open-source-search-engine/XmlDoc.cpp

//-*- coding: utf-8 -*-

#include "gb-include.h"

#include "hash.h"
#include "XmlDoc.h"
#include "Indexdb.h"   // for TERMID_MASK definition and g_indexdb.getTermId()
#include "Conf.h"
#include "Query.h"     // getFieldCode()
#include "Clusterdb.h" // g_clusterdb
#include "Categories.h" // g_categories
#include "iana_charset.h"
//#include "Checksumdb.h"
//#include "Msg24.h"
#include "Stats.h"
#include "Sanity.h"
#include "Speller.h"
#include "CountryCode.h"
//#include "SiteBonus.h"
#include "linkspam.h"
#include "Tagdb.h"
//#include "Dates.h"
#include "Repair.h"
//#include "Links.h"
#include "HashTableX.h"
#include "LanguageIdentifier.h" // g_langId
#include "CountryCode.h" // g_countryCode
#include "sort.h"
#include "Wiki.h"
#include "Speller.h"
#include "SiteGetter.h"
#include "Placedb.h"
#include "Test.h"
#include "Synonyms.h"
//#include "Revdb.h"
#include "Timedb.h"
#ifdef _USETURKS_
//#include "PageTurk.h"
#endif
#include "PageInject.h"
#include "HttpServer.h"
#include "Facebook.h"
#include "Posdb.h"
#include "Highlight.h"
#include "Wiktionary.h"
#include "seo.h" // Msg99Request etc.
//#include <regex.h>
#include "PingServer.h"
#include "Parms.h"

extern int g_inMemcpy;

//#define MAXDOCLEN (1024*1024 * 5)
//#define MAXDOCLEN (1024*1024)

HashTableX *g_ct = NULL;
XmlDoc *g_doc = NULL;
char *g_ptr = NULL;
int32_t *g_int32_t = NULL;

#define SENT_UNITS 30

static int32_t getIsContacty ( Url *url ,
			    LinkInfo *info1 ,
			    int32_t hops ,
			    uint8_t ct ,
			    bool isRoot ,
			    int32_t niceness );


static int32_t getTopGigabits ( HashTableX   *ht          ,
			     GigabitInfo **top         ,
			     int32_t          max         ,
			     int32_t          minDocCount ) ;

static void getWordToPhraseRatioWeights ( int64_t   pid1 , // pre phrase
					  int64_t   wid1 ,
					  int64_t   pid2 ,
					  int64_t   wid2 , // post word
					  float      *ww   ,
					  HashTableX *tt1  ,
					  int32_t        titleRecVersion ) ;

static bool addGigabit     ( HashTableX  *ht          ,
			     char        *s           ,
			     int32_t         slen        ,
			     int64_t    docId       ,
			     Section     *sp          ,
			     bool         singleWord  ,
			     uint8_t      langId      ,
			     // starts with word #i
			     int32_t         i           ,
			     int32_t         ptsArg      = -1 ) ;

static bool getWordPosVec ( Words *words ,
			    Sections *sections,
			    //int32_t wordStart,
			    //int32_t wordEnd,
			    int32_t startDist,
			    char *fragVec,
			    int32_t niceness ,
			    SafeBuf *wpos ) ;

static void getMetaListWrapper ( void *state ) ;

char *getFirstJSONObject ( char *p ,
			   int32_t niceness ,
			   bool *isProduct ,
			   bool *isImage ) ;
char *getJSONObjectEnd ( char *p , int32_t niceness ) ;

void doneReadingArchiveFileWrapper ( int fd, void *state );

XmlDoc::XmlDoc() {
	m_readThreadOut = false;
	for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) m_msg7s[i] = NULL;
	m_esbuf.setLabel("exputfbuf");
	for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
	m_freed = false;
	m_contentInjected = false;
	m_wasContentInjected = false;

	// warc parsing stuff
	m_msg7 = NULL;
	m_warcError = 0;
	m_arcError = 0;
	m_doneInjectingWarc = false;
	m_numInjectionsOut = 0;
	m_fptr = NULL;
	m_fptrEnd = NULL;
	m_fileBuf = NULL;
	m_warcContentPtr = NULL;
	m_calledWgetThread = false;

	//m_coll  = NULL;
	m_ubuf = NULL;
	m_pbuf = NULL;
	//m_contactDoc = NULL;
	m_rootDoc    = NULL;
	m_oldDoc     = NULL;
	m_dx = NULL;
	m_printedMenu = false;
	// reset all *valid* flags to false
	void *p    = &m_VALIDSTART;
	void *pend = &m_VALIDEND;
	memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p
	m_msg22Request.m_inUse = 0;
	m_msg4Waiting = false;
	m_msg4Launched = false;
	//m_sectiondbData = NULL;
	//m_placedbData   = NULL;
	m_dupTrPtr = NULL;
	m_oldTitleRec = NULL;
	m_filteredContent = NULL;
	m_filteredContentAllocSize = 0;
	m_metaList = NULL;
	m_metaListSize = 0;
	m_metaListAllocSize = 0;
	//m_titleRec = NULL;
	//m_freeTitleRec = true;
	m_rootTitleRec = NULL;
	m_outlinkHopCountVector = NULL;
	//m_gsbuf = NULL;
	m_extraDoc = NULL;
	m_ahrefsDoc = NULL;
	m_wikiqbuf = NULL;
	//m_cr = NULL;
	//m_msg3aArray = NULL;
	m_msg3a = NULL;
	m_query3a = NULL;
	//m_numMsg99Replies = 0;
	m_numMsg95Replies = 0;
	m_seoSocket = NULL;
	m_hackSocket = NULL;
	m_doingSEO = false;
	//m_newxd = NULL;
	//m_newxd2 = NULL;
	//m_newMsg20 = NULL;
	m_registeredSocketCallback = false;
	//m_numMsg98Requests = 0;
	//m_numMsg98Replies = 0;
	m_numMsg8eReplies = 0;
	m_numMsg8eRequests = 0;
	m_tempMsg25Page = NULL;
	m_tempMsg25Site = NULL;
	m_numLinkRequestsOut = 0;
	m_numLinkRequestsIn = 0;
	m_numMsg3fReplies = 0;
	m_numMsg3fRequests = 0;
	m_numMsg4fRequests = 0;
	m_numMsg4fReplies = 0;
	m_sentMsg4fRequests = false;

	//m_notifyBlocked = 0;
	//m_mcasts = NULL;
	//for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ )
	//	m_currentBinPtrs[i] = NULL;
	m_registeredWgetReadCallback = false;
	m_pipe = NULL;
	reset();
};

XmlDoc::~XmlDoc() {
	setStatus("freeing this xmldoc");
	reset();
	m_freed = true;
};

static int64_t s_lastTimeStart = 0LL;

// for debugging
class XmlDoc *g_xd;

void XmlDoc::reset ( ) {

	m_zeroedOut = false;

	m_oldDocExistedButHadError = false;

	m_addedStatusDocId = 0;

	if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) {
		mfree ( m_diffbotProxyReply , sizeof(ProxyReply) , "dprox" );
		m_diffbotProxyReply = NULL;
	}

	if ( m_readThreadOut )
		log("build: deleting xmldoc class that has a read thread out "
		    "on a warc file");

	if ( m_fileValid ) {
		m_file.close();
		m_file.unlink();
	}

	if ( m_fileBuf )
		mfree ( m_fileBuf , m_fileBufAllocSize , "fbdd");

	for ( int i = 0 ; i < MAXMSG7S ; i++ ) {
		Msg7 *msg7 = m_msg7s[i];
		if ( ! msg7 ) continue;
        if(msg7->m_inUse) {
            log("build: archive: reseting xmldoc when msg7s are outstanding");

        }
		mdelete ( msg7 , sizeof(Msg7) , "xdmsg7" );
		delete ( msg7 );
		m_msg7s[i] = NULL;
	}

	if ( m_msg7 ) {
		mdelete ( m_msg7 , sizeof(Msg7) , "xdmsg7" );
		delete ( m_msg7 );
		m_msg7 = NULL;
	}
	m_warcContentPtr = NULL;
	m_arcContentPtr = NULL;
	m_anyContentPtr = NULL;
	m_savedChar = '\0';
	m_contentDelim = NULL;

	if(m_registeredWgetReadCallback && m_pipe) {
		log("build:came back from sleep callback");
		g_loop.unregisterReadCallback( fileno(m_pipe), this,doneReadingArchiveFileWrapper);
		m_registeredWgetReadCallback = false;
	}

	if(m_pipe) {
		int32_t retCode = fclose(m_pipe);
		log("we closed the warc pipe on reset with error %s", mstrerror(retCode));
		m_pipe = NULL;
	}


	m_redirUrl.reset();

	m_updatedMetaData = false;

	m_ipStartTime = 0;
	m_ipEndTime   = 0;
	m_diffbotReplyRetries = 0;

	m_isImporting = false;

	m_printedMenu = false;

	// for hashing CT_STATUS docs consistently, this might be invalid
	// so call it 0
	m_pubDate = 0;

	m_tmpBuf2.purge();
	m_gotFacets = false;

	m_bodyStartPos = 0;

	m_mcastArray = NULL;

	m_skipIframeExpansion = false;
	m_indexedTime = 0;

	m_didDelete = false;

	m_metaList2.purge();
	m_zbuf.purge();
	m_kbuf.purge();

	m_mySiteLinkInfoBuf.purge();
	m_myPageLinkInfoBuf.purge();
	m_myTempLinkInfoBuf.purge();

	// reset count for nukeJSONObjects() function
	m_joc = 0;

	// notifications pending?
	//if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; }

	m_sentToDiffbot = 0;
	m_gotDiffbotSuccessfulReply = 0;
	// we need to reset this to false
	m_useTimeAxis = false;

	m_sentToDiffbotThisTime = false;

	m_loaded = false;

	m_msg4Launched = false;

	m_diffbotReplyError = 0;
	m_diffbotJSONCount = 0;
	//m_downloadAttempted = false;
	m_incrementedAttemptsCount = false;
	m_incrementedDownloadCount = false;

	if ( m_dx ) {
		mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
		delete  ( m_dx );
		m_dx = NULL;
		//log("diffbot: deleting m_dx2");
	}

	m_isDiffbotJSONObject = false;

	m_dmozBuf.purge();
	m_fakeIpBuf.purge();
	m_fakeTagRecPtrBuf.purge();

	m_tlbufTimer = 0LL;
	m_gsbuf.reset();

	//m_launchedAll = false;

	m_qstringTable.reset();

	//m_setForReplyPtrs = false;
	//m_setForLinkPtrs  = false;

	// must be none outstanding
	if ( m_numMsg3fReplies != m_numMsg3fRequests ) { char *xx=NULL;*xx=0;}
	if ( m_numMsg4fReplies != m_numMsg4fRequests ) { char *xx=NULL;*xx=0;}

	m_numMsg4fRequests = 0;
	m_numMsg4fReplies = 0;
	m_sentMsg4fRequests = false;

	// free table's mem if used
	//m_tmpDupTable.reset();

	//m_newxd2Blocked = false;

	m_lastPrintedDocId = 0LL;

	m_loggedMsg3 = false;

	m_progressBar = 0;

	m_triedToAddWordPosInfoToCachedb = false;

	if ( m_numLinkRequestsOut > m_numLinkRequestsIn ){char *xx=NULL;*xx=0;}

	m_doConsistencyTesting = g_conf.m_doConsistencyTesting;

	m_computedMetaListCheckSum = false;

	m_msg3aErrno = 0;

	m_hadMatchError = 0;
	m_clientClosed = false;
	m_lastCheckTime = 0;

	m_calledMsg25ForSite = false;
	m_calledMsg25ForPage = false;
	m_checkedCachedbForSite = false;
	m_checkedCachedbForPage = false;
	m_allHashed = false;

	// nuke it
	if ( m_tempMsg25Page ) {
		mdelete ( m_tempMsg25Page , sizeof(Msg25), "m25li" );
		delete ( m_tempMsg25Page );
		m_tempMsg25Page = NULL;
	}

	if ( m_tempMsg25Site ) {
		mdelete ( m_tempMsg25Site , sizeof(Msg25), "m25li" );
		delete ( m_tempMsg25Site );
		m_tempMsg25Site = NULL;
	}

	m_numLinkRequestsOut = 0;
	m_seoDebug = 0;
	//m_seoInfoSetFromCache = false;
	m_checkedCachedb = false;
	m_processedCachedbReply = false;
	m_cacheList.freeList();

	for ( int32_t i = 0; m_numMsg8eReplies && i < g_hostdb.m_numHosts;i++) {
		if ( ! m_msg8eReply[i] ) continue;
		mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
		m_msg8eReply[i] = NULL;
	}
	m_numMsg8eRequests = 0;
	m_numMsg8eReplies = 0;


	for ( int32_t i = 0; m_numMsg95Replies && i < g_hostdb.m_numHosts;i++) {
		if ( ! m_msg95ReplyPtrs[i] ) continue;
		mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
		m_msg95ReplyPtrs[i] = NULL;
	}
	m_numMsg95Replies = 0;


	m_numMsg3fRequests = 0;
	m_numMsg3fReplies  = 0;
	m_qcursor = 0;
	//m_binError = 0;
	//m_msg98ReplyError = 0;
	//m_binErrorForReplyPtrs = 0;
	//m_binErrorForLinkPtrs = 0;

	//m_msg17.reset();
	//m_triedCache = false;
	//m_cacheRec = NULL;
	//m_cacheRecSize = 0;

	// reset this crap
	m_beginTimeAllMatch = 0LL;
	m_beginTimeMatchUrl = 0LL;
	m_beginTimeFullQueries = 0LL;
	m_beginTimeLinks = 0LL;
	//m_beginMsg98s = 0LL;
	m_beginRelatedQueries = 0LL;

	m_doledbKey.n0 = 0LL;
	m_doledbKey.n1 = 0;

	// sanity check, any outstanding?
	//if( m_numMsg98Requests != m_numMsg98Replies ) { char *xx=NULL;*xx=0;}
	// reset them now
	//m_numMsg98Requests = 0;
	//m_numMsg98Replies = 0;

	//if ( m_newxd ) {
	//	mdelete ( m_newxd , sizeof(XmlDoc),"newxd");
	//	delete  ( m_newxd );
	//	m_newxd = NULL;
	//}

	//if ( m_newxd2 ) {
	//	mdelete ( m_newxd2 , sizeof(XmlDoc),"newxd2");
	//	delete  ( m_newxd2 );
	//	m_newxd2 = NULL;
	//}

	/*
	if ( m_newMsg20 ) {
		mdelete ( m_newMsg20 , sizeof(Msg20),"newmsg20");
		delete  ( m_newMsg20 );
		m_newMsg20 = NULL;
	}*/
	/*

	  NO! we use this for clientClosedConnection() function now

	if ( m_seoSocket ) {
		TcpServer *tcp = m_seoSocket->m_this;
		// gotta set this so it can be destroyed and closed
		m_seoSocket->m_waitingOnHandler = false;
		tcp->destroySocket ( m_seoSocket );
		m_seoSocket = NULL;
	}
	*/
	if ( m_registeredSocketCallback ) { char *xx=NULL; *xx=0; }
	//for ( int32_t i = 0 ; i < m_numMsg99Replies ; i++ ) {
	//	if ( ! m_msg99ReplyPtrs[i] ) continue;
	//	mfree ( m_msg99ReplyPtrs [i] ,
	//		m_msg99ReplyAlloc[i] ,
	//		"m99reply" );
	//}
	//m_numMsg99Replies = 0;
	//m_sentMsg99Requests = false;


	if ( m_msg3a ) {
		mdelete ( m_msg3a , sizeof(Msg3a) , "xdmsg3a" );
		delete  ( m_msg3a );
		m_msg3a = NULL;
	}

	if ( m_query3a ) {
		mdelete ( m_query3a , sizeof(Query),"xdqry3a");
		delete  ( m_query3a );
		m_query3a = NULL;
	}

	m_surroundingTextBuf.purge();
	m_rssItemBuf.purge();
	//m_twbuf.purge();
	m_topMatchingQueryBuf.purge();
	//m_queryPtrs.purge();
	m_queryOffsets.purge();
	m_extraQueryBuf.purge();
	//m_socketWriteBuf.purge();
	m_relatedDocIdBuf.purge();
	m_relatedTitleBuf.purge();
	m_commonQueryNumBuf.purge();
	m_queryLinkBuf.purge();
	//m_relatedQueryLinksIntersected.purge();
	m_queryLinkStringBuf.purge();
	//m_queryRelBuf.purge();
	//m_relPtrs.purge();
	m_sortedPosdbListBuf.purge();
	m_wpSortedPosdbListBuf.purge();
	m_termListBuf.purge();
	m_insertableTermsBuf.purge();
	//m_iwfiBuf.purge();
	m_wordPosInfoBuf.purge();
	//m_msg20ReplyPtrBuf.purge();
	m_recommendedLinksBuf.purge();
	m_tmpMsg0Buf.purge();
	m_msg20Array.purge();
	m_newLinkerBuf.purge();

	//m_msg99ReplyBuf.purge();
	m_matchingQueryBuf.purge();
	m_relatedQueryBuf.purge();
	m_queryLinkBuf.purge();
	m_matchingQueryStringBuf.purge();
	m_relatedQueryStringBuf.purge();
	m_queryLinkStringBuf.purge();
	m_docIdListBuf.purge();

	m_queryChangeBuf.purge();
	m_queryLogBuf.purge();
	//m_itStrBuf.purge();
	m_debugScoreInfoBuf.purge();
	m_origScoreInfoBuf.purge();
	m_msg20Buf.purge();
	m_topDocIdsBuf.purge();
	m_missingTermBuf.purge();
	m_termInfoBuf.purge();
	m_newTermInfoBuf.purge();
	m_matchingTermBuf.purge();
	m_termId32Buf.purge();
	m_storeList.freeList();

	//m_queryHashTable.reset();
	m_tidTable32.reset();
	m_queryOffsetTable.reset();
	m_tmpTable.reset();
	m_fullQueryDedup.reset();
	//m_dupVotes.reset();

	m_wordSpamBuf.purge();
	m_fragBuf.purge();

	m_downloadLevel = 0;

	for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) {
		if ( ! m_xmlDocs[i] ) continue;
		mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
		delete  ( m_xmlDocs[i] );
		m_xmlDocs[i] = NULL;
	}

	s_lastTimeStart = 0LL;

	m_req = NULL;

	m_doneWithAhrefs = false;
	m_useAhrefs = false;
	m_linkDedupTablePtr = NULL;
	m_domDedupTablePtr = NULL;

	m_storeTermListInfo = false;
	m_gotDupStats = false;
	//m_nextSection = (Section *)-1;
	m_si = (Section *)-1;

	// for limiting # of iframe tag expansions
	m_numExpansions = 0;

	// . are not allowed to exit if waiting for msg4 to complete
	// . yes we are, it should be saved as addsinprogress.dat
	if ( m_msg4Waiting ) {
		log("doc: resetting xmldoc with outstanding msg4. should "
		    "me saved in addsinprogress.dat. docid=%"UINT64"",m_docId);
		//char *xx=NULL;*xx=0; }
	}

	m_ei = 0;
	m_lastLaunch = -1;

	m_pbuf = NULL;
	m_wts  = NULL;

	m_deleteFromIndex = false;

	//if ( m_contactDocValid ) nukeDoc ( m_contactDoc );
	if ( m_rootDocValid    ) nukeDoc ( m_rootDoc    );
	if ( m_oldDocValid     ) nukeDoc ( m_oldDoc     );
	if ( m_extraDocValid   ) nukeDoc ( m_extraDoc   );
	if ( m_ahrefsDocValid  ) nukeDoc ( m_ahrefsDoc  );

	if ( m_linkInfo1Valid && ptr_linkInfo1 && m_freeLinkInfo1 ) {
		// it now points into m_myPageLinkInfoBuf !
		//mfree ( ptr_linkInfo1 , size_linkInfo1, "LinkInfo1");
		ptr_linkInfo1    = NULL;
		m_linkInfo1Valid = false;
	}
	if ( m_linkInfo2Valid && ptr_linkInfo2 && m_freeLinkInfo2 ) {
		// should point into a safebuf as well
		//mfree ( ptr_linkInfo2 , size_linkInfo2, "LinkInfo2");
		ptr_linkInfo2    = NULL;
		m_linkInfo2Valid = false;
	}
	if ( m_rawUtf8ContentValid && m_rawUtf8Content && !m_setFromTitleRec
	     // was content supplied by pageInject.cpp?
	     //! m_contentInjected ) {
	     ) {
		mfree ( m_rawUtf8Content, m_rawUtf8ContentAllocSize,"Xml3");
	}

	// reset this
	m_contentInjected = false;
	m_rawUtf8ContentValid = false;
	m_wasContentInjected = false;

	m_rootDoc = NULL;

	// if this is true, then only index if new
	m_newOnly = 0;

	//if ( m_sectiondbData ) {
	//	mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" );
	//	m_sectiondbData = NULL;
	//}

	//if ( m_placedbData ) {
	//	mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" );
	//	m_placedbData = NULL;
	//}

	if ( m_httpReplyValid && m_httpReply ) {
		mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
		m_httpReply = NULL;
		m_httpReplyValid = false;
	}

	if ( m_filteredContentAllocSize ) {
		mfree (m_filteredContent,m_filteredContentAllocSize,"xdfc");
		m_filteredContent = NULL;
		m_filteredContentAllocSize = 0;
	}

	//if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content )
	//	mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3");


	if ( m_metaList ) { // m_metaListValid && m_metaList ) {
		mfree ( m_metaList , m_metaListAllocSize , "metalist");
		m_metaList          = NULL;
		m_metaListSize      = 0;
		m_metaListAllocSize = 0;
	}

	if ( m_ubuf ) {
		// log("xmldoc: delete m_ubuf=%"PTRFMT" this=%"PTRFMT
		//     , (PTRTYPE) m_ubuf
		//     , (PTRTYPE) this
		//     );
		mfree ( m_ubuf     , m_ubufAlloc         , "ubuf");
		m_ubuf = NULL;
	}

	//if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) {
	//	mfree ( m_titleRec , m_titleRecAllocSize , "trec" );
	//}
	//m_titleRec = NULL;
	m_titleRecBuf.purge();

	if ( m_dupTrPtr ) {
		mfree ( m_dupTrPtr , m_dupTrSize , "trecd" );
		m_dupTrPtr = NULL;
	}

	if ( m_oldTitleRecValid && m_oldTitleRec ) {
		mfree ( m_oldTitleRec , m_oldTitleRecSize , "treca" );
		m_oldTitleRec = NULL;
		m_oldTitleRecValid = false;
	}

	if ( m_rootTitleRecValid && m_rootTitleRec ) {
		mfree ( m_rootTitleRec , m_rootTitleRecSize , "treca" );
		m_rootTitleRec = NULL;
		m_rootTitleRecValid = false;
	}


	if ( m_outlinkHopCountVectorValid && m_outlinkHopCountVector ) {
		int32_t sz = m_outlinkHopCountVectorSize;
		mfree ( m_outlinkHopCountVector,sz,"ohv");
	}
	m_outlinkHopCountVector = NULL;

	//if ( m_gsbufValid && m_gsbuf ) {
	//	mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" );
	//}
	//m_gsbuf = NULL;
	m_gsbuf.reset();


	// reset all *valid* flags to false
	void *p    = &m_VALIDSTART;
	void *pend = &m_VALIDEND;
	memset ( p , 0 , (char *)pend - (char *)p );

	m_hashedMetas = false;

	m_mcastBuf.purge();
	m_serpBuf.purge();

	// Doc.cpp:
	m_mime.reset();
	m_words.reset();
	m_phrases.reset();
	m_bits.reset();
	m_sections.reset();
	//m_weights.reset();
	m_countTable.reset();
	m_dates.reset();
	m_addresses.reset();

	// other crap
	m_xml.reset();
	m_links.reset();
	m_bits2.reset();
	m_pos.reset();
	//m_synonyms.reset();
	m_synBuf.reset();
	//m_nsvt.reset();
	//m_osvt.reset();
	m_turkVotingTable.reset();
	m_turkBitsTable.reset();
	m_vtr.reset();
	m_rdtab.reset();
	m_vctab.reset();
	m_vcduptab.reset();
	m_images.reset();
	m_countTable.reset();
	m_mime.reset();
	m_tagRec.reset();
	m_newTagBuf.reset();
	m_catRec.reset();
	//m_clockCandidatesTable.reset();
	//m_cctbuf.reset();
	m_dupList.reset();
	//m_oldMetaList.reset();
	m_msg8a.reset();
	//m_siteLinkInfo.reset();
	//m_msg25.reset();
	//m_msgc.reset();
	m_msg13.reset();
	m_tmpsb1.reset();
	m_tmpsb2.reset();
	m_turkBuf.reset();
	m_msg0b.reset();
	//m_siteGetter.reset();
	m_msge0.reset();
	m_msge1.reset();
	m_reply.reset();
	// mroe stuff skipped

	m_wtsTable.reset();
	m_wbuf.reset();
	m_pageLinkBuf.reset();
	m_siteLinkBuf.reset();
	m_esbuf.reset();
	m_xbuf.reset();
	m_tagRecBuf.reset();

	//m_titleRec           = NULL;
	//m_titleRecSize       = 0;

	// origin of this XmlDoc
	m_setFromTitleRec    = false;
	m_setFromUrl         = false;
	m_setFromDocId       = false;
	m_setFromSpiderRec   = false;
	m_freeLinkInfo1      = false;
	m_freeLinkInfo2      = false;

	m_checkedUrlFilters  = false;

	m_indexCode   = 0;
	m_masterLoop  = NULL;
	m_masterState = NULL;

	//m_isAddUrl = false;
	m_isInjecting = false;
	m_useFakeMime = false;
	m_useSiteLinkBuf = false;
	m_usePageLinkBuf = false;
	m_printInXml = false;

	m_check1   = false;
	m_check2   = false;
	m_prepared = false;

	// keep track of updates to the rdbs we have done, so we do not re-do
	m_listAdded                = false;
	m_listFlushed              = false;
	m_updatedCounts            = false;
	m_updatedCounts2           = false;
	//m_updatedTagdb1            = false;
	//m_updatedTagdb2            = false;
	//m_updatedTagdb3            = false;
	//m_updatedTagdb4            = false;
	//m_updatedTagdb5            = false;
	m_copied1                  = false;
	m_updatingSiteLinkInfoTags = false;
	m_addressSetCalled         = false;
	m_hashedTitle              = false;

	m_registeredSleepCallback  = false;
	m_addedNegativeDoledbRec   = false;

	m_numRedirects             = 0;
	m_numOutlinksAdded         = 0;
	// . use sameDomain and sameIp waits?
	// . these may be bypassed in getContactDoc()
	//m_throttleDownload       = true;
	m_spamCheckDisabled        = false;
	m_useRobotsTxt             = true;
	m_redirectFlag             = false;

	// Scraper.cpp sets this to true
	//m_isScraping               = false;

	m_allowSimplifiedRedirs    = false;

	//m_calledMsg22a             = false;
	//m_calledMsg22b             = false;
	//m_calledMsg22c             = false;
	m_didDelay                 = false;
	m_didDelayUnregister       = false;
	m_calledMsg22d             = 0LL;
	m_calledMsg22e             = false;
	m_calledMsg22f             = false;
	m_calledMsg25              = false;
	m_calledMsg25b             = false;
	m_calledMsg40              = false;
	m_calledSections           = false;
	m_calledThread             = false;
	m_alreadyRegistered        = false;
	m_loaded                   = false;
	m_firstEntry               = true;
	m_firstEntry2              = true;
	m_launchedSpecialMsg8a     = false;
	m_launchedMsg8a2           = false;

	m_numSectiondbReads        = 0;
	m_numSectiondbNeeds        = 0;
	m_sectiondbRecall          = 0;

	//m_triedVoteCache           = false;
	//m_storedVoteCache          = false;

	m_setTr                    = false;
	//m_checkedRobots            = false;
	m_triedTagRec              = false;
	m_didGatewayPage           = false;
	m_didQuickDupCheck         = false;
	m_calledMsg8b              = false;

	m_recycleContent           = false;
	//m_loadFromOldTitleRec    = false;
	m_callback1                = NULL;
	m_callback2                = NULL;
	m_state                    = NULL;

	// used for getHasContactInfo()
	m_processed0               = false;
	m_hasContactInfo           = false;
	m_hasContactInfo2          = false;


	//m_checkForRedir            = true;

	m_processedLang            = false;

	m_doingConsistencyCheck    = false;

	// used for getting contact info
	//m_triedRoot = false;
	//m_winner = -2;

	// tell Msg13 to just call HttpServer::getDoc() and not to forward
	// the download request to another host. although this does not
	// exclude possible forwarding it to a compression proxy if
	// g_conf.m_useCompressionProxy is set
	m_forwardDownloadRequest = false;

	m_isChildDoc = false;
	m_parentDocPtr = NULL;

	// for utf8 content functions
	m_savedp       = NULL;
	m_oldp         = NULL;
	m_didExpansion = false;

	// Repair.cpp now explicitly sets these to false if needs to
	m_usePosdb     = true;
	//m_useDatedb    = true;
	m_useClusterdb = true;
	m_useLinkdb    = true;
	m_useSpiderdb  = true;
	m_useTitledb   = true;
	m_useTagdb     = true;
	m_usePlacedb   = true;
	//m_useTimedb    = true;
	// only use for custom crawls for now to save disk space
	m_useSectiondb = false;
	//m_useRevdb     = true;
	m_useSecondaryRdbs = false;

	//m_useIpsTxtFile = true;

	// used by Msg13.cpp only. kinda a hack.
	m_isSpiderProxy = false;

	// do not cache the http reply in msg13 etc.
	m_maxCacheAge = 0;

	// reset these ptrs too!
	void *px    = &ptr_firstUrl;
	void *pxend = &size_firstUrl;
	memset ( px , 0 , (char *)pxend - (char *)px );

	m_hasMetadata = false;
	ptr_metadata = NULL;
	size_metadata = 0;
}

// . set the url with the intention of adding it or deleting it from the index
// . Msg7 and Repair.cpp can also set other members of XmlDoc rather than just
//   m_firstUrl. they can provide the ip, the http reply, content, filtered
//   content, the forced next spider time and the forced first indexed date,
//   the hop count
// . they might also want to skip deduping, or any algo deemed unnecessary
//   by setting, for instance, m_isDupValid = true, or something
bool XmlDoc::set1 ( char    *url         ,
		    char    *coll        ,
		    SafeBuf *pbuf        ,
		    int32_t     niceness    ) {

	reset();

	// this is true
	m_setFromUrl = true;

	//m_coll     = coll;
	m_pbuf     = pbuf;
	m_niceness = niceness;
	m_version  = TITLEREC_CURRENT_VERSION;
	m_versionValid = true;

	// sanity check
	if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }

	// copy this in case collection gets deleted i guess...
	//m_forceDelete = forceDelete;
	// did we get this url from PageAddUrl?
	//m_isAddUrl    = isAddUrl;
	// set m_indexCode so that XmlDoc::indexDoc() will delete it
	//if ( forceDelete ) m_indexCode = EDOCFORCEDELETE;

	// set this important member var
	//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
	//if ( ! cr ) return false;
	if ( ! setCollNum ( coll ) ) return false;

	setFirstUrl ( url , false );

	//setSpideredTime();

	return true;
}


char *XmlDoc::getTestDir ( ) {
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// return NULL if we are not the "qatest123" collection
	if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
	// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
	// then return "test-spider" otherwise...
	//if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
	//	return "qa";//"test-spider";
	// ... default to "test-parser"
	//return "test-parser";
	return "qa";
	/*
	if ( getIsPageParser() )
		return "test-page-parser";
	//if ( m_sreqValid && m_sreq.m_isInjecting )
	//	return "test-page-inject";
	else if ( g_conf.m_testParserEnabled )
		return "test-parser";
	else if ( g_conf.m_testSpiderEnabled )
		return "test-spider";
	// default to being from PageInject
	return "test-page-inject";
	*/
	//else { char *xx=NULL;*xx=0; }
	//return NULL;
}

int32_t XmlDoc::getSpideredTime ( ) {
	// stop if already set
	if ( m_spideredTimeValid ) return m_spideredTime;

	// tmp var
	int32_t date = 0;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return 0;

	// if not test collection keep it simple
	if ( strcmp(cr->m_coll,"qatest123") || cr->m_useTimeAxis) {
		// . set spider time to current time
		// . this might already be valid if we set it in
		//   getTestSpideredDate()
		m_spideredTime      = getTimeGlobal();
		m_spideredTimeValid = true;
		return m_spideredTime;
	}

	char *testDir = getTestDir();

	// get url
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) { char *xx=NULL;*xx=0; }

	// this returns false if not in there, in which case, add it
	if ( ! getTestSpideredDate(cu,&date,testDir) ) {
		m_spideredTime      = getTimeGlobal();
		m_spideredTimeValid = true;
		addTestSpideredDate ( cu , m_spideredTime , testDir );
		return m_spideredTime;
	}

	// if we are injecting into the test coll for the 2nd+ time
	// we need to use the spidered date from the first time we
	// injected the doc in order to ensure things are parsed
	// exactly the same way since some things depend on the
	// spideredTime, like Dates (for setting "in future"
	// flags)
	m_spideredTimeValid = true;
	m_spideredTime      = date;
	// hack for test coll which has fake vals for these because
	// the SpiderRequest::m_addedTime and m_parentPrevSpiderTime
	//m_minPubDate = m_spideredTime - 48*3600;
	//m_maxPubDate = m_spideredTime - 24*3600;

	return m_spideredTime;
}

// . we need this so PageGet.cpp can get the cached web page
// . but not for Msg20::getSummary(), that uses XmlDoc::set(Msg20Request*)
// . returns false and sets g_errno on error
bool XmlDoc::set3 ( int64_t  docId       ,
		    char      *coll        ,
		    int32_t       niceness    ) {

	reset();

	// this is true
	m_setFromDocId = true;

	m_docId       = docId;
	m_docIdValid  = true;
	//m_coll        = coll;
	m_niceness    = niceness;

	// . sanity check
	// . why can't we allow this??? MDW
	//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }

	// set this important member var
	//cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
	//if ( ! cr ) { m_errno = ENOCOLLREC; return false; }
	if ( ! setCollNum ( coll ) ) return false;

	// solidify some parms
	//m_eliminateMenus       = cr->m_eliminateMenus;
	//m_eliminateMenusValid  = true;

	return true;
}

void loadFromOldTitleRecWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// note it
	THIS->setStatus ( "loading from old title rec wrapper" );
	// return if it blocked
	if ( ! THIS->loadFromOldTitleRec ( ) ) return;

	char *coll = "";
	CollectionRec *cr = THIS->getCollRec();
	if ( cr ) coll = cr->m_coll;

	// error?
	if ( g_errno ) log("doc: loadfromtitlerec coll=%s: %s",
			   coll,
			   mstrerror(g_errno));
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

// returns false if blocked, returns true and sets g_errno on error otherwise
bool XmlDoc::loadFromOldTitleRec ( ) {
	// . we are an entry point.
	// . if anything blocks, this will be called when it comes back
	if ( ! m_masterLoop ) {
		m_masterLoop  = loadFromOldTitleRecWrapper;
		m_masterState = this;
	}
	// if we already loaded!
	if ( m_loaded ) return true;
	// if set from a docid, use msg22 for this!
	char **otr = getOldTitleRec ( );
	// error?
	if ( ! otr ) return true;
	// blocked?
	if ( otr == (void *)-1 ) return false;
	// this is a not found
	if ( ! *otr ) {
		// so we do not retry
		m_loaded = true;
		// make it an error
		g_errno = ENOTFOUND;
		return true;
	}
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;
	// use that. decompress it! this will also set
	// m_setFromTitleRec to true
	if ( ! set2 ( m_oldTitleRec     ,
		      m_oldTitleRecSize , // maxSize
		      cr->m_coll            ,
		      NULL              , // pbuf
		      m_niceness        )) {
		// we are now loaded, do not re-call
		m_loaded = true;
		// return true with g_errno set on error uncompressing
		return true;
	}
	// we are now loaded, do not re-call
	m_loaded = true;
	// sanity check
	if ( ! m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
	// good to go
	return true;
}

bool XmlDoc::setCollNum ( char *coll ) {
	CollectionRec *cr;
	cr = g_collectiondb.getRec ( coll , gbstrlen(coll) );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		return log("build: collrec not found for %s",coll);
	}
	// we can store this safely:
	m_collnum = cr->m_collnum;
	m_collnumValid = true;
	// if user "resets" the collection we need to know
	m_lastCollRecResetCount = cr->m_lastResetCount;
	return true;
}

CollectionRec *XmlDoc::getCollRec ( ) {
	if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
	CollectionRec *cr = g_collectiondb.m_recs[m_collnum];
	if ( ! cr ) {
		log("build: got NULL collection rec for collnum=%"INT32".",
		    (int32_t)m_collnum);
		g_errno = ENOCOLLREC;
		return NULL;
	}
	// was it reset since we started spidering this url?
	// we don't do it this way, when resetting a coll when delete it and
	// re-add under a different collnum to avoid getting msg4 adds to it.
	//if ( cr->m_lastResetCount != m_lastCollRecResetCount ) {
	//	log("build: collection rec was reset. returning null.");
	//	g_errno = ENOCOLLREC;
	//	return NULL;
	//}
	return cr;
}

// returns false and sets g_errno on error
bool XmlDoc::set4 ( SpiderRequest *sreq      ,
		    key_t         *doledbKey ,
		    char          *coll      ,
		    SafeBuf       *pbuf      ,
		    int32_t        niceness  ,
		    char          *utf8ContentArg ,
		    bool           deleteFromIndex ,
		    int32_t        forcedIp ,
		    uint8_t        contentType ,
		    uint32_t       spideredTime ,
		    bool           contentHasMimeArg ,
		    char          *contentDelim,
		    char          *metadata ,
			uint32_t       metadataLen,
			int32_t        payloadLen
) {

	// sanity check
	if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; }

	reset();

	if ( g_conf.m_logDebugSpider )
		log("xmldoc: set4 uh48=%"UINT64" parentdocid=%"UINT64"",
		    sreq->getUrlHash48(),sreq->getParentDocId());

	// used by PageSpiderdb.cpp
	m_startTime      = gettimeofdayInMilliseconds();
	m_startTimeValid = true;

	// this is true
	m_setFromSpiderRec = true;

	// did page inject (pageinject) request to delete it?
	m_deleteFromIndex = deleteFromIndex;

	// PageReindex.cpp will set this in the spider request
	if ( sreq->m_forceDelete )
		m_deleteFromIndex = true;

	// if we are a container doc then we need the content delimeter,
	// unless if we are a warc or arc, then we know how those delimit
	// already.
	m_contentDelim = contentDelim;
	m_contentDelimValid = true;

	bool contentHasMime = contentHasMimeArg;
	// but if we are a container doc then this parm applies to each subdoc
	// not to us, so turn it off for this part.
	if ( isContainerDoc() )	{
		contentHasMime    = false;
		m_subDocsHaveMime = contentHasMimeArg;
	}


	char *utf8Content = utf8ContentArg;

	if ( contentHasMime && utf8Content ) {
		// get length of it all
		int32_t clen = gbstrlen(utf8Content);
		// return true on error with g_errno set
		if ( ! m_mime.set ( utf8ContentArg , clen , NULL ) ) {
			if ( ! g_errno ) g_errno = EBADMIME;
			log("xmldoc: could not set mime: %s",
			    mstrerror(g_errno));
			return false;
		}
		// it's valid
		m_mimeValid = true;
		// advance
		utf8Content = m_mime.getContent();

		if(payloadLen != -1) {
			payloadLen -= m_mime.getContent() - utf8ContentArg;
		}
	}

	// use this to avoid ip lookup if it is not zero
	if ( forcedIp ) {
		m_ip = forcedIp;
		m_ipValid = true;
	}

	// sometimes they supply the content they want! like when zaks'
	// injects pages from PageInject.cpp
	if ( utf8Content ) {
		// . this is the most basic content from the http reply
		// . only set this since sometimes it is facebook xml and
		//   contains encoded html which needs to be decoded.
		//   like <name>Ben &amp; Jerry's</name> otherwise are
		//   sentence formation stops at the ';' in the "&amp;" and
		//   we also index "amp" which is bad.
		m_content             = utf8Content;
        if(payloadLen != -1) {
            m_contentLen = payloadLen;
        }
		else if ( m_mimeValid && m_mime.m_contentLen > 0) {
            m_contentLen = m_mime.m_contentLen;
		} else {
			m_contentLen = gbstrlen(utf8Content);
		}

		m_contentValid        = true;

		//m_rawUtf8Content      = utf8Content;
		//m_expandedUtf8Content = utf8Content;

		//ptr_utf8Content       = utf8Content;
		//size_utf8Content      = slen+1;

		//m_rawUtf8ContentValid      = true;
		//m_expandedUtf8ContentValid = true;
		//m_utf8ContentValid         = true;

		m_contentInjected     = true;
		m_wasContentInjected  = true;
		m_contentType         = contentType;
		m_contentTypeValid    = true;
		// use this ip as well for now to avoid ip lookup
		//m_ip      = atoip("127.0.0.1");
		//m_ipValid = true;
		// do not need robots.txt then
		m_isAllowed      = true;
		m_isAllowedValid = true;
		// nor mime
		m_httpStatus      = 200;
		m_httpStatusValid = true;
		// this too
		m_downloadStatus      = 0;
		m_downloadStatusValid = true;
		// assume this is the download time since the content
		// was pushed/provided to us
		if ( spideredTime )
			m_downloadEndTime = spideredTime;
		else
			m_downloadEndTime = gettimeofdayInMillisecondsGlobal();
		// either way, validate it
		m_downloadEndTimeValid = true;
		// and need a legit mime
		if ( ! m_mimeValid ) {
			m_mime.m_bufLen  = 1;
			m_mimeValid      = true;
			m_mime.m_contentType = contentType;
		}
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// no redir
		ptr_redirUrl    = NULL;
		size_redirUrl   = 0;
		m_redirUrl.reset();
		m_redirUrlPtr   = NULL;//&m_redirUrl;
		m_redirUrlValid = true;
		m_redirErrorValid = true;
		m_redirError      = 0;
		m_crawlDelay      = -1;
		m_crawlDelayValid = true;
	}

	// override content type based on mime for application/json
	if ( m_mimeValid ) {
		m_contentType = m_mime.m_contentType;
		m_contentTypeValid = true;
	}


	//m_coll      = coll;
	m_pbuf      = pbuf;
	m_niceness  = niceness;
	m_version   = TITLEREC_CURRENT_VERSION;
	m_versionValid = true;

	/*
	// set min/max pub dates right away
	m_minPubDate = -1;
	m_maxPubDate = -1;
	// parentPrevSpiderTime is 0 if that was the first time that the
	// parent was spidered, in which case isNewOutlink will always be set
	// for every outlink it had!
	if ( sreq->m_isNewOutlink && sreq->m_parentPrevSpiderTime ) {
		// sanity check
		if ( ! sreq->m_parentPrevSpiderTime ) {char *xx=NULL;*xx=0;}
		// pub date is somewhere between these two times
		m_minPubDate = sreq->m_parentPrevSpiderTime;
		m_maxPubDate = sreq->m_addedTime;
	}
	*/

	// this is used to removing the rec from doledb after we spider it
	m_doledbKey.setMin();
	if ( doledbKey ) m_doledbKey = *doledbKey;

	// . sanity check
	// . we really don't want the parser holding up the query pipeline
	//   even if this page is being turked!
	//if ( m_niceness == 0 &&
	//     // spider proxy uses xmldoc class to expand iframe tags and
	//     // sometimes the initiating msg13 class was re-niced to 0
	//     // in the niceness converstion logic.
	//     ! g_hostdb.m_myHost->m_isProxy ) {
	//	char *xx=NULL; *xx=0; }

	if ( sreq->isCorrupt(m_collnum) )
		return log("XmlDoc: set4() spider request is corrupt in coll "
			   "%s u=%s",coll,sreq->m_url);

	m_sreqValid    = true;

	// store the whole rec, key+dataSize+data, in case it disappears.
	gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );

	// set m_collnum etc.
	if ( ! setCollNum ( coll ) )
		return log("XmlDoc: set4() coll %s invalid",coll);

	// it should be valid since we just set it
	CollectionRec *cr = getCollRec();

	m_useRobotsTxt = cr->m_useRobotsTxt;

	// solidify some parms
	//m_eliminateMenus       = cr->m_eliminateMenus;
	//m_eliminateMenusValid  = true;

	// validate these here too
	/*
	m_titleWeight            = cr->m_titleWeight;
	m_headerWeight           = cr->m_headerWeight;
	m_urlPathWeight          = cr->m_urlPathWeight;
	m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
	m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
	m_conceptWeight          = cr->m_conceptWeight;

	m_titleWeightValid            = true;
	m_headerWeightValid           = true;
	m_urlPathWeightValid          = true;
	m_externalLinkTextWeightValid = true;
	m_internalLinkTextWeightValid = true;
	m_conceptWeightValid          = true;
	*/

	// fix some corruption i've seen
	if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
		log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
		m_sreq.m_urlIsDocId = 0;
	}

	// if url is a docid... we are from pagereindex.cpp
	//if ( sreq->m_isPageReindex ) {
	// now we can have url-based page reindex requests because
	// if we have a diffbot json object fake url reindex request
	// we add a spider request of the PARENT url for it as page reindex
	//if ( is_digit ( sreq->m_url[0] ) ) {
	// watch out for 0.r.msn.com!!
	if ( m_sreq.m_urlIsDocId ) {
		m_docId          = atoll(m_sreq.m_url);
		// assume its good
		m_docIdValid     = true;
		// similar to set3() above
		m_setFromDocId   = true;
		// use content and ip from old title rec to save time
		// . crap this is making the query reindex not actually
		//   re-download the content.
		// . we already check the m_deleteFromIndex flag below
		//   in getUtf8Content() and use the old content in that case
		//   so i'm not sure why we are recycling here, so take
		//   this out. MDW 9/25/2014.
		//m_recycleContent = true;
		// sanity
		if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; }
	}
	else {
		// add www is now REQUIRED for all!
		// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
		// www.tmblr.co has no IP
		setFirstUrl ( m_sreq.m_url , false );//true ); // false );
		// you can't call this from a docid based url until you
		// know the uh48
		//setSpideredTime();
	}

	// now query reindex can specify a recycle content option so it
	// can replace the rebuild tool. try to recycle on global index.
	if ( m_sreqValid )
		m_recycleContent = m_sreq.m_recycleContent;

	m_hasMetadata = (bool)metadata;

	ptr_metadata = metadata;
	size_metadata = metadataLen;

	return true;
}

// . set our stuff from the TitleRec (from titledb)
// . returns false and sets g_errno on error
bool XmlDoc::set2 ( char    *titleRec ,
		    int32_t     maxSize  ,
		    char    *coll     ,
		    SafeBuf *pbuf     ,
		    int32_t     niceness ,
		    SpiderRequest *sreq ) {

	// NO! can't do this. see below
	//reset();

	setStatus ( "setting xml doc from title rec");

	// . it resets us, so save this
	// . we only save these for set2() not the other sets()!
	//void (*cb1)(void *state) = m_callback1;
	//bool (*cb2)(void *state) = m_callback2;
	//void *state = m_state;

	// . clear it all out
	// . no! this is clearing our msg20/msg22 reply...
	// . ok, but repair.cpp needs it so do it there then
	//reset();

	// restore callbacks
	//m_callback1 = cb1;
	//m_callback2 = cb2;
	//m_state     = state;

	// sanity check - since we do not reset
	if ( m_contentValid ) { char *xx=NULL;*xx=0; }

	// this is true
	m_setFromTitleRec = true;

	// this is valid i guess. includes key, etc.
	//m_titleRec      = titleRec;
	//m_titleRecSize  = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
	//m_titleRecValid = true;
	// . should we free m_cbuf on our reset/destruction?
	// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
	//   that should not be freed, besides the alloc size is not known!
	//m_freeTitleRec  = false;

	int32_t titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4;
	// . should we free m_cbuf on our reset/destruction?
	// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
	//   that should not be freed, besides the alloc size is not known!
	m_titleRecBuf.setBuf ( titleRec ,
			       titleRecSize , // bufmax
			       titleRecSize ,  // bytes in use
			       false, // ownData?
			       csUTF8); // encoding
	m_titleRecBufValid = true;


	//m_coll               = coll;
	m_pbuf               = pbuf;
	m_niceness           = niceness;

	// . sanity check
	// . NO! could be from XmlDoc::getMsg20Reply()!
	//if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; }

	// it must be there!
	if ( !titleRec||titleRecSize==0 ) {g_errno=ENOTFOUND; return false;}

	// set our collection number
	if ( ! setCollNum ( coll ) ) return false;

	// store the whole rec, key+dataSize+data, in case it disappears.
	if ( sreq ) {
		gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
		m_sreqValid = true;
	}

	m_hashedTitle        = false;
	m_hashedMetas        = false;

	// save the compressed buffer in case we should free it when done
	//m_titleRec = titleRec;
	// should we free m_cbuf on our reset/destruction?
	//m_freeTitleRec = true;
	// our record may not occupy all of m_cbuf, careful
	//m_titleRecAllocSize = maxSize;

	// get a parse ptr
	char *p = titleRec ;
	// . this is just like a serialized RdbList key/dataSize/data of 1 rec
	// . first thing is the key
	// . key should have docId embedded in it
	m_titleRecKey =  *(key_t *) p ;
	//m_titleRecKeyValid = true;
	p += sizeof(key_t);
	// bail on error
	if ( (m_titleRecKey.n0 & 0x01) == 0x00 ) {
		g_errno = EBADTITLEREC;
		log("db: Titledb record is a negative key.");
		char *xx=NULL; *xx=0;
		return false;
	}
	// set m_docId from key
	m_docId = g_titledb.getDocIdFromKey ( m_titleRecKey );
	// validate that
	m_docIdValid = true;
	// then the size of the data that follows this
	int32_t dataSize =  *(int32_t *) p ;
	p += 4;
	// bail on error
	if ( dataSize < 4 ) {
		g_errno = EBADTITLEREC;
		return log("db: Titledb record has size of %"INT32" which "
			   "is less then 4. Probable disk corruption in a "
			   "titledb file.",
			   dataSize);
	}
	// what is the size of cbuf/titleRec in bytes?
	int32_t cbufSize = dataSize + 4 + sizeof(key_t);
	// . the actual data follows "dataSize"
	// . what's the size of the uncompressed compressed stuff below here?
	m_ubufSize = *(int32_t  *) p ; p += 4;
	// . because of disk/network data corruption this may be wrong!
	// . we can now have absolutely huge titlerecs...
	if ( m_ubufSize <= 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 )
		g_errno = EBADTITLEREC;
		return log("db: TitleRec::set: uncompress uncompressed "
			   "size=%"INT32".",m_ubufSize );
	}
	// trying to uncompress corrupt titlerecs sometimes results in
	// a seg fault... watch out
	if ( m_ubufSize > 100*1024*1024 ) {
		g_errno = EBADTITLEREC;
		return log("db: TitleRec::set: uncompress uncompressed "
			   "size=%"INT32" > 100MB. unacceptable, probable "
			   "corruption.",m_ubufSize );
	}
	// make buf space for holding the uncompressed stuff
	m_ubufAlloc = m_ubufSize;
	m_ubuf = (char *) mmalloc ( m_ubufAlloc ,"TitleRecu1");
	// log("xmldoc: m_ubuf=%"PTRFMT" this=%"PTRFMT
	//     , (PTRTYPE) m_ubuf
	//     , (PTRTYPE) this
	//     );
	if ( ! m_ubuf ) {
		// we had bad ubufsizes on gb6, like > 1GB print out key
		// so we can manually make a titledb.dat file to delete these
		// bad keys
		log("build: alloc failed ubufsize=%"INT32" key.n1=%"UINT32" "
		    "n0=%"UINT64,
		    m_ubufAlloc,m_titleRecKey.n1,m_titleRecKey.n0);
		return false;
	}
	// we need to loop since uncompress is wierd, sometimes it needs more
	// space then it should. see how much it actually took.
	int32_t realSize = m_ubufSize;
	// time it
	int64_t startTime = gettimeofdayInMilliseconds();
	// debug msg

	setStatus( "Uncompressing title rec." );
	// . uncompress the data into m_ubuf
	// . m_ubufSize should remain unchanged since we stored it
	int err = gbuncompress ( (unsigned char *)  m_ubuf ,
				 (uint32_t *) &realSize   ,
				 (unsigned char *)  p ,
				 (uint32_t  ) (dataSize - 4) );
	// hmmmm...
	if ( err == Z_BUF_ERROR ) {
		log("db: Buffer is too small to hold uncompressed "
		    "document. Probable disk corruption in a titledb file.");
		g_errno = EUNCOMPRESSERROR;
		return false;
	}
	// set g_errno and return false on error
	if ( err != Z_OK ) {
		g_errno = EUNCOMPRESSERROR;
		return log("db: Uncompress of document failed. ZG_ERRNO=%i. "
			   "cbufSize=%"INT32" ubufsize=%"INT32" realSize=%"INT32"",
			   err , cbufSize , m_ubufSize , realSize );
	}
	if ( realSize != m_ubufSize ) {
		g_errno = EBADENGINEER;
		return log("db: Uncompressed document size is not what we "
			   "recorded it to be. Probable disk corruption in "
			   "a titledb file.");
	}
	// . add the stat
	// . use white for the stat
	g_stats.addStat_r ( 0          ,
			    startTime  ,
			    gettimeofdayInMilliseconds(),
			    0x00ffffff );

	// first 2 bytes in m_ubuf is the header size
	int32_t headerSize = *(uint16_t *)m_ubuf;

	int32_t shouldbe = (char *)&ptr_firstUrl - (char *)&m_headerSize;

	if ( headerSize != shouldbe ) {
		g_errno = ECORRUPTDATA;
		return log("doc: bad header size in title rec");
	}

	// set our easy stuff
	gbmemcpy ( (void *)this , m_ubuf , headerSize );

	// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
	// like in Msg.cpp and Msg20Reply.cpp

	if ( m_pbuf ) {
		int32_t crc = hash32(m_ubuf,headerSize);
		m_pbuf->safePrintf("crchdr=0x%"XINT32" sizehdr=%"INT32", ",
				   crc,headerSize);
	}


	// point to the string data
	char *up = m_ubuf + headerSize;

	// end of the rec
	char *upend = m_ubuf + m_ubufSize;

	// how many XmlDoc::ptr_* members do we have? set "np" to that
	int32_t np = ((char *)&size_firstUrl  - (char *)&ptr_firstUrl) ;
	np /= sizeof(char *);

	// point to the first ptr
	char **pd = (char **)&ptr_firstUrl;
	// point to the first size
	int32_t *ps = (int32_t *)&size_firstUrl;

	// loop over them
	for ( int32_t i = 0 ; i < np ; i++ , pd++ , ps++ ) {
		// zero out the ith ptr_ and size_ member
		*pd = 0;
		*ps = 0;
		// make the mask
		uint32_t mask = 1 << i ;
		// do we have this member? skip if not.
		if ( ! (m_internalFlags1 & mask) ) continue;
		// watch out for corruption
		if ( up > upend ) {
			g_errno = ECORRUPTDATA;
			return log("doc: corrupt titlerec.");
		}
		// get the size
		*ps = *(int32_t *)up;
		// this should never be 0, otherwise, why was its flag set?
		if ( *ps <= 0 ) { char *xx=NULL;*xx=0; }
		// skip over to point to data
		up += 4;
		// point to the data. could be 64-bit ptr.
		*pd = up;//(int32_t)up;
		// debug
		if ( m_pbuf ) {
			int32_t crc = hash32(up,*ps);
			m_pbuf->safePrintf("crc%"INT32"=0x%"XINT32" size%"INT32"=%"INT32", ",
					   i,crc,i,*ps);
		}
		// skip over data
		up += *ps;
		// watch out for corruption
		if ( up > upend ) {
			g_errno = ECORRUPTDATA;
			return log("doc: corrupt titlerec.");
		}
	}
	// cap it
	char *pend = m_ubuf + m_ubufSize;
	// sanity check. must match exactly.
	if ( up != pend ) { char *xx=NULL;*xx=0; }

	// set the urls i guess
	m_firstUrl.set   ( ptr_firstUrl );
	if ( ptr_redirUrl ) {
		m_redirUrl.set   ( ptr_redirUrl );
		m_currentUrl.set ( ptr_redirUrl );
		m_currentUrlValid = true;
		m_redirUrlPtr     = &m_redirUrl;
	}
	else {
		m_currentUrl.set ( ptr_firstUrl );
		m_currentUrlValid = true;
		m_redirUrlPtr     = NULL;
	}
	m_firstUrlValid               = true;
	m_redirUrlValid               = true;


	// convert 8 bit to a 32 bit
	//m_numBannedOutlinks = score8to32 ( m_numBannedOutlinks8 );

	// validate *shadow* members since bit flags cannot be returned
	m_isRSS2              = m_isRSS;
	m_isPermalink2        = m_isPermalink;
	m_isAdult2            = m_isAdult;
	m_spiderLinks2        = m_spiderLinks;
	m_isContentTruncated2 = m_isContentTruncated;
	m_isLinkSpam2         = m_isLinkSpam;
	m_hasAddress2         = m_hasAddress;
	m_hasTOD2             = m_hasTOD;
	//m_hasSiteVenue2       = m_hasSiteVenue;
	m_hasContactInfo2     = m_hasContactInfo;
	//m_skipIndexingByte    = m_skipIndexing;
	m_isSiteRoot2         = m_isSiteRoot;

	// these members are automatically validated
	m_ipValid                     = true;
	m_spideredTimeValid           = true;
	m_indexedTimeValid            = true;

	m_pubDateValid                = true;
	m_firstIndexedValid   	      = true;
	m_outlinksAddedDateValid      = true;
	m_charsetValid                = true;
	m_countryIdValid              = true;
	/*
	m_titleWeightValid            = true;
	m_headerWeightValid           = true;
	m_urlPathWeightValid          = true;
	m_externalLinkTextWeightValid = true;
	m_internalLinkTextWeightValid = true;
	m_conceptWeightValid          = true;
	*/

	// new stuff
	m_siteNumInlinksValid         = true;
	// m_siteNumInlinksUniqueIpValid = true;
	// m_siteNumInlinksUniqueCBlockValid = true;
	// m_siteNumInlinksTotalValid        = true;
	//m_sitePopValid                = true;
	m_rootLangIdValid             = true;
	m_hasContactInfoValid         = true;
	m_metaListCheckSum8Valid      = true;

	m_hopCountValid               = true;
	//m_numBannedOutlinksValid    = true;
	m_langIdValid                 = true;
	m_contentTypeValid            = true;
	m_isRSSValid                  = true;
	m_isPermalinkValid            = true;
	m_isAdultValid                = true;
	//m_eliminateMenusValid         = true;
	m_spiderLinksValid            = true;
	m_isContentTruncatedValid     = true;
	m_isLinkSpamValid             = true;
	m_hasAddressValid             = true;
	m_tagRecDataValid             = true;
	m_gigabitHashesValid          = true;
	m_contentHash32Valid          = true;
	//m_tagHash32Valid              = true;
	m_tagPairHash32Valid          = true;
	m_adVectorValid               = true;
	m_wikiDocIdsValid             = true;
	m_imageDataValid              = true;
	m_catIdsValid                 = true;
	m_indCatIdsValid              = true;
	// ptr_dmozTitles/Summs/Anchors valid:
	m_dmozInfoValid               = true;
	m_utf8ContentValid            = true;
	//m_sectionsReplyValid          = true;
	//m_sectionsVotesValid          = true;
	//m_addressReplyValid         = true;
	m_siteValid                   = true;
	m_linkInfo1Valid              = true;
	m_linkInfo2Valid              = true;
	m_versionValid                = true;
	m_httpStatusValid             = true;
	m_crawlDelayValid             = true;
	//m_sectiondbDataValid          = true;
	//m_placedbDataValid            = true;
	//m_clockCandidatesDataValid    = true;
	//m_skipIndexingValid           = true;
	m_isSiteRootValid             = true;

	// ptr_linkInfo2 is valid. so getDiffbotTitleHashes() works.
	m_diffbotTitleHashBufValid = true;

	// set "m_oldTagRec" from ptr_tagRecData
	//gbmemcpy ( &m_oldTagRec     , ptr_tagRecData , size_tagRecData );
	//m_oldTagRecValid = true;

	// there was no issue indexing it...
	m_indexCode       = 0;
	m_indexCodeValid  = true;
	m_redirError      = 0;
	m_redirErrorValid = true;

	// stop core when importing and calling getNewSpiderReply()
	m_downloadEndTime = m_spideredTime;
	m_downloadEndTimeValid = true;

	// make a copy for new tag rec too, this one we modify
	//gbmemcpy ( &m_newTagRec     , ptr_tagRecData , size_tagRecData );

	// set "m_siteNumInlinks" from m_oldTagRec
	//Tag *tag = m_oldTagRec.getTag("sitenuminlinks");
	// must always be there!
	//if ( ! tag ) { char *xx=NULL;*xx=0; }
	// must be null terminated
	//if ( tag->getTagData()[tag->getTagData()Size-1] != 0 ) {
	// char *xx=NULL;*xx=0; }
	// grab that
	//m_siteNumInlinks      = atol(tag->getTagData());
	//m_siteNumInlinksValid = true;
	// must not be negative
	if ( m_siteNumInlinks < 0 ) { char *xx=NULL;*xx=0; }

	// set m_hasContactInfo in case someone calls ::getHasContactInfo()
	// which will do a bunch of parsing!!
	//tag = m_oldTagRec.getTag ("hascontactinfo");
	//if ( tag ) m_hasContactInfo = true;
	//else       m_hasContactInfo = false;
	//m_hasContactInfoValid = true;

	// sanity check. if m_siteValid is true, this must be there
	if ( ! ptr_site ) {
		log("set2: ptr_site is null for docid %"INT64"",m_docId);
		//char *xx=NULL;*xx=0; }
		g_errno = ECORRUPTDATA;
		return false;
	}

	// lookup the tagdb rec fresh if setting for a summary. that way we
	// can see if it is banned or not
	//if ( m_req ) m_tagRecDataValid = false;

	// debug thing
	ptr_sectiondbData = NULL;
	size_sectiondbData = 0;

	// set m_sections.m_nsvt from data. ptr_sectiondbData is the m_osvt
	// serialized, which is from our read of sectiondb at the time we
	// indexed it. but now that we may have nulled out our content to
	// save space in titledb because m_skipIndexing is true, then we have
	// to save our votes as well, BUT, only if we skipped indexing.
	// and not allowed to serialize UNLESS we skipped because
	// that would waste space as well
	//if (! m_skipIndexing && size_sectionsVotes ) { char *xx=NULL;*xx=0; }

	// success, return true then
	return true;
}


bool XmlDoc::setFirstUrl ( char *u , bool addWWW , Url *baseUrl ) {

	m_firstUrl.reset();
	m_currentUrl.reset();

	m_firstUrlValid = true;

	// sanity check. "u" must be normalized
	//if ( strncmp(u,"http",4 ) != 0 ) { char *xx=NULL;*xx=0; }

	// assume url is not correct format
	ptr_firstUrl  = NULL;
	size_firstUrl = 0;

	if ( ! u || ! u[0] ) {
		//if ( ! m_indexCode ) m_indexCode = EBADURL;
		return true;
	}

	//if ( gbstrlen (u) + 1 > MAX_URL_LEN )
	//	m_indexCode = EURLTOOLONG;

	m_firstUrl.set ( baseUrl , u , gbstrlen(u) , addWWW ) ;

	// it is the active url
	m_currentUrl.set ( &m_firstUrl , false );
	m_currentUrlValid = true;

	// set this to the normalized url
	ptr_firstUrl  = m_firstUrl.getUrl();
	size_firstUrl = m_firstUrl.getUrlLen() + 1;

	// is it is a link loop?
	//if ( m_firstUrl.isLinkLoop() ) {
	//	if ( ! m_indexCode ) m_indexCode = ELINKLOOP;
	//	return true;
	//}
	// it it illegal?
	//if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
	//	if ( ! m_indexCode ) m_indexCode = EBADURL;
	//	return true;
	//}

	// check if url is porn words in it
	//if ( cr->m_doUrlSpamCheck && m_firstUrl.isSpam() ) {
	//	if ( ! m_indexCode ) m_indexCode = EDOCURLSPAM;
	//	return true;
	//}

	return true;
}

//CollectionRec *XmlDoc::getCollRec ( ) {
//	return g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) );
//}

//bool XmlDoc::setRedirUrl ( char *u , bool addWWW ) {
//	m_redirUrl.set ( u , gbstrlen(u) , addWWW );
//	ptr_redirUrl  = m_redirUrl.getUrl();
//	size_redirUrl = m_redirUrl.getUrlLen()+1;
//	return true;
//}

void XmlDoc::setStatus ( char *s ) {
	m_statusMsg = s;
	m_statusMsgValid = true;
	static char *s_last = NULL;

	if ( s == s_last ) return;

	bool timeIt = false;
	// if ( m_sreqValid &&
	//      m_sreq.m_isInjecting &&
	//      m_sreq.m_isPageInject )
	// 	timeIt = true;
	if ( g_conf.m_logDebugBuildTime )
		timeIt = true;

	// log times to detect slowness
	if ( timeIt ) {
		int64_t now = gettimeofdayInMillisecondsLocal();
		if ( s_lastTimeStart == 0LL ) s_lastTimeStart = now;
		int32_t took = now - s_lastTimeStart;
		//if ( took > 100 )
			log("xmldoc: %s (xd=0x%"PTRFMT" "
			    "u=%s) took %"INT32"ms",
			    s_last,
			    (PTRTYPE)this,
			    m_firstUrl.m_url,
			    took);
		s_lastTimeStart = now;
	}

	s_last = s;

	bool logIt = g_conf.m_logDebugBuild;
	// CollectionRec *cr = NULL;
	// if ( m_collnumValid )
	// 	cr = g_collectiondb.m_recs[m_collnum];
	// if ( cr &&
	//      cr->m_coll &&
	//      cr->m_coll[0] == 'c' &&
	//      cr->m_coll[1] == 'r' &&
	//      strncmp(cr->m_coll,"crawlbottesting-",16) == 0 )
	// 	logIt = true;

	if ( ! logIt ) return;
	//return;
	if ( m_firstUrlValid )
		logf(LOG_DEBUG,"build: status = %s for %s (this=0x%"PTRFMT")",
		     s,m_firstUrl.m_url,(PTRTYPE)this);
	else
		logf(LOG_DEBUG,"build: status = %s for docId %"INT64" "
		     "(this=0x%"PTRFMT")",
		     s,m_docId, (PTRTYPE)this);
}

// caller must now call XmlDoc::setCallback()
void XmlDoc::setCallback ( void *state, void (* callback) (void *state) ) {
	m_state     = state;
	m_callback1 = callback;
	// add this additional state==this constraint to prevent core when
	// doing a page parser
	if ( state == this &&
	     // i don't remember why i added this sanity check...
	     callback == getMetaListWrapper ) { char *xx=NULL;*xx=0; }
}

void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) {
	m_state     = state;
	m_callback2 = callback;
}

// . similar to XmlDoc::indexDoc() but just adds m_firstUrl to spiderdb
// . used by PageAddUrl.cpp
/*
bool XmlDoc::addToSpiderdb ( ) {
	// set a flag
	m_isAddUrl = true;
	// url must be valid
	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
	// do not add if something wrong with url
	if ( m_indexCode ) return true;
	// this should just add to spiderdb because m_isAddUrl is true
	return indexDoc(false,false,false,false,true,false);
}
*/

void indexDocWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// note it
	THIS->setStatus ( "in index doc wrapper" );
	// return if it blocked
	if ( ! THIS->indexDoc( ) ) return;
	// otherwise, all done, call the caller callback

	// g_statsdb.addStat ( MAX_NICENESS,
	// 		    "docs_indexed",
	// 		    20,
	// 		    21,
	// 		    );


	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

// for registerSleepCallback
void indexDocWrapper2 ( int fd , void *state ) {
	indexDocWrapper ( state );
}

// . inject from http request
// . replace more of Msg7.cpp logic with this?
//bool XmlDoc::injectDoc ( HttpRequest *hr ) {
//}

// . the highest level function in here
// . user is requesting to inject this url
// . returns false if blocked and your callback will be called when done
// . returns true and sets g_errno on error
bool XmlDoc::injectDoc ( char *url ,
			 CollectionRec *cr ,
			 char *content ,
			 char *diffbotReply, // usually null
			 bool contentHasMimeArg ,
			 int32_t hopCount,
			 int32_t charset,

			 bool deleteUrl,
			 char *contentTypeStr, // text/html application/json
			 bool spiderLinks ,
			 char newOnly, // index iff new

			 void *state,
			 void (*callback)(void *state) ,

			 uint32_t firstIndexed,
			 uint32_t lastSpidered ,
			 int32_t injectDocIp ,
			 char *contentDelim,
			 char *metadata,
             uint32_t metadataLen,
			 int32_t  payloadLen
			 ) {


	// wait until we are synced with host #0
	if ( ! isClockInSync() ) {
		log("xmldocl: got injection request but clock not yet "
		    "synced with host #0");
		g_errno = ETRYAGAIN;//CLOCKNOTSYNCED;
		return true;
	}

	// normalize url
	Url uu;
	// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
	// which has no www.tmblr.co IP!
	uu.set(url,gbstrlen(url),false);//true);

	// if (!strncmp(url , "http://www.focusinfo.com/products/mxprodv" ,40))
        //          log("hey");


	// remove >'s i guess and store in st1->m_url[] buffer
	char cleanUrl[MAX_URL_LEN+1];
	cleanInput ( cleanUrl,
		     MAX_URL_LEN,
		     uu.getUrl(),
		     uu.getUrlLen() );


	int32_t contentType = CT_UNKNOWN;
	if ( contentTypeStr && contentTypeStr[0] )
		contentType = getContentTypeFromStr(contentTypeStr);

	// use CT_HTML if contentTypeStr is empty or blank. default
	if ( ! contentTypeStr || ! contentTypeStr[0] )
		contentType = CT_HTML;

	// this can go on the stack since set4() copies it
	SpiderRequest sreq;
	sreq.setFromInject ( cleanUrl );

	if ( lastSpidered )
		sreq.m_addedTime = lastSpidered;

	if ( deleteUrl )
		sreq.m_forceDelete = 1;

	//static char s_dummy[3];
	// sometims the content is indeed NULL...
	//if ( newOnly && ! content ) {
	//	// don't let it be NULL because then xmldoc will
	//	// try to download the page!
	//	s_dummy[0] = '\0';
	//	content = s_dummy;
	//	//char *xx=NULL;*xx=0; }
	//}

	// . use the enormous power of our new XmlDoc class
	// . this returns false with g_errno set on error
	if ( ! set4 ( &sreq       ,
		      NULL        ,
		      cr->m_coll  ,
		      NULL        , // pbuf
		      // from PageInject.cpp:
		      // give it a niceness of 1, we have to be
		      // careful since we are a niceness of 0!!!!
		      1, // niceness, // 1 ,
		      // inject this content
		      content ,
		      deleteUrl, // false, // deleteFromIndex ,
		      injectDocIp, // 0,//forcedIp ,
		      contentType ,
		      lastSpidered,//lastSpidered overide
		      contentHasMimeArg ,
              contentDelim,
              metadata,
			  metadataLen,
			  payloadLen
                  )) {
		// g_errno should be set if that returned false
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return true;
	}

	// a diffbot reply? should be in json
	if ( diffbotReply ) {
		if ( ! m_diffbotReply.safeStrcpy(diffbotReply) )
			return true;
		// it was injected so assume no error
		m_diffbotReplyError = 0;
		m_diffbotReplyValid = true;
	}

	//m_doConsistencyTesting = doConsistencyTesting;

	// . set xd from the old title rec if recycle is true
	// . can also use XmlDoc::m_loadFromOldTitleRec flag
	//if ( recycleContent ) m_recycleContent = true;

	// othercrap. used for importing from titledb of another coll/cluster.
	if ( firstIndexed ) {
		m_firstIndexedDate = firstIndexed;
		m_firstIndexedDateValid = true;
	}

	if ( lastSpidered ) {
		m_spideredTime      = lastSpidered;
		m_spideredTimeValid = true;
	}

	if ( hopCount != -1 ) {
		m_hopCount = hopCount;
		m_hopCountValid = true;
	}

	// PageInject calls memset on gigablastrequest so add '!= 0' here
	if ( charset != -1 && charset != csUnknown && charset != 0 ) {
		m_charset = charset;
		m_charsetValid = true;
	}

	// avoid looking up ip of each outlink to add "firstip" tag to tagdb
	// because that can be slow!!!!!!!
	m_spiderLinks = spiderLinks;
	m_spiderLinks2 = spiderLinks;
	m_spiderLinksValid = true;

	// . newOnly is true --> do not inject if document is already indexed!
	// . maybe just set indexCode
	m_newOnly = newOnly;

	// do not re-lookup the robots.txt
	m_isAllowed      = true;
	m_isAllowedValid = true;
	m_crawlDelay     = -1; // unknown
	m_crawlDelayValid = true;


	m_isInjecting = true;
	m_isInjectingValid = true;

	// set this now
	//g_inPageInject = true;

	// log it now
	//log("inject: indexing injected doc %s",cleanUrl);

	// make this our callback in case something blocks
	setCallback ( state , callback );

	// . now tell it to index
	// . this returns false if blocked
	// . eventually it will call "callback" when done if it blocks
	bool status = indexDoc ( );
	if ( ! status ) return false;

	// log it. i guess only for errors when it does not block?
	// because xmldoc.cpp::indexDoc calls logIt()
	if ( status ) logIt();


	// undo it
	//g_inPageInject = false;

	return true;
}

// XmlDoc::injectDoc uses a fake spider request so we have to add
// a real spider request into spiderdb so that the injected doc can
// be spidered again in the future by the spidering process, otherwise,
// injected docs can never be re-spidered. they would end up having
// a SpiderReply in spiderdb but no matching SpiderRequest as well.
void XmlDoc::getRevisedSpiderRequest ( SpiderRequest *revisedReq ) {

	if ( ! m_sreqValid ) { char *xx=NULL; *xx=0; }

	// we are doing this because it has a fake first ip
	if ( ! m_sreq.m_fakeFirstIp ) { char *xx=NULL;*xx=0; }

	// copy it over from our current spiderrequest
	gbmemcpy ( revisedReq , &m_sreq , m_sreq.getRecSize() );

	// this must be valid for us of course
	if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }

	// wtf? it might be invalid!!! parent caller will handle it...
	//if ( m_firstIp == 0 || m_firstIp == -1 ) { char *xx=NULL;*xx=0; }

	// store the real ip in there now
	revisedReq->m_firstIp = m_firstIp;

	// but turn off this flag! the whole point of all this...
	revisedReq->m_fakeFirstIp = 0;

	// re-make the key since it contains m_firstIp
	int64_t uh48 = m_sreq.getUrlHash48();
	int64_t parentDocId = m_sreq.getParentDocId();

	// set the key properly to reflect the new "first ip" since
	// we shard spiderdb by that.
	revisedReq->m_key = g_spiderdb.makeKey ( m_firstIp,
						 uh48,
						 true, // is request?
						 parentDocId ,
						 false );// isDel );
	revisedReq->setDataSize();
}

void XmlDoc::getRebuiltSpiderRequest ( SpiderRequest *sreq ) {

	// memset 0
	sreq->reset();

	// assume not valid
	sreq->m_siteNumInlinks = -1;

	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }

	// how many site inlinks?
	sreq->m_siteNumInlinks       = m_siteNumInlinks;
	sreq->m_siteNumInlinksValid  = true;

	if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }

	// set other fields besides key
	sreq->m_firstIp              = m_firstIp;
	sreq->m_hostHash32           = m_hostHash32a;
	//sreq->m_domHash32            = m_domHash32;
	//sreq->m_siteNumInlinks       = m_siteNumInlinks;
	//sreq->m_pageNumInlinks     = m_pageNumInlinks;
	sreq->m_hopCount             = m_hopCount;

	sreq->m_parentHostHash32     = 0;//m_sreq.m_parentHostHash32;
	sreq->m_parentDomHash32      = 0;//m_sreq.m_parentDomHash32;
	sreq->m_parentSiteHash32     = 0;//m_sreq.m_parentSiteHash32;
	sreq->m_pageNumInlinks       = 0;//m_sreq.m_parentFirstIp;

	Url *fu = getFirstUrl();

	sreq->m_isNewOutlink         = 0;
	sreq->m_isAddUrl             = 0;//m_isAddUrl;
	sreq->m_isPingServer         = fu->isPingServer();
	//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;

	// transcribe from old spider rec, stuff should be the same
	sreq->m_addedTime            = m_firstIndexedDate;
	sreq->m_sameDom              = 0;//m_sreq.m_sameDom;
	sreq->m_sameHost             = 0;//m_sreq.m_sameHost;
	sreq->m_sameSite             = 0;//m_sreq.m_sameSite;
	sreq->m_wasParentIndexed     = 0;//m_sreq.m_parentWasIndexed;
	sreq->m_parentIsRSS          = 0;//m_sreq.m_parentIsRSS;
	sreq->m_parentIsPermalink    = 0;//m_sreq.m_parentIsPermalink;
	sreq->m_parentIsPingServer   = 0;//m_sreq.m_parentIsPingServer;

	// validate the stuff so getUrlFilterNum() acks it
	sreq->m_hopCountValid = 1;

	// we need this now for ucp ucr upp upr new url filters that do
	// substring matching on the url
	if ( m_firstUrlValid )
		strcpy(sreq->m_url,m_firstUrl.m_url);

	// re-make the key since it contains m_firstIp
	long long uh48 = fu->getUrlHash48();
	// set the key properly to reflect the new "first ip"
	// since we shard spiderdb by that.
	sreq->m_key = g_spiderdb.makeKey ( m_firstIp,//ip,
					   uh48,
					   true,//is req?
					   0LL, // parentDocId ,
					   false );//isDel
	sreq->setDataSize();
}


////////////////////////////////////////////////////////////////////
//   THIS IS THE HEART OF HOW THE PARSER ADDS TO THE RDBS
////////////////////////////////////////////////////////////////////

// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
// . this is now a WRAPPER for indexDoc2() and it will deal with
//   g_errnos by adding an error spider reply so we offload the
//   logic to the url filters table
bool XmlDoc::indexDoc ( ) {

	// return from the msg4.addMetaList() below?
	if ( m_msg4Launched ) {
		// must have been waiting
		if ( ! m_msg4Waiting ) { char *xx=NULL;*xx=0; }
		return true;
	}

	// return true with g_errno set on error
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	if ( ! m_masterLoop ) {
		m_masterLoop  = indexDocWrapper;
		m_masterState = this;
	}

	// do not index if already indexed and we are importing
	// from the code in PageInject.cpp from a foreign titledb file
	if ( m_isImporting && m_isImportingValid ) {
		char *isIndexed = getIsIndexed();
		if ( ! isIndexed ) {
			log("import: import had error: %s",mstrerror(g_errno));
			return true;
		}
		if ( isIndexed == (char *)-1)
			return false;
		if ( *isIndexed ) {
			log("import: skipping import for %s. already indexed.",
			    m_firstUrl.getUrl());
			return true;
		}
	}

	// . even if not using diffbot, keep track of these counts
	// . even if we had something like EFAKEFIRSTIP, OOM, or whatever
	//   it was an attempt we made to crawl this url
	if ( ! m_isDiffbotJSONObject &&
	     ! m_incrementedAttemptsCount ) {
		// do not repeat
		m_incrementedAttemptsCount = true;
		// log debug
		//log("build: attempted %s count=%"INT64"",m_firstUrl.getUrl(),
		//    cr->m_localCrawlInfo.m_pageDownloadAttempts);
		// this is just how many urls we tried to index
		//cr->m_localCrawlInfo.m_urlsConsidered++;
		// avoid counting if it is a fake first ip
		bool countIt = true;
		// pagereindex.cpp sets this as does any add url (bulk job)
		if ( m_sreqValid && m_sreq.m_fakeFirstIp )
			countIt = false;
		if ( countIt ) {
			cr->m_localCrawlInfo.m_pageDownloadAttempts++;
			cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
			// changing status, resend local crawl info to all
			cr->localCrawlInfoUpdate();
		}
		// need to save collection rec now during auto save
		cr->m_needsSave = true;
		// update this just in case we are the last url crawled
		//int64_t now = gettimeofdayInMillisecondsGlobal();
		//cr->m_diffbotCrawlEndTime = now;
	}


	bool status = true;

	if ( ! g_errno ) status = indexDoc2 ( );

	// blocked?
	if ( ! status ) return false;

	// done with no error?
	bool success = true;
	if ( g_errno ) success = false;
	// if we were trying to spider a fakefirstip request then
	// pass through because we lookup the real firstip below and
	// add a new request as well as a reply for this one
	if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) success = false;

	if ( success ) return true;

	// . ignore failed child docs like diffbot pages
	// . they are getting EMALFORMEDSECTIONS
	if ( m_isChildDoc ) {
		log("build: done indexing child doc. error=%s. not adding "
		    "spider reply for %s",
		    mstrerror(g_errno),
		    m_firstUrl.m_url);
		return true;
	}

	///
	// otherwise, an internal error. we must add a SpiderReply
	// to spiderdb to release the lock.
	///

 logErr:

	if ( m_firstUrlValid && g_errno )
		log("build: %s had internal error = %s. adding spider "
		    "error reply.",
		    m_firstUrl.m_url,mstrerror(g_errno));
	else if ( g_errno )
		log("build: docid=%"INT64" had internal error = %s. "
		    "adding spider error reply.",
		    m_docId,mstrerror(g_errno));

	// seems like this was causing a core somehow...
	if ( g_errno == ENOMEM )
		return true;

	// and do not add spider reply if shutting down the server
	if ( g_errno == ESHUTTINGDOWN )
		return true;

	// i saw this on shard 9, how is it happening
	if ( g_errno == EBADRDBID )
		return true;

	// if docid not found when trying to do a query reindex...
	// this really shouldn't happen but i think we were adding
	// additional SpiderRequests since we were using a fake first ip.
	// but i have since fixed that code. so if the titlerec was not
	// found when trying to do a force delete... it's not a temporary
	// error and should not be retried. if we set indexCode to
	// EINTERNALERROR it seems to be retried.
	if ( g_errno == ENOTFOUND ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	// this should not be retired either. i am seeing it excessively
	// retried from a
	// "TitleRec::set: uncompress uncompressed size=-2119348471"
	// error condition. it also said
	// "Error spidering for doc http://www.... : Bad cached document"
	if ( g_errno == EBADTITLEREC ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
	// transId=496026 nice=1 net=default): Buf too small.
	// so fix that with this
	if ( g_errno == EBUFTOOSMALL ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	if ( g_errno == EBADURL ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	if ( g_errno == ENOTITLEREC ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	// default to internal error which will be retried forever otherwise
	if ( ! m_indexCodeValid ) {
		m_indexCode = EINTERNALERROR;//g_errno;
		m_indexCodeValid = true;
	}

	// if our spiderrequest had a fake "firstip" so that it could be
	// injected quickly into spiderdb, then do the firstip lookup here
	// and re-add the new spider request with that, and add the reply
	// to the fake firstip request below.
	if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) {
		// at least get this if possible
		int32_t *fip = getFirstIp();
		if ( fip == (void *) -1 ) return false;
		// error? g_errno will be changed if this is NULL
		if ( ! fip ) {
			log("build: error getting real firstip: %s",
			    mstrerror(g_errno));
			m_indexCode = EINTERNALERROR;
			m_indexCodeValid = true;
			goto logErr;
		}
		// sanity log
		if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
		// sanity log
		if ( *fip == 0 || *fip == -1 ) {
			//
			// now add a spider status doc for this so we know
			// why a crawl might have failed to start
			//
			SafeBuf *ssDocMetaList = NULL;
			// save this
			int32_t saved = m_indexCode;
			// make it the real reason for the spider status doc
			m_indexCode = EDNSERROR;
			// get the spiderreply ready to be added. false=del
			ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
			// revert
			m_indexCode = saved;
			// error?
			if ( ! ssDocMetaList ) return true;
			// blocked?
			if ( ssDocMetaList == (void *)-1 ) return false;
			// need to alloc space for it too
			char *list = ssDocMetaList->getBufStart();
			int32_t len = ssDocMetaList->length();
			//needx += len;
			// this too
			m_addedStatusDocSize = len;
			m_addedStatusDocSizeValid = true;

			char *url = "unknown";
			if ( m_sreqValid ) url = m_sreq.m_url;
			log("build: error2 getting real firstip of "
			    "%"INT32" for "
			    "%s. Not adding new spider req. "
			    "spiderstatusdocsize=%"INT32, (int32_t)*fip,url,
			    m_addedStatusDocSize);
			// also count it as a crawl attempt
			cr->m_localCrawlInfo.m_pageDownloadAttempts++;
			cr->m_globalCrawlInfo.m_pageDownloadAttempts++;

			if ( ! m_metaList2.safeMemcpy ( list , len ) )
				return true;

			goto skipNewAdd1;
		}
		// store the new request (store reply for this below)
		char rd = RDB_SPIDERDB;
		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
		if ( ! m_metaList2.pushChar(rd) )
			return true;
		// store it here
		SpiderRequest revisedReq;
		// this fills it in
		getRevisedSpiderRequest ( &revisedReq );
		// and store that new request for adding
		if ( ! m_metaList2.safeMemcpy (&revisedReq,
					       revisedReq.getRecSize()))
			return true;
		// make sure to log the size of the spider request
		m_addedSpiderRequestSize = revisedReq.getRecSize();
		m_addedSpiderRequestSizeValid = true;
	}

 skipNewAdd1:

	SpiderReply *nsr = NULL;

	// if only rebuilding posdb do not rebuild spiderdb
	if ( m_useSpiderdb && ! m_addedSpiderReplySizeValid ) {

		////
		//
		// make these fake so getNewSpiderReply() below does not block
		//
		////
		nsr = getFakeSpiderReply (  );
		// this can be NULL and g_errno set to ENOCOLLREC or something
		if ( ! nsr )
			return true;

		//SafeBuf metaList;

		char rd = RDB_SPIDERDB;
		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
		if ( ! m_metaList2.pushChar( rd ) )
			return true;

		if ( ! m_metaList2.safeMemcpy ( (char *)nsr,nsr->getRecSize()))
			return true;

		m_addedSpiderReplySize = nsr->getRecSize();
		m_addedSpiderReplySizeValid = true;
	}

	// for other errors like EBADTITLEREC we are not adding spider
	// status docs, so add them here
	/*
	if ( ! m_addedStatusDocSizeValid ) {
		SafeBuf *ssDocMetaList = NULL;
		// if calling getSpiderStatusDocMetaList blocks then
		// call addErrorStuffWrapper() to call msg4
		//m_masterLoop = addErrorStuffWrapper();
		//m_state = this;
		// this uses m_indexCode to set it
		// if this blocks it ends up calling m_masterLoop and
		// re-entering this function with g_errno clear possibly
		// so do we make it back here????? MDW
		ssDocMetaList = getSpiderStatusDocMetaList(NULL ,false);
		// error?
		if ( ! ssDocMetaList ) return true;
		// blocked?
		if ( ssDocMetaList == (void *)-1 ) return false;
		// need to alloc space for it too
		char *list = ssDocMetaList->getBufStart();
		int32_t len = ssDocMetaList->length();
		// this too
		m_addedStatusDocSize = len;
		m_addedStatusDocSizeValid = true;
		// also count it as a crawl attempt
		cr->m_localCrawlInfo.m_pageDownloadAttempts++;
		cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
		if ( ! m_metaList2.safeMemcpy ( list , len ) )
			return true;
	}
	*/

	m_msg4Launched = true;

	// display the url that had the error
	logIt();

	// log this for debug now
	if ( nsr ) {
		SafeBuf tmp;
		nsr->print(&tmp);
		log("xmldoc: added reply %s",tmp.getBufStart());
	}

	// clear g_errno
	g_errno = 0;

	// "cr" might have been deleted by calling indexDoc() above i think
	// so use collnum here, not "cr"
	if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart()     ,
				    m_metaList2.length() ,
				    m_collnum,//cr->m_coll         ,
				    m_masterState  , // state
				    m_masterLoop   ,
				    m_niceness     ) ) {
		// spider hang bug
		//if ( g_conf.m_testSpiderEnabled )
		//	logf(LOG_DEBUG,"build: msg4 meta add3 blocked"
		//	     "msg4=0x%"XINT32"" ,(int32_t)&m_msg4);
		m_msg4Waiting = true;
		return false;
	}

	//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );

	m_msg4Launched = false;

	// all done
	return true;
}

// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
bool XmlDoc::indexDoc2 ( ) {

	if ( g_isYippy ) return true;

	// if anything blocks, this will be called when it comes back
	if ( ! m_masterLoop ) {
		m_masterLoop  = indexDocWrapper;
		m_masterState = this;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;


	// do this before we increment pageDownloadAttempts below so that
	// john's smoke tests, which use those counts, are not affected
	if ( m_sreqValid &&
	     m_sreq.m_fakeFirstIp &&
	     // only do for add url, not for injects. injects expect
	     // the doc to be indexed while the browser waits. add url
	     // is really just adding the spider request and returning
	     // to the browser without delay.
	     ! m_sreq.m_isInjecting &&
	     // not for page reindexes either!
	     ! m_sreq.m_isPageReindex &&
	     // just add url
	     m_sreq.m_isAddUrl &&
	     // diffbot requests are ok though!
	     ! strstr(m_sreq.m_url,"-diffbotxyz") ) {
		m_indexCodeValid = true;
		m_indexCode = EFAKEFIRSTIP;
		return true;
	}


	// ensure that CollectionRec::m_globalCrawlInfo (spider stats)
	// is at least 1 minute in sync with counts of
	// all hosts in network. this returns false if it sent  out requests
	// to update the counts from all the hosts in the network, and
	// when it updates CollectionRec::m_crawlInfoGlobal with all the
	// requests from each hosts in the network it will call the
	// specified callback, m_masterLoop with m_masterState. this code
	// is all in Spider.cpp.
	// this is now in a sleep wrapper in spider.cpp.
	//setStatus ( "updating crawl info" );
	//if ( ! g_errno &&
	//     ! updateCrawlInfo ( cr , m_masterState , m_masterLoop ) )
	//	return false;


	// MDW: we do this in indexDoc() above why do we need it here?
	/*
	// even if not using diffbot, keep track of these counts
	if ( ! m_isDiffbotJSONObject &&
	     ! m_incrementedAttemptsCount ) {
		// do not repeat
		m_incrementedAttemptsCount = true;
		// this is just how many urls we tried to index
		//cr->m_localCrawlInfo.m_urlsConsidered++;
		cr->m_localCrawlInfo.m_pageDownloadAttempts++;
		cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
		// need to save collection rec now during auto save
		cr->m_needsSave = true;
		// update this just in case we are the last url crawled
		int64_t now = gettimeofdayInMillisecondsGlobal();
		cr->m_diffbotCrawlEndTime = now;
	}
	*/
	/*
	// if we are being called from Spider.cpp and we met our max
	// to crawl requirement, then bail out on this. this might
	// become true when we are in the middle of processing this url...
	if ( ! m_isDiffbotJSONObject &&
	     // this is just for this collection, from all hosts in network
	     cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >=
	     cr->m_diffbotMaxToCrawl ) {
		// set the code to badness
		m_indexCode = EHITCRAWLLIMIT;//EABANDONED;
		m_indexCodeValid = true;
		log("diffbot: abandoning url because we hit crawl limit "
		    "of %"INT64". downloaded %"INT64". Disabling spiders."
		    ,cr->m_diffbotMaxToCrawl
		    ,cr->m_globalCrawlInfo.m_pageDownloadSuccesses
		    );
		g_errno = m_indexCode;
		// if spiders already off..
		if ( ! cr->m_spideringEnabled ) return true;
		// do not repeat call sendNotification()
		cr->m_spideringEnabled = false;
		// set this
		m_emailInfo.reset();
		m_emailInfo.m_finalCallback = m_masterLoop;
		m_emailInfo.m_finalState = m_masterState;
		m_emailInfo.m_collnum = m_collnum;
		// note it
		setStatus("sending notification");
		// this returns false if it would block, so we ret fals
		if ( ! sendNotification ( &m_emailInfo ) ) return false;
		// it didn't block
		g_errno = m_indexCode;
		return true;
	}

	// likewise if we hit the max processing limit...
	if ( ! m_isDiffbotJSONObject &&
	     cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >=
	     cr->m_diffbotMaxToProcess ) {
		// set the code to badness
		m_indexCode = EHITPROCESSLIMIT;//EABANDONED;
		m_indexCodeValid = true;
		log("diffbot: abandoning url because we hit process limit "
		    "of %"INT64". processed %"INT64". Disabling spiders."
		    , cr->m_diffbotMaxToProcess
		    , cr->m_globalCrawlInfo.m_pageProcessSuccesses
		    );
		g_errno = m_indexCode;
		// if spiders already off...
		if ( ! cr->m_spideringEnabled ) return true;
		// turn them off and send notification (email or url)
		cr->m_spideringEnabled = false;
		// set this
		m_emailInfo.reset();
		m_emailInfo.m_finalCallback = m_masterLoop;
		m_emailInfo.m_finalState = m_masterState;
		m_emailInfo.m_collnum = m_collnum;
		// note it
		setStatus("sending notification");
		// . this returns false if it would block, so we ret fals
		// . this is now in PingServer.cpp
		if ( ! sendNotification( &m_emailInfo ) ) return false;
		// it didn't block
		g_errno = m_indexCode;
		return true;
	}
	*/

	setStatus("indexing doc");

	// maybe a callback had g_errno set?
	if ( g_errno ) return true;

	// before indexing this doc, index its inlinks it has according
	// to ahrefs?
	if ( m_downloadLevel == 1 && m_useAhrefs && ! m_doneWithAhrefs ) {
		// do not repeat this call!
		m_doneWithAhrefs = true;
		// call it
		if ( ! injectAhrefsLinks () ) return false;
	}


	if ( m_firstUrlValid && (m_firstUrl.isArc() ||  m_firstUrl.isWarc())) {
		// this returns false if it would block and callback will be
		// called
		if ( ! indexWarcOrArc ( ) )
			return false;
		logIt();
		// all done! no need to add the parent doc.
		return true;
	}

	if ( isContainerDoc() ) {
		// m_delimeter should be set!
		if ( ! indexContainerDoc () )
			return false;
		logIt();
		// all done! no need to add the parent doc.
		return true;
	}

	// . now get the meta list from it to add
	// . returns NULL and sets g_errno on error
	char *metaList = getMetaList ( );

	// error?
	if ( ! metaList ) {
		// sanity check. g_errno must be set
		if ( ! g_errno ) {
			log("build: Error UNKNOWN error spidering. setting "
			    "to bad engineer.");
			g_errno = EBADENGINEER;
			//char *xx=NULL;*xx=0; }
		}
		log("build: Error spidering for doc %s: %s",
		    m_firstUrl.m_url,mstrerror(g_errno));
		return true;
	}
	// did it block? return false if so, we will be recalled since
	// we set m_masterLoop to indexDoc
	if ( metaList == (char *) -1 ) return false;

	// before we add the meta list let's updateTagdb()
	//char *ret = updateTagdb();
	// it returns NULL on error
	//if ( ret == NULL ) return true;
	// return false if it blocked
	//if ( ret == (char *)-1 ) return false;

	// . let's update tagdb's venue address default too
	// . no. that is in getTitleRecBuf()

	// must be valid
	int32_t *indexCode = getIndexCode();
	if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;

	// . check to make sure the parser is consistent so we can cleanly
	//   delete the various rdb records if we need to in the future solely
	//   based on the titleRec.
	// . force = false
	// . unless we force it, the test is only done at random intervals
	//   for performance reasons
	if ( ! *indexCode ) doConsistencyTest ( false );
	// ignore errors from that
	g_errno = 0;


	// unregister any sleep callback
	if ( m_registeredSleepCallback ) {
		g_loop.unregisterSleepCallback(m_masterState,indexDocWrapper2);
		m_registeredSleepCallback = false;
	}

	//////////
	// . add the doledb negative key quickly to our tree to avoid a
	//   respider because the msg4 doledb negative key is buffered by msg4
	// . make it negative
	// . well it should not be respidered because the lock is on it!!
	//   -- so let's comment this out
	/////////
	/*
	key_t negative = m_doledbKey;
	// make it negative
	negative.n0 &= 0xfffffffffffffffeLL;
	// . store it in our tree if we can
	// . returns false and sets g_errno on error
	// . i.e. g_errno == ETRYAGAIN
	if ( ! m_addedNegativeDoledbRec &&
	     ! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
					NULL,0,m_niceness)){
		log("build: error trying to add to doledb: %s",
		    mstrerror(g_errno));
		// set sleep wrapper
		g_loop.registerSleepCallback(1000,m_masterState,
					     indexDocWrapper2,m_niceness);
		// note it
		m_registeredSleepCallback = true;
		// sleep and retry
		return false;
	}
	*/
	// we did that
	m_addedNegativeDoledbRec = true;

	// now add it
	if ( ! m_listAdded && m_metaListSize ) {
		// only call thuis once
		m_listAdded = true;
		// show it for now
		//printMetaList(m_metaList , m_metaList + m_metaListSize,NULL);
		// test it
		verifyMetaList ( m_metaList ,
				 m_metaList + m_metaListSize ,
				 false );
		// do it
		if ( ! m_msg4.addMetaList ( m_metaList     ,
					    m_metaListSize ,
					    m_collnum,//cr->m_coll         ,
					    m_masterState  , // state
					    m_masterLoop   ,
					    m_niceness     ) ) {
			// spider hang bug
			if ( g_conf.m_testSpiderEnabled )
				logf(LOG_DEBUG,"build: msg4 meta add blocked"
				     "msg4=0x%"PTRFMT"" ,(PTRTYPE)&m_msg4);
			m_msg4Waiting = true;
			return false;
		}
		// error with msg4? bail
		if ( g_errno ) return logIt();

	}

	// make sure our msg4 is no longer in the linked list!
	if (m_msg4Waiting && isInMsg4LinkedList(&m_msg4)){char *xx=NULL;*xx=0;}

	if ( m_msg4Waiting && g_conf.m_testSpiderEnabled )
		logf(LOG_DEBUG,"build: msg4=0x%"PTRFMT" returned"
		     ,(PTRTYPE)&m_msg4);

	// we are not waiting for the msg4 to return
	m_msg4Waiting = false;

	bool flush = false;

	// no longer flush injections.
	// TODO: pass in a flush flag with injection and flush in that
	// case, but for now disable to make things faster. profiler
	// indicates too much msg4 activity.
	//if ( m_contentInjected ) flush = true;
	//if ( m_sreqValid && m_sreq.m_isPageInject ) flush = true;

	// to keep our qa runs consistent
	if ( strcmp(cr->m_coll,"qatest123") == 0 ) flush = true;

	if ( ! m_listAdded ) flush = false;
	if ( m_listFlushed ) flush = false;

	// HACK: flush it if we are injecting it in case the next thing we
	//       spider is dependent on this one
	if ( flush ) {
		// note it
		setStatus ( "flushing msg4" );
		// only do it once
		m_listFlushed = true;
		// do it
		if ( ! flushMsg4Buffers ( m_masterState , m_masterLoop ) )
			return false;
	}

	// . all done with that. core if we block i guess.
	// . but what if we were not the function that set this to begin w/?
	//m_masterLoop = NULL;

	return logIt();
	/*
	// if not doing exact quotas, we're done
	if ( ! cr->m_exactQuotas ) return logIt();

	char *isIndexed = getIsIndexed();
	// this means it blocked
	if ( isIndexed == (char *)-1) { char *xx=NULL; *xx=0; }
	// returns NULL with g_errno set
	if ( isIndexed ) return logIt();

	// otherwise, tell Msg36 to update our quota count for this site
	// so we don't have to keep merging site: termlists
	m_incCount = false;
	m_decCount = false;
	if ( m_indexCode    ) m_decCount = true;
	//if ( m_forceDelete  ) m_decCount = true;

	// fix for the exact quota bug found on eurekster collection. bug 229
	// if we're not a new doc, then don't increment the count because
	// we have been already counted as the old doc. MDW: i added the
	// condition that if decCount is true we need to update the count!
	if ( *isIndexed && ! m_decCount ) return logIt();

	// if it is new and we are not adding it to the index then no need
	// to update any quota count...
	if ( ! *isIndexed && m_decCount ) return logIt();

	// if not decrementing the count, must be incrementing it then!
	if ( ! m_decCount ) m_incCount = true;
	*/
	// i am not using quotas, so disable this for now

	/*
	log(LOG_DEBUG,"build: inc'ing quota to REMOTE table "
	     "for termIdHost %"UINT64" termIdDom %"UINT64" for %s.",
	    m_msg16.m_termIdHost,m_msg16.m_termIdDom,m_url.getUrl());

	setStatus ( "updating quota cache" );

	// sanity checks
	if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; }
	if ( m_msg16.m_termIdDom  == 0 ) { char *xx = NULL; *xx = 0; }

	// . Msg36 gets the correct count from disk and puts it in cache. It
	//   doesn't try to increment or decrement the quotas in cache, because
	//   then it would have to be done on all twins, and also the correct
	//   split will have to be found.
	// . Actually, we should only use the cache on one host to hold the
	//   sum of all splits. This will be the authority cache.
	if ( ! m_updatedCounts ) {
		// only call this once
		m_updatedCounts = true;
		// do it
		if ( ! m_msg36.getTermFreq ( m_coll               ,
					     0                    , // maxAge
					     m_msg16.m_termIdHost ,
					     this                 ,
					     m_masterLoop         ,
					     m_niceness           ,
					     m_exactQuotas        ,
					     m_incCount           ,
					     m_decCount           ,
					     false                ))
			// we blocked
			return false;
		// error?
		if ( g_errno ) return logIt();
	}

	// add the second entry for domain
	if ( ! m_updatedCounts2 ) {
		// only call this once
		m_updateCounts2 = true;
		// do it
		if ( ! m_msg36.getTermFreq ( m_coll               ,
					     0                    , // maxAge
					     m_msg16.m_termIdDom  ,
					     this                 ,
					     doneAddingMsg36Entry2,
					     m_niceness           ,
					     m_exactQuotas        ,
					     m_incCount           ,
					     m_decCount           ,
					     false                ))
			// we blocked
			return false;
		// error?
		if ( g_errno ) return logIt();
	}

	// that is it!
	return logIt();
	*/
}

bool isRobotsTxtFile ( char *u , int32_t ulen ) {
	if ( ulen > 12 && ! strncmp ( u + ulen - 11 , "/robots.txt" , 11 ) )
		return true;
	return false;
}

// does this doc consist of a sequence of smaller sub-docs?
// if so we'll index the subdocs and not the container doc itself.
bool XmlDoc::isContainerDoc ( ) {
	if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
	if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
	//if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
	//if ( m_contentDelim ) return true;
	if ( m_contentDelimValid && m_contentDelim ) return true;
	return false;
}

// returns false if would block, true otherwise. returns true and sets g_errno on err
bool XmlDoc::indexContainerDoc ( ) {

	if ( ! m_contentDelim ) {
		log("build: can not index container doc. no delimeter.");
		g_errno = EBADENGINEER;
		return true;
	}

	// int8_t *hc = getHopCount();
	// if ( ! hc ) return true; // error?
	// if ( hc == (void *)-1 ) return false;
	// first download
	// in the case of a list of delimeted http server replies let's
	// not convert into utf8 here but just use as-is
	char **cpp = getContent();//getUtf8Content();
	// return true with g_errno set on error
	if ( ! cpp ) {
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return true;
	}
	// would block? return false then
	if ( cpp == (void *)-1 )
		return false;

	// need this. it is almost 1MB in size, so alloc it
	if ( ! m_msg7 ) {
		try { m_msg7 = new ( Msg7 ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			return true;
		}
		mnew ( m_msg7 , sizeof(Msg7),"xdmsg7");
	}

	// inject input parms:
	InjectionRequest *ir = &m_msg7->m_injectionRequest;
	// the cursor for scanning the subdocs
	if ( ! m_anyContentPtr ) {
		// init the content cursor to point to the first subdoc
		m_anyContentPtr = *cpp;
		// but skip over initial separator if there. that is a
		// faux pau
		int32_t dlen = gbstrlen(m_contentDelim);
		if ( strncmp(m_anyContentPtr,m_contentDelim,dlen) == 0 )
			m_anyContentPtr += dlen;
		// init the input parms
		memset ( ir , 0 , sizeof(InjectionRequest) );
		// reset it
		ir->m_spiderLinks = false;
		ir->m_injectLinks = false;
		ir->m_hopCount = 0;//*hc + 1;
		if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
		ir->m_collnum = m_collnum;
		// will this work on a content delimeterized doc?
		ir->m_deleteUrl = m_deleteFromIndex;
		// each subdoc will have a mime since it is an arc
		ir->m_hasMime = m_subDocsHaveMime;//true;
	}

 subdocLoop:

	QUICKPOLL ( m_niceness );

	// EOF?
	if ( m_anyContentPtr == (char *)-1 ) {
		m_indexCode = 0;//m_warcError;
		m_indexCodeValid = true;
		return true;
	}

	// we had \0 terminated the end of the previous record, so put back
	if ( m_savedChar && ! *m_anyContentPtr ) {
		*m_anyContentPtr = m_savedChar;
		m_anyContentPtr += gbstrlen(m_contentDelim);
	}


	// index this subdoc
	ir->ptr_content = m_anyContentPtr;

	// . should have the url as well.
	// . the url, ip etc. are on a single \n terminated line for an arc!
	char *separator = strstr(m_anyContentPtr,m_contentDelim);


	if ( separator ) {
		m_savedChar = *separator;
		m_anyContentPtr = separator;
		*m_anyContentPtr = '\0';
		//ir->size_content = separator - ir->ptr_content;
	}

	// if no separator found, this is our last injection
	if ( ! separator ) {
		m_anyContentPtr = (char *)-1;
	}


	// these are not defined. will be autoset in set4() i guess.
	ir->m_firstIndexed = 0;
	ir->m_lastSpidered = 0;

	bool setUrl = false;

	// HOWEVER, if an hasmime is true and an http:// follows
	// the delimeter then use that as the url...
	// this way we can specify our own urls.
	if ( ir->m_hasMime ) {
		char *du = ir->ptr_content;
		//du += gbstrlen(delim);
		if ( du && is_wspace_a ( *du ) ) du++;
		if ( du && is_wspace_a ( *du ) ) du++;
		if ( du && is_wspace_a ( *du ) ) du++;
		if ( ir->m_hasMime &&
		     (strncasecmp( du,"http://",7) == 0 ||
		      strncasecmp( du,"https://",8) == 0 ) ) {
			// flag it
			setUrl = true;
			// find end of it
			char *uend = du + 7;
			for ( ; *uend && ! is_wspace_a(*uend) ; uend++ );
			// inject that then
			m_injectUrlBuf.reset();
			m_injectUrlBuf.safeMemcpy ( du , uend - du );
			m_injectUrlBuf.nullTerm();
			// and point to the actual http mime then
			// well, skip that space, right
			ir->ptr_content = uend + 1;
			ir->ptr_url = m_injectUrlBuf.getBufStart();
			ir->size_url = m_injectUrlBuf.length()+1; // include \0
			// if (!strncmp(ir->ptr_url,"http://www.focusinfo.com/"
			// 	       "products/mxprodv" ,40) )
			// 	log("hey");
		}
	}


	QUICKPOLL ( m_niceness );

	// make the url from parent url
	// use hash of the content
	int64_t ch64 = hash64n ( ir->ptr_content , 0LL );

	// need this for an injection
	ir->size_content = gbstrlen(ir->ptr_content) + 1;// improve this?


	QUICKPOLL ( m_niceness );

	if ( ! setUrl ) {
		// reset it
		m_injectUrlBuf.reset();
		// by default append a -<ch64> to the provided url
		m_injectUrlBuf.safePrintf("%s-%"UINT64"",
					  m_firstUrl.getUrl(),ch64);
		ir->ptr_url = m_injectUrlBuf.getBufStart();
		ir->size_url = m_injectUrlBuf.length()+1; // include \0
	}


	bool status = m_msg7->sendInjectionRequestToHost ( ir ,
							   m_masterState ,
							   m_masterLoop ) ;

	// it would block, callback will be called later
	if ( status )
		return false;

	QUICKPOLL ( m_niceness );

	// error?
	if ( g_errno ) {
		log("build: index flatfile error %s",mstrerror(g_errno));
		return NULL;
	}
	else
		log("build: index flatfile did not block");

	// loop it up
	goto subdocLoop;

}


void doneInjectingArchiveRec ( void *state ) {
	Msg7 *THIS = (Msg7 *)state;
	THIS->m_inUse = false;
	XmlDoc *xd = THIS->m_stashxd;
	xd->m_numInjectionsOut--;
	log("build: archive: injection thread returned. %"INT32" out now.",
	    xd->m_numInjectionsOut);
	// reset g_errno so it doesn't error out in ::indexDoc() when
	// we are injecting a ton of these msg7s and then xmldoc ends up
	// getting reset and when a msg7 reply comes back in, we core
	g_errno = 0;
	xd->m_masterLoop ( xd );
}

void doneReadingArchiveFileWrapper ( int fd, void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// . go back to the main entry function
	// . make sure g_errno is clear from a msg3a g_errno before calling
	//   this lest it abandon the loop

	THIS->m_masterLoop ( THIS->m_masterState );
}


#define MAXWARCRECSIZE 5000000

bool XmlDoc::readMoreWarc() {
    // We read everything we can off the pipe in a sleep timer.
    // When we have enough to start processing, we call the
    // processing function.
    // If reading gets too far ahead of the processing and we can
    // no longer buffer the read, then we save the offset of what
    // we processed, free the readbuffer and restart the pipe and
    // skip until the offset we last processed

	if(!m_calledWgetThread) {
		m_pipe = getUtf8ContentInFile();
	}

	// return true with g_errno set on error
	if ( ! m_pipe ) {
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
        log("We don't have the warc pipe.");
		return true;
	}

    int64_t leftOver = 0;
    int64_t skipAhead = 0;

    // How much is unprocessed
    if(m_fptr != m_fptrEnd) {
        leftOver = m_fptrEnd - m_fptr;
    }
	if(leftOver < 0) {
        // Happens when we skip a record which is too big
		skipAhead = - leftOver;
		leftOver = 0;
		m_fptr = m_fileBuf;
		m_fptrEnd = m_fileBuf;
	}

	// We don't want to be memmoving the buffer up for every single
	// document we process so only do it when we need it.
	if(leftOver > MAXWARCRECSIZE) return false;

	int64_t bytesRemaining = m_fileBufAllocSize - (m_fptrEnd - m_fileBuf) - 1;
    // Scoot up everything we haven't processed
    if(bytesRemaining < MAXWARCRECSIZE) {
        //log("scooting up by left over %"INT64, leftOver);
        // count everything we've processed
        m_bytesStreamed += m_fptr - m_fileBuf;
        memmove(m_fileBuf, m_fptr, leftOver);
        m_fptr = m_fileBuf;
        m_fptrEnd = m_fileBuf + leftOver;
        *m_fptrEnd = '\0';
		bytesRemaining += leftOver;
    }

	int64_t toRead = m_fileBufAllocSize - leftOver - 1;
	if(toRead > bytesRemaining) toRead = bytesRemaining;

	if(toRead == 0) {
		//log("build: not enough room to read, lets process the buffer" );
		return false;
	}


    g_loop.disableTimer();
    errno = 0;
    int bytesRead = fread(m_fptrEnd, 1, toRead, m_pipe);
    g_loop.enableTimer();

	// if(bytesRead > 0) {
	// 	log("build: warc pipe read %"INT32" more bytes of the pipe. errno = %s, buf space = %"INT64 " processed = %"INT64 " skipAhead=%"INT64,
	// 		bytesRead, mstrerror(errno),toRead, m_bytesStreamed, skipAhead);
	// }

    if(bytesRead <= 0 && errno != EAGAIN) {
        // if(errno == EAGAIN){
        //     log("build: fd is not ready, lets process the buffer" );
        //     return false;
        // } else {
			if(m_registeredWgetReadCallback) {
				//log("build:came back from read callback");
				g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
				m_registeredWgetReadCallback = false;
			}

            if(m_pipe) {
                int32_t retCode = fclose(m_pipe);
				if(retCode) {
					log("we closed the pipe with error %s", mstrerror(retCode));
				}
                m_pipe = NULL;
            }

            //log("build: warc problem pipe terminated %s", mstrerror(errno));
            m_hasMoreToRead = false;
            return false;
        // }
    }
    //m_fptr = m_fileBuf;
    m_fptrEnd = m_fptrEnd + bytesRead;
    *m_fptrEnd = '\0';
	m_fptr += skipAhead;

    return false;
}


// . returns false if would block, true otherwise.
// . returns true and sets g_errno on err
// . injectwarc
bool XmlDoc::indexWarcOrArc ( ) {

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;
	if ( ! cr->m_indexWarcs ) {
		g_errno = EDOCWARC;
		return true;
	}

	// This can be a busy loop if we have max injections out but we
	// are getting a read ready callback.  Should we unregister
	// when max injections are out and then reregister when we have room?
	int32_t max = g_hostdb.m_numHosts * 2;
	if ( max > MAXMSG7S ) max = MAXMSG7S;
	if ( m_numInjectionsOut >= max ) return false;

    char ctype;
	if ( m_firstUrl.isWarc() ) {
        ctype = CT_WARC;
	} else {
        ctype = CT_ARC;
    }

	int8_t *hc = getHopCount();
	if ( ! hc ) return true; // error?
	if ( hc == (void *)-1 ) return false;

    if ( ! m_fileBuf ) {
        // Do this exacly once.
        m_fileBufAllocSize = (5 * MAXWARCRECSIZE) + 1;
        m_fileBuf=(char *)mmalloc(m_fileBufAllocSize ,"sibuf");
        m_fptr = m_fileBuf;
        m_fptrEnd = m_fileBuf;
        m_bytesStreamed = 0;
        m_hasMoreToRead = true;
    }

    if ( ! m_fileBuf ) {
        log("build: failed to alloc buf to read archive file %s",m_firstUrl.getUrl());
        return true;
    }

    if(m_hasMoreToRead) readMoreWarc();

	setStatus ("injecting archive records");

	QUICKPOLL ( m_niceness );

	// did an inject return?
	if ( m_doneInjectingWarc ) {
	warcDone:
		// log("build: done parsing %"INT64" bytes of archive file %s. left over =%"INT32 "done injecting %"INT32 " hasmoretoread %"INT32,
		// 	m_bytesStreamed + m_fptrEnd - m_fileBuf,
		//     m_firstUrl.getUrl(),
		// 	(int32_t)(m_fptrEnd - m_fptr),
		// 	(int32_t)m_doneInjectingWarc,
		// 	(int32_t)m_hasMoreToRead);

		m_doneInjectingWarc = true;

		// return if all injects have returned.
		if ( m_numInjectionsOut == 0) { // && !m_hasMoreToRead
			g_errno = m_warcError;
			m_indexCode = m_warcError;
			m_indexCodeValid = true;

			return true;
		}
		log("build: waiting for injection threads to return.");
		// we would block
		return false;
	}

    // Dup strings into here so we don't write nulls into our buffer, sometimes we have
    // to rewind over a rec and we want the buf to be the same every time.
	char scratchSpace[1024*10];
	SafeBuf scratch(scratchSpace, 1024*10);
 loop:
	scratch.reset();

	QUICKPOLL ( m_niceness );

	if ( max > MAXMSG7S ) max = MAXMSG7S;
	// wait for one to come back before launching another msg7
	if ( m_numInjectionsOut >= max ) {
		// Don't need to read anymore so don't call us
		if(m_registeredWgetReadCallback && m_pipe && m_fptr < m_fptrEnd) {
			g_loop.unregisterReadCallback(fileno(m_pipe), this,doneReadingArchiveFileWrapper);
			m_registeredWgetReadCallback = false;
		}
		return false;
	}

	char *realStart = m_fptr;

	// need at least say 100k for warc header
	if ( m_fptr + 100000 > m_fptrEnd && m_hasMoreToRead )  {
		//log("build need more of the record to process so sleeping.");

		if(!m_registeredWgetReadCallback) {
			if(!g_loop.registerReadCallback ( fileno(m_pipe),
											  this ,
											  doneReadingArchiveFileWrapper,
											  m_niceness    )) {
				log("build: failed to register warc read callback." );
				return true;
			}
			log("build: reregistered the read callback. need more");
			m_registeredWgetReadCallback = true;
		}
        return false;
	}

	int64_t  recTime       = 0;
	char    *recIp         = NULL;
	char    *recUrl        = NULL;
	char    *recContent    = NULL;
	int64_t  recContentLen = 0;
	// what we skip over
	uint64_t recSize       = 0;

	//
	// set recUrl, recIp, recTime, recContent, recContentLen and recSize
	//
    //log("buf size is %"INT64 " four chars %c%c%c%c%c%c",
    //m_fptrEnd-m_fptr, m_fptr[0], m_fptr[1], m_fptr[2], m_fptr[3],m_fptr[4],m_fptr[5]);

	if ( ctype == CT_WARC ) {
		// find "WARC/1.0" or whatever
		char *whp = m_fptr;
		if( ! whp ) {
			// FIXME: shouldn't get here with a NULL
			log("build: No buffer for file=%s",  m_firstUrl.getUrl());
			goto warcDone;
		}
		// we do terminate last warc rec with \0 so be aware of that...
		int32_t maxCount = 10;
		for ( ; *whp && strncmp(whp,"WARC/",5) && --maxCount>0; whp++);
		// none?
		if ( ! *whp ) {
			log("build: could not find WARC/1 header start for "
			    "file=%s",  m_firstUrl.getUrl());
			// we don't really need this and since we force the
			// http reply to end in \0 before calling inject2() on
			// it it gets messed up
			goto warcDone;
		}

		char *warcHeader = whp;

		// find end of warc mime HEADER not the content
		char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
		if ( ! warcHeaderEnd ) {
			log("build: could not find end of WARC header for "
			    "file=%s.",
			    m_firstUrl.getUrl());
			goto warcDone;
		}
		// \0 term for strstrs below
		char tmp = *warcHeaderEnd;
		*warcHeaderEnd = '\0';

		char *warcLen  = strstr(warcHeader,"Content-Length:");
		char *warcUrl  = strstr(warcHeader,"WARC-Target-URI:");
		char *warcType = strstr(warcHeader,"WARC-Type:");
		char *warcDate = strstr(warcHeader,"WARC-Date:");
		char *warcIp   = strstr(warcHeader,"WARC-IP-Address:");
		char *warcCon  = strstr(warcHeader,"Content-Type:");


		// advance
		if ( warcLen  ) warcLen  += 15;
		if ( warcUrl  ) warcUrl  += 16;
		if ( warcType ) warcType += 10;
		if ( warcIp   ) warcIp   += 17;
		if ( warcCon  ) warcCon  += 13;
		if ( warcDate ) warcDate += 10;

		// skip initial spaces spaces
		for ( ; warcUrl  && is_wspace_a(*warcUrl ) ; warcUrl ++ );
		for ( ; warcLen  && is_wspace_a(*warcLen ) ; warcLen ++ );
		for ( ; warcType && is_wspace_a(*warcType) ; warcType++ );
		for ( ; warcDate && is_wspace_a(*warcDate) ; warcDate++ );
		for ( ; warcIp   && is_wspace_a(*warcIp  ) ; warcIp  ++ );
		for ( ; warcCon  && is_wspace_a(*warcCon ) ; warcCon ++ );

		// get Content-Length: of WARC header for its content
		if ( ! warcLen ) {
			// this is a critical stop.
			log("build: warc problem: could not find WARC Content-Length:");
			goto warcDone;
		}

		//
		// advance m_fptr to point to the next warc record in case we
		// end up calling 'goto loop' below
		//
		recContent    = warcHeaderEnd + 4;
		recContentLen = atoll(warcLen);

		//log("build content len was %"INT64, recContentLen);
		char    *warcContentEnd = recContent + recContentLen;
		recSize = (warcContentEnd - realStart);

		recUrl = warcUrl;

		// point to the next warc record
		m_fptr += recSize;
		*warcHeaderEnd = tmp;

		//log("skipping %"UINT64, recSize);
		// advance the file offset to the next record as well

		// get WARC-Type:
		// revisit  (if url was already done before)
		// request (making a GET or DNS request)
		// response (reponse to a GET or dns request)
		// warcinfo (crawling parameters, robots: obey, etc)
		// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
		if ( ! warcType ) {
			log("build: could not find WARC-Type:");
			goto loop;
		}
		//http://www.mpaa.org/Resources/5bec4ac9-a95e-443b-987b-bff6fb5455a9.pdf
		// get Content-Type:
		// application/warc-fields (fetch time, hops from seed)
		// application/http; msgtype=request  (the GET request)
		// application/http; msgtype=response (the GET reply)
		if ( ! warcCon ) {
			log("build: could not find Content-Type:");
			goto loop;
		}

		if ( ! warcUrl ) {
			// no URI?
			log("build: could not find url");
			goto loop;
		}

		// if WARC-Type: is not response, skip it. so if it
		// is a revisit then skip it i guess.
		if ( strncmp ( warcType,"response", 8 ) != 0) {
			//log("build: was not type response %s *****%s*****", warcUrl, warcType);

			// read another warc record
			goto loop;
		}

		// warcConType needs to be
		// application/http; msgtype=response
		if ( !(strncmp(warcCon,"application/http; msgtype=response",34) == 0 ||
			   strncmp(warcCon,"application/http;msgtype=response",33) == 0)) {
			// read another warc record
			//log("build: wrong content type %s ---%s---", warcUrl, warcCon);
			goto loop;
		}

		recTime = 0;
		if ( warcDate ) recTime = atotime ( warcDate );
		recIp = warcIp;
	}
	// END WARC SPECIFIC PARSING

	//
	// set recUrl, recIp, recTime, recContent, recContentLen and recSize
	//
	if ( ctype == CT_ARC ) {
		// find \n\nhttp://
		char *whp = m_fptr;
		for ( ; *whp ; whp++ ) {
			if ( whp[0] != '\n' ) continue;
			if ( strncmp(whp+1,"http://",7) == 0) break;
			if ( strncmp(whp+1,"https://",8) == 0) break;
		}
		// none?
		if ( ! *whp ) {
			log("build: arc: could not find next \\nhttp:// in "
			    "arc file %s",m_firstUrl.getUrl());
			goto warcDone;
		}
		char *arcHeader = whp;
		// find end of arc header not the content
		char *arcHeaderEnd = strstr(arcHeader+1,"\n");
		if ( ! arcHeaderEnd ) {
			log("build: warc problem: could not find end of ARC header. file=%s",
                m_firstUrl.getUrl());
			goto warcDone;
		}
		// \0 term for strstrs below
		char tmp = *arcHeaderEnd;
		*arcHeaderEnd = '\0';
		char *arcContent = arcHeaderEnd + 1;
		// parse arc header line
		char *url = arcHeader + 1;
		char *hp = url;
		for ( ; *hp && *hp != ' ' ; hp++ );
		if ( ! *hp ) {
            log("build: warc problem: bad arc header 1.file=%s", m_firstUrl.getUrl());
            goto warcDone;
        }
		url = scratch.pushStr(url,  hp-url);
		hp++;

		char *ipStr = hp;
		for ( ; *hp && *hp != ' ' ; hp++ );
		if ( ! *hp ) {
            log("build: warc problem: bad arc header 2.file=%s", m_firstUrl.getUrl());
            goto warcDone;
        }
		ipStr  = scratch.pushStr(ipStr, hp - ipStr);
		hp++;

		char *timeStr = hp;
		for ( ; *hp && *hp != ' ' ; hp++ );
		if ( ! *hp ) {
            log("build: warc problem: bad arc header 3.file=%s", m_firstUrl.getUrl());
            goto warcDone;
        }
		timeStr = scratch.pushStr(timeStr, hp - timeStr);
		hp++;

		char *arcConType = hp;
		for ( ; *hp && *hp != ' ' ; hp++ );
		if ( ! *hp ) {
            log("build: warc problem: bad arc header 4.file=%s", m_firstUrl.getUrl());
            goto warcDone;
        }
		arcConType = scratch.pushStr(arcConType, hp - arcConType);
		hp++;

		char *arcContentLenStr = hp;
		// get arc content len
		int64_t arcContentLen = atoll(arcContentLenStr);
		char *arcContentEnd = arcContent + arcContentLen;
		//uint64_t oldOff = s_off;
		recSize = (arcContentEnd - realStart);
		// point to the next arc record
		m_fptr += recSize;
		*arcHeaderEnd = tmp;
		// advance the file offset to the next record as well
		// arcConType needs to indexable
		int32_t ct = getContentTypeFromStr ( arcConType );
		if ( ct != CT_HTML &&
		     ct != CT_TEXT &&
		     ct != CT_XML &&
			 ct != CT_PDF &&
			 ct != CT_XLS &&
			 ct != CT_PPT &&
			 ct != CT_PS  &&
			 ct != CT_DOC &&
		     ct != CT_JSON ) {
			// read another arc record
			log("build: was not indexable response %s", arcConType);
			goto loop;
		}
		// convert to timestamp
		// this time structure, once filled, will help yield a time_t
		struct tm t;
		// DAY OF MONTH
		t.tm_mday = atol2 ( timeStr + 6 , 2 );
		// MONTH
		t.tm_mon = atol2 ( timeStr + 4  , 2 );
		// YEAR - # of years since 1900
		t.tm_year = atol2 ( timeStr     , 4 ) - 1900 ;
		// TIME
		t.tm_hour = atol2 ( timeStr +  8 , 2 );
		t.tm_min  = atol2 ( timeStr + 10 , 2 );
		t.tm_sec  = atol2 ( timeStr + 12 , 2 );
		// unknown if we're in  daylight savings time
		t.tm_isdst = -1;
		// translate using mktime
		recTime = timegm ( &t );
		// set content as well
		recContent = arcContent;
		recContentLen = arcContentLen;
		recUrl = url;
		recIp = ipStr;
	}
	// END ARC SPECIFIC PARSING


	// must be http not dns:
	// url must start with http:// or https://
	// it's probably like WARC-Target-URI: dns:www.xyz.com
	// so it is a dns response
	if ( strncmp(recUrl,"http://" ,7) != 0 &&
	     strncmp(recUrl,"https://",8) != 0 )
		goto loop;

	// get length of it, null term it
	char *recUrlEnd = recUrl;
	for ( ; *recUrlEnd && ! is_wspace_a(*recUrlEnd) ; recUrlEnd++ );
	int32_t recUrlLen = recUrlEnd - recUrl;
	//*recUrlEnd = '\0';

	// skip if robots.txt
	if ( isRobotsTxtFile( recUrl , recUrlLen ) )
		goto loop;

	// how can there be no more to read?
	if ( m_fptr > m_fptrEnd && ! m_hasMoreToRead ) {
		log("build: warc problem: archive file %s exceeded file length.",
            m_firstUrl.getUrl());
		goto warcDone;
	}

	// if we fall outside of the current read buf, read next rec if too big
	if ( m_fptr > m_fptrEnd && recSize > MAXWARCRECSIZE ) {
		log("build: skipping archive file of %"INT64" "
		    "bytes which is too big",recSize);

		if(!m_registeredWgetReadCallback) {
			if(!g_loop.registerReadCallback ( fileno(m_pipe),
											  this ,
											  doneReadingArchiveFileWrapper,
											  m_niceness    )) {
				log("build: failed to register warc read callback." );
				return true;
			}
			log("build: reregistered the read callback. skip bigrec");
			m_registeredWgetReadCallback = true;
		}
		return false;
	}

	// don't read the next record, read THIS one again, we can fit it
	if ( m_fptr > m_fptrEnd ) {
        //log("build: record end is past the end of what we read by %"INT64 "  %"UINT64,  m_fptrEnd - m_fptr, recSize);
        m_fptr -= recSize;

		if(!m_registeredWgetReadCallback) {
			if(!g_loop.registerReadCallback ( fileno(m_pipe),
											  this ,
											  doneReadingArchiveFileWrapper,
											  m_niceness    )) {
				log("build: failed to register warc read callback." );
				return true;
			}
			log("build: reregistered the read callback. reread this record");
			m_registeredWgetReadCallback = true;
		}


        return false;
	}

	char    *httpReply     = recContent;
	int64_t  httpReplySize = recContentLen;

	// should be a mime that starts with GET or POST
	HttpMime m;
	if ( ! m.set ( httpReply , httpReplySize , NULL ) ) {
		log("build: archive: failed to set http mime at in "
		    "file");
		goto loop;
	}

	// check content type
	int ct2 = m.getContentType();
	if ( ct2 != CT_HTML &&
	     ct2 != CT_TEXT &&
	     ct2 != CT_XML &&
	     ct2 != CT_PDF &&
		 ct2 != CT_XLS &&
		 ct2 != CT_PPT &&
		 ct2 != CT_PS  &&
	     ct2 != CT_DOC &&
	     ct2 != CT_JSON ) {
		//log("build:got wrong type %"INT32, (int32_t)ct2);
		goto loop;
	}

	// grab an available msg7
	Msg7 *msg7 = NULL;
	for ( int32_t i = 0 ; i < MAXMSG7S ; i++ ) {
		msg7 = m_msg7s[i];
		// if we got an available one stop
		if ( msg7 ) {
			if( msg7->m_inUse ) continue;
			break; // reuse this one.
		}
		// ok, create one, 1MB each about
		try { msg7 = new ( Msg7 ); }
		catch ( ... ) {g_errno=ENOMEM;m_warcError=g_errno;return true;}
		mnew ( msg7 , sizeof(Msg7),"xdmsgs7");

		// store it for re-use
		m_msg7s[i] = msg7;
		break;
	}

    if(!msg7 || msg7->m_inUse) {
        // shouldn't happen, but it does... why?
        log("build: archive: Ran out of msg7s to inject doc.");
        return false;
    }

	// inject input parms:
	InjectionRequest *ir = &msg7->m_injectionRequest;
	// reset it
	ir->m_hopCount = *hc + 1;
	if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
	ir->m_collnum = m_collnum;
	// will this work on a content delimeterized doc?
	ir->m_deleteUrl = m_deleteFromIndex;
	// each subdoc will have a mime since it is a warc
	ir->m_hasMime = true;
	// it has a mime so we shouldn't need to set this
	ir->ptr_contentTypeStr = NULL;
	// we are injecting a single page, not a container file
	ir->ptr_contentDelim = NULL;
	// miscelleaneous. faster than memsetting the whole gr class (32k)
	ir->m_getSections = 0;
	ir->m_gotSections = 0;
	ir->m_spiderLinks = false;
	ir->m_injectLinks = false;
	ir->m_shortReply = false;
	ir->m_newOnly = false;
	ir->m_recycle = false;
	ir->m_dedup = true;
	ir->m_doConsistencyTesting = false;
	ir->m_charset = 0;

	ir->ptr_queryToScrape = NULL;
	ir->ptr_contentFile = NULL;
	ir->ptr_diffbotReply = NULL;


	// Stick the capture date in the metadata
	StackBuf(newKey);
	newKey.safePrintf("\"gbcapturedate\":%"INT64, recTime);
	SafeBuf newMetadata(newKey.length() * 2 + size_metadata, "ModifiedMetadata");

	newMetadata.safeMemcpy(ptr_metadata, size_metadata);
	Json::prependKey(newMetadata, newKey.getBufStart());

	ir->ptr_metadata = newMetadata.getBufStart();
	ir->size_metadata = newMetadata.length();

	newMetadata.nullTerm();
	// set 'timestamp' for injection
	ir->m_firstIndexed = recTime;
	ir->m_lastSpidered = recTime;


	// set 'ip' for injection

	ir->m_injectDocIp = 0;
	// get the record IP address from the warc header if there
	if ( recIp ) {
		// get end of ip
		char *ipEnd = recIp;
		// skip digits and periods
		while ( *ipEnd && ! is_wspace_a(*ipEnd) ) ipEnd++;
		// we now have the ip address for doing ip: searches
		// this func is in ip.h
		ir->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
	}

	// we end up repopulating m_fileBuf to read the next warc sometimes
	// so do not destroy the content we are injecting from the original
	// m_fileBuf. so we have to copy it.
	msg7->m_contentBuf.reset();
	msg7->m_contentBuf.reserve ( httpReplySize + 5 );
	msg7->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
	msg7->m_contentBuf.nullTerm();

	// set 'content' for injection
	ir->ptr_content = msg7->m_contentBuf.getBufStart();
	ir->size_content = msg7->m_contentBuf.getLength() + 1;


	// set the rest of the injection parms
	ir->m_hopCount     = -1;
	ir->m_newOnly        = 0;
	// all warc records have the http mime
	ir->m_hasMime        = true;

	ir->ptr_url          = recUrl;
	ir->size_url         = recUrlLen+1;

	// stash this
	msg7->m_stashxd = this;

	QUICKPOLL ( m_niceness );

	// log it
    *recUrlEnd = '\0';
	log("build: archive: injecting archive url %s",recUrl);

	QUICKPOLL ( m_niceness );

	if (msg7->sendInjectionRequestToHost(ir,msg7,doneInjectingArchiveRec)){
		m_numInjectionsOut++;
		msg7->m_inUse = true;
		goto loop;
	}

	log("build: index archive: msg7 inject: %s",
	    mstrerror(g_errno));

	goto loop;
}


void getTitleRecBufWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// note it
	THIS->setStatus ( "in get title rec wrapper" );
	// return if it blocked
	if ( THIS->getTitleRecBuf() == (void *)-1 ) return;
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

key_t *XmlDoc::getTitleRecKey() {
	if ( m_titleRecBufValid ) return &m_titleRecKey;
	SafeBuf *tr = getTitleRecBuf();
	if ( ! tr || tr == (void *)-1 ) return (key_t *)tr;
	return &m_titleRecKey;
}


int32_t *XmlDoc::getIndexCode ( ) {

	int32_t *indexCode = getIndexCode2();
	if ( ! indexCode || indexCode == (void *)-1 ) return indexCode;

	// if zero good!
	if ( *indexCode == 0 ) return indexCode;

	//
	// should we neutralize it?
	//
	// in the case of indexing dmoz urls outputted from
	// 'dmozparse urldump -s' it outputs a meta tag
	// (<meta name=ignorelinksexternalerrors content=1>) that
	// indicates to index the links even in the case of some errors,
	// so that we can be assured to have exactly the same urls the dmoz
	// has in our index. so when we do a gbcatid:xxx query we get the same
	// urls in the search results that dmoz has for that category id.
	if ( ! m_sreqValid || ! m_sreq.m_ignoreExternalErrors )
		return indexCode;

	// only neutralize certain errors
	if (    *   indexCode != EDNSTIMEDOUT
		&& *indexCode != ETCPTIMEDOUT
		&& *indexCode != EUDPTIMEDOUT
		// from m_redirError
		&& *indexCode != EDOCSIMPLIFIEDREDIR
		&& *indexCode != EDOCNONCANONICAL
		&& *indexCode != EDNSDEAD
		&& *indexCode != ENETUNREACH
		&& *indexCode != EHOSTUNREACH
		&& *indexCode != EDOCFILTERED
		&& *indexCode != EDOCREPEATSPAMMER
		&& *indexCode != EDOCDUP
		&& *indexCode != EDOCISERRPG
		&& *indexCode != EDOCHIJACKED
		&& *indexCode != EDOCBADHTTPSTATUS
		&& *indexCode != EDOCDISALLOWED
		&& *indexCode != EBADCHARSET
		&& *indexCode != EDOCDUPWWW
		&& *indexCode != EBADIP
		&& *indexCode != EDOCEVILREDIRECT // fix video.google.com dmoz
		&& *indexCode != EBADMIME
		// index.t and .exe files are in dmoz but those
		// extensions are "bad" according to Url::isBadExtension()
		&& *indexCode != EDOCBADCONTENTTYPE
		// repeat url path components are ok:
		&& *indexCode != ELINKLOOP
		&& *indexCode != ECONNREFUSED
		// malformed sections:
		&& *indexCode != EDOCBADSECTIONS
		&& *indexCode != ECORRUPTHTTPGZIP
		)
		return indexCode;

	// ok, neutralize it
	*indexCode = 0;

	// if we could not get an ip we need to make a fake one
	if ( ! m_ipValid || m_ip == 0 || m_ip == -1 ) {
		log("build: ip unattainable. forcing ip address of %s "
		    "to 10.5.123.45",m_firstUrl.m_url);
		m_ip = atoip("10.5.123.45");
		m_ipValid = true;
	}

	// make certain things valid to avoid core in getNewSpiderReply()
	if ( ! m_crawlDelayValid ) {
		m_crawlDelayValid = true;
		m_crawlDelay      = -1;
	}

	return indexCode;
}


// . return NULL and sets g_errno on error
// . returns -1 if blocked
int32_t *XmlDoc::getIndexCode2 ( ) {

	// return it now if we got it already
	if ( m_indexCodeValid ) return &m_indexCode;

	setStatus ( "getting index code");

	// page inject can set deletefromindex to true
	if ( m_deleteFromIndex ) {
		m_indexCode = EDOCFORCEDELETE;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block and return -1, we will be re-called from the top
	//if ( ! m_masterLoop ) {
	//	m_masterLoop  = getTitleRecWrapper;
	//	m_masterState = this;
	//}

	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }

	if ( m_firstUrl.m_ulen <= 5 ) {
		m_indexCode = EBADURL;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	if ( m_firstUrl.m_ulen + 1 >= MAX_URL_LEN ) {
		m_indexCode      = EURLTOOLONG;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// "url is repeating path components" error?
	if ( ! m_check1 ) {
		m_check1         = true;
		if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) {
			m_indexCode      = ELINKLOOP;
			m_indexCodeValid = true;
			return &m_indexCode;
		}
	}

	// fix for "http://.xyz.com/...."
	if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) {
		m_indexCode      = EBADURL;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
		m_check2         = true;
		if ( m_firstUrl.isSpam() ) {
			m_indexCode      = EDOCURLSPAM;
			m_indexCodeValid = true;
			return &m_indexCode;
		}
	}

	// . don't spider robots.txt urls for indexing!
	// . quickly see if we are a robots.txt url originally
	int32_t fulen = getFirstUrl()->getUrlLen();
	char *fu   = getFirstUrl()->getUrl();
	char *fp   = fu + fulen - 11;
	if ( fulen > 12 &&
	     fp[1] == 'r' &&
	     ! strncmp ( fu + fulen - 11 , "/robots.txt" , 11 )) {
		m_indexCode = EBADURL;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// if this is an injection and "newonly" is not zero then we
	// only want to do the injection if the url is "new", meaning not
	// already indexed. "m_wasContentInjected" will be true if this is
	// an injection. "m_newOnly" will be true if the injector only
	// wants to proceed with the injection if this url is not already
	// indexed.
	if ( m_wasContentInjected && m_newOnly ) {
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
		XmlDoc *od = *pod;
		// if the old doc does exist and WAS NOT INJECTED itself
		// then abandon this injection. it was spidered the old
		// fashioned way and we want to preserve it and NOT overwrite
		// it with this injection.
		if ( od && ! od->m_wasContentInjected ) {
			m_indexCode = EABANDONED;
			m_indexCodeValid = true;
			return &m_indexCode;
		}
		// if it was injected itself, only abandon this injection
		// in the special case that m_newOnly is "1". otherwise
		// if m_newOnly is 2 then we will overwrite any existing
		// titlerecs that were not injected themselves.
		if ( od && od->m_wasContentInjected && m_newOnly == 1 ) {
			m_indexCode = EABANDONED;
			m_indexCodeValid = true;
			return &m_indexCode;
		}

	}

	// need tagrec to see if banned
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// this is an automatic ban!
	if ( gr->getLong("manualban",0) ) {
		m_indexCode = EDOCBANNED;
		m_indexCodeValid = true;
		return &m_indexCode;
	}


	// get the ip of the current url
	int32_t *ip = getIp ( );
	if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
	if ( *ip == 0 ) {
		m_indexCode      = EBADIP;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// . check robots.txt
	// . uses the curernt url
	// . if we end in /robots.txt then this quickly returns true
	// . no, we still might want to index if we got link text, so just
	//   check this again below
	bool *isAllowed = getIsAllowed();
	if ( ! isAllowed || isAllowed == (void *)-1) return (int32_t *)isAllowed;
	/*
	if ( ! *isAllowed ) {
		m_indexCode      = EDOCDISALLOWED;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	*/

	// . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc.
	// . this will be the reply from diffbot.com if using diffbot
	int32_t *dstatus = getDownloadStatus();
	if ( ! dstatus || dstatus == (void *)-1 ) return (int32_t *)dstatus;
	if ( *dstatus ) {
		m_indexCode      = *dstatus;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// check the mime
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (int32_t *)mime;
	// no, now the smart compression will nuke a reply if it has
	// no good date or for other reasons...
	// if empty, bad mime
	//if ( mime->getMimeLen() <= 0 && ! m_recycleContent ) {
	//	m_indexCode      = EBADMIME;
	//	m_indexCodeValid = true;
	//	return &m_indexCode;
	//}

	// check redir url
	Url **redirp = getRedirUrl();
	if ( ! redirp || redirp == (void *)-1 ) return (int32_t *)redirp;
	// this must be valid now
	if ( ! m_redirErrorValid ) { char *xx=NULL;*xx=0; }
	if ( m_redirError ) {
		m_indexCode      = m_redirError;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	int64_t *d = getDocId();
	if ( ! d || d == (void *)-1 ) return (int32_t *)d;
	if ( *d == 0LL ) {
		m_indexCode      = ENODOCID;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// . is the same url but with a www. present already in titledb?
	// . example: if we are xyz.com and www.xyz.com is already in titledb
	//   then nuke ourselves by setting m_indexCode to EDOCDUPWWW
	char *isWWWDup = getIsWWWDup ();
	if ( ! isWWWDup || isWWWDup == (char *)-1) return (int32_t *)isWWWDup;
	if ( *isWWWDup ) {
		m_indexCode      = EDOCDUPWWW;
		m_indexCodeValid = true;
		return &m_indexCode;
	}


	uint16_t *charset = getCharset();
	if ( ! charset && g_errno == EBADCHARSET ) {
		g_errno = 0;
		m_indexCode      = EBADCHARSET;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	if ( ! charset || charset == (void *)-1) return (int32_t *)charset;
	// we had a 2024 for charset come back and that had a NULL
	// get_charset_str() but it was not supported
	if ( ! supportedCharset(*charset) ) { //&&get_charset_str(*charset) ) {
		m_indexCode      = EBADCHARSET;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// get local link info
	LinkInfo   *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int32_t *)info1;
	// get remote link info
	LinkInfo  **pinfo2 = getLinkInfo2();
	if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (int32_t *)pinfo2;
	LinkInfo   *info2 = *pinfo2;

	// if robots.txt said no, and if we had no link text, then give up
	bool disallowed = true;
	if ( *isAllowed ) disallowed = false;
	if ( info1 && info1->hasLinkText() ) disallowed = false;
	if ( info2 && info2->hasLinkText() ) disallowed = false;
	// if we generated a new sitenuminlinks to store in tagdb, we might
	// want to add this for that only reason... consider!
	if ( disallowed ) {
		m_indexCode      = EDOCDISALLOWED;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// check for bad url extension, like .jpg
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (int32_t *)cu;

	// take this check out because it is hurting
	// http://community.spiceworks.com/profile/show/Mr.T
	// because 't' was in the list of bad extensions.
	// now we use the url filters table to exclude the extensions we want.
	// and we use the 'ismedia' directive to exclude common media
	// extensions. having this check here is no longer needed and confusing
	// BUT on the otherhand stuff like .exe .rpm .deb is good to avoid!
	// so i'll just edit the list to remove more ambiguous extensions
	// like .f and .t
	bool badExt = cu->isBadExtension ( m_version );
	if ( badExt && ! info1->hasLinkText() &&
	      ( ! info2 || ! info2->hasLinkText() ) ) {
	 	m_indexCode      = EDOCBADCONTENTTYPE;
	 	m_indexCodeValid = true;
	 	return &m_indexCode;
	}

	int16_t *hstatus = getHttpStatus();
	if ( ! hstatus || hstatus == (void *)-1 ) return (int32_t *)hstatus;
	if ( *hstatus != 200 ) {
		m_indexCode      = EDOCBADHTTPSTATUS;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// debug point
	//if ( cr->m_localCrawlInfo.m_pageDownloadAttempts >= 2 ) {
	//	m_indexCode = ETCPTIMEDOUT;
	//	m_indexCodeValid = true;
	//	return &m_indexCode;
	//}

	// if this page is hijacked, toss it!
	char *hj = getIsHijacked();
	if ( ! hj || hj == (char *)-1 ) return (int32_t *)hj;
	// if not allowed m_indexCode will be set
	if ( *hj ) {
		m_indexCode      = EDOCHIJACKED;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// check for EDOCISERRPG (custom error pages)
	char *isErrorPage = getIsErrorPage();
	if ( !isErrorPage||isErrorPage==(void *)-1) return (int32_t *)isErrorPage;
	if ( *isErrorPage ) {
		m_indexCode      = EDOCISERRPG;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// . i moved this up to perhaps fix problems of two dup pages being
	//   downloaded at about the same time
	// . are we a dup of another doc from any other site already indexed?
	char *isDup = getIsDup();
	if ( ! isDup || isDup == (char *)-1 ) return (int32_t *)isDup;
	if ( *isDup ) {
		m_indexCode      = EDOCDUP;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// . is a non-canonical page that have <link ahref=xxx rel=canonical>
	// . also sets m_canonicanlUrl.m_url to it if we are not
	// . returns NULL if we are the canonical url
	// . do not do this check if the page was injected
	bool checkCanonical = true;
	if ( m_wasContentInjected ) checkCanonical = false;
	if ( m_isInjecting && m_isInjectingValid ) checkCanonical = false;
	// do not do canonical deletion if recycling content either i guess
	if ( m_sreqValid && m_sreq.m_recycleContent ) checkCanonical = false;
	// do not delete from being canonical if doing a query reindex
	if ( m_sreqValid && m_sreq.m_isPageReindex ) checkCanonical = false;
	if ( checkCanonical ) {
		Url **canon = getCanonicalRedirUrl();
		if ( ! canon || canon == (void *)-1 ) return (int32_t *)canon;
		// if there is one then we are it's leaf, it is the primary
		// page so we should not index ourselves
		if ( *canon ) {
			m_indexCode = EDOCNONCANONICAL;
			m_indexCodeValid = true;
			return &m_indexCode;
		}
	}

	// was page unchanged since last time we downloaded it?
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
	XmlDoc *od = NULL;
	if ( *pod ) od = *pod;

	// if recycling content is true you gotta have an old title rec.
	if ( ! od && m_recycleContent ) {
		m_indexCode = ENOTITLEREC;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	bool check = true;
	if ( ! od ) check = false;
	// do not do this logic for diffbot because it might want to get
	// the diffbot reply even if page content is the same, because it
	// might have an ajax call that updates the product price.
	// onlyProcessIfNewUrl defaults to true, so typically even diffbot
	// crawls will do this check.
	if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl &&
	     // but allow urls like *-diffbotxyz2445187448 to be deduped,
	     // that is the whole point of this line
	     ! m_isDiffbotJSONObject )
		check = false;
	if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError )
		check = false;
	// or if recycling content turn this off as well! otherwise
	// it will always be 100% the same
	if ( m_recycleContent )
		check = false;
	// never check for a bulk job
	if ( cr->m_isCustomCrawl == 2 )
		check = false;

	if ( check ) {
		// check inlinks now too!
		LinkInfo  *info1 = getLinkInfo1 ();
		if ( ! info1 || info1 == (LinkInfo *)-1 )
			return (int32_t *)info1;
		LinkInfo  *info2 = od->getLinkInfo1 ();
		if ( ! info2 || info2 == (LinkInfo *)-1 )
			return (int32_t *)info2;
		Inlink *k1 = NULL;
		Inlink *k2 = NULL;
		char *s1, *s2;
		int32_t len1,len2;
		if ( info1->getNumGoodInlinks() !=
		     info2->getNumGoodInlinks() )
			goto changed;
		for ( ; k1=info1->getNextInlink(k1) ,
			      k2=info2->getNextInlink(k2); ) {
			if ( ! k1 )
				break;
			if ( ! k2 )
				break;
			if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks )
				goto changed;
			s1   = k1->getLinkText();
			len1 = k1->size_linkText - 1; // exclude \0
			s2   = k2->getLinkText();
			len2 = k2->size_linkText - 1; // exclude \0
			if ( len1 != len2 )
				goto changed;
			if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
				goto changed;
		}
		// no change in link text, look for change in page content now
		int32_t *ch32 = getContentHash32();
		if ( ! ch32 || ch32 == (void *)-1 ) return (int32_t *)ch32;
		if ( *ch32 == od->m_contentHash32 ) {
			m_indexCode = EDOCUNCHANGED;
			m_indexCodeValid = true;
			// hack these values on or off.
			// really should be function calls.
			// but it never gets set when it should if the
			// doc is unchanged.
			m_sentToDiffbot = od->m_sentToDiffbot;
			return &m_indexCode;
		}
	}

 changed:
	// words
	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (int32_t *)words;

	// we set the D_IS_IN_DATE flag for these bits
	Bits *bits = getBits(); if ( ! bits ) return NULL;

	// . check for date buffer overflow before setting sections
	// . returns false and sets g_errno on error
	/*
	if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits )) {
		// sanity check
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// note it
		log("doc: parseDates: %s",mstrerror(g_errno));
		// this just means we ran out of stack space to parse
		// out all the dates, so ignore and continue... that way
		// Spider.cpp does not give up and keep retrying us over
		// and over again
		if ( g_errno != EBUFOVERFLOW ) return NULL;
		g_errno = 0;
		m_indexCode      = EDOCBADDATES;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	*/

	// bad sections? fixes http://www.beerexpedition.com/northamerica.shtml
	// being continuously respidered when its lock expires every
	// MAX_LOCK_AGE seconds
	Sections *sections = getSections();
	// on EBUFOVERFLOW we will NEVER be able to parse this url
	// correctly so do not retry!
	if ( ! sections && g_errno == EBUFOVERFLOW ) {
		g_errno = 0;
		m_indexCode      = EBUFOVERFLOW;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	if (!sections||sections==(Sections *)-1) return (int32_t *)sections;
	if ( sections->m_numSections == 0 && words->m_numWords > 0 ) {
		m_indexCode      = EDOCBADSECTIONS;
		m_indexCodeValid = true;
		return &m_indexCode;
	}

	// i think an oom error is not being caught by Sections.cpp properly
	if ( g_errno ) { char *xx=NULL;*xx=0; }

	Dates *dp = getDates();
	if ( ! dp && g_errno == EBUFOVERFLOW ) {
		g_errno = 0;
		m_indexCode      = EBUFOVERFLOW;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;

	// make sure address buffers did not overflow
	Addresses *aa = getAddresses ();
	if ( (! aa && g_errno == EBUFOVERFLOW) ||
	     // it sets m_breached now if there's a problem
	     (aa && aa->m_breached) ) {
		g_errno = 0;
		m_indexCode      = EBUFOVERFLOW;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	if ( ! aa || aa == (void *)-1 ) return (int32_t *)aa;


	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (int32_t *)isRoot;

	// get the tag rec
	//TagRec *gr = getTagRec ();
	//if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;

	bool spamCheck = true;
	// if we are a root, allow repeat spam
	if ( *isRoot ) spamCheck = false;
	// if we are being spidered deep, allow repeat spam
	if ( gr->getLong("deep",0) ) spamCheck = false;
	// not for crawlbot
	if ( cr->m_isCustomCrawl ) spamCheck = false;
	// only html for now
	if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
	// turn this off for now
	spamCheck = false;
	// otherwise, check the weights
	if ( spamCheck ) {
		char *ws = getWordSpamVec();
		if ( ! ws || ws == (void *)-1 ) return (int32_t *)ws;
		if ( m_isRepeatSpammer ) {
			m_indexCode      = EDOCREPEATSPAMMER;
			m_indexCodeValid = true;
			return &m_indexCode;
		}
	}

	// validate this here so getSpiderPriority(), which calls
	// getUrlFilterNum(), which calls getNewSpiderReply(), which calls
	// us, getIndexCode() does not repeat all this junk
	//m_indexCodeValid = true;
	//m_indexCode      = 0;

	// fix query reindex on global-index from coring because
	// the spider request is null
	if ( m_isDiffbotJSONObject ) {
		m_indexCode      = 0;
		m_indexCodeValid = true;
		return &m_indexCode;
	}


	// this needs to be last!
	int32_t *priority = getSpiderPriority();
	if ( ! priority || priority == (void *)-1) {
		// allow this though
		if ( g_errno == EBUFOVERFLOW ) {
			g_errno = 0;
			m_indexCode      = EBUFOVERFLOW;
			m_indexCodeValid = true;
			return &m_indexCode;
		}
		// but if it blocked, then un-validate it
		m_indexCodeValid = false;
		// and return to be called again i hope
		return (int32_t *)priority;
	}
	if ( *priority  == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
		m_indexCode      = EDOCFILTERED;
		m_indexCodeValid = true;
		return &m_indexCode;
	}
	// if ( *priority  == SPIDER_PRIORITY_BANNED ) {
	// 	m_indexCode      = EDOCBANNED;
	// 	m_indexCodeValid = true;
	// 	return &m_indexCode;
	// }

	// . if using diffbot and the diffbot reply had a time out error
	//   or otherwise... diffbot failure demands a re-try always i guess.
	//   put this above getSpiderPriority() call otherwise we end up in
	//   a recursive loop with getIndexCode() and getNewSpiderReply()
	// . NO, don't do this anymore, however, if there is a diffbot
	//   reply error then record it in the spider reply BUT only if it is
	//   a diffbot reply error that warrants a retry. for instance,
	//   EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
	//   error trying to download the page so it probably should not
	//   retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
	// SafeBuf *dbr = getDiffbotReply();
	// if ( ! dbr || dbr == (void *)-1 ) return (int32_t *)dbr;
	// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
	// 	m_indexCode= m_diffbotReplyError;
	// 	m_indexCodeValid = true;
	// 	return &m_indexCode;
	// }

	// no error otherwise
	m_indexCode      = 0;
	m_indexCodeValid = true;
	return &m_indexCode;
}

char *XmlDoc::prepareToMakeTitleRec ( ) {
	// do not re-call this for speed
	if ( m_prepared ) return (char *)1;

	int32_t *indexCode = getIndexCode();
	if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
	if ( *indexCode ) { m_prepared = true; return (char *)1; }

	//
	// do all the sets here
	//

	// . this gets our old doc from titledb, if we got it
	// . TODO: make sure this is cached in the event of a backoff, we
	//   will redo this again!!! IMPORTANT!!!
	char *isIndexed = getIsIndexed();
	if ( ! isIndexed || isIndexed == (char *)-1) return (char *)isIndexed;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if we are injecting into the "qatest123" coll, then we need to have
	// m_spideredTimeValid be true before calling getIsSpam() which calls
	// getSiteNumInlinks() which adds tags to tagdb using that date, but
	// only for the "qatest123" coll!
	// that keeps our parser output consistent across runs!
	char **content = NULL;
	if ( ! strcmp ( cr->m_coll,"qatest123") ) {
		content = getContent ( );
		if ( ! content || content == (void *)-1 )
			return (char *)content;
	}

	// get our site root
	char *mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (char *)mysite;

	// if we are a root page, update tagdb with the root lang id
	//bool *status1 = updateRootLangId();
	//if ( ! status1 || status1 == (void *)-1 ) return (char *)status1;

	// if we are a root page, update tagdb with the root lang id
	//bool *status2 = updateSiteTitleBuf();
	//if ( ! status2 || status2 == (void *)-1 ) return (char *)status2;

	// if we found some default venue addresses on page, add to tagdb
	//bool *status3 = updateVenueAddresses();
	//if ( ! status3 || status3 == (void *)-1 ) return (char *)status3;

	// add "firstip" to tag rec if we need to
	//bool *status4 = updateFirstIp();
	//if ( ! status4 || status4 == (void *)-1 ) return (char *)status4;

	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;

	int32_t *datedbDate = getPubDate();
	if ( ! datedbDate || datedbDate == (int32_t *)-1 )
		return (char *)datedbDate;

	getHostHash32a();
	getContentHash32();

	//Images *images = getImages();
	//if ( ! images || images == (Images *)-1 ) return (char *)images;

	char **id = getThumbnailData();
	if ( ! id || id == (void *)-1 ) return (char *)id;

	int8_t *hopCount = getHopCount();
	if ( ! hopCount || hopCount == (void *)-1 ) return (char *)hopCount;

	char *spiderLinks = getSpiderLinks();
	if ( ! spiderLinks || spiderLinks == (char *)-1 )
		return (char *)spiderLinks;

	//int32_t *nextSpiderTime = getNextSpiderTime();
	//if ( ! nextSpiderTime || nextSpiderTime == (int32_t *)-1 )
	//	return (char *)nextSpiderTime;

	//int8_t *nextSpiderPriority = getNextSpiderPriority();
	//if ( ! nextSpiderPriority || nextSpiderPriority == (void *)-1 )
	//	return (char *)nextSpiderPriority;

	int32_t *firstIndexedDate = getFirstIndexedDate();
	if ( ! firstIndexedDate || firstIndexedDate == (int32_t *)-1 )
		return (char *)firstIndexedDate;

	int32_t *outlinksAddedDate = getOutlinksAddedDate();
	if ( ! outlinksAddedDate || outlinksAddedDate == (int32_t *)-1 )
		return (char *)outlinksAddedDate;

	uint16_t *countryId = getCountryId();
	if ( ! countryId||countryId==(uint16_t *)-1) return (char *)countryId;

	char *trunc = getIsContentTruncated();
	if ( ! trunc || trunc == (char *)-1 ) return (char *)trunc;

	char *pl = getIsPermalink();
	if ( ! pl || pl == (char *)-1 ) return (char *)pl;

	//int32_t *numBannedOutlinks = getNumBannedOutlinks();
	// set this
	//m_numBannedOutlinks8 = score32to8 ( *numBannedOutlinks );

	Dates *dp = getDates();
	if ( ! dp || dp == (Dates *)-1 ) return (char *)dp;

	// . before storing this into title Rec, make sure all tags
	//   are valid and tagRec is up to date
	// . like we might need to update the contact info, siteNumInlinks,
	//   or other tags because, for instance, contact info might not
	//   be in there because isSpam() never required it.
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
	char *hci = getHasContactInfo();
	if ( ! hci || hci == (char *)-1 ) return (char *)hci;
	char *ict = getIsContentTruncated();
	if ( ! ict || ict == (char *)-1 ) return (char *)ict;
	int64_t **wd = getWikiDocIds();
	if ( ! wd || wd == (void *)-1 ) return (char *)wd;
	int64_t **avp = getAdVector();
	if ( ! avp || avp == (void *)-1 ) return (char *)avp;
	char *at = getIsAdult();
	if ( ! at || at == (void *)-1 ) return (char *)at;
	char *ls = getIsLinkSpam();
	if ( ! ls || ls == (void *)-1 ) return (char *)ls;
	uint32_t *tph = getTagPairHash32();
	if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph;

	// sets the ptr_sectionsReply, that is all we need it to do
	//char **sd = getSectionsReply ( ) ;
	//if ( ! sd || sd == (void *)-1 ) return (char *)sd;
	// sets the ptr_addressReply, that is all we need it to do
	//char **ad = getAddressReply ( ) ;
	//if ( ! ad || ad == (void *)-1 ) return (char *)ad;
	uint8_t *rl = getRootLangId();
	if ( ! rl || rl == (void *)-1 ) return (char *)rl;
	int32_t **pcids = getCatIds();
	if ( ! pcids || pcids == (void *)-1) return (char *)pcids;
	// get dmoz ptr_dmozTitles, ptr_dmozSumms, ptr_dmozAnchors
	if ( ! setDmozInfo() ) return (char *)-1;

	m_prepared = true;
	return (char *)1;
}

#define MAX_DMOZ_TITLES 10

int32_t *XmlDoc::getNumDmozEntries() {
	// MDW: wth is this?
	//int32_t **getDmozCatIds();
	int32_t nc = size_catIds / 4;
	if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
	m_numDmozEntries = nc;
	return &m_numDmozEntries;
}
// list of \0 terminated titles, etc. use getNumDmozTitles() to get #
char **XmlDoc::getDmozTitles ( ) {
	// returns false if blocked
	if ( ! setDmozInfo() ) return (char **)-1;
	if ( g_errno ) return NULL;
	return &ptr_dmozTitles;
}
char **XmlDoc::getDmozSummaries ( ) {
	// returns false if blocked
	if ( ! setDmozInfo() ) return (char **)-1;
	if ( g_errno ) return NULL;
	return &ptr_dmozSumms;
}
char **XmlDoc::getDmozAnchors ( ) {
	// returns false if blocked
	if ( ! setDmozInfo() ) return (char **)-1;
	if ( g_errno ) return NULL;
	return &ptr_dmozAnchors;
}


// returns false if blocked, true otherwise. sets g_errno on error & rets true
bool XmlDoc::setDmozInfo () {

	if ( m_dmozInfoValid ) return true;

	g_errno = 0;

	// return true and set g_errno on error
	if ( ! m_dmozBuf.reserve(12000) ) {
		log("xmldoc: error getting dmoz info: %s",mstrerror(g_errno));
		// ensure log statement does not clear g_errno
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return true;
	}

	// start here
	char *dmozBuf = m_dmozBuf.getBufStart();

	char *titles  = dmozBuf;
	char *summs   = dmozBuf+5000;
	char *anchors = dmozBuf+10000;
	// the end of it
	char *dtend = dmozBuf + 5000;
	char *dsend = dmozBuf + 10000;
	char *daend = dmozBuf + 12000;
	// point into those bufs
	char *dt = titles;
	char *ds = summs;
	char *da = anchors;
	// MDW: i limit this to 10 to save stack space!
	int32_t nc = size_catIds / 4;
	if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES;
	for (int32_t i = 0; i < nc ; i++) {
		// breathe
		QUICKPOLL ( m_niceness );
		// temp stuff
		int32_t dtlen = 0;
		int32_t dslen = 0;
		unsigned char dalen = 0;

		// . store all dmoz info separated by \0's into titles[] buffer
		// . crap, this does a disk read and blocks on that
		//
		// . TODO: make it non-blocking!!!!
		//
		g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
						   m_firstUrl.getUrlLen(),
						   ptr_catIds[i],
						   dt,//&titles[titlesLen],
						   &dtlen,//&titleLens[i],
						   dtend-dt,
						   ds,//&summs[summsLen],
						   &dslen,//&summLens[i],
						   dsend-ds,
						   da,//&anchors[anchorsLen],
						   &dalen,//&anchorLens[i],
						   daend-da,
						   m_niceness);
		// advance ptrs
		dt += dtlen;
		ds += dslen;
		da += dalen;
		// null terminate
		*dt++ = 0;
		*ds++ = 0;
		*ds++ = 0;
	}

	// if empty, make it a \0 to keep in sync with the rest
	if ( dt == titles  ) *dt++ = '\0';
	if ( ds == summs   ) *ds++ = '\0';
	if ( da == anchors ) *da++ = '\0';

	// set these
	ptr_dmozTitles   = titles;
	ptr_dmozSumms    = summs;
	ptr_dmozAnchors  = anchors;
	size_dmozTitles  = dt - titles;
	size_dmozSumms   = ds - summs;
	size_dmozAnchors = da - anchors;

	m_dmozInfoValid = true;
	return true;
}

// . create and store the titlerec into "buf".
// . it is basically the header part of all the member vars in this XmlDoc.
// . it has a key,dataSize,compressedData so it can be a record in an Rdb
// . return true on success, false on failure
bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){

	//setStatus ( "making title rec");

	// assume could not make one because we were banned or something
	tbuf->purge(); // m_titleRec = NULL;

	// start seting members in THIS's header before compression
	m_version           = TITLEREC_CURRENT_VERSION;

	// tag rec must have "sitenuminlinks" in it
	//if (! m_newTagRec.getTag("sitenuminlinks") ) { char *xx=NULL;*xx=0; }
	// we often update m_oldTagRec above by calling updateRootLangId(), etc
	// so update the size our of tag rec here
	//size_tagRecData = m_oldTagRec.getSize();
	// and sanity check this
	//if( ptr_tagRecData != (char *)&m_oldTagRec ) { char *xx=NULL;*xx=0; }

	// lookup dmoz title and summary for this site
	//int32_t          titleLens  [10];
	//int32_t          summLens   [10];
	//unsigned char anchorLens [10];
	//int32_t          titlesLen  = 0;
	//int32_t          summsLen   = 0;
	//int32_t          anchorsLen = 0;
	//char          titles     [10*1024];
	//char          summs      [10*4096];
	//char          anchors    [10* 256];

	/*

	  MDW oct 12 2013 -
	  why is this here? we should store this info at spider time?

	char *titles  = m_dmozBuf;
	char *summs   = m_dmozBuf+5000;
	char *anchors = m_dmozBuf+10000;
	// the end of it
	char *dtend = m_dmozBuf + 5000;
	char *dsend = m_dmozBuf + 10000;
	char *daend = m_dmozBuf + 12000;
	// point into those bufs
	char *dt = titles;
	char *ds = summs;
	char *da = anchors;
	// MDW: i limit this to 10 to save stack space!
	int32_t nc = size_catIds / 4;
	if ( nc > 10 ) nc = 10;
	for (int32_t i = 0; i < nc ; i++) {
		// breathe
		QUICKPOLL ( m_niceness );
		// temp stuff
		int32_t dtlen = 0;
		int32_t dslen = 0;
		unsigned char dalen = 0;

		// . store all dmoz info separated by \0's into titles[] buffer
		// . crap, this does a disk read and blocks on that
		//
		// . TODO: make it non-blocking!!!!
		//
		g_categories->getTitleAndSummary ( m_firstUrl.getUrl(),
						   m_firstUrl.getUrlLen(),
						   ptr_catIds[i],
						   dt,//&titles[titlesLen],
						   &dtlen,//&titleLens[i],
						   dtend-dt,
						   ds,//&summs[summsLen],
						   &dslen,//&summLens[i],
						   dsend-ds,
						   da,//&anchors[anchorsLen],
						   &dalen,//&anchorLens[i],
						   daend-da,
						   m_niceness);
		// advance ptrs
		dt += dtlen;
		ds += dslen;
		da += dalen;
		// null terminate
		if ( dtlen>0 && dt[dtlen-1]!='\0' ) { *dt++=0; dtlen++; }
		if ( dslen>0 && ds[dslen-1]!='\0' ) { *ds++=0; dslen++; }
		if ( dalen>0 && da[dalen-1]!='\0' ) { *da++=0; dalen++; }
		// must always be something!
		if ( dtlen==0 ) {*dt++=0; dtlen++;}
		if ( dslen==0 ) {*ds++=0; dslen++;}
		if ( dalen==0 ) {*da++=0; dalen++;}
	}

	// set these
	ptr_dmozTitles   = titles;
	ptr_dmozSumms    = summs;
	ptr_dmozAnchors  = anchors;
	size_dmozTitles  = dt - titles;
	size_dmozSumms   = ds - summs;
	size_dmozAnchors = da - anchors;
	*/

	// set our crap that is not necessarily set
	//ptr_firstUrl   = m_firstUrl.getUrl();
	//ptr_redirUrl   = m_redirUrl.getUrl();
	//ptr_tagRecData = (char *)&m_oldTagRec;

	// this must be valid now
	//if ( ! m_skipIndexingValid ) { char *xx=NULL;*xx=0; }

	// CT_STATUS docs do not have a valid XmlDoc really, it is
	// just the first 2048 bytes, so there is no m_collnum member
	// in the first 2048 bytes that is valid or even in legit memory.
	// see 'char xdhead[2048];' below.
	CollectionRec *cr = NULL;
	if ( m_contentType != CT_STATUS ) {
		cr = getCollRec();
		if ( ! cr ) return false;
	}

	// zero out the content to save disk space if it is a custom crawl
	// and the page was not processed (i.e. sent to diffbot).
	// this will cause some undeletable data in the index, like for
	// indexing meta tags perhaps, but in general we do not index
	// most of the html document in custom crawls because we set
	// 'indexBody/indexDoc' to false. but don't do this if we have
	// ever sent this url to diffbot for processing before at any time.
	// this may screw up content hash deduping, because the original
	// hash will always be indexed, even if the doc changes or is
	// deleted.
	bool zeroOut = false;
	if ( cr && cr->m_isCustomCrawl && ! m_sentToDiffbot ) zeroOut = true;
	if ( zeroOut && m_isDiffbotJSONObject ) zeroOut = false;
	if ( zeroOut && ! m_exactContentHash64Valid ) zeroOut = false;
	// don't zero out spider status documents
	if ( zeroOut && m_contentType == CT_STATUS ) zeroOut = false;
	// disable for now. probably most disk space is from the spider status
	// docs.
	//zeroOut = false;
	char    *savedPtr  = ptr_utf8Content;
	int32_t  savedSize = size_utf8Content;
	if ( zeroOut ) {
		// record the 64 bit content hash here and make
		// getExactContentHash64() return it as a 64-bit binary number.
		// that way we can preserve it.
		sprintf(m_tmp9,"gbzeroedout:%"UINT64"",m_exactContentHash64);
		ptr_utf8Content  = m_tmp9;
		size_utf8Content = gbstrlen(ptr_utf8Content) + 1;
		m_zeroedOut = true;
	}

	// set this
	m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;

	// add in variable length data
	int32_t *ps = (int32_t *)&size_firstUrl;
	// data ptr, consider a NULL to mean empty too!
	char **pd = (char **)&ptr_firstUrl;
	// how many XmlDoc::ptr_* members do we have? set "np" to that
	int32_t np = ((char *)&size_firstUrl  - (char *)&ptr_firstUrl) ;
	np /= sizeof(char *);
	// count up total we need to alloc
	int32_t need1 = m_headerSize;
	// clear these
	m_internalFlags1 = 0;
	// loop over em
	for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
		// skip if empty
		if ( *ps <= 0 ) continue;
		// or empty string ptr
		if ( ! *pd ) continue;
		// skip utf8content if we should -- no events or addresses
		//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
		// 4 bytes for the size
		need1 += 4;
		// add it up
		need1 += *ps;
		// make the mask
		uint32_t mask = 1 << i ;
		// add it in
		m_internalFlags1 |= mask;
	}
	// alloc the buffer
	char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
	// return NULL with g_errno set on error
	if ( ! ubuf ) {
		// restore if we were zeroed out
		ptr_utf8Content  = savedPtr;
		size_utf8Content = savedSize;
		return false;
	}
	// serialize into it
	char *p = ubuf;
	// copy our crap into there
	gbmemcpy ( p , &m_headerSize , m_headerSize );
	// skip it
	p += m_headerSize;
	// reset data ptrs
	pd = (char **)&ptr_firstUrl;
	// reset data sizes
	ps = (int32_t *)&size_firstUrl;

	// then variable length data
	for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
		// skip if empty, do not serialize
		if ( ! *ps ) continue;
		// or empty string ptr
		if ( ! *pd ) continue;
		// skip utf8content if we should -- no events or addresses
		//if ( m_skipIndexing && pd == &ptr_utf8Content ) continue;
		// store size first
		*(int32_t *)p = *ps;
		p += 4;
		// then the data
		gbmemcpy ( p , *pd , *ps );
		// skip *ps bytes we wrote. should include a \0
		p += *ps;
	}
	// sanity check
	if ( p != ubuf + need1 ) { char *xx=NULL; *xx=0; }

	// restore in case zeroOut was true
	ptr_utf8Content  = savedPtr;
	size_utf8Content = savedSize;

	// now restore it for other functions to use
	//size_content = saved;

	// . now compress our "title rec" data into a titleRec
	// . cbuf should not be set
	//if ( cbuf ) {
	//	log(LOG_LOGIC,"db: titlerec: compress: cbuf is set.");
	//	char *p = NULL; *p = 0;	exit(-1);
	//}
	// should we free cbuf on our reset/destruction?
	//m_owncbuf = ownCompressedData;
	// . make a buf big enough to hold compressed, we'll realloc afterwards
	// . according to zlib.h line 613 compress buffer must be .1% larger
	//   than source plus 12 bytes. (i add one for round off error)
	// . now i added another extra 12 bytes cuz compress seemed to want it
	int32_t need2 = ((int64_t)need1 * 1001LL) / 1000LL + 13 + 12;
	// we also need to store a key then regular dataSize then
	// the uncompressed size in cbuf before the compression of m_ubuf
	int32_t hdrSize = sizeof(key_t) + 4 + 4;
	// . now i add 12 bytes more so Msg14.cpp can also squeeze in a
	//   negative key to delete the old titleRec, cuz we use this cbuf
	//   to set our list that we add to our twins with
	// . we now store the negative rec before the positive rec in Msg14.cpp
	//hdrSize += sizeof(key_t) + 4;
	need2 += hdrSize;
	// alloc what we need
	//char *cbuf = (char *) mmalloc ( need2 ,"TitleRecc");
	//if ( ! cbuf ) return false;
	// return false on error
	if ( ! tbuf->reserve ( need2 ,"titbuf" ) ) return false;
	// int16_tcut
	char *cbuf = tbuf->getBufStart();
	// set cbuf sizes, we set cbufSize below to fit exactly used buf
	//int32_t cbufMaxSize = need2;
	// . how big is the buf we're passing to ::compress()?
	// . don't include the last 12 byte, save for del key in Msg14.cpp
	int32_t size = need2 - hdrSize ;
	// . uncompress the data into ubuf
	// . this will reset cbufSize to a smaller value probably
	// . "size" is set to how many bytes we wrote into "cbuf + hdrSize"
	int err = gbcompress ( (unsigned char *)cbuf + hdrSize,
			       (uint32_t *)&size,
			       (unsigned char *)ubuf ,
			       (uint32_t  )need1 );
	// note it
	//log("test: compressed %s from %"INT32" to %"INT32" bytes",
	//    m_firstUrl.m_url,need2-hdrSize,size);
	// free the buf we were trying to compress now
	mfree ( ubuf , need1 , "trub" );
	// we should check ourselves
	if ( err == Z_OK && size > (need2 - hdrSize ) ) {
		//mfree ( cbuf , need2 ,"TitleRecc" );
		tbuf->purge();
		g_errno = ECOMPRESSFAILED;
		log("db: Failed to compress document of %"INT32" bytes. "
		    "Provided buffer of %"INT32" bytes.",
		    size, (need2 - hdrSize ) );
		return false;
	}
	// check for error
	if ( err != Z_OK ) {
		//mfree ( cbuf , need2 ,"TitleRecc" );
		tbuf->purge();
		g_errno = ECOMPRESSFAILED;
		log("db: Failed to compress document.");
		return false;
	}
	// calc cbufSize, the uncompressed header + compressed stuff
	//cbufSize = hdrSize + size ;

	//int64_t uh48 = getFirstUrlHash48();
	// . make the key from docId
	// . false = delkey?
	//m_titleRecKey = g_titledb.makeKey (*getDocId(),uh48,false);//delkey?
	key_t tkey = g_titledb.makeKey (docId,uh48,false);//delkey?
	// validate it
	//m_titleRecKeyValid = true;

	// get a ptr to the Rdb record at start of the header
	p = cbuf;
	// skip over the negative rec reserved space for Msg14.cpp
	//p += 12 + 4;
	// . store key in header of cbuf
	// . store in our host byte ordering so we can be a rec in an RdbList
	*(key_t *) p = tkey;
	p += sizeof(key_t);
	// store total dataSize in header (excluding itself and key only)
	int32_t dataSize = size + 4;
	*(int32_t  *) p = dataSize ;
	p += 4;
	// store uncompressed size in header
	*(int32_t  *) p = need1 ; p += 4;
	// sanity check
	if ( p != cbuf + hdrSize ) { char *xx = NULL; *xx = 0; }
	// sanity check
	if ( need1 <= 0 ) { char *xx = NULL; *xx = 0; }
	// advance over data
	p += size;

	// update safebuf::m_length so it is correct
	tbuf->setLength ( p - cbuf );

	return true;
}

// . return NULL and sets g_errno on error
// . returns -1 if blocked
SafeBuf *XmlDoc::getTitleRecBuf ( ) {

	// return it now if we got it already
	if ( m_titleRecBufValid ) return &m_titleRecBuf;

	setStatus ( "making title rec");

	// did one of our many blocking function calls have an error?
	if ( g_errno ) return NULL;

	// . HACK so that TitleRec::isEmpty() return true
	// . faster than calling m_titleRec.reset()
	//m_titleRec.m_url.m_ulen = 0;

	int32_t *indexCode = getIndexCode();
	// not allowed to block here
	if ( indexCode == (void *)-1) { char *xx=NULL;*xx=0; }
	// return on errors with g_errno set
	if ( ! indexCode ) return NULL;
	// force delete? EDOCFORCEDELETE
	if ( *indexCode ) { m_titleRecBufValid = true; return &m_titleRecBuf; }

	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block and return -1, we will be re-called from the top
	if ( ! m_masterLoop ) {
		m_masterLoop  = getTitleRecBufWrapper;
		m_masterState = this;
	}

	/*
	// parsing knobs
	if ( ! m_titleWeightValid ) {
		// TODO: watchout for overruns!! these are 16-bits only!
		//m_eliminateMenus       = cr->m_eliminateMenus;
		m_titleWeight            = cr->m_titleWeight;
		m_headerWeight           = cr->m_headerWeight;
		m_urlPathWeight          = cr->m_urlPathWeight;
		m_externalLinkTextWeight = cr->m_externalLinkTextWeight;
		m_internalLinkTextWeight = cr->m_internalLinkTextWeight;
		m_conceptWeight          = cr->m_conceptWeight;
		//int32_t  siteNumInlinksBoost    = cr->m_siteNumInlinksBoost;
		// validate these
		//m_eliminateMenusValid       = true;
		m_titleWeightValid            = true;
		m_headerWeightValid           = true;
		m_urlPathWeightValid          = true;
		m_externalLinkTextWeightValid = true;
		m_internalLinkTextWeightValid = true;
		m_conceptWeightValid          = true;
	}
	*/

	/////////
	//
	// IF ANY of these validation sanity checks fail then update
	// prepareToMakeTitleRec() so it makes them valid!!!
	//
	/////////

	// verify key parts
	if ( ! m_docIdValid                  ) { char *xx=NULL;*xx=0; }

	// verify record parts
	//if ( ! m_versionValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_ipValid                     ) { char *xx=NULL;*xx=0; }
	if ( ! m_spideredTimeValid           ) { char *xx=NULL;*xx=0; }
	if ( ! m_pubDateValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_firstIndexedDateValid       ) { char *xx=NULL;*xx=0; }
	if ( ! m_outlinksAddedDateValid      ) { char *xx=NULL;*xx=0; }
	if ( ! m_charsetValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_countryIdValid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_httpStatusValid             ) { char *xx=NULL;*xx=0; }

	/*
	if ( ! m_titleWeightValid            ) { char *xx=NULL;*xx=0; }
	if ( ! m_headerWeightValid           ) { char *xx=NULL;*xx=0; }
	if ( ! m_urlPathWeightValid          ) { char *xx=NULL;*xx=0; }
	if ( ! m_externalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
	if ( ! m_internalLinkTextWeightValid ) { char *xx=NULL;*xx=0; }
	if ( ! m_conceptWeightValid          ) { char *xx=NULL;*xx=0; }
	*/

	if ( ! m_siteNumInlinksValid         ) { char *xx=NULL;*xx=0; }
	// if ( ! m_siteNumInlinksUniqueIpValid     ) { char *xx=NULL;*xx=0; }
	// if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; }
	// if ( ! m_siteNumInlinksTotalValid )        { char *xx=NULL;*xx=0; }
	//if ( ! m_sitePopValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_rootLangIdValid             ) { char *xx=NULL;*xx=0; }

	if ( ! m_hopCountValid               ) { char *xx=NULL;*xx=0; }
	if ( ! m_metaListCheckSum8Valid      ) { char *xx=NULL;*xx=0; }
	//if ( ! m_numBannedOutlinksValid    ) { char *xx=NULL;*xx=0; }
	if ( ! m_langIdValid                 ) { char *xx=NULL;*xx=0; }
	if ( ! m_contentTypeValid            ) { char *xx=NULL;*xx=0; }

	if ( ! m_isRSSValid                  ) { char *xx=NULL;*xx=0; }
	if ( ! m_isPermalinkValid            ) { char *xx=NULL;*xx=0; }
	if ( ! m_isAdultValid                ) { char *xx=NULL;*xx=0; }
	//if ( ! m_eliminateMenusValid         ) { char *xx=NULL;*xx=0; }
	if ( ! m_spiderLinksValid            ) { char *xx=NULL;*xx=0; }
	if ( ! m_isContentTruncatedValid     ) { char *xx=NULL;*xx=0; }
	if ( ! m_isLinkSpamValid             ) { char *xx=NULL;*xx=0; }

	// buffers
	if ( ! m_firstUrlValid               ) { char *xx=NULL;*xx=0; }
	if ( ! m_redirUrlValid               ) { char *xx=NULL;*xx=0; }
	//if ( ! m_metaRedirUrlValid           ) { char *xx=NULL;*xx=0; }
	if ( ! m_tagRecValid                 ) { char *xx=NULL;*xx=0; }
	if ( ! m_gigabitHashesValid          ) { char *xx=NULL;*xx=0; }
	if ( ! m_adVectorValid               ) { char *xx=NULL;*xx=0; }
	if ( ! m_wikiDocIdsValid             ) { char *xx=NULL;*xx=0; }
	if ( ! m_imageDataValid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_catIdsValid                 ) { char *xx=NULL;*xx=0; }
	if ( ! m_indCatIdsValid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_dmozInfoValid               ) { char *xx=NULL;*xx=0; }
	// if m_recycleContent is true, these are not valid
	if ( ! m_recycleContent ) {
		if ( ! m_rawUtf8ContentValid         ) { char *xx=NULL;*xx=0; }
		if ( ! m_expandedUtf8ContentValid    ) { char *xx=NULL;*xx=0; }
	}
	if ( ! m_utf8ContentValid            ) { char *xx=NULL;*xx=0; }
	if ( ! m_datesValid                  ) { char *xx=NULL;*xx=0; }
	// why do we need valid sections for a titlerec? we no longer user
	// ptr_sectiondbData...
	//if ( ! m_sectionsValid               ) { char *xx=NULL;*xx=0; }
	//if ( ! m_sectionsReplyValid          ) { char *xx=NULL;*xx=0; }
	//if ( ! m_addressReplyValid          ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteValid                   ) { char *xx=NULL;*xx=0; }
	if ( ! m_linkInfo1Valid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_linkInfo2Valid              ) { char *xx=NULL;*xx=0; }
	//if ( ! m_sectiondbDataValid          ) { char *xx=NULL;*xx=0; }
	//if ( ! m_placedbDataValid            ) { char *xx=NULL;*xx=0; }
	//if ( ! m_clockCandidatesDataValid    ) { char *xx=NULL;*xx=0; }

	// do we need these?
	if ( ! m_hostHash32aValid            ) { char *xx=NULL;*xx=0; }
	if ( ! m_contentHash32Valid          ) { char *xx=NULL;*xx=0; }
	//if ( ! m_tagHash32Valid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_tagPairHash32Valid          ) { char *xx=NULL;*xx=0; }
	// sanity checks
	if ( ! m_addressesValid              ) { char *xx=NULL;*xx=0; }

	// breathe
	QUICKPOLL( m_niceness );

	setStatus ( "compressing into final title rec");

	int64_t uh48 = getFirstUrlHash48();

	int64_t *docId = getDocId();

	// time it
	int64_t startTime = gettimeofdayInMilliseconds();

	//////
	//
	// fill in m_titleRecBuf
	//
	//////

	// we need docid and uh48 for making the key of the titleRec
	if ( ! setTitleRecBuf ( &m_titleRecBuf , *docId , uh48 ) )
		return NULL;

	// set this member down here because we can't set it in "xd"
	// because it is too int16_t of an xmldoc stub
	m_versionValid = true;

	// breathe
	QUICKPOLL( m_niceness );

	// . add the stat
	// . use white for the stat
	g_stats.addStat_r ( 0               ,
			    startTime ,
			    gettimeofdayInMilliseconds(),
			    0x00ffffff );

	QUICKPOLL( m_niceness );

	char *cbuf = m_titleRecBuf.getBufStart();
	m_titleRecKey = *(key_t *)cbuf;
	m_titleRecKeyValid = true;

	// we are legit
	//m_freeTitleRec    = true;
	//m_titleRec        = cbuf;
	// key + dataSize + ubufSize + compressedData
	//m_titleRecSize    = sizeof(key_t)+ 4 + 4 + size;
	//m_titleRecAllocSize = need2;

	// now valid. congratulations!
	m_titleRecBufValid   = true;
	return &m_titleRecBuf;
}


// . an "id" of 2 means very indicative of a dirty doc
// . an "id" of 1 means it must be joined with another dirty word to indicate
// . taken mostly from Url.cpp
// . see matches2.h for Needle class definition
static Needle s_dirtyWords []  = {
	{"upskirt"    ,0,2,0,0,NULL,0,NULL},
	{"downblouse" ,0,2,0,0,NULL,0,NULL},
	{"shemale"    ,0,1,0,0,NULL,0,NULL},
	{"spank"      ,0,1,0,0,NULL,0,NULL},
	{"dildo"      ,0,2,0,0,NULL,0,NULL},
	{"bdsm"       ,0,2,0,0,NULL,0,NULL},
	{"voyeur"     ,0,2,0,0,NULL,0,NULL},
	{"fisting"    ,0,2,0,0,NULL,0,NULL},
	{"vibrator"   ,0,2,0,0,NULL,0,NULL},
	{"ejaculat"   ,0,2,0,0,NULL,0,NULL},
	{"rgasm"      ,0,2,0,0,NULL,0,NULL},
	{"orgy"       ,0,2,0,0,NULL,0,NULL},
	{"orgies"     ,0,2,0,0,NULL,0,NULL},
	{"stripper"   ,0,1,0,0,NULL,0,NULL},
	{"softcore"   ,0,2,0,0,NULL,0,NULL},
	{"whore"      ,0,2,0,0,NULL,0,NULL},
	// gary slutkin on ted.com. make this just 1 point.
	{"slut"       ,0,1,0,0,NULL,0,NULL},
	{"smut"       ,0,2,0,0,NULL,0,NULL},
	{"tits"       ,0,2,0,0,NULL,0,NULL},
	{"lesbian"    ,0,2,0,0,NULL,0,NULL},
	{"swinger"    ,0,2,0,0,NULL,0,NULL},
	{"fetish"     ,0,2,0,0,NULL,0,NULL},
	{"nude"       ,0,1,0,0,NULL,0,NULL},
	{"centerfold" ,0,2,0,0,NULL,0,NULL},
	{"incest"     ,0,2,0,0,NULL,0,NULL},
	{"pedophil"   ,0,2,0,0,NULL,0,NULL},
	{"pedofil"    ,0,2,0,0,NULL,0,NULL},
	{"horny"      ,0,2,0,0,NULL,0,NULL}, // horny toad
	{"pussy"      ,0,2,0,0,NULL,0,NULL}, // pussy willow pussy cat
	{"pussies"    ,0,2,0,0,NULL,0,NULL},
	{"penis"      ,0,2,0,0,NULL,0,NULL},
	{"vagina"     ,0,2,0,0,NULL,0,NULL},
	{"phuck"      ,0,2,0,0,NULL,0,NULL},
	{"blowjob"    ,0,2,0,0,NULL,0,NULL},
	{"blow job"   ,0,2,0,0,NULL,0,NULL},
	{"gangbang"   ,0,2,0,0,NULL,0,NULL},
	{"xxx"        ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl"
	{"porn"       ,0,2,0,0,NULL,0,NULL},
	{"felch"      ,0,2,0,0,NULL,0,NULL},
	{"cunt"       ,0,2,0,0,NULL,0,NULL},
	{"bestial"    ,0,2,0,0,NULL,0,NULL},
	{"beastial"   ,0,2,0,0,NULL,0,NULL},
	{"kink"       ,0,2,0,0,NULL,0,NULL},
	// . "sex" is often substring in tagids.
	// . too many false positives, make "1" not "2"
	{"sex"        ,0,1,0,0,NULL,0,NULL},
	{"anal"       ,0,2,0,0,NULL,0,NULL},
	{"cum"        ,0,2,0,0,NULL,0,NULL},  // often used for cumulative
	{"clit"       ,0,2,0,0,NULL,0,NULL},
	{"fuck"       ,0,2,0,0,NULL,0,NULL},
	{"boob"       ,0,1,0,0,NULL,0,NULL},
	{"wank"       ,0,2,0,0,NULL,0,NULL},
	{"fick"       ,0,2,0,0,NULL,0,NULL},
	{"eroti"      ,0,2,0,0,NULL,0,NULL},
	{"gay"        ,0,1,0,0,NULL,0,NULL}, // make 1 pt. 'marvin gay'
	// new stuff not in Url.cpp
	{"thong"      ,0,1,0,0,NULL,0,NULL},
	{"masturbat"  ,0,2,0,0,NULL,0,NULL},
	{"bitch"      ,0,1,0,0,NULL,0,NULL},
	{"hell"       ,0,1,0,0,NULL,0,NULL},
	{"damn"       ,0,1,0,0,NULL,0,NULL},
	{"rimjob"     ,0,2,0,0,NULL,0,NULL},
	{"cunnilingu" ,0,2,0,0,NULL,0,NULL},
	{"felatio"    ,0,2,0,0,NULL,0,NULL},
	{"fellatio"   ,0,2,0,0,NULL,0,NULL},
	{"dick"       ,0,1,0,0,NULL,0,NULL},
	{"cock"       ,0,1,0,0,NULL,0,NULL},
	{"rape"       ,0,2,0,0,NULL,0,NULL},
	{"raping"     ,0,2,0,0,NULL,0,NULL},
	{"bukake"     ,0,2,0,0,NULL,0,NULL},
	{"shit"       ,0,2,0,0,NULL,0,NULL},
	{"naked"      ,0,1,0,0,NULL,0,NULL},
	{"nympho"     ,0,2,0,0,NULL,0,NULL},
	{"hardcore"   ,0,1,0,0,NULL,0,NULL}, // hardcore gamer, count as 1
	{"sodom"      ,0,2,0,0,NULL,0,NULL},
	{"titties"    ,0,2,0,0,NULL,0,NULL}, // re-do
	{"twat"       ,0,2,0,0,NULL,0,NULL},
	{"bastard"    ,0,1,0,0,NULL,0,NULL},
	{"erotik"     ,0,2,0,0,NULL,0,NULL},

	// EXCEPTIONS

	// smut
	{"transmut"    ,0,-2,0,0,NULL,0,NULL},
	{"bismuth"     ,0,-2,0,0,NULL,0,NULL},

	// sex
	{"middlesex"   ,0,-1,0,0,NULL,0,NULL},
	{"sussex"      ,0,-1,0,0,NULL,0,NULL},
	{"essex"       ,0,-1,0,0,NULL,0,NULL},
	{"deusex"      ,0,-1,0,0,NULL,0,NULL},
	{"sexchange"   ,0,-1,0,0,NULL,0,NULL},
	{"sexpress"    ,0,-1,0,0,NULL,0,NULL},
	{"sexpert"     ,0,-1,0,0,NULL,0,NULL},


	// EXCEPTIONS

	// sex
	{"middlesex"   ,0,-1,0,0,NULL,0,NULL},
	{"sussex"      ,0,-1,0,0,NULL,0,NULL},
	{"essex"       ,0,-1,0,0,NULL,0,NULL},
	{"deusex"      ,0,-1,0,0,NULL,0,NULL},
	{"sexchange"   ,0,-1,0,0,NULL,0,NULL},
	{"sexpress"    ,0,-1,0,0,NULL,0,NULL},
	{"sexpert"     ,0,-1,0,0,NULL,0,NULL},
	{"sexcel"      ,0,-1,0,0,NULL,0,NULL},
	{"sexist"      ,0,-1,0,0,NULL,0,NULL},
	{"sexile"      ,0,-1,0,0,NULL,0,NULL},
	{"sexperi"     ,0,-1,0,0,NULL,0,NULL},
	{"sexual"      ,0,-1,0,0,NULL,0,NULL},
	{"sexpose"     ,0,-1,0,0,NULL,0,NULL},
	{"sexclu"      ,0,-1,0,0,NULL,0,NULL},
	{"sexo"        ,0,-1,0,0,NULL,0,NULL},
	{"sexism"      ,0,-1,0,0,NULL,0,NULL},
	{"sexpan"      ,0,-1,0,0,NULL,0,NULL}, // buttonsexpanion
	{"same-sex"    ,0,-1,0,0,NULL,0,NULL},
	{"opposite sex",0,-1,0,0,NULL,0,NULL},

	// anal
	{"analog"      ,0,-2,0,0,NULL,0,NULL},
	{"analy"       ,0,-2,0,0,NULL,0,NULL},
	{"canal"       ,0,-2,0,0,NULL,0,NULL},
	{"kanal"       ,0,-2,0,0,NULL,0,NULL},
	{"banal"       ,0,-2,0,0,NULL,0,NULL},
	{"ianalbert"   ,0,-2,0,0,NULL,0,NULL}, // ian albert

	// cum
	{"circum"      ,0,-2,0,0,NULL,0,NULL},
	{"cum laude"   ,0,-2,0,0,NULL,0,NULL},
	{"succum"      ,0,-2,0,0,NULL,0,NULL},
	{"cumber"      ,0,-2,0,0,NULL,0,NULL},
	{"docum"       ,0,-2,0,0,NULL,0,NULL},
	{"cumul"       ,0,-2,0,0,NULL,0,NULL},
	{"acumen"      ,0,-2,0,0,NULL,0,NULL},
	{"incum"       ,0,-2,0,0,NULL,0,NULL},
	{"capsicum"    ,0,-2,0,0,NULL,0,NULL},
	{"modicum"     ,0,-2,0,0,NULL,0,NULL},
	{"locum"       ,0,-2,0,0,NULL,0,NULL},
	{"scum"        ,0,-2,0,0,NULL,0,NULL},
	{"accum"       ,0,-2,0,0,NULL,0,NULL},
	{"cumbre"      ,0,-2,0,0,NULL,0,NULL},

	{"swank"       ,0,-2,0,0,NULL,0,NULL},
	{"fickle"      ,0,-2,0,0,NULL,0,NULL},
	{"traffick"    ,0,-2,0,0,NULL,0,NULL},
	{"scleroti"    ,0,-2,0,0,NULL,0,NULL},
	{"gaylor"      ,0,-2,0,0,NULL,0,NULL},
	{"gaynor"      ,0,-2,0,0,NULL,0,NULL},
	{"gayner"      ,0,-2,0,0,NULL,0,NULL},
	{"gayton"      ,0,-2,0,0,NULL,0,NULL},
	{"dipthong"    ,0,-1,0,0,NULL,0,NULL},

	// hell
	{"hellen"      ,0,-1,0,0,NULL,0,NULL},
	{"hellman"     ,0,-1,0,0,NULL,0,NULL},
	{"shell"       ,0,-1,0,0,NULL,0,NULL},
	{"mitchell"    ,0,-1,0,0,NULL,0,NULL},
	{"chelle"      ,0,-1,0,0,NULL,0,NULL},  // me/michelle
	{"hello"       ,0,-1,0,0,NULL,0,NULL},
	{"moschella"   ,0,-1,0,0,NULL,0,NULL},
	{"othello"     ,0,-1,0,0,NULL,0,NULL},
	{"schelling"   ,0,-1,0,0,NULL,0,NULL},
	{"seychelles"  ,0,-1,0,0,NULL,0,NULL},
	{"wheller"     ,0,-1,0,0,NULL,0,NULL},
	{"winchell"    ,0,-1,0,0,NULL,0,NULL},

	// dick
	{"dicker"      ,0,-1,0,0,NULL,0,NULL},
	{"dickins"     ,0,-1,0,0,NULL,0,NULL},
	{"dickies"     ,0,-1,0,0,NULL,0,NULL},
	{"dickran"     ,0,-1,0,0,NULL,0,NULL},

	// cock
	{"babcock"     ,0,-1,0,0,NULL,0,NULL},
	{"cocked"      ,0,-1,0,0,NULL,0,NULL},
	{"cocking"     ,0,-1,0,0,NULL,0,NULL},
	{"cockpit"     ,0,-1,0,0,NULL,0,NULL},
	{"cockroach"   ,0,-1,0,0,NULL,0,NULL},
	{"cocktail"    ,0,-1,0,0,NULL,0,NULL},
	{"cocky"       ,0,-1,0,0,NULL,0,NULL},
	{"hancock"     ,0,-1,0,0,NULL,0,NULL},
	{"hitchcock"   ,0,-1,0,0,NULL,0,NULL},
	{"peacock"     ,0,-1,0,0,NULL,0,NULL},
	{"shuttlecock" ,0,-1,0,0,NULL,0,NULL},
	{"stopcock"    ,0,-1,0,0,NULL,0,NULL},
	{"weathercock" ,0,-1,0,0,NULL,0,NULL},
	{"woodcock"    ,0,-1,0,0,NULL,0,NULL},
	{"cockburn"    ,0,-1,0,0,NULL,0,NULL},

	// kink
	{"kinko"       ,0,-2,0,0,NULL,0,NULL},
	{"ukink"       ,0,-2,0,0,NULL,0,NULL},  // ink shop in uk

	// naked
	{"snaked"      ,0,-1,0,0,NULL,0,NULL},

	// rape
	{"drape"       ,0,-2,0,0,NULL,0,NULL},
	{"grape"       ,0,-2,0,0,NULL,0,NULL},
	{"scrape"      ,0,-2,0,0,NULL,0,NULL},
	{"therape"     ,0,-2,0,0,NULL,0,NULL},
	{"trapez"      ,0,-2,0,0,NULL,0,NULL},
	{"parapet"     ,0,-2,0,0,NULL,0,NULL},
	{"scraping"    ,0,-2,0,0,NULL,0,NULL},
	{"draping"     ,0,-2,0,0,NULL,0,NULL},

	// twat
	{"twatch"      ,0,-2,0,0,NULL,0,NULL}, // courtwatch -- cspan.org

	// clit
	{"heraclitus"  ,0,-2,0,0,NULL,0,NULL},

	// boob
	{"booboo"      ,0,-1,0,0,NULL,0,NULL},

	// shit
	{"shitak"      ,0,-2,0,0,NULL,0,NULL}
};

////
//// New stuff from sex.com adult word list
////
////
//// make it a 2nd part because of performance limits on matches2.cpp algo
////
static Needle s_dirtyWordsPart2 []  = {
        {"amateurfoto"  ,0,2,0,0,NULL,0,NULL},
        {"amateurhardcore"      ,0,2,0,0,NULL,0,NULL},
        {"amateurindex" ,0,2,0,0,NULL,0,NULL},
        {"amateurnaked" ,0,2,0,0,NULL,0,NULL},
        {"amatuerhardcore"      ,0,2,0,0,NULL,0,NULL},
        {"ampland"      ,0,2,0,0,NULL,0,NULL},
        //{"animehentai"  ,0,2,0,0,NULL,0,NULL}, dup
        {"anitablonde"  ,0,2,0,0,NULL,0,NULL},
        {"asiacarrera"  ,0,2,0,0,NULL,0,NULL},
        {"asshole"      ,0,2,0,0,NULL,0,NULL},
        {"asslick"      ,0,2,0,0,NULL,0,NULL},
        {"asspic"       ,0,2,0,0,NULL,0,NULL},
        {"assworship"   ,0,2,0,0,NULL,0,NULL},
        //{"badgirl"      ,0,2,0,0,NULL,0,NULL}, not necessarily bad
        {"bareceleb"    ,0,2,0,0,NULL,0,NULL},
        {"barenaked"    ,0,2,0,0,NULL,0,NULL},
        {"beaverboy"    ,0,2,0,0,NULL,0,NULL},
        {"beavershot"  ,0,2,0,0,NULL,0,NULL}, // was beavershots
        //{"bigball"      ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
        {"bigbreast"    ,0,2,0,0,NULL,0,NULL},
        //{"bigbutt"      ,0,2,0,0,NULL,0,NULL}, // not necessarily bad
        {"bigcock"      ,0,2,0,0,NULL,0,NULL},
        {"bigdick"      ,0,2,0,0,NULL,0,NULL},
        {"biggestdick"  ,0,2,0,0,NULL,0,NULL},
        {"biggesttit"   ,0,2,0,0,NULL,0,NULL},
        {"bighairyball" ,0,2,0,0,NULL,0,NULL},
        {"bighooter"    ,0,2,0,0,NULL,0,NULL},
        {"bignipple"    ,0,2,0,0,NULL,0,NULL},
        {"bigtit"       ,0,2,0,0,NULL,0,NULL},
        {"blackbooty"   ,0,2,0,0,NULL,0,NULL},
        {"blackbutt"    ,0,2,0,0,NULL,0,NULL},
        {"blackcock"    ,0,2,0,0,NULL,0,NULL},
        {"blackdick"    ,0,2,0,0,NULL,0,NULL},
        {"blackhardcore"        ,0,2,0,0,NULL,0,NULL},
        {"blackonblonde"        ,0,2,0,0,NULL,0,NULL},
        {"blacksonblonde"       ,0,2,0,0,NULL,0,NULL},
        {"blacktit"     ,0,2,0,0,NULL,0,NULL},
        {"blacktwat"    ,0,2,0,0,NULL,0,NULL},
        {"boner"        ,0,1,0,0,NULL,0,NULL}, // softcore, someone's lastname?
        {"bordello"     ,0,2,0,0,NULL,0,NULL},
        {"braless"      ,0,2,0,0,NULL,0,NULL},
        {"brothel"      ,0,2,0,0,NULL,0,NULL},
        {"bukake"       ,0,2,0,0,NULL,0,NULL},
        {"bukkake"      ,0,2,0,0,NULL,0,NULL},
        {"bustyblonde"  ,0,2,0,0,NULL,0,NULL},
        {"bustyceleb"   ,0,2,0,0,NULL,0,NULL},
        {"butthole"     ,0,2,0,0,NULL,0,NULL},
        {"buttman"      ,0,2,0,0,NULL,0,NULL},
        {"buttpic"      ,0,2,0,0,NULL,0,NULL},
        {"buttplug"     ,0,2,0,0,NULL,0,NULL},
        {"buttthumbnails"       ,0,2,0,0,NULL,0,NULL},
        {"callgirl"     ,0,2,0,0,NULL,0,NULL},
        {"celebritiesnaked"     ,0,2,0,0,NULL,0,NULL},
        {"celebritybush"        ,0,2,0,0,NULL,0,NULL},
        {"celebritybutt"        ,0,2,0,0,NULL,0,NULL},
        {"chaseylain"   ,0,2,0,0,NULL,0,NULL},
        {"chickswithdick"       ,0,2,0,0,NULL,0,NULL},
        {"christycanyon"        ,0,2,0,0,NULL,0,NULL},
        {"cicciolina"   ,0,2,0,0,NULL,0,NULL},
        //{"cunilingus"   ,0,2,0,0,NULL,0,NULL},
        {"cunniling"  ,0,2,0,0,NULL,0,NULL}, // abbreviate
        {"cyberlust"    ,0,2,0,0,NULL,0,NULL},
        {"danniashe"    ,0,2,0,0,NULL,0,NULL},
        {"dicksuck"     ,0,2,0,0,NULL,0,NULL},
        {"dirtymind"    ,0,2,0,0,NULL,0,NULL},
        {"dirtypicture" ,0,2,0,0,NULL,0,NULL},
        {"doggiestyle"  ,0,2,0,0,NULL,0,NULL},
        {"doggystyle"   ,0,2,0,0,NULL,0,NULL},
        {"domatrix"     ,0,2,0,0,NULL,0,NULL},
        {"dominatrix"   ,0,2,0,0,NULL,0,NULL},
        //{"dyke" ,0,2,0,0,NULL,0,NULL}, // dick van dyke!
        {"ejaculation"  ,0,2,0,0,NULL,0,NULL},
        {"erosvillage"  ,0,2,0,0,NULL,0,NULL},
        {"facesit"      ,0,2,0,0,NULL,0,NULL},
        {"fatass"       ,0,2,0,0,NULL,0,NULL},
        {"feetfetish"   ,0,2,0,0,NULL,0,NULL},
        {"felatio"      ,0,2,0,0,NULL,0,NULL},
        {"fellatio"     ,0,2,0,0,NULL,0,NULL},
        {"femdom"       ,0,2,0,0,NULL,0,NULL},
        {"fetishwear"   ,0,2,0,0,NULL,0,NULL},
        {"fettegirl"    ,0,2,0,0,NULL,0,NULL},
        {"fingerbang"   ,0,2,0,0,NULL,0,NULL},
        {"fingering"    ,0,1,0,0,NULL,0,NULL}, // fingering the keyboard? use 1
        {"flesh4free"   ,0,2,0,0,NULL,0,NULL},
        {"footfetish"   ,0,2,0,0,NULL,0,NULL},
        {"footjob"      ,0,2,0,0,NULL,0,NULL},
        {"footlicking"  ,0,2,0,0,NULL,0,NULL},
        {"footworship"  ,0,2,0,0,NULL,0,NULL},
        {"fornication"  ,0,2,0,0,NULL,0,NULL},
        {"freeass"      ,0,2,0,0,NULL,0,NULL},
        {"freebigtit"   ,0,2,0,0,NULL,0,NULL},
        {"freedick"     ,0,2,0,0,NULL,0,NULL},
        {"freehardcore" ,0,2,0,0,NULL,0,NULL},
        //{"freehentai"   ,0,2,0,0,NULL,0,NULL}, dup
        {"freehooter"   ,0,2,0,0,NULL,0,NULL},
        {"freelargehooter"      ,0,2,0,0,NULL,0,NULL},
        {"freenakedpic" ,0,2,0,0,NULL,0,NULL},
        {"freenakedwomen"       ,0,2,0,0,NULL,0,NULL},
        {"freetit"      ,0,2,0,0,NULL,0,NULL},
        {"freevoyeur"   ,0,2,0,0,NULL,0,NULL},
        {"gratishardcoregalerie"        ,0,2,0,0,NULL,0,NULL},
        {"hardcorecelebs"       ,0,2,0,0,NULL,0,NULL},
        {"hardcorefree" ,0,2,0,0,NULL,0,NULL},
        {"hardcorehooter"       ,0,2,0,0,NULL,0,NULL},
        {"hardcorejunkie"       ,0,2,0,0,NULL,0,NULL},
        {"hardcorejunky"        ,0,2,0,0,NULL,0,NULL},
        {"hardcoremovie"        ,0,2,0,0,NULL,0,NULL},
        {"hardcorepic"  ,0,2,0,0,NULL,0,NULL},
        {"hardcorepix"  ,0,2,0,0,NULL,0,NULL},
        {"hardcoresample"       ,0,2,0,0,NULL,0,NULL},
        {"hardcorestories"      ,0,2,0,0,NULL,0,NULL},
        {"hardcorethumb"        ,0,2,0,0,NULL,0,NULL},
        {"hardcorevideo"        ,0,2,0,0,NULL,0,NULL},
        {"harddick"     ,0,2,0,0,NULL,0,NULL},
        {"hardnipple"   ,0,2,0,0,NULL,0,NULL},
        {"hardon"       ,0,2,0,0,NULL,0,NULL},
        {"hentai"       ,0,2,0,0,NULL,0,NULL},
        {"interacialhardcore"   ,0,2,0,0,NULL,0,NULL},
        {"intercourseposition"  ,0,2,0,0,NULL,0,NULL},
        {"interracialhardcore"  ,0,2,0,0,NULL,0,NULL},
        {"ittybittytitty"       ,0,2,0,0,NULL,0,NULL},
        {"jackoff"      ,0,2,0,0,NULL,0,NULL},
        {"jennajameson" ,0,2,0,0,NULL,0,NULL},
        {"jennicam"     ,0,2,0,0,NULL,0,NULL},
        {"jerkoff"      ,0,2,0,0,NULL,0,NULL},
        {"jism" ,0,2,0,0,NULL,0,NULL},
        {"jiz"  ,0,2,0,0,NULL,0,NULL},
        {"justhardcore" ,0,2,0,0,NULL,0,NULL},
        {"karasamateurs"        ,0,2,0,0,NULL,0,NULL},
        {"kascha"       ,0,2,0,0,NULL,0,NULL},
        {"kaylakleevage"        ,0,2,0,0,NULL,0,NULL},
        {"kobetai"      ,0,2,0,0,NULL,0,NULL},
        {"lapdance"     ,0,2,0,0,NULL,0,NULL},
        {"largedick"    ,0,2,0,0,NULL,0,NULL},
        {"largehooter"  ,0,2,0,0,NULL,0,NULL},
        {"largestbreast"        ,0,2,0,0,NULL,0,NULL},
        {"largetit"     ,0,2,0,0,NULL,0,NULL},
        {"lesben"       ,0,2,0,0,NULL,0,NULL},
        {"lesbo"        ,0,2,0,0,NULL,0,NULL},
        {"lickadick"    ,0,2,0,0,NULL,0,NULL},
        {"lindalovelace"        ,0,2,0,0,NULL,0,NULL},
        {"longdick"     ,0,2,0,0,NULL,0,NULL},
        {"lovedoll"     ,0,2,0,0,NULL,0,NULL},
        {"makinglove"   ,0,2,0,0,NULL,0,NULL},
        {"mangax"       ,0,2,0,0,NULL,0,NULL},
        {"manpic"       ,0,2,0,0,NULL,0,NULL},
        {"marilynchambers"      ,0,2,0,0,NULL,0,NULL},
        {"massivecock"  ,0,2,0,0,NULL,0,NULL},
        {"masterbating" ,0,2,0,0,NULL,0,NULL},
        {"mensdick"     ,0,2,0,0,NULL,0,NULL},
        {"milf" ,0,2,0,0,NULL,0,NULL},
        {"minka"        ,0,2,0,0,NULL,0,NULL},
        {"monstercock"  ,0,2,0,0,NULL,0,NULL},
        {"monsterdick"  ,0,2,0,0,NULL,0,NULL},
        {"muffdiving"   ,0,2,0,0,NULL,0,NULL},
        {"nacktfoto"    ,0,2,0,0,NULL,0,NULL},
        {"nakedblackwomen"      ,0,2,0,0,NULL,0,NULL},
        {"nakedceleb"   ,0,2,0,0,NULL,0,NULL},
        {"nakedcelebrity"       ,0,2,0,0,NULL,0,NULL},
        {"nakedcheerleader"     ,0,2,0,0,NULL,0,NULL},
        {"nakedchick"   ,0,2,0,0,NULL,0,NULL},
        {"nakedgirl"    ,0,2,0,0,NULL,0,NULL},
        {"nakedguy"     ,0,2,0,0,NULL,0,NULL},
        {"nakedladies"  ,0,2,0,0,NULL,0,NULL},
        {"nakedlady"    ,0,2,0,0,NULL,0,NULL},
        {"nakedman"     ,0,2,0,0,NULL,0,NULL},
        {"nakedmen"     ,0,2,0,0,NULL,0,NULL},
        {"nakedness"    ,0,2,0,0,NULL,0,NULL},
        {"nakedphoto"   ,0,2,0,0,NULL,0,NULL},
        {"nakedpic"     ,0,2,0,0,NULL,0,NULL},
        {"nakedstar"    ,0,2,0,0,NULL,0,NULL},
        {"nakedwife"    ,0,2,0,0,NULL,0,NULL},
        {"nakedwoman"   ,0,2,0,0,NULL,0,NULL},
        {"nakedwomen"   ,0,2,0,0,NULL,0,NULL},
        {"nastychat"    ,0,2,0,0,NULL,0,NULL},
        {"nastythumb"   ,0,2,0,0,NULL,0,NULL},
        {"naughtylink"  ,0,2,0,0,NULL,0,NULL},
        {"naughtylinx"  ,0,2,0,0,NULL,0,NULL},
        {"naughtylynx"  ,0,2,0,0,NULL,0,NULL},
        {"naughtynurse" ,0,2,0,0,NULL,0,NULL},
        {"niceass"      ,0,2,0,0,NULL,0,NULL},
        {"nikkinova"    ,0,2,0,0,NULL,0,NULL},
        {"nikkityler"   ,0,2,0,0,NULL,0,NULL},
        {"nylonfetish"  ,0,2,0,0,NULL,0,NULL},
        {"nympho"       ,0,2,0,0,NULL,0,NULL},
        {"openleg"      ,0,2,0,0,NULL,0,NULL},
        {"oral4free"    ,0,2,0,0,NULL,0,NULL},
        {"pantyhosefetish"      ,0,2,0,0,NULL,0,NULL},
        {"peepcam"      ,0,2,0,0,NULL,0,NULL},
        {"persiankitty" ,0,2,0,0,NULL,0,NULL},
        {"perverted"    ,0,2,0,0,NULL,0,NULL},
        {"pimpserver"   ,0,2,0,0,NULL,0,NULL},
        {"pissing"      ,0,2,0,0,NULL,0,NULL},
        {"poontang"     ,0,2,0,0,NULL,0,NULL},
        {"privatex"     ,0,2,0,0,NULL,0,NULL},
        {"prono"        ,0,2,0,0,NULL,0,NULL},
        {"publicnudity" ,0,2,0,0,NULL,0,NULL},
        {"puffynipple"  ,0,2,0,0,NULL,0,NULL},
        {"racqueldarrian"       ,0,2,0,0,NULL,0,NULL},
        //{"rape" ,0,2,0,0,NULL,0,NULL}, // dup!
        {"rawlink"      ,0,2,0,0,NULL,0,NULL},
        {"realhardcore" ,0,2,0,0,NULL,0,NULL},
        {"rubberfetish" ,0,2,0,0,NULL,0,NULL},
        {"seka" ,0,2,0,0,NULL,0,NULL},
        {"sheboy"       ,0,2,0,0,NULL,0,NULL},
        {"showcam"      ,0,2,0,0,NULL,0,NULL},
        {"showercam"    ,0,2,0,0,NULL,0,NULL},
        {"smallbreast"  ,0,2,0,0,NULL,0,NULL},
        {"smalldick"    ,0,2,0,0,NULL,0,NULL},
        {"spycamadult"  ,0,2,0,0,NULL,0,NULL},
        {"strapon"      ,0,2,0,0,NULL,0,NULL},
        {"stripclub"    ,0,2,0,0,NULL,0,NULL},
        {"stripshow"    ,0,2,0,0,NULL,0,NULL},
        {"striptease"   ,0,2,0,0,NULL,0,NULL},
        {"strokeit"     ,0,2,0,0,NULL,0,NULL},
        {"strokeme"     ,0,2,0,0,NULL,0,NULL},
        {"suckdick"     ,0,2,0,0,NULL,0,NULL},
        {"sylviasaint"  ,0,2,0,0,NULL,0,NULL},
        {"teenhardcore" ,0,2,0,0,NULL,0,NULL},
        {"teenie"       ,0,2,0,0,NULL,0,NULL},
        {"teenpic"      ,0,2,0,0,NULL,0,NULL},
        {"teensuck"     ,0,2,0,0,NULL,0,NULL},
        {"tgp"  ,0,2,0,0,NULL,0,NULL},
        {"threesome"    ,0,2,0,0,NULL,0,NULL},
        {"thumblord"    ,0,2,0,0,NULL,0,NULL},
        {"thumbzilla"   ,0,2,0,0,NULL,0,NULL},
        {"tiffanytowers"        ,0,2,0,0,NULL,0,NULL},
        {"tinytitties"  ,0,2,0,0,NULL,0,NULL},
        //{"tities"       ,0,2,0,0,NULL,0,NULL}, // entities
        {"titman"       ,0,2,0,0,NULL,0,NULL},
        {"titsandass"   ,0,2,0,0,NULL,0,NULL},
        {"titties"      ,0,2,0,0,NULL,0,NULL},
        {"titts"        ,0,2,0,0,NULL,0,NULL},
        {"titty"        ,0,2,0,0,NULL,0,NULL},
        {"tokyotopless" ,0,2,0,0,NULL,0,NULL},
        {"tommysbookmark"       ,0,2,0,0,NULL,0,NULL},
        {"toplesswomen" ,0,2,0,0,NULL,0,NULL},
        {"trannies"     ,0,2,0,0,NULL,0,NULL},
        {"twinks"       ,0,2,0,0,NULL,0,NULL},
        {"ultradonkey"  ,0,2,0,0,NULL,0,NULL},
        {"ultrahardcore"        ,0,2,0,0,NULL,0,NULL},
        {"uncutcock"    ,0,2,0,0,NULL,0,NULL},
        {"vividtv"      ,0,2,0,0,NULL,0,NULL},
        {"wendywhoppers"        ,0,2,0,0,NULL,0,NULL},
        {"wetdick"      ,0,2,0,0,NULL,0,NULL},
        {"wetpanties"   ,0,2,0,0,NULL,0,NULL},
        {"wifesharing"  ,0,2,0,0,NULL,0,NULL},
        {"wifeswapping" ,0,2,0,0,NULL,0,NULL},
        {"xrated"       ,0,2,0,0,NULL,0,NULL}
};


// . store this in clusterdb rec so family filter works!
// . check content for adult words
char *XmlDoc::getIsAdult ( ) {

	if ( m_isAdultValid ) return &m_isAdult2;

	// call that
	setStatus ("getting is adult bit");

	int32_t **pici = getIndCatIds();
	if ( ! pici || pici == (void *)-1 ) return (char *)pici;

	// check categories
	for ( int32_t i = 0 ; i < size_indCatIds / 4 ; i++ ) {
		int32_t ic = ptr_indCatIds[i];
		// skip if not an adult category
		if ( ! g_categories->isIdAdult ( ic ) ) continue;
		// got it
		m_isAdult      = true;
		m_isAdult2     = true;
		m_isAdultValid = true;
		return &m_isAdult2;
	}

	// . if any of the wiki docids we are in are adult.... then we are
	// . we set the top bit of wiki docids to indicate if adult
	//for ( int32_t i = 0 ; i < size_wikiDocIds / 8 ; i++ ) {
	//	int64_t d = ptr_wikiDocIds[i];
	//	if ( ! ( d & 0x8000000000000000 ) ) continue;
	//	// got it
	//	m_isAdult      = true;
	//	m_isAdultValid = true;
	//	return &m_isAdult;
	//}

	// need the content
	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1) return (char *)u8;

	// time it
	int64_t start = gettimeofdayInMilliseconds();

	// score that up
	int32_t total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 ,
				      m_niceness , m_firstUrl.m_url );

	// then the url
	//char *u = getFirstUrl()->getUrl();
	//total += getDirtyPoints ( u , gbstrlen(u) );

	// and redir url
	//char *r = getRedirUrl()->getUrl();
	//total += getDirtyPoints ( r , gbstrlen(r) );

	// debug msg
	int64_t took = gettimeofdayInMilliseconds() - start;
	if ( took > 10 )
		logf(LOG_DEBUG,
		     "build: Took %"INT64" ms to check doc of %"INT32" bytes for "
		     "dirty words.",took,size_utf8Content-1);

	m_isAdult  = false;
	// adult?
	if ( total >= 2 ) m_isAdult = true;
	// set shadow member
	m_isAdult2 = (bool)m_isAdult;
	// validate
	m_isAdultValid = true;

	// note it
	if ( m_isAdult2 && g_conf.m_logDebugDirty )
		log("dirty: %s points = %"INT32"",m_firstUrl.m_url,total);

	// no dirty words found
	return &m_isAdult2;
}


int32_t getDirtyPoints ( char *s , int32_t slen , int32_t niceness , char *url ) {
	// . use the matches function to get all the matches
	// . then check each match to see if it is actually a legit word
	// . actually match the dirty words, then match the clean words
	//   then we can subtract counts.
	int32_t numDirty = sizeof(s_dirtyWords) / sizeof(Needle);
	getMatches2 ( s_dirtyWords ,
		      numDirty     ,
		      s            ,
		      slen         ,
		      NULL         , // linkPos
		      NULL         , // needleNum
		      false        , // stopAtFirstMatch?
		      NULL         , // hadPreMatch ptr
		      true         , // saveQuickTables?
		      niceness     );

	int32_t points = 0;
	// each needle has an associated score
	for ( int32_t i = 0 ; i < numDirty ; i++ ) {
		// skip if no match
		if ( s_dirtyWords[i].m_count <= 0 ) continue;
		// . the "id", is positive for dirty words, - for clean
		// . uses +2/-2 for really dirty words
		// . uses +1/-1 for borderline dirty words
		points += s_dirtyWords[i].m_id;
		// log debug
		if ( ! g_conf.m_logDebugDirty ) continue;
		// show it in the log
		log("dirty: %s %"INT32" %s"
		    ,s_dirtyWords[i].m_string
		    ,(int32_t)s_dirtyWords[i].m_id
		    ,url
		    );
	}

	////
	//
	// repeat for part2
	//
	// we have to do two separate parts otherwise the algo in
	// matches2.cpp gets really slow. it was not meant to match
	// so many needles in one haystack.
	//
	///
	int32_t numDirty2 = sizeof(s_dirtyWordsPart2) / sizeof(Needle);

	// . disable this for now. most of these are phrases and they
	//   will not be detected.
	// . TODO: hash the dirty words and phrases and just lookup
	//   words in that table like we do for isStopWord(), but use
	//   isDirtyWord(). Then replace the code is Speller.cpp
	//   with isDirtyUrl() which will split the string into words
	//   and call isDirtyWord() on each one. also use bi and tri grams
	//   in the hash table.
	numDirty2 = 0;

	getMatches2 ( s_dirtyWordsPart2 ,
		      numDirty2     ,
		      s            ,
		      slen         ,
		      NULL         , // linkPos
		      NULL         , // needleNum
		      false        , // stopAtFirstMatch?
		      NULL         , // hadPreMatch ptr
		      true         , // saveQuickTables?
		      niceness     );


	// each needle has an associated score
	for ( int32_t i = 0 ; i < numDirty2 ; i++ ) {
		// skip if no match
		if ( s_dirtyWordsPart2[i].m_count <= 0 ) continue;
		// . the "id", is positive for dirty words, - for clean
		// . uses +2/-2 for really dirty words
		// . uses +1/-1 for borderline dirty words
		points += s_dirtyWordsPart2[i].m_id;
		// log debug
		if ( ! g_conf.m_logDebugDirty ) continue;
		// show it in the log
		log("dirty: %s %"INT32" %s"
		    ,s_dirtyWordsPart2[i].m_string
		    ,(int32_t)s_dirtyWordsPart2[i].m_id
		    ,url
		    );
	}


	return points;
}


int32_t **XmlDoc::getIndCatIds ( ) {
	// if XmlDoc was set from a titleRec it should validate this
	if ( m_indCatIdsValid ) return &ptr_indCatIds;
	// otherwise, we must compute them!
	CatRec *cat = getCatRec ();
	// blocked or error?
	if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
	// set this
	ptr_indCatIds    = cat->m_indCatids;
	size_indCatIds   = cat->m_numIndCatids * 4;
	m_indCatIdsValid = true;
	// parse that up
	return &ptr_indCatIds;
}

int32_t **XmlDoc::getCatIds ( ) {
	// if XmlDoc was set from a titleRec it should validate this
	if ( m_catIdsValid ) return &ptr_catIds;
	// otherwise, we must compute them!
	CatRec *cat = getCatRec ();
	// blocked or error?
	if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat;
	// set this
	ptr_catIds    = cat->m_catids;
	size_catIds   = cat->m_numCatids * 4;
	m_catIdsValid = true;
	// parse that up
	return &ptr_catIds;
}

CatRec *XmlDoc::getCatRec ( ) {
	// return what we got
	if ( m_catRecValid ) return &m_catRec;
	// call that
	setStatus ("getting dmoz cat rec");
	// callback?
	if ( m_calledMsg8b ) {
		// return NULL on error
		if ( g_errno ) return NULL;
		// otherwise, success
		m_catRecValid = true;
		return &m_catRec;
	}
	// consider it called
	m_calledMsg8b = true;
	// assume empty and skip the call for now
	m_catRec.reset();
	m_catRecValid = true;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// let's bring dmoz back
	//return &m_catRec;
	// compute it otherwise
	if ( ! m_msg8b.getCatRec ( &m_firstUrl    ,
				   cr->m_coll         ,
				   gbstrlen(cr->m_coll) ,
				   true           ,  // use canonical name?
				   m_niceness     ,
				   &m_catRec      ,  // store here
				   m_masterState  ,  // state
				   m_masterLoop   )) // callback
		// return -1 if we blocked
		return (CatRec *)-1;
	// error?
	if ( g_errno ) return NULL;
	// we got it somehow without blocking... local cached lookup?
	m_catRecValid = true;
	return &m_catRec;
}

void gotWikiResultsWrapper ( void *state , UdpSlot *slot ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->gotWikiResults ( slot );
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . get the wiki pages that this page matches
// . use the docids of the wiki pages to represent them
// . use an independent 32-node cluster to index all of wikipedia so it is all
//   in ram. do not need datedb, etc.
// . get the gigabits for this page, up to 50 of them, and use that as a rat=0
//   query on the wiki cluster
// . score each wiki docid too, based on match
// . normalize scores so they range from 10% to 100%, based on # of gigabits
//   that the wiki page matches
// . index these as gbwiki:<wikipagedocid> with the score given (8-bit) mapped
//   to 32 bits using score8to32() so the score itself is preserved
// . WE CAN ALSO call this at QUERY TIME, using the actual query of the
//   searcher instead of the string of gigabits
// . BUT i will probably just look at the wiki topics of the search results,
//   that will be faster and maybe more accurate...
int64_t **XmlDoc::getWikiDocIds ( ) {

	if ( m_wikiDocIdsValid ) return (int64_t **)&ptr_wikiDocIds;

	setStatus ( "getting wiki docids" );

	// . get our gigabit vector
	// . consists of array of 32-bit hashes
	// . followed by 1-1 array of 16-bit scores
	// . TODO: restrict gigabits to capitalized words and phrases, and
	//   also to 2+ word wiki titles
	char *gq = getGigabitQuery ( );
	if ( ! gq || gq == (char *)-1 ) return (int64_t **)gq;

	// empty? then no wiki match i guess
	//logf(LOG_DEBUG,"FIX ME FIX ME - getWikiDocIds");

	// MDW: for now bail here too!
	if ( ! gq[0] || 1 == 1 ) {
		ptr_wikiDocIds  = m_wikiDocIds;
		ptr_wikiScores  = m_wikiScores;
		size_wikiDocIds = 0;
		size_wikiScores = 0;
		m_wikiDocIdsValid = true;
		return (int64_t **)&ptr_wikiDocIds;
	}

	// set our query to these gigabits
	// re-enable this later
	//if ( ! m_calledMsg40 ) m_wq.set ( gq );

	int32_t need = 200 + gbstrlen(gq);
	// make buf
	m_wikiqbuf = (char *)mmalloc ( need , "wikiqbuf");
	// error?
	if ( ! m_wikiqbuf ) return NULL;
	// save size
	m_wikiqbufSize = need;
	// use large single tier for speed
	char *p = m_wikiqbuf;
	p += sprintf ( p ,
		       "GET /search?raw=9&n=%"INT32"&sc=0&dr=0&"//dio=1&"
		       "t0=1000000&rat=0&"
		       "c=wiki&q=%s", (int32_t)MAX_WIKI_DOCIDS, gq );
	// terminate it
	*p++ = '\0';
	// then put in the ip
	*(int32_t *)p = g_hostdb.m_myHost->m_ip;
	// skip over ip
	p += 4;
	// sanity check
	if ( p - m_wikiqbuf > need ) { char *xx=NULL;*xx=0; }

	int32_t ip = g_conf.m_wikiProxyIp;
	// if not given, make it gf1 for now
	if ( ! ip ) ip = atoip ( "10.5.62.11" , 10 );

	int32_t port = g_conf.m_wikiProxyPort;
	// port default too to gf1
	if ( ! port ) port = 9002;

	// send it using msg 0xfd to the wiki cluster's proxy
	if ( ! g_udpServer.sendRequest ( m_wikiqbuf            ,
					 p - m_wikiqbuf        ,
					 0xfd                  ,
					 ip                    ,
					 port                  ,
					 -1                    , // hostId
					 NULL                  , // retSlot
					 this                  , // state
					 gotWikiResultsWrapper ,
					 1000                  ) )
		// we had an error, g_errno should be set
		return NULL;

	// got without blocking? no way!
	return (int64_t **)-1;
}

void XmlDoc::gotWikiResults ( UdpSlot *slot ) {

	setStatus ( "got wiki docids" );

	// do not free our request in slot
	slot->m_sendBufAlloc = NULL;

	// free request buf
	mfree ( m_wikiqbuf , m_wikiqbufSize , "wikiqbuf" );

	// error getting the wiki results?
	if ( g_errno ) return;

	// TODO: normalize all scores with each other some how. i think
	// they are fairly absolute, but now sure with a lot of rat=0 terms!
	logf(LOG_DEBUG,"wiki: fix my scoring stuff. have a min score... "
	     " and somehow normalize scores to be in [0,1.0]");

	// . force this reply to be NULL terminated
	// . i can't fix in the code now because the reply is coming from
	//   a different cluster running an older version of gb
	char  *s   = slot->m_readBuf;
	char  *end = s + slot->m_readBufSize - 1;
	// overwrite the last '>', who cares!
	*end = '\0';
	// make our xml
	Xml xml;
	if ( ! xml.set ( s                        ,
			 end - s                  ,
			 false                    , // ownData?
			 0                        ,
			 false                    ,
			 TITLEREC_CURRENT_VERSION ,
			 false                    , // setParents?
			 m_niceness               ,
			 CT_HTML                  ))
		// return if g_errno got set
		return;

	// grab docids
	int32_t      nd    = 0;
	int32_t      nn    = xml.getNumNodes();
	XmlNode  *nodes = xml.getNodes();
	float     score = 0.0;
	int64_t docId = 0LL;
	for ( int32_t i = 0 ; i + 1 < nn ; i++ ) {
		if ( nodes[i].m_nodeId != 1 ) continue;
		// tagname is <docid>?
		if ( nodes[i].m_tagNameLen == 5 &&
		     nodes[i].m_tagName[0] == 'd' &&
		     ! strncmp(nodes[i].m_tagName,"docId",5) )
			docId = atoll ( nodes[i].m_tagName );
		// is <score>? (after docid tag)
		if ( nodes[i].m_tagNameLen == 8 &&
		     nodes[i].m_tagName[0] == 'a' &&
		     ! strncmp(nodes[i].m_tagName,"absScore",8) ) {
			score = atof ( nodes[i].m_tagName );
			// add it
			m_wikiDocIds [ nd ] = docId;
			m_wikiScores [ nd ] = score;
			nd++;
			// do not overflow
			if ( nd >= MAX_WIKI_DOCIDS ) break;
		}
	}
	// point to them
	ptr_wikiDocIds  = m_wikiDocIds;
	ptr_wikiScores  = m_wikiScores;
	size_wikiDocIds = nd * 8;
	size_wikiScores = nd * sizeof(rscore_t);

	log ( LOG_DEBUG , "build: got %"INT32" wiki docids",nd);

	m_wikiDocIdsValid = true;
}

int32_t *XmlDoc::getPubDate ( ) {
	if ( m_pubDateValid ) return (int32_t *)&m_pubDate;
	// get date parse
	Dates *dp = getDates();
	if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp;
	// got it
	m_pubDateValid = true;
	m_pubDate      = dp->getPubDate();
	// print it once for page parser. we now do this in XmlDoc::print()
	//if ( m_pbuf ) m_dates.printPubDates ( m_pbuf );
	// set m_ageInDays
	if ( m_pubDate == (uint32_t)-1 ) return (int32_t *)&m_pubDate;
	// for parsing date
	//int32_t currentTime = getTimeGlobal();
	// this must be valid
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	int32_t spideredTime = getSpideredTime();
	// get doc age
	//float age = currentTime - m_pubDate;
	float age = spideredTime - m_pubDate;
	// convert to days (could be negative if in the future)
	m_ageInDays = age / (3600*24.0);
	// fix it if negative
	if ( m_ageInDays < 0.0 ) m_ageInDays = 0.0;
	return (int32_t *)&m_pubDate;
}

Dates *XmlDoc::getDates ( ) {
	if ( m_datesValid ) return &m_dates;
	// skip for now
	m_datesValid = true;
	return &m_dates;

	// set status. we can time status changes with this routine!
	setStatus ( "getting dates");

	Dates *dd = getSimpleDates();
	// bail on error
	if ( ! dd ) {
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return NULL;
	}

	// need addresses
	Addresses *aa = getAddresses ();
	if ( ! aa || aa == (void *)-1 ) return (Dates *)aa;

	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (Dates *)isRoot;

	// . get root doc, from titlerec is ok ( TODO: make sure from titlerec)
	// . TODO: make sure to save in titledb too???
	// . we need this now too
	// . now set DF_IN_ROOTDOC on dates that were in the same section but
	//   in the root doc.
	// . if we are not the root, we use the root title rec to see if
	//   the website repeats the store hours on every page. in that case
	// . TODO: a special cache just fo rholding "svt" for root pages.
	//   should be highly efficient!!!
	//XmlDoc     *rd  = NULL;

	// setPart2() needs the implied sections set, so set them
	Sections *sections = getSections();
	if ( !sections ||sections==(Sections *)-1) return(Dates *)sections;

	//SectionVotingTable *osvt = getOldSectionVotingTable();
	//if ( ! osvt || osvt == (void *)-1 ) return (Dates *)osvt;

	// table should be empty if we are the root!
	//HashTableX *rvt = getRootVotingTable();
	//if ( ! rvt || rvt == (void *)-1 ) return (Dates *)rvt;
	char *isRSS   = getIsRSS();
	if ( ! isRSS || isRSS == (void *)-1 ) return (Dates *)isRSS;

	uint8_t *ctype = getContentType();
	if ( ! ctype || ctype == (void *)-1 ) return (Dates *)ctype;

	bool isXml = false;
	if ( *isRSS ) isXml = true;
	if ( *ctype == CT_XML ) isXml = true;

	int32_t minPubDate = -1;
	int32_t maxPubDate = -1;
	// parentPrevSpiderTime is 0 if that was the first time that the
	// parent was spidered, in which case isNewOutlink will always be set
	// for every outlink it had!
	if ( m_sreqValid &&
	     m_sreq.m_isNewOutlink &&
	     m_sreq.m_parentPrevSpiderTime ) {
		// pub date is somewhere between these two times
		minPubDate = m_sreq.m_parentPrevSpiderTime;
		//maxPubDate = m_sreq.m_addedTime;
		maxPubDate = m_sreq.m_discoveryTime;
	}

	// now set part2 , returns false and sets g_errno on error
	if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt,
				  isXml , *isRoot )) {
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// note it
		log("doc: dates2: %s",mstrerror(g_errno));
		// this just means we ran out of stack space to parse
		// out all the dates, so ignore and continue... that way
		// Spider.cpp does not give up and keep retrying us over
		// and over again
		//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
		// on all other errors, return NULL
		if ( g_errno ) return NULL;
	}
	// debug EBADENGINEER error
	if ( g_errno ) { char *xx=NULL;*xx=0; }

	// overflow? does not set g_errno. at least clear all so we do not
	// get a messed up partial representation.
	//if ( m_dates.m_overflowed ) {
	//	log("doc: date overflow for %s",m_firstUrl.m_url);
	//	m_dates.reset();
	//}

	// only call it once
	m_datesValid = true;
	// return it
	return &m_dates;
}

Dates *XmlDoc::getSimpleDates ( ) {

	if ( m_simpleDatesValid ) return &m_dates;
	// note that
	setStatus("get dates part 1");
	// try the current url
	Url *u = getCurrentUrl();
	// and ip
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (Dates *)ip;
	// the docid
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Dates *)d;
	// the site hash
	int32_t *sh32 = getSiteHash32();
	if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Dates *)sh32;
	// words
	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (Dates *)words;
	// we set the D_IS_IN_DATE flag for these bits
	Bits *bits = getBits(); if ( ! bits ) return NULL;
	// sections. is it ok that these do not include implied sections?
	Sections *sections = getExplicitSections();
	if (!sections||sections==(Sections *)-1) return (Dates *)sections;
	// link info (this is what we had the problem with)
	LinkInfo *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Dates *)info1;
	//int32_t *sv = getPageSampleVector();
	//if ( ! sv || sv == (int32_t *)-1 ) return (Dates *)sv;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Dates *)xml;
	// this must be valid, cuz Dates.cpp uses it!
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0;}
	// . get the xml doc of the previously stored title rec
	// . Dates will compare the two docs to check for clocks, etc.
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (Dates *)pod;
	Url **redir = getRedirUrl();
	if ( ! redir || redir == (Url **)-1 ) return (Dates *)redir;
	//char *ru = NULL;
	//if ( *redir ) ru = (*redir)->getUrl();

	// this should deserialize from its title rec data
	//Dates *odp = NULL;
	//if ( *pod ) odp = (*pod)->getDates ();
	// the key in this table is the date tagHash and occNum, and the
	// value is the timestamp of the date. this is used by the clock
	// detection algorithm to compare a date in the previous version
	// of this web page to see if it changed and is therefore a clock then.
	// HashTableX *cct = NULL;
	// if ( *pod ) cct = (*pod)->getClockCandidatesTable();
	// this should be valid
	uint8_t ctype = *getContentType();
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// this now returns false and sets g_errno on error, true on success
	if ( ! m_dates.setPart1 ( u , //->getUrl(),
				  *redir, // ru         ,
				  ctype      ,
				  *ip        ,
				  *d         ,
				  *sh32      ,
				  xml        ,
				  words      ,
				  // set D_IS_IN_DATE flag so Address.cpp
				  // can avoid such word in addresses!
				  bits       ,
				  sections   ,
				  info1      ,
				  //sv       ,
				  //odp      , // old dates
				  NULL , // cct        ,
				  this       , // us
				  *pod       , // old XmlDoc
				  cr->m_coll     ,
				  m_niceness )) {
		// sanity check
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// note it
		log("doc: dates1: %s",mstrerror(g_errno));
		// this just means we ran out of stack space to parse
		// out all the dates, so ignore and continue... that way
		// Spider.cpp does not give up and keep retrying us over
		// and over again
		//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
		// on all other errors, return NULL
		if ( g_errno ) return NULL;
	}
	// only call it once
	m_simpleDatesValid = true;
	// return it
	return &m_dates;
}

/*
// returns NULL and sets g_errno on error, returns -1 if blocked
HashTableX *XmlDoc::getClockCandidatesTable ( ) {
	// return if valid
	if ( m_clockCandidatesTableValid ) return &m_clockCandidatesTable;
	// otherwise, deserialize?
	if ( m_clockCandidatesDataValid ) {
		// and table is now valid
		m_clockCandidatesTableValid = true;
		// return empty table if ptr is NULL. take this out then.
		if(!ptr_clockCandidatesData ) return &m_clockCandidatesTable;
		// otherwise, deserialize
		m_clockCandidatesTable.deserialize(ptr_clockCandidatesData ,
						     size_clockCandidatesData,
		 				     m_niceness );
		// and return that
		return &m_clockCandidatesTable;
	}

	// no longer using this since we got ptr_metadata
	return &m_clockCandidatesTable;

	// otherwise, get our dates
	Dates *dp = getDates();
	if ( ! dp || dp == (Dates *)-1 ) return (HashTableX *)dp;
	// reset table just in case
	m_clockCandidatesTable.reset();
	// if no dates, bail
	if ( dp->m_numDatePtrs == 0 ) {
		m_clockCandidatesTableValid = true;
		m_clockCandidatesDataValid  = true;
		// ptr_clockCandidatesData  = NULL;
		// size_clockCandidatesData = 0;
		return &m_clockCandidatesTable;
	}
	// and set size to 32 buckets to start
	if ( ! m_clockCandidatesTable.set (8,4,32,NULL,0,false,m_niceness,
					   "clockcands") )
		return NULL;
	// now stock the table
	for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get date
		Date *di = dp->m_datePtrs[i];
		// skip if got nuked
		if ( ! di ) continue;
		// make the key
		int64_t key ;
		// lower 32 bits is taghash
		key = di->m_tagHash;
		// upper 32 bits is occNum
		key |= ((int64_t)(di->m_occNum)) << 32;
		// timestamp is the val
		int32_t val = di->m_timestamp;
		// then store it
		if ( ! m_clockCandidatesTable.addKey ( &key , &val ) )
			return NULL;
	}
	// that is now valid
	m_clockCandidatesTableValid = true;
	// how many bytes to serialize?
	int32_t need = m_clockCandidatesTable.getStoredSize();
	// now make the ptr valid
	if ( ! m_cctbuf.reserve ( need ) ) return NULL;
	// store it in there
	m_clockCandidatesTable.serialize ( &m_cctbuf );
	// point to it
	// ptr_clockCandidatesData  = m_cctbuf.getBufStart();
	// size_clockCandidatesData = need;
	// that is valid now
	m_clockCandidatesDataValid = true;
	return &m_clockCandidatesTable;
}
*/

// a date of -1 means not found or unknown
int32_t XmlDoc::getUrlPubDate ( ) {
	if ( m_urlPubDateValid ) return m_urlPubDate;
	// need a first url. caller should have called setFirstUrl()
	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
	// use Dates
	//Dates dp;
	// -1 means unknown
	m_urlPubDate = -1;
	//m_urlAge     = -1;
	// try the FIRST url
	Url *u = getFirstUrl();
	// get last url we redirected to
	Url **redir = getRedirUrl();
	if ( ! redir || redir == (Url **)-1 ) {char *xx=NULL;*xx=0;}

 subloop:
	// . try to get the date just from the url
	// . this will be zero if none found
	m_urlPubDate = parseDateFromUrl ( u->getUrl() );
	// we are kosher
	m_urlPubDateValid = true;
	// if we are unknown try last/redir url, if any
	if ( m_urlPubDate == 0 && *redir && u != *redir ) {
		u = *redir;
		goto subloop;
	}
	// if we got a valid pub date from the url, set "m_urlAge"
	if ( m_urlPubDate == 0 ) return m_urlPubDate;
	// note it
	log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"",
	      (uint32_t)m_urlPubDate );
	// set the age
	//m_urlAge = getTimeGlobal() - m_urlPubDate;
	//if ( m_urlAge < 0 ) m_urlAge = 0;
	return m_urlPubDate;
}

// . use Dates to extract pub date from the url itself if pub date exists
// . an age of "-1" means unknown
/*
int32_t XmlDoc::getOutlinkAge ( int32_t outlinkNum ) {
	// use Dates
	Dates dp;
	// sanity
	if ( outlinkNum < 0 ) { char *xx=NULL;*xx=0; }
	// get it
	char *us = m_links.getLinkPtr(outlinkNum);
	// for now set this, until we mod Dates to use normalized
	// string urls
	Url u;
	u.set ( us );
	// try to get the date just from the url
	if ( ! dp.set ( &u         ,
			0          , // ip
			0LL        , // m_newDocId
			0          , // siteHash
			NULL       , // Xml
			NULL       , // Words
			NULL       , // Bits
			NULL       , // Sections
		        NULL       , // LinkInfo
			NULL       , // pageSampleVec
			NULL       , // old date parse2
			NULL       , // m_newDoc
			NULL       , // m_oldDoc
			m_coll     ,
			0          , // defaultTimeZone
			m_niceness )){
		// should never block!
		char *xx=NULL; *xx= 0; }
	// this will be -1 if no date was found in the url
	int32_t urlPubDate = dp.getPubDate();
	// if we got a valid pub date from the url, set "m_urlAge"
	if ( urlPubDate == -1 ) return -1;
	// note it
	//log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"", m_urlDate );
	// set the age
	int32_t age = getTimeGlobal() - urlPubDate;
	// keep positive
	if ( age < 0 ) age = 0;
	// return it
	return age;
}
*/


// . sets g_errno on error and returns NULL
// . now returns a ptr to it so we can return NULL to signify error, that way
//   all accessors have equivalent return values
// . an acessor function returns (char *)-1 if it blocked!
char *XmlDoc::getIsPermalink ( ) {
	if ( m_isPermalinkValid ) return &m_isPermalink2;
	Url *url      = getCurrentUrl();
	if ( ! url      ) return NULL;
	char *isRSS   = getIsRSS();
	// return NULL with g_errno set, -1 if blocked
	if ( ! isRSS || isRSS == (char  *)-1 ) return isRSS;
	Links *links  = getLinks();
	// return NULL with g_errno set, -1 if blocked
	if ( ! links || links == (Links *)-1 ) return (char *)links;
	uint8_t  *ct     = getContentType();
	// return NULL with g_errno set, -1 if blocked
	if ( ! ct    || ct    == (uint8_t  *)-1 ) return (char *)ct;
	// GUESS if it is a permalink by the format of the url
	int32_t p = ::isPermalink ( links  , // Links ptr
				 url    ,
				 *ct    , // CT_HTML default?
				 NULL   , // LinkInfo ptr
				 *isRSS );// isRSS?
	m_isPermalink      = p;
	m_isPermalink2     = p;
	m_isPermalinkValid = true;
	return &m_isPermalink2;
}

// guess based on the format of the url if this is a permalink
char *XmlDoc::getIsUrlPermalinkFormat ( ) {
	if ( m_isUrlPermalinkFormatValid ) return &m_isUrlPermalinkFormat;

	setStatus ( "getting is url permalink format" );

	Url *url      = getCurrentUrl();
	if ( ! url      ) return NULL;
	// just guess if we are rss here since we most likely do not have
	// access to the url's content...
	bool isRSS = false;
	char *ext = url->getExtension();
	if ( ext && strcasecmp(ext,"rss") == 0 ) isRSS = true;
	// GUESS if it is a permalink by the format of the url
	int32_t p = ::isPermalink ( NULL    , // Links ptr
				 url     ,
				 CT_HTML ,
				 NULL    , // LinkInfo ptr
				 isRSS   );// we guess this...
	m_isUrlPermalinkFormat      = p;
	m_isUrlPermalinkFormatValid = true;
	return &m_isUrlPermalinkFormat;
}

char *XmlDoc::getIsRSS ( ) {
	if ( m_isRSSValid ) return &m_isRSS2;
	// the xml tells us for sure
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	m_isRSS      = xml->isRSSFeed();
	m_isRSS2     = (bool)m_isRSS;
	m_isRSSValid = true;
	return &m_isRSS2;
}

char *XmlDoc::getIsSiteMap ( ) {
	if ( m_isSiteMapValid ) return &m_isSiteMap;
	uint8_t  *ct = getContentType();
	if ( ! ct    || ct    == (uint8_t  *)-1 ) return (char *)ct;
	char *uf = m_firstUrl.getFilename();
	int32_t ulen = m_firstUrl.getFilenameLen();
	// sitemap.xml
	m_isSiteMap = false;
	// must be xml to be a sitemap
	if ( *ct == CT_XML &&
	     ulen == 11 &&
	     strncmp(uf,"sitemap.xml",11) == 0 )
		m_isSiteMap = true;
	m_isSiteMapValid = true;
	return &m_isSiteMap;
}

// . this function should really be called getTagTokens() because it mostly
//   works on HTML documents, not XML, and just sets an array of ptrs to
//   the tags in the document, including ptrs to the text in between
//   tags.
Xml *XmlDoc::getXml ( ) {

	// return it if it is set
	if ( m_xmlValid ) return &m_xml;

	// note it
	setStatus ( "parsing html");

	// get the filtered content
	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
	int32_t u8len = size_utf8Content - 1;

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;

	// set it
	if ( ! m_xml.set ( *u8        ,
			   u8len      ,
			   false      ,  // ownData?
			   0          ,  // allocSize
			   false      ,  // pure xml?
			   m_version  ,
			   false      ,  // setParentsArg?
			   m_niceness ,
			   *ct ) )
		// return NULL on error with g_errno set
		return NULL;
	// set just once
	m_xmlValid = true;
	// all done
	return &m_xml;
}

// Language support static stuff
enum {
	METHOD_TAG = 0,
	METHOD_DMOZ,
	METHOD_URL,
	METHOD_OUTLINKS,
	METHOD_INLINKS,
	METHOD_FREQ,
	METHOD_DEFAULT,
	METHOD_IP,
	METHOD_ROOT,
	METHOD_CAP
};

bool setLangVec ( Words *words ,
		  SafeBuf *langBuf ,
		  Sections *ss ,
		  int32_t niceness ) {

	int64_t  *wids  = words->getWordIds    ();
	char      **wptrs = words->m_words;
	int32_t        nw    = words->getNumWords   ();

	// allocate
	if ( ! langBuf->reserve ( nw ) ) return false;

	uint8_t *langVector = (uint8_t *)langBuf->getBufStart();

	// now set the langid
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( niceness );
		// default
		langVector[i] = langUnknown;
		// add the word
		if ( wids[i] == 0LL ) continue;
		// skip if number
		if ( is_digit(wptrs[i][0]) ) {
			langVector[i] = langTranslingual;
			continue;
		}
		// get the lang bits. does not include langTranslingual
		// or langUnknown
		int64_t bits = g_speller.getLangBits64 ( &wids[i] );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 ) {
			// get it. bit #0 is english, so add 1
			char langId = getBitPosLL((uint8_t *)&bits) + 1;
			//langVector[i] = g_wiktionary.getLangId(&wids[i]);
			langVector[i] = langId;
			continue;
		}
		// ambiguous? set it to unknown then
		if ( count >= 2 ) {
			langVector[i] = langUnknown;
			continue;
		}
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		langVector[i] = getCharacterLanguage(wptrs[i]);
	}

	// . now go sentence by sentence
	// . get the 64 bit vector for each word in the sentence
	// . then intersect them all
	// . if the result is a unique langid, assign that langid to
	//   all words in the sentence

	// get first sentence in doc
	Section *si = NULL;
	if ( ss ) si = ss->m_firstSent;
	// scan the sentence sections and or in the bits we should
	for ( ; si ; si = si->m_nextSent ) {
		// breathe
		QUICKPOLL ( niceness );
		// reset vec
		int64_t bits = LANG_BIT_MASK;
		// get lang 64 bit vec for each wid in sentence
		for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
			// breathe
			QUICKPOLL ( niceness );
			// skip if not alnum word
			if ( ! wids[j] ) continue;
			// skip if starts with digit
			if ( is_digit(wptrs[j][0]) ) continue;
			// get 64 bit lang vec. does not include
			// langUnknown or langTransligual bits
			bits &= g_speller.getLangBits64 ( &wids[j] );
		}
		// bail if none
		if ( ! bits ) continue;
		// skip if more than one language in intersection
		if ( getNumBitsOn64(bits) != 1 ) continue;
		// get it. bit #0 is english, so add 1
		char langId = getBitPosLL((uint8_t *)&bits) + 1;
		// ok, must be this language i guess
		for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
			// breathe
			QUICKPOLL ( niceness );
			// skip if not alnum word
			if ( ! wids[j] ) continue;
			// skip if starts with digit
			if ( is_digit(wptrs[j][0]) ) continue;
			// set it
			langVector[j] = langId;
		}
	}

	// try the same thing but do not use sentences. use windows of
	// 5 words. this will pick up pages that have an english menu
	// where each menu item is an individual sentence and only
	// one word.
	// http://www.topicexchange.com/
	int64_t window[5];
	int32_t wpos[5];
	memset ( window , 0 , 8*5 );
	int32_t wp = 0;
	int32_t total = 0;
	// now set the langid
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( niceness );
		// must be alnum
		if ( ! wids[i] ) continue;
		// skip if starts with digit
		if ( is_digit(wptrs[i][0]) ) continue;
		// skip if lang already set to a language
		//if ( langVector[i] != langUnknown &&
		//     langVector[i] != langTranslingual )
		//	continue;
		// get last 5
		window[wp] = g_speller.getLangBits64 ( &wids[i] );
		// skip if not in dictionary!
		if ( window[wp] == 0 ) continue;
		// otherwise, store it
		wpos  [wp] = i;
		if ( ++wp >= 5 ) wp = 0;
		// need at least 3 samples
		if ( ++total <= 2 ) continue;
		// intersect them all together
		int64_t bits = LANG_BIT_MASK;
		for ( int32_t j = 0 ; j < 5 ; j++ ) {
			// skip if uninitialized, like if we have 3
			// or only 4 samples
			if ( ! window[j] ) continue;
			// otherwise, toss it in the intersection
			bits &= window[j];
		}
		// skip if intersection empty
		if ( ! bits ) continue;
		// skip if more than one language in intersection
		if ( getNumBitsOn64(bits) != 1 ) continue;
		// get it. bit #0 is english, so add 1
		char langId = getBitPosLL((uint8_t *)&bits) + 1;
		// set all in window to this language
		for ( int32_t j = 0 ; j < 5 ; j++ ) {
			// skip if unitialized
			if ( ! window[j] ) continue;
			// otherwise, set it
			langVector[wpos[j]] = langId;
		}
	}


	return true;
}

// 1-1 with the words!
uint8_t *XmlDoc::getLangVector ( ) {

	if ( m_langVectorValid ) {
		// can't return NULL, that means error!
		uint8_t *v = (uint8_t *)m_langVec.getBufStart();
		if ( ! v ) return (uint8_t *)0x01;
		return v;
	}

	// words
	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;

	// get the sections without implied sections
	Sections *ss = getImpliedSections();
	if ( ! ss || ss==(void *)-1) return (uint8_t *)ss;


	if ( ! setLangVec ( words , &m_langVec , ss , m_niceness) )
		return NULL;

	m_langVectorValid = true;
	// can't return NULL, that means error!
	uint8_t *v = (uint8_t *)m_langVec.getBufStart();
	if ( ! v ) return (uint8_t *)0x01;
	return v;
}

// returns -1 and sets g_errno on error
uint8_t *XmlDoc::getLangId ( ) {
	if ( m_langIdValid ) return &m_langId;
	setStatus ( "getting lang id");

	// debu ghack
	//m_langId = langRussian;
	//m_langIdValid = true;
	//return &m_langId;

	// get the stuff we need
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (uint8_t *)ip;

	// . if we got no ip, we can't get the page...
	// . also getLinks() will call getSiteNumInlinks() which will
	//   call getSiteLinkInfo() and will core if ip is 0 or -1
	if ( *ip == 0 || *ip == -1 ) {
		m_langId = langUnknown;
		m_langIdValid = true;
		return &m_langId;
	}

	//Xml      *xml      = getXml     ();
	//if ( ! xml || xml == (Xml *)-1 ) return (uint8_t *)xml;
	Words    *words    = getWords   ();
	if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;
	// do not get regular sections, getSections() which will call
	// getImpliedSections(), because then that will need to set addresses
	// and dates, etc. the addresses could return NULL with EBUFOVERFLOW
	// from a static buffer overflow causing us some problems here and
	// since that g_errno is only really handled well in getIndexCode()
	// it will log that CRITICAL CRITICAL message. and we really only
	// need the section sot avoid looking at script tag sections, etc.
	// when calling Words::getLanguage()
	Sections *sections = getExplicitSections();
	// did it block?
	if ( sections==(Sections *)-1) return(uint8_t *)sections;
	// well, it still calls Dates::parseDates which can return g_errno
	// set to EBUFOVERFLOW...
	if ( ! sections && g_errno != EBUFOVERFLOW ) return NULL;
	// if sectinos is still NULL - try lang id without sections then,
	// reset g_errno
	g_errno = 0;
	//Links *links  = getLinks();
	//if ( ! links || links == (Links *)-1 ) return (uint8_t *)links;
	//LinkInfo *info1    = getLinkInfo1();
	//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (uint8_t *)info1;
	//CatRec *cat = getCatRec ();
	//if ( ! cat || cat == (CatRec *)-1) return (uint8_t *)cat;
	uint8_t *lv = getLangVector();
	if ( ! lv || lv == (void *)-1 ) return (uint8_t *)lv;

	setStatus ( "getting lang id");

	// compute langid from vector
	m_langId = computeLangId ( sections , words, (char *)lv );
	if ( m_langId != langUnknown ) {
		m_langIdValid = true;
		return &m_langId;
	}

	// . try the meta description i guess
	// . 99% of the time we don't need this because the above code
	//   captures the language
	int32_t mdlen;
	char *md = getMetaDescription( &mdlen );
	Words mdw;
	mdw.setx ( md , mdlen , m_niceness );
	SafeBuf langBuf;
	setLangVec ( &mdw,&langBuf,NULL,m_niceness);
	char *tmpLangVec = langBuf.getBufStart();
	m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
	if ( m_langId != langUnknown ) {
		m_langIdValid = true;
		return &m_langId;
	}

	// try meta keywords
	md = getMetaKeywords( &mdlen );
	mdw.setx ( md , mdlen , m_niceness );
	langBuf.purge();
	setLangVec ( &mdw,&langBuf,NULL,m_niceness);
	tmpLangVec = langBuf.getBufStart();
	m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
	m_langIdValid = true;
	return &m_langId;
}


// lv = langVec
char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {

	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	// this means null too
	if ( sections && sections->m_numSections == 0 ) sp = NULL;
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE;//|SEC_SELECT;

	int32_t counts [ MAX_LANGUAGES ];
	memset ( counts , 0 , MAX_LANGUAGES * 4);


	int32_t           nw = words->getNumWords   ();
	char      **wptrs = words->m_words;
	int32_t       *wlens = words->m_wordLens;


	// now set the langid
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// skip if in script or style section
		if ( sp && (sp[i]->m_flags & badFlags) ) continue;
		//
		// skip if in a url
		//
		// blah/
		if ( wptrs[i][wlens[i]] == '/' ) continue;
		// blah.blah or blah?blah
		if ( (wptrs[i][wlens[i]] == '.' ||
		      wptrs[i][wlens[i]] == '?' ) &&
		     is_alnum_a(wptrs[i][wlens[i]+1]) )
			continue;
		// /blah or ?blah
		if ( (i>0 && wptrs[i][-1] == '/') ||
		     (i>0 && wptrs[i][-1] == '?')    )
			continue;
		// add it up
		counts[(unsigned char)lv[i]]++;
	}

	// get the majority count
	int32_t max = 0;
	int32_t maxi = 0;
	// skip langUnknown by starting at 1, langEnglish
	for ( int32_t i = 1 ; i < MAX_LANGUAGES ; i++ ) {
		// skip translingual
		if ( i == langTranslingual ) continue;
		if ( counts[i] <= max ) continue;
		max = counts[i];
		maxi = i;
	}

	return maxi;
	//m_langId = maxi;
	//m_langIdValid = true;
	//return &m_langId;

	/*
	int32_t    freqScore = 0;
	int32_t lang;
	if ( ! m_processedLang ) {
		// do not repeat this call for this document
		m_processedLang = true;
		lang = words->getLanguage( sections ,
					   1000 , // sampleSize ,
					   m_niceness,
					   &freqScore);
		// return NULL on error with g_errno set
		if ( lang == -1 ) return NULL;
		// we got it from words, return
		if ( lang != 0 ) {
			m_langId = lang;
			m_langIdValid = true;
			return &m_langId;
		}
	}

	m_langId = 0;
	// try from charset
	uint16_t *charset = getCharset ( );
	if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset;
	// do based on charset
	if ( *charset == csGB18030 ) m_langId = langChineseTrad;
	if ( *charset == csGBK     ) m_langId = langChineseSimp;

	if ( m_langId ) {
		m_langIdValid = true;
		return &m_langId;
	}

	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
	// this lookup here might be unnecessary
	uint8_t *rl = NULL;
	if ( ! *isRoot ) {
		rl = getRootLangId();
		if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
	}

	//Url *u = getCurrentUrl();
	Url *u = getFirstUrl();
	uint8_t gs[METHOD_CAP];
	// reset language method vector
	memset( gs , 0, sizeof(uint8_t) * METHOD_CAP );
	// Let the site tell us what language it's in
	gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml );
	// Guess from the FIRST URL (unredirected url)
	gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() );
	// Guess from the outlinks
	gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links );
	// Guess from the inlinks
	gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip);
	// root page's language, if there was one
	if ( ! *isRoot ) gs [METHOD_ROOT] = *rl;

	int32_t scores[MAX_LANGUAGES];
	memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES );
	// weights for the 10 methods
	char cw[] = { 8,9,4,7,6,7,8,1,2};
	// add up weighted scores
	for(int i = 0; i < METHOD_CAP; i++ )
		scores[gs[i]] += cw[i];

	// reset the "lang" to langUnknown which is 0
	lang = langUnknown ;
	int max, oldmax;
	max = oldmax = 0;
	// find best language
	for ( int32_t i = MAX_LANGUAGES  - 1; i > 0 ; i-- ) {
		if ( scores[i] < max) continue;
		oldmax = max;
		max = scores[i];
		lang = i;
	}
	// give up if not too conclusive
	if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) {
		//log(LOG_DEBUG, "build: Language: Threshold, score "
		//    "(%"INT32" - %"INT32") %"INT32" vs. %"INT32".\n",
		//    (int32_t)max,
		//    (int32_t)oldmax,
		//    (int32_t)max - oldmax,
		//    (int32_t)3);//(int32_t)cr->m_languageThreshold);
		lang = langUnknown;
	}
	// Make sure we're over the bailout value, this
	// keeps low scoring methods like TLD from being
	// the decider if it was the only successful method.
	if ( max < 5 ) { // cr->m_languageBailout ) {
		//log(LOG_DEBUG, "build: Language: Bailout, "
		//    "score %"INT32" vs. %"INT32".",
		//    (int32_t)max, (int32_t)5);//cr->m_languageBailout);
		lang = langUnknown;
	}
	// If the language is still not known,
	// use the language detected from the frames.
	//if(lang == langUnknown) lang = frameFoundLang;
	// . try dmoz if still unknown
	// . limit to 10 of them
	// all done, do not repeat
	m_langIdValid = true;
	m_langId      = lang;
	m_langIdScore = max;
	return &m_langId;
	*/
}


Words *XmlDoc::getWords ( ) {
	// return it if it is set
	if ( m_wordsValid ) return &m_words;
	// this will set it if necessary
	Xml *xml = getXml();
	// returns NULL on error, -1 if blocked
	if ( ! xml || xml == (Xml *)-1 ) return (Words *)xml;
	// note it
	setStatus ( "getting words");
	// now set what we need
	if ( ! m_words.set ( xml        ,
			     true       ,  // computeWordIds?
			     m_niceness ))
		return NULL;
	// we got it
	m_wordsValid = true;
	return &m_words;
}

Bits *XmlDoc::getBits ( ) {
	// return it if it is set
	if ( m_bitsValid ) return &m_bits;
	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Bits *)words;
	// now set what we need
	if ( ! m_bits.set ( words , m_version , m_niceness ) )
		return NULL;
	// we got it
	m_bitsValid = true;
	return &m_bits;
}

Bits *XmlDoc::getBitsForSummary ( ) {
	// return it if it is set
	if ( m_bits2Valid ) return &m_bits2;
	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Bits *)words;
	// now set what we need
	if ( ! m_bits2.setForSummary ( words ) ) return NULL;
	// we got it
	m_bits2Valid = true;
	return &m_bits2;
}

Pos *XmlDoc::getPos ( ) {
	// return it if it is set
	if ( m_posValid ) return &m_pos;
	// this will set it if necessary
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Pos *)ww;
	//Sections *sections = getSections();
	//if ( !sections ||sections==(Sections *)-1) return(Pos *)sections;
	// now set what we need
	//if ( ! m_pos.set ( ww , sections ) ) return NULL;
	if ( ! m_pos.set ( ww , NULL ) ) return NULL;
	// we got it
	m_posValid = true;
	return &m_pos;
}

Phrases *XmlDoc::getPhrases ( ) {
	// return it if it is set
	if ( m_phrasesValid ) return &m_phrases;
	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Phrases *)words;
	// get this
	Bits *bits = getBits();
	// bail on error
	if ( ! bits ) return NULL;
	// now set what we need
	if ( ! m_phrases.set ( words    ,
			       bits     ,
			       true     , // use stop words
			       false    , // use stems
			       m_version ,
			       m_niceness ) )
		return NULL;
	// we got it
	m_phrasesValid = true;
	return &m_phrases;
}

/*
Synonyms *XmlDoc::getSynonyms ( ) {
	// return if already set
	if ( m_synonymsValid ) return &m_synonyms;
	// this will set it if necessary
	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (Synonyms *)words;
	Phrases *phrases = getPhrases ();
	if ( ! phrases || phrases == (void *)-1 ) return (Synonyms *)phrases;
	uint8_t *lv = getLangVector();
	if ( ! lv || lv == (void *)-1 ) return (Synonyms *)lv;
	// primary language of the document
	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (Synonyms *) langId;
	// . now set what we need
	// . provide a buf for which synonyms can be stored if we need to
	SafeBuf *synBuf = NULL;
	if ( m_pbuf || m_storeTermListInfo ) synBuf = &m_synBuf;

	// force on for printing out the synonyms in the loop below
	//synBuf = &m_synBuf;

	if ( ! m_synonyms.set ( words,
				(char *)lv,
				(char)*langId,phrases,
				m_niceness,synBuf) )
		return NULL;

	// we got it
	m_synonymsValid = true;
	return &m_synonyms;
}
*/

Sections *XmlDoc::getExplicitSections ( ) {
	// these sections might or might not have the implied sections in them
	if ( m_explicitSectionsValid ) return &m_sections;

	// if json forget this it is only html
	//uint8_t *ct = getContentType();
	//if ( ! ct || ct == (void *)-1 ) return (Sections *)ct;
	//if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) {
	//	m_sectionsValid = true;
	//	return &m_sections;
	//}

	setStatus ( "getting explicit sections" );
	// use the old title rec to make sure we parse consistently!
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;
	// int16_tcut
	//XmlDoc *od = *pod;
	// if the serialized section is valid, use that
	//char *sd = NULL;
	//bool valid = false;
	//if ( od && od->m_sectionsReplyValid ) valid = true;
	//if ( valid                          ) sd = od->ptr_sectionsReply;
	// shouldn't we use the section data in ptr_sections for this???
	//bool valid = m_sectionsReplyValid ;
	//char        *sd = NULL;
	//if ( valid ) sd = ptr_sectionsReply;
	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Sections *)words;
	// need these too now
	Phrases *phrases = getPhrases();
	if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases;
	// get this
	Bits *bits = getBits();
	// bail on error
	if ( ! bits ) return NULL;
	// the site hash
	int64_t *sh64 = getSiteHash64();
	// sanity check
	if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; }
	if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64;
	// the docid
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	setStatus ( "getting sections");

	//char *sv = NULL;
	//if ( m_setFromTitleRec ) sv = ptr_sectionsVotes;

	// debug time to find a slow url
	int64_t start = gettimeofdayInMillisecondsLocal();

	// this uses the sectionsReply to see which sections are "text", etc.
	// rather than compute it expensively
	if ( ! m_calledSections &&
	     // we get malformed sections error for some diffbot replies
	     //*ct != CT_JSON &&
	     ! m_sections.set ( &m_words      ,
				&m_phrases    ,
				bits          ,
				getFirstUrl() ,
				*d            ,
				*sh64         ,    // 64 bits
				cr->m_coll        ,
				m_niceness    ,
				m_masterState ,    // state
				m_masterLoop  ,    // callback
				*ct           ,
				&m_dates      ,
				NULL          ,    // sd // sections data
				true          ,    // sections data valid?
				NULL          ,    // sv // for m_nsvt
				//*tph          ,
				NULL          ,    // buf
				0             )) { // bufSize
		m_calledSections = true;
		// sanity check, this should not block, we are setting
		// exclusively from the titleRec
		//if ( sd ) { char *xx=NULL;*xx=0; }
		// it blocked, return -1
		return (Sections *) -1;
	}

	int64_t end = gettimeofdayInMillisecondsLocal();

	if ( end - start > 1000 )
		log("build: %s section set took %"INT64" ms",
		    m_firstUrl.m_url,end -start);


	// error? ETAGBREACH for example... or maybe ENOMEM
	if ( g_errno ) return NULL;
	// set inlink bits
	m_bits.setInLinkBits ( &m_sections );
	// we got it
	m_explicitSectionsValid = true;
	return &m_sections;
}

Sections *XmlDoc::getImpliedSections ( ) {
	if ( m_impliedSectionsValid ) return &m_sections;

	// get the sections without implied sections
	Sections *sections = getExplicitSections();
	if ( ! sections || sections==(void *)-1) return (Sections *)sections;

	// just use that for now if not doing events to save time! because
	// adding implied sections really sucks the resources.
	m_impliedSectionsValid = true;
	return &m_sections;

	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Sections *)words;
	// get this
	Bits *bits = getBits();
	// bail on error
	if ( ! bits ) return NULL;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;

	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }

	// now we need basic date types to add implied sections that
	// have a dow/dom header and tod brother sections

	// THIS WAS in getExplicitSections() but now m_wids is NULL.
	// m_wids is set in setPart1() called by XmlDoc::getSimpleDates(),
	// which calls getExplicitSections().
	// . This was called for the benefit of Sections::addImpliedSections()
	//   but now getAddresses() which we call below ends up calling
	//   getSimpleDates() which calls m_dates.setPart1() which calls
	//   m_dates.parseDates() so this is no longer needed i guess.
	/*
	if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits,
				    sections, m_niceness , &m_firstUrl ,
				    *ct )) {
		// sanity check
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// note it
		log("doc: dates3: %s",mstrerror(g_errno));
		// this just means we ran out of stack space to parse
		// out all the dates, so ignore and continue... that way
		// Spider.cpp does not give up and keep retrying us over
		// and over again
		//if ( g_errno == EBUFOVERFLOW ) g_errno = 0;
		// on all other errors, return NULL
		if ( g_errno ) return NULL;
	}
	*/

	// if we got no sections it was bad html. so don't go any further
	// lest we core in other code..
	// it might have also just been an empty doc.
	// either way we'll core in getAddresses cuz it calls getSimpleDates
	// which will core in Dates::setPart1() trying to use m_sectionPtrs
	if ( sections->m_numSections == 0 ) {
		m_impliedSectionsValid = true;
		// hack to avoid core for empty docs like www.mini-polis.com
		sections->m_addedImpliedSections = true;
		return &m_sections;
	}
	// . now set addresses so we can use those to add implied sections
	// . this calls getSimpleDates() which calles m_dates.setPart1()
	//   which calls parseDates again
	Addresses *aa = getAddresses ();
	if ( ! aa || aa == (void *)-1 ) return (Sections *)aa;

	// . now add implied sections
	// . return NULL with g_errno set on error
	if ( ! m_sections.addImpliedSections ( aa ) ) return NULL;

	// we got it
	m_impliedSectionsValid = true;
	return &m_sections;
}

// add in Section::m_sentFlags bits having to do with our voting tables
Sections *XmlDoc::getSections ( ) {

	setStatus("getting sections");

	// get the sections without implied sections
	Sections *ss = getImpliedSections();
	if ( ! ss || ss==(void *)-1) return (Sections *)ss;

	// hash the turk votes (each vote maps a contenthash or taghash to
	// a value) and use these to set sections sentence flags, etc.
	//HashTableX *tvt = getTurkVotingTable ();
	//if ( ! tvt || tvt == (void *)-1 ) return (Sections *)tvt;

	// returns NULL if our url is root!
	//HashTableX *rvt = getRootVotingTable();
	//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;

	SectionVotingTable *osvt = getOldSectionVotingTable();
	if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt;

	uint32_t *tph = getTagPairHash32();
	if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph;

	// need a getUseSectiondb() function...

	if ( ! m_useSectiondb ) {
		m_sectionsValid = true;
		return &m_sections;
	}

	// start here
	Section *si;

	/*
	// get first sentence in doc
	si = ss->m_firstSent;
	// do not bother scanning if no votes
	if ( osvt->getNumVotes() <= 0 ) si = NULL;
	// scan the sentence sections and or in the bits we should
	for ( ; si ; si = si->m_nextSent ) {
		// breathe
		QUICKPOLL(m_niceness);
		// combine section tagHash with contentHashAll to get
		// the "modified tagHash"
		int32_t modified = si->m_tagHash ^ si->m_contentHash;
		// save this
		float dups = osvt->getNumSampled (modified,SV_TAGCONTENTHASH);
		// . getNumSampled() combines both m_nsvt and m_osvt so it
		//   includes ourselves... NO!... let's change this!
		//   the osvt should not include votes from us!
		//   it strips those outin SectionVotingTable::addListOfVotes()
		// . if it is a print-friendly version of the same page then
		//   one of the two should have been deduped and not indexed,
		//   so be strict with adhering to no more than 1!
		if ( dups > 0 ) si->m_flags |= SEC_DUP;
		// . content hash must be unique!
		// . can detect texty bios repeated throughout the site
		// . this is the hash of the words directly in the section
		// . HACK: the contentHash is the "tagHash" for this call
		// . SectionVote::m_numSampled is how many sections over all
		//   docs we indexed from this site have this m_contentHash
		// . note that it is not restricted to pages with the same
		//   tagPairHash as us (i.e. pages with similar layouts)
		//   therefore it is very flexible!!! it is only restricted
		//   to pages with our same site hash.
		// . getNumSampled() combines both m_nsvt and m_osvt so it
		//   includes ourselves
		// . if it is a print-friendly version of the same page then
		//   one of the two should have been deduped and not indexed,
		//   so be strict with adhering to no more than 1!
		if ( dups > 0 ) continue;
		// . must be in a unique section
		// . if the section has siblings, skip it!
		if ( si->m_numOccurences > 1  ) continue;
		// . eliminate dynamic menus
		// . like "related posts" menus
		// . therefore require that we must be "texty" ...
		// . i.e. be like 80% plain text and no more than 20% link text
		// . vote on this since in some cases article may be mostly
		//   just all in anchor text on a few article pages, but on
		//   other pages it is well-behaved
		if ( osvt->getScore ( si->m_tagHash, SV_TEXTY) < .80 )
			continue;
		// . check for comment sections
		// . these are text and the content is unique
		// . BUT the section tagHash is typically repeated at least
		//   once on some other pages (HOPEFULLY!!!!)
		// . if we only require there be X other pages from this site
		//   with the same layout, we might get unlucky in that each
		//   page has 1 or less comments!!! how to fix???
		// . anyway, we ask for the max # sampled from all of the votes
		//   here because if just one page has 2+ copies of this
		//   section enum tag hash, that is enough to be a comment
		//   section
		// . SV_TEXTY_MAX_SAMPLED is a statistic compiled from the
		//   voters and does not actually exist in sectiondb per se.
		//   we add this statistic transparently in addVote() below
		// . it just gets the num sampled from the voter that had the
		//   maximum m_numSampled value, because we don't want an
		//   average in this case
		if ( osvt->getNumSampled(si->m_tagHash,SV_TEXTY_MAX_SAMPLED)>0)
			continue;
		// set it
		si->m_flags |= SEC_ARTICLE;
		// tally it up
		//m_numAlnumWordsInArticle += si->m_exclusive;
		// and another flag
		//m_hadArticle = true;
	}
	*/

	//
	// . how many other pages from this site have our tagpairhash?
	// . that is all the unique adjacent tag pair hashes xor'd together
	// . kind of represents the template of the webpage, ideally
	//
	//int32_t numSimLayouts = osvt->getNumSampled ( *tph , SV_TAGPAIRHASH );

	///////////////////////////////////////
	//
	// set m_dupVotes and m_notDupVotes for each section
	//
	// answers the question... out of all the pages with this taghash,
	// from this site, how often is this content repeated?
	//
	// trumba.com often repeats an event on its various feeds, but
	// not on EVERY page. so we should adjust the event title penalties
	// based on the ratio of repeated to not-repeated from the various
	// pages on the site that have the same *taghash*
	//
	///////////////////////////////////////

	// get first sentence in doc
	si = ss->m_firstSent;
	// do not bother scanning if no votes
	if ( osvt->getNumVotes() <= 0 ) si = NULL;
	// assume no dups
	m_maxVotesForDup = 0;
	// scan the sentence sections and or in the bits we should
	for ( ; si ; si = si->m_nextSent ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// sanity check
		if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; }
		// how many pages from this site have this taghash for
		// a sentence
		float nt;
		nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH);
		// skip if nobody! (except us)
		if ( nt <= 0.0 ) continue;
		// . get out tag content hash
		// . for some reason m_contentHash is 0 for like menu-y sectns
		int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64;
		// . now how many pages also had same content in that tag?
		// . TODO: make sure numsampled only counts a docid once!
		//   and this is not each time it occurs on that page.
		float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH);
		// cast it to a int32_t
		int32_t votes1  = (int32_t)nsam;
		// by default, complement
		int32_t votes2 = (int32_t)nt - votes1;
		// store votes
		si->m_votesForDup    = votes1;
		si->m_votesForNotDup = votes2;
		// what's the most dup votes we had...
		if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1;
		// set it
		//if ( si->m_votesForDup >  2 * si->m_votesForNotDup &&
		//     si->m_votesForDup >= 1 &&
		//     ! (si->m_flags & SEC_HAS_NONFUZZYDATE) )
		//	si->m_sentFlags |= SENT_DUP_SECTION;
	}

	m_sectionsValid = true;
	return &m_sections;
}

SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) {
	if ( m_nsvtValid ) return &m_nsvt;
	// need sections
	Sections *ss = getSections();
	if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;
	// and dates
	Dates *dp = getDates();
	if ( ! dp || dp == (Dates *)-1 ) return (SectionVotingTable *)dp;
	// hash of all adjacent tag pairs
	uint32_t *tph = getTagPairHash32 ( ) ;
	if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;
	// are we a site root url?
	//char *isRoot = getIsSiteRoot();
	//if ( ! isRoot || isRoot == (char *)-1 )
	//	return (SectionVotingTable *)isRoot;

	// init table
	if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL;
	// . tally the section votes from the sections class
	// . only add the date votes, not the taghash/contenthash keys
	//   from the root, since we add those from the root voting table
	//   into m_osvt directly!
	// . we no longer have root voting table!
	// . this adds keys of the hash of each tag xpath
	// . and it adds keys of the hash of each tag path PLUS its innerhtml
	if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL;
	// tally the section votes from the dates
	if ( ! dp->addVotes ( &m_nsvt ) ) return NULL;
	// our new section voting table is now valid, and ready to be added
	// to sectiondb by calling SectionVotingTable::hash()
	m_nsvtValid = true;
	return &m_nsvt;
}


// . scan every section and look up its tag and content hashes in
//   sectiondb to find out how many pages and sites have the same hash
// . use the secondary sectiondb key, key2
// . then store the stats in the Sections::m_stats class
Sections *XmlDoc::getSectionsWithDupStats ( ) {

	Sections *ss = getSections();
	if ( !ss ||ss==(Sections *)-1) return(Sections *)ss;

	if ( m_gotDupStats ) return ss;

	int32_t *sh32 = getSiteHash32();
	if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32;
	uint32_t siteHash32 = (uint32_t)*sh32;

	//int64_t *shp64 = getSiteHash64();
	//if ( ! shp64 || shp64 == (void *)-1 ) return (Sections *)shp64;
	//int64_t siteHash48 = *shp64 & 0x0000ffffffffffffLL;

	// first time called? then init m_nextSection.
	//Section *si = m_si;

	// if this is -1, we are called for the first time
	if ( m_si == (void *)-1 ) {
		m_si  = ss->m_rootSection;
		m_mcastRequestsIn = 0;
		m_mcastRequestsOut = 0;
		m_secStatsErrno = 0;
	}


	//sec_t menuFlags = SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER   ;

	for ( ; m_si ; m_si = m_si->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);

		// don't bother with the section if it doesn't have this set
		// because this eliminates parent dupage to reduce amount
		// of gbxpathsitehash123456 terms we index.
		if ( ! ( m_si->m_flags & SEC_HASHXPATH ) )
			continue;

		// skip if sentence, only hash tags now i guess for diffbot
		//if ( m_si->m_sentenceContentHash64 )
		//	continue;

		// get hash of sentences this tag contains indirectly
		uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64;
		if ( ! val32 )
			continue;

		// skip if menu!
		//if ( m_si->m_flags & menuFlags ) continue;

		// get section xpath hash combined with sitehash
		uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32;

		// convert this to 32 bits
		uint32_t innerHash32 ;
		//sentHash32 = (uint32_t)m_si->m_sentenceContentHash64;
		innerHash32 = (uint32_t)m_si->m_indirectSentHash64;

		// save in case we need to read more than 5MB
		//m_lastSection = si;
		// . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32
		// . we hack the "sentContentHash32" into each posdb key
		//   as the "value" so we can do a facet-like histogram
		//   over all the possible values this xpath has for this site
		SectionStats *stats = getSectionStats ( secHash32,
							innerHash32,
							false ); // cache only?
		// it returns -1 if would block
		if ( stats == (void *)-1 ) {
			// count it as outstanding
			//m_mcastRequestsOut++;
			// launch more if we have room
			// UdpServer.cpp has a limit of 10 on 0x39 requests
			if ( m_mcastRequestsOut - m_mcastRequestsIn < 10)
				continue;
			// advance m_si so we do not repeat
			m_si = m_si->m_next;
			// otherwise, return -1 to indicate blocked
			return (Sections *)-1;
		}
		// NULL means g_errno
		if ( ! stats ) {
			// ensure g_errno is set
			if ( ! g_errno ) { char *xx=NULL;*xx=0; }
			// save it
			m_secStatsErrno = g_errno;
			// clear it
			g_errno = 0;
			// if still waiting though return -1
			if ( m_mcastRequestsOut > m_mcastRequestsIn )
				return (Sections *)-1;
			// otherwise, all done i guess
			return NULL;
		}
		// if already in the table, skip it!
	}

	// waiting for more replies to come back?
	if ( m_mcastRequestsOut > m_mcastRequestsIn )
		return (Sections *) -1;

	// now scan the sections and copy the stats from the table
	// into Section::m_stats of each sentence section.
	// use the key hash as the the hash of the tag/xpath and the innerhtml
	// and the val instead of being site hash will be hash of the
	// content. then we can get the histogram of our content hash
	// for this xpath on our site.
	Section *si = ss->m_rootSection;
	for ( ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);
		// skip if no content to hash
		//if ( ! si->m_sentenceContentHash64 ) continue;

		// don't bother with the section if it doesn't have this set
		// because this eliminates parent dupage to reduce amount
		// of gbxpathsitehash123456 terms we index
		if ( ! ( si->m_flags & SEC_HASHXPATH ) )
			continue;

		// skip if sentence, only hash tags now i guess for diffbot
		//if ( si->m_sentenceContentHash64 )
		//	continue;

		// get hash of sentences this tag contains indirectly
		uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
		if ( ! val32 )
			continue;

		// skip if menu!
		//if ( si->m_flags & menuFlags ) continue;


		// get section xpath hash combined with sitehash
		uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32;

		// convert this to 32 bits
		uint32_t innerHash32 ;
		innerHash32 = (uint32_t)si->m_indirectSentHash64;

		// the "stats" class should be in the table from
		// the lookups above!!
		SectionStats *stats = getSectionStats ( secHash32,
							innerHash32,
							true ); // cache only?
		// sanity
		//if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;}
		// must have had a network error or something
		if ( ! stats ) continue;
		// copy
		gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) );
	}

	//
	// now if a section has no stats but has the same
	// m_indirectSentHash64 as a kid, take his stats
	//
	Section *sx = ss->m_rootSection;
	for ( ; sx ; sx = sx->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);
		// don't bother with the section if it doesn't have this set
		// because this eliminates parent dupage to reduce amount
		// of gbxpathsitehash123456 terms we index
		if ( ! ( sx->m_flags & SEC_HASHXPATH ) )
			continue;
		// scan up parents and set their stats to ours as int32_t as
		// they have the same indirect sent hash64
		Section *p = sx->m_parent;
		for ( ; p ; p = p->m_parent ) {

			// if parent is like an img tag, skip it
			if ( p->m_tagId == TAG_IMG )
				continue;

			if ( p ->m_indirectSentHash64 !=
			     sx->m_indirectSentHash64 )
				break;

			// copy it to parent with the same inner html hash
			gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats));
		}
	}

	// now free the table's mem
	m_sectionStatsTable.reset();

	m_gotDupStats = true;
	return ss;
}

static void gotReplyWrapper39 ( void *state1 , void *state2 ) {
	//XmlDoc *THIS = (XmlDoc *)state;
	XmlDoc *THIS = (XmlDoc *)state1;
	Multicast *mcast = (Multicast *)state2;
	THIS->gotSectionFacets ( mcast );
	// this will end up calling getSectionsWithDupStats() again
	// which will call getSectionStats() some more on new sections
	// until m_gotDupStats is set to true.
	THIS->m_masterLoop ( THIS->m_masterState );
}


// . launch a single msg3a::getDocIds() for a section hash, secHash32
SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 ,
					uint32_t innerHash32 ,
					bool cacheOnly ) {

	// init cache?
	if ( m_sectionStatsTable.m_numSlots == 0 &&
	     ! m_sectionStatsTable.set(4,
				       sizeof(SectionStats),
				       32,
				       NULL,
				       0,
				       false,
				       m_niceness,
				       "secstatsch"))
		return NULL;

	// check in cache...
	SectionStats *stats ;
	stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 );
	// if there, return it
	if ( stats ) return stats;

	// if cache only do not launch
	if ( cacheOnly ) return NULL;

	//
	// TODO: shard gbxpathsitehashxxxxx by termid
	// and make sure msg3a only sends to that single shard and sends
	// the stats back. should make us much faster to sectionize
	// a web page. but for now try without it...
	//

	//int32_t *sh32 = getSiteHash32();
	//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32;

	int32_t maxOut = 32;

	// . need to make new msg39Request and a new Multicast arrays
	// . only need multicast since these gbfacetstr:gbxpathsitehash123456
	//   terms are sharded by termid, otherwise we'd have to use msg3a
	if ( ! m_mcastArray ) {
		// how much mem to alloc?
		int32_t need = 0;
		need += sizeof(Multicast);
		need += sizeof(Msg39Request);
		// query buf str
		need += 100;
		need *= maxOut;
		// a single query now to be shared
		//need += sizeof(Query);
		// just in case we are being re-used
		m_mcastBuf.reset();
		// alloc space
		if ( ! m_mcastBuf.reserve(need) ) return NULL;
		// point to buf
		char *p = m_mcastBuf.getBufStart();
		// set them up
		m_mcastArray = (Multicast *)p;
		p += sizeof(Multicast) * maxOut;
		m_msg39RequestArray = (Msg39Request *)p;
		p += sizeof(Msg39Request) * maxOut;
		//m_queryArray = (Query *)p;
		//p += sizeof(Query) * maxOut;
		//m_sharedQuery = (Query *)p;
		//p += sizeof(Query);
		// for holding the query string
		// assume query will not exceed 100 bytes incuding \0
		m_queryBuf = p;
		p += 100 * maxOut;
		// initialize all!
		for ( int32_t i = 0 ; i < maxOut ; i++ ) {
			m_mcastArray       [i].constructor();
			m_msg39RequestArray[i].reset();//constructor();
			//m_queryArray       [i].constructor();
			m_queryBuf[100*i] = '\0';
			//m_inUse[i] = 0;
		}
	}

	// get first available
	int32_t i;
	for ( i = 0 ; i < maxOut ; i++ )
		if ( ! m_mcastArray[i].m_inUse ) break;

	// wtf?
	if ( i >= maxOut ) { char *xx=NULL;*xx=0; }

	// and our vehicle
	Multicast *mcast = &m_mcastArray[i];

	// mark as in use up here in case we quickpoll into this same code?!
	// yeah, i guess set2() calls quickpoll?
	//mcast->m_inUse = 1;

	// save this for reply
	//mcast->m_hack = this;

	char *qbuf = m_queryBuf + 100 * i;

	// . hash this special term (was gbsectionhash)
	// . the wordbits etc will be a number though, the hash of the content
	//   of the xpath, the inner html hash
	// . preceeding this term with gbfacet: will make gigablast return
	//   the statistics for all the values in the posdb keys of this
	//   termlist, which happen to be innerHTML hashes for all pages
	//   with this same xpath and on this same site.
	sprintf(qbuf,"gbfacetstr:gbxpathsitehash%"UINT32"",
		(uint32_t)secHash32);

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// set the msg39 request
	Msg39Request *r = &m_msg39RequestArray[i];

	// reset all to defaults
	r->reset();

	//r-> ptr_coll             = cr->m_coll;
	//r->size_coll             = gbstrlen(cr->m_coll)+1;
	r->m_collnum = cr->m_collnum;
	r->m_maxAge              = 60; // cache timeout?
	r->m_addToCache          = true;
	r->m_docsToGet           = 0; // just calc stats
	r->m_niceness            = m_niceness;
	r->m_debug               = 0;
	r->m_doSiteClustering    = false;
	//r->m_doIpClustering      = false;
	r->m_doDupContentRemoval = false;
	r->m_boolFlag            = 2;
	r->m_familyFilter        = 0;
	r->m_language            = 0;
	r->ptr_query             = qbuf;//m_sectionHashQueryBuf;
	r->size_query            = gbstrlen(r->ptr_query)+1;
	r->m_timeout             = 3600; //-1;// auto-determine based on #terms
	r->m_maxQueryTerms       = 10;

	// how much of each termlist to read in bytes
	int32_t readList     = 10000;
	r-> ptr_readSizes = (char *)&readList;
	r->size_readSizes = 4;

	// term freqs
	float tfw = 1.0;
	r-> ptr_termFreqWeights = (char *)&tfw;
	r->size_termFreqWeights = 4;

	// speed it up some with this flag
	r->m_forSectionStats = true;

	// only do a single read of docids... do not split up
	r->m_numDocIdSplits  = 1;

	// 1 query term
	r->m_nqt = 1;

	///////////////////////
	//
	// this tells msg3a/msg39/posdbtable its a hack! no need to do this
	// because it's implied by the query.
	// BUT REALLY let's eliminate this and just make our queries like
	// gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of
	// the section's xpath with the site. the values of that term in
	// the posdb key will be 32-bit hashes of the innerHtml for such
	// sections from all pages with the same xpath on the same site.
	// so no need for this now, comment out.
	//
	//r->m_getFacetStats     = true;
	//
	/////////////////////////


	// we need to know what site is the base site so the section stats
	// can set m_onSiteDocIds and m_offSiteDocIds correctly
	//r->m_siteHash32          = *sh32;

	// . now we use the hash of the innerHtml of the xpath
	// . this is our value for the facet field of gbxpathsitehash12345678
	//   which is the hash of the innerHTML for that xpath on this site.
	//   12345678 is the hash of the xpath and the site.
	//r->m_myFacetVal32 = sentHash32;


	//Query *qq = &m_queryArray[i];
	// set query for msg3a. queryExpansion=false
	//qq->set2 ( r->ptr_query , langUnknown , false );

	Query qq;
	qq.set2 ( r->ptr_query , langUnknown , false );

	// TODO: ensure this just hits the one host since it is sharded
	// by termid...

	// what shard owns this termlist. we shard these
	// gbfacetstr:gbxpathsitehash123456 terms by termid.
	int64_t termId = qq.getTermId(0);
	int32_t shardNum = getShardNumFromTermId ( termId );

	// hack in our inner html content hash for this xpath
	mcast->m_hack32 = innerHash32;
	mcast->m_hack64 = secHash32;

	// malloc and store the request. mcast will free it when done.
	int32_t reqSize;
	char *req = serializeMsg ( sizeof(Msg39Request),
				   &r->size_readSizes,
				   &r->size_whiteList,
				   &r->ptr_readSizes,
				   r,
				   &reqSize,
				   NULL,
				   0,
				   false);

	// . send out a msg39 request to each shard
	// . multicasts to a host in group "groupId"
	// . we always block waiting for the reply with a multicast
	// . returns false and sets g_errno on error
	// . sends the request to fastest host in group "groupId"
	// . if that host takes more than about 5 secs then sends to
	//   next host
	// . key should be largest termId in group we're sending to
	bool status;
	status = mcast->send ( req               , // m_rbufPtr         ,
			       reqSize           , // request size
			       0x39              , // msgType 0x39
			       true              , // mcast owns m_request?
			       shardNum          , // group to send to
			       false             , // send to whole group?
			       0,//(int32_t)qh          , // 0 // startKey.n1
			       this              , // state1 data
			       mcast             , // state2 data
			       gotReplyWrapper39 ,
			       30                , //timeout in secs
			       m_niceness,//m_r->m_niceness   ,
			       false             , // realtime?
			       -1, // firstHostId, // -1// bestHandlingHostId ,
			       NULL              , // m_replyBuf   ,
			       0                 , // MSG39REPLYSIZE,
			       // this is true if multicast should free the
			       // reply, otherwise caller is responsible
			       // for freeing it after calling
			       // getBestReply().
			       // actually, this should always be false,
			       // there is a bug in Multicast.cpp.
			       // no, if we error out and never steal
			       // the buffers then they will go unfreed
			       // so they are freed by multicast by default
			       // then we steal control explicitly
			       true             );

	m_mcastRequestsOut++;

	// if successfully launch, wait...
	if ( status ) return (SectionStats *) -1;

	// error?
	if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; }

	// sets &m_sectionStats and adds to the table
	gotSectionFacets ( mcast );

	// i guess did not block...
	//return &msg3a->m_sectionStats;
	return &m_sectionStats;
}

// . come here when msg39 got the ptr_faceHashList for our single
//   gbfacet:gbxpathsitehash
// . returns false and sets g_errno on error
bool XmlDoc::gotSectionFacets ( Multicast *mcast ) {
	//SectionStats *stats = &msg39->m_sectionStats;

	if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;}

	// count it as returned
	m_mcastRequestsIn++;
	// mark it as available now
	int32_t num = mcast - m_mcastArray;
	// sanity
	//if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; }

	// grab the xpath/site hash
	uint32_t secHash32 = mcast->m_hack64;

	// and our innher html for that xpath
	int32_t myFacetVal32 = mcast->m_hack32;

	// sanity. should only be a gbfacet:gbxpathsitehash12345567 term.
	//if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; }

	// reset all counts to 0
	m_sectionStats.reset();

	//////
	//
	// compile m_sectionStats
	//
	///////

	// set m_sectionStats from the list of facet values for this
	// gbfacet:xpathsitehash term...
	// Query::m_queryTerm.m_facetHashTable has the facets merged
	// from all the shards. so now compute the stats from them.
	// set the section stats.
	//QueryTerm *qt = &msg3a->m_q->m_qterms[0];
	//HashTableX *ft = &qt->m_facetHashTable;

	// . get the list of facet field/value pairs.
	// . see how Msg3a.cpp merges these to see how they are stored
	Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply();

	// this is NULL with g_errno set on error
	if ( ! mr ) {
		log("xmldoc: got error from sec stats mcast: %s",
		    mstrerror(g_errno));
		return false;
	}

	deserializeMsg ( sizeof(Msg39Reply) ,
			 &mr->size_docIds,
			 &mr->size_clusterRecs,
			 &mr->ptr_docIds,
			 mr->m_buf );

	char *p = (char *)(mr->ptr_facetHashList);
	//char *pfinal = p + mr->size_facetHashList;

	//
	// should only be one termid of facets in here, so no need to re-loop
	//
	int32_t nh = 0;
	// "matches" is how many docids with this facet field had our facet val
	int32_t matches = 0;
	// "totalDocIds" is how many docids had this facet field
	int32_t totalFields = 0;

	if ( p ) {
		// first is the termid
		//int64_t termId = *(int64_t *)p;
		// skip that
		p += 8;
		// the # of unique 32-bit facet values
		nh = *(int32_t *)p;
		p += 4;
		// the end point
		char *pend = p + (8 * nh);
		// now compile the facet hash list into there
		for ( ; p < pend ; ) {
			// does this facet value match ours?
			// (i.e. same inner html?)
			if ( *(int32_t *)p == myFacetVal32 )
				matches += *(int32_t *)(p+4);
			p += 4;
			// now how many docids had this facet value?
			totalFields += *(int32_t *)p;
			p += 4;
		}
	}

	// how many unique inner html content hashes for this xpath/site
	// hash were there?
	m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed;

	// how many xpaths existsed over all docs. doc can have multiple.
	m_sectionStats.m_totalEntries = totalFields;

	// total # unique docids that had this facet
	m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits;

	// how many had the same inner html content hash for
	// this xpath/site as we did?
	m_sectionStats.m_totalMatches = matches;

	////////
	//
	// store m_sectionStats in cache
	//
	////////

	// cache them. this does a copy of m_sectionStats
	if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) )
		log("xmldoc: failed to add sections stats: %s",
		    mstrerror(g_errno));

	// reset that msg39 to free its data
	//msg39->reset();

	if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; }

	// . make it available again
	// . do this after all in case we were in quickpoll interruptting
	//   the getSectionStats() function below
	//mcast->m_inUse = 0;

	// free query Query::m_qwords array etc. to stop mem leaks
	m_mcastArray       [num].reset();
	m_msg39RequestArray[num].reset();
	//m_queryArray       [num].reset();
	// now when the master loop calls getSectionsWithDupStats() it
	// should find the stats class in the cache!
	return true;
}


// . for all urls from this subdomain...
// . EXCEPT root url since we use msg17 to cache that, etc.
SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) {

	if ( m_osvtValid ) return &m_osvt;

	// do not consult sectiondb if we are set from the title rec,
	// that way we avoid parsining inconsistencies since sectiondb changes!
	if ( m_setFromTitleRec ) {
		char *p = ptr_sectiondbData;
		m_osvtValid = true;
		m_osvt.m_totalSiteVoters = 0;
		if ( size_sectiondbData <= 4 ) return &m_osvt;
		m_osvt.m_totalSiteVoters = *(int32_t *)p;
		p += 4;
		int32_t remaining = size_sectiondbData - 4;
		m_osvt.m_svt.deserialize(p,remaining,m_niceness);
		return &m_osvt;
	}

	// returns empty table if WE are the site root url!
	//HashTableX *rvt = getRootVotingTable();
	//if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt;

	// need sections
	//Sections *ss = getSections();
	//if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss;

	// hash of all adjacent tag pairs
	uint32_t *tph = getTagPairHash32 ( ) ;
	if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph;

	int64_t *siteHash64 = getSiteHash64();
	if ( ! siteHash64 || siteHash64 == (void *)-1 )
		return (SectionVotingTable *)siteHash64;

	// the docid
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . for us, dates are really containers of the flags and tag hash
	// . init this up here, it is re-set if we re-call getSectiondbList()
	//   because there were too many records in it to handle in one read
	if ( m_numSectiondbReads == 0 ) {
		// init table
		if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL;
		// use site hash as the main thing
		int64_t termId = *siteHash64 & TERMID_MASK;
		// . start key for reading list from sectiondb
		// . read all the section votes for this site
		m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff);
		// how many reads we have to do...
		m_numSectiondbNeeds = 1;
	}

	//bool skipRecall = false;
	// always read 5MB at a time from sectiondb
	int32_t minRecSizes = 5000000;

	// crap! host #28 is being totall slammed!!!!!
	// why?????? in the meantime do this
	//minRecSizes = 100000;
	//skipRecall  = true;

	// is it facebook?
	bool limitSectiondb = false;
	// limit now to speed up repair rebuild
	// limit now to speed up injection!
	limitSectiondb = true;
	// facebook lists often clog the tree, and when we read 2MB worth of
	// it, it takes 100ms, so reduce to 50k to so it takes 2.5ms...
	// because facebook is a well structured xml feed so why read any
	// really!
	if ( limitSectiondb ) minRecSizes = 50000;

	key128_t *lastKey = NULL;

	// if msg0 blocked and came back with g_errno set, like
	// in preparing to merge it got an OOM
	if ( g_errno ) {
		log("build: sectiondb read2: %s",mstrerror(g_errno));
		return NULL;
	}


 readLoop:
	// before looking up TitleRecs using Msg20, let's first consult
	// datedb to see if we got adequate data as to what sections
	// are the article sections

	// only get the list once
	if ( m_numSectiondbReads < m_numSectiondbNeeds ) {
		// only do this once
		m_numSectiondbReads++;
		// make the termid
		uint64_t termId = *siteHash64 & TERMID_MASK;
		// end key is always the same
		key128_t end = g_datedb.makeEndKey ( termId , 0 );
		// int16_tcut
		Msg0 *m = &m_msg0;
		// get the group this list is in (split = false)
		uint32_t shardNum;
		shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey);
		// we need a group # from the groupId
		//int32_t split = g_hostdb.getGroupNum ( gid );
		// note it
		//logf(LOG_DEBUG,"sections: "
		//     "reading list from sectiondb: "
		//     "sk.n1=0x%"XINT64" sk.n0=0x%"XINT64" "
		//     "ek.n1=0x%"XINT64" ek.n0=0x%"XINT64" "
		//     ,m_sectiondbStartKey.n1
		//     ,m_sectiondbStartKey.n0
		//     ,end.n1
		//     ,end.n0
		//     );
		// . get the list
		// . gets all votes for one particular site
		if ( ! m->getList ( -1                      , // hostId
				    0                       , // ip
				    0                       , // port
				    0                       , // maxCacheAge
				    false                   , // addToCache
				    RDB_SECTIONDB           , // was RDB_DATEDB
				    cr->m_collnum                  ,
				    &m_secdbList            ,
				    (char *)&m_sectiondbStartKey ,
				    (char *)&end            ,
				    minRecSizes             ,
				    m_masterState           ,
				    m_masterLoop            ,
				    m_niceness              , // MAX_NICENESS
				    // default parms follow
				    true  ,  // doErrorCorrection?
				    true  ,  // includeTree?
				    true  ,  // doMerge?
				    -1    ,  // firstHostId
				    0     ,  // startFileNum
				    -1    ,  // numFiles
				    999995    ,  // timeout
				    -1    ,  // syncPoint
				    -1    ,  // preferLocalReads
				    NULL  ,  // msg5
				    NULL  ,  // msg5b
				    false ,  // isrealmerge?
				    true  ,  // allowpagecache?
				    false ,  // forceLocalIndexdb?
				    false ,  // doIndexdbSplit?
				    shardNum ) )//split ))
			// return -1 if blocks
			return (SectionVotingTable *)-1;
		// error?
		if ( g_errno ) {
			log("build: sectiondb read: %s",mstrerror(g_errno));
			return NULL;
		}
	}

	// it also returns the lastKey in the list so we can use that to
	// set the startKey for a re-call if we read >= 5MB
	lastKey = NULL;

	//logf(LOG_DEBUG,"sections: read list of %"INT32" bytes",
	//     m_secdbList.m_listSize);

	bool recall = true;

	if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false;

	// . unless it had special byte set in Msg0.cpp HACK
	// . we send back a compressed list and tack on an extra 0 byte at
	//   the end so that we know we had a full list!
	if ( (m_secdbList.m_listSize % 2) == 1 ) {
		m_secdbList.m_listSize--;
		m_secdbList.m_listEnd --;
		recall = true;
	}

	// no longer bother re-calling, because facebook is way slow...
	if ( limitSectiondb ) recall = false;

	// . returns false and sets g_errno on error
	// . compile the votes from sectiondb for this site into a hashtable
	// . m_osvt is a SectionVotingTable and each entry in the hashtable
	//   is a SectionVote class.
	// . the taghash is the key of the vote and is a hash of all the
	//   nested tags the section is in.
	// . another vote uses the tag hash hashed with the hash of the
	//   content contained by the section
	// . using these two vote counts we set Section::m_votesForDup
	//   or Section::m_votesForNotDup counts which let us know how the
	//   section is repeated or not repeated on the site
	// . SectionVote::m_score is always 1.0 from what i can tell
	//   cuz it seems like addVote*() always uses a score of 1.0
	// . SectionVote::m_numSampled is how many times that tagHash
	//   occurs in the document.
	if ( ! m_osvt.addListOfVotes(&m_secdbList,
				     &lastKey,
				     *tph,
				     *d , // docid
				     m_niceness))
		return NULL;

	// why is this always zero it seems?
	if ( g_conf.m_logDebugBuild )
		log("xmldoc: added sectiondblist size=%"INT32" recall=%"INT32"",
		    m_secdbList.m_listSize,(int32_t)recall);

	// . recall? yes if we had to truncate our list...
	// . we need to be able to scan all votes for the website... that is
	//   why we recall here
	// . limit votes by a special sectiondb key then that is a vote...
	if ( recall ) {
		// another debug
		//logf(LOG_DEBUG,"sections: recallling read");
		// just note it for now
		//if ( m_sectiondbRecall > 5 )
		if ( m_numSectiondbNeeds > 5 )
			logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%"INT32"",
			     m_sectiondbRecall++);
		// we should really limit voting per site! we do now!
		//if ( m_recall > 5 ) { char *xx=NULL;*xx=0; }
		// update our start key
		if ( lastKey ) m_sectiondbStartKey = *lastKey;
		// inc by 2 since we already had this key
		m_sectiondbStartKey += 2;
		// unflag
		m_numSectiondbNeeds++;
		// and repeat
		goto readLoop;
	}

	//
	// set ptr_sectiondbData so this can be set from a title rec without
	// having to lookup in sectiondb again which might have changed!
	//
	m_sectiondbData.purge();
	// alloc
	int32_t need = m_osvt.m_svt.getStoredSize() + 4;
	if ( ! m_sectiondbData.reserve(need) )
		// oom error?
		return NULL;
	// serialize this number
	m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters);
	// serialize the hashtablex
        m_osvt.m_svt.serialize ( &m_sectiondbData );
	// reference it for title rec serialization
	ptr_sectiondbData  = m_sectiondbData.getBufStart();
	size_sectiondbData = m_sectiondbData.length();

	m_osvtValid = true;
	return &m_osvt;
}

int32_t *XmlDoc::getLinkSiteHashes ( ) {
	if ( m_linkSiteHashesValid )
		return (int32_t *)m_linkSiteHashBuf.getBufStart();
	// get the outlinks
	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 ) return (int32_t *)links;

	// . get the outlink tag rec vector
	// . each link's tagrec may have a "site" tag that is basically
	//   the cached SiteGetter::getSite() computation
	TagRec ***grv = NULL;
	if ( ! m_setFromTitleRec ) {
		grv = getOutlinkTagRecVector();
		if ( ! grv || grv == (void *)-1 ) return (int32_t *)grv;
	}

	// how many outlinks do we have on this page?
	int32_t n = links->getNumLinks();

	// reserve space
	m_linkSiteHashBuf.purge();
	if ( ! m_linkSiteHashBuf.reserve ( n * 4 ) ) return NULL;

	if ( n == 0 ) {
		ptr_linkdbData = NULL;
		size_linkdbData = 0;
		return (int32_t *)0x1234;
	}

	// if set from titlerec then assume each site is the full hostname
	// of the link, unless its specified explicitly in the hashtablex
	// serialized in ptr_linkdbData
	if ( m_setFromTitleRec ) {
		// this holds the sites that are not just the hostname
		int32_t *p = (int32_t *)ptr_linkdbData;
		int32_t *pend = (int32_t *)(ptr_linkdbData + size_linkdbData);
		// loop over links
		for ( int32_t i = 0 ; i < n ; i++ ) {
			// breathe
			QUICKPOLL ( m_niceness );
			// get the link
			char *u = links->getLinkPtr(i);
			// assume site is just the host
			int32_t hostLen = 0;
			char *host = ::getHost ( u , &hostLen );
			int32_t siteHash32 = hash32 ( host , hostLen , 0 );
			// unless give as otherwise
			if ( p < pend && *p == i ) {
				p++;
				siteHash32 = *p;
				p++;
			}
			// store that then. should not fail since we allocated
			// right above
			if ( ! m_linkSiteHashBuf.pushLong(siteHash32) ) {
				char *xx=NULL;*xx=0; }
		}
		// return ptr of array, which is a safebuf
		return (int32_t *)m_linkSiteHashBuf.getBufStart();
	}

	// ptr_linkdbData will point into this buf
	m_linkdbDataBuf.purge();

	// loop through them
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get the link
		char *u = links->getLinkPtr(i);
		// get full host from link
		int32_t hostLen = 0;
		char *host = ::getHost ( u , &hostLen );
		int32_t hostHash32 = hash32 ( host , hostLen , 0 );
		// get the site
		TagRec *gr = (*grv)[i];
		char *site = NULL;
		int32_t  siteLen = 0;
		if ( gr ) {
			int32_t dataSize = 0;
			site = gr->getString("site",NULL,&dataSize);
			if ( dataSize ) siteLen = dataSize - 1;
		}
		// otherwise, make it the host or make it cut off at
		// a "/user/" or "/~xxxx" or whatever path component
		if ( ! site ) {
			// GUESS link site... like /~xxx
			site    = host;
			siteLen = hostLen;
		}
		int32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
		// only store if different form host itself
		if ( linkeeSiteHash32 != hostHash32 ) {
			if ( ! m_linkdbDataBuf.pushLong(i) )
				return NULL;
			if ( ! m_linkdbDataBuf.pushLong(linkeeSiteHash32) )
				return NULL;
		}
		// store it always in this buf
		if ( ! m_linkSiteHashBuf.pushLong(linkeeSiteHash32) ) {
			// space should have been reserved above!
			char *xx=NULL;*xx=0; }
	}
	// set ptr_linkdbData
	ptr_linkdbData  = m_linkdbDataBuf.getBufStart();
	size_linkdbData = m_linkdbDataBuf.length();
	m_linkSiteHashesValid = true;

	return (int32_t *)m_linkSiteHashBuf.getBufStart();
}

Links *XmlDoc::getLinks ( bool doQuickSet ) {
	if ( m_linksValid ) return &m_links;
	// set status
	setStatus ( "getting outlinks");

	// . add links from diffbot reply
	// . get the reply of json objects from diffbot
	// . this will be empty if we are a json object!
	// . will also be empty if not meant to be sent to diffbot
	// . the TOKENIZED reply consists of \0 separated json objects that
	//   we create from the original diffbot reply
	SafeBuf *dbr = getDiffbotReply();
	if ( ! dbr || dbr == (void *)-1 ) return (Links *)dbr;

	// this will set it if necessary
	Xml *xml = getXml();
	// bail on error
	if ( ! xml || xml == (Xml *)-1 ) return (Links *)xml;
	// can't call getIsPermalink() here without entering a dependency loop
	char *pp = getIsUrlPermalinkFormat();
	if ( !pp || pp == (char *)-1 ) return (Links *)pp;
	// use the old xml doc
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (Links *)od;
	// get Links class of the old title rec
	Links *oldLinks = NULL;
	// if we were set from a title rec, do not do this
	if ( *od ) {
		oldLinks = (*od)->getLinks();
		if (!oldLinks||oldLinks==(Links *)-1) return (Links *)oldLinks;
	}
	Url *baseUrl = getBaseUrl();
	if ( ! baseUrl || baseUrl==(Url *)-1) return (Links *)baseUrl;
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (Links *)ip;
	// this ensures m_contentLen is set
	//char **content = getContent();
	//if ( ! content || content == (char **)-1 ) return (Links *)content;
	// this will set ptr_indCatIds and size_indCatIds
	int32_t **pici = getIndCatIds();
	if ( ! pici || pici == (void *)-1 ) return (Links *)pici;
	char *ict = getIsContentTruncated();
	if ( ! ict || ict == (char *)-1 ) return (Links *)ict;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (Links *)sni;
	// get the latest url we are on
	Url *u = getCurrentUrl();

	//
	// if we had a EDOCSIMPLIFIEDREDIR error, pretend it is a link
	// so addOutlinkSpiderRecsToMetaList() will add it to spiderdb
	//
	if ( m_indexCodeValid && m_indexCode == EDOCSIMPLIFIEDREDIR ) {
		m_links.set ( m_redirUrl.getUrl(),m_redirUrl.getUrlLen() );
		m_linksValid = true;
		return &m_links;
	}

	if ( m_indexCodeValid && m_indexCode == EDOCNONCANONICAL ) {
		m_links.set(m_canonicalRedirUrl.getUrl(),
			    m_canonicalRedirUrl.getUrlLen());
		m_linksValid = true;
		return &m_links;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	bool useRelNoFollow = true;
	if ( ! cr->m_obeyRelNoFollowLinks ) useRelNoFollow = false;
	// to keep things simple, for diffbot custom crawls, if robots.txt
	// is not used then do not use rel no follow
	if ( ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
		useRelNoFollow = false;

	// . set it
	// . if parent is a permalink we can avoid its suburl outlinks
	//   containing "comment" from being classified as permalinks
	if ( ! m_links.set ( useRelNoFollow ,
			     xml         ,
			     u           ,
			     true        , // setLinkHashes?
			     baseUrl     ,
			     m_version   ,
			     m_niceness  ,
			     *pp         , // parent url in permalink format?
			     oldLinks    ,// oldLinks, might be NULL!
			     doQuickSet  ,
			     dbr ) )
		return NULL;

	m_linksValid = true;

	// do not bother setting that bit if we are being called for link
	// text because that bit was already in the linkdb key, and it
	// was set to zero! so if getting msg20 reply.... bail now
	if ( m_req ) return &m_links;

	// . apply link spam settings
	// . set the "spam bits" in the Links class
	setLinkSpam ( *ip                ,
		      ptr_indCatIds      ,
		      size_indCatIds / 4 ,
		      u                  , // linker url
		      *sni               ,
		      xml                ,
		      &m_links           ,
		      *ict               ,
		      m_niceness         );
	// we got it
	return &m_links;
}


HashTableX *XmlDoc::getCountTable ( ) {
	// return it if we got it
	if ( m_countTableValid ) return &m_countTable;

	setStatus ("getting count table");

	// get the stuff we need
	Xml      *xml      = getXml     ();
	if ( ! xml || xml == (Xml *)-1 ) return (HashTableX *)xml;
	Words    *words    = getWords   ();
	if ( ! words || words == (Words *)-1 ) return (HashTableX *)words;
	Phrases  *phrases  = getPhrases ();
	if ( ! phrases || phrases==(Phrases *)-1) return (HashTableX *)phrases;
	Bits     *bits     = getBits    ();
	if ( ! bits || bits == (Bits *)-1 ) return (HashTableX *)bits;
	Sections *sections = getSections();
	if ( !sections||sections==(Sections *)-1) return(HashTableX *)sections;
	LinkInfo *info1    = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (HashTableX *)info1;

	// . reduce score of words in badly repeated fragments to 0 so we do
	//   not count them here!
	// . ff[i] will have score of 0 if in repeated frag
	// . make sure this is stored for whole doc... since we only use it
	//   for the body
	char *fv = getFragVec();
	if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv;

	//LinkInfo *info2    = getLinkInfo2();
	//if ( ! info2 || info2 == (LinkInfo *)-1 ) return (HashTableX *)info2;

	// init our count table otherwise
	//if(! m_countTable.set( 8,4,1024,NULL,0,false,m_niceness,"xmlcnttbl"))
	//	return NULL;

	// breathe
	QUICKPOLL ( m_niceness );

	//
	// this was in Weights.cpp, but now it is here...
	//

	// int16_tcut
	HashTableX *ct = &m_countTable;

	// reset the counts, just in case set() below does not
	//ct->reset();

	// ez var
	int64_t  *wids  = words->getWordIds    ();
	nodeid_t   *tids  = words->getTagIds     ();
	int32_t        nw    = words->getNumWords   ();
	char      **wptrs = words->m_words;
	int32_t       *wlens = words->m_wordLens;
	int64_t  *pids  = phrases->getPhraseIds2();

	// add 5000 slots for inlink text in hashString_ct() calls below
	int32_t numSlots = nw * 3 + 5000;
	// only alloc for this one if not provided
	if (!ct->set(8,4,numSlots,NULL,0,false,m_niceness,"xmlct"))
	  return (HashTableX *)NULL;

	//char *ff = getFragVec ( ) ;
	//if ( ! ff ) return false;

	// . now hash all the phrase ids we have in order to see if the phrase
	//   is unique or not. if phrase is repeated a lot we punish the scores
	//   of the individual words in the phrase and boost the score of the
	//   phrase itself. We check for uniqueness down below.
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// add the word
		if ( wids[i] == 0LL ) continue;
		//if ( wids[i] == 708411945052722517LL )
		//	log("hey4 got new pid=%"INT64" i=%"INT32"",pids[i],i);
		// . skip if in repeated fragment
		// . unfortunately we truncate the frag vec to like
		//   the first 80,000 words for performance reasons
		if ( i < MAXFRAGWORDS && fv[i] == 0 ) continue;
		// accumulate the wid with a score of 1 each time it occurs
		if ( ! ct->addTerm ( &wids[i] ) ) return (HashTableX *)NULL;
		// skip if word #i does not start a phrase
		if ( ! pids [i] ) continue;
		// if phrase score is less than 100% do not consider as a
		// phrase so that we do not phrase "albuquerque, NM" and stuff
		// like that... in fact, we can only have a space here...
		if ( wptrs[i+1][0] == ',' ) continue;
		if ( wptrs[i+1][1] == ',' ) continue;
		if ( wptrs[i+1][2] == ',' ) continue;
		// put it in, accumulate, max score is 0x7fffffff
		if ( ! ct->addTerm ( &pids[i] ) ) return (HashTableX *)NULL;
	}

	// now add each meta tag to the pot
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if not a meta tag
		if ( tids[i] != 68 ) continue;
		// find the "content=" word
		char *w    = wptrs[i];
		int32_t  wlen = wlens[i];
		char *wend = w + wlen;
		char *p    ;
		p = strncasestr  (w,wlen,"content=");
		// skip if we did not have any content in this meta tag
		if ( ! p ) continue;
		// skip the "content="
		p += 8;
		// skip if empty meta content
		if ( wend - p <= 0 ) continue;
		// our ouw hash
		if ( ! hashString_ct ( ct , p , wend - p ) )
		  return (HashTableX *)NULL;
	}
	// add each incoming link text
	for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// int16_tcuts
		char *p;
		int32_t  plen;
		// hash link text (was hashPwids())
		p    = k-> getLinkText();
		plen = k->size_linkText - 1;
		if ( ! verifyUtf8 ( p , plen ) ) {
			log("xmldoc: bad link text 3 from url=%s for %s",
			    k->getUrl(),m_firstUrl.m_url);
			continue;
		}
		if ( ! hashString_ct ( ct , p , plen ) )
		  return (HashTableX *)NULL;
		// hash this stuff (was hashPwids())
		p    = k->getSurroundingText();
		plen = k->size_surroundingText - 1;
		if ( ! hashString_ct ( ct , p , plen ) )
		  return (HashTableX *)NULL;
	}

	// we got it
	m_countTableValid = true;
	return &m_countTable;
}

// . a special function used by XmlDoc::getCountTable() above
// . kinda similar to XmlDoc::hashString()
bool XmlDoc::hashString_ct ( HashTableX *ct , char *s , int32_t slen ) {

	Words   words;
	Bits    bits;
	Phrases phrases;
	if ( ! words.set   ( s , slen , m_version , true , m_niceness ) )
		return false;
	if ( ! bits.set    ( &words , m_version , m_niceness ) )
		return false;
	if ( ! phrases.set(&words,&bits,true,false,m_version,m_niceness))
		return false;
	int32_t nw = words.getNumWords();
	int64_t  *wids  = words.getWordIds();
	int64_t  *pids  = phrases.m_phraseIds2;
	char      **wptrs = words.m_words;
	int32_t       *wlens = words.m_wordLens;

	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// add the word
		if ( wids[i] == 0LL ) continue;
		// skip if in repeated fragment
		// . NO, we do not use this for these int16_t strings
		//if ( ww[i] == 0 ) continue;
		// accumulate the wid with a score of 1 each time it occurs
		if ( ! ct->addTerm ( &wids[i] ) ) return false;
		// skip if word #i does not start a phrase
		if ( ! pids [i] ) continue;
		// if phrase score is less than 100% do not consider as a
		// phrase so that we do not phrase "albuquerque, NM" and stuff
		// like that... in fact, we can only have a space here...
		if ( i+1<nw ) {
			if ( wptrs[i+1][0] == ',' ) continue;
			if ( wlens[i+1]>=2 && wptrs[i+1][1] == ',' ) continue;
			if ( wlens[i+1]>=3 && wptrs[i+1][2] == ',' ) continue;
		}
		// put it in, accumulate, max score is 0x7fffffff
		if ( ! ct->addTerm ( &pids[i] ) ) return false;
	}
	return true;
}


uint8_t *XmlDoc::getSummaryLangId ( ) {
	// return if we got it already
	if ( m_summaryLangIdValid ) return &m_summaryLangId;
	Summary *s = getSummary();
	if ( ! s || s == (void *)-1 ) return (uint8_t *)s;
	char *sum    = s->getSummary();
	// now set the words class
	Words ww;
	if ( ! ww.set9 ( sum , m_niceness ) ) return NULL;
	// check it out. 0 means langUnknown. -1 means error.
	int32_t ret = ww.getLanguage ( NULL , 100 , m_niceness , NULL );
	// -1 means error! g_errno should be set
	if ( ret < 0 ) return NULL;
	// set it
	m_summaryLangId = (uint8_t)ret;
	// assume valid
	m_summaryLangIdValid = true;
	// return it
	return &m_summaryLangId;
}

int cmp ( const void *h1 , const void *h2 ) ;

// vector components are 32-bit hashes
int32_t *XmlDoc::getTagPairHashVector ( ) {

	if ( m_tagPairHashVecValid ) return m_tagPairHashVec;

	Xml      *xml      = getXml     ();
	if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;

	// store the hashes here
	uint32_t hashes [ 2000 ];
	int32_t          nh = 0;
	// go through each node
	XmlNode *nodes = xml->getNodes    ();
	int32_t   n       = xml->getNumNodes ();

	// start with the ith node
	int32_t i = 0;

	uint32_t saved = 0;
	uint32_t lastHash = 0;
	// loop over the nodes
	for ( ; i < n ; i++ ) {
		// breathe a little
		QUICKPOLL ( m_niceness );
		// skip NON tags
		if ( ! nodes[i].isTag() ) continue;
		// use the tag id as the hash, its unique
		uint32_t h = hash32h ( nodes[i].getNodeId() , 0 );
		// ensure hash is not 0, that has special meaning
		if ( h == 0 ) h = 1;
		// store in case we have only one hash
		saved = h;

		// if we are the first, set this
		if ( ! lastHash ) {
			lastHash = h;
			continue;
		}

		// if they were the same do not xor, they will zero out
		if ( h == lastHash ) hashes[nh++] = h;
		// incorporate it into the last hash
		else                 hashes[nh++] = h ^ lastHash;

		// we are the new last hash
		lastHash = h;
		// bust out if no room
		if ( nh >= 2000 ) break;
	}
	// if only had one tag after, use that
	if ( nh == 0 && saved ) hashes[nh++] = saved;

	// breathe
	QUICKPOLL ( m_niceness ) ;
	// . TODO: remove the link text hashes here?
	// . because will probably be identical..
	// . now sort hashes to get the top MAX_PAIR_HASHES
	gbsort ( hashes , nh , 4 , cmp );
	// breathe
	QUICKPOLL ( m_niceness ) ;
	// uniquify them
	int32_t d = 0;
	for ( int32_t j = 1 ; j < nh ; j++ ) {
		if ( hashes[j] == hashes[d] ) continue;
		hashes[++d] = hashes[j];
	}
	// breathe
	QUICKPOLL ( m_niceness ) ;
	// how many do we got?
	nh = d;
	// truncate to MAX_PAIR_HASHES MINUS 1 so we can put a 0 at the end
	if ( nh > MAX_TAG_PAIR_HASHES-1 ) nh = MAX_TAG_PAIR_HASHES-1;
	// store the top MAX_PAIR_HASHES
	gbmemcpy ( m_tagPairHashVec , hashes , nh * 4 );
	// null term it. all vectors need this so computeSimilarity() works
	m_tagPairHashVec [ nh++ ] = 0;
	m_tagPairHashVecValid = true;
	m_tagPairHashVecSize = nh * 4;
	return m_tagPairHashVec;
}

// sort in descending order
int cmp ( const void *h1 , const void *h2 ) {
	return *(uint32_t *)h2 - *(uint32_t *)h1;
}

// . m_tagVector.setTagPairHashes(&m_xml, niceness);
// . Sections.cpp and getIsDup() both use this hash
// . returns NULL and sets g_errno on error
// . xors all the unique adjacent tag hashes together
// . kind of represents the template the web pages uses
// . we add this to sectiondb as a vote in Sections::addVotes()
uint32_t *XmlDoc::getTagPairHash32 ( ) {

	// only compute once
	if ( m_tagPairHash32Valid ) return &m_tagPairHash32;

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (uint32_t *)words;

        // int16_tcuts
	//int64_t *wids  = words->getWordIds  ();
        nodeid_t    *tids  = words->getTagIds   ();
        int32_t           nw  = words->getNumWords ();
	int32_t           nt  = words->m_numTags;

	// . get the hash of all the tag pair hashes!
	// . we then combine that with our site hash to get our site specific
	//   html template termid
	// . put all tag pairs into a hash table
	// . similar to Vector::setTagPairHashes() but we do not compute a
	//   vector, just a single scalar/hash of 32 bits, m_termId
	HashTableX tp; // T<int64_t,char> tp;
	if ( ! tp.set ( 4 , 1 , nt * 4  , NULL , 0 , true,m_niceness,"xmltp"))
		return 0LL;
	uint32_t lastTid = 0;
	char val = 1;
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not tag
		if ( tids[i] == 0LL ) continue;
		// skip if back tag
		if ( tids[i] & BACKBIT ) continue;
		// get last tid
		uint32_t h = hash32h ( tids[i] , lastTid );
		//logf(LOG_DEBUG,"build: tph %"INT32" h=%"UINT64"",i,(int64_t)h);
		// . add to table (skip if 0, means empty bucket)
		// . return NULL and set g_errno on error
		if ( h && ! tp.addKey ( &h , &val ) ) return NULL;
		// update this
		lastTid = h;
	}
	// linear scan on hash table to get all the hash, XOR together
	uint32_t hx = 0;
	int32_t nb = tp.getNumSlots();
	char *flags = tp.m_flags;
	// get keys
	uint32_t *keys = (uint32_t *)tp.m_keys;
	for ( int32_t i = 0 ; i < nb ; i++ ) {
		// skip if empty
		if ( flags[i] == 0 ) continue;
		// skip if empty
		//if ( keys[i] == 0LL ) continue;
		// incorporate
		hx ^= keys[i];
	}
	// never return 0, make it 1. 0 means an error
	if ( hx == 0 ) hx = 1;
	// set the hash
	m_tagPairHash32 = hx ;
	// it is now valid
	m_tagPairHash32Valid = true;
	return &m_tagPairHash32;
}

// . used for deduping search results
// . also uses the title
int32_t *XmlDoc::getSummaryVector ( ) {
	if ( m_summaryVecValid ) return (int32_t *)m_summaryVec;
	Summary *s = getSummary();
	if ( ! s || s == (Summary *)-1 ) return (int32_t *)s;
	Title *ti = getTitle();
	if ( ! ti || ti == (Title *)-1 ) return (int32_t *)ti;
	// store title and summary into "buf" so we can call words.set()
	//char buf[5000];
	SafeBuf sb;
	//char *p = buf;
	//int32_t avail = 5000;
	//int32_t len;
	// put title into there
	int32_t tlen = ti->m_titleBytes - 1;
	//if ( len > avail ) len = avail - 10;
	if ( tlen < 0 ) tlen = 0;

	// put summary into there
	int32_t slen = s->m_summaryLen;

	// allocate space
	int32_t need = tlen + 1 + slen + 1;
	if ( ! sb.reserve ( need ) ) return NULL;

	//gbmemcpy ( p , ti->m_title , len );
	//p += len;
	sb.safeMemcpy ( ti->m_title , tlen );
	// space separting the title from summary
	if ( tlen > 0 ) sb.pushChar(' ');

	//if ( len > avail ) len = avail - 10;
	//gbmemcpy ( p , s->m_summary , len );
	//p += len;
	sb.safeMemcpy ( s->m_summary , slen );
	// null terminate it
	//*p = '\0';
	sb.nullTerm();
	// word-ify it
	Words words;
	if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL;
	// . now set the dedup vector from big summary and title
	// . store sample vector in here
	// . returns size in bytes including null terminating int32_t
	m_summaryVecSize = computeVector ( NULL , &words ,
					   (uint32_t *)m_summaryVec );
	m_summaryVecValid = true;
	return m_summaryVec;
}


bool getWordVector ( char *s ,
		     HashTableX *ht ,
		     uint32_t *d ,
		     int32_t *nd ,
		     int32_t ndmax ) {
	// utf8 char size
	char size;
	// grab each word and hash it
	for ( ; *s ; s += size ) {
		// get size
		size = getUtf8CharSize(s);
		// skip if tag
		if ( *s == '<' ) {
			while ( *s && *s!='>' )
				s += getUtf8CharSize(s);
			continue;
		}
		// skip if other type of punct
		if ( ! is_alnum_utf8(s) ) continue;
		// ok, we got a word then
		char *start = s;
		// see how long the word is
		for ( ; *s && is_alnum_utf8(s);s+=getUtf8CharSize(s));
		// get wordid, a simple hash, just like Words.cpp does
		uint64_t h = hash64Lower_utf8(start,s - start);
		// do not inc this time
		size = 0;
		// breathe
		//QUICKPOLL ( m_niceness );
		// make 32 bit
		uint32_t wid32 = (uint32_t)h;
		//
		// TODO: ignore if it is a day name or month name or
		//       number because those are like dates
		//
		if ( ht ) {
			// do not add if we already got it
			if ( ht->getSlot ( &wid32 ) >= 0 ) continue;
			// add to hash table. return NULL and set g_errno onerr
			if ( ! ht->addKey (&wid32 )) return false;
		}
		// add it to our vector
		d[*nd] = (uint32_t)wid32;
		// inc it
		*nd = *nd + 1;
		// stop after 3000 for sure
		if ( *nd >= ndmax ) return true;
	}
	return true;
}

// used by getIsDup() and Dates.cpp for detecting dups and for
// seeing if the content changed respectively
int32_t *XmlDoc::getPageSampleVector ( ) {
	if ( m_pageSampleVecValid ) return m_pageSampleVec;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
	Sections *ss = NULL;
	//if ( m_eliminateMenus ) {
	//ss = getSections();
	//if ( ! ss || ss == (Sections *)-1) return (int32_t *)ss;
	//}
	m_pageSampleVecSize  = computeVector ( ss, ww,
					       (uint32_t *)m_pageSampleVec );
	m_pageSampleVecValid = true;
	return m_pageSampleVec;
}

// . this is the vector of the words right after the hypertext for the link
//   we are voting on.
// . it is used to dedup voters in Msg25.cpp
int32_t *XmlDoc::getPostLinkTextVector ( int32_t linkNode ) {

	if ( m_postVecValid ) return m_postVec;
	// assume none
	m_postVecSize = 0;

	// set up
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;

	// sanity check
	if ( linkNode < 0 ) { char *xx=NULL;*xx=0; }

	// linkNode starts pointing to a <a> tag so skip over that!
	linkNode++;
	// limit
	int32_t     nn    = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes();
	// and advance i to the next anchor tag thereafter, we do not
	// want to include link text in this vector because it is usually
	// repeated and will skew our "similarities"
	for ( ; linkNode < nn ; linkNode++ ) {
		// stop if we hit </a> or <a>
		if ( (nodes[linkNode].m_nodeId & BACKBITCOMP) != 2 ) continue;
		// advance over the </a> or <a>
		linkNode++;
		// then stop, we will start gathering link text here
		break;
	}
	// if we hit end of the doc, we got not vector then
	if ( linkNode >= nn ) return m_postVec;

	// now convert the linkNode # to a word #, "start"
	int32_t       nw   = ww->getNumWords ();
	int64_t *wids = ww->getWordIds  ();
	nodeid_t  *tids = ww->getTagIds   ();
	int32_t      *wn   = ww->m_nodes;
	int32_t       i    = 0;
	for ( ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// stop when we got the first word in this node #
		if ( wn[i] == linkNode ) break;
	}
	// if none, bail now, size is 0
	if ( i >= nw ) return m_postVec;
	// save that
	int32_t start = i;

	// likewise, set the end of it
	int32_t end = nw;
	// count alnum words
	int32_t count = 0;
	// limit it
	for ( i = start ; i < nw && count < 35 ; i++ ) {
		// get tag id
		nodeid_t tid = tids[i] & BACKBITCOMP;
		// stop if certain ones
		if ( tid     == TAG_TABLE ) break;
		if ( tid     == TAG_UL    ) break;
		// <a>, </a> is ok
		if ( tids[i] == TAG_A     ) break;
		// only up to 35 words allowed in the hash
		if ( wids[i] ) count++;
	}
	// set the end of the words to hash
	end = i;
	// specify starting node # now
	m_postVecSize = computeVector(NULL,ww,(uint32_t *)m_postVec,start,end);
	// return what we got
	return m_postVec;
}

// . was kinda like "m_tagVector.setTagPairHashes(&m_xml, niceness);"
// . this is used by getIsDup() (below)
// . this is used by Dates.cpp to see how much a doc has changed
// . this is also now used for getting the title/summary vector for deduping
//   search results
// . if we couldn't extract a good pub date for the doc, and it has changed
//   since last spidered, use the bisection method to come up with our own
//   "last modified date" which we use as the pub date.
// . this replaces the clusterdb.getSimilarity() logic in Msg14.cpp used
//   to do the same thing. but we call Vector::setForDates() from
//   Dates.cpp. that way the logic is more contained in Dates!
// . doesn't Msg14 already do that?
// . yes, but it uses two TermTables and calls Clusterdb::getSimilarity()
// . returns false and sets g_errno on error
// . these words classes should have been set by a call to Words::set(Xml *...)
//   so that we have "tids1" and "tids2"

// . returns NULL and sets g_errno on error
// . TODO: if our title rec is non-empty consider getting it from that
// . we use this vector to compare two docs to see how similar they are
int32_t XmlDoc::computeVector ( Sections *sections, Words *words, uint32_t *vec ,
			     int32_t start , int32_t end ) {

	// assume empty vector
	vec[0] = 0;

	// skip if no article section. then we have no vector.
	if ( sections && ! sections->m_hadArticle ) return 0;

	// int16_tcuts
	int32_t       nw     = words->getNumWords();
	//int32_t     nt     = words->m_numTags;
	int64_t *wids   = words->getWordIds();

	// set the end to the real end if it was specified as less than zero
	if ( end < 0 ) end = nw;

	// # of alnum words, about... minus the tags, then the punct words
	// are half of what remains...
	int32_t count = words->m_numAlnumWords;

	// if we got sections, how many good words?
	if ( sections ) count = sections->m_numAlnumWordsInArticle;

	// google seems to index SEC_MARQUEE so i took that out
	//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;

	// these Section ptrs are 1-1 with the words
	Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs;

	// . Get sample vector from content section only.
	// . This helps remove duplicate menu/ad from vector

	// 4 bytes per hash, save the last one for a NULL terminator, 0 hash
	int32_t maxTerms = SAMPLE_VECTOR_SIZE / 4  - 1;
	// what portion of them do we want to mask out from the rest?
	int32_t ratio = count / maxTerms ;
	// a mask of 0 means to get them all
	unsigned char mask = 0x00;
	// if we got twice as many terms as we need, then set mask to 0x01
	// to filter out half of them! but actually, let's aim for twice
	// as many as we need to ensure we really get as many as we need.
	// so if we got 4 or more than we need then cut in half...
	while ( ratio >= 4 ) {
		// shift the mask down, ensure hi bit is set
		mask >>= 1;
		mask |= 0x80;
		ratio >>= 1; // /2
	}

	// store vector into "d" for now. will sort below
	uint32_t d [ 3000 ];

	// dedup our vector using this hashtable, "ht"
	char hbuf[3000*6*2];
	HashTableX ht;
	if ( ! ht.set(4,0,3000,hbuf,3000*6*2,false,m_niceness,"xmlvecdedup")){
		char*xx=NULL;*xx=0;}

 again:
	// a buffer to hold the top termIds
	int32_t nd = 0;
	// count how many we mask out
	int32_t mo = 0;
	// . buffer should have at least "maxTerms" in it
	// . these should all be 12 byte keys
	for ( int32_t i = start ; i < end ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if not alnum word
		if ( wids[i] == 0 ) continue;
		// skip if mask filters it
		if ( ((wids[i]>>(NUMTERMIDBITS-8)) & mask)!=0) {mo++;continue;}
		// skip if in select, style, script or marquee tag section
		if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;
		// make 32 bit
		uint32_t wid32 = (uint32_t)wids[i];
		// do not add if we already got it
		if ( ht.getSlot ( &wid32 ) >= 0 ) continue;
		// add to hash table. return NULL and set g_errno on error
		if ( ! ht.addKey (&wid32 )){char*xx=NULL;*xx=0; }
		// add it to our vector
		d[nd] = (uint32_t)wids[i];
		// stop after 3000 for sure
		if ( ++nd < 3000 ) continue;
		// bitch and break out on error
		log(LOG_INFO,"build: Sample vector overflow. Slight "
		    "performance hit.");
		break;
	}

	// . if nd was too small, don't use a mask to save time
	// . well just make the mask less restrictive
	if ( nd < maxTerms && mask && mo ) {
		// shift the mask UP, allow more termIds to pass through
		mask <<= 1;
		// reset hash table since we are starting over
		ht.clear();
		goto again;
	}

	// bubble sort them
	bool flag = true;
	while ( flag ) {
		// breathe
		QUICKPOLL ( m_niceness );
		flag = false;
		for ( int32_t i = 1 ; i < nd ; i++ ) {
			if ( d[i-1] <= d[i] ) continue;
			uint32_t tmp = d[i-1];
			d[i-1] = d[i];
			d[i]   = tmp;
			flag   = true;
		}
	}

	// truncate
	if ( nd > maxTerms ) nd = maxTerms;
	// null terminate
	d [ nd++ ] = 0;
	// store in our sample vector
	gbmemcpy ( vec , d , nd * 4 );
	// return size in bytes
	return nd * 4;
}

float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
	int32_t *tv1 = getTagPairHashVector();
	if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
	int32_t *tv2 = xd2->getTagPairHashVector();
	if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
	m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
					      m_niceness );
	// this means error, g_errno should be set
	if ( m_tagSimilarity == -1.0 ) return NULL;
	return &m_tagSimilarity;
}

float *XmlDoc::getGigabitSimilarity ( XmlDoc *xd2 ) {
	int32_t **gv1 = getGigabitHashes();
	if ( ! gv1 || gv1 == (int32_t **)-1 ) return (float *)gv1;
	int32_t **gv2 = xd2->getGigabitHashes();
	if ( ! gv2 || gv2 == (int32_t **)-1 ) return (float *)gv2;
	// *gv1 could be NULL if vec was empty in titlerec's ptr_gigabitHashes
	m_gigabitSimilarity = computeSimilarity ( *gv1, *gv2, NULL, NULL, NULL,
						  m_niceness );
	// this means error, g_errno should be set
	if ( m_gigabitSimilarity == -1.0 ) return NULL;
	return &m_gigabitSimilarity;
}

float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
	int32_t *sv1 = getPageSampleVector();
	if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
	int32_t *sv2 = xd2->getPageSampleVector();
	if ( ! sv2 || sv2 == (int32_t *)-1 ) return (float *)sv2;
	m_pageSimilarity = computeSimilarity ( sv1, sv2, NULL, NULL, NULL,
					       m_niceness );
	// this means error, g_errno should be set
	if ( m_pageSimilarity == -1.0 ) return NULL;
	return &m_pageSimilarity;
}

// . compare old page vector with new
// . returns ptr to a float from 0.0 to 100.0
float *XmlDoc::getPercentChanged ( ) {
	// if we got it
	if ( m_percentChangedValid ) return &m_percentChanged;
	// get the old doc
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (float *)od;
	// if empty, assume 0% changed
	if ( ! *od ) {
		m_percentChanged      = 0;
		m_percentChangedValid = true;
		return &m_percentChanged;
	}
	// get its page c
	float *ps = getPageSimilarity    ( *od );
	if ( ! ps || ps == (float *)-1 ) return (float *)ps;
	// got it
	m_percentChanged      = *ps;
	m_percentChangedValid = true;
	// just return it
	return &m_percentChanged;
}

// . Address.cpp converts a place name into a vector for comparing via a
//   call to computeSimilarity() below
// . returns -1 and set g_errno on error
// . "vbufSize" is in BYTES!
// . returns length of word vector in int32_ts (# components stored)
int32_t makeSimpleWordVector (char *s,int32_t *vbuf,int32_t vbufSize,int32_t niceness ) {
	// nonsense?
	if ( vbufSize < 4 ) { char *xx=NULL;*xx=0; }
	// empty it
	*vbuf = 0;
	// no words, no vector
	if ( ! s ) return 0;
	// set them
	Words w;
	// return -1 with g_errno set on error
	if ( ! w.set9 ( s , niceness ) ) return -1;
	// skip if no words
	if ( w.m_numWords == 0 ) return 0;
	// int16_t cut
	int64_t *wids = w.m_wordIds;
	int64_t  pid  = 0LL;
	// count insertions
	int32_t count = 0;
	// ptr
	int32_t *vbufPtr = vbuf;
	int32_t *vbufEnd = vbuf + vbufSize/4;
	// put words into a vector
	for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
		// skip if not alnum word
		if ( ! wids[i] ) continue;
		// if no room stop. need room for NULL terminator
		if ( vbufPtr + 2 >= vbufEnd ) return count;
		// put it in
		//*vbufPtr = (int32_t)wids[i];
		// . use the synonym instead if it had one
		// . maps "theatre" to "theater", "4th" to "fourth", etc.
		// . false = is street name?
		int64_t *p = getSynonymWord ( &wids[i] , &pid , false );
		// set this
		pid = wids[i];
		//int64_t *p = (int64_t *)synTable->getValue64( wids[i] );
		// 0 means to ignore it
		if ( *p == 0LL ) continue;
		// otherwise add into our vector
		*vbufPtr = *p;
		// advance
		vbufPtr++;
		// NULL termination
		*vbufPtr = 0;
		// count it
		count++;
	}
	// all done
	return count;
}

// . compare two vectors
// . components in vectors are int32_ts
// . last component is a zero, to mark EOV = end of vector
// . discount any termIds that are in the query vector, qvec, which may be NULL
// . returns -1 and sets g_errno on error
// . vector components are 32-bit hashes of the words (hash32())???
//   i would say they should be the lower 32 bits of the 64-bit hashes!
// . replaces:
//   g_clusterdb.getGigabitSimilarity()
//   m_tagVec->getLinkBrotherProbability()
//   g_clusterdb.getSampleSimilarity()
float computeSimilarity ( int32_t   *vec0 ,
			  int32_t   *vec1 ,
			  int32_t   *s0   , // corresponding scores vector
			  int32_t   *s1   , // corresponding scores vector
			  Query  *q    ,
			  int32_t    niceness ,
			  bool    dedupVectors ) {
	static int32_t s_tmp = 0;
	if ( ! vec0 ) vec0 = &s_tmp;
	if ( ! vec1 ) vec1 = &s_tmp;
	// if both empty, assume not similar at all
	if ( *vec0 == 0 && *vec1 == 0 ) return 0;
	// if either is empty, return 0 to be on the safe side
	if ( *vec0 == 0 ) return 0;
	if ( *vec1 == 0 ) return 0;


	// flag if from query vector
	HashTableX qt;
	char qbuf[5000];
	if ( q ) {
		// init hash table
		if ( ! qt.set ( 4,0,512,qbuf,5000,false,niceness,"xmlqvtbl") )
			return -1;
		// . stock the query term hash table
		// . use the lower 32 bits of the termids to make compatible
		//   with the other vectors we use
		//int64_t *qtids = q->getTermIds ();
		int32_t       nt    = q->getNumTerms();
		for ( int32_t i = 0 ; i < nt ; i++ ) {
			// get query term
			QueryTerm *QT = &q->m_qterms[i];
			// get the termid
			int64_t termId = QT->m_termId;
			// get it
			uint32_t h = (uint32_t)(termId & 0xffffffff);
			// hash it
			if ( ! qt.addKey ( &h ) ) return -1;
		}
	}

	// if we ignore cardinality then it only matters if both vectors
	// have a particular value, and not how many times they each have it.
	// so we essentially dedup each vector if dedupVectors is true.
	// but we do total up the score and put it behind the one unique
	// occurence though. we do this only for
	// Sections::addDateBasedImpliedSections() right now
	bool allowDups = true;
	if ( dedupVectors ) allowDups = false;

	HashTableX ht;
	char  hbuf[10000];
	if ( ! ht.set ( 4,4,-1,hbuf,10000,allowDups,niceness,"xmlqvtbl2"))
		return -1;

	bool useScores  = (bool)s0;

	int32_t matches    = 0;
	int32_t total      = 0;

	int32_t matchScore = 0;
	int32_t totalScore = 0;

	// hash first vector. accumulating score total and total count
	for ( int32_t *p = vec0; *p ; p++ , s0++ ) {
		// breathe
		QUICKPOLL(niceness);
		// skip if matches a query term
		if ( q && qt.getSlot ( p ) ) continue;
		// count it
		total++;
		// get it
		int32_t score = 1;
		// get the score if valid
		if ( useScores ) score = *s0;
		// total it up
		totalScore += score;
		// add it
		if ( dedupVectors ) {
			// accumulate all the scores into this one bucket
			// in the case of p being a dup
			if ( ! ht.addTerm32 ( p , score ) ) return -1;
		}
		else {
			// otherwise, add each into its own bucket since
			// ht.m_allowDups should be true
			if ( ! ht.addKey ( p , &score ) ) return -1;
		}
	}

	int32_t zero = 0;

	// see what components of this vector match
	for ( int32_t *p = vec1; *p ; p++ , s1++ ) {
		// breathe
		QUICKPOLL(niceness);
		// skip if matches a query term
		if ( q && qt.getSlot ( p ) ) continue;
		// count it
		total++;
		// get it
		int32_t score = 1;
		// get the score if valid
		if ( useScores ) score = *s1;
		// and total scores
		totalScore += score;
		// is it in there?
		int32_t slot = ht.getSlot ( p );
		// skip if unmatched
		if ( slot < 0 ) continue;
		// otherwise, it is a match!
		matches++;
		// and scores
		matchScore += score;
		// and score of what we matched
		uint32_t *val = (uint32_t *)ht.getValueFromSlot ( slot );
		// he is hit too
		matchScore += *val;

		// remove it as we match it to deal with dups
		if ( allowDups ) {
			// once we match it once, do not match again, score was
			// already accumulated
			ht.setValue ( slot , &zero );
		}
		else {
			// otherwise, remove this dup and try to match any
			// remaining dups in the table
			ht.removeSlot ( slot );
		}
	}

	// if after subtracting query terms we got no hits, return 0.framesets?
	if ( useScores && totalScore == 0 ) return 0;
	if ( total                   == 0 ) return 0;
	// . what is the max possible score we coulda had?
	// . subtract the vector components that matched a query term
	float percent = 100 * (float)matchScore / (float)totalScore;
	//if ( useScores)percent = 100 * (float)matchScore / (float)totalScore;
	//else           percent = 100 * (float)matches    / (float)total;
	// sanity
	//if ( percent > 100 ) percent = 100;
	if ( percent > 100 ) { char *xx=NULL;*xx=0; }

	return percent;
}

// this returns true if the two vecs are "percentSimilar" or more similar
bool isSimilar_sorted ( int32_t   *vec0 ,
			int32_t   *vec1 ,
			int32_t nv0 , // how many int32_ts in vec?
			int32_t nv1 , // how many int32_ts in vec?
			// they must be this similar or more to return true
			int32_t percentSimilar,
			int32_t    niceness ) {
	// if both empty, assume not similar at all
	if ( *vec0 == 0 && *vec1 == 0 ) return 0;
	// if either is empty, return 0 to be on the safe side
	if ( *vec0 == 0 ) return 0;
	if ( *vec1 == 0 ) return 0;

	// do not include last 0
	nv0--;
	nv1--;
	int32_t total = nv0 + nv1;

	// so if the "noMatched" count ever EXCEEDS (not equals) this
	// "brink" we can bail early because there's no chance of getting
	// the similarity "percentSimilar" provided. should save some time.
	int32_t brink = ((100-percentSimilar) * total) / 100;

	// scan each like doing a merge
	int32_t *p0 = vec0;
	int32_t *p1 = vec1;
	int32_t yesMatched = 0;
	int32_t noMatched  = 0;

 mergeLoop:

	// stop if both exhausted. we didn't bail on brink, so it's a match
	if ( *p0 == 0 && *p1 == 0 )
		return true;

	if ( *p0 < *p1 || *p1 == 0 ) {
		p0++;
		if ( ++noMatched > brink ) return false;
		goto mergeLoop;
	}

	if ( *p1 < *p0 || *p0 == 0 ) {
		p1++;
		if ( ++noMatched > brink ) return false;
		goto mergeLoop;
	}

	yesMatched += 2;
	p1++;
	p0++;
	goto mergeLoop;
}

uint64_t *XmlDoc::getFuzzyDupHash ( ) {

	if ( m_dupHashValid ) return &m_dupHash;
	uint32_t *h1 = getTagPairHash32();
	if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;

	uint32_t *h2 = getGigabitVectorScorelessHash ( ) ;
	if ( ! h2 || h2 == (uint32_t *)-1 ) return (uint64_t *)h2;

	//uint64_t h2b = (uint64_t)*h2;

	m_dupHash = hash64 ( (uint64_t)*h1 , (uint64_t)*h2 );
	m_dupHashValid = true;
	return &m_dupHash;
}

int64_t *XmlDoc::getExactContentHash64 ( ) {

	if ( m_exactContentHash64Valid )
		return &m_exactContentHash64;

	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1) return (int64_t *)u8;


	// if (m_docId==88581116800LL)
	// 	log("got article1 diffbot");
	// if (m_docId==201689682865LL)
	// 	log("got article11 diffbot");

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if we are diffbot, then do not quite do an exact content hash.
	// there is a "url:" field in the json that changes. so we have
	// to exclude that field. otherwise getDupList() spider time dedup
	// detection will fail the TestDuplicateContent.testDuplicate smoketest
	if ( cr->m_isCustomCrawl == 1 && m_isDiffbotJSONObject ) {
		int32_t *ch32 = getContentHashJson32();
		if ( ! ch32 || ch32 == (void *)-1 ) return (int64_t *)ch32;
		m_exactContentHash64Valid = true;
		m_exactContentHash64 = (uint64_t)(uint32_t)*ch32;
		return &m_exactContentHash64;
	}

	unsigned char *p = (unsigned char *)*u8;

	int32_t plen = size_utf8Content;
	if ( plen > 0 ) plen--;

	// if we zeroed out this doc to save disk space, then we only
	// record the exact 64-bit hash, so extract it here so that
	// we can delete the gbcontenthash: term from the index if we are
	// deleting this doc or updating it with a fresh copy.
	if ( plen < 100 && p && plen > 12 &&
	     strncmp((char *)p,"gbzeroedout:",12) == 0 ) {
		sscanf((char *)p+12,"%"UINT64,&m_exactContentHash64);
		m_exactContentHash64Valid = true;
		return &m_exactContentHash64;
	}


	// sanity
	//if ( ! p ) return 0LL;
	//if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }

	unsigned char *pend = (unsigned char *)p + plen;
	uint64_t h64 = 0LL;
	unsigned char pos = 0;
	bool lastWasSpace = true;
	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// treat sequences of white space as a single ' ' (space)
		if ( is_wspace_a(*p) ) {
			if ( lastWasSpace ) continue;
			lastWasSpace = true;
			// treat all white space as a space
			h64 ^= g_hashtab[pos][(unsigned char)' '];
			pos++;
			continue;
		}
		lastWasSpace = false;
		// xor this in right
		h64 ^= g_hashtab[pos][p[0]];
		pos++;
	}

	m_exactContentHash64Valid = true;
	m_exactContentHash64 = h64;
	return &m_exactContentHash64;
}


RdbList *XmlDoc::getDupList ( ) {
	if ( m_dupListValid ) return &m_dupList;

	// until we start using posdb and not indexdb, just return an
	// empty list.
	// TODO: MDW fix the deduping.
	//m_dupList.reset();
	//m_dupListValid = true;
	//return &m_dupList;
	//
	// end temp hack
	//

	//uint64_t *dh = getDupHash ( );
	//if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	int64_t *ph64 = getExactContentHash64();
	//int64_t *ph64 = getLooseContentHash64();
	if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64;

	// must match term in XmlDoc::hashVectors()
	char qbuf[256];
	snprintf(qbuf, 256, "%"UINT64"",*ph64);
	int64_t pre     = hash64b ( "gbcontenthash" , 0LL );
	int64_t rawHash = hash64b ( qbuf , 0LL );
	int64_t termId  = hash64 ( rawHash , pre );
	// get the startkey, endkey for termlist
	key144_t sk ;
	key144_t ek ;
	g_posdb.makeStartKey ( &sk,termId ,0);
	g_posdb.makeEndKey   ( &ek,termId ,MAX_DOCID);
	// note it
	log(LOG_DEBUG,"build: check termid=%"UINT64" for docid %"UINT64""
	    ,(uint64_t)(termId&TERMID_MASK)
	    ,m_docId);
	// assume valid now
	m_dupListValid = true;
	// this is a no-split lookup by default now
	if ( ! m_msg0.getList ( -1    , // hostId
				0     , // ip
				0     , // port
				0     , // maxCacheAge
				false , // add to cache?
				RDB_POSDB, // INDEXDB ,
				cr->m_collnum,
				&m_dupList  ,
				(char *)&sk          ,
				(char *)&ek          ,
				606006        , // minRecSizes in bytes
				m_masterState , // state
				m_masterLoop  ,
				m_niceness    ,
				true , // error correction?
				true , // include tree?
				true , // domerge?
				-1 , // firsthosti
				0 , // startfilenum
				-1, // # files
				// never timeout when spidering in case
				// a host is down.
				9999977 , // timeout
				-1 , // syncpoint
				-1 , // preferlocal reads
				NULL, // msg5
				NULL, // msg5b
				false , // isRealMerge
				true , // allow page cache
				false , // forcelocalindexdb
				true ) ) // shardByTermId? THIS IS DIFFERENT!!!
		// return -1 if this blocks
		return (RdbList *)-1;
	// assume valid!
	m_dupListValid = true;
	return &m_dupList;
}


// moved DupDetector.cpp into here...
char *XmlDoc::getIsDup ( ) {
	if ( m_isDupValid ) return &m_isDup;
	// assume we are not a dup
	m_isDup = false;
	// get it
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// skip if we should
	if ( ! cr->m_dedupingEnabled ||
	     // bulk jobs never dedup
	     cr->m_isCustomCrawl == 2 ) {
		m_isDupValid = true;
		return &m_isDup;
	}

	// if &links was given in the diffbot api url then do not do
	// spider time deduping because the pages are likely rendered using
	// javascript, so they'd all seem to be dups of one another.
	if ( cr->m_isCustomCrawl ) {
		SafeBuf *au = getDiffbotApiUrl();
		if ( ! au || au == (void *)-1 ) return (char *)au;
		char *linksParm = NULL;
		if ( au->length() > 0 )
			linksParm = strstr ( au->getBufStart() , "&links");
		if ( ! linksParm && au->length() > 0 )
			linksParm = strstr ( au->getBufStart() , "?links");
		if ( linksParm && linksParm[6] && linksParm[6] != '&' )
			linksParm = NULL;
		if ( linksParm ) {
			m_isDupValid = true;
			m_isDup = false;
			return &m_isDup;
		}
	}

	// do not dedup seeds
	bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
	if ( cr->m_isCustomCrawl && isSeed ) {
		m_isDupValid = true;
		m_isDup = false;
		return &m_isDup;
	}


	setStatus ( "checking for dups" );

	// BUT if we are already indexed and a a crawlbot/bulk diffbot job
	// then do not kick us out just because another indexed doc is
	// a dup of us because it messes up the TestOnlyProcessIfNew smoketests
	// because in the 2nd round we end up deleting article1.html after
	// indexing it in the first round, then we add article11.html's
	// diffbot reply in the 2nd round because article1.html and its
	// diffbot reply was deleted. thereby giving it a new timestamp and
	// makeing the smoke fail.
	if ( cr->m_isCustomCrawl ) {
		char *isIndexed = getIsIndexed();
		if ( ! isIndexed || isIndexed == (char *)-1)
			return (char *)isIndexed;
		if ( *isIndexed ) {
			m_isDupValid = true;
			return &m_isDup;
		}
	}


	//we need both vectors to be non-empty
	//uint64_t *tv = getTagPairHash();
	//if ( ! tv || tv == (uint64_t *)-1) return (char *)tv;
	// get our docid
	int64_t *mydocid = getDocId();
	if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;
	// get the duplist!
	RdbList *list = getDupList();
	if ( ! list || list == (RdbList *)-1 ) return (char *)list;

	// sanity. must be posdb list.
	if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}

	// so getSiteRank() does not core
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;

	// . see if there are any pages that seem like they are dups of us
	// . they must also have a HIGHER score than us, for us to be
	//   considered the dup
	//if ( ! m_didQuickDupCheck ) {
	//	// do not repeat
	//	m_didQuickDupCheck = true;


	int32_t myRank = getSiteRank ( );

	// init
	//uint8_t maxScore = 0;
	//uint8_t myScore  = 0;
	//char maxSiteRank = -1;
	//int64_t maxDocId = -1LL;
	// assume not a dup
	m_isDup = false;
	// get the docid that we are a dup of
	for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
		// breathe
		QUICKPOLL(m_niceness);
		//int64_t d = list->getCurrentDocId();
		char *rec = list->getCurrentRec();
		// get the docid
		int64_t d = g_posdb.getDocId ( rec );
		// get the score
		//uint8_t score = list->getCurrentScore();
		// just let the best site rank win i guess?
		// even though one page may have more inlinks???
		char sr = (char )g_posdb.getSiteRank ( rec );
		// skip if us!
		//if ( d == *getDocId() ) {
		//	// record our score
		//	//myScore = score;
		//	mySiteRank = sr;
		//	continue;
		//}

		// skip if us
		if ( d == m_docId ) continue;

		// for debug
		//if ( d != m_docId )
		//log("build: doc %s is dup of docid %"INT64"",
		//    m_firstUrl.m_url,d);

		// if his rank is <= ours then he was here first and we
		// are the dup i guess...
		if ( sr >= myRank ) {
			log("build: doc %s is dup of docid %"INT64"",
			    m_firstUrl.m_url,d);
			m_isDup = true;
			m_isDupValid = true;
			m_docIdWeAreADupOf = d;
			return &m_isDup;
		}

		// get the winner
		//if ( score > maxScore ) maxScore = score;
		//if ( sr > maxSiteRank || maxSiteRank == -1 ) {
		//	maxSiteRank = sr;
		//	maxDocId = d;
		//	continue;
		//}
		//if ( sr < maxSiteRank ) continue;
		// fallback to docid?
		// do it first come first server othereise i guess
		// this will prevent dups from existing in the index at least
		// if they have the same siterank...
		//if ( d < maxDocId ) {
		//	maxDocId = d;
		//	continue;
		//}
	}
	// are we the highest scoring doc with this template?
	// corollary: if all dups have equal scores they will be
	// removed until there is only one doc that matches the pattern
	//if ( myScore >= maxScore ) {
	//if ( maxDocId >= 0 && maxDocId != *mydocid && out) {
	//	m_isDup = true;
	//	m_isDupValid = true;
	//	return &m_isDup;
	//}

	m_isDup = false;
	m_isDupValid = true;
	return &m_isDup;

	/*
	  we now temporarily at least, do exact dup checking...
	  later we will bring in the fuzzy code...

	// reset its ptr for stuff below
	list->resetListPtr();

 loop:
	// . get a title rec for the current docid
	// . but if exhausted, we are not a dup!
	if ( list->isExhausted() ) { m_isDupValid = true; return &m_isDup; }
	// get the docid
	int64_t d = list->getCurrentDocId();
	// continue if us!
	if ( d == *mydocid ) { list->skipCurrentRecord(); goto loop; }
	// is this a dup of us?
	char *dup = isDupOfUs ( d );
	if ( ! dup || dup == (char *)dup ) return (char *)dup;
	// if dup of us, bail out
	if ( *dup ) { m_isDup = true; m_isDupValid = true; return &m_isDup; }
	// prepare for next
	list->skipCurrentRecord();
	// loop up
	goto loop;
	*/
}

char *XmlDoc::isDupOfUs ( int64_t d ) {
	// sanity check
	if ( d <= 0 ) { char *xx=NULL;*xx=0; }
	// get our current title rec
	SafeBuf *tr = getTitleRecBuf();
	if ( ! tr || tr == (void *)-1 ) return (char *)tr;
	// we should not be here if we know we are a dup of another doc
	if ( m_isDup ) { char *xx=NULL;*xx=0; }
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// get the title rec for this docid if we haven't yet done so
	if ( m_calledMsg22d != d ) { // .m_docId != d ) {
		bool s;
		// note it
		setStatus ( "getting possible dup title rec" );
		// do not re-call
		m_calledMsg22d = d;
		// get the guy that might be a dup of us
		s = m_msg22d.getTitleRec ( &m_msg22Request ,
					   NULL         ,
					   d            ,
					   cr->m_coll       ,
					   &m_dupTrPtr  ,
					   &m_dupTrSize ,
					   false        , // just check tfndb?
					   false , // getAvailDocIdOnly
					   m_masterState, // state
					   m_masterLoop , // callback
					   m_niceness   ,
					   false        , // add to cache
					   60*60*24     , // maxcacheage
					   999999       );// timeout
		// we blocked
		if ( ! s ) return (char *)-1;
		// error?
		if ( g_errno ) return NULL;
	}
	// if not there do not count as an error
	if ( ! m_dupTrPtr ) { g_errno = 0; return &m_isDup; }
	// ignore any errors too i guess...
	if ( m_msg22d.m_errno ) {
		log(LOG_WARN, "build: Dup Detection error with "
		    "titlerec fetch: %s",mstrerror(m_msg22d.m_errno));
		g_errno = 0;
		return &m_isDup;
	}
	// we need to parse this potential dup doc
	XmlDoc dd;
	// . parse the possible dup title rec into another XmlDoc class
	// . it returns false and sets g_errno on error
	if ( ! dd.set2 ( m_dupTrPtr  ,
			 m_dupTrSize ,
			 cr->m_coll      ,
			 NULL        , // m_pbuf      ,
			 m_niceness  ) )
		return NULL;

	LinkInfo   *info1a = dd.getLinkInfo1();
	LinkInfo   *info1b =    getLinkInfo1();
	float pageNumInlinksA = info1a->m_numGoodInlinks;//getNumInlinksExtrapolated();
	float pageNumInlinksB = info1b->m_numGoodInlinks;//getNumInlinksExtrapolated();

	// . if the old dup doc is of lower quality than the new doc that
	//   we are checking, then that one should be removed, not us!
	//   if they are equal, we keep the int16_ter url of the two
	// . dd was set from title rec so these numInlinks should be taken
	//   from the TagRec in ptr_tagRecData, and therefore NOT BLOCK!
	if ( *dd.getSiteNumInlinks() <  *getSiteNumInlinks() )
		return &m_isDup;
	if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() &&
	     pageNumInlinksA         <   pageNumInlinksB     )
		return &m_isDup;
	if ( *dd.getSiteNumInlinks()  == *getSiteNumInlinks() &&
	     pageNumInlinksA          == pageNumInlinksB      &&
	      dd.getFirstUrl()->getUrlLen() > getFirstUrl()->getUrlLen())
		return &m_isDup;

	float *ts = getTagSimilarity     ( &dd );
	if ( ! ts || ts == (float *)-1 ) return (char *)ts;
	float *gs = getGigabitSimilarity ( &dd );
	if ( ! gs || gs == (float *)-1 ) return (char *)gs;
	float *ps = getPageSimilarity    ( &dd );
	if ( ! ps || ps == (float *)-1 ) return (char *)ps;

	int32_t gigabitVecSimilarity = (int32_t)*gs;
	int32_t tagVecSimilarity     = (int32_t)*ts;
	int32_t sampleVecSimilarity  = (int32_t)*ps;

	int32_t notSimilarCount = 0;
	if ( gigabitVecSimilarity < 80 ) {
		notSimilarCount++;
		if ( gigabitVecSimilarity < 50 ) return &m_isDup;
	}
	if ( tagVecSimilarity     < 80 ) {
		notSimilarCount++;
		if ( tagVecSimilarity     < 50 ) return &m_isDup;
	}
	if ( sampleVecSimilarity  < 80 ) {
		notSimilarCount++;
		if ( sampleVecSimilarity  < 50 ) return &m_isDup;
	}
	// if it is similar enough, we got a dup!
	if ( notSimilarCount <= 0 ) { m_isDupValid = true; m_isDup = true; }

	return &m_isDup;
}

// hash a gigabit hash vector without its scores, also order independent
uint32_t *XmlDoc::getGigabitVectorScorelessHash ( ) {
	if ( m_gigabitVectorHashValid ) return &m_gigabitVectorHash;
	int32_t **gbvec = getGigabitHashes();
	if ( ! gbvec || gbvec == (int32_t **)-1 ) return (uint32_t *)gbvec;
	uint32_t h = 0;
	// this bad boy is NULL terminated
	uint32_t *gbv = (uint32_t *)*gbvec;
	// i guess zak likes the simple XOR'ing thing...
	for ( int32_t i = 0; gbv && gbv[i] ; i++) h ^= gbv[i];
	m_gigabitVectorHashValid = true;
	m_gigabitVectorHash      = h;
	return &m_gigabitVectorHash;
}

// . the original vector used for deduping similar search results is just from
//   random sample of indexed terms, but gigabit vector is
//   formed using the hashes of the top-scoring gigabits of the document, and
//   therefore uses the words class
// . sets g_errno and returns NULL on error
// . ptr_gigabitHashes can be NULL...
int32_t **XmlDoc::getGigabitHashes ( ) {
	// if it was already set, treat this as an accessor
	if ( m_gigabitHashesValid ) return &ptr_gigabitHashes;
	// this also sets the vector
	char *gq = getGigabitQuery();
	if ( ! gq || gq == (char *)-1) return (int32_t **)gq;
	// it should be valid now!
	if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; }
	return &ptr_gigabitHashes;
}

// . the new function to get gigabits
// . sets and validates m_gigabitQuery[] and m_gigabitHashes[] among others
// . candidates = capitalized word, capitalized sequence of words,
//                uncapitalized 2+ word wikipedia phrase.
// . candidates exclude uncapitalized query stop words.
// . calls addGigabits() which is called by each doc in search results
//   when we use this at query time.
// . separates gigabits with a comma (delimeter) in m_gigabitQuery[]
// . quotes multiple word gigabits
char *XmlDoc::getGigabitQuery ( ) {

	if ( m_gigabitQueryValid ) return m_gigabitQuery;

	setStatus ( "getting gigabit query" );

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char  *)xml;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (char *)ww;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (char *)d;
	Sections *ss = getSections();
	if ( ! ss || ss == (Sections *)-1 ) return (char *)ss;
	//Weights *we = getWeights();
	//if ( ! we || we == (Weights *)-1 ) return (char *)we;
	LinkInfo   *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
	LinkInfo  **pinfo2 = getLinkInfo2();
	if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;

	HashTableX ht;
	char buf [ 200000 ];
	// pass in niceness in case it has to grow really big and re-hash all!!
	ht.set ( 8 , 4 , -1 , buf , 200000 , false, m_niceness,"xmlgbtbl");

	// . add gigabits from our body words
	// . includes title and header tags so pts can work well!
	if ( ! addGigabits ( ww , *d , ss , *langId ) ) return NULL;

	// add gigabits from link info
	for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
		// sanity check
		char *txt = k->getLinkText();
		int32_t tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		if ( ! verifyUtf8 ( txt , tlen ) ) {
			log("xmldoc: bad link text 0 from url=%s for %s",
			    k->getUrl(),m_firstUrl.m_url);
			continue;
		}
		// add those in
		if (!addGigabits(txt, *d, *langId ) ) return NULL;
		// add  in neighborhoods
		if(!addGigabits(k->getSurroundingText(),*d,*langId))
			return NULL;
	}

	// add in gigabits for meta keywords
	int32_t mdlen;
	char *md = getMetaDescription( &mdlen );
	if ( ! addGigabits2 ( md , mdlen, *d , *langId ) ) return NULL;

	// add in gigabits for meta description
	int32_t mklen;
	char *mk = getMetaKeywords( &mklen );
	if ( ! addGigabits2 ( mk , mklen , *d , *langId ) ) return NULL;

	// set m_gigabitQuery and m_gigabitScores
	//GigabitInfo *top[100];
	// fill in "top" in order of score
	m_numTop = getTopGigabits ( &ht , m_top , 100 , 0 );
	// error? then g_errno should be set
	if ( m_numTop == -1 ) return NULL;

	char *p    = m_gigabitQuery;
	char *pend = m_gigabitQuery + XD_GQ_MAX_SIZE - 1;
	// reset count of vector components for setting gigabit vector
	int32_t ng = 0;
	// total score
	//int32_t total = 0;
	// . now set the gigabit query!
	// . start with the highest scoring node first, the last node since
	//   nodes are ranked by lowest to highest key
	for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
		// get the info
		GigabitInfo *gi = m_top[i];
		// stop if too big
		if ( p + gi->m_len + 10 >= pend ) continue;
		// get 32 bit hash
		uint32_t h = gi->m_hash & 0xffffffff;
		// never allow 0
		if ( h == 0 ) h = 1;
		// add to vector
		if ( ng + 1 < XD_MAX_GIGABIT_HASHES ) {
			// the term hash
			m_gigabitHashes[ng] = (int32_t)h ;
			// and the score
			m_gigabitScores[ng] = gi->m_pts;
			// point into it, where we will copy it to
			m_gigabitPtrs  [ng] = p + 1;
			// advance
			ng++;
		}
		// quote it
		*p++ = '\"';
		// write into buffer
		gbmemcpy ( p , gi->m_ptr , gi->m_len );
		// finish quote
		*p++ = '\"';
		// separate terms just in case
		//gbmemcpy ( p , " , ", 4 );
		//p += 4;
		*p++ = ',';
	}
	// done
	*p++ = '\0';
	// NULL termiante the vector to make it a legit vector
	m_gigabitHashes [ ng ] = 0;
	m_gigabitScores [ ng ] = 0;

	// include the terminating 0
	ng++;
	// validate both the query and vector
	m_gigabitQueryValid  = true;
	m_gigabitHashesValid = true;
	// set this too
	ptr_gigabitHashes   = m_gigabitHashes;
	ptr_gigabitScores   = m_gigabitScores;
	size_gigabitHashes  = ng * 4 ; // 4 bytes each component
	size_gigabitScores  = ng * 4 ; // 4 bytes each score
	return m_gigabitQuery;
}


// . fill in "top" in order of score
// . returns -1 and sets g_errno on error
int32_t getTopGigabits ( HashTableX   *ht          ,
		      GigabitInfo **top         ,
		      int32_t          max         ,
		      int32_t          minDocCount ) {


	// store top 100 into this tree
	RdbTree tree;
	if ( ! tree.set ( 4     , // fixedDataSize
			  max+2 , // maxNumNodes
			  true  , // balance?
			  -1    , // maxMem
			  true  , // own data?
			  "tree-topgbits" ))
		return -1;

	int32_t  ns = ht->getNumSlots();
	key_t minKey;
	bool  minKeyValid = false;
	for ( int32_t i = 0 ; i < ns ; i++ ) {
		// skip if empty
		if ( ht->isEmpty(i) ) continue;
		// get his info
		GigabitInfo *gi = (GigabitInfo *)ht->getValueFromSlot(i);
		// must be valid
		if ( gi->m_count <= 0 ) { char *xx=NULL;*xx=0; }
		// must be in this many docs minimum
		if ( gi->m_numDocs < minDocCount ) continue;
		// make the key
		key_t key;
		key.n1 = gi->m_pts;
		key.n0 = gi->m_hash;
		// should we add it?
		if ( minKeyValid && key <= minKey ) continue;
		// we should add it. use points as the key. use PTR as data
		int32_t node = tree.addNode(0,key,(char *)&gi,4);
		// error? g_errno should be set
		if ( node < 0 ) return -1;
		// if not full continue
		if ( tree.getNumUsedNodes() < 100 ) continue;
		// get the smallest node
		int32_t tn = tree.getLowestNode ( ) ;
		// sanity check
		if ( tn < 0 ) { char *xx=NULL;*xx=0; }
		// kick out smallest
		tree.deleteNode3 ( tn , false );
		// get new smallest
		tn = tree.getLowestNode();
		// set the new minkey
		minKey = *(key_t *)tree.getKey ( tn );
		// validate it
		minKeyValid = true;
	}
	int32_t count = 0;
	// . now set the array
	// . start with the highest scoring node first, the last node since
	//   nodes are ranked by lowest to highest key
	for ( int32_t nn=tree.getLastNode() ; nn>=0 ; nn=tree.getPrevNode(nn) ){
		// get the info
		GigabitInfo *gi = (GigabitInfo *)tree.getData(nn);
		// store it
		top[count++] = gi;
		// stop if we are full
		if ( count >= max ) break;
	}
	return count;
}

char *XmlDoc::getMetaDescription( int32_t *mdlen ) {
	if ( m_metaDescValid ) {
		*mdlen = m_metaDescLen;
		return m_metaDesc;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	//xml->getMetaContent ( m_metaDesc, 1024, "description", 11 );
	// we need to point to it in the html source so our WordPosInfo
	// algo works right.
	m_metaDesc = xml->getMetaContentPointer("description",
						11,
						"name",
						&m_metaDescLen);
	*mdlen = m_metaDescLen;
	m_metaDescValid = true;
	return m_metaDesc;
}

char *XmlDoc::getMetaSummary ( int32_t *mslen ) {
	if ( m_metaSummaryValid ) {
		*mslen = m_metaSummaryLen;
		return m_metaSummary;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	m_metaSummary = xml->getMetaContentPointer("summary",
						   7,
						   "name",
						   &m_metaSummaryLen);
	*mslen = m_metaSummaryLen;
	m_metaSummaryValid = true;
	return m_metaSummary;
}

char *XmlDoc::getMetaKeywords( int32_t *mklen ) {
	if ( m_metaKeywordsValid ) {
		*mklen = m_metaKeywordsLen;
		return m_metaKeywords;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	//xml->getMetaContent ( m_metaKeywords, 1024, "keywords", 8 );
	// we need to point to it in the html source so our WordPosInfo
	// algo works right.
	m_metaKeywords=xml->getMetaContentPointer("keywords",
						  8,
						  "name",
						  &m_metaKeywordsLen);
	*mklen = m_metaKeywordsLen;
	m_metaKeywordsValid = true;
	return m_metaKeywords;
}

bool XmlDoc::addGigabits ( char *s ,
			   int64_t docId ,
			   uint8_t langId ) {
	Words tmp;
	// skip if none
	if ( ! s ) return true;
	// returns NULL with g_errno set on error
	if ( ! tmp.set9 ( s , m_niceness ) ) return false;
	// and weights!
	//Weights we;
	//if ( ! we.set ( &tmp , )
	// and so does this
	return addGigabits ( &tmp , docId , NULL , langId );
}

bool XmlDoc::addGigabits2 ( char *s ,
			    int32_t slen,
			    int64_t docId ,
			    uint8_t langId ) {
	Words tmp;
	// skip if none
	if ( ! s ) return true;
	// returns NULL with g_errno set on error
	if ( ! tmp.setx ( s , slen , m_niceness ) ) return false;
	// and weights!
	//Weights we;
	//if ( ! we.set ( &tmp , )
	// and so does this
	return addGigabits ( &tmp , docId , NULL , langId );
}

bool XmlDoc::addGigabits(Words *ww,int64_t docId,Sections *sections,
			 uint8_t langId ) {
	// skip sections marked as these:
	//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
	// get this
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	// not if we don't have any identified sections
	if ( sections && sections->m_numSections <= 0 ) sp = NULL;
	// int16_tcuts
	int64_t  *wids  = ww->m_wordIds;
	char      **wptrs = ww->m_words;
	int32_t       *wlens = ww->m_wordLens;
	nodeid_t   *tids  = ww->m_tagIds;
	int32_t        nw    = ww->getNumWords();
	//int32_t        flags;
	// inital # of slots
	int32_t is = 0;
	if ( m_wordsValid ) is = ww->m_numAlnumWords;
	// put gigabits into this hash table
	HashTableX ht;
	if ( ! ht.set ( 8 , sizeof(GigabitInfo),is,NULL,0,false,m_niceness,
			"gigabits") )
		return false;
	// scan through the words
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe if being called by spider
		QUICKPOLL ( m_niceness );
		// skip if not alnum word
		if ( ! wids[i] ) continue;
		// get section
		Section *sx = NULL;
		// get flags
		if ( sp ) sx = sp[i];//flags = sp[i]->m_flags;
		//else      flags = 0;
		// skip if ignored. i.e. in the menu or not in the article text
		//if ( flags & badFlags ) continue;
		// are we capitalized?
		bool cap = ww->isCapitalized(i);
		// ignore lower case query stop words
		if (!cap&&isQueryStopWord(wptrs[i],wlens[i],wids[i],langId))
			continue;
		// hash of word then the phrase
		//uint32_t h = wids[i] & 0xffffffff;
		//uint64_t h = wids[i];
		// add the word itself. return NULL with g_errno set on error
		if ( ! addGigabit (&ht,wptrs[i],wlens[i],docId,
				   sx,true,langId,i)) return false;
		// save position
		int32_t j = i + 1 ;
		// check this far out
		int32_t maxj = i + 12; if ( maxj > nw ) maxj = nw;
		// do we got a cap phrase?
		bool capPhrase = false;
		// if capitalized look for sequence
		for ( ; cap && j < maxj ; j++ ) {
			// . stop on tags
			// . tids is NULL if being set from meta tag...
			if ( tids && tids[j] ) break;
			// skip if not alnum
			if ( ! wids[j] ) {
				// make sure it is like a single space or
				// something we can "phrase across"
				// TODO: can be like "capt. "
				if ( wlens[j] == 1 ) continue;
				// otherwise it stops the phrase
				break;
			}
			// if not capitalized stop
			if ( ! ww->isCapitalized(j) ) break;
			// got one!
			capPhrase = true;
			// . hash it into the ongoing hash
			// . Speller::getPopularity() should use this same
			//   method so we can get popularities of the gigabits!
			//h = hash32Fast ( wids[j] & 0xffffffff , h );
			//h = hash64Fast ( wids[j] , h );
		}
		// if we added something... skip whole phrase, if any
		if ( capPhrase ) {
			// get length of it
			int32_t len = wptrs[j-1] + wlens[j-1] - wptrs[i];
			// add that entire sequence, [i,j)
			if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,
					    false,langId,i)) return false;
			// advance to end of phrase
			i = j - 1;
			continue;
		}
		// reset
		j = i + 1;
		// this must be true
		// . ok, look for a wiki phrase then!
		// . we can speed this up if too slow... using a crazy hash tbl
		int32_t wikij = -1;
		// init the hash for wiki lookup
		uint32_t h = 0;
		// loop over successive terms
		for ( ; j < maxj ; j++ ) {
			// . stop on tags
			// . tids is NULL if being set from meta tag
			if ( tids && tids[j] ) break;
			// skip if not alnum
			if ( ! wids[j] ) {
				// make sure it is like a single space or
				// something we can "phrase across"
				// TODO: can be like "capt. "
				if ( wlens[j] == 1 ) continue;
				// otherwise it stops the phrase
				break;
			}
			// init it
			if ( ! h ) h = hash32Fast ( wids[i] & 0xffffffff , 0 );
			// hash it into the ongoing hash
			h = hash32Fast ( wids[j] & 0xffffffff , h );
			// is this in the wiki?
			if ( ! g_wiki.isInWiki ( h ) ) continue;
			// it is, mark it
			wikij = j + 1;
		}

		// must be a 2+ word phrase in the wiki to be a gigabit
		if ( wikij == -1 ) continue;
		// bail if breach
		if ( wikij >= nw ) continue;
		// get len
		int32_t len = wptrs[wikij] + wlens[wikij] - wptrs[i];
		// add what we got
		if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,false,
				    langId,i) ) return false;
		// advance to end of phrase
		i = wikij - 1;
	}
	return true;
}


char* XmlDoc::getMetadata(int32_t* retlen) {
	if(!m_hasMetadata) {
		*retlen = 0;
		return NULL;
	}

	*retlen = size_metadata;
	return ptr_metadata;

}

// . this is called by Msg40.cpp to intersect gigabits from multiple docs
// . returns -1 and sets g_errno on error
// . returns # of GigabitInfos stored into "top"
/*
int32_t intersectGigabits ( Msg20       **mp          ,   // search results
			 int32_t          n           ,   // # of em
			 uint8_t       langId      ,   // searcher's langId
			 int32_t          maxTop      ,
			 int32_t          docsToScan  ,
			 int32_t          minDocCount , // must be in this # docs
			 GigabitInfo  *top         ,
			 int32_t          niceness    ) {

	// put gigabits into this hash table
	HashTableX ht;
	ht.set ( 8 , sizeof(GigabitInfo),0,NULL,0,false,niceness,"ginttbl");

	for ( int32_t i = 0 ; i < n && i < docsToScan ; i++ ) {
		// get the reply/searchResult
		Msg20Reply *mr = mp[i]->m_r;
		// sanity check
		if ( ! mr && ! mp[i]->m_errno ) { char *xx=NULL;*xx=0; }
		// this is NULL on error
		if ( ! mr ) continue;
		// count them
		int32_t count = 0;
		// add each gigabit for it
		for ( char *p = mr->ptr_gigabitQuery ; p && *p ; count++ ) {
			// skip the comma
			p++;
			// point to next
			char *end = strchr ( p , ',' );
			// do not allow NULLs
			if ( ! end ) end = p + gbstrlen(p);
			// get the score. aka GigabitInfo::m_pts
			int32_t ptsArg = mr->ptr_gigabitScores[count];
			// sanity check for bad scores
			if ( ptsArg <= 0 ) { char *xx=NULL;*xx=0; }
			// add it in
			if ( ! addGigabit ( &ht          ,
					    p            ,
					    end - p      , // langth
					    mr->m_docId  ,
					    NULL         ,// section ptr
					    false        , // singleWrd? unused
					    langId       ,
					    -1           , // word #i not used
					    ptsArg       ) )
				return -1;
			// advance p
			p = end;
			// if not comma, all done
			if ( *p != ',' ) break;
			// skip comma
			p++;
		}
	}

	// . get up to the top 50 gigabits
	GigabitInfo *array [ 50 ];
	int32_t numTop = getTopGigabits ( &ht , array , 50 , minDocCount );
	// error? g_errno should be set
	if ( numTop == -1 ) return -1;
	// sanity check
	if ( numTop > maxTop ) { char *xx=NULL;*xx=0; }

	// now copy into our array
	for ( int32_t i = 0 ; i < numTop ; i++ ) {
		// get it
		GigabitInfo *gi = array[i];
		// copy it
		gbmemcpy ( &top[i] , gi , sizeof(GigabitInfo) );
	}
	// return how many we copied
	return numTop;
}
*/

// . "docId" is the document Id that "h" came from
// . if being called at query time we often get called on each search result!
// . if being called at parse/index time we are being called on a single docId
// . returns false and sets g_errno on error
bool addGigabit ( HashTableX *ht         ,
		  char       *s          ,
		  int32_t        slen       ,
		  int64_t   docId      ,
		  Section    *sp         ,
		  bool        singleWord ,
		  uint8_t     langId     ,
		  // starts with word #i
		  int32_t        i          ,
		  int32_t        ptsArg     ) {
	// get its hash
	uint64_t h = hash64d ( s , slen );
	// get the slot where its at
	int32_t slot = ht->getSlot ( &h );
	// info for this hash/gigabit in the doc
	GigabitInfo *gi ;
	// otherwise, init a new slot. set the key to h
	if ( slot < 0 ) {
		// . add key to a new slot, set "gi" to the value ptr
		// . use NULL for the GigabitInfo ptr temporarily so it should
		//   not gbmemcpy into the slot
		if ( ! ht->addKey ( &h , NULL , &slot ) ) return false;
		// get data ptr to the bogus data
		gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
		// . set all the stuff now. this way avoids a gbmemcpy...
		// . every wiki title should have a popularity i guess...
		// . "pop" is # of docs out of 10,000 that have this phrase?
		int32_t pop = g_speller.getPhrasePopularity(s,h,true,langId);
		gi->m_pop             = pop;
		gi->m_pts             = 0;
		gi->m_count           = 0;
		gi->m_numDocs         = 0;
		gi->m_lastDocId       = 0LL;
		gi->m_currentDocCount = 0; // a char
		gi->m_ptr             = s;
		gi->m_len             = slen;
		gi->m_hash            = h;
		// sanity test
		GigabitInfo *tt = (GigabitInfo *)ht->getValue ( &h );
		if ( tt->m_pop != pop ) { char *xx=NULL;*xx=0; }
	}
	else {
		gi = (GigabitInfo *)ht->getValueFromSlot ( slot );
		// only allow up to 5 votes per document!
		if ( gi->m_currentDocCount >= 5 ) return true;
	}
	// inc the count, we got one more occurence
	gi->m_count++;
	// doc count. how many docs have this gigabit? count it.
	if ( docId != gi->m_lastDocId ) {
		gi->m_numDocs++;
		gi->m_lastDocId       = docId;
		gi->m_currentDocCount = 1;
	}
	else
		gi->m_currentDocCount++;

	// given?
	if ( ptsArg != -1 ) {
		gi->m_pts += ptsArg;
		return true;
	}

	// base points on popularity
	float pts = 1.0;
	if      ( gi->m_pop <  1 ) pts = 1000;
	else if ( gi->m_pop <  2 ) pts =  500;
	else if ( gi->m_pop <  3 ) pts =  250;
	else if ( gi->m_pop <  4 ) pts =  200;
	else if ( gi->m_pop <  5 ) pts =  150;
	else if ( gi->m_pop <  6 ) pts =  100;
	else if ( gi->m_pop <  7 ) pts =   20;
	else if ( gi->m_pop <  8 ) pts =   10;
	else if ( gi->m_pop < 10 ) pts =    5;
	else if ( gi->m_pop < 15 ) pts =    3;
	else if ( gi->m_pop < 20 ) pts =    2;

	// . special boost if in title, header or anchor tag
	// . the weights class ONLY boosts the first 20 or so words in
	//   header tags... how can we fix that??????????????????
	// . TODO: FIX THAT!!!
	//if ( flags & SEC_TITLE ) pts = pts * 6.0/(float)we->m_titleWeight;
	//if ( flags & SEC_HEADER) pts = pts * 4.0/(float)we->m_headerWeight;
	//if ( flags & SEC_A     ) pts = pts * 4.0/(float)we->m_linkTextWeight;
	if ( sp ) {
		if ( sp->m_flags & SEC_IN_TITLE  ) pts = pts * 6.0;
		if ( sp->m_flags & SEC_IN_HEADER ) pts = pts * 4.0;
		if ( sp->m_tagId == TAG_A        ) pts = pts * 4.0;
	}

	// if for the query 'recreation' you get the phrase "park bench"
	// 100 times and the word "bench" 100 times. the word weight
	// for "bench" should be very low! Weights.cpp also demotes repreated
	// sentence fragments, etc. it is generally a really handy thing!
	// and i think it already boosts scores for being in the title, etc.
	// IF BEING called from meta tag, weights are NULL!
	// TODO: we need to use the diversity vector here then...
	//if ( we ) {
	//	if ( singleWord ) pts *= we->m_ww[i];
	//	else              pts *= we->m_pw[i];
	//}

	// add them in
	gi->m_pts += (int32_t)pts;

	// good to go
	return true;
}


/*
 -- this will be a url filter var like "numindexed"
int32_t *XmlDoc::getSiteSpiderQuota ( ) {
	if ( m_siteSpiderQuotaValid ) return &m_siteSpiderQuota;
	int32_t *siteNumInlinks = getSiteNumInlinks();
	if ( ! siteNumInlinks               ) return NULL;
	if (   siteNumInlinks == (int32_t *)-1 ) return (int32_t *)-1;
	// get this fresh each time
	int32_t *rn = getRegExpNum ( -1 );
	if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
	// bail early? this happens if we match a banned/filtered rule in
	// the url filters table
	if ( m_indexCode ) return NULL;
	// valid at this point
	m_siteSpiderQuotaValid = true;
	// if no match, or filtered or banned, assume no quota
	if ( *rn == -1 ) m_siteSpiderQuota = -1;
	else             m_siteSpiderQuota = cr->m_spiderQuotas[*rn];
	// get the quota, -1 means no limit
	return &m_siteSpiderQuota;
}
*/


Url *XmlDoc::getCurrentUrl ( ) {
	if ( m_currentUrlValid ) return &m_currentUrl;
	// otherwise, get first url
	Url *fu = getFirstUrl();
	if ( ! fu || fu == (void *)-1 ) return (Url *)fu;
	// make that current url
	m_currentUrl.set ( &m_firstUrl , false );
	m_currentUrlValid = true;
	return &m_currentUrl;
	/*
	// need a valid url
	Url *u = getFirstUrl();
	if ( ! u ) return NULL;
	// but use redir if we got that
	Url *r = getRedirUrl();
	if ( r && m_redirUrlValid ) return r;
	return u;
	*/
}

Url *XmlDoc::getFirstUrl() {
	if ( m_firstUrlValid ) return &m_firstUrl;
	// we might have a title rec
	if ( m_setFromTitleRec ) {
		setFirstUrl ( ptr_firstUrl , false );
		m_firstUrlValid = true;
		return &m_firstUrl;
	}
	// must be this otherwise
	if ( ! m_setFromDocId ) { char *xx=NULL;*xx=0; }
	// this must be valid
	if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }

	// get the old xml doc from the old title rec
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (void *)-1 ) return (Url *)pod;
	// int16_tcut
	XmlDoc *od = *pod;
	// now set it
	setFirstUrl ( od->ptr_firstUrl , false );
	m_firstUrlValid = true;
	return &m_firstUrl;
}


int64_t XmlDoc::getFirstUrlHash48() {
	if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
	// this must work
	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }
	if ( getUseTimeAxis() ) {
		m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
		m_firstUrlHash48Valid = true;
		return m_firstUrlHash48;
	}

	m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL;
	m_firstUrlHash48Valid = true;
	return m_firstUrlHash48;
}

int64_t XmlDoc::getFirstUrlHash64() {
	if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
	// this must work
	if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; }

	if ( getUseTimeAxis() ) {
		m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
		m_firstUrlHash64Valid = true;
		return m_firstUrlHash64;
	}

	m_firstUrlHash64 = hash64b ( m_firstUrl.m_url );
	m_firstUrlHash64Valid = true;
	return m_firstUrlHash64;
}

Url **XmlDoc::getLastRedirUrl() {

	Url **ru = getRedirUrl();
	if ( ! ru || ru == (void *)-1 ) return ru;

	// m_redirUrlPtr will be NULL in all cases, however, the
	// last redir url we actually got will be set in
	// m_redirUrl.m_url so return that.
	m_lastRedirUrlPtr = &m_redirUrl;
	return &m_lastRedirUrlPtr;
}

// . operates on the latest m_httpReply
Url **XmlDoc::getRedirUrl() {
	if ( m_redirUrlValid ) return &m_redirUrlPtr;

	setStatus ( "getting redir url" );

	// assume no redirect
	m_redirUrlPtr = NULL;
	//ptr_redirUrl  = NULL;
	//size_redirUrl = 0;
	// bail on this
	//if ( ! m_checkForRedir ) {
	//	m_redirError = 0;
	//	m_redirErrorValid = true;
	//	return &m_redirUrlPtr;
	//}
	// we might have a title rec
	if ( m_setFromTitleRec ) { char *xx=NULL;*xx=0; }

	// or recycling content from old title rec
	if ( m_recycleContent ) {
		m_redirError = 0;
		m_redirErrorValid = true;
		m_redirUrlValid = true;
		return &m_redirUrlPtr;
	}

	// get the current http reply, not the final http reply necessarily
	if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }

	// set a mime on the stack
	HttpMime mime;
	// int16_tcut
	int32_t LEN = m_httpReplySize - 1;
	// sanity check
	if ( LEN > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
	// empty reply, no redir
	if ( LEN == 0 ) {
		// bad mime, but i guess valid empty redir url
		m_redirUrlValid = true;
		// no error
		m_redirError = 0;
		m_redirErrorValid = true;
		// return a fake thing. content length is 0.
		return &m_redirUrlPtr;
	}
	// set it. if 'connection refused' then LEN is -1.
	if ( LEN<0 || ! mime.set ( m_httpReply, LEN, getCurrentUrl() ) ) {
		// set this on mime error
		//if ( ! m_indexCode ) m_indexCode = EBADMIME;
		// bad mime, but i guess valid empty redir url
		m_redirUrlValid = true;
		// return nothing, no redirect url was there
		m_redirUrlPtr = NULL;
		// no error
		m_redirError = 0;
		m_redirErrorValid = true;
		// return a fake thing. content length is 0.
		return &m_redirUrlPtr;
	}

	int32_t httpStatus = mime.getHttpStatus() ;


	Url *loc = NULL;

	// quickly see if we are a robots.txt url originally
	bool isRobotsTxt = isFirstUrlRobotsTxt ( );

	//
	// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
	// if httpStatus is not a redirect
	//
	if ( httpStatus < 300 || httpStatus > 399 ) {
		// ok, crap, i was getting the xml here to get the meta
		// http-equiv refresh tag, but that added an element of
		// recursion that is just too confusing to deal with. so
		// let's just parse out the meta tag by hand
		bool checkMeta = true;
		if ( isRobotsTxt ) checkMeta = false;
		// if we are a doc that consists of a sequence of sub-docs that
		// we are indexing/injecting then don't do this check.
		if ( isContainerDoc() ) checkMeta = false;
		if ( checkMeta ) {
			Url **mrup = getMetaRedirUrl();
			if ( ! mrup || mrup == (void *)-1) return (Url **)mrup;
			// set it. might be NULL if not there.
			loc = *mrup;
		}
	}
	else
		// get Location: url (the redirect url) from the http mime
		loc = mime.getLocationUrl();

	// get current url
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (Url **)cu;

	// this call set size_catIds
	int32_t **pcids = getCatIds();
	if ( ! pcids || pcids == (void *)-1) return (Url **)pcids;
	// get local link info
	LinkInfo   *info1 = getLinkInfo1();
	// error or blocked
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
	// get remote link info
	LinkInfo  **pinfo2 = getLinkInfo2();
	// error or blocked
	if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (Url **)pinfo2;
	// convenience
	LinkInfo   *info2 = *pinfo2;

	// breathe
	QUICKPOLL(m_niceness);

	// did we send a cookie with our last request?
	bool sentCookieLastTime = false;
	if ( m_redirCookieBuf.length() )
		sentCookieLastTime = true;

	// get cookie for redirect to fix nyt.com/nytimes.com
	// for gap.com it uses multiple Set-Cookie:\r\n lines so we have
	// to accumulate all of them into a buffer now
	m_redirCookieBuf.reset();
	mime.addCookiesIntoBuffer ( &m_redirCookieBuf );
	m_redirCookieBufValid = true;

	/*
	char *cookie = mime.getCookie();
	// find end of cookie at the semicolon
	char *s = cookie;
	for ( ; s && *s && *s != ';' ; s++ );
	if ( s && *s == ';' ) {
		// do not include ;
		int32_t clen = s - cookie;
		m_redirCookieBuf.reset();
		m_redirCookieBuf.safeMemcpy ( cookie , clen );
		m_redirCookieBuf.nullTerm();
		m_redirCookieBufValid = true;
	}
	*/

	// mdw23
	//log("http: reply=%s",m_httpReply);

	// a hack for removing session ids already in there. for
	// brilliantshopper's bs4 collection and gk0 cluster
	//bool forceRedirect = false;
	if ( size_catIds == 0 &&
	     // must not have an actual redirect url in there
	     ! loc &&
	     // must be a valid http status
	     httpStatus == 200 &&
	    (gb_strcasestr( cu->getUrl(), "sessionid") ||
	     gb_strcasestr( cu->getUrl(), "oscsid")   ) ) {
		Url *tt = &m_redirUrl;
		tt->set ( cu->getUrl() ,
			  cu->getUrlLen() ,
			  true ,  // addwww?
			  true ); // strip sessid?
		// if it no longer has the session id, force redirect it
		if ( ! gb_strcasestr( tt->getUrl(), "sessionid") &&
		     ! gb_strcasestr( tt->getUrl(), "oscsid")   )  {
			m_redirUrlValid = true;
			m_redirUrlPtr   = &m_redirUrl;
			// TODO: log redir url in spider log output
			//logf(LOG_INFO,"build: %s force redirected to %s",
			//     cu->getUrl(),m_redirUrl.getUrl());
			m_redirUrlValid = true;
			ptr_redirUrl    = m_redirUrl.m_url;
			size_redirUrl   = m_redirUrl.m_ulen+1;
			// no error
			m_redirError = 0;
			m_redirErrorValid = true;
			return &m_redirUrlPtr;
		}
	}

	// breathe
	QUICKPOLL(m_niceness);

	// if no location url, then no redirect a NULL redir url
	if ( ! loc || loc->m_url[0] == '\0' ) {
		// validate it
		m_redirUrlValid = true;
		// no error
		m_redirError = 0;
		m_redirErrorValid = true;
		// and return an empty one
		return &m_redirUrlPtr;
	}

	// breathe
	QUICKPOLL(m_niceness);

	// this is handy
	//Url tmp;

	// TODO: make sure we got this logic elsewhere
	// if robots.txt said no, and if we had no link text, then give up
	//if(! *isAllowed && !info1->hasLinkText() && !info2->hasLinkText() ) {
	//	m_indexCode = EDOCDISALLOWED;

	// set our redir url from the mime's Location: field. addWWW=false
	//if ( loc != &tmp ) tmp.set ( loc , false );

	bool keep = false;
	if ( size_catIds > 0               ) keep = true;
	if ( info1->hasLinkText()          ) keep = true;
	if ( info2 && info2->hasLinkText() ) keep = true;

	// at this point we do not block anywhere
	m_redirUrlValid = true;

	// store the redir error
	m_redirError      = 0;
	m_redirErrorValid = true;

	// i've seen a "Location: 2010..." bogus url as well, so make sure
	// we got a legit url
	if ( ! loc->getDomain() || loc->getDomainLen() <= 0 ) {
		if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
		return &m_redirUrlPtr;
	}

	//bool injected = false;
	// get from spider request if there
	//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;

	// . if redirect url is nothing new, then bail (infinite loop)
	// . www.xbox.com/SiteRequirements.htm redirects to itself
	//   until you send a cookie!!
	// . www.twomileborris.com does the cookie thing, too
	if ( strcmp ( cu->getUrl(), loc->getUrl() ) == 0 ) {
		// try sending the cookie if we got one now and didn't have
		// one for this last request
		if ( ! sentCookieLastTime && m_redirCookieBuf.length() ) {
			m_redirUrl.set ( loc->getUrl() );
			m_redirUrlPtr = &m_redirUrl;
			return &m_redirUrlPtr;
		}
		if ( ! keep ) m_redirError = EDOCREDIRECTSTOSELF;
		return &m_redirUrlPtr;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . don't allow redirects when injecting!
	// . otherwise, we would mfree(m_buf) which would free our
	//   injected reply... yet m_injectedReplyLen would still be
	//   positive! can you say 'seg fault'?
	// . hmmm... seems to have worked though
	if ( cr->m_recycleContent || m_recycleContent ) { // || injected
		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
		return &m_redirUrlPtr;
	}
	// . if we followed too many then bail
	// . www.motorolamobility.com www.outlook.com ... failed when we
	//   had >= 4 here
	if ( ++m_numRedirects >= 10 ) {
		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
		return &m_redirUrlPtr;
	}
	// sometimes idiots don't supply us with a Location: mime
	if ( loc->getUrlLen() == 0 ) {
		if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
		return &m_redirUrlPtr;
	}
	// . protocol of url must be http or https
	// . we had one url redirect to an ihttp:// protocol and caused
	//   spider to core dump when it saw that SpiderRequest record
	char *proto = loc->getScheme();
	if ( strncmp(proto,"http://" ,7) && strncmp(proto,"https://",8) ) {
		m_redirError = EDOCBADREDIRECTURL;
		return &m_redirUrlPtr;
	}
	// do not allow redirects to evil-G or bing
	//if ( strstr(loc->getUrl(),".google.com/")  ||
	//     strstr(loc->getUrl(),".bing.com/")  ) {
	//	m_redirError = EDOCEVILREDIRECT;
	//	return &m_redirUrlPtr;
	//}
	// log a msg
	if ( g_conf.m_logSpideredUrls )
		logf(LOG_INFO,"build: %s redirected to %s",
		     cu->getUrl(),loc->getUrl());

	// if not same Domain, it is not a simplified redirect
	bool sameDom = true;
	int32_t dlen    = loc->getDomainLen();
	if      ( cu->getDomainLen() != dlen                   ) sameDom=false;
	else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
	if ( ! sameDom ) {
		m_redirectFlag = true;
		m_redirUrl.set ( loc , false ); // addWWW=false
		m_redirUrlPtr   = &m_redirUrl;
		ptr_redirUrl    = m_redirUrl.m_url;
		size_redirUrl    = m_redirUrl.m_ulen+1;
		return &m_redirUrlPtr;
	}

	// if redirecting to the same domain, then do not add "www.".
	// this way we can take care of slashdot.org, etc.
	//bool addwww = false;
	// but never modify if in dmoz, keep it pure
	//if ( size_catIds > 0 ) addwww = false;
	// debug msg
	//if ( strcmp(m_redirUrl.getUrl(),url->getUrl())== 0 )
	//	log("Redirect error: same url");
	//bool stripSessId = (size_catIds == 0);
	// . reset m_redirUrl now (do not addWWW for slashdot.org, etc)
	// . we now add "www." UNLESS it's a redirect from the same
	//   domain or firstUrl is in catdb
	//tmp.set( loc->getUrl(),loc->getUrlLen(),addwww,stripSessId);
	/*
	// get this
	bool sameHostLinks = false;
	if ( *pi >= 0 ) sameHostLinks =cr->m_pq_spiderSameHostnameLinks[*pi];
	// get first url ever
	Url *f = getFirstUrl();
	// . for same host links, addwww for comparing
	// . so if we are doing google.com and it redirects to
	//   www.google.com then we will allow that... and vice versa
	if ( sameHostLinks ) {
		Url u1;
		Url u2;
		u1.set ( loc->getUrl  () , loc->getUrlLen(), true ); // addwww?
		u2.set ( f->getUrl()     , f->getUrlLen () , true ); // addwww?
		// host must match if we are restricted to a particular host
		if ( u1.getHostLen() != u2.getHostLen() ||
		     strncmp ( u1.getHost() , u2.getHost() ,
			       u1.getHostLen () ) != 0 )  {
			m_redirError = EDOCBADREDIRECTURL;
			return &m_redirUrlPtr;
		}
	}
	*/
	// get first url ever
	Url *f = getFirstUrl();
	// breathe
	QUICKPOLL(m_niceness);
	// set this to true if the redirected urls is much preferred
	bool simplifiedRedir = false;
	// . if it redirected to a simpler url then stop spidering now
	//   and add the simpler url to the spider queue
	// . by simpler, i mean one w/ fewer path components
	// . or one with a www for hostname
	// . or could be same as firstUrl but with a / appended
	char *r   = loc->getUrl();
	char *u   = f->getUrl();
	int32_t rlen = loc->getUrlLen();
	int32_t ulen = f->getUrlLen();
	// simpler if new path depth is int16_ter
	if ( loc->getPathDepth (true) < f->getPathDepth (true) )
		simplifiedRedir = true;
	// simpler if old has cgi and new does not
	if ( f->isCgi() && ! loc->isCgi() )
		simplifiedRedir = true;
	// if we're a dmoz page, don't do this, unless just a / case,no
	if ( size_catIds > 0 )
		simplifiedRedir = false;
	// simpler if new one is same as old but has a '/' at the end
	if ( rlen == ulen+1 && r[rlen-1]=='/' && strncmp(r,u,ulen)==0)
		simplifiedRedir = true;
	// . if new url does not have semicolon but old one does
	// . http://news.yahoo.com/i/738;_ylt=AoL4eFRYKEdXbfDh6W2cF
	//   redirected to http://news.yahoo.com/i/738
	if ( strchr (u,';') &&  ! strchr (r,';') )
		simplifiedRedir = true;
	// simpler is new host is www and old is not
	if ( loc->isHostWWW() && ! f->isHostWWW() )
		simplifiedRedir = true;
	// if redirect is to different domain, set simplified
	// this helps locks from bunching on one domain
	if ( loc->getDomainLen()!=f->getDomainLen() ||
	     strncasecmp ( loc->getDomain(),
			   f->getDomain(),
			   loc->getDomainLen() ) != 0 )
		// crap, but www.hotmail.com redirects to live.msn.com
		// login page ... so add this check here
		if ( ! f->isRoot() )
			simplifiedRedir = true;

	bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;

	// follow redirects if injecting so we do not return
	// EDOCSIMPLIFIEDREDIR
	if ( getIsInjecting ( ) )
		allowSimplifiedRedirs = true;

	// or if disabled then follow the redirect
	if ( ! cr->m_useSimplifiedRedirects )
		allowSimplifiedRedirs = true;

	// . if the redir url is simpler, but has no hostname we
	//   prepend a "www." to it
	// . this should avoids www.russ.ru and russ.ru from being
	//   in the index at the same time and causing url: collisions
	/*
	if ( size_catIds == 0 &&
	     simplifiedRedir &&
	     loc->getDomainLen() == loc->getHostLen  () )
		loc->set (loc->getUrl(),
			 loc->getUrlLen(),
			 true, //false, addwww?
			 stripSessId );
	*/
	// if not allow, do not do them... except for the two below
	//if ( ! m_useSimplifiedRedirects || m_isDirColl )
	//	simplifiedRedir = false;

	// special hack for nytimes.com. do not consider simplified redirs
	// because it uses a cookie along with redirs to get to the final
	// page.
	char *dom2 = m_firstUrl.getDomain();
	int32_t  dlen2 = m_firstUrl.getDomainLen();
	if ( dlen2 == 11 && strncmp(dom2,"nytimes.com",dlen2)==0 )
		allowSimplifiedRedirs = true;
	// same for bananarepublic.gap.com ?
	// if ( dlen2 == 7 && strncmp(dom2,"gap.com",dlen2)==0 )
	// 	allowSimplifiedRedirs = true;

	// if redirect is setting cookies we have to follow the redirect
	// all the way through so we can stop now.
	if ( m_redirCookieBufValid && m_redirCookieBuf.getLength() )
		allowSimplifiedRedirs = true;

	// . don't bother indexing this url if the redir is better
	// . 301 means moved PERMANENTLY...
	// . many people use 301 on their root pages though, so treat
	//   it like a temporary redirect, like exclusivelyequine.com
	if ( simplifiedRedir && ! allowSimplifiedRedirs &&
	     // for custom BULK clients don't like this i guess
	     // AND for custom crawl it was messing up the processing
	     // url format for a nytimes blog subsite which was redirecting
	     // to the proper nytimes.com site...
	     // ! cr->m_isCustomCrawl ) {
	     // no, we need this for custom crawls because otherwise we
	     // get too many dups in the index. so for nyt we need something
	     // else
	     cr->m_isCustomCrawl != 2 ) {
		// returns false if blocked, true otherwise
		//return addSimplifiedRedirect();
		m_redirError = EDOCSIMPLIFIEDREDIR;
		// set this because getLinks() treats this redirUrl
		// as a link now, it will add a SpiderRequest for it:
		m_redirUrl.set ( loc , false ); // addWWW=false
		m_redirUrlPtr   = &m_redirUrl;
		// mdw: let this path through so contactXmlDoc gets a proper
		// redirect that we can follow. for the base xml doc at
		// least the m_indexCode will be set
		return &m_redirUrlPtr;
	}
	// good to go
	m_redirectFlag = true;
	m_redirUrl.set ( loc , false ); // addWWW=false
	m_redirUrlPtr   = &m_redirUrl;
	ptr_redirUrl    = m_redirUrl.m_url;
	size_redirUrl   = m_redirUrl.m_ulen+1;
	return &m_redirUrlPtr;
}

int32_t *XmlDoc::getFirstIndexedDate ( ) {
	if ( m_firstIndexedDateValid ) return (int32_t *)&m_firstIndexedDate;
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
	// valid
	m_firstIndexedDateValid = true;
	// must be downloaded
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	// assume now is the first time
	m_firstIndexedDate = getSpideredTime();//m_spideredTime;
	// inherit from our old title rec
	if ( *od ) m_firstIndexedDate = (*od)->m_firstIndexedDate;
	// return it
	return (int32_t *)&m_firstIndexedDate;
}

int32_t *XmlDoc::getOutlinksAddedDate ( ) {
	if ( m_outlinksAddedDateValid ) return (int32_t *)&m_outlinksAddedDate;
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
	// valid
	m_outlinksAddedDateValid = true;
	// must be downloaded
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	// assume we are doing it now
	m_outlinksAddedDate = getSpideredTime();//m_spideredTime;
	// get that
	if ( *od ) m_outlinksAddedDate = (*od)->m_outlinksAddedDate;
	// return it
	return (int32_t *)&m_outlinksAddedDate;
}

/*
int32_t *XmlDoc::getNumBannedOutlinks ( ) {
	if ( m_numBannedOutlinksValid ) return &m_numBannedOutlinks;

	setStatus ( "getting num banned outlinks" );

	// get the outlinks
	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 ) return (int32_t *)links;
	// count em
	int32_t n = links->getNumLinks();
	// reset
	m_numBannedOutlinks = 0;
	// one vote per domain hash table
	char buf[20000];
	HashTableX ht; ht.set ( 4 , 0 , -1 , buf , 20000 ,false,m_niceness);
	// loop through them
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get the link
		char *u = links->getLinkPtr(i);
		// get domain of the link
		int32_t dlen; char *dom  = getDomFast ( u , &dlen , false );
		// skip if bad domain
		if ( ! dom || dlen <= 0 ) continue;
		// get domHash
		int32_t h = hash32 ( dom , dlen );
		// one check per domain
		if ( ht.getSlot ( &h ) >= 0 ) continue;
		// add it, return NULL on error, g_errno should be set
		if ( ! ht.addKey ( &h ) ) return NULL;
		// . loop over all regular expression in the url filters table
		// . stop at first regular expression it matches
		int32_t *rn = getRegExpNum2 ( i );
		// need to wait for a callback at this point
		if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn;
		// skip if no match in url filters table
		if ( *rn == -1 ) continue;
		// get spider priority
		int32_t pr = cr->m_spiderPriorities[*rn];
		// skip if not banned
		if ( pr != -2 ) continue;
		// count it
		m_numBannedOutlinks++;
	}
	// all done
	m_numBannedOutlinksValid = true;
	// convert this too!
	//m_numBannedOutlinks8 = score32to8 ( m_numBannedOutlinks );
	// sanity check on score32to8()
	//if(m_numBannedOutlinks8>0&&!m_numBannedOutlinks){char*xx=NULL;*xx=0;}

	return &m_numBannedOutlinks;
}
*/

uint16_t *XmlDoc::getCountryId ( ) {
	if ( m_countryIdValid ) return &m_countryId;

	setStatus ( "getting country id" );

	// get it
	CatRec *cat = getCatRec ();
	if ( ! cat || cat == (CatRec *)-1) return (uint16_t *)cat;
        // MDW: i limit this to 10 to save stack space!
	Url *u = getCurrentUrl();
	if ( ! u || u == (void *)-1) return (uint16_t *)u;
	// use the url's tld to guess the country
	uint16_t country = g_langId.guessCountryTLD ( u->getUrl ( ) );
	// . 0 means no country i guess. try dmoz next.
	// . limit to 10 of them
	int32_t nc = cat->m_numCatids;
	for ( int32_t i = 0; ! country && i < nc && i < 10 ; i++) {
		int32_t catid = cat->m_catids[i];
		country = g_countryCode.getCountryFromDMOZ ( catid );
	}
	m_countryIdValid = true;
	m_countryId      = country;
	return &m_countryId;
}


/*
XmlDoc *XmlDoc::getOldDoc ( ) {
	if ( m_oldDocValid ) return &m_oldDoc;
	// get current url
	Url *u = getCurrentUrl();
	// set its url otherwise
	m_oldDoc.setFirstUrl ( u , false );
	// get the old title rec
	char *ret = getOldTitleRec();
	if ( ! ret || ret == (char *)-1 ) return (XmlDoc *)ret;
	// all done
	m_oldDocValid = true;
	// return it
	return m_oldDoc;
}
*/

uint8_t *XmlDoc::getRootLangId ( ) {

	// return it if we got it
	if ( m_rootLangIdValid ) return &m_rootLangId;
	// note it
	setStatus ( "getting root lang id from tagdb");
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
	// sanity check - should not be called on a root url
	if ( *isRoot ) {
		uint8_t *langId = getLangId();
		if ( ! langId || langId == (uint8_t *)-1 )
			return (uint8_t *) langId;
		m_rootLangId = *langId;
		m_rootLangIdValid = true;
		return &m_rootLangId;
		//char *xx=NULL;*xx=0; }
	}
	// get the tag rec
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr;
	// just use one. there may be multiple ones!
	Tag *tag = gr->getTag("rootlang");
	// if there use that
	if ( ! tag ) {
		// . get the root doc
 		// . allow for a one hour cache of the titleRec
		XmlDoc **prd = getRootXmlDoc( 3600 );
		if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd;
		// int16_tcut
		XmlDoc *rd = *prd;
		// . if no root doc, then assume language unknown
		// . this happens if we are injecting because we do not want
		//   to download the root page for speed purposes
		if ( ! rd ) {
			m_rootLangId = langUnknown;
			m_rootLangIdValid = true;
			return &m_rootLangId;
		}
		// . update tagdb rec
		// . on root download error use language "xx" (unknown) to
		//   avoid hammering the root page
		//bool *status = rd->updateRootLangId ();
		//if (! status || status==(void *)-1) return (uint8_t *)status;
		// update our tag rec now
		//Tag *tt = rd->m_newTagRec.getTag("rootlang");
		// must be there
		//if ( ! tt ) { char *xx=NULL;*xx=0; }
		// add it for us
		//if ( ! m_newTagRec.addTag ( tt ) ) return NULL;
		// get it
		uint8_t *rl = rd->getLangId();
		if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
		// must be legit now!
		if ( ! rd->m_langIdValid ) { char *xx=NULL;*xx=0;}
		// now validate our stuff
		m_rootLangIdValid = true;
		//m_rootLangIdScore = rd->m_langIdScore;
		m_rootLangId      = rd->m_langId;
		return &m_rootLangId;
	}
	// sanity check ( must be like "en,50\0" or could be
	// "en_US,50\0" or "zh_cn,50"
	if ( tag->getTagDataSize() > 6 ) { char *xx=NULL;*xx=0; }
	// point to 2 character language abbreviation
	char *abbr = tag->getTagData();
	/*
	// find comma
	char *comma = strchr(abbr,',' );
	// sanity check
	if ( ! comma ) { char *xx=NULL;*xx=0; }
	// tmp NULL
	*comma = '\0';
	*/
	// map it to an id
	uint8_t langId = getLangIdFromAbbr( abbr );
	/*
	// put it back
	*comma = ',';
	// get score
	int32_t score = atol(comma+1);
	// sanity check
	if ( score < 0 || score > 100 ) { char *xx=NULL;*xx=0; }
	*/
	// set that up
	m_rootLangId      = langId;
	//m_rootLangIdScore = score;
	m_rootLangIdValid = true;
	return &m_rootLangId;
}

XmlDoc **XmlDoc::getOldXmlDoc ( ) {

	if ( m_oldDocValid ) return &m_oldDoc;

	// note it
	setStatus ( "getting old xml doc");

	// if we are set from a title rec, we are the old doc
	if ( m_setFromTitleRec ) {
		m_oldDocValid = true;
		m_oldDoc      = NULL;//this;
		return &m_oldDoc;
	}

	// . cache age is 0... super fresh
	// . returns NULL w/ g_errno if not found unless isIndexed is false
	//   and valid, and it is not valid for pagereindexes.
	char **otr = getOldTitleRec ( );
	if ( ! otr || otr == (char **)-1 ) return (XmlDoc **)otr;
	// if no title rec, return ptr to a null
	m_oldDoc = NULL;
	if ( ! *otr ) { m_oldDocValid = true; return &m_oldDoc; }

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if provided title rec matches our docid but not uh48 then there
	// was a docid collision and we should null out our title rec
	// and return with an error and no index this puppy!
	// crap, we can't call getFirstUrl() because it might not be
	// valid if we are a docid based doc and THIS function was called
	// from getFirstUrl() -- we end up in a recursive loop.
	if ( ! m_setFromDocId ) {
		//int64_t uh48 = getFirstUrl()->getUrlHash48();
		int64_t uh48 = getFirstUrlHash48();
		int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr );
		if ( uh48 != tuh48 ) {
			log("xmldoc: docid collision uh48 mismatch. cannot "
				"index "
				"%s",getFirstUrl()->getUrl() );
			g_errno = EDOCIDCOLLISION;
			return NULL;
		}
	}

	// . if *otr is NULL that means not found
	// . return a NULL old XmlDoc in that case as well?
	// . make a new one
	// . this will uncompress it and set ourselves!
	try { m_oldDoc = new ( XmlDoc ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		return NULL;
	}
	mnew ( m_oldDoc , sizeof(XmlDoc),"xmldoc1");
	// debug the mem leak
	// log("xmldoc: xmldoc1=%"PTRFMT" u=%s"
	//     ,(PTRTYPE)m_oldDoc
	//     ,m_firstUrl.getUrl());
	// if title rec is corrupted data uncompress will fail and this
	// will return false!
	if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
				m_oldTitleRecSize , // maxSize
				cr->m_coll     ,
				NULL       , // pbuf
				m_niceness ) ) {
		log("build: failed to set old doc for %s",m_firstUrl.m_url);
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		int32_t saved = g_errno;
		// ok, fix the memleak here
		mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
		delete ( m_oldDoc );
		m_oldDocExistedButHadError = true;
		//log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc);
		m_oldDoc = NULL;
		g_errno = saved;
		// MDW: i removed this on 2/8/2016 again so the code below
		// would execute.
		//return NULL; //mdwmdwmdw
		// if it is data corruption, just assume empty so
		// we don't stop spidering a url because of this. so we'll
		// think this is the first time indexing it. otherwise
		// we get "Bad cached document" in the logs and the
		// SpiderReply and it never gets re-spidered because it is
		// not a 'temporary' error according to the url filters.
		log("build: treating corrupted titlerec as not found");
		g_errno = 0;
		m_oldDoc = NULL;
		m_oldDocValid = true;
		return &m_oldDoc;
	}
	m_oldDocValid = true;
	// share our masterloop and state!
	m_oldDoc->m_masterLoop  = m_masterLoop;
	m_oldDoc->m_masterState = m_masterState;
	return &m_oldDoc;
}

void XmlDoc::nukeDoc ( XmlDoc *nd ) {
	// skip if empty
	if ( ! nd ) return;
	// debug the mem leak
	// if ( nd == m_oldDoc )
	// 	log("xmldoc: nuke xmldoc1=%"PTRFMT" u=%s this=%"PTRFMT""
	// 	    ,(PTRTYPE)m_oldDoc
	// 	    ,m_firstUrl.getUrl()
	// 	    ,(PTRTYPE)this
	// 	    );
	// do not nuke yerself!
	if ( nd == this ) return;
	// or root doc!
	//if ( nd == m_rootDoc ) return;
	// nuke it
	mdelete ( nd , sizeof(XmlDoc) , "xdnuke");
	delete ( nd );
	// invalidate
	if ( nd == m_extraDoc ) {
		m_extraDocValid = false;
		m_extraDoc      = NULL;
	}
	if ( nd == m_rootDoc    ) {
		m_rootDocValid    = false;
		m_rootDoc         = NULL;
	}
	if ( nd == m_oldDoc     ) {
		m_oldDocValid     = false;
		m_oldDoc          = NULL;
	}
	if ( nd == m_ahrefsDoc     ) {
		m_ahrefsDocValid     = false;
		m_ahrefsDoc          = NULL;
	}
}

static LinkInfo s_dummy;

XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
	if ( m_extraDocValid ) return &m_extraDoc;
	// note that
	setStatus ( "getting new doc" );
	// we need a valid first ip first!
	//int32_t *pfip = getFirstIp();
	//if ( ! pfip || pfip == (void *)-1 ) return (XmlDoc **)pfip;
	// must be NULL
	if ( m_extraDoc ) { char *xx=NULL;*xx=0; }
	// sanity check
	if ( ! u || ! u[0] ) { char *xx=NULL;*xx=0; }//return &m_extraDoc;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// . if *otr is NULL that means not found
	// . return a NULL old XmlDoc in that case as well?
	// . make a new one
	// . this will uncompress it and set ourselves!
	try { m_extraDoc = new ( XmlDoc ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		return NULL;
	}
	mnew ( m_extraDoc , sizeof(XmlDoc),"xmldoc2");

	// . if we did not have it in titledb then download it!
	// . or if titleRec was too old!

	// a spider rec for the extra doc to use
	SpiderRequest sreq;
	// clear it
	sreq.reset();
	// spider the url "u"
	strcpy ( sreq.m_url , u );
	// inherit page parser
	sreq.m_isPageParser = getIsPageParser();
	// set the data size right
	sreq.setDataSize();
	// . prepare to download it, set it up
	// . returns false and sets g_errno on error
	if ( ! m_extraDoc->set4 ( &sreq        ,
				  NULL         , // doledbkey ptr
				  cr->m_coll       ,
				  NULL         , // SafeBuf
				  m_niceness   ))
		return NULL;

	// share our masterloop and state!
	m_extraDoc->m_masterLoop  = m_masterLoop;
	m_extraDoc->m_masterState = m_masterState;

	// carry this forward always!
	m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;

	// disable spam check because that is not necessary for this doc!
	m_extraDoc->m_spamCheckDisabled = true;

	// tell msg13 to get this from it robots.txt cache if it can. it also
	// keeps a separate html page cache for the root pages, etc. in case
	m_extraDoc->m_maxCacheAge = maxCacheAge;

	// a dummy thing
	s_dummy.m_numStoredInlinks = 0;
	s_dummy.m_numGoodInlinks  = 0;

	// we indirectly call m_extraDoc->getHttpReply() which calls
	// m_extraDoc->getRedirectUrl(), which checks the linkInfo and
	// dmoz catids of the original url to see if we should set m_indexCode
	// to something bad or not. to avoid these unnecessary lookups we
	// set these to NULL and validate them
	m_extraDoc->ptr_catIds              = NULL;
	m_extraDoc->size_catIds             = 0;
	m_extraDoc->m_catIdsValid           = true;
	m_extraDoc->ptr_linkInfo1           = &s_dummy;
	m_extraDoc->size_linkInfo1          = 0;
	m_extraDoc->m_linkInfo1Valid        = true;
	m_extraDoc->ptr_linkInfo2           = &s_dummy;
	m_extraDoc->size_linkInfo2          = 0;
	m_extraDoc->m_linkInfo2Valid        = true;
	m_extraDoc->m_urlFilterNumValid     = true;
	m_extraDoc->m_urlFilterNum          = 0;
	// for redirects
	m_extraDoc->m_allowSimplifiedRedirs = true;
	// always forward the http download request so that Msg13.cpp's
	// handleRequest13() can avoid this same page
	// from being downloaded at the same time. also, if we are robots.txt
	// this allows us to use the same cache since we select the host we
	// forward to based on ip address.
	m_extraDoc->m_forwardDownloadRequest = true;
	// set this flag so msg13.cpp doesn't print the "hammering ip" msg
	m_extraDoc->m_isChildDoc = true;
	m_extraDoc->m_parentDocPtr = this;
	// debug it
	//g_doc = this;

	// and inherit test dir so getTestDir() doesn't core on us
	bool isPageParser = getIsPageParser();
	m_extraDoc->m_isPageParser      = isPageParser;
	m_extraDoc->m_isPageParserValid = true;

	// without this we send all the msg13 requests to host #3! because
	// Msg13 uses it to determine what host to handle it
	if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
	m_extraDoc->m_firstIp = m_firstIp;
	m_extraDoc->m_firstIpValid = true;

	// i guess we are valid now
	m_extraDocValid = true;
	return &m_extraDoc;
}

bool XmlDoc::getIsPageParser ( ) {
	if ( m_isPageParserValid ) return m_isPageParser;
	// assume not
	m_isPageParser = false;
	// and set otherwise
	if ( m_sreqValid && m_sreq.m_isPageParser ) m_isPageParser = true;
	// and validate
	m_isPageParserValid = true;
	return m_isPageParser;
}

XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
	if ( m_rootDocValid ) return &m_rootDoc;
	// help avoid mem leaks
	if ( m_rootDoc ) { char *xx=NULL;*xx=0; }
	// note it
	setStatus ( "getting root doc");
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (XmlDoc **)isRoot;
	// if we are root use us!!!!!
	if ( *isRoot ) {
		m_rootDoc = this;
		m_rootDocValid = true;
		return &m_rootDoc;
	}
	// get our site root
	char *mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (XmlDoc **)mysite;
	// otherwise, we gotta get it!
	char **rtr = getRootTitleRec ( );
	if ( ! rtr || rtr == (char **)-1 ) return (XmlDoc **)rtr;
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (XmlDoc **)cu;
	// if no title rec, return ptr to a null
	//m_rootDoc = NULL;
	//if ( ! *rtr ) {
	//	// damn, not in titledb, i guess download it then
	//	m_rootDocValid = true; return &m_rootDoc; }
	// note it
	setStatus ( "getting root doc");

	// to keep injections fast, do not download the root page!
	if ( ! *rtr && m_contentInjected ) {
		// assume none
		m_rootDoc = NULL;
		m_rootDocValid = true;
		return &m_rootDoc;
	}

	// likewise, if doing a rebuild
	if ( ! *rtr && m_useSecondaryRdbs ) {
		// assume none
		m_rootDoc = NULL;
		m_rootDocValid = true;
		return &m_rootDoc;
	}

	// or recycling content like for query reindex. keep it fast.
	if ( ! *rtr && m_recycleContent ) {
		m_rootDoc = NULL;
		m_rootDocValid = true;
		return &m_rootDoc;
	}


	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . if *otr is NULL that means not found
	// . return a NULL root XmlDoc in that case as well?
	// . make a new one
	// . this will uncompress it and set ourselves!
	try { m_rootDoc = new ( XmlDoc ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		return NULL;
	}
	mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
	// if we had the title rec, set from that
	if ( *rtr ) {
		if ( ! m_rootDoc->set2 ( m_rootTitleRec     ,
					 m_rootTitleRecSize , // maxSize    ,
					 cr->m_coll             ,
					 NULL               , // pbuf
					 m_niceness         ) ) {
			// it was corrupted... delete this
			// possibly printed
			// " uncompress uncompressed size=..." bad uncompress
			log("build: rootdoc set2 failed");
			mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
			delete ( m_rootDoc );
			// call it empty for now, we don't want to return
			// NULL with g_errno set because it could stop
			// the whole indexing pipeline
			m_rootDoc = NULL;
			m_rootDocValid = true;
			return &m_rootDoc;
			//return NULL;
		}
	}
	// . otherwise, set the url and download it on demand
	// . this junk copied from the contactDoc->* stuff below
	else {
		// a spider rec for the contact doc
		SpiderRequest sreq;
		// clear it
		sreq.reset();
		// spider the url "u"
		char *p = sreq.m_url;
		if ( cu->isHttps() ) p += sprintf ( p , "https://" );
		else                 p += sprintf ( p , "http://" );
		strcpy ( p , mysite );
		// set this
		if ( m_sreqValid ) {
			// this will avoid it adding to tagdb!
			sreq.m_isPageParser = m_sreq.m_isPageParser;
		}
		// reset the data size
	        sreq.setDataSize ();
		// . prepare to download it, set it up
		// . returns false and sets g_errno on error
		if ( ! m_rootDoc->set4 ( &sreq        ,
					 NULL         , // doledbkey ptr
					 cr->m_coll       ,
					 NULL         , // SafeBuf
					 m_niceness   )) {
			mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
			delete ( m_rootDoc );
			m_rootDoc = NULL;
			return NULL;
		}
		// do not throttle it!
		//m_rootDoc->m_throttleDownload = false;
		// . do not do robots check for it
		// . no we must to avoid triggering a bot trap & getting banned
		//m_rootDoc->m_isAllowed      = m_isAllowed;
		//m_rootDoc->m_isAllowedValid = true;
	}

	// share our masterloop and state!
	m_rootDoc->m_masterLoop  = m_masterLoop;
	m_rootDoc->m_masterState = m_masterState;

	// msg13 caches the pages it downloads
	m_rootDoc->m_maxCacheAge = maxCacheAge;

	// like m_contactDoc we avoid unnecessary lookups in call to
	// getRedirUrl() by validating these empty members
	m_rootDoc->ptr_catIds              = NULL;
	m_rootDoc->size_catIds             = 0;
	m_rootDoc->m_catIdsValid           = true;
	m_rootDoc->ptr_linkInfo1           = &s_dummy;
	m_rootDoc->size_linkInfo1          = 0;
	m_rootDoc->m_linkInfo1Valid        = true;
	m_rootDoc->ptr_linkInfo2           = &s_dummy;
	m_rootDoc->size_linkInfo2          = 0;
	m_rootDoc->m_linkInfo2Valid        = true;
	m_rootDoc->m_urlFilterNumValid     = true;
	m_rootDoc->m_urlFilterNum          = 0;
	// for redirects
	m_rootDoc->m_allowSimplifiedRedirs = true;
	// always forward the http download request so that Msg13.cpp's
	// handleRequest13() can avoid the same root page or contact page
	// from being downloaded at the same time. also, if we are robots.txt
	// this allows us to use the same cache since we select the host we
	// forward to based on ip address.
	m_rootDoc->m_forwardDownloadRequest = true;
	// set this flag so msg13.cpp doesn't print the "hammering ip" msg
	m_rootDoc->m_isChildDoc = true;
	m_rootDoc->m_parentDocPtr = this;

	// validate it
	m_rootDocValid = true;
	return &m_rootDoc;
}

/*
// no longer access Revdb to get the old metalist, now re-compute
RdbList *XmlDoc::getOldMetaList ( ) {
	// if valid return that
	if ( m_oldMetaListValid ) return &m_oldMetaList;
	// update status msg
	setStatus ( "getting old meta list");
	// load the old title rec
	XmlDoc **odp = getOldXmlDoc( );
	if ( ! odp || odp == (XmlDoc **)-1 ) return (RdbList *)odp;
	XmlDoc *od = *odp;
	// empty old doc?
	if ( ! od ) {
		m_oldMetaList.reset();
		m_oldMetaListValid = true;
		return &m_oldMetaList;
	}
	// and use that. it has m_setFromTitleRec set to true.
	char *old = od->getMetaList();
	if ( ! old || old == (void *)-1 ) return (RdbList *)old;
	// set it
	m_oldMetaList.m_list     = od->m_metaList; // old;
	m_oldMetaList.m_listSize = od->m_metaListSize;
	m_oldMetaList.m_ownData  = false;
	// assign it
	m_oldMetaListValid = true;
	return &m_oldMetaList;
}
*/

SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
	if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
	if ( m_setFromDocId ) return &m_timeAxisUrl;
	m_timeAxisUrlValid = true;
	Url *fu = getFirstUrl();
	m_timeAxisUrl.reset();
	m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
	return &m_timeAxisUrl;
}

// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
//   from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getOldTitleRec ( ) {
	// clear if we blocked
	//if ( g_errno == ENOTFOUND ) g_errno = 0;

	// g_errno = EBADTITLEREC;
	// return NULL;

	// if valid return that
	if ( m_oldTitleRecValid ) return &m_oldTitleRec;
	// update status msg
	setStatus ( "getting old title rec");
	// if we are set from a title rec, we are the old doc
	if ( m_setFromTitleRec ) {
		m_oldTitleRecValid = true;
		m_oldTitleRec      = NULL;//m_titleRec;
		return &m_oldTitleRec;
	}
	// sanity check
	if ( m_oldTitleRecValid && m_msg22a.m_outstanding ) {
		char *xx=NULL;*xx=0; }
	// point to url
	//char *u = getCurrentUrl()->getUrl();
	//char *u = getFirstUrl()->getUrl();

	// assume its valid
	m_oldTitleRecValid = true;
	// add it to the cache?
	bool addToCache = false;
	//if ( maxCacheAge > 0 ) addToCache = true;

	// not if new! no we need to do this so XmlDoc::getDocId() works!
	// this logic prevents us from setting g_errno to ENOTFOUND
	// when m_msg22a below calls indexDocWrapper(). however, for
	// doing a query delete on a not found docid will succumb to
	// the g_errno because m_isIndexed is not valid i think...
	if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
		m_oldTitleRec      = NULL;
		m_oldTitleRecValid = true;
		return &m_oldTitleRec;
	}
	// sanity check. if we have no url or docid ...
	if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
	// use docid if first url not valid
	int64_t docId = 0;
	if ( ! m_firstUrlValid ) docId = m_docId;
	// if url not valid, use NULL
	char *u = NULL;
	if ( docId == 0LL && ptr_firstUrl ) u = getFirstUrl()->getUrl();
	// if both are not given that is a problem
	if ( docId == 0LL && ! u ) {
		log("doc: no url or docid provided to get old doc");
		g_errno = EBADENGINEER;
		return NULL;
	}
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if using time axis then append the timestamp to the end of
	// the url. this way Msg22::getAvailDocId() will return a docid
	// based on that so we don't collide with other instances of this
	// same url.
	if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
		SafeBuf *tau = getTimeAxisUrl();
		u = tau->getBufStart();
	}

	// the title must be local since we're spidering it
	if ( ! m_msg22a.getTitleRec ( &m_msg22Request      ,
				      u                    ,
				      docId                , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      &m_oldTitleRec       ,
				      &m_oldTitleRecSize   ,
				      false                , // just chk tfndb?
				      false , // getAvailDocIdOnly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      addToCache           , // add to cache?
				      0                    , // max cache age
				      999999               , // timeout seconds
				      false                ))// load balancing?
		// return -1 if we blocked
		return (char **)-1;
	// not really an error
	if ( g_errno == ENOTFOUND ) g_errno = 0;
	// error?
	if ( g_errno ) return NULL;
	// got it
	return &m_oldTitleRec;
}

// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
//   from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getRootTitleRec ( ) {
	// if valid return that
	if ( m_rootTitleRecValid ) return &m_rootTitleRec;
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
	// if we are root use us!!!!! well, the old us...
	if ( *isRoot ) {
		char **otr = getOldTitleRec ( );
		if ( ! otr || otr == (char **)-1 ) return (char **)otr;
		m_rootTitleRec     = m_oldTitleRec;
		m_rootTitleRecSize = m_oldTitleRecSize;
		return &m_rootTitleRec;
	}
	// get our site root
	char *mysite = getSite();
	if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// make it a url. keep it on stack since msg22 copies it into its
	// url request buffer anyway! (m_msg22Request.m_url[])
	Url site; site.set ( mysite );
	// assume its valid
	m_rootTitleRecValid = true;
	// add it to the cache?
	bool addToCache = false;
	//if ( maxCacheAge > 0 ) addToCache = true;
	// update status msg
	setStatus ( "getting root title rec");
	// the title must be local since we're spidering it
        if ( ! m_msg22b.getTitleRec ( &m_msg22Request      ,
				      site.getUrl()        ,
				      0                    , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      &m_rootTitleRec      ,
				      &m_rootTitleRecSize  ,
				      false                , // just chk tfndb?
				      false , // getAvailDocIdOnly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      addToCache           , // add to cache?
				      0                    , // max cache age
				      999999               , // timeout seconds
				      false                ))// load balancing?
		// return -1 if we blocked
		return (char **)-1;
	// not really an error
	if ( g_errno == ENOTFOUND ) g_errno = 0;
	// error?
	if ( g_errno ) return NULL;
	// got it
	return &m_rootTitleRec;
}

/*
// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
//   from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getContactTitleRec ( char *u ) {
	// clear if we blocked
	//if ( g_errno == ENOTFOUND ) g_errno = 0;
	// if valid return that
	if ( m_contactTitleRecValid ) return &m_contactTitleRec;
	// fake
	static char *s_fake = NULL;
	// if no url, we got no contact title rec in titledb then!
	if ( ! u || u[0] == '\0' ) return &s_fake;
	// update status msg
	setStatus ( "getting contact title rec");
	// assume its valid
	m_contactTitleRecValid = true;
	// add it to the cache?
	bool addToCache = false;
	//if ( maxCacheAge > 0 ) addToCache = true;
	// the title must be local since we're spidering it
        if ( ! m_msg22c.getTitleRec ( &m_msg22Request      ,
				      u                    ,
				      0                    , // probable docid
				      m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      &m_contactTitleRec     ,
				      &m_contactTitleRecSize ,
				      false                , // just chk tfndb?
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      addToCache           , // add to cache?
				      0                    , // max cache age
				      999999               , // timeout seconds
				      false                ))// load balancing?
		// return -1 if we blocked
		return (char **)-1;
	// not really an error
	if ( g_errno == ENOTFOUND ) g_errno = 0;
	// error?
	if ( g_errno ) return NULL;
	// got it
	return &m_contactTitleRec;
}
*/


// used for indexing spider replies. we need a unique docid because it
// is treated as a different document even though its url will be the same.
// and there is never an "older" version of it because each reply is treated
// as a brand new document.
int64_t *XmlDoc::getAvailDocIdOnly ( int64_t preferredDocId ) {
	if ( m_availDocIdValid && g_errno ) {
		log("xmldoc: error getting availdocid: %s",
		    mstrerror(g_errno));
		return NULL;
	}
	if ( m_availDocIdValid )
		// this is 0 or -1 if no avail docid was found
		return &m_msg22c.m_availDocId;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// pre-validate it
	m_availDocIdValid = true;
	if ( ! m_msg22c.getAvailDocIdOnly ( &m_msg22Requestc ,
					    preferredDocId ,
					    cr->m_coll ,
					    m_masterState ,
					    m_masterLoop ,
					    m_niceness ) )
		return (int64_t *)-1;
	// error?
	log("xmldoc: error getting availdocid2: %s",mstrerror(g_errno));
	return NULL;
}


int64_t *XmlDoc::getDocId ( ) {
	if ( m_docIdValid ) return &m_docId;
	setStatus ("getting docid");
	XmlDoc **od = getOldXmlDoc( );
	if ( ! od || od == (XmlDoc **)-1 ) return (int64_t *)od;
	setStatus ("getting docid");
	// . set our docid
	// . *od is NULL if no title rec found with that docid in titledb
	if ( *od ) {
		m_docId = *(*od)->getDocId();
		m_docIdValid = true;
		return &m_docId;
	}

	m_docId = m_msg22a.getAvailDocId();

	// if titlerec was there but not od it had an error uncompressing
	// because of the corruption bug in RdbMem.cpp when dumping to disk.
	if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
		m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
		log("build: salvaged docid %"INT64" from corrupt title rec "
		    "for %s",m_docId,m_firstUrl.m_url);
	}

	if ( m_docId == 0 ) {
		log("build: docid is 0 for %s",m_firstUrl.m_url);
		g_errno = ENODOCID;
		return NULL;
	}

	// ensure it is within probable range
	if ( ! getUseTimeAxis () ) {
		char *u = getFirstUrl()->getUrl();
		int64_t pd = g_titledb.getProbableDocId(u);
		int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
		int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
		if ( m_docId < d1 || m_docId > d2 ) {
			char *xx=NULL;*xx=0; }
	}

	// if docid is zero, none is a vailable!!!
	//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
	m_docIdValid = true;
	return &m_docId;
}

// . is our docid on disk? i.e. do we exist in the index already?
// . TODO: just check tfndb?
char *XmlDoc::getIsIndexed ( ) {
	if ( m_isIndexedValid ) return &m_isIndexed;

	setStatus ( "getting is indexed" );

	// we must be old if this is true
	//if ( m_setFromTitleRec ) {
	//	m_isNew      = false;
	//	m_isNewValid = true;
	//	return &m_isNew;
	//}
	// get the url
	//char *u = getFirstUrl()->getUrl();

	if ( m_oldDocValid ) {
		m_isIndexedValid = true;
		if ( m_oldDoc ) m_isIndexed = true;
		else            m_isIndexed = false;
		return &m_isIndexed;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// sanity check. if we have no url or docid ...
	if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; }
	// use docid if first url not valid
	int64_t docId = 0;
	char      *url  = NULL;
	// use docid if its valid, otherwise use url
	if ( m_docIdValid ) docId = m_docId;
	else                url   = ptr_firstUrl;

	// note it
	if ( ! m_calledMsg22e )
		setStatus ( "checking titledb for old title rec");
	else
		setStatus ( "back from msg22e call");

	// . consult the title rec tree!
	// . "justCheckTfndb" is set to true here!
        if ( ! m_calledMsg22e &&
	     ! m_msg22e.getTitleRec ( &m_msg22Request      ,
				      url                  ,
				      docId                , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      NULL                 , // tr ptr
				      NULL                 , // tr size ptr
				      true                 , // just chk tfndb?
				      false, // getavaildocidonly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      false                , // add to cache?
				      0                    , // max cache age
				      999999               , // timeout seconds
				      false                )){//load balancing?
		// validate
		m_calledMsg22e = true;
		// return -1 if we blocked
		return (char *)-1;
	}
	// got it
	m_calledMsg22e = true;
	// error?
	if ( g_errno ) return NULL;
	// get it
	if ( m_msg22e.m_found ) m_isIndexed = true;
	else                    m_isIndexed = false;

	// validate
	m_isIndexedValid = true;
	return &m_isIndexed;
}

void gotTagRecWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// note it
	THIS->setStatus ( "in got tag rec wrapper" );
	// set these
	if ( ! g_errno ) {
		THIS->m_tagRec.serialize ( THIS->m_tagRecBuf );
		THIS->ptr_tagRecData =  THIS->m_tagRecBuf.getBufStart();
		THIS->size_tagRecData = THIS->m_tagRecBuf.length();
		// validate
		THIS->m_tagRecValid = true;
	}
	// continue
	THIS->m_masterLoop ( THIS->m_masterState );
}


// if tagrec changed enough so that it would affect what we would index
// since last time we indexed this doc, we need to know that!
/*
int32_t *XmlDoc::getTagHash32 ( ) {
	// make it valid
	if ( m_tagHash32Valid ) return &m_tagHash32;
	// compute it
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// init it
	m_tagHash32 = 0;
	// hash the values of all tags
	for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get data
		uint32_t h = hash32(tag->getTagData(),tag->getTagDataSize(),0);
		// skip if 0
		if ( ! h ) continue;
		// xor it up
		m_tagHash32 = hash32h ( h , m_tagHash32 );
	}
	// validate
	m_tagHash32Valid = true;
	return &m_tagHash32;
}
*/

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
TagRec *XmlDoc::getTagRec ( ) {
	// if we got it give it
	if ( m_tagRecValid ) return &m_tagRec;
	// do we got a title rec?
	if ( m_setFromTitleRec && m_version >= 118 &&
	     // lookup up fresh from tagdb when doing a rebuild so we get
	     // the latest sitenuminlinks! nah, we set m_tagRecValid and
	     // m_tagRecDataValid to false in Repair.cpp iff rebuilding
	     // titledb!! otherwise, we have to use what is in titlerec
	     // to avoid parsing inconsistencies that would result in
	     // undeletable posdb data.
	     //! m_useSecondaryRdbs &&
	     // lookup the tagdb rec fresh if setting for a summary. that way
	     // we can see if it is banned or not
	     m_tagRecDataValid ) {
		// all done
		m_tagRecValid = true;
		// assume null if old version
		//if ( m_version <= 115 ) return &m_tagRec;
		// just return empty otherwise
		m_tagRec.setFromBuf ( ptr_tagRecData , size_tagRecData );
		return &m_tagRec;
	}
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// get our site, usually the hostname, but can be like
	// "www.last.fm/user/breendaxx/"
	// we can't call this because it CALLS getTagRec()!!!
	//char *mysite = getSite();
	//if ( ! mysite || mysite == (char *)-1 ) return (TagRec *)mysite;
	// update status msg
	setStatus ( "getting tagdb record" );
	// get the final redirected url
	//Url *u = getCurrentUrl();
	// nah, try this
	Url *u = getFirstUrl();
	// if we are docid based url this might block!
	//if ( ! u || u == (void *)-1 ) return (TagRec *)u;
	// good to go
	//m_oldTagRecValid = true;
	// get it, user our collection for lookups, not m_tagdbColl[] yet!
	if ( ! m_msg8a.getTagRec ( u ,
				   // we have to guess the site because
				   // we can't hit tagdb to get it at this
				   // point!!!
				   NULL, // guess it! // mysite ,
				   cr->m_collnum ,
				   false, // skip domain lookup? // true
				   m_niceness ,
				   this ,
				   gotTagRecWrapper ,
				   &m_tagRec ) )
		// we blocked, return -1
		return (TagRec *)-1;
	// error? ENOCOLLREC?
	if ( g_errno ) return NULL;
	// assign it
	m_tagRec.serialize ( m_tagRecBuf );
	ptr_tagRecData =  m_tagRecBuf.getBufStart();
	size_tagRecData = m_tagRecBuf.length();
	// validate
	m_tagRecValid = true;
	// our tag rec should be all valid now
	return &m_tagRec;
}


// this is only for purposes of setting the site's TagRec
char *XmlDoc::getHasContactInfo ( ) {

	if ( m_hasContactInfoValid ) return &m_hasContactInfo2;

	setStatus ( "getting has contact info" );

	// get it from the tag rec if we can
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;

	char *ic = getIsThisDocContacty ( );
	if ( ! ic || ic == (void *)-1 ) return (char *)ic;

	// the current top ip address
	//int32_t *ip = getIp();
	//if ( ! ip || ip == (int32_t *)-1) return (char *)ip;
	//int32_t top = *ip & 0x00ffffff;

	// and should have a contact page tag
	Tag *tag = gr->getTag ("hascontactinfo");

	if ( tag ) m_hasContactInfo = true;
	else       m_hasContactInfo = false;

	m_hasContactInfo2 = m_hasContactInfo;

	// are we a "contact" link? i.e. about us, etc. that would contain
	// the physical address of the entity responsible for this website
	//bool isContacty = getIsContacty( fu ,
	//				 info1 ,
	//				 hops ,
	//				 *ct ,
	//				 *isRoot ,
	//				 m_niceness );

	// bail early if not a candidate for contact info
	if ( ! *ic ) { // check ) {
		m_hasContactInfoValid = true;
		return &m_hasContactInfo2;
	}

	//
	// TODO: did IP change?? invalidate it???
	//

	// set status. we can time status changes with this routine!
	setStatus ( "getting contact info on just this page" );

	int32_t *nca = getNumContactAddresses();
	if ( ! nca || nca == (void *)-1 ) return (char *)nca;

	// did we have a contact address?
	if ( *nca ) {
		m_hasContactInfo  = true;
		m_hasContactInfo2 = true;
		m_hasContactInfoValid = true;
		return &m_hasContactInfo2;
	}

	// get the email addresses
	int32_t *numOfficial = getNumOfficialEmails ( );
	if ( ! numOfficial || numOfficial == (void *)-1)
		return (char *)numOfficial;

	// did we get some?
	if ( *numOfficial > 0 ) {
		m_hasContactInfo  = true;
		m_hasContactInfo2 = true;
		m_hasContactInfoValid = true;
		return &m_hasContactInfo2;
	}

	// this should set m_hasContactInfo as well as m_contact*[] arrays
	//TagRec *pcitr = getContactInfoTagRec ();
	//if ( ! pcitr || pcitr == (void *)-1 ) return (char *)pcitr;

	// do not re-peat the above now
	m_hasContactInfoValid = true;

	return &m_hasContactInfo2;
}

// returns "type" of contact link, > 0
int32_t getIsContacty ( Url *url ,
		     LinkInfo *info1 ,
		     int32_t hops ,
		     uint8_t ct ,
		     bool isRoot ,
		     int32_t niceness ) {

	static int64_t h_home       ;
	static int64_t h_site       ;
	static int64_t h_map        ;
	static int64_t h_sitemap    ;
	static int64_t h_contact    ;
	static int64_t h_about      ;
	static int64_t h_privacy    ;
	static int64_t h_policy     ;
	static int64_t h_statement  ;
	static int64_t h_terms      ;
	static int64_t h_of         ;
	static int64_t h_and        ;
	static int64_t h_service    ;
	static int64_t h_conditions ;
	static int64_t h_use        ;
	static int64_t h_us         ;
	static int64_t h_help       ;
	static int64_t h_location   ;
	static int64_t h_faq        ;
	static int64_t h_faqs       ;
	static int64_t h_customer   ;
	static int64_t h_support    ;
	static int64_t h_advertise  ;
	static int64_t h_inquiry    ;
	static int64_t h_inquiries  ;
	static int64_t h_feedback   ;
	static int64_t h_company    ;
	static int64_t h_corporate  ;

	static bool s_inith = false;
	if ( ! s_inith ) {
		s_inith = true;
		h_home       = hash64n ("home");
		h_site       = hash64n ("site");
		h_map        = hash64n ("map");
		h_sitemap    = hash64n ("sitemap");
		h_contact    = hash64n ("contact");
		h_about      = hash64n ("about");
		h_privacy    = hash64n ("privacy");
		h_policy     = hash64n ("policy");
		h_statement  = hash64n ("statement");
		h_terms      = hash64n ("terms");
		h_of         = hash64n ("of");
		h_and        = hash64n ("and");
		h_service    = hash64n ("service");
		h_conditions = hash64n ("conditions");
		h_use        = hash64n ("use");
		h_us         = hash64n ("us");
		h_help       = hash64n ("help");
		h_location   = hash64n ("location");
		h_faq        = hash64n ("faq");
		h_faqs       = hash64n ("faqs");
		h_customer   = hash64n ("customer");
		h_support    = hash64n ("support");
		h_advertise  = hash64n ("advertise");
		h_inquiry    = hash64n ("inquiry");
		h_inquiries  = hash64n ("inquiries");
		h_feedback   = hash64n ("feedback");
		h_company    = hash64n ("company");
		h_corporate  = hash64n ("corporate");
	}

	int32_t check = 0;
	// loop over the link texts we got
	for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
		// never do anything if hop count >= 3
		if ( hops >= 3 ) break;
		// javascript must be hopcount 1 only
		if ( ct == CT_JS && hops != 1 ) break;
		// is this inlinker internal?
		//bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
		// skip if not local to site
		//if ( ! internal ) continue;
		// get the text
		char *txt = k->getLinkText();
		// get length of link text
		int32_t tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		// assume utf-8. so do a utf-8 sanity check so it doesn't
		// break Words::countWords() by thinking a character is
		// 2+ bytes and breaching the buffer
		if ( ! verifyUtf8 ( txt , tlen ) ) {
			log("xmldoc: bad link text 1 from url=%s for %s",
			    k->getUrl(),url->m_url);
			continue;
		}
		// convert into words i guess
		Words ww;
		// . TODO: use alt text if only an image in the link!!!!!
		// . return -1 if it fails with g_errno set
		if ( ! ww.setx ( txt , tlen , niceness) ) return (char)-1;
		// int16_tcut
		int32_t nw = ww.getNumWords();
		// skip if too big
		if ( nw >= 30 ) continue;
		// int16_tcut
		int64_t *wids = ww.getWordIds();
		// reset alnumcount
		int32_t count = 0;
		// loop over its words
		for ( int32_t j = 0 ; j < nw && ! check ; j++ ) {
			// skip if not alnum
			if ( ! wids[j] ) continue;
			// keep track of alnum word position
			count++;
			// "contact..." only good from root or root kid
			if ( wids[j] == h_contact && hops >= 1 && count == 1 )
				check = 1;
			// "about..." only good from root or root kid
			if ( wids[j] == h_about   && hops >= 1 && count == 1 )
				check = 2;
			// "...privacy policy..."
			if ( wids[j  ] == h_privacy && j+2<nw &&
			     wids[j+2] == h_policy )
				check = 3;
			// "...privacy statement..."
			if ( wids[j  ] == h_privacy && j+2<nw &&
			     wids[j+2] == h_statement )
				check = 4;
			// "...terms of service..."
			if ( wids[j  ] == h_terms && j+4<nw &&
			     wids[j+2] == h_of &&
			     wids[j+4] == h_service )
				check = 5;
			// "...terms of use..."
			if ( wids[j  ] == h_terms && j+4<nw &&
			     wids[j+2] == h_of &&
			     wids[j+4] == h_use )
				check = 6;
			// "... terms & conditions ..."
			if ( wids[j  ] == h_terms && j+2<nw &&
			     wids[j+2] == h_conditions )
				check = 7;
			// "... terms and conditions ..."
			if ( wids[j  ] == h_terms && j+4<nw &&
			     wids[j+2] == h_and &&
			     wids[j+4] == h_conditions )
				check = 8;
			// "...site map ..."
			if ( wids[j] == h_site && j+2<nw &&
			     wids[j+2] == h_map )
				check = 9;
			// "...about us..."
			if ( wids[j] == h_about && j+2<nw &&
			     wids[j+2] == h_us )
				check = 10;
			// "...contact us..."
			if ( wids[j] == h_contact && j+2<nw &&
			     wids[j+2] == h_us)
				check = 11;
			// "help..."
			if ( wids[j] == h_help && count == 1 )
				check = 12;
			// "faq..."
			if ( wids[j] == h_faq && count == 1 )
				check = 13;
			// "faqs..."
			if ( wids[j] == h_faqs && count == 1 )
				check = 14;
			// "...customer support..."
			if ( wids[j] == h_customer && j+2<nw &&
			     wids[j+2] == h_support )
				check = 15;
			// "advertise..."
			if ( wids[j] == h_advertise && count == 1)
				check = 16;
			// "...inquiry..."
			if ( wids[j] == h_inquiry )
				check = 17;
			// "...inquiries..."
			if ( wids[j] == h_inquiries )
				check = 18;
			// one word only below here
			if ( ww.getNumAlnumWords() != 1 ) continue;
			if ( wids[j] == h_about     ) check = 2;
			if ( wids[j] == h_home      ) check = 19;
			if ( wids[j] == h_support   ) check = 20;
			if ( wids[j] == h_advertise ) check = 21;
			if ( wids[j] == h_help      ) check = 22;
			if ( wids[j] == h_faq       ) check = 23;
			if ( wids[j] == h_faqs      ) check = 24;
			if ( wids[j] == h_contact   ) check = 25;
			if ( wids[j] == h_feedback  ) check = 26;
			if ( wids[j] == h_sitemap   ) check = 27;
			if ( wids[j] == h_company   ) check = 28;
			if ( wids[j] == h_corporate ) check = 29;
			if ( wids[j] == h_privacy   ) check = 30;
			if ( wids[j] == h_terms     ) check = 31;
			// "location" fixes guildcinema.com
			if ( wids[j] == h_location && isRoot ) check = 32;
		}
	}


	// check for certain things in the url path that would indicate that
	// this is a contact info page
	//char *path = m_firstUrl.getPath();
	char *path = url->getPath();
	if ( gb_strcasestr(path,"contact"  ) ) { check += 33; check *= 90; }
	if ( gb_strcasestr(path,"/about"   ) ) { check += 34; check *= 91; }
	if ( gb_strcasestr(path,"/feedback") ) { check += 35; check *= 92; }
	if ( gb_strcasestr(path,"/help"    ) ) { check += 36; check *= 93; }
	if ( gb_strcasestr(path,"/faq"     ) ) { check += 37; check *= 94; }
	if ( gb_strcasestr(path,"advertise") ) { check += 38; check *= 95; }
	if ( gb_strcasestr(path,"inquir"   ) ) { check += 39; check *= 96; }

	return check;
}

char *XmlDoc::getIsThisDocContacty() {
	if ( m_isContactyValid ) return &m_isContacty;
	setStatus  ( "getting is contacty" );
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;
	int8_t *hc = getHopCount();
	if ( ! hc || hc == (void *)-1 ) return (char *)hc;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;
	LinkInfo  *info1    = getLinkInfo1 ();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
	// get the first url
	Url *fu = getFirstUrl();
	// int16_tcut
	int32_t hops = *hc;
	// check it
	m_isContacty = getIsContacty ( fu ,
				       info1 ,
				       hops ,
				       *ct ,
				       *isRoot ,
				       m_niceness );
	m_isContactyValid = true;
	return &m_isContacty;
}


int32_t *XmlDoc::getNumContactAddresses ( ) {
	// process
	Address **ca = getContactAddresses();
	if ( ! ca || ca == (void *)-1 ) return (int32_t *)ca;
	// now we are valid
	return &m_numContactAddresses;
}


Address **XmlDoc::getContactAddresses ( ) {
	// assume none
	if ( m_contactAddressesValid ) return m_contactAddresses;
	// need this of course
	Addresses *aa = getAddresses ();
	if ( ! aa || aa == (void *)-1 ) return (Address **)aa;
	// assume none
	m_contactAddressesValid = true;
	m_numContactAddresses   = 0;
	// not if not contacty. we gotta be a url like ".../contact.asp"
	char *ic = getIsThisDocContacty ( );
	if ( ! ic || ic == (void *)-1 ) return (Address **)ic;
	// if not a of contact url form, return none
	if ( ! *ic )
		return m_contactAddresses;
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (Address **)isRoot;
	// do not do this for root if multiple addresses. this
	// fixes http://obits.abqjournal.com/
	if ( *isRoot && aa->m_uniqueStreetHashes > 1 )
		return m_contactAddresses;
	// reset count
	int32_t nca = 0;
	// number of addresses in this doc
	int32_t na = aa->m_am.getNumPtrs();
	// add all addresses then???
	for ( int32_t i = 0 ; i < na ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get it
		Address *ai = (Address *)aa->m_am.getPtr(i);
		// do not add this to tagdb if not inlined!
		if ( ! ( ai->m_flags & AF_INLINED ) ) continue;
		// store it
		m_contactAddresses[nca++] = ai;
		// stop before breach
		if ( nca >= MAX_CONTACT_ADDRESSES ) break;
	}
	// update count
	m_numContactAddresses = nca;
	return m_contactAddresses;
}

int32_t *XmlDoc::getNumOfficialEmails ( ) {
	char *eb = getEmailBuf();
	if ( ! eb || eb == (void *)-1 ) return (int32_t *)eb;
	return &m_numOfficialEmails;
}

// . add email addresses to tag rec
// . add up to 3 of same domain and different domain addresses
// . return # of *official* contact infos added to tag rec
// . this now includes submission forms!
// . returns -1 and sets g_errno on error
char *XmlDoc::getEmailBuf ( ) {

	if ( m_emailBufValid ) return m_emailBuf;

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (char *)ww;

	// count # of official contacts we got
	int32_t official = 0;

	// int16_tcuts
	int64_t  *wids  = ww->m_wordIds;
	char      **wptrs = ww->m_words;
	int32_t       *wlens = ww->m_wordLens;
	nodeid_t   *tids  = ww->m_tagIds;
	int32_t        nw    = ww->getNumWords();

	// get our url
	Url *f = getFirstUrl();
	// get its domain len
	char *myDom    = f->getMidDomain();
	int32_t  myDomLen = f->getMidDomainLen();


	// point here
	char *eptr = m_emailBuf;
	char *emax = m_emailBuf + EMAILBUFSIZE;

	m_emailBufValid = true;

	// reset
	*eptr = '\0';

	//
	// ADD EMAIL ADDRESSES
	//

	// count how many we find
	int32_t ne = 0;
	// loop over all the words
	for ( int32_t i = 1 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// . email address? look for the '@'
		// . might also have <img src="at.gif"> (bot proof)
		if ( wptrs[i][0] != '@' && tids[i] != TAG_IMG ) continue;
		// . make sure any image has an "/at." in it!
		// . "mail<img src="/common/images/at.gif">pipl.com"
		if(tids[i]==TAG_IMG&&!gb_strncasestr(wptrs[i],wlens[i],"/at."))
			continue;
		// must be a single char
		if ( ! tids[i] && wlens[i] != 1 ) continue;
		// if i was the last word, give up!
		if ( i + 1 >= nw ) break;
		// back up i until we hit a non-email char
		int32_t a ;
		for ( a = i ; a - 1 > 0 ; a-- ) {
			if (wids [a-1] ) continue;
			if (wptrs[a-1][0]=='.'&&wlens[a-1]==1)continue;
			if (wptrs[a-1][0]=='-'&&wlens[a-1]==1)continue;
			break;
		}
		// must not start with '.'
		if ( wptrs[a][0]=='.' ) a++;
		// now get the end of it
		int32_t b;
		int32_t periodCount = 0;
		for ( b = i ; b+1 < nw ; b++ ) {
			if (wids[b+1]) continue;
			// only punct we allow is a single period
			if ( wptrs[b+1][0]!='.' ) break;
			if ( wlens[b+1]   != 1  ) break;
			periodCount++;
		}
		// must have at least one!
		if ( ! periodCount ) continue;
		// must not end on '.'
		if ( wptrs[b][0]=='.') b--;
		// hostname must have a valid tld
		char *host = wptrs[i+1];
		char *hend = wptrs[b]+wlens[b];
		// temp null term
		char c = *hend;
		*hend = '\0';
		int32_t tldLen ; char *tld = getTLDFast ( host, &tldLen , false );
		// ignore the rest of this line for addresses even
		// if tld is bogus
		//ignoreLine = true;
		// must have a legit tld!
		if ( ! tld ) { *hend = c; continue; }
		// if not from our same domain, use "emailaddressoffsite"
		int32_t  dlen ; char *dom = getDomFast ( host , &dlen , false );
		// use mid domain. subtract '.'
		//int32_t midlen = tld - dom - 1;
		// undo the temp NULL thing
		*hend = c;
		if ( ! dom ) continue;

		// include last word
		b++;
		// normal buffer
		char  buf[100];
		char *p    = buf;
		char *pend = buf + 100;
		// normalize it
		for ( int32_t j = a ; j < b ; j++ ) {
			// include the at sign
			if ( j == i ) {*p++ = '@'; continue;}
			// skip tags
			if ( tids[j] ) continue;
			// skip punct
			if ( ! wids[j] ) {*p++ ='.'; continue;}
			// ensure minimal space
			if ( p + wlens[j] + 1 >= pend ) break;
			// write out wids
			gbmemcpy ( p , wptrs[j] , wlens[j] );
			p += wlens[j];
		}
		// NULL term it
		*p = '\0';

		// do we match domains?
		//char *tn = "emailaddressoffsite";
		// use this if we match domains
		//if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
		//	tn = "emailaddressonsite";
		//	// this is an official contact method
		//	//official++;
		//}
		// we now count even offsite email addresses as official
		// for addresses like @gmail.com etc. because we are now
		// only checking "contact us" and "about us" and root pages,
		// so they should never be email addresses of commenters.
		// and often bloggers have external email addresses.
		// http://www.christinesaari.com/html/about.php?psi=44
		official++;
		// store it
		//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
		//	return -1;
		int32_t blen = gbstrlen(buf);
		// ignore if breach
		if ( eptr + blen + 2 > emax ) continue;
		// comma?
		if ( eptr > m_emailBuf ) *eptr++ = ',';
		// store it
		gbmemcpy (eptr , buf , blen );
		// advance
		eptr += blen;
		// limit it
		if ( ++ne >= 3 ) break;
	}

	//
	// ADD BOT-PROOF EMAIL ADDRESSES (bot proof)
	//
	// super dot john at xyz dot com
	//

	int64_t h_at  = hash64Lower_utf8("at");
	int64_t h_dot = hash64Lower_utf8("dot");
	// loop over all the words
	for ( int32_t i = 1 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// email address? look for the " at "
		if ( wids[i] != h_at ) continue;
		// front name word count
		int32_t nameCount = 0;
		// back up i until we hit a non-email word
		int32_t a ;
		// do a loop
		for ( a = i - 1 ; a > 0 ; ) {
			// need a space/punt word
			if ( wids[a] ) break;
			if ( tids[a] ) break;
			// skip it
			a--;
			// then need the "john" part
			if ( ! wids[a]          ) break;
			if (   tids[a]          ) break;
			if (   wids[a] == h_dot ) break; // "dot" is bad
			// count account name part
			nameCount++;
			// go back if like "mike dot smith"
			if ( a - 4 >= 0 &&
			     ! tids[a-1] &&
			     wids  [a-2] == h_dot &&
			     ! tids[a-3] &&
			     wids  [a-4] != h_dot &&
			     wids  [a-4] != h_at )
				a -= 4;
			// that is good enough
			break;
		}
		// need a name at least one
		if ( nameCount <= 0 ) continue;
		// skip over that space/punct word
		//a--;
		// now must be regular word before that
		//if (   tids[a-1] ) continue;
		//if ( ! wids[a-1] ) continue;
		// we got it
		//a--;
		// now get the end of it
		int32_t b ;
		// count the dots
		int32_t dotCount = 0;
		// make sure last word is a legit tld
		int32_t tldLen = 0; char *tld = NULL;
		// do a loop
		for ( b = i + 1 ; b + 3 < nw ; b++ ) {
			// need a space/punt word
			if ( wids[b] ) break;
			if ( tids[b] ) break;
			// skip it
			b++;
			// then need the "xyz" part
			if ( ! wids[b]          ) break;
			if (   tids[b]          ) break;
			if (   wids[b] == h_dot ) break; // "dot" is bad
			// remember it for tld detection
			tld    = wptrs[b];
			tldLen = wlens[b];
			// skip it
			b++;
			// need another space/punct word
			if ( wids[b] ) break;
			if ( tids[b] ) break;
			// skip it
			b++;
			// now we need a "dot"
			if ( wids[b] != h_dot ) break;
			// count the dots
			dotCount++;
		}
		// need at least one "dot"
		if ( dotCount < 1 ) continue;
		// not too many!
		if ( dotCount > 5 ) continue;
		// must have legit tld
		if ( tld && ! isTLD ( tld , tldLen ) ) continue;
		// normal buffer
		char  buf[100];
		char *p    = buf;
		char *pend = buf + 100;
		// normalize it
		for ( int32_t j = a ; j < b ; j++ ) {
			// skip tags
			if ( tids[j] ) continue;
			// skip punct
			if ( ! wids[j] ) continue;
			// ensure minimal space
			if ( p + wlens[j] + 1 >= pend ) break;
			// write out wids
			if ( wids[j] == h_at  ) {*p++ = '@'; continue;}
			if ( wids[j] == h_dot ) {*p++ = '.'; continue;}
			gbmemcpy ( p , wptrs[j] , wlens[j] );
			p += wlens[j];
		}
		// NULL term it
		*p = '\0';
		// get the host
		char *host    = buf ; // wptrs[i+1]; ?? is this right?
		// if not from our same domain, use "emailaddressoffsite"
		int32_t  dlen ; char *dom = getDomFast ( host , &dlen , false );
		if ( ! dom ) continue;
		// use mid domain
		int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
		// limit domain by that. subtract '.'
		int32_t midlen = tld3 - dom - 1;
		// do we match domains?
		char *tn = "emailaddressoffsite";
		// use this if we match domains
		if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
			tn = "emailaddressonsite";
			// this is an official contact method
			//official++;
		}
		// we now count even offsite email addresses as official
		// for addresses like @gmail.com etc. because we are now
		// only checking "contact us" and "about us" and root pages,
		// so they should never be email addresses of commenters
		// and often bloggers have external email addresses.
		// http://www.christinesaari.com/html/about.php?psi=44
		official++;
		// store that
		//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) )
		//	return -1;
		int32_t blen = gbstrlen(buf);
		// ignore if breach
		if ( eptr + blen + 2 > emax ) continue;
		// comma?
		if ( eptr > m_emailBuf ) *eptr++ = ',';
		// store it
		gbmemcpy (eptr , buf , blen );
		// advance
		eptr += blen;
		// limit it
		if ( ++ne >= 3 ) break;
	}

	//
	// ADD EMAIL ADDRESSES IN MAILTO TAGS
	//
	// <a href=mailto:steve@xyz.com>
	// <a href=mailto:"steve at xyz dot com">
	// now we check char by char since a website had it in the javascript:
	// http://www.botanique.com/bincgi/stateprov.CFM?state=NM
	//
	char *m    =     xml->m_xml;
	char *mend = m + xml->m_xmlLen - 4;
	// empty?
	if ( ! m ) mend = m;
	// scan
	for ( ; ; m++ ) {
		// breach?
		if ( m >= mend ) break;
		// breathe
		QUICKPOLL ( m_niceness );
		// skip if not possible mailto:
		if ( *m != 'm' && *m !='M' ) continue;
		// skip
		m++;
		// skip?
		if ( *m != 'a' && *m !='A' ) continue;
		// skip
		m++;
		// skip?
		if ( *m != 'i' && *m !='I' ) continue;
		// skip
		m++;
		// skip?
		if ( *m != 'l' && *m !='L' ) continue;
		// skip
		m++;
		// skip?
		if ( *m != 't' && *m !='T' ) continue;
		// skip
		m++;
		// skip?
		if ( *m != 'o' && *m !='O' ) continue;
		// skip
		m++;
		// skip?
		if ( *m != ':' ) continue;
		// skip
		m++;
		// set end
		char *mend = m + 100;
		// skip over the mailto:
		//m += 7;
		// that is the start of the email address then
		char *start = m;
		// skip til '@'
		for ( ; *m && m < mend && *m != '@' ; m++ ) {
			// but give up if we hit a non-email name char
			if ( is_alnum_a(*m) ) continue;
			if ( *m == '.' ) continue;
			if ( *m == '-' ) continue;
			break;
		}
		// bad if no @
		if ( *m != '@' ) continue;
		// skip the @
		m++;
		// . skip until alnum
		// . fix parsing of "dsquires@ unimelb.edu.au" for
		//   http://www.marcom1.unimelb.edu.au/public/contact.html
		for (;*m && is_wspace_utf8(m); m+=getUtf8CharSize(m) );
		// get the host
		char *host    = m;
		// skip till end of hostname
		for (;*m && m<mend && (is_alnum_a(*m)||*m=='.'||*m=='-');m++ );
		// null term
		char c = *m; *m = '\0';
		// if not from our same domain, use "emailaddressoffsite"
		int32_t  dlen ; char *dom = getDomFast ( host , &dlen , false );
		// skip if no valid domain
		if ( ! dom ) { *m = c; continue; }
		// use mid domain
		int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false );
		// limit domain by that. subtract '.'
		int32_t midlen = tld3 - dom - 1;
		// put it back
		*m = c;
		// point "end" to end of the email address
		char *end = dom + dlen;
		// do we match domains?
		char *tn = "emailaddressoffsite";
		// use this if we match domains
		if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) {
			tn = "emailaddressonsite";
			// this is an official contact method
			//official++;
		}
		// we now count even offsite email addresses as official
		// for addresses like @gmail.com etc. because we are now
		// only checking "contact us" and "about us" and root pages,
		// so they should never be email addresses of commenters
		// and often bloggers have external email addresses.
		// http://www.christinesaari.com/html/about.php?psi=44
		official++;
		// store that
		//if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,start,end-start) )
		//	return -1;
		// cast it
		char *buf  = start;
		int32_t  blen = end - start;
		// ignore if breach
		if ( eptr + blen + 2 > emax ) continue;
		// comma?
		if ( eptr > m_emailBuf ) *eptr++ = ',';
		// store it
		gbmemcpy (eptr , buf , blen );
		// advance
		eptr += blen;
		// limit it
		if ( ++ne >= 3 ) break;
	}


	//
	// ADD CONTACT FORM
	//

	bool gotEmailBox = false;
	bool storedForm  = false;
	int32_t emailPos    = -1;
	int32_t alnumCount  =  0;
	// quick compares
	int64_t he1 = hash64Lower_utf8 ( "email");
	int64_t he2 = hash64Lower_utf8 ( "mail");
	// loop over all words again
	for ( int32_t i = 1 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get tag id if any
		int32_t tid = tids[i] & BACKBITCOMP;
		// . do we have a submit form?
		// . first, do we have a text box for the sender's email?
		if ( tid == TAG_INPUT ) {
			int32_t ttlen;
			// bad i is not a node # it is a word #
			int32_t nn = ww->m_nodes[i];
			// must be valid
			char *tt = xml->getString(nn,"type",&ttlen);
			if ( ! tt || ttlen <= 0 ) continue;
			// must be of type text
			if ( strncasecmp(tt,"text",4) ) continue;
			// might have "email" or "e-mail" in the value
			int32_t vlen;
			char *val = xml->getString(nn,"value",&vlen);
			// check that
			if ( val ) {
				if ( gb_strncasestr(val,vlen,"email") ||
				     gb_strncasestr(val,vlen,"e-mail") )
					// flag it good
					gotEmailBox = true;
			}
			// must have the word "email" or "e-mail" within
			// a few words right before it!
			if ( emailPos ==    -1 ) continue;
			//if ( i - emailPos >= 7 ) continue;
			if ( alnumCount > 7 ) continue;
			// flag it
			gotEmailBox = true;
		}
		// text area? must happen AFTER the email adress box
		if ( tid == TAG_TEXTAREA && gotEmailBox ) {
			// must have had the form before us
			// do not double store into tagdb rec
			if ( storedForm ) continue;
			// store this bad boy into the tagdb rec
			//if ( ! gr->addTag("hascontactform",
			//		  timestamp,
			//		  "xmldoc",
			//		  ip,
			//		  "1" ,
			//		  1 ) )
			//	return -1;
			// copy it
			char *buf  = "hascontactform";
			int32_t  blen = gbstrlen(buf);
			// ignore if breach
			if ( eptr + blen + 2 > emax ) continue;
			// comma?
			if ( eptr > m_emailBuf ) *eptr++ = ',';
			// store it
			gbmemcpy (eptr , buf , blen );
			// advance
			eptr += blen;
			// do not double store
			storedForm = true;
			// this is an official contact method
			official++;
			// another contact method
			ne++;
			// that's enough!
			break;
		}
		// alnum counter
		if ( wids[i] ) alnumCount++;
		// special counter
		if ( wids[i] == he1 || wids[i] == he2 ) {
			// mark it
			emailPos = i;
			// reset counter
			alnumCount = 0;
		}
	}

	// null term
	*eptr = '\0';

	m_numOfficialEmails = official;

	// i guess that is it
	return m_emailBuf;
}

// returns vector 1-1 with Words.m_words[] array
/*
Spam *XmlDoc::getSpam ( ) {
	if ( m_spamValid ) return &m_spam;
	// set it
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Spam *)ww;
	Bits *bits = getBits ();
	if ( ! bits || bits == (Bits *)-1 ) return (Spam *)bits;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (Spam *)sni;
	// if more than X% ("thresh") of words are spammed to some degree,
	// index all words with a minimum score
	int32_t thresh = 6;
	if ( *sni > 10  ) thresh = 8;
	if ( *sni > 30  ) thresh = 10;
	if ( *sni > 100 ) thresh = 20;
	if ( *sni > 500 ) thresh = 30;
	//int64_t x[] = {30,40,50,70,90};
	//int64_t y[] = {6,8,10,20,30};
	//int32_t spamThresh = getY ( m_docQuality , x , y , 5 );
	if ( ! m_spam.set ( ww         ,
			    bits       ,
			    m_version  ,
			    thresh     ,
			    20         ,
			    m_niceness ))
		return NULL;
	m_spamValid = true;
	return &m_spam;
}
*/

// this means any tod now
bool *XmlDoc::getHasTOD ( ) {
	if ( m_hasTODValid ) return &m_hasTOD2;
	// scan the dates
	Dates *dp = getDates() ;
	if ( ! dp || dp == (Dates *)-1 ) return (bool *)dp;
	// assume not
	m_hasTOD2 = false;
	m_hasTOD  = false;
	// scan the dates
	for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get date
		Date *di = dp->m_datePtrs[i];
		// skip if got nuked
		if ( ! di ) continue;
		// tod?
		if ( !(di->m_hasType & DT_TOD) ) continue;
		// got one
		m_hasTOD2 = true;
		m_hasTOD  = true;
	}
	// it is now valid
	m_hasTODValid = true;
	return &m_hasTOD2;
}

/*
bool *XmlDoc::getHasSiteVenue ( ) {
	if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
	// get the tag rec
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (TagRec *)-1 ) return (bool *)gr;
	// get tag from it
	Tag *sv = gr->getTag("venueaddress") ;
	// from that
	m_hasSiteVenue2 = (bool)sv;
	m_hasSiteVenue  = (bool)sv;
	m_hasSiteVenueValid = true;
	return &m_hasSiteVenue2;
}
*/


// do not include addresses that are always in the header/footer of every page!
bool *XmlDoc::getHasAddress ( ) {
	if ( m_hasAddressValid ) return &m_hasAddress2;
	// get the addresses
	Addresses *aa = getAddresses();
	if ( ! aa || aa == (void *)-1 ) return (bool *)aa;
	// from that
	m_hasAddress2 = (aa->getNumNonDupAddresses() > 0);
	m_hasAddress  = (aa->getNumNonDupAddresses() > 0);
	m_hasAddressValid = true;
	return &m_hasAddress2;
}

Addresses *XmlDoc::getAddresses ( ) {
	if ( m_addressesValid ) {
		// return error if buf was breached
		//if ( m_addresses.m_breached ) {
		//	g_errno = EBUFOVERFLOW;
		//	return NULL;
		//}
		// otherwise, return it
		return &m_addresses;
	}
	// skip for now
	m_addressesValid = true;
	return &m_addresses;
	// note it
	setStatus ( "getting addresses");
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Addresses *)ww;
	// we make sure that D_IS_IN_DATE is set by doing this
	//Dates *dp = getDates();
	//if ( ! dp || dp == (Dates *)-1) return (Addresses *)dp;
	// we set the D_IS_IN_DATE flag for these bits
	Bits *bits = getBits(); if ( ! bits ) return NULL;
	Sections *sections = getExplicitSections();
	if ( !sections||sections==(Sections *)-1) return (Addresses *)sections;
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (Addresses *)gr;
	// the site hash
	//int32_t *sh32 = getSiteHash32();
	//if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Addresses *)sh32;
	int32_t dh = getDomHash32();
	// hash of all adjacent tag pairs
	//uint32_t *tph = getTagPairHash32 ( ) ;
	//if ( ! tph || tph == (void *)-1 ) return (Addresses *)tph;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Addresses *)d;
	// get our ip
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1) return (Addresses *)ip;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;

	//char **stb = getSiteTitleBuf();
	//if ( ! stb || stb == (void *)-1 ) return (Addresses *)stb;
	// sanity check
	//if ( ! m_siteTitleBufValid ) { char *xx=NULL;*xx=0; }
	char **fbuf = getFilteredRootTitleBuf();
	if ( ! fbuf || fbuf == (void *)-1 ) return (Addresses *)fbuf;

	// this will set D_IS_IN_DATE in the Bits::m_bits[] array which
	// Addresses::set() uses to avoid having addresses that are really
	// just dates!
	Dates *dd = getSimpleDates();
	// return NULL on error
	if ( ! dd ) return (Addresses *)NULL;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if the serialized section is valid, use that
	//char *sd = NULL;
	//bool valid = false;
	//if ( od && od->m_sectionsReplyValid ) valid = true;
	//if ( valid                          ) sd = od->ptr_sectionsReply;
	// assume valid, really only when it returns in case it blocked...
	//m_addressesValid = true;
	// this should not be outstanding!
	if ( m_addressSetCalled ) { char *xx=NULL;*xx=0; }
	// assume valid, really only when it returns in case it blocked...
	m_addressesValid = true;
	// set it
	m_addressSetCalled = true;
	// make a copy of the tag rec here in case it gets mangled later
	// because the m_addresses class may reference its buffer
	//m_savedTagRec1.copy ( gr );
	// . this returns false if blocked
	// . it uses the "venueaddress" from the tagrec, "gr", BUT if this
	//   page is the one that sets the venue address, it won't be able
	//   to use it as a default city/state thingy until next time it is
	//   spidered, since that info is in the tagrec
	// . PROBLEM: if the venue address is on this page, we can't take
	//   advantage of it by usings its city/state as a default for the
	//   other addresses on this page
	if ( ! m_addresses.set ( sections      ,
				 ww            ,
				 bits          ,
				 &m_tagRec     , // &m_savedTagRec1 , // gr
				 &m_firstUrl   ,
				 *d            ,
				 cr->m_collnum    ,
				 dh            , // *sh32
				 *ip           ,
				 //(int32_t)*tph    ,
				 m_niceness    ,
				 m_pbuf        ,
				 m_masterState ,
				 m_masterLoop  ,
				 *ct           ,
				 //ptr_addressReply ,
				 //size_addressReply ,
				 //m_addressReplyValid ,
				 m_filteredRootTitleBuf     ,
				 m_filteredRootTitleBufSize ,
				 this ))
		return (Addresses *)-1;
	// sanity check
	if ( m_addresses.m_msg2c &&
	     m_addresses.m_msg2c->m_requests !=
	     m_addresses.m_msg2c->m_replies) {
		char *xx=NULL;*xx=0; }
	// error?
	if ( g_errno ) return NULL;
	// return it if not breached
	//if ( ! m_addresses.m_breached ) return &m_addresses;
	// return that error otherwise
	//g_errno = EBUFOVERFLOW;
	//return NULL;
	return &m_addresses;
}

/*
int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) {
	if ( m_siteNumInlinksUniqueIpValid )
		return &m_siteNumInlinksUniqueIp;
	// get our companion number
	int32_t *ni = getSiteNumInlinks();
	if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
	// sanity check
	if ( ! m_siteNumInlinksUniqueIp ) { char *xx=NULL;*xx=0; }
	// ok we must be valid
	return &m_siteNumInlinksUniqueIp;
}

int32_t *XmlDoc::getSiteNumInlinksUniqueCBlock ( ) {
	if ( m_siteNumInlinksUniqueCBlockValid )
		return &m_siteNumInlinksUniqueCBlock;
	// get our companion number
	int32_t *ni = getSiteNumInlinks();
	if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
	// sanity check
	if ( ! m_siteNumInlinksUniqueCBlock ) { char *xx=NULL;*xx=0; }
	// ok we must be valid
	return &m_siteNumInlinksUniqueCBlock;
}

int32_t *XmlDoc::getSiteNumInlinksTotal ( ) {
	if ( m_siteNumInlinksTotalValid )
		return &m_siteNumInlinksTotal;
	// get our companion number
	int32_t *ni = getSiteNumInlinks();
	if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni;
	// sanity check
	if ( ! m_siteNumInlinksTotal ) { char *xx=NULL;*xx=0; }
	// ok we must be valid
	return &m_siteNumInlinksTotal;
}
*/

// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
int32_t *XmlDoc::getFirstIp ( ) {
	// return it if we got it
	if ( m_firstIpValid ) return &m_firstIp;
	// note it
	setStatus ( "getting first ip");
	// get tag rec
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// got it
	Tag *tag = gr->getTag ( "firstip" );
	// get from tag
	m_firstIp = 0;
	if ( tag ) m_firstIp = atoip(tag->getTagData());
	// if no tag, or is bogus in tag... set from ip
	if ( m_firstIp == 0 || m_firstIp == -1 ) {
		// need ip then!
		int32_t *ip = getIp();
		if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
		// set that
		m_firstIp = *ip;
	}
	m_firstIpValid = true;
	return &m_firstIp;
	// must be 4 bytes - no now its a string
	//if ( tag->getTagDataSize() != 4 ) { char *xx=NULL;*xx=0; }
}

uint8_t *XmlDoc::getSiteNumInlinks8 () {
	if ( m_siteNumInlinks8Valid ) return &m_siteNumInlinks8;
	// get the full count
	int32_t *si = getSiteNumInlinks();
	if ( ! si || si == (int32_t *)-1 ) return (uint8_t *)si;
	// convert to 8
	m_siteNumInlinks8 = score32to8 ( *si );
	// validate
	m_siteNumInlinks8Valid = true;
	return &m_siteNumInlinks8;
}

// this is the # of GOOD INLINKS to the site. so it is no more than
// 1 per c block, and it has to pass link spam detection. this is the
// highest-level count of inlinks to the site. use it a lot.
int32_t *XmlDoc::getSiteNumInlinks ( ) {

	if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;

	// sanity check
	if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// hacks of speed. computeSiteNumInlinks is true by default
	// but if the user turns it off the just use sitelinks.txt
	if ( cr && ! cr->m_computeSiteNumInlinks ) {
		int32_t hostHash32 = getHostHash32a();
		int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
		// try with www if not there
		if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
			int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
			min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
		}
		// fix core by setting these
		// m_siteNumInlinksUniqueIp          = 0;
		// m_siteNumInlinksUniqueCBlock      = 0;
		// m_siteNumInlinksTotal             = 0;
		// m_siteNumInlinksUniqueIpValid     = true;
		// m_siteNumInlinksUniqueCBlockValid = true;
		// m_siteNumInlinksTotalValid        = true;
		//a nd this
		m_siteNumInlinksValid = true;
		m_siteNumInlinks      = 0;
		// if still not in sitelinks.txt, just use 0
		if ( min < 0 ) {
			return &m_siteNumInlinks;
		}
		m_siteNumInlinks = min;
		return &m_siteNumInlinks;
	}

	setStatus ( "getting site num inlinks");

	// get it from the tag rec if we can
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (void *)-1 ) return (int32_t *)gr;

	// the current top ip address
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
	//int32_t top = *ip & 0x00ffffff;

	// this happens when its NXDOMAIN reply from dns so assume
	// no site inlinks
	if ( *ip == 0 ) {
		m_siteNumInlinks             = 0;
		// m_siteNumInlinksUniqueIp     = 0;
		// m_siteNumInlinksUniqueCBlock = 0;
		// m_siteNumInlinksTotal        = 0;
		m_siteNumInlinksValid             = true;
		// m_siteNumInlinksUniqueIpValid     = true;
		// m_siteNumInlinksUniqueCBlockValid = true;
		// m_siteNumInlinksTotalValid        = true;
		return &m_siteNumInlinks;
	}

	if ( *ip == -1 ) {
		log("xmldoc: ip is %"INT32", can not get site inlinks",*ip);
		g_errno = EBADIP;
		return NULL;
	}

	// wait for clock to sync before calling getTimeGlobal
	int32_t wfts = waitForTimeSync();
	// 0 means error, i guess g_errno should be set, -1 means blocked
	if ( ! wfts ) return NULL;
	if ( wfts == -1 ) return (int32_t *)-1;

	setStatus ( "getting site num inlinks");
	// check the tag first
	Tag *tag = gr->getTag ("sitenuminlinks");
	// is it valid?
	bool valid = true;
	// current time
	int32_t now = getTimeGlobal();
	// use the spidered time for the test collection for consistency
	if ( !strcmp(cr->m_coll,"qatest123") ) {
		//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
		now = getSpideredTime();//m_spideredTime;
	}
	// get tag age in days
	int32_t age = 0; if ( tag ) age = (now - tag->m_timestamp) ;
	// add in some flutter to avoid having all hsots in the network
	// calling msg25 for this site at the same time.
	// a 10,000 second jitter. 3 hours.
	int32_t flutter = rand() % 10000;
	// add it in
	age += flutter;
	// . if site changes ip then toss the contact info out the window,
	//   but give it a two week grace period
	// . well now we use the "ownershipchanged" tag to indicate that
	//if (tag && age>14*3600*24) valid=false;
	// . we also expire it periodically to keep the info uptodate
	// . the higher quality the site, the longer the expiration date
	int32_t ns = 0;
	int32_t maxAge = 0;
	int32_t sni = -1;
	if ( tag ) {
		// how many site inlinks?
		ns = atol(tag->getTagData());
		// for less popular sites use smaller maxAges
		maxAge = 90;
		if      ( ns <  10 ) maxAge = 10;
		else if ( ns <  30 ) maxAge = 15;
		else if ( ns <  50 ) maxAge = 30;
		else if ( ns < 100 ) maxAge = 60;
		// if index size is tiny then maybe we are just starting to
		// build something massive, so reduce the cached max age
		int64_t nt = g_titledb.m_rdb.getCollNumTotalRecs(m_collnum);
		if ( nt < 100000000 ) //100M
			maxAge = 3;
		if ( nt < 10000000 ) //10M
			maxAge = 1;
		// for every 100 urls you already got, add a day!
		sni = atol(tag->getTagData());
		// double if repairing
		//if ( m_useSecondaryRdbs ) maxAge = (maxAge+1) * 2;
		// fix bug for rebuild. rebuild any tag before now because
		// the MAX_LINKERS_IN_TERMLIST was too small in Linkdb.cpp
		// and i raised from 1M to 3M. it was hurting mahalo.com.
		if ( m_useSecondaryRdbs && tag->m_timestamp < 1345819704 )
			valid = false;
		// force another rebuild of siterank because i fixed
		// the 'beds' query a little to use firstip, so recompute
		// siterank for those spammers.
		if ( m_useSecondaryRdbs && tag->m_timestamp < 1348257346 &&
		     // leave really big guys in tact
		     sni < 300 )
			valid = false;
		// convert into seconds
		maxAge *= 3600*24;
		// so youtube which has 2997 links will add an extra 29 days
		maxAge += (sni / 100) * 86400;
		// hack for global index. never affect siteinlinks i imported
		if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0;
		// invalidate for that as wel
		if ( age > maxAge ) valid = false;
	}
	// our companion tags, sitePop and fresh inlinks
	// Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" );
	// Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock");
	// Tag *tag4 = gr->getTag ( "sitenuminlinkstotal");
	// if we are missing either of those, invalidate as well
	// if ( ! tag2 ) valid = false;
	// if ( ! tag3 ) valid = false;
	// if ( ! tag4 ) valid = false;
	// if we have already been through this
	if ( m_updatingSiteLinkInfoTags ) valid = false;
	// if rebuilding linkdb assume we have no links to sample from!
	if ( tag && m_useSecondaryRdbs && g_repair.m_rebuildLinkdb )
		valid = true;

	// debug log
	if ( g_conf.m_logDebugLinkInfo )
		log("xmldoc: valid=%"INT32" "
		    "age=%"INT32" ns=%"INT32" sni=%"INT32" "
		    "maxage=%"INT32" "
		    "tag=%"PTRFMT" "
		    // "tag2=%"PTRFMT" "
		    // "tag3=%"PTRFMT" "
		    "url=%s",
		    (int32_t)valid,age,ns,sni,
		    maxAge,
		    (PTRTYPE)tag,
		    // (PTRTYPE)tag2,
		    // (PTRTYPE)tag3,
		    m_firstUrl.m_url);

	LinkInfo *sinfo = NULL;
	char *mysite = NULL;

	// if we are good return it
	if ( tag && valid ) {
		// set it
		m_siteNumInlinks = atol(tag->getTagData());
		m_siteNumInlinksValid = true;

		// companion tags
		// if ( tag2 ) {
		// 	m_siteNumInlinksUniqueIp = atol(tag2->getTagData());
		// 	m_siteNumInlinksUniqueIpValid = true;
		// }
		// if ( tag3 ) {
		// 	m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData());
		// 	m_siteNumInlinksUniqueCBlockValid = true;
		// }
		// if ( tag4 ) {
		// 	m_siteNumInlinksTotal =atol(tag4->getTagData());
		// 	m_siteNumInlinksTotalValid = true;
		// }

		// . consult our sitelinks.txt file
		// . returns -1 if not found
		goto updateToMin;
	}

	// set status. we can time status changes with this routine!
	//setStatus ( "getting site link info");


	// if ip is bad we can't do this. we need to have a legit ip
	// so we know if a linker is internal or not
	/*
	if ( *ip == 0 || *ip == -1 ) {
		log("gb: bad ip so we can't get site num inlinks right");
		m_siteNumInlinks = 0;
		m_sitePop = 0;
		m_siteNumInlinksFresh = 0;
		m_siteNumInlinksValid = true;
		m_siteNumInlinksFreshValid = true;
		m_sitePopValid = true;
		return &m_siteNumInlinks;
	}
	*/

	// set this flag so when we are re-called, "valid" will be set to false
	// so we can come down here and continue this. "flutter" might
	// otherwise cause us to not make it down here.
	m_updatingSiteLinkInfoTags = true;

	// we need to re-get both if either is NULL
	sinfo = getSiteLinkInfo();
	// block or error?
	if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;

	//
	// now update tagdb!
	//

	// ok, get the sites of the external outlinks and they must
	// also be NEW outlinks, added to the page since the last time
	// we spidered it...
	//Links *links = getLinks ();
	//if ( ! links || links == (Links *)-1 ) return (int32_t *)links;

	mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;

	setStatus ( "adding site info tags to tagdb 1");

	// why are we adding tag again! should already be in tagdb!!!
	if ( m_doingConsistencyCheck ) {char*xx=NULL;*xx=0;}

	// do not re-call at this point
	//m_siteNumInlinks      = sinfo->m_numInlinksExtrapolated;
	m_siteNumInlinks      = (int32_t)sinfo->m_numGoodInlinks;
	//m_siteNumInlinksFresh = sinfo->m_numInlinksFresh;
	//m_sitePop             = sinfo->m_pagePop;
	// m_siteNumInlinksUniqueIp     = sinfo->m_numUniqueIps;
	// m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks;
	// m_siteNumInlinksTotal        = sinfo->m_totalInlinkingDocIds;

	m_siteNumInlinksValid      = true;
	// m_siteNumInlinksUniqueIpValid = true;
	// m_siteNumInlinksUniqueCBlockValid = true;
	// m_siteNumInlinksTotalValid = true;


 updateToMin:

	// . consult our sitelinks.txt file
	// . returns -1 if not found
	int32_t hostHash32 = getHostHash32a();
	int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );

	// try with www if not there
	if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
		int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
		min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
	}

	if ( min >= 0 ) {
		if ( m_siteNumInlinks < min ||
		     ! m_siteNumInlinksValid ) {
			m_siteNumInlinks = min;
			m_siteNumInlinksValid = true;
		}
		// if ( ! m_siteNumInlinksUniqueIpValid ||
		//      m_siteNumInlinksUniqueIp < min ) {
		// 	m_siteNumInlinksUniqueIp = min;
		// 	m_siteNumInlinksUniqueIpValid = true;
		// }
		// if ( ! m_siteNumInlinksUniqueCBlockValid ||
		//      m_siteNumInlinksUniqueCBlock < min ) {
		// 	m_siteNumInlinksUniqueCBlock = min;
		// 	m_siteNumInlinksUniqueCBlockValid = true;
		// }
		// if ( ! m_siteNumInlinksTotalValid ||
		//      m_siteNumInlinksTotal < min ) {
		// 	m_siteNumInlinksTotal = min;
		// 	m_siteNumInlinksTotalValid = true;
		// }
	}


	// deal with it
	return &m_siteNumInlinks;
}

// . do a 'site:xyz.com | gbnuminlinks' query to get the top docs
//   from a site and get the gigabits from that query!
// . then store the resulting gigabits into tagdb for efficiency
// . recompute once per month or so ... or if ip changes i guess
// . we need the root title as a source for city and adm1's for
//   Addresses::set() function
//char **XmlDoc::getSiteGigabits ( ) {
//}

// TODO: can we have a NULL LinkInfo without having had an error?
LinkInfo *XmlDoc::getSiteLinkInfo() {
	// lookup problem?
	if ( g_errno ) {
		log("build: error getting link info: %s",
		    mstrerror(g_errno));
		return NULL;
	}

	setStatus ( "getting site link info" );

	if ( m_siteLinkInfoValid )
		//return msg25.m_linkInfo;
		return (LinkInfo *)m_mySiteLinkInfoBuf.getBufStart();
	char *mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;
	int32_t *fip = getFirstIp();
	if ( ! fip || fip == (int32_t *)-1) return (LinkInfo *)fip;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// can we be cancelled?
	bool canBeCancelled = true;
	// not if pageparser though
	if ( m_pbuf ) canBeCancelled = false;
	// not if injecting
	if ( ! m_sreqValid ) canBeCancelled = false;
	// assume valid when it returns
	m_siteLinkInfoValid = true;
	// use this buffer so XmlDoc::print() can display it where it wants
	SafeBuf *sb = NULL;
	if ( m_pbuf ) sb = &m_siteLinkBuf;
	// only do this for showing them!!!
	if ( m_useSiteLinkBuf ) sb = &m_siteLinkBuf;
	//bool onlyGetGoodInlinks = true;
	//if ( m_useSiteLinkBuf ) onlyGetGoodInlinks = false;
	// get this
	int32_t lastUpdateTime = getTimeGlobal();
	// get from spider request if there
	//bool injected = false;
	//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
	// but be consistent if doing the "qatest123" collection
	if ( ! strcmp(cr->m_coll,"qatest123") ) {
		//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
		lastUpdateTime = getSpideredTime();//m_spideredTime;
	}

	bool onlyNeedGoodInlinks = true;
	// so if steve wants to display all links then set this
	// to false so we get titles of bad inlinks
	// seems like pageparser.cpp just sets m_pbuf and not
	// m_usePageLinkBuf any more
	if ( sb ) onlyNeedGoodInlinks = false;

	// int16_tcut
	//Msg25 *m = &m_msg25;
	if ( ! getLinkInfo ( &m_tmpBuf11,
			     &m_mcast11,
			     mysite , // site
				mysite , // url
				true , // isSiteLinkInfo?
				*fip                 ,
				0 , // docId
				cr->m_collnum           , //linkInfoColl
				NULL                , // qbuf
				0                   , // qbufSize
				m_masterState       ,
				m_masterLoop        ,
				m_contentInjected ,// isInjecting?
				sb                  ,
			     m_printInXml        ,
				0 , // sitenuminlinks -- dunno!
				//0 , // sitePop
				NULL , // oldLinkInfo1        ,
				m_niceness          ,
				cr->m_doLinkSpamCheck ,
				cr->m_oneVotePerIpDom ,
				canBeCancelled        ,
				lastUpdateTime ,
				onlyNeedGoodInlinks ,
				false,
				0,
				0,
				// it will store the linkinfo into this safebuf
				&m_mySiteLinkInfoBuf) )
		// return -1 if it blocked
		return (LinkInfo *)-1;
	// sanity check
	//if ( ! m_msg25.m_linkInfo ) {
	//	log("build: error making link info: %s",mstrerror(g_errno));
	//	return NULL;
	//}
	// we got it
	//return m_msg25.m_linkInfo;
	// getLinkInfo() now calls multicast so it returns true on errors only
	log("build: error making link info: %s",mstrerror(g_errno));
	return NULL;
}

static void gotIpWrapper ( void *state , int32_t ip ) ;

static void delayWrapper ( int fd , void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
int32_t *XmlDoc::getIp ( ) {
	// return if we got it
	if ( m_ipValid ) return &m_ip;
	// update status msg
	setStatus ( "getting ip" );

	m_ipStartTime = 0;
	// assume the same in case we get it right away
	m_ipEndTime = 0;

	// if set from docid and recycling
	if ( m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (int32_t *)pod;
		// int16_tcut
		XmlDoc *od = *pod;
		// set it
		if ( od ) {
			m_ip      = od->m_ip;
			m_ipValid = true;
			return &m_ip;
		}
	}


	// fakeit for now
	//log("FAKING IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
	//m_ip = atoip("74.201.80.152",13);
	//m_ipValid = true;
	//return &m_ip;

	// get the best url
	Url *u = getCurrentUrl();
	if ( ! u || u == (void *)-1 ) return (int32_t *)u;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	bool useTestCache = false;
	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
	// unless its the pagesubmit.cpp event submission tool
	//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;


	// when building the "qatest123" collection try to get the ip from
	// "./test/ips.txt" so our injections are consistent every time
	// Test.cpp runs its injection loop into the "qatest123" collection
	if ( useTestCache ) { // && m_useIpsTxtFile ) {
		// stolen from msgc.cpp:
		// if url is already in a.b.c.d format return that
		int32_t ip2 = 0;
		char *host = u->getHost();
		if ( host ) ip2 = atoip ( host,u->getHostLen() );
		if ( ip2 != 0 ) {
			m_ip = ip2;
			m_ipValid = true;
			return &m_ip;
		}
		// assume not found in our file
		bool found = false;
		// get test dir
		char *testDir = getTestDir();
		// get it from "./test/ips.txt"
		getTestIp ( u->getUrl() , &m_ip , &found , m_niceness,testDir);
		// if we found a match...
		if ( found ) { // m_ip != 0 ) {
			// we are valid now
			return gotIp ( false );
			//m_ipValid = true;
			// return it
			//return &m_ip;
		}
	}

	// we need the ip before we download the page, but before we get
	// the IP and download the page, wait for this many milliseconds.
	// this basically slows the spider down.
	int32_t delay = cr->m_spiderDelayInMilliseconds;
	// ignore for testing
	if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
	// injected?
	if ( m_sreqValid && m_sreq.m_isInjecting  ) delay = 0;
	if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
	if ( m_sreqValid && m_sreq.m_isScraping   ) delay = 0;
	if ( m_sreqValid && m_sreq.m_fakeFirstIp  ) delay = 0;
	// . don't do the delay when downloading extra doc, robots.txt etc.
	// . this also reports a status msg of "getting new doc" when it
	//   really means "delaying spider"
	if ( m_isChildDoc ) delay = 0;

	if ( delay > 0 && ! m_didDelay ) {
		// we did it
		m_didDelay = true;
		m_statusMsg = "delaying spider";
		// random fuzz so we don't get everyone being unleashed at once
		int32_t radius = (int32_t)(.20 * (double)delay);
		int32_t fuzz = (rand() % (radius * 2)) - radius;
		delay += fuzz;
		// make a callback wrapper.
		// this returns false and sets g_errno on error
		if ( g_loop.registerSleepCallback ( delay         ,
						    m_masterState ,
						    delayWrapper,//m_masterLoop
						    m_niceness    ))
			// wait for it, return -1 since we blocked
                        return (int32_t *)-1;
                // if was not able to register, ignore delay
        }

	if ( m_didDelay && ! m_didDelayUnregister ) {
		g_loop.unregisterSleepCallback(m_masterState,delayWrapper);
		m_didDelayUnregister = true;
	}

	// update status msg
	setStatus ( "getting ip" );

	m_ipStartTime = gettimeofdayInMillisecondsGlobal();

	// assume valid! if reply handler gets g_errno set then m_masterLoop
	// should see that and call the final callback
	//m_ipValid = true;
	// get it
	if ( ! m_msgc.getIp ( u->getHost   () ,
			      u->getHostLen() ,
			      &m_ip           ,
			      this            ,
			      gotIpWrapper    ))
		// we blocked
                return (int32_t *)-1;
	// wrap it up
	return gotIp ( true );
}

void gotIpWrapper ( void *state , int32_t ip ) {
	// point to us
	XmlDoc *THIS = (XmlDoc *)state;

	THIS->m_ipEndTime = gettimeofdayInMillisecondsGlobal();

	// wrap it up
	THIS->gotIp ( true );
	// . call the master callback
	// . m_masterState usually equals THIS, unless THIS is the
	//   Xml::m_contactDoc or something...
	THIS->m_masterLoop ( THIS->m_masterState );
}

int32_t *XmlDoc::gotIp ( bool save ) {
	// return NULL on error
	if ( g_errno ) return NULL;
	// this is bad too
	//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
	//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());

	setStatus ("got ip");

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// note it for crawlbot
	if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
		log("db: got ip %"INT32" for %s",
		    m_ip,getCurrentUrl()->getUrl());

	bool useTestCache = false;
	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
	// unless its the pagesubmit.cpp event submission tool
	//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;


	// when building the "qatest123" collection try to get the ip from
	// "./test/ips.txt" so our injections are consistent every time
	// Test.cpp runs its injection loop into the "qatest123" collection
	if ( save && useTestCache ) {
		// ip of 0 means NXDOMAIN i think (-1 means error)
		//if ( m_ip == 0 ) {
		//	log("waiting for debug break");
		//	sleep(3600);
		//}
		// get the best url
		Url *u = getCurrentUrl();
		if ( !u || u == (void *)-1 ) { char *xx=NULL;*xx=0; }
		// . add it to "./test/ips.txt"
		// . this function is in Msge1.cpp
		addTestIp ( u->getHost() , u->getHostLen() , m_ip );
		// get test dir
		char *testDir = getTestDir();
		// save it
		saveTestBuf ( testDir );
	}

	// we got it
	m_ipValid = true;
	// give it to them
	return &m_ip;
}

#include "Mime.h"

// taken from Robotdb.cpp
bool isAllowed2 ( Url   *url            ,
		  char  *userAgent      ,
		  char  *file           ,
		  int32_t   fileLen        ,
		  bool  *userAgentFound ,
		  bool   substringMatch ,
		  int32_t  *crawlDelay     ,
		  char **cacheStart     ,
		  int32_t  *cacheLen       ,
		  bool  *hadAllowOrDisallow ) {
	// assume nothing to cache yet
	*cacheLen   = 0;
	*cacheStart = file;
	// assume user agent is not in the file
	*userAgentFound = false;
	*hadAllowOrDisallow = false;
	// assume no crawl delay (-1)
	// *crawlDelay = -1;
	// if fileLen is 0 it is allowed
	if ( fileLen <= 0 ) return true;
	// get path from url, include cgi stuff
	char *path    = url->getPath();
	int32_t  pathLen = url->getPathLenWithCgi();
	// set the Mime class to this Mime file
	Mime mime;
	mime.set ( file , fileLen );
	// get a line of Mime
	char *f , *v;
	int32_t flen, vlen;
	// user agent length
	int32_t uaLen = gbstrlen (userAgent);
	// ptr into "file"
	char *p = file;
	char flag;
	bool allowed = true;
 loop:
	// if p is NULL now we're done
	if ( ! p ) return allowed;
	// get the next Mime line
	p = mime.getLine ( p , &f , &flen , &v , &vlen );
	// if this field is NOT "user-agent" skip it
	if ( flen != 10 ) goto loop;
	if ( strncasecmp ( f , "user-agent" , 10 ) != 0 ) goto loop;
 gotAgent:
	//some webmasters put comments at the end of their lines,
	//because they think this is a shell script or something.
	char* vv = v;
	while(vv - v < vlen && *vv != '#') vv++;
	vlen = vv - v;
	// decrement vlen to hack off spaces after the user-agent so that vlen
	// is really the length of the user agent
	while ( vlen > 0 && is_wspace_a(v[vlen-1]) ) vlen--;
	// now match the user agent
	if ( ! substringMatch && vlen != uaLen ) goto loop;
	// otherwise take the min of the lengths
	if ( uaLen < vlen ) vlen = uaLen;
	// is it the right user-agent?
	if ( strncasecmp ( v , userAgent , vlen ) != 0 ) goto loop;
	// we got it, if first instance start our cache here
	if ( !*userAgentFound ) *cacheStart = f;
	*userAgentFound = true;
	flag = 0;
 urlLoop:
	// if p is NULL now there is no more lines
	if ( ! p ) {
		// set our cache stop to the end of the file
		*cacheLen = (file + fileLen) - *cacheStart;
		return allowed;
	}
	// now loop over lines until we hit another user-agent line
	p = mime.getLine ( p , &f , &flen , &v , &vlen );
	// if it's another user-agent line ... ignore it unless we already
	// have seen a disallow line, in which case we got another set of
	if ( flag && flen==10 && strncasecmp(f,"user-agent",10)==0) {
		// set our cache stop here
		*cacheLen = f - *cacheStart;
		goto gotAgent;
	}
	// if a crawl delay, get the delay
	if ( flen == 11 && strncasecmp ( f , "crawl-delay", 11 ) == 0 ) {
		// set flag
		flag = 1;
		// skip if invalid. it could be ".5" seconds
		if ( ! is_digit ( *v ) && *v != '.' ) goto urlLoop;
		// get this. multiply crawl delay by x1000 to be in
		// milliseconds/ms
		int64_t vv = (int64_t)(atof(v) * 1000LL);
		// truncate to 0x7fffffff
		if      ( vv > 0x7fffffff ) *crawlDelay = 0x7fffffff;
		else if ( vv < 0          ) *crawlDelay = -1;
		else                        *crawlDelay = (int32_t)vv;
		// get the delay
		//*crawlDelay = atol(v) * 1000;
		goto urlLoop;
	}
	// if already disallowed, just goto the next line
	if ( !allowed ) goto urlLoop;
	// if we have an allow line or sitemap: line, then set flag to 1
	// so we can go to another user-agent line.
	// fixes romwebermarketplace.com/robots.txt
	// (doc.156447320458030317.txt)
	if ( flen==5 && strncasecmp(f,"allow"  ,5)==0 ) {
		*hadAllowOrDisallow = true;
		flag = 1;
	}
	if ( flen==7 && strncasecmp(f,"sitemap",7)==0 ) {
		flag = 1;
	}
	// if not disallow go to loop at top
	if ( flen != 8 ) goto urlLoop;
	if ( strncasecmp ( f , "disallow" , 8 ) != 0 ) {
		goto urlLoop;
	}
	// we had a disallow
	*hadAllowOrDisallow = true;
	// set flag
	flag = 1;
	// . take off trailing chars from the banned path name
	// . this is now done below
	//while ( vlen > 0 && is_space(v[vlen-1]) ) vlen--;
	// . skip leading spaces
	// . this should be done in mime class
	// while ( vlen > 0 && is_space(v[0]) ) { v++; vlen--; }
	// now stop at first space after url or end of line
	char *s    = v;
	char *send = v + vlen;
	// skip all non-space chars
	while ( s < send && ! is_wspace_a(*s) ) s++;
	// stop there
	vlen = s - v;
	// check for match
	char *tmpPath = path;
	int32_t tmpPathLen = pathLen;
	// assume path begins with /
	if ( vlen > 0 && v[0] != '/'){tmpPath++;tmpPathLen--;}
	if ( vlen > tmpPathLen ) goto urlLoop;
	if ( strncasecmp(tmpPath,v,vlen) != 0 ) goto urlLoop;
	// an exact match
	if ( vlen == tmpPathLen ) {
		//return false;
		allowed = false;
		goto urlLoop;
	}
	// must be something
	if ( vlen <= 0 ) goto urlLoop;
	// "v" may or may not end in a /, it really should end in a / though
	if ( v[vlen-1] == '/' && tmpPath[vlen-1] == '/' ) {
		//return false;
		allowed = false;
		goto urlLoop;
	}
	if ( v[vlen-1] != '/' && tmpPath[vlen  ] == '/' ) {
		//return false;
		allowed = false;
		goto urlLoop;
	}
	// let's be stronger. just do the substring match. if the webmaster
	// does not want us splitting path or file names then they should end
	// all of their robots.txt entries in a '/'. this also fixes the
	// problem of the "Disallow: index.htm?" line.
	//return false;
	allowed = false;
	// get another url path
	goto urlLoop;
}

// when doing a custom crawl we have to decide between the provided crawl
// delay, and the one in the robots.txt...
int32_t *XmlDoc::getFinalCrawlDelay() {

	if ( m_finalCrawlDelayValid )
		return &m_finalCrawlDelay;

	bool *isAllowed = getIsAllowed();
	if ( ! isAllowed || isAllowed == (void *)-1 ) return (int32_t *)isAllowed;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	m_finalCrawlDelayValid = true;

	// getIsAllowed already sets m_crawlDelayValid to true
	if ( ! cr->m_isCustomCrawl ) {
		m_finalCrawlDelay = m_crawlDelay;
		// default to 250ms i guess if none specified in robots
		// just to be somewhat nice by default
		if ( m_crawlDelay < 0 )	m_finalCrawlDelay = 250;
		return &m_finalCrawlDelay;
	}

	// get manually specified crawl delay in seconds. convert to ms.
	int32_t manual = (int32_t)(cr->m_collectiveCrawlDelay * 1000.0);
	// negative means -1 means unknown or not specified
	if ( manual < 0 ) manual = -1;

	// if both are unknown...
	if ( m_crawlDelay == -1 && manual == -1 ) {
		m_finalCrawlDelay = -1;
		return &m_finalCrawlDelay;
	}

	// if not in robots.txt use manual
	if ( m_crawlDelay == -1 ) {
		m_finalCrawlDelay = manual;
		return &m_finalCrawlDelay;
	}

	// if manually provided crawldelay is -1, use robots.txt then
	if ( manual == -1 ) {
		m_finalCrawlDelay = m_crawlDelay;
		return &m_finalCrawlDelay;
	}

	// let robots.txt dictate if both are >= 0
	if ( m_useRobotsTxt ) {
		m_finalCrawlDelay = m_crawlDelay;
		return &m_finalCrawlDelay;
	}

	// if not using robots.txt, pick the smallest
	if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay;
	else                         m_finalCrawlDelay = manual;

	return &m_finalCrawlDelay;
}

bool XmlDoc::isFirstUrlRobotsTxt ( ) {
	if ( m_isRobotsTxtUrlValid )
		return m_isRobotsTxtUrl;
	Url *fu = getFirstUrl();
	m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
	m_isRobotsTxtUrlValid = true;
	return m_isRobotsTxtUrl;
}

// . get the Robots.txt and see if we are allowed
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
// . getting a robots.txt is not trivial since we need to follow redirects,
//   so we make use of the powerful XmlDoc class for this
bool *XmlDoc::getIsAllowed ( ) {
	// return if we got it
	if ( m_isAllowedValid ) return &m_isAllowed;
	// could be turned off for everyone
	if ( ! m_useRobotsTxt ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		m_crawlDelayValid = true;
		m_crawlDelay      = -1;
		//log("xmldoc: skipping robots.txt lookup for %s",
		//    m_firstUrl.m_url);
		return &m_isAllowed;
	}

	// . if setting from a title rec, assume allowed
	// . this avoids doConsistencyCheck() from blocking and coring
	if ( m_setFromTitleRec ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		return &m_isAllowed;
	}

	if ( m_recycleContent ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		return &m_isAllowed;
	}

	// HACK: so we can spider archive.org warcs and arcs internally
	if ( m_firstUrlValid &&
	     m_firstUrl.getDomainLen() == 11 &&
	     strncmp ( m_firstUrl.getDomain() , "archive.org" , 11 ) == 0 ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		return &m_isAllowed;
	}


	// double get?
	if ( m_crawlDelayValid ) { char *xx=NULL;*xx=0; }

	// bulk jobs don't need this
	CollectionRec *cr = getCollRec();
	if ( cr && cr->m_isCustomCrawl == 2 ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		return &m_isAllowed;
	}

	// . if WE are robots.txt that is always allowed!!!
	// . check the *first* url since these often redirect to wierd things
	if ( isFirstUrlRobotsTxt() ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		m_crawlDelayValid = true;
		// make it super fast...
		m_crawlDelay      = 0;
		return &m_isAllowed;
	}

	// or if using the "qatest123" collection, assume yes!
	//if ( ! strcmp ( m_coll , "qatest123" ) ) {
	//	m_isAllowed      = true;
	//	m_isAllowedValid = true;
	//	return &m_isAllowed;
	//}

	// update status msg
	setStatus ( "getting robots.txt" );
	// sanity
	int32_t *ip = getIp ();
	// error? or blocked?
	if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
	Url *fu = getFirstUrl();
	// if ip does not exist on the dns, do not try to download robots.txt
	// it is pointless... this can happen in the dir coll and we basically
	// have "m_siteInCatdb" set to true
	if ( *ip == 1 || *ip == 0 || *ip == -1 ) {
		// note this
		log("build: robots.txt ip is %s for url=%s. allowing for now.",
		    fu->getUrl(),iptoa(*ip));
		// just core for now
		//char *xx=NULL;*xx=0;
		m_isAllowed      = true;
		m_isAllowedValid = true;
		// since ENOMIME is no longer causing the indexCode
		// to be set, we are getting a core because crawlDelay
		// is invalid in getNewSpiderReply()
		m_crawlDelayValid = true;
		m_crawlDelay      = -1;
		return &m_isAllowed;
	}

	// we need this so getExtraDoc does not core
	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 ) return (bool *)pfip;

	// get the current url after redirects
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (bool *)cu;

	// set m_extraUrl to the robots.txt url
	char buf[MAX_URL_LEN+2];
	char *p = buf;
	if ( cu->isHttps() ) p += sprintf ( p , "https://" );
	else                 p += sprintf ( p , "http://" );
	// sanity
	if ( ! cu->getHost() ) { char *xx=NULL;*xx=0; }
	gbmemcpy ( p , cu->getHost() , cu->getHostLen() );
	p += cu->getHostLen();
	int32_t port = cu->getPort();
	// 80 is the default port
	int32_t defPort = 80;
	// is it https://?
	if ( cu->m_url[4] == 's' ) defPort = 443;
	if ( port != defPort ) p += sprintf ( p , ":%"INT32"",port );
	p += sprintf ( p , "/robots.txt" );
	m_extraUrl.set ( buf );

	// . maxCacheAge = 3600 seconds = 1 hour for robots.txt
	// . if this is non-zero then msg13 should store it as well!
	// . for robots.txt it should only cache the portion of the doc
	//   relevant to our user agent!
	// . getHttpReply() should use msg13 to get cached reply!
	XmlDoc **ped = getExtraDoc ( m_extraUrl.getUrl() , 3600 );
	if ( ! ped || ped == (void *)-1 ) return (bool *)ped;
	// assign it
	XmlDoc *ed = *ped;
	// return NULL on error with g_errno set
	if ( ! ed ) {
		// sanity check, g_errno must be set
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// log it -- should be rare?
		log("doc: had error getting robots.txt: %s",
		    mstrerror(g_errno));
		return NULL;
	}
	// inherit this
	//if ( ! m_useIpsTxtFile ) ed->m_useIpsTxtFile = false;
	// . steal m_firstIp from us to avoid tag rec lookup
	// . why was this commented out?
	// . maybe because if we redirect, this is not the same!!!
	//ed->m_firstIp      = m_firstIp;
	//ed->m_firstIpValid = m_firstIpValid;//true;
	// also, steal our ip! neither is this!
	//ed->m_ip      = m_ip;
	//ed->m_ipValid = m_ipValid;
	// . now try the content
	// . should call getHttpReply
	char **pcontent = ed->getContent();
	if ( ! pcontent || pcontent == (void *)-1 ) return (bool *)pcontent;
	// get the mime
	HttpMime *mime = ed->getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (bool *)mime;
	// get this
	int32_t contentLen = ed->m_contentLen;
	// save this
	m_robotsTxtLen = contentLen;
	m_robotsTxtLenValid = true;
	// get content
	char *content = *pcontent;
	// sanity check
	if ( content && contentLen>0 && content[contentLen] != '\0'){
		char*xx=NULL;*xx=0;}

	// reset this. -1 means unknown or none found.
	m_crawlDelay = -1;
	m_crawlDelayValid = true;

	// assume valid and ok to spider
	m_isAllowed      = true;
	m_isAllowedValid = true;

	// put in a crawldelay test for diffbot
	/*
	SafeBuf tmp;
	if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) {
		tmp.safePrintf("User-Agent: *\n"
			       "Crawl-Delay: 10.1\n"
			       );
		content = tmp.getBufStart();
		contentLen = tmp.getLength();
	}

	// if not success, assume no robots.txt
	else*/

	if ( mime->getHttpStatus() != 200 ) {
		// nuke it to save mem
		nukeDoc ( ed );
		return &m_isAllowed;
	}

	// get the url we lookup
	//Url *cu = getCurrentUrl();

	// this is set to true if our userAgent was found explicitly
	bool uaFound;
	bool allowed;
	char *cacheStart;
	int32_t  cacheLen;
	bool  hadAllowOrDisallow;
	int32_t  savedCrawlDelay = -1;
	// now use left-anchored substring match so we can match Gigabot/1.0
	allowed = isAllowed2 ( cu                       ,
			       g_conf.m_spiderUserAgent ,
			       content                  ,
			       contentLen               ,
			       &uaFound                 ,
			       true                     , // substrmatch?
			       &m_crawlDelay            ,
			       &cacheStart              ,
			       &cacheLen                ,
			       &hadAllowOrDisallow        );
	// save it
	savedCrawlDelay = m_crawlDelay;
	// . if didn't find our user agent so check for * as a user-agent
	// . www.wikihow.com/robots.txt just has "Gigabot: crawl-delay:10\n"
	//   and then a "User-Agent: *" after that with the disallows, so
	//   i added the hadAllowDisallow parm
	if ( ! uaFound || ! hadAllowOrDisallow )
		allowed = isAllowed2 ( cu                  ,
				       "*"                 ,
				       content             ,
				       contentLen          ,
				       &uaFound            ,
				       false               ,  // substrmatch?
				       &m_crawlDelay       ,
				       &cacheStart         ,
				       &cacheLen           ,
				       &hadAllowOrDisallow   );
	// bring back?
	if ( savedCrawlDelay != -1 ) m_crawlDelay = savedCrawlDelay;
	// nuke it to save mem
	nukeDoc ( ed );
	// we are legit
	m_isAllowed      = allowed;
	m_isAllowedValid = true;
	return &m_isAllowed;
}


// . lookup the title rec with the "www." if we do not have that in the url
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
char *XmlDoc::getIsWWWDup ( ) {
	// this is not a real error really
	//if ( g_errno == ENOTFOUND ) g_errno = 0;
	// return if we got it
	if ( m_isWWWDupValid ) return &m_isWWWDup;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// could be turned off for everyone
	if ( ! cr->m_dupCheckWWW ) {
		m_isWWWDup      = false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}
	// get the FIRST URL... (no longer current url after redirects)
	Url *u = getFirstUrl(); // CurrentUrl();
	// if we are NOT a DOMAIN-ONLY url, then no need to do this dup check
	if ( u->getDomainLen() != u->getHostLen() ) {
		m_isWWWDup      = false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}

	// must NOT have a www
	if ( ! u->isHostWWW() ) {
		m_isWWWDup      = false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}

	// watch out for idiot urls like www.gov.uk and www.gov.za
	// treat them as though the TLD is uk/za and the domain
	// is gov.uk and gov.za
	if ( u->getDomain() &&
	     strncmp ( u->getDomain() , "www." , 4 ) == 0 ) {
		m_isWWWDup      = false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}

	// make it without the www
	char withoutWWW[MAX_URL_LEN+1];
	char *proto = "http";
	if ( u->isHttps() ) proto = "https";
	sprintf(withoutWWW,"%s://%s",proto,u->getDomain());

	// assume yes
	m_isWWWDup = true;

	if ( ! m_calledMsg22f )
		setStatus ( "getting possible www dup title rec" );

	// . does this title rec exist in titledb?
	// . "justCheckTfndb" is set to true here!
	if ( ! m_calledMsg22f &&
	     ! m_msg22f.getTitleRec ( &m_msg22Request      ,
				      withoutWWW           ,
				      0                    , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      NULL                 , // tr ptr
				      NULL                 , // tr size ptr
				      true                 , // just chk tfndb?
				      false, // getavaildocidonly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      false                , // add to cache?
				      0                    , // max cache age
				      999999               , // timeout seconds
				      false                )){//load balancing?
		// validate
		m_calledMsg22f = true;
		// return -1 if we blocked
		return (char *)-1;
	}
	// got it
	m_calledMsg22f = true;
	// valid now
	m_isWWWDupValid = true;
	// found?
	if ( ! g_errno && m_msg22f.m_found ) {
		// crap we are a dup
		m_isWWWDup = true;
		// set the index code
		//m_indexCode = EDOCDUPWWW;
	}
	// return us
	return &m_isWWWDup;
}


LinkInfo s_dummy2;

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
LinkInfo *XmlDoc::getLinkInfo1 ( ) {

	if ( m_linkInfo1Valid && ptr_linkInfo1 )
		return ptr_linkInfo1;

	// do not generate in real-time from a msg20 request for a summary,
	// because if this falls through then getFirstIp() below can return -1
	// and we return -1, causing all kinds of bad things to happen for
	// handling the msg20 request
	if ( m_setFromTitleRec && m_req && ! ptr_linkInfo1 ) {
	returnDummy:
		memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
		s_dummy2.m_lisize = sizeof(LinkInfo);
		ptr_linkInfo1  = &s_dummy2;
		size_linkInfo1 = sizeof(LinkInfo);
		return ptr_linkInfo1;
	}

	// at least get our firstip so if cr->m_getLinkInfo is false
	// then getRevisedSpiderReq() will not core because it is invalid
	int32_t *ip = getFirstIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;

	// just return nothing if not doing link voting
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// to keep things fast we avoid getting link info for some collections
	if ( ! m_linkInfo1Valid && ! cr->m_getLinkInfo ) {
		ptr_linkInfo1 = NULL;
		m_linkInfo1Valid = true;
	}

	// sometimes it is NULL in title rec when setting from title rec
	if ( m_linkInfo1Valid && ! ptr_linkInfo1 ) {
		goto returnDummy;
	}

	// return if we got it
	if ( m_linkInfo1Valid )
		return ptr_linkInfo1;

	// change status
	setStatus ( "getting local inlinkers" );

	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo *)od;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni;
	//int32_t *fip = getFirstIp();
	//if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d;
	// sanity check. error?
	if ( *d == 0LL ) {
		log("xmldoc: crap no g_errno");
		g_errno = EBADENGINEER;
		return NULL;
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return NULL;
	}
	char *mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;

	// no linkinfo for diffbot custom crawls to speed up
	if ( cr->m_isCustomCrawl ) {
		m_linkInfo1Valid = true;
		memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
		s_dummy2.m_lisize = sizeof(LinkInfo);
		ptr_linkInfo1  = &s_dummy2;
		size_linkInfo1 = sizeof(LinkInfo);
		return ptr_linkInfo1;
	}

	// grab a ptr to the LinkInfo contained in our Doc class
	LinkInfo  *oldLinkInfo1 = NULL;
	if ( *od ) oldLinkInfo1 = (*od)->getLinkInfo1();

	// if ip does not exist, make it 0
	if ( *ip == 0 || *ip == -1 ) {
		m_linkInfo1Valid = true;
		memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
		s_dummy2.m_lisize = sizeof(LinkInfo);
		ptr_linkInfo1  = &s_dummy2;
		size_linkInfo1 = sizeof(LinkInfo);
		return ptr_linkInfo1;
	}

	//link info generation requires an IP for internal/external computation
	// UNLESS we are from getSpiderStatusDocMetaList2() ... so handle
	// -1 above!
	//if ( *ip == -1 || *ip == 0 ) { char *xx=NULL;*xx=0; }

	// . error getting linkers?
	// . on udp timeout we were coring below because msg25.m_linkInfo
	//   was NULL
	if ( g_errno && m_calledMsg25 ) return NULL;
	// prevent core as well
	//if ( m_calledMsg25 && ! size_linkInfo1 ) { // m_msg25.m_linkInfo ) {
	//	log("xmldoc: msg25 had null link info");
	//	g_errno = EBADENGINEER;
	//	return NULL;
	//}

	// . now search for some link info for this url/doc
	// . this queries the search engine to get linking docIds along
	//   with their termIds/scores from anchor text and then compiles
	//   it all into one IndexList
	// . if we have no linkers to this url then we set siteHash, etc.
	//   for this linkInfo class
	// . this is my google algorithm
	// . let's use the first url (before redirects) for this
	// . m_newDocId is used for classifying doc under predefined news topic
	// . catSiteRec is used for classifying pages under a predefined
	//   newstopic. this is currently for news search only.
	// . use the rootTitleRecPtr if there and we are doing our link info
	//   stuff in this collection, but if doing it in another collection
	//   the msg25 will look up the root in that collection...
	if ( ! m_calledMsg25 ) {
		// get this
		int32_t lastUpdateTime = getTimeGlobal();
		// but be consistent if doing the "qatest123" collection
		if ( ! strcmp(cr->m_coll,"qatest123") ) {
			//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
			lastUpdateTime = getSpideredTime();//m_spideredTime;
		}
		// do not redo it
		m_calledMsg25 = true;
		// int16_tcut
		//Msg25 *m = &m_msg25;
		// can we be cancelled?
		bool canBeCancelled = true;
		// not if pageparser though
		if ( m_pbuf ) canBeCancelled = false;
		// not if injecting
		if ( ! m_sreqValid ) canBeCancelled = false;
		// use this buffer so XmlDoc::print() can display wherever
		SafeBuf *sb = NULL;
		if ( m_pbuf ) sb = &m_pageLinkBuf;
		// only do this for showing them!!!
		if ( m_usePageLinkBuf ) sb = &m_pageLinkBuf;
		// get from spider request if there
		//bool injected = false;
		//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
		// we do not want to waste time computing the page title
		// of bad inlinks if we only want the good inlinks, because
		// as of oct 25, 2012 we only store the "good" inlinks
		// in the titlerec
		bool onlyNeedGoodInlinks = true;
		// so if steve wants to display all links then set this
		// to false so we get titles of bad inlinks
		if ( m_usePageLinkBuf ) onlyNeedGoodInlinks = false;
		// seems like pageparser.cpp just sets m_pbuf and not
		// m_usePageLinkBuf any more
		if ( m_pbuf           ) onlyNeedGoodInlinks = false;
		// status update
		setStatus ( "calling msg25 for url" );
		CollectionRec *cr = getCollRec();
		if ( ! cr ) return NULL;

		// we want to get all inlinks if doing a custom crawlbot crawl
		// because we need the anchor text to pass in to diffbot
		bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
		bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
		// this seems to overdo it when we have a ton of linktext
		// perhaps, so take this out...
		//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
		//	doLinkSpamCheck     = false;
		//	oneVotePerIpDom     = false;
		//	onlyNeedGoodInlinks = false;
		//}

		// call it. this is defined in Linkdb.cpp
		char *url = getFirstUrl()->getUrl();
		if ( ! getLinkInfo ( &m_tmpBuf12,
				     &m_mcast12,
				        mysite ,
					url ,
					false , // isSiteLinkInfo?
					*ip                 ,
					*d                  ,
					cr->m_collnum       , //linkInfoColl
					NULL                , // qbuf
					0                   , // qbufSize
					m_masterState       ,
					m_masterLoop        ,
					m_contentInjected ,//m_injectedReply ,
					sb                  ,
					m_printInXml        ,
					*sni                ,
					//m_sitePop           ,
					oldLinkInfo1        ,
					m_niceness          ,
					doLinkSpamCheck ,
					oneVotePerIpDom ,
					canBeCancelled        ,
					lastUpdateTime ,
					onlyNeedGoodInlinks ,
					false, // getlinkertitles
					0, // ourhosthash32 (special)
					0,  // ourdomhash32 (special)
					&m_myPageLinkInfoBuf
					) )
			// blocked
			return (LinkInfo *)-1;
		// error?
		if ( g_errno ) return NULL;
		// panic! what the fuck? why did it return true and then
		// call our callback???
		//if ( g_conf.m_logDebugBuild ) {
		log("build: xmldoc call to msg25 did not block");
		// must now block since it uses multicast now to
		// send the request onto the network
		char *xx=NULL;*xx=0;
		//}
	}

	// at this point assume its valid
	m_linkInfo1Valid = true;
	// . get the link info we got set
	// . this ptr references into m_myPageLinkInfoBuf safebuf
	//ptr_linkInfo1  = m_msg25.m_linkInfo;
	//size_linkInfo1 = m_msg25.m_linkInfo->getSize();
	ptr_linkInfo1  = (LinkInfo *)m_myPageLinkInfoBuf.getBufStart();
	size_linkInfo1 = m_myPageLinkInfoBuf.length();
	// we should free it
	m_freeLinkInfo1 = true;
	// this can not be NULL!
	if ( ! ptr_linkInfo1 || size_linkInfo1 <= 0 ) {
		log("build: error getting linkinfo1: %s",mstrerror(g_errno));
		char *xx=NULL;*xx=0;
		return NULL;
	}
	// take it from msg25 permanently
	//m_msg25.m_linkInfo = NULL;
	// set flag
	m_linkInfo1Valid = true;
	// . validate the hop count thing too
	// . i took hopcount out of linkdb to put in lower ip byte for steve
	//m_minInlinkerHopCount = -1;//m_msg25.getMinInlinkerHopCount();
	// return it
	return ptr_linkInfo1;
}


static void *s_null = NULL;

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
LinkInfo **XmlDoc::getLinkInfo2 ( ) {

	// this can now be title hashes for XmlDoc::m_diffbotTitleHashes
	// but otherwise, we don't use it for link info from another cluster
	// any more.
	m_linkInfo2Valid = true;
	return (LinkInfo **)&s_null;

	// return if we got it
	if ( m_linkInfo2Valid ) return &ptr_linkInfo2;

	m_linkInfo2Valid = true;
	ptr_linkInfo2 = NULL;
	return &ptr_linkInfo2;

	/*
	if ( ! cr->m_importFromHosts2Conf ) {
		m_linkInfo2Valid = true;
		ptr_linkInfo2 = NULL;
		return &ptr_linkInfo2;
	}

	// change status
	setStatus ( "getting remote hosts2.conf inlinkers" );

	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo **)od;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo **)sni;
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo **)ip;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (LinkInfo **)d;
	// grab a ptr to the LinkInfo contained in our Doc class
	LinkInfo  *oldLinkInfo2 = NULL;
	if ( *od ) oldLinkInfo2 = *(*od)->getLinkInfo2();

	// . now search for some link info for this url/doc
	// . this queries the search engine to get linking docIds along
	//   with their termIds/scores from anchor text and then compiles
	//   it all into one IndexList
	// . if we have no linkers to this url then we set siteHash, etc.
	//   for this linkInfo class
	// . this is my google algorithm
	// . let's use the first url (before redirects) for this
	// . m_newDocId is used for classifying doc under predefined news topic
	// . catSiteRec is used for classifying pages under a predefined
	//   newstopic. this is currently for news search only.
	// . use the rootTitleRecPtr if there and we are doing our link info
	//   stuff in this collection, but if doing it in another collection
	//   the msg25 will look up the root in that collection...
	if ( ! m_calledMsg25b ) {
		// do not redo it
		m_calledMsg25b = true;
		// int16_tcut
		Msg25 *m = &m_msg25;
		// can we be cancelled?
		bool canBeCancelled = true;
		// not if pageparser though
		if ( m_pbuf ) canBeCancelled = false;
		// not if injecting
		if ( ! m_sreqValid ) canBeCancelled = false;
		// use this buffer so XmlDoc::print() can display wherever
		//SafeBuf *sb = NULL;
		//if ( m_pbuf ) sb = &m_pageLinkBuf2;
		// call it
		if ( ! m->getPageLinkInfo2 ( getFirstUrl()         ,
					     m_coll                ,
					     cr->m_externalColl  ,
					     m_masterState         ,
					     m_masterLoop          ,
					     cr->m_doLinkSpamCheck ,
					     cr->m_oneVotePerIpDom ,
					     canBeCancelled        ) )
			// blocked
			return (LinkInfo **)-1;
		// error?
		if ( g_errno ) return NULL;
	}

	// at this point assume its valid
	m_linkInfo2Valid = true;
	// get the link info we got set
	ptr_linkInfo2 = m_msg25.m_linkInfo;
	// we should free it
	m_freeLinkInfo2 = true;
	// take it from msg25 permanently
	m_msg25.m_linkInfo = NULL;
	// set flag
	m_linkInfo2Valid = true;
	// validate the hop count thing too
	//m_minInlinkerHopCount = m_msg25.getMinInlinkerHopCount();
	// return it
	return &ptr_linkInfo2;
	*/
}


static void gotSiteWrapper ( void *state ) ;

// . we should store the site in the title rec because site getter might
//   change what it thinks the site is!
char *XmlDoc::getSite ( ) {
	// was there a problem getting site?
	if ( m_siteValid && m_siteGetter.m_errno ) {
		g_errno = m_siteGetter.m_errno;
		return NULL;
	}
	// ok, return it
	if ( m_siteValid ) return ptr_site;//m_siteGetter.m_site;
	// note it
	setStatus ( "getting site");
	// need this
	TagRec *gr = getTagRec();
	// sanity check
	if ( ! gr && ! g_errno ) { char *xx=NULL;*xx=0; }
	// blocked or error?
	if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// get url
	Url *f = getFirstUrl();
	// bogus first url? prevent core in getIsSiteRoot().
	if ( f->getUrlLen() <= 1 ) {
		log("xmldoc: getSite: got bogus first url.");
		g_errno = EBADURL;
		return NULL;
	}
	// this must be valid
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	int32_t timestamp = getSpideredTime();//m_spideredTime;
	// add tags to tagdb?
	//bool addTags = true;
	//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
	//if ( getIsPageParser() ) addTags = false;
	// do it
	if ( ! m_siteGetter.getSite ( f->getUrl()    ,
				      gr             ,
				      timestamp      ,
				      cr->m_collnum         ,
				      m_niceness     ,
				      //addTags        ,
				      this           , // state
				      gotSiteWrapper ))
		// return -1 if we blocked
		return (char *)-1;
	// error?
	if ( g_errno ) return NULL;
	// set these then
	gotSite();
	return ptr_site;//m_siteGetter.m_site;
}

// set it
void gotSiteWrapper ( void *state ) {
	// point to us
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->gotSite ();
	// resume. this checks g_errno for being set.
	THIS->m_masterLoop ( THIS->m_masterState );
}

void XmlDoc::gotSite ( ) {
	// sanity check
	if ( ! m_siteGetter.m_allDone && ! g_errno ) { char *xx=NULL;*xx=0; }
	// this sets g_errno on error
	ptr_site    = m_siteGetter.m_site;
	size_site   = m_siteGetter.m_siteLen+1; // include \0
	// sanity check -- must have a site
	if ( ! g_errno && size_site <= 1 ) { char *xx=NULL;*xx=0; }
	// sitegetter.m_errno might be set!
	m_siteValid = true;
	// must be valid
	if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
	// add the sitepathdepth tag to our tagrec
	//Tag *a = m_siteGetter.m_addedTag.getFirstTag();
	//if ( a ) m_newTagRec.addTag ( a );
}

int64_t *XmlDoc::getSiteHash64 ( ) {
	if ( m_siteHash64Valid ) return &m_siteHash64;
	char *site = getSite();
	// sanity check
	if ( ! site && ! g_errno ) { char *xx=NULL;*xx=0; }
	if ( ! site || site == (void *)-1) return (int64_t *)site;
	m_siteHash64 = hash64 ( site , gbstrlen(site) );
	m_siteHash64Valid = true;
	return &m_siteHash64;
}


int32_t *XmlDoc::getSiteHash32 ( ) {
	if ( m_siteHash32Valid ) return &m_siteHash32;
	char *site = getSite();
	if ( ! site || site == (void *)-1) return (int32_t *)site;
	m_siteHash32 = hash32 ( site , gbstrlen(site) );
	m_siteHash32Valid = true;
	return &m_siteHash32;
}


void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {

	XmlDoc *THIS = (XmlDoc *)state;

	bool hadError = false;

	THIS->setStatus("got diffbot reply");

	// wha?
	if ( g_errno ) {
		log("diffbot: http error2 %s",mstrerror(g_errno));
		THIS->m_diffbotReplyError = g_errno;
		hadError = true;
	}

	// just retry if connection got reset by peer!
	if ( g_errno == ECONNRESET ||
	     g_errno == ETIMEDOUT ) {
	retry:
		// reset error in case was set below before our retry.
		// getDiffbotReply() will retry because we never set
		// m_diffbotReplyValid to true, below.
		THIS->m_diffbotReplyError = 0;
		log("buld: retrying diffbot reply");
		THIS->m_diffbotReplyRetries++;
		// resume. this checks g_errno for being set.
		THIS->m_masterLoop ( THIS->m_masterState );
		return;
	}

	THIS->m_diffbotReplyEndTime = gettimeofdayInMillisecondsGlobal();

	//char *buf = s->m_readBuf;
	// do not allow TcpServer.cpp to free it since m_diffbotReply
	// is now responsible for that
	//s->m_readBuf = NULL;

	// set the mime
	HttpMime mime;
	if ( ! hadError && s && s->m_readOffset>0 &&
	     // set location url to "null"
	     ! mime.set ( s->m_readBuf , s->m_readOffset , NULL ) ) {
		// g_errno should be set
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// note it
		log("build: error setting diffbot mime");
		THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR;
		hadError = true;
	}

	bool retryUrl = false;

	// check the status
	if ( ! hadError && mime.getHttpStatus() != 200 ) {
		THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS;
		log("xmldoc: diffbot reply mime was %"INT32"",
		    mime.getHttpStatus());
		hadError = true;
		// gateway timed out? then retry.
		if ( mime.getHttpStatus() == 504 )
			retryUrl = true;
	}

	if ( hadError )
		log("build: diffbot error for url %s",
		    THIS->m_diffbotUrl.getBufStart());


	CollectionRec *cr = THIS->getCollRec();

	if ( cr && strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
		log("build: diffbot reply for url %s = %s",
		    THIS->m_diffbotUrl.getBufStart(),
		    s->m_readBuf);
	}


	if ( retryUrl )
		goto retry;

	// get page content
	char *page = NULL;
	int32_t  pageLen = 0;
	if ( ! hadError && mime.getMimeLen() >= 0 )  {
		page = s->m_readBuf + mime.getMimeLen();
		char *end = s->m_readBuf + s->m_readOffset;
		pageLen = end - page;
	}

	// "-1" means diffbot had an error
	if ( page &&
	     page[0] == '-' &&
	     page[1] == '1' ) {
		log("xmldoc: diffbot reply was -1");
		THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
	}


	// . verify that it contains legit json and has the last field
	//   b/c we saw a case where the diffbot reply was truncated
	//   somehow
	// . check to make sure it has the "url": field as all diffbot
	//   json replies must
	if ( ! THIS->m_diffbotReplyError ) {
		char *ttt = strstr ( page , "\"url\":\"");
		if ( ! ttt ) ttt = strstr ( page , "\"pageUrl\":\"");
		if ( ! ttt ) {
			log("xmldoc: diffbot reply for %s using %s is missing "
			    "the url: field in the json reply. reply=%s",
			    THIS->m_firstUrl.m_url,
			    THIS->m_diffbotUrl.getBufStart(),
			    page
			    );
			// try to get the right error code
			char *err = strstr(page,"\"error\":\"");
			if ( err ) err += 9;
			int32_t code = EDIFFBOTUNKNOWNERROR;
			if ( ! err &&
			     page[0]=='{' &&
			     page[1]=='}' )
				code = EDIFFBOTCURLYREPLY;
			if ( err && !strncmp(err,"Unable to apply rules",21))
				code = EDIFFBOTUNABLETOAPPLYRULES;
			// like .pdf pages get this error
			if ( err && !strncmp(err,"Could not parse page",20))
				code = EDIFFBOTCOULDNOTPARSE;
			// if it is 404... 502, etc. any http status code
			if ( err && !strncmp(err,"Could not download page",23))
				code = EDIFFBOTCOULDNOTDOWNLOAD;
			// custom api does not apply to the url
			if ( err && !strncmp(err,"Invalid API",11))
				code = EDIFFBOTINVALIDAPI;
			if ( err && !strncmp(err,"Version required",16))
				code = EDIFFBOTVERSIONREQ;
			if ( err && !strncmp(err,"Empty content",13))
				code = EDIFFBOTEMPTYCONTENT;
			if ( err && !strncmp(err,"The selected pages contains too many TextNodes",46))
				code = EDIFFBOTTOOMANYTEXTNODES;
			if ( err && !strncmp(err,"No content received",19))
				code = EDIFFBOTEMPTYCONTENT;
			if ( err && !strncmp(err,"Request timed",13))
				code = EDIFFBOTREQUESTTIMEDOUT;
			if ( err &&!strncmp(err,"Request of third-party c",24))
				code = EDIFFBOTREQUESTTIMEDOUTTHIRDPARTY;
			// error processing url
			if ( err && !strncmp(err,"Error processing",16))
				code = EDIFFBOTURLPROCESSERROR;
			if ( err && !strncmp(err,"Your token has exp",18))
				code = EDIFFBOTTOKENEXPIRED;
			if ( err && !strncmp(err,"Not authorized API tok",22))
				code = EDIFFBOTTOKENUNAUTHORIZED;
			if ( err && !strncmp(err,"Error.",6) )
				code = EDIFFBOTPLAINERROR;
			THIS->m_diffbotReplyError = code;
		}
		// a hack for detecting if token is expired
		if ( THIS->m_diffbotReplyError == EDIFFBOTTOKENEXPIRED ) {
			// note it
			log("xmldoc: pausing crawl %s (%"INT32") because "
			    "token is expired",cr->m_coll,
			    (int32_t)cr->m_collnum);
			// pause the crawl
			SafeBuf parmList;
			// spidering enabled is the "cse" cgi parm in Parms.cpp
			g_parms.addNewParmToList1 ( &parmList ,
						    cr->m_collnum,
						    "0", // val
						    -1 ,
						    "cse");
			// this uses msg4 so parm ordering is guaranteed
			g_parms.broadcastParmList ( &parmList , NULL , NULL );
		}
	}

	// reply is now valid but might be empty
	THIS->m_diffbotReplyValid = true;

	// if json reply was truncated, that is an error as well.
	// likewise we have to check if such bad json is in the serps
	// when doing an icc=1 and print 'bad json' in json instead.
	if ( ! THIS->m_diffbotReplyError && s->m_readOffset > 1 &&
	     // json must end with '}' (ignores trailing whitespace)
	     ! endsInCurly ( s->m_readBuf , s->m_readOffset ) ) {
		// hopefully this can be re-tried later.
		THIS->m_diffbotReplyError = EJSONMISSINGLASTCURLY;
		// make a note of it
		log("build: got diffbot reply missing curly for %s",
		    THIS->m_firstUrl.m_url);
	}

	//if ( ! cr ) return;

	bool countIt = true;
	if ( ! cr ) countIt = false;
	if ( THIS->m_diffbotReplyError ) countIt = false;

	/*

	  // solution for bug #2092 but probably not really needed so
	  // commented out.

	// if doing /vxxx/analzye?mode=xxxx then ensure matches
	bool isAnalyze = false;
	if ( countIt &&
	     THIS->m_diffbotApiUrlValid &&
	     strstr ( THIS->m_diffbotApiUrl.getBufStart(), "/analyze?") )
		isAnalyze = true;

	char *mode = NULL;
	if ( isAnalyze ) {
		mode = strstr (THIS->m_diffbotApiUrl.getBufStart(), "mode=");
		if ( mode ) mode += 5;
		// find end of it
	}

	char *pageType = NULL;
	int32_t pageTypeLen;
	if ( mode &&
	     THIS->m_diffbotReplyValid &&
	     THIS->m_diffbotReply.length() > 5 ) {
		char *reply = THIS->m_diffbotReply.getBufStart();
		pageType = strstr ( reply , "\"type\":\"" );
		if ( pageType ) pageType += 8;
		char *e = pageType;
		for ( ; *e && *e != '\"' ; e++ );
		pageTypeLen = e - pageType;
	}

	// if it does not match, do not count it
	if ( mode && pageType && strncmp ( mode , pageType , pageTypeLen ) )
		countIt = false;
	*/

	// increment this counter on a successful reply from diffbot
	if ( countIt ) { // ! THIS->m_diffbotReplyError && cr ) {
		// mark this flag
		THIS->m_gotDiffbotSuccessfulReply = 1;
		// count it for stats
		cr->m_localCrawlInfo.m_pageProcessSuccesses++;
		cr->m_globalCrawlInfo.m_pageProcessSuccesses++;
		// per round as well
		cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound++;
		cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound++;
		// log it
		log(LOG_INFO,
		    "build: processed page %s (pageLen=%"INT32")",
		    THIS->m_firstUrl.m_url,
		    pageLen);
		// changing status, resend local crawl info to all
		cr->localCrawlInfoUpdate();
		// sanity!
		// crap, this can happen if we try to get the metalist
		// of an old page for purposes of incremental indexing or
		// deletion. we do not re-download it, but it seems we try
		// to re-process it...
		//if ( cr->m_localCrawlInfo.m_pageProcessAttempts >
		//     cr->m_localCrawlInfo.m_pageDownloadAttempts ) {
		//	char *xx=NULL;*xx=0; }
		// need to save collection rec now during auto save
		cr->m_needsSave = true;
		// the diffbot api url we used
		//SafeBuf *au = THIS->getDiffbotApiUrl();
		//if ( ! au || au == (void *)-1 ) {char *xx=NULL;*xx=0;}
		// set the reply properly
		int32_t need = pageLen + 1;// + au->length() + 1;
		if ( ! THIS->m_diffbotReply.reserve ( need ) )
			goto skip;
		// first store the url we used on first line
		//THIS->m_diffbotReply.safeMemcpy ( au->getBufStart(),
		//				  au->length() );
		//THIS->m_diffbotReply.pushChar('\n');
		// convert the \u1f23 to utf8 (\n and \r as well)
		// crap, this decodes \\\\\" to \\" which is causing
		// the json parser to believe it is an encoded \ then
		// a REAL quote... but quote is contained...
		//THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen ,
		//					    THIS->m_niceness );

		// do not do that any more then, jsonparse can call it
		// on a per string basis
		THIS->m_diffbotReply.safeMemcpy ( page , pageLen );

		// convert embedded \0 to space
		//char *p = THIS->m_diffbotReply.getBufStart();
		//char *pend = p + THIS->m_diffbotReply.getLength();
		// tack on a \0 but don't increment m_length
		THIS->m_diffbotReply.nullTerm();

		// any embedded \0's in the utf8?
		int32_t testLen1 = THIS->m_diffbotReply.length();
		int32_t testLen2 = gbstrlen(THIS->m_diffbotReply.getBufStart());
		if ( testLen1 != testLen2 ) { char *xx=NULL;*xx=0; }
		// convert the \u1f23 to utf8 (\n and \r as well)
		//THIS->m_diffbotReply.decodeJSONToUtf8 ( THIS->m_niceness );
		//THIS->m_diffbotReply.nullTerm();
	}

skip:
	// resume. this checks g_errno for being set.
	THIS->m_masterLoop ( THIS->m_masterState );
}

SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {

	if ( m_diffbotApiUrlValid )
		return &m_diffbotApiUrl;

	// if we are a diffbot json object, do not re-send to diffbot!
	if ( m_isDiffbotJSONObject ) {
		//m_diffbotApiNum = DBA_NONE;
		m_diffbotApiUrlValid = true;
		return &m_diffbotApiUrl;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;


	m_diffbotApiUrl.safeMemcpy ( &cr->m_diffbotApiUrl );
	m_diffbotApiUrl.nullTerm();
	m_diffbotApiUrlValid = true;

	// this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
	// in case the url filters table changes while spidering this!!!
	// gotta be careful of that.
	//int32_t *ufn = getUrlFilterNum();
	//if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;

	// ensure it does set it!
	//if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }

	//m_diffbotApiNum = cr->m_spiderDiffbotApiNum[*ufn];

	// sanity check
	//if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }

	//m_diffbotApiNumValid = true;
	return &m_diffbotApiUrl;
}

// if only processing NEW URLs is enabled, then do not get diffbot reply
// if we already got one before
bool *XmlDoc::getRecycleDiffbotReply ( ) {

	if ( m_recycleDiffbotReplyValid )
		return &m_recycleDiffbotReply;

	// if from pageparser.cpp re-call diffbot for debugging
	if ( getIsPageParser() ) {
		m_recycleDiffbotReply = false;
		m_recycleDiffbotReplyValid = true;
		return &m_recycleDiffbotReply;
	}

	XmlDoc **odp = getOldXmlDoc( );
	if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp;
	XmlDoc *od = *odp;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if doc has been successfully processed in the past then
	// ***RECYCLE*** the diffbot reply!
	m_recycleDiffbotReply = false;

	if ( cr->m_diffbotOnlyProcessIfNewUrl &&
	     od && od->m_gotDiffbotSuccessfulReply )
		m_recycleDiffbotReply = true;

	// to fight off corrupted title recs just assume that even though
	// we could not uncompress the title rec that it had a successful reply
	// if ( cr->m_diffbotOnlyProcessIfNewUrl &&
	//      m_oldDocExistedButHadError )
	// 	m_recycleDiffbotReply = true;

	// don't recycle if specfically asked to reindex though
	if ( m_sreqValid && m_sreq.m_isPageReindex )
		m_recycleDiffbotReply = false;

	// unless the 'recycle content' checkbox was checked when doing
	// the query (page) reindex...
	if ( m_sreqValid && m_sreq.m_recycleContent )
	 	m_recycleDiffbotReply = true;


	m_recycleDiffbotReplyValid = true;

	return &m_recycleDiffbotReply;
}

// get hashes of the json objects in the diffbotreply
int32_t *XmlDoc::getDiffbotTitleHashes ( int32_t *numHashes ) {

	*numHashes = size_linkInfo2 / 4;

	if ( ! ptr_linkInfo2 ) *numHashes = 0;

	// hack: use linkdbdata2 field
	if ( m_diffbotTitleHashBufValid ) {
		// do not return NULL without g_errno set
		if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;
		return (int32_t *)ptr_linkInfo2;
	}

	SafeBuf *tdbr = getTokenizedDiffbotReply();
	if ( ! tdbr || tdbr == (void *)-1 ) return (int32_t *)tdbr;

	HashTableX dedup;
	if ( ! dedup.set ( 4,0,1024,NULL,0,false,m_niceness,"ddthbuf") )
		return NULL;

	// parse out the json items in the reply
	char *p = tdbr->getBufStart();
	char *pend = p + tdbr->length();

	int32_t plen;

	for ( ; p < pend ; p += plen + 1 ) {
		// breathe some in case diffbot reply is 250MB
		QUICKPOLL(m_niceness);
		// set this
		plen = gbstrlen(p);
		// get title from it
		int32_t valLen;
		char *val = getJSONFieldValue ( p , "title", &valLen );
		int32_t th32 = 0;
		// hash the title
		if ( val && valLen ) {
			th32 = hash32 ( val , valLen );
			// avoid 0
			if ( th32 == 0 ) th32 = 1;
		}
		// if no title, use hash of body
		if ( th32 == 0 ) {
			th32 = hash32 ( p , plen );
			// avoid 0
			if ( th32 == 0 ) th32 = 2;
		}
		// if our hash is duplicated then increment until unique
		while ( dedup.isInTable ( &th32 ) ) th32++;
		// store it for deduping
		dedup.addKey ( &th32 );
		// store it
		m_diffbotTitleHashBuf.pushLong(th32);
	}

	ptr_linkInfo2 = (LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
	size_linkInfo2 = m_diffbotTitleHashBuf.length();
	*numHashes = size_linkInfo2 / 4;
	m_diffbotTitleHashBufValid = true;

	// if no hashes return 0x01 because NULL means g_errno
	if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01;

	return (int32_t *)ptr_linkInfo2;
}

// . we now get the TOKENIZED diffbot reply.
// . that converts a single diffbot reply into multiple \0 separated
//   json objects.
// . for instance, the diffbot product api returns an array like
//   "products":[{...},{...}],"url":...  that consists of multiple
//   json product items, but the json elements that are not in
//   this array are description of the page itself, like url and title.
//   so we need to carry over these outter json objects to each
//   inner json object we tokenize.
// . in this fashion we'll have separate objects that can each be indexed
//   as a single page, which is what we want for searching.
SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) {

	if ( m_tokenizedDiffbotReplyValid )
		return m_tokenizedDiffbotReplyPtr;

	SafeBuf *dbr = getDiffbotReply();
	if ( ! dbr || dbr == (void *)-1 ) return dbr;

	// empty? that's easy. might be just "{}\n" i guess
	if ( dbr->length() <= 3 ) return dbr;

	char *text = dbr->getBufStart();

	Json jp;
	if ( ! jp.parseJsonStringIntoJsonItems ( text , m_niceness ) ) {
		g_errno = EBADJSONPARSER;
		return NULL;
	}

	JsonItem *jsonItem = jp.getItem("objects");
	char *array = NULL;
	int32_t arrayLen = 0;
	if ( jsonItem ) {
		array = jsonItem->getArrayStart();
		arrayLen = jsonItem->getArrayLen();
	}
	if ( array && arrayLen > 0 ) {
		m_v3buf.safeMemcpy( array , arrayLen );
		m_v3buf.nullTerm();
		// trim off the enclosing []'s
		char *p = m_v3buf.getBufStart();
		for ( ; *p && is_wspace_a(*p) ; p++ );
		if ( *p == '[') *p = ' ';
		char *e = m_v3buf.getBuf()-1;
		for ( ; e>p && is_wspace_a(*e) ;e--);
		if ( *e ==']') *e=' ';
		// replace top level commas with \0's
		int32_t curlies = 0;
		char *x = p;
		bool  inQuotes = false;
		// scan now
		for (  ; *x ; x++ ) {
			// escaping a backslash?
			if ( *x == '\\' && x[1] == '\\' ) {
				// skip two bytes then..
				x++;
				continue;
			}
			// escaping a quote? ignore quote then.
			if ( *x == '\\' && x[1] == '\"' ) {
				// skip two bytes then..
				x++;
				continue;
			}
			if ( *x == '\"' ) {
				inQuotes = ! inQuotes;
				continue;
			}
			// if in a quote, ignore {} in there
			if ( inQuotes ) continue;
			if ( *x== '{' ) {
				curlies++;
				continue;
			}
			if ( *x == '}' ) {
				curlies--;
				continue;
			}
			if ( curlies != 0 ) continue;
			if ( *x == ',' ) *x = '\0';
		}
		m_tokenizedDiffbotReplyPtr = &m_v3buf;
		m_tokenizedDiffbotReplyValid = true;
		return m_tokenizedDiffbotReplyPtr;
	}


	// it must have \"type\":\"product or \"type\":\"image
	// in order for us to do the array separation logic below.
	// we don't want to do this logic for articles because they
	// contain an image array!!!

	// this must be on the FIRST level of the json object, otherwise
	// we get errors because we got type:article and it
	// contains an images array!

	int32_t valLen;
	char *val = getJSONFieldValue ( text , "type", &valLen );

	bool isProduct = false;
	bool isImage = false;

	if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 )
		isProduct = true;

	if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 )
		isImage = true;

	if ( ! isProduct && ! isImage ) {
		m_tokenizedDiffbotReplyValid = true;
		m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
		return m_tokenizedDiffbotReplyPtr;
	}


	char *needle;
	char *newTerm;
	if ( isProduct ) {
		needle = ",\"products\":[";
		newTerm = "product";
	}
	else {
		needle = ",\"images\":[";
		newTerm = "image";
	}

	char *parray = strstr ( text , needle );

	// if not found, no need to do anything...
	if ( ! parray ) {
		m_tokenizedDiffbotReplyValid = true;
		m_tokenizedDiffbotReplyPtr = &m_diffbotReply;
		return m_tokenizedDiffbotReplyPtr;
	}


	// point to [
	char *pstart = parray + gbstrlen(needle) - 1;

	//
	// ok, now we have to do so json ju jitsu to fix it
	//

	// point to array. starting at the '['
	char *p = pstart;
	int32_t brackets = 0;
	bool inQuotes = false;
	for ( ; *p ; p++ ) {
		// escaping a quote? ignore quote then.
		if ( *p == '\\' && p[1] == '\"' ) {
			// skip two bytes then..
			p++;
			continue;
		}
		if ( *p == '\"' ) {
			inQuotes = ! inQuotes;
			continue;
		}
		// if in a quote, ignore {} in there
		if ( inQuotes ) continue;
		if ( *p == '[' ) brackets++;
		if ( *p != ']' ) continue;
		brackets--;
		// stop if array is done. p points to ']'
		if ( brackets == 0 ) break;
	}

	// now point to outter items to the left of the ",\"products\":[...
	char *left1 = dbr->getBufStart();
	char *left2 = parray;
	// then to the right. skip over the ending ']'
	char *right1 = p + 1;
	char *right2 = dbr->getBuf(); // end of the buffer


	SafeBuf *tbuf = &m_tokenizedDiffbotReply;

	// now scan the json products or images in the array
	char *x = pstart;
	// skip over [
	x++;
	// each product item in array is enclosed in {}'s
	if ( *x != '{' ) {
		log("build: something is wrong with diffbot reply");
		g_errno = EBADENGINEER;
		return NULL;
	}
	// reset CURLY bracket count
	int32_t curlies = 0;
	char *xstart = NULL;
	inQuotes = false;
	// scan now
	for ( ; x < right1 ; x++ ) {
		// escaping a quote? ignore quote then.
		if ( *x == '\\' && x[1] == '\"' ) {
			// skip two bytes then..
			x++;
			continue;
		}
		if ( *x == '\"' ) {
			inQuotes = ! inQuotes;
			continue;
		}
		// if in a quote, ignore {} in there
		if ( inQuotes ) continue;
		if ( *x== '{' ) {
			if ( curlies == 0 ) xstart = x;
			curlies++;
			continue;
		}
		if ( *x == '}' ) {
			curlies--;
			if ( curlies != 0 ) continue;
			// unreciprocated '{'? wtf???
			if ( ! xstart ) continue;
			// skip empty curlies
			if ( x[-1] == '{' ) continue;
			//
			// ok, we got an item!
			//

			// left top items
			if ( ! tbuf->safeMemcpy ( left1 , left2-left1 ) )
				return NULL;
			// use "product":

			if ( ! tbuf->safePrintf(",\"%s\":" , newTerm ) )
				return NULL;
			// the item itself, include it's curlies.
			if ( ! tbuf->safeMemcpy ( xstart , x - xstart+1 ) )
				return NULL;
			// right top items
			if ( ! tbuf->safeMemcpy ( right1 , right2-right1 ) )
				return NULL;
			// then a \0
			if ( ! tbuf->pushChar('\0') )
				return NULL;
			// reset this!
			xstart = NULL;
		}
	}

	// now show the items. debug!
	//p = tbuf->getBufStart();
	//for ( ; p < tbuf->getBuf() ; p += gbstrlen(p) + 1 )
	//	fprintf(stderr,"ITEM\n%s\n\n",p);


	m_tokenizedDiffbotReplyPtr = tbuf;
	m_tokenizedDiffbotReplyValid = true;
	return m_tokenizedDiffbotReplyPtr;
}

void gotDiffbotProxyReplyWrapper ( void *state , UdpSlot *slot ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->m_diffbotProxyReply = NULL;
	// if a valid reply, then point to it
	if ( slot->m_readBufSize == sizeof(ProxyReply) ) {
		THIS->m_diffbotProxyReply = (ProxyReply *)slot->m_readBuf;
		// steal it, we will free it in XmlDoc::reset()
		slot->m_readBuf = NULL;
	}
	// resume. this checks g_errno for being set.
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . convert document into json representing multiple documents
//   if it makes sense. sometimes a single url contains multiple
//   subdocuments that each should have their own url, but do not,
//   so we fix that here.
// . the diffbot reply will be a list of json objects we want to index
SafeBuf *XmlDoc::getDiffbotReply ( ) {

	// got reply of malformed json missing final '}'
	if ( m_diffbotReplyValid &&
	     m_diffbotReplyError == EJSONMISSINGLASTCURLY ) {
		// hopefully spider will retry later
		g_errno = m_diffbotReplyError;
		return NULL;
	}

	if ( m_diffbotReplyValid )
		return &m_diffbotReply;

	// . check the url filters table to see if diffbot api is specified
	// . just return "\0" if none, but NULL means error i guess
	SafeBuf *au = getDiffbotApiUrl();
	if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;

	// if no url, assume do not access diffbot
	if ( au->length() <= 0 ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	// if we are json do not send that to diffbot, like an injected
	// json diffbot object. should fix json injections into gobal index
	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;
	if ( *ct == CT_JSON ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}


	// we make a "fake" url for the diffbot reply when indexing it
	// by appending -diffbotxyz%"UINT32". see "fakeUrl" below.
	if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
		if ( m_firstUrlValid )
			log("build: diffbot url would be too long for "
			    "%s", m_firstUrl.getUrl() );
		else
			log("build: diffbot url would be too long for "
			    "%"INT64"", m_docId );
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	// getIndexCode() calls getDiffbotReply(), so avoid a loop!
	//if ( *getIndexCode() )
	//	return &m_diffbotReply;
	if ( m_indexCodeValid && m_indexCode )
		return &m_diffbotReply;


	if ( m_isDiffbotJSONObject ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	// if this is a robots.txt or a root page we are downloading
	// separately to get the title for to compare to this page's title,
	// or whatever, do not pass to diffbot
	if ( m_isChildDoc ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// get list of substring patterns
	char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
	char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
	if ( upp && ! upp[0] ) upp = NULL;
	if ( ucp && ! ucp[0] ) ucp = NULL;
	// do we match the url process pattern or regex?
	// get the compiled regular expressions
	//regex_t *ucr = &cr->m_ucr;
	regex_t *upr = &cr->m_upr;
	//if ( ! cr->m_hasucr ) ucr = NULL;
	if ( ! cr->m_hasupr ) upr = NULL;
	// get the url
	Url *f = getFirstUrl();
	char *url = f->getUrl();
	// . "upp" is a ||-separated list of substrings
	// . "upr" is a regex
	// . regexec returns 0 for a match
	if ( upr && regexec(upr,url,0,NULL,0) ) {
		// return empty reply
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}
	if ( upp && !upr &&!doesStringContainPattern(url,upp)) {
		// return empty reply
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}


	// if already processed and onlyprocessifnewurl is enabled then
	// we recycle and do not bother with this, we also do not nuke
	// the diffbot json objects we have already indexed by calling
	// nukeJSONObjects()
	bool *recycle = getRecycleDiffbotReply();
	if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle;
	if ( *recycle ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}


	// if set from title rec, do not do it. we are possibly an "old doc"
	// and we should only call diffbot.com with new docs
	if ( m_setFromTitleRec ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}


	// "none" means none too! Parms.cpp doesn't like &dapi1=& because
	// it does not call setParm() on such things even though it probably
	// should, it doesn't like no values, so i put "none" in there.
	if ( strncasecmp(au->getBufStart(),"none",4) == 0 ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	if ( strncasecmp(au->getBufStart(),"donotprocess",12) == 0 ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	// invalid url?
	Url apiUrl; apiUrl.set ( au->getBufStart() );
	if (apiUrl.getUrlLen() <= 0 ||
	     apiUrl.getHostLen() <= 0 ||
	     apiUrl.getDomainLen() <= 0 )  {
		log("build: invalid diffbot api url of \"%s\".",
		    au->getBufStart() );
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}


	// when respidering an "old" doc, never call this. we already
	// have the diffbot replies xyz.com/-diffbot-0 and xyz.com/-diffbot-1
	// etc.
	//if ( m_setFromTitleRec ) { char *xx = NULL; *xx = 0; }

	// sanity check. no! barfs on legit url with -diffbot- in it
	//if ( strstr(m_firstUrl.m_url,"-diffbot-") ) {
	//	char *xx=NULL; *xx = 0; }

	// we should not "process" (i.e. send to diffbot) urls that do
	// not match the supplied CollectionRec::m_diffbotUrlProcessPattern
	// let's just put a checkbox in the url filters box for this!
	// i.e. Send to Diffbot? [X]
	//if ( m_useDiffbot && ! doesUrlMatchDiffbotProcessPattern() ) {
	//	m_diffbotReplyValid = true;
	//	return &m_diffbotReply;
	//}

	// empty content, do not send to diffbot then
	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8;
	if ( ! *u8 ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	// do not send to diffbot if its binary!
	char *ib = getIsBinary();
	if ( ! ib || ib == (void *)-1 ) return (SafeBuf *)ib;
	if ( *ib ) {
		m_diffbotReplyValid = true;
		log("diffbot: skipping binary page %s",m_firstUrl.m_url);
		return &m_diffbotReply;
	}


	// or if original page content matches the page regex dont hit diffbot
	if ( ! doesPageContentMatchDiffbotProcessPattern() ) {
		m_diffbotReplyValid = true;
		return &m_diffbotReply;
	}

	// now include referring link anchor text, etc.
	LinkInfo  *info1    = getLinkInfo1 ();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;


	setStatus("getting diffbot reply");


	// set up dedup table for deduping on link text
	HashTableX dedup;
	char tmp[512];
	if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") )
		return NULL;

	SafeBuf headers;
	bool first = true;

	// . make additional headers
	// . add two headers for every "good" (non-dup) link
	// . do NOT end headers in \r\n since HttpServer adds that!
	for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// sanity
		if ( k->size_urlBuf <= 1 ) continue;
		// skip if too long
		if ( k->size_linkText > 1024 ) continue;
		// or not enough! (size includes \0)
		if ( k->size_linkText <= 1 ) continue;
		// sanity check
		char *txt = k->getLinkText();
		int32_t tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		// this seems to happen sometimes..
		if ( ! verifyUtf8 ( txt , tlen ) ) continue;
		// if anchor text has \0 skip it
		if ( gbstrlen(txt) != tlen ) continue;
		// or if surrounding text has \0 skip as well
		char *surStr = k->getSurroundingText();
		int32_t  surLen = k->size_surroundingText;
		if ( surLen > 0 ) surLen--;
		if ( surStr && gbstrlen(surStr) != surLen ) continue;
		// dedup on that
		int32_t h32 = hash32 ( txt , tlen );
		if ( dedup.isInTable ( &h32 ) ) continue;
		if ( ! dedup.addKey ( &h32 ) ) return NULL;
		// separate with \r\n
		if ( ! first && ! headers.safePrintf("\r\n" ) )
			return NULL;
		first = false;
		// add to http header
		if ( ! headers.safePrintf("X-referring-url: ") )
			return NULL;
		// do not include the terminating \0, so -1
		if ( ! headers.safeMemcpy(k->getUrl() , k->size_urlBuf-1 ))
			return NULL;
		// and link text
		if ( ! headers.safePrintf("\r\nX-anchor-text: ") )
			return NULL;
		// store the anchor text without any \r or \n chars
		if ( ! headers.reserve ( tlen ) ) return NULL;
		char *p    = txt;
		char *pend = txt + tlen;
		for ( ; p < pend ; p++ ) {
			if ( *p == '\r' ) continue;
			if ( *p == '\n' ) continue;
			headers.pushChar(*p);
		}
		// do not include it if more than 2000 chars big
		if ( surLen > 0 && surLen < 2000 ) {
			if ( ! headers.safePrintf("\r\nX-surrounding-text: ") )
				return NULL;
			// make room for copying the surrounding text
			if ( ! headers.reserve ( surLen ) ) return NULL;
			// copy minus any \r or \n so its mime header safe
			p    = surStr;
			pend = surStr + surLen;
			for ( ; p < pend ; p++ ) {
				if ( *p == '\r' ) continue;
				if ( *p == '\n' ) continue;
				headers.pushChar(*p);
			}
		}
	}

	// make sure to null term the headers
	if ( headers.length() && ! headers.nullTerm() ) return NULL;

	//char *path = "api";
	//if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 )
	//	path = "v2";

	//
	// DIFFBOT injection interface TODO
	//
	// if we are intercepting a direct injection diffbot request
	// then we will probably take the exact same parms provided and
	// just relay them to diffbot here. maybe Diffbot.cpp can set
	// the original diffbot.com request url in this xmldoc class that
	// is being inject using the url encoded in that request.
	//

	// url can be on the stack since httpserver.cpp makes an http mime
	// from this url
	//SafeBuf diffbotUrl;

	// TODO: make sure "api" works as hostname for not just product...
	//diffbotUrl.safePrintf("http://www.diffbot.com/");
	// skip extra '/'?
	//char *api = au->getBufStart();
	//int32_t apiLen = au->length();
	//if ( api && api[0] == '/' ) { api++; apiLen--; }
	// append the custom url. i.e. /api/analyze?mode=auto&u=
	//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );

	// reset it in case we are a re-call from gotDiffbotReplyWrapper()
	// if g_errno == ECONNRESET
	m_diffbotUrl.reset();
	// store the api url into here
	m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() );

	// . m_diffbotApi Is like "article" or "product" etc.
	// . if classify is true we always return the classification
	//   of the page in the JSON. like "type":"frontpage" regardless
	//   of the "api" specified.
	// . otherwise, if classify is false empty json will be returned
	//   if there is no json objects of the specified page type, "api"
	// . BUT if api is "all" return all types of json objects
	// . SHOULD we return "type" in the json output?
	/*
	if ( *an == DBA_ALL )
		diffbotUrl.safePrintf("analyze?mode=auto&" );
	else if ( *an == DBA_ARTICLE_FORCE )
		diffbotUrl.safePrintf("article?");
	else if ( *an == DBA_ARTICLE_AUTO )
		diffbotUrl.safePrintf("analyze?mode=article&");
	else if ( *an == DBA_PRODUCT_FORCE )
		diffbotUrl.safePrintf("product?");
	else if ( *an == DBA_PRODUCT_AUTO )
		diffbotUrl.safePrintf("analyze?mode=product&");
	else if ( *an == DBA_IMAGE_FORCE )
		diffbotUrl.safePrintf("image?");
	else if ( *an == DBA_IMAGE_AUTO )
		diffbotUrl.safePrintf("analyze?mode=image&");
	else if ( *an == DBA_FRONTPAGE_FORCE )
		diffbotUrl.safePrintf("frontpage?");
	else if ( *an == DBA_FRONTPAGE_AUTO )
		diffbotUrl.safePrintf("analyze?mode=frontpage&");
	else {
		log("build: unknown diffbot api num = %"INT32". assuming all",*an );
		diffbotUrl.safePrintf("analyze?mode=auto&" );
	}
	*/

	//CollectionRec *cr = getCollRec();
	//if ( ! cr ) return NULL;

	// add a '?' if none
	if ( ! strchr ( apiUrl.getUrl() , '?' ) )
		m_diffbotUrl.pushChar('?');
	else
		m_diffbotUrl.pushChar('&');

	//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
	// only print token if we have one, because if user provides their
	// own diffbot url (apiUrl in Parms.cpp) then they  might include
	// the token in that for their non-custom crawl. m_customCrawl=0.
	if ( cr->m_diffbotToken.length())
		m_diffbotUrl.safePrintf("token=%s",
					cr->m_diffbotToken.getBufStart());

	bool useProxies = true;
	// user can turn off proxy use with this switch
	if ( ! g_conf.m_useProxyIps ) useProxies = false;
	// did collection override?
	if ( cr->m_forceUseFloaters ) useProxies = true;
	// we gotta have some proxy ips that we can use
	if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;

	// until we fix https CONNECT support for https urls diffbot can't
	// go through gb. we should fix that by downloading the whole page
	// ourselves and sending it back, and tell diffbot's phantomjs not
	// to do the certificate check.
	//
	// for now, allow http and NOT https urls through though.
	// TODO: if the url redirects to an https url will this mess us up?
	// if (  ! m_firstUrlValid )
	// 	useProxies = false;
	// if ( m_firstUrlValid && m_firstUrl.isHttps() )
	// 	useProxies = false;

	// turn off for now always
	//useProxies = false;

	if ( useProxies && ! m_diffbotProxyReplyValid && m_ipValid ) {
		// a special opcode used in SpiderProxy.cpp
		Msg13Request *r = &m_diffbotProxyRequest;
		r->m_opCode = OP_GETPROXYFORDIFFBOT;
		r->m_banProxyIp = 0;
		r->m_urlIp = m_ip;
		m_diffbotProxyReplyValid = true;
		// get first alive host, usually host #0 but if he is dead then
		// host #1 must take over! if all are dead, it returns host #0.
		// so we are guaranteed "h will be non-null
		Host *h = g_hostdb.getFirstAliveHost();
		// now ask that host for the best spider proxy to send to
		if ( ! g_udpServer.sendRequest ( (char *)r,
						 // just the top part of the
						 // Msg13Request is sent to
						 // handleRequest54() now
						 r->getProxyRequestSize() ,
						 0x54         , // msgType 0x54
						 h->m_ip      ,
						 h->m_port    ,
						 -1 , // h->m_hostId  ,
						 NULL         ,
						 this         , // state data
						 gotDiffbotProxyReplyWrapper,
						 9999999  )){// 99999sectimeout
			// sanity check
			if ( ! g_errno ) { char *xx=NULL;*xx=0; }
			// report it
			log("spider: msg54 request3: %s %s",
			    mstrerror(g_errno),r->ptr_url);
			return NULL;
		}
		// wait for reply
		return (SafeBuf *)-1;
	}


	// if we used a proxy to download the doc, then diffbot should too
	// BUT tell diffbot to go through host #0 so we can send it to the
	// correct proxy using our load balancing & backoff algos.
	if ( useProxies ) {
		//Host *h0 = g_hostdb.getHost(0);
		// use a random host now to avoid host #0 running
		// out of sockets from diffbot trying to connect
		// for downloading hundreds of urls from the same
		// high crawl delay site.
		// round robin over the hosts just to be more evenly
		// distributed. it will likely get several http requests
		// from diffbot.
		// static int32_t s_lastHostId = -1;
		// if ( s_lastHostId == -1 )
		// 	s_lastHostId = g_hostdb.m_myHost->m_hostId;
		// int32_t r = s_lastHostId;//rand() % g_hostdb.m_numHosts;
		// if ( ++s_lastHostId >= g_hostdb.m_numHosts )
		// 	s_lastHostId = 0;
		// Host *h0 = g_hostdb.getHost(r);
		// m_diffbotUrl.safePrintf("&proxy=%s:%"INT32"",
		// 			iptoa(h0->m_ip),
		// 			(int32_t)h0->m_httpPort);
		ProxyReply *prep = m_diffbotProxyReply;
		m_diffbotUrl.safePrintf("&proxy=%s:%"UINT32"",
					iptoa(prep->m_proxyIp),
					(uint32_t)prep->m_proxyPort);
		m_diffbotUrl.safePrintf("&proxyAuth=");
		m_diffbotUrl.urlEncode(prep->m_usernamePwd);
	}
	// char *p = g_conf.m_proxyAuth.getBufStart();
	// if ( useProxies && p ) {
	// 	char *p1 = p;
	// 	for ( ; *p1 &&   is_wspace_a(*p1) ; p1++ );
	// 	char *p2 = p1;
	// 	for ( ; *p2 && ! is_wspace_a(*p2) ; p2++ );
	// 	char c = *p2;
	// 	*p2 = '\0';
	// 	m_diffbotUrl.safePrintf("&proxyAuth=");
	// 	m_diffbotUrl.urlEncode(p1);
	// 	*p2 = c;
	// }

	// now so it works just give it a proxy directly, so it doesn't
	// have to go through gb.
	// if ( useProxies ) {
	// 	// msg13 typically uses this to get an unbanned proxy
	// 	getProxiesToUse();
	// }

	// if we use proxies then increase the timeout since proxies
	// increase the crawl delay in hopes of backing off to discover
	// the website's policy so we don't hit it too hard and get banned.
	// so to avoid diffbot timing out tell it to wait up to a minute
	// because the crawl delay can be as high as that, even higher
	if ( useProxies )
		m_diffbotUrl.safePrintf("&timeout=%"INT32"",
					(int32_t)MAX_PROXYCRAWLDELAYMS+10000);

	m_diffbotUrl.safePrintf("&url=");
	// give diffbot the url to process
	m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
	// append this just in case the next thing doesn't have it.
	//if ( cr->m_diffbotApiQueryString.length() &&
	//     cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
	//	diffbotUrl.pushChar('&');
	// then user provided parms that are dependent on if it is an
	// article, product, etc. like "&dontstripads=1" or whatever
	//diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart());

    // for analyze requests without mode=, make sure that diffbot expands all objects
	// "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze
	// null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above)
	if (m_diffbotUrl.nullTerm()) {
	    char *u = m_diffbotUrl.getBufStart();
	    if (strstr(u, "/analyze") && !strstr(u, "mode=")) {
	        m_diffbotUrl.safePrintf("&expand");
	    }
	}

	// null term it
	m_diffbotUrl.nullTerm();

	// mark as tried
	if ( m_srepValid ) { char *xx=NULL;*xx=0; }

	m_sentToDiffbotThisTime = true;

	// might have been a recall if gotDiffbotReplyWrapper() sensed
	// g_errno == ECONNRESET and it will retry
	if ( ! m_sentToDiffbot ) {

		m_sentToDiffbot = 1;

		// count it for stats
		cr->m_localCrawlInfo.m_pageProcessAttempts++;
		cr->m_globalCrawlInfo.m_pageProcessAttempts++;

		// changing status, resend local crawl info to all
		cr->localCrawlInfoUpdate();

		cr->m_needsSave = true;
	}

	char *additionalHeaders = NULL;
	if ( headers.length() > 0 )
		additionalHeaders = headers.getBufStart();

	// if did not get the web page first and we are crawling, not
	// doing a bulk, then core. we need the webpage to harvest links
	// and sometimes to check the pageprocesspattern to see if we should
	// process.
	if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
		char *xx=NULL;*xx=0; }

	log(LOG_INFO,
	    "diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
	    additionalHeaders);

	m_diffbotReplyStartTime = gettimeofdayInMillisecondsGlobal();

	if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() ,
				     0 , // ip
				     0 , // offset
				     -1 , // size
				     0 , // ifmodifiedsince
				     this , // state
				     gotDiffbotReplyWrapper ,
				     // MDW: boost timeout from 180 to 18000
				     // seconds so we can figure out why
				     // diffbot times out, etc. what is
				     // going on.
				     // this is slowing things too much
				     // so make it 240 seconds
				     240*1000, // 240 sec timeout
				     0,//proxyip
				     0,//proxyport
				     // unlimited replies i guess
				     -1,//maxtextdoclen unlimited
				     -1,//maxotherdoclen unlimited
				     g_conf.m_spiderUserAgent ,
				     "HTTP/1.0",
				     false, // do post?
				     NULL, // cookie
				     additionalHeaders ) )
		// return -1 if blocked
		return (SafeBuf *)-1;
	// error?
	if ( ! g_errno ) { char *xx=NULL;*xx=0; }
	// wha?
	log("diffbot: http error %s",mstrerror(g_errno));
	// had an error!
	return NULL;
}

char **XmlDoc::getHttpReply ( ) {
	// both must be valid now
	if ( m_redirUrlValid && m_httpReplyValid ) {
		// might have been a download error of ECORRUPTDATA
		if ( m_downloadStatus == ECORRUPTDATA ) {
			// set g_errno so caller knows
			g_errno = m_downloadStatus;
			// null means error
			return NULL;
		}
		// otherwise, assume reply is valid
		return &m_httpReply;
	}

	setStatus("getting http reply");

	// come back up here if a redirect invalidates it
 loop:
	// sanity test -- only if not the test collection (NO, might be EBADIP)
	//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
	// get the http reply
	char **replyPtr = getHttpReply2();
	if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
	// . now if the reply was a redirect we should set m_redirUrl to it
	//   and re-do all this code
	// . this often sets m_indexCode to stuff like ESIMPLIFIEDREDIR, etc.
	Url **redirp = getRedirUrl();
	// we often lookup the assocaited linkInfo on the original url to
	// see if it is worth keeping and indexing just to take advantage of
	// the incoming link text it has, so we may block on that!
	// but in the case of a contactDoc, getContactDoc() sets these things
	// to NULL to avoid unnecessary lookups.
	if ( ! redirp || redirp == (void *)-1 ) return (char **)redirp;
	// sanity check
	if ( *redirp && ! m_redirUrlValid ) { char *xx=NULL;*xx=0; }
	// if NULL, we are done
	if ( ! *redirp ) return &m_httpReply;
	// . also, hang it up if we got a simplified redir url now
	// . we set m_redirUrl so that getLinks() can add a spiderRequest
	//   for it, but we do not want to actually redirect to it to get
	//   the content for THIS document
	if ( m_redirError ) return &m_httpReply;
	// and invalidate the redir url because we do not know if the
	// current url will redirect or not (mdwmdw)
	m_redirUrlValid           = false;
	m_metaRedirUrlValid       = false;
	// free it
	mfree ( m_httpReply , m_httpReplyAllocSize, "freehr" );
	// always nullify if we free so we do not re-use freed mem
	m_httpReply = NULL;
	// otherwise, we had a redirect, so invalidate what we had set
	m_httpReplyValid          = false;
	// do not invalidate this any more, now it is when we STARTED spidering
	// the document
	//m_spideredTimeValid       = false;
	m_isContentTruncatedValid = false;
	// do not redo robots.txt lookup if the redir url just changed from
	// http to https or vice versa
	Url *ru = *redirp;
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1) return (char **)cu;
	if ( strcmp ( ru->getUrl() + ru->getSchemeLen() ,
		      cu->getUrl() + cu->getSchemeLen() ) ) {
		// redo robots.txt lookup. might be cached.
		m_isAllowedValid  = false;
		m_crawlDelayValid = false;
	}
	// keep the same ip if hostname is unchanged
	if ( ru->getHostLen() != cu->getHostLen() ||
	     strncmp ( ru->getHost() , cu->getHost(), cu->getHostLen() ) )
		// ip is supposed to be that of the current url, which changed
		m_ipValid         = false;
	// we set our m_xml to the http reply to check for meta redirects
	// in the html sometimes in getRedirUrl() so since we are redirecting,
	// invalidate that xml
	m_xmlValid                = false;
	m_wordsValid              = false;
	m_rawUtf8ContentValid     = false;
	m_expandedUtf8ContentValid= false;
	m_utf8ContentValid        = false;
	m_filteredContentValid    = false;
	m_contentValid            = false;
	m_mimeValid               = false;
	// update our current url now to be the redirected url
	m_currentUrl.set ( *redirp , false );
	m_currentUrlValid = true;
	// loop it
	goto loop;
}

void gotHttpReplyWrapper ( void *state ) {
	// point to us
	XmlDoc *THIS = (XmlDoc *)state;
	// this sets g_errno on error
	THIS->gotHttpReply ( );
	// resume. this checks g_errno for being set.
	THIS->m_masterLoop ( THIS->m_masterState );
}

// "NULL" can be a valid http reply (empty page) so we need to use "char **"
char **XmlDoc::getHttpReply2 ( ) {
	if ( m_httpReplyValid ) return &m_httpReply;

	setStatus("getting http reply2");


	// if recycle is set then NEVER download if doing query reindex
	// but if doing an injection then i guess we can download.
	// do not even do ip lookup if no old titlerec, which is how we
	// ended up here...
	if ( m_recycleContent && m_sreqValid && m_sreq.m_isPageReindex ) {
		g_errno = ENOTITLEREC;
		return NULL;
	}

	// doing a query reindex on diffbot objects does not have a
	// valid spider request, only sets m_recycleContent to true
	// in reindexJSONObjects()/redoJSONObjects()
	if ( m_recycleContent && m_isDiffbotJSONObject ) {
		g_errno = ENOTITLEREC;
		return NULL;
	}


	// get ip
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (char **)ip;

	// reset
	m_httpReplySize = 0;
	m_httpReply     = NULL;

	// if ip is bogus, we are done
	if ( *ip == 0 || *ip == -1 ) {
		log("xmldoc: ip is bogus 0 or -1 for %s. skipping download",
		    m_firstUrl.getUrl());
		m_httpReplyValid          = true;
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// need this now too. but don't hurt a nonzero val if we have
		if ( ! m_downloadEndTimeValid ) {
			m_downloadEndTime      = 0;
			m_downloadEndTimeValid = true;
		}
		return &m_httpReply;
		//return gotHttpReply ( );
	}

	// get this. should operate on current url (i.e. redir url if there)
	bool *isAllowed = getIsAllowed();
	// error or blocked
	if ( ! isAllowed || isAllowed == (void *)-1) return (char **)isAllowed;
	// this must be valid, since we share m_msg13 with it
	if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; }

	int32_t *cd = getFinalCrawlDelay();
	if ( ! cd || cd == (void *)-1 ) return (char **)cd;

	// we might bail
	if ( ! *isAllowed ) {
		m_httpReplyValid          = true;
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// need this now too. but don't hurt a nonzero val if we have
		if ( ! m_downloadEndTimeValid ) {
			m_downloadEndTime      = 0;
			m_downloadEndTimeValid = true;
		}
		m_downloadStatusValid = true;
		// forbidden? assume we downloaded it and it was empty
		m_downloadStatus = 0; // EDOCDISALLOWED;//403;
		return &m_httpReply;
		//return gotHttpReply ( );
	}

	// are we site root page?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
	//int8_t *hc = getHopCount();
	//if ( ! hc || hc == (void *)-1 ) return (char **)hc;

	XmlDoc *od = NULL;
	if ( ! m_isSpiderProxy &&
	     // don't lookup xyz.com/robots.txt in titledb
	     ! isFirstUrlRobotsTxt() ) {
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
		// get ptr to old xml doc, could be NULL if non exists
		od = *pod;
	}

	// sanity check
	if ( od && m_recycleContent ) {char *xx=NULL;*xx=0; }

	// validate m_firstIpValid
	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// robots.txt and css files etc should have m_isChildDoc as true
	//if ( ! m_downloadAttempted && ! m_isChildDoc )
	//	// keep track of spider stats
	//	cr->m_localCrawlInfo.m_pageDownloadAttempts++;

	// we made an attempt to download, so mark it
	//m_downloadAttempted = true;

	// if we didn't block getting the lock, keep going
	setStatus ( "getting web page" );


	// sanity check
	if ( ! m_masterLoop ) { char *xx=NULL;*xx=0; }

	// int16_tcut. this will return the redirUrl if it is non-empty.
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (char **)cu;

	/*
	// if on google, make it empty so we do not hit them
	if ( strstr(cu->getUrl(),".google.com/") ) {
		log("spider: encountered google.com url. emptying.");
		m_httpReplyValid          = true;
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// need this now too. but don't hurt a nonzero val if we have
		if ( ! m_downloadEndTimeValid ) {
			m_downloadEndTime      = 0;
			m_downloadEndTimeValid = true;
		}
		return &m_httpReply;
	}
	*/

	// no ip found means empty page i guess
	//if ( *ip == 0 || *ip == -1 )
	//	return gotHttpReply ( );

	bool useTestCache = false;
	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
	// unless its the pagesubmit.cpp event submission tool
	//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;

	// sanity check
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }

	// set parms
	Msg13Request *r = &m_msg13Request;
	// clear it first
	r->reset();
	// and set the url
	//strcpy ( r->m_url , cu->getUrl() );
	r->ptr_url  = cu->getUrl();
	r->size_url = cu->getUrlLen()+1;

	// caution: m_sreq.m_hopCountValid is false sometimes for page parser
	// this is used for Msg13.cpp's ipWasBanned()
	// we use hopcount now instead of isInSeedBuf(cr,r->ptr_url)
	bool isInjecting = getIsInjecting();
	if ( ! isInjecting && m_sreqValid     && m_sreq.m_hopCount == 0 )
		r->m_isRootSeedUrl = 1;
	if ( ! isInjecting && m_hopCountValid && m_hopCount        == 0 )
		r->m_isRootSeedUrl = 1;

	// sanity check
	if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }

	// r->m_maxTextDocLen          = maxDownload;
	// r->m_maxOtherDocLen         = maxDownload;
	r->m_maxTextDocLen          = cr->m_maxTextDocLen;
	r->m_maxOtherDocLen         = cr->m_maxOtherDocLen;

	// max to download in bytes. currently 1MB.
	//int32_t maxDownload = (int32_t)MAXDOCLEN;
	// but if url is http://127.0.0.1.... or local then
	if ( m_ipValid ) {
		// make into a string
		char *ipStr = iptoa(m_ip);
		// is it local?
		bool isLocal = false;
		if ( strncmp(ipStr,"192.168.",8) == 0) isLocal = true;
		if ( strncmp(ipStr,"10."     ,3) == 0) isLocal = true;
		if ( m_ip == 16777343 ) isLocal = true; // 127.0.0.1 ?
		// . if local then make web page download max size unlimited
		// . this is for adding the gbdmoz.urls.txt.* files to
		//   populate dmoz. those files are about 25MB each.
		if ( isLocal ) {
			//maxDownload = -1;
			r->m_maxTextDocLen  = -1;
			r->m_maxOtherDocLen = -1;
		}
	}
	// m_maxCacheAge is set for getting contact or root docs in
	// getContactDoc() and getRootDoc() and it only applies to
	// titleRecs in titledb i guess... but still... for Msg13 it applies
	// to its cache ... for robots.txt files too
	r->m_maxCacheAge            = m_maxCacheAge;
	r->m_urlIp                  = *ip;
	r->m_firstIp                = m_firstIp;
	r->m_urlHash48              = getFirstUrlHash48();
	if ( r->m_maxTextDocLen  < 100000 ) r->m_maxTextDocLen  = 100000;
	if ( r->m_maxOtherDocLen < 200000 ) r->m_maxOtherDocLen = 200000;
	r->m_forwardDownloadRequest = (bool)m_forwardDownloadRequest;
	r->m_useTestCache           = (bool)useTestCache;
	r->m_spideredTime           = getSpideredTime();//m_spideredTime;
	r->m_ifModifiedSince        = 0;
	r->m_skipHammerCheck        = 0;

	//if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true;
	//else                        r->m_addToTestCache = false;
	r->m_addToTestCache = (bool)useTestCache;

	if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
		r->ptr_cookie  = m_redirCookieBuf.getBufStart();
		r->size_cookie = m_redirCookieBuf.length() + 1;
		// . only do once per redirect
		// . do not invalidate because we might have to carry it
		//   through to the next redir... unless we change domain
		// . this fixes the nyt.com/nytimes.com bug some more
		//m_redirCookieBufValid = false;
	}

	// . this is -1 if unknown. none found in robots.txt or provided
	//   in the custom crawl parms.
	// . it should also be 0 for the robots.txt file itself
	r->m_crawlDelayMS = *cd;

	// let's time our crawl delay from the initiation of the download
	// not from the end of the download. this will make things a little
	// faster but could slam servers more.
	r->m_crawlDelayFromEnd = false;

	// need this in order to get all languages, etc. and avoid having
	// to set words class at the spider compression proxy level
	r->m_forEvents              = 0;
	// new stuff
	r->m_contentHash32 = 0;
	// if valid in SpiderRequest, use it. if spider compression proxy
	// sees the content is unchanged it will not send it back! it will
	// send back g_errno = EDOCUNCHANGED or something
	if ( m_sreqValid )
		r->m_contentHash32 = m_sreq.m_contentHash32;

	// if we have the old doc already set use that
	if ( od )
		r->m_contentHash32 = od->m_contentHash32;

	// force floater usage on even if "use spider proxies" parms is off
	// if we're a diffbot crawl and use robots is off.
	//if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
	//	r->m_forceUseFloaters = true;

	// for beta testing, make it a collection specific parm for diffbot
	// so we can turn on manually
	if ( cr->m_forceUseFloaters )
		r->m_forceUseFloaters = true;

	// eventgurubot is the max
	//char *userAgent = g_conf.m_spiderUserAgent;
	// hardcode it
	//char *userAgent = "EventGuruBot";
	//int32_t uaLen = gbstrlen(userAgent);
	//if ( uaLen > 12 ) {
	//	log("spider: user agent string too long");
	//	uaLen = 12;
	//}
	//strncpy(r->m_userAgent,userAgent,uaLen);
	//r->m_userAgent[uaLen] = '\0';

	// turn this off too
	r->m_attemptedIframeExpansion = false;

	r->m_collnum = (collnum_t)-1;
	if ( m_collnumValid )r->m_collnum = m_collnum;

	// turn off
	r->m_useCompressionProxy = false;
	r->m_compressReply       = false;
	r->m_isCustomCrawl       = cr->m_isCustomCrawl;

	// set it for this too
	if ( g_conf.m_useCompressionProxy &&
	     // do not use for the test collection ever, that is qa'ing
	     strcmp(cr->m_coll,"qatest123") ) {
		r->m_useCompressionProxy = true;
		r->m_compressReply       = true;
	}

	// are we a robots.txt file?
	//bool isRobotsTxt = isRobotsTxtFile ( cu->getUrl() , cu->getUrlLen());

	char *td = getTestDir();
	if ( td ) strncpy ( r->m_testDir, td, 31);

	//r->m_isPageParser = getIsPageParser();
	//r->m_isPageInject = ( m_sreqValid && m_sreq.m_isInjecting );

	// if current url IS NOT EQUAL to first url then set redir flag
	if ( strcmp(cu->m_url,m_firstUrl.m_url) )
		r->m_skipHammerCheck = 1;
	// or if this an m_extraDoc or m_rootDoc for another url then
	// do not bother printing the hammer ip msg in msg13.cpp either
	if ( m_isChildDoc )
		r->m_skipHammerCheck = 1;

	if ( m_contentInjected ) // oldsrValid && m_sreq.m_isInjecting )
		r->m_skipHammerCheck = 1;

	// or if ahrefs
	if ( strncmp(cu->m_url,"http://api.ahrefs.com/",22) == 0 )
		r->m_skipHammerCheck = 1;

	if ( r->m_skipHammerCheck )
		log(LOG_DEBUG,"build: skipping hammer check");

	// if we had already spidered it... try to save bandwidth and time
	if ( od ) {
		// sanity check
		if ( ! od->m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
		// only get it if modified since last spider time
		r->m_ifModifiedSince = od->m_spideredTime;
	}

	// tell msg13 he is scraping...
	if ( m_sreqValid && m_sreq.m_isScraping )
		r->m_isScraping = 1;

	// if doing frame expansion on a doc we just downloaded as the
	// spider proxy, we are asking ourselves now to download the url
	// from an <iframe src=...> tag. so definitely use msg13 again
	// so it can use the robots.txt cache, and regular html page cache.
	if ( m_isSpiderProxy ) {
		r->m_useCompressionProxy = false;
		r->m_compressReply       = false;
		r->m_skipHammerCheck = 1;
		//r->m_requireGoodDate = false;
		// no frames within frames
		r->m_attemptedIframeExpansion = 1;
		log(LOG_DEBUG,"build: skipping hammer check 2");

	}

	// . use msg13 to download the file, robots.txt
	// . msg13 will ensure only one download of that url w/ locks
	// . msg13 can use the compress the http reply before
	//   sending it back to you via udp (compression proxy)
	// . msg13 uses XmlDoc::getHttpReply() function to handle
	//   redirects, etc.? no...
	bool isTestColl = false;
	if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;

	//if ( isTestColl && m_contentType == CT_IMAGE )
	//	isTestColl = false;

	// sanity check. keep injections fast. no downloading!
	if ( m_wasContentInjected ) {
		log("xmldoc: url injection failed! error!");
		char *xx=NULL;*xx=0;
	}

	// sanity check
	if ( m_deleteFromIndex ) {
		log("xmldoc: trying to download page to delete");
		char *xx=NULL;*xx=0;
	}

	m_downloadStartTimeValid = true;
	m_downloadStartTime = gettimeofdayInMillisecondsGlobal();

	if ( ! m_msg13.getDoc ( r , isTestColl,this , gotHttpReplyWrapper ) )
		// return -1 if blocked
		return (char **)-1;
	return gotHttpReply ( );
}
// . this returns false if blocked, true otherwise
// . sets g_errno on error
char **XmlDoc::gotHttpReply ( ) {
	// save it
	int32_t saved = g_errno;
	// note it
	setStatus ( "got web page" );

	// sanity check. are we already valid?
	if ( m_httpReply && m_httpReplyValid ) { char *xx=NULL;*xx=0; }

	// do not re-call
	m_httpReplyValid = true;

	// assume none
	m_httpReply = NULL;

	// . get the HTTP reply
	// . TODO: free it on reset/destruction, we own it now
	// . this is now NULL terminated thanks to changes in
	//   Msg13.cpp, but watch the buf size, need to subtract 1
	// . therefore, we can set the Xml class with it
	m_httpReply     = m_msg13.m_replyBuf;
	m_httpReplySize = m_msg13.m_replyBufSize;
	// how much to free?
	m_httpReplyAllocSize = m_msg13.m_replyBufAllocSize;

	// sanity check
	if ( m_httpReplySize > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; }
	// what is this for? that makes it into a length not a size!
	//if ( m_httpReplySize > 0 ) m_httpReplySize--;
	// . save entire reply length we read from the net so
	//   SpiderCache
	//   can use it for its m_avgReplyLen for throttling
	// . m_bufLen may change due to filtering
	//m_replyLen  = m_bufLen;
	// . don't let UdpServer free m_buf when socket is
	//   recycled/closed
	// . we own it now and are responsible for freeing it
	//slot->m_readBuf = NULL;
	m_msg13.m_replyBuf = NULL;
	// relabel mem so we know where it came from
	relabel( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );

	// breathe
	QUICKPOLL ( m_niceness );

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . sanity test -- only if not the test collection
	// . i.e. what are you doing downloading the page if there was
	//   a problem with the page we already know about
	if ( m_indexCode && m_indexCodeValid &&
	     strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }

	// fix this
	if ( saved == EDOCUNCHANGED ) {
		// assign content from it since unchanged
		m_recycleContent = true;
		// clear the error
		saved = 0;
		g_errno = 0;
	}

	// . save the error in download status
	// . could now be EDOCUNCHANGED or EDOCNOGOODDATE (w/ tod)
	m_downloadStatus = saved; // g_errno;
	// validate
	m_downloadStatusValid = true;

	// update m_downloadEndTime if we should, used for sameIpWait
	m_downloadEndTime      = gettimeofdayInMillisecondsGlobal();
	m_downloadEndTimeValid = true;

	// make it so
	g_errno = saved;

	bool doIncrement = true;
	if ( m_isChildDoc ) doIncrement = false;
	if ( m_incrementedDownloadCount ) doIncrement = false;

	bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );

	// if it doesn't match the crawl pattern, just the process pattern
	// then do not increment download successes
	if ( doIncrement &&
	     cr->m_isCustomCrawl == 1 &&
	     // allow seeds to be counted
	     ! isSeed &&
	     //! sreq->m_isPageReindex &&
	     //! sreq->m_isInjecting &&
	     ! doesUrlMatchDiffbotCrawlPattern() )
		doIncrement = false;


	// . do not count bad http status in mime as failure i guess
	// . do not inc this count for robots.txt and root page downloads, etc.
	if ( doIncrement ) {
		cr->m_localCrawlInfo.m_pageDownloadSuccesses++;
		cr->m_globalCrawlInfo.m_pageDownloadSuccesses++;
		cr->m_localCrawlInfo.m_pageDownloadSuccessesThisRound++;
		cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound++;
		m_incrementedDownloadCount = true;
		cr->m_needsSave = true;
		// changing status, resend local crawl info to all
		cr->localCrawlInfoUpdate();
	}

	// this means the spider compression proxy's reply got corrupted
	// over roadrunner's crappy wireless internet connection
	if ( saved == ECORRUPTDATA ) return NULL;
	// this one happens too! for the same reason...
	if ( saved == EBADREPLYSIZE ) return NULL;
	// might as well check this too while we're at it
	if ( saved == ENOMEM ) return NULL;

	// sanity check -- check after bailing on corruption because
	// corrupted replies do not end in NULLs
	if ( m_httpReplySize > 0 && m_httpReply[m_httpReplySize-1] ) {
		log("http: httpReplySize=%"INT32" http reply does not end in \\0 "
		    "for %s in collnum=%"INT32". blanking out reply."
		    ,m_httpReplySize
		    ,m_firstUrl.m_url
		    ,(int32_t)m_collnum
		    );
		// free it i guess
		mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
		// and reset it
		m_httpReplySize      = 0;
		m_httpReply          = NULL;
		m_httpReplyAllocSize = 0;
		// call it data corruption i guess for now
		g_errno = ECORRUPTDATA;
		//char *xx=NULL;*xx=0;
	}

	// if its a bad gzip reply, a compressed http reply, then
	// make the whole thing empty? some websites return compressed replies
	// even though we do not ask for them. and then the compression
	// is corrupt.
	if ( saved == ECORRUPTHTTPGZIP ||
	     // if somehow we got a page too big for MAX_DGRAMS... treat
	     // it like an empty page...
	     saved == EMSGTOOBIG ) {
		// free it i guess
		mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
		// and reset it
		m_httpReplySize      = 0;
		m_httpReply          = NULL;
		m_httpReplyAllocSize = 0;
	}

	// if errors were not local, reset g_errno and set m_indexCode
	//if ( g_errno == ETCPTIMEDOUT ) m_indexCode = ETCPTIMEDOUT;
	//if ( g_errno == EBADMIME     ) m_indexCode = EBADMIME;
	// clear g_errno
	//if ( m_indexCode ) g_errno = 0;
	// return if cancelled, etc.
	//if ( g_errno ) return NULL;

	// clear this i guess
	g_errno = 0;

	/*
	  MDW: 2/8/16 this logic now below in getIsContentTruncated() function

	// int16_tcut - convert size to length
	int32_t LEN = m_httpReplySize - 1;

	m_isContentTruncated  = false;
	// was the content truncated? these might label a doc is truncated
	// when it really is not... but we only use this for link spam stuff,
	// so it should not matter too much. it should only happen rarely.
	//if ( LEN >= cr->m_maxTextDocLen-1  ) m_isContentTruncated = true;
	//if ( LEN >= cr->m_maxOtherDocLen-1 ) m_isContentTruncated = true;
	if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
	// set this
	m_isContentTruncated2 = (bool)m_isContentTruncated;
	// validate it
	m_isContentTruncatedValid = true;
	*/

	return &m_httpReply;
}

char *XmlDoc::getIsContentTruncated ( ) {
	if ( m_isContentTruncatedValid ) return &m_isContentTruncated2;

	setStatus ( "getting is content truncated" );

	// if recycling content use its download end time
	if ( m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (char *)pod;
		// int16_tcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			m_isContentTruncated  = od->m_isContentTruncated;
			m_isContentTruncated2 = (bool)m_isContentTruncated;
			m_isContentTruncatedValid = true;
			return &m_isContentTruncated2;
		}
	}

	// need a valid reply
	char **replyPtr = getHttpReply ();
	if ( ! replyPtr || replyPtr == (void *)-1 ) return (char *)replyPtr;

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (char *)ct;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// shortcut - convert size to length
	int32_t LEN = m_httpReplySize - 1;

	m_isContentTruncated  = false;
	// was the content truncated? these might label a doc is truncated
	// when it really is not... but we only use this for link spam stuff,
	// so it should not matter too much. it should only happen rarely.
	if ( cr->m_maxTextDocLen >= 0 &&
	     LEN >= cr->m_maxTextDocLen-1  &&
	     *ct == CT_HTML )
		m_isContentTruncated = true;

	if ( cr->m_maxOtherDocLen >= 0 &&
	     LEN >= cr->m_maxOtherDocLen-1 &&
	     *ct != CT_HTML )
		m_isContentTruncated = true;

	//if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
	// set this
	m_isContentTruncated2 = (bool)m_isContentTruncated;
	// validate it
	m_isContentTruncatedValid = true;

	return &m_isContentTruncated2;
}

int32_t *XmlDoc::getDownloadStatus ( ) {
	if ( m_downloadStatusValid ) return &m_downloadStatus;
	// log it
	setStatus ( "getting download status");
	// if recycling content, we're 200!
	if ( m_recycleContent ) {
		m_downloadStatus = 0;
		m_downloadStatusValid = true;
		return &m_downloadStatus;
	}
	// get ip
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
	// . first try ip
	// . this means the dns lookup timed out
	if ( *ip == -1 ) {
		m_downloadStatus = EDNSTIMEDOUT;
		m_downloadStatusValid = true;
		return &m_downloadStatus;
	}
	// this means ip does not exist
	if ( *ip == 0 ) {
		m_downloadStatus = EBADIP;
		m_downloadStatusValid = true;
		return &m_downloadStatus;
	}
	// need a valid reply
	char **reply = getHttpReply ();
	if ( ! reply || reply == (void *)-1 ) return (int32_t *)reply;
	// must be valid now
	if ( ! m_downloadStatusValid ) { char *xx=NULL;*xx=0; }
	// return it
	return &m_downloadStatus;
}

int64_t *XmlDoc::getDownloadEndTime ( ) {
	if ( m_downloadEndTimeValid ) return &m_downloadEndTime;
	// log it
	setStatus ( "getting download end time");

	// do not cause us to core in getHttpReply2() because m_deleteFromIndex
	// is set to true...
	if ( m_deleteFromIndex ) {
		m_downloadEndTime = 0;
		m_downloadEndTimeValid = true;
		return &m_downloadEndTime;
	}

	// if recycling content use its download end time
	if ( m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (int64_t *)pod;
		// int16_tcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			m_downloadEndTime      = od->m_downloadEndTime;
			m_downloadEndTimeValid = true;
			return &m_downloadEndTime;
		}
	}

	// need a valid reply
	char **reply = getHttpReply ();
	if ( ! reply || reply == (void *)-1 ) return (int64_t *)reply;
	// must be valid now
	if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0;}
	// return it
	return &m_downloadEndTime;
}


int16_t *XmlDoc::getHttpStatus ( ) {
	// if we got a title rec then return that
	if ( m_httpStatusValid ) return &m_httpStatus;
	// get mime otherwise
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (int16_t *)mime;
	// get from that
	m_httpStatus = mime->getHttpStatus();
	m_httpStatusValid = true;
	return &m_httpStatus;
}

HttpMime *XmlDoc::getMime () {
	if ( m_mimeValid ) return &m_mime;

	// log debug
	setStatus("getting http mime");

	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1) return (HttpMime *)cu;

	// injection from SpiderLoop.cpp sets this to true
	if ( m_useFakeMime ) {
	usefake:
		m_mime.set            ( NULL , 0 , cu );
		m_mime.setHttpStatus  ( 200 );
		m_mime.setContentType ( CT_HTML );
		m_mimeValid = true;
		return &m_mime;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if recycling content, fake this mime
	if ( cr->m_recycleContent || m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (HttpMime *)pod;
		// int16_tcut
		XmlDoc *od = *pod;
		// . this is non-NULL if it existed
		// . fake it for now
		if ( od ) goto usefake;
	}

	// need a valid reply
	char **reply = getHttpReply ();
	if ( ! reply || reply == (void *)-1 ) return (HttpMime *)reply;

	// fake it for now
	m_mime.set ( NULL , 0 , cu );
	m_mime.setHttpStatus ( 200 );
	m_mime.setContentType ( CT_HTML );

	// int16_tcut
	int32_t LEN = m_httpReplySize - 1;

	// validate it
	m_mimeValid = true;

	// TODO: try again on failures because server may have been overloaded
	// and closed the connection w/o sending anything
	if ( LEN>0 && ! m_mime.set ( m_httpReply , LEN , cu ) ) {
		// set this on mime error
		//m_indexCode = EBADMIME;
		// return a fake thing. content length is 0.
		return &m_mime;
	}

	// . check the mime status, should be in the 200's for success
	// . spider should redirect on 3xx codes
	// . 404 means not found, etc.
	// . 304 is not modified since
	// . >= 300 should only happen if redirect chain was too long to follow
	//int32_t httpStatus = m_mime.getHttpStatus();
	// sanity check, these must be reserved! no longer, we have
	// a separate m_httpStatus in the SpiderReply class now
	//if ( mstrerror(httpStatus) ) { char *xx=NULL;*xx=0; }
	// sanity check
	//if ( m_indexCode ) { char *xx=NULL;*xx=0; }
	// set it
	//m_indexCode = httpStatus;
	// clear if it was ok though
	//if ( m_indexCode == 200 ) m_indexCode = 0;
	// bail out now
	return &m_mime;
}

// need to use "char **" since content might be NULL itself, if none
char **XmlDoc::getContent ( ) {
	if ( m_contentValid ) return &m_content;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// recycle?
	if ( cr->m_recycleContent || m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (char **)pod;
		// int16_tcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			m_content      = od-> ptr_utf8Content;
			m_contentLen   = od->size_utf8Content - 1;
			m_contentValid = true;
			return &m_content;
		}
		if ( m_recycleContent )
			log("xmldoc: failed to load old title rec "
			    "when recycle content was true and url = "
			    "%s",ptr_firstUrl);
		// if could not find title rec and we are docid-based then
		// we can't go any further!!
		if ( m_setFromDocId ) {
			log("xmldoc: null content for docid-based titlerec "
			    "lookup which was not found");
			m_content = NULL;
			m_contentLen = 0;
			m_contentValid = true;
			return &m_content;
		}
	}

	if ( m_recycleContent ) {
		if ( m_firstUrlValid )
			log("xmldoc: failed to recycle content for %s. could "
			    "not load title rec",m_firstUrl.m_url);
		else if ( m_docIdValid )
			log("xmldoc: failed to recycle content for %"UINT64". "
			    "could "
			    "not load title rec",m_docId );
		else
			log("xmldoc: failed to recycle content. "
			    "could not load title rec" );
		// let's let it pass and just download i guess, then
		// we can get page stats for urls not in the index
		//g_errno = EBADENGINEER;
		//return NULL;
	}


	// if we were set from a title rec use that we do not have the original
	// content, and caller should be calling getUtf8Content() anyway!!
	if ( m_setFromTitleRec ) { char *xx=NULL; *xx=0; }

	// query reindex has m_setFromDocId to true and we WANT to re-download
	// the content... so why did i have this here? MDW 9/25/2014
	//if ( m_setFromDocId    ) { char *xx=NULL; *xx=0; }

	// recycle?
	//if ( m_recycleContent ) { char *xx=NULL; *xx=0; }

	// get the mime first
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (char **)mime;

	// http reply must be valid
	if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }

	// make it valid
	m_contentValid = true;

	// assume none
	m_content    = NULL;
	m_contentLen = 0;

	// all done if no reply
	if ( ! m_httpReply ) return &m_content;

	// set the content, account for mime header
	m_content    = m_httpReply     + mime->getMimeLen() ;
	m_contentLen = m_httpReplySize - mime->getMimeLen() ;

	// watch out for this!
	if ( m_useFakeMime ) {
		m_content    = m_httpReply;
		m_contentLen = m_httpReplySize;
	}

	// why is this not really the size???
	m_contentLen--;

	// sanity check
	if ( m_contentLen < 0 ) { char *xx = NULL; *xx = 0; }
	return &m_content;
}

char getContentTypeFromContent ( char *p , int32_t niceness ) {
	char ctype = 0;
	// max
	char *pmax = p + 100;
	// check that out
	for ( ; p && *p && p < pmax ; p++ ) {
		QUICKPOLL(niceness);
		if ( p[0] != '<' ) continue;
		if ( p[1] != '!' ) continue;
		if ( to_lower_a(p[2]) != 'd' ) continue;
		if ( strncasecmp(p,"<!doctype ",10) ) continue;
		char *dt = p + 10;
		// skip spaces
		for ( ; *dt ; dt++ ) {
			QUICKPOLL(niceness);
			if ( ! is_wspace_a ( *dt ) ) break;
		}
		// point to that
		if ( ! strncasecmp(dt,"html"     ,4) ) ctype = CT_HTML;
		if ( ! strncasecmp(dt,"xml"      ,3) ) ctype = CT_XML;
		if ( ! strncasecmp(dt,"text/html",9) ) ctype = CT_HTML;
		if ( ! strncasecmp(dt,"text/xml" ,8) ) ctype = CT_XML;
		break;
	}
	return ctype;
}

uint8_t *XmlDoc::getContentType ( ) {
	if ( m_contentTypeValid ) return &m_contentType;
	// log debug
	setStatus("getting content type");
	// get the mime first
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (uint8_t *)mime;
	// then get mime
	m_contentType = mime->getContentType();
	// but if they specify <!DOCTYPE html> in the document that overrides
	// the content type in the mime! fixes planet.mozilla.org
	char **pp = getContent();
	if ( ! pp || pp == (void *)-1 ) return (uint8_t *)pp;
	char *p = *pp;
	// scan content for content type. returns 0 if none found.
	char ctype2 = getContentTypeFromContent ( p , m_niceness );
	// valid?
	if ( ctype2 != 0 ) m_contentType = ctype2;
	// it is valid now
	m_contentTypeValid = true;
	// give to to them
	return &m_contentType;
}


// . similar to getMetaRedirUrl but look for different strings
// . rel="canonical" or rel=canonical in a link tag.
Url **XmlDoc::getCanonicalRedirUrl ( ) {
	// return if we got it
	if ( m_canonicalRedirUrlValid ) return &m_canonicalRedirUrlPtr;

	//if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }

	// assume none in doc
	m_canonicalRedirUrlPtr = NULL;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// disable for crawlbot, not good really for deduping
	if ( cr->m_isCustomCrawl ) {
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}

	if ( ! cr->m_useCanonicalRedirects ) {
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}


	// are we site root page? don't follow canonical url then.
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (Url **)isRoot;
	if ( *isRoot ) {
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}

	// if this page has an inlink, then let it stand
	LinkInfo  *info1 = getLinkInfo1 ();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1;
	if ( info1->getNumGoodInlinks() > 0 ) {
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}


	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;

	// these canonical links only supported in xml/html i think
	if ( *ct != CT_HTML && *ct != CT_XML ) {
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Url **)xml;

	// scan nodes looking for a <link> node. like getBaseUrl()
	for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
		// breathe some
		QUICKPOLL(m_niceness);
		// 12 is the <base href> tag id
		if ( xml->getNodeId ( i ) != TAG_LINK ) continue;
		// get the href field of this base tag
		int32_t linkLen;
		char *link = (char *) xml->getString ( i, "href", &linkLen );
		// skip if not valid
		if ( ! link || linkLen == 0 ) continue;
		// must also have rel=canoncial
		int32_t relLen;
		char *rel = xml->getString(i,"rel",&relLen);
		if ( ! rel ) continue;
		// skip if does not match "canonical"
		if ( strncasecmp(rel,"canonical",relLen) ) continue;
		// allow for relative urls
		Url *cu = getCurrentUrl();
		// set base to it. addWWW=false
		m_canonicalRedirUrl.set(cu,link,linkLen,false);//true
		// assume it is not our url
		bool isMe = false;
		// if it is us, then skip!
		if(strcmp(m_canonicalRedirUrl.getUrl(),m_firstUrl.getUrl())==0)
			isMe = true;
		// might also be our redir url i guess
		if(strcmp(m_canonicalRedirUrl.getUrl(),m_redirUrl.getUrl())==0)
			isMe = true;
		// if it is us, keep it NULL, it's not a redirect. we are
		// the canonical url.
		if ( isMe ) break;
		// ignore if in an expanded iframe (<gbrame>) tag
		char *pstart = xml->m_xml;
		char *p      = link;
		// scan backwards
		if ( ! m_didExpansion ) p = pstart;
		bool skip = false;
		for ( ; p > pstart ; p-- ) {
			QUICKPOLL(m_niceness);
			if ( p[0] != '<' )
				continue;
			if ( p[1] == '/' &&
			     p[2] == 'g' &&
			     p[3] == 'b' &&
			     p[4] == 'f' &&
			     p[5] == 'r' &&
			     p[6] == 'a' &&
			     p[7] == 'm' &&
			     p[8] == 'e' &&
			     p[9] == '>' )
				break;
			if ( p[1] == 'g' &&
			     p[2] == 'b' &&
			     p[3] == 'f' &&
			     p[4] == 'r' &&
			     p[5] == 'a' &&
			     p[6] == 'm' &&
			     p[7] == 'e' &&
			     p[8] == '>' ) {
				skip = true;
				break;
			}
		}
		if ( skip ) continue;
		// otherwise, it is not us, we are NOT the canonical url
		// and we should not be indexed, but just ass the canonical
		// url as a spiderrequest into spiderdb, just like
		// simplified meta redirect does.
		m_canonicalRedirUrlPtr = &m_canonicalRedirUrl;
		break;
	}

	m_canonicalRedirUrlValid = true;
	return &m_canonicalRedirUrlPtr;
}

// returns false if none found
bool setMetaRedirUrlFromTag ( char *p , Url *metaRedirUrl , char niceness ,
			      Url *cu ) {
	// limit scan
	char *limit = p + 30;
	// skip whitespace
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// must be a num
	if ( ! is_digit(*p) ) return false;
	// init delay
	int32_t delay = atol ( p );
	// ignore long delays
	if ( delay >= 10 ) return false;
	// now find the semicolon, if any
	for ( ; *p && p < limit && *p != ';' ; p++ );
	// must have semicolon
	if ( *p != ';' ) return false;
	// skip it
	p++;
	// skip whitespace some more
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// must have URL
	if ( strncasecmp(p,"URL",3) ) return false;
	// skip that
	p += 3;
	// skip white space
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// then an equal sign
	if ( *p != '=' ) return false;
	// skip equal sign
	p++;
	// them maybe more whitespace
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// an optional quote
	if ( *p == '\"' ) p++;
	// can also be a single quote!
	if ( *p == '\'' ) p++;
	// set the url start
	char *url = p;
	// now advance to next quote or space or >
	for ( ; *p && !is_wspace_a(*p) &&
		      *p !='\'' &&
		      *p !='\"' &&
		      *p !='>' ;
	      p++);
	// that is the end
	char *urlEnd = p;
	// get size
	int32_t usize = urlEnd - url;
	// skip if too big
	if ( usize > 1024 ) {
		log("build: meta redirurl of %"INT32" bytes too big",usize);
		return false;
	}
	// get our current utl
	//Url *cu = getCurrentUrl();
	// decode what we got
	char decoded[MAX_URL_LEN];
	// convert &amp; to "&"
	int32_t decBytes = htmlDecode(decoded,url,usize,false,niceness);
	decoded[decBytes]='\0';
	// . then the url
	// . set the url to the one in the redirect tag
	// . but if the http-equiv meta redirect url starts with a '?'
	//   then just replace our cgi with that one
	if ( *url == '?' ) {
		char foob[MAX_URL_LEN*2];
		char *pf = foob;
		int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
		gbmemcpy(foob,cu->getUrl(),cuBytes);
		pf += cuBytes;
		gbmemcpy ( pf , decoded , decBytes );
		pf += decBytes;
		*pf = '\0';
		metaRedirUrl->set(foob);
	}
	// . otherwise, append it right on
	// . use "url" as the base Url
	// . it may be the original url or the one we redirected to
	// . redirUrl is set to the original at the top
	else
		// addWWW = false, stripSessId=true
		metaRedirUrl->set(cu,decoded,decBytes,false,true);
	return true;
}


// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
Url **XmlDoc::getMetaRedirUrl ( ) {
	if ( m_metaRedirUrlValid ) return &m_metaRedirUrlPtr;
	// get ptr to utf8 content
	//char **u8 = getHttpReply();
	//if ( ! u8 || u8 == (void *)-1 ) return (Url **)u8;
	if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; }

	char *p    = m_httpReply;
	// subtract one since this is a size not a length
	char *pend = p + m_httpReplySize - 1;//size_utf8Content;

	// assume no meta refresh url
	m_metaRedirUrlPtr = NULL;
	// make it valid regardless i guess
	m_metaRedirUrlValid = true;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if we are recycling or injecting, do not consider meta redirects
	if ( cr->m_recycleContent || m_recycleContent )
		return &m_metaRedirUrlPtr;

	// will this work in here?
	//uint8_t *ct = getContentType();
	//if ( ! ct ) return NULL;

	Url *cu = getCurrentUrl();

	bool gotOne = false;

	// advance a bit, we are initially looking for the 'v' char
	p += 10;
	// begin the string matching loop
	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// fix <!--[if lte IE 6]>
		// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
		if ( *p == '!' &&
		     p[-1]=='<' &&
		     p[1] == '-' &&
		     p[2] == '-' ) {
			// find end of comment
			for ( ; p < pend ; p++ ) {
				QUICKPOLL(m_niceness);
				if ( p[0] == '-' &&
				     p[1] == '-' &&
				     p[2] == '>' )
					break;
			}
			// if found no end of comment, then stop
			if ( p >= pend )
				break;
			// resume looking for meta redirect tags
			continue;
		}
		// base everything off the equal sign
		if ( *p != '=' ) continue;
		// did we match "http-equiv="?
		if ( to_lower_a(p[-1]) != 'v' ) continue;
		if ( to_lower_a(p[-2]) != 'i' ) continue;
		if ( to_lower_a(p[-3]) != 'u' ) continue;
		if ( to_lower_a(p[-4]) != 'q' ) continue;
		if ( to_lower_a(p[-5]) != 'e' ) continue;
		if (            p[-6]  != '-' ) continue;
		if ( to_lower_a(p[-7]) != 'p' ) continue;
		if ( to_lower_a(p[-8]) != 't' ) continue;
		if ( to_lower_a(p[-9]) != 't' ) continue;
		if ( to_lower_a(p[-10])!= 'h' ) continue;
		// skip the equal sign
		p++;
		// skip quote if there
		if ( *p == '\"' ) p++;
		// must be "refresh", continue if not
		if ( strncasecmp(p,"refresh",7) ) continue;
		// skip that
		p += 7;
		// skip another quote if there
		if ( *p == '\"' ) p++;
		// limit the # of white spaces
		char *limit = p + 20;
		// skip white spaces
		while ( *p && p < limit && is_wspace_a(*p) ) p++;
		// must be content now
		if ( strncasecmp(p,"content=",8) ) continue;
		// skip that
		p += 8;
		// skip possible quote
		if ( *p == '\"' ) p++;
		// PARSE OUT THE URL
		Url dummy;
		if ( ! setMetaRedirUrlFromTag ( p , &dummy , m_niceness ,cu))
			continue;
		gotOne = true;
		break;
	}

	if ( ! gotOne )
		return &m_metaRedirUrlPtr;

	// to fix issue with scripts containing
	// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
	// we have to get the Xml. we can't call getXml() because of
	// recursion bugs so just do it directly here

	Xml xml;
	if ( ! xml.set ( m_httpReply ,
			 m_httpReplySize - 1, // make it a length
			 false      ,  // ownData?
			 0          ,  // allocSize
			 false      ,  // pure xml?
			 m_version  ,
			 false      ,  // setParentsArg?
			 m_niceness ,
			 // assume html since getContentType() is recursive
			 // on us.
			 CT_HTML ) ) // *ct ) )
		// return NULL on error with g_errno set
		return NULL;

	XmlNode *nodes = xml.getNodes();
	int32_t     n  = xml.getNumNodes();
	// find the first meta summary node
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != 68 ) continue;
		// only get content for <meta http-equiv=..>
		int32_t tagLen;
		char *tag ;
		tag = xml.getString ( i , "http-equiv" , &tagLen );
		// skip if empty
		if ( ! tag || tagLen <= 0 ) continue;
		// if not a refresh, skip it
		if ( strncasecmp ( tag , "refresh", 7 ) ) continue;
		// get the content
		tag = xml.getString ( i ,"content", &tagLen );
		// skip if empty
		if ( ! tag || tagLen <= 0 ) continue;
		// PARSE OUT THE URL
		if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,m_niceness,cu) )
			continue;
		// set it
		m_metaRedirUrlPtr = &m_metaRedirUrl;
		// return it
		return &m_metaRedirUrlPtr;
	}

	// nothing found
	return &m_metaRedirUrlPtr;
}

uint16_t getCharsetFast ( HttpMime *mime,
			  char *url,
			  char *s ,
			  int32_t slen ,
			  int32_t niceness  ){

	int16_t charset = csUnknown;

	if ( slen < 0 ) slen = 0;

	char *pstart = s;
	char *pend   = s + slen;

	char *cs    = mime->getCharset();
	int32_t  cslen = mime->getCharsetLen();
	if ( cslen > 31 ) cslen = 31;
	if ( cs && cslen > 0 ) {
		char *p2 = cs + cslen ; char  c = *p2; *p2 = '\0';
		// get it
		charset = get_iana_charset ( cs , gbstrlen(cs) );
		// restore
		*p2 = c;
	}

	// look for Unicode BOM first though
	cs = ucDetectBOM ( pstart , pend - pstart );
	if ( cs && charset == csUnknown ) {
		log(LOG_DEBUG, "build: Unicode BOM signature detected: %s",cs);
		int32_t len = gbstrlen(cs);	if ( len > 31 ) len = 31;
		charset = get_iana_charset ( cs , len );
	}

	// prepare to scan doc
	char *p = pstart;

	// if the doc claims it is utf-8 let's double check because
	// newmexicomusic.org says its utf-8 in the mime header and it says
	// it is another charset in a meta content tag, and it is NOT in
	// utf-8, so don't trust that!
	if ( charset == csUTF8 ) {
		// loop over every char
		for ( char *s = pstart ; s < pend ; s += getUtf8CharSize(s) ) {
			// breathe
			QUICKPOLL(niceness);
			// sanity check
			if ( ! isFirstUtf8Char ( s ) ) {
				// note it
				log(LOG_DEBUG,
				    "build: mime says UTF8 but does not "
				    "seem to be for url %s",url);
				// reset it back to unknown then
				charset = csUnknown;
				break;
			}
		}
	}

	// do not scan the doc if we already got it set
	if ( charset != csUnknown ) p = pend;

	//
	// it is inefficient to set xml just to get the charset.
	// so let's put in some quick string matching for this!
	//

	// . how big is one char? usually this is 1 unless we are in utf16...
	// . if we are in utf16 natively then this code needs to know that and
	//   set oneChar to 2! TODO!!
	//char oneChar = 1;
	// advance a bit, we are initially looking for the = sign
	if ( p ) p += 10;
	// begin the string matching loop
	for ( ; p < pend ; p++ ) {
		// breathe
		QUICKPOLL ( niceness );
		// base everything off the equal sign
		if ( *p != '=' ) continue;
		// must have a 't' or 'g' before the equal sign
		char c = to_lower_a(p[-1]);
		// did we match "charset="?
		if ( c == 't' ) {
			if ( to_lower_a(p[-2]) != 'e' ) continue;
			if ( to_lower_a(p[-3]) != 's' ) continue;
			if ( to_lower_a(p[-4]) != 'r' ) continue;
			if ( to_lower_a(p[-5]) != 'a' ) continue;
			if ( to_lower_a(p[-6]) != 'h' ) continue;
			if ( to_lower_a(p[-7]) != 'c' ) continue;
		}
		// did we match "encoding="?
		else if ( c == 'g' ) {
			if ( to_lower_a(p[-2]) != 'n' ) continue;
			if ( to_lower_a(p[-3]) != 'i' ) continue;
			if ( to_lower_a(p[-4]) != 'd' ) continue;
			if ( to_lower_a(p[-5]) != 'o' ) continue;
			if ( to_lower_a(p[-6]) != 'c' ) continue;
			if ( to_lower_a(p[-7]) != 'n' ) continue;
			if ( to_lower_a(p[-8]) != 'e' ) continue;
		}
		// if not either, go to next char
		else
			continue;
		// . make sure a <xml or a <meta preceeds us
		// . do not look back more than 500 chars
		char *limit = p - 500;
		// assume charset= or encoding= did NOT occur in a tag
		bool inTag = false;
		// check crazy wrap if m_content was close to a NULL ptr...
		if ( limit >= pend   ) limit = pstart;
		if ( limit <  pstart ) limit = pstart;
		for ( char *s = p ; s >= limit ; s -= 1 ) { // oneChar ) {
			// break at > or <
			if ( *s == '>' ) break;
			if ( *s != '<' ) continue;
			// . TODO: this could be in a quoted string too! fix!!
			// . is it in a <meta> tag?
			if ( to_lower_a(s[1]) == 'm' &&
			     to_lower_a(s[2]) == 'e' &&
			     to_lower_a(s[3]) == 't' &&
			     to_lower_a(s[4]) == 'a' ) {
				inTag = true;
				break;
			}
			// is it in an <xml> tag?
			if ( to_lower_a(s[1]) == 'x' &&
			     to_lower_a(s[2]) == 'm' &&
			     to_lower_a(s[3]) == 'l' ) {
				inTag = true;
				break;
			}
			// is it in an <?xml> tag?
			if ( to_lower_a(s[1]) == '?' &&
			     to_lower_a(s[2]) == 'x' &&
			     to_lower_a(s[3]) == 'm' &&
			     to_lower_a(s[4]) == 'l' ) {
				inTag = true;
				break;
			}
		}
		// if not in a tag proper, it is useless
		if ( ! inTag ) continue;
		// skip over equal sign
		p += 1;//oneChar;
		// skip over ' or "
		if ( *p == '\'' ) p += 1;//oneChar;
		if ( *p == '\"' ) p += 1;//oneChar;
		// keep start ptr
		char *csString = p;
		// set a limit
		limit = p + 50;
		if ( limit > pend ) limit = pend;
		if ( limit < p    ) limit = pend;
		// stop at first special character
		while ( p < limit &&
			*p &&
			*p !='\"' &&
			*p !='\'' &&
			! is_wspace_a(*p) &&
			*p !='>' &&
			*p != '<' &&
			*p !='?' &&
			*p !='/' &&
			// fix yaya.pro-street.us which has
			// charset=windows-1251;charset=windows-1"
			*p !=';' &&
			*p !='\\' )
			p += 1;//oneChar;
		// save it
		char d = *p;
		// do the actual NULL termination
		*p = 0;
		// get the character set
		int16_t metaCs = get_iana_charset(csString, gbstrlen(csString));
		// put it back
		*p = d;
		// update "charset" to "metaCs" if known, it overrides all
		if (metaCs != csUnknown ) charset = metaCs;
		// all done, only if we got a known char set though!
		if ( charset != csUnknown ) break;
	}

        // alias these charsets so iconv understands
	if ( charset == csISO58GB231280 ||
	     charset == csHZGB2312      ||
	     charset == csGB2312         )
		charset = csGB18030;

	if ( charset == csEUCKR )
		charset = csKSC56011987; //x-windows-949

	// use utf8 if still unknown
	if ( charset == csUnknown ) {
		if ( g_conf.m_logDebugSpider )
			logf(LOG_DEBUG,"doc: forcing utf8 charset");
		charset = csUTF8;
	}

	// once again, if the doc is claiming utf8 let's double check it!
	if ( charset == csUTF8 ) {
		// use this for iterating
		char size;
		// loop over every char
		for ( char *s = pstart ; s < pend ; s += size ) {
			// breathe
			QUICKPOLL(niceness);
			// set
			size = getUtf8CharSize(s);
			// sanity check
			if ( ! isFirstUtf8Char ( s ) ) {
				// but let 0x80 slide? it is for the
				// 0x80 0x99 apostrophe i've seen for
				// eventvibe.com. it did have a first byte,
				// 0xe2 that led that sequece but it was
				// converted into &acirc; by something that
				// thought it was a latin1 byte.
				if ( s[0] == (char)0x80 &&
				     s[1] == (char)0x99 ) {
					s += 2;
					size = 0;
					continue;
				}
				// note it
				log(LOG_DEBUG,
				    "build: says UTF8 (2) but does not "
				    "seem to be for url %s"
				    " Resetting to ISOLatin1.",url);
				// reset it to ISO then! that's pretty common
				// no! was causing problems for
				// eventvibe.com/...Yacht because it had
				// some messed up utf8 in it but it really
				// was utf8. CRAP, but really messes up
				// sunsetpromotions.com and washingtonia
				// if we do not have this here
				charset = csISOLatin1;
				break;
			}
		}
	}

	// breathe
	QUICKPOLL ( niceness );

	//char *csName = get_charset_str(charset);

	// breathe
	//QUICKPOLL ( m_niceness );

	// if we are not supported, set m_indexCode
	//if ( csName && ! supportedCharset(charset) ) {
	//	log("build: xml: Unsupported charset: %s", csName);
	//	g_errno = EBADCHARSET;
	//	return NULL;
	//	//charset = csUnknown;
	//	// i guess do not risk it
	//	//m_indexCode = EBADCHARSET;
	//}

	// all done
	return charset;
}


uint16_t *XmlDoc::getCharset ( ) {
	if ( m_charsetValid ) return &m_charset;

	// . get ptr to filtered content
	// . we can't get utf8 content yet until we know what charset this
	//   junk is so we can convert it!
	char **fc = getFilteredContent();
	if ( ! fc || fc == (void *)-1 ) return (uint16_t *)fc;

	// scan document for two things:
	// 1.  charset=  (in a <meta> tag)
	// 2. encoding=  (in an <?xml> tag)
	char *pstart = *fc;
	//char *pend   = *fc + m_filteredContentLen;

	// assume known charset
	m_charset = csUnknown;
	// make it valid regardless i guess
	m_charsetValid = true;

	// check in http mime for charset
	HttpMime *mime = getMime();

	m_charset = getCharsetFast ( mime ,
				     m_firstUrl.getUrl(),
				     pstart ,
				     m_filteredContentLen,
				     m_niceness );
	m_charsetValid = true;
	return &m_charset;
}

char *XmlDoc::getIsBinary ( ) {
	if ( m_isBinaryValid ) return &m_isBinary;

	// get the content
	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1 ) return (char *)u8;

	//char *ctype = getContentType();
	//if ( ! ctype || ctype == (void *)-1 ) return (char *)ctype;
	//bool doBinaryCheck = false;
	// the "abq-g" query gives a lot of binary content, use that
	// as a testbed to make sure we filter it out!
	//if ( *ctype  == CT_TEXT     ) doBinaryCheck = true;
	//if ( *ctype  == CT_UNKNOWN  ) doBinaryCheck = true;
	//if ( *ctype  == CT_XML      ) doBinaryCheck = true;
	//if ( *ctype  == CT_HTML     ) doBinaryCheck = true;
	//if ( csEnum == csUnknown   ) doBinaryCheck = true;
	//if ( csEnum == csASCII     ) doBinaryCheck = true;
	//if ( csEnum == csISOLatin1 ) doBinaryCheck = true;
	//if (   slen <= 0           ) doBinaryCheck = false;
	// why shouldn't we binary check everything? now that we are utf8...
	//doBinaryCheck = true;

	// assume not
	m_isBinary      = false;
	m_isBinaryValid = true;

	// if content is not identifed as a type known to us, then check it
	// for binary characters. yes, this can be utf8 or utf16 and then
	// detected as binary i think, but it should really be identified as
	// being html or txt or something...
	//if ( ! doBinaryCheck ) return &m_isBinary;

	// use a table
	char table[256];
	memset ( table , 0 , 256 );
	// see if we had deceitful binary content
	char *s    =     ptr_utf8Content;
	char *send = s + size_utf8Content - 1;
	// for now just count the binary chars
	int32_t count = 0;

	// no content?
	if ( ! s ) return &m_isBinary;

	for ( ; s < send ; s += getUtf8CharSize(s) ) {
		// yield
		QUICKPOLL(m_niceness);
		// skip valid utf8 characters
		if ( getUtf8CharSize(s) > 1 ) continue;
		// . do not count \0's
		// . the fctypes.cpp isBinary array takes into account
		//   that people mix windows 1254 characters into
		//   latin-1. windows 1254 is a superset of latin-1.
		//   so the more common quotes and dashes are no longer
		//   counted as binary characters, but some of the
		//   rarer ones are! however, the "diff" count
		//   contraint helps us make up for that.
		// . the first char of a utf8 character sequence always has
		//   the high bit off, so just test that...
		if ( ! is_binary_a(*s) || ! *s ) continue;
		// count it up
		count++;
		table[(unsigned char)*s]++;
	}
	// how many DIFFERENT bin chars do we have?
	int32_t diff = 0;
	for ( int32_t i = 0 ; i < 256 ; i++ )
		if ( table[i] ) diff++;
	// . is binary if 10 or more bin chars and at least 10
	//   DIFFERENT binary chars
	// . is binary if > 5% of chars are binary
	if ( (count > 10 && diff>=5) || ( 100 * count ) / size_utf8Content>6) {
		// note it for now
		logf(LOG_DEBUG,"build: Got binary content for %s. "
		     "Zeroing out content. (diff=%"INT32" count=%"INT32" "
		     "len=%"INT32")",
		     m_firstUrl.getUrl(),diff,count,size_utf8Content-1);
		// do not try to index binary content, but keep it
		// around for site: queries or in case we have
		// inlink text for it!
		ptr_utf8Content  = NULL;
		size_utf8Content = 0;
		m_isBinary = true;
	}
	return &m_isBinary;
}


// declare these two routines for using threads
static void  filterDoneWrapper    ( void *state , ThreadEntry *te ) ;
static void *filterStartWrapper_r ( void *state , ThreadEntry *te ) ;

// filters m_content if its pdf, word doc, etc.
char **XmlDoc::getFilteredContent ( ) {
	// return it if we got it already
	if ( m_filteredContentValid ) return &m_filteredContent;

	// this must be valid
	char **content = getContent();
	if ( ! content || content == (void *)-1 ) return content;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;
	// it needs this
	HttpMime *mime = getMime();
	if ( ! mime || mime == (void *)-1 ) return (char **)mime;

	// make sure NULL terminated always
	// Why? pdfs can have nulls embedded
	// if ( m_content &&
	//      m_contentValid &&
	//      m_content[m_contentLen] ) {
	// 	char *xx=NULL;*xx=0; }

	int32_t max , max2;
	CollectionRec *cr;
	bool filterable = false;

	if ( m_calledThread ) goto skip;

	// assume we do not need filtering by default
	m_filteredContent      = m_content;
	m_filteredContentLen   = m_contentLen;
	m_filteredContentValid = true;
	m_filteredContentAllocSize = 0;

	// empty content?
	if ( ! m_content ) return &m_filteredContent;

	if ( *ct == CT_HTML    ) return &m_filteredContent;
	if ( *ct == CT_TEXT    ) return &m_filteredContent;
	if ( *ct == CT_XML     ) return &m_filteredContent;
	// javascript - sometimes has address information in it, so keep it!
	if ( *ct == CT_JS      ) return &m_filteredContent;
	if ( m_contentLen == 0 ) return &m_filteredContent;

	// we now support JSON for diffbot
	if ( *ct == CT_JSON    ) return &m_filteredContent;

	if ( *ct == CT_ARC     ) return &m_filteredContent;
	if ( *ct == CT_WARC    ) return &m_filteredContent;

	// unknown content types are 0 since it is probably binary... and
	// we do not want to parse it!!
	if ( *ct == CT_PDF ) filterable = true;
	if ( *ct == CT_DOC ) filterable = true;
	if ( *ct == CT_XLS ) filterable = true;
	if ( *ct == CT_PPT ) filterable = true;
	if ( *ct == CT_PS  ) filterable = true;

	// if its a jpeg, gif, text/css etc. bail now
	if ( ! filterable ) {
		m_filteredContent      = NULL;
		m_filteredContentLen   = 0;
		m_filteredContentValid = true;
		return &m_filteredContent;
	}

	// invalidate
	m_filteredContentValid = false;

	cr = getCollRec();
	if ( ! cr ) return NULL;

	// . if we have no filter specified...
	// . usually "gbfilter" and it is a script in the working directory
	//if ( ! cr->m_filter[0] ) {
	//	m_indexCode = EDOCBADCONTENTTYPE;
	//	return &m_filteredContent;
	//}

	// if not text/html or text/plain, use the other max
	//max = MAXDOCLEN; // cr->m_maxOtherDocLen;
	max = cr->m_maxOtherDocLen;
	// now we base this on the pre-filtered length to save memory because
	// our maxOtherDocLen can be 30M and when we have a lot of injections
	// at the same time we lose all our memory quickly
	max2 = 5 * m_contentLen + 10*1024;
	if ( max > max2 ) max = max2;
	// user uses -1 to specify no maxTextDocLen or maxOtherDocLen
	if ( max < 0    ) max = max2;
	// make a buf to hold filtered reply
	m_filteredContentAllocSize = max;
	m_filteredContent = (char *)mmalloc(m_filteredContentAllocSize,"xdfc");
	if ( ! m_filteredContent ) {
		log("build: Could not allocate %"INT32" bytes for call to "
		    "content filter.",m_filteredContentMaxSize);
		return NULL;
	}
	// breathe
	QUICKPOLL ( m_niceness );
	// reset this here in case thread gets killed by the kill() call below
	m_filteredContentLen = 0;
	// update status msg so its visible in the spider gui
	setStatus ( "filtering content" );
	// reset this... why?
	g_errno = 0;
	// . call thread to call popen
	// . callThread returns true on success, in which case we block
	// . do not repeat
	m_calledThread = true;
	// reset this since filterStart_r() will set it on error
	m_errno = 0;

	// how can this be? don't core like this in thread, because it
	// does not save our files!!
	if ( ! m_mimeValid ) { char *xx=NULL;*xx=0; }

	// do it
	if ( g_threads.call ( FILTER_THREAD        ,
			      MAX_NICENESS         ,
			      this                 ,
			      filterDoneWrapper    ,
			      filterStartWrapper_r ) )
		// return -1 if blocked
		return (char **)-1;
	// clear error!
	g_errno = 0;
	// note it
	log("build: Could not spawn thread for call to "
	    "content filter.");
	// get the data
	filterStart_r ( false ); // am thread?

	// skip down here if thread has returned and we got re-called
skip:

	// if size is 0, free the buf
	if ( m_filteredContentLen <= 0 ) {
		mfree ( m_filteredContent ,
			m_filteredContentAllocSize,"fcas");
		m_filteredContent          = NULL;
		m_filteredContentLen       = 0;
		m_filteredContentAllocSize = 0;
	}

	// did we have an error from the thread?
	if ( m_errno ) g_errno = m_errno;
	// but bail out if it set g_errno
	if ( g_errno ) return NULL;
	// must be valid now - sanity check
	if ( ! m_filteredContentValid ) { char *xx=NULL;*xx=0; }
	// return it
	return &m_filteredContent;
}

// come back here
void filterDoneWrapper ( void *state , ThreadEntry *te ) {
	// jump back into the brawl
	XmlDoc *THIS = (XmlDoc *)state;

	// if size is 0, free the buf. have to do this outside the thread
	// since malloc/free cannot be called in thread
	if ( THIS->m_filteredContentLen <= 0 ) {
		mfree ( THIS->m_filteredContent ,
			THIS->m_filteredContentAllocSize,"fcas");
		THIS->m_filteredContent          = NULL;
		THIS->m_filteredContentLen       = 0;
		THIS->m_filteredContentAllocSize = 0;
	}

	// . call the master callback
	// . it will ultimately re-call getFilteredContent()
	THIS->m_masterLoop ( THIS->m_masterState );
}

// thread starts here
void *filterStartWrapper_r ( void *state , ThreadEntry *te ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->filterStart_r ( true ); // am thread?
	return NULL;
}

//int my_system_r ( char *cmd , int32_t timeout ) ;

// sets m_errno on error
void XmlDoc::filterStart_r ( bool amThread ) {
	// get thread id
	pthread_t id = getpidtid();
	// sanity check
	if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
	// int16_tcut
	int32_t ctype = m_contentType;

	// assume none
	m_filteredContentLen = 0;

	//if ( amThread ) id = pthread_self();
	//else            id = getpid();
	// pass the input to the program through this file
	// rather than a pipe, since popen() seems broken
	char in[1024];
	snprintf(in,1023,"%sin.%"INT64"", g_hostdb.m_dir , (int64_t)id );
	unlink ( in );
	// collect the output from the filter from this file
	char out[1024];
	snprintf ( out , 1023,"%sout.%"INT64"", g_hostdb.m_dir, (int64_t)id );
	unlink ( out );
	// ignore errno from those unlinks
	errno = 0;
	// open the input file
 retry11:
	int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
	if ( fd < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry11;
		m_errno = errno;
		log("build: Could not open file %s for writing: %s.",
		    in,mstrerror(m_errno));
		return;
	}
	// we are in a thread, this must be valid!
	if ( ! m_mimeValid ) { char *xx=NULL;*xx=0;}

 retry12:
	// write the content into the input file
	int32_t w = write ( fd , m_content , m_contentLen );
	// valgrind
	if ( w < 0 && errno == EINTR ) goto retry12;
	// did we get an error
	if ( w != m_contentLen ) {
		//int32_t w = fwrite ( m_buf , 1 , m_bufLen , pd );
		//if ( w != m_bufLen ) {
		m_errno = errno;
		log("build: Error writing to %s: %s.",in,
		    mstrerror(m_errno));
		close(fd);
		return;
	}
	// close the file
	close ( fd );

	// int16_tcut
	char *wdir = g_hostdb.m_dir;

	// . open a pipe to pdf2html program
	// . the output will go to stdout
	char cmd[2048];
	// different commands to filter differt ctypes
	// -i     : ignore images
	// -stdout: send output to stdout
	// -c     : generate complex document
	// Google generates complex docs, but the large ones are horribly slow
	// in the browser, but docs with 2 cols don't display right w/o -c.
	// damn, -stdout doesn't work when -c is specified.
	// These ulimit sizes are max virtual memory in kilobytes. let's
	// keep them to 25 Megabytes
	if      ( ctype == CT_PDF )
		snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; timeout 30s nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
	else if ( ctype == CT_DOC )
		// "wdir" include trailing '/'? not sure
		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; timeout 30s nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
	else if ( ctype == CT_XLS )
		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
	// this is too buggy for now... causes hanging threads because it
	// hangs, so i added 'timeout 10s' but that only works on newer
	// linux version, so it'll just error out otherwise.
	else if ( ctype == CT_PPT )
		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/ppthtml %s > %s" , wdir , in , out );
	else if ( ctype == CT_PS  )
		snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30; timeout 10s nice -n 19 %s/pstotext %s > %s" , wdir , in , out );
	else { char *xx=NULL;*xx=0; }

	// breach sanity check
	//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }

	// exectue it
	int retVal = gbsystem ( cmd );
	if ( retVal == -1 )
		log("gb: system(%s) : %s",
		    cmd,mstrerror(g_errno));

	// all done with input file
	// clean up the binary input file from disk
	if ( unlink ( in ) != 0 ) {
		// log error
		log("gbfilter: unlink (%s): %s\n",in, strerror(errno));
		// ignore it, since it was not a processing error per se
		errno = 0;
	}

	// don't use too much memory, i think xhtml uses so much that it
	// swaps out all the gb processes?
	//struct rlimit lim;
	//lim.rlim_cur = lim.rlim_max = 24 * 1024 * 1024 ;
	//if ( setrlimit ( RLIMIT_AS , &lim ) )
	//	fprintf (stderr,"gbfilter:setrlimit: %s", strerror(errno) );

 retry13:
	fd = open ( out , O_RDONLY );
	if ( fd < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry13;
		m_errno = errno;
		log("gbfilter: Could not open file %s for reading: %s.",
		    out,mstrerror(m_errno));
		return;
	}
	// sanity -- need room to store a \0
	if ( m_filteredContentAllocSize < 2 ) { char *xx=NULL;*xx=0; }
	// to read - leave room for \0
	int32_t toRead = m_filteredContentAllocSize - 1;
 retry14:
	// read right from pipe descriptor
	int32_t r = read (fd, m_filteredContent,toRead);
	// note errors
	if ( r < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry14;
		log("gbfilter: reading output: %s",mstrerror(errno));
		// this is often bad fd from an oom error, so ignore it
		//m_errno = errno;
		errno = 0;
		r = 0;
	}
	// clean up shop
	close ( fd );
	// delete output file
	unlink ( out );

	// validate now
	m_filteredContentValid = 1;
	// save the new buf len
	m_filteredContentLen = r;
	// ensure enough room for null term
	if ( r >= m_filteredContentAllocSize ) { char *xx=NULL;*xx=0; }
	// ensure filtered stuff is NULL terminated so we can set the Xml class
	m_filteredContent [ m_filteredContentLen ] = '\0';
	// it is good
	m_filteredContentValid = true;

	// . at this point we got the filtered content
	// . bitch if we didn't allocate enough space
	if ( r > 0 && r == toRead )
		log(LOG_LOGIC,"build: Had to truncate document to %"INT32" bytes "
		    "because did not allocate enough space for filter. "
		    "This should never happen. It is a hack that should be "
		    "fixed right.", toRead );

	// if we got something, then we're done
	//if ( r > 0 ) return;
	// otherwise, free it up
	// . NO! not in a thread!!
	//mfree ( m_filteredContent , m_filteredContentAllocSize, "fcas" );
	//m_filteredContent          = NULL;
	//m_filteredContentLen       = 0;
	//m_filteredContentAllocSize = 0;
}

pid_t g_pid    = -1;
int32_t  g_ticker = 0;
int32_t  g_filterTimeout = -1;

/*
static int startUp ( void *cmd ) ;
#include <sys/types.h>    // waitpid()
#include <sys/wait.h>     // waitpid()
#include <sched.h>        // clone()

static char cloneStack[250000];

int my_system_r ( char *cmd , int32_t timeout ) {
	// bail if cmd empty
	if ( ! cmd ) {
		log(LOG_LOGIC,"build: empty command.");
		return -1;
	}
	errno = 0;
	// this gives out of memory on newer kernels, was that causing our
	// older kernerls to crash, too, in addition to the e1000 driver?
	//pid_t pid = fork();
	// let's use clone() instead now
	// error forking?
	pid_t pid = clone ( startUp ,
			    cloneStack + 250000 ,
			    CLONE_FS | CLONE_FILES | CLONE_VM | SIGCHLD ,
			    cmd );
	if (pid == -1) {
		log("build: fork: %s.",mstrerror(errno));
		return -1;
	}
	// sanity check
	if ( g_pid != -1 ) { char *xx = NULL; *xx = 0; }
	// set the process group id of this guy to itself, so he becomes
	// the process leader, so any processes he spawns should all receive
	// the same HUP or kill signals he receives. uhhhh probably not...
	//setpgid ( pid , pid );
	// save the pid globally so Threads.cpp can kill(9,g_pid) it if it
	// stalls too long. but to measure how long it is out for, keep a
	// ticker count. this ticker count is incremented in the sleep wrapper
	// in Threads.cpp.
	g_ticker        = 0;
	g_pid           = pid;
	g_filterTimeout = timeout;
 loop:
	int status;
	if ( waitpid ( pid , &status , 0 ) == -1 ) {
		// reset g_pid so Threads.cpp's kill wrapper chills out
		if ( errno != EINTR ) {
			log("build: waitpid pid=%"INT32": %s.",
			    (int32_t)g_pid,mstrerror(errno));
			g_pid = -1;
			return -1;
		}
		// if we got interrupted by a different signal keep waiting
		goto loop;
	}
	// reset g_pid so Threads.cpp's kill wrapper chills out
	g_pid = -1;
	if ( status < 0 ) log("build: Got bad status from child.");
	// we got the signal
	return status;
}

int startUp ( void *cmd ) {
	char *argv[4];
	argv[0] = "sh";
	argv[1] = "-c";
	argv[2] = (char *)cmd;
	argv[3] = 0;
	char *envp[2];
	char  buf[1024];
	// antiword needs this environment var so it can find
	// the .antiword/ dir , we should put it in gb's working dir
	snprintf(buf,1023,"HOME=%s", g_hostdb.m_dir );
	envp[0] = buf;
	envp[1] = 0;
	execve("/bin/sh", argv, envp );
	//exit(127);
	return 1;
}
*/


// return downloaded content as utf8
char **XmlDoc::getRawUtf8Content ( ) {
	// if we already computed it, return that
	if ( m_rawUtf8ContentValid ) return &m_rawUtf8Content;

	// . get our characterset
	// . crap! this can be recursive. it calls getXml() which calls
	//   getUtf8Content() which is us!
	uint16_t *charset = getCharset ( );
	if ( ! charset || charset == (uint16_t *)-1 ) return (char **)charset;

	char *csName = get_charset_str(*charset);

	// . if not supported fix that!
	// . m_indexCode should be set to EBADCHARSET ultimately, but not here
	if ( ! supportedCharset(*charset) && csName ) {
		m_rawUtf8Content          = NULL;
		m_rawUtf8ContentSize      = 0;
		m_rawUtf8ContentAllocSize = 0;
		m_rawUtf8ContentValid     = true;
		return &m_rawUtf8Content;
	}

	// get ptr to filtered content
	char **fc = getFilteredContent();
	if ( ! fc || fc == (void *)-1 ) return (char **)fc;

	// make sure NULL terminated always
	if ( m_filteredContent &&
	     m_filteredContentValid &&
	     m_filteredContent[m_filteredContentLen] ) {
		char *xx=NULL;*xx=0; }

	// NULL out if no content
	if ( ! m_filteredContent ) {
		m_rawUtf8Content          = NULL;
		m_rawUtf8ContentSize      = 0;
		m_rawUtf8ContentAllocSize = 0;
		m_rawUtf8ContentValid     = true;
		return &m_rawUtf8Content;
	}

	// assume already utf8
	m_rawUtf8Content          = m_filteredContent;
	m_rawUtf8ContentSize      = m_filteredContentLen + 1;
	m_rawUtf8ContentAllocSize = 0;

	// if we are not ascii or utf8 already, encode it into utf8
	if ( m_rawUtf8ContentSize > 1 &&
	     csName &&
	     *charset != csASCII &&
	     *charset != csUTF8 ) {
		// ok, no-go
		//ptr_utf8Content = NULL;
		m_rawUtf8Content = NULL;
		// assume utf8 will be twice the size ... then add a little
		int32_t  need = (m_filteredContentLen * 2) + 4096;
		char *buf  = (char *) mmalloc(need, "Xml3");
		// log oom error
		if ( ! buf ) {
			log("build: xml: not enough memory for utf8 buffer");
			return NULL;
		}
		// sanity check
		if ( ! csName ) { char *xx=NULL;*xx=0; }
		// note it
		setStatus ( "converting doc to utf8" );
		// returns # of bytes i guess
		int32_t used = ucToUtf8 ( buf                  ,
				       // fix core dump by subtracting 10!
				       need - 10,
				       m_filteredContent    ,
				       m_filteredContentLen ,
				       csName               ,
				       -1                   ,//allowBadChars
				       m_niceness           );
		// clear this if successful, otherwise, it sets errno
		if ( used > 0 ) g_errno = 0;
		// unrecoverable error? bad charset is g_errno == 7
		// which is like argument list too long or something
		// error from Unicode.cpp's call to iconv()
		if ( g_errno )
			log(LOG_INFO, "build: xml: failed parsing buffer: %s "
			    "(cs=%d)", mstrerror(g_errno), *charset);
		if ( g_errno && g_errno != 7 ) {
			mfree ( buf, need, "Xml3");
			// do not index this doc, delete from spiderdb/tfndb
			//if ( g_errno != ENOMEM ) m_indexCode = g_errno;
			// if conversion failed NOT because of bad charset
			// then return NULL now and bail out. probably ENOMEM
			return NULL;
		}
		// if bad charset... just make doc empty as a utf8 doc
		if ( g_errno == 7 ) {
			used = 0;
			buf[0] = '\0';
			buf[1] = '\0';
			// clear g_errno
			g_errno = 0;
			// and make a note for getIndexCode() so it will not
			// bother indexing the doc! nah, just index it
			// but with no content...
		}
		// crazy? this is pretty important...
		if ( used + 10 >= need )
			log("build: utf8 using too much buf space!!! u=%s",
			    getFirstUrl()->getUrl());
		// re-assign
		//ptr_utf8Content        = buf;
		//size_utf8Content       = used + 1;
		//m_utf8ContentAllocSize = need;
		m_rawUtf8Content          = buf;
		m_rawUtf8ContentSize      = used + 1;
		m_rawUtf8ContentAllocSize = need;
	}

	// convert \0's to spaces. why do we see these in some pages?
	// http://www.golflink.com/golf-courses/ has one in the middle after
	// about 32k of content.
	char *p    =     m_rawUtf8Content;
	char *pend = p + m_rawUtf8ContentSize - 1;
	for ( ; p < pend ; p++ ) {
		QUICKPOLL(m_niceness);
		if ( ! *p ) *p = ' ';
	}


	//
	// VALIDATE the UTF-8
	//

	// . make a buffer to hold the decoded content now
	// . we were just using the m_expandedUtf8Content buf itself, but "n"
	//   ended up equalling m_expadedUtf8ContentSize one time for a
	//   doc, http://ediso.net/, which probably had corrupt utf8 in it,
	//   and that breached our buffer! so verify that this is good
	//   utf8, and that we can parse it without breaching our buffer!
	p = m_rawUtf8Content;
	// make sure NULL terminated always
	if ( p[m_rawUtf8ContentSize-1]) { char *xx=NULL;*xx=0;}
	// make sure we don't breach the buffer when parsing it
	char size;
	char *lastp = NULL;
	for ( ; ; p += size ) {
		QUICKPOLL(m_niceness);
		if ( p >= pend ) break;
		lastp = p;
		size = getUtf8CharSize(p);
	}
	// overflow?
	if ( p > pend && lastp ) {
		// back up to the bad utf8 char that made us overshoot
		p = lastp;
		// space it out
		for ( ; p < pend ; p++ ) *p = ' ';
		// log it maybe due to us not being keep alive http server?
		log("doc: fix bad utf8 overflow (because we are not "
		    "keepalive?) in doc %s",m_firstUrl.m_url);
	}
	// overflow?
	if ( p != pend ) { char *xx=NULL;*xx=0; }
	// sanity check for breach. or underrun in case we encountered a
	// premature \0
	if (p-m_rawUtf8Content!=m_rawUtf8ContentSize-1) {char*xx=NULL;*xx=0;}

	// sanity -- must be \0 terminated
	if ( m_rawUtf8Content[m_rawUtf8ContentSize-1] ) {char *xx=NULL;*xx=0; }

        // it might have shrunk us
	//m_rawUtf8ContentSize = n + 1;
	// we are good to go
	m_rawUtf8ContentValid = true;

	//return &ptr_utf8Content;
	return &m_rawUtf8Content;
}

// this is so Msg13.cpp can call getExpandedUtf8Content() to do its
// iframe expansion logic
void getExpandedUtf8ContentWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	char **retVal = THIS->getExpandedUtf8Content();
	// return if blocked again
	if ( retVal == (void *)-1 ) return;
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

// now if there are any <iframe> tags let's substitute them for
// the html source they represent here. that way we will get all the
// information you see on the page. this is somewhat critical since
// a lot of pages have their content in the frame.
char **XmlDoc::getExpandedUtf8Content ( ) {
	// if we already computed it, return that
	if ( m_expandedUtf8ContentValid ) return &m_expandedUtf8Content;

	// if called from spider compression proxy we need to set
	// masterLoop here now
	if ( ! m_masterLoop ) {
		m_masterLoop  = getExpandedUtf8ContentWrapper;
		m_masterState = this;
	}

	// get the unexpanded cpontent first
	char **up = getRawUtf8Content ();
	if ( ! up || up == (void *)-1 ) return up;

	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (char **)cu;

	// NULL out if no content
	if ( ! *up ) {
		m_expandedUtf8Content          = NULL;
		m_expandedUtf8ContentSize      = 0;
		m_expandedUtf8ContentValid     = true;
		return &m_expandedUtf8Content;
	}

	// do not do iframe expansion in order to keep injections fast
	if ( m_wasContentInjected ) {
		m_expandedUtf8Content     = m_rawUtf8Content;
		m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
		m_expandedUtf8ContentValid = true;
		return &m_expandedUtf8Content;
	}

	bool skip = m_skipIframeExpansion;

	// if we are a warc, arc or doc that consists of a sequence of
	// sub-docs that we are indexing/injecting then skip iframe expansion
	if ( isContainerDoc() )
		skip = true;

	// or if this is set to true
	if ( skip ) {
		m_expandedUtf8Content     = m_rawUtf8Content;
		m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
		m_expandedUtf8ContentValid = true;
		return &m_expandedUtf8Content;
	}


	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (char **)ct;

	// if we have a json reply, leave it alone... do not expand iframes
	// in json, it will mess up the json
	if ( *ct == CT_JSON ) {
		m_expandedUtf8Content     = m_rawUtf8Content;
		m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
		m_expandedUtf8ContentValid = true;
		return &m_expandedUtf8Content;
	}

	// we need this so getExtraDoc does not core
	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;

	// point to it
	char *p    = *up;
	char *pend = *up + m_rawUtf8ContentSize; // includes \0
	// declare crap up here so we can jump into the for loop
	int32_t urlLen;
	char *url;
	char *fend;
	Url furl;
	XmlDoc **ped;
	XmlDoc *ed;
	bool inScript = false;
	bool match;
	// assign saved value if we got that
	if ( m_savedp ) {
		// restore "p"
		p = m_savedp;
		// update this
		ed = m_extraDoc;
		// and see if we got the mime now
		goto gotMime;
	}
	// now loop for frame and iframe tags
	for ( ; p < pend ; p += getUtf8CharSize(p) ) {
		// breathe
		QUICKPOLL(m_niceness);
		// if never found a frame tag, just keep on chugging
		if ( *p != '<' ) continue;
		// <script>?
		if ( to_lower_a(p[1]) == 's' &&
		     to_lower_a(p[2]) == 'c' &&
		     to_lower_a(p[3]) == 'r' &&
		     to_lower_a(p[4]) == 'i' &&
		     to_lower_a(p[5]) == 'p' &&
		     to_lower_a(p[6]) == 't' )
			inScript = 1;
		// </script>?
		if ( p[1]=='/' &&
		     to_lower_a(p[2]) == 's' &&
		     to_lower_a(p[3]) == 'c' &&
		     to_lower_a(p[4]) == 'r' &&
		     to_lower_a(p[5]) == 'i' &&
		     to_lower_a(p[6]) == 'p' &&
		     to_lower_a(p[7]) == 't' )
			inScript = 0;
		// . skip if in script
		// . fixes guysndollsllc.com which has an iframe tag in
		//   a script section, "document.write ('<iframe..."
		if ( inScript ) continue;
		// iframe or frame?
		match = false;
		if ( to_lower_a(p[1]) == 'f' &&
		     to_lower_a(p[2]) == 'r' &&
		     to_lower_a(p[3]) == 'a' &&
		     to_lower_a(p[4]) == 'm' &&
		     to_lower_a(p[5]) == 'e' )
			match = true;
		if ( to_lower_a(p[1]) == 'i' &&
		     to_lower_a(p[2]) == 'f' &&
		     to_lower_a(p[3]) == 'r' &&
		     to_lower_a(p[4]) == 'a' &&
		     to_lower_a(p[5]) == 'm' &&
		     to_lower_a(p[6]) == 'e' )
			match = true;
		// skip tag if not iframe or frame
		if ( ! match ) continue;
		// check for frame or iframe
		//if ( strncasecmp(p+1,"frame " , 6) &&
		//     strncasecmp(p+1,"iframe ", 7) )
		//	continue;
		// get src tag (function in Words.h)
		url = getFieldValue ( p , pend - p ,"src" , &urlLen );
		// needs a src field
		if ( ! url ) continue;
		// "" is not acceptable either. techcrunch.com has
		// <iframe src=""> which ends up embedding the root url.
		if ( urlLen == 0 )
			continue;
		// skip if "about:blank"
		if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
			continue;
		// get our current url
		//cu = getCurrentUrl();
		// set our frame url
		furl.set ( cu , url , urlLen );
		// no recursion
		if ( strcmp(furl.getUrl(),m_firstUrl.getUrl()) == 0 )
			continue;
		// must be http or https, not ftp! ftp was causing us to
		// core in Msg22.cpp where it checks the url's protocol
		// when trying to lookup the old title rec.
		// http://sweetaub.ipower.com/ had an iframe with a ftp url.
		if ( ! furl.isHttp() && ! furl.isHttps() ) continue;
		// ignore google.com/ assholes for now
		if ( strstr(furl.getUrl(),"google.com/" ) ) continue;
		// and bing just to be safe
		if ( strstr(furl.getUrl(),"bing.com/" ) ) continue;
		// save it in case we have to return and come back later
		m_savedp = p;
		// break here
		//log("mdw: breakpoing here");
		// . download that. get as a doc. use 0 for max cache time
		// . no, use 5 seconds since we often have the same iframe
		//   in the root doc that we have in the  main doc, like a
		//   facebook iframe or something.
		// . use a m_maxCacheAge of 5 seconds now!
		ped = getExtraDoc ( furl.m_url , 5 );
		// should never block
		if ( ! ped ) {
			log("xmldoc: getExpandedutf8content = %s",
			    mstrerror(g_errno));
			return NULL;
		}
		// . return -1 if it blocked???
		// . no, this is not supported right now
		// . it will mess up our for loop
		if ( ped == (void *)-1 ) {char *xx=NULL;*xx=0;}
		// cast it
		ed = *ped;
		// sanity
		if ( ! ed ) { char *xx=NULL;*xx=0; }
		// jump in here from above
	gotMime:
		// make it not use the ips.txt cache
		//ed->m_useIpsTxtFile     = false;
		//ed->m_readFromTestCache = false;
		// get the mime
		HttpMime *mime = ed->getMime();
		if ( ! mime || mime == (void *)-1 ) return (char **)mime;
		// if not success, do not expand it i guess...
		if ( mime->getHttpStatus() != 200 ) {
			// free it
			nukeDoc ( ed );
			// and continue
			continue;
		}
		// update m_downloadEndTime if we should
		if ( ed->m_downloadEndTimeValid ) {
			// we must already be valid
			if ( ! m_downloadEndTimeValid ) {char *xx=NULL;*xx=0;}
			// only replace it if it had ip and robots.txt allowed
			if ( ed->m_downloadEndTime )
				m_downloadEndTime = ed->m_downloadEndTime;
		}

		// re-write that extra doc into the content
		char **puc = ed->getRawUtf8Content();
		// this should not block
		//if ( puc == (void *)-1 ) { char *xx=NULL;*xx=0; }
		// it blocked before! because the charset was not known!
		if ( puc == (void *)-1 ) return (char **)puc;
		// error?
		if ( ! puc ) return (char **)puc;
		// cast it
		char *uc = *puc;
		// or if no content, and no mime (like if robots.txt disallows)
		if ( ! uc || ed->m_rawUtf8ContentSize == 1 ) {
			// free it
			nukeDoc ( ed );
			// and continue
			continue;
		}
		// size includes terminating \0
		if ( uc[ed->m_rawUtf8ContentSize-1] ) { char *xx=NULL;*xx=0;}

		// if first time we are expanding, set this
		if ( ! m_oldp ) m_oldp = *up;

		// find end of frame tag
		fend = p;
		for ( ; fend < pend ; fend += getUtf8CharSize(fend) ) {
			// breathe
			QUICKPOLL(m_niceness);
			// if never found a frame tag, just keep on chugging
			if ( *fend == '>' ) break;
		}
		// if no end to the iframe tag was found, bail then...
		if ( fend >= pend ) continue;
		// skip the >
		fend++;

		// insert the non-frame crap first AND the frame/iframe tag
		m_esbuf.safeMemcpy ( m_oldp , fend - m_oldp );
		// end the frame
		//m_esbuf.safeMemcpy ( "</iframe>", 9 );
		// use our own special tag so Sections.cpp can set
		// Section::m_gbFrameNum which it uses internally
		m_esbuf.safePrintf("<gbframe>"); // gbiframe
		// identify javascript
		bool javascript = false;
		if ( *ed->getContentType() == CT_JS ) javascript = true;
		// so we do not mine javascript for cities and states etc.
		// in Address.cpp
		if ( javascript ) m_esbuf.safePrintf("<script>");
		// store that
		m_esbuf.safeMemcpy ( uc , ed->m_rawUtf8ContentSize - 1 );
		// our special tag has an end tag as well
		if ( javascript ) m_esbuf.safePrintf("</script>");
		m_esbuf.safePrintf("</gbframe>");
		// free up ed
		nukeDoc ( ed );

		// end of frame tag, skip over whole thing
		m_oldp = fend ;
		// sanity check
		if ( m_oldp > pend ) { char *xx=NULL;*xx=0; }
		// another flag
		m_didExpansion = true;
		// count how many we did
		if ( ++m_numExpansions >= 5 ) break;
	}
	// default
	m_expandedUtf8Content     = m_rawUtf8Content;
	m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
	// point to expansion buffer if we did any expanding
	if ( m_didExpansion ) {
		// copy over the rest
		m_esbuf.safeMemcpy ( m_oldp , pend - m_oldp );
		// null term it
		m_esbuf.pushChar('\0');
		// and point to that buffer
		m_expandedUtf8Content     = m_esbuf.getBufStart();//m_buf;
		// include the \0 as part of the size
		m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
	}
	// sanity -- must be \0 terminated
	if ( m_expandedUtf8Content[m_expandedUtf8ContentSize-1] ) {
		char *xx=NULL;*xx=0; }

	m_expandedUtf8ContentValid = true;
	return &m_expandedUtf8Content;
}

static SafeBuf s_cookieBuf;


void *systemStartWrapper_r ( void *state , ThreadEntry *t ) {

	XmlDoc *THIS = (XmlDoc *)state;

	char filename[2048];
	snprintf(filename,2048,"%sgbarchivefile%"UINT32".gz",
		 g_hostdb.m_dir,
		 (int32_t)(int64_t)THIS);

	char cmd[MAX_URL_LEN+256];
	snprintf( cmd,
		  MAX_URL_LEN+256,
		  "wget -q --header=\"Cookie: %s\" \"%s\" -O %s" ,
		  s_cookieBuf.getBufStart() ,
		  THIS->m_firstUrl.getUrl() ,
		  filename );

	log("build: wget: %s",cmd );

	int ret;

	ret = system(cmd);
	if ( ret == -1 )
		log("build: wget system failed: %s",mstrerror(errno));
	else
		log("build: wget system returned %"INT32"",ret);

	// unzip it now
	snprintf ( cmd , MAX_URL_LEN+256, "gunzip -f %s" , filename );

	log("build: wget begin: %s",cmd );

	ret = system(cmd);
	if ( ret == -1 )
		log("build: gunzip system failed: %s",mstrerror(errno));
	else
		log("build: gunzip system returned %"INT32"",ret);


	log("build: done with gunzip");

	return NULL;
}

// come back here
void systemDoneWrapper ( void *state , ThreadEntry *t ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->m_masterLoop ( THIS->m_masterState );
}

// we download large files to a file on disk, like warcs and arcs
FILE *XmlDoc::getUtf8ContentInFile () {

	setStatus ("wgetting archive file");

	// if ( m_calledWgetThread ) {

	// 	char filename[2048];
	// 	snprintf ( filename,
	// 		   2048,
	// 		   "gbarchivefile%"UINT32"",
	// 		   (int32_t)(int64_t)this);

	// 	m_file.set ( g_hostdb.m_dir , filename );
	// 	m_fileSize = m_file.getFileSize();
	// 	m_fileValid = true;
	// 	*fileSizeArg = m_fileSize;
	// 	m_file.open(O_RDONLY);
	// 	// explicitly set it to false now to make it harder for
	// 	// it not to be true because that messes things up
	// 	m_file.m_usePartFiles = false;
	// 	return &m_file;
	// }

	// before calling the system wget thread we gotta set the cookiebuf
	// HACK: for archive.org
	// if getting a page from archive.org then append the cookie
	// so we have the proper permissions
	static bool s_triedToLoadCookie = false;
	char *x = m_firstUrl.getUrl();
	// only go out 20 chars looking for start of .archive.org/
	char *xend = x + 25;
	bool isArchiveOrg = false;
	for ( ; x < xend && *x ; x++ ) {
		if ( x[ 0] != '.' && x[0] != '/' ) continue; // /archive.org?
		if ( x[ 1] != 'a' ) continue;
		if ( x[ 2] != 'r' ) continue;
		if ( x[ 3] != 'c' ) continue;
		if ( x[ 4] != 'h' ) continue;
		if ( x[ 5] != 'i' ) continue;
		if ( x[ 6] != 'v' ) continue;
		if ( x[ 7] != 'e' ) continue;
		if ( x[ 8] != '.' ) continue;
		if ( x[ 9] != 'o' ) continue;
		if ( x[10] != 'r' ) continue;
		if ( x[11] != 'g' ) continue;
		if ( x[12] != '/' ) continue;
		isArchiveOrg = true;
		break;
	}

	if ( isArchiveOrg && ! s_triedToLoadCookie ) {
		// try to load it up if haven't tried yet
		s_triedToLoadCookie = true;
		SafeBuf tmp;
		//int32_t loaded = tmp.load ( "/home/mwells/.config/internetarchive.yml");
		int32_t loaded = tmp.load ( "auth/internetarchive.yml");
		if(loaded <= 0) {
			if ( ! g_errno ) g_errno = EDOCTOOBIG;
			log("gb: failed to load auth/internetarchive.yml: "
			    "%s",mstrerror(g_errno));
			// do not restart gb in a loop, so return 0 to shell
			exit(0);
			//return NULL;
			// FIXME
			char *xx=NULL;*xx=0;
		}
		char *s = tmp.getBufStart();
		char *line;
		char *lineEnd;
		line = strstr ( s , "logged-in-user: " );
		if ( line ) lineEnd = strstr(line,"\n");
		if ( lineEnd ) {
			s_cookieBuf.safePrintf("logged-in-user=");
			line += 16;
			s_cookieBuf.safeMemcpy(line,lineEnd-line);
			s_cookieBuf.pushChar(';');
			s_cookieBuf.pushChar(' ');
			s_cookieBuf.nullTerm();
		}
		line = strstr ( s , "logged-in-sig: " );
		if ( line ) lineEnd = strstr(line,"\n");
		if ( lineEnd ) {
			s_cookieBuf.safePrintf("logged-in-sig=");
			line += 15;
			s_cookieBuf.safeMemcpy(line,lineEnd-line);
			//s_cookieBuf.pushChar(';');
			//s_cookieBuf.pushChar(' ');
			s_cookieBuf.nullTerm();
		}
	}

	// if we loaded something use it
	if ( isArchiveOrg && s_cookieBuf.length() ) {
		//cookie = s_cookieBuf.getBufStart();
		log("http: using archive cookie %s",s_cookieBuf.getBufStart());
		// and set user-agent too
		// userAgent = "python-requests/2.3.0 "
		// 	"CPython/2.7.3 Linux/3.5.0-32-generic";
	}

	char cmd[MAX_URL_LEN+256];
	snprintf( cmd,
		  MAX_URL_LEN+256,
			  "set -o pipefail|"
			  "wget --limit-rate=10M -O- --header=\"Cookie: %s\" \"%s\"|" //
			  "zcat|"
			  "mbuffer -t -m 10M -o-", //this is useful but we need a new version of mbuffer -W 30
			  s_cookieBuf.getBufStart() ,
              m_firstUrl.getUrl());

	log("build: wget: %s",cmd );

	FILE* fh = gbpopen(cmd);

	int	fd = fileno(fh);
	int flags = fcntl(fd, F_GETFL, 0);
	if(fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
        log("build: could not set wget stream to nonblocking %s",
			m_firstUrl.getUrl());
		//error
	}

	if(!g_loop.registerReadCallback ( fd,
									  this ,
									  doneReadingArchiveFileWrapper,
									  m_niceness    )) {
		log("build: failed to register warc read callback." );
		return NULL;
	}
	m_registeredWgetReadCallback = true;


	log("build: called popen");

	m_calledWgetThread = true;
    m_hasMoreToRead = true;

	return fh;

	//	return getUtf8ContentInFile ( fileSizeArg );


	// . callThread returns true on success, in which case we block
	// if ( g_threads.call ( FILTER_THREAD        ,
	// 		      MAX_NICENESS         ,
	// 		      (void *)this                 , // this
	// 		      systemDoneWrapper    ,
	// 					  systemStartWrapper_r ) )
	// 	// would block, wait for thread
	// 	return (BigFile *)-1;
	// // failed?
	// log("build: failed to launch wget thread");
	// If we run it in this thread then if we are fetching
	// a local url it will block forever.
	// systemStartWrapper_r(this,NULL);
	// return getUtf8ContentInFile ( fileSizeArg );
	//g_errno = ETHREADSDISABLED;

	//return NULL;
}

// . get the final utf8 content of the document
// . all html entities are replaced with utf8 chars
// . all iframes are expanded
// . if we are using diffbot then getting the utf8 content should return
//   the json which is the output from the diffbot api. UNLESS we are getting
//   the webpage itself for harvesting outlinks to spider later.
char **XmlDoc::getUtf8Content ( ) {

	// if we already computed it, return that
	if ( m_utf8ContentValid ) return &ptr_utf8Content;

	if ( m_setFromTitleRec ) {
		m_utf8ContentValid = true;
		return &ptr_utf8Content;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	setStatus("getting utf8 content");

	// recycle?
	if ( cr->m_recycleContent || m_recycleContent ||
	     // if trying to delete from index, load from old titlerec
	     m_deleteFromIndex ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (char **)pod;
		// int16_tcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			ptr_utf8Content    = od-> ptr_utf8Content;
			size_utf8Content   = od->size_utf8Content;
			m_utf8ContentValid = true;
			m_contentType      = od->m_contentType;
			m_contentTypeValid = true;
			// sanity check
			if ( ptr_utf8Content &&
			     ptr_utf8Content[size_utf8Content-1] ) {
				char *xx=NULL;*xx=0; }
			return &ptr_utf8Content;
		}
		// if could not find title rec and we are docid-based then
		// we can't go any further!!
		if ( m_setFromDocId ||
		     // it should be there if trying to delete as well!
		     m_deleteFromIndex ) {
			log("xmldoc: null utf8 content for docid-based "
			    "titlerec (d=%"INT64") lookup which was not found",
			    m_docId);
			ptr_utf8Content = NULL;
			size_utf8Content = 0;
			m_utf8ContentValid = true;
			m_contentType = CT_HTML;
			m_contentTypeValid = true;
			return &ptr_utf8Content;
		}

	}


	char **ep = getExpandedUtf8Content();
	if ( ! ep || ep == (void *)-1 ) return ep;

	// NULL out if no content
	if ( ! *ep ) {
		ptr_utf8Content    = NULL;
		size_utf8Content   = 0;
		m_utf8ContentValid = true;
		return &ptr_utf8Content;
	}

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (char **)ct;

	// if we have a json reply, leave it alone... expanding a &quot;
	// into a double quote will mess up the JSON!
	if ( *ct == CT_JSON ) {
		ptr_utf8Content  = (char *)m_expandedUtf8Content;
		size_utf8Content = m_expandedUtf8ContentSize;
		m_utf8ContentValid = true;
		return &ptr_utf8Content;
	}

	// why would the spider proxy, who use msg13.cpp to call
	// XmlDoc::getExpandedUtf8Content() want to call this??? it seems
	// to destroy expandedutf8content with a call to htmldecode
	if ( m_isSpiderProxy ) { char *xx=NULL;*xx=0; }


	// not if rss file extension
	//bool isRSSExt = false;
	//char *ext = m_firstUrl.getExtension();
	//if ( ext && strcasecmp(ext,"rss") == 0 ) isRSSExt = true;
	//if ( ext && strcasecmp(ext,"xml") == 0 ) isRSSExt = true;
	//if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;

	//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
	//if ( m_contentTypeValid && m_contentType == CT_XML ) isRSSExt = true;

	// convert &lt; to <gb and &gt; to gb/> ???? and &amp; to utf32 char
	// for a double wide ampersand?
	//bool doSpecial = true;
	// convert to what it should be if we are an .rss file extension
	//if ( isRSSExt ) doSpecial = false;

	// sabnity check
	if ( m_xmlValid   ) { char *xx=NULL;*xx=0; }
	if ( m_wordsValid ) { char *xx=NULL;*xx=0; }

	QUICKPOLL(m_niceness);

	//
	// convert illegal utf8 characters into spaces
	//
	// fixes santaclarachorale.vbotickets.com/tickets/g.f._handels_israel_in_egypt/1062
	// which has a 228,0x80,& sequence (3 chars, last is ascii)
	uint8_t *x = (uint8_t *)m_expandedUtf8Content;
	char size;
	for ( ; *x ; x += size ) {
		QUICKPOLL(m_niceness);
		size = getUtf8CharSize(x);
		// ok, make it a space i guess if it is a bad utf8 char
		if ( ! isSaneUtf8Char(x) ) {
			*x = ' ';
			size = 1;
			continue;
		}
		// skip if only one byte
		if ( size == 1 ) continue;
		// now each byte in the sequence must have 0x80 set...
		if ( ! (x[1] & 0x80) ) {
			x[0] = ' ';
			size = 1;
			continue;
		}
		if ( size == 2 ) continue;
		if ( ! (x[2] & 0x80) ) {
			x[0] = ' ';
			size = 1;
			continue;
		}
		if ( size == 3 ) continue;
		if ( ! (x[3] & 0x80) ) {
			x[0] = ' ';
			size = 1;
			continue;
		}
	}

	// sanity
	if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }

	// if we are an xml doc, then before we call htmlDecode translate
	// all tags like <title> or <link> to <gbtitle> or <gblink> so we
	// know they are xml tags. because stuff like &lt;br&gt; will
	// become <br> and will be within its xml tag like <gbdescription>
	// or <gbtitle>.
	// MDW: 9/28/2014. no longer do this since i added hashXmlFields().
	/*
	if ( m_contentType == CT_XML ) {
		// count the xml tags
		char *p    = m_expandedUtf8Content;
		char *pend = p + m_expandedUtf8ContentSize - 1;
		int32_t  need = m_expandedUtf8ContentSize;
		for ( ; p < pend ; p++ ) {
			QUICKPOLL(m_niceness);
			if ( *p == '<' ) need += 5; // for adding "gbxml"
		}
		if ( ! m_xbuf.reserve(need) ) return NULL;
		// reset ptr
		p = m_expandedUtf8Content;
		// ponit to dst
		char *dst = m_xbuf.getBufStart();
		// do the copy
		for ( ; p < pend ; p++ ) {
			// breathe
			QUICKPOLL(m_niceness);
			// copy it over
			*dst++ = *p;
			if ( *p != '<' ) continue;
			// if <?xml> copy over as is, do not insert 'gb'
			if ( p[1] == '?' ) continue;
			// same for comments <!--...-->
			if ( p[1] == '!' ) continue;
			// point to tagname
			char *tagName = p+1;
			if ( p[1] == '/' ) tagName++;
			// also get the full node now
			NodeType *nt; getTagId ( tagName , &nt );
			// if it is not an html tag, do not fuss with it!
			if ( ! nt ) continue;
			// if its in the list but is xml, let it go too
			if ( nt->m_isXmlTag ) continue;
			// . otherwise, its an html tag being used as an xml
			//   tag and we need to encode (append gbxml to it)
			// . insert / first if there
			if ( p[1] == '/' ) {p++;*dst++ = *p;}
			// then "gb"
			*dst++ = 'g';
			*dst++ = 'b';
			*dst++ = 'x';
			*dst++ = 'm';
			*dst++ = 'l';
		}
		// update
		m_xbuf.m_length = dst - m_xbuf.getBufStart();
		// final \0
		*dst = '\0';
		// re-assign these
		m_expandedUtf8Content     = m_xbuf.getBufStart();//m_buf;
		m_expandedUtf8ContentSize = m_xbuf.m_length + 1;
		// free esbuf if we were referencing that to save mem
		m_esbuf.purge();
	}
	*/

	// richmondspca.org has &quot; in some tags and we do not like
	// expanding that to " because it messes up XmlNode::getTagLen()
	// and creates big problems. same for www.first-avenue.com. so
	// by setting doSpecial to try we change &lt; &gt and &quot; to
	// [ ] and ' which have no meaning in html per se.
	bool doSpecial = true;
	if ( m_contentType == CT_XML ) doSpecial = false;

	// . now decode those html entites into utf8 so that we never have to
	//   check for html entities anywhere else in the code. a big win!!
	// . doSpecial = true, so that &lt, &gt, &amp; and &quot; are
	//   encoded into high value
	//   utf8 chars so that Xml::set(), etc. still work properly and don't
	//   add any more html tags than it should
	// . this will decode in place
	// . MDW: 9/28/2014. no longer do for xml docs since i added
	//   hashXmlFields()
	int32_t n = m_expandedUtf8ContentSize - 1;
	if ( m_contentType != CT_XML )
		n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
			       m_expandedUtf8Content,//ptr_utf8Content,
			       m_expandedUtf8ContentSize-1,//size_utf8Con
			       doSpecial,
			       m_niceness);

	// can't exceed this! n does not include the final \0 even though
	// we do right it out.
	if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
	// sanity
	if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }

	// now rss has crap in it like "&amp;nbsp;" so we have to do another
	// decoding pass
	// . MDW: 9/28/2014. no longer do for xml docs since i added
	//   hashXmlFields()
	// if ( m_contentType == CT_XML ) // isRSSExt )
	// 	n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
	// 		       m_expandedUtf8Content,//ptr_utf8Content,
	// 		       n,
	// 		       false,//doSpecial,
	// 		       m_niceness);
	// sanity
	if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
	// sanity
	if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; }


	// finally transform utf8 apostrophe's into regular apostrophes
	// to make parsing easier
	uint8_t *p   = (uint8_t *)m_expandedUtf8Content;
	uint8_t *dst = (uint8_t *)m_expandedUtf8Content;
	uint8_t *pend = p + n;
	for ( ; *p ; p += size ) {
		QUICKPOLL(m_niceness);
		size = getUtf8CharSize(p);
		// quick copy
		if ( size == 1 && p[0] != '<' ) { *dst++ = *p; continue; }
		// make "1<super>st</super>" into "1st" so Dates.cpp can
		// have an easier time
		if ( p[0] == '<' &&
		     to_lower_a(p[1]) == 's' &&
		     to_lower_a(p[2]) == 'u' &&
		     to_lower_a(p[3]) == 'p' ) {
			// assume no go!
			*dst++ = '<';
			// use this
			char *s = (char *)p;
			// did number preceed?
			char *pn = s - 1;
			for (;pn>=m_expandedUtf8Content&&is_wspace_a(*pn);pn--)
				QUICKPOLL(m_niceness);
			// must be like "1st" or "32nd"
			if ( ! is_digit(*pn) ) continue;
			// skip the "<sup"
			s += 4;
			// skip until >
			for (; *s && *s != '>' ; s++ )
				QUICKPOLL(m_niceness);
			// crazy?
			if ( ! *s ) continue;
			// skip the '>'
			s++;
			// skip spaces after the "<sup>" tag
			for (; *s && is_wspace_a(*s) ; s++ )
				QUICKPOLL(m_niceness);
			// crazy?
			if ( ! *s ) continue;
			// check for "st" etc
			bool gotIt = false;
			char *suffix = s;
			if ( (to_lower_a(s[0])=='s'&&to_lower_a(s[1]) == 't')||
			     (to_lower_a(s[0])=='n'&&to_lower_a(s[1]) == 'd')||
			     (to_lower_a(s[0])=='r'&&to_lower_a(s[1]) == 'd')||
			     (to_lower_a(s[0])=='t'&&to_lower_a(s[1]) == 'h'))
				gotIt = true;
			if ( ! gotIt ) continue;
			// skip that
			s += 2;
			// skip more spaces
			for (; *s && is_wspace_a(*s) ; s++ )
				QUICKPOLL(m_niceness);
			// crazy?
			if ( ! *s ) continue;
			// find </super> tag
			if ( s[0] != '<' ) continue;
			if ( s[1] != '/' ) continue;
			if ( to_lower_a(s[2]) != 's' ) continue;
			if ( to_lower_a(s[3]) != 'u' ) continue;
			if ( to_lower_a(s[4]) != 'p' ) continue;
			if ( s[5] != '>' ) continue;
			// skip it, point to >
			s += 5;
			// assign p to that
			p = (unsigned char *)s;
			// back up ove rthe no-go
			dst--;
			// rewrite it
			*dst++ = to_lower_a(suffix[0]);
			*dst++ = to_lower_a(suffix[1]);
			// do next round
			continue;
		}


		// check for crazy apostrophes
		if ( p[0]==0xe2 &&
		     p[1]==0x80 &&
		     (p[2]==0x99 ||
		      p[2]==0x98 ||
		      p[2]==0x9b ) ) {
			*dst++ = '\'';
			continue;
		}
		// utf8 control character?
		if ( p[0] == 0xc2 &&
		     p[1] >= 0x80 &&
		     p[1] <= 0x9f ) {
			*dst++ = ' ';
			continue;
		}
		// double quotes in utf8
		// DO NOT do this if type JSON!! json uses quotes as
		// control characters
		if ( p[0] == 0xe2 &&
		     p[1] == 0x80 &&
		     m_contentType != CT_JSON ) {
			if (p[2] == 0x9c ) {
				*dst++ = '\"';
				continue;
			}
			if (p[2] == 0x9d ) {
				*dst++ = '\"';
				continue;
			}
		}
		// and crazy hyphens (8 - 10pm)
		if ( p[0]==0xc2 &&
		     p[1]==0xad ) {
			*dst++ = '-';
			continue;
		}
		if ( p[0]==0xe2 &&
		     p[1]==0x80 &&
		     p[2]==0x93 ) {
			*dst++ = '-';
			continue;
		}
		if ( p[0]==0xe2 &&
		     p[1]==0x80 &&
		     p[2]==0x94 ) {
			*dst++ = '-';
			continue;
		}
		// . convert all utf8 white space to ascii white space
		// . should benefit the string matching algo in
		//   XmlDoc::getEventSummary() which needs to skip spaces
		if ( ! g_map_is_ascii[(unsigned char)*p]  &&
		     is_wspace_utf8(p) ) {
			*dst++ = ' ';
			continue;
		}
		// otherwise, just copy it
		gbmemcpy(dst,p,size);
		dst += size;
	}
	// null term
	*dst++ = '\0';

	// now set it up
	ptr_utf8Content  = (char *)m_expandedUtf8Content;
	//size_utf8Content = n+1;//m_expandedUtf8ContentSize;
	size_utf8Content = (char *)dst - m_expandedUtf8Content;

	// sanity -- skipped over the \0???
	if ( p > pend ) { char *xx=NULL;*xx=0; }

	// sanity check
	if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) {
		char *xx=NULL;*xx=0; }

	m_utf8ContentValid = true;
	return &ptr_utf8Content;
}

// *pend should be \0
int32_t getContentHash32Fast ( unsigned char *p ,
			    int32_t plen ,
			    int32_t niceness ) {
	// sanity
	if ( ! p ) return 0;
	if ( plen <= 0 ) return 0;
	if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; }
	unsigned char *pend = p + plen;

	static bool s_init = false;
	static char s_qtab0[256];
	static char s_qtab1[256];
	static char s_qtab2[256];
	static char *s_skips[] = {
		"jan",
		"feb",
		"mar",
		"apr",
		"may",
		"jun",
		"jul",
		"aug",
		"sep",
		"oct",
		"nov",
		"dec",
		"sun",
		"mon",
		"tue",
		"wed",
		"thu",
		"fri",
		"sat" };
	if ( ! s_init ) {
		// only call this crap once
		s_init = true;
		// clear up
		memset(s_qtab0,0,256);
		memset(s_qtab1,0,256);
		memset(s_qtab2,0,256);
		for ( int32_t i = 0 ; i < 19  ; i++ ) {
			unsigned char *s = (unsigned char *)s_skips[i];
			s_qtab0[(unsigned char)to_lower_a(s[0])] = 1;
			s_qtab0[(unsigned char)to_upper_a(s[0])] = 1;
			// do the quick hash
			unsigned char qh = to_lower_a(s[0]);
			qh ^= to_lower_a(s[1]);
			qh <<= 1;
			qh ^= to_lower_a(s[2]);
			s_qtab1[qh] = 1;
			// try another hash, the swift hash
			unsigned char sh = to_lower_a(s[0]);
			sh <<= 1;
			sh ^= to_lower_a(s[1]);
			sh <<= 1;
			sh ^= to_lower_a(s[2]);
			s_qtab2[sh] = 1;
		}
	}

	bool lastWasDigit = false;
	bool lastWasPunct = true;
	uint32_t h = 0LL;
	//char  size = 0;
	unsigned char pos = 0;
	for ( ; p < pend ; p++ ) { //  += size ) {
		// breathe
		QUICKPOLL ( niceness );
		// get size
		// this might not be utf8!!!
		//size = getUtf8CharSize(p);
		// skip if not alnum
		// this might not be utf8!!!
		//if ( ! is_alnum_utf8 ( (char *)p ) ) {
		if ( ! is_alnum_a ( *p ) ) {
			lastWasDigit = false;
			lastWasPunct = true;
			continue;
		}
		// if its a digit, call it 1
		if ( is_digit(*p) ) {
			// skip consecutive digits
			if ( lastWasDigit ) continue;
			// xor in a '1'
			h ^= g_hashtab[pos][(unsigned char)'1'];
			pos++;
			lastWasDigit = true;
			continue;
		}
		// reset
		lastWasDigit = false;

		// exclude days of the month or week so clocks do
		// not affect this hash
		if ( s_qtab0[p[0]] && lastWasPunct && p[1] && p[2] ) {
			// quick hash
			unsigned char qh = to_lower_a(p[0]);
			qh ^= to_lower_a(p[1]);
			qh <<= 1;
			qh ^= to_lower_a(p[2]);
			// look that up
			if ( ! s_qtab1[qh] ) goto skip;
			// try another hash, the swift hash
			unsigned char sh = to_lower_a(p[0]);
			sh <<= 1;
			sh ^= to_lower_a(p[1]);
			sh <<= 1;
			sh ^= to_lower_a(p[2]);
			if ( ! s_qtab2[sh] ) goto skip;
			// ok, probably a match..
			unsigned char *s = p + 3;
			// skip to end of word
			//char size2;
			//for ( ; s < pend ; s += size2 ) {
			for ( ; s < pend ; s++ ) {
				//size2 = getUtf8CharSize(s);
				//if ( ! is_alnum_utf8 ((char *)s) )
				if ( ! is_alnum_a ( *s ) )
					break;
			}
			// it is already point to the next char, so clr this
			//size = 0;
			// advance p now
			p = s;
			// hash as one type of thing...
			h ^= g_hashtab[pos][(unsigned char)'X'];
			pos++;
			continue;
		}

	skip:
		// reset this
		lastWasPunct = false;
		// xor this in right
		h ^= g_hashtab[pos][p[0]];
		pos++;
		// assume ascii or latin1
		continue;
		/*
		// one more?
		if ( size == 1 ) continue;
		// do that
		h ^= g_hashtab[pos][p[1]];
		pos++;
		// one more?
		if ( size == 2 ) continue;
		// do that
		h ^= g_hashtab[pos][p[2]];
		pos++;
		// one more?
		if ( size == 3 ) continue;
		// do that
		h ^= g_hashtab[pos][p[3]];
		pos++;
		// that should do it!
		continue;
		*/
	}
	return h;
}

int32_t *XmlDoc::getContentHash32 ( ) {
	// return it if we got it
	if ( m_contentHash32Valid ) return &m_contentHash32;
	setStatus ( "getting contenthash32" );

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (int32_t *)ct;

	// we do not hash the url/resolved_url/html fields in diffbot json
	// because the url field is a mirror of the url and the html field
	// is redundant and would slow us down
	if ( *ct == CT_JSON )
		return getContentHashJson32();

	// if we are a diffbot json object, fake this for now, it will
	// be set for real in hashJSON()
	// no, because we call this before hashJSON() for to set
	// EDOCUNCHANGED above... so just hash the json normally for now
	//if ( m_isDiffbotJSONObject ) {
	//	m_contentHash32 = 0;
	//	return &m_contentHash32;
	//}

	// . get the content. get the pure untouched content!!!
	// . gotta be pure since that is what Msg13.cpp computes right
	//   after it downloads the doc...
	// . if iframes are present, msg13 gives up
	char **pure = getContent();
	if ( ! pure || pure == (char **)-1 ) return (int32_t *)pure;
	// size
	//int32_t n = size_utf8Content - 1;
	// hash up to first 10,000 chars
	//if ( n > 10000 ) n = 10000;
	// do it
	//m_contentHash32 = hash32 ( ptr_utf8Content , n );
	unsigned char *p = (unsigned char *)(*pure);
	int32_t plen = m_contentLen;//size_utf8Content - 1;

	// no content means no hash32
	if ( plen <= 0 ) {//ptr_utf8Content ) {
		m_contentHash32 = 0;
		m_contentHash32Valid = true;
		return &m_contentHash32;
	}

	// we set m_contentHash32 in ::hashJSON() below because it is special
	// for diffbot since it ignores certain json fields like url: and the
	// fields are independent, and numbers matter, like prices
	//if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; }

	// *pend should be \0
	m_contentHash32 = getContentHash32Fast ( p , plen , m_niceness );
	// validate
	m_contentHash32Valid = true;
	return &m_contentHash32;
}

// we do not hash the url/resolved_url/html fields in diffbot json
// because the url field is a mirror of the url and the html field
// is redundant and would slow us down
int32_t *XmlDoc::getContentHashJson32 ( ) {

	if ( m_contentHash32Valid ) return &m_contentHash32;

	// use new json parser
	Json *jp = getParsedJson();
	if ( ! jp || jp == (void *)-1 ) return (int32_t *)jp;

	JsonItem *ji = jp->getFirstItem();
	int32_t totalHash32 = 0;

	//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);

	for ( ; ji ; ji = ji->m_next ) {
		QUICKPOLL(m_niceness);
		// skip if not number or string
		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
			continue;

		char *topName = NULL;

		// what name level are we?
		int32_t numNames = 1;
		JsonItem *pi = ji->m_parent;
		for ( ; pi ; pi = pi->m_parent ) {
			// empty name?
			if ( ! pi->m_name ) continue;
			if ( ! pi->m_name[0] ) continue;
			topName = pi->m_name;
			numNames++;
		}

		// if we are the diffbot reply "html" field do not hash this
		// because it is redundant and it hashes html tags etc.!
		// plus it slows us down a lot and bloats the index.
		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"html") == 0 )
			continue;

		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"url") == 0 )
			continue;

		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"pageUrl") == 0 )
			continue;

		// mike will track down how the hash works in article|3|123456
		//if ( ji->m_name && numNames==1 &&
		//     strcmp(ji->m_name,"diffbotUri") == 0 )
		//	continue;

		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"resolved_url") == 0 )
			continue;

		if ( topName && strcmp(topName,"stats") == 0 )
			continue;

		if ( topName && strcmp(topName,"queryString") == 0 )
			continue;

		if ( topName && strcmp(topName,"nextPages") == 0 )
			continue;

		if ( topName && strcmp(topName,"textAnalysis") == 0 )
			continue;

		if ( topName && strcmp(topName,"links") == 0 )
			continue;


		// hash the fully compound name
		int32_t nameHash32 = 0;
		JsonItem *p = ji;
		char *lastName = NULL;
		for ( ; p ; p = p->m_parent ) {
			// empty name?
			if ( ! p->m_name ) continue;
			if ( ! p->m_name[0] ) continue;
			// dup? can happen with arrays. parent of string
			// in object, has same name as his parent, the
			// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
			if ( p->m_name == lastName ) continue;
			// update
			lastName = p->m_name;
			// hash it up
			nameHash32 = hash32(p->m_name,p->m_nameLen,nameHash32);
		}

		//
		// now Json.cpp decodes and stores the value into
		// a buffer, so ji->getValue() should be decoded completely
		//

		// . get the value of the json field
		// . if it's a number or bool it converts into a string
		int32_t vlen;
		char *val = ji->getValueAsString( &vlen );

		//
		// for deduping search results we set m_contentHash32 here for
		// diffbot json objects.
		//
		// we use this hash for setting EDOCUNCHANGED when reindexing
		// a diffbot reply. we also use to see if the diffbot reply
		// is a dup with another page in the index. thirdly, we use
		// to dedup search results, which could be redundant because
		// of our spider-time deduping.
		//
		// make the content hash so we can set m_contentHash32
		// for deduping. do an exact hash for now...
		int32_t vh32 = hash32 ( val , vlen , m_niceness );
		// combine
		int32_t combined32 = hash32h ( nameHash32 , vh32 );
		// accumulate field/val pairs order independently
		totalHash32 ^= combined32;
		// debug note
		//logf(LOG_DEBUG,"ch32: field=%s nh32=%"UINT32" vallen=%"INT32"",
		//     ji->m_name,
		//     nameHash32,
		//     vlen);
	}

	m_contentHash32 = totalHash32;
	m_contentHash32Valid = true;
	return &m_contentHash32;
}

// do not consider tags except frame and iframe... make all months
// and days of weeks and digits basically the same
int64_t *XmlDoc::getLooseContentHash64 ( ) {

	if ( m_looseContentHash64Valid )
		return &m_looseContentHash64;


	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (int64_t *)xml;

	int64_t h64 = 0LL;

	int32_t n = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes   ();
	for ( int32_t i = 0 ; i < n ; i++ ) {

		// breathe
		QUICKPOLL(m_niceness);

		// skip if not the right kinda tag
		if ( nodes[i].isTag() &&
		     nodes[i].getNodeId() != TAG_FRAME  &&
		     nodes[i].getNodeId() != TAG_IFRAME &&
		     nodes[i].getNodeId() != TAG_IMG      )
			continue;

		// hash that node up
		int64_t ch64;

		// this is really a 32-bit hash
		ch64=getContentHash32Fast((unsigned char *)nodes[i].getNode() ,
					  nodes[i].getNodeLen() ,
					  m_niceness );

		// incorporate hash from that node
		h64 = hash64h ( ch64 , h64 );
	}

	m_looseContentHash64Valid = true;
	m_looseContentHash64 = h64;
	return &m_looseContentHash64;
}

int32_t XmlDoc::getHostHash32a ( ) {
	if ( m_hostHash32aValid ) return m_hostHash32a;
	m_hostHash32aValid = true;
	Url *f = getFirstUrl();
	m_hostHash32a = f->getHostHash32();
	return m_hostHash32a;
}

int32_t XmlDoc::getHostHash32b ( ) {
	if ( m_hostHash32bValid ) return m_hostHash32b;
	m_hostHash32bValid = true;
	Url *c = getCurrentUrl();
	m_hostHash32b = c->getHostHash32();
	return m_hostHash32b;
}

int32_t XmlDoc::getDomHash32( ) {
	if ( m_domHash32Valid ) return m_domHash32;
	m_domHash32Valid = true;
	Url *f = getFirstUrl();
	m_domHash32 = hash32 ( f->getDomain(), f->getDomainLen() );
	return m_domHash32;
}

// . this will be the actual pnm data of the image thumbnail
// . you can inline it in an image tag like
//   <img src="data:image/png;base64,iVBORw0...."/>
//   background-image:url(data:image/png;base64,iVBORw0...);
// . FORMAT of ptr_imageData:
//   <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg>
char **XmlDoc::getThumbnailData ( ) {
	if ( m_imageDataValid ) return &ptr_imageData;
	Images *images = getImages();
	if ( ! images || images == (Images *)-1 ) return (char **)images;
	ptr_imageData  = NULL;
	size_imageData = 0;
	m_imageDataValid = true;
	if ( ! images || ! images->m_imageBufValid ) return &ptr_imageData;
	if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData;
	// this buffer is a ThumbnailArray
	ptr_imageData  = images->m_imageBuf.getBufStart();
	size_imageData = images->m_imageBuf.length();
	return &ptr_imageData;
}

Images *XmlDoc::getImages ( ) {
	if ( m_imagesValid ) return &m_images;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	if ( ! cr->m_makeImageThumbnails ) {
		m_images.reset();
		m_imagesValid = true;
		return &m_images;
	}

	if ( cr->m_isCustomCrawl ) {
		m_images.reset();
		m_imagesValid = true;
		return &m_images;
	}

	setStatus ( "getting thumbnail" );

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (Images *)words;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Images *)xml;
	Sections *sections = getSections();
	if ( ! sections || sections==(Sections *)-1) return (Images *)sections;
	char *site = getSite ();
	if ( ! site || site == (char *)-1 ) return (Images *)site;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Images *)d;
	int8_t *hc = getHopCount();
	if ( ! hc || hc == (void *)-1 ) return (Images *)hc;
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (Images *)cu;

	// . this does not block or anything
	// . if we are a diffbot json reply it should just use the primary
	//   image, if any, as the only candidate
	m_images.setCandidates ( cu , words , xml , sections , this );

	setStatus ("getting thumbnail");

	// assume valid
	m_imagesValid = true;

	// now get the thumbnail
	if ( ! m_images.getThumbnail ( site         ,
				       gbstrlen(site) ,
				       *d           ,
				       this         ,
				       cr->m_collnum       ,
				       //NULL         , // statusPtr ptr
				       *hc          ,
				       m_masterState,
				       m_masterLoop ) )
		return (Images *)-1;

	return &m_images;
}


// . get different attributes of the Links as vectors
// . these are 1-1 with the Links::m_linkPtrs[] array
TagRec ***XmlDoc::getOutlinkTagRecVector () {

	// if page has a <meta name=usefakeips content=1> tag
	// then use the hash of the links host as the firstip.
	// this will speed things up when adding a gbdmoz.urls.txt.*
	// file to index every url in dmoz.
	char *useFakeIps = hasFakeIpsMetaTag();
	if ( ! useFakeIps || useFakeIps == (void *)-1 )
		return (TagRec ***)useFakeIps;

	// no error and valid, return quick
	if ( m_outlinkTagRecVectorValid && *useFakeIps )
		return &m_outlinkTagRecVector;

	// error?
	if ( m_outlinkTagRecVectorValid && m_msge0.m_errno ) {
		g_errno = m_msge0.m_errno;
		return NULL;
	}

	// if not using fake ips, give them the real tag rec vector
	if ( m_outlinkTagRecVectorValid )
		return &m_msge0.m_tagRecPtrs;

	Links *links = getLinks();
	if ( ! links || links == (void *) -1 ) return (TagRec ***)links;

	if ( *useFakeIps ) {
		// set to those
		m_fakeTagRec.reset();
		// just make a bunch ptr to empty tag rec
		int32_t need = links->m_numLinks * sizeof(TagRec *);
		if ( ! m_fakeTagRecPtrBuf.reserve ( need ) ) return NULL;
		// make them all point to the fake empty tag rec
		TagRec **grv = (TagRec **)m_fakeTagRecPtrBuf.getBufStart();
		for ( int32_t i = 0 ; i < links->m_numLinks ; i++ )
			grv[i] = &m_fakeTagRec;
		// set it
		m_outlinkTagRecVector = grv;
		m_outlinkTagRecVectorValid = true;
		return &m_outlinkTagRecVector;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;


	// update status msg
	setStatus ( "getting outlink tag rec vector" );
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (TagRec ***)gr;
	// assume valid
	m_outlinkTagRecVectorValid = true;
	// go get it
	if ( ! m_msge0.getTagRecs ( links->m_linkPtrs  ,
				    links->m_linkFlags ,
				    links->m_numLinks  ,
				    false              , // skip old?
				    // make it point to this basetagrec if
				    // the LF_SAMEHOST flag is set for the link
				    gr ,
				    cr->m_collnum             ,
				    m_niceness         ,
				    m_masterState      ,
				    m_masterLoop       )) {
		// sanity check
		if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
		// we blocked
		return (TagRec ***)-1;
	}
	// error?
	if ( g_errno ) return NULL;
	// or this?
	if ( m_msge0.m_errno ) {
		g_errno = m_msge0.m_errno;
		return NULL;
	}
	// set it
	//m_outlinkTagRecVector = m_msge0.m_tagRecPtrs;
	// ptr to a list of ptrs to tag recs
	return &m_msge0.m_tagRecPtrs;
}

char *XmlDoc::hasNoIndexMetaTag() {
	if ( m_hasNoIndexMetaTagValid )
		return &m_hasNoIndexMetaTag;
	// assume none
	m_hasNoIndexMetaTag = false;
	// store value/content of meta tag in here
	char mbuf[16];
	mbuf[0] = '\0';
	char *tag = "noindex";
	int32_t tlen = gbstrlen(tag);
	// check the xml for a meta tag
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	xml->getMetaContent ( mbuf, 16 , tag , tlen );
	if ( mbuf[0] == '1' ) m_hasNoIndexMetaTag = true;
	m_hasNoIndexMetaTagValid = true;
	return &m_hasNoIndexMetaTag;
}


char *XmlDoc::hasFakeIpsMetaTag ( ) {
	if ( m_hasUseFakeIpsMetaTagValid ) return &m_hasUseFakeIpsMetaTag;

	char mbuf[16];
	mbuf[0] = '\0';
	char *tag = "usefakeips";
	int32_t tlen = gbstrlen(tag);

	// check the xml for a meta tag
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	xml->getMetaContent ( mbuf, 16 , tag , tlen );

	m_hasUseFakeIpsMetaTag = false;
	if ( mbuf[0] == '1' ) m_hasUseFakeIpsMetaTag = true;
	m_hasUseFakeIpsMetaTagValid = true;
	return &m_hasUseFakeIpsMetaTag;
}


int32_t **XmlDoc::getOutlinkFirstIpVector () {

	Links *links = getLinks();
	if ( ! links ) return NULL;

	// if page has a <meta name=usefakeips content=1> tag
	// then use the hash of the links host as the firstip.
	// this will speed things up when adding a gbdmoz.urls.txt.*
	// file to index every url in dmoz.
	char *useFakeIps = hasFakeIpsMetaTag();
	if ( ! useFakeIps || useFakeIps == (void *)-1 )
		return (int32_t **)useFakeIps;

	if ( *useFakeIps && m_outlinkIpVectorValid )
		return &m_outlinkIpVector;

	if ( *useFakeIps ) {
		int32_t need = links->m_numLinks * 4;
		m_fakeIpBuf.reserve ( need );
		for ( int32_t i = 0 ; i < links->m_numLinks ; i++ ) {
			uint64_t h64 = links->getHostHash64(i);
			int32_t ip = h64 & 0xffffffff;
			m_fakeIpBuf.pushLong(ip);
		}
		int32_t *ipBuf = (int32_t *)m_fakeIpBuf.getBufStart();
		m_outlinkIpVector = ipBuf;
		m_outlinkIpVectorValid = true;
		return &m_outlinkIpVector;
	}

	// return msge1's buf otherwise
	if ( m_outlinkIpVectorValid )
		return &m_msge1.m_ipBuf;

	// should we have some kinda error for msge1?
	//if ( m_outlinkIpVectorValid && m_msge1.m_errno ) {
	//	g_errno = m_msge1.m_errno;
	//	return NULL;
	//}

	// . we now scrounge them from TagRec's "firstip" tag if there!
	// . that way even if a domain changes its ip we still use the
	//   original ip, because the only reason we need this ip is for
	//   deciding which group of hosts will store this SpiderRequest and
	//   we use that for throttling, so we have to be consistent!!!
	// . we never add -1 or 0 ips to tagdb though.... (NXDOMAIN,error...)
	// . uses m_msgeForTagRecs for this one
	TagRec ***grv = getOutlinkTagRecVector();
	if ( ! grv || grv == (void *)-1 ) return (int32_t **)grv;
	// note it
	setStatus ( "getting outlink first ip vector" );
	// assume valid
	m_outlinkIpVectorValid = true;
	// sanity check
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	// use this
	int32_t nowGlobal = getSpideredTime();//m_spideredTime;
	// add tags to tagdb?
	bool addTags = true;
	//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
	if ( getIsPageParser() ) addTags = false;
	// get this
	char *testDir = getTestDir();

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . go get it
	// . if coll is "qatest123" then try to use the file ./test/ips.txt to
	//   see if the ip is in there for the given url hostname
	// . this will now update Tagdb with the "firstip" tags if it should!!
	// . this just dns looks up the DOMAINS of each outlink because these
	//   are *first* ips and ONLY used by Spider.cpp for throttling!!!
	if ( ! m_msge1.getFirstIps ( *grv               ,
				     links->m_linkPtrs  ,
				     links->m_linkFlags ,
				     links->m_numLinks  ,
				     false              , // skip old?
				     cr->m_coll             ,
				     m_niceness         ,
				     m_masterState      ,
				     m_masterLoop       ,
				     nowGlobal          ,
				     addTags            ,
				     testDir            )) {
		// sanity check
		if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }
		// we blocked
		return (int32_t **)-1;
	}
	// error?
       	if ( g_errno ) return NULL;
	// . ptr to a list of ptrs to tag recs
	// . ip will be -1 on error
	return &m_msge1.m_ipBuf;
}

/*
// really this could just check titledb in memory tree and tfndb and should
// be really fast!!
char **XmlDoc::getOutlinkIsIndexedVector () {
	if ( m_outlinkIsIndexedVectorValid ) return &m_msge2.m_isIndexedBuf;
	setStatus ( "getting outlink is indexed vector" );
	Links *links = getLinks();
	if ( ! links ) return NULL;
	// assume valid
	m_outlinkIsIndexedVectorValid = true;
	// go get it
	bool status = m_msge2.getIsIndexed ( links->m_linkPtrs  ,
					     links->m_linkFlags ,
					     links->m_numLinks  ,
					     false              , // skip old?
					     m_coll             ,
					     m_niceness         ,
					     m_masterState      ,
					     m_masterLoop       );
	// set it
	//m_outlinkIsIndexedVector = m_msge2.m_isIndexedBuf;
	// we blocked
	if ( ! status ) return (char **)-1;
	// error?
	if ( g_errno ) return NULL;
	// ptr to a list of ptrs to tag recs
	return &m_msge2.m_isIndexedBuf;
}
*/

/*
char *XmlDoc::getIsVisible ( ) {
	if ( m_isVisibleValid ) return &m_isVisible;
	setStatus ( "getting is visible" );
	// to get a live reading, invalidate tag rec from title rec
	m_oldTagRecValid = false;
	// . loop over all regular expression in the url filters table
	// . stop at first regular expression it matches
	int32_t *rn = getRegExpNum2 ( -1 );
	// need to wait for a callback at this point (or we had critical error)
	if ( ! rn || rn == (int32_t *)-1 ) return (char *)rn;
	// assume yes
	m_isVisible = true;
	// and valid
	m_isVisibleValid = true;
	// no match
	if ( *rn == -1 ) return &m_isVisible;
	// get spider priority
	int32_t pr = m_cr->m_spiderPriorities[*rn];
	// test it
	if ( pr == -2 ) m_isVisible = false;
	if ( pr == -3 ) m_isVisible = false;
	return &m_isVisible;
}
*/

int32_t *XmlDoc::getUrlFilterNum ( ) {
	// return it if already set
	if ( m_urlFilterNumValid ) return &m_urlFilterNum;
	// note that
	setStatus ( "getting url filter row num");

	// . make the partial new spider rec
	// . we need this for matching filters like lang==zh_cn
	// . crap, but then it matches "hasReply" when it should not
	// . PROBLEM! this is the new reply not the OLD reply, so it may
	//   end up matching a DIFFERENT url filter num then what it did
	//   before we started spidering it...
	//SpiderReply *newsr = getNewSpiderReply ( );
	// note it
	//if ( ! newsr )
	//	log("doc: getNewSpiderReply: %s",mstrerror(g_errno));
	//if ( ! newsr || newsr == (void *)-1 ) return (int32_t *)newsr;

	// need language i guess
	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (int32_t *)langId;


	// make a fake one for now
	// SpiderReply fakeReply;
	// // fix errors
	// fakeReply.reset();
	// fakeReply.m_isIndexedINValid = true;
	// // just language for now, so we can FILTER by language
	// if ( m_langIdValid ) fakeReply.m_langId = m_langId;

	int32_t langIdArg = -1;
	if ( m_langIdValid ) langIdArg = m_langId;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// this must be valid
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	int32_t spideredTime = getSpideredTime();
	// get the spider request
	SpiderRequest *oldsr = &m_sreq;
	// null it out if invalid...
	if ( ! m_sreqValid ) oldsr = NULL;
	// do not set the spideredTime in the spiderReply to 0
	// so we do not trigger the lastSpiderTime
	//int32_t saved = newsr->m_spideredTime;
	//newsr->m_spideredTime = 0;
	//
	// PROBLEM: we end up matching "isIndexed" in the url filters
	// even if this is a NEW document because we pass it in the spider
	// reply that we generate now even though another spider reply
	// may not exist.
	//
	// SOLUTION: just do not supply a spider reply, we only seem to
	// use the urlfilternum to get a diffbot api url OR to see if the
	// document is banned/filtered so we should delete it. otherwise
	// we were supplying "newsr" above...

	// . look it up
	// . use the old spidered date for "nowGlobal" so we can be consistent
	//   for injecting into the "qatest123" coll
	int32_t ufn = ::getUrlFilterNum ( oldsr,
					  NULL,//&fakeReply,
					  spideredTime,false,
					  m_niceness,cr,
					  false, // isOutlink?
					  NULL,
					  langIdArg);

	// put it back
	//newsr->m_spideredTime = saved;

	// bad news?
	if ( ufn < 0 ) {
		log("build: failed to get url filter for xmldoc %s",
		    m_firstUrl.m_url);
		//g_errno = EBADENGINEER;
		//return NULL;
	}


	// store it
	m_urlFilterNum      = ufn;
	m_urlFilterNumValid = true;

	// set this too in case the url filters table changes while
	// we are spidering this and a row is inserted or deleted or something
	//SafeBuf *yy = &cr->m_spiderDiffbotApiUrl[ufn];
	// copy to ours
	//m_diffbotApiUrl.safeMemcpy ( yy );
	// ensure null term
	//m_diffbotApiUrl.nullTerm();
	//m_diffbotApiUrlValid = true;


	return &m_urlFilterNum;
}

// . both "u" and "site" must not start with http:// or https:// or protocol
bool isSiteRootFunc ( char *u , char *site ) {
	// get length of each
	int32_t slen = gbstrlen(site);//m_siteLen;
	int32_t ulen = gbstrlen(u);
	// "site" may or may not end in /, so remove that
	if ( site[slen-1] == '/' ) slen--;
	// same for url
	if ( u[ulen-1] == '/' ) ulen--;
	// skip http:// or https://
	if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
	if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
	if ( strncmp(site,"http://" ,7)==0 ) { site += 7; slen -= 7; }
	if ( strncmp(site,"https://",8)==0 ) { site += 8; slen -= 8; }
	// subtract default.asp etc. from "u"
	//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.asp",11)==0 )
	//	ulen -= 11;
	//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.html",12)==0 )
	//	ulen -= 12;
	//if ( ulen > 15 && strncasecmp(u+ulen-11,"index.html",10)==0 )
	//	ulen -= 10;
	// now they must match exactly
	if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
	// all done
	return false;
}

bool isSiteRootFunc3 ( char *u , int32_t siteRootHash32 ) {
	// get length of each
	int32_t ulen = gbstrlen(u);
	// remove trailing /
	if ( u[ulen-1] == '/' ) ulen--;
	// skip http:// or https://
	if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
	if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
	// now they must match exactly
	int32_t sh32 = hash32(u,ulen);
	return ( sh32 == siteRootHash32 );
}

char *XmlDoc::getIsSiteRoot ( ) {
	if ( m_isSiteRootValid ) return &m_isSiteRoot2;
	// get our site
	char *site = getSite ();
	if ( ! site || site == (char *)-1 ) return (char *)site;
	// get our url without the http:// or https://
	char *u = getFirstUrl()->getHost();
	if ( ! u ) {
		g_errno = EBADURL;
		return NULL;
	}
	// assume valid now
	m_isSiteRootValid = true;
	// get it
	bool isRoot = isSiteRootFunc ( u , site );
	// seems like https:://twitter.com/ is not getting set to root
	if ( m_firstUrl.getPathDepth(true) == 0  && ! m_firstUrl.isCgi() )
		isRoot = true;
	m_isSiteRoot2 = m_isSiteRoot = isRoot;
	return &m_isSiteRoot2;
}

/*
bool XmlDoc::getIsOutlinkSiteRoot ( char *u , TagRec *gr ) {
	// get our site
	Tag *tag = gr->getTag("site");
	// make "host" point to u's hostname
	int32_t hostLen; char *host = getHostFast ( u , &hostLen );
	// use hostname?
	char *site;
	int32_t  slen;
	if ( tag ) {
		site = tag->getTagData();
		slen = tag->getTagDataSize() - 1;
	}
	// otherwise, use hostname as site
	else {
		// must be end, or could be '/'
		if ( ! host[hostLen] || ! host[hostLen+1] ) return true;
		// i guess we were more than just a hostname, so not site root
		return false;
	}
	// get length of each
	int32_t ulen = gbstrlen(u);
	// "site" may or may not end in /, so remove that
	if ( site[slen-1] == '/' ) slen--;
	// same for url
	if ( u[ulen-1] == '/' ) ulen--;
	// now they must match exactly
	if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
	// all done
	return false;
}
*/


int8_t *XmlDoc::getHopCount ( ) {
	// return now if valid
	if ( m_hopCountValid ) return &m_hopCount;

	setStatus ( "getting hop count" );

	CollectionRec *cr = this->getCollRec();
	if(cr && cr->m_isCustomCrawl ) {
		// for diffbot collections, compute hopcount without casting
		// site/rss to 0 hopcount -- copied from below

		LinkInfo *info1 = getLinkInfo1();
		if (!info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
		int32_t origHopCount = -1;
		if ( m_sreqValid ) {
			origHopCount = m_sreq.m_hopCount;
		}
		int32_t hc = -1;
		// if(m_minInlinkerHopCount+1 < hc && m_minInlinkerHopCount>=0)
		// 	hc = m_minInlinkerHopCount + 1;
		// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
		// 	hc = m_minInlinkerHopCount + 1;
		if ( origHopCount < hc && origHopCount >= 0 )
			hc = origHopCount;
		if ( hc == -1 && origHopCount >= 0 )
			hc = origHopCount;
		if ( hc == -1 )
			hc = 1;
		if ( hc > 0x7f ) hc = 0x7f;
		m_hopCountValid = true;
		m_hopCount      = hc;

		//printf("Custom hopcount: %d for url: %s",
		//m_hopCount, this->ptr_firstUrl);
		return &m_hopCount;
	}

	// the unredirected url
	Url *f = getFirstUrl();
	// get url as string, skip "http://" or "https://"
	//char *u = f->getHost();
	// if we match site, we are a site root, so hop count is 0
	//char *isr = getIsSiteRoot();
	//if ( ! isr || isr == (char *)-1 ) return (int8_t *)isr;
	//if ( *isr ) {
	//	m_hopCount      = 0;
	//	m_hopCountValid = true;
	//	return &m_hopCount;
	//}
	// ping servers have 0 hop counts
	if ( f->isPingServer() ) {
		// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
		m_hopCount      = 0;
		m_hopCountValid = true;
		return &m_hopCount;
	}
	char *isRSS = getIsRSS();
	if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
	// check for site root
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int8_t *)gr;
	// and site roots
	char *isSiteRoot = getIsSiteRoot();
	if (!isSiteRoot ||isSiteRoot==(char *)-1) return (int8_t *)isSiteRoot;
	if ( *isSiteRoot ) {
		// log("xmldoc: hc1 is 0 (siteroot) %s",m_firstUrl.m_url);
		m_hopCount      = 0;
		m_hopCountValid = true;
		return &m_hopCount;
	}
	// make sure m_minInlinkerHopCount is valid
	LinkInfo *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
	// . fix bad original hop counts
	// . assign this hop count from the spider rec
	int32_t origHopCount = -1;
	if ( m_sreqValid ) origHopCount = m_sreq.m_hopCount;
	// derive our hop count from our parent hop count
	int32_t hc = -1;
	// . BUT use inlinker if better
	// . if m_linkInfo1Valid is true, then m_minInlinkerHopCount is valid
	// if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 )
	// 	hc = m_minInlinkerHopCount + 1;
	// or if parent is unknown, but we have a known inlinker with a
	// valid hop count, use the inlinker hop count then
	// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
	// 	hc = m_minInlinkerHopCount + 1;
	// if ( origHopCount == 0 )
	// 	log("xmldoc: hc3 is 0 (spiderreq) %s",m_firstUrl.m_url);
	// or use our hop count from the spider rec if better
	if ( origHopCount < hc && origHopCount >= 0 )
		hc = origHopCount;
	// or if neither parent or inlinker was valid hop count
	if ( hc == -1 && origHopCount >= 0 )
		hc = origHopCount;
	// if we have no hop count at this point, i guess just pick 1!
	if ( hc == -1 )
		hc = 1;
	// truncate, hop count is only one byte in the TitleRec.h::m_hopCount
	if ( hc > 0x7f ) hc = 0x7f;

	// and now so do rss urls.
	if ( *isRSS && hc > 1 ) {
		// force it to one, not zero, otherwise it gets pounded
		// too hard on the aggregator sites. spider priority
		// is too high
		m_hopCount      = 1;
		m_hopCountValid = true;
		return &m_hopCount;
	}

	// unknown hop counts (-1) are propogated, except for root urls
	m_hopCountValid = true;
	m_hopCount      = hc;
	return &m_hopCount;
}

/*
int8_t *XmlDoc::getOutlinkHopCountVector ( ) {
	if ( m_outlinkHopCountVectorValid ) return m_outlinkHopCountVector;
	// need these of course
	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 ) return (int8_t *)links;
	// and these for seeing if outlink is a site root
	TagRec ***grv = getOutlinkTagRecVector();
	if ( ! grv || grv == (void *)-1 ) return (int8_t *)grv;
	// hop count of parent
	int8_t *ph = getHopCount();
	if ( ! ph || ph == (void *)-1 ) return (int8_t *)ph;
	// int16_tcut
	int32_t n = links->getNumLinks();
	// sanity check
	if ( m_outlinkHopCountVector ) { char *xx=NULL;*xx=0; }
	// make some space
	m_outlinkHopCountVector = (int8_t *)mmalloc ( n * 4 ,"xdhc");
	// return NULL on error with g_errno set
	if ( ! m_outlinkHopCountVector ) return NULL;
	// save size
	m_outlinkHopCountVectorSize = n * 4;
	// stock it
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// get it
		char *u = links->getLinkPtr(i);
		// and this
		TagRec *gr = (*grv)[i];
		// flags
		linkflags_t flags = links->m_linkFlags[i];
		// hop count. default to 1.
		int32_t hc = 1;
		if      ( getIsOutlinkSiteRoot ( u , gr ) ) hc = 0;
		else if ( isPingServer ( u )              ) hc = 0;
		else if ( flags & LF_RSS                  ) hc = 0;
		else                                        hc = *ph + 1;
		// assign it
		m_outlinkHopCountVector[i] = hc;
	}
	m_outlinkHopCountVectorValid = true;
	return m_outlinkHopCountVector;
}
*/

//set to false fo rinjecting and validate it... if &spiderlinks=0
// should we spider links?
char *XmlDoc::getSpiderLinks ( ) {
	// set it to false on issues
	//if ( m_indexCode ) {
	//	m_spiderLinks      = false;
	//	m_spiderLinks2     = false;
	//	m_spiderLinksValid = true ; }

	// this slows importing down because we end up doing ip lookups
	// for every outlink if "firstip" not in tagdb.
	// shoot. set2() already sets m_spiderLinksValid to true so we
	// have to override if importing.
	if ( m_isImporting && m_isImportingValid ) {
		m_spiderLinks  = false;
		m_spiderLinks2 = false;
		m_spiderLinksValid = true;
		return &m_spiderLinks2;
	}

	// return the valid value
	if ( m_spiderLinksValid ) return &m_spiderLinks2;

	setStatus ( "getting spider links flag");

	// do not add links now if doing the parser test
	if ( g_conf.m_testParserEnabled ||
	     m_isDiffbotJSONObject ) {
		m_spiderLinks  = false;
		m_spiderLinks2 = false;
		m_spiderLinksValid = true;
		return &m_spiderLinks2;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return (char *)cr;

	int32_t *ufn = getUrlFilterNum();
	if ( ! ufn || ufn == (void *)-1 ) return (char *)ufn;

	// if url filters forbids it
	if ( ! cr->m_harvestLinks[*ufn] ) {
		m_spiderLinksValid = true;
		m_spiderLinks2 = false;
		m_spiderLinks  = false;
		return &m_spiderLinks2;
	}

	// hack for bulk job detection. never spider links
	//if ( cr->m_isCustomCrawl == 2 ) {
	//	m_spiderLinks  = false;
	//	m_spiderLinks2 = false;
	//	m_spiderLinksValid = true;
	//	return &m_spiderLinks2;
	//}

	// check the xml for a meta robots tag
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	// assume true
	m_spiderLinks = true;

	// or if meta tag says not to
	char buf1 [256];
	char buf2 [256];
	buf1[0] = '\0';
	buf2[0] = '\0';
	xml->getMetaContent ( buf1, 255 , "robots" , 6 );
	xml->getMetaContent ( buf2, 255 , "gigabot", 7 );

	if ( strstr ( buf1 , "nofollow" ) ||
	     strstr ( buf2 , "nofollow" ) ||
	     strstr ( buf1 , "none"     ) ||
	     strstr ( buf2 , "none"     ) )
		m_spiderLinks = false;

	// spider links if doing custom crawl or not using robots.txt
	if ( ! m_useRobotsTxt || cr->m_isCustomCrawl )
		m_spiderLinks = true;

	// spider request forbade it? diffbot.cpp crawlbot api when
	// specifying urldata (list of urls to add to spiderdb) usually
	// they do not want the links crawled i'd imagine.
	if ( m_sreqValid && m_sreq.m_avoidSpiderLinks )
		m_spiderLinks = false;


	// also check in url filters now too


	// set shadow member
	m_spiderLinks2 = m_spiderLinks;
	// validate
	m_spiderLinksValid = true;
	return &m_spiderLinks2;
}

//
// . DELETE ALL SPAM FROM THE INDEX!!!
//
// . for a page to be spam these must ALL be true, with the current ip:
//   . site is not in google
//   . site has no "stars" in google's dir
//   . site has no authorityinlink tag
//   . site has less than 10 fresh inlinks
//   . site has less than 500 total inlinks
//   . ip is not from ultra dns
//   . TODO: site is not linked to by wikipedia.com
//   . TODO: site is not linked to by about.com
//   . TODO: site is not linked to by a .gov site
//   . the page IP address changed significantly since the same since last
//     time we indexed it when it was not spam (if applicable)
//
// . if the page was indexed at one time and then we decided it was spam,
//   and its ip changed significantly since last time, we just
//   reschedule the spider rec for 15 days later and do not touch anything
//   else. that way we keep the index somewhat stable.
//

/*
char *XmlDoc::getIsSpam() {
	// return it if valid
	if ( m_isSpamValid ) return &m_isSpam;

	setStatus ("getting is spam");

	// assume it is not spam
	m_isSpam = false;

	// debug
	//logf(LOG_DEBUG,"doc: NOT SPAM!!");
	//m_isSpamValid = true;	return &m_isSpam;

	// we disable this check for the contact doc
	if ( m_spamCheckDisabled ) { m_isSpamValid = true; return &m_isSpam; }

	// . i put this here for debugging purposes
	// . some big sites have no easy to find contact info
	// . get our domain
	Url *fu = getFirstUrl();
	char *dom  = fu->getDomain   ();
	int32_t  dlen = fu->getDomainLen();
	if ( dlen == 12 && !strncmp(dom,"facebook.com",dlen) ) {
		m_isSpamValid = true; return &m_isSpam; }
	if ( dlen ==  9 && !strncmp(dom,"yahoo.com",dlen) ) {
		m_isSpamValid = true; return &m_isSpam; }

	// get our site's tag rec
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;

	// are we already in the index?
	//char *isIndexed = getIsIndexed();
	//if (!isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;

	// this will update m_oldTagRec with the latest info if its stale
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;

	char *hci = getHasContactInfo();
	if ( ! hci || hci == (char *)-1 ) return (char *)hci;

	//int32_t *ip = getIp();
	//if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;

	//XmlDoc **od = getOldXmlDoc ( );
	//if ( ! od || od == (void *)-1 ) return (char *)od;

	//int32_t oldIp = 0 ;
	//if ( *od ) {
	//	int32_t *ip2 = (*od)->getIp();
	//	if ( ! ip2 || ip2 == (int32_t *)-1 ) return (char *)ip2;
	//	oldIp = *ip2;
	//}

	// i am guessing that most sites that use ultra dns will have a lot
	// of site inlinks! so comment this our for now
	//char *ultra = getIpIsUltraDns();
	//if ( ultra || ultra==(char *)-1 ) return (char *)ultra;
	// spammers do not use ultradns
	//if ( *ultra ) return false;

	Url *f = getFirstUrl();
	char *u = f->getUrl();

	int32_t now = getTimeGlobal();

	// this will be valid
	m_isSpamValid = true;

	// use this routine
	m_isSpam = isSpam ( u,
			    gr,
			    now,
			    // *isIndexed,
			    //oldIp  ,
			    // *ip    ,
			    *hci );

	// we are doomed! delete in its entirety
	if ( m_isSpam ) m_indexCode = EDOCSPAM;

	return &m_isSpam;
}

// . "u" must be NORMALIZED. i.e. start with http:// or https:// etc.
// . we call this on outlinks as well
// . we no longer look at the old and newip to determine ownership change,
//   because that is not reliable enough
// . we now maybe rely on a major change to the site root page...
bool XmlDoc::isSpam ( char   *u         ,
		      TagRec *gr        ,
		      int32_t    now       ,
		      char    isIndexed ,
		      int32_t    oldIp     ,
		      int32_t    newIp     ,
		      bool    hasContactInfo ) {

	// we need to mine that same database that firefox does...
	Tag *tag = gr->getTag ( "malware" );
	if ( tag && tag->getTagData()[0] != '0' ) return true;

	// if they have contact info, that is a really good sign
	if ( hasContactInfo ) return false;

	// .edu and .gov sites are always fine
	int32_t tlen; char *tld = getTLDFast(u,&tlen);
	if ( tlen == 3 && ! strncmp(tld,"edu",3) ) return false;
	if ( tlen == 3 && ! strncmp(tld,"gov",3) ) return false;

	// the current top ip address
	//int32_t top = newIp & 0x00ffffff;

	// TODO: in the case of multiple ips on one domain, ensure we select
	// the same IP every time we do a lookup in MsgC.

	// ok if in google
	if ( gr->getTag ( "ingoogle" ) ) return false;
	//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;

	// can also be in google's dmoz dir. must have a decent page rank.
	if ( gr->getTag ( "pagerank" ) ) return false;
	//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;

	// . if was linked to by a high quality root as a new external outlink
	// . TODO: include about.com and wikipedia.com i guess (TODO)
	if ( gr->getTag ( "authorityinlink" ) ) return false;
	//if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false;

	tag = gr->getTag("sitenuminlinks");
	// i guess if it has no entry for this, assume the best
	if ( ! tag ) return false;
	// or just a massive amount of any-age inlinks
	if ( atol(tag->getTagData()) >= 500 ) return false;

	tag = gr->getTag("sitenuminlinksfresh");
	// i guess if it has no entry for this, assume the best
	if ( ! tag ) return false;
	// if site has enough good FRESH inlinks from the last 3 mos, no spam
	if( atol(tag->getTagData()) >= 10 ) return false;

	// if we are old and the top 3 bytes of the ip is the same as the last
	// time we were indexed and thereby not identified as spam...
	// then assume we are still not spam! because it was unlikely that
	// the domain ownership changed...
	//if ( isIndexed (oldIp & 0x00ffffff) == top ) return false;

	// if they have contact info, that is a really good sign
	//if ( hasContactInfo && (oldIp & 0x00ffffff) == top ) return false;

	// if first time... accept them if they got contact info
	//if ( ! oldIp && hasContactInfo ) return false;

	// . if it has had the same ip for the last 365 days, let it in
	// . getTagRec() updates this tag immediately if the ip changes
	// . so we can't really use this tag for outlinks, because they might
	//   never get thrown into spiderdb to where we can add this tag to
	//   their tag rec... UNLESS msgc/msge were to update their tag rec...
	// . i've seen quite a few old spam sites/pages. they just kinda stay
	//   there.  so let's not do this...
	//tag = gr->get("iptimestamp");
	//int32_t now;
	//if ( tag ) now = getTimeGlobal();
	//if(tag&&now-atol(tag->getTagData())>365*24*3600&&
	//    ((tag->m_ip&0x00ffffff)==top))
	//	return false;

	return true;
}
*/

// should we index the doc? if already indexed, and is filtered, we delete it
char *XmlDoc::getIsFiltered ( ) {
	if ( m_isFilteredValid ) return &m_isFiltered;
	if ( m_isDiffbotJSONObject ) {
		m_isFiltered = false;
		m_isFilteredValid = true;
		return &m_isFiltered;
	}
	int32_t *priority = getSpiderPriority();
	if ( ! priority || priority == (void *)-1 ) return (char *)priority;
	m_isFiltered = false;
	// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
	// if ( *priority == SPIDER_PRIORITY_BANNED   ) m_isFiltered = true;
	if ( *priority == -3 ) m_isFiltered = true;
	m_isFilteredValid = true;
	return &m_isFiltered;
}

int32_t *XmlDoc::getSpiderPriority ( ) {
	if ( m_priorityValid ) return &m_priority;
	setStatus ("getting spider priority");
	// need tagrec to see if banned
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// this is an automatic ban!
	if ( gr->getLong("manualban",0) ) {
		m_priority      = -3;//SPIDER_PRIORITY_BANNED;
		m_priorityValid = true;
		return &m_priority;
	}
	int32_t *ufn = getUrlFilterNum();
	if ( ! ufn || ufn == (void *)-1 ) return (int32_t *)ufn;
	// sanity check
	if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	m_priority = cr->m_spiderPriorities[*ufn];

	// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
	if ( cr->m_forceDelete[*ufn] ) m_priority = -3;

	m_priorityValid = true;
	return &m_priority;
}

bool XmlDoc::logIt ( SafeBuf *bb ) {

	// set errCode
	int32_t errCode = m_indexCode;
	if ( ! errCode && g_errno ) errCode = g_errno;

	// were we new?
	//char isIndexed = -1;
	//if ( m_isIndexedValid ) isIndexed = m_isIndexed;
	bool isNew = true;
	if ( m_sreqValid && m_sreq.m_hadReply ) isNew = false;

	// keep track of stats
	g_stats.addSpiderPoint ( errCode, isNew ); // !isIndexed );

	// do not log if we should not, saves some time
	//if ( ! g_conf.m_logSpideredUrls && ! m_forceDelete ) return true;
	if ( ! g_conf.m_logSpideredUrls ) return true;

	// patch the ip
	int32_t ip = m_ip;
	// invalid?
	if ( ! m_ipValid ) ip = 0;

	char *coll = "nuked";
	CollectionRec *cr = getCollRec();
	if ( cr ) coll = cr->m_coll;

	SafeBuf tmpsb;

	// print into this now
	SafeBuf *sb = &tmpsb;
	// log into provided safebuf if not null
	if ( bb ) sb = bb;

	//
	// coll
	//
	sb->safePrintf("coll=%s ",coll);
	sb->safePrintf("collnum=%"INT32" ",(int32_t)m_collnum);

	//
	// print ip
	//
	if ( m_ipValid )
		sb->safePrintf("ip=%s ",iptoa(m_ip) );

	if ( m_firstIpValid )
		sb->safePrintf("firstip=%s ",iptoa(m_firstIp) );

	// . first ip from spider req if it is fake
	// . we end up spidering the same url twice because it will have
	//   different "firstips" in the SpiderRequest key. maybe just
	//   use domain hash instead of firstip, and then let msg13
	//   make queues in the case of hammering an ip, which i think
	//   it already does...
	if ( m_sreqValid && m_sreq.m_firstIp != m_firstIp )
		sb->safePrintf("fakesreqfirstip=%s ",iptoa(m_sreq.m_firstIp) );

	//
	// print when this spider request was added
	//
	//if ( m_sreqValid && m_sreq.m_addedTime ) {
	//	struct tm *timeStruct = gmtime ( &m_sreq.m_addedTime );
	//	char tmp[64];
	//	strftime(tmp,64,"requestadded=%b-%d-%Y(%H:%M:%S)", timeStruct);
	//	sb->safePrintf("%s(%"UINT32") ",tmp,m_sreq.m_addedTime);
	//}

	//
	// print spidered time
	//
	//if ( m_spideredTimeValid ) {
	time_t spideredTime = (time_t)getSpideredTime();
	struct tm *timeStruct = gmtime ( &spideredTime );
	char tmp[64];
	strftime(tmp,64,"spidered=%b-%d-%Y(%H:%M:%S)", timeStruct );
	sb->safePrintf("%s(%"UINT32") ",tmp,(uint32_t)spideredTime);

	// when it was scheduled to be spidered
	if ( m_sreqValid && m_sreq.m_addedTime ) {
		time_t ts = m_sreq.m_addedTime;
		struct tm *timeStruct = gmtime ( &ts );
		char tmp[64];
		strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
		sb->safePrintf("scheduledtime=%s(%"UINT32") ",
			       tmp,(uint32_t)m_sreq.m_addedTime);
	}

	// discovery date, first time spiderrequest was added to spiderdb
	if ( m_sreqValid && m_sreq.m_discoveryTime ) {
		time_t ts = m_sreq.m_discoveryTime;
		struct tm *timeStruct = gmtime ( &ts );
		char tmp[64];
		strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
		sb->safePrintf("discoverydate=%s(%"UINT32") ",
			       tmp,(uint32_t)m_sreq.m_discoveryTime);
	}

	// print first indexed time
	if ( m_firstIndexedDateValid ) {
		time_t ts = m_firstIndexedDate;
		timeStruct = gmtime ( &ts );//m_firstIndexedDate );
		strftime(tmp,64,"firstindexed=%b-%d-%Y(%H:%M:%S)", timeStruct);
		sb->safePrintf("%s(%"UINT32") ",tmp,
			       (uint32_t)m_firstIndexedDate);
	}


	//if ( ! m_isIndexedValid ) { char *xx=NULL;*xx=0; }

	// just use the oldurlfilternum for grepping i guess
	//if ( m_oldDocValid && m_oldDoc )

	// when injecting a request we have no idea if it had a reply or not
	if ( m_sreqValid && m_sreq.m_isInjecting )
		sb->safePrintf("firsttime=? ");
	else if ( m_sreqValid && m_sreq.m_hadReply )
		sb->safePrintf("firsttime=0 ");
	else if ( m_sreqValid )
		sb->safePrintf("firsttime=1 ");
	else
		sb->safePrintf("firsttime=? ");

	//
	// print # of link texts
	//
	if ( m_linkInfo1Valid && ptr_linkInfo1 ) {
		LinkInfo *info = ptr_linkInfo1;
		int32_t nt = info->getNumLinkTexts();
		sb->safePrintf("goodinlinks=%"INT32" ",nt );
		// new stuff. includes ourselves i think.
		//sb->safePrintf("ipinlinks=%"INT32" ",info->m_numUniqueIps);
		//sb->safePrintf("cblockinlinks=%"INT32" ",
		//info->m_numUniqueCBlocks);
	}

	//
	// print # of link texts from 2nd coll
	//
	// this is not used for what it was used for.
	// if ( m_linkInfo2Valid && size_linkInfo2 > 4 ) {
	// 	LinkInfo *info = ptr_linkInfo2;
	// 	int32_t nt = 0;
	// 	if ( info ) nt = info->getNumLinkTexts();
	// 	if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt );
	// }

	if (  m_docIdValid )
		sb->safePrintf("docid=%"UINT64" ",m_docId);

	char *u = getFirstUrl()->getUrl();
	int64_t pd = g_titledb.getProbableDocId(u);
	int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
	int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
	sb->safePrintf("probdocid=%"UINT64" ",pd);
	sb->safePrintf("probdocidmin=%"UINT64" ",d1);
	sb->safePrintf("probdocidmax=%"UINT64" ",d2);
	sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);


	if ( m_siteNumInlinksValid ) {
		sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
		// sb->safePrintf("siteipinlinks=%"INT32" ",
		// 	      m_siteNumInlinksUniqueIp);
		// sb->safePrintf("sitecblockinlinks=%"INT32" ",
		// 	      m_siteNumInlinksUniqueCBlock);
		int32_t sr = ::getSiteRank ( m_siteNumInlinks );
		sb->safePrintf("siterank=%"INT32" ", sr );
	}

	if ( m_sreqValid )
		sb->safePrintf("pageinlinks=%04"INT32" ",
			       m_sreq.m_pageNumInlinks);

	// int16_tcut
	int64_t uh48 = hash64b ( m_firstUrl.m_url );
	// mask it
	uh48 &= 0x0000ffffffffffffLL;
	sb->safePrintf ("uh48=%"UINT64" ",uh48 );


	if ( m_charsetValid )
		sb->safePrintf("charset=%s ",get_charset_str(m_charset));

	if ( m_contentTypeValid )
		sb->safePrintf("ctype=%s ",
			      g_contentTypeStrings [m_contentType]);

	if ( m_sreqValid )
		sb->safePrintf("parentlang=%02"INT32"(%s) ",
			       (int32_t)m_sreq.m_parentLangId,
			       getLanguageAbbr(m_sreq.m_parentLangId));

	if ( m_langIdValid )
		sb->safePrintf("lang=%02"INT32"(%s) ",(int32_t)m_langId,
			      getLanguageAbbr(m_langId));

	if ( m_countryIdValid )
		sb->safePrintf("country=%02"INT32"(%s) ",(int32_t)m_countryId,
			      g_countryCode.getAbbr(m_countryId));

	if ( m_hopCountValid )
		sb->safePrintf("hopcount=%02"INT32" ",(int32_t)m_hopCount);


	if ( m_contentValid )
		sb->safePrintf("contentlen=%06"INT32" ",m_contentLen);

	if ( m_contentValid && cr->m_isCustomCrawl )
		sb->safePrintf("zeroedout=%i ",(int)m_zeroedOut);

	if ( m_isContentTruncatedValid )
		sb->safePrintf("contenttruncated=%"INT32" ",
			       (int32_t)m_isContentTruncated);

	if ( m_robotsTxtLenValid )
		sb->safePrintf("robotstxtlen=%04"INT32" ",m_robotsTxtLen );

	if ( m_isAllowedValid )
		sb->safePrintf("robotsallowed=%i ", (int)m_isAllowed);
	else
		sb->safePrintf("robotsallowed=? " );

	if ( m_contentHash32Valid )
		sb->safePrintf("ch32=%010"UINT32" ",m_contentHash32);

	if ( m_domHash32Valid )
		sb->safePrintf("dh32=%010"UINT32" ",m_domHash32);

	if ( m_siteHash32Valid )
		sb->safePrintf("sh32=%010"UINT32" ",m_siteHash32);

	if ( m_isPermalinkValid )
		sb->safePrintf("ispermalink=%"INT32" ",(int32_t)m_isPermalink);

	if ( m_isRSSValid )
		sb->safePrintf("isrss=%"INT32" ",(int32_t)m_isRSS);

	if ( m_linksValid )
		sb->safePrintf("hasrssoutlink=%"INT32" ",
			      (int32_t)m_links.hasRSSOutlink() );

	if ( m_numOutlinksAddedValid ) {
		sb->safePrintf("outlinksadded=%04"INT32" ",
			       (int32_t)m_numOutlinksAdded);
		sb->safePrintf("outlinksaddedfromsamedomain=%04"INT32" ",
			       (int32_t)m_numOutlinksAddedFromSameDomain);
	}

	if ( m_metaListValid )
		sb->safePrintf("addlistsize=%05"INT32" ",
			       (int32_t)m_metaListSize);
	else
		sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)0);

	if ( m_addedSpiderRequestSizeValid )
		sb->safePrintf("addspiderreqsize=%05"INT32" ",
			       m_addedSpiderRequestSize);
	else
		sb->safePrintf("addspiderreqsize=%05"INT32" ",0);


	if ( m_addedSpiderReplySizeValid )
		sb->safePrintf("addspiderrepsize=%05"INT32" ",
			       m_addedSpiderReplySize);
	else
		sb->safePrintf("addspiderrepsize=%05"INT32" ",0);


	if ( m_addedStatusDocSizeValid ) {
		sb->safePrintf("addstatusdocsize=%05"INT32" ",
			       m_addedStatusDocSize);
		sb->safePrintf("addstatusdocid=%"UINT64" ",
			       m_addedStatusDocId);
	}
	else {
		sb->safePrintf("addstatusdocsize=%05"INT32" ",0);
		sb->safePrintf("addstatusdocid=0 ");
	}


	if ( m_useSecondaryRdbs ) {
		sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
		sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
		sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
		sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
		sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
		if ( cr )
			sb->safePrintf("indexspiderreplies=%i ",(int)
				       cr->m_indexSpiderReplies);
	}

	if ( size_imageData && m_imageDataValid ) {
		// url is in data now
		ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
		int32_t nt = ta->getNumThumbnails();
		ThumbnailInfo *ti = ta->getThumbnailInfo(0);
		sb->safePrintf("thumbnail=%s,%"INT32"bytes,%"INT32"x%"INT32",(%"INT32") ",
			      ti->getUrl(),
			      ti->m_dataSize,
			      ti->m_dx,
			      ti->m_dy,
			      nt);
	}
	else
		sb->safePrintf("thumbnail=none ");


	/*
	if ( m_hasAddressValid && m_addressesValid )
		sb->safePrintf("numaddr=%"INT32" ",(int32_t)m_addresses.m_numValid);

	//if ( m_skipIndexingValid )
	//	sb->safePrintf("skipindexing=%"INT32" ",(int32_t)m_skipIndexing);

	if ( m_hasTODValid )
		sb->safePrintf("hastod=%"INT32" ",(int32_t)m_hasTOD);
	*/

	// get the content type
	uint8_t ct = CT_UNKNOWN;
	if ( m_contentTypeValid ) ct = m_contentType;

	bool isRoot = false;
	if ( m_isSiteRootValid ) isRoot = m_isSiteRoot;

	// make sure m_minInlinkerHopCount is valid
	LinkInfo *info1 = NULL;
	if ( m_linkInfo1Valid ) info1 = ptr_linkInfo1;

	//bool isContacty    =  getIsContacty(&m_firstUrl,
	//				    info1,
	//				    m_hopCount ,
	//				    ct , // contentType
	//				    isRoot ,
	//				    m_niceness );
	/*
	// just use this now
	if ( m_hasContactInfoValid )
		sb->safePrintf("iscontacty=%"INT32" ",(int32_t)m_hasContactInfo);

	if ( m_hasSiteVenueValid )
		sb->safePrintf("hassitevenue=%"INT32" ",(int32_t)m_hasSiteVenue);
	*/

	// hack this kinda
	// . in PageInject.cpp we do not have a valid priority without
	//   blocking because we did a direct injection!
	//   so ignore this!!
	// . a diffbot json object, an xmldoc we set from a json object
	//   in a diffbot reply, is a childDoc (m_isChildDoc) is true
	//   and does not have a spider priority. only the parent doc
	//   that we used to get the diffbot reply (array of json objects)
	//   will have the spider priority
	if ( ! getIsInjecting() && ! m_isDiffbotJSONObject ) {
		//int32_t *priority = getSpiderPriority();
		//if ( ! priority ||priority==(void *)-1){char *xx=NULL;*xx=0;}
		if ( m_priorityValid )
			sb->safePrintf("priority=%"INT32" ",
				      (int32_t)m_priority);
	}

	// should be valid since we call getSpiderPriority()
	if ( m_urlFilterNumValid )
		sb->safePrintf("urlfilternum=%"INT32" ",(int32_t)m_urlFilterNum);


	if ( m_diffbotApiUrlValid &&
	     m_diffbotApiUrl.getBufStart() &&
	     m_diffbotApiUrl.getBufStart()[0] )
		sb->safePrintf("diffbotjsonobjects=%"INT32" ",
			      (int32_t)m_diffbotJSONCount);

	if ( m_diffbotReplyValid )
		sb->safePrintf("diffboterror=%"INT32" ",m_diffbotReplyError);

	if ( m_siteValid )
		sb->safePrintf("site=%s ",ptr_site);

	if ( m_isSiteRootValid )
		sb->safePrintf("siteroot=%"INT32" ",m_isSiteRoot );
	else
		sb->safePrintf("siteroot=? ");

	// like how we index it, do not include the filename. so we can
	// have a bunch of pathdepth 0 urls with filenames like xyz.com/abc.htm
	if ( m_firstUrlValid ) {
		int32_t pd = -1;
		// fix core
		if ( m_firstUrl.m_url &&
		     m_firstUrl.m_ulen > 0 &&
		     m_firstUrl.m_path )
			pd = m_firstUrl.getPathDepth(false);
		sb->safePrintf("pathdepth=%"INT32" ",pd);
	}
	else {
		sb->safePrintf("pathdepth=? ");
	}

	//
	// . sometimes we print these sometimes we do not
	// . put this at the end so we can awk out the above fields reliably
	//

	// print when it was last spidered
	if ( m_oldDocValid && m_oldDoc ) {
		time_t spideredTime = m_oldDoc->getSpideredTime();
		struct tm *timeStruct = gmtime ( &spideredTime );
		char tmp[64];
		strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
		sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime);
	}

	// print new pubdate
	if ( m_pubDateValid && m_pubDate!=(uint32_t)-1 && m_pubDate!=0 ) {
		char tmp[64];
		time_t ts = (time_t)m_pubDate;
		struct tm *timeStruct = gmtime ( &ts );
		strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
		sb->safePrintf("pubdate=%s ", tmp );
	}

	if ( m_linkInfo1Valid && ptr_linkInfo1 && ptr_linkInfo1->hasRSSItem())
		sb->safePrintf("hasrssitem=1 ");

	// was the content itself injected?
	if ( m_wasContentInjected )
		sb->safePrintf("contentinjected=1 ");
	else
		sb->safePrintf("contentinjected=0 ");

	// might have just injected the url and downloaded the content?
	if ( (m_sreqValid && m_sreq.m_isInjecting) ||
	     (m_isInjecting && m_isInjectingValid) )
		sb->safePrintf("urlinjected=1 ");
	else
		sb->safePrintf("urlinjected=0 ");

	if ( m_sreqValid && m_sreq.m_isAddUrl )
		sb->safePrintf("isaddurl=1 ");
	else
		sb->safePrintf("isaddurl=0 ");

	if ( m_sreqValid && m_sreq.m_isPageReindex )
		sb->safePrintf("pagereindex=1 ");

	if ( m_spiderLinksValid && m_spiderLinks )
		sb->safePrintf("spiderlinks=1 ");
	if ( m_spiderLinksValid && ! m_spiderLinks )
		sb->safePrintf("spiderlinks=0 ");


	if ( m_crawlDelayValid && m_crawlDelay != -1 )
		sb->safePrintf("crawldelayms=%"INT32" ",(int32_t)m_crawlDelay);

	if ( m_recycleContent )
		sb->safePrintf("recycleContent=1 ");

	if ( m_exactContentHash64Valid )
		sb->safePrintf("exactcontenthash=%"UINT64" ",
			      m_exactContentHash64 );

	// . print percent changed
	// . only print if non-zero!
	if ( m_percentChangedValid && m_oldDocValid && m_oldDoc &&
	     m_percentChanged )
		sb->safePrintf("changed=%.00f%% ",m_percentChanged);

	// only print if different now! good for grepping changes
	if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_docId != m_docId )
		sb->safePrintf("olddocid=%"UINT64" ",m_oldDoc->m_docId);

	// only print if different now! good for grepping changes
	if ( m_sreqValid && m_sreq.m_ufn >= 0 &&
	     m_sreq.m_ufn != m_urlFilterNum )
		sb->safePrintf("oldurlfilternum=%"INT32" ",
			      (int32_t)m_sreq.m_ufn);

	if ( m_sreqValid && m_sreq.m_priority >= 0 &&
	     m_sreq.m_priority != m_priority )
		sb->safePrintf("oldpriority=%"INT32" ",
			      (int32_t)m_sreq.m_priority);

	if ( m_oldDoc && m_oldDoc->m_langIdValid &&
	     m_oldDoc->m_langId != m_langId )
		sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_oldDoc->m_langId,
			      getLanguageAbbr(m_oldDoc->m_langId));

	if ( m_useSecondaryRdbs &&
	     m_useTitledb &&
	     m_logLangId != m_langId )
		sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_logLangId,
			      getLanguageAbbr(m_logLangId));

	if ( m_useSecondaryRdbs &&
	     m_useTitledb &&
	     m_logSiteNumInlinks != m_siteNumInlinks )
		sb->safePrintf("oldsiteinlinks=%04"INT32" ",m_logSiteNumInlinks);

	if ( m_useSecondaryRdbs &&
	     m_useTitledb &&
	     m_oldDocValid &&
	     m_oldDoc &&
	     strcmp(ptr_site,m_oldDoc->ptr_site) )
		sb->safePrintf("oldsite=%s ",m_oldDoc->ptr_site);

	// . print old pubdate
	// . -1 means unsupported, 0 means could not find one
	// . only print if different now! good for grepping changes
	if ( m_oldDocValid && m_oldDoc &&
	     m_oldDoc->m_pubDate!= (uint32_t)-1 &&
	     m_oldDoc->m_pubDate !=0 &&
	     m_oldDoc->m_pubDate != m_pubDate ) {
		char tmp[64];
		time_t ts = m_oldDoc->m_pubDate;
		struct tm *timeStruct = gmtime ( &ts );
		strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
		sb->safePrintf("oldpubdate=%s ",tmp );
	}

	if ( m_isAdultValid )
		sb->safePrintf("isadult=%"INT32" ",(int32_t)m_isAdult);

	// only print if different now! good for grepping changes
	if ( m_oldDocValid && m_oldDoc &&
	     m_oldDoc->m_siteNumInlinks >= 0 &&
	     m_oldDoc->m_siteNumInlinks != m_siteNumInlinks ) {
		int32_t sni = -1;
		if  ( m_oldDoc ) sni = m_oldDoc->m_siteNumInlinks;
		sb->safePrintf("oldsiteinlinks=%04"INT32" ",sni);
	}


	// Spider.cpp sets m_sreq.m_errCount before adding it to doledb
	if ( m_sreqValid ) // && m_sreq.m_errCount )
		sb->safePrintf("errcnt=%"INT32" ",(int32_t)m_sreq.m_errCount );
	else
		sb->safePrintf("errcnt=? ");

	if ( ptr_redirUrl ) { // m_redirUrlValid && m_redirUrlPtr ) {
		sb->safePrintf("redir=%s ",ptr_redirUrl);//m_redirUrl.getUrl());
		if ( m_numRedirects > 2 )
			sb->safePrintf("numredirs=%"INT32" ",m_numRedirects);
	}

	if ( m_canonicalRedirUrlValid && m_canonicalRedirUrlPtr )
		sb->safePrintf("canonredir=%s ",
			       m_canonicalRedirUrlPtr->getUrl());

	if ( m_httpStatusValid && m_httpStatus != 200 )
		sb->safePrintf("httpstatus=%"INT32" ",(int32_t)m_httpStatus);

	if ( m_updatedMetaData )
		sb->safePrintf("updatedmetadata=1 ");

	if ( m_isDupValid && m_isDup )
		sb->safePrintf("dupofdocid=%"INT64" ",m_docIdWeAreADupOf);

	if ( m_firstUrlValid )
		sb->safePrintf("url=%s ",m_firstUrl.m_url);
	else
		sb->safePrintf("urldocid=%"INT64" ",m_docId);

	//
	// print error/status
	//
	sb->safePrintf(": %s",mstrerror(m_indexCode));

	// breathe
	QUICKPOLL ( m_niceness );

	// if safebuf provided, do not log to log
	if ( bb ) return true;

	// log it out
	logf ( LOG_INFO ,
	       "build: %s",
	       //getFirstUrl()->getUrl(),
	       sb->getBufStart() );

	return true;
}


// . returns false and sets g_errno on error
// . make sure that the title rec we generated creates the exact same
//   meta list as what we got
bool XmlDoc::doConsistencyTest ( bool forceTest ) {

	// skip for now it was coring on a json doc test
	return true;

	CollectionRec *cr = getCollRec();
	if ( ! cr )
		return true;

	if ( ! m_doConsistencyTesting && strcmp(cr->m_coll,"qatest123") != 0 )
		return true;

	// if we had an old doc then our meta list will have removed
	// stuff already in the database from indexing the old doc.
	// so it will fail the parsing consistency check... because of
	// the 'incremental indexing' algo above
	// disable for now... just a secondfor testing cheatcc.com
	if ( m_oldDoc && m_oldDocValid && g_conf.m_doIncrementalUpdating )
		return true;

	// if not test coll skip this
	//if ( strcmp(cr->m_coll,"qatest123") ) return true;

	// title rec is null if we are reindexing an old doc
	// and "unchanged" was true.
	if ( m_unchangedValid && m_unchanged ) {
		if ( ! m_titleRecBufValid ) return true;
		if ( m_titleRecBuf.length()==0 ) return true;
	}

	// leave this uncommented so we can see if we are doing it
	setStatus ( "doing consistency check" );

	// log debug
	log("spider: doing consistency check for %s",ptr_firstUrl);

	// . set another doc from that title rec
	// . do not keep on stack since so huge!
	XmlDoc *doc ;
	try { doc = new ( XmlDoc ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		return false;
	}
	mnew ( doc , sizeof(XmlDoc),"xmldcs");


	if ( ! doc->set2 ( m_titleRecBuf.getBufStart() ,
			   -1 , cr->m_coll , NULL , m_niceness ,
			  // no we provide the same SpiderRequest so that
			  // it can add the same SpiderReply to the metaList
			   &m_sreq ) ) {
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		return false;
	}

	// . some hacks
	// . do not look up title rec in titledb, assume it is new
	doc->m_isIndexed      = false;
	doc->m_isIndexedValid = true;

	// so we don't core in getRevisedSpiderRequest()
	doc->m_firstIp = m_firstIp;
	doc->m_firstIpValid = true;

	// inherit this doc's tag rec since it has not called updateTagdb() yet
	//doc->ptr_tagRecData = ptr_tagRecData;
	//doc->size_tagRecData = size_tagRecData;

	// getNewSpiderReply() calls getDownloadEndTime() which is not valid
	// and causes the page to be re-downloaded, so stop that..!
	doc->m_downloadEndTime      = m_downloadEndTime;
	doc->m_downloadEndTimeValid = true;

	// inherit doledb key as well to avoid a core there
	doc->m_doledbKey = m_doledbKey;

	// skip the robots.txt lookup! that was causing this too block!
	//doc->m_isAllowed      = true;
	//doc->m_isAllowedValid = true;

	// do not get outlink info for this, that stuff is for adding outlinks
	// to spiderdb, and tagdb may have changed. so we can't really compare
	// spider recs! if this is false then the call to doc->getMetaList()
	// blocks to lookup the tagdb and titledb recs for each outlink!
	// therefore, set it to true!
	//doc->m_isInjecting = true;
	// mdw: shouldn't this have the same effect?
	//doc->m_spiderLinks2     = false;
	//doc->m_spiderLinksValid = true;

	// flag it
	doc->m_doingConsistencyCheck = true;

	// get get its metalist. rv = return value
	char *rv = doc->getMetaList ( );

	// sanity check - compare urls
	if ( doc->m_firstUrl.m_ulen != m_firstUrl.m_ulen){char *xx=NULL;*xx=0;}

	// error setting it?
	if ( ! rv ) {
		// sanity check
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		// free it
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		// error
		return false;
	}
	// blocked? that is not allowed
	if ( rv == (void *)-1 ) { char *xx=NULL; *xx=0; }

	// compare with the old list
	char *list1     = m_metaList;
	int32_t  listSize1 = m_metaListSize;

	char *list2     = doc->m_metaList;
	int32_t  listSize2 = doc->m_metaListSize;


	// show it for now
	//log("build: printing meta list 1");
	//printMetaList(list1,list1+listSize1,NULL);
	//log("build: printing meta list 2");
	//printMetaList(list2,list2+listSize2,NULL);


	// do a compare
	HashTableX ht1;
	HashTableX ht2;

	ht1.set ( sizeof(key224_t),sizeof(char *),
		  262144,NULL,0,false,m_niceness,"xmlht1");
	ht2.set ( sizeof(key224_t),sizeof(char *),
		  262144,NULL,0,false,m_niceness,"xmlht2");

	// format of a metalist... see XmlDoc::addTable() where it adds keys
	// from a table into the metalist
	// <nosplitflag|rdbId><key><dataSize><data>
	// where nosplitflag is 0x80
	char *p1    = list1;
	char *p2    = list2;
	char *pend1 = list1 + listSize1;
	char *pend2 = list2 + listSize2;

	// see if each key in list1 is in list2
	if ( ! hashMetaList ( &ht1 , p1 , pend1 , false ) ) {
		char *xx=NULL;*xx=0;
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		return log("doc: failed consistency test for %s",ptr_firstUrl);
	}
	if ( ! hashMetaList ( &ht2 , p2 , pend2 , false ) ) {
		char *xx=NULL;*xx=0;
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		return log("doc: failed consistency test for %s",ptr_firstUrl);
	}

	// . now make sure each list matches the other
	// . first scan the guys in "p1" and make sure in "ht2"
	hashMetaList ( &ht2 , p1 , pend1 , true );
	// . second scan the guys in "p2" and make sure in "ht1"
	hashMetaList ( &ht1 , p2 , pend2 , true );

	mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
	delete ( doc );

	log ("spider: passed consistency test for %s",ptr_firstUrl );

	// no serious error, although there might be an inconsistency
	return true;
}

int32_t XmlDoc::printMetaList ( ) {

	SafeBuf sb;
	printMetaList ( m_metaList ,
			m_metaList + m_metaListSize ,
			&sb );
	fprintf(stderr,"%s\n",sb.getBufStart());
	return 0;
}


#define TABLE_ROWS 25

// print this also for page parser output!
void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {

	verifyMetaList ( p , pend , false );

	SafeBuf tmp;
	if ( ! sb ) sb = &tmp;

	char *hdr =
		"<table border=1>\n"
		"<tr>"
		"<td><b>rdb</b></td>"
		"<td><b>del?</b></td>"
		"<td><b>shardByTermId?</b></td>"
		// illustrates key size
		"<td><b>key</b></td>"
		// break it down. based on rdb, of course.
		"<td><b>desc</b></td>"
		"</tr>\n" ;

	sb->safePrintf("%s",hdr);

	int32_t recSize = 0;
	int32_t rcount = 0;
	for ( ; p < pend ; p += recSize ) {
		// get rdbid
		uint8_t rdbId = *p & 0x7f;
		// skip
		p++;
		// get key size
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		// point to it
		char *rec = p;
		// init this
		int32_t recSize = ks;
		// convert into a key128_t, the biggest possible key
		//key224_t k ;
		char k[MAX_KEY_BYTES];
		if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
		//k.setMin();
		gbmemcpy ( &k , p , ks );
		// is it a negative key?
		char neg = false;
		if ( ! ( p[0] & 0x01 ) ) neg = true;
		// this is now a bit in the posdb key so we can rebalance
		char shardByTermId = false;
		if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
			shardByTermId = true;
		// skip it
		p += ks;
		// get datasize
		int32_t dataSize = getDataSizeFromRdbId ( rdbId );
		// . always zero if key is negative
		// . this is not the case unfortunately...
		if ( neg ) dataSize = 0;
		// if -1, read it in
		if ( dataSize == -1 ) {
			dataSize = *(int32_t *)p;
			// inc this
			recSize += 4;
			// sanity check
                        if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
			p += 4;
                }
		// point to it
		char *data = p;
		// skip the data
		p += dataSize;
		// inc it
		recSize += dataSize;
		// NULL it for negative keys
		if ( dataSize == 0 ) data = NULL;

		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS) == 0 )
			sb->safePrintf("<!--ignore--></table>%s",hdr);


		//if ( rdbId != RDB_LINKDB ) continue;

		// print dbname
		sb->safePrintf("<tr>");
		char *dn = getDbnameFromId ( rdbId );
		sb->safePrintf("<td>%s</td>",dn);

		if ( neg ) sb->safePrintf("<td>D</td>");
		else       sb->safePrintf("<td>&nbsp;</td>");

		if ( shardByTermId ) sb->safePrintf("<td>shardByTermId</td>");
		else           sb->safePrintf("<td>&nbsp;</td>");

		sb->safePrintf("<td><nobr>%s</nobr></td>", KEYSTR(k,ks));


		if ( rdbId == RDB_POSDB ) {
			// get termid et al
			key144_t *k2 = (key144_t *)k;
			int64_t tid = g_posdb.getTermId(k2);
			//uint8_t score8 = g_posdb.getScore ( *k2 );
			//uint32_t score32 = score8to32 ( score8 );
			// sanity check
			if(dataSize!=0){char*xx=NULL;*xx=0;}
			sb->safePrintf("<td>"
				       "termId=%020"UINT64" "
				       //"score8=%03"UINT32" "
				       //"score32=%010"UINT32""
				       "</td>"
				       ,(uint64_t)tid
				       //(int32_t)score8,
				       //(int32_t)score32
				       );
		}
		else if ( rdbId == RDB_DATEDB ) {
			// get termid et al
			key128_t *k2 = (key128_t *)k;
			int64_t tid = g_datedb.getTermId(k2);
			// use indexdb's function for this. should be the same
			uint8_t score8 = g_indexdb.getScore ( (char *)k );
			int32_t date = g_datedb.getDate ( k2 );
			uint32_t score32 = score8to32 ( score8 );
			// sanity check
			if(dataSize!=0){char*xx=NULL;*xx=0;}
			sb->safePrintf("<td>"
				       "termId=%020"UINT64" "
				       "date=%010"UINT32" "
				       "score8=%03"UINT32" "
				       "score32=%010"UINT32""
				       "</td>",
				       tid,
				       date,
				       (int32_t)score8,
				       (int32_t)score32);
		}
		// key parsing logic from Sections.cpp::gotSectiondbList()
		else if ( rdbId == RDB_SECTIONDB ) {
			key128_t *k2 = (key128_t *)k;
			int32_t secType  = g_indexdb.getScore ( (char *)k2);
			int32_t tagHash  = g_datedb.getDate ( k2 );
			int64_t tid = g_datedb.getTermId(k2);
			int64_t siteHash = tid; // not quite 64 bits
			SectionVote *sv = (SectionVote *)data;
			char *dd = "tagHash32";
			if ( secType == SV_TAGCONTENTHASH )
				dd ="tagcontentHash32";
			if ( secType == SV_TAGPAIRHASH )
				dd = "tagPairHash32";
			// sanity check
			int32_t ds = sizeof(SectionVote);
			if (!neg&&dataSize!=ds){char*xx=NULL;*xx=0;}
			if ( neg&&dataSize!=0 ){char*xx=NULL;*xx=0;}
			float score      = 0.0;
			float numSampled = 0.0;
			if ( data ) {
				score      = sv->m_score;
				numSampled = sv->m_numSampled;
			}
			sb->safePrintf("<td>"
				       "<nobr>"
				       "siteHash48=0x%016"XINT64" "
				       "%s=0x%08"XINT32" "
				       "secType=%s "
				       "score=%.02f "
				       "numSampled=%.02f"
				       "</nobr>"
				       "</td>",
				       siteHash,
				       dd,tagHash,
				       getSectionTypeAsStr(secType),
				       score,
				       numSampled);
		}
		else if ( rdbId == RDB_LINKDB ) {
			key224_t *k2 = (key224_t *)k;
			int64_t linkHash=g_linkdb.getLinkeeUrlHash64_uk(k2);
			int32_t linkeeSiteHash  = g_linkdb.getLinkeeSiteHash32_uk(k2);
			int32_t linkerSiteHash  = g_linkdb.getLinkerSiteHash32_uk(k2);
			char linkSpam   = g_linkdb.isLinkSpam_uk    (k2);
			int32_t siteRank = g_linkdb.getLinkerSiteRank_uk (k2);
			//int32_t hopCount = g_linkdb.getLinkerHopCount_uk   (k2);
			//int32_t ip24     = g_linkdb.getLinkerIp24_uk       (k2);
			int32_t ip32       = g_linkdb.getLinkerIp_uk       (k2);
			int64_t docId = g_linkdb.getLinkerDocId_uk      (k2);
			// sanity check
			if(dataSize!=0){char*xx=NULL;*xx=0;}
			sb->safePrintf("<td>"
				       "<nobr>"
				       "linkeeSiteHash32=0x%08"XINT32" "
				       "linkeeUrlHash=0x%016"XINT64" "
				       "linkSpam=%"INT32" "
				       "siteRank=%"INT32" "
				       //"hopCount=%03"INT32" "
				       "sitehash32=0x%"XINT32" "
				       "IP32=%s "
				       "docId=%"UINT64""
				       "</nobr>"
				       "</td>",
				       linkeeSiteHash,
				       linkHash,
				       (int32_t)linkSpam,
				       siteRank,
				       //hopCount,
				       linkerSiteHash,
				       iptoa(ip32),
				       docId);

		}
		else if ( rdbId == RDB_CLUSTERDB ) {
			key128_t *k2 = (key128_t *)k;
			char *r = (char *)k2;
			int32_t siteHash26 = g_clusterdb.getSiteHash26   ( r );
			char lang       = g_clusterdb.getLanguage     ( r );
			int64_t docId = g_clusterdb.getDocId        ( r );
			char ff         = g_clusterdb.getFamilyFilter ( r );
			// sanity check
			if(dataSize!=0){char*xx=NULL;*xx=0;}
			sb->safePrintf("<td>"
				       // 26 bit site hash
				       "siteHash26=0x%08"XINT32" "
				       "family=%"INT32" "
				       "lang=%03"INT32" "
				       "docId=%"UINT64""
				       "</td>",
				       siteHash26 ,
				       (int32_t)ff,
				       (int32_t)lang,
				       docId );
		}
		// key parsing logic taken from Address::makePlacedbKey
		else if ( rdbId == RDB_PLACEDB ) {
			key128_t *k2 = (key128_t *)k;
			int64_t bigHash = g_placedb.getBigHash       ( k2 );
			int64_t docId   = g_placedb.getDocId         ( k2 );
			int32_t      snh     = g_placedb.getStreetNumHash ( k2 );
			//int32_t smallHash    = g_placedb.getSmallHash ( k2 );
			// sanity check
			if(!neg &&dataSize<=0){char*xx=NULL;*xx=0;}
			if( neg &&dataSize!=0){char*xx=NULL;*xx=0;}
			sb->safePrintf("<td><nobr>"
				       "bigHash64=0x%016"XINT64" "
				       "docId=%"UINT64" "
				       "streetNumHash25=0x%08"XINT32" "
				       "dataSize=%010"INT32" "
				       "address=%s"
				       "</nobr>"
				       "</td>",
				       bigHash,
				       docId,
				       snh,
				       dataSize ,
				       data );
		}
		// key parsing logic taken from Address::makePlacedbKey
		else if ( rdbId == RDB_SPIDERDB ) {
			sb->safePrintf("<td><nobr>");
			key128_t *k2 = (key128_t *)k;
			if ( g_spiderdb.isSpiderRequest(k2) ) {
				SpiderRequest *sreq = (SpiderRequest *)rec;
				sreq->print ( sb );
			}
			else {
				SpiderReply *srep = (SpiderReply *)rec;
				srep->print ( sb );
			}
			sb->safePrintf("</nobr></td>");
		}
		else if ( rdbId == RDB_DOLEDB ) {
			key_t *k2 = (key_t *)k;
			sb->safePrintf("<td><nobr>");
			sb->safePrintf("priority=%"INT32" "
				       "spidertime=%"UINT32" "
				       //"uh48=%"XINT64" "
				       "isdel=%"INT32"",
				       g_doledb.getPriority(k2),
				       (uint32_t)g_doledb.getSpiderTime(k2),
				       //g_doledb.getUrlHash48(k2),
				       g_doledb.getIsDel(k2));
			sb->safePrintf("</nobr></td>");
		}
		else if ( rdbId == RDB_TITLEDB ) {
			//XmlDoc tr;
			//SafeBuf tmp;
			//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
			// print each offset and size for the variable crap
			sb->safePrintf("<td><nobr>titlerec datasize=%"INT32" "
				       //"sizeofxmldoc=%"INT32" "
				       //"hdrSize=%"INT32" "
				       //"version=%"INT32" "
				       //"%s"
				       "</nobr></td>",
				       dataSize
				       //(int32_t)sizeof(XmlDoc),
				       //(int32_t)tr.m_headerSize,
				       //(int32_t)tr.m_version,
				       //tmp.getBufStart());
				       );
		}
		//else if ( rdbId == RDB_REVDB ) {
		//	sb->safePrintf("<td><nobr>revdb datasize=%"INT32" ",
		//		       dataSize);
		//}
		else if ( rdbId == RDB_TAGDB ) {
			Tag *tag = (Tag *)rec;
			sb->safePrintf("<td><nobr>");
			if ( rec[0] & 0x01 ) tag->printToBuf(sb);
			else sb->safePrintf("negativeTagKey");
			sb->safePrintf("</nobr></td>");
		}
		else {
			char *xx=NULL;*xx=0;
		}

		// close it up
		sb->safePrintf("</tr>\n");

		/*
		// hash the data into a int32_t for hash table
		char *ns = "no";
		if ( noSplit ) ns = "yes";
		char *del = "";
		if ( neg ) del = " (delete)";

		if ( ks==12 ) {
			key_t *k2 = (key_t *)k;
			int64_t tid = g_indexdb.getTermId(k2);
			uint8_t score8 = g_indexdb.getScore ( *k2 );
			uint32_t score32 = score8to32 ( score8 );
			log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
			    "tid=%"UINT64" score8=%"UINT32" score32=%"UINT32" nosplit=%s%s",
			    count,getDbnameFromId(rdbId),(int32_t)ks,
			    (int32_t)dataSize,tid ,(int32_t)score8,(int32_t)score32,
			    ns,del);
		}
		else {
			log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
			    "nosplit=%s%s",
			    count,getDbnameFromId(rdbId),(int32_t)ks,
			    (int32_t)dataSize,ns,del);
		}
		*/

	}
	sb->safePrintf("</table>\n");

	if ( sb == &tmp )
		sb->print();
}


bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	// do not do this if not test collection for now
	if ( strcmp(cr->m_coll,"qatest123") ) return true;

	log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");

	// store each record in the list into the send buffers
	for ( ; p < pend ; ) {
		// breathe
 		QUICKPOLL(m_niceness);
		// first is rdbId
		//char rdbId = -1; // m_rdbId;
		//if ( rdbId < 0 ) rdbId = *p++;
		uint8_t rdbId = *p++;
		// mask off rdbId
		rdbId &= 0x7f;
		// get the key of the current record
		//char *key = p;
		// negative key?
		bool del ;
		if ( *p & 0x01 ) del = false;
		else             del = true;
		// must always be negative if deleteing
		// spiderdb is exempt because we add a spiderreply that is
		// positive and a spiderdoc
		// no, this is no longer the case because we add spider
		// replies to the index when deleting or rejecting a doc.
		//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
		//	char *xx=NULL;*xx=0; }
		// get the key size. a table lookup in Rdb.cpp.
		int32_t ks ;
		if      ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 ) {
			ks = 18;
			// no compress bits set!
			if ( p[0] & 0x06 ) { char*xx=NULL;*xx=0; }
			// alignment bit set or cleared
			if ( ! ( p[1] & 0x02 ) ) { char *xx=NULL;*xx=0; }
			if (   ( p[7] & 0x02 ) ) { char *xx=NULL;*xx=0; }
			int64_t docId = g_posdb.getDocId(p);
			if ( docId != m_docId && !cr->m_indexSpiderReplies) {
				log("xmldoc: %"INT64" != %"INT64""
				    , docId
				    , m_docId );
				char *xx=NULL;*xx=0;
			}
			// else
			// 	log("xmldoc: %"INT64" == %"INT64""
			// 	    , docId
			// 	    , m_docId );

			// uint64_t termId = g_posdb.getTermId(p);
			// if ( termId == 59194288760543LL ) {
			// 	log("xmldoc: debug");
			// 	//char *xx=NULL;*xx=0;
			// }
		}
		else if ( rdbId == RDB_DATEDB  ) ks = 16;
		else ks = getKeySizeFromRdbId ( rdbId );
		// sanity
		if ( ks < 12 ) { char *xx=NULL;*xx=0; }
		if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
		// another check
		Rdb *rdb = getRdbFromId(rdbId);
		if ( ! rdb ) { char *xx=NULL;*xx=0; }
		if ( rdb->m_ks < 12 || rdb->m_ks > MAX_KEY_BYTES ) {
			char *xx=NULL;*xx=0;}

		// special linkdb check
		//if ( rdbId == RDB_LINKDB ) {
		//	// parse it up
		//	key192_t *k = (key192_t *)p;
		//	unsigned char hc = g_linkdb.getLinkerHopCount_uk(k);
		//	if ( hc != 0 ){ char *xx=NULL;*xx=0; }
		//}

		char *rec = p;

		// set this
		//bool split = true;
		//if(rdbId == RDB_POSDB && g_posdb.isShardedByTermId(p) )
		// split =false;
		// skip key
		p += ks;
		// . if key belongs to same group as firstKey then continue
		// . titledb now uses last bits of docId to determine groupId
		// . but uses the top 32 bits of key still
		// . spiderdb uses last 64 bits to determine groupId
		// . tfndb now is like titledb(top 32 bits are top 32 of docId)
		//uint32_t gid = getGroupId ( rdbId , key , split );
		// get the record, is -1 if variable. a table lookup.
		int32_t dataSize;
		if      ( rdbId == RDB_POSDB || rdbId==RDB2_POSDB2)dataSize=0;
		else if ( rdbId == RDB_DATEDB  ) dataSize = 0;
		//else if ( rdbId == RDB_REVDB      ) dataSize = -1;
		else if ( rdbId == RDB2_POSDB2  ) dataSize = 0;
		else if ( rdbId == RDB2_DATEDB2   ) dataSize = 0;
		//else if ( rdbId == RDB2_REVDB2   ) dataSize = -1;
		else dataSize = getDataSizeFromRdbId ( rdbId );
		// . for delete never stores the data
		// . you can have positive keys without any dataSize member
		//   when they normally should have one, like titledb
		if ( forDelete ) dataSize = 0;
		// . negative keys have no data
		// . this is not the case unfortunately
		if ( del ) dataSize = 0;

		// ensure spiderdb request recs have data/url in them
		if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
		     g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)rec ) &&
		     ! forDelete &&
		     ! del &&
		     dataSize == 0 ) {
			char *xx=NULL;*xx=0; }

		// if variable read that in
		if ( dataSize == -1 ) {
			// -1 means to read it in
			dataSize = *(int32_t *)p;
			// sanity check
			if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
			// skip dataSize
			p += 4;
		}
		// skip over the data, if any
		p += dataSize;
		// breach us?
		if ( p > pend ) { char *xx=NULL;*xx=0; }
	}
	// must be exactly equal to end
	if ( p != pend ) return false;
	return true;

	/*
	int32_t recSize = 0;
	int32_t count = 0;
	for ( ; p < pend ; p += recSize , count++ ) {
		// get rdbid
		char rdbId = *p & 0x7f;
		// get nosplit flag
		char noSplit = *p & 0x80;
		// skip
		p++;
		// get key size
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		// sanity
		if ( ks > 16 ) { char *xx=NULL;*xx=0;}
		// negative key?
		bool del;
		if ( *p & 0x01 ) del = false;
		else             del = true;
		// convert into a key128_t, the biggest possible key
		char k[16];
		gbmemcpy ( &k , p , ks );
		// skip it
		p += ks;
		// flip this
		char split = ! noSplit;
		// test it
		g_hostdb.getGroupId(rdbId,k,split);
		// if negative, no data size allowed
		if ( ( k[0] & 0x01 ) == 0x00 ) continue;
		// get datasize
		int32_t dataSize = getDataSizeFromRdbId ( rdbId );
		// no negative key has data
		if ( del ) dataSize = 0;
		// if -1, read it in
		if ( dataSize == -1 ) {
			dataSize = *(int32_t *)p;
			// sanity check
                        if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
			p += 4;
                }
		// skip the data
		p += dataSize;
	}
	*/
}

bool XmlDoc::hashMetaList ( HashTableX *ht        ,
			    char       *p         ,
			    char       *pend      ,
			    bool        checkList ) {
	int32_t recSize = 0;
	int32_t count = 0;
	for ( ; p < pend ; p += recSize , count++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get rdbid
		char rdbId = *p & 0x7f;
		// skip rdb id
		p++;
		// save that
		char *rec = p;
		// get key size
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		// sanity check
		if ( ks > 28 ) { char *xx=NULL;*xx=0; }
		// is it a delete key?
		char del ;
		if ( ( p[0] & 0x01 ) == 0x00 ) del = true;
		else                           del = false;
		// convert into a key128_t, the biggest possible key
		char k[MAX_KEY_BYTES];//key128_t k ;
		// zero out
		KEYMIN(k,MAX_KEY_BYTES);
		//k.setMin();
		gbmemcpy ( k , p , ks );
		// skip it
		p += ks;
		// if negative, no data size allowed -- no
		if ( del ) continue;
		// get datasize
		int32_t dataSize = getDataSizeFromRdbId ( rdbId );
		// if -1, read it in
		if ( dataSize == -1 ) {
			dataSize = *(int32_t *)p;
			// sanity check
                        if ( dataSize < 0 ) { char *xx=NULL;*xx=0; }
			p += 4;
                }
		// hash the data into a int32_t for hash table
		//int32_t h32 = 0;
		//h32 = hash32 ( p , dataSize );
		// do not allow 0
		//if ( h32 == 0 ) h32 = 1;
		// skip the data
		p += dataSize;
		// ignore spiderdb recs for parsing consistency check
		if ( rdbId == RDB_SPIDERDB ) continue;
		if ( rdbId == RDB2_SPIDERDB2 ) continue;
		// ignore tagdb as well!
		if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;
		// skip revdb for now too
		//if ( rdbId == RDB_REVDB ) continue;
		// set our rec size, includes key/dataSize/data
		int32_t recSize = p - rec;
		// debug point
		//if ( *(uint64_t *)k == 4828936067112479745LL )
		//	log("hey");
		// if just adding, do it
		if ( ! checkList ) {
			// we now store ptr to the rec, not hash!
			if ( ! ht->addKey ( k , &rec ) ) return false;
			continue;
		}
		// check to see if this rec is in the provided hash table
		int32_t slot = ht->getSlot ( k );
		// bitch if not found
		if ( slot < 0 && ks==12 ) {
			key144_t *k2 = (key144_t *)k;
			int64_t tid = g_posdb.getTermId(k2);
			char shardByTermId = g_posdb.isShardedByTermId(k2);
			//uint8_t score8 = g_indexdb.getScore ( *k2 );
			//uint32_t score32 = score8to32 ( score8 );
			log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
			    "tid=%"UINT64" "
			    "key=%s "
			    //"score8=%"UINT32" score32=%"UINT32" "
			    "shardByTermId=%"INT32"",
			    count,getDbnameFromId(rdbId),(int32_t)ks,
			    (int32_t)dataSize,tid ,
			    //(int32_t)score8,(int32_t)score32,
			    KEYSTR(k2,ks),
			    (int32_t)shardByTermId);
			// look it up


			// int16_tcut
			HashTableX *wt = m_wts;

			// point to keys, termids?
			//TermInfo **tp = (TermInfo **)wt->m_keys;

			// now print the table we stored all we hashed into
			for ( int32_t i = 0 ; i < wt->m_numSlots ; i++ ) {
				// skip if empty
				if ( wt->m_flags[i] == 0 ) continue;
				// get the TermInfo
				TermDebugInfo *ti;
				ti = (TermDebugInfo *)wt->getValueFromSlot(i);
				// skip if not us
				if((ti->m_termId & TERMID_MASK)!=tid)continue;
				// got us
				char *start = m_wbuf.getBufStart();
				char *term = start + ti->m_termOff;
				char *prefix = "";
				if ( ti->m_prefixOff >= 0 ) {
					prefix = start + ti->m_prefixOff;
					//prefix[ti->m_prefixLen] = '\0';
				}
				// NULL term it
				term[ti->m_termLen] = '\0';
				// print it
				log("parser: term=%s prefix=%s",//score32=%"INT32"",
				    term,prefix);//,(int32_t)ti->m_score32);
			}

			char *xx=NULL; *xx=0;
			// count it for PageStats.cpp
			g_stats.m_parsingInconsistencies++;
			continue;
		}
		if ( slot < 0 && ks != 12 ) {
			// if it is sectiondb and the orig doc did not
			// add sectiondb recs because m_totalSiteVoters >=
			// MAX_SITE_VOTERS, then that is ok!
			if ( (rdbId == RDB_SECTIONDB ||
			      rdbId == RDB2_SECTIONDB2 ) &&
			     m_sectionsValid &&
			     m_sections.m_totalSiteVoters >= MAX_SITE_VOTERS )
				continue;
			log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" "
			    "ks=%s "
			    ,count,getDbnameFromId(rdbId),(int32_t)ks,
			    (int32_t)dataSize,KEYSTR(k,ks));
			char *xx=NULL; *xx=0;
			// count it for PageStats.cpp
			g_stats.m_parsingInconsistencies++;
			continue;
		}
		// if in there, check the hashes
		//int32_t h2 = *(int32_t *)ht->getValueFromSlot ( slot );
		char *rec2 = *(char **)ht->getValueFromSlot ( slot );
		// get his dataSize
		int32_t dataSize2 = getDataSizeFromRdbId(rdbId);
		// his keysize
		int32_t ks2 = getKeySizeFromRdbId(rdbId);
		// get his recsize
		int32_t recSize2 = ks2 ;
		// if -1 that is variable
		if ( dataSize2 == -1 ) {
			dataSize2 = *(int32_t *)(rec2+ks2);
			recSize2 += 4;
		}
		// add it up
		recSize2 += dataSize2;
		// keep on chugging if they match
		if ( recSize2==recSize && !memcmp(rec,rec2,recSize) ) continue;
		// otherwise, bitch
		char shardByTermId = false;
		if ( rdbId == RDB_POSDB )
			shardByTermId = g_posdb.isShardedByTermId(rec2);
		log("build: data not equal for key=%s "
		    "rdb=%s splitbytermid=%"INT32" dataSize=%"INT32"",
		    KEYSTR(k,ks2),
		    getDbnameFromId(rdbId),(int32_t)shardByTermId,dataSize);

		// print into here
		SafeBuf sb1;
		SafeBuf sb2;

		// print it out
		if ( rdbId == RDB_SPIDERDB ) {
			// get rec
			if ( g_spiderdb.isSpiderRequest((key128_t *)rec) ) {
				SpiderRequest *sreq1 = (SpiderRequest *)rec;
				SpiderRequest *sreq2 = (SpiderRequest *)rec2;
				sreq1->print(&sb1);
				sreq2->print(&sb2);
			}
			else {
				SpiderReply *srep1 = (SpiderReply *)rec;
				SpiderReply *srep2 = (SpiderReply *)rec2;
				srep1->print(&sb1);
				srep2->print(&sb2);
			}
			log("build: rec1=%s",sb1.getBufStart());
			log("build: rec2=%s",sb2.getBufStart());

		}
		char *xx=NULL; *xx=0;
		// count it for PageStats.cpp
		g_stats.m_parsingInconsistencies++;
	}
	return true;
}

/*
bool checkRegex ( SafeBuf *regex ,
		  char    *target ,
		  bool    *boolVal ,
		  bool    *boolValValid ,
		  int32_t    *compileError ,
		  CollectionRec *cr ) {

	if ( compileError ) *compileError = false;

	if ( *boolValValid )
		return *boolVal;

	// if not using diffbot or there is no regex, it is ok
	if ( regex->length() <= 0 ) {
		*boolVal = true;
		*boolValValid = true;
		return boolVal;
	}

	// whip out the regex shit i guess...
	regex_t buf;
	// this will store the compiled regular expression into "buf"
	int32_t ret = regcomp ( &buf ,
			     // the actual regular expression to compile
			     regex->getBufStart() ,
			     // some flags
			     REG_EXTENDED|REG_ICASE|REG_NEWLINE|REG_NOSUB);

	if ( ret ) {
		//g_errno = ret;
		if ( compileError ) *compileError = errno;
		log("xmldoc: diffbot regcomp failed: %s. This should have "
		    "been tested before crawl was started. Ignoring.",
		    mstrerror(errno));
		return true;
	}

	// now see if it is a match
	if ( regexec(&buf,target,0,NULL,0) ) *boolVal = true;
	else                                 *boolVal = false;

	*boolValValid = true;
	return boolVal;
}
*/

// . should we send this url off to diffbot or processing?
// . if the url's downloaded content does not match the provided regex
//   in m_diffbotPageProcessPattern, then we do not send the url to diffbot
//   for processing
// . make sure this regex is pre-tested before starting the crawl
//   so we know it compiles
bool XmlDoc::doesUrlMatchDiffbotCrawlPattern() {

	if ( m_matchesCrawlPatternValid )
		return m_matchesCrawlPattern;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	// get the compiled regular expressions
	regex_t *ucr = &cr->m_ucr;
	if ( ! cr->m_hasucr ) ucr = NULL;

	if ( ! m_firstUrlValid ) return false;


	m_matchesCrawlPatternValid = true;
	m_matchesCrawlPattern = false;

	Url *furl = getFirstUrl();
	char *url = furl->getUrl();

	// if we had a url crawl regex then regexec will return non-zero
	// if our url does NOT match i guess
	if ( ucr && regexec(ucr,url,0,NULL,0) )
		return false;

	// int16_tcut
	char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
	if ( ucp && ! ucp[0] ) ucp = NULL;

	// do not require a match on ucp if ucr is given
	if ( ucp && ! ucr && ! doesStringContainPattern(url,ucp) )
		return false;

	m_matchesCrawlPattern = true;

	return true;
}

/*
bool XmlDoc::doesUrlMatchDiffbotProcessPattern() {
	return checkRegex ( &cr->m_diffbotUrlProcessPattern ,
			    m_firstUrl.m_url ,
			    &m_diffbotUrlProcessPatternMatch,
			    &m_diffbotUrlProcessPatternMatchValid,
			    NULL,
			    cr);
}
bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
	if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
	return checkRegex ( &cr->m_diffbotPageProcessPattern ,
			    ptr_utf8Content,
			    &m_diffbotPageProcessPatternMatch,
			    &m_diffbotPageProcessPatternMatchValid,
			    NULL,
			    cr);
}
*/

bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
	if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;
	char *p = cr->m_diffbotPageProcessPattern.getBufStart();
	// empty? no pattern matches everything.
	if ( ! p ) return true;
	if ( ! m_content ) return false;
	// how many did we have?
	return doesStringContainPattern ( m_content , p );
}

int32_t *XmlDoc::reindexJSONObjects ( int32_t *newTitleHashes,
				      int32_t numNewHashes ) {
	return redoJSONObjects (newTitleHashes,numNewHashes,false );
}

int32_t *XmlDoc::nukeJSONObjects ( int32_t *newTitleHashes ,
				   int32_t numNewHashes ) {
	return redoJSONObjects (newTitleHashes,numNewHashes,true );
}

// . returns ptr to status
// . diffbot uses this to remove the indexed json pages associated with
//   a url. each json object is basically its own url. a json object
//   url is the parent page's url with a -diffbotxyz-%"UINT32" appended to it
//   where %"INT32" is the object # starting at 0 and incrementing from there.
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
int32_t *XmlDoc::redoJSONObjects ( int32_t *newTitleHashes ,
				   int32_t numNewHashes ,
				   bool deleteFromIndex ) {
	// use this
	static int32_t s_return = 1;
	// if none, we are done
	if ( m_diffbotJSONCount <= 0 ) return &s_return;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// i was trying to re-index some diffbot json docs in the global
	// index but it wasn't set as custom crawl
	//if ( ! cr->m_isCustomCrawl ) return &s_return;

	// already did it?
	if ( m_joc >= m_diffbotJSONCount ) return &s_return;

	// new guy here
	if ( ! m_dx ) {
		try { m_dx = new ( XmlDoc ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("xmldoc: failed to alloc m_dx");
			return NULL;
		}
		mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
	}

	//
	// index the hashes of the latest diffbot json items for this parent
	//
	HashTableX dedup;
	if ( ! dedup.set(4,0,numNewHashes*4,NULL,0,false,m_niceness,"njodt") )
		return NULL;
	for ( int32_t i = 0 ; i < numNewHashes ; i++ )
		dedup.addKey ( &newTitleHashes[i] );

	// get this old doc's current title hashes
	int32_t  numOldHashes;
	int32_t *oldTitleHashes = getDiffbotTitleHashes ( &numOldHashes );
	// sanity. should return right away without having to block
	if ( oldTitleHashes == (void *)-1 ) { char *xx=NULL;*xx=0; }

	//int32_t count = m_diffbotJSONCount;
	// sanity again
	if ( numOldHashes != m_diffbotJSONCount ) {
		log("build: can't remove json objects. "
		    "jsoncount mismatch %"INT32" != %"INT32
		    ,numOldHashes
		    ,m_diffbotJSONCount
		    );
		g_errno = EBADENGINEER;
		return NULL;
		//count = 0;
		//char *xx=NULL;*xx=0;
	}

	// scan down each
	for ( ; m_joc < m_diffbotJSONCount ; ) {
		// only NUKE the json items for which title hashes we lost
		int32_t th32 = oldTitleHashes[m_joc];
		// . if still in the new diffbot reply, do not DELETE!!!
		// . if there was no title, it uses hash of entire object
		if ( deleteFromIndex && dedup.isInTable(&th32) ) {
			m_joc++;
			continue;
		}
		// if m_dx has no url set, call set4 i guess
		if ( ! m_dx->m_firstUrlValid ) {
			// make the fake url for this json object for indexing
			SafeBuf fakeUrl;
			fakeUrl.set ( m_firstUrl.getUrl() );
			// get his title hash32
			//int32_t jsonTitleHash32 = titleHashes[m_joc];
			// append -diffbotxyz%"UINT32" for fake url
			fakeUrl.safePrintf("-diffbotxyz%"UINT32"",
					   (uint32_t)th32);
			// set url of new xmldoc
			if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
					    cr->m_coll ,
					    NULL , // pbuf
					    m_niceness ) )
				// g_errno should be set!
				return NULL;
			// we are indexing json objects, don't use all these
			m_dx->m_useClusterdb  = false;
			m_dx->m_useSpiderdb   = false;
			m_dx->m_useTagdb      = false;
			m_dx->m_usePlacedb    = false;
			m_dx->m_useLinkdb     = false;
			m_dx->m_isChildDoc    = true;
			m_dx->m_parentDocPtr  = this;
			// are we doing a query reindex or a nuke?
			m_dx->m_deleteFromIndex = deleteFromIndex;//true;
			// do not try to download this url
			if ( ! deleteFromIndex )
				m_dx->m_recycleContent = true;
			// we need this because only m_dx->m_oldDoc will
			// load from titledb and have it set
			m_dx->m_isDiffbotJSONObject = true;
			// for debug
			char *str = "reindexing";
			if ( deleteFromIndex ) str = "nuking";
			log("xmldoc: %s %s",str,fakeUrl.getBufStart());
		}

		// when the indexdoc completes, or if it blocks, call us!
		// we should just pass through here
		m_dx->setCallback ( m_masterState , m_masterLoop );

		// . this should ultimately load from titledb and not
		//   try to download the page since m_deleteFromIndex is
		//   set to true
		// . if m_dx got its msg4 reply it ends up here, in which
		//   case do NOT re-call indexDoc() so check for
		//   m_listAdded.
		if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
			return (int32_t *)-1;
		// critical error on our part trying to index it?
		// does not include timeouts or 404s, etc. mostly just
		// OOM errors.
		if ( g_errno ) return NULL;
		// count as deleted
		cr->m_localCrawlInfo.m_objectsDeleted++;
		cr->m_globalCrawlInfo.m_objectsDeleted++;
		cr->m_needsSave = true;
		// but gotta set this crap back
		//log("diffbot: resetting %s",m_dx->m_firstUrl.m_url);
		// clear for next guy if there is one. clears
		// m_dx->m_contentValid so the set4() can be called again above
		m_dx->reset();
		// try to do more json objects indexed from this parent doc
		m_joc++;
	}

	// nuke it
	mdelete ( m_dx , sizeof(XmlDoc), "xddx" );
	delete  ( m_dx );
	m_dx = NULL;

	return &s_return;
}


void getMetaListWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// note it
	THIS->setStatus ( "in get meta list wrapper" );
	// get it
	char *ml = THIS->getMetaList ( );
	// sanity check
	if ( ! ml && ! g_errno ) {
		log("doc: getMetaList() returned NULL without g_errno");
		sleep(5);
		char *xx=NULL;*xx=0;
	}
	// return if it blocked
	if ( ml == (void *)-1 ) return;
	// sanityh check
	if ( THIS->m_callback1 == getMetaListWrapper ) { char *xx=NULL;*xx=0;}
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

XmlDoc *g_od = NULL;

// . returns NULL and sets g_errno on error
// . make a meta list to call Msg4::addMetaList() with
// . called by Msg14.cpp
// . a meta list is just a buffer of Rdb records of the following format:
//   rdbid | rdbRecord
// . meta list does not include title rec since Msg14 adds that using Msg1
// . returns false and sets g_errno on error
// . sets m_metaList ptr and m_metaListSize
// . if "deleteIt" is true, we are a delete op on "old"
// . returns (char *)-1 if it blocks and will call your callback when done
// . generally only Repair.cpp changes these use* args to false
char *XmlDoc::getMetaList ( bool forDelete ) {

	if ( m_metaListValid ) return m_metaList;

	setStatus ( "getting meta list" );

	// force it true?
	// "forDelete" means we want the metalist to consist of "negative"
	// keys that will annihilate with the positive keys in the index,
	// posdb and the other rdbs, in order to delete them. "deleteFromIndex"
	// means to just call getMetaList(tre) on the m_oldDoc (old XmlDoc)
	// which is built from the titlerec in Titledb. so don't confuse
	// these two things. otherwise when i add this we were not adding
	// the spiderreply of "Doc Force Deleted" from doing a query reindex
	// and it kept repeating everytime we started gb up.
	//if ( m_deleteFromIndex ) forDelete = true;

	// assume valid
	m_metaList     = "";
	m_metaListSize = 0;


	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block, this callback will be called
	if ( ! m_masterLoop ) {
		m_masterLoop  = getMetaListWrapper;
		m_masterState = this;
	}

	// returning from a handler that had an error?
	if ( g_errno ) return NULL;

	// if we are a spider status doc/titlerec and we are doing a rebuild
	// operation, then keep it simple
	if ( m_setFromTitleRec &&
	     m_useSecondaryRdbs &&
	     m_contentTypeValid &&
	     m_contentType == CT_STATUS ) {
		// if not rebuilding posdb then done, list is empty since
		// spider status docs do not contribute to linkdb, clusterdb,..
		if ( ! m_usePosdb && ! m_useTitledb ) {
			m_metaListValid = true;
			return m_metaList;
		}

		/////////////
		//
		// if user disabled spider status docs then delete the titlerec
		// AND the posdb index list from our dbs for this ss doc
		//
		/////////////
		CollectionRec *cr = getCollRec();
		if ( ! cr ) return NULL;
		if ( ! cr->m_indexSpiderReplies ) {
			int64_t uh48 = m_firstUrl.getUrlHash48();
			// delete title rec. true = delete?
			key_t tkey = g_titledb.makeKey (m_docId,uh48,true);
			// shortcut
			SafeBuf *ssb = &m_spiderStatusDocMetaList;
			// add to list. and we do not add the spider status
			// doc to posdb since we deleted its titlerec.
			ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
			ssb->safeMemcpy ( &tkey , sizeof(key_t) );
			m_metaList      = ssb->getBufStart();
			m_metaListSize  = ssb->getLength  ();
			m_metaListValid = true;
			return m_metaList;
		}

		// set safebuf to the json of the spider status doc
		SafeBuf jd;
		if ( ! jd.safeMemcpy ( ptr_utf8Content , size_utf8Content ) )
			return NULL;
		// set m_spiderStatusDocMetaList from the json
		if ( ! setSpiderStatusDocMetaList ( &jd , m_docId ) )
			return NULL;
		// TODO: support titledb rebuild as well
		m_metaList      = m_spiderStatusDocMetaList.getBufStart();
		m_metaListSize  = m_spiderStatusDocMetaList.getLength();
		m_metaListValid = true;
		return m_metaList;
	}


	// any other indexing issue? hey! g_errno might not be set here
	//if ( m_indexCode ) { g_errno = m_indexCode; return NULL; }

	// a hacky thing
	//XmlDoc *od = (XmlDoc *)1;

	//bool diffbotEmptyReply = false;

	/*
	// fake this for diffbot?
	if ( m_useDiffbot &&
	     ! m_isDiffbotJSONObject &&
	     ! doesUrlMatchDiffbotCrawlPattern() ) {
		// flag it so we only add the SpiderReply to spiderdb and bail
		//diffbotEmptyReply = true;
		// we should not delete the json objects for this url
		// from the index just because the user decided to remove
		// it from her crawl
		m_isIndexedValid = true;
		m_isIndexed      = false;
		m_oldDocValid    = true;
		m_oldDoc         = NULL;
	}
	*/

	// if "rejecting" from index fake all this stuff
	if ( m_deleteFromIndex ) {
	     // if we are using diffbot api and diffbot found no json objects
	     // or we never even processed the url, we really just want to
	     // add the SpiderReply for this url to spiderdb and nothing more.
		// NO! we still want to store the page content in titledb
		// so we can see if it has changed i guess
	     //diffbotEmptyReply ) {
		// set these things to bogus values since we don't need them
		m_contentHash32Valid = true;
		m_contentHash32 = 0;
		m_httpStatusValid = true;
		m_httpStatus = 200;
		m_siteValid = true;
		ptr_site = "www.poopoo.com";
		size_site = gbstrlen(ptr_site)+1;
		m_isSiteRootValid = true;
		m_isSiteRoot2 = 1;
		//m_tagHash32Valid = true;
		//m_tagHash32 = 0;
		m_tagPairHash32Valid = true;
		m_tagPairHash32 = 0;
		m_siteHash64Valid = true;
		m_siteHash64 = 0LL;
		m_spiderLinksValid = true;
		m_spiderLinks2 = 1;
		m_langIdValid = true;
		m_langId = 1;
		m_siteNumInlinksValid = true;
		m_siteNumInlinks = 0;
		m_isIndexed = true;
		m_isIndexedValid = true;
		m_ipValid = true;
		m_ip = 123456;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	//
	// BEGIN MULTI DOC QUERY REINDEX HACK
	//
	// this fixes it so we can do a query reindex on fake child urls
	// of their original parent multidoc url. the child urls are
	// subsections of the original parent url that were indexed as
	// separate documents with their own docid. if we try to do a
	// query reindex on such things, detect it, and add the request
	// for the original parent multidoc url.
	//
	if ( m_sreqValid && m_sreq.m_isPageReindex &&
	     // if it is a force delete, then allow the user to delete
	     // such diffbot reply json children documents, however.
	     ! m_sreq.m_forceDelete ) {
		// see if its diffbot json object
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
		XmlDoc *od = *pod;
		// if no old doc then we might have just been a diffbot
		// json url that was directly injected into GLOBAL-INDEX
		// like xyz.com/-diffbotxyz12345 (my format) or
		if ( ! od ) goto skip9;
		// if we are indexing a subdoc piece of a multidoc url
		// then parentUrl should return non-NULL
		char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
		if ( ! parentUrl && od->m_contentType != CT_STATUS )
			goto skip9;
		// in that case we need to reindex the parent url not the
		// subdoc url, so make the spider reply gen quick
		//SpiderReply *newsr = od->getFakeSpiderReply();
		//if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
		// use our ip though
		//newsr->m_firstIp = od->m_firstIp;
		// however we have to use our docid-based spider request
		SpiderReply srep;
		srep.reset();
		// it MUST match up with original spider request so the
		// lock key in Spider.cpp can unlock it. that lock key
		// uses the "uh48" (48bit hash of the url) and "srep.m_firstIp"
		// in this case the SpiderRequest, sreq, is docid-based because
		// it was added through PageReindex.cpp (query reindex) so
		// it will be the 48 bit hash64b() of the docid
		// (see PageReindex.cpp)'s call to SpiderRequest::setKey()
		srep.m_firstIp = m_sreq.m_firstIp;
		// assume no error
		srep.m_errCount = 0;
		// do not inherit this one, it MIGHT HAVE CHANGE!
		srep.m_siteHash32 = m_sreq.m_siteHash32;
		srep.m_domHash32  = m_sreq.m_domHash32;
		srep.m_spideredTime = getTimeGlobal();
		int64_t uh48 = m_sreq.getUrlHash48();
		int64_t parentDocId = 0LL;
		srep.m_contentHash32 = 0;
		// were we already in titledb before we started spidering?
		// yes otherwise we would have called "goto skip9" above
		srep.m_wasIndexed = 1;
		srep.m_wasIndexedValid = 1;
		srep.m_isIndexed = 1;
		srep.m_isIndexedINValid = false;
		srep.m_errCode      = EREINDEXREDIR; // indexCode
		srep.m_downloadEndTime = 0;
		srep.setKey (  srep.m_firstIp, parentDocId , uh48 , false );
		// lock of request needs to match that of reply so the
		// reply, when recevied by Rdb.cpp which calls addSpiderReply()
		// can unlock this url so it can be spidered again.
		int64_t lock1 = makeLockTableKey(&m_sreq);
		int64_t lock2 = makeLockTableKey(&srep);
		if ( lock1 != lock2 ) { char *xx=NULL;*xx=0; }
		// make a fake spider reply so this docid-based spider
		// request is not used again
		//SpiderReply srep;
		// store the rdbid
		char rd = RDB_SPIDERDB;
		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
		if ( ! m_zbuf.pushChar(rd) )
			return NULL;
		// store that reply to indicate this spider request has
		// been fulfilled!
		if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
			return NULL;

		// but also store a new spider request for the parent url
		SpiderRequest ksr;
		int64_t pd;

		// skip if doc is a spider status "document". their docids
		// often get added during a query reindex but we should ignore
		// them completely.
		if ( od->m_contentType == CT_STATUS )
			goto returnList;

		//goto returnList;

		// complain
		if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
			log("build: doing query reindex but diffbot api "
			    "url is not set in spider controls");
		// just copy original request
		gbmemcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
		// do not spider links, it's a page reindex of a multidoc url
		ksr.m_avoidSpiderLinks = 1;
		// avoid EDOCUNCHANGED
		ksr.m_ignoreDocUnchangedError = 1;
		// no longer docid based we set it to parentUrl
		ksr.m_urlIsDocId = 0;
		// but consider it a manual add. this should already be set.
		ksr.m_isPageReindex = 1;
		// but it is not docid based, so overwrite the docid
		// in ksr.m_url with the parent multidoc url. it \0 terms it.
		strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
		// this must be valid
		//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
		// set the key, ksr.m_key. isDel = false
		// fake docid
		pd = g_titledb.getProbableDocId(parentUrl);
		ksr.setKey ( m_sreq.m_firstIp, pd , false );
		// store this
		if ( ! m_zbuf.pushChar(rd) )
			return NULL;
		// then the request
		if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
			return NULL;
	returnList:
		// prevent cores in indexDoc()
		m_indexCode = EREINDEXREDIR;
		m_indexCodeValid = true;
		// for now we set this crap
		m_metaList = m_zbuf.getBufStart();
		m_metaListSize = m_zbuf.length();
		m_metaListValid = true;
		return m_metaList;
	}
	//
	// END DIFFBOT OBJECT QUERY REINDEX HACK
	//


 skip9:
	// get our checksum
	int32_t *plainch32 = getContentHash32();
	if ( ! plainch32 || plainch32 == (void *)-1 ) return (char *)plainch32;

	// get this too
	int16_t *hs = getHttpStatus ();
	if ( ! hs || hs == (void *)-1 ) return (char *)hs;

	// make sure site is valid
	char *site = getSite();
	if ( ! site || site == (void *)-1 ) return (char *)site;

	// this seems to be an issue as well for "unchanged" block below
	char *isr = getIsSiteRoot();
	if ( ! isr || isr == (void *)-1 ) return (char *)isr;

	// get hash of all tags from tagdb that affect what we index
	//int32_t *tagHash = getTagHash32 ( );
	//if ( ! tagHash || tagHash == (void *)-1 ) return (char *)tagHash;

	int64_t *sh64 = getSiteHash64();
	if ( ! sh64 || sh64 == (void *)-1 ) return (char *)sh64;

	// make sure docid valid
	int64_t *mydocid = getDocId();
	if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid;

	// . get the old version of our XmlDoc from the previous spider time
	// . set using the old title rec in titledb
	// . should really not do any more than set m_titleRec...
	// . should not even uncompress it!
	// . getNewSpiderReply() will use this to set the reply if
	//   m_indexCode == EDOCUNCHANGED...
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod;
	// point to the old xml doc if no error, etc.
	XmlDoc *od = *pod;

	// check if we are already indexed
	char *isIndexed = getIsIndexed ();
	if ( ! isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed;
	// do not delete anything in these cases, but do remove us from
	// spiderdb, and from tfndb (except for EDOCNOTNEW)
	//if ( m_indexCode == EDOCNOTNEW || m_indexCode == EDOCNOTOLD  )
	//	od = NULL;

	// why call this way down here? it ends up downloading the doc!
	int32_t *indexCode = getIndexCode();
	if ( ! indexCode || indexCode ==(void *)-1) return (char *)indexCode;
	// sanity check
	if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }

	// this means to abandon the injection
	if ( *indexCode == EABANDONED ||
	     *indexCode == EHITCRAWLLIMIT ||
	     *indexCode == EHITPROCESSLIMIT ) {
		m_metaList = (char *)0x123456;
		m_metaListSize = 0;
		m_metaListValid = true;
		return m_metaList;
	}

	// if diffbot reply is empty, don't bother adding anything except
	// for the spider reply... reply might be "-1" too!
	//if ( m_useDiffbot &&
	//     ! m_isDiffbotJSONObject &&
	//     m_diffbotReplyValid &&
	//     m_diffbotReply.length() <= 3 )
	//	diffbotEmptyReply = true;

	// . some index code warrant retries, like EDNSTIMEDOUT, ETCPTIMEDOUT,
	//   etc. these are deemed temporary errors. other errors basically
	//   indicate a document that will never be indexable and should,
	//   if currently indexed, be deleted.
	// . just add the spider reply and we're done
	if (    *indexCode == EDNSTIMEDOUT
	     || *indexCode == ETCPTIMEDOUT
	     || *indexCode == EUDPTIMEDOUT
	     || *indexCode == EDNSDEAD
	     || *indexCode == ENETUNREACH
	     || *indexCode == EHOSTUNREACH
		// . rejected from a diffbot regex url crawl filter?
		// . or no json objects returned from diffbot?
		// . or rejected from the processign regex filter?
		// . then just add the SpiderReply to avoid respidering
		// . NO! still need to add outlinks
		//|| diffbotEmptyReply
		// . treat this as a temporary error i guess
		// . getNewSpiderReply() below will clear the error in it and
		//   copy stuff over from m_sreq and m_oldDoc for this case
		|| *indexCode == EDOCUNCHANGED
		) {
		// sanity - in repair mode?
		if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
		// . this seems to be an issue for blocking
		// . if we do not have a valid ip, we can't compute this,
		//   in which case it will not be valid in the spider reply
		// . why do we need this for timeouts etc? if the doc is
		//   unchanged
		//   we should probably update its siteinlinks in tagdb
		//   periodically and reindex the whole thing...
		// . i think we were getting the sitenuminlinks for
		//   getNewSpiderReply()
		if ( m_ipValid &&
		     m_ip != 0 &&
		     m_ip != -1 ) {
			int32_t *sni = getSiteNumInlinks();
			if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
		}
		// all done!
		bool addReply = true;
		// Scraper.cpp uses this
		if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
		// page parser calls set4 and sometimes gets a dns time out!
		if ( m_sreqValid && m_sreq.m_isPageParser ) addReply = false;
		// return nothing if done
		if ( ! addReply ) {
			m_metaListSize = 0;
			m_metaList     = (char *)0x1;
			return m_metaList;
		}
		// save this
		int32_t savedCode = *indexCode;
		// before getting our spider reply, assign crap from the old
		// doc to us since we are unchanged! this will allow us to
		// call getNewSpiderReply() without doing any processing, like
		// setting the Xml or Words classes, etc.
		copyFromOldDoc ( od );
		// need this though! i don't want to print out "Success"
		// in the log in the logIt() function
		m_indexCode = savedCode;
		m_indexCodeValid = true;
		// but set our m_contentHash32 from the spider request
		// which got it from the spiderreply in the case of
		// EDOCUNCHANGED. this way ch32=xxx will log correctly.
		// I think this is only when EDOCUNCHANGED is set in the
		// Msg13.cpp code, when we have a spider compression proxy.
		if ( *indexCode == EDOCUNCHANGED &&
		     m_sreqValid &&
		     ! m_contentHash32Valid ) {
			m_contentHash32      = m_sreq.m_contentHash32;
			m_contentHash32Valid = true;
		}
		// we need these got getNewSpiderReply()
		m_wasInIndex = false;
		if ( od ) m_wasInIndex = true;
		m_isInIndex       = m_wasInIndex;
		m_wasInIndexValid = true;
		m_isInIndexValid  = true;

		// unset our ptr_linkInfo1 so we do not free it and core
		// since we might have set it in copyFromOldDoc() above
		ptr_linkInfo1  = NULL;
		size_linkInfo1 = 0;
		m_linkInfo1Valid = false;

		bool indexNewTimeStamp = false;
		if ( getUseTimeAxis() &&
		     od &&
		     m_hasMetadata &&
		     *indexCode == EDOCUNCHANGED
		     //m_spideredTimeValid &&
		     //od->m_spideredTime != m_spideredTime
		     )
			indexNewTimeStamp = true;


		// . if not using spiderdb we are done at this point
		// . this happens for diffbot json replies (m_dx)
		if ( ! m_useSpiderdb && ! indexNewTimeStamp ) {
			m_metaList = NULL;
			m_metaListSize = 0;
			return (char *)0x01;
		}

		// get our spider reply
		SpiderReply *newsr = getNewSpiderReply();
		// return on error
		if ( ! newsr ) return (char *)newsr;
		// . panic on blocking! this is supposed to be fast!
		// . it might still have to lookup the tagdb rec?????
		if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
		// how much we need
		int32_t needx = sizeof(SpiderReply) + 1;


		// . INDEX SPIDER REPLY (1a)
		// . index ALL spider replies as separate doc. error or not.
		// . then print out error histograms.
		// . we should also hash this stuff when indexing the
		//   doc as a whole

		// i guess it is safe to do this after getting the spiderreply
		SafeBuf *spiderStatusDocMetaList = NULL;
		// if ( cr->m_indexSpiderReplies &&
		//      m_useSpiderdb &&
		//      // doing it for diffbot throws off smoketests.
		//      // yeah, but we need it, so we'll just have to update
		//      // the smoketests
		//      ! cr->m_isCustomCrawl ) {
		// get the spiderreply ready to be added
		spiderStatusDocMetaList = getSpiderStatusDocMetaList(newsr ,
								    forDelete);
		// error?
		if ( ! spiderStatusDocMetaList ) return NULL;
		// blocked?
		if (spiderStatusDocMetaList==(void *)-1)
			return (char *)-1;

		// . now append the new stuff.
		// . we overwrite the old titlerec with the new one that has
		//   some more json in the ptr_metaInfo buffer so we hash
		//   its new timestamp. 'gbspiderdate' and any info from
		//   the meta info given in the injection request if there.
		//   this allows you to tag each document, even multiple
		//   versions of the same url with the same content. so if
		//   you spider the doc again and it is unchanged since last
		//   time we still index some of this meta stuff.
		if ( indexNewTimeStamp )
			appendNewMetaInfo (spiderStatusDocMetaList,forDelete);

		// need to alloc space for it too
		int32_t len = spiderStatusDocMetaList->length();
		needx += len;
		// this too
		m_addedStatusDocSize = len;
		m_addedStatusDocSizeValid = true;
		//}

		// doledb key?
		//if ( m_doledbKey.n0 || m_doledbKey.n1 )
		//	needx += 1 + sizeof(key_t); // + 4;
		// the FAKEDB unlock key for msg12 in spider.cpp
		//needx += 1 + sizeof(key_t); // FAKEDB
		// make the buffer
		m_metaList = (char *)mmalloc ( needx , "metalist");
		if ( ! m_metaList ) return NULL;
		// save size for freeing later
		m_metaListAllocSize = needx;
		// ptr and boundary
		m_p    = m_metaList;
		m_pend = m_metaList + needx;

		// save it
		char *saved = m_p;

		// first store spider reply "document"
		if ( spiderStatusDocMetaList ) {
			gbmemcpy ( m_p,
				 spiderStatusDocMetaList->getBufStart(),
				 spiderStatusDocMetaList->length() );
			m_p += spiderStatusDocMetaList->length();
		}

		/*

		  Not any more, now we remove from doledb as soon
		  as we get all the lock grants in our group (shard)
		  using Msg4 in Spider.cpp. That way we can add a
		  "0" entry into the waiting tree (or a time X ms into
		  the future from now) to try to enforce a sameIpWait
		  constraint and also allow up to maxSpidersPerIP.

		// remove from doledb if we had a valid key
		// (BEFORE adding SpiderReply)
		if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
			// note it
			setStatus ( "removing key from doledb" );
			// . now remove the original spider rec from "doledb"
			// . rdbid first
			*m_p = RDB_DOLEDB;
			m_p++;
			// then the key
			*(key_t *)m_p = m_doledbKey;
			// nukey, clear del bit to delete it
			*m_p = *m_p & 0xfe;
			// skip key
			m_p += sizeof(key_t);
			// then zero for data size
			// *(int32_t *)m_p = 0;
			//m_p += 4;
			// sanity check
			verifyMetaList( m_metaList , m_p , forDelete );
		}
		*/

		// sanity check
		if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
		// . make a fake titledb key
		// . remove the spider lock (Msg12 in Spider.cpp)
		// . now SPider.cpp uses SpiderReply reception to remove lock
		//   - mdw 9/28/13
		//*m_p++ = RDB_FAKEDB;
		//*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
		//key_t fakeKey;
		//fakeKey.n1 = 0;
		//fakeKey.n0 = m_docId;
		//gbmemcpy ( m_p , &fakeKey , sizeof(key_t) );
		//m_p += sizeof(key_t);
		// now add the new rescheduled time
		setStatus ( "adding SpiderReply to spiderdb" );
		// rdbid first
		char rd = RDB_SPIDERDB;
		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
		*m_p++ = rd;
		// get this
		if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
		// store the spider rec
		int32_t newsrSize = newsr->getRecSize();
		gbmemcpy ( m_p , newsr , newsrSize );
		m_p += newsrSize;
		m_addedSpiderReplySize = newsrSize;
		m_addedSpiderReplySizeValid = true;
		// sanity check
		if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
		// sanity check
		verifyMetaList( m_metaList , m_p , forDelete );
		// verify it
		m_metaListValid = true;
		// set size
		m_metaListSize = m_p - m_metaList;
		// all done
		return m_metaList;

	}

	// if using diffbot do not index the content of the web page we
	// got the json objects from, although, do keep it cached in titledb
	// because that can be useful
	// Not any more, now index the pages as well! then restrict search
	// to type:json to just search json objects.
	//if ( m_useDiffbot && ! m_isDiffbotJSONObject ) {
	//	m_usePosdb      = false;
	//	m_useClusterdb  = false;
	//}

	// get the old meta list if we had an old doc
	char *oldList = NULL;
	int32_t  oldListSize = 0;
	if ( od ) {
		od->m_useSpiderdb = false;
		od->m_useTagdb    = false;
		// do not use diffbot for old doc since we call
		// od->nukeJSONObjects below()
		od->m_diffbotApiUrlValid = true;
		// api url should be empty by default
		//od->m_diffbotApiNum = DBA_NONE;
		//log("break it here. shit this is not getting the list!!!");
		// if we are doing diffbot stuff, we are still indexing this
		// page, so we need to get the old doc meta list
		oldList = od->getMetaList ( true );
		oldListSize = od->m_metaListSize;
		if ( ! oldList || oldList ==(void *)-1) return (char *)oldList;
	}

	// . set whether we should add recs to titledb, posdb, linkdb, etc.
	// . if this doc is set by titlerec we won't change these
	// . we only turn off m_usePosdb, etc. if there is a
	//   <meta name=noindex content=1>
	// . we will still add to spiderdb, but not posdb, linkdb, titledb
	//   and clusterdb.
	// . so we'll add the spiderreply for this doc and the spiderrequests
	//   for all outlinks and "firstIp" tagrecs to tagdb for those outlinks
	// . we use this for adding the url seed file gbdmoz.urls.txt
	//   which contains a list of all the dmoz urls we want to spider.
	//   gbdmoz.urls.txt is generated by dmozparse.cpp. we spider all
	//   these dmoz urls so we can search the CONTENT of the pages in dmoz,
	//   something dmoz won't let you do.
	char *mt = hasNoIndexMetaTag();
	if ( ! mt || mt == (void *)-1 ) return (char *)mt;
	if ( *mt ) {
		m_usePosdb = false;
		m_useLinkdb = false;
		m_useTitledb = false;
		m_useClusterdb = false;
		// do not add the "firstIp" tagrecs of the outlinks any more
		// because it might hurt us?
		m_useTagdb = false;
	}

	if ( cr->m_isCustomCrawl )
		m_useLinkdb = false;

	// . should we recycle the diffbot reply for this url?
	// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
	//   our existing diffbot reply, i.e. recycle it, even though we
	//   respidered this page.
	bool *recycle = getRecycleDiffbotReply();
	if ( ! recycle || recycle == (void *)-1) return (char *)recycle;
	// in that case inherit this from the old doc...
	if ( od && *recycle && od->m_diffbotJSONCount &&
	     // somehow i am seeing that this is empty!
	     // this is how many title hashes of diffbot replies we've
	     // stored in the old doc's titlerec. if these are not equal
	     // and we call reindexJSONObjects() below then it cores
	     // in redoJSONObjects().
	     od->size_linkInfo2/4 == od->m_diffbotJSONCount &&
	     // only call this once otherwise we double stock
	     // m_diffbotTitleHashBuf
	     m_diffbotJSONCount == 0 ) {//cr->m_isCustomCrawl){
		m_diffbotJSONCount          = od->m_diffbotJSONCount;
		m_sentToDiffbot             = od->m_sentToDiffbot;
		m_gotDiffbotSuccessfulReply = od->m_gotDiffbotSuccessfulReply;
		// copy title hashes info. it goes hand in hand with the
		// NUMBER of diffbot items we have.
		int nh = 0;
		int32_t *ohbuf = od->getDiffbotTitleHashes ( &nh );
		if ( ! m_diffbotTitleHashBuf.safeMemcpy ( ohbuf , nh*4 ) )
			return NULL;
		ptr_linkInfo2 =(LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
		size_linkInfo2=m_diffbotTitleHashBuf.length();

	}
	// but we might have to call reindexJSONObjects() multiple times if
	// it would block
	if ( od && *recycle &&
	     // only reindex if it is a query reindex i guess otherwise
	     // just leave it alone
	     m_sreqValid && m_sreq.m_isPageReindex &&
	     od->m_diffbotJSONCount &&
	     size_linkInfo2 ) {
		// similar to od->nukeJSONObjects
		int32_t *ohbuf =(int32_t *)m_diffbotTitleHashBuf.getBufStart();
		int32_t nh     =m_diffbotTitleHashBuf.length() / 4;
		int32_t *status = reindexJSONObjects( ohbuf , nh );
		if ( ! status || status == (void *)-1) return (char *)status;
	}


	// just delete the json items whose "title hashes" are present
	// in the "old doc" but NOT i the "new doc".
	// we use the title hash to construct a unique url for each json item.
	// if the title hash is present in both the old and new docs then
	// do not delete it here, but we will reindex it later in
	// getMetaList() below when we call indexDoc() on each one after
	// setting m_dx to each one.
	bool nukeJson = true;
	if ( ! od ) nukeJson = false;
	if ( od && od->m_diffbotJSONCount <= 0 ) nukeJson = false;
	// if recycling json objects, leave them there!
	if ( *recycle ) nukeJson = false;
	// you have to be a diffbot crawl to do this
	// no, not if you have th diffbot api url set... so take this out
	//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
	// do not remove old diffbot json objects if pageparser.cpp test
	// because that can not change the index, etc.
	if ( getIsPageParser() ) nukeJson = false;

	if ( nukeJson ) {
		// it should only nuke/delete the json items that we LOST,
		// so if we still have the title hash in our latest
		// diffbot reply, then do not nuke that json item, which
		// will have a url ending in -diffboyxyz%"UINT32"
		// (where %"UINT32" is the json item title hash).
		// This will download the diffbot reply if not already there.
		int32_t numHashes;
		int32_t *th = getDiffbotTitleHashes(&numHashes);
		if ( ! th && ! g_errno ) { char *xx=NULL;*xx=0; }
		if ( ! th || th == (void *)-1 ) return (char *)th;
		// this returns false if it blocks
		int32_t *status = od->nukeJSONObjects( th , numHashes );
		if ( ! status || status == (void *)-1) return (char *)status;
	}

	// . need this if useTitledb is true
	// . otherwise XmlDoc::getTitleRecBuf() cores because its invalid
	// . this cores if rebuilding just posdb because hashAll() needs
	//   the inlink texts for hashing
	//if ( m_useTitledb ) {
	LinkInfo *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 )
		return (char *)info1;
	//}

	// global debug
	g_od = od;

	/*
	// is the document content unchanged?
	bool unchanged = false;
	if ( od && od->m_contentHash32 == *ch32 ) unchanged = true;
	// http status of 304 means "not modified since"
	if ( od && *hs == 304                    ) unchanged = true;

	// compare to last time
	if ( od && *tagHash != od->m_tagHash32 ) unchanged = false;

	// do not do this if from pageparser.cpp
	//if ( m_sreqValid && m_sreq.m_isPageParser ) unchanged = false;
	if ( getIsPageParser() ) unchanged = false;

	// force reindex if it was from query reindex (docid based spider req)
	if ( m_sreqValid && m_sreq.m_urlIsDocId ) unchanged = false;

	// if we were turked... how to tell????
	if ( m_sreqValid && m_sreq.m_isInjecting ) unchanged = false;

	// just turn it all off for now because our parsing logic might
	// have changed
	unchanged = false;

	// set this i guess for doConsistencyTest()
	m_unchanged      = unchanged;
	m_unchangedValid = true;

	// . if doc content was unchanged just add the SpiderReply to the
	//   meta list so that spiderdb knows we attempted it at this time.
	// . copy over member vars of the old titleRec/XmlDoc into us so
	//   we can save time and cpu
	if ( unchanged ) {
		// this seems to be an issue for blocking
		int32_t *sni = getSiteNumInlinks();
		if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
		// all done!
		bool addReply = true;
		// Scraper.cpp uses this
		if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
		// return nothing if done
		if ( ! addReply ) {
			m_metaListSize = 0;
			m_metaList     = (char *)0x1;
			return m_metaList;
		}
		// before getting our spider reply, assign crap from the old
		// doc to us since we are unchanged! this will allow us to
		// call getNewSpiderReply() without doing any processing, like
		// setting the Xml or Words classes, etc.
		copyFromOldDoc ( od );
		// and don't forget to validate this
		int32_t *ic = getIndexCode();
		// should never block since we copied from old doc
		if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }
		// get our spider reply
		SpiderReply *newsr = getNewSpiderReply();
		// return on error
		if ( ! newsr ) return (char *)newsr;
		// . panic on blocking! this is supposed to be fast!
		// . it might still have to lookup the tagdb rec?????
		if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
		// unset our ptr_linkInfo1 so we do not free it and core
		// since we might have set it in copyFromOldDoc() above
		ptr_linkInfo1  = NULL;
		size_linkInfo1 = 0;
		// how much we need
		int32_t needx = sizeof(SpiderReply) + 1;
		// doledb key?
		if ( m_doledbKey.n0 || m_doledbKey.n1 )
			needx += 1 + sizeof(key_t); // + 4;
		// the titledb unlock key for msg12 in spider.cpp
		needx += 1 + sizeof(key_t);
		// make the buffer
		m_metaList = (char *)mmalloc ( needx , "metalist");
		if ( ! m_metaList ) return NULL;
		// save size for freeing later
		m_metaListAllocSize = needx;
		// ptr and boundary
		m_p    = m_metaList;
		m_pend = m_metaList + needx;
		// save it
		char *saved = m_p;
		// remove from doledb if we had a valid key (BEFORE adding SpiderReply)
		if ( m_doledbKey.n0 || m_doledbKey.n1 ) {
			// note it
			setStatus ( "removing key from doledb" );
			// . now remove the original spider rec from "doledb"
			// . rdbid first
			*m_p = RDB_DOLEDB;
			m_p++;
			// then the key
			*(key_t *)m_p = m_doledbKey;
			// nukey, clear del bit to delete it
			*m_p = *m_p & 0xfe;
			// skip key
			m_p += sizeof(key_t);
			// then zero for data size
			//  *(int32_t *)m_p = 0;
			//m_p += 4;
			// sanity check
			verifyMetaList( m_metaList , m_p );
		}
		// sanity check
		if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
		// . make a fake titledb key
		// . remove the spider lock (Msg12 in Spider.cpp)
		*m_p++ = RDB_FAKEDB;
		*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true );
		m_p += sizeof(key_t);
		// now add the new rescheduled time
		// note it
		setStatus ( "adding SpiderReply to spiderdb" );
		// rdbid first
		*m_p = RDB_SPIDERDB;
		// use secondary?
		if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
		m_p++;
		// get this
		if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
		// store the spider rec
		int32_t newsrSize = newsr->getRecSize();
		gbmemcpy ( m_p , newsr , newsrSize );
		m_p += newsrSize;
		// sanity check
		if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; }
		// sanity check
		verifyMetaList( m_metaList , m_p );
		// verify it
		m_metaListValid = true;
		// set size
		m_metaListSize = m_p - m_metaList;
		// all done
		return m_metaList;
	}
	*/

	// so getSiteRank() works
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;
	// so addTable144 works
	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId;


	//
	// . before making the title rec we need to set all the ptrs!
	// . so at least now set all the data members we will need to
	//   seriazlize into the title rec because we can't be blocking further
	//   down below after we set all the hashtables and XmlDoc::ptr_ stuff
	if ( ! m_setFromTitleRec || m_useSecondaryRdbs ) {
		// all member vars should already be valid if set from titlerec
		char *ptg = prepareToMakeTitleRec ();
		// return NULL with g_errno set on error
		if ( ! ptg || ptg == (void *)-1 ) return (char *)ptg;
	}

	// sanity check - if the valid title rec is null, m_indexCode is set!
	//if ( ! *tr && ! m_indexCode ) { char *xx=NULL;*xx=0; }
	// . bail. return an empty meta list, m_metaListSize should be 0!
	// . NO! we need to add a SpiderReply to spiderdb...
	//if ( ! *tr )
	//	log("HEY");
	/*
	if ( ! *tr ) {
		m_metaList = "";
		m_metaListSize = 0;
		m_metaListValid = true;
		return m_metaList;
	}
	*/

	// get this for hashing stuff
	//Spam *sp = getSpam();
	//if ( ! sp || sp == (void *)-1 ) return (char *)sp;

	// our next slated spider priority
	char *spiderLinks3 = getSpiderLinks();
	if ( ! spiderLinks3  || spiderLinks3 == (char *)-1 )
		return (char *)spiderLinks3;

	bool spideringLinks = *spiderLinks3;

	// int16_tcut
	XmlDoc *nd = this;

	///////////////////////////////////
	///////////////////////////////////
	//
	//
	// if we had an error, do not add us regardless to the index
	// although we might add SOME things depending on the error.
	// Like add the redirecting url if we had a ESIMPLIFIEDREDIR error.
	// So what we had to the Rdbs depends on the indexCode.
	//

	if ( m_indexCode ) nd = NULL;

	// OR if deleting from index, we just want to get the metalist
	// directly from "od"
	if ( m_deleteFromIndex ) nd = NULL;


	//
	//
	///////////////////////////////////
	///////////////////////////////////

	if ( ! nd )
		spideringLinks = false;

	// set these for getNewSpiderReply() so it can set
	// SpiderReply::m_wasIndexed and m_isIndexed...
	m_wasInIndex = false;
	m_isInIndex  = false;
	if ( od ) m_wasInIndex = true;
	if ( nd ) m_isInIndex  = true;
	m_wasInIndexValid = true;
	m_isInIndexValid  = true;


	// if we are adding a simplified redirect as a link to spiderdb
	if ( m_indexCode == EDOCSIMPLIFIEDREDIR )
		spideringLinks = true;

	// likewise if there error was ENONCANONICAL treat it like that
	if ( m_indexCode == EDOCNONCANONICAL )
		spideringLinks = true;

	//
	// . prepare the outlink info if we are adding links to spiderdb!
	// . do this before we start hashing so we do not block and re-hash!!
	//
	if ( spideringLinks && ! m_doingConsistencyCheck && m_useSpiderdb){
		setStatus ( "getting outlink info" );
		TagRec ***grv = getOutlinkTagRecVector();
		if ( ! grv || grv == (void *)-1 ) return (char *)grv;
		//char    **iiv = getOutlinkIsIndexedVector();
		//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
		int32_t    **ipv = getOutlinkFirstIpVector();
		if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
		//int8_t  *hcv = getOutlinkHopCountVector();
		//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
		//char     *ipi = getIsIndexed(); // is the parent indexed?
		//if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
	}

	// get the tag buf to add to tagdb
	SafeBuf *ntb = NULL;
	if ( m_useTagdb && ! m_deleteFromIndex ) {
		ntb = getNewTagBuf();
		if ( ! ntb || ntb == (void *)-1 ) return (char *)ntb;
	}


	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;

	Words *ww = getWords();
	if ( ! ww || ww == (void *)-1 ) return (char *)ww;

	int64_t *pch64 = getExactContentHash64();
	//int64_t *pch64 = getLooseContentHash64();
	if ( ! pch64 || pch64 == (void *)-1 ) return (char *)pch64;

	// get the voting table which we will add to sectiondb
	SectionVotingTable *nsvt = NULL;
	SectionVotingTable *osvt = NULL;
	// seems like
	// sectiondb takes up abotu 15% of the disk space like this. no!
	// cuz then there is revdb, so we are 30%. so that's a no go.
	bool addSectionVotes = false;
	if ( nd ) addSectionVotes = true;
	if ( ! m_useSectiondb ) addSectionVotes = false;
	// to save disk space no longer add the roots! nto only saves sectiondb
	// but also saves space in revdb
	//if ( nd && *isRoot ) addSectionVotes = true;
	if ( addSectionVotes ) {
		nsvt = getNewSectionVotingTable();
		if ( ! nsvt || nsvt == (void *)-1 ) return (char *)nsvt;
		// get the old table too!
		osvt = getNewSectionVotingTable();
		if ( ! osvt || osvt == (void *)-1 ) return (char *)osvt;
	}

	// get the addresses for hashing tag hashes that indicate place names
	Addresses *na = NULL;
	//Addresses *oa = NULL;
	if ( nd ) na = getAddresses();
	//if ( od ) oa = od->getAddresses();

	// get dates ready for hashing
	Dates *ndp = NULL;
	//Dates *odp = NULL;
	if ( nd ) {
		ndp = nd->getDates();
		if ( ! ndp || ndp==(void *)-1) return (char *)ndp;
	}
	//if ( od ) {
	//	odp = od->getDates();
	//	if ( ! odp || odp==(void *)-1) return (char *)odp;
	//}

	// need firstip if adding a rebuilt spider request
	if ( m_useSecondaryRdbs && ! m_isDiffbotJSONObject && m_useSpiderdb ) {
		int32_t *fip = getFirstIp();
		if ( ! fip || fip == (void *)-1 ) return (char *)fip;
	}


	// shit, we need a spider reply so that it will not re-add the
	// spider request to waiting tree, we ignore docid-based
	// recs that have spiderreplies in Spider.cpp
	SpiderReply *newsr = NULL;
	if ( m_useSpiderdb ) { // && ! m_deleteFromIndex ) {
		newsr = getNewSpiderReply();
		if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr;
	}

	// the site hash for hashing
	int32_t *sh32 = getSiteHash32();
	if ( ! sh32 || sh32 == (int32_t *)-1 ) return (char *)sh32;

	// set ptr_clockCandidatesData
	// if ( nd ) {
	// 	HashTableX *cct = nd->getClockCandidatesTable();
	// 	if ( ! cct || cct==(void *)-1) return (char *)cct;
	// }

	if ( m_useLinkdb && ! m_deleteFromIndex ) {
		int32_t *linkSiteHashes = getLinkSiteHashes();
		if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
			return (char *)linkSiteHashes;
	}

	//SafeBuf *au = getDiffbotApiUrl();
	//if ( ! au || au == (void *)-1 ) return (char *)au;


	// test json parser
	//
	/*
	char *json = "{\"icon\":\"http://www.pixar.com/sites/default/files/pixar_2012_favicon_0.ico\",\"text\":\"\",\"title\":\"Pixar\",\"type\":\"article\",\"media\":[{\"primary\":\"true\",\"link\":\"http://www.pixar.com/sites/default/files/home_billboard_v7.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/roz1_0.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/home_bu-thumb_v1.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/mu_home_thumb.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/brenda_homepage.jpg\",\"type\":\"image\"}],\"url\":\"http://www.pixar.com/\"}";
	char *dd = getNextJSONObject ( json );
	if ( *dd ) { char *xx=NULL;*xx=0; }
	*/

	///////////
	//
	// BEGIN the diffbot json object index hack
	//
	// if we are using diffbot, then each json object in the diffbot reply
	// should be indexed as its own document.
	//
	///////////


	// . get the reply of json objects from diffbot
	// . this will be empty if we are a json object!
	// . will also be empty if not meant to be sent to diffbot
	// . the TOKENIZED reply consists of \0 separated json objects that
	//   we create from the original diffbot reply
	SafeBuf *tdbr = getTokenizedDiffbotReply();
	if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr;


	// i guess it is safe to do this after getting the spiderreply
	SafeBuf *spiderStatusDocMetaList = NULL;
	//if ( indexReply ) {

	// get the spiderreply ready to be added to the rdbs w/ msg4
	// but if doing a rebuild operation then do not get it, we'll rebuild
	// it since it will have its own titlerec
	if ( ! m_useSecondaryRdbs ) {
		spiderStatusDocMetaList =
			getSpiderStatusDocMetaList (newsr,forDelete);
		if ( ! spiderStatusDocMetaList ) {
			log("build: ss doc metalist null. bad!");
			return NULL;
		}
	}

	if ( spiderStatusDocMetaList == (void *)-1)
		return (char *)spiderStatusDocMetaList;
	//}


	int32_t tdbrLen = tdbr->length();

	// do not index json items as separate docs if we are page parser
	if ( getIsPageParser() ) tdbrLen = 0;

	// same goes if appending -diffbotxyz%UINT32 would be too long
	if ( m_firstUrl.getUrlLen() + 11 + 10 > MAX_URL_LEN )
		tdbrLen = 0;

	// once we have tokenized diffbot reply we can get a unique
	// hash of the title of each json item. that way, if a page changes
	// and it gains or loses a diffbot item, the old items will still
	// have the same url and we can set their m_indexCode to EDOCUNCHANGED
	// if the individual json item itself has not changed when we
	// call m_dx->indexDoc() below.
	int32_t numHashes = 0;
	int32_t *titleHashBuf = NULL;

	//
	// if we got a json object or two from diffbot, index them
	// as their own child xmldocs.
	// watch out for reply from diffbot of "-1" indicating error!
	//
	if ( tdbrLen > 3 ) {

		// get title hashes of the json items
		titleHashBuf = getDiffbotTitleHashes(&numHashes);
		if (!titleHashBuf || titleHashBuf == (void *)-1){
			char *xx=NULL;*xx=0;}

		// make sure diffbot reply is valid for sure
		if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
		// set status for this
		setStatus ( "indexing diffbot json doc");
		// new guy here
		if ( ! m_dx ) {
			try { m_dx = new ( XmlDoc ); }
			catch ( ... ) {
				g_errno = ENOMEM;
				log("xmldoc: failed to alloc m_dx");
				return NULL;
			}
			mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
			// we now parse the array of products out of the
			// diffbot reply. each product is an item/object.
			m_diffbotObj = tdbr->getBufStart();
			m_diffbotJSONCount = 0;
		}
		// loop back up here to process next json object from below
	jsonloop:
		// if m_dx has no url set, call set4 i guess
		if ( ! m_dx->m_contentValid ) {

			// sanity. ensure the json item we are trying to
			// index has a title hash in this buf
			if(m_diffbotJSONCount>=numHashes){char *xx=NULL;*xx=0;}

			// get the title of the json we are indexing
			int32_t jth = titleHashBuf [ m_diffbotJSONCount ];

			// make the fake url for this json object for indexing
			SafeBuf fakeUrl;
			fakeUrl.set ( m_firstUrl.getUrl() );
			// append -diffbot-0 etc. for fake url
			fakeUrl.safePrintf("-diffbotxyz%"UINT32"",
					   //(int32_t)m_diffbotJSONCount);
					   (uint32_t)jth);
			if ( fakeUrl.length() > MAX_URL_LEN ) {
				log("build: diffbot enhanced url too long for "
				    "%s",fakeUrl.getBufStart());
				char *xx=NULL;*xx=0;
			}
			m_diffbotJSONCount++;
			// this can go on the stack since set4() copies it
			SpiderRequest sreq;
			sreq.reset();
			// string ptr
			char *url = fakeUrl.getBufStart();
			// use this as the url
			strcpy( sreq.m_url, url );
			// parentdocid of 0
			int32_t firstIp = hash32n ( url );
			if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
			sreq.setKey( firstIp,0LL, false );
			sreq.m_isInjecting   = 1;
			sreq.m_isPageInject  = 1;
			sreq.m_hopCount      = m_hopCount;
			sreq.m_hopCountValid = m_hopCountValid;
			sreq.m_fakeFirstIp   = 1;
			sreq.m_firstIp       = firstIp;
			// so we can match url filters' "insitelist" directive
			// in Spider.cpp::getUrlFilterNum()
			sreq.m_domHash32  = m_domHash32;
			sreq.m_siteHash32 = m_siteHash32;
			sreq.m_hostHash32 = m_siteHash32;
			// set this
			if (!m_dx->set4 ( &sreq       ,
					  NULL        ,
					  cr->m_coll  ,
					  NULL        , // pbuf
					  // give it a niceness of 1, we have
					  // to be careful since we are a
					  // niceness of 0!!!!
					  m_niceness, // 1 ,
					  // inject this content
					  m_diffbotObj,
					  false, // deleteFromIndex ,
					  0, // forcedIp ,
					  CT_JSON, // contentType ,
					  0, // lastSpidered ,
					  false )) // hasMime
				// g_errno should be set!
				return NULL;
			// we are indexing json objects, don't use all these
			m_dx->m_useClusterdb  = false;
			m_dx->m_useSpiderdb   = false;
			m_dx->m_useTagdb      = false;
			m_dx->m_usePlacedb    = false;
			m_dx->m_useLinkdb     = false;
			m_dx->m_isChildDoc    = true;
			m_dx->m_parentDocPtr  = this;
			// we like to sort json objects using
			// 'gbsortby:spiderdate' query to get the most
			// recent json objects, so this must be valid
			if ( m_spideredTimeValid ) {
				m_dx->m_spideredTimeValid = true;
				m_dx->m_spideredTime = m_spideredTime;
			}

			m_dx->m_isDiffbotJSONObject = true;
		}

		// when the indexdoc completes, or if it blocks, call us!
		// we should just pass through here
		//xd->setCallback ( this , getMetaListWrapper );
		m_dx->setCallback ( m_masterState , m_masterLoop );

		///////////////
		// . inject the content of the json using this fake url
		// . return -1 if this blocks
		// . if m_dx got its msg4 reply it ends up here, in which
		//   case do NOT re-call indexDoc() so check for
		//   m_listAdded.
		///////////////
		if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
			return (char *)-1;

		// critical error on our part trying to index it?
		// does not include timeouts or 404s, etc. mostly just
		// OOM errors.
		if ( g_errno ) return NULL;

		CollectionRec *cr = getCollRec();
		if ( ! cr ) return NULL;
		// count as deleted
		cr->m_localCrawlInfo.m_objectsAdded++;
		cr->m_globalCrawlInfo.m_objectsAdded++;
		cr->m_needsSave = true;
		// we successfully index the json object, skip to next one
		m_diffbotObj += gbstrlen(m_diffbotObj) + 1;
		// but gotta set this crap back
		log(LOG_INFO,"diffbot: resetting %s",m_dx->m_firstUrl.m_url);
		// clear for next guy if there is one. clears
		// m_dx->m_contentValid so the set4() can be called again above
		m_dx->reset();
		// have we breached the buffer of json objects? if not, do more
		if ( m_diffbotObj < tdbr->getBuf() ) goto jsonloop;
	}

	/////
	//
	// END the diffbot json object index hack
	//
	/////


	//
	// CAUTION
	//
	// CAUTION
	//
	// We should never "block" after this point, lest the hashtables
	// we create get messed up.
	//

	//
	//
	// START HASHING
	//
	//


	// store what we hash into this table
	if ( (m_pbuf || m_storeTermListInfo) && ! m_wts ) {
		// init it. the value is a TermInfo class. allowDups=true!
		m_wtsTable.set (12,sizeof(TermDebugInfo),
				0,NULL,0,true,m_niceness,
				"wts-tab");
		// point to it, make it active
		m_wts = &m_wtsTable;
	}

	// how much to alloc? compute an upper bound
	int32_t need = 0;
	// should we index this doc?
	bool index1 = true;

	setStatus ( "hashing posdb and datedb terms" );
	// . hash our documents terms into "tt1"
	// . hash the old document's terms into "tt2"
	// . by old, we mean the older versioned doc of this url spidered b4
	HashTableX tt1;
	HashTableX tt2;
	// how many words we got?
	int32_t nw = m_words.getNumWords();
	// . prepare it, 5000 initial terms
	// . make it nw*8 to avoid have to re-alloc the table!!!
	// . i guess we can have link and neighborhood text too! we don't
	//   count it here though... but add 5k for it...
	int32_t need4 = nw * 4 + 5000;
	if ( nd && index1 && m_usePosdb ) {
		if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,
				 "posdb-indx"))
			return NULL;
		int32_t did = tt1.m_numSlots;
		//bool index2 = true;
		// . hash the document terms into "tt1"
		// . this is a biggie!!!
		// . only hash ourselves if m_indexCode is false
		// . m_indexCode is non-zero if we should delete the doc from
		//   index
		// . i think this only adds to posdb
		//log("xmldoc: CALLING HASHALL");
		// shit, this blocks which is bad!!!
		char *nod = hashAll ( &tt1 ) ;
		// you can't block here because if we are re-called we lose tt1
		if ( nod == (char *)-1 ) { char *xx=NULL;*xx=0; }
		// error?
		if ( ! nod ) return NULL;
		int32_t done = tt1.m_numSlots;
		if ( done != did )
			log("xmldoc: reallocated big table! bad. old=%"INT32" "
			    "new=%"INT32" nw=%"INT32"",did,done,nw);
	}

	// if indexing the spider reply as well under a different docid
	// there is no reason we can't toss it into our meta list here
	if ( spiderStatusDocMetaList )
		need += spiderStatusDocMetaList->length();

	// now we use revdb
	// before hashing the old doc into it
	//if ( od && index2 ) {
	//	// if this hash table init fails, return NULL
	//	if (!tt2.set(12,4,5000,NULL,0,false,m_niceness)) return NULL;
	//	char *rod = od->hash ( &tt2 ) ;
	//	if ( ! rod || rod == (char *)-1 ) return rod;
	//}
	// space for indexdb AND DATEDB! +2 for rdbids
	int32_t needIndexdb = 0;
	needIndexdb +=tt1.m_numSlotsUsed*(sizeof(key144_t)+2+sizeof(key128_t));
	//needIndexdb+=tt2.m_numSlotsUsed * (sizeof(key_t)+2+sizeof(key128_t));
	need += needIndexdb;
	// sanity check
	//if ( ! od && m_skipIndexing && needIndexdb ) { char *xx=NULL;*xx=0; }

	// . sanity check - must have one or the other!
	// . well, not in the case of EDOCNOTNEW or EDOCNOTOLD, in which
	//   case we just remove ourselves from spiderdb, and in the case
	//   of EDOCNOTOLD, from tfndb as well
	//if ( ! od && ! nd ) { char *xx=NULL;*xx=0; }


	// what pub dates do the old and new doc have? -1 means none.
	int32_t date1 = -1; if ( nd ) date1 = nd->m_pubDate;
	//int32_t date2 = -1; if ( od ) date2 = od->m_pubDate;

	// now we also add the title rec. true = ownsCbuf? ret NULL on error
	// with g_errno set.
	//if ( nd && ! nd->compress( true , m_niceness ) ) return NULL;


	/*
	  now we have the bit in the posdb key, so this should not be needed...
	  use Posdb::isShardedByTermId() to see if it is such a spcial case key
	  like Hostdb::getShardNum() now does...

	setStatus ( "hashing nosplit keys" );
	// hash no split terms into ns1 and ns2
	HashTableX ns1;
	// prepare it, 500 initial terms
	if ( ! ns1.set ( 18 , 4 , 500,NULL,0,false,m_niceness,"nosplt-indx" ))
		return NULL;
	// . hash for no splits
	// . like above, but these are "no split" termids
	if ( nd && m_usePosdb && ! hashNoSplit ( &ns1 ) ) return NULL;
	//if(index2 && od && ! od->hashNoSplit ( &ns2 ) ) return NULL;
	// needs for hashing no split terms
	int32_t needNoSplit1 = 0;
	// add em up. +1 for rdbId. add to both indexdb AND datedb i guess...
	needNoSplit1 += ns1.m_numSlotsUsed * (18+1); // +16+1);
	//needNoSplit += ns2.m_numSlotsUsed * (12+1+16+1);
	// add it in
	need += needNoSplit1;
	// sanity check
	//if ( ! od && m_skipIndexing && needNoSplit ) { char *xx=NULL;*xx=0; }
	*/


	setStatus ( "hashing sectiondb keys" );
	// add in special sections keys. "ns" = "new sections", etc.
	// add in the special nosplit datedb terms from the Sections class
	// these hash into the term table so we can do incremental updating
	HashTableX st1; // <key128_t,char> dt1;
	//HashTableX st2; // <key128_t,char> dt2;
	// set key/data size
	int32_t svs = sizeof(SectionVote);
	st1.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness,"sectdb-indx");
	// tell hashtable to use the sectionhash for determining the slot,
	// not the lower 4 bytes because that is the docid which is the
	// same for every key
	st1.m_maskKeyOffset = 6;
	//st2.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness);
	// do not bother if deleting
	if ( m_indexCode ) nsvt = NULL;

	// . now we hash the root just to get some section votes i guess
	//if ( nts && ! *isr ) nsvt = NULL;
	// if old voting table add more than 100,000 votes forget it!!! do
	// not bloat sectiondb that big...
	if ( osvt && osvt->m_totalSiteVoters >= MAX_SITE_VOTERS ) nsvt = NULL;
	// hash terms into a table that uses full datedb keys
	if ( nsvt && ! nsvt->hash (m_docId,&st1,*sh64,m_niceness)) return NULL;
	// needs for hashing no split terms
	int32_t needSectiondb = 0;
	// add em up. plus one for rdbId
	needSectiondb += st1.m_numSlotsUsed * (16+svs+1);
	//needSectiondb += st2.m_numSlotsUsed * (16+svs+1);
	// add it in
	need += needSectiondb;


	// Sections::respiderLineWaiters() adds one docid-based spider rec
	// for every url waiting in line. Sections::m_numLineWaiters. assume
	// 64 bytes per line waiter spider rec i guess
	//int32_t needLineWaiters = 0;
	// +1 for rdbId
	//if ( ns ) needLineWaiters = ns->m_numLineWaiters * 64;
	// forgot to add this?
	//need += needLineWaiters;
	// . for adding Sections.cpp keys
	// . Sections::hash() does not bother with invalid sections
	// . waitInLine might be true in Sections::hash() too, so always add 12
	//if ( ns ) need += (ns->m_numSections - ns->m_numInvalids)*12 + 12;
	//if ( os ) need += (os->m_numSections - os->m_numInvalids)*12 + 12;


	// for adding Addresses::m_keys[] (Addresses::hash())
	//if ( na ) need += (na->m_numKeys * 16);
	//if ( oa ) need += (oa->m_numKeys * 16);

	// don't forget Dates!
	//if ( ndp ) need += ndp->m_numPubDates * sizeof(key_t);
	//if ( odp ) need += odp->m_numPubDates * sizeof(key_t);

	// clusterdb keys. plus one for rdbId
	int32_t needClusterdb = 0;
	//if ( nd && ! nd->m_skipIndexing ) needClusterdb += 13;
	//if ( od && ! od->m_skipIndexing ) needClusterdb += 13;
	if ( nd ) needClusterdb += 13;
	//if ( od ) needClusterdb += 13;
	need += needClusterdb;

	// . LINKDB
	// . linkdb records. assume one per outlink
	// . we may index 2 16-byte keys for each outlink
	Links *nl2 = NULL;
	//if ( spideringLinks ) nl2 = &m_links;
	// if injecting, spideringLinks is false, but then we don't
	// add the links to linkdb, which causes the qainlinks() test to fail
	nl2 = &m_links;
	// do not bother if deleting. but we do add simplified redirects
	// to spiderdb as SpiderRequests now.
	int32_t code = m_indexCode;
	if  ( code == EDOCSIMPLIFIEDREDIR ) code = 0;
	if  ( code == EDOCNONCANONICAL    ) code = 0;
	if  ( code ) nl2 = NULL;
	//Links *ol = NULL; if ( od ) ol = od->getLinks();
	// . set key/data size
	// . use a 16 byte key, not the usual 12
	// . use 0 for the data, since these are pure keys, which have no
	//   scores to accumulate
	HashTableX kt1;
	//HashTableX kt2;
	int32_t nis = 0;
	if ( nl2 && m_useLinkdb ) nis = nl2->getNumLinks() * 4;
	// pre-grow table based on # outlinks
	kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" );
	// use magic to make fast
	kt1.m_useKeyMagic = true;
	// linkdb keys will have the same lower 4 bytes, so make hashing fast.
	// they are 28 byte keys. bytes 20-23 are the hash of the linkEE
	// so that will be the most random.
	kt1.m_maskKeyOffset = 20;
	// faster
	//kt2.set ( sizeof(key128_t) , 0,0,NULL,0,false,m_niceness );
	// do not add these
	//bool add1 = true;
	// do not add negative key if no old doc
	//if ( ! od ) add2 = false;
	// . we already have a Links::hash into the Termtable for links: terms,
	//   but this will have to be for adding to Linkdb. basically take a
	//   lot of it from Linkdb::fillLinkdbList()
	// . these return false with g_errno set on error
	if ( m_useLinkdb && nl2 && ! hashLinksForLinkdb(&kt1) ) return NULL;
	//if ( add2 && ol && ! !od->m_skipIndexing &&
	//     ol->hash(&kt2,od,m_niceness) )
	//	return NULL;
	// add up what we need. +1 for rdbId
	int32_t needLinkdb = 0;
	needLinkdb += kt1.m_numSlotsUsed * (sizeof(key224_t)+1);
	//needLinkdb += kt2.m_numSlotsUsed * (sizeof(key128_t)+1);
	need += needLinkdb;
	// sanity check
	//if ( ! od && m_skipIndexing && needLinkdb ) { char *xx=NULL;*xx=0; }

	// PLACEDB
	HashTableX pt1;
	//HashTableX pt2;
	// . set key/data size
	// . limit every address to 512 bytes
	pt1.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness,"placedb-indx");
	//pt2.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness);
	//
	// if this is true, then we just store the placedb recs
	// directly into the title rec. That way we do not have
	// to store the content of the web page, and we save space.
	//
	// otherwise, we have to parse out the sections and it is much slower
	//else if (oa && !oa->hashForPlacedb(m_docId,*sh32,*od->getIp(),&pt2) )
	//	return NULL;
	// hash terms into a table that uses full datedb keys
	if ( na && !na->hashForPlacedb(m_docId,*sh32,*nd->getIp(),&pt1))
		return NULL;


	setStatus("hashing place info");
	int32_t needPlacedb = 0;
	// . +1 for rdbId
	// . up to 512 bytes per address
	needPlacedb += pt1.m_numSlotsUsed * (sizeof(key128_t)+1+512);
	//needPlacedb += pt2.m_numSlotsUsed * (sizeof(key128_t)+1+512);
	need += needPlacedb;
	// sanity check -- coring here because we respider the page and
	// the address is gone so it tries to delete it!
	//if ( ! od && m_skipIndexing && needPlacedb ) { char *xx=NULL;*xx=0; }

	// we add a negative key to doledb usually (include datasize now)
	int32_t needDoledb = sizeof(key_t) + 1 ; // + 4;
	if ( forDelete ) needDoledb = 0;
	need += needDoledb;

	// for adding the SpiderReply to spiderdb (+1 for rdbId)
	int32_t needSpiderdb1 = sizeof(SpiderReply) + 1;
	if ( forDelete ) needSpiderdb1 = 0;
	need += needSpiderdb1;

	// if injecting we add a spiderrequest to be able to update it
	// but don't do this if it is pagereindex. why is pagereindex
	// setting the injecting flag anyway?
	int32_t needSpiderdb3 = 0;
	if ( m_sreqValid &&
	     m_sreq.m_isInjecting &&
	     m_sreq.m_fakeFirstIp &&
	     ! m_sreq.m_forceDelete &&
	     // do not rebuild spiderdb if only rebuilding posdb
	     // this is explicitly for injecting so we need to add
	     // the spider request to spiderdb...
	     //m_useSpiderdb &&
	     /// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
	     ! m_isDiffbotJSONObject ) {
		needSpiderdb3 = m_sreq.getRecSize() + 1;
		// NO! because when injecting a warc and the subdocs
		// it contains, gb then tries to spider all of them !!! sux...
		needSpiderdb3 = 0;
	}
	// or if we are rebuilding spiderdb
	else if (m_useSecondaryRdbs && !m_isDiffbotJSONObject && m_useSpiderdb)
		needSpiderdb3 = sizeof(SpiderRequest) + m_firstUrl.m_ulen+1;

	need += needSpiderdb3;

	//int32_t needSpiderdb3 = 0;
	//if ( m_sreqValid ) needSpiderdb3 = m_sreq.getRecSize() + 1;
	//need += needSpiderdb3;

	// . for adding our outlinks to spiderdb
	// . see SpiderRequest::getRecSize() for description
	// . SpiderRequest::getNeededSize() will include the null terminator
	int32_t hsize         = SpiderRequest::getNeededSize ( 0 );
	int32_t needSpiderdb2 = hsize * m_links.getNumLinks();
	// and the url buffer of outlinks. includes \0 terminators i think
	needSpiderdb2 += m_links.getLinkBufLen();
	// don't need this if doing consistecy check
	if ( m_doingConsistencyCheck ) needSpiderdb2 = 0;
	// nor for generating the delete meta list for incremental indexing
	if ( forDelete ) needSpiderdb2 = 0;
	// accumulate it
	need += needSpiderdb2;

	// the new tags for tagdb
	int32_t needTagdb = 0;
	if ( ntb ) needTagdb = ntb->length() ;
	// add 1 byte for up to 128 rdbids
	//needTagdb += needTagdb/sizeof(Tag) + 1;
	// add that in
	need += needTagdb;

	// . add in title rec size
	// . should be valid because we called getTitleRecBuf() above
	// . this should include the key
	// . add in possible negative key for deleting old title rec
	//int32_t needTitledb = sizeof(key96_t);
	// +1 for rdbId
	//if ( nd && m_useTitledb ) needTitledb = m_titleRecSize + 1;
	//need += needTitledb;


	//
	// . CHECKSUM PARSING CONSISTENCY TEST
	//
	// . set m_metaListChecksum member (will be stored in titleRec header)
	// . gotta set m_metaListCheckSum8 before making titleRec below
	// . also, if set from titleRec, verify metalist is the same!
	//
	if ( ! m_computedMetaListCheckSum ) {
		// do not call twice!
		m_computedMetaListCheckSum = true;
		// all keys in tt1, ns1, kt1 and pt1
		int32_t ck32 = 0;
		ck32 ^= tt1.getKeyChecksum32();

		// show tt1
		//
		// UNCOMMENT this to debug parsing inconsistencies!!!
		//
		// SafeBuf sb;
		// tt1.print(&sb);
		// if(sb.getBufStart()) fprintf(stderr,"%s", sb.getBufStart());

		//ck32 ^= ns1.getKeyChecksum32();
		//ck32 ^= kt1.getKeyChecksum32();
		//ck32 ^= pt1.getKeyChecksum32();
		// set this before calling getTitleRecBuf() below
		uint8_t currentMetaListCheckSum8 = (uint8_t)ck32;
		// see if matches what was in old titlerec
		if ( m_metaListCheckSum8Valid &&
		     // if we were set from a titleRec, see if we got
		     // a different hash of terms to index this time around...
		     m_setFromTitleRec &&
		     // fix for import log spam
		     ! m_isImporting &&
		     m_version >= 120 &&
		     m_metaListCheckSum8 != currentMetaListCheckSum8 ) {
			log("xmldoc: checksum parsing inconsistency for %s "
			    "(old)%i != %i(new). Uncomment tt1.print() "
			    "above to debug.",
			    m_firstUrl.getUrl(),
			    (int)m_metaListCheckSum8,
			    (int)currentMetaListCheckSum8);
			// if doing qa test drop core
			CollectionRec *cr = getCollRec();
			if ( cr && strcmp(cr->m_coll,"qatest123") == 0 ) {
				log("xmldoc: sleep 1000");
				sleep(1000);
				exit(0);}//char *xx=NULL;*xx=0; }
		}
		// assign the new one, getTitleRecBuf() call below needs this
		m_metaListCheckSum8 = currentMetaListCheckSum8;
		m_metaListCheckSum8Valid = true;
	}


	//
	// now that we've set all the ptr_* members vars, we can make
	// the title rec
	//

	// . MAKE the title rec from scratch, that is all we need at this point
	// . sets m_indexCode to EDOCNOTNEW or EDOCNOTOLD sometimes
	// . if repairing and not rebuilding titledb, we do not need the
	//   titlerec
	if ( m_useTitledb ) {
		// this buf includes key/datasize/compressdata
		SafeBuf *tr = getTitleRecBuf ();
		// panic if this blocks! it should not at this point because
		// we'd have to re-hash the crap above
		if ( tr == (void *) -1 ) { char *xx=NULL;*xx=0; }
		// return NULL with g_errno set on error
		if ( ! tr ) return (char *)tr;
		// sanity check - if the valid title rec is null,
		// m_indexCode is set!
		if ( tr->length()==0 && ! m_indexCode ) { char *xx=NULL;*xx=0;}
	}

	// . add in title rec size
	// . should be valid because we called getTitleRecBuf() above
	// . this should include the key
	// . add in possible negative key for deleting old title rec
	int32_t needTitledb = sizeof(key96_t) + 1;
	// +1 for rdbId
	if ( nd && m_useTitledb && ! forDelete )
		needTitledb += m_titleRecBuf.length();
	// set new and old keys for titledb
	//key_t ok;
	key_t nk;
	//ok.setMin();
	nk.setMin();
	//if ( od ) ok = *od->getTitleRecKey();
	if ( nd && m_useTitledb ) nk = *nd->getTitleRecKey();
	//if ( od && m_useTitledb && ok != nk ) needTitledb += sizeof(key_t)+1;
	if ( m_useTitledb ) {
		// then add it in
		need += needTitledb;
		// the titledb unlock key for msg12 in spider.cpp
		need += sizeof(key_t);
	}

	//
	// now space for the revdb record, which is the meta list itself!
	//
	//need = need + 12 + 4 + need;

	// . alloc mem for metalist
	// . sanity
	if ( m_metaListSize > 0 ) { char *xx=NULL;*xx=0; }
	// make the buffer
	m_metaList = (char *)mmalloc ( need , "metalist");
	if ( ! m_metaList ) return NULL;
	// save size for freeing later
	m_metaListAllocSize = need;
	// ptr and boundary
	m_p    = m_metaList;
	m_pend = m_metaList + need;

	//
	// TITLEDB
	//
	setStatus ("adding titledb recs");
	// checkpoint
	char *saved = m_p;
	// . delete old title rec key if different
	// . Repair.cpp might set useTitledb to false!
	//if ( od && m_useTitledb && ok != nk ) {
	//	// rdbId
	//	*m_p++ = RDB_TITLEDB;
	//	// key
	//	*(key_t *)m_p = *od->getTitleRecKey();
	//	// make it negative
	//	*m_p &= 0xfe;
	//	// skip over it
	//	m_p += sizeof(key_t);
	//	// then data size, 0
	//	//*(int32_t *)m_p = 0;
	//	//m_p+= 4;
	//}
	// . store title rec
	// . Repair.cpp might set useTitledb to false!
	if ( nd && m_useTitledb ) {
		// rdbId
		if ( m_useSecondaryRdbs ) *m_p++ = RDB2_TITLEDB2;
		else                      *m_p++ = RDB_TITLEDB;
		// sanity
		if ( ! nd->m_titleRecBufValid ) { char *xx=NULL;*xx=0; }
		// key, dataSize, data is the whole rec
		int32_t tsize = nd->m_titleRecBuf.length();
		// if getting an "oldList" to do incremental posdb updates
		// then do not include the data portion of the title rec
		if ( forDelete ) tsize = sizeof(key_t);
		gbmemcpy ( m_p , nd->m_titleRecBuf.getBufStart() , tsize );
		// make it a negative key
		//if ( forDelete ) *m_p = *m_p & 0xfe;
		m_p += tsize;//nd->m_titleRecSize;
		// store a zero datasize, key is still positive until the dt8
		// table deletes it
		//if ( forDelete ) { *(int32_t *)m_p = 0; m_p += 4; }
	}
	// sanity check
	if ( m_p - saved > needTitledb ) { char *xx=NULL;*xx=0; }
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );

	//
	// ADD BASIC INDEXDB/DATEDB TERMS
	//
	setStatus ( "adding posdb and datedb terms");
	// checkpoint
	saved = m_p;
	// store indexdb terms into m_metaList[]
	if ( m_usePosdb && ! addTable144 ( &tt1 , m_docId )) return NULL;
	//if(!addTable96 ( &tt2, &tt1, date2, date1, true ,false)) return NULL;
	//if ( od ) tt2.clear();
	// sanity check
	if ( m_p - saved > needIndexdb ) { char*xx=NULL;*xx=0; }
	// free all mem
	tt1.reset();
	//tt2.reset();
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );

	//
	// ADD NOSPLIT INDEXDB/DATEDB TERMS
	//
	/*
	  we added these now in hashAll() to tt1, no longer ns1 since we
	  have the sharded by termid bit in the actual posdb key now so
	  Rebalance.cpp works

	setStatus ( "adding posdb shardByTermId terms");
	// checkpoint
	saved = m_p;
	// no longer anything special now since the
	// Posdb::isShardedyTermId() bit
	// is in the key now so Rebalance.cpp can work
	if ( m_usePosdb && ! addTable144 ( &ns1 )) return NULL;
	//if(! addTable96 ( &ns2, &ns1, -1, -1, true ,true)) return NULL;
	// sanity check
	if ( m_p - saved > needNoSplit1 ) { char*xx=NULL;*xx=0; }
	// free all mem
	ns1.reset();
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );
	*/


	/*
	setStatus ( "adding datedb nosplit terms");
	// checkpoint
	saved = m_p;
 	// this is now for datedb
	if ( m_useDatedb && ! addTableDate(&ns2,m_docId,RDB_DATEDB,true))
		return NULL;
	// sanity check
	if ( m_p - saved > needNoSplit2 ) { char*xx=NULL;*xx=0; }
	// free all mem
	ns2.reset();
	// sanity check
	verifyMetaList( m_metaList , m_p );
	*/

	//
	// ADD SECTIONS SPECIAL TERMS
	//
	setStatus ( "adding sectiondb keys");
	// checkpoint
	saved = m_p;
	// add that table to the metalist
	if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete))
		return NULL;
	//if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL;
	// sanity check
	if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; }
	// free mem
	st1.reset();
	//st2.reset();
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );


	//
	// ADD CLUSTERDB KEYS
	//
	setStatus ( "adding clusterdb keys" );
	// checkpoint
	saved = m_p;
	// . do we have adult content?
	// . should already be valid!
	if ( nd && ! m_isAdultValid ) { char *xx=NULL;*xx=0; }
	// . get new clusterdb key
	// . we use the host hash for the site hash! hey, this is only 26 bits!
	key_t newk ; newk.setMin();
	if ( nd )
		newk = g_clusterdb.makeClusterRecKey ( *nd->getDocId() ,
						       *nd->getIsAdult() ,
						       *nd->getLangId(),
						        nd->getHostHash32a(),
						        false ); // del?
	//key_t oldk; oldk.setMin();
	//if ( od ) // && add2 )
	//	oldk = g_clusterdb.makeClusterRecKey ( *od->getDocId(),
	//					       *od->getIsAdult() ,
	//					       *od->getLangId() ,
	//					        od->getHostHash32a(),
	//					        true ); // del?
	// . store old only if new tr is good and keys are different from old
	// . now we store even if skipIndexing is true because i'd like to
	//   see how many titlerecs we have and count them towards the
	//   docsIndexed count...
	if ( nd && m_useClusterdb ) {
		// store rdbid
		*m_p = RDB_CLUSTERDB;
		// use secondary if we should
		if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
		// skip
		m_p++;
		// and key
		*(key_t *)m_p = newk;
		// skip it
		m_p += sizeof(key_t);
	}
	// store new if different
	//if ( od && ( ! nd || newk != oldk ) ) { // && !od->m_skipIndexing ) {
	//	// store rdbid
	//	*m_p = RDB_CLUSTERDB;
	//	// use secondary if we should
	//	if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2;
	//	// skip
	//	m_p++;
	//	// turn on last bit (undo del)
	//	//newk.n0 |= 0x01;
	//	// and key
	//	*(key_t *)m_p = oldk;
	//	// skip it
	//	m_p += sizeof(key_t);
	//}
	// sanity check
	if ( m_p - saved > needClusterdb ) { char *xx=NULL;*xx=0; }
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );


	//
	// ADD LINKDB KEYS
	//
	setStatus ( "adding linkdb keys" );
	// checkpoint
	saved = m_p;
	// add that table to the metalist (LINKDB)
	if ( m_useLinkdb && !addTable224(&kt1))
		return NULL;
	//if(add2&&!addTable128(&kt2,&kt1,RDB_LINKDB, false))return NULL;
	// sanity check
	if ( m_p - saved > needLinkdb ) { char *xx=NULL;*xx=0; }
	// all done
	kt1.reset();
	//kt2.reset();
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );

	//
	// . ADD ADDRESSES TO NAMEDB/PLACEDB
	// . key is basically a hash of the address (excluding place name
	//   and street indicators)
	//
	setStatus ( "adding to placedb" );
	// checkpoint
	saved = m_p;
	// add that table to the metalist
	if ( m_usePlacedb && ! addTable128 ( &pt1, RDB_PLACEDB,forDelete))
		return NULL;
	//if(! addTable128 ( &pt2, &pt1, RDB_PLACEDB, true , true))return NULL;
	// sanity check
	if ( m_p - saved > needPlacedb ) { char *xx=NULL;*xx=0; }
	// free mem
	pt1.reset();
	//pt2.reset();
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );


	/*
	//
	// ADD REVDB RECORD
	//

	//
	// . add the metalist to itself
	// . this way, when we delete this doc from the index, we just
	//   lookup the original metalist in revdb, set all the
	//   delbits, and re-add that. this avoid having to ensure
	//   parsing consistency, which is a royal pain in the ass
	// . now we also update getMetaList() to check revdb to get
	//   the meta list if the doc is already indexed...
	//
	// define current meta list
	char *x    = m_metaList;
	char *xend = m_p;
	// skip adding to revdb?
	if ( ! m_useRevdb ) xend = x;
	int32_t *dataSizePtr;
	char *savedp;
	// if nothing in current list do not add revdb rec
	bool hadStuff = ( x < xend);
	if ( hadStuff ) {
		// put in the rdbId
		if ( m_useSecondaryRdbs ) *m_p++ = RDB2_REVDB2;
		else                      *m_p++ = RDB_REVDB;
		// the key
		if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
		*(key_t *)m_p = g_revdb.makeKey ( m_docId , false );
		m_p += sizeof(key_t);
		// data size
		dataSizePtr = (int32_t *)m_p;
		// skip for now
		m_p += 4;
		// save it
		savedp = m_p;
	}
	// scan the current metalist and add keys to the revdb record
	for ( ; x < xend ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// save this
		char byte = *x;
		// get rdbId
		char rdbId = byte & 0x7f;
		//
		// convert if adding to secondary rdbids!!!!!!!!
		//
		if ( m_useSecondaryRdbs ) {
			if ( rdbId == RDB2_POSDB2 )
				rdbId = RDB_POSDB;
			else if ( rdbId == RDB2_DATEDB2 )
				rdbId = RDB_DATEDB;
			else if ( rdbId == RDB2_SECTIONDB2 )
				rdbId = RDB_SECTIONDB;
			else if ( rdbId == RDB2_PLACEDB2 )
				rdbId = RDB_PLACEDB;
			else if ( rdbId == RDB2_TITLEDB2 )
				rdbId = RDB_TITLEDB;
			else if ( rdbId == RDB2_LINKDB2 )
				rdbId = RDB_LINKDB;
			else if ( rdbId == RDB2_CLUSTERDB2 )
				rdbId = RDB_CLUSTERDB;
			else if ( rdbId == RDB2_SPIDERDB2 )
				rdbId = RDB_SPIDERDB;
			else if ( rdbId == RDB2_TAGDB2 )
				rdbId = RDB_TAGDB;
			// must be covered!!
			else { char *xx=NULL;*xx=0; }
			// rewrite byte now b/c we store it below
			byte = (byte & 0x80) | rdbId;
		}
		// skip that
		x++;
		// copy that over
		*m_p++ = byte;
		// sanity check -- no negative keys allowed in here
		if ( (x[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
		// get key size
		int32_t ks = getKeySizeFromRdbId(rdbId);
		// copy that over
		gbmemcpy ( m_p , x , ks );
		// skip that
		m_p += ks;
		x   += ks;
		// datasize?
		int32_t ds = getDataSizeFromRdbId(rdbId);
		if ( ds == -1 ) {
			ds = *(int32_t *)x;
			x += 4;
		}
		// skip data
		x += ds;
	}
	// record size of what we wrote
	if ( hadStuff )
		*dataSizePtr = ( m_p - savedp );
	// sanity check
	if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}
	// sanity check
	verifyMetaList( m_metaList , m_p );
	*/

	//////
	//
	// add SPIDERREPLY BEFORE and SPIDERREQUEST!!!
	//
	// add spider reply first so we do not immediately respider
	// this same url if we were injecting it because no SpiderRequest
	// may have existed, and SpiderColl::addSpiderRequest() will
	// spawn a spider of this url again unless there is already a REPLY
	// in spiderdb!!! crazy...
	bool addReply = true;
	// Scraper.cpp uses this
	if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false;
	// save it
	saved = m_p;
	// now add the new rescheduled time
	if ( addReply && m_useSpiderdb && ! forDelete ) {
		// note it
		setStatus ( "adding SpiderReply to spiderdb" );
		// rdbid first
		*m_p = RDB_SPIDERDB;
		// use secondary?
		if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
		m_p++;
		// get this
		if ( ! m_srepValid ) { char *xx=NULL;*xx=0; }
		// store the spider rec
		int32_t newsrSize = newsr->getRecSize();
		gbmemcpy ( m_p , newsr , newsrSize );
		m_p += newsrSize;

		m_addedSpiderReplySize = newsrSize;
		m_addedSpiderReplySizeValid = true;

		// sanity check - must not be a request, this is a reply
		if ( g_spiderdb.isSpiderRequest( &newsr->m_key ) ) {
			char *xx=NULL;*xx=0; }
		// sanity check
		if ( m_p - saved != needSpiderdb1 ) { char *xx=NULL;*xx=0; }
		// sanity check
		verifyMetaList( m_metaList , m_p , forDelete );
	}


	// if we are injecting we must add the spider request
	// we are injecting from so the url can be scheduled to be
	// spidered again.
	// NO! because when injecting a warc and the subdocs
	// it contains, gb then tries to spider all of them !!! sux...
	if ( needSpiderdb3 ) {
		// note it
		setStatus("adding spider request");
		// checkpoint
		saved = m_p;
		// store it here
		SpiderRequest revisedReq;

 		// if doing a repair/rebuild of spiderdb...
		if ( m_useSecondaryRdbs )
			getRebuiltSpiderRequest ( &revisedReq );

		// this fills it in for doing injections
		if ( ! m_useSecondaryRdbs ) {
			getRevisedSpiderRequest ( &revisedReq );
			// sanity log
			if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
			// sanity log
			if ( m_firstIp == 0 || m_firstIp == -1 ) {
				char *url = "unknown";
				if ( m_sreqValid ) url = m_sreq.m_url;
				log("build: error3 getting real firstip of "
				    "%"INT32" for %s. not adding new request.",
				    (int32_t)m_firstIp,url);
				goto skipNewAdd2;
			}
		}

		// copy it
		if ( m_useSecondaryRdbs ) *m_p++ = RDB2_SPIDERDB2;
		else                      *m_p++ = RDB_SPIDERDB;
		// store it back
		gbmemcpy ( m_p , &revisedReq , revisedReq.getRecSize() );
		// skip over it
		m_p += revisedReq.getRecSize();
		// sanity check
		if ( m_p - saved > needSpiderdb3 ) { char *xx=NULL;*xx=0; }

		m_addedSpiderRequestSize = revisedReq.getRecSize();
		m_addedSpiderRequestSizeValid = true;

	}

 skipNewAdd2:

	//
	// ADD SPIDERDB RECORDS of outlinks
	//
	// - do this AFTER computing revdb since we do not want spiderdb recs
	//   to be in revdb.
	//
	setStatus ( "adding spiderdb keys" );
	// sanity check. cannot spider until in sync
	if ( ! isClockInSync() ) { char *xx=NULL;*xx=0; }
	// checkpoint
	saved = m_p;
	// . should be fixed from Links::setRdbList
	// . we should contain the msge that msg16 uses!
	// . we were checking m_msg16.m_recycleContent, but i have not done
	//   that in years!!! MDW
	// . we were also checking if the # of banned outlinks >= 2, then
	//   we would not do this...
	// . should also add with a time of now plus 5 seconds to that if
	//   we spider an outlink linkdb should be update with this doc
	//   pointing to it so it can get link text then!!
	if ( spideringLinks && nl2 && ! m_doingConsistencyCheck &&
	     m_useSpiderdb && ! forDelete ){
		// returns NULL and sets g_errno on error
		char *ret = addOutlinkSpiderRecsToMetaList ();
		// sanity check
		if ( ! ret && ! g_errno ) { char *xx=NULL;*xx=0; }
		// return NULL on error
		if ( ! ret ) return NULL;
		// this MUST not block down here, to avoid re-hashing above
		if ( ret == (void *)-1 ) { char *xx=NULL;*xx=0; }
	}
	// sanity check
	if ( m_p - saved > needSpiderdb2 ) { char *xx=NULL;*xx=0; }
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );

	//
	// ADD TAG RECORDS TO TAGDB
	//
	// checkpoint
	saved = m_p;
	// . only do this if NOT setting from a title rec
	// . it might add a bunch of forced spider recs to spiderdb
	// . store into tagdb even if indexCode is set!
	if ( ntb && m_useTagdb && ! forDelete ) {
		// ntb is a safebuf of Tags, which are already Rdb records
		// so just gbmemcpy them directly over
		char *src     = ntb->getBufStart();
		int32_t  srcSize = ntb->length();
		gbmemcpy ( m_p , src , srcSize );
		m_p += srcSize;
	}
	// sanity check
	if ( m_p - saved > needTagdb ) { char *xx=NULL;*xx=0; }
	// sanity check
	verifyMetaList( m_metaList , m_p , forDelete );


	//
	// ADD INDEXED SPIDER REPLY with different docid so we can
	// search index of spider replies! (NEW!)
	//
	// . index spider reply with separate docid so they are all searchable.
	// . see getSpiderStatusDocMetaList() function to see what we index
	//   and the titlerec we create for it
	if ( spiderStatusDocMetaList ) {
		gbmemcpy ( m_p ,
			 spiderStatusDocMetaList->getBufStart() ,
			 spiderStatusDocMetaList->length() );
		m_p += spiderStatusDocMetaList->length();
		m_addedStatusDocSize = spiderStatusDocMetaList->length();
		m_addedStatusDocSizeValid = true;
	}

	/*
	//
	// ADD FORCED RESPIDER DOCID-BASED SPIDER RECS for Sections
	//
	// used by Sections.cpp to respider docs because we just identified an
	// article section and they need to be re-indexed to take advantage
	// of that
	//
	// checkpoint
	saved = m_p;
	// . only do this if NOT setting from a title rec
	// . it might add a bunch of forced spider recs to spiderdb
	if ( ! m_setFromTitleRec && nd ) { // && ! m_isInjecting ) {
		Sections *ss = &m_sections;
		m_p = ss->respiderLineWaiters ( m_p , m_pend );
		if ( ! m_p ) return NULL;
	}
	// sanity check
	if ( m_p - saved > needLineWaiters ) { char *xx=NULL;*xx=0; }
	// sanity check
	verifyMetaList( m_metaList , m_p );
	*/


	//
	// NOW UPDATE OURSELVES (OUR URL) IN SPIDERDB
	//

	// but not if injecting!
	//if ( ! m_sreqValid ) {
	//	// set the list size, different from the alloc size
	//	m_metaListSize = m_p - m_metaList;
	//	// all done
	//	return m_metaList;
	//}

	// note it
	//setStatus ( "deleting old spider rec key" );
	// rdbid first
	// *p = RDB_SPIDERDB;
	// use secondary?
	//if ( m_useSecondaryRdbs ) *p = RDB2_SPIDERDB2;
	//p++;
	// must be legit
	//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
	// then the key
	// *(key_t *)p = m_sreq.m_key;
	// nukey, clear del bit to delete it
	// *p &= 0xfe;
	// skip key
	//p += sizeof(key_t);

	// int16_tcut
	saved = m_p;
	/*

	  See comment under DOLEDB above! this approach is no longer used.

	// . remove from doledb if we had a valid key
	// . DO THIS BEFORE adding the SpiderReply since
	//   Spider.cpp::addSpiderReply() will
	//   decrement the count for firstIp in m_doleIpTable
	if ( (m_doledbKey.n0 || m_doledbKey.n1) &&
	     ! m_useSecondaryRdbs &&
	     // do not add if we are generating the meta list for incremental
	     // indexing purposes from an old doc
	     ! forDelete ) {
		// note it
		setStatus ( "removing key from doledb" );
		// . now remove the original spider rec from "doledb"
		// . rdbid first
		*m_p = RDB_DOLEDB;
		m_p++;
		// then the key
		*(key_t *)m_p = m_doledbKey;
		// nukey, clear del bit to delete it
		*m_p = *m_p & 0xfe;
		// skip key
		m_p += sizeof(key_t);
		// datasize is 0
		// *(int32_t *)m_p = 0;
		//m_p += 4;
		// sanity check
		if ( m_p - saved != needDoledb ) { char *xx=NULL;*xx=0; }
		// sanity check
		verifyMetaList( m_metaList , m_p , forDelete );
	}
	*/

	// note it
	//setStatus ( "removing spider lock");
	// . make a fake titledb key
	// . remove the spider lock (Msg12 in Spider.cpp)
	// . no need to do this if called from Repair.cpp
	// . the uh48 is zero, that means fake!
	// . i added "&& m_useSpiderdb" here because it was messing up
	//   the cacheTermLists() function which ONLY wants posdb keys and
	//   any other keys in the metalist messes it up. MDW 1/26/13
	// . now SPider.cpp uses SpiderReply reception to remove lock
	//   - mdw 9/28/13
	//if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) {
	//	*m_p++ = RDB_FAKEDB;
	//	((key_t *)m_p)->n1 = 0;
	//	((key_t *)m_p)->n0 = m_docId;
	//	//= g_titledb.makeKey ( m_docId , 0LL , true );
	//	m_p += sizeof(key_t);
	//}


	// MDW: new spider algo does not need this
	/*
	// save it
	saved = m_p;
	// re-add the same request since it was removed from Spider.cpp's
	// m_urlBuf and the associated orderTree,ipTree, etc. and now
	// since we are un-doling (undoling) it we need to re-add and this
	// is the easiest way. it really was never removed from spiderdb
	// but it will no longer be in the spider's cache since we delete
	// it from there when we add it to doledb. so this is just a quick
	// way of getting it back into the cache.
	// now, we add this first since now Rdb.cpp calls evaluateAllReqeusts()
	// AFTER the REPLY now
	if ( m_sreqValid &&
	     // page parser has an invalid firstIp which causes printMetaList()
	     // to core when trying to print this out, so don't add it when
	     // doing page parser
	     ! m_sreq.m_isPageParser ) {
		// note it
		setStatus ( "adding SpiderRequest back to spiderdb" );
		// rdbid first
		*m_p = RDB_SPIDERDB;
		// use secondary?
		if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2;
		m_p++;
		// store the spider rec
		int32_t size = m_sreq.getRecSize();
		gbmemcpy ( m_p , &m_sreq , size );
		// set this one bit
		SpiderRequest *rr = (SpiderRequest *)m_p;
		rr->m_readd = 1;
		// and hafta reset this junk otherwise it cores
		// (see Spider.h::SpiderRequest::reset())
		rr->m_ufn      = -1;
		rr->m_priority = -1;
		rr->m_doled    =  0;
		// skip over the whole rec
		m_p += size;
		// sanity check - must not be a request, this is a reply
		if ( ! g_spiderdb.isSpiderRequest( &m_sreq.m_key ) ) {
			char *xx=NULL;*xx=0; }
		// sanity check
		if ( m_p - saved != needSpiderdb3 ) { char *xx=NULL;*xx=0; }
		// sanity check
		verifyMetaList( m_metaList , m_p );
	}
	*/

	// sanity check
	if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;}

	int32_t now = getTimeGlobal();

	/////////////////
	//
	// INCREMENTAL INDEXING / INCREMENTAL UPDATING
	//
	// now prune/manicure the metalist to remove records that
	// were already added, and insert deletes for records that
	// changed since the last time. this is how we do deletes
	// now that we have revdb. this allows us to avoid
	// parsing inconsistency errors.
	//
	/////////////////

	// disable for parsing consistency testing of already indexed docs
	//oldList = NULL;

	if ( oldList ) { // && oldList->m_listSize > 16 ) {
		// point to start of the old meta list, the first and only
		// record in the oldList
		char *om = oldList;// + 12 + 4;
		// the size
		int32_t osize = oldListSize;//*(int32_t *)(oldList + 12);
		// the end
		char *omend = om + osize;
		int32_t needx = 0;
		// init these. data is just the rdbid, a single byte.
		//HashTableX dt12;
		//HashTableX dt16;
		//char dbuf12[30000];
		//char dbuf16[40000];
		//dt12.set ( 12,1,2048,dbuf12,30000,false,m_niceness);
		//dt16.set ( 16,1,2048,dbuf16,40000,false,m_niceness);
		HashTableX dt8;
		char dbuf8[34900];
		// value is the ptr to the rdbId/key in the oldList
		dt8.set ( 8,sizeof(char *),2048,dbuf8,34900,
			  false,m_niceness,"dt8-tab");
		// just for linkdb:
		//HashTableX dt9;
		//char dbuf9[30000];
		//dt9.set ( 8,4,2048,dbuf9,30000,false,m_niceness,"dt9-tab");
		// scan recs in that and hash them
		for ( char *p = om ; p < omend ; ) {
			// breathe
			QUICKPOLL(m_niceness);
			// save this
			char byte = *p;
			// save this
			char *rec = p;
			// get the rdbid for this rec
			char rdbId = byte & 0x7f;
			// skip that
			p++;
			// get the key size
			int32_t ks = getKeySizeFromRdbId ( rdbId );
			// get that
			char *k = p;
			// unlike a real meta list, this meta list has
			// no data field, just rdbIds and keys only! because
			// we only use it for deleting, which only requires
			// a key and not the data
			p += ks;
			// tally this up in case we have to add the delete
			// version of this key back (add 1 for rdbId)
			needx += ks + 1;
			// always re-add titledb record!
			// if our current/new list is basically empty
			// except for a SpiderReply because it got deleted
			// from the index, we need to store the titledb key
			// in dt8 so we can add it as a negative! so i
			// don't really know what this was trying to fix
			// because it broke that!
			//if ( rdbId == RDB_TITLEDB ) continue;
			// for linkdb, sometimes we also add a "lost" link
			// key in addition to deleting the old key! see below
			if ( rdbId == RDB_LINKDB ) needx += ks + 1;
			// do not add it if datasize > 0
			uint64_t hk;
			// do not include discovery or lost dates in the
			// linkdb key...
			if ( rdbId == RDB_LINKDB )
				hk = hash64 (k+12,ks-12);
			else
				hk = hash64 (k,ks);
			// sanity check
			if ( rdbId == RDB_LINKDB &&
			     g_linkdb.getLinkerDocId_uk((key224_t *)k)!=
			     m_docId ) {
				char *xx=NULL;*xx=0; }
			//if ( getDataSize(rdbId) != 0 ) continue;
			// hash this key
			//bool status;
			// sectiondb keys all have the same last few bits...
			// so this clogs up the hash table.
			// so mix up the key bits for hashing
			//uint64_t hk = hash64 ( k,ks);
			//if      (ks == 12 ) status = dt12.addKey ( k, &byte);
			//else if (ks == 16 ) status = dt16.addKey ( k, &byte);
			//else { char *xx=NULL; *xx=0; }
			if ( ! dt8.addKey(&hk,&rec) ) return NULL;
			// return NULL with g_errno set on error
			//if ( ! status ) return NULL;
		}
		// also need all the new keys just to be sure, in case none
		// are already in the rdbs
		needx += (m_p - m_metaList);
		// now alloc for our new manicured metalist
		char *nm = (char *)mmalloc( needx, "newmeta" );
		if ( ! nm ) return NULL;
		char *nptr = nm;
		char *nmax = nm + needx;
		// scan each rec in the current meta list, see if its in either
		// the dt12 or dt16 hash table, if it already is, then
		// do NOT add it to the new metalist, nm, because there is
		// no need to.
		char *p    = m_metaList;
		char *pend = p + (m_p - m_metaList);
		for ( ; p < pend ; ) {
			// breathe
			QUICKPOLL(m_niceness);
			// save it with the flag
			char byte = *p;
			// get rdbId
			char rdbId = byte & 0x7f;
			// skip that
			p++;
			// key size
			int32_t ks = getKeySizeFromRdbId(rdbId);
			// get key
			char *key = p;
			// skip that
			p += ks;
			// get data size
			int32_t ds = getDataSizeFromRdbId(rdbId);
			// assume we do not store the datasize
			bool neg = false;
			// . if key is negative, no data is present
			// . the doledb key is negative for us here
			if ( (key[0] & 0x01) == 0x00 ) { neg = true; ds = 0; }
			// if datasize variable, read it in
			if ( ds == -1 ) {
				// get data size
				ds = *(int32_t *)p;
				// skip data size int32_t
				p += 4;
			}
			// point to data
			char *data = p;
			// skip data if not zero
			p += ds;

			// mix it up for hashtable speed
			uint64_t hk ;//= hash64 ( key,ks);

			// skip if for linkdb, we do that below
			if ( rdbId == RDB_LINKDB )
				hk = hash64(key+12,ks-12);
			else
				hk = hash64(key,ks);

			// was this key already in the "old" list?
			int32_t slot = dt8.getSlot(&hk);

			// do we got a linkdb key that existed last time
			// we indexed this doc? if so, inherit its discovery
			// date.
			if ( slot >= 0 && rdbId == RDB_LINKDB ) {
				/*
				// get old key from last time
				char *oldk=*(char**)dt8.getValueFromSlot(slot);
				// skip rdbid
				oldk++;
				// sanity
				if(g_linkdb.getLinkerDocId_uk((key224_t *)oldk)
				   !=m_docId){
					char *xx=NULL;*xx=0; }
				// copy rdbid into new meta list
				*nptr++ = byte;
				// point to where key will be stored in new lst
				char *nk = nptr;
				// store the new key in the new meta list
				gbmemcpy ( nptr , key , ks );
				// advance ptr
				nptr += ks;
				// get disocvery time of old key from last time
				int32_t dd = g_linkdb.getDiscoveryDate_uk(oldk);
				// sanity
				if ( dd < 0 ) { char *xx=NULL;*xx=0; }
				// but mod the new key's discovery time
				g_linkdb.setDiscoveryDate_uk ( nk, dd );
				*/
				// . no need to deal with this any further
				// . yeah, because there could be dups!
				//   so don't delete it just yet
				// . but make the data ptr NULL so we
				//   know to disregard it below...???
				dt8.removeSlot(slot);
				// all done for this key
				continue;
			}

			// see if already in an rdb, IFF dataless, otherwise
			// the keys might be the same but with different data!
			if ( slot >= 0 ) { // dt8.isInTable(&hk) ) {
				// remove from hashtable so we do not add it
				// as a delete key below
				// dt8.removeKey(&hk);
				dt8.removeSlot(slot);
				// but do add like a titledb rec that has the
				// same key, because its data is probably
				// different...
				// HACK: enable for now since we lost
				// the url:www.geico.com term somehow!!!
				// geico got deleted but not the title rec!!
				// MAKE SURE TITLEREC gets deleted then!!!
				if ( ds==0 && g_conf.m_doIncrementalUpdating )
					continue;
			}
			// ok, it is not already in an rdb, so add it
			*nptr++ = byte;
			// store key
			gbmemcpy ( nptr, key , ks );
			// skip over it
			nptr += ks;
			// store data size. BUT not if negative key!
			if ( getDataSizeFromRdbId(rdbId) == -1 && ! neg ) {
				*(int32_t *)nptr = ds;
				nptr += 4;
			}
			// store data
			if ( ds ) {
				gbmemcpy ( nptr , data , ds );
				nptr += ds;
			}
		}
		// now scan dt8 and add their keys as del keys
		for ( int32_t i = 0 ; i < dt8.m_numSlots ; i++ ) {
			// breathe
			QUICKPOLL(m_niceness);
			// skip if empty
			if ( ! dt8.m_flags[i] ) continue;
			// store rdbid first
			char *rec = *(char **)dt8.getValueFromSlot(i);
			// get rdbId with hi bit possibly set
			char rdbId = rec[0] & 0x7f;
			// key size
			int32_t ks = getKeySizeFromRdbId(rdbId);
			// sanity test - no negative keys
			if ( (rec[1] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0;}
			// copy the rdbId byte and key
			gbmemcpy ( nptr , rec , 1 + ks );
			// skip over rdbid
			nptr++;
			// make it a negative key by clearing lsb
			*nptr = *nptr & 0xfe;
			// skip it
			nptr += ks;
			// if it is from linkdb, and unmet, then it is a
			// lost link, so set the lost date of it. we keep
			// these so we can graph lost links
			if ( rdbId == RDB_LINKDB ) {
				// the real linkdb rec is at rec+1
				int32_t lost = g_linkdb.getLostDate_uk( rec+1 );
				// how can it be non-zero? it should have
				// been freshly made from the old titlerec...
				if ( lost ) { char *xx=NULL;*xx=0; }
				// if zero, set it to now!
				//g_linkdb.setLostDate_uk(realRec,now);
				// copy the rdbId byte and key
				gbmemcpy ( nptr , rec , 1 + ks );
				// set it in there now
				g_linkdb.setLostDate_uk(nptr+1,now);
				// carry it through on revdb, do not delete
				// it! we want a linkdb history for seomasters
				nptr += 1 + ks;
				// and go on to delete the old linkdb key that
				// did not have a lost date
				//continue;
			}

		}
		// sanity. check for metalist breach
		if ( nptr > nmax ) { char *xx=NULL;*xx=0; }
		// free the old meta list
		mfree ( m_metaList , m_metaListAllocSize , "fm" );
		// now switch over to the new one
		m_metaList          = nm;
		m_metaListAllocSize = needx;
		m_p                 = nptr;
	}


	// if we only removed it from index, set this flag
	if ( oldList && ! nd ) m_didDelete = true;

	//
	// repeat this logic special for linkdb since we keep lost links
	// and may update the discovery date or lost date in the keys
	//
	// 1. hash keys of old linkdb keys into dt9 here
	// 2. do not hash the discovery/lost dates when making key hash for dt9
	// 3. scan keys in meta list and add directly into new meta list
	//    if not in dt9
	// 4. if in dt9 then add dt9 key instead
	// 5. remove dt9 keys as we add them
	// 6. then add remaining dt9 keys into meta list but with lost date
	//    set to now UNLESS it's already set
	//


	//
	// validate us!
	//
	m_metaListValid = true;

	// set the list size, different from the alloc size
	m_metaListSize = m_p - m_metaList;//end - m_p;
	// sanity check
	verifyMetaList( m_metaList , m_metaList + m_metaListSize , forDelete );

	// all done
	return m_metaList;
}

// . copy from old title rec to us to speed things up!
// . returns NULL and set g_errno on error
// . returns -1 if blocked
// . returns 1 otherwise
// . when to doc content is unchanged, just inherit crap from the old title
//   rec so we can make the spider reply in getNewSpiderReply()
void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
	// skip if none
	if ( ! od ) return;
	// skip if already did it
	if ( m_copied1 ) return;
	// do not repeat
	m_copied1 = true;
	// set these
	m_percentChanged      = 0;
	m_percentChangedValid = true;

	// copy over bit members
	m_contentHash32 = od->m_contentHash32;
	//m_tagHash32     = od->m_tagHash32;
	m_tagPairHash32 = od->m_tagPairHash32;
	//m_sitePop       = od->m_sitePop;
	m_httpStatus    = od->m_httpStatus;
	m_hasAddress    = od->m_hasAddress;
	m_hasTOD        = od->m_hasTOD;
	//m_hasSiteVenue  = od->m_hasSiteVenue;
	m_isRSS         = od->m_isRSS;
	m_isPermalink   = od->m_isPermalink;
	m_hasContactInfo= od->m_hasContactInfo;
	m_hopCount      = od->m_hopCount;
	m_crawlDelay    = od->m_crawlDelay;

	// do not forget the shadow members of the bit members
	m_hasAddress2    = m_hasAddress;
	m_hasTOD2        = m_hasTOD;
	//m_hasSiteVenue2  = m_hasSiteVenue;
	m_isRSS2         = m_isRSS;
	m_isPermalink2   = m_isPermalink;

	// validate them
	m_contentHash32Valid = true;
	//m_tagHash32Valid     = true;
	m_tagPairHash32Valid = true;
	//m_sitePopValid       = true;
	m_httpStatusValid    = true;
	m_hasAddressValid    = true;
	m_hasTODValid        = true;
	//m_hasSiteVenueValid  = true;
	m_isRSSValid         = true;
	m_isPermalinkValid   = true;
	m_hasContactInfoValid= true;
	m_hopCountValid      = true;
	m_crawlDelayValid    = true;

	m_pubDate       = od->m_pubDate;
	m_langId        = od->m_langId;

	m_pubDateValid      = true;
	m_langIdValid       = true;

	// so get sitenuminlinks doesn't crash when called by getNewSpiderReply
	// because dns timed out. it timed out with EDNSTIMEDOUT before.
	// so overwrite it here...
	if ( m_ip == -1 || m_ip == 0 || ! m_ipValid ) {
		m_ip                  = od->m_ip;
		m_ipValid             = true;
		m_siteNumInlinks            = od->m_siteNumInlinks;
		// m_siteNumInlinksUniqueIp    = od->m_siteNumInlinksUniqueIp;
		// m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlo
		// m_siteNumInlinksTotal       = od->m_siteNumInlinksTotal;

		m_siteNumInlinksValid =
			od->m_siteNumInlinksValid;
		// m_siteNumInlinksUniqueIpValid =
		// 	od->m_siteNumInlinksUniqueIpValid;
		// m_siteNumInlinksUniqueCBlockValid =
		// 	od->m_siteNumInlinksUniqueCBlockValid;
		// m_siteNumInlinksTotal =
		// 	od->m_siteNumInlinksTotalValid;
	}

	m_indexCode      = 0;//od->m_indexCode;
	m_indexCodeValid = true;

	// we need the link info too!
	ptr_linkInfo1  = od->ptr_linkInfo1;
	size_linkInfo1 = od->size_linkInfo1;
	if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
	else m_linkInfo1Valid = false;

	// turn off for debug
	ptr_sectiondbData = NULL;
	size_sectiondbData = 0;
}

// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
SpiderReply *XmlDoc::getFakeSpiderReply ( ) {

	if ( ! m_tagRecValid ) {
		m_tagRec.reset();
		m_tagRecValid = true;
	}

	if ( ! m_siteHash32Valid ) {
		m_siteHash32 = 1;
		m_siteHash32Valid = true;
	}

	if ( ! m_downloadEndTimeValid ) {
		m_downloadEndTime = 0;
		m_downloadEndTimeValid = true;
	}

	if ( ! m_ipValid ) {
		m_ipValid = true;
		m_ip = atoip("1.2.3.4");
	}

	if ( ! m_spideredTimeValid ) {
		m_spideredTimeValid = true;
		m_spideredTime = getTimeGlobal();//0; use now!
	}

	// don't let it get the diffbot reply either! it should be empty.
	if ( ! m_diffbotReplyValid ) {
		m_diffbotReplyValid = true;
	}

	// if doing diffbot query reindex
	// TODO: does this shard the request somewhere else???
	if ( ! m_firstIpValid ) {
		m_firstIp = m_ip;//atoip("1.2.3.4");
		m_firstIpValid = true;
	}

	// this was causing nsr to block and core below on a bad engineer
	// error loading the old title rec
	if ( ! m_isPermalinkValid ) {
		m_isPermalink = false;
		m_isPermalinkValid = true;
	}

	//if ( ! m_sreqValid ) {
	// 	m_sreqValid = true;
	// 	m_sreq.m_parentDocId = 0LL;
	// }


	// if error is EFAKEFIRSTIP, do not core
	//if ( ! m_isIndexedValid ) {
	//	m_isIndexed = false;
	//	m_isIndexedValid = true;
	//}

	// if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT
	// or ECORRUPTDATA (corrupt gzip reply)
	// then this should not block. we need a spiderReply to release the
	// url spider lock in SpiderLoop::m_lockTable.
	// if m_isChildDoc is true, like for diffbot url, this should be
	// a bogus one.
	SpiderReply *nsr = getNewSpiderReply ();
	if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; }
	if ( ! nsr ) {
		log("doc: crap, could not even add spider reply "
		    "to indicate internal error: %s",mstrerror(g_errno));
		if ( ! g_errno ) g_errno = EBADENGINEER;
		//return true;
		return NULL;
	}

	return nsr;

	//if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; }

	//CollectionRec *cr = getCollRec();
	//if ( ! cr ) return true;
}

// getSpiderReply()
SpiderReply *XmlDoc::getNewSpiderReply ( ) {

	if ( m_srepValid ) return &m_srep;

	setStatus ( "getting spider reply" );

	// diffbot guys, robots.txt, frames, sshould not be here
	if ( m_isChildDoc ) { char *xx=NULL;*xx=0; }

	// . get the mime first
	// . if we are setting XmlDoc from a titleRec, this causes
	//   doConsistencyCheck() to block and core
	//HttpMime *mime = getMime();
	//if ( ! mime || mime == (HttpMime *)-1 ) return (SpiderReply *)mime;

	// if we had a critical error, do not do this
	int32_t *indexCode = getIndexCode();
	if (! indexCode || indexCode == (void *)-1)
		return (SpiderReply *)indexCode;


	// if it has been abandoned early, i.e. cut-off, then we should
	// add a "fake" spider reply to release the lock in
	// SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply()
	// to see what parts of this are relevant.
	/*
	if ( *indexCode == EABANDONED ||
	     // . any internal "error" needs to be here really
	     // . was there an error unzipping the title rec?
	     *indexCode == ECORRUPTDATA ||
	     *indexCode == EHITCRAWLLIMIT ||
	     *indexCode == EHITPROCESSLIMIT ) {
		// clear everything
		m_srep.reset();
		// get from spider request, if there
		int32_t firstIp = 0;
		if ( m_sreqValid ) firstIp = m_sreq.m_firstIp;
		// otherwise, wtf?
		if ( ! firstIp )
			log("build: no first ip to make fake spiderReply. "
			    "injected?");
		// we at least need this
		m_srep.m_firstIp = firstIp;
		Url *fu = getFirstUrl();
		// this is the lock key
		int64_t uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
		m_srep.setKey (  firstIp, 0 , uh48 , false );
		// tell it we are fake and not to really add us to
		// spiderdb, but just to release the lock
		m_srep.m_errCode = *indexCode;
		m_srepValid = true;
		return &m_srep;
	}
	*/

	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;

	// can't call getIsPermalink() here without entering a dependency loop
	//char *pp = getIsUrlPermalinkFormat();
	//if ( !pp || pp == (char *)-1 ) return (SpiderReply *)pp;

	// the site hash
	int32_t *sh32 = getSiteHash32();
	if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SpiderReply *)sh32;

	int64_t *de = getDownloadEndTime();
	if ( ! de || de == (void *)-1 ) return (SpiderReply *)de;

	// need to set m_sentToDiffbot!!
	SafeBuf *dbr = getDiffbotReply();
	if ( ! dbr || dbr == (void *)-1 ) return (SpiderReply *)dbr;

	// was the doc index when we started trying to spider this url?
	//char *wasIndexed = getIsIndexed();
	//if ( ! wasIndexed || wasIndexed == (void *)-1 )
	//	return (SpiderReply *)wasIndexed;

	//Tag *vt = m_oldTagRec.getTag("venueaddress");
	//bool siteHasVenue = (bool)vt;


	// int16_tcut
	Url *fu = NULL;
	// watch out for titlerec lookup errors for docid based spider reqs
	if ( m_firstUrlValid ) fu = getFirstUrl();

	// reset
	m_srep.reset();

	int32_t firstIp = -1;
	// inherit firstIp
	Tag *tag = m_tagRec.getTag("firstip");
	// tag must be there?
	if ( tag ) firstIp = atoip(tag->getTagData());

	// this is usually the authority
	if ( m_firstIpValid )
		firstIp = m_firstIp;

	// otherwise, inherit from oldsr to be safe
	// BUT NOT if it was a fakeip and we were injecting because
	// the SpiderRequest was manufactured and not actually taken
	// from spiderdb! see XmlDoc::injectDoc() because that is where
	// it came from!! if it has m_sreq.m_isAddUrl and
	// m_sreq.m_fakeFirstIp then we actually do add the reply with that
	// fake ip so that they will exist in the same shard.
	// BUT if it is docid pased from PageReindex.cpp (a query reindex)
	// we set the injection bit and the pagereindex bit, we should let
	// thise guys keep the firstip because the docid-based spider request
	// is in spiderdb. it needs to match up.
	if ( m_sreqValid && (!m_sreq.m_isInjecting||m_sreq.m_isPageReindex) )
		firstIp = m_sreq.m_firstIp;

	// sanity
	if ( firstIp == 0 || firstIp == -1 ) {
		if ( m_firstUrlValid )
			log("xmldoc: BAD FIRST IP for %s",m_firstUrl.getUrl());
		else
			log("xmldoc: BAD FIRST IP for %"INT64"",m_docId);
		firstIp = 12345;
		//char *xx=NULL;*xx=0; }
	}
	// store it
	m_srep.m_firstIp = firstIp;
	// assume no error
	// MDW: not right...
	m_srep.m_errCount = 0;
	// otherwise, inherit from oldsr to be safe
	//if ( m_sreqValid )
	//	m_srep.m_firstIp = m_sreq.m_firstIp;

	// do not inherit this one, it MIGHT HAVE CHANGE!
	m_srep.m_siteHash32 = m_siteHash32;

	// need this for updating crawl delay table, m_cdTable in Spider.cpp
	if ( fu ) m_srep.m_domHash32  = getDomHash32();
	else      m_srep.m_domHash32  = 0;

	if ( ! m_tagRecValid               ) { char *xx=NULL;*xx=0; }
	if ( ! m_ipValid                   ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteHash32Valid           ) { char *xx=NULL;*xx=0; }
	//if ( ! m_spideredTimeValid         ) { char *xx=NULL;*xx=0; }

	// . set other fields besides key
	// . crap! if we are the "qatest123" collection then m_spideredTime
	//   was read from disk usually and is way in the past! watch out!!
	m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// crap, for the test coll this is often a very old time and it
	// causes the spider request to be repeatedly executed, so let's
	// fix that
	if ( ! strcmp(cr->m_coll,"qatest123") )
		m_srep.m_spideredTime = getTimeGlobal();


	// TODO: expire these when "ownershipchanged" tag is newer!!
	if ( gr->getTag ( "ingoogle"        ) ) {
		m_srep.m_inGoogle      = 1;
		m_srep.m_inGoogleValid = 1;
	}
	if ( gr->getTag ( "authorityinlink" ) )
		m_srep.m_hasAuthorityInlink = 1;
	// automatically valid either way
	m_srep.m_hasAuthorityInlinkValid = 1;
	// but for this tag, it must exist even if it has no contact info
	//tag = gr->getTag ( "hascontactinfo"  );
	//if ( tag ) {

	int64_t uh48        = 0LL;
	// we might be a docid based spider request so fu could be invalid
	// if the titlerec lookup failed
	if ( fu ) uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL;
	int64_t parentDocId = 0LL;
	if ( m_sreqValid )
		parentDocId = m_sreq.getParentDocId();
	//else { char *xx=NULL;*xx=0; }

	// for docid based urls from PageReindex.cpp we have to make
	// sure to set the urlhash48 correctly from that.
	if ( m_sreqValid ) uh48 = m_sreq.getUrlHash48();

	// note it
	if ( g_conf.m_logDebugSpider )
		log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId);

	// set the key, m_srep.m_key
	m_srep.setKey (  firstIp, parentDocId , uh48 , false );

	// . did we download a page? even if indexcode is set we might have
	// . if this is non-zero that means its valid
	if ( m_contentHash32Valid )
		m_srep.m_contentHash32 = m_contentHash32;

	// injecting the content (url implied)
	if ( m_contentInjected ) // m_sreqValid && m_sreq.m_isInjecting )
		m_srep.m_fromInjectionRequest = 1;

	// can be injecting a url too, content not necessarily implied
	if ( m_sreqValid && m_sreq.m_isInjecting )
		m_srep.m_fromInjectionRequest = 1;

	if ( m_sentToDiffbotThisTime )
		m_srep.m_sentToDiffbotThisTime = true;
	else
		m_srep.m_sentToDiffbotThisTime = false;

	if ( m_diffbotReplyError )
		m_srep.m_hadDiffbotError = true;
	else
		m_srep.m_hadDiffbotError = false;

	// if we only had an error code in the diffbot reply, record that
	if ( ! m_indexCode && m_diffbotReplyError )
		m_srep.m_errCode = m_diffbotReplyError;

	// sanity. if being called directly from indexDoc() because of
	// an error like out of memory, then we do not know if it is
	// indexed or not or was indexed...
	//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
	//if ( ! m_isInIndexValid  ) { char *xx=NULL;*xx=0; }

	// were we already in titledb before we started spidering?
	m_srep.m_wasIndexed = m_wasInIndex;

	// note whether m_wasIndexed is valid because if it isn't then
	// we shouldn't be counting this reply towards the page counts.
	// if we never made it this far i guess we should not forcibly call
	// getIsIndexed() at this point so our performance is fast in case
	// this is an EFAKEFIRSTIP error or something similar where we
	// basically just add this reply and we're done.
	// NOTE: this also pertains to SpiderReply::m_isIndexed.
	m_srep.m_wasIndexedValid = m_wasInIndexValid;

	// assume no change
	m_srep.m_isIndexed = m_isInIndex;

	// we need to know if the m_isIndexed bit is valid or not
	// because sometimes like if we are being called directly from
	// indexDoc() because of an error situation, we do not know!
	if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
	else                    m_srep.m_isIndexedINValid = true;

	// likewise, we need to know if we deleted it so we can decrement the
	// quota count for this subdomain/host in SpiderColl::m_quotaTable
	//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;

	// treat error replies special i guess, since langId, etc. will be
	// invalid
	if ( m_indexCode ) {
		// validate
		m_srepValid = true;
		// set these items if valid already, but don't bother
		// trying to compute them, since we are not indexing.
		if ( m_siteNumInlinksValid       ) {
			m_srep.m_siteNumInlinks = m_siteNumInlinks;
			m_srep.m_siteNumInlinksValid = true;
		}
		//if ( m_percentChangedValid )
		//	m_srep.m_percentChangedPerDay = m_percentChanged;
		if ( m_crawlDelayValid && m_crawlDelay >= 0 )
			// we already multiply x1000 in isAllowed2()
			m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
		else
			m_srep.m_crawlDelayMS = -1;
		if ( m_pubDateValid     ) m_srep.m_pubDate = m_pubDate;
		if ( m_langIdValid      ) m_srep.m_langId = m_langId;
		if ( m_isRSSValid       ) m_srep.m_isRSS = m_isRSS;
		if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
		if ( m_httpStatusValid  ) m_srep.m_httpStatus = m_httpStatus;
		// stuff that is automatically valid
		m_srep.m_isPingServer = 0;
		if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
		// this was replaced by m_contentHash32
		//m_srep.m_newRequests  = 0;
		m_srep.m_errCode      = m_indexCode;
		if ( m_downloadEndTimeValid )
			m_srep.m_downloadEndTime = m_downloadEndTime;
		else
			m_srep.m_downloadEndTime = 0;
		// is the original spider request valid?
		if ( m_sreqValid ) {
			// preserve the content hash in case m_indexCode is
			// EDOCUNCHANGED. so we can continue to get that
			// in the future. also, if we had the doc indexed,
			// just carry the contentHash32 forward for the other
			// errors like EDNSTIMEDOUT or whatever.
			m_srep.m_contentHash32 = m_sreq.m_contentHash32;
			// int16_tcuts
			SpiderReply   *n = &m_srep;
			SpiderRequest *o = &m_sreq;
			// more stuff
			n->m_inGoogle           = o->m_inGoogle;
			n->m_hasContactInfo     = o->m_hasContactInfo;
			n->m_isContacty         = o->m_isContacty;
			n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
			n->m_isPingServer       = o->m_isPingServer;
			// the validator flags
			n->m_inGoogleValid       = o->m_inGoogleValid;
			n->m_hasContactInfoValid = o->m_hasContactInfoValid;
			n->m_isContactyValid     = o->m_isContactyValid;
			n->m_hasAuthorityInlinkValid =
				o->m_hasAuthorityInlinkValid;
			// get error count from original spider request
			int32_t newc = m_sreq.m_errCount;
			// inc for us, since we had an error
			newc++;
			// contain to one byte
			if ( newc > 255 ) newc = 255;
			// store in our spiderreply
			m_srep.m_errCount = newc;
		}
		// . and do not really consider this an error
		// . i don't want the url filters treating it as an error reply
		// . m_contentHash32 should have been carried forward from
		//   the block of code right above
		if ( m_indexCode == EDOCUNCHANGED ) {
			// we should have had a spider request, because that's
			// where we got the m_contentHash32 we passed to
			// Msg13Request.
			if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }
			// make it a success
			m_srep.m_errCode = 0;
			// and no error count, it wasn't an error per se
			m_srep.m_errCount = 0;
			// call it 200
			m_srep.m_httpStatus = 200;
		}
		// copy flags and data from old doc...
		if ( m_indexCode == EDOCUNCHANGED &&
		     m_oldDocValid &&
		     m_oldDoc ) {
			m_srep.m_pubDate        = m_oldDoc->m_pubDate;
			m_srep.m_langId         = m_oldDoc->m_langId;
			m_srep.m_isRSS          = m_oldDoc->m_isRSS;
			m_srep.m_isPermalink    = m_oldDoc->m_isPermalink;
			m_srep.m_hasAddress     = m_oldDoc->m_hasAddress;
			m_srep.m_hasTOD         = m_oldDoc->m_hasTOD;
			//m_srep.m_hasSiteVenue   = m_oldDoc->m_hasSiteVenue;
			m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
			// they're all valid
			m_srep.m_hasAddressValid     = true;
			m_srep.m_hasTODValid         = true;
			//m_srep.m_hasSiteVenueValid   = true;
			m_srep.m_siteNumInlinksValid = true;
		}
		// do special things if
		return &m_srep;
	}

	// this will help us avoid hammering ips & respect same ip wait
	if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0; }
	m_srep.m_downloadEndTime      = m_downloadEndTime;

	// . if m_indexCode was 0, we are indexed then...
	// . this logic is now above
	//m_srep.m_isIndexed = 1;

	// get ptr to old doc/titlerec
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (SpiderReply *)pod;
	// this is non-NULL if it existed
	XmlDoc *od = *pod;

	// status is -1 if not found
	int16_t *hs = getHttpStatus ();
	if ( ! hs || hs == (void *)-1 ) return (SpiderReply *)hs;

	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (SpiderReply *)sni;

	float *pc = getPercentChanged();
	if ( ! pc || pc == (void *)-1 ) return (SpiderReply *)pc;

	// these are "non-dup" addresses (nondup)
	bool *hasAddress = getHasAddress();
	if ( ! hasAddress || hasAddress == (void *)-1 )
		return (SpiderReply *)hasAddress;
	// does it have a tod (i.e. 6pm) in there somewhere?
	bool *hasTOD = getHasTOD();
	if ( ! hasTOD || hasTOD == (void *)-1 )
		return (SpiderReply *)hasTOD;
	// does it have a venue address?
	//bool *hasSiteVenue = getHasSiteVenue();
	//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
	//	return (SpiderReply *)hasSiteVenue;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (SpiderReply *)isRoot;
	char *hci = getHasContactInfo();
	if ( ! hci || hci == (char *)-1 ) return (SpiderReply *)hci;


	int32_t *pubDate = getPubDate();
	if ( ! pubDate || pubDate == (int32_t *)-1 )
		return (SpiderReply *)pubDate;

	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 )
		return (SpiderReply *)langId;

	char *isRSS   = getIsRSS();
	if ( ! isRSS || isRSS == (char  *)-1 )
		return (SpiderReply *)isRSS;

	char *pl = getIsPermalink();
	if ( ! pl || pl == (char *)-1 )
		return (SpiderReply *)pl;

	if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
	if ( m_hasContactInfo ) {
		m_srep.m_hasContactInfo = 1;
		m_srep.m_hasContactInfoValid = 1;
	}

	// this is only know if we download the robots.tt...
	if ( od && m_recycleContent ) {
		m_crawlDelay = od->m_crawlDelay;
		m_crawlDelayValid = true;
	}

	// sanity checks
	//if(! m_sreqValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteNumInlinksValid       ) { char *xx=NULL;*xx=0; }
	if ( ! m_hopCountValid             ) { char *xx=NULL;*xx=0; }
	if ( ! m_pubDateValid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_langIdValid               ) { char *xx=NULL;*xx=0; }
	if ( ! m_isRSSValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_isPermalinkValid          ) { char *xx=NULL;*xx=0; }
	//if ( ! m_pageNumInlinksValid     ) { char *xx=NULL;*xx=0; }
	if ( ! m_percentChangedValid       ) { char *xx=NULL;*xx=0; }
	//if ( ! m_isSpamValid               ) { char *xx=NULL;*xx=0; }
	//if ( ! m_crawlDelayValid           ) { char *xx=NULL;*xx=0; }

	// httpStatus is -1 if not found (like for empty http replies)
	m_srep.m_httpStatus = *hs;

	// zero if none
	//m_srep.m_percentChangedPerDay = 0;
	// . only if had old one
	// . we use this in url filters to set the respider wait time usually
	if ( od ) {
		int32_t spideredTime = getSpideredTime();
		int32_t oldSpideredTime = od->getSpideredTime();
		float numDays = spideredTime - oldSpideredTime;
		m_srep.m_percentChangedPerDay = (m_percentChanged+.5)/numDays;
	}

	// . update crawl delay, but we must store now as milliseconds
	//   because Spider.cpp like it better that way
	// . -1 implies crawl delay unknown or not found
	if ( m_crawlDelay >= 0 && m_crawlDelayValid )
		// we already multiply x1000 in isAllowed2()
		m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000;
	else
		// -1 means invalid/unknown
		m_srep.m_crawlDelayMS = -1;

	if ( ! m_hasAddressValid    ) { char *xx=NULL;*xx=0; }
	if ( ! m_hasTODValid        ) { char *xx=NULL;*xx=0; }
	//if ( ! m_hasSiteVenueValid  ) { char *xx=NULL;*xx=0; }
	if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }

	// . we use this to store "bad" spider recs to keep from respidering
	//   a "bad" url over and over again
	// . it is up to the url filters whether they want to retry this
	//   again or not!
	// . TODO: how to represent "ETCPTIMEDOUT"????
	// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
	//   ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
	m_srep.m_siteNumInlinks       = m_siteNumInlinks;
	m_srep.m_pubDate              = *pubDate;
	// this was replaced by m_contentHash32
	//m_srep.m_newRequests          = 0;
	m_srep.m_langId               = *langId;
	m_srep.m_isRSS                = (bool)*isRSS;
	m_srep.m_isPermalink          = (bool)*pl;
        m_srep.m_isPingServer         = (bool)fu->isPingServer();
	//m_srep.m_isSpam             = m_isSpam;

	m_srep.m_siteNumInlinksValid = true;

	// . ignore address in dup sections (nondup/non-dup addresses only)
	// . this way if the place always has their address in the header or
	//   footer of every web page we will ignore it
	m_srep.m_hasAddress    = *hasAddress;
	m_srep.m_isContacty    =  *hci;//getIsContacty(fu,
					//	 info1,
					//	 m_hopCount ,
					//	 *ct , // contentType
					//	 *isRoot ,
					//	 m_niceness );
	m_srep.m_hasTOD        = *hasTOD;
	//m_srep.m_hasSiteVenue  = *hasSiteVenue;

	// validate all
	m_srep.m_inGoogleValid           = 1;
	m_srep.m_hasContactInfoValid     = 1;
	m_srep.m_hasAuthorityInlinkValid = 1;
	m_srep.m_isContactyValid         = 1;
	m_srep.m_hasAddressValid         = 1;
	m_srep.m_hasTODValid             = 1;
	//m_srep.m_hasSiteVenueValid       = 1;

	// a quick validation. reply must unlock the url from the lock table.
	// so the locks must be equal.
	if ( m_sreqValid &&
	     // we create a new spiderrequest if injecting with a fake firstip
	     // so it will fail this test...
	     ! m_sreq.m_isInjecting ) {
		int64_t lock1 = makeLockTableKey(&m_sreq);
		int64_t lock2 = makeLockTableKey(&m_srep);
		if ( lock1 != lock2 ) {
			log("build: lock1 != lock2 lock mismatch for %s",
			    m_firstUrl.m_url);
			char *xx=NULL;*xx=0;
		}
	}

	// validate
	m_srepValid = true;

	return &m_srep;
}

// . so Msg20 can see if we are banned now or not...
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
//   because things like "parentIsRSS" can be both true or false since a url
//   can have multiple spider recs associated with it!
void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq   ,
				    SpiderReply   *srep   ) {

	// sanity checks
	if ( ! m_ipValid                   ) { char *xx=NULL;*xx=0; }
	//if ( ! m_domHash32Valid            ) { char *xx=NULL;*xx=0; }
	//if ( ! m_siteNumInlinksValid       ) { char *xx=NULL;*xx=0; }
	if ( ! m_hopCountValid             ) { char *xx=NULL;*xx=0; }
	if ( ! m_pubDateValid              ) { char *xx=NULL;*xx=0; }
	if ( ! m_langIdValid               ) { char *xx=NULL;*xx=0; }
	if ( ! m_isRSSValid                ) { char *xx=NULL;*xx=0; }
	if ( ! m_isPermalinkValid          ) { char *xx=NULL;*xx=0; }
	//if ( ! m_isUrlPermalinkFormatValid ) { char *xx=NULL;*xx=0; }
	//if ( ! m_spideredTimeValid         ) { char *xx=NULL;*xx=0; }
	//if ( ! m_pageNumInlinksValid     ) { char *xx=NULL;*xx=0; }
	//if ( ! m_percentChangedValid       ) { char *xx=NULL;*xx=0; }

	Url *fu = getFirstUrl();

	// get this
	//TagRec *gr = (TagRec *)ptr_tagRecData;
	//Tag    *tag = NULL;
	//if ( gr ) tag = gr->getTag("sitenuminlinks");
	// reset
	sreq->reset();
	// assume not valid
	sreq->m_siteNumInlinks = -1;

	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	// how many site inlinks?
	sreq->m_siteNumInlinks       = m_siteNumInlinks;
	sreq->m_siteNumInlinksValid  = true;

	// set other fields besides key
	sreq->m_firstIp              = m_ip;
	sreq->m_hostHash32           = m_hostHash32a;
	//sreq->m_domHash32            = m_domHash32;
	//sreq->m_siteNumInlinks       = m_siteNumInlinks;
	//sreq->m_pageNumInlinks     = m_pageNumInlinks;
	sreq->m_hopCount             = m_hopCount;

	sreq->m_parentHostHash32     = 0;//m_sreq.m_parentHostHash32;
	sreq->m_parentDomHash32      = 0;//m_sreq.m_parentDomHash32;
	sreq->m_parentSiteHash32     = 0;//m_sreq.m_parentSiteHash32;
	sreq->m_pageNumInlinks       = 0;//m_sreq.m_parentFirstIp;

	sreq->m_isNewOutlink         = 0;
	sreq->m_isAddUrl             = 0;//m_isAddUrl;
	sreq->m_isPingServer         = fu->isPingServer();
	//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;

	// transcribe from old spider rec, stuff should be the same
	sreq->m_addedTime          = m_firstIndexedDate;
	sreq->m_sameDom              = 0;//m_sreq.m_sameDom;
	sreq->m_sameHost             = 0;//m_sreq.m_sameHost;
	sreq->m_sameSite             = 0;//m_sreq.m_sameSite;
	sreq->m_wasParentIndexed     = 0;//m_sreq.m_parentWasIndexed;
	sreq->m_parentIsRSS          = 0;//m_sreq.m_parentIsRSS;
	sreq->m_parentIsPermalink    = 0;//m_sreq.m_parentIsPermalink;
	sreq->m_parentIsPingServer   = 0;//m_sreq.m_parentIsPingServer;

	// validate the stuff so getUrlFilterNum() acks it
	sreq->m_hopCountValid = 1;

	srep->reset();

	srep->m_spideredTime         = getSpideredTime();//m_spideredTime;
	//srep->m_isSpam             = isSpam; // real-time update this!!!
	srep->m_isRSS                = m_isRSS;
	srep->m_isPermalink          = m_isPermalink;
	srep->m_httpStatus           = 200;
	//srep->m_retryNum           = 0;
	srep->m_langId               = m_langId;
	srep->m_percentChangedPerDay = 0;//m_percentChanged;

	// we need this now for ucp ucr upp upr new url filters that do
	// substring matching on the url
	if ( m_firstUrlValid )
		strcpy(sreq->m_url,m_firstUrl.m_url);
}

// defined in PageCrawlBot.cpp
int32_t isInSeedBuf ( CollectionRec *cr , char *url, int len ) ;

// . add the spiderdb recs to the meta list
// . used by XmlDoc::setMetaList()
// . returns NULL and sets g_errno on error
// . otherwise returns the "new p"
// . if Scraper.cpp or PageAddUrl.cpp and Msg7.cpp should all use the XmlDoc
//   class even if just adding links. they should make a fake html page and
//   "inject" it, with only m_useSpiderdb set to true...
char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {

	if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; }

	// do not do this if recycling content
	// UNLESS REBUILDING...
	if ( m_recycleContent && ! m_useSecondaryRdbs ) return (char *)0x01;


	// for now skip in repair tool
	if ( m_useSecondaryRdbs && ! g_conf.m_rebuildAddOutlinks )
		return (char *)0x01;


	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 ) return (char *)links;

	char *spiderLinks = getSpiderLinks();
	if ( ! spiderLinks || spiderLinks == (char *)-1 )
		return (char *)spiderLinks;

	TagRec ***grv = getOutlinkTagRecVector();
	if ( ! grv || grv == (void *)-1 ) return (char *)grv;
	//char    **iiv = getOutlinkIsIndexedVector();
	//if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv;
	int32_t    **ipv = getOutlinkFirstIpVector();
	if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv;
	//int8_t  *hcv = getOutlinkHopCountVector();
	//if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv;
	char     *ipi = getIsIndexed(); // is the parent indexed?
	if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi;
	Addresses *aa = getAddresses ();
	if ( ! aa || aa == (Addresses *)-1 ) return (char *)aa;
	// sanity check
	if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }

	// . ignore address in dup sections
	// . this way if the place always has their address in the header or
	//   footer of every web page we will ignore it (SEC_DUP section flag)
	bool parentHasAddress = (bool)(aa->getNumNonDupAddresses()>0);

	// need this
	int32_t parentDomHash32 = getDomHash32();
	if ( parentDomHash32 != m_domHash32 ) { char *xx=NULL;*xx=0; }

	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot;

	int32_t *psni = getSiteNumInlinks();
	if ( ! psni || psni == (int32_t *)-1 ) return (char *)psni;

	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 ) return (char *)pfip;

	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (char *)d;

	Url *fu = getFirstUrl();
	if ( ! fu || fu == (void *)-1 ) return (char *)fu;

	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (char *)cu;

	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId;

	// validate this to prevent core for simplified redirect links
	int32_t hostHash32a = getHostHash32a();

	// so linkSites[i] is site for link #i in Links.cpp class
	int32_t *linkSiteHashes = getLinkSiteHashes ( );
	if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
		return (char *)linkSiteHashes;


	XmlDoc  *nd  = this;

	// set "od". will be NULL if no old xml doc, i.e. no old title rec
	//XmlDoc **pod = getOldXmlDoc ( );
	//if ( ! pod || pod == (void *)-1 ) return (char *)pod;
	//XmlDoc  *od  = *pod;

	// if this page is hacked, then do not spider external outlinks
	//char *comp = getIsCompromised();
	//if ( ! comp || comp == (char *)-1 ) return (char *)comp;
	//if ( *comp )
	//	onlyInternal = true;

	bool    isParentRSS       = false;
	bool    parentIsPermalink = false;
	bool    parentIsSiteMap   = false;
	// PageAddUrl.cpp does not supply a valid new doc, so this is NULL
	if ( nd ) {
		isParentRSS       = *nd->getIsRSS() ;
		parentIsPermalink = *nd->getIsPermalink();
		parentIsSiteMap   = *nd->getIsSiteMap();
	}

	int32_t n = links->m_numLinks;
	// return early if nothing to do. do not return NULL though cuz we
	// do not have g_errno set!
	if ( n <= 0 ) return (char *)0x01;

	// sanity checks
	if ( ! m_ipValid             ) { char *xx=NULL;*xx=0; }
	if ( ! m_domHash32Valid      ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	if ( ! m_hostHash32aValid    ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteHash32Valid     ) { char *xx=NULL;*xx=0; }
	if ( ! m_hopCountValid       ) { char *xx=NULL;*xx=0; }
	//if ( ! m_spideredTimeValid   ) { char *xx=NULL;*xx=0; }

	int64_t myUh48 = m_firstUrl.getUrlHash48();

	// . pre-allocate a buffer to hold the spider recs
	// . taken from SpiderRequest::store()
	int32_t size = 0;
	for ( int32_t i = 0 ; i < n ; i++ )
		size += SpiderRequest::getNeededSize ( links->getLinkLen(i) );

	// append spider recs to this list ptr
	char *p = m_p;

	// hash table to avoid dups
	HashTableX ht;
	char buf2[8192];
	if ( ! ht.set ( 4,0,1000,buf2 , 8192,false,m_niceness,"linkdedup" ) )
		return NULL;

	// count how many we add
	int32_t numAdded = 0;
	int32_t numAddedFromSameDomain = 0;
	int32_t linksBanned = 0;
	int32_t linksFiltered = 0;

	bool isParentPingServer = false;
	if ( fu && fu->isPingServer() ) isParentPingServer = true;
	if ( cu && cu->isPingServer() ) isParentPingServer = true;

	// int16_tcut
	bool isScraping = (m_sreqValid && m_sreq.m_isScraping);
	//bool useTestSpiderDir = (m_sreqValid && m_sreq.m_useTestSpiderDir);

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// do not do this if not test collection for now
	bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
	// turn off for now
	isTestColl = false;

	//char **wptrs = m_words.getWords();
	//int32_t  *wlens = m_words.getWordLens();

	// need this for setting SpiderRequest::m_spiderTime
	//int32_t nowGlobal = getTimeGlobal();

	// for setting LF_CONTACTY bit on the outlinks
	char disbuf[1000];
	HashTableX disqualify;
	disqualify.set(4,0,32,disbuf,1000,false,m_niceness,"disqual");
	int32_t consec = 0;
	int32_t linkTypes[2000];
	int32_t lastType = 0;


	// if the file we are indexing now has
	// "<meta name=spiderlinkslinks value=0>" then that means to
	// add the links to spiderdb, but do not spider their links!
	// dmozparse uses this to make a file called gbdmoz.urs.txt.0
	// that is just filled with urls that are in dmoz. and we want
	// to index just those urls.
	//
	// now just make dmozparse output urls as <a href=> tags.
	//
	char mbuf[16];
	mbuf[0] = '\0';
	char *tag = "spiderlinkslinks";
	int32_t tlen = gbstrlen(tag);
	xml->getMetaContent ( mbuf, 16 , tag , tlen );
	bool avoid = false;
	if ( mbuf[0] == '0' ) avoid = true;

	// if this is a simplified redir and we should not be spidering
	// links then turn it off as well! because we now add simplified
	// redirects back into spiderdb using this function.
	if ( m_spiderLinksValid && ! m_spiderLinks )
		avoid = true;

	// it also has this meta tag now too
	mbuf[0] = '\0';
	tag = "ignorelinksexternalerrors";
	tlen = gbstrlen(tag);
	xml->getMetaContent ( mbuf, 16 , tag , tlen );
	bool ignore = false;
	if ( mbuf[0] == '1' ) ignore = true;

	// for diffbot crawlbot, if we are a seed url and redirected to a
	// different domain... like bn.com --> barnesandnoble.com
	int32_t redirDomHash32  = 0;
	int32_t redirHostHash32 = 0;
	//int32_t redirSiteHash32 = 0;
	if ( //cr->m_isCustomCrawl == 1 &&
	     //isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) &&
	     m_hopCount == 0 &&
	     m_redirUrlValid &&
	     ptr_redirUrl &&
	     //m_redirUrlPtr && (this gets reset to NULL as being LAST redir)
	     // this is the last non-empty redir here:
	     m_redirUrl.getUrlLen() > 0 ) {
		log("build: seed REDIR: %s",m_redirUrl.getUrl());
		redirDomHash32  = m_redirUrl.getDomainHash32();
		redirHostHash32 = m_redirUrl.getHostHash32();
	}


	//SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );

	//
	// serialize each link into the metalist now
	//
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// grab our info
		TagRec *gr        = (*grv)[i];
		int32_t    firstIp   = (*ipv)[i];
		//char    isIndexed = (*iiv)[i];
		//int32_t    hc        = hcv[i];
		// ip lookup failed? do not add to spiderdb then
		if ( firstIp == 0 || firstIp == -1 ) continue;

		// if firstIp is in the SpiderColl::m_overflowFirstIps list
		// then do not add any more links to it. it already has
		// more than 500MB worth.
		// this was moved to Rdb.cpp's addRecord()
		// if ( sc && sc->isFirstIpInOverflowList ( firstIp ) ) {
		// 	m_linkOverflows++;
		// 	g_stats.m_totalOverflows++;
		// 	continue;
		// }

		// sanity check
		//if ( firstIp == 0x03 ) {char *xx=NULL;*xx=0; }
		// get flags
		linkflags_t flags = links->m_linkFlags[i];
		// . skip if we are rss page and this link is an <a href> link
		// . we only harvest <link> urls from rss feeds, not href links
		// . or in the case of feedburner, those orig tags
		if ( isParentRSS && (flags & LF_AHREFTAG) ) continue;
		// if we have a <feedburner:origLink> tag, then ignore <link>
		// tags and only get the links from the original links
		if ( links->m_isFeedBurner && !(flags & LF_FBTAG) ) continue;
		// do not add self links, pointless
		if ( flags & LF_SELFLINK ) continue;
		// do not add if no follow
		if ( flags & LF_NOFOLLOW ) continue;
		// point to url
		char *s    = links->getLink   (i);
		int32_t  slen = links->getLinkLen(i);
		// breathe
		QUICKPOLL(m_niceness);
		// get hash
		int32_t uh = hash32 ( s , slen );
		// it does not like keys of 0, that means empty slot
		if ( uh == 0 ) uh = 1;
		// skip if dup
		if ( ht.isInTable ( &uh ) ) continue;
		// add it, returns false and sets g_errno on error
		if ( ! ht.addKey ( &uh ) ) return NULL;
		// we now supports HTTPS
		if ( strncmp(s,"http://",7) && strncmp(s,"https://",8) )
			continue;
		// . do not add if "old"
		// . Links::set() calls flagOldOutlinks()
		// . that just means we probably added it the last time
		//   we spidered this page
		// . no cuz we might have a different siteNumInlinks now
		//   and maybe this next hop count is now allowed where as
		//   before it was not!
		//if ( flags & LF_OLDLINK ) continue;

		// set it. addWWW = true! no.. make it false because of issues
		// like tmblr.co/ZHw5yo1E5TAaW injection where
		// www.tmblr.co has no IP
		Url url; url.set ( s , slen , false ); // true );

		// if hostname length is <= 2 then SILENTLY reject it
		if ( url.getHostLen() <= 2 ) continue;

		// are we a new outlink from a ? i.e. a "hot link"? assume so
		bool newOutlink = true;
		// if no old links, can not be a new outlink then
		if ( flags & LF_OLDLINK ) newOutlink = false;
		// . do not consider outlinks of new pages to be newOutlinks.
		//   that is somewhat redundant.
		// . you can use "parentisnew" to do what you want in the url
		//   filters table
		//if ( ! isIndexed ) newOutlink = false;

		// get # of inlinks to this site... if recorded...
		int32_t ksni = -1;
		Tag *st = NULL;
		if ( gr ) st = gr->getTag ("sitenuminlinks");
		if ( st ) ksni = atol(st->getTagData());

		int32_t hostHash32   = url.getHostHash32();
		// . consult our sitelinks.txt file
		// . returns -1 if not found
		int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );

		// try with www if not there
		if ( min < 0 && ! url.hasSubdomain() ) {
			int32_t wwwHash32 = url.getHash32WithWWW();
			min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
		}

		if ( min >= 0 && ksni < min )
			ksni = min;

		//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
		//int32_t ksni = m_siteNumInlinks;

		// . get possible pub date from url (.../2008/09/23/page.htm)
		// . this returns 0 if none found
		//int32_t urlPubDate = parseDateFromUrl(s);

		// use zero for the timestamp so SiteGetter does not recompute
		// any tags in the tagRec thereby blocking!
		//SiteGetter sg;
		//sg.getSite ( s , gr , 0, m_coll, m_niceness,false,NULL,NULL);
		// get this
		bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );
		//int32_t siteHash32 = hash32n ( linkSite );

		// get it quick
		bool ispingserver = url.isPingServer();
		int32_t domHash32    = url.getDomainHash32();

		// is link rss?
		//bool isrss = false;
		//if (slen>6 && !strncasecmp(s+slen-4,".rss",4)) isrss = true;
		bool isRSSExt = false;
		char *ext = url.getExtension();
		if ( ext && strcasecmp(ext,"rss" ) == 0 ) isRSSExt = true;
		if ( ext && strcasecmp(ext,"xml" ) == 0 ) isRSSExt = true;
		if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true;


		// make the spider request rec for it
		SpiderRequest ksr;
		// to defaults (zero out)
		ksr.reset();
		// set other fields besides key
		ksr.m_firstIp          = firstIp;
		ksr.m_hostHash32       = hostHash32;
		ksr.m_domHash32        = domHash32;
		ksr.m_siteHash32       = linkSiteHashes[i];//siteHash32;
		ksr.m_siteNumInlinks   = ksni;
		ksr.m_siteNumInlinksValid = true;
		ksr.m_isRSSExt            = isRSSExt;
		// continue using "test-spider" subdir to cache web pages
		// if our parent was using that
		//ksr.m_useTestSpiderDir = useTestSpiderDir;
		ksr.m_parentIsSiteMap = parentIsSiteMap;

		ksr.m_hasMediaExtension = url.hasMediaExtension();
		ksr.m_hasMediaExtensionValid = 1;

		// now we need this so we can share Msg12 spider locks with
		// query reindex docid-based spider requests. that way
		// we do not spider the same document at the same time.
		//ksr.m_probDocId = g_titledb.getProbableDocId(&url);

		//ksr.m_pageNumInlinks = 0;

		// hop count is now 16 bits so do not wrap that around
		int32_t hc = m_hopCount + 1;
		if ( hc > 65535 ) hc = 65535;
		ksr.m_hopCount         = hc;

		// keep hopcount the same for redirs
		if ( m_indexCodeValid &&
		     ( m_indexCode == EDOCSIMPLIFIEDREDIR ||
		       m_indexCode == EDOCNONCANONICAL ) )
			ksr.m_hopCount = m_hopCount;

		// for diffbot custom crawls we keep the computed hopcount
		if ( ! cr->m_isCustomCrawl ) {
			if ( issiteroot   ) ksr.m_hopCount = 0;
			if ( ispingserver ) ksr.m_hopCount = 0;
			//if ( isrss        ) ksr.m_hopCount = 0;
		}

		// log("ksr: url=%s hc=%i (isr=%i ips=%i icv=%i ic=%i mhc=%i)",
		//     url.getUrl(),(int)ksr.m_hopCount,
		//     (int)issiteroot,(int)ispingserver,(int)m_indexCodeValid,
		//     (int)m_indexCode,(int)m_hopCount
		//     );

		// validate it
		ksr.m_hopCountValid = true;

		ksr.m_addedTime        = getSpideredTime();//m_spideredTime;
		//ksr.m_lastAttempt    = 0;
		//ksr.m_urlPubDate       = urlPubDate;
		//ksr.m_errCode        = 0;
		ksr.m_parentHostHash32 = hostHash32a;
		ksr.m_parentDomHash32  = m_domHash32;
		ksr.m_parentSiteHash32 = m_siteHash32;

		// if a seed/hopcount0 url redirected to a different domain
		// then use that if it is the same. that way we can satisft
		// the "isonsamedomain" expression in the url filters table.
		if ( redirDomHash32 == domHash32 && redirDomHash32 )
			ksr.m_parentDomHash32 = redirDomHash32;
		if ( redirHostHash32 == hostHash32 && redirHostHash32 )
			ksr.m_parentHostHash32 = redirHostHash32;

		//ksr.m_parentFirstIp    = *pfip;//m_ip;
		ksr.m_pageNumInlinks   = 0;

		ksr.m_parentHasAddress = parentHasAddress;
		// get this
		bool isupf = ::isPermalink(NULL,&url,CT_HTML,NULL,isRSSExt);
		// set some bit flags. the rest are 0 since we call reset()
		if ( newOutlink   ) ksr.m_isNewOutlink         = 1;
		if ( isupf        ) ksr.m_isUrlPermalinkFormat = 1;
		//if ( isIndexed    ) ksr.m_isIndexed          = 1;
		if ( ispingserver ) ksr.m_isPingServer         = 1;

		// is it like www.xxx.com/* (does not include www.xxx.yyy.com)
		// includes xxx.com/* however
		ksr.m_isWWWSubdomain = url.isSimpleSubdomain();

		// get link text we use for this outlink
		/*
		char tbuf[200];
		int32_t  tlen = links->getLinkText2 ( i          ,
						   tbuf       ,
						   200        ,
						   NULL       ,
						   NULL       ,
						   NULL       ,
						   m_niceness );
		*/

		// the updated isContacty algo to fix www.apha.org which
		// has a ton of apha.org/about/* links
		int32_t t = getIsContacty ( &url,
					 NULL ,
					 ksr.m_hopCount ,
					 0 , // content type
					 (ksr.m_hopCount==0),
					 m_niceness );
		// if same type as last one we might disqualify if 3 in a row
		if ( t && t == lastType ) consec++;
		else                      consec = 0;
		// disqualify this pattern as a contacty link if is abused
		if ( consec >= 3 )
			if ( ! disqualify.addKey(&t) )
				return NULL;
		// remember. use numAdded as the index for this since we do
		// not add all the outlinks to this list.
		if ( numAdded < 2000 ) linkTypes[numAdded] = t;
		// set this
		lastType = t;

		// validate
		ksr.m_isContactyValid = 1;

		// if parent is a root of a popular site, then it is considered
		// an authority linker.  (see updateTagdb() function above)
		if ( *isRoot && *psni >= 500 )
			ksr.m_hasAuthorityInlink   = 1;
		// this is in request now as well as reply
		//Tag *tag;
		// hascontactinfo tag can have a value of 0 or 1
		//tag = gr->getTag("hascontactinfo");
		//if ( tag ) {
		if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; }
		if ( m_hasContactInfo ) {
			ksr.m_hasContactInfo = 1;
			ksr.m_hasContactInfoValid     = true;
		}

		// if we just set the contact info, use us, more recent
		if ( linkSiteHashes[i]==m_siteHash32 && m_hasContactInfoValid){
			ksr.m_hasContactInfo      = m_hasContactInfo;
			ksr.m_hasContactInfoValid = true;
		}

		if ( gr->getTag("ingoogle" ) ) {
			ksr.m_inGoogle      = 1;
			ksr.m_inGoogleValid = true;
		}
		// the mere existence of these tags is good
		if ( gr->getTag("authorityinlink"))ksr.m_hasAuthorityInlink =1;
		ksr.m_hasAuthorityInlinkValid = true;

		// if our url was a seed and redirected to another domain
		// allow outlinks on that other domain to be on domain too.
		// only used for diffbot crawlbot right now.
		if ( domHash32  == redirDomHash32  && redirDomHash32 )
			ksr.m_sameDom  = 1;
		if ( hostHash32 == redirHostHash32 && redirHostHash32 )
			ksr.m_sameHost = 1;
		// if ( linkSiteHashes[i]==redirSiteHash32 && redirSiteHash32)
		// 	ksr.m_sameSite = 1;

		// set parent based info
		if ( domHash32  == m_domHash32   ) ksr.m_sameDom  = 1;
		if ( hostHash32 == m_hostHash32a ) ksr.m_sameHost = 1;
		if ( linkSiteHashes[i]==m_siteHash32  ) ksr.m_sameSite = 1;
		if ( *ipi                        ) ksr.m_wasParentIndexed  = 1;
		if ( isParentRSS                 ) ksr.m_parentIsRSS       = 1;
		if ( parentIsPermalink           ) ksr.m_parentIsPermalink = 1;
		if ( isParentPingServer          ) ksr.m_parentIsPingServer= 1;
		if ( parentIsSiteMap             ) ksr.m_parentIsSiteMap   = 1;

		// this is used for building dmoz. we just want to index
		// the urls in dmoz, not their outlinks.
		if ( avoid  ) ksr.m_avoidSpiderLinks = 1;

		// this is used for building dmoz. we need to index this
		// url even in the case of ETCPTIMEDOUT, etc.
		if ( ignore ) ksr.m_ignoreExternalErrors = 1;

		// . if this is the 2nd+ time we were spidered and this outlink
		//   wasn't there last time, then set this!
		// . if this is the first time spidering this doc then set it
		//   to zero so that m_minPubDate is set to -1 when the outlink
		//   defined by "ksr" is spidered.
		if ( m_oldDocValid && m_oldDoc ) {
			int32_t oldSpideredTime = m_oldDoc->getSpideredTime();
			ksr.m_parentPrevSpiderTime = oldSpideredTime;
		}
		else
			ksr.m_parentPrevSpiderTime = 0;

		//
		// . inherit manual add bit if redirecting to simplified url
		// . so we always spider seed url even if prohibited by
		//   the regex, and even if it simplified redirects
		//
		if ( m_indexCodeValid &&
		     ( m_indexCode == EDOCSIMPLIFIEDREDIR ||
		       m_indexCode == EDOCNONCANONICAL ) &&
		     m_sreqValid ) {
			if ( m_sreq.m_isInjecting )
				ksr.m_isInjecting = 1;
			if ( m_sreq.m_isAddUrl )
				ksr.m_isAddUrl = 1;
		}

		// it is useful to know the primary langid of the parent
		// when prioritizing links for spidering in the case of
		// focussing the search engine on a particular set of langs
		ksr.m_parentLangId = *langId;

		// don't forget this one!
		//ksr.m_spiderTime = nowGlobal;

		// . is it "spam"? XmlDoc.cpp::isSpam()
		// . we need to make that root quality into site root quality!
		// . let's put spam detection logic into url filters
		//if ( isSpam ( s,gr,m_spideredTime,true ) )
		//	// set the bit flag
		//	ksr.m_isSpam = 1;
		// copy the url into SpiderRequest::m_url buffer
		strcpy(ksr.m_url,s);
		// this must be valid
		if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }

		// set the key, ksr.m_key. isDel = false
		ksr.setKey ( firstIp, *d , false );

		// we were hopcount 0, so if we link to ourselves we override
		// our original hopcount of 0 with this guy that has a
		// hopcount of 1. that sux... so don't do it.
		if ( ksr.getUrlHash48() == myUh48 ) continue;

		// if we've recently added this url to spiderdb in Spider.cpp, skip it
		//if ( sc && sc->isInDupCache ( &ksr , false ) )
		//	continue;

		// . technically speaking we do not have any reply so we
		//   should not be calling this! cuz we don't have all the info
		// . see if banned or filtered, etc.
		// . at least try to call it. getUrlFilterNum() should
		//   break out and return -1 if it encounters a filter rule
		//   that it does not have enough info to answer.
		//   so if your first X filters all map to a "FILTERED"
		//   priority and this url matches one of them we can
		//   confidently toss this guy out.
		// . show this for debugging!
		// int32_t ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
		// 			       false, m_niceness, cr,
		// 			       false,//true , // outlink?
		// 			       NULL ); // quotatable
		// logf(LOG_DEBUG,"build: ufn=%"INT32" for %s",
		//      ufn,ksr.m_url);

		// bad?
		//if ( ufn < 0 ) {
		//	log("build: link %s had bad url filter."
		//	    , ksr.m_url );
		//	g_errno = EBADENGINEER;
		//	return NULL;
		//}

		//int32_t priority = -1;
		//if ( ufn >= 0 )
		//	priority = cr->m_spiderPriorities[ufn];

		// debug
		if ( g_conf.m_logDebugUrlAttempts || isScraping ) {
			// print the tag rec out into sb2
			SafeBuf sb2;
			if ( gr ) gr->printToBuf ( &sb2 );
			// get it
			//SafeBuf sb1;
			char *action = "add";
			if ( isScraping ) action = "scrape";
			logf(LOG_DEBUG,
			     "spider: attempting to %s link. "
			     "%s "
			     "tags=%s "
			     "onpage=%s"
			     ,
			     action ,
			     ksr.m_url,
			     //sb1.getBufStart(),
			     sb2.getBufStart(),
			     m_firstUrl.m_url);
		}
		// do not add if bad priority, SPIDER_PRIORITY_FILTERED, ...
		// . mdw: oct 24, 2013. now i add so the urls show up in
		//   the pagecrawlbot.cpp spiderdb dump, so you can examine
		//   exactly why a url was crawled or not. plus if you change
		//   your mind about banning/filtering then it'd be nice to
		//   have these urls readily available.
		//if ( priority == SPIDER_PRIORITY_FILTERED ) {
		//	linksFiltered++; continue; }
		//if ( priority == SPIDER_PRIORITY_BANNED   ) {
		//	linksBanned++; continue; }


		// serialize into the buffer
		int32_t need = ksr.getRecSize();
		// is that what we thought it would be?
		//int32_t thought = links->m_linkLens[i] + 1 + hsize;
		// sanity check
		//if ( need + 12 + 4 > thought ) { char *xx=NULL;*xx=0; }
		// sanity check
		if ( p + 1 + need > m_pend ) { char *xx=NULL;*xx=0; }
		// store the rdbId
		if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
		else                      *p++ = RDB_SPIDERDB;
		// print it for debug
		if ( isTestColl ) {
			SafeBuf tmp;
			ksr.print(&tmp);
			log("spider: attempting to add outlink "
			    "%s",tmp.getBufStart());
		}
		// store the spider rec
		gbmemcpy ( p , &ksr , need );
		// skip it
		p += need;
		// count it
		numAdded++;
		// check domain
		//if ( domHash32  == m_domHash32 ) numAddedFromSameDomain++;
		if ( ksr.m_sameDom ) numAddedFromSameDomain++;
	}

	//
	// scan through requests and set m_isContacty
	//
	char *s = m_p;
	int32_t k = 0;
	for ( ; s < p ; k++ ) {
		// advance over rdbid
		s++;
		// breathe
		QUICKPOLL(m_niceness);
		// cast
		SpiderRequest *ksr = (SpiderRequest *)s;
		// set size
		size = ksr->getRecSize();
		// advance over that
		s += size;
		// stop if breach
		if ( k >= 2000 ) break;
		// must be isContacty
		if ( ! linkTypes[k] ) continue;
		// and not disqualified
		if ( disqualify.isInTable(&linkTypes[k] )) continue;
		// ok, we are good to go
		ksr->m_isContacty = 1;
	}

	// . this is just how many urls we tried to index
	// . move into Spider::addSpiderRequest()
	//cr->m_localCrawlInfo.m_urlsHarvested += numAdded;
	//cr->m_globalCrawlInfo.m_urlsHarvested += numAdded;
	//cr->m_needsSave = true;

	// save it
	m_numOutlinksAdded      = numAdded;
	m_numOutlinksAddedValid = true;
	m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
	m_numOutlinksFiltered = linksFiltered;
	m_numOutlinksBanned = linksBanned;
	// update end of list once we have successfully added all spider recs
	m_p = p;
	// return current ptr
	return m_p ;
}


/*
// add keys/recs from the table into the metalist
bool XmlDoc::addTable96 ( HashTableX *tt1     ,
			  int32_t       date1   ,
			  bool       nosplit ) {

	// sanity check
	if ( tt1->m_numSlots ) {
		if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
		if ( tt1->m_ds != 4               ) {char *xx=NULL;*xx=0;}
	}

	// docid is handy
	int64_t d = *getDocId();

	uint8_t f = 0;
	if ( nosplit ) f = 0x80;

	// use secondary rdbs if repairing
	//bool useRdb2 = ( g_repair.isRepairActive() &&
	//		 ! g_repair.m_fullRebuild  &&
	//		 ! g_repair.m_removeBadPages );
	char rdbId1 = RDB_INDEXDB;
	char rdbId2 = RDB_DATEDB;
	if ( m_useSecondaryRdbs ) { // useRdb2 ) {
		rdbId1 = RDB2_INDEXDB2;
		rdbId2 = RDB2_DATEDB2;
	}

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// get its key
		int64_t *termId1 = (int64_t *)tt1->getKey ( i );
		// get the score
		uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
		// sanity check
		if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
		// store rdbid
		*m_p++ = (rdbId1 | f);
		// store it. not a del key.
		*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,false);
		// skip it
		m_p += sizeof(key_t);
		// add to datedb?
		if ( date1 == -1 ) continue;
		// yes
		*m_p++ = (rdbId2 | f);
		// store it. not a del key.
		*(key128_t *)m_p=
			g_datedb.makeKey(*termId1,date1,score1,d,false);
		// advance over that
		m_p += sizeof(key128_t);
	}
	return true;
}
*/

bool XmlDoc::addTable128 ( HashTableX *tt1     , // T <key128_t,char> *tt1
			   uint8_t     rdbId   ,
			   bool        forDelete ) {

	// sanity check
	if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }

	bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
	//if ( g_repair.m_fullRebuild            ) useRdb2 = false;
	//if ( g_repair.m_removeBadPages         ) useRdb2 = false;

	// store this rdbId into the list
	char useRdbId = rdbId;
	//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) useRdbId = RDB2_CLUSTERDB2;
	if ( useRdb2 && rdbId == RDB_LINKDB    ) useRdbId = RDB2_LINKDB2;
	if ( useRdb2 && rdbId == RDB_DATEDB    ) useRdbId = RDB2_DATEDB2;
	if ( useRdb2 && rdbId == RDB_PLACEDB   ) useRdbId = RDB2_PLACEDB2;
	if ( useRdb2 && rdbId == RDB_SECTIONDB ) useRdbId = RDB2_SECTIONDB2;

	// sanity checks
	if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
	if ( rdbId == RDB_PLACEDB ) {
		if ( tt1->m_ds !=  512 ) { char *xx=NULL;*xx=0; }
	}
	else if ( rdbId == RDB_SECTIONDB ) {
		int32_t svs = sizeof(SectionVote);
		if ( tt1->m_ds !=  svs ) { char *xx=NULL;*xx=0; }
	}
	else {
		if ( tt1->m_ds !=  0 ) { char *xx=NULL;*xx=0; }
	}

	int32_t count = 0;

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// breathe
		QUICKPOLL(m_niceness);
		// get its key
		key128_t *k = (key128_t *)tt1->getKey ( i );
		// no key is allowed to have the del bit clear at this point
		// because we reserve that for making negative keys!
		if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
		// store rdbid
		*m_p++ = useRdbId; // (useRdbId | f);
		// store it
		// *(key128_t *)m_p = *k; does this work?
		gbmemcpy ( m_p , k , sizeof(key128_t) );
		// all keys must be positive at this point
		if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
		// or if getting for incremental indexing and this is
		// from the "oldList"
		//if ( forDelete ) *m_p = *m_p & 0xfe;
		// skip key
		m_p += sizeof(key128_t);
		// count it
		count++;
		// do not add the data if deleting
		if ( forDelete ) continue;
		// skip if not sectiondb or placedb
		if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
		// ok test it out (MDW)
		//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
		//if ( count > 1 ) continue;
		// get the data value
		char *val = (char *)tt1->getValue ( k );
		// get the size of the data to store. assume Sectiondb vote.
		int32_t ds = sizeof(SectionVote);
		// placedb is special even. include the \0 terminator
		if ( rdbId == RDB_PLACEDB ) {
			// "ds" is how many bytes we store as data
			ds = gbstrlen(val)+1;
			// store dataSize first
			*(int32_t *)m_p = ds;
			// skip it
			m_p += 4;
		}
		// store possible accompanying date of the rdb record
		gbmemcpy (m_p,val, ds );
		// skip it
		m_p += ds;
	}
	//if(rdbId==RDB_LINKDB    ) log("doc: added %"INT32" linkdb keys"   ,count);
	//if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count);
	return true;
}

int32_t XmlDoc::getSiteRank ( ) {
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	return ::getSiteRank ( m_siteNumInlinks );
}

// . add keys/recs from the table into the metalist
// . we store the keys into "m_p" unless "buf" is given
bool XmlDoc::addTable144 ( HashTableX *tt1 , int64_t docId , SafeBuf *buf ) {

	// sanity check
	if ( tt1->m_numSlots ) {
		if ( tt1->m_ks != sizeof(key144_t) ) {char *xx=NULL;*xx=0;}
		if ( tt1->m_ds != 4                ) {char *xx=NULL;*xx=0;}
	}

	// assume we are storing into m_p
	char *p = m_p;

	// reserve space if we had a safebuf and point into it if there
	if ( buf ) {
		int32_t slotSize = (sizeof(key144_t)+2+sizeof(key128_t));
		int32_t need = tt1->getNumSlotsUsed() * slotSize;
		if ( ! buf->reserve ( need ) ) return false;
		// get cursor into buf, NOT START of buf
		p = buf->getBufStart();
	}

	int32_t siteRank = getSiteRank ();

	if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }

	char rdbId = RDB_POSDB;
	if ( m_useSecondaryRdbs ) rdbId = RDB2_POSDB2;

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// get its key
		char *kp = (char *)tt1->getKey ( i );
		// store rdbid
		*p++ = rdbId; // (rdbId | f);
		// store it as is
		gbmemcpy ( p , kp , sizeof(key144_t) );
		// sanity check
		//int64_t final = hash64n("products.offerprice",0);
		//int64_t prefix = hash64n("gbsortby",0);
		//int64_t h64 = hash64 ( final , prefix);
		//h64 &= TERMID_MASK;
		//if ( g_posdb.getTermId(kp) == h64 ) {
		//	log("hey: docid=%"INT64" float=%f",m_docId,
		//	    g_posdb.getFloat(kp) );
		//}
		/*
		// get the score
		int32_t score = tt1->getScoreFromSlot ( i ) ;
		// set the M-bits to the score. used to accumulate link texts
		// that are the same so pages like google.com do not have
		// the word 'google' like 1 million times. this should reduce
		// our "score" logarithmacly into the 7-bits or whatever.
		//
		// NO! now we just always increment the distance cursor
		// m_dist so there will never be a collision of any posdb
		// key we add... so we think
		if ( score ) {
			int32_t newScore = score;
			if ( score >= 65 ) newScore = 65 +(score/100);
			//if ( score >= 65+3200) newScore = 65 +(score/100);
			if ( newScore > MAXMULTIPLIER )
				newScore = MAXMULTIPLIER;
			g_posdb.setMultiplierBits(m_p,(unsigned char)newScore);
		}
		*/
		// this was zero when we added these keys to zero, so fix it
		g_posdb.setDocIdBits ( p , docId );
		// if this is a numeric field we do not want to set
		// the siterank or langid bits because it will mess up
		// sorting by the float which is basically in the position
		// of the word position bits.
		if ( g_posdb.isAlignmentBitClear ( p ) ) {
			// make sure it is set again. it was just cleared
			// to indicate that this key contains a float
			// like a price or something, and we should not
			// set siterank or langid so that its termlist
			// remains sorted just by that float
			g_posdb.setAlignmentBit ( p , 1 );
		}
		// otherwise, set the siterank and langid
		else {
			// this too
			g_posdb.setSiteRankBits ( p , siteRank );
			// set language here too
			g_posdb.setLangIdBits ( p , m_langId );
		}
		// advance over it
		p += sizeof(key144_t);
	}

	// all done
	if ( ! buf ) { m_p = p; return true; }

	// update safebuf otherwise
	char *start = buf->getBufStart();
	// fix SafeBuf::m_length
	buf->setLength ( p - start );
	// sanity
	if ( buf->length() > buf->getCapacity() ) { char *xx=NULL;*xx=0; }

	return true;
}

// add keys/recs from the table into the metalist
bool XmlDoc::addTable224 ( HashTableX *tt1 ) {

	// sanity check
	if ( tt1->m_numSlots ) {
		if ( tt1->m_ks != sizeof(key224_t) ) {char *xx=NULL;*xx=0;}
		if ( tt1->m_ds != 0                ) {char *xx=NULL;*xx=0;}
	}

	char rdbId = RDB_LINKDB;
	if ( m_useSecondaryRdbs ) rdbId = RDB2_LINKDB2;

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// get its key
		char *kp = (char *)tt1->getKey ( i );
		// store rdbid
		*m_p++ = rdbId; // (rdbId | f);
		// store it as is
		gbmemcpy ( m_p , kp , sizeof(key224_t) );
		// advance over it
		m_p += sizeof(key224_t);
	}
	return true;
}

/*
// . add table into our metalist pointed to by m_p
// . k.n1 = date   (see hashWords() below)
// . k.n0 = termId (see hashWords() below)
// . and the value is the score, 32-bits
bool XmlDoc::addTableDate ( HashTableX *tt1     , // T <key128_t,char> *tt1
			    uint64_t    docId   ,
			    uint8_t     rdbId   ,
			    bool        nosplit ) {

	if ( tt1->m_numSlotsUsed == 0 ) return true;

	uint8_t f = 0;
	if ( nosplit ) f = 0x80;

	// sanity check
	if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }

	// sanity checks
	if ( nosplit ) {
		if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
	}

	bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
	//if ( g_repair.m_fullRebuild            ) useRdb2 = false;
	//if ( g_repair.m_removeBadPages         ) useRdb2 = false;
	//if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
	if ( useRdb2 && rdbId == RDB_LINKDB    ) rdbId = RDB2_LINKDB2;
	if ( useRdb2 && rdbId == RDB_DATEDB    ) rdbId = RDB2_DATEDB2;

	// sanity checks
	if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
	if ( tt1->m_ds !=  4 ) { char *xx=NULL;*xx=0; }

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// breathe
		QUICKPOLL(m_niceness);
		// get its key
		key96_t *k = (key96_t *)tt1->getKey ( i );
		// get its value
		uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
		// convert to 8 bits
		v = score32to8 ( v );
		// . make the meta list key for datedb
		// . a datedb key (see Datedb.h)
		key128_t mk = g_datedb.makeKey ( k->n0  , // termId
						 k->n1  , // date
						 v      , // score (8 bits)
						 docId  ,
						 false  );// del key?
		// store rdbid with optional "nosplit" flag
		*m_p++ = (rdbId | f);
		// store it. it is a del key.
		*(key128_t *)m_p = mk;
		// skip it
		m_p += sizeof(key128_t);
	}
	return true;
}
*/

/*
// add keys/recs from the table into the metalist
bool XmlDoc::addTable96 ( HashTableX *tt1     ,
			  HashTableX *tt2     ,
			  int32_t       date1   ,
			  int32_t       date2   ,
			  bool       del     ,
			  bool       nosplit ) {

	// sanity check
	if ( tt1->m_numSlots ) {
		if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
		if ( tt1->m_ds != 4               ) {char *xx=NULL;*xx=0;}
	}
	if ( tt2->m_numSlots ) {
		if ( tt2->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;}
		if ( tt2->m_ds != 4               ) {char *xx=NULL;*xx=0;}
	}

	// docid is handy
	int64_t d = *getDocId();

	uint8_t f = 0;
	if ( nosplit ) f = 0x80;

	// use secondary rdbs if repairing
	//bool useRdb2 = ( g_repair.isRepairActive() &&
	//		 ! g_repair.m_fullRebuild  &&
	//		 ! g_repair.m_removeBadPages );
	char rdbId1 = RDB_INDEXDB;
	char rdbId2 = RDB_DATEDB;
	if ( m_useSecondaryRdbs ) { // useRdb2 ) {
		rdbId1 = RDB2_INDEXDB2;
		rdbId2 = RDB2_DATEDB2;
	}

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// breathe
		QUICKPOLL(m_niceness);
		// get its key
		int64_t *termId1 = (int64_t *)tt1->getKey ( i );
		// get the score
		uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) );
		// sanity check
		if ( score1 <= 0 ) { char *xx=NULL;*xx=0; }
		// see if in "tt2"
		int32_t slot = tt2->getSlot ( termId1 );
		// assume 0
		uint8_t score2 = 0;
		// look it up in the positive key table
		if ( slot >= 0 ) {
			score2 = score32to8 ( tt2->getScoreFromSlot(slot) );
			// sanity check
			if ( score2 <= 0 ) { char *xx=NULL;*xx=0; }
		}
		// we annihilate!
		if ( score1 != score2 ) {
			// store rdbid
			*m_p++ = (rdbId1 | f);
			// store it. it is a del key.
			*(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,del);
			// skip it
			m_p += sizeof(key_t);
		}
		// add to datedb?
		if ( date1 == -1 ) continue;
		// same dates too?
		if ( date1 == date2 && score1 == score2 ) continue;
		// yes
		*m_p++ = (rdbId2 | f);
		// store it. it is a del key.
		*(key128_t *)m_p=g_datedb.makeKey(*termId1,date1,score1,d,del);
		// advance over that
		m_p += sizeof(key128_t);
	}
	return true;
}

// . add table into our metalist pointed to by m_p
// . k.n1 = date   (see hashWords() below)
// . k.n0 = termId (see hashWords() below)
// . and the value is the score, 32-bits
bool XmlDoc::addTableDate ( HashTableX *tt1     , // T <key128_t,char> *tt1
			    HashTableX *tt2     , // <key128_t,char> *tt2
			    uint64_t    docId   ,
			    uint8_t     rdbId   ,
			    bool        del     ,
			    bool        nosplit ) {

	uint8_t f = 0;
	if ( nosplit ) f = 0x80;

	// sanity check
	if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }

	// sanity checks
	if ( nosplit ) {
		if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
	}

	bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
	//if ( g_repair.m_fullRebuild            ) useRdb2 = false;
	//if ( g_repair.m_removeBadPages         ) useRdb2 = false;
	if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
	if ( useRdb2 && rdbId == RDB_LINKDB    ) rdbId = RDB2_LINKDB2;
	if ( useRdb2 && rdbId == RDB_DATEDB    ) rdbId = RDB2_DATEDB2;

	// sanity checks
	if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; }
	if ( tt2->m_ks != 12 ) { char *xx=NULL;*xx=0; }
	if ( tt1->m_ds !=  4 ) { char *xx=NULL;*xx=0; }
	if ( tt2->m_ds !=  4 ) { char *xx=NULL;*xx=0; }

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// breathe
		QUICKPOLL(m_niceness);
		// get its key
		key96_t *k = (key96_t *)tt1->getKey ( i );
		// get its value
		uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i );
		// convert to 8 bits
		v = score32to8 ( v );
		// see if in "tt2"
		int32_t slot = tt2->getSlot ( k );
		// get value if there
		if ( slot >= 0 ) {
			// get it
			uint32_t val =*(uint32_t *)tt2->getValueFromSlot(slot);
			// convert to 8 bits
			val = score32to8 ( val );
			// compare, if same, skip it!
			if ( val == v ) continue;
		}
		// . make the meta list key for datedb
		// . a datedb key (see Datedb.h)
		key128_t mk = g_datedb.makeKey ( k->n0  , // termId
						 k->n1  , // date
						 v      , // score (8 bits)
						 docId  ,
						 del    );// del key?
		// store rdbid with optional "nosplit" flag
		*m_p++ = (rdbId | f);
		// store it. it is a del key.
		*(key128_t *)m_p = mk;
		// skip it
		m_p += sizeof(key128_t);
	}
	return true;
}

bool XmlDoc::addTable128 ( HashTableX *tt1     , // T <key128_t,char> *tt1
			   HashTableX *tt2     , // <key128_t,char> *tt2
			   uint8_t     rdbId   ,
			   bool        del     ,
			   bool        nosplit ) {

	uint8_t f = 0;
	if ( nosplit ) f = 0x80;

	// sanity check
	if ( rdbId == 0 ) { char *xx=NULL;*xx=0; }

	// sanity checks
	if ( nosplit ) {
		if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; }
		if ( rdbId == RDB_DATEDB ) { char *xx=NULL;*xx=0; }
	}

	bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive();
	//if ( g_repair.m_fullRebuild            ) useRdb2 = false;
	//if ( g_repair.m_removeBadPages         ) useRdb2 = false;
	if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2;
	if ( useRdb2 && rdbId == RDB_LINKDB    ) rdbId = RDB2_LINKDB2;
	if ( useRdb2 && rdbId == RDB_DATEDB    ) rdbId = RDB2_DATEDB2;

	// sanity checks
	if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; }
	if ( tt2->m_ks != 16 ) { char *xx=NULL;*xx=0; }
	if ( rdbId == RDB_PLACEDB ) {
		if ( tt1->m_ds !=  512 ) { char *xx=NULL;*xx=0; }
		if ( tt2->m_ds !=  512 ) { char *xx=NULL;*xx=0; }
	}
	else if ( rdbId == RDB_SECTIONDB ) {
		int32_t svs = sizeof(SectionVote);
		if ( tt1->m_ds !=  svs ) { char *xx=NULL;*xx=0; }
		if ( tt2->m_ds !=  svs ) { char *xx=NULL;*xx=0; }
	}
	else {
		if ( tt1->m_ds !=  0 ) { char *xx=NULL;*xx=0; }
		if ( tt2->m_ds !=  0 ) { char *xx=NULL;*xx=0; }
	}

	int32_t count = 0;

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// breathe
		QUICKPOLL(m_niceness);
		// get its key
		key128_t *k = (key128_t *)tt1->getKey ( i );
		// no key is allowed to have the del bit clear at this point
		// because we reserve that for making negative keys!
		if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;}
		// see if in "tt2"
		int32_t slot = tt2->getSlot ( k );
		// . skip if already indexed
		// . do not do incremental indexing for sectiondb/placedb since
		//   it may have the same key but different data!!!!!!!
		if ( slot >= 0 &&
		     rdbId != RDB_SECTIONDB &&
		     rdbId != RDB_PLACEDB   )
			continue;
		// store rdbid with optional "nosplit" flag
		*m_p++ = (rdbId | f);
		// store it
		// *(key128_t *)m_p = *k; does this work?
		gbmemcpy ( m_p , k , sizeof(key128_t) );
		// all keys must be positive at this point
		if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; }
		// clear the del bit if we are an unmatched key and "del"
		// is true. we need to be a negative key now
		if ( del ) m_p[0] = m_p[0] & 0xfe;
		// skip key
		m_p += sizeof(key128_t);
		// count it
		count++;
		// skip if not sectiondb or placedb
		if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue;
		// ok test it out (MDW)
		//logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below
		//if ( count > 1 ) continue;
		// if we were a negative key, do not add a value, even for
		// sectiondb
		if ( del ) continue;
		// get the data value
		char *val = (char *)tt1->getValue ( k );
		// get the size of the data to store. assume Sectiondb vote.
		int32_t ds = sizeof(SectionVote);
		// placedb is special even. include the \0 terminator
		if ( rdbId == RDB_PLACEDB ) {
			// "ds" is how many bytes we store as data
			ds = gbstrlen(val)+1;
			// store dataSize first
			*(int32_t *)m_p = ds;
			// skip it
			m_p += 4;
		}
		// store possible accompanying date of the rdb record
		gbmemcpy (m_p,val, ds );
		// skip it
		m_p += ds;
	}
	//if(rdbId==RDB_LINKDB    ) log("doc: added %"INT32" linkdb keys"   ,count);
	//if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count);
	return true;
}
*/


//
// . hash terms that are sharded by TERMID not DOCID!!
//
// . returns false and sets g_errno on error
// . these terms are stored in indexdb/datedb, but all terms with the same
//   termId reside in one and only one group. whereas normally the records
//   are split based on docid and every group gets 1/nth of the termlist.
// . we do this "no splitting" so that only one disk seek is required, and
//   we know the termlist is small, or the termlist is being used for spidering
//   or parsing purposes and is usually not sent across the network.
bool XmlDoc::hashNoSplit ( HashTableX *tt ) {

	//if (  m_pbuf )
	//	m_pbuf->safePrintf("<h3>Terms which are immune to indexdb "
	//			   "splitting:</h3>");

	//if ( m_skipIndexing ) return true;

	// this should be ready to go and not block!
	int64_t *pch64 = getExactContentHash64();
	//int64_t *pch64 = getLooseContentHash64();
	if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; }

	// int16_tcut
	Url *fu = getFirstUrl();

	if ( ! hashVectors ( tt ) ) return false;

	// constructor should set to defaults automatically
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_tt        = tt;
	// usually we shard by docid, but these are terms we shard by termid!
	hi.m_shardByTermId   = true;


	// for exact content deduping
	setStatus ( "hashing gbcontenthash (deduping) no-split keys" );
	char cbuf[64];
	int32_t clen = sprintf(cbuf,"%"UINT64"",*pch64);
	hi.m_prefix    = "gbcontenthash";
	if ( ! hashString ( cbuf,clen,&hi ) ) return false;

	////
	//
	// let's stop here for now, until other stuff is actually used again
	//
	////

	// let's bring back image thumbnail support for the widget project
	//return true;


	char *host = fu->getHost    ();
	//int32_t  hlen = fu->getHostLen ();

	/*
	setStatus ( "hashing no-split qdom keys" );

	char *dom  = fu->getDomain   ();
	int32_t  dlen = fu->getDomainLen();

	// desc is NULL, prefix will be used as desc
	hi.m_prefix    = "qdom";
	if ( ! hashString ( dom,dlen,&hi ) ) return false;


	setStatus ( "hashing no-split qhost keys" );

	// desc is NULL, prefix will be used as desc
	hi.m_prefix = "qhost";
	if ( ! hashString ( host,hlen,&hi ) ) return false;
	*/


	// now hash the site


	setStatus ( "hashing no-split SiteGetter terms");

	//
	// HASH terms for SiteGetter.cpp
	//
	// these are now no-split terms
	//
	char *s    = fu->getUrl   ();
	int32_t  slen = fu->getUrlLen();
	// . this termId is used by SiteGetter.cpp for determining subsites
	// . matches what is in SiteGet::getSiteList()
	// for www.xyz.com/a/     HASH www.xyz.com
	// for www.xyz.com/a/b/   HASH www.xyz.com/a/
	// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
	bool  add  = true;
	// we only hash this for urls that end in '/'
	if ( s[slen-1] != '/' ) add = false;
	// and no cgi
	if ( fu->isCgi()     ) add = false;
	// skip if root
	if ( fu->m_plen <= 1 ) add = false;
	// sanity check
	if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
	// . skip if we have no subdirectory outlinks
	// . that way we do not confuse all the pages in dictionary.com or
	//   wikipedia.org as subsites!!
	if ( ! m_links.hasSubdirOutlink() ) add = false;
	// hash it
	if ( add ) {
		// remove the last path component
		char *end2 = s + slen - 2;
		// back up over last component
		for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
		// hash that part of the url
		hi.m_prefix    = "siteterm";
		if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
	}

	//Dates *dp = getDates ();
	// hash the clocks into indexdb
	//if ( ! dp->hash ( m_docId , tt , this ) ) return false;

	// . hash special site/hopcount thing for permalinks
	// . used by Images.cpp for doing thumbnails
	// . this returns false and sets g_errno on error
	// . let's try thumbnails for all...
	//if ( ! *getIsPermalink() ) return true;

	setStatus ( "hashing no-split gbsitetemplate keys" );

	// must be valid
	if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }
	char buf[MAX_URL_LEN+20];
	//uint32_t th = m_tagVector.getVectorHash();
	uint32_t tph = *getTagPairHash32();
	// . skip this so we can do site:xyz.com queries
	// . but if this is https:// then you will have to
	//   specify that...
	char *site = getSite();
	// sanity check, must NOT start with http://
	if ( ! strncmp ( site , "http://", 7 ) ) { char *xx=NULL;*xx=0;}
	// this must match what we search in Images.cpp::getThumbnail()
	int32_t blen = sprintf(buf,"%"UINT32"%s",tph,site);

	// use the prefix as the description if description is NULL
	hi.m_prefix = "gbsitetemplate";
	//if ( ! hashString ( buf,blen,&hi ) ) return false;
	if ( ! hashSingleTerm ( buf,blen,&hi ) ) return false;


	setStatus ( "hashing no-split gbimage keys" );

	hi.m_prefix    = "gbimage";
	// hash gbimage: for permalinks only for Images.cpp
	for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) {
		// get the node number
		//int32_t nn = m_images.m_imageNodes[i];
		// get the url of the image
		//XmlNode *xn = m_xml.getNodePtr(nn);
		int32_t  srcLen;
		char *src = m_images.getImageUrl(i,&srcLen);
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		Url *cu = getCurrentUrl();
		// we can addwww to normalize since this is for deduping kinda
		iu.set ( cu , src , srcLen , true );  // addWWW? yes...
		char *u    = iu.getUrl   ();
		int32_t  ulen = iu.getUrlLen();
		// hash each one
		//if ( ! hashString ( u,ulen,&hi ) ) return false;
		// hash a single entity
		if ( ! hashSingleTerm ( u,ulen,&hi) ) return false;
		//log("test: %s",u);
	}

	return true;
}

// . returns -1 if blocked, returns NULL and sets g_errno on error
// . "sr" is the tagdb Record
// . "ws" store the terms for PageParser.cpp display
char *XmlDoc::hashAll ( HashTableX *table ) {

	setStatus ( "hashing document" );

	if ( m_allHashed ) return (char *)1;

	// sanity checks
	if ( table->m_ks != 18 ) { char *xx=NULL;*xx=0; }
	if ( table->m_ds != 4  ) { char *xx=NULL;*xx=0; }

	if ( m_wts && m_wts->m_ks != 12  ) { char *xx=NULL;*xx=0; }
	// ptr to term = 4 + score = 4 + ptr to sec = 4
	if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){char *xx=NULL;*xx=0;}

	unsigned char *hc = (unsigned char *)getHopCount();
	if ( ! hc || hc == (void *)-1 ) return (char *)hc;

	// need this for hashing
	HashTableX *cnt = getCountTable();
	if ( ! cnt ) return (char *)cnt;
	if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; }
	// and this
	//Weights *we = getWeights();
	//if ( ! we || we == (void *)-1 ) return (char *)we;
	// and this
	Links *links = getLinks();
	if ( ! links ) return (char *)links;
	if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; }
	// and now this
	//Synonyms *syn = getSynonyms();
	//if ( ! syn || syn == (void *)-1 ) return (char *)syn;

	char *wordSpamVec = getWordSpamVec();
	if (!wordSpamVec) return (char *)wordSpamVec;
	if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;}

	char *fragVec = getFragVec();//m_fragBuf.getBufStart();
	if ( ! fragVec ) return (char *)fragVec;
	if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; }

	// why do we need this?
	if ( m_wts ) {
		uint8_t *lv = getLangVector();
		if ( ! lv ) return (char *)lv;
		if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; }
	}

	TagRec *gr = getTagRec();
	if ( ! gr ) return (char *)gr;
	if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; }

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// just keep it somewhat sane...
	//if ( nw > 30000 ) nw = 30000;

	// then each singleton has one phrase, and 1 empty for good hashing
	//if ( ! table->setTableSize ( nw * 4 ) )
	//	return log("build: Could not allocate %"INT32" bytes for table "
	//		   "for indexing document.",
	//		   (nw*4)*(8+sizeof(int32_t)));

	/*
	const char *help =
		"<table><td bgcolor=lightgreen>\n"
		"Each document has several associated pieces. Each piece "
		"is indexed individually. The pieces are listed below and "
		"are preceeded with a table dictating the parameters with "
		"which the piece was indexed."

		"<br><br>"

		"Below that table the actual text of the piece is displayed. "
		"Each alphanumeric word in the text has two subscripts of the "
		"form <i>X/Y</i> where X and Y are percentage weights on the "
		"score of that particular alphanumeric word. X is the weight "
		"on the word itself and Y is the weight on the phrase which "
		"is started by that word. A weight of 100% "
		"indicates a weight which does not affect the score."

		"<br><br>"

		"Words that are struck out and in a box with a red background "
		"instead of light blue are considered to be spam, meaning "
		"they are repeated in a pattern. They "
		"contain a number in that box which indicates the probability "
		"they are spam and 100 minus that probability is weighted "
		"with their score to get a new, spam-adjusted score. "
		"<br>\n"
		"</tr>\n"
		"</table>\n"
		"</td></table>\n"
		"<br><br>\n";

	if ( m_pbuf ) m_pbuf->safePrintf("%s",help);
	*/

	/*
	int32_t inlinks = *getSiteNumInlinks();
	int32_t boost1  = getBoostFromSiteNumInlinks ( inlinks );

	// . now we hard code "boost2"
	// . based on # of alnum words
	// . this makes us look at keyword density, not just the
	//   plain keyword count
	int32_t naw = m_words.getNumAlnumWords();
	// . keep at 100% for up to 200 words then reduce linearly
	// . only do this for newer title recs to avoid undeletable data
	// . if we have a huge document, it can still contain a very
	//   relevant paragraph that is dense in the query terms, so
	//   we really only want to punish enough so the post query
	//   reranking has some good candidates for doing proximity
	//   scoring.
	// . back off by .90 every 1000 words
	float nn = naw;
	float bb = 100.0;
	while ( nn > 1000 ) {
		nn *= .9;
		bb *= .9;
	}
	// never drop below %1
	if ( bb < 1.0 ) bb = 1.0;
	// set it
	int64_t boost2 = (int64_t)bb;
	*/

	/*
	int32_t siteNumInlinks = *getSiteNumInlinks();

	if ( m_pbuf )
		m_pbuf->safePrintf(

			"<table border=1 cellpadding=2>"

			"<tr><td>siteNumInlinks</td><td><b>%"INT32"%%</b></td></tr>"

			"<tr><td>siteNumInlinksBoost</td>"
			"<td>%"INT32"%%</td></tr>"

			"<tr><td>numAlnumWords</td>"
			"<td>%"INT32"</td></tr> "

			"<tr><td>scoreWeightFromNumAlnumWords"
			"</td><td>%"INT32"%%</td></tr>"

			"<tr><td>headerWeight</td>"
			"<td>%"INT32"%%</td></tr>"

			"<tr><td>urlPathWeight</td>"
			"<td>%"INT32"%%</td></tr>"

			"<tr><td>externalLinkTextWeight</td>"
			"<td>%"INT32"%%</td></tr>"

			"<tr><td>internalLinkTextWeight</td>"
			"<td>%"INT32"%%</td></tr>"

			"<tr><td>conceptWeight</td>"
			"<td>%"INT32"%%</td></tr>"

			"<tr><td>titleWeight</td>"
			"<td>%"INT32"%%</td></tr>"

			"</table>"
			"<br>"
			,
			(int32_t)siteNumInlinks,
			(int32_t)boost1,
			//(int32_t)len,
			(int32_t)naw,
			(int32_t)boost2,
			(int32_t)boost1,
			(int32_t)boost2,
			//(int32_t)boost1,
			(int32_t)m_headerWeight,
			(int32_t)m_urlPathWeight,
			(int32_t)m_externalLinkTextWeight,
			(int32_t)m_internalLinkTextWeight,
			(int32_t)m_conceptWeight,
			(int32_t)m_titleWeight,
			(int32_t)m_titleWeight,
			(int32_t)boost1,
			(int32_t)boost1,
			);
	*/

	// do not repeat this if the cachedb storage call blocks
	m_allHashed = true;

	// reset distance cursor
	m_dist = 0;

	// hash diffbot's json output here
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;
	/*
	if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) {
		// hash the content type for type:json query
		if ( ! hashContentType   ( table ) ) return NULL;
		// and the url: query support
		if ( ! hashUrl           ( table ) ) return NULL;
		// language support
		if ( ! hashLanguage      ( table ) ) return NULL;
		// country?
		if ( ! hashCountry       ( table ) ) return NULL;
		if ( ! hashTagRec        ( table ) ) return NULL;
		// hash for gbsortby:gbspiderdate
		if ( ! hashDateNumbers   ( table ) ) return NULL;
		// has gbhasthumbnail:1 or 0
		if ( ! hashImageStuff    ( table ) ) return NULL;
		// and the json itself
		return hashJSON ( table );
	}
	*/

	if ( ! hashContentType   ( table ) ) return NULL;
	if ( ! hashUrl           ( table ) ) return NULL;
	if ( ! hashLanguage      ( table ) ) return NULL;
	if ( ! hashCountry       ( table ) ) return NULL;
	if ( ! hashSiteNumInlinks( table ) ) return NULL;
	if ( ! hashTagRec        ( table ) ) return NULL;
	if ( ! hashAds           ( table ) ) return NULL;
	if ( ! hashSubmitUrls    ( table ) ) return NULL;
	if ( ! hashIsAdult       ( table ) ) return NULL;

	// has gbhasthumbnail:1 or 0
	if ( ! hashImageStuff    ( table ) ) return NULL;

	// . hash sectionhash:xxxx terms
	// . diffbot still needs to hash this for voting info
	if ( ! hashSections   ( table ) ) return NULL;

	// now hash the terms sharded by termid and not docid here since they
	// just set a special bit in posdb key so Rebalance.cpp can work.
	// this will hash the content checksum which we need for deduping
	// which we use for diffbot custom crawls as well.
	if ( ! hashNoSplit ( table ) ) return NULL;


	// MDW: i think we just inject empty html with a diffbotreply into
	// global index now, so don't need this... 9/28/2014

	// stop indexing xml docs
	bool indexDoc = true;
	if ( cr->m_isCustomCrawl ) indexDoc = false;
	if ( ! cr->m_indexBody   ) indexDoc = false;
	// if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject )
	// 	indexDoc = true;
	// always index diffbot json objects for GI (custom crawl is false)
	if ( m_isDiffbotJSONObject )
		indexDoc = true;

	// global index unless this is a json object in which case it is
	// hashed above in the call to hashJSON(). this will decrease disk
	// usage by about half, posdb* files are pretty big.
	if ( ! indexDoc ) return (char *)1;

	// hash json fields
	if ( *ct == CT_JSON ) {
		// this hashes both with and without the fieldname
		hashJSONFields ( table );
		goto skip;
	}

	// same for xml now, so we can search for field:value like w/ json
	if ( *ct == CT_XML ) {
		// this hashes both with and without the fieldname
		hashXMLFields ( table );
		goto skip;
	}

	// hash the body of the doc first so m_dist is 0 to match
	// the rainbow display of sections
	if ( ! hashBody2 (table ) ) return NULL;

	// hash the title now too so neighborhood singles have more
	// to match. plus, we only hash these title terms iff they
	// are not already in the hash table, so as to avoid hashing
	// repeated title terms because we do not do spam detection
	// on them. thus, we need to hash these first before anything
	// else. give them triple the body score
	if ( ! hashTitle ( table )) return NULL;

	// . hash the keywords tag, limited to first 2k of them so far
	// . hash above the neighborhoods so the neighborhoods only index
	//   what is already in the hash table
	if ( ! hashMetaKeywords(table ) ) return NULL;

	// then hash the incoming link text, NO ANOMALIES, because
	// we index the single words in the neighborhoods next, and
	// we had songfacts.com coming up for the 'street light facts'
	// query because it had a bunch of anomalous inlink text.
	if ( ! hashIncomingLinkText(table,false,true)) return NULL;

	// then the meta summary and description tags with half the score of
	// the body, and only hash a term if was not already hashed above
	// somewhere.
	if ( ! hashMetaSummary(table) ) return NULL;

 skip:

	// this will only increment the scores of terms already in the table
	// because we neighborhoods are not techincally in the document
	// necessarily and we do not want to ruin our precision
	if ( ! hashNeighborhoods ( table ) ) return NULL;


	if ( ! hashLinks         ( table ) ) return NULL;
	if ( ! hashDateNumbers   ( table ) ) return NULL;
	if ( ! hashMetaTags      ( table ) ) return NULL;
	if ( ! hashMetaZip       ( table ) ) return NULL;
	if ( ! hashDMOZCategories( table ) ) return NULL;
	if ( ! hashCharset       ( table ) ) return NULL;
	if ( ! hashRSSInfo       ( table ) ) return NULL;
	if ( ! hashPermalink     ( table ) ) return NULL;

	// hash gblang:de last for parsing consistency
	if ( ! hashLanguageString ( table ) ) return NULL;

	// we set this now in hashWords3()
	if ( m_doingSEO )
		m_wordPosInfoBufValid = true;

	// store the m_wordPosInfoBuf into cachedb
	// NO! we are not allowed to block in here it messes shit up!!!
	//if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
	//	return (char *)-1;

	// . hash gbkeyword:gbmininlinks where the score is the inlink count
	// . the inlink count can go from 1 to 255
	// . an ip neighborhood can vote no more than once
	// . this is in LinkInfo::hash
	//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;

	if ( ! hashMetaData      ( table ) ) return NULL;

	// return true if we don't need to print parser info
	//if ( ! m_pbuf ) return true;
	// print out the table into g_bufPtr now if we need to
	//table->print ( );
	return (char *)1;
}

// . "inlinks" is # of inlinks to the SITE
// . returns a percentage boost
int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) {
	// . base on # of site inlinks
	// . just hard code this for now
	int32_t boost1 = 100;
	if ( inlinks >=    10 ) boost1 = 150;
	if ( inlinks >=    50 ) boost1 = 200;
	if ( inlinks >=   100 ) boost1 = 250;
	if ( inlinks >=   200 ) boost1 = 300;
	if ( inlinks >=   400 ) boost1 = 350;
	if ( inlinks >=   800 ) boost1 = 400;
	if ( inlinks >=  1600 ) boost1 = 450;
	if ( inlinks >=  3200 ) boost1 = 500;
	if ( inlinks >=  6400 ) boost1 = 550;
	if ( inlinks >= 12800 ) boost1 = 600;
	if ( inlinks >= 25600 ) boost1 = 650;
	if ( inlinks >= 51200 ) boost1 = 700;
	return boost1;
}

bool XmlDoc::appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) {

	// set4() called from the inject sets these two things for meta data
	// which is basically json that augments the doc, tags it with stuff
	if ( ! m_hasMetadata ) return true;
	if ( ! ptr_metadata  ) return true;

	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod ) { char *xx=NULL;*xx=0; }
	if ( pod == (XmlDoc **)-1 ) { char *xx=NULL;*xx=0; }
	// this is non-NULL if it existed
	XmlDoc *od = *pod;

	// wtf?
	if ( ! od ) return true;


	// dedup. if already in there, do not re-add it
	if ( strstr ( od->ptr_metadata , ptr_metadata ) )
		return true;

	SafeBuf md;

	// copy over and append
	if ( ! md.safeMemcpy ( od->ptr_metadata , od->size_metadata ) )
		return false;
	// remove trailing \0 if there
	md.removeLastChar ( '\0' );
	// separate from the new stuff
	if ( ! md.safePrintf(",\n") )
		return false;

	if ( ! md.safeMemcpy ( ptr_metadata , size_metadata ) )
		return false;

	if ( ! md.nullTerm ( ) )
		return false;
	// update his meta data
	od->ptr_metadata = md.getBufStart();
	od->size_metadata = md.length();

	int32_t nw = od->size_metadata * 4;

	HashTableX tt1;
	int32_t need4 = nw * 4 + 5000;
	if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,"posdb-i2"))
		return false;

	od->hashMetaData ( &tt1 );

	// store the posdb keys from tt1 into our safebuf, tmp
	SafeBuf sb;
	if ( m_usePosdb && ! addTable144 ( &tt1 , od->m_docId , &sb ) )
		return false;

	// this could use time axis so that is taken into account
	int64_t uh48 = getFirstUrlHash48();

	// and re-formulate (and compress) his new title rec
	SafeBuf trec;
	if ( ! od->setTitleRecBuf ( &trec , od->m_docId , uh48 ) )
		return false;

	// force the title rec key to be the same
	// if ( od->m_titleRecKeyValid && trec.getLength() >= sizeof(key_t) ) {
	// 	char *p = trec.getBufStart();
	// 	*(key_t *)p = od->m_titleRecKey;
	// }
	// else {
	// 	log("build: old titlerec invalid docid=%"INT64,od->m_docId);
	// }

	// store the posdb keys in the meta list
	if ( m_usePosdb && ! metaList->safeMemcpy ( &sb ) )
		return false;

	// store the updated titlerec into the meta list
	if ( m_useTitledb && ! metaList->pushChar(RDB_TITLEDB) )
		return false;
	if ( m_useTitledb && ! metaList->safeMemcpy(&trec) )
		return false;

	m_updatedMetaData = true;

	return true;
}

// . this is kinda hacky because it uses a short XmlDoc on the stack
// . no need to hash this stuff for regular documents since all the terms
//   are fielded by gberrorstr, gberrornum or gbisreply.
// . normally we might use a separate xmldoc class for this but i wanted
//   something more lightweight
SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ,
					      bool forDelete ) {

	// set status for this
	setStatus ( "getting spider reply meta list");

	if ( m_spiderStatusDocMetaListValid )
		return &m_spiderStatusDocMetaList;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	if ( ! cr->m_indexSpiderReplies || forDelete ) {
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// if docid based do not hash a spider reply. docid-based spider
	// requests are added to spiderdb from the query reindex tool.
	// do not do for diffbot subdocuments either, usespiderdb should be
	// false for those.
	// MDW: i disagree, i want to see when these get updated! 9/6/2014
	// ok, let's index for diffbot objects so we can see if they are
	// a dup of another diffbot object, or so we can see when they get
	// revisted, etc.
	//if ( m_setFromDocId || ! m_useSpiderdb ) {
	if ( ! m_useSpiderdb && ! m_isDiffbotJSONObject ) {
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// do not add a status doc if doing a query delete on a status doc
	if ( m_contentTypeValid && m_contentType == CT_STATUS ) {
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// doing it for diffbot throws off smoketests
	// ok, smoketests are updated now, so remove this
	// if ( strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
	// 	m_spiderStatusDocMetaListValid = true;
	// 	return &m_spiderStatusDocMetaList;
	// }

	// we double add regular html urls in a query reindex because the
	// json url adds the parent, so the parent gets added twice sometimes,
	// and for some reason it is adding a spider status doc the 2nd time
	// so cut that out. this is kinda a hack b/c i'm not sure what's
	// going on. but you can set a break point here and see what's up if
	// you want.
	// MDW: likewise, take this out, i want these recorded as well..
	// if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
	// 	m_spiderStatusDocMetaListValid = true;
	// 	return &m_spiderStatusDocMetaList;
	// }

	// . fake this out so we do not core
	// . hashWords3() uses it i guess
	bool forcedLangId = false;
	if ( ! m_langIdValid ) {
		forcedLangId = true;
		m_langIdValid = true;
		m_langId = langUnknown;
	}

	// prevent more cores
	bool forcedSiteNumInlinks = false;
	if ( ! m_siteNumInlinksValid ) {
		forcedSiteNumInlinks = true;
		m_siteNumInlinks = 0;
		m_siteNumInlinksValid = true;
	}

	SafeBuf *mbuf = getSpiderStatusDocMetaList2 ( reply );

	if ( forcedLangId )
		m_langIdValid = false;

	if ( forcedSiteNumInlinks ) {
		m_siteNumInlinksValid = false;
	}

	return mbuf;
}

// . the spider status doc
// . TODO:
//   usedProxy:1
//   proxyIp:1.2.3.4
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {

	setStatus ( "making spider reply meta list");

	// . we also need a unique docid for indexing the spider *reply*
	//   as a separate document
	// . use the same url, but use a different docid.
	// . use now to mix it up
	//int32_t now = getTimeGlobal();
	//int64_t h = hash64(m_docId, now );
	// to keep qa test consistent this docid should be consistent
	// so base it on spidertime of parent doc.
	// if doc is being force deleted then this is invalid!
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
	int64_t h = hash64(m_docId, m_spideredTime );
	// mask it out
	int64_t d = h & DOCID_MASK;
	// try to get an available docid, preferring "d" if available
	int64_t *uqd = getAvailDocIdOnly ( d );
	if ( ! uqd || uqd == (void *)-1 ) return  (SafeBuf *)uqd;

	m_addedStatusDocId = *uqd;

	// unsigned char *hc = (unsigned char *)getHopCount();
	// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;

	int32_t tmpVal = -1;
	int32_t *priority = &tmpVal;
	int32_t *ufn = &tmpVal;

	// prevent a core if sreq is not valid, these will freak out
	// diffbot replies may not have a valid m_sreq
	if ( m_sreqValid ) {
		priority = getSpiderPriority();
		if ( ! priority || priority == (void *)-1 )
			return (SafeBuf *)priority;

		ufn = getUrlFilterNum();
		if ( ! ufn || ufn == (void *)-1 )
			return (SafeBuf *)ufn;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	Json *jp1 = NULL;
	// i've seen ptr_utf8Content NULL and content type as html for
	// some reason when deleting a diffbot object doc so check for that
	// here and forget it. we don't want getParsedJson() to core.
	if ( m_isDiffbotJSONObject &&
	     m_contentType == CT_JSON &&
	     m_contentTypeValid ) {
		jp1 = getParsedJson();
		if ( ! jp1 || jp1 == (void *)-1) return (SafeBuf *)jp1;
	}

	// sanity
	if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }

	// why isn't gbhopcount: being indexed consistently?
	//if ( ! m_hopCountValid )  { char *xx=NULL;*xx=0; }

	// reset just in case
	m_spiderStatusDocMetaList.reset();

	// sanity
	if ( *uqd <= 0 || *uqd > MAX_DOCID ) {
		log("xmldoc: avail docid = %"INT64". could not index spider "
		    "reply or %s",*uqd,m_firstUrl.m_url);
		//char *xx=NULL;*xx=0; }
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// the old doc
	XmlDoc *od = NULL;
	if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;

	Url *fu = &m_firstUrl;

	// . make a little json doc that we'll hash up
	// . only index the fields in this doc, no extra gbdocid: inurl:
	//   hash terms
	SafeBuf jd;
	jd.safePrintf("{\n");

	// so type:status query works
	jd.safePrintf("\"type\":\"status\",\n");

	jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl()  );

	if ( ptr_redirUrl )
		jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
			      ptr_redirUrl);

	if ( m_indexCodeValid ) {
		jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
		jd.safePrintf("\"gbssStatusMsg\":\"");
		jd.jsonEncode (mstrerror(m_indexCode));
		jd.safePrintf("\",\n");
	}
	else {
		jd.safePrintf("\"gbssStatusCode\":-1,\n");
		jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
	}


	if ( m_httpStatusValid )
		jd.safePrintf("\"gbssHttpStatus\":%"INT32",\n",
			      (int32_t)m_httpStatus);

	// do not index gbssIsSeedUrl:0 because there will be too many usually
	bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
	if ( isSeed )
		jd.safePrintf("\"gbssIsSeedUrl\":1,\n");

	if ( od )
		jd.safePrintf("\"gbssWasIndexed\":1,\n");
	else
		jd.safePrintf("\"gbssWasIndexed\":0,\n");

	int32_t now = getTimeGlobal();
	if ( od )
		jd.safePrintf("\"gbssAgeInIndex\":"
			      "%"UINT32",\n",now - od->m_spideredTime);

	if ( m_isDiffbotJSONObject ) { // && cr->m_isCustomCrawl
		jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
		JsonItem *jsonItem = NULL;
		if ( jp1 ) jsonItem = jp1->getItem("diffbotUri");
		if ( jsonItem ) {
			jd.safePrintf("\"gbssDiffbotUri\":\"");
			int32_t vlen;
			char *val = jsonItem->getValueAsString( &vlen );
			if ( val ) jd.safeMemcpy ( val , vlen );
			jd.safePrintf("\",\n");
		}
		else
			jd.safePrintf("\"gbssDiffbotUri\":"
				      "\"none\",\n");
		// show the type as gbssDiffbotType:"article" etc.
		JsonItem *dti = NULL;
		if ( jp1 )
			dti = jp1->getItem("type");
		if ( dti ) {
			jd.safePrintf("\"gbssDiffbotType\":\"");
			int32_t vlen;
			char *val = dti->getValueAsString( &vlen );
			if ( val ) jd.jsonEncode ( val , vlen );
			jd.safePrintf("\",\n");
		}

	}
	else { // if ( cr->m_isCustomCrawl ) {
		jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
	}

	jd.safePrintf("\"gbssDomain\":\"");
	jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
	jd.safePrintf("\",\n");

	jd.safePrintf("\"gbssSubdomain\":\"");
	jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
	jd.safePrintf("\",\n");

	//if ( m_redirUrlPtr && m_redirUrlValid )
	//if ( m_numRedirectsValid )
	jd.safePrintf("\"gbssNumRedirects\":%"INT32",\n",m_numRedirects);

	if ( m_docIdValid )
		jd.safePrintf("\"gbssDocId\":%"INT64",\n", m_docId);//*uqd);

	if ( m_parentDocPtr && m_isChildDoc && m_parentDocPtr->m_docIdValid )
		jd.safePrintf("\"gbssParentDocId\":%"INT64",\n",
			      m_parentDocPtr->m_docId);

	if ( m_hopCountValid )
		//jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)*hc);
		jd.safePrintf("\"gbssHopCount\":%"INT32",\n",(int32_t)m_hopCount);

	// crawlbot round
	if ( cr->m_isCustomCrawl )
		jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
			      cr->m_spiderRoundNum);

	// for -diffbotxyz fake docs addedtime is 0
	if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
		// in Spider.cpp we try to set m_sreq's m_addedTime to the
		// min of all the spider requests, and we try to ensure
		// that in the case of deduping we preserve the one with
		// the oldest time. no, now we actually use
		// m_discoveryTime since we were using m_addedTime in
		// the url filters as it was originally intended.
		jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
			      m_sreq.m_discoveryTime);
	}

	if ( m_isDupValid && m_isDup )
		jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
			      m_docIdWeAreADupOf);

	// how many spiderings were successful vs. failed
	// these don't work because we only store one reply
	// which overwrites any older reply. that's how the
	// key is. we can change the key to use the timestamp
	// and not parent docid in makeKey() for spider
	// replies later.
	// if ( m_sreqValid ) {
	// 	jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%"INT32",\n",
	// 		      m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
	// 	jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%"INT32",\n",
	// 		      m_sreq.m_reservedc1);
	// 	jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%"INT32",\n",
	// 		      m_sreq.m_reservedc2);
	// }

	if ( m_spideredTimeValid )
		jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",
			      m_spideredTime);
	else
		jd.safePrintf("\"gbssSpiderTime\":%"INT32",\n",0);

	if ( m_firstIndexedDateValid )
		jd.safePrintf("\"gbssFirstIndexed\":%"UINT32",\n",
			      m_firstIndexedDate);

	if ( m_contentHash32Valid )
		jd.safePrintf("\"gbssContentHash32\":%"UINT32",\n",
			      m_contentHash32);

	// so we know what hostid spidered the url. this is not the
	// same hostid that will store it necessarily
	jd.safePrintf("\"gbssSpideredByHostId\":%"INT32",\n",
		      (int32_t)g_hostdb.getMyHostId());

	// which shard will store the titlerec and index terms? it
	// is based on docid.
	if ( m_docIdValid ) {
		int32_t shardNum = getShardNumFromDocId ( m_docId );
		jd.safePrintf("\"gbssStoredOnShard\":%"INT32",\n",shardNum);
	}

	if ( m_downloadStartTimeValid && m_downloadEndTimeValid ) {
		jd.safePrintf("\"gbssDownloadStartTimeMS\":%"INT64",\n",
			      m_downloadStartTime);
		jd.safePrintf("\"gbssDownloadEndTimeMS\":%"INT64",\n",
			      m_downloadEndTime);

		int64_t took = m_downloadEndTime - m_downloadStartTime;
		jd.safePrintf("\"gbssDownloadDurationMS\":%"INT64",\n",took);

		jd.safePrintf("\"gbssDownloadStartTime\":%"UINT32",\n",
			      (uint32_t)(m_downloadStartTime/1000));

		jd.safePrintf("\"gbssDownloadEndTime\":%"UINT32",\n",
			      (uint32_t)(m_downloadEndTime/1000));
	}


	jd.safePrintf("\"gbssUsedRobotsTxt\":%"INT32",\n",
		      m_useRobotsTxt);

	if ( m_linksValid )
		jd.safePrintf("\"gbssNumOutlinksOnPage\":%"INT32",\n",
			      (int32_t)m_links.getNumLinks());

	//if ( m_numOutlinksAddedValid )
	// crap, this is not right because we only call addOutlinksToMetaList()
	// after we call this function.
	// jd.safePrintf("\"gbssNumOutlinksAdded\":%"INT32",\n",
	// 	      (int32_t)m_numOutlinksAdded);

	// how many download/indexing errors we've had, including this one
	// if applicable.
	if ( m_srepValid )
		jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",
			      m_srep.m_errCount);
	else
		jd.safePrintf("\"gbssConsecutiveErrors\":%"INT32",\n",0);


	if ( m_ipValid )
		jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip));
	else
		jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");

	if ( m_ipEndTime ) {
		int64_t took = m_ipEndTime - m_ipStartTime;
		jd.safePrintf("\"gbssIpLookupTimeMS\":%"INT64",\n",took);
	}

	if ( m_siteNumInlinksValid ) {
		jd.safePrintf("\"gbssSiteNumInlinks\":%"INT32",\n",
			      (int32_t)m_siteNumInlinks);
		char siteRank = getSiteRank();
		jd.safePrintf("\"gbssSiteRank\":%"INT32",\n",
			      (int32_t)siteRank);
	}

	jd.safePrintf("\"gbssContentInjected\":%"INT32",\n",
		      (int32_t)m_contentInjected);

	if ( m_percentChangedValid && od )
		jd.safePrintf("\"gbssPercentContentChanged\""
			      ":%.01f,\n",
			      m_percentChanged);

	if ( ! m_isDiffbotJSONObject )
		jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
			      *priority);

	// this could be -1, careful
	if ( *ufn >= 0 && ! m_isDiffbotJSONObject )
		jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
			      cr->m_regExs[*ufn].getBufStart());

	// we forced the langid valid above
	if ( m_langIdValid && m_contentLen )
		jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
			      getLangAbbr(m_langId));

	if ( m_contentTypeValid && m_contentLen )
		jd.safePrintf("\"gbssContentType\":\"%s\",\n",
			      g_contentTypeStrings[m_contentType]);

	if ( m_contentValid )
		jd.safePrintf("\"gbssContentLen\":%"INT32",\n",
			      m_contentLen);

	if ( m_isContentTruncatedValid )
		jd.safePrintf("\"gbssIsContentTruncated\":%"INT32",\n",
			      (int32_t)m_isContentTruncated);


	// do not show the -1 any more, just leave it out then
	// to make things look prettier
	if (  m_crawlDelayValid && m_crawlDelay >= 0 &&
	      ! m_isDiffbotJSONObject )
		// -1 if none?
		jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
			      (int32_t)m_crawlDelay);

	// was this url ever sent to diffbot either now or at a previous
	// spider time?
	if ( ! m_isDiffbotJSONObject ) {
		jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
			      (int)m_sentToDiffbot);

		// sent to diffbot?
		jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
			      (int)m_sentToDiffbotThisTime);
	}

	// page must have been downloaded for this one
	if ( cr->m_isCustomCrawl &&
	     m_utf8ContentValid &&
	     ! m_isDiffbotJSONObject &&
	     m_content &&
	     m_contentValid &&
	     cr->m_diffbotPageProcessPattern.getBufStart() &&
	     cr->m_diffbotPageProcessPattern.getBufStart()[0] ) {
		char match = doesPageContentMatchDiffbotProcessPattern();
		jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n",
			      (int)match);
	}
	if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){

		char *url = getFirstUrl()->getUrl();

		// the crawl regex
		int match = 1;
		regex_t *ucr = &cr->m_ucr;
		if ( ! cr->m_hasucr ) ucr = NULL;
		if ( ucr && regexec(ucr,url,0,NULL,0) ) match = 0;
		if ( ucr )
			jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
				      match);

		// now the substring pattern
		match = 1;
		char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart();
		if ( ucp && ! ucp[0] ) ucp = NULL;
		if ( ucp && ! doesStringContainPattern(url,ucp) ) match = 0;
		if ( ucp )
			jd.safePrintf("\"gbssMatchesUrlCrawlPattern\":%i,\n",
				      match);

		// now process regex
		match = 1;
		regex_t *upr = &cr->m_upr;
		if ( ! cr->m_hasupr ) upr = NULL;
		if ( upr && regexec(upr,url,0,NULL,0) ) match = 0;
		if ( upr )
			jd.safePrintf("\"gbssMatchesUrlCrawlRegEx\":%i,\n",
				      match);

		// now process pattern
		match = 1;
		char *upp = cr->m_diffbotUrlProcessPattern.getBufStart();
		if ( upp && ! upp[0] ) upp = NULL;
		if ( upp && ! doesStringContainPattern(url,upp) ) match = 0;
		if ( upp )
			jd.safePrintf("\"gbssMatchesUrlProcessPattern\":%i,\n",
				      match);

	}


	if ( m_diffbotReplyValid && m_sentToDiffbotThisTime &&
	     ! m_isDiffbotJSONObject ) {
		jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
			      m_diffbotReplyError);
		jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
		jd.jsonEncode(mstrerror(m_diffbotReplyError));
		jd.safePrintf("\",\n");
		jd.safePrintf("\"gbssDiffbotReplyLen\":%"INT32",\n",
			      m_diffbotReply.length());
		int64_t took = m_diffbotReplyEndTime - m_diffbotReplyStartTime;
		jd.safePrintf("\"gbssDiffbotReplyResponseTimeMS\":%"INT64",\n",
			      took );
		jd.safePrintf("\"gbssDiffbotReplyRetries\":%"INT32",\n",
			      m_diffbotReplyRetries );
		// this is not correct at this point we haven't parsed the json
		// jd.safePrintf("\"gbssDiffbotReplyNumObjects\":%"INT32",\n",
		// 	      m_diffbotJSONCount);
	}

	// remove last ,\n
	jd.incrementLength(-2);
	// end the json spider status doc
	jd.safePrintf("\n}\n");

	// BEFORE ANY HASHING
	int32_t savedDist = m_dist;

	// add the index list for it. it returns false and sets g_errno on err
	// otherwise it sets m_spiderStatusDocMetaList
	if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
		return NULL;

	// now make the titlerec
	char xdhead[2048];
	// just the head of it. this is the hacky part.
	XmlDoc *xd = (XmlDoc *)xdhead;
	// clear it out
	memset ( xdhead, 0 , 2048);

	// copy stuff from THIS so the spider reply "document" has the same
	// header info stuff
	int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
	if ( hsize > 2048 ) { char *xx=NULL;*xx=0; }
	gbmemcpy ( xdhead , (char *)this , hsize );

	// override spider time in case we had error to be consistent
	// with the actual SpiderReply record
	//xd->m_spideredTime = reply->m_spideredTime;
	//xd->m_spideredTimeValid = true;
	// sanity
	//if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;}

	// this will cause the maroon box next to the search result to
	// say "STATUS" similar to "PDF" "DOC" etc.
	xd->m_contentType  = CT_STATUS;

	int32_t fullsize = &m_dummyEnd - (char *)this;
	if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; }

	/*
	// the ptr_* were all zero'd out, put the ones we want to keep back in
	SafeBuf tmp;
	// was "Spider Status: %s" but that is unnecessary
	tmp.safePrintf("<title>%s</title>",
		       mstrerror(m_indexCode));

	// if we are a dup...
	if ( m_indexCode == EDOCDUP )
		tmp.safePrintf("Dup of docid %"INT64"<br>", m_docIdWeAreADupOf );

	if ( m_redirUrlPtr && m_redirUrlValid )
		tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
	*/

	// put stats like we log out from logIt
	//tmp.safePrintf("<div style=max-width:800px;>\n");
	// store log output into doc
	//logIt(&tmp);
	//tmp.safePrintf("\n</div>");

	// the content is just the title tag above
	// xd->ptr_utf8Content = tmp.getBufStart();
	// xd->size_utf8Content = tmp.length()+1;
	xd->ptr_utf8Content = jd.getBufStart();
	xd->size_utf8Content = jd.length()+1;

	// keep the same url as the doc we are the spider reply for
	xd->ptr_firstUrl = ptr_firstUrl;
	xd->size_firstUrl = size_firstUrl;

	// serps need site, otherwise search results core
	xd->ptr_site = ptr_site;
	xd->size_site = size_site;

	// if this is null then ip lookup failed i guess so just use
	// the subdomain
	if ( ! ptr_site && m_firstUrlValid ) {
		xd->ptr_site  = m_firstUrl.getHost();
		xd->size_site = m_firstUrl.getHostLen();
	}

	// we can't do this the head is not big enough
	// xd->m_collnum = m_collnum;
	// xd->m_collnumValid = m_collnumValid;

	// use the same uh48 of our parent
	int64_t uh48 = m_firstUrl.getUrlHash48();
	// then make into a titlerec but store in metalistbuf, not m_titleRec
	SafeBuf titleRecBuf;
	// this should not include ptrs that are NULL when compressing
	// using its m_internalFlags1
	if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
		return NULL;

	// concat titleRec to our posdb key records
	if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
		return NULL;
	if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
		return NULL;

	// return the right val
	m_dist = savedDist;

	// ok, good to go, ready to add to posdb and titledb
	m_spiderStatusDocMetaListValid = true;
	return &m_spiderStatusDocMetaList;
}


bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {

	// the posdb table
	HashTableX tt4;
	if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx"))
		return false;


	Json jp2;
	if (! jp2.parseJsonStringIntoJsonItems (jd->getBufStart(),m_niceness)){
		g_errno = EBADJSONPARSER;
		return false;
	}

	// re-set to 0
	m_dist = 0;

	// hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged"
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_tt = &tt4;
	hi.m_desc = "json spider status object";
	hi.m_useCountTable = false;
	hi.m_useSections = false;

	// fill up tt4. false -> do not hash without field prefixes.
	hashJSONFields2 ( &tt4 , &hi , &jp2 , false );


	/*
	char buf[64];
	int32_t bufLen;

	// hash 'type:status' similar to 'type:json' etc.
	hi.m_prefix = "type";
	if ( ! hashString("status" , &hi ) ) return NULL;

	// . hash gbstatus:0 for no error, otherwise the error code
	// . this also hashes it as a number so we don't have to
	// . so we can do histograms on this #
	hi.m_prefix = "gbstatus";
	hi.m_desc   = "spider error number as string";
	bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode );
	if ( ! hashString( buf , &hi ) ) return NULL;
	*/

	/*
	logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url);
	logf(LOG_DEBUG,"hashing indexcode=%"INT32"",m_indexCode);
	bool ok = false;
	if ( m_indexCode ) ok = true;
	// scan the keys in tt and make sure the termid fo
	addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList );
	int32_t recSize = 0;
	int32_t rcount = 0;
	char *p = m_spiderStatusDocMetaList.getBufStart();
	char *pend =m_spiderStatusDocMetaList.getBuf();
	for ( ; p < pend ; p += recSize ) {
		// get rdbid, RDB_POSDB
		uint8_t rdbId = *p & 0x7f;
		// skip
		p++;
		// get key size
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		// init this
		int32_t recSize = ks;
		// convert into a key128_t, the biggest possible key
		//key224_t k ;
		char k[MAX_KEY_BYTES];
		if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; }
		//k.setMin();
		gbmemcpy ( &k , p , ks );
		// is it a negative key?
		char neg = false;
		if ( ! ( p[0] & 0x01 ) ) neg = true;
		// this is now a bit in the posdb key so we can rebalance
		char shardByTermId = false;
		if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k))
			shardByTermId = true;
		// skip it
		p += ks;
		// . always zero if key is negative
		// . this is not the case unfortunately...
		if ( neg ) {char *xx=NULL;*xx=0; }
		// print dbname
		if ( rdbId != RDB_POSDB ) { char *xx=NULL;*xx=0; }
		// get termid et al
		key144_t *k2 = (key144_t *)k;
		int64_t tid = g_posdb.getTermId(k2);
		log("db: tid=%"INT64"",tid);
		if ( tid == 199947062354729LL ) ok = true;
		//if ( m_indexCode == 0 && tid != 199947062354729LL ) {
		//	char *xx=NULL;*xx=0; }
	}
	if ( ! ok ) { char *xx=NULL;*xx=0; }
	goto SKIP;
	// was here....
	*/

	/*
	// gbstatus:"tcp timed out"
	hi.m_prefix = "gbstatusmsg";
	hi.m_desc   = "spider error msg";
	if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL;

	//hi.m_prefix = "gbdocid";
	//hi.m_desc   = "docid";
	//bufLen = sprintf ( buf , "%"UINT64"", *uqd ) ;
	//if ( ! hashString( buf , &hi ) ) return NULL;

	// . then the url. url: site: ip: etc. terms
	// . do NOT hash non-fielded terms so we do not get "status"
	//   results poluting the serps => false
	if ( ! hashUrl ( &tt4 , true ) ) return NULL;

	// false --> do not hash the gbdoc* terms (CT_STATUS)
	hashDateNumbers ( &tt4 , true );
	*/

	// store keys in safebuf then to make our own meta list
	addTable144 ( &tt4 , uqd , &m_spiderStatusDocMetaList );

	// debug this shit
	//SafeBuf tmpsb;
	//printMetaList ( m_spiderStatusDocMetaList.getBufStart() ,
	//		m_spiderStatusDocMetaList.getBuf(),
	//		&tmpsb );
	//logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart());

	return true;
}

// returns false and sets g_errno on error
bool XmlDoc::hashMetaTags ( HashTableX *tt ) {

	setStatus ( "hashing meta tags" );

	// assume it's empty
	char buf [ 32*1024 ];
	int32_t bufLen = 32*1024 - 1;
	buf[0] = '\0';
	int32_t     n     = m_xml.getNumNodes();
	XmlNode *nodes = m_xml.getNodes();

	// set up the hashing parms
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INMETATAG;
	hi.m_tt        = tt;
	hi.m_desc      = "custom meta tag";

	// find the first meta summary node
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != 68 ) continue;
		// only get content for <meta name=..> not <meta http-equiv=..>
		int32_t tagLen;
		char *tag = m_xml.getString ( i , "name" , &tagLen );
		char *tptr = tag;
		char tagLower[128];
		int32_t j ;
		int32_t code;
		// skip if empty
		if ( ! tag || tagLen <= 0 ) continue;
		// make tag name lower case and do not allow bad chars
		if ( tagLen > 126 ) tagLen = 126 ;
		to_lower3_a ( tag , tagLen , tagLower );
		for ( j = 0 ; j < tagLen ; j++ ) {
			// bail if has unacceptable chars
			if ( ! is_alnum_a ( tag[j] ) &&
			     tag[j] != '-' &&
			     tag[j] != '_' &&
			     tag[j] != '.' ) break;
			// convert to lower
			tagLower[j] = to_lower_a ( tag[j] );
		}
		// skip this meta if had unacceptable chars
		if ( j < tagLen ) continue;
		// is it recognized?
		code = getFieldCode ( tag , tagLen );
		// after version 45 or more, do not allow gbrss
		// meta tags, because those are now reserved for us
		if ( code == FIELD_GBRSS ) continue;
		// allow gbrss: fields for earlier versions though
		if ( code == FIELD_GBRSS ) code = FIELD_GENERIC;
		// . do not allow reserved tag names
		// . title,url,suburl,
		if ( code != FIELD_GENERIC ) continue;
		// this is now reserved
		// do not hash keyword, keywords, description, or summary metas
		// because that is done in hashRange() below based on the
		// tagdb (ruleset) record
		if ((tagLen== 7&&strncasecmp(tag,"keyword"    , 7)== 0)||
		    (tagLen== 7&&strncasecmp(tag,"summary"    , 7)== 0)||
		    (tagLen== 8&&strncasecmp(tag,"keywords"   , 8)== 0)||
		    (tagLen==11&&strncasecmp(tag,"description",11)== 0) )
			continue;
		// . don't allow reserved names: site, url, suburl, link and ip
		// . actually, the colon is included as part of those
		//   field names, so we really lucked out...!
		// . index this converted tag name
		tptr = tagLower;

		// get the content
		int32_t len;
		char *s = m_xml.getString ( i , "content" , &len );
		if ( ! s || len <= 0 ) continue;
		// . ensure not too big for our buffer (keep room for a \0)
		// . TODO: this is wrong, should be len+1 > bufLen,
		//   but can't fix w/o resetting the index (COME BACK HERE
		//   and see where we index meta tags besides this place!!!)
		//   remove those other places, except... what about keywords
		//   and description?
		if ( len+1 >= bufLen ) {
			//len = bufLen - 1;
			// assume no punct to break on!
			len = 0;
			// only cut off at punctuation
			char *p    = s;
			char *pend = s + len;
			char *last = NULL;
			int32_t  size ;
			for ( ; p < pend ; p += size ) {
				// skip if utf8 char
				size = getUtf8CharSize(*p);
				// skip if 2+ bytes
				if ( size > 1 ) continue;
				// skip if not punct
				if ( is_alnum_a(*p) ) continue;
				// mark it
				last = p;
			}
			if ( last ) len = last - s;
			// this old way was faster...:
			//while ( len > 0 && is_alnum(s[len-1]) ) len--;
		}
		// convert html entities to their chars
		len = saftenTags ( buf , bufLen , s , len );
		// NULL terminate the buffer
		buf[len] = '\0';

		// temp null term
		char c = tptr[tagLen];
		tptr[tagLen] = 0;
		// custom
		hi.m_prefix = tptr;
		// desc is NULL, prefix will be used as desc
		bool status = hashString ( buf,len,&hi );
		// put it back
		tptr[tagLen] = c;
		// bail on error, g_errno should be set
		if ( ! status ) return false;

		// return false with g_errno set on error
		//if ( ! hashNumber ( buf , bufLen , &hi ) )
		//	return false;
	}


	return true;
}


bool XmlDoc::hashMetaData ( HashTableX *tt ) {

	if ( ! ptr_metadata || !ptr_metadata[0] ) return true;

	Json jp;

	if ( ! jp.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
		log("XmlDoc had error parsing json in metadata %s",
		    ptr_metadata);
		return false;
	}

	// set up the hashing parms
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INMETATAG;
	hi.m_tt        = tt;
	hi.m_desc      = "meta data";
	hi.m_useCountTable = false;

	// always reset to word pos to 0 now when hashing a json field
	// since it shouldn't matter because they are in a field so we
	// have to search like myfield:whatever. this way we can
	// augment ptr_metadata on an EDOCUNCHANGED error and
	// not end up with undeleteable data in posdb. if we have
	// duplicate fields in our doc and our doc is json, we could have
	// some word position conflicts, which kinda sucks, but can be
	// avoided becomes this is HASHGROUP_INMETATAG, but should really
	// be HASHGROUP_INMETADATA just to be sure.
	int32_t saved =  m_dist;
	m_dist = 0;

	hashJSONFields2 ( tt , &hi , &jp , false );

	m_dist = saved;

	return true;
}

// slightly greater than m_spideredTime, which is the download time.
// we use this for sorting as well, like for the widget so things
// don't really get added out of order and not show up in the top spot
// of the widget list.
int32_t XmlDoc::getIndexedTime() {
	if ( m_indexedTimeValid ) return m_indexedTime;
	m_indexedTime = getTimeGlobal();
	return m_indexedTime;
}

// . hash dates for sorting by using gbsortby: and gbrevsortby:
// . do 'gbsortby:gbspiderdate' as your query to see this in action
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {

	// stop if already set
	if ( ! m_spideredTimeValid ) return true;

	int32_t indexedTime = getIndexedTime();

	// first the last spidered date
	HashInfo hi;
	hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
	hi.m_tt        = tt;
	hi.m_desc      = "last spidered date";
	hi.m_prefix    = "gbspiderdate";

	char buf[64];
	int32_t bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime );
	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
		return false;

	// and index time is >= spider time, so you want to sort by that for
	// the widget for instance
 	hi.m_desc      = "last indexed date";
 	hi.m_prefix    = "gbindexdate";
 	bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime );
 	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
 		return false;

	// do not index the rest if we are a "spider reply" document
	// which is like a fake document for seeing spider statuses
	//if ( isStatusDoc == CT_STATUS ) return true;
	//if ( isStatusDoc ) return true;

	// now for CT_STATUS spider status "documents" we also index
	// gbspiderdate so index this so we can just do a
	// gbsortby:gbdocspiderdate and only get real DOCUMENTS not the
	// spider status "documents"
	hi.m_desc      = "doc last spidered date";
	hi.m_prefix    = "gbdocspiderdate";
	bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime );
	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
		return false;

 	hi.m_desc      = "doc last indexed date";
 	hi.m_prefix    = "gbdocindexdate";
 	bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime );
 	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
 		return false;


	// all done
	return true;
}

bool XmlDoc::hashMetaZip ( HashTableX *tt ) {

	setStatus ( "hashing meta zip" );

	// . set the score based on quality
	// . scores are multiplied by 256 to preserve fractions for adding
	uint32_t score = *getSiteNumInlinks8() * 256 ;
	if ( score <= 0   ) score = 1;
	// search for meta date
	char buf [ 32 ];
	int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
	if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
	char *p     = buf;
	char *pend  = buf + bufLen ;
	if ( bufLen <= 0 ) return true;

	// set up the hashing parms
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_tt        = tt;
	//hi.m_prefix    = "zipcode";
	hi.m_prefix    = "gbzipcode";

 nextZip:
	// . parse out the zip codes, may be multiple ones
	// . skip non-digits
	while ( p < pend && ! is_digit(*p) ) p++;
	// skip if no digits
	if ( p == pend ) return true;
	// need at least 5 consecutive digits
	if ( p + 5 > pend  ) return true;
	// if not a zip code, skip it
	if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
	if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
	if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
	if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
	// do we have too many consectuive digits?
	if ( p + 5 != pend && is_digit(p[5]) ) {
		// if so skip this whole string of digits
		p += 5; while ( p < pend && is_digit(*p) ) p++;
		goto nextZip;
	}
	// 90210 --> 90 902 9021 90210
	for ( int32_t i = 0 ; i <= 3 ; i++ )
		// use prefix as description
		if ( ! hashString ( p,5-i,&hi ) ) return false;
	p += 5;
	goto nextZip;
}

// returns false and sets g_errno on error
bool XmlDoc::hashContentType ( HashTableX *tt ) {

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	uint8_t ctype = *getContentType();
	char *s = NULL;

	setStatus ( "hashing content type" );


	// hash numerically so we can do gbfacetint:type on it
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_tt        = tt;
	hi.m_prefix    = "type";

	char tmp[6];
	sprintf(tmp,"%"UINT32"",(uint32_t)ctype);
	if ( ! hashString (tmp,gbstrlen(tmp),&hi ) ) return false;


	// these ctypes are defined in HttpMime.h
	switch (ctype) {
	case CT_HTML: s = "html"; break;
	case CT_TEXT: s = "text"; break;
	case CT_XML : s = "xml" ; break;
	case CT_PDF : s = "pdf" ; break;
	case CT_DOC : s = "doc" ; break;
	case CT_XLS : s = "xls" ; break;
	case CT_PPT : s = "ppt" ; break;
	case CT_PS  : s = "ps"  ; break;
	// for diffbot. so we can limit search to json objects
	// in Diffbot.cpp
	case CT_JSON: s = "json"  ; break;
	}
	// bail if unrecognized content type
	if ( ! s ) return true;

	// hack for diffbot. do not hash type:json because diffbot uses
	// that for searching diffbot json objects
	if ( cr->m_isCustomCrawl && ctype==CT_JSON && !m_isDiffbotJSONObject )
		return true;

	// . now hash it
	// . use a score of 1 for all
	// . TODO: ensure doc counting works ok with this when it does
	//   it's interpolation
	return hashString (s,gbstrlen(s),&hi );
}

// . hash the link: terms
// . ensure that more useful linkers are scored higher
// . useful for computing offsite link text for qdb-ish algorithm
// . NOTE: for now i do not hash links to the same domain in order to
//   hopefully save 10%-25% index space
// . NOTE: PLUS, they may clog up the link-adjusted quality ratings since
//   different site links with no link text will be ranked behind them
// . the 8-bit bitmap of the score of a link: term:
// . 00ubdcss  u = link is Unbanned? b = link isBanned?
//             d = link dirty?       c = link clean?
//             s = 01 if no link text, 10 if link text
// . NOTE: this is used in Msg18.cpp for extraction
// . CAUTION: IndexList::score32to8() will warp our score if its >= 128
//   so i moved the bits down
bool XmlDoc::hashLinks ( HashTableX *tt ) {

	setStatus ( "hashing links" );

	// int16_tcuts
	bool isRSSFeed = *getIsRSS();
	Url *cu = getCurrentUrl() ;
	Url *ru = *getRedirUrl() ;

	char dbuf[8*4*1024];
	HashTableX dedup;
	dedup.set( 8,0,1024,dbuf,8*4*1024,false,m_niceness,"hldt");

	// see ../url/Url2.cpp for hashAsLink() algorithm
	for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
		// skip links with zero 0 length
		if ( m_links.m_linkLens[i] == 0 ) continue;
		// . skip if we are rss page and this link is an <a href> link
		// . we only harvest/index <link> urls from rss feeds
		// . or in the case of feedburner, those orig tags
		if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) )
			continue;
		// if we have a <feedburner:origLink> tag, then ignore <link>
		// tags and only get the links from the original links
		if ( m_links.m_isFeedBurner &&
		     !(m_links.m_linkFlags[i] & LF_FBTAG) )
			continue;
		// normalize the link
		Url link;
		// now we always add "www" to these links so that any link
		// to cnn.com is same as link to www.cnn.com, because either
		// we index cnn.com or www.cnn.com but not both providing
		// their content is identical (deduping). This way whichever
		// one we index, we can take advantage of all link text whether
		// it's to cnn.com or www.cnn.com.
		// Every now and then we add new session ids to our list in
		// Url.cpp, too, so we have to version that.
		// Since this is just for hashing, it shouldn't matter that
		// www.tmblr.co has no IP whereas only tmblr.co does.
		link.set ( m_links.m_linkPtrs[i] ,
			   m_links.m_linkLens[i] ,
			   true          , // addWWW?
			   m_links.m_stripIds    ,
			   false         , // stripPound?
			   false         , // stripCommonFile?
			   m_version     );// used for new session id stripping
		// breathe
		QUICKPOLL(m_niceness);
		// . the score depends on some factors:
		// . NOTE: these are no longer valid! (see score bitmap above)
		// . 4 --> if link has different domain AND has link text
		// . 3 --> if link has same domain AND has link text
		// . 2 --> if link has different domain AND no link text
		// . 1 --> if link has sam domain AND no link text
		// . is domain the same as ours?
		// . NOTE: ideally, using the IP domain would be better, but
		//   we do not know the ip of the linker right now... so scores
		//   may be topped with a bunch of same-ip domain links so that
		//   we may not get as much link text as we'd like, since we
		//   only sample from one link text per ip domain
		// . now we also just use the mid domain! (excludes TLD)
		bool internal = false;
		int32_t mdlen = cu->getMidDomainLen();
		if ( mdlen == link.getMidDomainLen() &&
		     strncmp(cu->getMidDomain(),link.getMidDomain(),mdlen)==0)
			//continue; // sameMidDomain = true;
			internal = true;
		// also check the redir url
		if ( ru ) {
			mdlen = ru->getMidDomainLen();
			if ( mdlen == link.getMidDomainLen() &&
			     strncmp(ru->getMidDomain(),
				     link.getMidDomain(),mdlen)==0)
				//continue; // sameMidDomain = true;
				internal = true;
		}
		// now make the score
		//unsigned char score ;
		// . TODO: consider not hashing link w/o text!
		// . otherwise, give it a higher score if it's got link TEXT
		//bool gotLinkText = m_links.hasLinkText ( i, m_version );
		// otherwise, beginning with version 21, allow internal links,
		// but with lower scores
		//                          score
		// internal, no link text:  2
		// internal, w/ link text:  4
		// external, no link text:  6
		// external, w/ link text:  8
		//if ( internal ) {
		//	if ( ! gotLinkText ) score = 0x02;
		//	else                 score = 0x04;
		//}
		//else {
		//	if ( ! gotLinkText ) score = 0x06;
		//	else                 score = 0x08;
		//}


		// dedup this crap
		int64_t h = hash64 ( link.getUrl(), link.getUrlLen() );
		if ( dedup.isInTable ( &h ) ) continue;
		if ( ! dedup.addKey ( &h ) ) return false;


		// set up the hashing parms
		HashInfo hi;
		hi.m_hashGroup = HASHGROUP_INTAG;
		hi.m_tt        = tt;
		hi.m_prefix    = "link";

		// hash link:<url>
		if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi ))
			return false;


		h = hash64 ( link.getHost() , link.getHostLen() );
		if ( dedup.isInTable ( &h ) ) continue;
		if ( ! dedup.addKey ( &h ) ) return false;


		// fix parm
		hi.m_prefix    = "sitelink";

		// hash sitelink:<urlHost>
		if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi))
			return false;
		// breathe
		QUICKPOLL(m_niceness);
	}

	// skip this for now
	return true;

	/*
	setStatus ("hashing gbhasbannedoutlink" );

	// only lets a domain vote once
	int32_t numBannedOutlinks = *getNumBannedOutlinks();
	//if ( numBannedOutlinks <= 0 ) return true;
	// a score of 235 seems to give a negative return for score8to32()
	uint32_t score = score8to32 ( numBannedOutlinks );
	// make score at least 1!
	if ( score <= 0 ) score = 1;
	// a hack fix
	if ( score > 0x7fffffff ) score = 0x7fffffff;

	// set up the hashing parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_prefix    = "gbhasbannedoutlink";

	// hash this special thing to help us de-spam the index
	if ( numBannedOutlinks > 0 ) return hashString ("1",1,&hi );
	else                         return hashString ("0",1,&hi );
	*/
}


// . returns false and sets g_errno on error
// . hash for linkdb
bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) {

	// sanity check
	if ( dt->m_ks != sizeof(key224_t) ) { char *xx=NULL;*xx=0; }
	if ( dt->m_ds != 0                ) { char *xx=NULL;*xx=0; }

	// this will be different with our new site definitions
	uint32_t linkerSiteHash32 = *getSiteHash32();

	char siteRank = getSiteRank();

	if ( ! m_linksValid ) { char *xx=NULL;*xx=0; }

	// we need to store this in the title rec for re-building
	// the meta list from the title rec...
	// is this just site info?
	//TagRec ***pgrv = getOutlinkTagRecVector();
	//if ( ! pgrv || pgrv == (void *)-1 ) { char *xx=NULL;*xx=0; }
	//TagRec **grv = *pgrv;

	int32_t *linkSiteHashes = getLinkSiteHashes();
	if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ){
		char *xx=NULL;*xx=0;}

	// convert siteNumInlinks into a score
	//int32_t numSiteInlinks = *xd->getSiteNumInlinks();

	unsigned char hopCount = *getHopCount();

	// use spidered time! might not be current time! like if rebuilding
	// or injecting from a past spider time
	int32_t discoveryDate = getSpideredTime();//TimeGlobal();
	int32_t lostDate      = 0;

	// add in new links
	for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) {
		// give up control
		QUICKPOLL ( m_niceness );
		// skip if empty
		if ( m_links.m_linkLens[i] == 0 ) continue;
		// . skip if spam, ALWAYS allow internal outlinks though!!
		// . CAUTION: now we must version islinkspam()
		bool spam = m_links.isLinkSpam(i) ;
		// or if it has no link text, skip it
		//if ( ! links->hasLinkText(i,TITLEREC_CURRENT_VERSION) )
		//continue;
		// get site of outlink from tagrec if in there
		int32_t linkeeSiteHash32 = linkSiteHashes[i];
		/*
		TagRec *gr = grv[i];
		char *site = NULL;
		int32_t  siteLen = 0;
		if (   gr   ) {
			int32_t dataSize = 0;
			site = gr->getString("site",NULL,&dataSize);
			if ( dataSize ) siteLen = dataSize - 1;
		}
		// otherwise, make it the host or make it cut off at
		// a "/user/" or "/~xxxx" or whatever path component
		if ( ! site ) {
			// GUESS link site... TODO: augment for /~xxx
			char *s = m_links.getLink(i);
			//int32_t  slen = m_links.getLinkLen(i);
			//siteLen = slen;
			site = ::getHost ( s , &siteLen );
		}
		uint32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
		*/

		//
		// when setting the links class it should set the site hash
		//


		// set this key, it is the entire record
		key224_t k;
		k = g_linkdb.makeKey_uk ( linkeeSiteHash32 ,
					  m_links.getLinkHash64(i)   ,
					  spam               , // link spam?
					  siteRank     , // was quality
					  hopCount,
					  *getIp()       ,
					  *getDocId()    ,
					  discoveryDate      ,
					  lostDate           ,
					  false              , // new add?
					  linkerSiteHash32   ,
					  false              );// delete?
		/*
		// debug
		if ( m_links.getLinkHash64(i) != 0x3df1c439a364e18dLL )
			continue;
		//char c = site[siteLen];
		//site[siteLen]=0;
		//char tmp[1024];
		//sprintf(tmp,"xmldoc: hashinglink site=%s sitelen=%"INT32" ",
		//	site,siteLen);
		//site[siteLen] = c;
		log(//"%s "
		    "url=%s "
		    "linkeesitehash32=0x%08"XINT32" "
		    "linkersitehash32=0x%08"XINT32" "
		    "urlhash64=0x%16llx "
		    "docid=%"INT64" k=%s",
		    //tmp,
		    m_links.getLink(i),
		    (int32_t)linkeeSiteHash32,
		    linkerSiteHash32,
		    m_links.getLinkHash64(i),
		    *getDocId(),
		    KEYSTR(&k,sizeof(key224_t))
		    );
		*/
		// store in hash table
		if ( ! dt->addKey ( &k , NULL ) ) return false;
	}
	return true;
}

bool XmlDoc::getUseTimeAxis ( ) {
	if ( m_useTimeAxisValid )
		return m_useTimeAxis;
	if ( m_setFromTitleRec )
		// return from titlerec header
		return m_useTimeAxis;
	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
	if ( ! cr ) return false;
	m_useTimeAxis = cr->m_useTimeAxis;
	m_useTimeAxisValid = true;
	// sanity check
	// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
	// 	log("build: custom crawls can't use time axis");
	// 	char *xx=NULL;*xx=0;
	// 	m_useTimeAxis = false;
	// }
	return m_useTimeAxis;
}


// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashUrl ( HashTableX *tt ) { // , bool isStatusDoc ) {

	setStatus ( "hashing url colon" );

	// get the first url
	Url *fu = getFirstUrl();

	// set up the hashing parms
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_tt        = tt;

	// we do not need diversity bits for this
	hi.m_useCountTable = false;
	//
	// HASH url: term
	//
	// append a "www." for doing url: searches
	Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true );
	hi.m_prefix    = "url";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "url2";
	if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
		return false;

	if ( getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
		hi.m_prefix = "gbtimeurl";
		SafeBuf *tau = getTimeAxisUrl();
		hashSingleTerm ( tau->getBufStart(),tau->length(),&hi);
	}

	// use hash of url as score so we can get a # of docs per site est.
	//uint16_t score = hash16 ( fu->getUrl() , fu->getUrlLen() );

	setStatus ( "hashing inurl colon" );

	//
	// HASH inurl: terms
	//
	char *s    = fu->getUrl   ();
	int32_t  slen = fu->getUrlLen();
	hi.m_prefix = "inurl";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "inurl2";
	if ( ! hashString ( s,slen, &hi ) ) return false;

	setStatus ( "hashing ip colon" );

	//
	// HASH ip:a.b.c.d
	//
	if ( ! m_ipValid ) { char *xx=NULL;*xx=0; }
	// copy it to save it
	char ipbuf[64];
	int32_t iplen = sprintf(ipbuf,"%s",iptoa(m_ip));
	//char *tmp = iptoa ( m_ip );
	//int32_t  tlen = gbstrlen(tmp);
	hi.m_prefix = "ip";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "ip2";
	if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false;

	//
	// HASH ip:a.b.c
	//
	char *end1 = ipbuf + iplen - 1;
	while ( *end1 != '.' ) end1--;
	if ( ! hashSingleTerm(ipbuf,end1-ipbuf,&hi) ) return false;


	// . sanity check
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	// get the boost
	//floatboost1=(float)getBoostFromSiteNumInlinks(m_siteNumInlinks)/100.0


	//
	// HASH the url path plain as if in body
	//
	// get number of components in the path. does not include the filename
	int32_t pathDepth = fu->getPathDepth(false);
	// make it a density thing
	//pathScore /= ( pathDepth + 1 );
	// ensure score positive
	//if ( pathScore <= 0 ) pathScore = 1;
	// get it
	char *path = fu->getPath();
	int32_t  plen = fu->getPathLen();
	/*
	// update it
	float boost2 = (float)m_urlPathWeight / 100;
	// again
	float boost3 = 1.0 / ((float)pathDepth + 1.0)  ;
	// make a description
	char tmp3[190];
	sprintf( tmp3 ,
		 "path score = "
		 "siteInlinksBoost * "
		 "urlPathWeight * "
		 "pathDepthBoost * "
		 "256 = %.02f * %.02f * %.02f * 256 " ,
		 boost1 ,
		 boost2 ,
		 boost3 );
	*/
	//int32_t pathScore = (int32_t) (256.0 * boost1 * boost2 * boost3);
	// update parms
	//hi.m_desc      = tmp3;
	hi.m_prefix = NULL;
	hi.m_desc = "url path";
	hi.m_hashGroup = HASHGROUP_INURL;

	// if parm "index article content only" is true, do not index this!
	//if ( m_eliminateMenus ) skipIndex=true;

	setStatus ( "hashing gbpathdepth");

	//
	// HASH gbpathdepth:X
	//
	// xyz.com/foo      --> 0
	// xyz.com/foo/     --> 1
	// xyz.com/foo/boo  --> 1
	// xyz.com/foo/boo/ --> 2
	char buf[20];
	int32_t blen = sprintf(buf,"%"INT32"",pathDepth);
	// update parms
	hi.m_prefix    = "gbpathdepth";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2";
	hi.m_hashGroup = HASHGROUP_INTAG;
	// hash gbpathdepth:X
	if ( ! hashString ( buf,blen,&hi) ) return false;


	//
	// HASH gbhopcount:X
	//
	setStatus ( "hashing gbhopcount");
	if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; }
	blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount);
	// update parms
	hi.m_prefix    = "gbhopcount";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "gbhopcount2";
	hi.m_hashGroup = HASHGROUP_INTAG;
	// hash gbpathdepth:X
	if ( ! hashString ( buf,blen,&hi) ) return false;


	setStatus ( "hashing gbhasfilename");

	//
	// HASH gbhasfilename:0 or :1
	//
	char *hm;
	if ( fu->getFilenameLen() ) hm = "1";
	else                        hm = "0";
	// update parms
	hi.m_prefix = "gbhasfilename";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2";
	// hash gbhasfilename:[0|1]
	if ( ! hashString ( hm,1,&hi) ) return false;

	setStatus ( "hashing gbiscgi");

	//
	// HASH gbiscgi:0 or gbiscgi:1
	//
	if ( fu->isCgi() ) hm = "1";
	else               hm = "0";
	hi.m_prefix = "gbiscgi";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "gbiscgi2";
	if ( ! hashString ( hm,1,&hi) ) return false;


	setStatus ( "hashing gbext");

	//
	// HASH gbhasext:0 or gbhasext:1 (does it have a fileextension)
	//
	// . xyz.com/foo     --> gbhasext:0
	// . xyz.com/foo.xxx --> gbhasext:1
	if ( fu->getExtensionLen() ) hm = "1";
	else                         hm = "0";
	hi.m_prefix = "gbhasext";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "gbhasext2";
	if ( ! hashString ( hm,1,&hi) ) return false;

	//
	// HASH the url's mid domain and host as they were in the body
	//
	setStatus ( "hashing site colon terms");

	//
	// HASH the site: terms
	//
	// . hash the pieces of the site
	// . http://host.domain.com/~harry/level1/ should hash to:
	// . site:host.domain.com/~harry/level1/
	// . site:host.domain.com/~harry/
	// . site:host.domain.com/~
	// . site:host.domain.com/
	// . site:domain.com/~harry/level1/
	// . site:domain.com/~harry/
	// . site:domain.com/~
	// . site:domain.com/
	// ensure score is positive
	//if ( siteScore <= 0 ) siteScore = 1;
	// get the hostname (later we set to domain name)
	char *name    = fu->getHost();
	int32_t  nameLen = fu->getHostLen();
	// . point to the end of the whole thing, including port field
	// . add in port, if non default
	char *end3    = name + fu->getHostLen() + fu->getPortLen();
 loop:
	// now loop through the sub paths of this url's path
	for ( int32_t i = 0 ; ; i++ ) {
		// get the subpath
		int32_t len = fu->getSubPathLen(i);
		// FIX: always include first /
		if ( len == 0 ) len = 1;
		// write http://www.whatever.com/path into buf
		char buf[MAX_URL_LEN+10];
		char *p = buf;
		gbmemcpy ( p , "http://" , 7 ); p += 7;
		gbmemcpy ( p , name          , nameLen      ); p += nameLen;
		gbmemcpy ( p , fu->getPath() , len          ); p += len;
		*p = '\0';
		// update hash parms
		hi.m_prefix    = "site";
		// no longer, we just index json now
		//if ( isStatusDoc ) hi.m_prefix = "site2";
		hi.m_hashGroup = HASHGROUP_INURL;
		// this returns false on failure
		if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false;
		// break when we hash the root path
		if ( len <=1 ) break;
	}
	// now keep moving the period over in the hostname
	while ( name < end3 && *name != '.' ) { name++; nameLen--; }
	// skip the '.'
	name++; nameLen--;
	// if not '.' we're done
	if ( name < end3 ) goto loop;

	setStatus ( "hashing ext colon");

	//
	// HASH ext: term
	//
	// i.e. ext:gif ext:html ext:htm ext:pdf, etc.
	char *ext  = fu->getExtension();
	int32_t  elen = fu->getExtensionLen();
	// update hash parms
	hi.m_prefix    = "ext";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "ext2";
	if ( ! hashSingleTerm(ext,elen,&hi ) ) return false;


	setStatus ( "hashing gbdocid" );
	hi.m_prefix = "gbdocid";
	// no longer, we just index json now
	//if ( isStatusDoc ) hi.m_prefix = "gbdocid2";
	char buf2[32];
	sprintf(buf2,"%"UINT64"",(m_docId) );
	if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;

	// if indexing a json diffbot object, index
	// gbparenturl:xxxx of the original url from which the json was
	// datamined. we use this so we can act as a diffbot json cache.
	if ( m_isDiffbotJSONObject ) {
		setStatus ( "hashing gbparenturl term");
		char *p = fu->getUrl() + fu->getUrlLen() - 1;
		// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
		for ( ; *p && *p != '-' ; p-- );
		// set up the hashing parms
		hi.m_hashGroup = HASHGROUP_INTAG;
		hi.m_tt        = tt;
		hi.m_desc      = "diffbot parent url";
		// append a "www." as part of normalization
		uw.set ( fu->getUrl() , p - fu->getUrl() , true );
		hi.m_prefix    = "gbparenturl";
		// no longer, we just index json now
		//if ( isStatusDoc ) hi.m_prefix = "gbparenturl2";
		if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) )
			return false;
	}

	//if ( isStatusDoc ) return true;

	setStatus ( "hashing SiteGetter terms");

	//
	// HASH terms for SiteGetter.cpp
	//
	// . this termId is used by SiteGetter.cpp for determining subsites
	// . matches what is in SiteGet::getSiteList()
	// for www.xyz.com/a/     HASH www.xyz.com
	// for www.xyz.com/a/b/   HASH www.xyz.com/a/
	// for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/
	bool  add  = true;
	// we only hash this for urls that end in '/'
	if ( s[slen-1] != '/' ) add = false;
	// and no cgi
	if ( fu->isCgi()     ) add = false;
	// skip if root
	if ( fu->m_plen <= 1 ) add = false;
	// sanity check
	if ( ! m_linksValid ) { char *xx=NULL; *xx=0; }
	// . skip if we have no subdirectory outlinks
	// . that way we do not confuse all the pages in dictionary.com or
	//   wikipedia.org as subsites!!
	if ( ! m_links.hasSubdirOutlink() ) add = false;

	char *host = fu->getHost        ();
	int32_t  hlen = fu->getHostLen     ();

	// tags from here out
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_shardByTermId = true;
	// hash it
	if ( add ) {
		// remove the last path component
		char *end2 = s + slen - 2;
		// back up over last component
		for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ;
		// hash that part of the url
		hi.m_prefix    = "siteterm";
		if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false;
	}
	hi.m_shardByTermId  = false;

	setStatus ( "hashing urlhashdiv10 etc");

	//
	// HASH urlhash: urlhashdiv10: urlhashdiv100: terms
	//
	// this is for proving how many docs are in the index
	uint32_t h = hash32 ( s , slen );
	blen = sprintf(buf,"%"UINT32"",h);
	hi.m_prefix    = "urlhash";
	if ( ! hashString(buf,blen,&hi) ) return false;
	blen = sprintf(buf,"%"UINT32"",h/10);
	// update hashing parms
	hi.m_prefix = "urlhashdiv10";
	if ( ! hashString(buf,blen,&hi) ) return false;
	blen = sprintf(buf,"%"UINT32"",h/100);
	// update hashing parms
	hi.m_prefix = "urlhashdiv100";
	if ( ! hashString(buf,blen,&hi) ) return false;


	setStatus ( "hashing url mid domain");
	// the final score
	//int32_t plainScore = (int32_t)(256.0 * boost1 * boost2 * fw);
	// update parms
	hi.m_prefix    = NULL;
	hi.m_desc      = "middle domain";//tmp3;
	hi.m_hashGroup = HASHGROUP_INURL;
	// if parm "index article content only" is true, do not index this!
	//if ( m_eliminateMenus ) plainScore = 0;
	//char *mid  = fu->getMidDomain   ();
	//int32_t  mlen = fu->getMidDomainLen();
	//hi.m_desc = "url mid dom";
	//if ( ! hashString ( mid,mlen ,&hi ) ) return false;
	//hi.m_desc = "url host";
	if ( ! hashString ( host,hlen,&hi)) return false;


	setStatus ( "hashing url path");

	// hash the path plain
	if ( ! hashString (path,plen,&hi) ) return false;

	return true;
}
/////////////
//
// CHROME DETECTION
//
// we search for these terms we hash here in getSectionsWithDupStats()
// so we can remove chrome.
//
/////////////

// . returns false and sets g_errno on error
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashSections ( HashTableX *tt ) {

	//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
	//if ( m_contentType == CT_HTML ) return true;

	setStatus ( "hashing sections" );

	if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteValid ) { char *xx=NULL;*xx=0; }

	Sections *ss = &m_sections;

	int32_t siteHash32 = *getSiteHash32();

	// set up the hashing parms
	HashInfo hi;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_tt        = tt;
	// the prefix is custom set for each section below
	//hi.m_prefix    = "gbsectionhash";
	// put all guys with the same xpath/site on the same shard
	hi.m_shardByTermId = true;

	Section *si = ss->m_rootSection;

	for ( ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);
		// . skip if empty
		// . this needs to be like 48 bits because 32 bits is not
		//   big enought!
		//uint64_t ih64 = si->m_sentenceContentHash64;

		// don't bother with the section if it doesn't have this set
		// because this eliminates parent dupage to reduce amount
		// of gbxpathsitehash123456 terms we index
		if ( ! ( si->m_flags & SEC_HASHXPATH ) )
			continue;

		// skip if sentence, only hash tags now i guess for diffbot
		//if ( si->m_sentenceContentHash64 )
		//	continue;

		// get hash of sentences this tag contains indirectly
		uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
		if ( ! val32 )
			continue;

		// the termid is now the xpath and the sitehash, the "value"
		// will be the hash of the innerhtml, m_sentenceContentHash64
		uint64_t thash64 = (uint32_t)si->m_turkTagHash32;
		// combine with site hash
		thash64 ^= (uint32_t)siteHash32;

		// this is a special hack we need to make it the
		// hash of the inner html
		//hi.m_sentHash32 = (uint32_t)ih64;

		// . get section xpath & site hash
		// . now if user does a gbfacets:gbxpathsitehashxxxxxx query
		//   he will get back a histogram of the values it hash,
		//   which are 32-bit hashes of the innerhtml for that
		//   xpath on this site.
		char prefix[96];
		sprintf(prefix,"gbxpathsitehash%"UINT64"",thash64);

		// like a normal key but we store "ih64" the innerHTML hash
		// of the section into the key instead of wordbits etc.
		// similar to hashNumber*() functions.
		//if ( ! hashSectionTerm ( term , &hi, (uint32_t)ih64 ) )
		//	return false;

		// i guess use facets
		hi.m_prefix = prefix;

		// we already have the hash of the inner html of the section
		hashFacet2 ( "gbfacetstr",
			     prefix,
			     //(int32_t)(uint32_t)ih64 ,
			     val32,
			     hi.m_tt ,
			     // shard by termId?
			     true );
	}

	return true;
}

// . returns false and sets g_errno on error
bool XmlDoc::hashIncomingLinkText ( HashTableX *tt               ,
				    bool        hashAnomalies    ,
				    bool        hashNonAnomalies ) {

	// do not index ANY of the body if it is NOT a permalink and
	// "menu elimination" technology is enabled.
	//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;

	setStatus ( "hashing link text" );

	// . now it must have an rss item to be indexed in all its glory
	// . but if it tells us it has an rss feed, toss it and wait for
	//   the feed.... BUT sometimes the rss feed outlink is 404!
	// . NO, now we discard with ENORSS at Msg16.cpp
	//if ( ! *getHasRSSItem() &&  m_eliminateMenus ) return true;

	// sanity check
	if ( hashAnomalies == hashNonAnomalies ) { char *xx = NULL; *xx =0; }
	// display this note in page parser
	char *note = "hashing incoming link text";
	// sanity
	if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; }
	if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; }
	// . finally hash in the linkText terms from the LinkInfo
	// . the LinkInfo class has all the terms of hashed anchor text for us
	// . if we're using an old TitleRec linkTermList is just a ptr to
	//   somewhere in TitleRec
	// . otherwise, we generated it from merging a bunch of LinkInfos
	//   and storing them in this new TitleRec
	LinkInfo  *info1    = getLinkInfo1 ();
	LinkInfo **pinfo2   = getLinkInfo2 ();
	LinkInfo  *info2    = *pinfo2;
	LinkInfo  *linkInfo = info1;
	// pick the one with the most inlinks with valid incoming link text,
	// otherwise, we end up with major bias when we stop importing
	// link text from another cluster, because some pages will have
	// twice as many links as they should!
	if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
		linkInfo = info2;
		note = "hashing incoming link text from other cluster";
	}

	// sanity checks
	if ( ! m_ipValid             ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }

	//
	// brought the following code in from LinkInfo.cpp
	//

	int32_t noteLen = 0;
	if ( note ) noteLen = gbstrlen ( note );
	// count "external" inlinkers
	int32_t ecount = 0;

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_useSynonyms = true;
	// hashstring should update this like a cursor.
	hi.m_startDist = 0;

	// loop through the link texts and hash them
	for ( Inlink *k = NULL; (k = linkInfo->getNextInlink(k)) ; ) {
		// is this inlinker internal?
		bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
		// count external inlinks we have for indexing gbmininlinks:
		if ( ! internal ) ecount++;
		// get score
		//int64_t baseScore = k->m_baseScore;
                // get the weight
		//int64_t ww ;
		//if ( internal ) ww = m_internalLinkTextWeight;
		//else            ww = m_externalLinkTextWeight;
		// modify the baseScore
		//int64_t final = (baseScore * ww) / 100LL;
		// get length of link text
		int32_t tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		// get the text
		char *txt = k->getLinkText();
		// sanity check
		if ( ! verifyUtf8 ( txt , tlen ) ) {
			log("xmldoc: bad link text 2 from url=%s for %s",
			    k->getUrl(),m_firstUrl.m_url);
			continue;
		}
		// if it is anomalous, set this, we don't
		//if ( k->m_isAnomaly )
		//	hi.m_hashIffNotUnique = true;
		//hi.m_baseScore = final;
		if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT;
		else            hi.m_hashGroup = HASHGROUP_INLINKTEXT;
		// store the siterank of the linker in this and use that
		// to set the multiplier M bits i guess
		hi.m_linkerSiteRank = k->m_siteRank;
		// now record this so we can match the link text to
		// a matched offsite inlink text term in the scoring info
		k->m_wordPosStart = m_dist; // hi.m_startDist;
		// . hash the link text into the table
		// . returns false and sets g_errno on error
		// . we still have the score punish from # of words though!
		// . for inlink texts that are the same it should accumulate
		//   and use the reserved bits as a multiplier i guess...
		if ( ! hashString ( txt,tlen,&hi) ) return false;
		// now record this so we can match the link text to
		// a matched offsite inlink text term in the scoring info
		//k->m_wordPosEnd = hi.m_startDist;
		// spread it out
		hi.m_startDist += 20;
	}

	/*
	// . hash gbkeyword:numinlinks where score is # of inlinks from 1-255
	// . do not hash gbkeyword:numinlinks if we don't got any
	if ( ecount <= 0 ) return true;
	// limit it since our score can't be more than 255 (8-bits)
	//if ( ecount > 255 ) ecount = 255;
	// convert our 32 bit score to 8-bits so we trick it!
	//int32_t score = score8to32 ( (uint8_t)ecount );
	// watch out for wrap
	//if ( score < 0 ) score = 0x7fffffff;
	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_prefix    = "gbkeyword";
	hi.m_hashGroup = HASHGROUP_INTAG;
	// for terms where word position/density/diversity is irrelevant,
	// we can store this value...
	hi.m_fakeValue = ecount;
	// hash gbkeyword:numinlinks term
	if ( ! hashString ( "numinlinks",10,&hi ) )return false;
	*/

	return true;
}

// . returns false and sets g_errno on error
bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {

	// seems like iffUnique is off, so do this
	//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;

	setStatus ( "hashing neighborhoods" );

	//g_tt = table;

	// . now we also hash the neighborhood text of each inlink, that is,
	//   the text surrounding the inlink text.
	// . this is also destructive in that it will remove termids that
	//   were not in the document being linked to in order to save
	//   space in the titleRec
	// . now we only do one or the other, not both
	LinkInfo  *info1    = getLinkInfo1 ();
	LinkInfo **pinfo2   = getLinkInfo2 ();
	LinkInfo  *info2    = *pinfo2;
	LinkInfo  *linkInfo = info1;

	char *note = " (internal cluster)";
	// pick the one with the most inlinks with valid incoming link text
	// otherwise, we end up with major bias when we stop importing
	// link text from another cluster, because some pages will have
	// twice as many links as they should!
	if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) {
		linkInfo = info2;
		note = " (external cluster)";
	}

	// loop over all the Inlinks
	Inlink *k = NULL;
 loop:
	// get the next inlink
	k = linkInfo->getNextInlink( k );
	// break if done
	if ( ! k ) return true;

	// skip if internal, they often have the same neighborhood text
	if ( (k->m_ip&0x0000ffff)==(m_ip&0x0000ffff) ) goto loop;

	// get the left and right texts and hash both
	char *s = k->getSurroundingText();
	if ( ! s || k->size_surroundingText <= 1 ) goto loop;

	//int32_t inlinks = *getSiteNumInlinks();

	// HACK: to avoid having to pass a flag to TermTable, then to
	// Words::hash(), Phrases::hash(), etc. just flip a bit in the
	// table to make it not add anything unless it is already in there.
	tt->m_addIffNotUnique = true;

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_desc      = "surrounding text";
	hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD;

	// . hash that
	// . this returns false and sets g_errno on error
	int32_t len = k->size_surroundingText - 1;
	if ( ! hashString ( s, len, &hi ) ) return false;

	// now turn it back off
	tt->m_addIffNotUnique = false;

	// get the next Inlink
	goto loop;

	return true;
}


// . returns false and sets g_errno on error
bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {

	setStatus ( "hashing rss info" );

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }

	// . finally hash in the linkText terms from the LinkInfo
	// . the LinkInfo class has all the terms of hashed anchor text for us
	// . if we're using an old TitleRec linkTermList is just a ptr to
	//   somewhere in TitleRec
	// . otherwise, we generated it from merging a bunch of LinkInfos
	//   and storing them in this new TitleRec
	LinkInfo  *linkInfo = getLinkInfo1();

	// get the xml of the first rss/atom item/entry referencing this url
	Xml xml;
	// . returns NULL if no item xml
	// . this could also be a "channel" blurb now, so we index channel pgs
	if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;

	if ( xml.isEmpty() )
		// hash gbrss:0
		return hashRSSTerm ( tt , false );

	// parser info msg
	//if ( m_pbuf ) {
	//	m_pbuf->safePrintf(
	//		"<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
	//}

	// hash nothing if not a permalink and eliminating "menus"
	//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;

	// . IMPORTANT: you must be using the new link algo, so turn it on
	//   in the spider controls. this allows us to include LinkTexts from
	//   the same IP in our LinkInfo class in the TitleRec.
	// . is it rss or atom? both use title tag, so doesn't matter
	// . get the title tag
	bool  isHtmlEncoded;
	int32_t  titleLen;
	char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
	char  c = 0;

	// sanity check
	if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }

	bool hashIffUnique = true;
	// but if we had no content because we were an mp3 or whatever,
	// do not worry about avoiding double hashing
	if ( size_utf8Content <= 0 ) hashIffUnique = false;

	// decode it?
	// should we decode it? if they don't use [CDATA[]] then we should
	// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
	// but most other feeds do not use it
	if ( isHtmlEncoded && title && titleLen > 0 ) {
		// it is html encoded so that the <'s are encoded to &lt;'s so
		// we must decode them back. this could turn latin1 into utf8
		// though? no, because the &'s should have been encoded, too!
		int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
		// make sure we don't overflow the buffer
		if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
		// reassign the length
		titleLen = newLen;
		// NULL terminate it
		c = title[titleLen];
		title[titleLen] = '\0';
	}

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_TITLE;
	hi.m_desc      = "rss title";

	// . hash the rss title
	// . only hash the terms if they are unique to stay balanced with docs
	//   that are not referenced by an rss feed
	bool status = hashString ( title,titleLen,&hi ) ;
	// pop the end back just in case
	if ( c ) title[titleLen] = c;
	// return false with g_errno set on error
	if ( ! status ) return false;

	// get the rss description
	int32_t  descLen;
	char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );

	// for adavanced hashing
	Xml     xml2;
	Words   w;
	//Scores  scores;
	Words  *wordsPtr = NULL;
	//Scores *scoresPtr = NULL;
	c = 0;
	// should we decode it? if they don't use [CDATA[]] then we should
	// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
	// but most other feeds do not use it
	if ( isHtmlEncoded && desc && descLen > 0 ) {
		// it is html encoded so that the <'s are encoded to &lt;'s so
		// we must decode them back. this could turn latin1 into utf8
		// though? no, because the &'s should have been encoded, too!
		int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
		// make sure we don't overflow the buffer
		if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
		// reassign the length
		descLen = newLen;
	}

	// NULL terminate it
	if ( desc ) {
		c = desc[descLen];
		desc[descLen] = '\0';
		// set the xml class from the decoded html
		if ( ! xml2.set ( desc             ,
				  descLen          ,
				  false            , // own data?
				  0                , // allocSize
				  false            , // pure xml?
				  m_version ,
				  true , // set parents?
				  m_niceness ,
				  *ct ) )
			return false;
		// set the words class from the xml, returns false and sets
		// g_errno on error
		if ( ! w.set ( &xml2 ,
			       true  ,  // compute Ids
			       true  ))// has html ents? (WERE encoded twice!)

			return false;
		// pass it in to TermTable::hash() below
		wordsPtr = &w;
	}

	// update hash parms
	hi.m_tt        = tt;
	hi.m_desc      = "rss body";
	hi.m_hashGroup = HASHGROUP_BODY;

	// . hash the rss/atom description
	// . only hash the terms if they are unique to stay balanced with docs
	//   that are not referenced by an rss feed
	status = hashString ( desc, descLen, &hi );
	// pop the end back just in case
	if ( c ) desc[descLen] = c;
	// return false with g_errno set
	if ( ! status ) return false;

	// hash gbrss:1
       	if ( ! hashRSSTerm ( tt , true ) ) return false;

	// parser info msg
	//if ( m_pbuf ) {
	//	m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
	//			   "</b><br><br>");
	//}
 	return true;
}

bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
	// hash gbrss:0 or gbrss:1
	char *value;
	if ( inRSS ) value = "1";
	else         value = "0";

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_prefix    = "gbinrss";
	hi.m_hashGroup = HASHGROUP_INTAG;

	// returns false and sets g_errno on error
	if ( ! hashString(value,1,&hi ) ) return false;

	// hash gbisrss:1 if we are an rss page ourselves
	if ( *getIsRSS() ) value = "1";
	else               value = "0";
	// update hash parms
	hi.m_prefix = "gbisrss";
	// returns false and sets g_errno on error
	if ( ! hashString(value,1,&hi) ) return false;
	return true;
}

// . we now do the title hashing here for newer titlerecs, version 80+, rather
//   than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
//   does have an <index> block in the ruleset.
// . the new Weights class hashes title as part of body now with a high weight
//   given by "titleWeight" parm
bool XmlDoc::hashTitle ( HashTableX *tt ) {
	// sanity check
	if ( m_hashedTitle ) { char *xx=NULL ; *xx=0; }

	setStatus ( "hashing title" );

	// this has been called, note it
	m_hashedTitle = true;

	nodeid_t *tids = m_words.m_tagIds;
	int32_t      nw   = m_words.m_numWords;

	// find the first <title> tag in the doc
	int32_t i ;
	for ( i = 0 ; i < nw ; i++ )
		if ( tids[i] == TAG_TITLE ) break;

	// return true if no title
	if ( i >= nw ) return true;

	// skip tag
	i++;
	// mark it as start of title
	int32_t a = i;

	// limit end
	int32_t max = i + 40;
	if ( max > nw ) max = nw;

	// find end of title, either another <title> or a <title> tag
	for ( ; i < max ; i++ )
		if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break;

	// ends on a <title> tag?
	if ( i == a ) return true;

	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_prefix    = "title";
	hi.m_useSynonyms= true;

	// the new posdb info
	hi.m_hashGroup      = HASHGROUP_TITLE;

	// . hash it up! use 0 for the date
	// . use XmlDoc::hashWords()
	// . use "title" as both prefix and description
	//if ( ! hashWords (a,i,&hi ) ) return false;

	char **wptrs = m_words.getWords();
	int32_t  *wlens = m_words.getWordLens();
	char  *title    = wptrs[a];
	char  *titleEnd = wptrs[i-1] + wlens[i-1];
	int32_t   titleLen = titleEnd - title;
	if ( ! hashString ( title, titleLen, &hi) ) return false;

	// now hash as without title: prefix
	hi.m_prefix = NULL;
	if ( ! hashString ( title, titleLen, &hi) ) return false;

	return true;
}

// . we now do the title hashing here for newer titlerecs, version 80+, rather
//   than use the <index> block in the ruleset for titles.
// . this is not to be confused with hashing the title: terms which still
//   does have an <index> block in the ruleset.
bool XmlDoc::hashBody2 ( HashTableX *tt ) {

	// do not index ANY of the body if it is NOT a permalink and
	// "menu elimination" technology is enabled.
	//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;

	setStatus ( "hashing body" );

	// if more than X% of words are spammed to some degree, index all
	// words with a minimum score
	//int64_t x[] = {30,40,50,70,90};
	//int64_t y[] = {6,8,10,20,30};
	//int32_t mp = getY ( *getSiteNumInlinks8() , x , y , 5 );

	//int32_t nw = m_words.getNumWords();

	// record this
	m_bodyStartPos = m_dist;
	m_bodyStartPosValid = true;

	HashInfo hi;
	hi.m_tt         = tt;
	hi.m_desc       = "body";
	hi.m_useSynonyms= true;
	hi.m_hashGroup  = HASHGROUP_BODY;

	// use NULL for the prefix
	return hashWords (&hi );
}

bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) {

	// do not index meta tags if "menu elimination" technology is enabled.
	//if ( m_eliminateMenus ) return true;

	setStatus ( "hashing meta keywords" );

	// hash the meta keywords tag
	//char buf [ 2048 + 2 ];
	//int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 );
	int32_t mklen;
	char *mk = getMetaKeywords( &mklen );

	// update hash parms
	HashInfo hi;
	hi.m_tt         = tt;
	hi.m_desc       = "meta keywords";
	hi.m_hashGroup  = HASHGROUP_INMETATAG;

	// call XmlDoc::hashString
	return hashString ( mk , mklen , &hi);
}


// . hash the meta summary, description and keyword tags
// . we now do the title hashing here for newer titlerecs, version 80+, rather
//   than use the <index> block in the ruleset for titles.
bool XmlDoc::hashMetaSummary ( HashTableX *tt ) {

	// sanity check
	if ( m_hashedMetas ) { char *xx=NULL ; *xx=0; }

	// this has been called, note it
	m_hashedMetas = true;

	// do not index meta tags if "menu elimination" technology is enabled.
	//if ( m_eliminateMenus ) return true;

	setStatus ( "hashing meta summary" );

	// hash the meta keywords tag
	//char buf [ 2048 + 2 ];
	//int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 );
	int32_t mslen;
	char *ms = getMetaSummary ( &mslen );

	// update hash parms
	HashInfo hi;
	hi.m_tt         = tt;
	hi.m_hashGroup  = HASHGROUP_INMETATAG;

	// udpate hashing parms
	hi.m_desc = "meta summary";
	// hash it
	if ( ! hashString ( ms , mslen , &hi )) return false;


	//len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 );
	int32_t mdlen;
	char *md = getMetaDescription ( &mdlen );

	// udpate hashing parms
	hi.m_desc = "meta desc";
	// . TODO: only hash if unique????? set a flag on ht then i guess
	if ( ! hashString ( md , mdlen , &hi ) ) return false;

	return true;
}


//bool XmlDoc::linksToGigablast ( ) {
//	// check m_links for a link to gigablast.com or www.gigablast.com
//	return m_links.linksToGigablast();
//}

bool XmlDoc::searchboxToGigablast ( ) {
	// . they may have a form variable like
	// . <form method=get action=http://www.gigablast.com/cgi/0.cgi name=f>
	return m_xml.hasGigablastForm();
}

// . bring back support for dmoz integration
// . when clicking on a "search within this category" it does a gbpdcat:<catid>
//   search to capture all pages that have that dmoz category as one of their
//   parent topics
bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) {

	getDmozTitles();


	char *titlePtr = ptr_dmozTitles;
	char *sumPtr   = ptr_dmozSumms;
	//char *anchPtr  = ptr_dmozAnchors;

	char  buf[128];

	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;

	int32_t *catIds = (int32_t *)ptr_catIds;
	int32_t numCatIds = size_catIds / 4;
	// go through the catIds and hash them
	for (int32_t i = 0; i < numCatIds; i++) {
		// write the catid as a string
		sprintf(buf, "%"UINT32"", (uint32_t)catIds[i]);
		// term prefix for hashing
		hi.m_prefix = "gbcatid";
		// hash it
		hashString ( buf , gbstrlen(buf) , &hi );
		// we also want to hash the parents
		int32_t currCatId    = catIds[i];
		int32_t currParentId = catIds[i];
		int32_t currCatIndex;
		// loop to the Top, Top = 1
		while ( currCatId > 1 ) {
			// hash the parent
			sprintf(buf, "%"UINT32"", (uint32_t)currParentId);
			hi.m_prefix = "gbpcatid";
			hashString ( buf , gbstrlen(buf), &hi );
			// next cat
			currCatId = currParentId;
			// get the index for this cat
			currCatIndex = g_categories->getIndexFromId(currCatId);
			if ( currCatIndex <= 0 ) break;
			// get the parent for this cat
			currParentId =
				g_categories->m_cats[currCatIndex].m_parentid;
		}

		// do not hash titles or summaries if "index article content
		// only" parm is on
		//if ( tr->eliminateMenus() ) continue;

		// hash dmoz title
		hi.m_prefix = NULL;
		// call this DMOZ title as regular title i guess
		hi.m_hashGroup = HASHGROUP_TITLE;
		// hash the DMOZ title
		hashString ( titlePtr , gbstrlen(titlePtr), &hi );
		// next title
		titlePtr += gbstrlen(titlePtr) + 1;

		// hash DMOZ summary
		hi.m_prefix = NULL;
		// call this DMOZ summary as body i guess
		hi.m_hashGroup = HASHGROUP_BODY;
		// hash the DMOZ summary
		hashString ( sumPtr , gbstrlen(sumPtr), &hi );
		// next summary
		sumPtr += gbstrlen(sumPtr) + 1;
	}

	int32_t numIndCatIds = size_indCatIds / 4;
	int32_t *indCatIds   = (int32_t *)ptr_indCatIds;
	// go through the INDIRECT catIds and hash them
	for (int32_t i = 0 ; i < numIndCatIds; i++) {

		// write the catid as a string
		sprintf(buf, "%"UINT32"", (uint32_t)indCatIds[i]);
		// use prefix
		hi.m_prefix = "gbicatid";
		hi.m_hashGroup = HASHGROUP_INTAG;
		// hash it
		hashString ( buf , gbstrlen(buf), &hi );

		// we also want to hash the parents
		int32_t currCatId    = indCatIds[i];
		int32_t currParentId = indCatIds[i];
		int32_t currCatIndex;
		// loop to the Top, Top = 1
		while (currCatId > 1) {
			// hash the parent
			sprintf(buf, "%"UINT32"", (uint32_t)currParentId);
			// new prefix
			hi.m_prefix = "gbipcatid";
			// hash it
			hashString ( buf , gbstrlen(buf), &hi );
			// next cat
			currCatId = currParentId;
			// get the index for this cat
			currCatIndex = g_categories->getIndexFromId(currCatId);
			if ( currCatIndex <= 0 ) break;
			// get the parent for this cat
			currParentId =
				g_categories->m_cats[currCatIndex].m_parentid;
		}
	}
	return true;
}

bool XmlDoc::hashLanguage ( HashTableX *tt ) {

	setStatus ( "hashing language" );

	int32_t langId = (int32_t)*getLangId();

	char s[32]; // numeric langid
	int32_t slen = sprintf(s, "%"INT32"", langId );

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gblang";

	if ( ! hashString ( s, slen, &hi ) ) return false;

	// try lang abbreviation
	sprintf(s , "%s ", getLangAbbr(langId) );
	// go back to broken way to try to fix parsing consistency bug
	// by adding hashLanguageString() function below
	//sprintf(s , "%s ", getLangAbbr(langId) );
	if ( ! hashString ( s, slen, &hi ) ) return false;

	return true;
}

bool XmlDoc::hashLanguageString ( HashTableX *tt ) {

	setStatus ( "hashing language string" );

	int32_t langId = (int32_t)*getLangId();

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gblang";

	// try lang abbreviation
	char s[32];
	int32_t slen = sprintf(s , "%s ", getLangAbbr(langId) );
	// go back to broken way to try to fix parsing consistency bug
	if ( ! hashString ( s, slen, &hi ) ) return false;

	return true;
}

bool XmlDoc::hashCountry ( HashTableX *tt ) {

	setStatus ( "hashing country" );

	//uint16_t *cids = getCountryIds();
	//if ( ! cids                 ) return true;
	//if ( cids == (uint16_t *)-1 ) return false;
	uint16_t *cid = getCountryId();
	if ( ! cid || cid == (uint16_t *)-1 ) return false;

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbcountry";

	for ( int32_t i = 0 ; i < 1 ; i++ ) {
		// get the ith country id
		//int32_t cid = cids[i];
		// convert it
		char buf[32];
		int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) );
		// hash it
		if ( ! hashString ( buf, blen, &hi ) ) return false;
	}
	// all done
	return true;
}

bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {

	setStatus ( "hashing site num inlinks" );

	char s[32];
	int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() );

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbsitenuminlinks";

	// hack test
	// slen = sprintf(s,"%"UINT32"",
	// 	       ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
	// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);

	return hashString ( s, slen, &hi );
}

bool XmlDoc::hashCharset ( HashTableX *tt ) {

	setStatus ( "hashing charset" );

	char s[128]; // charset string
	int32_t slen;

	// hash the charset as a string
	if ( ! get_charset_str(*getCharset()))
		slen = sprintf(s, "unknown");
	else
		slen = sprintf(s, "%s", get_charset_str(*getCharset()));

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbcharset";

	if ( ! hashString ( s,slen, &hi ) ) return false;

	// hash charset as a number
	slen = sprintf(s, "%d", *getCharset());

	return hashString ( s,slen, &hi ) ;
}


// . only hash certain tags (single byte scores and ST_COMMENT)
// . do not hash clocks, ST_SITE, ST_COMMENT
// . term = gbtag:blog1     score=0-100
// . term = gbtag:blog2     score=0-100
// . term = gbtag:english1  score=0-100
// . term = gbtag:pagerank1 score=0-100, etc. ...
// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
// . later we can support query like gbtag:english1>30
bool XmlDoc::hashTagRec ( HashTableX *tt ) {

	setStatus ( "hashing tag rec" );

	//char *field    = "gbtag:";
	//int32_t  fieldlen = gbstrlen(field);
	//bool  retval   = true;

	// . this tag rec does not have the ST_SITE tag in it to save space
	// . it does not have clocks either?
	TagRec *gr = getTagRec();

	// count occurence of each tag id
	//int16_t count [ LAST_TAG ];
	//memset ( count , 0 , 2 * LAST_TAG );

	// loop over all tags in the title rec
	for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get id
		int32_t type = tag->m_type;
		// skip tags we are not supposed to index, like
		// ST_CLOCK, etc. or anything with a dataSize not 1
		if ( ! tag->isIndexable() ) continue;
		// hash these metas below
		//if ( type == ST_META ) continue;
		//if ( tag->isType("meta") ) continue;
		// only single byters. this should have been covered by the
		// isIndexable() function.
		//if ( tag->getTagDataSize() != 1 ) continue;
		// get the name
		char *str = getTagStrFromType ( type );
		// get data size
		//uint8_t *data = (uint8_t *)tag->getTagData();
		// make it a string
		//char dataStr[6];
		//sprintf ( dataStr , "%"INT32"",(int32_t)*data );
		// skip if has non numbers
		//bool num = true;
		//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
		//	if ( ! is_digit(tag->getTagData()[i]) ) num = false;
		// skip if it has more than just digits, we are not indexing
		// strings at this point
		//if ( ! num ) continue;
		// point to it, should be a NULL terminated string
		char *dataStr = tag->getTagData();
		// skip if number is too big
		//int32_t val = atol ( dataStr );
		// boost by one so we can index "0" score
		//val++;
		// we really only want to index scores from 0-255
		//if ( val > 255 ) continue;
		// no negatives
		//if ( val <= 0 ) continue;
		// count occurence
		//count [ type ]++;
		// . make the term name to hash after the gbtag:
		// . we want to hash "gbtag:english3" for example, for the
		//   ST_ENGLISH tag id.
		char prefix[64];
		// . do not include the count for the first occurence
		// . follows the gbruleset:36 convention
		// . index gbtagspam:0 or gbtagspam:1, etc.!!!
		//if ( count[type] == 1 )
		sprintf ( prefix , "gbtag%s",str);
		// assume that is good enough
		//char *prefix = tmp;
		// store prefix into m_wbuf so XmlDoc::print() works!
		//if ( m_pbuf ) {
		//	int32_t tlen = gbstrlen(tmp);
		//	m_wbuf.safeMemcpy(tmp,tlen+1);
		//	prefix = m_wbuf.getBuf() - (tlen+1);
		//}
		//else
		//	sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]);
		// "unmap" it so when it is hashed it will have the correct
		// 8-bit score. IndexList.cpp will convert it back to 8 bits
		// in IndexList::set(table), which sets our termlist from
		// this "table".
		//int32_t score = score8to32 ( val );
		// we already incorporate the score as a string when we hash
		// gbtagtagname:tagvalue so why repeat it?
		//int32_t score = 1;

		// update hash parms
		HashInfo hi;
		hi.m_tt        = tt;
		hi.m_prefix    = prefix;
		hi.m_hashGroup = HASHGROUP_INTAG;

		// meta is special now
		if ( tag->isType("meta") ) {
			hi.m_prefix    = NULL;
		}

		// hash it. like "gbtagenglish:1" with a score of 1, etc.
		// or "gbtagspam:33" with a score of 33. this would also
		// hash gbtagclock:0xfe442211 type things as well.
		int32_t dlen = gbstrlen(dataStr);
		if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
	}

	return true;
}


bool XmlDoc::hashPermalink ( HashTableX *tt ) {

	setStatus ( "hashing is permalink" );

	// put a colon in there so it can't be faked using a meta tag.
	char *s = "0";
	if ( *getIsPermalink() ) s = "1";

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbpermalink";

	return hashString ( s,1,&hi );
}


//hash the tag pair vector, the gigabit vector and the sample vector
bool XmlDoc::hashVectors ( HashTableX *tt ) {

	setStatus ( "hashing vectors" );

	int32_t score =  *getSiteNumInlinks8() * 256;
	if ( score <= 0 ) score = 1;
	char buf[32];
	uint32_t h;
	//char *field;
	//char *descr;
	//h = m_tagVector.getVectorHash();
	uint32_t tph = *getTagPairHash32();
	int32_t blen = sprintf(buf,"%"UINT32"", tph);
	//field = "gbtagvector";
	//descr = "tag vector hash";

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbtagvector";
	hi.m_desc      = "tag vector hash";
	hi.m_shardByTermId = true;

	// this returns false on failure
	if ( ! hashString ( buf,blen, &hi ) ) return false;

	h = *getGigabitVectorScorelessHash();
	blen = sprintf(buf,"%"UINT32"",(uint32_t)h);
	// udpate hash parms
	hi.m_prefix = "gbgigabitvector";
	hi.m_desc   = "gigabit vector hash";
	// this returns false on failure
	if ( ! hashString ( buf,blen,&hi) ) return false;

	// . dup checking uses the two hashes above, not this hash!!! MDW
	// . i think this vector is just used to see if the page changed
	//   significantly since last spidering
	// . it is used by getPercentChanged() and by Dates.cpp
	// . sanity check
	//if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
	//int32_t *pc = m_pageSampleVec;
	//h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE);
	//blen = sprintf(buf,"%"UINT32"",(int32_t unsigned int)h);
	//field = "gbsamplevector";
	//descr = "sample vector hash";
	// this returns false on failure
	//if ( ! hashString ( tt,buf,blen,score,field,descr) )
	//	return false;

	// . hash combined for Dup Dectection
	// . must match XmlDoc::getDupList ( );
	//uint64_t h1 = m_tagVector.getVectorHash();
	//uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec);
	//uint64_t h64 = hash64 ( h1 , h2 );

	// take this out for now
	/*
	uint64_t *dh = getDupHash ( );
	blen = sprintf(buf,"%"UINT64"", *dh );//h64);
	//field = "gbduphash";
	//descr = "dup vector hash";
	// update hash parms
	hi.m_prefix    = "gbduphash";
	hi.m_desc      = "dup vector hash";
	// this returns false on failure
	if ( ! hashString ( buf,blen,&hi ) ) return false;
	*/

	// hash the wikipedia docids we match
	if ( ! m_wikiDocIdsValid   ) { char *xx=NULL;*xx=0; }
	for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) {
		blen = sprintf(buf,"%"UINT64"",ptr_wikiDocIds[i]);
		// convert to int32_t
		//int32_t convScore = (int32_t)ptr_wikiScores[i];
		// get score
		//uint32_t ws = score8to32 ( convScore );
		// update hash parms
		hi.m_prefix    = "gbwikidocid";
		hi.m_desc      = "wiki docid";
		hi.m_hashGroup = HASHGROUP_INTAG;
		// this returns false on failure
		if ( ! hashString ( buf,blen,&hi ) ) return false;
	}

	return true;
}

bool XmlDoc::hashAds ( HashTableX *tt ) {

	setStatus ( "hashing ad ids" );

	for(int32_t i = 0; i < size_adVector / 8 ; i++) {
		int32_t score =  *getSiteNumInlinks8() * 256;
		if ( score <= 0 ) score = 1;
		char buf[128];
		char *field;
		char *descr;
		//buflen = snprintf(buf,128,"%s-%s",
		//		  m_adProvider[i],m_adClient[i]);
		snprintf(buf,128,"%"UINT64"",ptr_adVector[i] );
		int32_t bufLen = gbstrlen(buf);
		field = "gbad";
		descr = "ad provider and id";
		// update hash parms
		HashInfo hi;
		hi.m_tt        = tt;
		hi.m_hashGroup = HASHGROUP_INTAG;
		hi.m_prefix    = "gbad";
		hi.m_desc      = "ad provider and id";
		//log(LOG_WARN, "build: url %s indexing ad termid %s:%s",
		// getFirstUrl()->getUrl(), field, buf);
		//this returns false on failure
		if ( ! hashString ( buf,bufLen,&hi ) ) return false;
	}
	return true;
}

Url *XmlDoc::getBaseUrl ( ) {
	if ( m_baseUrlValid ) return &m_baseUrl;
	// need this
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
	// no longer set addWWW to true since tmblr.co has an IP but
	// www.tmblr.co does not
	m_baseUrl.set ( cu , false ); // addWWW = true
	// look for base url
	for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
		// 12 is the <base href> tag id
		if ( xml->getNodeId ( i ) != TAG_BASE ) continue;
		// get the href field of this base tag
		int32_t linkLen;
		char *link = (char *) xml->getString ( i, "href", &linkLen );
		// skip if not valid
		if ( ! link || linkLen == 0 ) continue;
		// set base to it. addWWW=true
		m_baseUrl.set(link, linkLen, false);//true);
		break;
	}
	// fix invalid <base href="/" target="_self"/> tag
	if ( m_baseUrl.getHostLen  () <= 0 || m_baseUrl.getDomainLen() <= 0 )
		m_baseUrl.set ( cu , false );

	m_baseUrlValid = true;
	return &m_baseUrl;
}

// hash gbhasthumbnail:0|1
bool XmlDoc::hashImageStuff ( HashTableX *tt ) {

	setStatus ("hashing image stuff");

	char *val = "0";
	char **td = getThumbnailData();
	if ( *td ) val = "1";

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbhasthumbnail";
	hi.m_desc      = "has a thumbnail";

	// this returns false on failure
	if ( ! hashString ( val,1,&hi ) ) return false;

	return true;
}


// returns false and sets g_errno on error
bool XmlDoc::hashIsAdult ( HashTableX *tt ) {

	setStatus ("hashing isadult");

	char *ia = getIsAdult();
	// this should not block or return error! should have been
	// set in prepareToMakeTitleRec() before hashAll() was called!
	if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; }

	// index gbisadult:1 if adult or gbisadult:0 if not
	char *val;
	if ( *ia ) val = "1";
	else       val = "0";

	// update hash parms
	HashInfo hi;
	hi.m_tt        = tt;
	hi.m_hashGroup = HASHGROUP_INTAG;
	hi.m_prefix    = "gbisadult";
	hi.m_desc      = "is document adult content";

	// this returns false on failure
	if ( ! hashString ( val,1,&hi ) ) return false;

	return true;
}

// hash destination urls for embedded gb search boxes
bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) {

	setStatus ( "hashing submit urls" );

	Url *baseUrl = getBaseUrl();
	if ( ! baseUrl || baseUrl == (Url *)-1) { char*xx=NULL;*xx=0;}

	for ( int32_t i = 0 ; i < m_xml.getNumNodes() ; i++ ) {
		// Find forms
		if ( m_xml.getNodeId(i) != TAG_FORM ) continue;
		if ( m_xml.isBackTag(i) ) continue;
		int32_t score =  *getSiteNumInlinks8() * 256;
		if ( score <= 0 ) score = 1;
		int32_t len;
		char *s = m_xml.getString ( i , "action" , &len );
		if (!s || len == 0) continue;
		Url url; url.set(baseUrl, s, len, true);

		char *buf  = url.getUrl();
		int32_t  blen = url.getUrlLen();

		// update hash parms
		HashInfo hi;
		hi.m_tt        = tt;
		hi.m_hashGroup = HASHGROUP_INTAG;
		hi.m_prefix    = "gbsubmiturl";
		hi.m_desc      = "submit url for form";

		// this returns false on failure
		if ( ! hashString ( buf,blen,&hi ) ) return false;
	}
	return true;
}


//
// STUFF IMPORTED FROM INDEXLIST.CPP
//

// we also assume all scores are above 256, too
uint8_t score32to8 ( uint32_t score ) {
	// ensure score is > 0... no! not any more
	if ( score <= 0  ) return (unsigned char) 0;
	// extremely large scores need an adjustment to avoid wrapping
	if ( score < (uint32_t)0xffffffff - 128 )
		score += 128;
	// scores are multiplied by 256 to preserve fractions, so undo that
	score /= 256;
	// ensure score is > 0
	if ( score <= 0  ) return (unsigned char) 1;
	// if score < 128 return it now
	if ( score < 128 ) return (unsigned char) score;
	// now shrink it so it's now from 1 upwards
	score -= 127;

	// . take NATURAL log of score now
	// . PROBLEM: for low scores logscore may increase by close to 1.0
	//   for a score increase of 1.0. and since s_maxscore is about 22.0
	//   we end up moving 1.0/22.0 of 128 total pts causing a jump of
	//   2 or more score points!! oops!!! to fix, let's add 10 pts
	//   to the score
	score += 10;
	double logscore = ::log ( (double)score );
	// now the max it can be
	//double maxscore = ::log ( (double)(0x00ffffff - 127));
	static double s_maxscore = -1.0;
	static double s_minscore = -1.0;
	if ( s_maxscore == -1.0 ) {
		uint32_t max = ((0xffffffff +   0)/256) - 127 + 10;
		uint32_t min = (  128                 ) - 127 + 10;
		s_maxscore = ::log((double)max);
		s_minscore = ::log((double)min);
		// adjust
		s_maxscore -= s_minscore;
	}
	// adjust it
	logscore -= s_minscore;
	// scale it into [126,0] (add .5 for rounding)
	double scaled   = (logscore* 127.0) / s_maxscore  + .5;
	// sanity check
	if ( (unsigned char)scaled >= 128 ) { char *xx=NULL;*xx=0; }
	// . go into the 8 bit score now
	// . set the hi bit so they know we took its log
	unsigned char score8 = (unsigned char)scaled | 128;
	return score8;
}

// for score8to32() below
static uint32_t s_scoreMap[] = {
	0UL,
	1UL,
        385UL,
        641UL,
        897UL,
        1153UL,
        1409UL,
        1665UL,
        1921UL,
        2177UL,
        2433UL,
        2689UL,
        2945UL,
        3201UL,
        3457UL,
        3713UL,
        3969UL,
        4225UL,
        4481UL,
        4737UL,
        4993UL,
        5249UL,
        5505UL,
        5761UL,
        6017UL,
        6273UL,
        6529UL,
        6785UL,
        7041UL,
        7297UL,
        7553UL,
        7809UL,
        8065UL,
        8321UL,
        8577UL,
        8833UL,
        9089UL,
        9345UL,
        9601UL,
        9857UL,
        10113UL,
        10369UL,
        10625UL,
        10881UL,
        11137UL,
        11393UL,
        11649UL,
        11905UL,
        12161UL,
        12417UL,
        12673UL,
        12929UL,
        13185UL,
        13441UL,
        13697UL,
        13953UL,
        14209UL,
        14465UL,
        14721UL,
        14977UL,
        15233UL,
        15489UL,
        15745UL,
        16001UL,
        16257UL,
        16513UL,
        16769UL,
        17025UL,
        17281UL,
        17537UL,
        17793UL,
        18049UL,
        18305UL,
        18561UL,
        18817UL,
        19073UL,
        19329UL,
        19585UL,
        19841UL,
        20097UL,
        20353UL,
        20609UL,
        20865UL,
        21121UL,
        21377UL,
        21633UL,
        21889UL,
        22145UL,
        22401UL,
        22657UL,
        22913UL,
        23169UL,
        23425UL,
        23681UL,
        23937UL,
        24193UL,
        24449UL,
        24705UL,
        24961UL,
        25217UL,
        25473UL,
        25729UL,
        25985UL,
        26241UL,
        26497UL,
        26753UL,
        27009UL,
        27265UL,
        27521UL,
        27777UL,
        28033UL,
        28289UL,
        28545UL,
        28801UL,
        29057UL,
        29313UL,
        29569UL,
        29825UL,
        30081UL,
        30337UL,
        30593UL,
        30849UL,
        31105UL,
        31361UL,
        31617UL,
        31873UL,
        32129UL,
        32385UL,
        32641UL,
        32897UL,
        33488UL,
        33842UL,
        34230UL,
        34901UL,
        35415UL,
        35979UL,
        36598UL,
        37278UL,
        38025UL,
        39319UL,
        40312UL,
        41404UL,
        43296UL,
        44747UL,
        46343UL,
        48098UL,
        51138UL,
        53471UL,
        56037UL,
        58859UL,
        61962UL,
        65374UL,
        71287UL,
        75825UL,
        80816UL,
        86305UL,
        92342UL,
        98982UL,
        110492UL,
        119326UL,
        129042UL,
        139728UL,
        151481UL,
        171856UL,
        187496UL,
        204699UL,
        223622UL,
        244437UL,
        267333UL,
        307029UL,
        337502UL,
        371022UL,
        407893UL,
        448450UL,
        493062UL,
        570408UL,
        629783UL,
        695095UL,
        766938UL,
        845965UL,
        982981UL,
        1088163UL,
        1203862UL,
        1331130UL,
        1471124UL,
        1625117UL,
        1892110UL,
        2097072UL,
        2322530UL,
        2570533UL,
        2843335UL,
        3143416UL,
        3663697UL,
        4063102UL,
        4502447UL,
        4985726UL,
        5517332UL,
        6439034UL,
        7146599UL,
        7924919UL,
        8781070UL,
        9722836UL,
        10758778UL,
        12554901UL,
        13933735UL,
        15450451UL,
        17118838UL,
        18954063UL,
        20972809UL,
        24472927UL,
        27159874UL,
        30115514UL,
        33366717UL,
        36943040UL,
        43143702UL,
        47903786UL,
        53139877UL,
        58899576UL,
        65235244UL,
        72204478UL,
        84287801UL,
        93563849UL,
        103767501UL,
        114991518UL,
        127337936UL,
        140918995UL,
        164465962UL,
        182542348UL,
        202426372UL,
        224298798UL,
        248358466UL,
        290073346UL,
        322096762UL,
        357322519UL,
        396070851UL,
        438694015UL,
        485579494UL,
        566869982UL,
        629274552UL,
        697919578UL,
        773429105UL,
        856489583UL,
        947856107UL,
        1106268254UL,
        1227877095UL,
        1361646819UL,
        1508793514UL,
        1670654878UL,
        1951291651UL,
        2166729124UL,
        2403710344UL,
        2664389686UL,
        2951136962UL,
        3266558965UL,
        3813440635UL,
        4233267317UL
};

uint32_t score8to32 ( uint8_t score8 ) {

	/*
	int32_t test = score32to8((uint32_t)0xffffffff);
	static bool s_set = false;
	if ( ! s_set ) {
		s_set = true;
		uint8_t lasts =  0;
		int32_t    step  =  128;
		int64_t start = gettimeofdayInMilliseconds();
		for ( uint64_t i=1 ; i<(uint32_t)0xffffffff ; i+=step) {
			// get the score
			uint8_t s = score32to8(i);
			// print it out now
			if ( s != lasts ) {
				fprintf(stderr,"\t%"UINT32"UL,\n",i);
			}
			// if no change, skip it
			if (lasts != 0 && s == lasts ) {
				if ( s > 128 )
					step = (int32_t)((float)step * 1.1);
				continue;
			}
			// otherwise set it
			s_scoreMap[s] = i;
			// reset
			lasts = s;
		}
		// sanity test
		for ( int32_t j = 1 ; j < 256 ; j++ ) {
			uint32_t big = s_scoreMap[j];
			if ( score32to8(big) != j ) { char *xx=NULL;*xx=0;}
		}
		int64_t end = gettimeofdayInMilliseconds();
		logf(LOG_DEBUG,
		     "gb: took %"INT64" ms to build score table.",
		     end-start);

	}
	// sanity test
	static bool s_set = false;
	if ( ! s_set ) {
		for ( int32_t j = 1 ; j < 256 ; j++ ) {
			uint32_t big = s_scoreMap[j];
			uint8_t  tt;
			tt = score32to8(big);
			if ( tt != j ) { char *xx=NULL;*xx=0;}
		}
		s_set = true;
	}
	*/

	return(s_scoreMap[score8]);
}

////////////////////////////////////////////////////////////
//
// Summary/Title generation for Msg20
//
////////////////////////////////////////////////////////////

void XmlDoc::set20 ( Msg20Request *req ) {
	// clear it all out
	reset();
	// this too
	m_reply.reset();

	m_pbuf     = NULL;//pbuf;
	m_niceness = req->m_niceness;
	// remember this
	m_req = req;
	// and this!
	//m_coll = req->ptr_coll;
	//setCollNum ( req->ptr_coll );
	m_collnum = req->m_collnum;
	m_collnumValid = true;
	// make this stuff valid
	if ( m_req->m_docId > 0 ) {
		m_docId      = m_req->m_docId;
		m_docIdValid = true;
	}
	// set url too if we should
	if ( m_req->size_ubuf > 1 )
		setFirstUrl ( m_req->ptr_ubuf , false );
}

#define MAX_LINK_TEXT_LEN 512
#define MAX_RSSITEM_SIZE  30000

void getMsg20ReplyWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// return if it blocked
	if ( THIS->getMsg20Reply ( ) == (void *)-1 ) return;
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

// . returns NULL with g_errno set on error
// . returns -1 if blocked
Msg20Reply *XmlDoc::getMsg20Reply ( ) {

	// return it right away if valid
	if ( m_replyValid ) return &m_reply;

	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block, this callback will be called
	if ( ! m_masterLoop ) {
		m_masterLoop  = getMsg20ReplyWrapper;
		m_masterState = this;
	}

	// used by Msg20.cpp to time this XmlDoc::getMsg20Reply() function
	if ( ! m_startTimeValid && isClockInSync() ) {
		m_startTime      = gettimeofdayInMilliseconds();
		m_startTimeValid = true;
	}

	// caller shouldhave the callback set
	if ( ! m_callback1 && ! m_callback2 ) { char *xx=NULL;*xx=0; }

	//char safeStack[100000];
	//safeStack[0] = 0;
	//safeStack[90000] = 0;

	// int16_tcut
	Msg20Reply *reply = &m_reply;

	m_niceness = m_req->m_niceness;

	m_collnum = m_req->m_collnum;//cr->m_collnum;
	m_collnumValid = true;

	//char *coll = m_req->ptr_coll;
	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
	if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }


	//CollectionRec *cr = getCollRec();
	//if ( ! cr ) return NULL;

	// set this important member var
	//if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll));
	// return NULL with g_errno set on error
	//if ( ! cr ) return NULL;

	// . cache it for one hour
	// . this will set our ptr_ and size_ member vars
	char **otr = getOldTitleRec ( );
	if ( ! otr || otr == (void *)-1 ) return (Msg20Reply *)otr;

	// must have a title rec in titledb
	if ( ! *otr ) { g_errno = ENOTFOUND; return NULL; }

	// sanity
	if ( *otr != m_oldTitleRec ) { char *xx=NULL;*xx=0; }

	// what is this?
	int32_t maxSize = 0;

	// . set our ptr_ and size_ member vars from it after uncompressing
	// . returns false and sets g_errno on error
	if ( ! m_setTr ) {
		// . this completely resets us
		// . this returns false with g_errno set on error
		bool status = set2( *otr, maxSize, cr->m_coll, NULL,
				    m_niceness);
		// sanity check
		if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
		// if there was an error, g_errno should be set.
		if ( ! status ) return NULL;
		m_setTr = true;
	}

	// breathe
	QUICKPOLL(m_niceness);

	// init
	reply->m_nextMerged = NULL;

	reply->m_collnum = m_collnum;

	// MsgE uses this one
	if ( m_req->m_getTitleRec ) {
		// this is the original compressed titleRec, preceeded
		// by key and dataSize and followed by the data
		reply-> ptr_tr = m_oldTitleRec;
		reply->size_tr = m_oldTitleRecSize;
		m_replyValid = true;
		return reply;
	}


	// if they provided a query with gbfacet*: terms then we have
	// to get those facet values.
	if ( ! m_gotFacets ) {
		// only do this once
		m_gotFacets = true;
		// get facet term
		char *qs = m_req->ptr_qbuf;
	facetPrintLoop:
		for ( ; qs && *qs ; qs++ ) {
			if ( qs[0] != 'g' ) continue;
			if ( qs[1] != 'b' ) continue;
			if ( qs[2] != 'f' ) continue;
			if ( strncasecmp(qs,"gbfacet",7) ) continue;
			qs += 7;
			// gbfacetstr: gbfacetint: gbfacetfloat:
			if      ( strncasecmp(qs,"str:"  ,4) == 0 ) qs += 4;
			else if ( strncasecmp(qs,"int:"  ,4) == 0 ) qs += 4;
			else if ( strncasecmp(qs,"float:",6) == 0 ) qs += 6;
			else continue;
			break;
		}
		// if we had a facet, get the values it has in the doc
		if ( qs && *qs ) {
			// need this for storeFacetValues() if we are json
			if ( m_contentType == CT_JSON ||
			     // spider status docs are really json
			     m_contentType == CT_STATUS ) {
				Json *jp = getParsedJson();
				if ( ! jp || jp == (void *)-1)
					return (Msg20Reply *)jp;
			}
			if ( m_contentType == CT_HTML ||
			     m_contentType == CT_XML ) {
				Xml *xml = getXml();
				if ( ! xml || xml==(void *)-1)
					return (Msg20Reply *)xml;
			}
			// find end of it
			char *e = qs;
			for ( ; *e && ! is_wspace_a(*e) ; e++ );
			// tmp null it
			char c = *e; *e = '\0';
			// this is zero if unspecifed
			FacetValHash_t fvh = m_req->m_facetValHash;
			// . this will store facetField/facetValue pairs
			// . stores into safebuf, m_tmpBuf2
			// . it will terminate all stored strings with \0
			// . we check meta tags for html docs
			// . otherwise we check xml/json doc fields
			// . returns false with g_errno set on error
			bool ret = storeFacetValues ( qs , &m_tmpBuf2 , fvh ) ;
			// revert the \0
			*e = c;
			// return NULL with g_errno set on error
			if ( ! ret ) return NULL;
			// advance
			qs = e;
			// do another one
			goto facetPrintLoop;
		}
		// assign
		reply-> ptr_facetBuf = m_tmpBuf2.getBufStart();
		reply->size_facetBuf = m_tmpBuf2.length();
	}

	if ( m_req->m_justGetFacets ) {
		m_replyValid = true;
		return reply;
	}

	if ( m_req->m_getTermListBuf ) {
		// ensure content is recycled from title rec
		m_recycleContent = true;
		//xd->m_recycleLinkInfo = true;
		// only get posdb keys really for this stuff
		m_useTitledb   = false;
		m_useTagdb     = false;
		m_useClusterdb = false;
		m_useSpiderdb  = false;
		m_useLinkdb    = false;
		// time it
		if ( m_tlbufTimer == 0 )
			m_tlbufTimer = gettimeofdayInMilliseconds();
		// . shit limit content for speed!!!
		// . this is for getting matching queries/relatedqueries
		//   anyway, so should be ok
		if ( size_utf8Content > 150000 ) {
			char *p = ptr_utf8Content + 150000 - 1;
			char *pstart = ptr_utf8Content;
			// back up until we hit punct
			for ( ; p > pstart ; p-- )
				if ( is_punct_utf8(p) ) break;
			// set new size then
			*p = '\0';
			size_utf8Content = p - pstart + 1;
		}
		// hack: should be sorted by lower 32bits of termids
		// so handleRequest8e does not have to sort before doing
		// its query matching algo with queries in g_qbuf.
		// but these termlists are really mostly used for doing
		// the gbdocid:|xxxx queries in handleRequest8e.
		SafeBuf *tbuf = getTermListBuf();
		if ( ! tbuf || tbuf == (void *)-1 ) return (Msg20Reply *)tbuf;
		SafeBuf *tibuf = getTermId32Buf();
		if ( ! tibuf || tibuf == (void *)-1)return (Msg20Reply *)tibuf;
		// time it
		int64_t took = gettimeofdayInMilliseconds() - m_tlbufTimer;
		log("seo: tlistbuf gen took %"INT64" ms for docid %"INT64"",
		    took,m_docId);
		// just that
		reply-> ptr_tlistBuf = tbuf->getBufStart();
		reply->size_tlistBuf = tbuf->length();
		reply-> ptr_tiBuf = tibuf->getBufStart();
		reply->size_tiBuf = tibuf->length();
		m_replyValid = true;
		return reply;
	}

	// lookup the tagdb rec fresh if setting for a summary. that way we
	// can see if it is banned or not. but for getting m_getTermListBuf
	// and stuff above, skip the tagrec lookup!
	// save some time when SPIDERING/BUILDING by skipping fresh
	// tagdb lookup and using tags in titlerec
	if ( m_req && ! m_req->m_getLinkText && ! m_checkedUrlFilters )
		m_tagRecDataValid = false;

	// set and validate member vars
	//if ( ! m_setFromTitleRec )
	//	// return NULL with g_errno set on error
	//	if ( ! set ( tr , NULL , m_niceness ) ) return NULL;

	// if  shard responsible for tagrec is dead, then
	// just recycle!
	if ( m_req && ! m_checkedUrlFilters && ! m_tagRecDataValid ) {
		char *site = getSite();
		TAGDB_KEY tk1 = g_tagdb.makeStartKey ( site );
		TAGDB_KEY tk2 = g_tagdb.makeDomainStartKey ( &m_firstUrl );
		uint32_t shardNum1 = g_hostdb.getShardNum(RDB_TAGDB,&tk1);
		uint32_t shardNum2 = g_hostdb.getShardNum(RDB_TAGDB,&tk2);
		// shardnum1 and shardnum2 are often different!
		// log("db: s1=%i s2=%i",(int)shardNum1,(int)shardNum2);
		if ( g_hostdb.isShardDead ( shardNum1 ) ) {
			log("query: skipping tagrec lookup for dead shard "
			    "# %"INT32""
			    ,shardNum1);
			m_tagRecDataValid = true;
		}
		if ( g_hostdb.isShardDead ( shardNum2 ) && m_firstUrlValid ) {
			log("query: skipping tagrec lookup for dead shard "
			    "# %"INT32""
			    ,shardNum2);
			m_tagRecDataValid = true;
		}
	}


	// if we are showing sites that have been banned in tagdb, we dont
	// have to do a tagdb lookup. that should speed things up.
	TagRec *gr = NULL;
	if ( cr && cr->m_doTagdbLookups ) {
		gr = getTagRec();
		if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr;
	}

	//reply-> ptr_tagRec = (char *)gr;
	//reply->size_tagRec = gr->getSize();

	// we use this instead of nowGlobal
	//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }

	// this should be valid, it is stored in title rec
	if ( m_contentHash32Valid ) reply->m_contentHash32 = m_contentHash32;
	else                        reply->m_contentHash32 = 0;

	// if this page is potential spam, toss it!
	//char *isSpam = getIsSpam();
	//if ( ! isSpam || isSpam == (char *)-1 ) return (Msg20Reply *)isSpam;

	if ( ! m_checkedUrlFilters ) {
		// do it
		//int32_t *rn = getRegExpNum2(-1);
		//if ( ! rn || rn == (int32_t *)-1 ) return (Msg20Reply *)rn;
		// do not re-check
		m_checkedUrlFilters = true;

		// a non-www url?
		/*

		  now we allow domain-only urls in the index, so this is
		  hurting us...

		if ( ! m_req->m_getLinkText ) {
			Url tmp;
			tmp.set ( ptr_firstUrl );
			if ( tmp.getHostLen() == tmp.getDomainLen() ) {
				// set m_errno
				reply->m_errno = EDOCFILTERED;
				// tmp debug
				log("xmldoc: filtering non www url %s",
				    ptr_firstUrl);
				// and this
				reply->m_isFiltered = true;
				// give back the url at least
				reply->ptr_ubuf = getFirstUrl()->getUrl();
				reply->size_ubuf =getFirstUrl()->getUrlLen()+1;
				// validate
				m_replyValid = true;
				// and return
				return reply;
			}
		}
		*/

		// get this
		//time_t nowGlobal = getTimeGlobal();
		// get this
		SpiderRequest sreq;
		SpiderReply   srep;
		setSpiderReqForMsg20 ( &sreq , &srep );//, *isSpam );
		int32_t spideredTime = getSpideredTime();
		int32_t langIdArg = -1;
		if ( m_langIdValid ) langIdArg = m_langId;
		// get it
		int32_t ufn;
		ufn=::getUrlFilterNum(&sreq,&srep,spideredTime,true,
				      m_niceness,cr,
				      false, // isOutlink?
				      NULL ,
				      langIdArg);
		// sanity check
		if ( ufn < 0 ) {
			log("msg20: bad url filter for url %s", sreq.m_url);
		}

		// save it
		reply->m_urlFilterNum = ufn;
		// get spider priority if ufn is valid
		int32_t pr = 0;
		//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
		if ( cr->m_forceDelete[ufn] ) pr = -3;

		// this is an automatic ban!
		if ( gr && gr->getLong("manualban",0))
			pr=-3;//SPIDER_PRIORITY_BANNED;

		// is it banned
		if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
			// set m_errno
			reply->m_errno = EDOCBANNED;
			// and this
			reply->m_isBanned = true;
		}

		//
		// for now always allow it until we can fix this better
		// we probably should assume NOT filtered unless it matches
		// a string match only url filter... but at least we will
		// allow it to match "BANNED" filters for now...
		//
		pr = 0;


		// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
		// 	// set m_errno
		// 	reply->m_errno = EDOCFILTERED;
		// 	// and this
		// 	reply->m_isFiltered = true;
		// }
		// done if we are
		if ( reply->m_errno && ! m_req->m_showBanned ) {
			// give back the url at least
			reply->ptr_ubuf = getFirstUrl()->getUrl();
			reply->size_ubuf = getFirstUrl()->getUrlLen() + 1;
			m_replyValid = true;
			return reply;
		}
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// a special hack for XmlDoc::getRecommendedLinksBuf() so we exclude
	// links that link to the main url's site/domain as well as a
	// competitor url (aka related docid)
	Links *links = NULL;
	if ( m_req->m_ourHostHash32 || m_req->m_ourDomHash32 ) {
		links = getLinks();
		if ( ! links || links==(Links *)-1) return (Msg20Reply *)links;
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// truncate content length if we should
	// this was hurting our linkdb lookups! do not do it for those!
	/*
	if ( size_utf8Content > cr->m_contentLenMaxForSummary &&
	// fix for link text fetching!
	     ! req->m_getLinkText ) {
		logf(LOG_DEBUG,"summary: truncating doc of len %"INT32" to %"INT32" for "
		     "generating summary",
		     size_utf8Content,cr->m_contentLenMaxForSummary);
		size_utf8Content = cr->m_contentLenMaxForSummary ;
		// null term just in case
		ptr_utf8Content[size_utf8Content-1] = '\0';
	}
	*/
	// do they want a summary?
	if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
		char *hsum = getHighlightedSummary();

		if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
		//Summary *s = getSummary();
		//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
		//int32_t sumLen = m_finalSummaryBuf.length();
		// is it size and not length?
		int32_t hsumLen = 0;
		// seems like it can return 0x01 if none...
		if ( hsum == (char *)0x01 ) hsum = NULL;
		// get len. this is the HIGHLIGHTED summary so it is ok.
		if ( hsum ) hsumLen = gbstrlen(hsum);
		// must be \0 terminated. not any more, it can be a subset
		// of a larger summary used for deduping
		if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
		// assume size is 0
		//int32_t sumSize = 0;
		// include the \0 in size
		//if ( sum ) sumSize = sumLen + 1;
		// do not get any more than "me" lines/excerpts of summary
		//int32_t max = m_req->m_numSummaryLines;
		// grab stuff from it!
		//reply->m_proximityScore = s->getProximityScore();
		reply-> ptr_displaySum = hsum;//s->getSummary();
		reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
		// this is unhighlighted for deduping, and it might be longer
		// . seems like we are not using this for deduping but using
		//   the gigabit vector in Msg40.cpp, so take out for now
		//reply-> ptr_dedupSum = s->m_summary;
		//reply->size_dedupSum = s->m_summaryLen+1;
		//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
		//reply->m_diversity      = s->getDiversity();
	}

	reply->m_numAlnumWords = 0;
	if ( m_wordsValid )
		reply->m_numAlnumWords = m_words.m_numAlnumWords;

	// . we filter out search results that do not have all the query terms
	// . Matches.cpp checks the link text, dmoz, etc. for all query terms
	// . it must get into the results form indexdb corruption?
	// . this filtering method is/was known as the "BIG HACK"
	// . We also make sure that matches aren't based on
	// . "anomalous" link text, where a doc has so many link texts
	// . that most common dictionary terms appear in or around
	// . a link to the site.
	if ( m_req->size_qbuf > 1 ) {
		Matches *mm = getMatches();
		int32_t numInlinks = getLinkInfo1()->getNumLinkTexts( );
		reply->m_hasAllQueryTerms = mm->docHasQueryTerms(numInlinks);
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// copy the link info stuff?
	if ( ! m_req->m_getLinkText ) {
		reply->ptr_linkInfo  = (char *)ptr_linkInfo1;
		reply->size_linkInfo = size_linkInfo1;
	}


	// breathe
	QUICKPOLL ( m_niceness );

	bool getThatTitle = true;
	if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
	if ( reply->ptr_tbuf           ) getThatTitle = false;
	// if steve's requesting the inlink summary we will want to get
	// the title of each linker even if they are spammy!
	// only get title here if NOT getting link text otherwise
	// we only get it down below if not a spammy voter, because
	// this sets the damn slow sections class
	if ( m_req->m_getLinkText &&
	     ! m_useSiteLinkBuf &&
	     ! m_usePageLinkBuf &&
	     // m_pbuf is used by pageparser.cpp now, not the other two things
	     // above this.
	     ! m_pbuf )
		getThatTitle = false;

	// if steve is getting the inlinks, bad and good, for displaying
	// then get the title here now... otherwise, if we are just spidering
	// and getting the inlinks, do not bother getting the title because
	// the inlink might be linkspam... and we check down below...
	if ( ! m_req->m_onlyNeedGoodInlinks )
		getThatTitle = true;

	// ... no more seo so stop it... disable this for sp
	if ( m_req->m_getLinkText )
	        getThatTitle = false;

	if ( getThatTitle ) {
		Title *ti = getTitle();
		if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
		char *tit = ti->getTitle();
		int32_t  titLen = ti->getTitleLen();
		reply-> ptr_tbuf = tit;
		reply->size_tbuf = titLen + 1; // include \0
		// sanity
		if ( tit && tit[titLen] != '\0' ) { char *xx=NULL;*xx=0; }
		if ( ! tit || titLen <= 0 ) {
			reply->ptr_tbuf = NULL;
			reply->size_tbuf = 0;
		}
	}

	// this is not documented because i don't think it will be popular
	if ( m_req->m_getHeaderTag ) {
		SafeBuf *htb = getHeaderTagBuf();
		if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
		// . it should be null terminated
		// . actually now it is a \0 separated list of the first
		//   few h1 tags
		// . we call SafeBuf::pushChar(0) to add each one
		reply->ptr_htag = htb->getBufStart();
		reply->size_htag = htb->getLength();
	}

	// breathe
	QUICKPOLL ( m_niceness );

	if ( m_req->m_getMatches && ! reply->ptr_mbuf ) {
		MatchOffsets *mo = getMatchOffsets();
		if ( ! mo || mo == (MatchOffsets *)-1) return (Msg20Reply *)mo;
		reply-> ptr_mbuf = (char *)mo->m_matchOffsets;
		reply->size_mbuf = mo->m_numMatches*4;
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// get site
	reply->ptr_site  = ptr_site;
	reply->size_site = size_site;

	// assume unknown
	reply->m_noArchive = 0;
	// are we noarchive? only check this if not getting link text
	if ( ! m_req->m_getLinkText ) {
		char *na = getIsNoArchive();
		if ( ! na || na == (char *)-1 ) return (Msg20Reply *)na;
		reply->m_noArchive = *na;
	}

	// breathe
	QUICKPOLL ( m_niceness );

	int32_t nowUTC2 = m_req->m_nowUTC;
	if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet;

	// . summary vector for deduping
	// . does not compute anything if we should not! (svSize will be 0)
	if ( ! reply->ptr_vbuf &&
	     m_req->m_getSummaryVector &&
	     cr->m_percentSimilarSummary >   0 &&
	     cr->m_percentSimilarSummary < 100   ) {
		int32_t *sv = getSummaryVector ( );
		if ( ! sv || sv == (void *)-1 ) return (Msg20Reply *)sv;
		reply-> ptr_vbuf = (char *)m_summaryVec;
		reply->size_vbuf = m_summaryVecSize;
	}

	// breathe
	QUICKPOLL ( m_niceness );

	if ( m_req->m_numSummaryLines > 0 ) {
		// turn off for now since we added this to posdb
		uint8_t *sl = getSummaryLangId();
		if ( ! sl || sl == (void *)-1 ) return (Msg20Reply *)sl;
		reply->m_summaryLanguage = *sl;
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// returns values of specified meta tags
	if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) {
		int32_t dsize;  char *d;
		d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
		if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d;
		reply->ptr_dbuf  = d;
		reply->size_dbuf = dsize; // includes \0
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// . sample buffer for doing gigabit generation
	// . Msg40.cpp calls intersectGigabits on all these samples from
	//   all the Msg20Replies it gets in the search results
	//if ( ! reply->ptr_gigabitQuery && m_req->m_bigSampleMaxLen > 0 ) {
	if ( ! reply->ptr_gigabitSample && m_req->m_bigSampleMaxLen > 0 ) {
		// before we got a chunk of text from teh doc
		SafeBuf *gsbuf = getSampleForGigabits();
		if ( ! gsbuf||gsbuf ==(void *)-1) return (Msg20Reply *)gsbuf;
		reply->ptr_gigabitSample = gsbuf->getBufStart();
		reply->size_gigabitSample = gsbuf->length();
		// . now we use the gigabit query!
		// . this is really used to find out what wikipedia pages
		//   we match the best...
		// . this also sets the vector
		/*
		char *gq = getGigabitQuery();
		if ( ! gq || gq == (char *)-1) return (Msg20Reply *)gq;
		reply-> ptr_gigabitQuery  = m_gigabitQuery;
		reply->size_gigabitQuery  = gbstrlen(m_gigabitQuery)+1;
		reply-> ptr_gigabitScores = ptr_gigabitScores;
		reply->size_gigabitScores = size_gigabitScores;
		*/
	}

	// get full image url. but not if we already have a thumbnail...
	if ( ! reply->ptr_imgUrl&&!reply->ptr_imgData&&!m_req->m_getLinkText){
		// && m_req->m_getImageUrl ) {
		char **iu = getImageUrl();
		if ( ! iu || iu == (char **)-1 ) return (Msg20Reply *)iu;
		reply-> ptr_imgUrl = *iu;
		reply->size_imgUrl = 0;
		if ( *iu ) reply->size_imgUrl = gbstrlen(*iu)+1;
	}

	// get thumbnail image DATA
	if ( ! reply->ptr_imgData && ! m_req->m_getLinkText ) {
		// && m_req->m_getImageUrl ) {
		reply-> ptr_imgData = ptr_imageData;
		reply->size_imgData = size_imageData;
	}

	// . adids contained in the doc
	// . get from title rec rather than generating
	// . but we need to generate to store in titleRec at index time
	// . they are 32 bits each
	int64_t **avp = getAdVector();
	if ( ! avp || avp == (void *)-1 ) return (Msg20Reply *)avp;

	// get firstip
	int32_t *fip = getFirstIp();
	if ( ! fip || fip == (void *)-1 ) return (Msg20Reply *)fip;


	//Url **redir = getRedirUrl();
	//if ( ! redir || redir == (Url **)-1 ) return (Msg20Reply *)redir;
	//int32_t redirSize = 0;
	//if ( *redir ) redirSize = (*redir)->getUrlLen() + 1;
	//char *ru = NULL;
	//if ( *redir ) ru = (*redir)->getUrl();
	char *ru = ptr_redirUrl;
	int32_t  rulen = 0;
	if ( ru ) rulen = gbstrlen(ru)+1;

	// . Msg25.cpp uses m_adIdHash for restricting voting
	// . these are 64 bit termids hashes
	reply-> ptr_gbAdIds = (char *)*avp;
	// this size is in bytes and includes the \0
	reply->size_gbAdIds = size_adVector;

	// need full cached page of each search result?
	// include it always for spider status docs.
	if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
		reply-> ptr_content =  ptr_utf8Content;
		reply->size_content = size_utf8Content;
	}

	// if ( m_req->m_getSectionVotingInfo && m_tmpBuf3.getCapacity() <=0) {
	// 	Sections *ss = getSections();
	// 	if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;
	// 	// will at least store a \0 in there, but will not count
	// 	// as part of the m_tmpBuf.length()
	//         ss->printVotingInfoInJSON ( &m_tmpBuf3 );
	// 	reply-> ptr_sectionVotingInfo = m_tmpBuf3.getBufStart();
	// 	reply->size_sectionVotingInfo = m_tmpBuf3.length() + 1;
	// }

	// breathe
	QUICKPOLL ( m_niceness );

	// do they want to know if this doc has an outlink to a url
	// that has the provided site and domain hash, Msg20Request::
	// m_ourHostHash32 and m_ourDomHash32?
	int32_t nl = 0;
	if ( links ) nl = links->getNumLinks();
	// scan all outlinks we have on this page
	int32_t i ; for ( i = 0 ; i < nl ; i++ ) {
		// get the normalized url
		//char *url = links->getLinkPtr(i);
		// get the site. this will not block or have an error.
		int32_t hh32 = (int32_t)((uint32_t)links->getHostHash64(i));
		if ( hh32 == m_req->m_ourHostHash32 ) break;
		int32_t dh32 = links->getDomHash32(i);
		if ( dh32 == m_req->m_ourDomHash32 ) break;
	}
	reply->m_hasLinkToOurDomOrHost = false;
	if ( i < nl )
		reply->m_hasLinkToOurDomOrHost = true;


	// easy ones
	reply->m_isPermalink      = m_isPermalink;
	reply->m_ip               = m_ip;
	reply->m_firstIp          = *fip;
	reply->m_domHash          = getDomHash32();//domHash;
	reply->m_docId            = m_docId;
	reply->m_urlHash48        = getFirstUrlHash48();
	reply->m_contentLen       = size_utf8Content;
	reply->m_lastSpidered     = getSpideredTime();//m_spideredTime;
	reply->m_datedbDate       = m_pubDate;
	reply->m_firstIndexedDate = m_firstIndexedDate;
	reply->m_firstSpidered    = m_firstIndexedDate;
	reply->m_contentType      = m_contentType;
	reply->m_hostHash         = getHostHash32a();
	//reply->m_contentHash      = *getContentHash32();
	reply->m_language         = m_langId;
	reply->m_country          = *getCountryId();
	//reply->m_hasAllQueryTerms = false;
	reply->m_hopcount         = m_hopCount;
	reply->m_siteRank         = getSiteRank();

	reply->ptr_ubuf             = getFirstUrl()->getUrl();
	reply->ptr_rubuf            = ru;
	reply->ptr_catIds           = ptr_catIds;
	reply->ptr_indCatIds        = ptr_indCatIds;
	reply->ptr_dmozTitles       = ptr_dmozTitles;
	reply->ptr_dmozSumms        = ptr_dmozSumms;
	reply->ptr_dmozAnchors      = ptr_dmozAnchors;
	reply->ptr_metadataBuf      = ptr_metadata;


	reply->size_ubuf             = getFirstUrl()->getUrlLen() + 1;
	reply->size_rubuf            = rulen;
	reply->size_catIds           = size_catIds;
	reply->size_indCatIds        = size_indCatIds;
	reply->size_dmozTitles       = size_dmozTitles;
	reply->size_dmozSumms        = size_dmozSumms;
	reply->size_dmozAnchors      = size_dmozAnchors;
	reply->size_metadataBuf      = size_metadata;


	// breathe
	QUICKPOLL( m_req->m_niceness );

	/*
	// truncate if necessary (buzz)
	int32_t maxLen = 150000;
	// truncate it?
	bool trunc = true;
	// not if getting link text
	if ( req->m_getLinkText  ) trunc = false;
	// or outlinks
	if ( req->m_getOutlinks  ) trunc = false;
	// or any niceness 1+ for that matter, that indicates a build operation
	if ( req->m_niceness > 0 ) trunc = false;
	// this is causing us to get EMISSINGQUERYTERMS errors!!!
	trunc = false;
	// MDW: int16_ten for speed test
	//int32_t maxLen = 1000;
	if ( trunc && contentLen > maxLen+1 ) {
		contentLen =  maxLen;
		content      [maxLen  ] = '\0';
	}
	*/

	// check the tag first
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	//if ( ! m_sitePopValid        ) { char *xx=NULL;*xx=0; }
	//Tag *tag1 = gr->getTag ("sitenuminlinks");
	//Tag *tag2 = gr->getTag ("sitepop");
	//int32_t sni  = 0;
	//int32_t spop = 0;
	//if ( tag1 ) sni  = atol(tag1->m_data);
	//if ( tag2 ) spop = atol(tag2->m_data);
	reply->m_siteNumInlinks       = m_siteNumInlinks;
	//reply->m_siteNumInlinksTotal  = m_siteNumInlinksTotal;
	//reply->m_siteNumUniqueIps     = m_siteNumInlinksUniqueIp;
	//reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock;
	//reply->m_sitePop        = m_sitePop;

	// . get stuff from link info
	// . this is so fast, just do it for all Msg20 requests
	// . no! think about it -- this can be huge for pages like
	//   google.com!!!
	LinkInfo *info1 = ptr_linkInfo1;
	if ( info1 ) { // && m_req->m_getLinkInfo ) {
		reply->m_pageNumInlinks        = info1->m_totalInlinkingDocIds;
		reply->m_pageNumGoodInlinks     = info1->m_numGoodInlinks;
		reply->m_pageNumUniqueIps       = info1->m_numUniqueIps;
		reply->m_pageNumUniqueCBlocks   = info1->m_numUniqueCBlocks;
		reply->m_pageInlinksLastUpdated = info1->m_lastUpdated;
		//reply->m_pagePop            = 0;//info1->m_pagePop;
		//reply->m_siteNumInlinks = info1->m_siteNumInlinks;
		//reply->m_sitePop        = info1->m_sitePop;
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// getLinkText is true if we are getting the anchor text for a
	// supplied url as part of the SPIDER process..
	// this was done by Msg23 before
	if ( ! m_req->m_getLinkText ) {
		m_replyValid = true;
		return &m_reply;
	}

	// use the first url of the linker by default
	Url *linker = &m_firstUrl;

	// the base url, used for doing links: terms, is the final url,
	// just in case there were any redirects
	Url redir;
	if ( ru ) {
		redir.set ( ru );
		linker = &redir;
	}

	// breathe
	QUICKPOLL( m_niceness );

	// . get score weight of link text
	// . phase out the sitedb*.xml files
	//int64_t x[] = {0,20,30,40,50,70,90,100}; qualities!
	// map these siteNumInlinks (x) to a weight (y)
	//int64_t x[] = {0,50,100,200,500,3000,10000,50000};
	// these are the weights the link text will receive
	//int64_t y[] = {10,30,2000,3000,4000,5000,6000,7000};
	// sanity check
	//if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	// int16_tcut
	//int32_t sni = m_siteNumInlinks;// *getSiteNumInlinks();
	// get the final link text weight as a percentage
	//int32_t ltw = getY ( m_siteNumInlinks , x , y , 8 );
	// store the weight in the reply
	//reply->m_linkTextScoreWeight = ltw;

	//log(LOG_DEBUG,"build: got score weight of %"INT32" for sni=%"INT32"",
	//    (int32_t)reply->m_linkTextScoreWeight, m_siteNumInlinks);

	// breathe
	//QUICKPOLL( m_niceness );

	// . we need the mid doma hash in addition to the ip domain because
	//   chat.yahoo.com has different ip domain than www.yahoo.com , ...
	//   and we don't want them both to be able to vote
	// . the reply is zeroed out in call the reply->reset() above so
	//   if this is not yet set it will be 0
	if ( reply->m_midDomHash == 0 ) {
		char *m      = linker->getMidDomain();
		int32_t  mlen   = linker->getMidDomainLen();
		reply->m_midDomHash = hash32 ( m , mlen );
	}

	// breathe
	QUICKPOLL( m_niceness );

	int64_t start = gettimeofdayInMilliseconds();

	// if not set from above, set it here
	if ( ! links ) links = getLinks ( true ); // do quick set?
	if ( ! links || links == (Links *)-1 ) return (Msg20Reply *)links;
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) return (Msg20Reply *)pos;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Msg20Reply *)ww;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Msg20Reply *)xml;
	//Sections *ss = getSections();
	//if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss;

	// . is this page a dynamic page?
	// . like a guestbook, access log stats, etc.
	// . we don't like to count such pages for links analysis because
	//   they can be spammed so easily
	// . TODO: guestbooks and message boards typically contain cgi links
	//   can we use that to identify?
	// . the coll size includes the \0
	//CollectionRec *cr ;
	//cr = g_collectiondb.getRec ( m_req->ptr_coll,m_req->size_coll-1);
	// g_errno should be ENOCOLLREC
	//if ( ! cr ) return NULL;

	// . we want link text for this url, "linkee"
	// . TODO: true --> add "www" to see if that fixes our problem
	//   i guess Links.cpp does that with the outlinks, so when
	//   Linkdb::fillList() uses Links.cpp, the outlinks have "www"
	//   prepended on them...
	//Url linkee;
	//linkee.set ( m_req->ptr_linkee , m_req->size_linkee );

	// get a ptr to the link in the content. will point to the
	// stuff in the href field of the anchor tag. used for seeing if
	// we have bad links or not.
	int32_t linkNode = -1;
	int32_t linkNum  = -1;
	// . get associated link text from the linker's document for our "url"
	// . only gets from FIRST link to us
	// . TODO: allow more link text from better quality pages?
	// . TODO: limit score based on link text length?
	// . should always be NULL terminated
	// . should not break in the middle of a word
	// . this will return the item/entry if we are extracting from an
	//   rss/atom feed
	char  *rssItem    = NULL;
	int32_t   rssItemLen = 0;
	// store link text in here
	//char  linkTextBuf[MAX_LINK_TEXT_LEN];

	//
	// TODO: for getting siteinlinks just match the site in the url
	// not the full url... and maybe match the one with the int16_test path.
	//

	// . get the link text
	// . linkee might be a site if m_isSiteLinkInfo is true in which
	//   case we get the best inlink to that site, and linkee is
	//   something like blogspot.com/mary/ or some other site.
	int32_t blen = links->getLinkText ( m_req->ptr_linkee  ,//&linkee,
					 m_req->m_isSiteLinkInfo ,
					 m_linkTextBuf         ,
					 MAX_LINK_TEXT_LEN-2 ,
					 &rssItem            ,
					 &rssItemLen         ,
					 &linkNode           ,
					 &linkNum            ,
					 m_niceness          );


	// . BUT this skips the news topic stuff too. bad?
	// . THIS HAPPENED before because we were truncating the xml(see above)
	if ( linkNode < 0 ) {

		int64_t took = gettimeofdayInMilliseconds() - start;
		if ( took > 100 )
			log("build: took %"INT64" ms to get link text for "
			    "%s from linker %s",
			    took,
			    m_req->ptr_linkee,
			    m_firstUrl.m_url );

		logf(LOG_DEBUG,"build: Got linknode = %"INT32" < 0. Cached "
		     "linker %s does not have outlink to %s like linkdb "
		     "says it should. page is probably too big and the "
		     "outlink is past our limit. contentLen=%"INT32". or "
		     "a sitehash collision, or an area tag link.",
		     linkNode,getFirstUrl()->getUrl(),m_req->ptr_linkee,
		     m_xml.getContentLen());
		//g_errno = ECORRUPTDATA;
		// do not let multicast forward to a twin! so use this instead
		// of ECORRUTPDATA
		g_errno = EBADENGINEER;
		//char *xx=NULL;*xx=0;
		return NULL;
	}

	// breathe
	QUICKPOLL(m_niceness);

	if ( ! verifyUtf8 ( m_linkTextBuf , blen ) ) {
		log("xmldoc: bad OUT link text from url=%s for %s",
		    m_req->ptr_linkee,m_firstUrl.m_url);
		m_linkTextBuf[0] = '\0';
		blen = 0;
	}

	// verify for rss as well. seems like we end up coring because
	// length/size is not in cahoots and [size-1] != '\0' sometimes
	if ( ! verifyUtf8 ( rssItem , rssItemLen ) ) {
		log("xmldoc: bad RSS ITEM text from url=%s for %s",
		    m_req->ptr_linkee,m_firstUrl.m_url);
		rssItem[0] = '\0';
		rssItemLen = 0;
	}

	// point to it, include the \0.
	if ( blen > 0 ) {
		reply->ptr_linkText  = m_linkTextBuf;
		// save the size into the reply, include the \0
		reply->size_linkText = blen + 1;
		// sanity check
		if ( blen + 2 > MAX_LINK_TEXT_LEN ) { char *xx=NULL;*xx=0; }
		// sanity check. null termination required.
		if ( m_linkTextBuf[blen] ) { char *xx=NULL;*xx=0; }
	}

	// . the link we link to
	// . important when getting site info because the link url
	//   can be different than the root url!
	reply-> ptr_linkUrl = links->getLink   (linkNum);
	reply->size_linkUrl = links->getLinkLen(linkNum)+1;

	// save the rss item in our state so we can point to it, include \0
	//if(rssItemLen > MAX_RSSITEM_SIZE-2 ) rssItemLen = MAX_RSSITEM_SIZE-2;
	//char rssItemBuf[MAX_RSSITEM_SIZE];
	if ( rssItemLen > MAX_RSSITEM_SIZE )
		rssItemLen = MAX_RSSITEM_SIZE;
	if ( rssItemLen > 0) {
		m_rssItemBuf.safeMemcpy ( rssItem , rssItemLen );
		m_rssItemBuf.pushChar('\0');
		// gbmemcpy ( rssItemBuf, rssItem , rssItemLen );
		// // NULL terminate it
		// rssItemBuf[rssItemLen] = 0;
	}

	// point to it, include the \0
	if ( rssItemLen > 0 ) {
		reply->ptr_rssItem  = m_rssItemBuf.getBufStart();
		reply->size_rssItem = m_rssItemBuf.getLength();
	}

	// breathe
	QUICKPOLL( m_niceness );

	if ( ! m_req->m_doLinkSpamCheck )
		reply->m_isLinkSpam = false;

	if ( m_req->m_doLinkSpamCheck ) {
		// reset to NULL to avoid gbstrlen segfault
		char *note = NULL;
		// need this
		if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; }
		// time it
		//int64_t start = gettimeofdayInMilliseconds();

		Url linkeeUrl;
		linkeeUrl.set ( m_req->ptr_linkee );

		// get it. does not block.
		reply->m_isLinkSpam = ::isLinkSpam ( linker ,
						     m_ip ,
						     ptr_indCatIds ,
						     size_indCatIds / 4 ,
						     m_siteNumInlinks,
						     &m_xml,
						     links,
						     // if doc length more
						     // than 150k then consider
						     // it linkspam
						     // automatically so it
						     // can't vote
						     150000,//MAXDOCLEN//150000
						     &note ,
						     &linkeeUrl , // url ,
						     linkNode ,
						     cr->m_coll ,
						     m_niceness );
		// store it
		if ( note ) {
			// include the \0
			reply->ptr_note  = note;
			reply->size_note = gbstrlen(note)+1;
		}
		// log the reason why it is a log page
		if ( reply->m_isLinkSpam )
			log(LOG_DEBUG,"build: linker %s: %s.",
			    linker->getUrl(),note);
		// sanity
		if ( reply->m_isLinkSpam && ! note )
			log("linkspam: missing note for d=%"INT64"!",m_docId);
		// store times... nah, might have yielded cpu!
		reply->m_timeLinkSpam = 0;
	}

	// breathe
	QUICKPOLL(m_niceness);

	// sanity check
	if ( reply->ptr_rssItem &&
	     reply->size_rssItem>0 &&
	     reply->ptr_rssItem[reply->size_rssItem-1]!=0) {
		char *xx=NULL;*xx=0; }


	//log ("nogl=%"INT32"",(int32_t)m_req->m_onlyNeedGoodInlinks );

	// . skip all this junk if we are a spammy voter
	// . we get the title above in "getThatTitle"
	if ( reply->m_isLinkSpam ) {
		m_replyValid = true; return reply; }

	// . this vector is set from a sample of the entire doc
	// . it is used to dedup voters in Msg25.cpp
	// . this has pretty much been replaced by vector2, it was
	//   also saying a doc was a dup if all its words were
	//   contained by another, like if it was a small subset, which
	//   wasn't the best behaviour.
	// . yeah neighborhood text is much better and this is setting
	//   the slow sections class, so i took it out
	getPageSampleVector ();
	// must not block or error out. sanity check
	if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; }
	//st->m_v1.setPairHashes    ( ww , -1    , m_niceness );

	// breathe
	QUICKPOLL( m_niceness );

	//st->m_v2.setPairHashes    ( ww,linkWordNum, m_niceness );
	// . this vector is set from the text after the link text
	// . it terminates at at a breaking tag
	// . check it out in ~/fff/src/Msg20.cpp
	getPostLinkTextVector ( linkNode );
	// must not block or error out. sanity check
	//if ( ! m_postLinkTextVecValid ) { char *xx=NULL;*xx=0; }

	// breathe
	QUICKPOLL( m_niceness );

	// set from the hashes of the tag id pairs
	//st->m_v3.setTagPairHashes ( xml         , m_niceness );
	// get it
	getTagPairHashVector();
	// must not block or error out. sanity check
	if ( ! m_tagPairHashVecValid ) { char *xx=NULL;*xx=0; }

	// breathe
	QUICKPOLL( m_niceness );

	// this vector is set from the hashes of the path components
	// with punctuation stripped out
	//v4.set ( xml, NULL  , linker,  -1  ,buf4,size);
	// . the 4th vector is provided, this will point to m_topIps[] buffer
	// . this is temporarily disabled
	// . this is the top 2 bytes of the ips of each inlink
	// . we were looking this info up in linkdb
	// . so if two good inlinkers had their inlinks from the same ip
	//   neighborhoods, then one would have its voting power "deduped".
	// . see the old LinkText.cpp for the logic that read these from linkdb
	//v5.set2 ( (char *)incomingIps , numIncomingIps );

	// reference the vectors in our reply
	reply-> ptr_vector1 = m_pageSampleVec;//(char *)&st->m_v1;
	reply->size_vector1 = m_pageSampleVecSize;//st->m_v1.getSize();
	reply-> ptr_vector2 = m_postVec;//(char *)&st->m_v2;
	reply->size_vector2 = m_postVecSize;//st->m_v2.getSize();
	reply-> ptr_vector3 = m_tagPairHashVec; // (char *)&st->m_v3;
	reply->size_vector3 = m_tagPairHashVecSize;//st->m_v3.getSize();

	// crap, we gotta bubble sort these i think
	// but only tag pair hash vec
	bool flag = true;
	uint32_t *d = (uint32_t *)m_tagPairHashVec;
	// exclude the terminating 0 int32_t
	int32_t nd = (m_tagPairHashVecSize / 4) - 1;
	while ( flag ) {
		// breathe
		QUICKPOLL ( m_niceness );
		flag = false;
		for ( int32_t i = 1 ; i < nd ; i++ ) {
			if ( d[i-1] <= d[i] ) continue;
			uint32_t tmp = d[i-1];
			d[i-1] = d[i];
			d[i]   = tmp;
			flag   = true;
		}
	}


	// just always do it
	//if ( ! req->m_getInlinkNeighborhoods ) return true;

	// convert "linkNode" into a string ptr into the document
	char *node = xml->getNodePtr(linkNode)->m_node;
	// . find the word index, "n" for this node
	// . this is INEFFICIENT!!
	char **wp = ww->getWords();
	int32_t   nw = ww->getNumWords();
	int32_t   n;
	for ( n = 0; n < nw && wp[n] < node ; n++ )
		QUICKPOLL(m_niceness);
	// sanity check
	//if ( n >= nw ) { char *xx=NULL; *xx=0; }
	if ( n >= nw ) {
		log("links: crazy! could not get word before linknode");
		g_errno = EBADENGINEER;
		return NULL;
	}

	//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
	// get the ptrs to the sections, 1-1 with words
	//Section **sp = NULL;
	//if ( ss ) sp = ss->m_sectionPtrs;
	// . even tags in the article section have positive scores
	// . the scores array is 1-1 with the words in Words, not the nodes
	//   in Xml. so we had to do that conversion.
	//if ( ! sp || !(sp[n]->m_flags & NOINDEXFLAGS) )
	//	reply->m_outlinkInContent = true;

	//
	// get the surrounding link text, around "linkNode"
	//
	// radius of 80 characters around n
	char  sbuf[1201];
	int32_t  radius = 80;
	char *p      = sbuf;
	char *pend   = sbuf + 600;
	// . make a neighborhood in the "words" space [a,b]
	// . radius is in characters, so "convert" into words by dividing by 5
	int32_t a = n - radius / 5;
	int32_t b = n + radius / 5;
	if ( a <     0 ) a =     0;
	if ( b >    nw ) b =    nw;
	int32_t *pp  = pos->m_pos;
	int32_t  len;
	// if too big shring the biggest, a or b?
	while ( (len=pp[b]-pp[a]) >= 2 * radius + 1 ) {
		// decrease the largest, a or b
		if ( a<n && (pp[n]-pp[a])>(pp[b]-pp[n])) a++;
		else if ( b>n )                          b--;
	}
	// only store it if we can
	if ( p + len + 1 < pend ) {
		// store it
		// FILTER the html entities!!
		int32_t len2 = pos->filter(p,pend,ww,a,b,NULL);//ss);
		// ensure NULL terminated
		p[len2] = '\0';
		// store in reply. it will be serialized when sent.
		// thanks to isj for finding this bug fix.
		m_surroundingTextBuf.safeMemcpy ( p , len2 + 1 );
		reply->ptr_surroundingText =m_surroundingTextBuf.getBufStart();
		reply->size_surroundingText=m_surroundingTextBuf.getLength();
	}

	// breathe
	QUICKPOLL ( m_niceness );

	// get title? its slow because it sets the sections class
	if ( m_req->m_titleMaxLen > 0 && ! reply->ptr_tbuf &&
	     // don't get it anymore if getting link info because it
	     // is slow...
	     getThatTitle ) {
		Title *ti = getTitle();
		if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti;
		char *tit = ti->getTitle();
		int32_t  titLen = ti->getTitleLen();
		reply-> ptr_tbuf = tit;
		reply->size_tbuf = titLen + 1; // include \0
		if ( ! tit || titLen <= 0 ) {
			reply->ptr_tbuf = NULL;
			reply->size_tbuf = 0;
		}
	}

	int64_t took = gettimeofdayInMilliseconds() - start;
	if ( took > 100 )
		log("build: took %"INT64" ms to get link text for "
		    "%s from linker %s",
		    took,
		    m_req->ptr_linkee,
		    m_firstUrl.m_url );


	m_replyValid = true;
	return reply;
}

//static void gotMsg5ListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
//	XmlDoc *THIS = (XmlDoc *)state;
//	THIS->m_masterLoop ( THIS->m_masterState );
//}


char **XmlDoc::getDiffbotPrimaryImageUrl ( ) {

	// use new json parser
	Json *jp = getParsedJson();
	if ( ! jp || jp == (void *)-1 ) return (char **)jp;

	JsonItem *ji = jp->getFirstItem();

	// assume none
	m_imageUrl2      = NULL;
	m_imageUrl2Valid = true;

	//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);

	for ( ; ji ; ji = ji->m_next ) {
		QUICKPOLL(m_niceness);
		// skip if not number or string
		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
			continue;

		//char *topName = NULL;
		// what name level are we?
		// int32_t numNames = 1;
		// JsonItem *pi = ji->m_parent;
		// for ( ; pi ; pi = pi->m_parent ) {
		// 	// empty name?
		// 	if ( ! pi->m_name ) continue;
		// 	if ( ! pi->m_name[0] ) continue;
		// 	topName = pi->m_name;
		// 	numNames++;
		// }

		char *name0 = ji->m_name;
		char *name1 = NULL;
		char *name2 = NULL;
		if ( ji->m_parent )
			name1 = ji->m_parent->m_name;
		if ( ji->m_parent->m_parent )
			name2 = ji->m_parent->m_parent->m_name;

		// stop at first image for "images":[{ indicator
		if ( strcmp(name0,"url") == 0 &&
		     name1 &&
		     strcmp(name1,"images") == 0 )
			break;


		// for products
		if ( strcmp(name0,"link") == 0 &&
		     name1 &&
		     strcmp(name1,"media") == 0 )
			break;
	}


	if ( ! ji )
		return &m_imageUrl2;

	int32_t vlen;
	char *val = ji->getValueAsString( &vlen );

	// ok, we got it, just copy that
	m_imageUrlBuf2.safeMemcpy ( val , vlen );
	m_imageUrlBuf2.nullTerm();
	m_imageUrl2 = m_imageUrlBuf2.getBufStart();
	return &m_imageUrl2;
}

// get the image url SPECIFIED by the page, so there is no guesswork here
// unlike with the Images.cpp class
char **XmlDoc::getImageUrl() {
	// return if valid
	if ( m_imageUrlValid ) return &m_imageUrl;
	// get first url
	Url *f = getFirstUrl();
	if ( ! f || f == (Url *)-1 ) return (char **)f;

	// assume none
	m_imageUrl      = NULL;
	m_imageUrlValid = true;

	// we use getDiffbotPrimaryImageUrl() above for doing thumbs
	if ( m_isDiffbotJSONObject || m_contentType == CT_JSON )
		return &m_imageUrl;

	// all done if not youtube or meta cafe
	char *host = f->getHost();
	char  found = 0;
	if ( ! strncmp ( host , "www.youtube.com/"  , 16 ) ) found = 1;
	if ( ! strncmp ( host , "youtube.com/"      , 12 ) ) found = 1;
	if ( ! strncmp ( host , "www.metacafe.com/" , 17 ) ) found = 2;
	if ( ! strncmp ( host , "metacafe.com/"     , 13 ) ) found = 2;
	if ( ! found ) return &m_imageUrl;
	// char ptr
	char *u = f->getUrl();
	// make it
	if ( found == 1 ) {
		char *s = strstr(u,"v=");
		// if url does not contain a "v=" then forget it
		if ( ! s ) return &m_imageUrl;
		// point to the id
		s += 2;
		//m_imageUrl = m_imageUrlBuf;
		//char    *p = m_imageUrlBuf;
		m_imageUrlBuf.safeStrcpy("http://img.youtube.com/vi/");
		// do not break
		//char *pend = m_imageUrlBuf + 80;
		// copy the id/number
		//for ( ; is_digit(*s) && p < pend ; ) *p++ = *s++;
		for ( ; is_digit(*s) ; s++ )
			m_imageUrlBuf.pushChar(*s);
		// wrap it up
		m_imageUrlBuf.safeStrcpy ( "/2.jpg" );
		// size includes \0;
		//m_imageUrlSize = p - m_imageUrl ;
		// sanity check
		//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
		m_imageUrl = m_imageUrlBuf.getBufStart();
		return &m_imageUrl;
	}
	// must be meta cafe now
	// http://www.metacafe.com/watch/559561/surfer_girls_vol_2/
	// http://s2.mcstatic.com/thumb/559561.jpg
	// scan url path for first digit
	for ( char *t = f->getPath() ; *t ; t++ ) {
		// look for digit
		if ( ! is_digit ( *t ) ) t++;
		// grab that
		int32_t id = atol ( t );
		// skip ifnot good
		if ( id <= 0 ) continue;
		// make the url
		//m_imageUrl = m_imageUrlBuf;
		//char    *p = m_imageUrlBuf;
		//gbmemcpy ( p , "http://s2.mcstatic.com/thumb/" , 29 );
		//p += 29;
		//p += sprintf ( p , "%"INT32"" , id );
		//gbmemcpy ( p , ".jpg\0" , 5 );
		//p += 5;
		m_imageUrlBuf.safePrintf("http://s2.mcstatic."
					 "com/thumb/%"INT32".jpg", id);
		m_imageUrl = m_imageUrlBuf.getBufStart();
		// size includes \0;
		//m_imageUrlSize = p - m_imageUrl ;
		// sanity check
		//if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; }
		break;
	}
	return &m_imageUrl;
}


MatchOffsets *XmlDoc::getMatchOffsets () {
	// return it if it is set
	if ( m_matchOffsetsValid ) return &m_matchOffsets;

	// need a buncha crap
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (MatchOffsets *)ww;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (MatchOffsets *)xml;
	Matches *mm = getMatches();
	if ( ! mm || mm == (Matches *)-1 ) return (MatchOffsets *)mm;

	m_matchOffsets.set ( xml , ww , mm , true ); // getMatches=true
	m_matchOffsetsValid = true;
	return &m_matchOffsets;
}

Query *XmlDoc::getQuery() {
	if ( m_queryValid ) return &m_query;
	// bail if no query
	if ( ! m_req || ! m_req->ptr_qbuf ) {
		m_queryValid = true;
		return &m_query;
	}
	// return NULL with g_errno set on error
	if ( ! m_query.set2( m_req->ptr_qbuf ,
			     m_req->m_langId ,
			     true ) ) return NULL;
	m_queryValid = true;
	return &m_query;
}

Matches *XmlDoc::getMatches () {
	// return it if it is set
	if ( m_matchesValid ) return &m_matches;

	// if no query, matches are empty
	if ( ! m_req->ptr_qbuf ) {
		m_matchesValid = true;
		return &m_matches;
	}

	// cache it for one hour
	//XmlDoc *od = getOldXmlDoc ( 3600 );
	//if ( ! od || od == (XmlDoc *)-1 ) return (Matches *)od;
	//if ( od->isEmpty() ) od = NULL;

	// need a buncha crap
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Matches *)ww;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Matches *)xml;
	Bits *bits = getBitsForSummary();
	if ( ! bits || bits == (Bits *)-1 ) return (Matches *)bits;
	Sections *ss = getSections();
	if ( ! ss || ss == (void *)-1) return (Matches *)ss;
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) return (Matches *)pos;
	Title *ti = getTitle();
	if ( ! ti || ti == (Title *)-1 ) return (Matches *)ti;
	//Synonyms *syn = getSynonyms();
	//if ( ! syn || syn == (void *)-1 ) return (Matches *)syn;
	Phrases *phrases = getPhrases();
	if ( ! phrases || phrases == (void *)-1 ) return (Matches *)phrases;

	Query *q = getQuery();
	if ( ! q ) return (Matches *)q;

	// set it up
	m_matches.setQuery ( q );
	// returns false and sets g_errno on error
	if ( ! m_matches.set ( this       ,
			       ww         ,
			       //syn        ,
			       phrases ,
			       ss         ,
			       bits       ,
			       pos        ,
			       xml        ,
			       ti         ,
			       m_niceness ) )
		return NULL;
	// we got it
	m_matchesValid = true;
	return &m_matches;
}

// sender wants meta description, custom tags, etc.
char *XmlDoc::getDescriptionBuf ( char *displayMetas , int32_t *dsize ) {
	// return the buffer if we got it
	if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	// now get the content of the requested display meta tags
	//char  dbuf [ 1024*64 ];
	char *dbufEnd = m_dbuf + 1024;//1024*64;
	char *dptr    = m_dbuf;
	char *pp      = displayMetas;
	char *ppend   = pp + gbstrlen(displayMetas);
	// loop over the list of requested meta tag names
	while ( pp < ppend && dptr < dbufEnd ) {
		// skip initial spaces. meta tag names are ascii always i guess
		while ( *pp && is_wspace_a(*pp) ) pp++;
		// that's the start of the meta tag name
		char *s = pp;
		// . find end of that meta tag name
		// . can end in :<integer> which specifies max len
		while ( *pp && ! is_wspace_a(*pp) && *pp != ':' ) pp++;
		// assume no max length to the content of this meta tag
		int32_t maxLen = 0x7fffffff;
		// save current char
		char c = *pp;
		// . NULL terminate the name
		// . before, overflowed the request buffer and caused core!
		// . seems like it is already NULL terminated
		if ( *pp ) *pp = '\0';
		// always advance regardless though
		pp++;
		// if ':' was specified, get the max length
		if ( c == ':' ) {
			if ( is_digit(*pp) ) maxLen = atoi ( pp );
			// skip over the digits
			while ( *pp && ! is_wspace_a (*pp) ) pp++;
		}
		// don't exceed our total buffer size (save room for \0 at end)
		int32_t avail = dbufEnd - dptr - 1;
		if ( maxLen > avail ) maxLen = avail;
		// store the content at "dptr" (do not exceed "maxLen" bytes)
		int32_t wlen = xml->getMetaContent ( dptr      , // write buf
						 maxLen    , // buf length
						 s         , // name value
						 gbstrlen(s) , // name len
						 "name"    , // http-equiv/name
						 false     );// convert &#'s?
		dptr[wlen] = '\0';

		// test it out
		if ( ! verifyUtf8 ( dptr ) ) {
			log("xmldoc: invalid utf8 content for meta tag %s.",s);
			continue;
		}

		// advance and NULL terminate
		dptr    += wlen;
		*dptr++  = '\0';
		// bitch if we truncated
		if ( dptr >= dbufEnd )
			log("query: More than %"INT32" bytes of meta tag "
			    "content "
			    "was encountered. Truncating.",
			    (int32_t)(dbufEnd-m_dbuf));
	}
	// what is the size of the content of displayed meta tags?
	m_dbufSize   = dptr - m_dbuf;
	m_dbufValid = true;
	*dsize = m_dbufSize;
	return m_dbuf;
}

SafeBuf *XmlDoc::getHeaderTagBuf() {
	if ( m_htbValid ) return &m_htb;

	Sections *ss = getSections();
	if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;

	int32_t count = 0;

	// scan sections
	Section *si = ss->m_rootSection;

 moreloop:

	for ( ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);
		if ( si->m_tagId != TAG_H1 ) continue;
		// if it contains now text, this will be -1
		// so give up on it
		if ( si->m_firstWordPos < 0 ) continue;
		if ( si->m_lastWordPos  < 0 ) continue;
		// ok, it works, get it
		break;
	}
	// if no h1 tag then make buf empty
	if ( ! si ) {
		m_htb.nullTerm();
		m_htbValid = true;
		return &m_htb;
	}
	// otherwise, set it
	char *a = m_words.m_words[si->m_firstWordPos];
	char *b = m_words.m_words[si->m_lastWordPos] ;
	b += m_words.m_wordLens[si->m_lastWordPos];

	// copy it
	m_htb.safeMemcpy ( a , b - a );
	m_htb.pushChar('\0');

	si = si->m_next;

	// add more?
	if ( count++ < 3 ) goto moreloop;

	m_htbValid = true;
	return &m_htb;
}


Title *XmlDoc::getTitle ( ) {
	if ( m_titleValid ) return &m_title;
	// need a buncha crap
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Title *)ww;
	Sections *sections = getSections();
	if ( ! sections ||sections==(Sections *)-1) return (Title *)sections;
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) return (Title *)pos;
	Query *q = getQuery();
	if ( ! q ) return (Title *)q;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	int32_t titleMaxLen = cr->m_titleMaxLen;
	if ( m_req ) titleMaxLen = m_req->m_titleMaxLen;
	// limit for speed, some guys have a 100k word title!
	if ( titleMaxLen > 256 ) titleMaxLen = 256;

	m_titleValid = true;
	if ( ! m_title.setTitle ( this        ,
				  xml         ,
				  ww          ,
				  sections    ,
				  pos         ,
				  titleMaxLen ,
				  0xffff      ,
				  NULL        ,
				  q           ,
				  cr        ,
				  m_niceness  ) )
		return NULL;
	return &m_title;
}


Summary *XmlDoc::getSummary () {
	if ( m_summaryValid ) return &m_summary;

	// xml and json docs have empty summaries for now
	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (Summary *)ct;

	if ( *ct == CT_JSON || *ct == CT_XML ) {
		m_summaryValid = true;
		return &m_summary;
	}

	// need a buncha crap
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Summary *)xml;
	Bits *bits = getBitsForSummary();
	if ( ! bits || bits == (Bits *)-1 ) return (Summary *)bits;
	Sections *sections = getSections();
	if ( ! sections ||sections==(Sections *)-1) return (Summary *)sections;
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) return (Summary *)pos;
	char *site = getSite ();
	if ( ! site || site == (char *)-1 ) return (Summary *)site;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Summary *)d;
	Matches *mm = getMatches();
	if ( ! mm || mm == (Matches *)-1 ) return (Summary *)mm;
	Title *ti = getTitle();
	if ( ! ti || ti == (Title *)-1 ) return (Summary *)ti;
	Query *q = getQuery();
	if ( ! q ) return (Summary *)q;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . get the highest number of summary lines that we need
	// . the summary vector we generate for doing summary-based deduping
	//   typically has more lines in it than the summary we generate for
	//   displaying to the user
	int32_t numLines = m_req->m_numSummaryLines;
	if ( cr->m_percentSimilarSummary >   0  &&
	     cr->m_percentSimilarSummary < 100  &&
	     m_req->m_getSummaryVector            &&
	     cr->m_summDedupNumLines > numLines   )
		// request more lines than we will display
		numLines = cr->m_summDedupNumLines;

	// int16_tcut
	Summary *s = &m_summary;

	// time cpu set time
	int64_t start = gettimeofdayInMilliseconds();
	m_cpuSummaryStartTime = start;

	// make sure summary does not include title
	char *tbuf    = ti->m_title;
	// this does not include the terminating \0
	int32_t  tbufLen = ti->m_titleBytes;

	// compute the summary
	bool status;
	status = s->set2( xml                              ,
			  ww                               ,
			  bits                             ,
			  sections                         ,
			  pos                              ,
			  q                                ,
			  (int64_t *)m_req->ptr_termFreqs  ,
			  (float     *)m_req->ptr_affWeights ,
			  false                            , // doStemming
			  m_req->m_summaryMaxLen           ,
			  numLines                         ,
			  // . displayLines, # lines we are displaying
			  // . Summary::getDisplayLen() will return the
			  //   length of the summary to display
			  m_req->m_numSummaryLines         ,
			  m_req->m_summaryMaxNumCharsPerLine,
			  m_req->m_ratInSummary            ,
			  getFirstUrl()                    ,
			  //&reply->m_queryProximityScore ,
			  mm                               ,
			  tbuf                             ,
			  tbufLen                          );

	// error, g_errno should be set!
	if ( ! status ) return NULL;

	m_summaryValid = true;

	return &m_summary;
}

char *XmlDoc::getHighlightedSummary ( ) {

	if ( m_finalSummaryBufValid ) {
		//char *fsum = m_finalSummaryBuf.getBufStart();
		//if ( ! fsum ) fsum = (char *)0x01;
		return m_finalSummaryBuf.getBufStart();
	}

	Summary *s = getSummary();

	if ( ! s || s == (void *)-1 ) return (char *)s;

	Query *q = getQuery();
	if ( ! q ) return (char *)q;

	// get the summary
	char *sum    = s->getSummary();
	//int32_t  sumLen = s->getSummaryLen();
	int32_t sumLen = s->getSummaryDisplayLen();

	//sum[sumLen] = 0;

	// assume no highlighting?
	if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
		m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
		m_finalSummaryBuf.nullTerm();
		m_finalSummaryBufValid = true;
		return m_finalSummaryBuf.getBufStart();
		//char *fsum = m_finalSummaryBuf.getBufStart();
		//if ( ! fsum ) fsum = (char *)0x01;
		//return fsum;
	}

	if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }

	Highlight hi;
	StackBuf(hb);
	// highlight the query in it
	int32_t hlen = hi.set ( &hb,
			     sum,
			     sumLen,
			     m_langId,
			     q,
			     false  , // doStemming?
			     false  , //click&scroll?
			     NULL   , // base url
			     "<b>"  , // front tag
			     "</b>" , // back tag
			     0,
			     m_niceness );


	// highlight::set() returns 0 on error
	if ( hlen < 0 ) {
		log("build: highlight class error = %s",mstrerror(g_errno));
		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
		return NULL;
	}

	// store into our safebuf then
	m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 );
	m_finalSummaryBufValid = true;
	m_finalSummaryBuf.nullTerm();

	return m_finalSummaryBuf.getBufStart();

	//char *fsum = m_finalSummaryBuf.getBufStart();
	//if ( ! fsum ) fsum = (char *)0x01;
	//return fsum;
}


//
// GET GIGABIT SAMPLE
//
//
// This will get samples surrounding all the query terms for purposes
// of gigabits generation. We don't just generate gigabits from the
// WHOLE document because it takes much longer?? is that still true?
// We assume that the first call to getTopLines() above set
// matches/numMatches. We use those arrays to
// skip directly to just the query terms in the document and save time.
// We may have to reset the Scores array here if we want to use it ltr.
//
// aka getGigabitSample.  get gigabit sample
//
SafeBuf *XmlDoc::getSampleForGigabits ( ) {


	if ( m_gsbufValid ) return &m_gsbuf;

	// assume empty
	//m_gsbuf = NULL;

	// basically, exit now if no sample needed
	if ( m_req->m_bigSampleMaxLen <= 0 ||
	     m_req->m_bigSampleRadius <= 0 ) {
		m_gsbufValid = true;
		return &m_gsbuf;
	}

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct;


	// if it is json then only return the json fields that are strings
	// and json decode them... separate each field with a \0.
	if ( *ct == CT_JSON )
		return getSampleForGigabitsJSON();


	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;

	// just send back the whole page, but separate each section
	// with \0. make only sentences end with ? ! or ., headers
	// not with anything, and no menu items
	Sections *sections = getSections();
	if ( ! sections ||sections==(Sections *)-1) return (SafeBuf *)sections;
	Section *sp = sections->m_rootSection;
	SafeBuf reply;
	reply.setLabel("gbtrepbuf");
	// m_contentLen is invalid, don't use that here use size_utf8Content
	if ( ! reply.reserve ( size_utf8Content + 1000 ) ) return NULL;
	// scan the sections of the document
	for ( ; sp ; sp = sp->m_next ) {
		QUICKPOLL(m_niceness);
		// do not allow menu crap
		if ( sp->m_flags & ( SEC_MENU          |
				     SEC_MENU_SENTENCE |
				     SEC_MENU_HEADER   ) )
			continue;
		// must be sentence or header
		bool ok = false;
		if ( sp->m_flags & SEC_SENTENCE ) ok = true;
		// headings are ok, just don't use as sentences...
		if ( sp->m_flags & SEC_HEADING  ) ok = true;
		if ( ! ok ) continue;

		// store without tags
		char *p = ww->m_words[sp->m_a];
		// include period after final word in section
		int32_t b = sp->m_b - 1;
		char *e = ww->m_words[b] + ww->m_wordLens[b];

		// if 3+ commas and one comma for every 4 words, forget it,
		// it is probably a list! well, process it, but make sure it
		// does not end in a period so we do not display it
		// as a fast fact, but we use it for gigabits.
		bool isList = false;
		int32_t commaCount = 0;
		int32_t bracketCount = 0;
		for ( char *z = p ; z < e ; z++ ) {
			if ( *z == ',' ) commaCount++;
			// fix ] [AllTheWeb] [Gigablast] [Google] [HotBot]...
			if ( *z == '[' ) bracketCount++;
		}
		int32_t naw = (b - sp->m_a) / 2;

		// just skip even for gigabits if too long. most likely
		// a spammy list of nouns.
		if ( naw >= 130 ) continue;

		if ( commaCount >= 3 && commaCount *4 >= naw )
			isList = true;
		if ( commaCount >= 10 )
			isList = true;
		if ( bracketCount >= 3 )
			isList = true;

		// too much uppercase?
		bool yelling = false;
		int32_t upper = 0;
		int32_t lower = 0;
		char cs = 0;
		for ( char *z = p ; z < e ; z += cs ) {
			cs = getUtf8CharSize(z);
			if ( ! is_alpha_utf8(z) ) continue;
			if ( is_upper_utf8(z) ) upper++;
			if ( is_lower_utf8(z) ) lower++;
		}
		if ( upper > lower ) yelling = true;


		// ending ) or ]
		if      ( e[0] == ')' ) e++;
		else if ( e[0] == ']' ) e++;

		// incorporate period etc.
		if      ( e[0] == '.' ) e++;
		else if ( e[0] == '!' ) e++;
		else if ( e[0] == '?' ) e++;
		else if ( e[0] == ';' ) e++;


		// must end in a period, or .) or .]
		bool endsInPeriod = false;
		if ( e-2 >= p &&
		     ( e[-1] =='.' ||
		       e[-1] =='!' ||
		       e[-1] =='?' ) )
			endsInPeriod = true;
		if ( (e[-1] == ')' ||
		      e[-1] == ']' ) &&
		     (e[-2] == '.' ||
		      e[-2] == '?' ||
		      e[-2] == '!' ) )
			endsInPeriod = true;

		//int32_t off = reply.length();

		// filter out tags and \n's and \r's and store into "reply"
		if ( ! reply.safePrintFilterTagsAndLines ( p , e-p ,false ) )
			return NULL;

		// if a sentence and does not end in period, toss one in
		//if ( sp->m_flags & SEC_SENTENCE ) {
		//	if ( e[-1] !='.' &&
		//	     e[-1] !='!' &&
		//	     e[-1] !='?' &&
		//	     e[-1] !=']' &&
		//	     e[-1] !=')' )
		//		reply.pushChar('.');
		//}

		// too huge? if # of ALNUM words > 70 it's too big.
		bool isHuge = false;
		if ( naw > 70 ) isHuge = true;


		// ending in a * indicates a printable sentence for fast facts
		if ( (sp->m_flags & SEC_SENTENCE) &&
		     ! isList &&
		     ! isHuge &&
		     ! yelling &&
		     endsInPeriod )
			reply.pushChar('*');

		// delineate sentences/headers/sections with | now so
		// we can still allow a word to be a gigabit even if it is
		// not in a sentence with a query term
		//reply.pushChar('\0');
		reply.pushChar('|');
		char *pc = reply.getBufStart() + reply.length() - 1;
		*pc = '\0';

		// debug
		//char *x = reply.getBufStart() + off;
		// turn off fast fact debug for now
		//log("fastfact: fastfact: %s",x);
		// revert back to |
		*pc = '|';

		// stop? this fixes the query 'lesbain vedeo porno' on
		// my cluster taking 10 seconds to get gigabits for.
		// bigsamplemaxlen is 1000 as of 12/4/2013.
		if ( reply.length() >= m_req->m_bigSampleMaxLen )
			break;
	}
	// a final \0
	reply.pushChar('\0');
	// move it over to m_gsbuf now
	m_gsbuf.stealBuf ( &reply );
	// we are valid
	m_gsbufValid = true;
	// success
	return &m_gsbuf;


	// need a buncha crap
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) return (SafeBuf *)pos;
	Matches *mm = getMatches();
	if ( ! mm || mm == (Matches *)-1 ) return (SafeBuf *)mm;

	// convert length to number of words
	int32_t bigSampleRadius = m_req->m_bigSampleRadius / 5;
	// at least 1
	if ( bigSampleRadius <= 0 ) bigSampleRadius = 1;

	// alloc for whole document?
	int32_t max = xml->getContentLen() ;
	// do not exceed
	if ( max > m_req->m_bigSampleMaxLen ) max = m_req->m_bigSampleMaxLen;
	// make sure we have something in words too. i guess no sample?
	if ( max <= 2 ) { m_gsbufValid = true; return &m_gsbuf; }
	// a flag so we don't overlap samples...
	int32_t lastb = -1;
	// . set m_buf to where we write the sample
	// . add a byte for the terminating \0
	int32_t gsbufAllocSize = max + 1;
	// temp hack
	//m_gsbuf = (char *)mmalloc(m_gsbufAllocSize,"gsbuf");
	if ( ! m_gsbuf.reserve ( gsbufAllocSize, "gsbuf" ) ) return NULL;
	// g_errno should be set...
	//if ( ! m_gsbuf ) return NULL;
	//m_freeBuf = true;
	// set our pointer
	char *pstart = m_gsbuf.getBufStart();
	char *p    = pstart;
	char *pend = pstart + max;

	int32_t nw = ww->m_numWords;

	// skip to first query term
	for ( int32_t i = 0 ; i < mm->m_numMatches ; i++ ) {
		// breathe
		QUICKPOLL ( m_niceness );
		// get the match
		Match *m = &mm->m_matches[i];
		// break out if match is not from the document's Words class
		if ( m->m_words != ww ) break;
		// the word #
		int32_t n = m->m_wordNum;
		// got a match, add this samplet, [a,b]
		int32_t a = n - bigSampleRadius;
		int32_t b = n + bigSampleRadius;
		if ( a <     0 ) a =     0;
		if ( b >    nw ) b =    nw;
		if ( a < lastb ) a = lastb;
		// ensure the samples are separated by \0
		else if ( p > pstart && p + 2 < pend ) {
			*p++ = '\0';
		}
		Pos  *pos = m->m_pos;
		int32_t *pp  = pos->m_pos;
		int32_t  len = pp[b+1] - pp[a];
		// if match would send us over, we are done
		if ( p + len >= pend ) break;
		len = pos->filter(p,pend,m->m_words,a,b,m->m_sections);
		// for debug (mdw)
		//log("query: gigabitsample#%"INT32"=%s",i,p);
		p += len;
		// we are the new lastb
		lastb = b;
	}
	// always null terminate
	*p++ = '\0';
	// . set sample size
	// . this includes terminating 0\'s in this case
	//int32_t gsbufSize = p - m_gsbuf;
	m_gsbuf.setLength( p - m_gsbuf.getBufStart() );
	// we are valid
	m_gsbufValid = true;
	// for debug (mdw)
	//log("query: finalgigabitsample=%s",m_gsbuf);
	// success
	return &m_gsbuf;
}

// if it is json then only return the json fields that are strings
// and json decode them... separate each field with a \0.
SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) {

	SafeBuf tmp;

	// use new json parser
	Json *jp = getParsedJson();
	if ( ! jp || jp == (void *)-1 ) return (SafeBuf *)jp;
	JsonItem *ji = jp->getFirstItem();
	for ( ; ji ; ji = ji->m_next ) {
		QUICKPOLL(m_niceness);
		// skip if not string
		if ( ji->m_type != JT_STRING )
			continue;
		// store field value
		char *val = ji->getValue();
		int valLen = ji->getValueLen();
		// if it contains html then skip it as a gigabit candidate.
		// otherwise our fast facts end up including html tags in them
		// in computeFastFacts() in Msg40.cpp
		int i;
		for ( i = 0 ; i < valLen ; i++ )
			if ( val[i] == '<' ) break;
		if ( i < valLen ) continue;

		if ( ! tmp.pushChar('\n') )
			return NULL;
		// if ( ! tmp.safePrintf("<p>"))
		// 	return NULL;


		// decode the json
		//SafeBuf xx;
		if ( ! tmp.safeDecodeJSONToUtf8(val,valLen,m_niceness))
			return NULL;

		// escape out the html
		// if ( ! tmp.htmlEncode ( xx.getBufStart() ))
		// 	return NULL;

		// two new lines
		if ( ! tmp.safePrintf("<hr>"))
			return NULL;
		if ( ! tmp.pushChar('\n') )
			return NULL;
		if ( ! tmp.pushChar('\n') )
			return NULL;
		if ( ! tmp.pushChar('\n') )
			return NULL;
	}

	if ( ! tmp.nullTerm() )
		return NULL;

	Xml xml;
	if ( ! xml.set ( tmp.getBufStart() ,
			 tmp.length() ,
			 false      ,  // ownData?
			 0          ,  // allocSize
			 false      ,  // pure xml?
			 m_version  ,
			 false      ,  // setParentsArg?
			 m_niceness ,
			 CT_HTML ) ) // *ct ) )
	     return NULL;
	Words ww;
	if ( ! ww.set ( &xml , true  , m_niceness ) ) return NULL;
	Bits bb;
	if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL;
	Phrases pp;
	if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL;
	// this uses the sectionsReply to see which sections are
	// "text", etc. rather than compute it expensively
	Sections sec;
	if ( !sec.set ( &ww      ,
			&pp    ,
			&bb          ,
			getFirstUrl() ,
			0,//*d            ,
			0,//*sh64         ,    // 64 bits
			"",//cr->m_coll        ,
			m_niceness    ,
			NULL,//m_masterState ,    // state
			NULL,//m_masterLoop  ,    // callback
			CT_JSON, // *ct           ,
			NULL,//&m_dates      ,
			NULL          ,    // sd // sections data
			true          ,    // sections data valid?
			NULL          ,    // sv // for m_nsvt
			NULL          ,    // buf
			0             )) { // bufSize
		return NULL;
	}


	// now add each sentence section into the buffer
	// scan the sentences if we got those
	char **wptrs = ww.getWords();
	int32_t  *wlens = ww.getWordLens();
	Section *ss = sec.m_firstSent;
	for ( ; ss ; ss = ss->m_nextSent ) {
		// breathe
		QUICKPOLL(m_niceness);
		// count of the alnum words in sentence
		int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
		// start with one word!
		count--;
		// how can it be less than one alnum word
		if ( count < 0 ) continue;
		// store it
		char *wp1 = wptrs[ss->m_senta];
		char *wp2 = wptrs[ss->m_sentb-1] + wlens[ss->m_sentb-1];

		bool gotTerm = (wp2[0]=='.' || wp2[0]=='?' || wp2[0]=='!' ) ;

		//if ( ! gotTerm ) continue;

		if ( ! m_gsbuf.safeMemcpy ( wp1 , wp2 - wp1 ) )
			return NULL;

		// puncty?
		if ( gotTerm && ! m_gsbuf.pushChar(wp2[0]))
			return NULL;

		// to indicate end of header or sentence, in order to
		// qualify as a fast fact, we must add a '*'. see
		// PageResults.cpp, search for ''*''
		if ( gotTerm && ! m_gsbuf.pushChar('*') )
			return NULL;
		if ( ! m_gsbuf.pushChar('\0') )
			return NULL;
	}
	m_gsbufValid = true;
	return &m_gsbuf;
}


// . good sites sometimes have hacked pages
// . try to identify those
char *XmlDoc::getIsCompromised ( ) {
	if ( m_isCompromisedValid ) return &m_isCompromised;
	Xml *xml = getXml();
	if ( ! xml || xml == (void *)-1 ) return (char *)xml;
	int32_t     n     = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes();
	// assume compromised
	m_isCompromised = true;
	m_isCompromisedValid = true;
	// find the first meta summary node
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != TAG_FONT ) continue;
		// only get content for <meta name=..> not <meta http-equiv=..>
		int32_t stlen;
		char *style = nodes[i].getFieldValue ( "style" , &stlen );
		// skip if none
		if ( ! style || stlen <= 6 ) continue;
		// NULL term
		char c = style[stlen];
		style[stlen] = '\0';
		char *hc = strstr(style,"height");
		char *wc = strstr(style,"width");
		// skip if neighter
		if ( ! hc && ! wc ) continue;
		// advance
		if ( hc ) hc += 6;
		if ( wc ) wc += 5;
		while ( is_wspace_a(*hc) ) hc++;
		while ( is_wspace_a(*wc) ) wc++;
		if ( hc && *hc == ':' ) hc++;
		if ( wc && *wc == ':' ) hc++;
		while ( is_wspace_a(*hc) ) hc++;
		while ( is_wspace_a(*wc) ) wc++;
		style[stlen] = c;
		// a zero height or width is a signal of invisble text and of
		// our syzygy compromised site to compromised site spammer
		if ( *hc == '0' ) return &m_isCompromised;
		if ( *wc == '0' ) return &m_isCompromised;
	}
	m_isCompromised = false;
	return &m_isCompromised;
}

// <meta name=robots value=noarchive>
// <meta name=gigabot value=noarchive>
char *XmlDoc::getIsNoArchive ( ) {
	if ( m_isNoArchiveValid ) return &m_isNoArchive;
	Xml *xml = getXml();
	if ( ! xml || xml == (void *)-1 ) return (char *)xml;
	m_isNoArchive      = false;
	m_isNoArchiveValid = true;
	int32_t     n     = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes();
	// find the meta tags
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != TAG_META ) continue;
		// get robots attribute
		int32_t alen; char *att;
		// <meta name=robots value=noarchive>
		att = nodes[i].getFieldValue ( "name" , &alen );
		// need a name!
		if ( ! att ) continue;
		// get end
		char *end = att + alen;
		// skip leading spaces
		while ( att < end && *att && is_wspace_a(*att) ) att++;
		// must be robots or gigabot. skip if not
		if ( strncasecmp(att,"robots" ,6) &&
		     strncasecmp(att,"gigabot",7)   ) continue;
		// get the content vaue
		att = nodes[i].getFieldValue("content",&alen);
		// skip if none
		if ( ! att ) continue;
		// get end
		end = att + alen;
		// skip leading spaces
		while ( att < end && *att && is_wspace_a(*att) ) att++;
		// is is noarchive? skip if no such match
		if ( strncasecmp(att,"noarchive",9) ) continue;
		// ok, we got it
		m_isNoArchive = true;
		break;
	}
	// return what we got
	return &m_isNoArchive;
}

// this vector's components are 64-bit, not the usual 32-bit
int64_t **XmlDoc::getAdVector ( ) {
	if ( m_adVectorValid ) return &ptr_adVector;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (int64_t **)xml;
	setStatus ( "parsing out ad ids");
	// assume valid
	m_adVectorValid = true;
	int32_t     na    = 0;
	int32_t     n     = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes();
	// find the meta tags
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// continue if not a script tag
		if ( nodes[i].m_nodeId != TAG_SCRIPT ) continue; // 83
		// must be a front tag, not a back tag
		if ( xml->isBackTag ( i ) ) continue;
		// find the back tag for it
		int32_t j;
		for ( j = i ; j < n ; j++ ) {
			// another script tag
			if( nodes[i].m_nodeId != TAG_SCRIPT ) continue;
			// must be a back tag this time
			if ( ! xml->isBackTag ( i ) ) continue;
			// ok, we got it
			break;
		}
		// if no back tag, give up
		if ( j == n ) break;

		// buf/len defines the script area
		char *buf = xml->getNode(i);
		int32_t  len = xml->getNode(j) - buf;

		// skip this script tag for next loop
		i = j;

		bool  found  = false;

		// start off looking for google
		char *needles[3] =
			{ "google_ad_client" ,
			  "ctxt_ad_partner",
			  "http://ad" };
		char *providers[3] =
			{ "google" ,
			  "yahoo",
			  "doubleclick" };

		for ( int32_t k = 0 ; k < 3 ; k++ ) {
			// try to match this needle
			char *match = needles[k];
			// try to get a match
			char *p = strnstr ( buf, match , len );
			// go again
			if ( ! p ) continue;
			// do not exceed the script area
			char *pend   = buf + len;

			// it is in quotes
			// pub-uint64_t for google ad, uint32_t for yahoo

			// check for double or single quote
			while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
			// it must have them!... i guess
			if ( p >= pend ) continue;

			// point to after the quote
			char *pbegin = ++p;
			// find the ending quote
			while (k<2 && p<pend && *p != '"' && *p != '\'') p++;
			// if none, bail
			if ( p >= pend ) continue;
			// get length of the ad client id between the quotes
			int32_t adClientLen = p - pbegin;

			if ( k == 2 ) {
				p = strnstr(p,".doubleclick.net/",pend-p);
				if ( ! p ) continue;
				p += 17;
				// look for doubleclick ads
				// user name is the second element of the path
				while(p < pend && *p != '/') p++;
				pbegin = ++p;
				while(p < pend && *p != '/') p++;
				if(p >= pend) continue;
				adClientLen = p - pbegin;
				found = true;
			}

			char *f    = pbegin;
			char *fend = pbegin + adClientLen;
			for ( ; f < fend ; f++ ) {
				if ( is_alnum_a ( *f ) ) continue;
				if ( *f == '-' || *f == '_' || *f == '.' )
					continue;
				break;
			}
			if ( f < fend           ) continue;
			if ( adClientLen >= 400 ) continue;
			if ( adClientLen < 4    ) continue;
			// null term temp
			char c = *fend;
			*fend = '\0';
			// hash it
			char buf[512];
			sprintf(buf,"gbad:%s-%s",providers[k],pbegin);
			// put it back
			*fend = c;
			// . make the query term id
			// . first hash the field
			uint64_t h = hash64 ( "gbad" , 4 );
			// then add in the other junk
			h = hash64 ( buf , gbstrlen(buf) , h );
			// . now we will index that as-is
			// . and Msg25/LinkInfo can use to dedup voters!
			m_adIds[na++] = h;
			// stop if too many. save room for NULL termination.
			if ( na + 1 >= XD_MAX_AD_IDS ) break;
		}
		//look for another if not found or not ok.
	}
	// null term it like a good vector! no, those are 32-bit components,
	// we are a 64-bit component vector
	//m_adIds[na++] = 0;
	// point to where we should put them
	ptr_adVector = m_adIds;
	// store this i guess
	size_adVector = na * 8;
	// *lastNode = nn;
	return &ptr_adVector;
}


char *XmlDoc::getIsLinkSpam ( ) {
	if ( m_isLinkSpamValid ) return &m_isLinkSpam2;

	setStatus ( "checking if linkspam" );

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 ) return (char *)links;
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
	int32_t **pici = getIndCatIds();
	if ( ! pici || pici == (void *)-1 ) return (char *)pici;
	//LinkInfo *info1 = getLinkInfo1();
	//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// reset note
	m_note = NULL;

	// . if a doc is "link spam" then it cannot vote, or its
	//   voting power is reduced
	// . look for indications that the link is from a guestbook
	// . doc length over 100,000 bytes consider it link spam
	m_isLinkSpamValid = true;
	m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
				      *ip ,
				      ptr_indCatIds ,
				      size_indCatIds / 4 ,
				      *sni ,
				      xml,
				      links,
				      150000,//MAXDOCLEN,//maxDocLen ,
				      &m_note ,
				      NULL , // &linkee , // url ,
				      -1 , // linkNode ,
				      cr->m_coll ,
				      m_niceness );
	// set shadow
	m_isLinkSpam2 = (bool)m_isLinkSpam;
	return &m_isLinkSpam2;
}


void *zliballoc ( void *opaque , unsigned int items , unsigned int size ) {
	//log("db: got zlib alloc");
	return (void *)mmalloc ( items * size , "zlib" );
}

void zlibfree ( void *opaque , void *address ) {
	//log("db: got zlib free");
	// -1 will tell Mem.cpp to look it up in the table
	mfree ( address , -1 , "zlib" );
}

void *malloc_replace (void *pf , unsigned int nitems , unsigned int size ) {
	return g_mem.gbmalloc(size*nitems,"malloc_replace");
}

void free_replace   ( void *pf , void *s ) {
	// -1 means we don't know the size
	g_mem.gbfree(s,-1,"free_replace");
}

int gbuncompress ( unsigned char *dest      ,
		   uint32_t *destLen   ,
		   unsigned char *source    ,
		   uint32_t  sourceLen ) {
	z_stream stream;
	int err;

	stream.next_in = (Bytef*)source;
	stream.avail_in = (uInt)sourceLen;
	// Check for source > 64K on 16-bit machine:
	if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;

	stream.next_out = dest;
	stream.avail_out = (uInt)*destLen;
	if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;

	//stream.zalloc = (alloc_func)0;
	//stream.zfree = (free_func)0;
	stream.zalloc = malloc_replace;//zliballoc;
	stream.zfree  = free_replace;//zlibfree;

	// this calls memcpy so make sure Profiler.cpp doesn't crash
	// since when it calls backtrace() that calls memcpy() too
	// and it's not async safe
	g_inMemcpy = 2;

	//we can be gzip or deflate
	err = inflateInit2(&stream, 47);

	g_inMemcpy = 0;

	if (err != Z_OK) return err;

	err = inflate(&stream, Z_FINISH);
	if (err != Z_STREAM_END) {
		inflateEnd(&stream);
		if (err == Z_NEED_DICT ||
		    (err == Z_BUF_ERROR && stream.avail_in == 0))
			return Z_DATA_ERROR;
		return err;
	}
	*destLen = stream.total_out;

	err = inflateEnd(&stream);
	return err;
}

void deflateQuickPoll ( ) {
	QUICKPOLL(1);
}

int gbcompress ( unsigned char *dest      ,
		 uint32_t *destLen   ,
		 unsigned char *source    ,
		 uint32_t  sourceLen ,
		 int32_t encoding            ) {

	int level = Z_DEFAULT_COMPRESSION;
	z_stream stream;
	int err;
	int method     = Z_DEFLATED;
	//lots of mem, faster, more compressed, see zlib.h
	int windowBits = 31;
	int memLevel   = 8;
	int strategy   = Z_DEFAULT_STRATEGY;

	stream.next_in = (Bytef*)source;
	stream.avail_in = (uInt)sourceLen;
#ifdef MAXSEG_64K
	// Check for source > 64K on 16-bit machine:
	if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
#endif
	stream.next_out = dest;
	stream.avail_out = (uInt)*destLen;
	if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;

	//stream.zalloc = (alloc_func)0;
	//stream.zfree = (free_func)0;
	stream.zalloc = malloc_replace;//zliballoc;
	stream.zfree  = free_replace;//zlibfree;

	stream.opaque = (voidpf)0;

	//we can be gzip or deflate
	if(encoding == ET_DEFLATE) err = deflateInit (&stream, level);
	else                       err = deflateInit2(&stream, level,
						      method, windowBits,
						      memLevel, strategy);
	if (err != Z_OK) {
		// zlib's incompatible version error?
		if ( err == -6 ) {
			log("zlib: zlib did you forget to add #pragma pack(4) to "
			    "zlib.h when compiling libz.a so it aligns on 4-byte "
			    "boundaries because we have that pragma in "
			    "gb-include.h so its used when including zlib.h");
		}
		return err;
	}

	// cygwin uses the system libz.a which is not hacked for our quickpoll
#ifndef CYGWIN
	// tell deflat() to call quickpoll

	// MDW: 11/14/2014 don't do this for the 64bit zlib for now just to
	// save some time. do it later when it proves to be an issue.
	//setQuickPoll ( (char *)&g_loop.m_needsToQuickPoll, deflateQuickPoll);
#endif

	// this calls memcpy so make sure Profiler.cpp doesn't crash
	// since when it calls backtrace() that calls memcpy() too
	// and it's not async safe
	g_inMemcpy = 3;

	err = deflate(&stream, Z_FINISH);

	g_inMemcpy = 0;

	if (err != Z_STREAM_END) {
		deflateEnd(&stream);
		return err == Z_OK ? Z_BUF_ERROR : err;
	}
	*destLen = stream.total_out;

	err = deflateEnd(&stream);
	return err;
}

//
// NO NO don't use until use replace in[64] with SafeBuf in and out below
//
int gbcompress7 ( unsigned char *dest      ,
		  uint32_t *destLen   ,
		  unsigned char *source    ,
		  uint32_t  sourceLen ,
		  bool compress ) {

	//int32_t id = 1;
	// pass the input to the program through this file
	// rather than a pipe, since popen() seems broken
	char in[64];
	if ( compress ) sprintf ( in , "%s/in.7z", g_hostdb.m_dir );
	else  sprintf ( in , "%s/out.7z", g_hostdb.m_dir );
	unlink ( in );
	// collect the output from the filter from this file
	char out[64];
	if ( compress ) sprintf ( out , "%s/out.7z", g_hostdb.m_dir );
	else sprintf ( out , "%s/in.7z", g_hostdb.m_dir );
	if ( ! compress )
	unlink ( out );
	// ignore errno from those unlinks
	errno = 0;
	// open the input file
 retry11:
	int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
	if ( fd < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry11;
		log("build: Could not open file %s for writing: %s.",
		    in,mstrerror(errno));
		return -1;
	}

 retry12:
	// write the content into the input file
	int32_t w = write ( fd , source , sourceLen );
	// valgrind
	if ( w < 0 && errno == EINTR ) goto retry12;
	// did we get an error
	if ( w != (int32_t)sourceLen ) {
		log("build: Error writing to %s: %s.",in,mstrerror(errno));
		close(fd);
		return -1;
	}
	// close the file
	close ( fd );

	// . open a pipe to pdf2html program
	// . the output will go to stdout
	//char cmd[2048];
	SafeBuf cmd;
	// different commands to filter differt ctypes
	// -i     : ignore images
	// -stdout: send output to stdout
	// -c     : generate complex document
	// Google generates complex docs, but the large ones are horribly slow
	// in the browser, but docs with 2 cols don't display right w/o -c.
	// damn, -stdout doesn't work when -c is specified.
	// These ulimit sizes are max virtual memory in kilobytes. let's
	// keep them to 25 Megabytes
	// . the newer 2.6 kernels do not support ulimit !!!
	if ( compress )
		// 7za a out.7z in.7z
		cmd.safePrintf( "%s7za a %s %s > /dev/null",
			  g_hostdb.m_dir , out,in);
	else
		// -y = yes on all. so we overwrite "in.7z"
		cmd.safePrintf( "%s7za -o%s -y e %s > /dev/null",
			  g_hostdb.m_dir,g_hostdb.m_dir , in);//,in);
	// breach sanity check
	//if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; }

	// exectue it
	int retVal = gbsystem ( cmd.getBufStart() );
	if ( retVal == -1 )
		log("gb: system(%s) : %s",cmd.getBufStart(),
		    mstrerror(g_errno));

	// all done with input file
	// clean up the binary input file from disk
	//if ( unlink ( in ) != 0 ) {
	//	// log error
	//	log("gbfilter: unlink (%s): %s\n",in,strerror(errno));
	//	// ignore it, since it was not a processing error per se
	//	errno = 0;
	//}

 retry13:
	fd = open ( out , O_RDONLY );
	if ( fd < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry13;
		log("7zip: Could not open file %s for reading: %s.",
		    out,mstrerror(errno));
		return -1;
	}
	// to read - leave room for \0
	//int32_t toRead = MAXDOCLEN + 1000;
	int32_t toRead = 150000 + 1000;
 retry14:
	// read right from pipe descriptor
	int32_t r = read (fd, dest,toRead);
	// note errors
	if ( r < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry14;
		log("7zip: reading output: %s",mstrerror(errno));
		// this is often bad fd from an oom error, so ignore it
		errno = 0;
		r = 0;
	}
	// clean up shop
	close ( fd );
	// delete output file
	//unlink ( out );
	if ( r > (int32_t)*destLen ) { char *xx=NULL;*xx=0; }
	// assign
	*destLen = r;
	// debug for now
	char *pre = "";
	if ( ! compress ) pre = "un";
	log("7zip: %scompressed %"UINT32" to %"UINT32" bytes"
	    , pre,sourceLen , *destLen );
	return Z_OK;
}

int gbuncompress7  ( unsigned char *dest      ,
		     uint32_t *destLen   ,
		     unsigned char *source    ,
		     uint32_t  sourceLen ) {
	return gbcompress7(dest,destLen,source,sourceLen,false);
}

/*
bool XmlDoc::hashSingleTerm ( int64_t termId , HashInfo *hi ) {
	// combine with a non-NULL prefix
	if ( hi->m_prefix ) {
		int64_t prefixHash = hash64b ( hi->m_prefix );
		// sanity test, make sure it is in supported list
		if ( getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
			char *xx=NULL;*xx=0; }
		termId = hash64 ( termId , prefixHash );
	}

	// save it?
	if ( m_wts && ! ::storeTerm ( "binary",6,termId,hi,0,0,
				      MAXDENSITYRANK,
				      MAXDIVERSITYRANK,
				      MAXWORDSPAMRANK,
				      hi->m_hashGroup,
				      false,&m_wbuf,m_wts,false) )
		return false;

	// int16_tcut
	HashTableX *dt = hi->m_tt;
	// sanity check
	if ( dt->m_ks != sizeof(key_t) ) { char *xx=NULL;*xx=0; }
	// make the key like we do in hashWords()
	key96_t k;
	k.n1 = hi->m_date;
	k.n0 = termId;
	// get current score for this wordid
	int32_t slot = dt->getSlot ( &k );
	// does this termid/date already exist?
	if ( slot >= 0 ) {
		// done
		return true;
	}
	// otherwise, add a new slot
	char val = 1;
	if ( ! hi->m_tt->addKey ( (char *)k , &val ) )
		return false;
	// return true on success
	return true;
}
*/

bool storeTerm ( char       *s        ,
		 int32_t        slen     ,
		 int64_t   termId   ,
		 HashInfo   *hi       ,
		 int32_t        wordNum  ,
		 int32_t        wordPos  ,
		 char        densityRank,
		 char        diversityRank ,
		 char        wordSpamRank ,
		 char        hashGroup,
		 //bool        isPhrase ,
		 SafeBuf    *wbuf     ,
		 HashTableX *wts      ,
		 char        synSrc   ,
		 char        langId ,
		 POSDBKEY key ) {

	// store prefix
	int32_t poff = wbuf->length();
	// int16_tcut
	char *p = hi->m_prefix;
	// add the prefix too!
	if ( p  && ! wbuf->safeMemcpy(p,gbstrlen(p)+1)) return false;
	// none?
	if ( ! p ) poff = -1;


	// store description
	int32_t doff = wbuf->length();
	// int16_tcut
	char *d = hi->m_desc;
	// add the desc too!
	if ( d && ! wbuf->safeMemcpy(d,gbstrlen(d)+1) ) return false;
	// none?
	if ( ! d ) doff = -1;

	// store term
	int32_t toff = wbuf->length();
	// add it
	if ( ! wbuf->safeMemcpy ( s , slen ) ) return false;

	// make this
	TermDebugInfo ti;
	ti.m_termOff   = toff;
	ti.m_termLen   = slen;
	ti.m_descOff   = doff;
	ti.m_prefixOff = poff;
	ti.m_date      = hi->m_date;
	ti.m_shardByTermId = hi->m_shardByTermId;
	ti.m_termId    = termId;
	//ti.m_weight    = 1.0;
	//ti.m_spam    = -1.0;
	ti.m_diversityRank = diversityRank;
	ti.m_densityRank   = densityRank;
	ti.m_wordSpamRank  = wordSpamRank;
	ti.m_hashGroup     = hashGroup;
	ti.m_wordNum   = wordNum;
	ti.m_wordPos   = wordPos;
	ti.m_langId = langId;
	ti.m_key   = key;

	// was sitehash32
	//ti.m_facetVal32 = hi->m_facetVal32;//sentHash32 = hi->m_sentHash32;

	// save for printing out an asterisk
	ti.m_synSrc = synSrc; // isSynonym = isSynonym;

	// get language bit vec
	ti.m_langBitVec64 = g_speller.getLangBits64(&termId);

	//if ( isPhrase ) ti.m_synSrc = SOURCE_NGRAM;

	/*
	// the weight vec for the words and phrases
	for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) ti.m_rv[j] = 1.0;

	int32_t  *wscores = NULL;

	if ( weights && ! isPhrase ) wscores = weights->m_ww;
	if ( weights &&   isPhrase ) wscores = weights->m_pw;

	// int16_tcut
	int32_t i = wordNum;

	if ( weights && ! weights->m_rvw ) { char *xx=NULL;*xx=0; }
	if ( weights && ! weights->m_rvp ) { char *xx=NULL;*xx=0; }

	float *rv = NULL;
	if ( weights && ! isPhrase ) rv = &weights->m_rvw[i*MAX_RULES];
	if ( weights &&   isPhrase ) rv = &weights->m_rvp[i*MAX_RULES];

	if ( weights ) ti.m_weight = (float)wscores[i] / (float)DW;

	if ( weights )
		gbmemcpy ( &ti.m_rv, rv , MAX_RULES*sizeof(float));

	// no, because if this is zero we force it up to 1!
	//if ( weights )
	//	ti.m_score32 = (int32_t)((float)ti.m_score32 * ti.m_weight);
	ti.m_score32 = score;

	if ( isSynonym )
		ti.m_score32   = score;
	*/

	// make the key
	key96_t k;
	k.n1 = 0; // date
	k.n0 = termId;

	// store it
	return wts->addKey ( &k , &ti ) ;
}


bool XmlDoc::hashSingleTerm ( char       *s         ,
			      int32_t        slen      ,
			      HashInfo   *hi        ) {
	// empty?
	if ( slen <= 0 ) return true;
	if ( ! m_versionValid    ) { char *xx=NULL;*xx=0; }
	if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }

	//
	// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
	//
	//if ( ! m_storeTermListInfo )
	//	return true;


	// a single blob hash
        int64_t termId = hash64 ( s , slen );
	// combine with prefix
	int64_t final = termId;
	// combine with a non-NULL prefix
	int64_t prefixHash = 0LL;
	if ( hi->m_prefix ) {
		prefixHash = hash64b ( hi->m_prefix );
		final = hash64 ( termId , prefixHash );
	}
	// call the other guy now
	//return hashSingleTerm ( final , hi );

	// int16_tcut
	HashTableX *dt = hi->m_tt;
	// sanity check
	if ( dt->m_ks != sizeof(key144_t) ) { char *xx=NULL;*xx=0; }
	// make the key like we do in hashWords()
	key144_t k;
	g_posdb.makeKey ( &k ,
			  final,
			  0LL, // docid
			  0, // dist
			  MAXDENSITYRANK, // density rank
			  MAXDIVERSITYRANK, // diversity rank
			  MAXWORDSPAMRANK, // wordspamrank
			  0, // siterank
			  hi->m_hashGroup,
			  // we set to docLang in final hash loop
			  langUnknown,// langid
			  0, // multiplier
			  0, // syn?
			  false , // delkey?
			  hi->m_shardByTermId );

	//
	// HACK: mangle the key if its a gbsitehash:xxxx term
	// used for doing "facets" like stuff on section xpaths.
	//
	// no longer do this because we just hash the term
	// gbxpathsitehash1234567 where 1234567 is that hash.
	// but
	//
	//static int64_t s_gbsectionhash = 0LL;
	//if ( ! s_gbsectionhash ) s_gbsectionhash = hash64b("gbsectionhash");
	//if ( prefixHash == s_gbsectionhash )
	//	g_posdb.setSectionSentHash32 ( &k, hi->m_sentHash32 );

	// . otherwise, add a new slot
	// . key should NEVER collide since we are always
	//   incrementing the distance cursor, m_dist
	if ( ! dt->addTerm144 ( &k ) ) return false;

	// add to wts for PageParser.cpp display
	if ( m_wts && ! storeTerm ( s,slen,final,hi,
				    0, // wordnum
				    0, // wordPos,
				    MAXDENSITYRANK,
				    MAXDIVERSITYRANK,
				    MAXWORDSPAMRANK,
				    hi->m_hashGroup,
				    //false,
				    &m_wbuf,
				    m_wts,
				    SOURCE_NONE, // synsrc
				    langUnknown,
				    k) )
		return false;

	return true;
}

bool XmlDoc::hashString ( char *s, HashInfo *hi ) {
	return hashString ( s , gbstrlen(s), hi ); }

bool XmlDoc::hashString ( char       *s          ,
			  int32_t        slen       ,
			  HashInfo   *hi         ) {
	if ( ! m_versionValid        ) { char *xx=NULL;*xx=0; }
	if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; }
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }
	int32_t *sni = getSiteNumInlinks();
	return   hashString3( s                ,
			      slen             ,
			      hi               ,
			      &m_countTable    ,
			      m_pbuf           ,
			      m_wts            ,
			      &m_wbuf          ,
			      m_version        ,
			      *sni             ,
			      m_niceness       );
}


bool XmlDoc::hashString3( char       *s              ,
		  int32_t        slen           ,
		  HashInfo   *hi             ,
		  HashTableX *countTable     ,
		  SafeBuf    *pbuf           ,
		  HashTableX *wts            ,
		  SafeBuf    *wbuf           ,
		  int32_t        version        ,
		  int32_t        siteNumInlinks ,
		  int32_t        niceness       ) {
	Words   words;
	Bits    bits;
	Phrases phrases;
	//Weights weights;
	//Synonyms synonyms;
	if ( ! words.set   ( s , slen , version , true , niceness ) )
		return false;
	if ( ! bits.set    ( &words , version , niceness ) )
		return false;
	if ( ! phrases.set(&words,&bits,true,false,version,niceness ) )
		return false;

	// use primary langid of doc
	if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }

	// words
	//SafeBuf myLangVec;
	//if ( ! setLangVec ( &words , &myLangVec , m_niceness ) )
	//	return false;
	//char *langVec = (char *)myLangVec.getBufStart();

	/*
	// debugBuf for synonyms? yes if we are debugging
	SafeBuf synDebugBuf;
	SafeBuf *sdbp = NULL;
	if ( pbuf || m_storeTermListInfo ) sdbp = &synDebugBuf;
	// now we can set it...
	if ( hi->m_useSynonyms && !synonyms.set(&words,
						NULL, // langVec,
						m_langId,
						&phrases,
						niceness,
						sdbp))
		return false;
	*/

	// set weights because of count table
	//if ( countTable && ! weights.set ( &words     ,
	/*
	if ( hi->m_useWeights &&
	     ! weights.set ( &words         ,
			     &phrases       ,
			     &bits          ,
			     NULL           ,
			     pbuf           ,
			     false          ,
			     false          ,
			     version        ,
			     100            , // titleWeight
			     100            , // headerWeight
			     countTable     ,
			     false          , // isLinkText
			     false          , // isCntTable?
			     siteNumInlinks ,
			     niceness       ) )
		return false;

	Weights *wp = &weights;
	if ( ! hi->m_useWeights ) wp = NULL;
	*/

	//Synonyms *sp = NULL;
	//if ( hi->m_useSynonyms ) sp = &synonyms;

	return hashWords3  ( //0                   ,
			     //words.getNumWords() ,
			     hi                  ,
			     &words              ,
			     &phrases            ,
			     NULL,//sp                  , synonyms
			     NULL                , // sections
			     countTable          ,
			     NULL , // fragvec
			     NULL , // wordspamvec
			     NULL , // langvec
			     langUnknown , // default langid doclangid
			     pbuf                ,
			     wts                 ,
			     wbuf                ,
			     niceness            );
}

bool XmlDoc::hashWords ( //int32_t        wordStart ,
			 //int32_t        wordEnd   ,
			 HashInfo   *hi        ) {
	// sanity checks
	if ( ! m_wordsValid   ) { char *xx=NULL; *xx=0; }
	if ( ! m_phrasesValid ) { char *xx=NULL; *xx=0; }
	if ( hi->m_useCountTable &&!m_countTableValid){char *xx=NULL; *xx=0; }
	if ( ! m_bitsValid ) { char *xx=NULL; *xx=0; }
	if ( ! m_sectionsValid) { char *xx=NULL; *xx=0; }
	//if ( ! m_synonymsValid) { char *xx=NULL; *xx=0; }
	if ( ! m_fragBufValid ) { char *xx=NULL; *xx=0; }
	if ( ! m_wordSpamBufValid ) { char *xx=NULL; *xx=0; }
	if ( m_wts && ! m_langVectorValid  ) { char *xx=NULL; *xx=0; }
	if ( ! m_langIdValid ) { char *xx=NULL; *xx=0; }
	// . is the word repeated in a pattern?
	// . this should only be used for document body, for meta tags,
	//   inlink text, etc. we should make sure words are unique
	char *wordSpamVec = getWordSpamVec();
	char *fragVec = m_fragBuf.getBufStart();
	char *langVec = m_langVec.getBufStart();

	return   hashWords3( //wordStart       ,
			     //wordEnd         ,
			     hi              ,
			     &m_words        ,
			     &m_phrases      ,
			     NULL,//&m_synonyms     ,
			     &m_sections     ,
			     &m_countTable   ,
			     fragVec ,
			     wordSpamVec ,
			     langVec ,
			     m_langId , // defaultLangId docLangId
			     m_pbuf          ,
			     m_wts           ,
			     &m_wbuf         ,
			     m_niceness      );
}

// . this now uses posdb exclusively
bool XmlDoc::hashWords3 ( //int32_t        wordStart ,
			 //int32_t        wordEnd   ,
		 HashInfo   *hi        ,
		 Words      *words     ,
		 Phrases    *phrases   ,
		 Synonyms   *synonyms  ,
		 Sections   *sectionsArg  ,
		 HashTableX *countTable ,
		 char *fragVec ,
		 char *wordSpamVec ,
		 char *langVec ,
		 char docLangId , // default lang id
		 //Weights    *weights   ,
		 SafeBuf    *pbuf      ,
		 HashTableX *wts       ,
		 SafeBuf    *wbuf      ,
		 int32_t        niceness  ) {

	//
	// POSDB HACK: temporarily turn off posdb until we hit 1B pages!
	//
	//if ( ! m_storeTermListInfo )
	//	return true;

	Sections *sections = sectionsArg;
	// for getSpiderStatusDocMetaList() we don't use sections it'll
	// mess us up
	if ( ! hi->m_useSections ) sections = NULL;

	// int16_tcuts
	uint64_t *wids    = (uint64_t *)words->getWordIds();
	//nodeid_t *tids    = words->m_tagIds;
	uint64_t *pids2   = (uint64_t *)phrases->m_phraseIds2;
	//uint64_t *pids3   = (uint64_t *)phrases->m_phraseIds3;

	HashTableX *dt = hi->m_tt;

	// . sanity checks
	// . posdb just uses the full keys with docid
	if ( dt->m_ks != 18 ) { char *xx=NULL;*xx=0; }
	if ( dt->m_ds != 4  ) { char *xx=NULL;*xx=0; }

	// if provided...
	if ( wts ) {
		if ( wts->m_ks != 12               ) { char *xx=NULL;*xx=0; }
		if ( wts->m_ds != sizeof(TermDebugInfo)){char *xx=NULL;*xx=0; }
		if ( ! wts->m_allowDups ) { char *xx=NULL;*xx=0; }
	}

	// ensure caller set the hashGroup
	if ( hi->m_hashGroup < 0 ) { char *xx=NULL;*xx=0; }

	// handy
	char **wptrs = words->getWordPtrs();
	int32_t  *wlens = words->getWordLens();

	// hash in the prefix
	uint64_t prefixHash = 0LL;
	int32_t plen = 0;
	if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
	if ( hi->m_prefix && plen ) {
		// we gotta make this case insensitive, and skip spaces
		// because if it is 'focal length' we can't search
		// 'focal length:10' because that comes across as TWO terms.
		prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
		// . sanity test, make sure it is in supported list
		// . hashing diffbot json output of course fails this so
		//   skip in that case if diffbot
		//if ( ! m_isDiffbotJSONObject &&
		//     getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) {
		//	if (hi->m_desc&&strcmp(hi->m_desc,"custom meta tag")) {
		//		char *xx=NULL;*xx=0; }
		//}
	}

	bool hashIffUnique = false;
	//if ( hi->m_hashGroup == HASHGROUP_INLINKTEXT ) hashIffUnique = true;
	if ( hi->m_hashGroup == HASHGROUP_INMETATAG  ) hashIffUnique = true;
	if ( hi->m_hashGroup == HASHGROUP_INTAG      ) hashIffUnique = true;
	HashTableX ut; ut.set ( 8,0,0,NULL,0,false,niceness,"uqtbl");

	///////
	//
	// diversity rank vector.
	//
	///////
	// the final diversity which is a multiplier
	// is converted into a rank from 0-15 i guess.
	// so 'mexico' in "new mexico" should receive a low word score but high
	// phrase score. thus, a search for 'mexico' should not bring up
	// the page for university of new mexico!
	SafeBuf dwbuf;
	if(!getDiversityVec ( words,phrases,countTable,&dwbuf,niceness))
		return false;
	char *wdv = dwbuf.getBufStart();

	int32_t nw = words->getNumWords();

	/////
	//
	// calculate density ranks
	//
	/////
	//
	// this now varies depending on the length of the sentence/header etc.
	// so if the hasgroup is not title, link text or meta tag, we have to
	// use a safebuf.
	SafeBuf densBuf;
	// returns false and sets g_errno on error
	if ( ! getDensityRanks((int64_t *)wids,
			       nw,//wordStart,
			       //wordEnd,
			       hi->m_hashGroup,
			       &densBuf,
			       sections,
			       m_niceness))
		return false;
	// a handy ptr
	char *densvec = (char *)densBuf.getBufStart();

	////////////
	//
	// get word positions
	//
	///////////
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	SafeBuf wpos;
	if ( ! getWordPosVec ( words ,
			       sections,
			       //wordStart,
			       //wordEnd,
			       m_dist, // hi->m_startDist,
			       fragVec,
			       niceness,
			       &wpos) ) return false;
	// a handy ptr
	int32_t *wposvec = (int32_t *)wpos.getBufStart();

	/*
	// show that for debug
	if ( m_docId == 192304365235LL ) {
		for ( int32_t i = 0 ; i < nw ; i++  ) {
			char buf[1000];
			int32_t len = wlens[i];
			if ( len > 900 ) len = 900;
			gbmemcpy(buf,wptrs[i],len);
			buf[len]='\0';
			log("seopipe: wptr=%s pos[%"INT32"]=%"INT32"",buf,i,wposvec[i]);
		}
	}
	*/

	//int32_t wc = 0;

	//int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;

	int32_t i;
	for ( i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		if ( ! wids[i] ) continue;
		// ignore if in repeated fragment
		if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue;
		// ignore if in style section
		if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue;

		// do not breach wordpos bits
		if ( wposvec[i] > MAXWORDPOS ) break;

		// . hash the startHash with the wordId for this word
		// . we must mask it before adding it to the table because
		//   this table is also used to hash IndexLists into that come
		//   from LinkInfo classes (incoming link text). And when
		//   those IndexLists are hashed they used masked termIds.
		//   So we should too...
		//uint64_t h = g_indexdb.getTermId ( startHash , wids[i] ) ;
		uint64_t h ;
		if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash );
		else            h = wids[i];

		// . get word spam rank. 0 means not spammed
		// . just mod Weights class to ues a weight rank...
		// . and diversity rank
		// . need to separate weights by spam vs. diversity.
		// . maybe just have a diversity class and a pattern class
		//   and leave the poor weights class alone
		//int32_t wsr = 0;

		int32_t hashGroup = hi->m_hashGroup;

		Section *sx = NULL;
		if ( sp ) {
			sx = sp[i];
			// . this is taken care of in hashTitle()
			// . it is slightly different if the title is
			//   multiple sentences because when hashing the
			//   body the density rank is per sentence, but in
			//   hashTitle we count all the words in the title
			//   towards the density rank even if they are
			//   in different sentences
			if ( sx->m_flags & SEC_IN_TITLE  )
				//hashGroup = HASHGROUP_TITLE;
				continue;
			if ( sx->m_flags & SEC_IN_HEADER )
				hashGroup = HASHGROUP_HEADING;
			if ( sx->m_flags & ( SEC_MENU          |
					     SEC_MENU_SENTENCE |
					     SEC_MENU_HEADER   ) )
				hashGroup = HASHGROUP_INMENU;
		}

		// this is for link text and meta tags mostly
		if ( hashIffUnique ) {
			// skip if already did it
			if ( ut.isInTable ( &h ) ) continue;
			if ( ! ut.addKey ( &h ) ) return false;
		}

		char ws = 15;
		if ( wordSpamVec ) ws = wordSpamVec[i];

		// HACK:
		// if this is inlink text, use the wordspamrank to hold the
		// inlinker's site rank!
		if ( hashGroup == HASHGROUP_INLINKTEXT )
			ws = hi->m_linkerSiteRank;

		// default to the document's primary language if it is not
		// clear what language this word belongs to.
		// if the word is only in german it should be german,
		// otherwise it will be the document's primary language.
		char langId = langUnknown;
		if ( m_wts && langVec ) langId = langVec[i];
		// keep it as the original vector. i'm not sure we use
		// this for anything but for display, so show the user
		// how we made our calculation of the document's primary lang
		//if ( langId == langUnknown ) langId = docLangId;

		char wd;
		if ( hi->m_useCountTable ) wd = wdv[i];
		else                       wd = MAXDIVERSITYRANK;

		// if using posdb
		key144_t k;
		// if ( i == 11429 )
		// 	log("foo");
		g_posdb.makeKey ( &k ,
				  h ,
				  0LL,//docid
				  wposvec[i], // dist,
				  densvec[i],// densityRank , // 0-15
				  wd, // diversityRank 0-15
				  ws, // wordSpamRank  0-15
				  0, // siterank
				  hashGroup ,
				  // we set to docLang final hash loop
				  langUnknown, // langid
				  0 , // multiplier
				  false , // syn?
				  false , // delkey?
				  hi->m_shardByTermId );

		// get the one we lost
		// char *kstr = KEYSTR ( &k , sizeof(POSDBKEY) );
		// if (!strcmp(kstr,"0x0ca3417544e400000000000032b96bf8aa01"))
		// 	log("got lost key");

		// key should NEVER collide since we are always incrementing
		// the distance cursor, m_dist
		dt->addTerm144 ( &k );


		// . make the m_wordPosInfoBuf here because we need to set
		//   WordPosInfo::m_wordPtr/m_wordLen.
		// . could also use instead of the "wts" buffer?
		if ( m_doingSEO ) {
			// alloc in 10k chunks
			if ( m_wordPosInfoBuf.getAvail() <
			     (int32_t)sizeof(WordPosInfo) ) {
				int32_t newSize = m_wordPosInfoBuf.length();
				newSize += 10000;
				if ( ! m_wordPosInfoBuf.reserve ( newSize ) )
					return false;
			}
			// make it
			WordPosInfo wi;
			wi.m_wordPtr       = wptrs[i];
			wi.m_wordLen       = wlens[i];
			wi.m_wordPos       = wposvec[i];
			wi.m_densityRank   = densvec[i];
			wi.m_wordSpamRank  = ws;
			wi.m_diversityRank = wd;//v[i];
			wi.m_hashGroup     = hashGroup;
			wi.m_trafficGain   = 0;
			int32_t cs = sizeof(WordPosInfo);
			if(!m_wordPosInfoBuf.safeMemcpy(&wi,cs)) return false;
		}

		// add to wts for PageParser.cpp display
		if ( wts ) {
			if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i,
					   wposvec[i], // wordPos
					   densvec[i],// densityRank , // 0-15
					   wd,//v[i],
					   ws,
					   hashGroup,
					   //false, // is phrase?
					   wbuf,
					   wts,
					   SOURCE_NONE, // synsrc
					   langId ,
					   k))
				return false;
		}

		//
		// STRIP POSSESSIVE WORDS for indexing
		//
		// . for now do simple stripping here
		// . if word is "bob's" hash "bob"
		//
		if ( wlens[i] >= 3 &&
		     wptrs[i][wlens[i]-2] == '\'' &&
		     to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
			int64_t nah ;
			nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 );
			if ( plen>0 ) nah = hash64 ( nah , prefixHash );
			g_posdb.makeKey ( &k ,
					  nah,
					  0LL,//docid
					  wposvec[i], // dist,
					  densvec[i],// densityRank , // 0-15
					  wd,//v[i], // diversityRank ,
					  ws, // wordSpamRank ,
					  0, //siterank
					  hashGroup,
					  // we set to docLang final hash loop
					  langUnknown, // langid
					  0 , // multiplier
					  true  , // syn?
					  false , // delkey?
					  hi->m_shardByTermId );
			// key should NEVER collide since we are always
			// incrementing the distance cursor, m_dist
			dt->addTerm144 ( &k );
			// keep going if not debug
			if ( ! wts ) continue;
			// print the synonym
			if ( ! storeTerm(wptrs[i], // synWord,
					 wlens[i] -2, // gbstrlen(synWord),
					 nah,  // termid
					 hi,
					 i, // wordnum
					 wposvec[i], // wordPos
					 densvec[i],// densityRank , // 0-15
					 wd,//v[i],
					 ws,
					 hashGroup,
					 //false, // is phrase?
					 wbuf,
					 wts,
					 SOURCE_GENERATED,
					 langId,
					 k) )
				return false;
		}


		/////////////
		//
		// synonyms (alt words,morphs,synonyms)
		//
		/////////////
		/*
		int64_t *aids = NULL;
		int16_t      naids = 0;
		int64_t  syh;
		if ( synonyms ) {
			aids   = synonyms->getAltIds (i);
			naids  = synonyms->getNumAlts(i);
			//ascore = saved / 4;
			//if ( ascore <= 0 ) ascore = 1;
			//asaved = ascore;
		}
		for ( int32_t j = 0 ; j < naids ; j++ ) {
			// skip if same as original
			if ( (uint64_t)aids[j] == wids[i] ) continue;
			// . hash it with the prefix if any
			// . fixes gbwhere:galleries bug...
			if ( plen>0 ) syh = hash64 ( aids[j] , prefixHash );
			else          syh = aids[j];
			g_posdb.makeKey ( &k ,
					  syh ,
					  0LL,//docid
					  wposvec[i], // dist,
					  densvec[i],// densityRank , // 0-15
					  wdv[i], // diversityRank ,
					  ws, // wordSpamRank ,
					  0, //siterank
					  hashGroup,
					  // we set to docLang final hash loop
					  langUnknown, // langid
					  0 , // multiplier
					  true  , // syn?
					  false ); // delkey?
			// key should NEVER collide since we are always
			// incrementing the distance cursor, m_dist
			dt->addTerm144 ( &k );

			// keep going if not debug
			if ( ! wts ) continue;
			// get the junk
			char *synWord = synonyms->getStringFromId(&aids[j]);
			// sanity
			if ( ! synWord ) { char *xx=NULL;*xx=0; }
			// print the synonym
			if ( ! storeTerm(synWord,
					 gbstrlen(synWord),
					 syh, // termid
					 hi,
					 i, // wordnum
					 wposvec[i], // wordPos
					 densvec[i],// densityRank , // 0-15
					 wdv[i],
					 ws,
					 hashGroup,
					 //false, // is phrase?
					 wbuf,
					 wts,
					 synonyms->m_source[i], // synsrc
					 langId) )
				return false;
		}
		*/

		////////
		//
		// two-word phrase
		//
		////////

		int64_t npid = pids2[i];
		int32_t      npw  = 2;
		uint64_t  ph2 = 0;

		// repeat for the two word hash if different!
		if ( npid ) {
			// hash with prefix
			if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
			else            ph2 = npid;
			g_posdb.makeKey ( &k ,
					  ph2 ,
					  0LL,//docid
					  wposvec[i],//dist,
					  densvec[i],// densityRank , // 0-15
					  MAXDIVERSITYRANK, //phrase
					  ws, // wordSpamRank ,
					  0,//siterank
					  hashGroup,
					  // we set to docLang final hash loop
					  langUnknown, // langid
					  0 , // multiplier
					  true  , // syn?
					  false , // delkey?
					  hi->m_shardByTermId );
			// key should NEVER collide since we are always
			// incrementing the distance cursor, m_dist
			dt->addTerm144 ( &k );
		}

		// add to wts for PageParser.cpp display
		if ( wts && npid ) {
			// get phrase as a string
			int32_t plen;
			char *phr=phrases->getPhrase(i,&plen,npw);
			// store it
			if ( ! storeTerm ( phr,plen,ph2,hi,i,
					   wposvec[i], // wordPos
					   densvec[i],// densityRank , // 0-15
					   MAXDIVERSITYRANK,//phrase
					   ws,
					   hashGroup,
					   //true,
					   wbuf,
					   wts,
					   SOURCE_BIGRAM, // synsrc
					   langId,
					   k) )
				return false;
		}

		////////
		//
		// three-word phrase
		//
		////////
		/*
		npid = pids3[i];
		npw  = 3;

		// repeat for the two word hash if different!
		if ( npid ) {
			// hash with prefix
			uint64_t ph2 ;
			if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash );
			else            ph2 = npid;
			g_posdb.makeKey ( &k ,
					  ph2 ,
					  0LL,//docid
					  wposvec[i],//dist,
					  densvec[i],// densityRank , // 0-15
					  MAXDIVERSITYRANK, //phrase
					  ws, // wordSpamRank ,
					  0,//siterank
					  hashGroup,
					  // we set to docLang final hash loop
					  langUnknown, // langid
					  0 , // multiplier
					  true  , // syn?
					  false ); // delkey?
			// key should NEVER collide since we are always
			// incrementing the distance cursor, m_dist
			dt->addTerm144 ( &k );
		}

		// add to wts for PageParser.cpp display
		if ( wts && npid ) {
			// get phrase as a string
			int32_t plen;
			char *phr=phrases->getPhrase(i,&plen,npw);
			// store it
			if ( ! storeTerm ( phr,plen,ph2,hi,i,
					   wposvec[i], // wordpos
					   densvec[i],// densityRank , // 0-15
					   MAXDIVERSITYRANK,//phrase
					   ws,
					   hashGroup,
					   //true, // is phrase?
					   wbuf,
					   wts,
					   SOURCE_TRIGRAM, // synsrc
					   langId ) )
				return false;
		}
		*/
		// update for hashIncomingLinkText()
		//hi->m_startDist = wposvec[i];

		// debug point
		//if ( ph2 == (uint64_t)-233869093807964777LL ) {
		//	log("hey slot=%"INT32" date=%"UINT32" n0=%"INT64" score=%"INT32"",
		//	    slot,
		//	    k.n1,k.n0,
		//	    score);
		//	//char *xx=NULL;*xx=0;
		//}

		//
		// NUMERIC SORTING AND RANGES
		//

		// only store numbers in fields this way
		if ( prefixHash == 0 ) continue;

		// this may or may not be numeric.
		if ( ! is_digit ( wptrs[i][0] ) ) continue;

		// this might have to "back up" before any '.' or '-' symbols
		if ( ! hashNumber ( wptrs[0] ,
				    wptrs[i] ,
				    wlens[i] ,
				    hi ) )
			return false;
	}

	// hash a single term so they can do gbfacet:ext or
	// gbfacet:siterank or gbfacet:price. a field on a field.
	if ( prefixHash && words->m_numWords )
		// hash gbfacet:price with and store the price in the key
		hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi );

	// between calls? i.e. hashTitle() and hashBody()
	//if ( wc > 0 ) m_dist = wposvec[wc-1] + 100;
	if ( i > 0 ) m_dist = wposvec[i-1] + 100;

	return true;
}

// just like hashNumber*() functions but we use "gbfacet" as the
// primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby,
// gbsortbyint, gbrevsortby, gbrevsortbyint
bool XmlDoc::hashFacet1 ( char *term ,
			  Words *words ,
			  HashTableX *tt ) {

	// need a prefix
	//if ( ! hi->m_prefix ) return true;

	// hash the ENTIRE content, all words as one blob
	int32_t nw = words->getNumWords();
	char *a = words->m_words[0];
	char *b = words->m_words[nw-1]+words->m_wordLens[nw-1];
	// hash the whole string as one value, the value of the facet
	int32_t val32 = hash32 ( a , b - a );

	if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false;

	//
	// why do this if we already do it for hashNumber() using gbsortby: ?
	//

	/*
	// if it's a number hash as float and int
	if ( nw != 1 ) return true;
	char **wptrs = words->m_words;
	if ( ! is_digit ( wptrs[0][0] ) ) return true;

	// hash with a float val
	float f = atof(wptrs[0]);
	int32_t vf32 = *(int32_t *)&f;
	if ( ! hashFacet2 ( "gbfacetfloat",term, vf32 , tt ) ) return false;

	// and an int val
	int32_t vi32 = atoi(wptrs[0]);
	if ( ! hashFacet2 ( "gbfacetint",term, vi32 , tt ) ) return false;
	*/

	return true;
}

bool XmlDoc::hashFacet2 ( char *prefix,
			  char *term ,
			  int32_t val32 ,
			  HashTableX *tt ,
			  // we only use this for gbxpathsitehash terms:
			  bool shardByTermId ) {

	// need a prefix
	//if ( ! hi->m_prefix ) return true;
	//int32_t plen = gbstrlen ( hi->m_prefix );
	//if ( plen <= 0 ) return true;
	// we gotta make this case insensitive, and skip spaces
	// because if it is 'focal length' we can't search
	// 'focal length:10' because that comes across as TWO terms.
	//int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen);

	// now any field has to support gbfacet:thatfield
	// and store the 32-bit termid into where we normally put
	// the word position bits, etc.
	//static int64_t s_facetPrefixHash = 0LL;
	//if ( ! s_facetPrefixHash )
	//	s_facetPrefixHash = hash64n ( "gbfacet" );

	// this is case-sensitive
	int64_t prefixHash = hash64n ( prefix );

	// term is like something like "object.price" or whatever.
	// it is the json field itself, or the meta tag name, etc.
	int64_t termId64 = hash64n ( term );

	// combine with the "gbfacet" prefix. old prefix hash on right.
	// like "price" on right and "gbfacetfloat" on left... see Query.cpp.
	int64_t ph2 = hash64 ( termId64, prefixHash );

	// . now store it
	// . use field hash as the termid. normally this would just be
	//   a prefix hash
	// . use mostly fake value otherwise
	key144_t k;
	g_posdb.makeKey ( &k ,
			  ph2 ,
			  0,//docid
			  0,// word pos #
			  0,// densityRank , // 0-15
			  0 , // MAXDIVERSITYRANK
			  0 , // wordSpamRank ,
			  0 , //siterank
			  0 , // hashGroup,
			  // we set to docLang final hash loop
			  //langUnknown, // langid
			  // unless already set. so set to english here
			  // so it will not be set to something else
			  // otherwise our floats would be ordered by langid!
			  // somehow we have to indicate that this is a float
			  // termlist so it will not be mangled any more.
			  //langEnglish,
			  langUnknown,
			  0 , // multiplier
			  false, // syn?
			  false , // delkey?
			  shardByTermId );

	//int64_t final = hash64n("products.offerprice",0);
	//int64_t prefix = hash64n("gbsortby",0);
	//int64_t h64 = hash64 ( final , prefix);
	//if ( ph2 == h64 )
	//	log("hey: got offer price");

	// now set the float in that key
	g_posdb.setInt ( &k , val32 );

	// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
	// so that we can b-step into a posdb list and make sure
	// we are aligned on a 6 byte or 12 byte key, since they come
	// in both sizes. but for this, hack it off to tell
	// addTable144() that we are a special posdb key, a "numeric"
	// key that has a float stored in it. then it will NOT
	// set the siterank and langid bits which throw our sorting
	// off!!
	g_posdb.setAlignmentBit ( &k , 0 );

	HashTableX *dt = tt;//hi->m_tt;

	// the key may indeed collide, but that's ok for this application
	if ( ! dt->addTerm144 ( &k ) )
		return false;

	if ( ! m_wts )
		return true;

	bool isFloat = false;
	if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true;

	// store in buffer for display on pageparser.cpp output
	char buf[130];
	if ( isFloat )
		snprintf(buf,128,"facetField=%s facetVal32=%f",term,
			 *(float *)&val32);
	else
		snprintf(buf,128,"facetField=%s facetVal32=%"UINT32"",
			 term,(uint32_t)val32);
	int32_t bufLen = gbstrlen(buf);

	// make a special hashinfo for this facet
	HashInfo hi;
	hi.m_tt = tt;
	// the full prefix
	char fullPrefix[66];
	snprintf(fullPrefix,64,"%s:%s",prefix,term);
	hi.m_prefix = fullPrefix;//"gbfacet";

	// add to wts for PageParser.cpp display
	// store it
	if ( ! storeTerm ( buf,
			   bufLen,
			   ph2, // prefixHash, // s_facetPrefixHash,
			   &hi,
			   0, // word#, i,
			   0, // wordPos
			   0,// densityRank , // 0-15
			   0, // MAXDIVERSITYRANK,//phrase
			   0, // ws,
			   0, // hashGroup,
			   //true,
			   &m_wbuf,
			   m_wts,
			   // a hack for display in wts:
			   SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
			   langUnknown ,
			   k) )
		return false;

	return true;
}

bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) {

	HashTableX *tt = hi->m_tt;

	uint64_t val64 = hash64 ( val , vlen );

	// term is like something like "object.price" or whatever.
	// it is the json field itself, or the meta tag name, etc.
	uint64_t middlePrefix = hash64n ( hi->m_prefix );

        // hash "This is a new product." with "object.desc".
        // "object.desc" (termId64) is case-sensitive.
        uint64_t composite = hash64 ( val64 , middlePrefix );

        // hash that with "gbfieldmatch"
	char *prefix = "gbfieldmatch";
	uint64_t prefixHash = hash64n ( prefix );
	uint64_t ph2 = hash64 ( composite , prefixHash );

	// . now store it
	// . use field hash as the termid. normally this would just be
	//   a prefix hash
	// . use mostly fake value otherwise
	key144_t k;
	g_posdb.makeKey ( &k ,
			  ph2 ,
			  0,//docid
			  0,// word pos #
			  0,// densityRank , // 0-15
			  0 , // MAXDIVERSITYRANK
			  0 , // wordSpamRank ,
			  0 , //siterank
			  0 , // hashGroup,
			  // we set to docLang final hash loop
			  //langUnknown, // langid
			  // unless already set. so set to english here
			  // so it will not be set to something else
			  // otherwise our floats would be ordered by langid!
			  // somehow we have to indicate that this is a float
			  // termlist so it will not be mangled any more.
			  //langEnglish,
			  langUnknown,
			  0 , // multiplier
			  false, // syn?
			  false , // delkey?
			  false ) ; // shardByTermId? no, by docid.

	HashTableX *dt = tt;//hi->m_tt;

	// the key may indeed collide, but that's ok for this application
	if ( ! dt->addTerm144 ( &k ) )
		return false;

	if ( ! m_wts )
		return true;

	// store in buffer for display on pageparser.cpp output
	char buf[128];
	int32_t bufLen ;
	bufLen = sprintf(buf,"gbfieldmatch:%s:%"UINT64"",hi->m_prefix,val64);

	// make a special hashinfo for this facet
	HashInfo hi2;
	hi2.m_tt = tt;
	// the full prefix
	char fullPrefix[64];
	snprintf(fullPrefix,62,"%s:%s",prefix,hi->m_prefix);
	hi2.m_prefix = fullPrefix;//"gbfacet";

	// add to wts for PageParser.cpp display
	// store it
	if ( ! storeTerm ( buf,
			   bufLen,
			   ph2, // prefixHash, // s_facetPrefixHash,
			   &hi2,
			   0, // word#, i,
			   0, // wordPos
			   0,// densityRank , // 0-15
			   0, // MAXDIVERSITYRANK,//phrase
			   0, // ws,
			   0, // hashGroup,
			   //true,
			   &m_wbuf,
			   m_wts,
			   // a hack for display in wts:
			   SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
			   langUnknown ,
			   k) )
		return false;

	return true;
}


// . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the
//   posdb key
// . the termid is the hash of the preceeding field
// . in json docs a field is like "object.details.price"
// . in meta tags it is just the meta tag name
// . credit card numbers are 16 digits. we'd need like 58 bits to store those
//   so we can't do that here, but we can approximate as a float
// . the binary representation of floating point numbers is ordered in the
//   same order as the floating points themselves! so we are lucky and can
//   keep our usually KEYCMP sorting algos to keep the floats in order.
bool XmlDoc::hashNumber ( char *beginBuf ,
			  char *buf ,
			  int32_t bufLen ,
			  HashInfo *hi ) {

	if ( ! is_digit(buf[0]) ) return true;

	char *p = buf;
	char *bufEnd = buf + bufLen;

	// back-up over any .
	if ( p > beginBuf && p[-1] == '.' ) p--;

	// negative sign?
	if ( p > beginBuf && p[-1] == '-' ) p--;

	// . convert it to a float
	// . this now allows for commas in numbers like "1,500.62"
	float f = atof2 ( p , bufEnd - p );

	// debug
	//log("build: hashing %s %f",hi->m_prefix,f);

	if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
		return false;

	// also hash in reverse order for sorting from low to high
	f = -1.0 * f;

	if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
		return false;

	//
	// also hash as an int, 4 byte-integer so our lastSpidered timestamps
	// dont lose 128 seconds of resolution
	//

	int32_t i = (int32_t) atoll2 ( p , bufEnd - p );

	if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) )
		return false;

	// also hash in reverse order for sorting from low to high
	i = -1 * i;

	if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) )
		return false;


	return true;
}

// . THIS IS NOW replaced by ::hashFacet2() being called by hashSections()
//   above. it is a more generic, faceted approch.
// . the term is gbxpathsite123456 the prefix is gbfacet the val32
//   stored in the posdb key is the inner html hash of the section, and
//   the "123456" is the hash of the xpath and site. so the field names
//   are very custom, not your typical "ext" or "title"
// . CHROME DETECTION
// . hash a special "gbxpathsitehash12345678" term which has the hash of the
//   innerHTML content embedded in it.
// . we do this for doing gbfacetstr:gbxpathsitehash12345678 etc. on every
//   section with innerHTML so we can figure out the histogram of each
//   section on this page relative to its subdomain. like the distriubtion
//   of the innerHTML for this section as it appears on other pages from
//   this site. this allows killer CHROME DETECTION!!!!
/*
bool XmlDoc::hashSectionTerm ( char *term , HashInfo *hi , int32_t sentHash32 ) {

        int64_t termId = hash64 ( term , gbstrlen(term) );
	key144_t k;
	g_posdb.makeKey ( &k ,
			  termId,
			  0,//docid
			  0,// word pos #
			  0,// densityRank , // 0-15
			  0 , // MAXDIVERSITYRANK
			  0 , // wordSpamRank ,
			  0 , //siterank
			  0 , // hashGroup,
			  // we set to docLang final hash loop
			  //langUnknown, // langid
			  // unless already set. so set to english here
			  // so it will not be set to something else
			  // otherwise our floats would be ordered by langid!
			  // somehow we have to indicate that this is a float
			  // termlist so it will not be mangled any more.
			  //langEnglish,
			  langUnknown,
			  0 , // multiplier
			  false, // syn?
			  false , // delkey?
			  hi->m_shardByTermId );

	//int64_t final = hash64n("products.offerprice",0);
	//int64_t prefix = hash64n("gbsortby",0);
	//int64_t h64 = hash64 ( final , prefix);
	//if ( ph2 == h64 )
	//	log("hey: got offer price");

	// now set the float in that key
	g_posdb.setInt ( &k , sentHash32 );

	// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
	// so that we can b-step into a posdb list and make sure
	// we are aligned on a 6 byte or 12 byte key, since they come
	// in both sizes. but for this, hack it off to tell
	// addTable144() that we are a special posdb key, a "numeric"
	// key that has a float stored in it. then it will NOT
	// set the siterank and langid bits which throw our sorting
	// off!!
	g_posdb.setAlignmentBit ( &k , 0 );

	// sanity
	int t = g_posdb.getInt ( &k );
	if ( t != sentHash32 ) { char *xx=NULL;*xx=0; }

	HashTableX *dt = hi->m_tt;

	// the key may indeed collide, but that's ok for this application
	if ( ! dt->addTerm144 ( &k ) )
		return false;

	if ( ! m_wts )
		return true;

	// store in buffer
	//char buf[128];
	//int32_t bufLen = sprintf(buf,"%"UINT32"",sentHash32);

	// if no gbmin or gbmax or gbsorty or gbrevsortby we need gbfacet
	//int64_t truePrefix64 = hash64n ( "gbfacet" );

	// add to wts for PageParser.cpp display
	// store it
	if ( ! storeTerm ( term,//buf,
			   gbstrlen(term),//bufLen,
			   0LL,//truePrefix64,
			   hi,
			   0, // word#, i,
			   0, // wordPos
			   0,// densityRank , // 0-15
			   0, // MAXDIVERSITYRANK,//phrase
			   0, // ws,
			   0, // hashGroup,
			   //true,
			   &m_wbuf,
			   m_wts,
			   // a hack for display in wts:
			   SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
			   langUnknown ,
			   k))
		return false;

	return true;
}
*/


bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {

	// prefix is something like price. like the meta "name" or
	// the json name with dots in it like "product.info.price" or something
	int64_t nameHash = 0LL;
	int32_t nameLen = 0;
	if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
	if ( hi->m_prefix && nameLen )
		nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
	// need a prefix for hashing numbers... for now
	else { char *xx=NULL; *xx=0; }

	// combine prefix hash with a special hash to make it unique to avoid
	// collisions. this is the "TRUE" prefix.
	int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
	// hash with the "TRUE" prefix
	int64_t ph2 = hash64 ( nameHash , truePrefix64 );

	// . now store it
	// . use field hash as the termid. normally this would just be
	//   a prefix hash
	// . use mostly fake value otherwise
	key144_t k;
	g_posdb.makeKey ( &k ,
			  ph2 ,
			  0,//docid
			  0,// word pos #
			  0,// densityRank , // 0-15
			  0 , // MAXDIVERSITYRANK
			  0 , // wordSpamRank ,
			  0 , //siterank
			  0 , // hashGroup,
			  // we set to docLang final hash loop
			  //langUnknown, // langid
			  // unless already set. so set to english here
			  // so it will not be set to something else
			  // otherwise our floats would be ordered by langid!
			  // somehow we have to indicate that this is a float
			  // termlist so it will not be mangled any more.
			  //langEnglish,
			  langUnknown,
			  0 , // multiplier
			  false, // syn?
			  false , // delkey?
			  hi->m_shardByTermId );

	//int64_t final = hash64n("products.offerprice",0);
	//int64_t prefix = hash64n("gbsortby",0);
	//int64_t h64 = hash64 ( final , prefix);
	//if ( ph2 == h64 )
	//	log("hey: got offer price");

	// now set the float in that key
	g_posdb.setFloat ( &k , f );

	// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
	// so that we can b-step into a posdb list and make sure
	// we are aligned on a 6 byte or 12 byte key, since they come
	// in both sizes. but for this, hack it off to tell
	// addTable144() that we are a special posdb key, a "numeric"
	// key that has a float stored in it. then it will NOT
	// set the siterank and langid bits which throw our sorting
	// off!!
	g_posdb.setAlignmentBit ( &k , 0 );

	// sanity
	float t = g_posdb.getFloat ( &k );
	if ( t != f ) { char *xx=NULL;*xx=0; }

	HashTableX *dt = hi->m_tt;

	// the key may indeed collide, but that's ok for this application
	if ( ! dt->addTerm144 ( &k ) )
		return false;

	if ( ! m_wts )
		return true;

	// store in buffer
	char buf[128];
	snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
	int32_t bufLen = gbstrlen(buf);

	// add to wts for PageParser.cpp display
	// store it
	if ( ! storeTerm ( buf,
			   bufLen,
			   truePrefix64,
			   hi,
			   0, // word#, i,
			   0, // wordPos
			   0,// densityRank , // 0-15
			   0, // MAXDIVERSITYRANK,//phrase
			   0, // ws,
			   0, // hashGroup,
			   //true,
			   &m_wbuf,
			   m_wts,
			   // a hack for display in wts:
			   SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
			   langUnknown ,
			   k) )
		return false;

	return true;
}

bool XmlDoc::hashNumber3 ( int32_t n , HashInfo *hi , char *sortByStr ) {

	// prefix is something like price. like the meta "name" or
	// the json name with dots in it like "product.info.price" or something
	int64_t nameHash = 0LL;
	int32_t nameLen = 0;
	if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
	if ( hi->m_prefix && nameLen )
		nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
	// need a prefix for hashing numbers... for now
	else { char *xx=NULL; *xx=0; }

	// combine prefix hash with a special hash to make it unique to avoid
	// collisions. this is the "TRUE" prefix.
	int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
	// hash with the "TRUE" prefix
	int64_t ph2 = hash64 ( nameHash , truePrefix64 );

	// . now store it
	// . use field hash as the termid. normally this would just be
	//   a prefix hash
	// . use mostly fake value otherwise
	key144_t k;
	g_posdb.makeKey ( &k ,
			  ph2 ,
			  0,//docid
			  0,// word pos #
			  0,// densityRank , // 0-15
			  0 , // MAXDIVERSITYRANK
			  0 , // wordSpamRank ,
			  0 , //siterank
			  0 , // hashGroup,
			  // we set to docLang final hash loop
			  //langUnknown, // langid
			  // unless already set. so set to english here
			  // so it will not be set to something else
			  // otherwise our floats would be ordered by langid!
			  // somehow we have to indicate that this is a float
			  // termlist so it will not be mangled any more.
			  //langEnglish,
			  langUnknown,
			  0 , // multiplier
			  false, // syn?
			  false , // delkey?
			  hi->m_shardByTermId );

	//int64_t final = hash64n("products.offerprice",0);
	//int64_t prefix = hash64n("gbsortby",0);
	//int64_t h64 = hash64 ( final , prefix);
	//if ( ph2 == h64 )
	//	log("hey: got offer price");

	// now set the float in that key
	//g_posdb.setFloat ( &k , f );
	g_posdb.setInt ( &k , n );

	// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
	// so that we can b-step into a posdb list and make sure
	// we are aligned on a 6 byte or 12 byte key, since they come
	// in both sizes. but for this, hack it off to tell
	// addTable144() that we are a special posdb key, a "numeric"
	// key that has a float stored in it. then it will NOT
	// set the siterank and langid bits which throw our sorting
	// off!!
	g_posdb.setAlignmentBit ( &k , 0 );

	// sanity
	//float t = g_posdb.getFloat ( &k );
	int32_t x = g_posdb.getInt ( &k );
	if ( x != n ) { char *xx=NULL;*xx=0; }

	HashTableX *dt = hi->m_tt;

	// the key may indeed collide, but that's ok for this application
	if ( ! dt->addTerm144 ( &k ) )
		return false;

	if ( ! m_wts )
		return true;

	// store in buffer
	char buf[128];
	snprintf(buf,126,"%s:%s int32=%"INT32"",sortByStr, hi->m_prefix,n);
	int32_t bufLen = gbstrlen(buf);

	// add to wts for PageParser.cpp display
	// store it
	if ( ! storeTerm ( buf,
			   bufLen,
			   truePrefix64,
			   hi,
			   0, // word#, i,
			   0, // wordPos
			   0,// densityRank , // 0-15
			   0, // MAXDIVERSITYRANK,//phrase
			   0, // ws,
			   0, // hashGroup,
			   //true,
			   &m_wbuf,
			   m_wts,
			   // a hack for display in wts:
			   SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
			   langUnknown ,
			   k ) )
		return false;

	return true;
}

// . many many websites got hijacked pages in them...
// . revkim.org/mcdrt/mgntf/sata/sata.htm
// . collegefootballweekly.net/hswsj/riime/sata/sata.htm
char *XmlDoc::getIsHijacked() {
	bool hj = false;
	if ( ! hj ) hj = isHijackerFormat ( ptr_firstUrl );
	if ( ! hj ) hj = isHijackerFormat ( ptr_redirUrl );
	if ( ! hj ) {
		m_isHijacked      = false;
		m_isHijackedValid = true;
		return &m_isHijacked;
	}
	uint32_t *h1 = getTagPairHash32();
	if ( ! h1 || h1 == (void *)-1 ) return (char *)h1;
	// TODO: check it for the malicious tag formats here!!
	m_isHijacked      = false;
	m_isHijackedValid = true;
	return &m_isHijacked;
}

// is it a custom error page? ppl do not always use status 404!
char *XmlDoc::getIsErrorPage ( ) {
	if ( m_isErrorPageValid ) return &m_isErrorPage;

	setStatus ( "getting is error page");

	// need a buncha crap
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	// get local link info
	LinkInfo   *info1  = getLinkInfo1();
	// error or blocked
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
	// get remote link info
	LinkInfo   **pinfo2 = getLinkInfo2();
	// error or blocked
	if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2;
	// convenience
	LinkInfo *info2 = *pinfo2;

	// default
	LinkInfo  *li = info1;

	//we have to be more sophisticated with longer pages because they
	//are could actually be talking about an error message.
	//if(xml->getContentLen() > 4096) return false;


	// assume not
	m_isErrorPage      = false;
	m_isErrorPageValid = true;

	int32_t nn = xml->getNumNodes();
	int32_t i;

	char* s;
	int32_t len;
	int32_t len2;

	char* errMsg = NULL;

	int32_t numChecked = 0;
	// check the first header and title tag
	// limit it to first 32 nodes
	if(nn > 32) nn = 32;
	for ( i = 0 ; i < nn ; i++ ) {
		switch(xml->getNodeId(i)) {
		case TAG_TITLE:
		case TAG_H1:
		case TAG_H2:
		case TAG_H3:
		case TAG_SPAN:
			char* p = xml->getString(i,true,&len);
			if(len == 0 || len > 1024) continue;
			char* pend = p + len;
			errMsg = matchErrorMsg(p, pend );
			++numChecked;
			break;
		}
		if(errMsg || numChecked > 1) break;
	}
	if(!errMsg) return &m_isErrorPage;
	len = gbstrlen(errMsg);

	// make sure the error message was not present in the link text
 loop:
	if ( li && li->getNumGoodInlinks() > 5 ) return &m_isErrorPage;
	for (Inlink *k=NULL;li && (k=li->getNextInlink(k)); ) {
		//int32_t nli = li->getNumLinkTexts();
		//if we can index some link text from the page, then do it
		//if(nli > 5) return false;
		//for ( int32_t i = 0 ; i < nli ; i++ ) {
		s    = k->getLinkText();
		len2 = k->size_linkText - 1; // exclude \0
		//if(!s) break;
		//allow error msg to contain link text or vice versa
		if(len < len2) {
			if(strncasestr(errMsg, s,len,len2) != NULL)
				return &m_isErrorPage;
		}
		else {
			if(strncasestr(s, errMsg,len2,len) != NULL)
				return &m_isErrorPage;
		}
	}

	if ( li ) { li = info2; info2 = NULL; goto loop; }

	m_isErrorPage = true;
	return &m_isErrorPage;
}


char* XmlDoc::matchErrorMsg(char* p, char* pend ) {
 	char utf8Buf[1024];
	//	int32_t utf8Len = 0;
	int32_t len = pend - p;

	if(len > 1024) len = 1024;
	pend = p + len;
	char* tmp = utf8Buf;
	while(p < pend) {
		*tmp = to_lower_a(*p);
		tmp++; p++;
	}

	p = utf8Buf;
	pend = p + len;

	char* errMsg = NULL;

	while(p < pend) {
		int32_t r = pend - p;
		switch (*p) { //sorted by first letter, then by frequency
		case '4':
			errMsg = "404 error";
			if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
			errMsg = "403 forbidden";
			if(r>=13&&strncmp(p, errMsg, 13) == 0) return errMsg;
			break;

		case 'd':
			errMsg = "detailed error information follows";
			if(r>=34&&strncmp(p, errMsg, 34) == 0) return errMsg;
			break;

		case 'e':
			errMsg = "error 404";
			if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
			errMsg = "error was encountered while processing "
				"your request";
			if(r>=51&&strncmp(p, errMsg,51) == 0) return errMsg;

			errMsg = "error occurred while processing request";
			if(r>=39&&strncmp(p, errMsg, 39) == 0) return errMsg;
			errMsg = "exception error has occurred";
			if(r>=28&&strncmp(p, errMsg,28) == 0) return errMsg;
			errMsg = "error occurred";
			if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
			//http://www.gnu.org/fun/jokes/unix.errors.html
			//errMsg = "error message";
			//if(strncmp(p, errMsg, 13) == 0) return errMsg;
			break;

		case 'f':
			errMsg = "file not found";
			if(r>=14&&strncmp(p, errMsg, 14) == 0) return errMsg;
			break;

		case 'h':
			errMsg = "has moved";
			if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
			break;

		case 'n':
			errMsg = "no referrer";
			if(r>=12&&strncmp(p, errMsg,12) == 0) return errMsg;
			break;

		case 'o':
			errMsg = "odbc error code = ";
			if(r>=18&&strncmp(p, errMsg,18) == 0) return errMsg;
			errMsg = "object not found";
			if(r>=16&&strncmp(p, errMsg,16) == 0) return errMsg;
			break;

		case 'p':
			errMsg = "page not found";
			if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
			break;

		case 's':
			errMsg = "system error";
			if(r>=12&&strncmp(p, errMsg, 12) == 0) return errMsg;
			break;
		case 't':
			errMsg = "the application encountered an "
				"unexpected problem";
			if(r>=49&&strncmp(p, errMsg, 49) == 0) return errMsg;
			errMsg = "the page you requested has moved";
			if(r>=32&&strncmp(p, errMsg, 32) == 0) return errMsg;
			errMsg = "this page has moved";
			if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
			break;

		case 'u':
			errMsg = "unexpected problem has occurred";
			if(r>=31&&strncmp(p, errMsg, 31) == 0) return errMsg;
			errMsg = "unexpected error has occurred";
			if(r>=29&&strncmp(p, errMsg, 29) == 0) return errMsg;
			errMsg = "unexpected problem occurred";
			if(r>=27&&strncmp(p, errMsg, 27) == 0) return errMsg;
			errMsg ="unexpected error occurred";
			if(r>=25&&strncmp(p, errMsg, 25) == 0) return errMsg;
			errMsg ="unexpected result has occurred";
			if(r>=33&&strncmp(p, errMsg, 33) == 0) return errMsg;
			errMsg ="unhandled exception";
			if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;

			break;

		case 'y':
			errMsg = "you have been blocked";
			if(r>=21&&strncmp(p, errMsg, 21) == 0) return errMsg;
			break;
		}
		//skip to the beginning of the next word
		while(p < pend && !is_wspace_a(*p)) p++;
		while(p < pend && is_wspace_a(*p)) p++;
	}
	return NULL;
}

#include "Spider.h"

static SafeBuf *s_wbuf = NULL;

// . this is used by gbsort() above
// . sorts TermInfos alphabetically by their TermInfo::m_term member
int cmptp (const void *v1, const void *v2) {
	TermDebugInfo *t1 = *(TermDebugInfo **)v1;
	TermDebugInfo *t2 = *(TermDebugInfo **)v2;

	char *start = s_wbuf->getBufStart();

	// prefix first
	char *ps1 = start + t1->m_prefixOff;
	char *ps2 = start + t2->m_prefixOff;
	if ( t1->m_prefixOff < 0 ) ps1 = NULL;
	if ( t2->m_prefixOff < 0 ) ps2 = NULL;
	int32_t plen1 = 0; if ( ps1 ) plen1 = gbstrlen(ps1);
	int32_t plen2 = 0; if ( ps2 ) plen2 = gbstrlen(ps2);
	int32_t pmin = plen1;
	if ( plen2 < pmin ) pmin = plen2;
	int32_t pn = strncmp ( ps1 , ps2 , pmin );
	if ( pn ) return pn;
	if ( plen1 != plen2 ) return ( plen1 - plen2 );

	// return if groups differ
	int32_t len1 = t1->m_termLen;
	int32_t len2 = t2->m_termLen;
	int32_t min = len1;
	if ( len2 < min ) min = len2;
	char *s1    = start + t1->m_termOff;
	char *s2    = start + t2->m_termOff;
	int32_t n = strncasecmp ( s1 , s2 , min );
	if ( n ) return n;
	// . if length same, we are tied
	// . otherwise, prefer the int16_ter
	return ( len1 - len2 );
}

// . this is used by gbsort() above
// . sorts TermDebugInfos by their TermDebugInfo::m_wordPos member
int cmptp2 (const void *v1, const void *v2) {
	TermDebugInfo *t1 = *(TermDebugInfo **)v1;
	TermDebugInfo *t2 = *(TermDebugInfo **)v2;
	// word position first
	int32_t d = t1->m_wordPos - t2->m_wordPos;
	if ( d ) return d;
	// secondly drop back to hashgroup i guess
	//d = t1->m_hashGroup - t2->m_hashGroup;
	d = t1->m_synSrc - t2->m_synSrc;
	if ( d ) return d;
	// word len
	d = t1->m_termLen - t2->m_termLen;
	if ( d ) return d;
	return 0;
}

bool printLangBits ( SafeBuf *sb , TermDebugInfo *tp ) {

	char printed = false;
	if ( tp->m_synSrc ) {
		sb->safePrintf("&nbsp;");
		printed = true;
	}
	int32_t j = 0;
	if ( printed ) j = MAX_LANGUAGES;
	for ( ; j < MAX_LANGUAGES ; j++ ) {
		int64_t mask = 1LL << j;
		//if ( j == tp->m_langId )
		//	sb->safePrintf("[%s]",
		//		       getLangAbbr(tp->m_langId));
		if ( ! (tp->m_langBitVec64 & mask) ) continue;
		char langId = j+1;
		// match in langvec? that means even if the
		// word is in multiple languages we put it in
		// this language because we interesect its lang bit
		// vec with its neighbors in the sliding window
		// algo in setLangVector.
		if ( langId == tp->m_langId )
			sb->safePrintf("<b>");
		sb->safePrintf("%s ", getLangAbbr(langId) );
		if ( langId == tp->m_langId )
			sb->safePrintf("</b>");
		printed = true;
	}
	if ( ! printed ) {
		sb->safePrintf("??");
	}
	return true;
}

bool XmlDoc::printDoc ( SafeBuf *sb ) {

	if ( ! sb ) return true;

	Url *u = getFirstUrl();
	// hash the url into 64 bits
	int64_t uh64 = hash64(u->getUrl(),u->getUrlLen());


	// int16_tcut
	char *fu = ptr_firstUrl;

	char *allowed = "???";
	if      ( m_isAllowedValid && m_isAllowed )  allowed = "yes";
	else if ( m_isAllowedValid                )  allowed = "no";

	int32_t ufn = -1;
	if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
	time_t spideredTime = getSpideredTime();

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
			"content=\"text/html; charset=utf-8\">"

			"<table cellpadding=3 border=0>\n"

			"<tr>"
			"<td width=\"25%%\">docId</td>"
			"<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>"
			"</tr>\n"

			"<tr>"
			"<td width=\"25%%\">uh48</td>"
			"<td>%"UINT64"</td>"
			"</tr>\n"

			"<tr>"
			"<td width=\"25%%\">uh64</td>"
			"<td>%"UINT64"</td>"
			"</tr>\n"

			"<tr>"
			"<td>index error code</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>http status</td>"
			"<td>%i</td>"
			"</tr>\n"

			"<tr>"
			"<td>url filter num</td>"
			"<td>%"INT32"</td>"
			"</tr>\n"


			"<tr>"
			"<td>other - errno</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>robots.txt allows</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>metalist size</td>"
			"<td>%"INT32"</td>"
			"</tr>\n"


			"<tr>"
			"<td>url</td>"
			"<td><a href=\"%s\">%s</a></td>"
			"</tr>\n"

			,
			cr->m_coll,
			m_docId ,
			m_docId ,
			getFirstUrlHash48(), // uh48
			getFirstUrlHash64(), // uh48

			mstrerror(m_indexCode),
			m_httpStatus,
			ufn,
			mstrerror(g_errno),
			allowed,

			m_metaListSize,

			fu,
			fu

			);

	if ( ptr_redirUrl )
		sb->safePrintf(
			       "<tr>"
			       "<td>redir url</td>"
			       "<td><a href=\"%s\">%s</a></td>"
			       "</tr>\n"
			       ,ptr_redirUrl
			       ,ptr_redirUrl
			       );
	else
		sb->safePrintf(
			       "<tr>"
			       "<td>redir url</td>"
			       "<td>--</td>"
			       "</tr>\n"
			       );


	sb->safePrintf("<tr><td>hostHash64</td><td>0x%"XINT64"</td></tr>",
		       (uint64_t)getHostHash32a());
	sb->safePrintf("<tr><td>site</td><td>");
	sb->safeMemcpy(ptr_site,size_site-1);
	sb->safePrintf("</td></tr>\n");
	if ( m_siteHash32Valid )
		sb->safePrintf("<tr><td>siteHash32</td><td>0x%"XINT32"</td></tr>\n",
			       m_siteHash32);
	if ( m_domHash32Valid )
		sb->safePrintf("<tr><td>domainHash32</td><td>0x%"XINT32"</td></tr>\n",
			       m_domHash32);
	sb->safePrintf ( "<tr>"
			 "<td>domainHash8</td>"
			 "<td>0x%"XINT32"</td>"
			 "</tr>\n"
			 ,
			 (int32_t)g_titledb.getDomHash8FromDocId(m_docId)
			 );

	sb->safePrintf(
			"<tr>"
			"<td>coll</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>spidered date</td>"
			"<td>%s UTC</td>"
			"</tr>\n"
			,
			cr->m_coll,
			asctime(gmtime ( &spideredTime ))
			);


	/*
	char *ms = "-1";
	if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate ));
	sb->safePrintf (
			"<tr>"
			"<td>min pub date</td>"
			"<td>%s UTC</td>"
			"</tr>\n" , ms );

	ms = "-1";
	if ( m_maxPubDate != -1 ) ms = asctime(gmtime ( &m_maxPubDate ));
	sb->safePrintf (
			"<tr>"
			"<td>max pub date</td>"
			"<td>%s UTC</td>"
			"</tr>\n" , ms );
	*/

	// our html template fingerprint
	sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
	if ( m_tagPairHash32Valid )sb->safePrintf("%"UINT32"",
						  (uint32_t)m_tagPairHash32);
	else                       sb->safePrintf("invalid");
	sb->safePrintf("</td></tr>\n" );


	// print list we added to delete stuff
	if ( m_indexCode && m_oldDocValid && m_oldDoc ) {
		// skip debug printing for now...
		//return true;
		sb->safePrintf("</table><br>\n");
		sb->safePrintf("<h2>Delete Meta List</h2>");
		printMetaList ( m_metaList , m_metaList + m_metaListSize ,sb);
	}


	if ( m_indexCode || g_errno ) {
		printMetaList ( m_metaList , m_metaList + m_metaListSize, sb );
	}

	if ( m_indexCode ) return true;
	if ( g_errno     ) return true;


	// sanity check
	//if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; }

	/*
	sb->safePrintf("<tr><td>next spider date</td>"
		       "<td>%s UTC</td></tr>\n"

		       "<tr><td>next spider priority</td>"
		       "<td>%"INT32"</td></tr>\n" ,
		       asctime(gmtime( &m_nextSpiderTime )) ,
		       (int32_t)m_nextSpiderPriority );
	*/

	// must always start with http i guess!
	if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }
	// show the host that should spider it
	//int32_t domLen ; char *dom = getDomFast ( fu , &domLen , true );
	//int32_t hostId;
	if ( m_sreqValid ) {
		// must not block
		SpiderRequest *oldsr = &m_sreq;
		uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
		sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
				"</td>\n"
				"<td><b>%"UINT32"</b></td></tr>\n",shard);
	}

	time_t ts = m_firstIndexedDate;
	sb->safePrintf("<tr><td>first indexed date</td>"
		       "<td>%s UTC</td></tr>\n" ,
		       asctime(gmtime(&ts )) );

	ts = m_outlinksAddedDate;
	sb->safePrintf("<tr><td>outlinks last added date</td>"
		       "<td>%s UTC</td></tr>\n" ,
		       asctime(gmtime(&ts )) );

	// hop count
	sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td></tr>\n",
		      (int32_t)m_hopCount);

	// thumbnails
	ThumbnailArray *ta = (ThumbnailArray *) ptr_imageData;
	if ( ta ) {
		int32_t nt = ta->getNumThumbnails();
		sb->safePrintf("<tr><td># thumbnails</td>"
			       "<td>%"INT32"</td></tr>\n",nt);
		for ( int32_t i = 0 ; i < nt ; i++ ) {
			ThumbnailInfo *ti = ta->getThumbnailInfo(i);
			sb->safePrintf("<tr><td>thumb #%"INT32"</td>"
				       "<td>%s (%"INT32"x%"INT32",%"INT32"x%"INT32") "
				       , i
				       , ti->getUrl()
				       , ti->m_origDX
				       , ti->m_origDY
				       , ti->m_dx
				       , ti->m_dy
				       );
			ti->printThumbnailInHtml ( sb , 100,100,true,NULL) ;
			// end the row for this thumbnail
			sb->safePrintf("</td></tr>\n");
		}
	}


	char *ddd;
	time_t datedbDate  = (time_t)m_pubDate;
	if ( datedbDate != -1 ) ddd = asctime ( gmtime(&datedbDate ));
	else                    ddd = "---";

	char strLanguage[128];
	languageToString(m_langId, strLanguage);

	// print tags
	//if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; }
	SafeBuf tb;

	TagRec *ogr = NULL;
	if ( m_tagRecValid ) ogr = &m_tagRec;
	if ( ogr ) ogr->printToBufAsHtml ( &tb , "old tag" );

	SafeBuf *ntb = NULL;
	if ( m_newTagBufValid ) ntb = getNewTagBuf();
	if ( ntb ) {
		// this is just a sequence of tags like an rdblist
		char *pt    = ntb->getBufStart();
		char *ptend = pt + ntb->length();
		for ( ; pt < ptend ; ) {
			// skip rdbid
			pt++;
			// cast it
			Tag *tag = (Tag *)pt;
			// skip it
			pt += tag->getRecSize();
			// print tag out
			tag->printToBufAsHtml ( &tb, "new tag");
		}
	}


	// prevent (null) from being displayed
	tb.pushChar('\0');


	//Tag *tag1 = gr->getTag ("sitenuminlinks");
	//Tag *tag2 = gr->getTag ("sitepop");
	//int32_t sni  = 0;
	//int32_t spop = 0;
	//if ( tag1 ) sni  = atol(tag1->m_data);
	//if ( tag2 ) spop = atol(tag2->m_data);
	int32_t sni  = m_siteNumInlinks;
	//int32_t spop = m_sitePop;

	LinkInfo *info1 = ptr_linkInfo1;
	//LinkInfo *info2 = ptr_linkInfo2;
	//int32_t sni ;
	//int32_t extrapolated = 0;
	//if ( info1 ) extrapolated = info1->m_numInlinksExtrapolated;
	//if ( info1 ) sni          = info1->m_siteNumInlinks;

	char *ipString = iptoa(m_ip);
	char *estimated = "";
	if ( datedbDate & 0x01 ) // tr->datedbDateIsEstimated() )
		estimated = "<nobr><b>[estimated from bisection]</b></nobr>";

	//char *ls = getIsLinkSpam();
	Links *links = getLinks();
	// sanity check. should NEVER block!
	if ( links == (void *)-1 ) { char *xx=NULL;*xx=0; }

	// this is all to get "note"
	//char *note = NULL;
	// make it a URL
	Url uu; uu.set ( ptr_firstUrl , false );
	// sanity check
	Xml *xml = getXml();
	// sanity check
	if ( xml == (void *)-1 ) { char *xx=NULL;*xx=0; }

	sb->safePrintf (
		  "<tr><td>datedb date</td><td>%s UTC (%"UINT32")%s"
		  "</td></tr>\n"

		  "<tr><td>compressed size</td><td>%"INT32" bytes</td></tr>\n"

		  "<tr><td>original charset</td><td>%s</td></tr>\n"

		  //"<tr><td>site num inlinks</td><td><b>%"INT32"%</b></td></tr>\n"

		  //"<tr><td>total extrapolated linkers</td><td>%"INT32"</td></tr>\n"

		  "<tr><td><b>title rec version</b></td><td><b>%"INT32"</b>"
		  "</td></tr>\n"

		  "<tr><td>adult bit</td><td>%"INT32"</td></tr>\n"

		  //"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n"

		  "<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n"
		  "<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n"
		  //"<tr><td>index article only?</td><td>%"INT32"</td></tr>\n"
		  "%s\n"
		  "<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
		  "%s</td></tr>\n"
		  "<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n"
		  "<tr><td>content truncated</td><td>%"INT32"</td></tr>\n"

		  "<tr><td>content type</td><td>%"INT32" (%s)</td></tr>\n"
		  "<tr><td>language</td><td>%"INT32" (%s)</td></tr>\n"
		  "<tr><td>country</td><td>%"INT32" (%s)</td></tr>\n"
		  "<tr><td>time axis used</td><td>%"INT32"</td></tr>\n"
		  "<tr><td>metadata</td><td>%s</td></tr>\n"
		  "</td></tr>\n",

		  ddd ,
		  (uint32_t)datedbDate ,
		  estimated ,

		  m_oldTitleRecSize,

		  get_charset_str(m_charset),

		  //sni ,

		  //ptr_linkInfo1->m_numInlinksExtrapolated,

		  (int32_t)m_version ,

		  (int32_t)m_isAdult,

		  //(int32_t)m_isLinkSpam,
		  //m_note,

		  (int32_t)m_isPermalink,

		  (int32_t)m_isRSS,


		  //(int32_t)m_eliminateMenus,


		  // tag rec
		  tb.getBufStart(),

		  ipString,
		  cr->m_coll,
		  ipString,
		  size_utf8Content - 1,
		  (int32_t)m_isContentTruncated,

		  (int32_t)m_contentType,
		  g_contentTypeStrings[(int)m_contentType] ,

		  (int32_t)m_langId,
		  strLanguage,

		  (int32_t)m_countryId,
		  g_countryCode.getName(m_countryId),
		  m_useTimeAxis,
		  ptr_metadata);


	/*
	int32_t boost1 = getBoostFromSiteNumInlinks ( sni );

	sb->safePrintf (
		       "<tr><td><b>title weight</b></td>"
		       "<td><b>%"UINT32"%%</b></td></tr>\n"

		       "<tr><td>header weight</td>"
		       "<td>%"UINT32"%%</td></tr>\n"

		       "<tr><td>url path weight</td>"
		       "<td>%"UINT32"%%</td></tr>\n"

		       "<tr><td>external link text weight</td>"
		       "<td>%"UINT32"%%</td></tr>\n"

		       "<tr><td>internal link text weight</td>"
		       "<td>%"UINT32"%%</td></tr>\n"

		       "<tr><td>concept weight</td>"
		       "<td>%"UINT32"%%</td></tr>\n"

		       "<tr><td>score boost from site num inlinks</td>"
		       "<td>%"INT32"%%</td>"
		       "</tr>\n",

		       (int32_t)m_titleWeight,
		       (int32_t)m_headerWeight,
		       (int32_t)m_urlPathWeight,
		       (int32_t)m_externalLinkTextWeight,
		       (int32_t)m_internalLinkTextWeight,
		       (int32_t)m_conceptWeight ,
		       boost1 );
	*/

	// print title
	//sb->safePrintf( "<tr><td>title</td><td>%s</td></tr>\n" ,
	//	       ti->m_title );

	// print the new, unstored, gigabit vector
	if ( size_gigabitHashes ) {
		// get gigabit vector
		int32_t *vec = ptr_gigabitHashes;
		// point to scores
		int32_t *ss  = ptr_gigabitScores;
		int32_t count = 0;
		int32_t total = 0;
		sb->safePrintf ( "<tr><td>stored gigabit vector</td><td>");
		while ( *vec ) {
			sb->safePrintf ( "%08"XINT32" ", *vec );
			sb->safePrintf ( "(%05"INT32") ", *ss );
			vec++;
			ss++;
			count++;
			total++;
			//if ( total >= GIGABITS_IN_VECTOR ) break;
			if ( count < 4 ) continue;
			count = 0;
			sb->safePrintf ( "<br>\n");
		}
		sb->safePrintf ( "</tr>\n");
	}

	// print dmoz stuff
	int32_t numCatIds    = size_catIds/4;
	int32_t numIndCatIds = size_indCatIds/4;
	sb->safePrintf( "<tr><td>Number of Category IDs</td>"
		       "<td>%"INT32"</td></tr>\n",  numCatIds );
	char *dtp = ptr_dmozTitles;
	char *dsp = ptr_dmozSumms;
	char *dap = ptr_dmozAnchors;
	for (int32_t i = 0; i < numCatIds; i++) {
		// print the ID
		sb->safePrintf( "<tr><td>ID #%"INT32"</td><td>%"INT32"</td></tr>\n",
			       i, ptr_catIds[i]);
		// print the title
		if ( dtp ) {
			sb->safePrintf( "<tr><td>Title #%"INT32" </td><td>",i);
			sb->safeMemcpy( dtp,gbstrlen(dtp) );
			sb->safePrintf( "</td></tr>\n");
			dtp += gbstrlen(dtp) + 1;
		}
		// print the summary
		if ( dsp ) {
			sb->safePrintf( "<tr><td>Summary #%"INT32"</td><td>", i);
			sb->safeMemcpy( dsp , gbstrlen(dsp ) ) ;
			sb->safePrintf( "</td></tr>\n");
			dsp += gbstrlen ( dsp ) + 1;
		}
		// print the anchor
		if ( dap ) {
			sb->safePrintf( "<tr><td>Anchor #%"INT32"</td><td>",i);
			sb->safeMemcpy( dap , gbstrlen(dap) );
			sb->safePrintf( "</td></tr>\n");
			dap += gbstrlen ( dap ) + 1;
		}
	}
	sb->safePrintf( "<tr><td>Number of Indirect Category IDs</td>"
			"<td>%"INT32"</td></tr>\n",   numIndCatIds);

	for (int32_t i = 0; i < numIndCatIds; i++) {
		// print the ID
		sb->safePrintf( "<tr><td>Indirect ID #%"INT32"</td>"
				"<td>%"INT32"</td></tr>\n",
				i, ptr_indCatIds[i]);
	}

	if ( info1 ) {
		//sb->safePrintf("<tr><td>page pop</td><td>%"INT32"</td></tr>\n",
		//	       info1->m_pagePop );
		//sb->safePrintf("<tr><td>whole site pop</td>"
		//	       "<td>%"INT32"</td></tr>\n",
		//	       spop );
		sb->safePrintf("<tr><td>num GOOD links to whole site</td>"
			       "<td>%"INT32"</td></tr>\n",
			       sni );
	}

	// close the table
	sb->safePrintf ( "</table></center><br>\n" );

	//
	// convert document into json representing multiple documents
	// if it makes sense. sometimes a single url contains multiple
	// subdocuments that each should have their own url, but do not,
	// so we fix that here.
	//
	SafeBuf *dbr = getDiffbotReply();
	if ( dbr->length() ) {
		sb->safePrintf("<b>START EXACT DIFFBOT REPLY</b><br>\n");
		sb->safePrintf("<pre>");
		sb->safeMemcpy ( dbr );
		sb->safePrintf("</pre>");
		sb->safePrintf("<b>END EXACT DIFFBOT REPLY</b><br><br>\n");
	}

	// print outlinks
	links->print( sb );

	//
	// PRINT ADDRESSES (prints streets first)
	//
	Addresses *aa = getAddresses ();
	if ( ! aa || aa == (Addresses *)-1 ) { char *xx=NULL;*xx=0;}
	aa->print(sb,uh64);


	//
	// PRINT PUB DATE CANDIDATES
	//

	// print stored pub date candidates which we indexed as clock
	// or not clock!
	Dates *dp = getDates() ;
	// should never block!
	if ( dp == (void *)-1 ) { char *xx=NULL;*xx=0; }
	// print it out
	if ( dp ) dp->printDates ( sb );

	//return true;

	//
	// PRINT SECTIONS
	//
	Sections *sections = getSections();
	if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;}
	//SectionVotingTable *nsvt = getNewSectionVotingTable();
	//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
	//SectionVotingTable *osvt = getOldSectionVotingTable();
	//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}


	// these are nice
	//HashTableX *pt = dp->getPhoneTable();
	//HashTableX *et = dp->getEmailTable();
	//HashTableX *at = aa->getPlaceTable();
	//HashTableX *tt = dp->getTODTable();
	//HashTableX *rt = ev->getRegistrationTable();
	//HashTableX *priceTable = dp->getPriceTable();

	//sections->print ( sb , pt , et , NULL , at , tt , priceTable );

	// try the new print function
	//sections->print2 ( sb , NULL, NULL , NULL , false );

	printRainbowSections ( sb , NULL );

	//nsvt->print ( sb , "NEW Sections Voting Table" );

	//osvt->print ( sb , "OLD Sections Voting Table" );


	//
	// PRINT LINKINFO
	//

	//if ( info1 )
	//	info1->print ( sb , cr->m_coll );

	//if ( info2 ) {
	//	sb->safePrintf ( "<tr><td><b>IMPORTED LINK INFO:"
	//			 "</b></td></tr>" );
	//	info2->print ( sb , cr->m_coll );
	//}


	// cut it int16_t for debugging
	logf(LOG_DEBUG,"xmldoc: FIX ME remove return");

	//return true;

	//
	// PRINT LINKINFO
	//

	char *p    = m_pageLinkBuf.getBufStart();
	int32_t  plen = m_pageLinkBuf.length();
	sb->safeMemcpy ( p , plen );


	//
	// PRINT SITE LINKINFO
	//
	p    = m_siteLinkBuf.getBufStart();
	plen = m_siteLinkBuf.length();
	sb->safeMemcpy ( p , plen );


	//
	// BEGIN PRINT GIGABITS
	//

	// print out for PageParser.cpp
	const char *help =
		"The <i>Gigabits</i> are words extracted from the document "
		"that are deemed to best represent it. The <i>Pop</i> column "
		"is the popularity of the word and it ranges from 0 to 1000 "
		"and is how many documents out of a sample of 1000 that "
		"contained that word. The <i>Score</i> of each Gigabit is "
		"based on the popularity and how many times the word appeared "
		"in the document. Higher scores are deemed more "
		"representative of the document. The hashes of these Gigabits "
		"are stored with the cached copy of the document as numeric "
		"hashes for purposes of topic clustering. You can see these "
		"hashes by clicking on the <i>[info]</i> link next to "
		"any search result.<br><br>";

	if ( m_numTop > 0 )
		sb->safePrintf(  "<table width=100%%>"
				 "<td bgcolor=pink>\n"
				 "%s"
				 "<table>"
				 "<tr><td>#</td><td>"
				 "<b>%"INT32" Gigabits</b></td><td><b>Score</b>"
				 "</td>"
				 "<td><b>Pop</b></td>"
				 "<td><b>Hash</b></td>"
				 "</tr>\n",
				 help,m_numTop);

	// . print out the top gigabits we harvested
	// . start with the highest scoring node first, the last node since
	//   nodes are ranked by lowest to highest key
	int32_t total = 0;
	for ( int32_t i = 0 ; i < m_numTop ; i++ ) {
		// get the info
		GigabitInfo *gi = m_top[i];
		// print row
		sb->safePrintf("<tr><td>%"INT32"</td><td>",i);
		// print gigabit
		sb->safeMemcpy(gi->m_ptr , gi->m_len );
		// get 32 bit hash
		uint32_t h = gi->m_hash & 0xffffffff;
		// never allow 0
		if ( h == 0 ) h = 1;
		// if unicode, pop's hi bit is set
		sb->safePrintf(  "</td>"
				 "<td>%"INT32"</td>"
				 "<td>%"INT32"</td>"
				 "<td>%08"XINT32"</td>"
				 "</tr>\n",
				 (int32_t)gi->m_pts,
				 (int32_t)gi->m_pop,
				 (int32_t)h );
		// add up all scores
		total += gi->m_pts;
	}

	// close table
	if ( m_numTop > 0 ) {
		sb->safePrintf("<tr><td></td><td></td><td>"
				   "<b>%"INT32"</b></td></tr>\n",total);
		sb->safePrintf("</table>\n");
	}


	//
	// END PRINT GIGABITS
	//


	// note this
	sb->safePrintf("<h2>NEW Meta List</h2>");

	printMetaList ( m_metaList , m_metaList + m_metaListSize , sb );


	// all done if no term table to print out
	if ( ! m_wts ) return true;


	// print out the rules in Weights.cpp
	/*
	sb->safePrintf ("<br>"
			"<table border=1 cellpadding=0>"

			"<tr><td>Rule #3</td>"
			"<td>First 40 words in ()'s.</td></tr>\n"

			"<tr><td>Rule #4</td>"
			"<td>Adjacent to bad punct.</td></tr>\n"

			"<tr><td>Rule #5</td>"
			"<td>In a link.</td></tr>\n"

			"<tr><td>Rule #6</td>"
			"<td>First occurence in a section. Actual weight "
			"depends on section word count.</td></tr>\n"

			"<tr><td>Rule #7</td>"
			"<td>In a header tag. h1 is most weight.</td></tr>\n"

			"<tr><td>Rule #8</td>"
			"<td>In a \"ul\" list.</td></tr>\n"

			"<tr><td>Rule #9</td>"
			"<td>Repeated occurence in the same fragment or "
			"sentence.</td></tr>\n"

			"<tr><td>Rule #10</td>"
			"<td>In a comma-separated list.</td></tr>\n"

			"<tr><td>Rule #11</td>"
			"<td>Promoted isolated capitalized words, demote "
			"if it is in a capitalized phrase.</td></tr>\n"

			"<tr><td>Rule #13</td>"
			"<td>First occurence in document.</td></tr>\n"

			"<tr><td>Rule #15</td>"
			"<td>Word to phrase ratio weight.</td></tr>\n"

			"<tr><td>Rule #16</td>"
			"<td>At the beginning of a fragment or sentence."
			"</td></tr>\n"

			"<tr><td>Rule #17</td>"
			"<td>If immediately after a quote, iff not "
			"promoted by Rule #18.</td></tr>\n"

			"<tr><td>Rule #18</td>"
			"<td>Promote phrase if capitalized. Demote phrase "
			"if mixed case without hypehn.</td></tr>\n"

			"<tr><td>Rule #22</td>"
			"<td>Demote phrases containing bad punct.</td></tr>\n"

			"<tr><td>Rule #23</td>"
			"<td>In script, style, select or marquee tag. "
			"</td></tr>\n"

			"<tr><td>Rule #23</td>"
			"<td>Follows a number.</td></tr>\n"

			"<tr><td>Rule #25</td>"
			"<td>Demote non-hyphenated phrases that would split "
			"adjacent hyphenated phrases.</td></tr>\n"

			"<tr><td>Rule #26</td>"
			"<td>Demote if in a repeated fragment.</td></tr>\n"

			"<tr><td>Rule #27</td>"
			"<td>Demote if in a menu section.</td></tr>\n"

			"<tr><td>Rule #28</td>"
			"<td>Pattern spam detector.</td></tr>\n"

			"</table>\n"
			"<br>"
			);
	*/


	//
	// BEGIN PRINT HASHES TERMS
	//

	// int16_tcut
	HashTableX *wt = m_wts;

	// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
	TermDebugInfo **tp = NULL;
	// add them with this counter
	int32_t nt = 0;

	int32_t nwt = 0;
	if ( wt ) {
		nwt = wt->m_numSlots;
		tp = (TermDebugInfo **)wt->m_keys;
	}

	// now print the table we stored all we hashed into
	for ( int32_t i = 0 ; i < nwt ; i++ ) {
		// skip if empty
		if ( wt->m_flags[i] == 0 ) continue;
		// breathe
		//QUICKPOLL(m_niceness);
		// get its key, date=32bits termid=64bits
		//key96_t *k = (key96_t *)wt->getKey ( i );
		// get the TermDebugInfo
		TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
		// point to it for sorting
		tp[nt++] = ti;
	}

	// set this for cmptp
	s_wbuf = &m_wbuf;

	// sort them alphabetically by Term
	gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );

	// determine how many non 1.0 weight fields we got in the vectors
	/*
	int32_t count [ MAX_RULES ];
	memset ( count , 0 , MAX_RULES * 4 );
	for ( int32_t i = 0 ; i < nt ; i++ ) {
		TermDebugInfo *ti = tp[i];
		for ( int32_t j = 0 ; j < MAX_RULES ; j++ )
			if ( ti->m_rv[j] != 1.0 ) count[j]++;
	}
	// count the counts
	char fbuf[9024];
	char *fp = fbuf;
	for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
		if ( ! count[j] ) continue;
		fp += sprintf(fp ,"<td><b>R#%"INT32"</b></td>",j);
	}
	*/

	// print them out in a table
	char hdr[1000];
	sprintf(hdr,
		"<table border=1 cellpadding=0>"
		"<tr>"
		// this messes up Test.cpp diff'ing
		//"<td><b>#</b></td>"
		"<td><b>Prefix</b></td>"
		"<td><b>WordNum</b></td>"
		"<td><b>Lang</b></td>"
		"<td><b>Term</b></td>"

		//"%s"

		//"<td><b>Weight</b></td>"
		//"<td><b>Spam</b></td>"

		"<td><b>Desc</b></td>"
		"<td><b>TermId/TermHash48</b></td>"
		"<td><b>ShardByTermId?</b></td>"
		"<td><b>Note</b></td>"
		"</tr>\n"
		//,fbuf
		);

	sb->safePrintf("%s",hdr);

	char *start = m_wbuf.getBufStart();
	int32_t rcount = 0;

	for ( int32_t i = 0 ; i < nt ; i++ ) {


		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS) == 0 )
			sb->safePrintf("<!--ignore--></table>%s",hdr);

		char *prefix = "&nbsp;";
		if ( tp[i]->m_prefixOff >= 0 )
			prefix = start + tp[i]->m_prefixOff;

		bool isFacet = false;
		if ( prefix &&
		     prefix[0]=='g' &&
		     strncmp(prefix,"gbfacet",7)== 0 )
			isFacet = true;

		sb->safePrintf ( "<tr>"
				 //"<td><b>%"INT32"</b></td>"
				 "<td>%s</td>"
				 //i ,
				 , prefix
				 );

		if ( isFacet )
			sb->safePrintf("<td>--</td>");
		else
			sb->safePrintf( "<td>%"INT32"</td>"
					, tp[i]->m_wordNum );


		// print lang
		//char langId = tp[i]->m_langId;

		// print out all langs word is in if it's not clear
		// what language it is. we use a sliding window to
		// resolve some ambiguity, but not all, so print out
		// the possible langs here
		sb->safePrintf("<td>");
		if ( isFacet )
			sb->safePrintf("--");
		else
			printLangBits ( sb , tp[i] );
		sb->safePrintf("</td>");


		// print the term
		sb->safePrintf("<td><nobr>");

		if ( tp[i]->m_synSrc )
			sb->pushChar('*');

		char *term = start + tp[i]->m_termOff;
		int32_t  termLen = tp[i]->m_termLen;
		sb->safeMemcpy ( term , termLen );

		/*
		char *dateStr = "&nbsp;";
		int32_t ddd = tp[i]->m_date;
		uint8_t *tddd = (uint8_t *)&ddd;
		char tbbb[32];
		if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
		     tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
			sprintf(tbbb,"evIds %"INT32"-%"INT32"",
				(int32_t)tddd[1],(int32_t)tddd[0]);
			dateStr = tbbb;
		}
		else if ( ddd )
			dateStr = asctime ( gmtime(&ddd ));
		*/

		//char ss[30];
		//if ( tp[i]->m_spam == -1.0 ) sprintf(ss,"&nbsp;");
		//else if ( tp[i]->m_spam == 0.0 ) sprintf(ss,"--");
		//else sprintf ( ss , "%.03f",1.0-tp[i]->m_spam);


		sb->safePrintf ( "</nobr></td>"
				 );

		// print the weight vector before Weight and Spam
		/*
		float prod = 1.0;
		for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) {
			if ( ! count[j] ) continue;
			if ( tp[i]->m_isSynonym )
				sb->safePrintf("<td>&nbsp;</td>" );
			else if ( tp[i]->m_rv[j] == 1.0 )
				sb->safePrintf("<td>&nbsp;</td>" );
			else sb->safePrintf("<td>%.02f</td>",tp[i]->m_rv[j] );
			// product up
			prod *= tp[i]->m_rv[j];
		}

		// sanity check
		// maybe look into this at some point, but not a big deal!!
		//float err = prod - tp[i]->m_weight;
		//if ( err > .05 )
		//	logf(LOG_DEBUG,"weights: prod was %.02f should be "
		//	     "%.02f",prod,tp[i]->m_weight);
		*/

		//char *desc = "&nbsp;";
		//if ( tp[i]->m_descOff >= 0 )
		//	desc = start + tp[i]->m_descOff;

		/*
		// synonyms are always 1/4 weight of original
		if ( tp[i]->m_isSynonym )
			sb->safePrintf("<td>&nbsp;</td>" );
		else
			sb->safePrintf("<td>%.03f</td>",  tp[i]->m_weight );
		*/

		sb->safePrintf ( //"<td>%s</td>"
				//"<td><b>%"UINT32"</b></td>"
				//"<td><nobr>%s</nobr></td>"
				"<td><nobr>%s",
				getHashGroupString(tp[i]->m_hashGroup)
				);

		//if ( tp[i]->m_synSrc ) {
		//	char ss = tp[i]->m_synSrc;
		//	sb->safePrintf(" - %s",g_synonyms.getSourceString(ss));
		//}

		sb->safePrintf ( "</nobr></td>" );

		sb->safePrintf ( "<td>%016"UINT64"</td>"
				 ,
				 //ss ,
				 //(uint32_t)tp[i]->m_score32 ,
				 //dateStr          ,
				 //desc, // start + tp[i]->m_descOff    ,
				 (uint64_t)(tp[i]->m_termId & TERMID_MASK) );

		if ( tp[i]->m_shardByTermId ) sb->safePrintf("<td><b>1</b></td>" );
		else                    sb->safePrintf("<td>0</td>" );


		sb->safePrintf("<td>");

		// there is no prefix for such terms now
		// TODO: store actual key in there i guess?? or just this bit.
		int32_t val32 = 0;
		if ( strncmp(prefix,"gbfacet",7) == 0 )
			val32 = g_posdb.getInt(&tp[i]->m_key);

		// . this is like gbxpathsitehash1234567
		// . the number following it is the hash
		// . the value stored in the posdb key is the hash of the
		//   inner html content of that xpath/site for this page
		if ( strncmp(term,"facetField=gbxpathsitehash",26)==0)
			sb->safePrintf("<b>Term</b> is a 32-bit hash of the "
				       "X-path of "
				       "a section XOR'ed with the 32-bit "
				       "hash of this document's subdomain. "
				       "[%"UINT32"] is the 32-bit hash of the "
				       "Inner HTML of this section stored "
				       "in the posdb key instead of "
				       "the usual stuff. This is also "
				       "sharded by termId!",
				       (uint32_t)val32
				       //(int32_t)tp[i]->m_sentHash32
				       );

		sb->safePrintf("</td>");


		sb->safePrintf("</tr>\n");
	}


	sb->safePrintf("</table><br>\n");

	//
	// END PRINT HASHES TERMS
	//

	return true;
}

bool XmlDoc::printMenu ( SafeBuf *sb ) {

	// encode it
	SafeBuf ue;
	ue.urlEncode ( ptr_firstUrl );

	// get
	sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
			"content=\"text/html; charset=utf-8\">" );

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	/*
	char *coll = cr->m_coll;

	int64_t d = m_docId;

	// print links at top
	sb->safePrintf(
		       //"<a href=/print?c=%s&u=%s&page=1>general info</a> | "
		       //"<a href=/print?c=%s&u=%s&page=2>page inlinks</a> | "
		       //"<a href=/print?c=%s&u=%s&page=3>site inlinks</a> | "
		       //"<a href=/print?c=%s&u=%s&page=4>sections</a> | "
		       //"<a href=/print?c=%s&u=%s&page=5>indexed terms</a> | "
		       // the breakdown of when it was spidered and when it
		       // is due to be spidered again. and any errors
		       // encountered when spidering
		       //"<a href=/print?c=%s&u=%s&page=6>spider stats</a> | "
		       //"<a href=/print?c=%s&u=%s&page=7>cached page</a>"
		       "<a href=/print?c=%s&d=%"INT64"&page=1>general info</a> | "
		       "<a href=/print?c=%s&d=%"INT64"&page=2&recompute=1>"
		       "page inlinks</a> | "
		       "<a href=/print?c=%s&d=%"INT64"&page=3>site inlinks</a> | "
		       //"<a href=/print?c=%s&d=%"INT64"&page=4>sections</a> | "
		       "<a href=/print?c=%s&d=%"INT64"&page=5>indexed terms</a>"
		       // the breakdown of when it was spidered and when it
		       // is due to be spidered again. and any errors
		       // encountered when spidering
		       //"<a href=/print?c=%s&d=%"INT64"&page=6>spider stats</a> |"
		       //" <a href=/print?c=%s&d=%"INT64"&page=7>cached page</a>"
		       "<br>"
		       "<br>"
		       ,coll,d//ue.getBufStart()
		       ,coll,d//ue.getBufStart()
		       ,coll,d//ue.getBufStart()
		       //,coll,d//ue.getBufStart()
		       ,coll,d//ue.getBufStart()
		       //,coll,d//ue.getBufStart()
		       //,coll,d//ue.getBufStart()
		       );
	*/
	return true;
}

// if printDocForProCog, an entry function, blocks, we gotta re-call it
static void printDocForProCogWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// note it
	THIS->setStatus ( "in print doc for pro cog wrapper" );
	// get it
	bool status = THIS->printDocForProCog ( THIS->m_savedSb ,
						THIS->m_savedHr );
	// return if it blocked
	if ( ! status ) return;
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
}

// in PageRoot.cpp
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ,
			   bool printGigablast );

// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {

	if ( ! sb ) return true;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	m_masterLoop = printDocForProCogWrapper;
	m_masterState = this;

	m_savedSb = sb;
	m_savedHr = hr;

	// if we are generating site or page inlinks info for a
	// non docid based url, then store that info in the respective
	// safe bufs
	m_useSiteLinkBuf = true;
	m_usePageLinkBuf = true;


	int32_t page = hr->getLong("page",1);


	// for some reason sections page blocks forever in browser
	if ( page != 7 && ! m_printedMenu ) { // && page != 5 )
		printFrontPageShell ( sb , "search" , cr , false );
		m_printedMenu = true;
		//printMenu ( sb );
	}


	if ( page == 1 )
		return printGeneralInfo(sb,hr);

	if ( page == 2 )
		return printPageInlinks(sb,hr);

	if ( page == 3 )
		return printSiteInlinks(sb,hr);

	if ( page == 4 )
		return printRainbowSections(sb,hr);

	if ( page == 5 )
		return printTermList(sb,hr);

	if ( page == 6 )
		return printSpiderStats(sb,hr);

	if ( page == 7 )
		return printCachedPage(sb,hr);

	return true;
}

bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {

	// int16_tcut
	char *fu = ptr_firstUrl;

	// sanity check
	Xml *xml = getXml();
	// blocked?
	if ( xml == (void *)-1 ) return false;
	// error?
	if ( ! xml ) return true;

	char *ict = getIsContentTruncated();
	if ( ! ict ) return true; if ( ict == (char *)-1 ) return false;
	char *at = getIsAdult();
	if ( ! at ) return true; if ( at == (void *)-1 ) return false;
	char *ls = getIsLinkSpam();
	if ( ! ls ) return true; if ( ls == (void *)-1 ) return false;
	uint8_t *ct = getContentType();
	if ( ! ct ) return true; if ( ct == (void *)-1 ) return false;
	uint16_t *cs = getCharset ( );
	if ( ! cs ) return true; if ( cs == (uint16_t *)-1 ) return false;
	char *pl = getIsPermalink();
	if ( ! pl ) return true; if ( pl == (char *)-1 ) return false;
	char *isRSS   = getIsRSS();
	if ( ! isRSS ) return true; if ( isRSS == (char  *)-1 )  return false;
	int32_t *ip = getIp();
	if ( ! ip ) return true; if ( ip == (int32_t *)-1 ) return false;
	uint8_t *li = getLangId();
	if ( ! li ) return true; if ( li == (uint8_t *)-1 ) return false;
	uint16_t *cid = getCountryId();
	if ( ! cid ) return true; if ( cid == (uint16_t *)-1 ) return false;
	LinkInfo   *info1  = getLinkInfo1();
	if ( ! info1 ) return true; if ( info1 == (void *)-1 ) return false;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	//char *ls = getIsLinkSpam();
	//Links *links = getLinks();
	// blocked?
	//if ( links == (void *)-1 ) { char *xx=NULL;*xx=0;}//return false;
	// error?
	//if ( ! links ) return true;

	// make it a URL
	Url uu; uu.set ( fu , false );


	char *allowed = "???";
	int32_t allowedInt = 1;
	if      ( m_isAllowedValid && m_isAllowed )  {
		allowed = "yes";
		allowedInt = 1;
	}
	else if ( m_isAllowedValid                )  {
		allowed = "no";
		allowedInt = 0;
	}

	int32_t ufn = -1;
	if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;

	char *es = mstrerror(m_indexCode);
	if ( ! m_indexCode ) es = mstrerror(g_errno);

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	//int32_t groupId = g_hostdb.getGroupIdFromDocId(m_docId);
	//Host *group = g_hostdb.getGroup(groupId);
	int32_t shardNum = getShardNumFromDocId ( m_docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	Host *h = &hosts[0];

	if ( ! isXml )
		sb->safePrintf (
				"<table cellpadding=3 border=0>\n"

				"<tr>"
				"<td width=\"25%%\">docId</td>"
				"<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>"
				"</tr>\n"

				"<tr>"
				"<td width=\"25%%\">on host #</td>"
				"<td>%"INT32"</td>"
				"</tr>\n"

				"<tr>"
				"<td>index error code</td>"
				"<td>%s</td>"
				"</tr>\n"


				"<tr>"
				"<td>robots.txt allows</td>"
				"<td>%s</td>"
				"</tr>\n"


				"<tr>"
				"<td>url</td>"
				"<td><a href=\"%s\">%s</a></td>"
				"</tr>\n"

				,
				cr->m_coll,
				m_docId ,
				m_docId ,

				h->m_hostId,

				es,
				allowed,

				fu,
				fu

				);
	else
		sb->safePrintf (
				"<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				"\t<coll><![CDATA[%s]]></coll>\n"
				"\t<docId>%"INT64"</docId>\n"
				"\t<indexError><![CDATA[%s]]></indexError>\n"
				"\t<robotsTxtAllows>%"INT32""
				"</robotsTxtAllows>\n"
				"\t<url><![CDATA[%s]]></url>\n"
				,
				cr->m_coll,
				m_docId ,
				es,
				allowedInt,//(int32_t)m_isAllowed,
				fu
				);

	char *redir = ptr_redirUrl;
	if ( redir && ! isXml ) {
		sb->safePrintf(
			       "<tr>"
			       "<td>redir url</td>"
			       "<td><a href=\"%s\">%s</a></td>"
			       "</tr>\n"
			       ,redir
			       ,redir );
	}
	else if ( redir ) {
		sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
			       "</redirectUrl>\n" ,redir );
	}


	if ( m_indexCode || g_errno ) {
		if ( ! isXml ) sb->safePrintf("</table><br>\n");
		else           sb->safePrintf("</response>\n");
		return true;
	}


	// must always start with http i guess!
	if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; }

	time_t ts = (time_t)m_firstIndexedDate;

	if ( ! isXml )
		sb->safePrintf("<tr><td>first indexed date</td>"
			       "<td>%s UTC</td></tr>\n" ,
			       asctime(gmtime(&ts)) );
	else
		sb->safePrintf("\t<firstIndexedDateUTC>%"UINT32""
			       "</firstIndexedDateUTC>\n",
			       (uint32_t)m_firstIndexedDate );

	ts = m_spideredTime;

	if ( ! isXml )
		sb->safePrintf("<tr><td>last indexed date</td>"
			       "<td>%s UTC</td></tr>\n" ,
			       asctime(gmtime(&ts )) );
	else
		sb->safePrintf("\t<lastIndexedDateUTC>%"UINT32""
			       "</lastIndexedDateUTC>\n",
			       (uint32_t)m_spideredTime );

	ts = m_outlinksAddedDate;

	if ( ! isXml )
		sb->safePrintf("<tr><td>outlinks last added date</td>"
			       "<td>%s UTC</td></tr>\n" ,
			       asctime(gmtime(&ts )) );
	else
		sb->safePrintf("\t<outlinksLastAddedUTC>%"UINT32""
			       "</outlinksLastAddedUTC>\n",
			       (uint32_t)m_outlinksAddedDate );

	// hop count
	if ( ! isXml )
		sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td>"
			       "</tr>\n",
			       (int32_t)m_hopCount);
	else
		sb->safePrintf("\t<hopCount>%"INT32"</hopCount>\n",
			       (int32_t)m_hopCount);


	char strLanguage[128];
	languageToString(m_langId, strLanguage);

	// print tags
	//SafeBuf tb;
	int32_t sni  = m_siteNumInlinks;

	char *ipString = iptoa(m_ip);

	//int32_t sni = info1->getNumGoodInlinks();

	time_t tlu = info1->getLastUpdated();
	struct tm *timeStruct3 = gmtime ( &tlu );//info1->m_lastUpdated );
	char tmp3[64];
	strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );


	if ( ! isXml )
		sb->safePrintf (
			"<tr><td>original charset</td><td>%s</td></tr>\n"
			"<tr><td>adult bit</td><td>%"INT32"</td></tr>\n"
			//"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n"
			"<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n"
			"<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n"
			"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
			"%s</td></tr>\n"
			"<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n"
			"<tr><td>content truncated</td><td>%"INT32"</td></tr>\n"
			"<tr><td>content type</td><td>%s</td></tr>\n"
			"<tr><td>language</td><td>%s</td></tr>\n"
			"<tr><td>country</td><td>%s</td></tr>\n"

			"<tr><td><b>good inlinks to site</b>"
			"</td><td>%"INT32"</td></tr>\n"

			// "<tr><td>unique IP inlinks to site"
			// "</td><td>%"INT32"</td></tr>\n"

			// "<tr><td>unique CBlock inlinks to site"
			// "</td><td>%"INT32"</td></tr>\n"

			"<tr><td><b>site rank</b></td><td>%"INT32"</td></tr>\n"

			"<tr><td>good inlinks to page"
			"</td><td>%"INT32"</td></tr>\n"

			// "<tr><td>unique IP inlinks to page"
			// "</td><td>%"INT32"</td></tr>\n"

			// "<tr><td>unique CBlock inlinks to page"
			// "</td><td>%"INT32"</td></tr>\n"

			// "<tr><td>total inlinks to page"
			// "</td><td>%"INT32"</td></tr>\n"

			"<tr><td><nobr>page inlinks last computed</nobr></td>"
			"<td>%s</td></tr>\n"
			"</td></tr>\n",
			get_charset_str(m_charset),
			(int32_t)m_isAdult,
			//(int32_t)m_isLinkSpam,
			//m_note,
			(int32_t)m_isPermalink,
			(int32_t)m_isRSS,
			ipString,
			cr->m_coll,
			ipString,
			size_utf8Content - 1,
			(int32_t)m_isContentTruncated,
			g_contentTypeStrings[(int)m_contentType] ,
			strLanguage,
			g_countryCode.getName(m_countryId) ,
			sni,
			//m_siteNumInlinksUniqueIp,
			//m_siteNumInlinksUniqueCBlock,
			::getSiteRank(sni),
			//info1->getNumTotalInlinks(),
			info1->getNumGoodInlinks(),
			// info1->m_numUniqueIps,
			// info1->m_numUniqueCBlocks,
			// info1->m_totalInlinkingDocIds,

			tmp3
			);
	else {
		sb->safePrintf (
			"\t<charset><![CDATA[%s]]></charset>\n"
			"\t<isAdult>%"INT32"</isAdult>\n"
			"\t<isLinkSpam>%"INT32"</isLinkSpam>\n"
			"\t<siteRank>%"INT32"</siteRank>\n"

			"\t<numGoodSiteInlinks>%"INT32"</numGoodSiteInlinks>\n"
			//"\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n"
			// "\t<numUniqueIpsLinkingToSite>%"INT32""
			// "</numUniqueIpsLinkingToSite>\n"
			// "\t<numUniqueCBlocksLinkingToSite>%"INT32""
			// "</numUniqueCBlocksLinkingToSite>\n"


			// how many inlinks, external and internal, we have
			// to this page not filtered in any way!!!
			//"\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n"
			// how many inlinking ips we got, including our own if
			// we link to ourself
			// "\t<numUniqueIpsLinkingToPage>%"INT32""
			// "</numUniqueIpsLinkingToPage>\n"
			// how many inlinking cblocks we got, including our own
			// if we link to ourself
			// "\t<numUniqueCBlocksLinkingToPage>%"INT32""
			// "</numUniqueCBlocksLinkingToPage>\n"


			"\t<numGoodPageInlinks>%"INT32"</numGoodPageInlinks>\n"
			"\t<pageInlinksLastComputed>%"INT32""
			"</pageInlinksLastComputed>\n"

			,get_charset_str(m_charset)
			,(int32_t)m_isAdult
			,(int32_t)m_isLinkSpam
			,::getSiteRank(sni)
			,sni
			// ,m_siteNumInlinksTotal
			// ,m_siteNumInlinksUniqueIp
			// ,m_siteNumInlinksUniqueCBlock

			//,info1->m_totalInlinkingDocIds
			//,info1->m_numUniqueIps
			//,info1->m_numUniqueCBlocks

			,info1->getNumGoodInlinks()
			//,tmp3
			,(int32_t)info1->m_lastUpdated
			);
		//if ( m_note )
		//	sb->safePrintf("\t<isLinkSpamReason><![CDATA[%s]]>"
		//		       "</isLinkSpamReason>\n"
		//		       , m_note );
		sb->safePrintf("\t<isPermalink>%"INT32"</isPermalink>\n"
			       "\t<isRSSFeed>%"INT32"</isRSSFeed>\n"
			       "\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
			       "\t<contentLenInBytes>%"INT32""
			       "</contentLenInBytes>\n"
			       "\t<isContentTruncated>%"INT32""
			       "</isContentTruncated>\n"
			       "\t<contentType><![CDATA[%s]]></contentType>\n"
			       "\t<language><![CDATA[%s]]></language>\n"
			       "\t<country><![CDATA[%s]]></country>\n",
			       (int32_t)m_isPermalink,
			       (int32_t)m_isRSS,
			       ipString,
			       size_utf8Content - 1,
			       (int32_t)m_isContentTruncated,
			       g_contentTypeStrings[(int)m_contentType] ,
			       strLanguage,
			       g_countryCode.getName(m_countryId) );
	}

	//sb->safePrintf("<tr><td>site</td><td>");
	//sb->safeMemcpy(ptr_site,size_site-1);
	//sb->safePrintf("</td></tr>\n");


	TagRec *ogr = NULL;
	if ( m_tagRecDataValid && m_version >= 118 ) {
		ogr = getTagRec(); // &m_tagRec;
		// sanity. should be set from titlerec, so no blocking!
		if ( ! ogr || ogr == (void *)-1 ) { char *xx=NULL;*xx=0; }
	}
	if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
	else if ( ogr )       ogr->printToBufAsXml  ( sb  );

	// show the good inlinks we used when indexing this
	if ( ! isXml )
		info1->print(sb,cr->m_coll);

	// close the table
	if ( ! isXml )
		sb->safePrintf ( "</table></center><br>\n" );
	else
		sb->safePrintf("</response>\n");

	return true;
}

bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {

	// use msg25 to hit linkdb and give us a link info class i guess
	// but we need paging functionality so we can page through like
	// 100 links at a time. clustered by c-class ip.

	// do we need to mention how many from each ip c-class then? because
	// then we'd have to read the whole termlist, might be several
	// separate disk reads.

	// we need to re-get both if either is NULL
	LinkInfo *sinfo = getSiteLinkInfo();
	// block or error?
	if ( ! sinfo ) return true; if ( sinfo == (LinkInfo *)-1) return false;

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	if ( isXml )
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);


	sb->safeMemcpy ( &m_siteLinkBuf );

	if ( isXml )
		sb->safePrintf ("</response>\n"	);

	// just print that
	//sinfo->print ( sb , cr->m_coll );

	return true;
}

bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {

	// we need to re-get both if either is NULL
	LinkInfo *info1 = getLinkInfo1();
	// block or error?
	if ( ! info1 ) return true; if ( info1 == (LinkInfo *)-1) return false;

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	if ( isXml )
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);

	int32_t recompute = hr->getLong("recompute",0);

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	// i guess we need this
	if ( ! recompute ) // m_setFromTitleRec )
		info1->print ( sb , cr->m_coll );
	else
		sb->safeMemcpy ( &m_pageLinkBuf );

	if ( isXml )
		sb->safePrintf ("</response>\n"	);

	return true;
}

static void getInlineSectionVotingBufWrapper ( void *state ) {
	XmlDoc *xd = (XmlDoc *)state;
	SafeBuf *vb = xd->getInlineSectionVotingBuf();
	// return if blocked
	if ( vb == (void *)-1 ) return;
	// error?
	if ( ! vb ) log("xmldoc: error getting inline section votes: %s",
			mstrerror(g_errno));
	// all done then. call original entry callback
	log("xmldoc: returning control to original caller");
	xd->m_callback1 ( xd->m_state );
}

// . returns false if blocked, true otherwise
// . returns true with g_errno set on error
// . this actually returns the page content with inserted information
//   based on sectiondb data
// . for example, <div id=poo> --> <div id=poo d=5 n=20>
//   means that the section is repeated on 20 pages from this site and 5 of
//   which have the same innerHtml as us
SafeBuf *XmlDoc::getInlineSectionVotingBuf ( ) {

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . if we block anywhere below we want to come back here until done
	// . this can be a main entry point, so set m_masterLoop
	if ( ! m_masterLoop ) {
		m_masterLoop  = getInlineSectionVotingBufWrapper;
		m_masterState = this;
		log("xmldoc: getting section voting info from coll=%s",
		    cr->m_coll);
	}

	if ( m_inlineSectionVotingBufValid )
		return &m_inlineSectionVotingBuf;

	Sections *sections = getSectionsWithDupStats();
	if ( ! sections || sections == (void *)-1 ) return (SafeBuf *)sections;
	Words *words = getWords();
	if ( ! words || words == (void *)-1 ) return (SafeBuf *)words;
	HttpMime *mime = getMime();
	if ( ! mime || mime == (void *)-1 ) return (SafeBuf *)mime;

	int32_t siteHash32 = *getSiteHash32();

	//int32_t nw = words->getNumWords();
	//int64_t *wids = words->getWordIds();

	SafeBuf *sb = &m_inlineSectionVotingBuf;

	// store mime first then content
	if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }

	// we no longer use this through a proxy, so take this out
	//sb->safeMemcpy ( m_httpReply , mime->getMimeLen() );
	// but hack the Content-Length: field to something alien
	// because we markup the html and the lenght will be different...
	//sb->nullTerm();

	// we no longer use this through a proxy so take this out
	//char *cl = strstr(sb->getBufStart(),"\nContent-Length:");
	//if ( cl ) cl[1] = 'Z';

	//sec_t mflags = SEC_SENTENCE | SEC_MENU;

	// just print out each word
	// map the word to a section.
	// if it s the first time we've printed the section then we
	// can inject the stuff
	// set a printed bit to indicate when we print out a section so
	// we do not re-print it...

	// these are 1-1 with words
	Section **sptrs = sections->m_sectionPtrs;
	int32_t nw = words->getNumWords();
	char **wptrs = words->m_words;
	int32_t *wlens = words->m_wordLens;

	for ( int32_t i = 0 ; i < nw ; i++ ) {
		char *a = wptrs[i];
		// skip if not a front tag
		if ( *a != '<' || a[1] == '/' ) {
			sb->safeMemcpy(a,wlens[i]);
			continue;
		}
		Section *sa = sptrs[i];
		// straight copy if no stats
		if ( ! sa || ! sa->m_stats.m_totalEntries ) {
			sb->safeMemcpy ( a , wlens[i] );
			continue;
		}
		// should be tag then
		char *e = a;
		for ( ; *e && *e != '>' && ! is_wspace_a(*e) ; e++);
		// copy that
		sb->safeMemcpy ( a , e-a);

		// the hash of the turktaghash and sitehash32 combined
		// so you can do gbfacetstr:gbxpathsitehash12345
		// where the 12345 is this h32 value.
		uint32_t h32 = sa->m_turkTagHash32 ^ siteHash32;

		// insert our stuff into the tag
		//sb->safePrintf("<!--");
		//sb->safePrintf("<font color=red>");
		SectionStats *sx = &sa->m_stats;
		// # docs from our site had the same innerHTML?
		sb->safePrintf(" _s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32"",
			       // total # of docs that had an xpath with
			       // our same innerHtml
			       (int32_t)sx->m_totalMatches,
			       // # of of docids with this facet
			       (int32_t)sx->m_totalDocIds,
			       // . total # of times this xpath occurred
			       // . can be multiple times per doc
			       (int32_t)sx->m_totalEntries,
			       // unique values in the xpath innerhtml
			       (int32_t)sx->m_numUniqueVals,
			       // xpathsitehash
			       h32 );
		// copy the rest of the tag
		sb->safeMemcpy( e, wlens[i]-(e-a) );
		//sb->safePrintf("-->");
		//sb->safePrintf("</font>");
		// print it here
	}
	sb->nullTerm();
	m_inlineSectionVotingBufValid = true;
	return &m_inlineSectionVotingBuf;
}

bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {

	// what wordposition to scroll to and blink?
	int32_t hiPos = -1;
	if ( hr ) hiPos = hr->getLong("hipos",-1);

	//
	// PRINT SECTIONS
	//
	Sections *sections ;
	// hr is NULL if being called from page parser which does not have the
	// dup stats! and we core if we block here!
	if ( hr ) sections = getSectionsWithDupStats();
	else      sections = getSections();
	if ( ! sections) return true;if (sections==(Sections *)-1)return false;
	//SectionVotingTable *nsvt = getNewSectionVotingTable();
	//if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
	//SectionVotingTable *osvt = getOldSectionVotingTable();
	//if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;}
	Words *words = getWords();
	if ( ! words ) return true; if ( words == (Words *)-1 ) return false;
	Phrases *phrases = getPhrases();
	if ( ! phrases ) return true; if (phrases == (void *)-1 ) return false;
	HashTableX *cnt = getCountTable();
	if ( ! cnt ) return true; if ( cnt == (void *)-1 ) return false;


	int32_t nw = words->getNumWords();
	//int32_t wordStart = 0;
	//int32_t wordEnd = nw;
	int64_t *wids = words->getWordIds();

	int32_t isXml = false;
	if ( hr ) isXml = (bool)hr->getLong("xml",0);

	//if ( ! isXml ) printMenu ( sb );

	// now complement, cuz bigger is better in the ranking world
	//int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY );
	SafeBuf densBuf;
	// returns false and sets g_errno on error
	if ( ! getDensityRanks((int64_t *)wids,
			       nw,
			       HASHGROUP_BODY,//hi->m_hashGroup,
			       &densBuf,
			       sections,
			       m_niceness))
		return true;
	// a handy ptr
	char *densityVec = (char *)densBuf.getBufStart();


	/*
	if ( ! isXml )
		sb->safePrintf("<br><b>density rank of body = %"INT32"</b> "
			       "(out of %"INT32")"
			       "<br>"
			       "<br>"
			       , densityRank
			       , (int32_t)MAXDENSITYRANK
			       );
	*/


	char *wordSpamVec = getWordSpamVec();
	char *fragVec = m_fragBuf.getBufStart();

	SafeBuf dwbuf;
	if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true;
	char *diversityVec = dwbuf.getBufStart();

	// hack fack debug
	//m_bodyStartPos =2136;

	SafeBuf wpos;
	if ( ! getWordPosVec ( words ,
			       sections,
			       //wordStart,
			       //wordEnd,
			       // we save this in the titlerec, when we
			       // start hashing the body. we have the url
			       // terms before the body, so this is necessary.
			       m_bodyStartPos,//0, // hi->m_startDist,
			       fragVec,
			       m_niceness,
			       &wpos) ) return true;
	// a handy ptr
	int32_t *wposVec = (int32_t *)wpos.getBufStart();

	if ( ! isXml ) {
		// put url in for steve to parse out
		sb->safePrintf("%s\n",
			       m_firstUrl.m_url);
		sb->safePrintf("<font color=black>w</font>"
			       "/"
			       "<font color=purple>x</font>"
			       //"/"
			       //"<font color=green>y</font>"
			       "/"
			       "<font color=red>z</font>"
			       ": "
			       "w=wordPosition "
			       "x=densityRank "
			       //"y=diversityRank "
			       "z=wordSpamRank "
			       "<br>"
			       "<br>"
			       ""
			       );
	}

	if ( ! isXml ) {
		// try the new print function
		sections->print2 ( sb ,
				   hiPos,
				   wposVec,
				   densityVec,
				   diversityVec,
				   wordSpamVec,
				   fragVec,
				   NULL,
				   NULL ,
				   &m_addresses ,
				   true );
		return true;
	}

	if ( isXml )
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);

	Section *si = sections->m_rootSection;

	sec_t mflags = SEC_SENTENCE | SEC_MENU;

	for ( ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);
		// print it out
		sb->safePrintf("\t<section>\n");
		// get our offset in the array of sections
		int32_t num = si - sections->m_sections;
		sb->safePrintf("\t\t<id>%"INT32"</id>\n",num);
		Section *parent = si->m_parent;
		if ( parent ) {
			int32_t pnum = parent - sections->m_sections;
			sb->safePrintf("\t\t<parent>%"INT32"</parent>\n",pnum);
		}
		char *byte1 = words->m_words[si->m_a];
		char *byte2 = words->m_words[si->m_b-1] +
			words->m_wordLens[si->m_b-1];
		int32_t off1 = byte1 - words->m_words[0];
		int32_t size = byte2 - byte1;
		sb->safePrintf("\t\t<byteOffset>%"INT32"</byteOffset>\n",off1);
		sb->safePrintf("\t\t<numBytes>%"INT32"</numBytes>\n",size);
		if ( si->m_flags & mflags ) {
			sb->safePrintf("\t\t<flags><![CDATA[");
			bool printed = false;
			if ( si->m_flags & SEC_SENTENCE ) {
				sb->safePrintf("sentence");
				printed = true;
			}
			if ( si->m_flags & SEC_MENU ) {
				if ( printed ) sb->pushChar(' ');
				sb->safePrintf("ismenu");
				printed = true;
			}
			sb->safePrintf("]]></flags>\n");
		}
		int32_t bcolor = (int32_t)si->m_colorHash& 0x00ffffff;
		int32_t fcolor = 0x000000;
		//int32_t rcolor = 0x000000;
		uint8_t *bp = (uint8_t *)&bcolor;
		bool dark = false;
		if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
			dark = true;
		// or if two are less than 50
		if ( bp[0]<100 && bp[1]<100 ) dark = true;
		if ( bp[1]<100 && bp[2]<100 ) dark = true;
		if ( bp[0]<100 && bp[2]<100 ) dark = true;
		// if bg color is dark, make font color light
		if ( dark ) {
			fcolor = 0x00ffffff;
			//rcolor = 0x00ffffff;
		}
		sb->safePrintf("\t\t<bgColor>%06"XINT32"</bgColor>\n",bcolor);
		sb->safePrintf("\t\t<textColor>%06"XINT32"</textColor>\n",fcolor);
		// count stats
		uint64_t ch64 = (int32_t)si->m_sentenceContentHash64;
		if ( ! ch64 ) {
			sb->safePrintf("\t</section>\n");
			continue;
		}
		/* take this out for now it is not quite right any more.
		   we now use the xpath hash and site hash as the key
		   and the "value" is the sentence/innerHtml hash
		sb->safePrintf("\t\t<numOnSitePagesThatDuplicateContent>%"INT32""
			       "</numOnSitePagesThatDuplicateContent>\n",
			       (int32_t)si->m_stats.m_onSiteDocIds);
		sb->safePrintf("\t\t<numOffSitePagesThatDuplicateContent>%"INT32""
			       "</numOffSitePagesThatDuplicateContent>\n",
			       (int32_t)si->m_stats.m_offSiteDocIds);
		sb->safePrintf("\t\t<numSitesThatDuplicateContent>%"INT32""
			       "</numSitesThatDuplicateContent>\n",
			       (int32_t)si->m_stats.m_numUniqueSites);
		*/
		// you can do a sitehash:xxxxx this number to see who the
		// dups are!
		sb->safePrintf("\t\t<innerContentHash64>%"UINT64""
			       "</innerContentHash64>\n",
			       si->m_sentenceContentHash64);
		sb->safePrintf("\t</section>\n");
	}

	// now print out the entire page content so the offsets make sense!
	sb->safePrintf("\t<utf8Content><![CDATA[");
	if ( ptr_utf8Content )
		sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,false);
	sb->safePrintf("]]></utf8Content>\n");

	// end xml response
	sb->safePrintf("</response>\n");

	return true;
}

bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {

	// set debug buffer
	m_storeTermListInfo = true;

	// default to sorting by wordpos
	m_sortTermListBy = hr->getLong("sortby",1);

	// cores in getNewSpiderReply() if we do not have this and provide
	// the docid...
	m_useSpiderdb = false;

	char *metaList = getMetaList ( );
	if ( ! metaList ) return true; if (metaList==(char *) -1) return false;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;


	int32_t isXml = hr->getLong("xml",0);

	if ( isXml ) {
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);
		sb->safePrintf(
			       "\t<maxDens>%"INT32"</maxDens>\n"
			       //"\t<maxDiv>%"INT32"</maxDiv>\n"
			       "\t<maxSpam>%"INT32"</maxSpam>\n"
			       , (int32_t)MAXDENSITYRANK
			       //, (int32_t)MAXDIVERSITYRANK
			       , (int32_t)MAXWORDSPAMRANK
			       );
	}

	if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }

	if ( ! isXml ) {
		//printMenu ( sb );
		//sb->safePrintf("<i>* indicates word is a synonym or "
		//	       "alternative word form<br><br>");
		sb->safePrintf("N column = DensityRank (0-%"INT32")<br>"
			       //"V column = DiversityRank (0-%"INT32")<br>"
			       "S column = WordSpamRank  (0-%"INT32") "
			       "[or linker "
			       "siterank if its offsite link text]<br>"

			       "Lang column = language used for purposes "
			       "of detecting the document's primary language "
			       "using a simple majority vote"
			       "<br>"

			       "</i>"
			       "<br>"
			       "Document Primary Language: <b>%s</b> (%s)"
			       "<br>"
			       "<br>"
			       , (int32_t)MAXDENSITYRANK
			       //, (int32_t)MAXDIVERSITYRANK
			       , (int32_t)MAXWORDSPAMRANK
			       , getLanguageString (m_langId)
			       , getLangAbbr(m_langId)
			       );
		// encode it
		SafeBuf ue;
		ue.urlEncode ( ptr_firstUrl );

		sb->safePrintf("Sort by: " );
		if ( m_sortTermListBy == 0 )
			sb->safePrintf("<b>Term</b>");
		else
			sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
				       "sortby=0>"
				       "Term</a>"
				       , cr->m_coll
				       , ue.getBufStart()
				       );
		sb->safePrintf(" | ");
		if ( m_sortTermListBy == 1 )
			sb->safePrintf("<b>WordPos</b>");
		else
			sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
				       "sortby=1>"
				       "WordPos</a>"
				       , cr->m_coll
				       , ue.getBufStart()
				       );
		sb->safePrintf("<br>"
			       "<br>"
			       );
	}


	//
	// BEGIN PRINT HASHES TERMS (JUST POSDB)
	//

	// int16_tcut
	HashTableX *wt = m_wts;

	// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
	TermDebugInfo **tp = NULL;
	// add them with this counter
	int32_t nt = 0;

	int32_t nwt = 0;
	if ( wt ) {
		nwt = wt->m_numSlots;
		tp = (TermDebugInfo **)wt->m_keys;
	}

	// now print the table we stored all we hashed into
	for ( int32_t i = 0 ; i < nwt ; i++ ) {
		// skip if empty
		if ( wt->m_flags[i] == 0 ) continue;
		// breathe
		//QUICKPOLL(m_niceness);
		// get its key, date=32bits termid=64bits
		//key96_t *k = (key96_t *)wt->getKey ( i );
		// get the TermDebugInfo
		TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
		// point to it for sorting
		tp[nt++] = ti;
	}

	// set this for cmptp
	s_wbuf = &m_wbuf;

	if ( m_sortTermListBy == 0 )
		// sort them alphabetically
		gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness );
	else
		// sort by word pos
		gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp2 , m_niceness );


	// print the weight tables
	//printLocationWeightsTable(sb,isXml);
	//printDiversityWeightsTable(sb,isXml);
	//printDensityWeightsTable(sb,isXml);
	//printWordSpamWeightsTable(sb,isXml);

	// print them out in a table
	char hdr[1000];
	sprintf(hdr,
		"<table border=1 cellpadding=0>"
		"<tr>"
		// this messes up Test.cpp diff'ing
		//"<td><b>#</b></td>"
		"<td><b>Prefix</b></td>"
		"<td><b>WordPos</b></td>"
		"<td><b>Lang</b></td>"

		"<td><b>Term</b></td>"

		//"%s"

		//"<td><b>Weight</b></td>"
		//"<td><b>Spam</b></td>"

		"<td><b>Desc</b></td>"

		"<td><b>N</b></td>"
		//"<td><b>V</b></td>" // diversityRank
		"<td><b>S</b></td>"
		"<td><b>Score</b></td>"

		//"<td><b>Date</b></td>"
		//"<td><b>Desc</b></td>"
		//"<td><b>TermId</b></td>"
		"</tr>\n"
		//,fbuf
		);

	if ( ! isXml )
		sb->safePrintf("%s",hdr);

	char *start = m_wbuf.getBufStart();
	int32_t rcount = 0;

	for ( int32_t i = 0 ; i < nt ; i++ ) {

		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS) == 0 && ! isXml )
			sb->safePrintf("<!--ignore--></table>%s",hdr);

		char *prefix = NULL;//"&nbsp;";
		if ( tp[i]->m_prefixOff >= 0 )
			prefix = start + tp[i]->m_prefixOff;

		if ( isXml ) sb->safePrintf("\t<term>\n");

		if ( isXml && prefix )
			sb->safePrintf("\t\t<prefix><![CDATA[%s]]>"
				       "</prefix>\n",prefix);

		if ( ! isXml ) {
			sb->safePrintf ( "<tr>");
			if ( prefix )
				sb->safePrintf("<td>%s:</td>",prefix);
			else
				sb->safePrintf("<td>&nbsp;</td>");
		}

		if ( ! isXml )
			sb->safePrintf("<td>%"INT32""
				       "/%"INT32""
				       "</td>" ,
				       tp[i]->m_wordPos
				       ,tp[i]->m_wordNum
				       );

		//char *abbr = getLangAbbr(tp[i]->m_langId);
		//if ( tp[i]->m_langId == langTranslingual ) abbr ="??";
		//if ( tp[i]->m_langId == langUnknown      ) abbr ="--";
		//if ( tp[i]->m_synSrc ) abbr = "";


		// print out all langs word is in if it's not clear
		// what language it is. we use a sliding window to
		// resolve some ambiguity, but not all, so print out
		// the possible langs here
		if ( ! isXml ) {
			sb->safePrintf("<td>");
			printLangBits ( sb , tp[i] );
			sb->safePrintf("</td>");
		}


		//if ( ! isXml && abbr[0] )
		//	sb->safePrintf("<td>%s</td>", abbr );
		//else if ( ! isXml )
		//	sb->safePrintf("<td>&nbsp;</td>" );
		//else if ( abbr[0] )
		//	sb->safePrintf("\t\t<lang><![CDATA["
		//		       "]]>%s</lang>\n", abbr );


		if ( isXml )
			sb->safePrintf("\t\t<s><![CDATA[");

		if ( ! isXml )
			sb->safePrintf ("<td><nobr>" );

		//if ( tp[i]->m_synSrc )
		//	sb->pushChar('*');

		sb->safeMemcpy_nospaces ( start + tp[i]->m_termOff ,
					  tp[i]->m_termLen );

		/*
		char *dateStr = "&nbsp;";
		int32_t ddd = tp[i]->m_date;
		uint8_t *tddd = (uint8_t *)&ddd;
		char tbbb[32];
		if ( ddd && tddd[2] == 0 && tddd[3] == 0 &&
		     tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) {
			sprintf(tbbb,"evIds %"INT32"-%"INT32"",
				(int32_t)tddd[1],(int32_t)tddd[0]);
			dateStr = tbbb;
		}
		else if ( ddd )
			dateStr = asctime ( gmtime(&ddd ));

		char tmp[20];
		if ( tp[i]->m_noSplit ) sprintf ( tmp,"<b>1</b>" );
		else                    sprintf ( tmp,"0" );
		*/

		if ( isXml )
			sb->safePrintf("]]></s>\n");
		else
			sb->safePrintf ( "</nobr></td>" );


		if ( isXml )
			sb->safePrintf("\t\t<wordPos>%"INT32"</wordPos>\n",
				       tp[i]->m_wordPos);

		char *desc = NULL;
		if ( tp[i]->m_descOff >= 0 )
			desc = start + tp[i]->m_descOff;

		// use hashgroup
		int32_t hg = tp[i]->m_hashGroup;
		if ( ! desc || ! strcmp(desc,"body") )
			desc = getHashGroupString(hg);

		if ( isXml && desc )
			sb->safePrintf("\t\t<loc>%s</loc>\n", desc);
		else if ( ! isXml ) {
			if ( ! desc ) desc = "&nbsp;";
			sb->safePrintf ( "<td>%s", desc );
			char ss = tp[i]->m_synSrc;
			if ( ss )
				sb->safePrintf(" - %s",
					       getSourceString(ss));
			sb->safePrintf("</td>");
		}

		int32_t dn = (int32_t)tp[i]->m_densityRank;
		if ( isXml )
			sb->safePrintf("\t\t<dens>%"INT32"</dens>\n",dn);

		if ( ! isXml && dn >= MAXDENSITYRANK )
			sb->safePrintf("<td>%"INT32"</td>\n",dn);
		else if ( ! isXml )
			sb->safePrintf("<td><font color=purple>%"INT32"</font>"
				       "</td>",dn);

		// the diversityrank/wordspamrank
		/*
		int32_t ds = (int32_t)tp[i]->m_diversityRank;
		if ( isXml )
			sb->safePrintf("\t\t<div>%"INT32"</div>\n",ds);
		if ( ! isXml && ds >= MAXDIVERSITYRANK )
			sb->safePrintf("<td>%"INT32"</td>\n",ds);
		else if ( ! isXml )
			sb->safePrintf("<td><font color=green>%"INT32"</font>"
				       "</td>",ds);
		*/

		int32_t ws = (int32_t)tp[i]->m_wordSpamRank;

		if ( isXml && hg == HASHGROUP_INLINKTEXT )
			sb->safePrintf("\t\t<linkerSiteRank>%"INT32""
				       "</linkerSiteRank>\n",ws);
		else if ( isXml )
			sb->safePrintf("\t\t<spam>%"INT32"</spam>\n",ws);

		if ( ! isXml && ws >= MAXWORDSPAMRANK )
			sb->safePrintf("<td>%"INT32"</td>",ws);
		else if ( ! isXml )
			sb->safePrintf("<td><font color=red>%"INT32"</font></td>",
				       ws);

		float score = 1.0;
		// square this like we do in the query ranking algo
		score *= getHashGroupWeight(hg) * getHashGroupWeight(hg);
		//score *= getDiversityWeight(tp[i]->m_diversityRank);
		score *= getDensityWeight(tp[i]->m_densityRank);
		if ( tp[i]->m_synSrc ) score *= SYNONYM_WEIGHT;
		if ( hg == HASHGROUP_INLINKTEXT ) score *= getLinkerWeight(ws);
		else                           score *= getWordSpamWeight(ws);
		if ( isXml )
			sb->safePrintf("\t\t<score>%.02f</score>\n",score);
		else
			sb->safePrintf("<td>%.02f</td>\n",score);

		if ( isXml )
			sb->safePrintf("\t</term>\n");
		else
			sb->safePrintf("</tr>\n");
	}


	if ( isXml )
		sb->safePrintf ("</response>\n"	);
	else
		sb->safePrintf("</table><br>\n");

	//
	// END PRINT HASHES TERMS
	//

	return true;
}

bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	sb->safePrintf("<b>Coming Soon</b>");

	return true;
}

bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {

	char **c = getUtf8Content();
	if ( ! c ) return true; if ( c==(void *)-1) return false;

	int32_t isXml = hr->getLong("xml",0);

	int32_t raw = hr->getLong("raw",0);

	if ( ! isXml && ! raw ) printMenu ( sb );

	if ( ! isXml ) {
		// just copy it otherwise
		if ( ptr_utf8Content )
			sb->safeMemcpy ( ptr_utf8Content ,size_utf8Content -1);
		return true;
	}

	sb->safePrintf ("<?xml version=\"1.0\" "
			"encoding=\"UTF-8\" ?>\n"
			"<response>\n"
			);
	sb->safePrintf("\t<utf8Content><![CDATA[");
	if ( ptr_utf8Content )
		sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,
				 false);
	sb->safePrintf("]]></utf8Content>\n");
	// end xml response
	sb->safePrintf("</response>\n");
	return true;
}


// . get the possible titles of the root page
// . includes the title tag text
// . includes various inlink text
// . used to match the VERIFIED place name 1 or 2 of addresses on this
//   site in order to set Address::m_flags's AF_VENUE_DEFAULT bit which
//   indicates the address is the address of the website (a venue website)
char **XmlDoc::getRootTitleBuf ( ) {

	// return if valid
	if ( m_rootTitleBufValid ) return (char **)&m_rootTitleBuf;

	// get it from the tag rec first
	setStatus ( "getting root title buf");

	// sanity check, root must have been indexed
	//if ( ! m_sreq.m_rootIndexed ) { char *xx=NULL;*xx=0; }

	// . update it first before reading it!
	// . do not update it here, just update it in getTitleRec() because
	//   this makes doConsistencyCheck() block and core
	//bool *status2 = updateSiteTitleBuf();
	//if ( ! status2 || status2 == (void *)-1 ) return (char **)status2;

	// get it from the tag rec if we can
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (void *)-1 ) return (char **)gr;

	// clear this if not set from title rec
	//if ( ! m_setFromTitleRec ) {
	//	ptr_siteTitleBuf  = NULL;
	//	size_siteTitleBuf = 0;
	//}

	// PROBLEM: new title rec is the only thing which has sitetitles tag
	// sometimes and we do not store that in the title rec. in this case
	// we should maybe store ptr_siteTitleBuf/size_siteTitleBuf in the
	// title rec?
	Tag *tag = gr->getTag("roottitles");

	char *src     = NULL;
	int32_t  srcSize = 0;

	if ( ptr_rootTitleBuf || m_setFromTitleRec ) {
		src    =  ptr_rootTitleBuf;
		srcSize = size_rootTitleBuf;
	}
	else if ( tag ) {
		src     = tag->getTagData();
		srcSize = tag->getTagDataSize();
		// no need to add to title rec since already in the tag so
		// make sure we did not double add
		if ( ptr_rootTitleBuf ) { char *xx=NULL;*xx=0; }
	}
	else {
		// . get the root doc
 		// . allow for a one hour cache of the titleRec
		XmlDoc **prd = getRootXmlDoc( 3600 );
		if ( ! prd || prd == (void *)-1 ) return (char **)prd;
		// int16_tcut
		XmlDoc *rd = *prd;
		// . if no root doc, then assume no root title
		// . this happens if we are injecting because we do not want
		//   to download the root page for speed purposes
		if ( ! rd ) {
			m_rootTitleBuf[0] = '\0';
			m_rootTitleBufSize = 0;
			m_rootTitleBufValid = true;
			return (char **)&m_rootTitleBuf;
		}
		// . ONLY do this if root doc was NOT set from titleRec to
		//   avoid that core in updateSiteTitleBuf(). this can happen
		//   if the root doc had no title! (or no content)
		//if ( rd->m_setFromTitleRec ) {
		//	// emptyt
		//	m_siteTitleBuf[0] = '\0';
		//	// set the size of it
		//	m_siteTitleBufSize = 0;
		//	// validate it
		//	m_siteTitleBufValid = true;
		//	// return a ptr to it
		//	return (char **)&m_siteTitleBuf;
		//}

		// a \0 separated list
		char **rtl = rd->getTitleBuf();
		if ( ! rtl || rtl == (void *)-1 ) return (char **)rtl;

		// ptr
		src     = rd->m_titleBuf;
		srcSize = rd->m_titleBufSize;
	}

	int32_t max = (int32_t)ROOT_TITLE_BUF_MAX - 5;
	// sanity
	if ( srcSize >= max ) {
		// truncate
		srcSize = max;
		// back up so we split on a space
		for ( ; srcSize>0 && ! is_wspace_a(src[srcSize]); srcSize--);
		// null term
		src[srcSize] = '\0';
		// include it
		srcSize++;
	}

	// copy that over in case root is destroyed
	gbmemcpy ( m_rootTitleBuf , src , srcSize );
	m_rootTitleBufSize = srcSize;

	// sanity check, must include the null ni the size
	if ( m_rootTitleBufSize > 0 &&
	     m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
		log("build: bad root titlebuf size not end in null char for "
		    "collnum=%i",(int)m_collnum);
		ptr_rootTitleBuf = NULL;
		size_rootTitleBuf = 0;
		m_rootTitleBufValid = true;
		return (char **)&m_rootTitleBuf;
		char *xx=NULL;*xx=0;
		//m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0';
		//m_rootTitleBufSize++;
	}

	// sanity check - breach check
	if ( m_rootTitleBufSize > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0;}

	// serialize into our titlerec
	ptr_rootTitleBuf  = m_rootTitleBuf;
	size_rootTitleBuf = m_rootTitleBufSize;

	m_rootTitleBufValid = true;

	return (char **)&m_rootTitleBuf;
}


char **XmlDoc::getFilteredRootTitleBuf ( ) {

	if ( m_filteredRootTitleBufValid )
		return (char **)&m_filteredRootTitleBuf;

	// get unfiltered. m_rootTitleBuf should be set from this call.
	char **rtbp = getRootTitleBuf();
	if ( ! rtbp || rtbp == (void *)-1 ) return (char **)rtbp;

	/*
	// assume none
	m_filteredRootTitleBuf[0] = '\0';
	m_filteredRootTitleBufSize = 0;
	m_filteredRootTitleBufValid = true;
	return (char **)&m_filteredRootTitleBuf;
	*/

	// filter all the punct to \0 so that something like
	// "walmart.com : live better" is reduced to 3 potential
	// names, "walmart", "com" and "live better"
	char *src    =       m_rootTitleBuf;
	char *srcEnd = src + m_rootTitleBufSize;
	char *dst    =       m_filteredRootTitleBuf;
	// save some room to add a \0, so subtract 5
	char *dstEnd = dst + ROOT_TITLE_BUF_MAX - 5;
	//char *src = tag->getTagData();
	//char *srcEnd = src + tag->getTagDataSize();
	int32_t  size = 0;
	bool lastWasPunct = true;
	for ( ; src < srcEnd && dst < dstEnd ; src += size ) {
		// set the char size
		size = getUtf8CharSize(src);
		// space?
		if ( is_wspace_a (*src) ||
		     // allow periods too
		     *src=='.' ) {
			// no back to back punct
			if ( lastWasPunct ) continue;
			// flag it
			lastWasPunct = true;
			// add it in
			*dst++ = '.';
			// that's it
			continue;
		}
		// x'y  or x-y
		if ( ( *src == '\'' ||
		       *src == '.'  ||
		       *src == '-'  ) &&
		     ! lastWasPunct &&
		     is_alnum_a(src[1]) ) {
			// add it in
			*dst++ = *src;
			// that's it
			continue;
		}
		// x & y is ok
		if ( *src == '&' ) {
			// assume not punct (stands for and)
			lastWasPunct = false;
			// add it in
			*dst++ = *src;
			// that's it
			continue;
		}
		// store alnums right in
		if ( is_alnum_a(*src) ) {
			// flag it
			lastWasPunct = false;
			// copy it over
			gbmemcpy ( dst , src , size );
			// skip what we copied
			dst += size;
			continue;
		}
		// if punct and haven't stored anything, just skip it
		if ( lastWasPunct ) dst[-1] = '\0';
		// store it
		else *dst++ = '\0';
	}
	// make sure we end on a \0
	if ( dst > m_filteredRootTitleBuf && dst[-1] != '\0' )
		*dst++ = '\0';

	// int16_tcut
	char *str     = m_filteredRootTitleBuf;
	int32_t  strSize = dst - m_filteredRootTitleBuf;

	// copy that over in case root is destroyed
	gbmemcpy ( m_filteredRootTitleBuf , str , strSize );
	m_filteredRootTitleBufSize = strSize;

	// sanity check, must include the null ni the size
	if ( m_filteredRootTitleBufSize > 0 &&
	     m_filteredRootTitleBuf [ m_filteredRootTitleBufSize - 1 ] ) {
		char *xx=NULL;*xx=0;
		//m_filteredRootTitleBuf [ m_filteredRootTitleBufSize-1]='\0';
		//m_filteredRootTitleBufSize++;
	}

	// sanity check - breach check
	if ( m_filteredRootTitleBufSize > ROOT_TITLE_BUF_MAX ) {
		char *xx=NULL;*xx=0;}

	m_filteredRootTitleBufValid = true;

	// make this static to avoid compiler warning
	static char *fp = m_filteredRootTitleBuf;

	return (char **)&fp;
	//return (char **)&m_filteredRootTitleBuf;
}

//static bool s_dummyBool = 1;

class Binky {
public:
	char      *m_text;
	int32_t       m_textLen;
	int32_t       m_score;
	int64_t  m_hash;
};


int cmpbk ( const void *v1, const void *v2 ) {
	Binky *b1 = (Binky *)v1;
	Binky *b2 = (Binky *)v2;
	return b1->m_score - b2->m_score;
}

char **XmlDoc::getTitleBuf ( ) {
	if ( m_titleBufValid ) return (char **)&m_titleBuf;

	// recalc this everytime the root page is indexed
	setStatus ( "getting title buf on root");

	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
	// this should only be called on the root!
	// . if the site changed for us, but the title rec of what we
	//   think is now the root thinks that it is not the root because
	//   it is using the old site, then it cores here!
	// . i.e. if the new root is www.xyz.com/user/ted/ and the old root
	//   is www.xyz.com then and the old root is stored in ptr_site for
	//   the title rec for www.xyz.com/user/ted/ then we core here,
	// . so take this sanity check out
	// . but if the title rec does not think he is the site root yet
	//   then just wait until he does so we can get his
	//   ptr_rootTitleBuf below
	if ( ! *isRoot ) {
		m_titleBuf[0] = '\0';
		m_titleBufSize = 0;
		m_titleBufValid = true;
		return (char **)&m_titleBuf;
	}

	// sanity check
	if ( m_setFromTitleRec ) {
		gbmemcpy(m_titleBuf, ptr_rootTitleBuf, size_rootTitleBuf );
		m_titleBufSize  = size_rootTitleBuf;
		m_titleBufValid = true;
		return (char **)&m_titleBuf;
	}

	char *mysite = getSite();
	if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
	// get link info first
	LinkInfo   *info1  = getLinkInfo1();
	// error or blocked
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char **)info1;

	// sanity check
	Xml *xml = getXml();
	// return -1 if it blocked
	if ( xml == (void *)-1 ) return (char **)-1;
	// set up for title
	int32_t tlen ;
	char *title ;
	// on error, ignore it to avoid hammering the root!
	if ( xml == (void *)NULL ) {
		// log it
		log("build: error downloading root xml: %s",
		    mstrerror(g_errno));
		// clear it
		g_errno = 0;
		// make it 0
		tlen  = 0;
		title = NULL;
	}
	else {
		// get the title
		title = m_xml.getTextForXmlTag ( 0,
						 999999 ,
						 "title" ,
						 &tlen ,
						 true ); // skip leading spaces
	}

	// truncate to 100 chars
	//for ( ; tlen>0 && (tlen > 100 || is_alnum_a(title[tlen])) ; tlen-- )
	//	if ( tlen == 0 ) break;
	if ( tlen > 100 ) {
		char *tpend = title + 100;
		char *prev  = getPrevUtf8Char ( tpend , title );
		// make that the end so we don't split a utf8 char
		tlen = prev - title;
	}

	// store tag in here
	char tmp[1024];
	// point to it
	char *ptmp = tmp;
	// set this
	char *pend = tmp + 1024;
	// add that in
	gbmemcpy ( ptmp, title, tlen); ptmp += tlen;
	// null terminate it
	*ptmp++ = '\0';

	// two votes per internal inlink
	int32_t internalCount = 0;
	// count inlinkers
	int32_t linkNum = 0;
	Binky bk[1000];
	// init this
	//char stbuf[2000];
	//HashTableX scoreTable;
	//scoreTable.set(8,4,64,stbuf,2000,false,m_niceness,"xmlscores");
	// scan each link in the link info
	for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
		// do not breach
		if ( linkNum >= 1000 ) break;
		// is this inlinker internal?
		bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
		// get length of link text
		int32_t tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		// get the text
		char *txt = k->getLinkText();
		// skip corrupted
		if ( ! verifyUtf8 ( txt , tlen ) ) {
			log("xmldoc: bad link text 4 from url=%s for %s",
			    k->getUrl(),m_firstUrl.m_url);
			continue;
		}
		// store these
		// zero out hash
		bk[linkNum].m_hash    = 0;
		bk[linkNum].m_text    = txt;
		bk[linkNum].m_textLen = tlen;
		bk[linkNum].m_score   = 0;
		// internal count
		if ( internal && ++internalCount >= 3 ) continue;
		// it's good
		bk[linkNum].m_score = 1;
		linkNum++;
		/*
		// set into words
		Words w;
		// return NULL on error with g_errno set
		if ( ! w.setx ( txt , tlen , m_niceness ) ) return NULL;
		// int16_tcut
		int64_t *wids = w.getWordIds();
		// init hash
		int64_t h = 0LL;
		// hash all words together
		for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) {
			// skip if not hashable
			if ( ! wids[i] ) continue;
			// mix it up
			h <<= 1LL;
			// xor it in
			h ^= wids[i];
		}
		// update hash
		bk[linkNum].m_hash = h;
		// store in table, return NULL with g_errno set on error
		if ( ! scoreTable.addTerm ( &h ) ) return NULL;
		*/
	}
	// init this
	char dtbuf[1000];
	HashTableX dupTable;
	dupTable.set(8,0,64,dtbuf,1000,false,m_niceness,"xmldup");
	// now set the scores and isdup
	for ( int32_t i = 0 ; i < linkNum ; i++ ) {
		// skip if ignored
		if ( bk[i].m_score == 0 ) continue;
		// get hash
		int64_t h = bk[i].m_hash;
		// assume a dup
		bk[i].m_score = 0;
		// skip if zero'ed out
		if ( ! h ) continue;
		// only do each hash once!
		if ( dupTable.isInTable(&h) ) continue;
		// add to it. return NULL with g_errno set on error
		if ( ! dupTable.addKey(&h) ) return NULL;
		// is it in there?
		bk[i].m_score = 1; // scoreTable.getScore ( &h );
	}
	// now sort the bk array by m_score
	//gbsort ( bk , linkNum , sizeof(Binky), cmpbk , m_niceness );

	// sanity check - make sure sorted right
	//if ( linkNum >= 2 && bk[0].m_score < bk[1].m_score ) {
	//	char *xx=NULL; *xx=0; }

	// . now add the winners to the buffer
	// . skip if score is 0
	for ( int32_t i = 0 ; i < linkNum ; i++ ) {
		// skip if score is zero
		if ( bk[i].m_score == 0 ) continue;
		// skip if too big
		if ( bk[i].m_textLen + 1 > pend - ptmp ) continue;
		// store it
		gbmemcpy ( ptmp , bk[i].m_text , bk[i].m_textLen );
		// advance
		ptmp += bk[i].m_textLen;
		// null terminate it
		*ptmp++ = '\0';
	}

	// sanity
	int32_t size = ptmp - tmp;
	if ( size > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0; }

	gbmemcpy ( m_titleBuf , tmp , ptmp - tmp );
	m_titleBufSize = size;
	m_titleBufValid = true;
	// ensure null terminated
	if ( size > 0 && m_titleBuf[size-1] ) { char *xx=NULL;*xx=0; }
	//ptr_siteTitleBuf = m_siteTitleBuf;
	//size_siteTitleBuf = m_siteTitleBufSize;
	return (char **)&m_titleBuf;
}


// . now we just get all the tagdb rdb recs to add using this function
// . then we just use the metalist to update tagdb
SafeBuf *XmlDoc::getNewTagBuf ( ) {

	if ( m_newTagBufValid ) return &m_newTagBuf;

	setStatus ( "getting new tags");

	int32_t *ic = getIndexCode();
	if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; }

	// get our ip
	int32_t *ip = getIp();
	// this must not block to avoid re-computing "addme" above
	if ( ip == (void *)-1 ) { char *xx=NULL;*xx=0; }
	if ( ! ip || ip == (int32_t *)-1) return (SafeBuf *)ip;

	// . do not both if there is a problem
	// . otherwise if our ip is invalid (0 or 1) we core in
	//   getNumSiteInlinks() which requires a valid ip
	// . if its robots.txt disallowed, then indexCode will be set, but we
	//   still want to cache our sitenuminlinks in tagdb! delicious.com was
	//   recomputing the sitelinkinfo each time because we were not storing
	//   these tags in tagdb!!
	if ( ! *ip || *ip == -1 ) { // *ic ) {
		m_newTagBuf.reset();
		m_newTagBufValid = true;
		return &m_newTagBuf;
	}

	// get the tags already in tagdb
	TagRec *gr = getTagRec ( );
	if ( ! gr || gr == (void *)-1 ) return (SafeBuf *)gr;

	// get our site
	char *mysite = getSite();
	// this must not block to avoid re-computing "addme" above
	if ( mysite == (void *)-1 ) { char *xx=NULL;*xx=0; }
	if ( ! mysite || mysite == (char *)-1 ) return (SafeBuf *)mysite;

	// age of tag in seconds
	int32_t timestamp;

	// always just use the primary tagdb so we can cache our sitenuminlinks
	char rdbId = RDB_TAGDB;
	//if ( m_useSecondaryRdbs ) rdbId = RDB2_TAGDB2;
	//else                      rdbId = RDB_TAGDB;

	// sitenuminlinks special for repair
	if ( m_useSecondaryRdbs &&
	     // and not rebuilding titledb
	     ! m_useTitledb ) {
		m_newTagBuf.reset();
		m_newTagBufValid = true;
		int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,&timestamp);
		if ( old1 == m_siteNumInlinks &&
		     old1 != -1 &&
		     ! m_updatingSiteLinkInfoTags )
			return &m_newTagBuf;
		int32_t now = getTimeGlobal();
		if ( g_conf.m_logDebugLinkInfo )
			log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"",
			    mysite,m_siteNumInlinks);
		if ( ! m_newTagBuf.addTag2(mysite,"sitenuminlinks",now,
					   "xmldoc",
					   *ip,m_siteNumInlinks,rdbId) )
			return NULL;
		return &m_newTagBuf;
	}

	// if doing consistency check, this buf is for adding to tagdb
	// so just ignore those. we use ptr_tagRecData in getTagRec() function
	// but this is really for updating tagdb.
	if ( m_doingConsistencyCheck ) {
		m_newTagBuf.reset();
		m_newTagBufValid = true;
		return &m_newTagBuf;
	}

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;

	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;

	char *isIndexed = getIsIndexed();
	if ( !isIndexed || isIndexed==(char *)-1 ) return (SafeBuf *)isIndexed;

	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (SafeBuf *)isRoot;

	int32_t *siteNumInlinks = getSiteNumInlinks();
	if ( ! siteNumInlinks ) return NULL;
	if (   siteNumInlinks == (int32_t *)-1) return (SafeBuf *)-1;

	// ok, get the sites of the external outlinks and they must
	// also be NEW outlinks, added to the page since the last time
	// we spidered it...
	Links *links = getLinks ();
	if ( ! links || links == (Links *)-1 ) return (SafeBuf *)links;

	// our next slated spider priority
	char *spiderLinks = getSpiderLinks();
	if ( ! spiderLinks  || spiderLinks == (char *)-1 )
		return (SafeBuf *)spiderLinks;

	// . get ips of all outlinks.
	// . use m_msgeForIps class just for that
	// . it sucks if the outlink's ip is a dns timeout, then we never
	//   end up being able to store it in tagdb, that is why when
	//   rebuilding we need to skip adding firstip tags for the outlinks
	int32_t **ipv = NULL;
	TagRec ***grv = NULL;
	bool addLinkTags = true;
	if ( ! *spiderLinks ) addLinkTags = false;
	if ( ! m_useSpiderdb ) addLinkTags = false;
	if ( addLinkTags ) {
		ipv = getOutlinkFirstIpVector ();
		if ( ! ipv || ipv == (void *)-1 ) return (SafeBuf *)ipv;
		// . uses m_msgeForTagRecs for this one
		grv = getOutlinkTagRecVector();
		if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
	}

	// get root langid of root page
	uint8_t *rl = getRootLangId();
	if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl;

	char *hci = getHasContactInfo();
	if ( ! hci || hci == (char *)-1 ) return (SafeBuf *)hci;

	// get the address class
	Addresses *aa = getAddresses ();
	if ( ! aa || aa == (Addresses *)-1 ) return (SafeBuf *)aa;

	// get comma separated list of email address on page
	char *emails = getEmailBuf ( );
	if ( ! emails || emails == (void *)-1 ) return (SafeBuf *)emails;

#ifdef _USETURKS_
	//HashTableX *tvt = getTurkVotingTable ();
	//if ( ! tvt || tvt == (void *)-1 ) return (SafeBuf *)tvt;
#endif

	//
	// init stuff
	//

	// . this gets the root doc and and parses titles out of it
	// . sets our m_rootTitleBuf/m_rootTitleBufSize
	char **rtbufp = getRootTitleBuf();
	if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf *)rtbufp;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// overwrite "getting root title buf" status
	setStatus ("computing new tags");

	if ( g_conf.m_logDebugLinkInfo )
		log("xmldoc: adding tags for mysite=%s",mysite);

	// int16_tcut
	//TagRec *tr = &m_newTagRec;
	// current time
	int32_t now = getTimeGlobal();
	// actually, use spider download time if we can. that way
	// Test.cpp's injection runs will be more consistent!
	if ( ! strcmp(cr->m_coll,"qatest123") ) {
		//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
		now = getSpideredTime();//m_spideredTime;
	}
	// store tags into here
	SafeBuf *tbuf = &m_newTagBuf;
	// allocate space to hold the tags we will add
	Tag *tag;
	int32_t need = 512;
	// add in root title buf in case we add it too
	need += m_rootTitleBufSize;
	// reserve it all now
	if ( ! tbuf->reserve(need) ) return NULL;


	//
	// add root langid if we need to
	//
	char *oldrl = gr->getString("rootlang",NULL,&timestamp);
	// assume no valid id
	int32_t oldrlid = -99;
	// convert to id
	if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl );

	// if not in old tag, or changed from what was in tag, or it has
	// been 10 days or more, then update tagdb with this tag.
	bool addRootLang = false;
	if ( ! oldrl ) addRootLang = true;
	if ( oldrlid != *rl ) addRootLang = true;
	if ( now-timestamp > 10*86400 ) addRootLang = true;
	// injects do not download the root doc for speed reasons, so do not
	// bother for them unless the doc itself is the root.
	if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
	// . get the two letter (usually) language code from the id
	// . i think the two chinese languages are 5 letters
	char *newrl = NULL;
	if ( addRootLang )
		// i've seen this return NULL because *rl is a corrupt 215
		// for some reason
		newrl = getLanguageAbbr( *rl );

	if ( newrl )
		tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId);

	//
	// add hascontactinfo if we need to
	//
	int32_t oldhci = gr->getLong("hascontactinfo",-1,NULL,&timestamp);
	if ( oldhci == -1 || oldhci != *hci || now-timestamp > 10 *86400 ) {
		char *val = "0";
		if ( m_hasContactInfo ) val = "1";
		tbuf->addTag3 (mysite,"hascontactinfo",now,"xmldoc",*ip,val,
			       rdbId);
	}
	//
	// add "site" tag
	//
	char *oldsite = gr->getString("site",NULL);
	if ( ! oldsite || strcmp(oldsite,mysite) || now-timestamp > 10*86400)
		tbuf->addTag3(mysite,"site",now,"xmldoc",*ip,mysite,rdbId);

	//
	// add firstip if not there at all
	//
	char *oldfip = gr->getString("firstip",NULL);
	// convert it
	int32_t ip3 = 0;
	if ( oldfip ) ip3 = atoip(oldfip);
	// if not there or if bogus, add it!! should override bogus firstips
	if ( ! ip3 || ip3 == -1 ) {
		char *ipstr = iptoa(m_ip);
		//if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0; }
		//int32_t  iplen = gbstrlen(ipstr);
		//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
		tbuf->addTag3(mysite,"firstip",now,"xmldoc",*ip,ipstr,
			     rdbId);
	}

	//if ( strncmp(m_firstUrl.m_url,"http://delicious.com/",21)==0 )
	//	log("boo");

	// sitenuminlinks
	int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,&timestamp);
	if ( old1 == -1 || old1 != m_siteNumInlinks ||
	     m_updatingSiteLinkInfoTags ) {
		if ( g_conf.m_logDebugLinkInfo )
			log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"",
			    mysite,m_siteNumInlinks);
		if ( ! tbuf->addTag2(mysite,"sitenuminlinks",now,"xmldoc",
				    *ip,m_siteNumInlinks,rdbId) )
			return NULL;
	}

	//int32_t old2, old3, old4;

	// if running for diffbot crawlbot then isCustomCrawl is true
	// so do not update the siteinlink info already in tagdb since i
	// imported it from my main collection. we do not want to overwrite it.
	// NO, because for single site crawls we bottlenech on msg25
	// when there are millions of urls. we only skip this
	// for the global-index and if already in tagdb!
	// No, let's just not invalidate the sitenuminlinks* tags
	// in XmlDoc::getSiteNumInlinks()
	//if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks;

	// sitenuminlinksfresh
	// old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,&timestamp);
	// if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp ||
	//      m_updatingSiteLinkInfoTags )
	// 	if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip",
	// 			     now,"xmldoc",
	// 			    *ip,m_siteNumInlinksUniqueIp,rdbId))
	// 		return NULL;
	// // sitepop
	// old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL,
	// 			&timestamp);
	// if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock ||
	//      m_updatingSiteLinkInfoTags )
	// 	if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock",
	// 			     now,"xmldoc",
	// 			    *ip,m_siteNumInlinksUniqueCBlock,rdbId))
	// 		return NULL;
	// // total site inlinks
	// old4 = gr->getLong("sitenuminlinkstotal",-1,NULL,
	// 			&timestamp);
	// if ( old4 == -1 || old4 != m_siteNumInlinksTotal ||
	//      m_updatingSiteLinkInfoTags )
	// 	if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal",
	// 			     now,"xmldoc",
	// 			    *ip,m_siteNumInlinksTotal,rdbId))
	// 		return NULL;

	// skipSiteInlinks:

	// get root title buf from old tag
	char *data  = NULL;
	int32_t  dsize = 0;
	Tag *rt = gr->getTag("roottitles");
	if ( rt ) {
		data  = rt->getTagData();
		dsize = rt->getTagDataSize();
	}

	bool addRootTitle = false;
	// store the root title buf if we need to. if we had no tag yet...
	if ( ! rt )
		addRootTitle = true;
	// or if differs in size
	else if ( dsize != m_rootTitleBufSize )
		addRootTitle = true;
	// or if differs in content
	else if ( memcmp(data,m_rootTitleBuf,m_rootTitleBufSize))
		addRootTitle =true;
	// or if it is 10 days old or more
	if ( now-timestamp > 10*86400 ) addRootTitle = true;
	// but not if injected
	if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false;
	// add it then
	if ( addRootTitle &&
	     ! tbuf->addTag(mysite,"roottitles",now,"xmldoc",
			    *ip,m_rootTitleBuf,m_rootTitleBufSize,
			    rdbId,true) )
		return NULL;


	//
	// add the VENUEADDRESS tags
	//

	// init the dedup table so we do not add the same address many times
	char dtbuf[1000];
	HashTableX dt;
	dt.set(8,0,32,dtbuf,1000,false,m_niceness,"xmldt");
	// reset counts
	int32_t numContactAddressTags = 0;
	int32_t numContactEmailTags   = 0;
	int32_t tagType2 = getTagTypeFromStr ( "contactaddress" );
	int32_t tagType3 = getTagTypeFromStr ( "contactemails"   );
	// before we add the sitevenue to the tagrec let's make sure it is
	// not a dedup.. i.e. that we do not already have this address
	// in there.
	int32_t tagType = getTagTypeFromStr ( "venueaddress" );
	// start at the first tag
	tag = gr->getFirstTag();
	// loop over all tags in the buf, see if we got a dup
	for ( ; tag ; tag = gr->getNextTag ( tag ) ) {
		// count current contact addresses we have
		if ( tag->m_type == tagType2 ) numContactAddressTags++;
		if ( tag->m_type == tagType3 ) numContactEmailTags++;
		// skip if not a venueaddress tag
		if ( tag->m_type != tagType ) continue;
		// point to the serialized address
		char *data = tag->getTagData();
		// get that address hash i guess
		uint64_t ah = getHashFromAddr ( data );
		// add to dedup table - return NULL with g_errno set on error
		if ( ! dt.addKey ( &ah ) ) return NULL;
	}
	int32_t na = aa->getNumAddresses();
	// add up to 10 for now
	for ( int32_t i = 0 ; i < na ; i++ ) {
		// get it
		Address *a = (Address *)aa->m_am.getPtr(i);
		// check if venue
		if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue;
		// must have street on the page, not pointing into a tagrec
		// from tagdb... otherwise we keep re-adding
		if ( a->m_street->m_a < 0 ) continue;
		//  dedup! dedup against
		//  addresses in tagdb for venueaddress tag. can we use
		//  the dc[] array from Address.cpp... we need another
		//  set of bit flags for address class:
		if ( dt.isInTable ( &a->m_hash ) ) continue;
		// sanity
		if ( a->m_hash == 0 ) { char *xx=NULL;*xx=0; }
		// . serialize it
		// . TODO: get rid of Address::addToTagRec() functions
		char abuf[5000];
		a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
		// store in safebuf of tags
		if ( ! tbuf->addTag3 (mysite,"venueaddress",now,"xmldoc",
				     *ip,abuf,rdbId) ) return NULL;
		// only add once
		if ( ! dt.addKey (&a->m_hash) ) return NULL;
	}

	//
	//
	// contact info stuff
	//
	//

	// ensure m_numContactAddresses etc. are valid
	Address **ca = getContactAddresses();
	// blocked?
	if ( ! ca || ca == (void *)-1 ) return (SafeBuf *)ca;

	// do not do this for root if multiple addresses. this
	// fixes http://obits.abqjournal.com/
	if ( *isRoot && aa->m_uniqueStreetHashes > 1 )  na = 0;

	// do not store more than 2 contact addresses, or 2 contact emails
	// to avoid tagdb bloat. and also because we do not need that many.

	// . store contact address if we had one
	// . this is a buffer of Address ptrs
	for ( int32_t i = 0 ; i < m_numContactAddresses ; i++ ) {
		// stop on breach
		if ( numContactAddressTags >= 2 ) break;
		// inc it
		numContactAddressTags++;
		// breathe
		QUICKPOLL(m_niceness);
		// get it
		Address *a = ca[i];
		// . serialize it
		// . TODO: get rid of Address::addToTagRec() functions
		char abuf[5000];
		a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true);
		// store in safebuf of tags
		if ( ! tbuf->addTag3 (mysite,"contactaddress",now,"xmldoc",
				     *ip,abuf,rdbId) ) return NULL;
	}

	// . add email addresses and submission forms to tag
	// . this does not block, so make sure only called once!
	// . contact emails. comma separated list
	if ( emails && numContactEmailTags <= 1 ) {
		numContactEmailTags++;
		if ( ! tbuf->addTag3 (mysite,"contactemails",now,"xmldoc",
				     *ip,emails,rdbId) ) return NULL;
	}


	//
	//
	// NOW add tags for our outlinks
	//
	//

	bool oldHighQualityRoot = true;
	// if we are new, do not add anything, because we only add a tagdb
	// rec entry for "new" outlinks  that were added to the page since
	// the last time we spidered it
	if ( ! *isIndexed ) oldHighQualityRoot = false;
	// special tags for google search results pages for scraping
	char inGoogle = false;
	if ( strstr(mysite,"google.com") ) inGoogle = true;
	// no updating if we are not root
	if ( ! inGoogle && ! *isRoot ) oldHighQualityRoot = false;
	// must be high quality, too
	if ( ! inGoogle && *siteNumInlinks < 500 ) oldHighQualityRoot = false;
	// . if we are a google url then add tags for each outlink!
	// . more google special tags to replace Scraper.cpp
	char *fu = m_firstUrl.getUrl();
	//char *name = NULL;
	bool inGoogleBlogs = false;
	bool inGoogleNews  = false;
	if ( ! strncmp ( fu , "http://www.google.com/blogsearch?", 33 )  )
		inGoogleBlogs = true;
	if ( ! strncmp ( fu , "http://blogsearch.google.com/blogsearch?", 40 ))
		inGoogleBlogs = true;
	if ( ! strncmp ( fu , "http://news.google.com/", 23 ))
		inGoogleNews = true;
	// only do once per site
	char buf[1000];
	HashTableX ht; ht.set (4,0,-1 , buf , 1000 ,false,m_niceness,"sg-tab");
	// get site of outlink
	SiteGetter siteGetter;
	// . must be from an EXTERNAL DOMAIN and must be new
	// . we should already have its tag rec, if any, since we have msge
	int32_t n = links->getNumLinks();
	// not if not spidering links
	if ( ! addLinkTags ) n = 0;
	// get the flags
	linkflags_t *flags = links->m_linkFlags;
	// scan all outlinks we have on this page
	for ( int32_t i = 0 ; i < n ; i++ ) {

		// get its tag rec
		TagRec *gr = (*grv)[i];

		// does this hostname have a "firstIp" tag?
		char *ips = gr->getString("firstip",NULL);

		bool skip = false;
		// skip if we are not "old" high quality root
		if ( ! oldHighQualityRoot ) skip = true;
		// . skip if not external domain
		// . we added this above, so just "continue"
		if ( flags[i] & LF_SAMEDOM ) continue;//skip = true;
		// skip links in the old title rec
		if ( flags[i] & LF_OLDLINK ) skip = true;
		// skip if determined to be link spam! should help us
		// with the text ads we hate so much
		if ( links->m_spamNotes[i] ) skip = true;

		// if we should skip, and they have firstip already...
		if ( skip && ips ) continue;

		// get the normalized url
		char *url = links->getLinkPtr(i);
		// get the site. this will not block or have an error.
		siteGetter.getSite(url,gr,timestamp,cr->m_collnum,m_niceness);
		// these are now valid and should reference into
		// Links::m_buf[]
		char *site    = siteGetter.m_site;
		int32_t  siteLen = siteGetter.m_siteLen;

		int32_t linkIp  = (*ipv)[i];

		// get site hash
		uint32_t sh = hash32 ( site , siteLen );
		// ensure site is unique
		if ( ht.getSlot ( &sh ) >= 0 ) continue;
		// add it. returns false and sets g_errno on error
		if ( ! ht.addKey ( &sh ) ) return NULL;

		// . need to add firstip tag for this link's subdomain?
		// . this was in Msge1.cpp but now we do it here
		if ( ! ips && linkIp && linkIp != -1 ) {
			// make it
			char *ips = iptoa(linkIp);
			if (!tbuf->addTag3(site,"firstip",now,"xmldoc",*ip,ips,
					  rdbId))
				return NULL;
		}

		if ( skip ) continue;

		// if outlink is a .gov or .edu site, do not bother, because
		// getIsSpam() always returns false for those
		// TODO: verify this
		//if ( flags[i] & LF_EDUTLD ) continue;
		//if ( flags[i] & LF_GOVTLD ) continue;
		// this must be valid
		//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
		//int32_t timestamp = m_spideredTime;

		// how much avail for adding tags?
		int32_t avail = tbuf->getAvail();
		// reserve space
		int32_t need = 512;
		// make sure enough
		if ( need > avail && ! tbuf->reserve ( need ) ) return NULL;

		// add tag for this outlink
		if ( inGoogle ) {// && ! gr->getTag("ingoogle") ) {
			if ( ! tbuf->addTag(site,"ingoogle",now,"xmldoc",
					    *ip,"1",2,rdbId,true) )
				return NULL;
		}
		if ( inGoogleBlogs && //! gr->getTag("ingoogleblogs") &&
		    !tbuf->addTag(site,"ingoogleblogs",now,"xmldoc",*ip,"1",2,
				  rdbId,true))
			return NULL;
		if ( inGoogleNews && //! gr->getTag("ingooglenews") &&
		     !tbuf->addTag(site,"ingooglenews",now,"xmldoc",*ip,"1",2,
				   rdbId,true))
			return NULL;
		// link is linked to by a high quality site! 500+ inlinks.
		if ( gr->getNumTagTypes("authorityinlink") < 5 &&
		     ! tbuf->addTag(site,"authorityinlink",now,"xmldoc",
				    *ip,"1",2,rdbId,true) )
			return NULL;
	}

	m_newTagBufValid = true;
	return &m_newTagBuf;
}


//
//
// BEGIN OLD SPAM.CPP class
//
//

#define WTMPBUFSIZE (MAX_WORDS *21*3)

// . RULE #28, repetitive word/phrase spam detector
// . set's the "spam" member of each word from 0(no spam) to 100(100% spam)
// . "bits" describe each word in phrasing terminology
// . if more than maxPercent of the words are spammed to some degree then we
//   consider all of the words to be spammed, and give each word the minimum
//   score possible when indexing the document.
// . returns false and sets g_errno on error
char *XmlDoc::getWordSpamVec ( ) {

	if ( m_wordSpamBufValid ) {
		char *wbuf = m_wordSpamBuf.getBufStart();
		if ( ! wbuf ) return (char *)0x01;
		return wbuf;
	}

	setStatus("getting word spam vec");

	// assume not the repeat spammer
	m_isRepeatSpammer = false;

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (char *)words;

	m_wordSpamBuf.purge();

	int32_t nw = words->getNumWords();
	if ( nw <= 0 ) {
		m_wordSpamBufValid = true;
		return (char *)0x01;
	}

	Phrases *phrases = getPhrases ();
	if ( ! phrases || phrases == (void *)-1 ) return (char *)phrases;
	Bits *bits = getBits();
	if ( ! bits ) return (char *)NULL;

	m_wordSpamBufValid = true;

	//if ( m_isLinkText   ) return true;
	//if ( m_isCountTable ) return true;

	// int16_tcuts
	//Words *words = m_words;
	//Bits  *bits  = m_bits;

	// if 20 words totally spammed, call it all spam?
	m_numRepeatSpam = 20;

	// int16_tcut
	int32_t sni = m_siteNumInlinks;
	if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; }

	// set "m_maxPercent"
	int32_t maxPercent = 6;
	if ( sni > 10  ) maxPercent = 8;
        if ( sni > 30  ) maxPercent = 10;
        if ( sni > 100 ) maxPercent = 20;
        if ( sni > 500 ) maxPercent = 30;
	// fix this a bit so we're not always totally spammed
	maxPercent = 25;

	// assume not totally spammed
	m_totallySpammed = false;
	// get # of words we have to set spam for
	int32_t numWords = words->getNumWords();

	// set up the size of the hash table (number of buckets)
	int32_t  size = numWords * 3;

	// . add a tmp buf as a scratch pad -- will be freed right after
	// . allocate this second to avoid mem fragmentation more
	// . * 2 for double the buckets
	char  tmpBuf [ WTMPBUFSIZE ];
	char *tmp     = tmpBuf;
	int32_t  need    = (numWords * 21) * 3 + numWords;
	if ( need > WTMPBUFSIZE ) {
		tmp = (char *) mmalloc ( need , "Spam" );
		if ( ! tmp ) {
			log("build: Failed to allocate %"INT32" more "
			    "bytes for spam detection:  %s.",
			    need,mstrerror(g_errno));
			return NULL;
		}
	}

	QUICKPOLL(m_niceness);
	// set up ptrs
	char *p = tmp;
	// first this
	unsigned char *spam      = (unsigned char *)p; p += numWords ;
	// . this allows us to make linked lists of indices of words
	// . i.e. next[13] = 23--> word #23 FOLLOWS word #13 in the linked list
	int32_t      *next          = (int32_t      *)p;  p += size * 4;
	// hash of this word's stem (or word itself if useStem if false)
	int64_t *bucketHash    = (int64_t *)p;  p += size * 8;
	// that word's position in document
	int32_t      *bucketWordPos = (int32_t      *)p;  p += size * 4;
	// profile of a word
	int32_t      *profile       = (int32_t      *)p;  p += size * 4;
	// is it a common word?
	char      *commonWords   = (char      *)p;  p += size * 1;

	// sanity check
	if ( p - tmp > need ) { char *xx=NULL;*xx=0; }

	// clear all our spam percentages for these words
	memset ( spam , 0 , numWords );

	int32_t np;
        // clear the hash table
        int32_t i;
        for ( i = 0 ; i < size ; i++ ) {
                bucketHash   [i] =  0;
                bucketWordPos[i] = -1;
		commonWords  [i] =  0;
        }

	// count position since Words class can now have tags in it
	//
	//int32_t pos = 0;
	//bool usePos = false;
	//if ( words->m_tagIds ) usePos = true;

	int64_t *wids = words->getWordIds();

	// . loop through each word
	// . hash their stems and place in linked list
	// . if no stemming then don't do stemming
	for ( i = 0 ; i < numWords ; i++ ) {
		// . skip punctuation
		// . this includes tags now , too i guess
		//if ( words->isPunct(i) ) continue;
		if ( wids[i] == 0 ) continue;
		// skip if will not be indexed cuz score is too low
		//if ( wscores && wscores[i] <= 0 ) continue;
		QUICKPOLL(m_niceness);
		// TODO: get phrase stem if stemming is on
		// store the phrase stem this word int32_to the buffer
		//		blen = words->getPhraseStem(i,buf,100);
		//		if (blen<=0) continue;
		// get the hash of the ith word
		int64_t h = words->getWordId(i);
		// use secondary wordId if available
		//if ( words->getStripWordId(i) )
		//	h = words->getStripWordId(i);
		// "j" is the bucket index
		int32_t j = (uint64_t)h % size;
		// make sure j points to the right bucket
		while (bucketHash[j]) {
			if ( h == bucketHash[j] ) break;
			if (++j == size) j = 0;
		}
		// if this bucket is occupied by a word then replace it but
		// make sure it adds onto the "linked list"
		if (bucketHash[j])  {
			// if Words class contain tags as words, do this
			//if ( usePos ) {
			//	next         [pos] = bucketWordPos[j];
			//	bucketWordPos[  j] = pos++;
			//}
			//else {
			// add onto linked list for the ith word
			next[i]  = bucketWordPos[j];
			// replace bucket with index to this word
			bucketWordPos[j] = i;
			//}
		}
		// otherwise, we have a new occurence of this word
		else {
			bucketHash  [j] = h;
			// if Words class contain tags as words, do this
			//if ( usePos ) {
			//	bucketWordPos[  j] = pos++;
			//	next         [pos] = -1;
			//}
			//else {
			// store our position # (i) in bucket
			bucketWordPos[j] = i;
			// no next occurence of the ith word yet
			next[i] = -1;
			//}
		}
		// if stop word or number then mark it
		if ( bits->isStopWord(i) ) commonWords[j] = 1;
		if ( words->isNum ( i )  ) commonWords[j] = 1;
	}
	// count distinct candidates that had spam and did not have spam
	int32_t spamWords = 0;
	int32_t goodWords = 0;
	// . now cruise down the hash table looking for filled buckets
	// . grab the linked list of indices and make a "profile"
	for ( i = 0 ; i < size ; i++ ) {
		// skip empty buckets
		if (bucketHash[i] == 0) continue;
		np=0;
		// word #j is in bucket #i
		int32_t j = bucketWordPos[i];
		// . cruise down the linked list for this word
		while ( j!=-1) {
			// store position of occurence of this word in profile
			profile [ np++ ] = j;
			// get the position of next occurence of this word
			j = next[ j ];
		}
		// if 2 or less occurences of this word, don't check for spam
		if ( np < 3 ) { goodWords++; continue; }

		//
		// set m_isRepeatSpammer
		//
		// look for a word repeated in phrases, in a big list,
		// where each phrase is different
		//
		int32_t max = 0;
		int32_t count = 0;
		int32_t knp = np;
		// must be 3+ letters, not a stop word, not a number
		if ( words->m_wordLens[profile[0]] <= 2 || commonWords[i] )
			knp = 0;
		// scan to see if they are a tight list
		for ( int32_t k = 1 ; k < knp ; k++ ) {
			// breathe
			QUICKPOLL(m_niceness);
			// are they close together? if not, bail
			if ( profile[k-1] - profile[k] >= 25 ) {
				count = 0;
				continue;
			}
			// otherwise inc it
			count++;
			// must have another word in between or tag
			int32_t a = profile[k];
			int32_t b = profile[k-1];
			bool gotSep = false;
			bool inLink = false;
			for ( int32_t j = a+1 ; j <b ; j++ ) {
				// if in link do not count, chinese spammer
				// does not have his crap in links
				if ( words->m_words[j][0] == '<' &&
				     words->m_wordLens[j]>=3 ) {
					// get the next char after the <
					char nc;
					nc=to_lower_a(words->m_words[j][1]);
					// now check it for anchor tag
					if ( nc == 'a' ) {
						inLink = true; break; }
				}
				if ( words->m_words[j][0] == '<' )
					gotSep = true;
				if ( is_alnum_a(words->m_words[j][0]) )
					gotSep = true;
			}
			// . the chinese spammer always has a separator,
			//   usually another tag
			// . and fix "BOW BOW BOW..." which has no separators
			if      ( ! gotSep  ) count--;
			else if (   inLink  ) count--;
			// get the max
			if ( count > max ) max = count;
		}
		// a count of 50 such monsters indicates the chinese spammer
		if ( max >= 50 )
			m_isRepeatSpammer = true;
		//
		// end m_isRepeatSpammer detection
		//

		// . determine the probability this word was spammed by looking
		//   at the distribution of it's positions in the document
		// . sets "spam" member of each word in this profile
		// . don't check if word occurred 2 or less times
		// . TODO: what about TORA! TORA! TORA!
		// . returns true if 1+ occurences were considered spam
		QUICKPOLL(m_niceness);
		bool isSpam = setSpam ( profile , np , numWords , spam );
		// don't count stop words or numbers towards this threshold
		if ( commonWords[i] ) continue;
		// tally them up
		if ( isSpam ) spamWords++;
		else          goodWords++;
	}
	// what percent of distinct cadidate words were spammed?
	int32_t totalWords = spamWords + goodWords;
	// if no or ver few words return true
	int32_t percent;
	if ( totalWords <= 10 ) goto done;
	percent    = ( spamWords * 100 ) / totalWords;
	// if 20% of words we're spammed punish everybody now to 100% spam
	// if we had < 100 candidates and < 20% spam, don't bother
	//if ( percent < 5 ) goto done;
	if ( percent <= maxPercent ) goto done;
	// set flag so linkspam.cpp can see if all is spam and will not allow
	// this page to vote
	m_totallySpammed = true;
	// now only set to 99 so each singleton usually gets hashed
	for ( i = 0 ; i < numWords ; i++ )
		if ( words->getWordId(i) && spam[i] < 99 )
			spam[i] = 99;
 done:

	// update the weights for the words
	//for ( i = 0 ; i < numWords ; i++ ) {
	//	m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100;
	//}

	// TODO: use the min word spam algo as in Phrases.cpp for this!
	//for ( i = 0 ; i < numWords ; i++ ) {
	//	m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100;
	//}

	// convert from percent spammed into rank.. from 0 to 10 i guess
	for ( i = 0 ; i < numWords ; i++ )
		spam[i] = (MAXWORDSPAMRANK * (100 - spam[i])) / 100;

	// copy into our buffer
	if ( ! m_wordSpamBuf.safeMemcpy ( (char *)spam , numWords ) )
		return NULL;

	// free our temporary table stuff
	if ( tmp != tmpBuf ) mfree ( tmp , need , "Spam" );

	return m_wordSpamBuf.getBufStart();
}


// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we map "each" subProfile to a probability of spam (from 0 to 100)
// . if the profile is really big we get really slow (O(n^2)) iterating through
//   many subProfiles
// . so after the first 25 words, it's automatically considered spam
// . return true if one word was spammed w/ probability > 20%
bool XmlDoc::setSpam ( int32_t *profile, int32_t plen , int32_t numWords ,
			unsigned char *spam ) {
	// don't bother detecting spam if 2 or less occurences of the word
	if ( plen < 3 ) return false;
	int32_t i;
	// if we have more than 10 words and this word is 20% or more of
	// them then all but the first occurence is spammed
	//log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam);
	if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) {
		for (i=1; i<plen; i++) spam[profile[i]] = 100;
		return true ;
	}
	// . over 50 repeated words is ludicrous
	// . set all past 50 to spam and continue detecting
	// . no, our doc length based weight takes care of that kind of thing
	//if (plen > 50 && m_version < 93 ) {
	//	// TODO: remember, profile[i] is in reverse order!! we should
	//	// really do i=0;i<plen-50, but this is obsolete anyway...
	//	for (i=50; i<plen;i++) m_spam[profile[i]] = 100;
	//	plen = 50;
	//}


	// we have to do this otherwise it takes FOREVER to do for plens in
	// the thousands, like i saw a plen of 8338!
	if ( plen > 50 ) { // && m_version >= 93 ) {
		// . set all but the last 50 to a spam of 100%
		// . the last 50 actually occur as the first 50 in the doc
		for (i=0; i<plen-50;i++) spam[profile[i]] = 100;
		// we now have only 50 occurences
		plen = 50;
		// we want to skip the first plen-50 because they actually
		// occur at the END of the document
		profile += plen - 50;
	}

	QUICKPOLL(m_niceness);
	// higher quality docs allow more "freebies", but only starting with
	// version 93... (see Titledb.h)
	// profile[i] is actually in reverse order so we subtract off from wlen
	//int32_t off ;
	//if ( m_version >= 93 ) {
	//	off = (m_docQuality - 30) / 3;
	//	if ( off < 0 ) off = 0;
	//}
	// just use 40% "quality"
	int32_t off = 3;

	// . now the nitty-gritty part
	// . compute all sub sequences of the profile
	// . similar to a compression scheme (wavelets?)
	// . TODO: word positions should count by two's since punctuation is
	//         not included so start step @ 2 instead of 1
	// . if "step" is 1 we look at every       word position in the profile
	// . if "step" is 2 we look at every other word position
	// . if "step" is 3 we look at every 3rd   word position, etc...
	int32_t maxStep = plen / 4;
	if ( maxStep > 4 ) maxStep = 4;
	// . loop through all possible tuples
	int32_t window, wlen, step, prob;
	 for ( step = 1 ; step <= maxStep ; step++ ) {
		for ( window = 0 ; window + 3 < plen ; window+=1) {
			for (wlen = 3; window+wlen <= plen ; wlen+=1) {
				// continue if step isn't aligned with window
				// length
				if (wlen % step != 0) continue;
				// . get probability that this tuple is spam
				// . returns 0 to 100
				prob = getProbSpam ( profile + window ,
						     wlen , step);
				// printf("(%i,%i,%i)=%i\n",step,window,
				// wlen,prob);
				// . if the probability is too low continue
				// . was == 100
				if ( prob <= 20 ) continue;
				// set the spammed words spam to "prob"
				// only if it's bigger than their current spam
				for (i=window; i<window+wlen;i++) {
					// first occurences can have immunity
					// due to doc quality being high
					if ( i >= plen - off ) break;
					if (spam[profile[i]] < prob)
						spam[profile[i]] = prob;
				}
				QUICKPOLL(m_niceness);
			}

		}
	 }
	 // was this word spammed at all?
	 bool hadSpam = false;
	 for (i=0;i<plen;i++) if ( spam[profile[i]] > 20 ) hadSpam = true;
	 // make sure at least one word survives
	 for (i=0;i<plen;i++) if ( spam[profile[i]] == 0)  return hadSpam;
	 // clear the spam level on this guy
	 spam[profile[0]] = 0;
	 // return true if we had spam, false if not
	 return hadSpam;
}

bool getWordPosVec ( Words *words ,
		     Sections *sections,
		     //int32_t wordStart,
		     //int32_t wordEnd,
		     int32_t startDist, // m_dist
		     char *fragVec,
		     int32_t niceness ,
		     SafeBuf *wpos ) {

	int32_t dist = startDist; // 0;
	Section *lastsx = NULL;
	int32_t tagDist = 0;
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	nodeid_t *tids = words->m_tagIds;
	int64_t *wids = words->m_wordIds;
	int32_t *wlens = words->getWordLens();
	char **wptrs = words->getWords();
	int32_t nw = words->getNumWords();

	if ( ! wpos->reserve ( nw * 4 ) ) return false;
	int32_t *wposvec = (int32_t *)wpos->getBufStart();


	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL(niceness);

		// save it
		wposvec[i] = dist;

		// tags affect the distance/wordposition cursor
		if ( tids && tids[i] ) {
			// tag distance affects
			nodeid_t tid = tids[i] & BACKBITCOMP;
			if ( isBreakingTagId ( tid ) ) tagDist += SENT_UNITS;
			dist++;
			continue;
		}
		// . and so do sequences of punct
		// . must duplicate this code in Query.cpp for setting
		//   QueryWord::m_posNum
		if ( ! wids[i] ) {
			// simple space or sequence of just white space
			if ( words->isSpaces(i) )
				dist++;
			// 'cd-rom'
			else if ( wptrs[i][0]=='-' && wlens[i]==1 )
				dist++;
			// 'mr. x'
			else if ( wptrs[i][0]=='.' && words->isSpaces2(i,1))
				dist++;
			// animal (dog)
			else
				dist += 2;
			continue;
		}
		// ignore if in repeated fragment
		if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) {
			dist++; continue; }

		Section *sx = NULL;
		if ( sp ) {
			sx = sp[i];
			// ignore if in style tag, etc. and do not
			// increment the distance
			if ( sx->m_flags & NOINDEXFLAGS )
				continue;
		}

		// different sentence?
		if ( sx &&
		     ( ! lastsx ||
		     sx->m_sentenceSection != lastsx->m_sentenceSection ) ) {
			// separate different sentences with 30 units
			dist += SENT_UNITS; // 30;
			// limit this!
			if ( tagDist > 120 ) tagDist = 120;
			// and add in tag distances as well here, otherwise
			// we do not want "<br>" to really increase the
			// distance if the separated words are in the same
			// sentence!
			dist += tagDist;
			// new last then
			lastsx = sx;
			// store the vector AGAIN
			wposvec[i] = dist;
		}

		tagDist = 0;

		dist++;
	}
	return true;
}

bool getDensityRanks ( int64_t *wids ,
		       int32_t nw ,
		       int32_t hashGroup ,
		       SafeBuf *densBuf ,
		       Sections *sections ,
		       int32_t niceness ) {

	//int32_t nw = wordEnd - wordStart;

	// make the vector
	if ( ! densBuf->reserve ( nw ) ) return false;

	// convenience
	char *densVec = densBuf->getBufStart();

	// clear i guess
	memset ( densVec , 0 , nw );

	if ( hashGroup != HASHGROUP_BODY &&
	     hashGroup != HASHGROUP_HEADING )
		sections = NULL;

	// scan the sentences if we got those
	Section *ss = NULL;
	if ( sections ) ss = sections->m_firstSent;
	// sanity
	//if ( sections && wordStart != 0 ) { char *xx=NULL;*xx=0; }
	for ( ; ss ; ss = ss->m_nextSent ) {
		// breathe
		QUICKPOLL(niceness);
		// count of the alnum words in sentence
		int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
		// start with one word!
		count--;
		// how can it be less than one alnum word
		if ( count < 0 ) continue;
		// . base density rank on that
		// . count is 0 for one alnum word now
		int32_t dr = MAXDENSITYRANK - count;
		// ensure not negative. make it at least 1. zero means un-set.
		if ( dr < 1 ) dr = 1;
		// mark all in sentence then
		for ( int32_t i = ss->m_senta ; i < ss->m_sentb ; i++ ) {
			// breathe
			QUICKPOLL(niceness);
			// assign
			densVec[i] = dr;
		}
	}
	// all done if using sections
	if ( sections ) return true;


	// count # of alphanumeric words in this string
	int32_t na = 0;
	for ( int32_t i = 0 ; i < nw ; i++ ) if ( wids[i] ) na++;
	// a single alnum should map to 0 "na"
	na--;
	// wtf?
	if ( na < 0 ) return true;
	// compute density rank
	int32_t dr  = MAXDENSITYRANK - na ;
	// at least 1 to not be confused with 0 which means un-set
	if ( dr < 1 ) dr = 1;
	// assign
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL(niceness);
		// assign
		densVec[i] = dr;
	}
	return true;
}

// . called by hashString() for hashing purposes, i.e. creating posdb keys
// . string is usually the document body or inlink text of an inlinker or
//   perhaps meta keywords. it could be anything. so we need to create this
//   vector based on that string, which is represented by words/phrases here.
bool getDiversityVec ( Words *words ,
		       Phrases *phrases ,
		       HashTableX *countTable ,
		       SafeBuf *sbWordVec ,
		       //SafeBuf *sbPhraseVec ,
		       int32_t niceness ) {

	int64_t  *wids  = words->getWordIds ();
	//nodeid_t   *tids  = words->getTagIds  ();
	int32_t        nw    = words->getNumWords();
	int64_t  *pids  = phrases->getPhraseIds2();

	// . make the vector
	// . it will be diversity ranks, so one float per word for now
	//   cuz we convert to rank below though, one byte rank
	if ( ! sbWordVec  ->reserve ( nw*4 ) ) return false;
	//if ( ! sbPhraseVec->reserve ( nw*4 ) ) return false;

	// get it
	float *ww = (float *)sbWordVec  ->getBufStart();
	//float *pw = (float *)sbPhraseVec->getBufStart();

	int32_t      nexti        = -10;
	int64_t pidLast      = 0;

	// . now consider ourselves the last word in a phrase
	// . adjust the score of the first word in the phrase to be
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// yield
		QUICKPOLL ( niceness );
		// skip if not alnum word
		if ( ! wids[i] ) { ww[i] = 0.0; continue; }
		// try to inline this
		int64_t nextWid = 0;
		int64_t lastPid = 0;
		// how many words in the bigram?
		int32_t      nwp = phrases->getNumWordsInPhrase2(i);
		if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ;
		if ( i == nexti ) lastPid = pidLast;
		// get current pid
		int64_t pid = pids[i];
		// get the word and phrase weights for term #i
		float ww2;
		//float pw2;
		getWordToPhraseRatioWeights ( lastPid  , // pids[i-1],
					      wids[i]  ,
					      pid      ,
					      nextWid  , // wids[i+1] ,
					      &ww2     ,
					      //&pw2     ,
					      countTable ,
					      1);//m_version );
		// 0 to 1.0
		if ( ww2 < 0 || ww2 > 1.0 ) { char *xx=NULL;*xx=0; }
		// save the last phrase id
		if ( nwp > 0 ) {
			nexti        = i + nwp - 1;
			pidLast      = pid; // pids[i] ;
		}
		// . apply the weights
		// . do not hit all the way down to zero though...
		// . Words.cpp::hash() will not index it then...
		//if ( ww[i] > 0 ) {
		ww[i] = ww2;
		//}
		/*
		//if ( pw[i] > 0 ) {
		pw[i] = (int32_t)(pw[i] * pw2);
		if ( pw[i] <= 0 ) pw[i] = 1;
		//}

		// MDW: why was this here?
		//if ( isLinkText ) continue;

		// do not demote all the way to 0
		//if ( ww[i] <= 0 ) ww[i] = 2;

		// skip if phrase score is 0
		if ( ! pw[i] ) continue;

		if ( pid == 0 ) { pw[i] = 0; continue; }
		// skip if does not start phrase
		if ( nwp <= 0 ) continue;
		// sanity check
		if ( nwp == 99 ) { char *xx = NULL; *xx = 0; }
		// now mod the score
		float avg = pw[i];
		// weight by punct in between
		//for ( int32_t j = i+1 ; j < i+nwp ; j++ ) {
		//	if ( wids[j] ) continue;
		//	avg = (avg * (int64_t)pw[j]) / DW;
		//}
		// do not demote all the way to zero, we still want to index it
		// and when normalized on a 100 point scale, like when printed
		// out by PageParser.cpp, a score of 1 here gets normalized to
		// 0, so make sure it is at least 2.
		if ( avg < 2 )
			avg = 2;
		// set that as our new score
		pw[i] = avg;
		*/
	}

	// overwrite the array of floats with an array of chars (ranks)
	char *nww = (char *)ww;
	//char *npw = (char *)pw;

	// convert from float into a rank from 0-15
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		if ( ! ww[i] ) { nww[i] = 0; continue; }
		// 2.50 is max in getWordToPhraseRatioWeights() function
		char wrank = (char) ((ww[i] * ((float)MAXDIVERSITYRANK))/.55);
		// sanity
		if ( wrank > MAXDIVERSITYRANK ) wrank = MAXDIVERSITYRANK;
		if ( wrank < 0 ) { char *xx=NULL;*xx=0; }
		//char prank = (char) ((pw[i] * 15.0) / 2.50);
		// assign now
		nww[i] = wrank;
		//npw[i] = prank;
	}

	return true;
}

// match word sequences of NUMWORDS or more words
#define NUMWORDS 5

// . repeated sentence frags
// . 1-1 with words in body of doc
char *XmlDoc::getFragVec ( ) {

	if ( m_fragBufValid ) {
		char *fb = m_fragBuf.getBufStart();
		if ( ! fb ) return (char *)0x01;
		return fb;
	}

	setStatus("getting frag vec");

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (char *)words;
	Bits *bits = getBits();
	if ( ! bits ) return NULL;

	m_fragBuf.purge();

	// ez vars
	int64_t  *wids  = words->getWordIds ();
	int32_t        nw    = words->getNumWords();

	// if no words, nothing to do
	if ( nw == 0 ) {
		m_fragBufValid = true;
		return (char *)0x01;//true;
	}

	// truncate for performance reasons. i've seen this be over 4M
	// and it was VERY VERY SLOW... over 10 minutes...
	// - i saw this tak over 200MB for an alloc for
	//   WeightsSet3 below, so lower from 200k to 50k. this will probably
	//   make parsing inconsistencies for really large docs...
	if ( nw > MAXFRAGWORDS ) nw = MAXFRAGWORDS;

	int64_t   ringWids [ NUMWORDS ];
	int32_t        ringPos  [ NUMWORDS ];
	int32_t        ringi = 0;
	int32_t        count = 0;
	uint64_t   h     = 0;

	// . make the hash table
	// . make it big enough so there are gaps, so chains are not too long
	int32_t       minBuckets = (int32_t)(nw * 1.5);
	uint32_t  nb     = 2 * getHighestLitBitValue ( minBuckets ) ;
	int32_t       need       = nb * (8+4+4);
	char      *buf        = NULL;
	char       tmpBuf[50000];
	if ( need < 50000 ) buf = tmpBuf;
	else                buf = (char *)mmalloc ( need , "WeightsSet3" );
	char      *ptr        = buf;
	uint64_t *hashes = (uint64_t *)ptr; ptr += nb * 8;
	int32_t      *vals       = (int32_t      *)ptr; ptr += nb * 4;
	float     *ww         = (float     *)ptr; ptr += nb * 4;
	if ( ! buf ) return NULL;

	for ( int32_t i = 0 ; i < nw ; i++ ) ww[i] = 1.0;

	if ( ptr != buf + need ) { char *xx=NULL;*xx=0; }

	// make the mask
	uint32_t mask = nb - 1;

	// clear the hash table
	memset ( hashes , 0 , nb * 8 );

	// clear ring of hashes
	memset ( ringWids , 0 , NUMWORDS * 8 );

	// for sanity check
	int32_t lastStart = -1;

	// . hash EVERY NUMWORDS-word sequence in the document
	// . if we get a match look and see what sequences it matches
	// . we allow multiple instances of the same hash to be stored in
	//   the hash table, so keep checking for a matching hash until you
	//   chain to a 0 hash, indicating the chain ends
	// . check each matching hash to see if more than NUMWORDS words match
	// . get the max words that matched from all of the candidates
	// . demote the word and phrase weights based on the total/max
	//   number of words matching
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not alnum word
		if ( ! wids[i] ) continue;
		// yield
		QUICKPOLL ( m_niceness );
		// add new to the 5 word hash
		h ^= wids[i];
		// . remove old from 5 word hash before adding new...
		// . initial ring wids are 0, so should be benign at startup
		h ^= ringWids[ringi];
		// add to ring
		ringWids[ringi] = wids[i];
		// save our position
		ringPos[ringi] = i;
		// wrap the ring ptr if we need to, that is why we are a ring
		if ( ++ringi >= NUMWORDS ) ringi = 0;
		// this 5-word sequence starts with word # "start"
		int32_t start = ringPos[ringi];
		// need at least NUMWORDS words in ring buffer to do analysis
		if ( ++count < NUMWORDS ) continue;
		// . skip if it starts with a word which can not start phrases
		// . that way "a new car" being repeated a lot will not
		//   decrease the weight of the phrase term "new car"
		// . setCountTable() calls set3() with this set to NULL
		//if ( bits && ! bits->canStartPhrase(start) ) continue;
		// sanity check
		if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
		// reset max matched
		int32_t max = 0;
		// look up in the hash table
		uint32_t n = h & mask;
		// sanity breach check
		if ( n >= nb ) { char *xx=NULL;*xx=0; }
	loop:
		// all done if empty
		if ( ! hashes[n] ) {
			// sanity check
			//if ( n >= nb ) { char *xx = NULL; *xx = 0; }
			// add ourselves to the hash table now
			hashes[n] = h;
			// sanity check
			//if ( wids[start] == 0 ) { char *xx = NULL; *xx = 0; }
			// this is where the 5-word sequence starts
			vals  [n] = start;
			// save it
			lastStart = start;
			// debug point
			//if ( start == 7948 )
			//	log("heystart");
			// do not demote words if less than NUMWORDS matched
			if ( max < NUMWORDS ) continue;
			// . how much we should we demote
			// . 10 matching words pretty much means 0 weights
			float demote = 1.0 - ((max-5)*.10);
			if ( demote >= 1.0 ) continue;
			if ( demote <  0.0 ) demote = 0.0;

			// . RULE #26 ("long" phrases)
			// . if we got 3, 4 or 5 in our matching sequence
			// . basically divide by the # of *phrase* terms
			// . multiply by 1/(N-1)
			// . HOWEVER, should we also look at HOW MANY other
			//   sequences matches this too!???
			//float demote = 1.0 / ((float)max-1.0);
			// set3() is still called from setCountTable() to
			// discount the effects of repeated fragments, and
			// the count table only understands score or no score
			//if ( max >= 15 ) demote = 0.0;

			// demote the next "max" words
			int32_t mc = 0;
			int32_t j;
			for ( j = start ; mc < max ; j++ ) {
				// sanity
				if ( j >= nw ) { char *xx=NULL;*xx=0; }
				if ( j <  0 ) { char *xx=NULL;*xx=0; }
				// skip if not an alnum word
				if ( ! wids[j] ) continue;
				// count it
				mc++;
				// demote it
				ww[j] = (int32_t)(ww[j] * demote);
				if ( ww[j] <= 0 ) ww[j] = 2;
			}
			// save the original i
			int32_t mini = i;
			// advance i, it will be incremented by 1 immediately
			// after hitting the "continue" statement
			i = j - 1;
			// must be at least the original i, we are monotinic
			// otherwise ringPos[] will not be monotonic and core
			// dump ultimately cuz j and k will be equal below
			// and we increment matched++ forever.
			if ( i < mini ) i = mini;
			// get next word
			continue;
		}
		// get next in chain if hash does not match
		if ( hashes[n] != h ) {
			// wrap around the hash table if we hit the end
			if ( ++n >= nb ) n = 0;
			// check out bucket #n now
			goto loop;
		}
		// how many words match so far
		int32_t matched = 0;
		// . we have to check starting at the beginning of each word
		//   sequence since the XOR compositional hash is order
		//   independent
		// . see what word offset this guy has
		int32_t j = vals[n] ;
		// k becomes the start of the current 5-word sequence
		int32_t k = start;
		// sanity check
		if ( j == k ) { char *xx = NULL; *xx = 0; }
		// skip to next in chain to check later
		if ( ++n >= nb ) n = 0;
		// keep advancing k and j as int32_t as the words match
	matchLoop:
		// get next wid for k and j
		while ( k < nw && ! wids[k] ) k++;
		while ( j < nw && ! wids[j] ) j++;
		if ( k < nw && wids[k] == wids[j] ) {
			matched++;
			k++;
			j++;
			goto matchLoop;
		}
		// keep track of the max matched for i0
		if ( matched > max ) max = matched;
		// get another matching string of words, if possible
		goto loop;
	}

	if ( nw <= 0 ) { char *xx=NULL;*xx=0;}

	// make space
	if ( ! m_fragBuf.reserve ( nw ) ) {
		// save it
		int32_t saved = g_errno;
		if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );
		// reinstate it
		g_errno = saved;
		return NULL;
	}
	// validate
	m_fragBufValid = true;
	// handy ptr
	char *ff = m_fragBuf.getBufStart();

	// convert from floats into frag score, 0 or 1 really
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		if ( ww[i] <= 0.0 ) ff[i] = 0;
		else                ff[i] = 1;
	}

	if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" );

	// wtf?
	if ( ! ff ) { char *xx=NULL;*xx=0; }

	return ff;
}

float g_wtab[30][30];

// . inline this for speed
// . if a word repeats in different phrases, promote the word
//   and  demote the phrase
// . if a word repeats in pretty much the same phrase, promote
//   the  phrase and demote the word
// . if you have the window of text "new mexico good times"
//   and word #i is mexico, then:
//   pid1 is "new mexico"
//   wid1 is "mexico"
//   pid2 is "mexico good"
//   wid2 is "good"
// . we store sliderParm in titleRec so we can update it along
//   with title and header weights on the fly from the spider controls
void getWordToPhraseRatioWeights ( int64_t   pid1 , // pre phrase
				   int64_t   wid1 ,
				   int64_t   pid2 ,
				   int64_t   wid2 , // post word
				   float      *retww   ,
				   //float      *retpw   ,
				   HashTableX *tt1  ,
				   int32_t        titleRecVersion ) {

	static float s_fsp;
	// from 0 to 100
	char sliderParm = g_conf.m_sliderParm;
	// i'm not too keen on putting this as a parm in the CollectionRec
	// because it is so cryptic...
	//static char sliderParm = 25;

	// . to support RULE #15 (word to phrase ratio)
	// . these weights are based on the ratio of word to phrase count
	//   for a particular word
	static char s_sp = -1;
	if ( s_sp != sliderParm ) {
		// . set it to the newly updated value
		// . should range from 0 up to 100
		s_sp = sliderParm;
		// the float version
		s_fsp = (float)sliderParm / 100.0;
		// sanity test
		if ( s_fsp < 0.0 || s_fsp > 1.0 ) { char *xx = NULL; *xx = 0; }
		// i is the word count, how many times a particular word
		// occurs in the document
		for ( int32_t i = 0 ; i < 30 ; i++ ) {
		// . k is the phrase count, how many times a particular phrase
		//   occurs in the document
		// . k can be GREATER than i because we index only phrase terms
		//   sometimes when indexing neighborhoods, and not the
		//   single words that compose them
		for ( int32_t k = 0 ; k < 30 ; k++ ) {
			// do not allow phrase count to be greater than
			// word count, even though it can happen since we
			// add imported neighborhood pwids to the count table
			int32_t j = k;
			if ( k > i ) j = i;
			// get ratio
			//float ratio = (float)phrcount / (float)wrdcount;
			float ratio = (float)j/(float)i;
			// it should be impossible that this can be over 1.0
			// but might happen due to hash collisions
			if ( ratio > 1.0 ) ratio = 1.0;
			// restrict the range we can weight a word or phrase
			// based on the word count
			//float r = 1.0;
			//if      ( i >= 20 ) r = 2.1;
			//else if ( i >= 10 ) r = 1.8;
			//else if ( i >=  4 ) r = 1.5;
			//else                r = 1.3;
			//g_ptab[i][k] = 1.00;
			g_wtab[i][k] = 1.00;
			if ( i <= 1 ) continue;
			// . we used to have a sliding bar between 0.0 and 1.0.
			//   word is weighted (1.0 - x) and phrase is weighted
			//   by (x). however, x could go all the way to 1.0
			//   even when i = 2, so we need to restrict x.
			// . x is actually "ratio"
			// . when we have 8 or less word occurences, do not
			//   remove more than 80% of its score, a 1/5 penalty
			//   is good enough for now. but for words that occur
			//   a lot in the link text or pwids, go to town...
			if      ( i <=  2 && ratio >= .50 ) ratio = .50;
			else if ( i <=  4 && ratio >= .60 ) ratio = .60;
			else if ( i <=  8 && ratio >= .80 ) ratio = .80;
			else if ( i <= 12 && ratio >= .95 ) ratio = .95;
			// round up, so many "new mexico" phrases but only
			// make it up to 95%...
			if ( ratio >= .95 ) ratio = 1.00;
			// if word's phrase is repeated 3 times or more then
			// is a pretty good indication that we should weight
			// the phrase more and the word itself less
			//if ( k >= 3 && ratio < .90 ) ratio = .90;
			// compute the weights
			//float pw = 2.0 * ratio;
			//float ww = 2.0 * (1.0 - ratio);
			float ww = (1.0 - ratio);

			// . punish words a little more
			// . if we got 50% ratio, words should not get as much
			//   weight as the phrase
			//ww *= .45;
			// do not weight to 0, no less than .15
			if ( ww < 0.0001 ) ww = 0.0001;
			//if ( pw < 0.0001 ) pw = 0.0001;
			// do not overpromote either
			//if ( ww > 2.50 ) ww = 2.50;
			//if ( pw > 2.50 ) pw = 2.50;
			// . do a sliding weight of the weight
			// . a "ww" of 1.0 means to do no weight
			// . can't do this for ww cuz we use "mod" below
			//float newWW = s_fsp*ww + (1.0-s_fsp)*1.00;
			//float newPW = s_fsp*pw + (1.0-s_fsp)*1.00;
			// limit how much we promote a word because it
			// may occur 30 times total, but have a phrase count
			// of only 1. however, the other 29 times it occurs it
			// is in the same phrase, just not this particular
			// phrase.
			//if ( ww > 2.0 ) ww = 2.0;
			g_wtab[i][k] = ww;
			//g_ptab[i][k] = newPW;
			//logf(LOG_DEBUG,"build: wc=%"INT32" pc=%"INT32" ww=%.2f "
			//"pw=%.2f",i,k,g_wtab[i][k],g_ptab[i][k]);
		}
		}
	}

	int32_t phrcount1 = 0;
	int32_t phrcount2 = 0;
	int32_t wrdcount1 = 0;
	int32_t wrdcount2 = 0;
	if ( tt1->m_numSlotsUsed > 0 ) {
		if (pid1) phrcount1 = tt1->getScore(&pid1);
		if (pid2) phrcount2 = tt1->getScore(&pid2);
		if (wid1) wrdcount1 = tt1->getScore(&wid1);
		if (wid2) wrdcount2 = tt1->getScore(&wid2);
	}
	// if we are always ending the same phrase, like "Mexico"
	// in "New Mexico"... get the most popular phrase this word is
	// in...
	int32_t phrcountMax = phrcount1;
	int32_t wrdcountMin = wrdcount1;
	// these must actually exist to be part of the selection
	if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2;
	if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2;


	// . but if we are 'beds' and in a popular phrase like 'dog beds'
	//   there maybe a lot of other phrases mentioned that have 'beds'
	//   in them like 'pillow beds', 'pet beds', but we need to assume
	//   that is phrcountMax is high enough, do not give much weight to
	//   the word... otherwise you can subvert this algorithm by just
	//   adding other random phrases with the word 'bed' in them.
	// . BUT, if a page has 'X beds' with a lot of different X's then you
	//   still want to index 'beds' with a high score!!! we are trying to
	//   balance those 2 things.
	// . do this up here before you truncate phrcountMax below!!
	float mod = 1.0;
	if      ( phrcountMax <=  6 ) mod = 0.50;
	else if ( phrcountMax <=  8 ) mod = 0.20;
	else if ( phrcountMax <= 10 ) mod = 0.05;
	else if ( phrcountMax <= 15 ) mod = 0.03;
	else                          mod = 0.01;

	// scale wrdcount1/phrcountMax down for the g_wtab table
	if ( wrdcount1 > 29 ) {
		float ratio = (float)phrcountMax / (float)wrdcount1;
		phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
		wrdcount1   = 29;
	}
	if ( phrcountMax > 29 ) {
		float ratio = (float)wrdcount1 / (float)phrcountMax;
		wrdcount1   = (int32_t)((29.0 * ratio) + 0.5);
		phrcountMax = 29;
	}

	// . sanity check
	// . neighborhood.cpp does not always have wid/pid pairs
	//   that match up right for some reason... so we can't do this
	//if ( phrcount1 > wrdcount1 ) { char *xx = NULL; *xx = 0; }
	//if ( phrcount2 > wrdcount2 ) { char *xx = NULL; *xx = 0; }

	// apply the weights from the table we computed above
	*retww = mod   *   g_wtab[wrdcount1][phrcountMax];

	// slide it
	*retww = s_fsp*(*retww) + (1.0-s_fsp)*1.00;

	// ensure we do not punish too hard
	if ( *retww <= 0.0 ) *retww = 0.01;

	if ( *retww > 1.0 ) { char *xx=NULL;*xx=0; }

	/*
	if ( phrcountMax >= 0 ) {
		int64_t sh = getPrefixHash ( (char *)NULL , 0 , NULL , 0 );
		int64_t tid = g_indexdb.getTermId ( sh , wid1 );
		logf(LOG_DEBUG,"build: phrcountMax=%"INT32" wrdCount1=%"INT32" "
		     "*ww=%.4f for word with tid=%"UINT64"",
		     phrcountMax,wrdcount1,(float)*ww,tid);
		//if ( phrcountMax < 10 && tid == 16944700235015LL )
		//	log("hey");
	}
	*/

	// sanity check
	//if ( *ww == 0.0 ) { char *xx = NULL; *xx = 0; }

	/*
	// scale wrdcountMin/phrcount down for the g_ptab table
	if ( wrdcountMin > 29 ) {
		float ratio = (float)phrcount2 / (float)wrdcountMin;
		phrcount2   = (int32_t)((29.0 * ratio) + 0.5);
		wrdcountMin = 29;
	}
	if ( phrcount2 > 29 ) {
		float ratio = (float)wrdcountMin / (float)phrcount2;
		wrdcountMin = (int32_t)((29.0 * ratio) + 0.5);
		phrcount2   = 29;
	}
	*/
	// . if the word is Mexico in 'New Mexico good times' then
	//   phrase term #i which is, say, "Mexico good" needs to
	//   get the min word count when doings its word to phrase
	//   ratio.
	// . it has two choices, it can use the word count of
	//   "Mexico" or it can use the word count of "good".
	// . say, each is pretty high in the document so the phrase
	//   ends up getting penalized heavily, which is good because
	//   it is a nonsense phrase.
	// . if we had "united socialist soviet republic" repeated
	//   a lot, the phrase "socialist soviet" would score high
	//   and the individual words would score low. that is good.
	// . try to seek the highest weight possible for this phrase
	//   by choosing the lowest word count possible
	// . NO LONGER AFFECT phrase weights because just because the
	//   words occur a lot in the document and this may be the only
	//   occurence of this phrase, does not mean we should punish
	//   the phrase.  -- MDW
	//*retpw = 1.0;
	return;

	// do it the old way...
	//*pw = g_ptab[wrdcountMin][phrcount2];

	// sanity check
	//if ( *pw == 0.0 ) { char *xx = NULL; *xx = 0; }
}

// for registerSleepCallback
static void clockSyncWaitWrapper ( int fd , void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . a special call
// . returns -1 if blocked, 1 otherwise, 0 on error
char XmlDoc::waitForTimeSync ( ) {
	// unregister?
	if ( isClockInSync() && m_alreadyRegistered ) {
		// note it
		log("build: clock now synced for %s",m_firstUrl.m_url);
		g_loop.unregisterSleepCallback(m_masterState,
					       clockSyncWaitWrapper);
	}
	// return 1 if synced!
	if ( isClockInSync() ) return 1;
	// already registered? wait another 1000ms
	if ( m_alreadyRegistered ) return -1;
	// flag it
	m_alreadyRegistered = true;
	// note it
	log("build: waiting for clock to sync for %s",m_firstUrl.m_url);
	// this should mean it is re-called later
	if ( g_loop.registerSleepCallback ( 1000 , // 1000 ms
					    m_masterState ,
					    clockSyncWaitWrapper ,
					    m_niceness    ))
		// wait for it, return -1 since we blocked
		return -1;
	// if was not able to register, ignore delay
	log("doc: failed to register clock wait callback");
	return 0;
}

////////////////////////////
//
// SCRAPING TOOLS
//
////////////////////////////

void doInjectLoopWrapper ( void *state ) {
	XmlDoc *XD = (XmlDoc *)state;
	// if it blocked, wait
	if ( ! XD->doInjectLoop ( ) ) return;
	// . if we did not inject any links, i guess we are done!
	// . this happens if the ahrefs.com doc had the same outlinks
	//   as the ahrefs.com doc for another search result, they are all
	//   deduped and it does not block.
	XD->m_finalCallback ( XD->m_finalState );
}

// . return false if blocks, true otherwise
// . return true and set error on error, with no blocks outstanding
// . TODO: make this word for ahrefs.com list of links in xml feed
bool XmlDoc::injectLinks (HashTableX *linkDedupTablePtr ,
			  HashTableX *domDedupTablePtr,
			  void *finalState ,
			  void (* finalCallback)(void *)) {

	// INJECT 10 at a time. xmldoc is 1MB.
	int32_t i; for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
		XmlDoc *nd;
		// continue if already set it. this was overwriting it
		// and causing a mem leak before
		if ( m_xmlDocs[i] ) continue;
		try { nd = new ( XmlDoc ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			break;
		}
		mnew ( nd , sizeof(XmlDoc),"xmldocarr");
		m_xmlDocs[i] = nd;
	}

	// all null?
	if ( i < (int32_t)MAX_XML_DOCS ) {
		log("scrape: one xmldoc alloc failed");
		return true;
	}

	m_masterLoop  = doInjectLoopWrapper;
	m_masterState = this;

	m_finalState    = finalState;
	m_finalCallback = finalCallback;

	// note it
	//log("xmldoc: injecting outlinks of %s",m_firstUrl.getUrl());

	m_linkDedupTablePtr = linkDedupTablePtr;
	m_domDedupTablePtr  = domDedupTablePtr;

	// loop over all links
	m_i       = 0;
	m_blocked = 0;
	memset ( m_used , 0 , (int32_t)MAX_XML_DOCS );

	return doInjectLoop();
}


void doneInjectingWrapper ( void *state ) {
	XmlDoc *xd = (XmlDoc *)state;
	XmlDoc *XD = (XmlDoc *)xd->m_hack;
	XD->doneInjecting ( xd );
}

// . return false if blocks, true otherwise
// . return true and set error on error, with no blocks outstanding
bool XmlDoc::doInjectLoop ( ) {

	setStatus("inject outlinks");

	//Links *links = getLinks();
	//if ( ! links ) return (m_blocked == 0);
	//if ( links == (void *)-1 ) return false;
	Sections *sections = getSections();
	if ( ! sections ) return (m_blocked == 0);
	if ( sections == (void *)-1 ) return false;
	Links *links = getLinks();
	if ( ! links ) return (m_blocked == 0);
	if ( links == (void *)-1 ) return false;
	Words *words = getWords();
	if ( ! words ) return (m_blocked == 0);
	if ( words == (void *)-1 ) return false;
	Bits *bp = getBits();
	if ( ! bp ) return (m_blocked == 0);
	if ( bp == (void *)-1 ) return false;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;
	int32_t n = links->getNumLinks();
	Url tmpUrl;
	Section *prev = NULL;
	// scan the links now
	for ( ; m_i <  n ; ) {
		// get xml doc then
		int32_t j; for (  j = 0 ; j < MAX_XML_DOCS ; j++ )
			if ( ! m_used[j] ) break;
		// none? return false if blocked.
		if ( j >= MAX_XML_DOCS ) return false;
		// get the m_ith link
		char *link       = links->getLink ( m_i );
		int32_t linkLen = links->getLinkLen ( m_i );
		// temp term
		if ( link[linkLen] ) { char *xx=NULL;*xx=0; }
		// skip to next link to index
		m_i++;
		// skip injecting if its an internal bing/google outlink
		if ( strncmp(link,"http://www.bing.com/",20) == 0 )
			continue;
		// skip youtube query links. they contain our exact
		// query!! so almost always come up #1
		if ( strstr(link,".youtube.com/") && strstr(link,"&q="))
			continue;
		if ( strstr(link,".msn.com/") )
			continue;
		if ( strstr(link,".microsoft.com/") )
			continue;
		if ( strstr(link,".discoverbing.com/") )
			continue;
		if ( strstr(link,".googleusercontent.com/") )
			continue;
		//if(!strncmp(link,"http://webcache.googleusercontent.com/",38)
		if(!strncmp(link,"http://www.google.com/url?q=http",32)){
			// grab the real url from that
			char *embed = strstr(link,"url?q=http");
			if ( ! embed ) continue;
			link = embed+6;
			char *end = embed;
			for ( ; *end && *end != '&' ; end++) {
				// google appends query to url.. strange
				//if ( end[0] == '%' &&
				//     end[1] == '2' &&
				//     to_lower_a(end[2]) == 'b' )
				//	break;
			}
			SafeBuf mbuf;
			mbuf.reserve ( end - link + 100 );
			int32_t dlen;
			char *bs = mbuf.getBufStart();
			dlen=urlDecode(bs,link , end - link );
			bs[dlen] = '\0';
			tmpUrl.set ( bs );
			link = tmpUrl.getUrl();
			linkLen = tmpUrl.getUrlLen();
		}
		// skip maps.google.com etc.
		if ( strstr(link,".google.com/") )
			continue;

		// ok, point to title and summary for this result!
		// go up to prev node for first non-clickable text which
		// should be summary
		//Section **sp = sections->m_sectionPtrs;
		// get the section
		int32_t ln = links->getNodeNum(m_i-1);
		// get node ptr
		XmlNode *node = m_xml.getNodePtr(ln);
		char *ptr = node->m_node;
		// find section that contains it i guess
		Section *sx = sections->m_rootSection;
		Section *last = NULL;
		char **wptrs = words->getWords();
		//nodeid_t *tids = words->getTagIds();
		for ( ; sx ; sx = sx->m_next ) {
			// get section ptr
			char *sw = wptrs[sx->m_b-1];
			if ( sw < ptr ) continue;
			// over?
			sw = wptrs[sx->m_a];
			if ( sw > ptr ) break;
			last = sx;
		}
		// assign
		sx = last;
		// telescope section up one i guess
		//sx = sx->m_parent;
		// int16_tcut
		wbit_t *bits = bp->m_bits;
		// if still same first alnum, go another
		//for ( ; sx ; sx = sx->m_parent ) {
		//	// skip if same word starts this section
		//	//if ( sx->m_firstWordPos == fa ) continue;
		//	// must have alnum
		//	if ( sx->m_firstWordPos <= 1 ) continue;
		//	// must be in link! should be the result TITLE
		//	if ( bits[sx->m_firstWordPos] & D_IN_LINK ) break;
		//	// word must not be "cached" or whatever...
		//}
		// if in bold tag, should telescope up some more
		//if ( sx && sx->m_tagId == TAG_B ) sx = sx->m_parent;
		//if ( sx && sx->m_tagId == TAG_STRONG ) sx = sx->m_parent;
		// save
		//int32_t fa = sx->m_firstWordPos;
		// that's the title so telescope up as int32_t as that is the
		// first alnum!!!
		for ( ; sx ; sx = sx->m_parent ) {
			//Section *ps = sx->m_parent;
			// do we have a next brother? stop then! that means
			// we are in a list!
			//if ( sx->m_nextBrother ) break;
			//if ( ps->m_firstWordPos != fa ) break;
			// stop when we hit a result delimeter!!
			if ( sx->m_tagId == TAG_LI ) {
				// bing...
				if ( strncmp(wptrs[sx->m_a],
					     "<li class=\"sa_wr\">",
					     17) == 0 ) {
					break;
				}
				// google...
				if ( strncmp(wptrs[sx->m_a],
					     "<li class=\"g\">",
					     13) == 0 ) {
					break;
				}
			}

		}
		// if no indicator, bail
		if ( ! sx ) continue;
		// skip link if contained in prev section
		if ( prev == sx )
			continue;
		// save it
		prev = sx;
		// record search result details
		Section *title   = NULL;
		Section *cite    = NULL;
		Section *summary = NULL;
		// . that is probably the full result then...
		// . title is first sentence
		for ( ; sx ; sx = sx->m_next ) {
			// only sentences
			if ( ! ( sx->m_flags & SEC_SENTENCE ) ) continue;
			// grab it
			if ( ! title ) {
				title = sx;
				continue;
			}
			// skip section if in link
			if ( bits[sx->m_firstWordPos] & D_IN_LINK ) continue;
			// we are sentence section so fix it so we are one
			// above!
			Section *rs = sx; // ->m_parent;
			// telescope up to a div or whatever...
			//for ( ; rs ; rs = rs->m_parent ) {
			//	if ( rs->m_tagId == TAG_DIV ) break;
			//	if ( rs->m_tagId == TAG_P ) break;
			//}
			// and out of bold
			if ( rs && rs->m_tagId == TAG_B ) rs = rs->m_parent;
			if ( rs && rs->m_tagId == TAG_STRONG) rs=rs->m_parent;
			// bail if no good!
			if ( ! rs ) continue;
			// then site if google
			if ( ! cite ) {
				cite = rs;
				continue;
			}
			// then summary
			summary = rs;
			break;
		}
		m_serpBuf.safePrintf("\t\t<result>\n");
		// print <title> tag
		if ( title ) printSerpFiltered(title,"title");
		// print <sum> tag
		if ( summary ) printSerpFiltered(summary,"sum");
		m_serpBuf.safePrintf("\t\t\t<url>");
		m_serpBuf.safeMemcpy ( link , linkLen );
		m_serpBuf.safePrintf("</url>\n");
		m_serpBuf.safePrintf("\t\t</result>\n");


		// if not injecting, skip
		//continue;
		if ( ! m_reallyInjectLinks ) continue;

		// dedup
		int32_t  linkHash32 = hash32 ( link , linkLen );
		if ( m_linkDedupTablePtr &&
		     m_linkDedupTablePtr->isInTable (&linkHash32) ) continue;
		// add it otherwise
		if ( m_linkDedupTablePtr )
			m_linkDedupTablePtr->addKey ( &linkHash32 );

		// we use this when injecting ahrefs links
		if ( m_domDedupTablePtr ) {
			int32_t domLen;
			char *dom = getDomFast ( link , &domLen );
			int32_t dh32 = hash32 ( dom , domLen );
			if ( m_domDedupTablePtr->isInTable (&dh32) ) continue;
			m_domDedupTablePtr->addKey ( &dh32 );
		}

		// get it
		XmlDoc *xd = m_xmlDocs[j];

		if ( ! xd ) { char *xx=NULL;*xx=0; }

		// add www to it
		Url lu;
		lu.set ( link , linkLen , true );

		char *wwwLink = lu.getUrl();

		// this can go on the stack since set4() copies it
		SpiderRequest sreq;
		sreq.reset();
		// index this link!
		strcpy(sreq.m_url,wwwLink);
		// parentdocid of 0
		int32_t firstIp = hash32n(wwwLink);
		if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
		sreq.setKey( firstIp,0LL, false );
		sreq.m_isInjecting   = 1;
		sreq.m_isPageInject  = 1;
		sreq.m_hopCount      = 0;//hopCount;
		sreq.m_hopCountValid = 1;
		sreq.m_fakeFirstIp   = 1;
		sreq.m_firstIp       = firstIp;

		setStatus("injecting an outlink");

		// . use the enormous power of our new XmlDoc class
		// . this returns false with g_errno set on error
		if ( ! xd->set4 ( &sreq       ,
				  NULL        ,
				  cr->m_coll  ,
				  NULL        , // pbuf
				  // give it a niceness of 1, we have to be
				  // careful since we are a niceness of 0!!!!
				  m_niceness, // 1 ,
				  // inject this content
				  NULL, // content ,
				  false, // deleteFromIndex ,
				  0, // forcedIp ,
				  0, // contentType ,
				  0, // lastSpidered ,
				  false )) { // hasMime
			// . g_errno should be set if that returned false
			// . return true if does not need to block
			log("xmldoc: outlink inject: %s",mstrerror(g_errno));
			break;
		}

		xd->m_hack = this;

		// make this our callback in case something blocks
		xd->setCallback ( xd , doneInjectingWrapper );
		// . set xd from the old title rec if recycle is true
		// . can also use XmlDoc::m_loadFromOldTitleRec flag
		xd->m_recycleContent = false;//true;

		// avoid looking up ip of each outlink to add "firstip" tag to
		// tagdb because that can be slow!!!!!!!
		xd->m_spiderLinks      = false;
		xd->m_spiderLinks2     = false;
		xd->m_spiderLinksValid = true;

		// . newOnly is true --> do not inject if document is already
		//   indexed!
		// . maybe just set indexCode
		xd->m_newOnly = true;//false;//newOnly;
		// need to refresh it!!
		//xd->m_newOnly = false;//newOnly;

		// turn off robots.txt lookups
		xd->m_isAllowed      = true;
		xd->m_isAllowedValid = true;
		xd->m_crawlDelay     = -1; // unknown
		xd->m_crawlDelayValid = true;

		// log it now
		log("inject: indexing outlink %s (hash=%"UINT32")",wwwLink,
		    (uint32_t)linkHash32);

		// costs one API unit, which is one cent. but if we do
		// top 50 on google, top 50 on procog, it can be like
		// $1 every time we do this.
		//xd->injectAhrefsLinks();

		bool status = true;

		// this will tell it to index ahrefs first before indexing
		// the doc. but do NOT do this if we are from ahrefs.com
		// ourselves to avoid recursive explosion!!
		xd->m_downloadLevel = m_downloadLevel + 1;
		xd->m_useAhrefs     = m_useAhrefs;

		// inherit dedup tables as well!
		xd->m_linkDedupTablePtr = m_linkDedupTablePtr;

		// . now tell it to index
		// . this returns false if blocked
		status = xd->indexDoc ( );

		// log it. i guess only for errors when it does not block?
		// because xmldoc.cpp::indexDoc calls logIt()
		if ( status ) xd->logIt();
		// otherwise, it blocks
		else {
			m_blocked++;
			log("xmldoc: blockedout=%"INT32" slotj=%"INT32" "
			    "(this=0x%"PTRFMT",xd=0x%"PTRFMT")",
			    m_blocked,j,(PTRTYPE)this,(PTRTYPE)xd);
			m_used[j] = true;
		}
	}

	// return true if all done
	return (m_blocked == 0);
}

void XmlDoc::doneInjecting ( XmlDoc *xd ) {
	// find it in our list
	int32_t i;
	for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) {
		if ( ! m_used[i] ) continue;
		if ( m_xmlDocs[i] != xd ) continue;
		break;
	}
	// core if not found in our list, it must be there
	if ( i >= MAX_XML_DOCS ) { char *xx=NULL;*xx=0; }
	// free it up now!
	m_used[i] = 0;
	// free it up
	//mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" );
	//delete  ( m_xmlDocs[i] );
	//m_xmlDocs[i] = NULL;
	m_xmlDocs[i]->reset();
	// uncount it as being outstanding
	m_blocked--;
	// log debug
	log("xmldoc: blockedin=%"INT32" (this=0x%"PTRFMT")",
	    m_blocked,(PTRTYPE)this);
	// return if still blocked
	if ( ! doInjectLoop() ) return;
	// log debug
	log("xmldoc: final callback");
	// ok, all have been indexed
	m_finalCallback ( m_finalState );
}

bool XmlDoc::injectAhrefsLinks ( ) {

	setStatus("get inlinks from ahrefs.com");

	// skip for now
	//return true;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;


	// make the ahrefs urls
	try { m_ahrefsDoc = new ( XmlDoc ); }
	catch ( ... ) {
		g_errno = ENOMEM;
		return true;
	}
	mnew ( m_ahrefsDoc , sizeof(XmlDoc),"xmldocah");
	// make the url
	SafeBuf ubuf;
	// turn count down to 10 for now
	ubuf.safePrintf("http://api.ahrefs.com/get_backlinks.php?count=350&mode=exact&output=xml&AhrefsKey=0452f27fd5a7fec5e9702e23ba4af223&target=");
	//ubuf.safePrintf("http://www.gigablast.com/?q=poo&u=");
	ubuf.urlEncode (m_firstUrl.getUrl() );
	Url url;
	url.set ( ubuf.getBufStart() );
	char *up = url.getUrl();
	// set by url i guess
	SpiderRequest sreq;
	sreq.reset();
	strcpy(sreq.m_url,up);
	// parentdocid of 0
	int32_t firstIp = hash32n(up);
	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
	sreq.setKey( firstIp,0LL, false );
	sreq.m_isInjecting   = 1;
	sreq.m_isPageInject  = 1;
	sreq.m_hopCount      = 0;//hopCount;
	sreq.m_hopCountValid = 1;
	sreq.m_fakeFirstIp   = 1;
	sreq.m_firstIp       = firstIp;
	// int16_tcut
	XmlDoc *ah = m_ahrefsDoc;

	// . use the enormous power of our new XmlDoc class
	// . this returns false with g_errno set on error
	if ( ! ah->set4 ( &sreq       ,
			  NULL        ,
			  cr->m_coll  ,
			  NULL        , // pbuf
			  // give it a niceness of 1, we have to be
			  // careful since we are a niceness of 0!!!!
			  m_niceness, // 1 ,
			  // inject this content
			  NULL, // content ,
			  false, // deleteFromIndex ,
			  0, // forcedIp ,
			  0, // contentType ,
			  0, // lastSpidered ,
			  false )) { // hasMime
		log("xmldoc: ahref doc error %s",mstrerror(g_errno));
		// g_errno should be set if that returned false
		return true;
	}
	// do not re-call the set
	//m_needsSet = false;
	// make this our callback in case something blocks
	//ah->setCallback ( state , callback );
	// do not re-lookup the robots.txt
	ah->m_isAllowed      = true;
	ah->m_isAllowedValid = true;
	ah->m_crawlDelay     = -1; // unknown
	ah->m_crawlDelayValid = true;

	ah->m_downloadLevel = m_downloadLevel + 1;

	// reset domain table for deduping ahref's links by domain
	// before injecting them... only inject one per domain
	if ( ! m_domDedupTablePtr ) {
		m_domDedupTable.set(4,0,512,NULL,0,false,m_niceness,"dmtab2");
		m_domDedupTablePtr = &m_domDedupTable;
	}

	// log it now
	//log("inject: indexing injected doc %s",url);

	// if we are a url like api.ahrefs.com/get_backlinks... then
	// our links can use our table for deduping based on domain, AND
	// they can use our link dedup table in case one outlink is also
	// a search result on google's page...
	if ( ! ah->injectLinks ( m_linkDedupTablePtr,
				 m_domDedupTablePtr,
				 m_masterState ,
				 m_masterLoop ) )
		return false;

	return true;
}


bool XmlDoc::printSerpFiltered ( Section *sx , char *tagName ) {
	//int64_t *wids = m_words.getWordIds();
	char **wptrs = m_words.getWords();
	int32_t  *wlens = m_words.getWordLens();
	int32_t fa = sx->m_firstWordPos;
	nodeid_t *tids = m_words.getTagIds();
	if ( fa > 0 && tids[fa-1] == TAG_B ) fa--;
	if ( fa > 0 && tids[fa-1] == TAG_STRONG ) fa--;
	int32_t la = sx->m_b;
	int32_t nw = m_words.getNumWords();
	if ( la+1 < nw && tids[la+1] == (TAG_B|BACKBIT) ) la++;
	if ( la+1 < nw && tids[la+1] == (TAG_STRONG|BACKBIT) ) la++;

	// advance la even more if regular words or br tags or b or strong tags
	for ( ; la < nw ; la++ ) {
		if ( ! tids[la] ) continue;
		if ( (tids[la]&BACKBITCOMP) == TAG_BR ) continue;
		if ( (tids[la]&BACKBITCOMP) == TAG_STRONG ) continue;
		if ( tids[la] == TAG_BR ) continue;
		break;
	}

	m_serpBuf.safePrintf("\t\t\t<%s>",tagName);
	// cdata!
	m_serpBuf.safePrintf("<![CDATA[");
	// subtract 1 from sx->m_b to avoid ending tag
	for ( int32_t i = fa ; i < la ; i++ ) {
		// skip if br
		if ( tids[i] == TAG_BR ) continue;
		m_serpBuf.cdataEncode ( wptrs[i] , wlens[i] );
	}
	// cdata!
	m_serpBuf.safePrintf("]]>");
	m_serpBuf.safePrintf("</%s>\n",tagName);
	return true;
}

//////////
//
// BEGIN NEW SEO MATCHING QUERIES TOOL CODE
//
//////////


static void loadTitleRecFromDiskOrSpiderWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	if ( ! THIS->loadTitleRecFromDiskOrSpider() ) return;
	THIS->m_callback1 ( THIS->m_state );
}

// . if we can't load titlerec from titledb, spider it, index it and
//   use that new titlerec
// . returns false if blocks
// . returns true and sets g_errno on error
bool XmlDoc::loadTitleRecFromDiskOrSpider() {

	if ( ! m_masterLoop ) {
		m_masterState = this;
		m_masterLoop = loadTitleRecFromDiskOrSpiderWrapper;
	}

	// fix a core when getTermListBuf() calls getMetaList()
	// which calls getNewSpiderReply() which calls
	// getDownloadEndTime() and tries to download the page
	// even though we have a valid titlerec!
	if ( ! m_downloadEndTimeValid ) {
		m_downloadEndTimeValid = true;
		m_downloadEndTime = 0;
	}

	// . try to recycle the content first
	// . try to load it from title rec first
	// . we have to do this otherwise our ptr_linkInfo link texts
	//   will be somewhat random and cause us to get different scores
	//   for the queries we match!!
	// . so do this not just for speed, but to be consistent.
	if ( ! loadFromOldTitleRec() ) return false;

	// did that fail? i.e. not found!?!?! ignore and just indexx it
	if ( m_oldTitleRecValid && m_oldTitleRec )
		return true;

	// ok, we gotta index it
	if ( ! m_loggedMsg3 ) {
		m_loggedMsg3 = true;
		log("xmldoc: url %s not in titledb, spidering and indexing",
		    m_firstUrl.m_url);
	}

	// clear that
	g_errno = 0;

	// turn off recycling i guess since we don't have it
	m_recycleContent = false;

	// first index it, but only if not already indexed
	// did it block?
	// eror indexing doc? indexCode should be set then
	if ( ! indexDoc() ) return false;

	// no blocking
	return true;
}

/*
void getSEOQueryInfoWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// note it
	THIS->setStatus ( "seoqueryinfowrapper" );
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { char *xx=NULL;*xx=0;}
	// note it
	THIS->setStatus ( "in seo query info wrapper" );
	// return if it blocked
	if ( THIS->getSEOQueryInfo( ) == (void *)-1 ) return;
	// print any error
	if ( g_errno )
		log("seopipe: getSeoQueryInfo error: %s",mstrerror(g_errno));
	// all done
	else
		log("seopipe: getSeoQueryInfo is done");
	// show timing info
	int64_t now = gettimeofdayInMilliseconds();
	int64_t took = now - THIS->m_beginSEOTime;
	log("seopipe: time: getSeoQueryInfo took %"INT64"ms",took);
	// otherwise, all done, call the caller callback
	if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state );
	else                     THIS->m_callback2 ( THIS->m_state );
	return;
}

void getSEOQueryInfoWrapper2 ( int fd , void *state ) {
	// just pump! otherwise we might re-launch a msg3a request while
	// one is outstanding causing a core in Multicast::reset()
	XmlDoc *THIS = (XmlDoc *)state;
	// debug log
	THIS->setStatus ("getseoqueryinfowrapper2");
	// if we are waiting just on the pump i guess we are all done!
	if ( ! THIS->m_socketWriteBufValid ) {
		log("seopipe: pumping socket");
		THIS->pumpSocketWriteBuf();
		return;
	}
	// not pumping?
	log("seopipe: pumping socket ready wrapper");
	// otherwise, let it call the callback
	getSEOQueryInfoWrapper ( state );
}

// . return safebuf of xml containing matching and related queries and
//   related urls/titles
// . this transmits the xml as it generates it to "m_seoSocket" if non-null
// . returns -1 if blocked, returns NULL and sets g_errno on error
// . stores the xml in the m_socketWriteBuf SafeBuf
// . will keep blocking (returning -1) until the xml is delivered to socket
//   if it is non-NULL
SafeBuf *XmlDoc::getSEOQueryInfo ( ) {

	setStatus ( "seo query info" );

	// only set to valid once it has been all written out!!
	if ( m_socketWriteBufValid ) {
		// all done?
		if ( ! m_seoSocket ) return &m_socketWriteBuf;
		// pump
		pumpSocketWriteBuf();
		// if socket not done being pumped... we block. it's
		// ready wrappers should re-call our wrapper.
		if ( m_socketWriteBufSent >= m_socketWriteBuf.length() )
			return &m_socketWriteBuf;
		// wait for write to finish
		return (SafeBuf *)-1;
	}

	// the g_errno could be a title rec not found reply coming back
	// so do not process that here! it needs to be processed
	// by the function whose request resulted in an error reply.
	// for instances, the getTitle() call below needs to set g_errno
	// when we call it now, responding to its msg22 reply.
	//if ( g_errno ) return NULL;

	// a good place to init stuff we need here
	if ( ! m_masterState ) {
		m_printedQueries = false;
		m_printedRelatedDocIds = false;
		m_printedRelatedQueries = false;
		m_printedRecommendedLinks = false;
		m_printedScoredInsertableTerms = false;
		//m_docIndexed = false;
		// time it
		m_beginSEOTime = gettimeofdayInMilliseconds();
		// for our m_masterLoop function, it uses this as the state
		m_masterState = this;
		// this is a main entry point function so anything that blocks
		// should re-call this function
		m_masterLoop = getSEOQueryInfoWrapper;
		// assume indexed
		m_docIndexed = true;
		// fix a core when getTermListBuf() calls getMetaList()
		// which calls getNewSpiderReply() which calls
		// getDownloadEndTime() and tries to download the page
		// even though we have a valid titlerec!
		if ( ! m_downloadEndTimeValid ) {
			m_downloadEndTimeValid = true;
			m_downloadEndTime = 0;
		}
	}

	// . try to load it from title rec first
	// . we have to do this otherwise our ptr_linkInfo link texts
	//   will be somewhat random and cause us to get different scores
	//   for the queries we match!!
	// . so do this not just for speed, but to be consistent.
	if ( m_recycleContent && ! loadFromOldTitleRec()) return (SafeBuf *)-1;

	// did that fail? i.e. not found!?!?! ignore and just indexx it
	if ( m_oldTitleRecValid && ! m_oldTitleRec && m_recycleContent ) {
		// just skip this asshole then
		log("xmldoc: url %s load3 failed",m_firstUrl.m_url);
		// clear that
		g_errno = 0;
		// need to index it
		m_docIndexed = false;
	}

	// first index it, but only if not already indexed
	if ( ! m_docIndexed ) {
		// turn off recycling i guess since we don't have it
		m_recycleContent = false;
		// did it block?
		// eror indexing doc? indexCode should be set then
		if ( ! indexDoc() ) return (SafeBuf *)-1;
		// do not re-call
		m_docIndexed = true;
	}


	// was indexing successful?
	int32_t *indexCode = getIndexCode();
	if ( ! indexCode || indexCode == (void *)-1 )
		return (SafeBuf *)indexCode;

	// if not successfully indexed send back error msg
	if ( *indexCode && m_seoSocket ) {
		m_socketWriteBuf.safePrintf(
					    "\t<errorMsg><![CDATA[%s]]>"
					    "</errorMsg>\n"
					    "</response>"
					    , mstrerror(*indexCode) );
		// send on socket
		pumpSocketWriteBuf();
		// if socket not done being pumped... we block
		if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
			return (SafeBuf *)-1;
		// otherwise, we are done sending
		return &m_socketWriteBuf;
	}


	// seo.cpp needs this in printDupSentences
	Sections *sections = getSectionsWithDupStats();
	if ( ! sections || sections == (void *)-1) return (SafeBuf *)sections;

	// seo.cpp needs this now when it calls getSiteRank()
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (void *)-1 ) return (SafeBuf *)sni;

	// . find all logged queries that this document matches
	// . this will launch msg99 requests to each host in the network
	// . then it scores them
	// . don't worry about sending back in real-time for this since it
	//   should be fast
	SafeBuf *qpbuf = getMatchingQueriesScored();
	if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;

	// . how many queries do we have that match this url?
	// . they should be sorted by our url's score
	int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);

	// int16_tcut
	SafeBuf *sb = &m_socketWriteBuf;
	// cast the msg99 reply ptrs, i.e. query ptrs
	Msg99Reply **queryPtrs = (Msg99Reply **)qpbuf->getBufStart();

	// store each one as xml then into m_headerBuf99
	if ( ! m_printedQueries && m_seoSocket ) {
		m_printedQueries = true;
		// do not flood the socket! so limit to 1000 queries
		// they should be sorted by queryImportance!
		// cheatcodes.com has like 50,000 matching queries.
		int32_t max = numQueryPtrs;
		if ( max > 1000 ) max = 1000;
		for ( int32_t i = 0 ; i < max ; i++ ) {
			// int16_tcut
			Msg99Reply *qp = queryPtrs[i];
			// sometimes queries like 'gallery-view' are
			// hard-phrased and do not show up for us, so skip.
			// they should be at the very end so we should be
			// trimming the tail for them, so don't worry about
			// <queryNum> having holes in it.
			if ( qp->m_myDocId == 0LL && qp->m_myScore == 0.0 )
				continue;
			// int16_tcut
			QueryLogEntry *qe = &qp->m_queryLogEntry;
			sb->safePrintf("\t<seoQuery>\n"
				       "\t\t<queryNum>%"INT32"</queryNum>\n"
				       "\t\t<query><![CDATA[%s]]></query>\n"
				       "\t\t<queryTrafficPerDay>%"INT32""
				       "</queryTrafficPerDay>\n"
				       // our url's score
				       "\t\t<myDocId>%"INT64"</myDocId>\n"
				       "\t\t<myScore>%f</myScore>\n"
				       //"\t\t<mySiteHash32>%"UINT32""
				       //"</mySiteHash32>\n"
				       "\t\t<queryImportance>%f"
				       "</queryImportance>\n"


				       "\t</seoQuery>\n"
				       , i
				       , qp->m_queryStr
				       // x 10 to estimate google?
				       , qe->m_gigablastTraffic *
				         GB_TRAFFIC_MODIFIER
				       , qp->m_myDocId
				       , qp->m_myScore
				       //, qp->m_mySiteHash32
				       , qp->m_queryImportance
				       //,qp->m_queryInfo.m_numUniqueWordForms
				       //,qp->m_queryInfo.m_numRepeatWordForms
				       //qp->m_queryInfo.m_smallestNormTermFreq
				       );
		}
	}

	// pump it some. i.e. send m_socketWriteBuf contents back to
	// m_seoSocket if it is non-NULL
	pumpSocketWriteBuf();

	// . now instead try getting the top "imax" queries scored on the
	//   whole index
	// . transmit them back on m_seoSocket AS WE GET THEM by calling
	//   pumpSocketWriteBuf() function and storing into m_socketWriteBuf
	//qpbuf = getMatchingQueriesScoredForFullQuery ( );
	//if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf;

	SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
	if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
	RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
	// how many related docids do we have?
	int32_t nr = rdbuf->length() / sizeof(RelatedDocId);

	//
	// print out the related urls
	//
	if ( ! m_printedRelatedDocIds && nr && m_seoSocket ) {
		m_printedRelatedDocIds = true;
		int32_t max = 200; // m_maxRelatedUrls;
		if ( max == -1 ) max = nr;
		if ( nr < max ) max = nr;
		sb->safePrintf("\t<relatedUrls>\n");
		for ( int32_t i = 0 ; i < max ; i++ ) {
			RelatedDocId *rd = &rds[i];
			// fix for titlerec not found errors
			char *title = rd->ptr_rd_title;
			char *url   = rd->ptr_rd_url;
			if ( ! title ) title = "";
			if ( ! url   ) url   = "";
			// print it out
			sb->safePrintf("\t\t<relatedUrl>\n"
				       "\t\t\t<urlNum>%"INT32"</urlNum>\n"
				       "\t\t\t<url><![CDATA[%s]]></url>\n"
				       "\t\t\t<docId>%"INT64"</docId>\n"
				       "\t\t\t<siteHash32>%"UINT32"</siteHash32>\n"
				       "\t\t\t<title><![CDATA["
				       , i
				       , url
				       , rd->m_docId
				       , rd->m_siteHash32
				       );
			// encode CDATA stuff in title
			sb->cdataEncode(title);
			sb->safePrintf("]]></title>\n"
				       "\t\t\t<queriesInCommon>%"INT32""
				       "</queriesInCommon>\n"
				       "\t\t\t<similarityScore>%f"
				       "</similarityScore>\n"
				       , rd->m_numCommonQueries
				       , rd->m_dotProduct // similarityScore
				       );
			// print the actualy querynums in common
			int32_t firstOff = rd->m_firstCommonQueryNumOff;
			int32_t offset = firstOff;
			sb->safePrintf("\t\t\t<queriesInCommon>\n");
			for ( ; offset >= 0 ; ) {
				// get that node
				char *buf = m_commonQueryNumBuf.getBufStart();
				// and offset
				buf += offset;
				// then cast
				QueryNumLinkedNode *qn;
				qn = (QueryNumLinkedNode *)buf;
				// print that
				sb->safePrintf("\t\t\t\t<queryNum>%"INT32""
					       "</queryNum>\n"
					       , qn->m_queryNum );
				// advance. will be -1 when done
				offset = qn->m_nextOff;
			}
			sb->safePrintf("\t\t\t</queriesInCommon>\n");
			sb->safePrintf("\t\t</relatedUrl>\n");
		}
		sb->safePrintf("\t</relatedUrls>\n");
	}


	//
	// recommended inlinks!
	//

	// pump it some. i.e. send m_socketWriteBuf contents back to
	// m_seoSocket if it is non-NULL
	pumpSocketWriteBuf();

	SafeBuf *kbuf = getRecommendedLinksBuf();
	if ( ! kbuf || kbuf == (void *)-1 ) return kbuf;

	// print out the recommended links in xml
	if ( ! m_printedRecommendedLinks && m_seoSocket ) {
		sb->safePrintf("\t<recommendedLinks>\n");
		char *p = kbuf->getBufStart();
		char *pend = kbuf->getBuf();
		for ( ; p < pend ; ) {
			// cast it
			RecommendedLink *ri = (RecommendedLink *)p;
			// skip it
			p += ri->getSize();
			// print it out
			sb->safePrintf("\t\t<link>\n"
				       "\t\t\t<url><![CDATA[%s]]></url>\n"
				       "\t\t\t<title><![CDATA[%s]]></title>\n"
				       "\t\t\t<score>%f</score>\n"
				       "\t\t\t<siteRank>%"INT32"</siteRanke>\n"
				       ,ri->getUrl(kbuf)
				       ,ri->getTitle(kbuf)
				       ,ri->m_totalRecommendedScore
				       ,(int32_t)ri->m_siteRank
				       );
		}
		sb->safePrintf("\t</recommendedLinks>\n");
		m_printedRecommendedLinks = true;
	}


	//
	// related queries
	//

	// write out
	pumpSocketWriteBuf();


	SafeBuf *relBuf = getRelatedQueryBuf();
	if ( ! relBuf || relBuf == (void *)-1 ) return relBuf;
	QueryRel **rels = (QueryRel **)relBuf->getBufStart();
	int32_t numRels = relBuf->length() / sizeof(QueryRel *);

	//
	// print out the related queries
	//
	if ( ! m_printedRelatedQueries && numRels && m_seoSocket ) {
		sb->safePrintf("\t<relatedQueries>\n");
		int32_t max = 200; // m_maxRelatedQueries;
		if ( max == -1 ) max = numRels;
		if ( numRels < max ) max = numRels;
		for ( int32_t i = 0 ; i < max ; i++ ) {
			QueryRel *rel = rels[i];
			// must be a first!
			if ( ! rel->m_isFirst ) { char *xx=NULL;*xx=0; }
			// int16_tcut
			//QueryInfo *qi = &rel->m_queryInfo;
			// print it out
			sb->safePrintf("\t\t<relatedQuery>\n"
				       "\t\t\t<query><![CDATA[%s]]></query>\n"
				       "\t\t\t<relatedDocIdsInCommon>%"INT32""
				       "</relatedDocIdsInCommon>\n"

				       "\t\t\t<relatedQueryImportance>%f"
				       "</relatedQueryImportance>\n"

				       //"\t</relatedUrl>\n"
				       , rel->m_queryStr
				       , rel->m_docIdVotes

				       //, qi->m_numUniqueWordForms
				       //, qi->m_numRepeatWordForms
				       //, qi->m_smallestNormTermFreq

				       , rel->m_totalRelatedQueryImportance
				       //, qi->m_myScoreRelated
				       );
			// print details!
			sb->safePrintf("\t\t\t<matchingDocIds>\n");
			// linked list of Msg99Replies for the related queries.
			// all in linked list are for the same query but
			// restricted to a different docid!
			for ( ; rel ; rel = rel->m_next ) {
				// get his related docid
				RelatedDocId *rd = rel->m_relatedDocId;
				// print that
				sb->safePrintf("\t\t\t\t<match>\n"
					       "\t\t\t\t\t<relatedDocId>%"INT64""
					       "</relatedDocId>\n"
					       "\t\t\t\t\t<siteHash32>%"UINT32""
					       "</siteHash32>\n"
					       //"\t\t\t\t\t"
					       //"<queryImportance>%f"
					       //"</queryImportance>\n"
					       "\t\t\t\t\t<docIdSimilarity>%f"
					       "</docIdSimilarity>\n"
					       "\t\t\t\t\t<docIdScore>%f"
					       "</docIdScore>\n"
					       "\t\t\t\t</match>\n"
					       , rd->m_docId
					       , rd->m_siteHash32
					       //, rd->m_similarityScore
					       , rd->m_dotProduct
					       , rel->m_myScore
					       );
			}
			sb->safePrintf("\t\t\t</matchingDocIds>\n");
			sb->safePrintf("\t\t</relatedQuery>\n");

		}
		sb->safePrintf("\t</relatedQueries>\n");
		m_printedRelatedQueries = true;
	}


	// write out
	pumpSocketWriteBuf();

	// this is the Keyword Insertion Tool data (KIT data)
	SafeBuf *sits = getScoredInsertableTerms();
	if ( ! sits || sits == (void *)-1 ) return sits;

	// try to store into cachedb in case user clicks a different
	// insertable term and we have to update the wordposinfo::m_rankChange
	// stuff in the html src display
	//if ( ! storeIntoCachedb() )
	//	// return -1 if it blocked and wait for store to complete
	//	return (SafeBuf *)-1;


	// print out query changes
	if ( ! m_printedScoredInsertableTerms && m_seoSocket ) {
		// dump out each insertable term and it's corresponding
		// QueryChanges
		if ( ! printScoredInsertableTerms ( sb ) )
			return NULL;
		m_printedScoredInsertableTerms = true;
		// end of xml response?
		sb->safePrintf("</response>\n");
	}

	// even if not fully pumped, set it to valid here
	m_socketWriteBufValid = true;

	if ( ! m_seoSocket ) return &m_socketWriteBuf;

	// write out
	pumpSocketWriteBuf();

	// if socket not done being pumped... we block
	if ( m_socketWriteBufSent < m_socketWriteBuf.length() )
		return (SafeBuf *)-1;

	// ok, we are done
	return &m_socketWriteBuf;
}
*/

// have the smallest twids on top!
int twidcmp ( const void *a, const void *b ) {
	TermInfo *ua = (TermInfo *)a;
	TermInfo *ub = (TermInfo *)b;
	//uint32_t ua = *(uint32_t *)a;
	//uint32_t ub = *(uint32_t *)b;
	// HACKY: sort by lower 32 bits of the 64 bit termids so
	// seo.cpp can use them with its QueryLogEntries which use 32 bit
	// termids to save mem.
	uint32_t ta = (uint32_t)ua->m_termId64;
	uint32_t tb = (uint32_t)ub->m_termId64;
	// lower first
	if ( ta > tb ) return  1; // swap
	if ( ta < tb ) return -1;
	return 0;
}


// . 1. make a vector of the words in the title, headers, page-inlink-text,
//      and site-inlink-text
//
// . 2. pass that word vector to every machine in network to see what queries
//      in the query logs we match. use Msg99.cpp. it should initialize
//      on startup and load in it's share of the query logs. query log file
//      should be sorted then sorted by filtered query then split. should also
//      remove queries from the most aggressive IPs (bots). we would need
//      a program, filterquerylog.cpp to do all that on gk37, our query log
//      storage server. it needs to store # of times query was done, too.
//      all queries should have back to back spaces removed and made lowercase.
//      remove queries that have double quotes or colon operators in them.
//      index each query term in the query log into HashTableX, which will
//      point to the query in the buffer. then we just store the termlist
//      in a SafeBuf that we save on disk. 40GB of queries split 256 ways
//      is still like 175MB per server! (if one server is dead, skip it)
//
// . 3. merge all queries received from all hosts and sort by traffic.
//
// . 4. perform the queries on procog and cache the scores of the top 10
//      results for each query. should be cached on machine that houses the
//      query. try a 60-day cache max age.
//
// . 5. now redo the queries but with a "url:thisurl |" to get this page's
//      score for each query. if the min score of the query on procog is
//      well beyond our grasp, we could just skip it.
//
// . 6. then determine the # of inlinks we need to add to get more traffic
//      for each query. assume siterank of 0 per inlink. if that would be
//      impossible then increment the siterank until it gets us in the top 10.
//


// just use getTopTermsVector
HashTableX *XmlDoc::getTermIdBufDedupTable32 ( ) {
	SafeBuf *tiBuf = getTermInfoBuf();
	if ( ! tiBuf || tiBuf == (void *)-1 ) return (HashTableX *)tiBuf;
	return &m_tidTable32;
}

// . used by handleRequest8e() which uses msg20::getSummary() with
//   m_getTermListBuf to call this in the local host msg20 handler.
// . this buf is used to determine what queries this document matches
SafeBuf *XmlDoc::getTermId32Buf() {

	if ( m_termId32BufValid )
		return &m_termId32Buf;

	SafeBuf *tiBuf = getTermInfoBuf ();
	if ( ! tiBuf || tiBuf == (void *) -1 ) return tiBuf;

	int32_t need = 4 * (tiBuf->length() / sizeof(TermInfo));
	if ( ! m_termId32Buf.reserve(need) ) return NULL;

	// scan those
	char *p = tiBuf->getBufStart();
	char *pend = tiBuf->getBuf();
	uint32_t last = 0;
	for ( ; p < pend ; ) {
		TermInfo *ti = (TermInfo *)p;
		p += sizeof(TermInfo);
		uint32_t tid32 = (uint32_t)(ti->m_termId64);
		m_termId32Buf.pushLong(tid32);
		// sanity
		if ( last && tid32 <= last ) { char *xx=NULL;*xx=0; }
		last = tid32;
	}

	m_termId32BufValid = true;
	return &m_termId32Buf;
}

// . used by getTermId32Buf() for getting this document's matching queries
// . serialize the words in the title and inlink text into a vector
// . SafeBuf is filled with class TermInfos! defined in seo.h. currently
//   just a int64_t m_termId64 though!
// . get synonyms of each word too!
// . we sort them by the 32-bit termid so handleRequest8e() can do its fast
//   compare algo to find matching queries which are also sorted by the lower
//   32 bits of terms in the query.
SafeBuf *XmlDoc::getTermInfoBuf ( ) {

	setStatus ( "getterminfobuf" );

	if ( m_termInfoBufValid ) return &m_termInfoBuf;

	bool includeSynonyms = true;

	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;
	LinkInfo *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1;
	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) return (SafeBuf *)langId;


	if (!m_tidTable32.set(4,0,16384,NULL,0,false,m_niceness,"twidtabl"))
		return NULL;

	//
	// add document body words now to m_twbuf
	//

	if ( ! addUniqueWordsToBuf ( &m_termInfoBuf ,
				     &m_tidTable32 , // dedup table
				     NULL, // filter table
				     NULL, // mincounttable
				     false ,
				     ww ,
				     includeSynonyms) )
		return NULL;

	//
	// store count of each term we hash after this into "TMP"
	//
	HashTableX TMP;
	if(!TMP.set(4,4,4096,NULL,0,false,m_niceness,"tmttt") )
		return NULL;

	//
	// hash meta desc into TMP table
	//

	int32_t mdlen;
	char *md = getMetaDescription( &mdlen );
	if ( md ) {
		Words ww3;
		ww3.setx ( md , mdlen , m_niceness );
		if (!addUniqueWordsToBuf(NULL,
					 NULL , // dedup table
					 NULL, // filter table
					 &TMP, // mincounttable
					 true, // store counts?
					 &ww3,
					 includeSynonyms))
			return NULL;
	}

	//
	// hash meta keywords into TMP table
	//

	int32_t mklen;
	char *mk = getMetaKeywords( &mklen );
	if ( mk ) {
		Words ww4;
		ww4.setx ( mk , mklen , m_niceness );
		if (!addUniqueWordsToBuf(NULL,
					 NULL, // dedup table
					 NULL, // filter table
					 &TMP, // mincounttable
					 true, // store counts?
					 &ww4,
					 includeSynonyms))
			return NULL;
	}

	//
	// hash each link text into TMP table
	//

	// loop over every link text to this page
	for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get the link text
		if ( k->size_linkText <= 1 ) continue;
		// set Url
		Url u;
		u.set ( k->getUrl() , k->size_urlBuf );
		// do not allow anomalous link text to match query
		//if ( k->m_isAnomaly ) continue;
		char *p    = k-> getLinkText();
		int32_t  plen = k->size_linkText - 1;
		if ( ! verifyUtf8 ( p , plen ) ) {
			log("title: set4 bad link text from url=%s",
			    k->getUrl());
			continue;
		}
		// debug
		//log("seo: counttable for link text '%s'",k->getLinkText());
		// now the words.
		Words ww2;
		if ( ! ww2.set ( k->getLinkText()   ,
				 k->size_linkText-1, // len
				 TITLEREC_CURRENT_VERSION ,
				 true              , // computeIds
				 m_niceness        ))// niceness
			// g_errno set on error, return NULL
			return NULL;
		// int16_tcuts on link text
		if ( ! addUniqueWordsToBuf( NULL,
					    NULL, // dedup table
					    NULL, // filter table
					    &TMP, // mincounttable
					    true, // store counts?
					    &ww2,
					    includeSynonyms))
			return NULL;
	}

	//
	// now only add link texts to main table and buffer if it occurs
	// already in the body, or occurs TWICE in "TMP"
	//


	// loop over every link text to this page
	for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get the link text
		if ( k->size_linkText <= 1 ) continue;
		// set Url
		Url u;
		u.set ( k->getUrl() , k->size_urlBuf );
		// do not allow anomalous link text to match query
		//if ( k->m_isAnomaly ) continue;
		char *p    = k-> getLinkText();
		int32_t  plen = k->size_linkText - 1;
		if ( ! verifyUtf8 ( p , plen ) ) {
			log("title: set4 bad link text from url=%s",
			    k->getUrl());
			continue;
		}
		// now the words.
		Words ww2;
		if ( ! ww2.set ( k->getLinkText()   ,
				 k->size_linkText-1, // len
				 TITLEREC_CURRENT_VERSION ,
				 true              , // computeIds
				 m_niceness        ))// niceness
			// g_errno set on error, return NULL
			return NULL;

		if ( !addUniqueWordsToBuf( &m_termInfoBuf,
					   &m_tidTable32, // dedup table
					   NULL, // filter table
					   &TMP, // mincounttable, >=2 counts
					   false, // store counts?
					   &ww2,
					   includeSynonyms))
			return NULL;
	}


	// how many 32-bit twids do we got?
	//m_numTwids = m_twbuf.length() / 4;
	//m_twids = (int32_t *)m_twbuf.getBufStart();

	QUICKPOLL(m_niceness);

	// . sort that buf now
	// . HACK: only sorts by last 32 bits of termid!!!!
	qsort ( m_termInfoBuf.getBufStart(),
		m_termInfoBuf.length() / sizeof(TermInfo),
		sizeof(TermInfo), // 32-bit twids = 4 bytes
		twidcmp );

	QUICKPOLL(m_niceness);

	// if no twids then return a -2 ptr, not NULL, that means error
	// not -1 that means blocked!
	//if ( m_numTwids == 0 ) m_twids = (int32_t *)-2;
	// do not repeat this logic
	//m_twidsValid = true;
	m_termInfoBufValid = true;
	// return the vector
	return &m_termInfoBuf;
}

// . just like getTermInfoBuf but also includes terms from related queries
//   that our document does not have!
// . we do it this way because for seo.cpp::handleRequest95() it finds
//   matching queries locally based on getNewTermInfoBuf()'s m_newTermInfoBuf.
SafeBuf *XmlDoc::getNewTermInfoBuf ( ) {

	setStatus ( "getnewterminfobuf" );

	if ( m_newTermInfoBufValid ) return &m_newTermInfoBuf;

	SafeBuf *oldBuf = getTermInfoBuf ();
	if ( ! oldBuf || oldBuf == (void *) -1 ) return oldBuf;

	SafeBuf *itBuf = getInsertableTerms();
	if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;


	// this should be valid automatically
	HashTableX *oldDedupTable = getTermIdBufDedupTable32 ( );


	// get old guy
	if ( ! m_newTermInfoBuf.safeMemcpy ( oldBuf ) )
		return NULL;

	// a dedup table on stack
	HashTableX newDedup32;
	if (! newDedup32.set(4,0,16384,NULL,0,false,m_niceness,"newdtabl"))
		return NULL;

	// now scan the insertable terms buf
	char *p = itBuf->getBufStart();
	char*pend = itBuf->getBuf();
	// scan each "term" which might be one or more words
	for ( ; p < pend ; ) {
		QUICKPOLL(m_niceness);
		// cast it
		InsertableTerm *it = (InsertableTerm *)p;
		p += it->getSize();
		char *term = it->getTerm();
		Words ww;
		ww.set9 ( term , m_niceness );
		// we add entries to the dedup table, "newDedup32",
		// but only filter and not add to "oldDedupTable"
		if ( ! addUniqueWordsToBuf ( &m_newTermInfoBuf,
					     &newDedup32 , // dedup table
					     oldDedupTable, // filter table
					     NULL, // mincounttable
					     false,
					     &ww ,
					     true ) )
			return NULL;
	}

	QUICKPOLL(m_niceness);

	// . sort that buf now.
	// . HACK: only sorts by last 32 bits of termid!!!!
	qsort ( m_newTermInfoBuf.getBufStart(),
		m_newTermInfoBuf.length() / sizeof(TermInfo),
		sizeof(TermInfo), // 32-bit twids = 4 bytes
		twidcmp );

	QUICKPOLL(m_niceness);

	/*
	// set the term freq of each one
	p = m_newTermInfoBuf.getBufStart();
	pend = m_newTermInfoBuf.getBuf();
	for ( ; p < pend ; ) {
		QUICKPOLL(m_niceness);
		TermInfo *ti = (TermInfo *)p;
		p += sizeof(TermInfo);
		// look it up
		int64_t tf = g_posdb.getTermFreq (cr->m_coll,ti->m_termId64);
		// store it
		ti->m_termFreq64 = tf;
	}
	*/

	// do not repeat this logic
	m_newTermInfoBufValid = true;
	// return the vector
	return &m_newTermInfoBuf;
}

bool XmlDoc::addUniqueWordsToBuf ( SafeBuf *termInfoBuf ,
				   HashTableX *dedupTable ,
				   HashTableX *filterTable ,
				   HashTableX *minCountTable ,
				   bool storeCounts,
				   Words *ww ,
				   bool getSynonyms ) {

	int32_t nw   = ww->getNumWords ();
	uint64_t *wids = (uint64_t *)ww->getWordIds  ();
	//nodeid_t  *tids = ww->getTagIds   ();
	uint8_t *langId = getLangId();
	// this should have been set by parent caller
	if ( ! langId || langId == (uint8_t *)-1 ) {char *xx=NULL;*xx=0; }
	// store the langId here
	uint8_t useLangId = *langId;
	// default that to english i guess if unknown
	if ( useLangId == langUnknown ) {
		static XmlDoc *s_lastPrint = NULL;
		if ( s_lastPrint != this ) {
			log("seopipe: langid of page is unknown for twid "
			    "synonyms. assuming english.");
			s_lastPrint = this;
		}
		useLangId = langEnglish;
	}

	Synonyms syn;

	//bool inTitle = false;

	// scan for title
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// out of a link
		//if(tids && tids[i] == TAG_TITLE ) inTitle = true;
		//if(tids && tids[i] == (TAG_TITLE | BACKBIT)) inTitle = false;
		// count it, limit to 30
		//if ( inTitle ) tw++;
		// skip if not alnumword
		if ( ! wids[i] ) continue;
		// make it 32 bit
		uint32_t wid32 = (uint32_t)wids[i];
		// filter table
		if ( filterTable && filterTable->isInTable(&wid32) ) continue;
		/*
		// debug
		if ( minCountTable && storeCounts ) {
			int32_t wlen = ww->m_wordLens[i];
			char *wptr = ww->m_words[i];
			char c= wptr[wlen];
			wptr[wlen] = '\0';
			log("seo: storecount wid=%"UINT32" word=%s",
			    (uint32_t)((uint64_t)wids[i]),wptr);
			wptr[wlen] = c;
		}
		*/
		// to avoid link text anomalies, the word must have been
		// repeated in another link text or a meta tag. should
		// fix ibm.com from getting 'lincoln' or 'unc' as high-scoring
		// matching queries. should fix artdaily.com from getting
		// that foreign language phrase in danish. (bedste pa nettet)
		// (best of the web)
		if ( minCountTable &&
		     ! storeCounts &&
		     minCountTable->getScore32(&wid32) <= 1 )
			continue;
		// get slot
		if ( dedupTable && dedupTable->isInTable(&wid32) ) continue;
		// count it!
		if ( storeCounts && ! minCountTable->addTerm32(&wid32) )
			return false;
		// show it
		//if ( wid32 == 1174583722 && storeCounts ) {
		//	log("seo: storing occurence. current count=%"INT32"",
		//	    (int32_t)minCountTable->getScore32(&wid32) );
		//}
		// add it to vector
		TermInfo ti;
		ti.m_termId64 = wids[i];
		//ti.m_termFreq64 = -1;
		if ( termInfoBuf && !
		     termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
			return false;
		// add it then
		if ( dedupTable && ! dedupTable->addKey ( &wid32 ) )
			return false;
		// do synonyms now?
		if ( ! getSynonyms ) continue;
		// get its synonyms into tmpBuf
		char tmpBuf[TMPSYNBUFSIZE];
		int32_t naids = syn.getSynonyms(ww,i,useLangId,tmpBuf,m_niceness);
		for ( int32_t j = 0 ; j < naids ; j++ ) {
			// get it
			uint32_t aid32 = (uint32_t)syn.m_aids[j];
			// get slot
			if ( dedupTable && dedupTable->isInTable(&aid32) )
				continue;
			// add it to vector
			TermInfo ti;
			ti.m_termId64 = syn.m_aids[j]; // 64 bit version
			//ti.m_termFreq64 = -1;
			if ( termInfoBuf &&
			     ! termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) )
				return false;
			// add it then
			if ( dedupTable && ! dedupTable->addKey(&aid32) )
				return false;
			// count it!
			if ( storeCounts && ! minCountTable->addTerm32(&aid32))
				return false;
		}
	}
	return true;
}

/*
static void gotMsg99ReplyWrapper ( void *state , UdpSlot *slot ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->gotMsg99Reply ( slot );
}

void XmlDoc::gotMsg99Reply ( UdpSlot *slot ) {
	// get replying hostid
	int32_t hostId = slot->m_hostId;
	// log
	setStatus ( "gotmsg99reply" );
	// sanity
	if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
	// save it
	int32_t i = m_numMsg99Replies;
	m_msg99ReplyPtrs [i] = slot->m_readBuf;
	m_msg99ReplySizes[i] = slot->m_readBufSize;
	m_msg99ReplyAlloc[i] = slot->m_readBufMaxSize;
	m_msg99HostIds   [i] = hostId;
	// steal it so it doesn't free it
	slot->m_readBuf = NULL;
	// note it
	//log("seopipe: got msg99 reply from host #%"INT32" i=%"INT32" alloc=%"INT32"",
	//    hostId,i,slot->m_readBufMaxSize);
	// inc the counter
	m_numMsg99Replies++;
	// sanity!
	if ( m_numMsg99Replies > m_numMsg99Requests  ) { char *xx=NULL;*xx=0; }
	if ( m_numMsg99Replies > g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; }
	// don't free the sendbuf, it is shared between all hosts UNLESS
	// we are the last reply received!!!
	if ( m_numMsg99Replies < g_hostdb.m_numHosts )
		slot->m_sendBufAlloc = NULL;
	// return control to transmit function. it will call m_callback1
	// if the function is done. but if a different parent function than
	// transmit called us then we call that. it just depends on the
	// intial entry function that called getMatchingQueries()
	m_masterLoop ( m_masterState );
}
*/
/*
float getQueryImportance2 ( QueryInfo *qi , float myScore ) {
	// now divide by the top score (or 50th score) for the query
	// so we can see how high we score relatively speaking...
	// although, if all search results for this query have the
	// same score this method kinda sux...
	float imp = myScore / qe->m_minTop50Score;
	return imp;
	// mod because one word query terms get higher scores than
	// multi-word queries because they are divided by distance in
	// the search algo.
	// this hurts 'gigablast' query.
	if ( qi->m_numUniqueWordForms <= 1 ) score /= 10.0;
	// multiply by it?
	score *= qi->m_numUniqueWordForms;
	// until we have the code to fix things like 'coast to coast'
	// where the term is repeated, we have to punish...
	if ( qi->m_numRepeatWordForms >= 1 ) score /= 30.0;
	// kill 'search+engine+search+engine'
	if ( qi->m_numRepeatWordForms >= 2 ) score /= 30.0;
	// if every word in query is repeated... push it down
	// try to fix 'bot+bot' and 'search+search' 'http+http'
	if ( qi->m_numUniqueWordForms == qi->m_numRepeatWordForms )
		score /= 2000.0;
	// fix 'web search search'
	if ( qi->m_numRepeatWordForms > 0 &&
	     qi->m_numUniqueWordForms == qi->m_numRepeatWordForms + 1 )
		score /= 200.0;
	// try to kill those queries that are just a single stop word
	// or forms of stop words.
	// this hurts 'gigablast' query, so make it > .9. no, then crap like
	// 'web' and 'http' come up too high...
	if ( qi->m_numUniqueWordForms == 1 ) {
		score *= (1.1 - qi->m_smallestNormTermFreq);
		score *= (1.1 - qi->m_smallestNormTermFreq);
	}
	// http is very common! so make the 'http' or 'http+http' queries
	// very low importance
	if ( qi->m_numControlWordForms == qi->m_numUniqueWordForms )
		score /= 1000000.0;
	// TODO: if query is a single term and it's exact syn min
	// hash is that for 'and' then kill it. fix 'anding'

	// boost it for more accuracy since we gotta make it into anint
	//score *= 1000;
	return score;
}

// set Msg99Reply::m_queryImportance for all msg99replies
void setQueryImportance ( Msg99Reply **qptrs , int32_t numQueryPtrs ) {
}

void setQueryImportanceRelated ( QueryRel **qptrs , int32_t numQueryPtrs ) {
	for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
		QueryRel *qrel = qptrs[i];
		float score = qrel->m_queryInfo.m_myScoreRelated;
		QueryInfo *qi = &qrel->m_queryInfo;
		float imp = getQueryImportance2 ( qi , score );
		qi->m_queryImportance = imp;
	}
}
*/
/*
int qp99cmp ( const void *a, const void *b ) {
	Msg99Reply *qa = *(Msg99Reply **)a;
	Msg99Reply *qb = *(Msg99Reply **)b;
	// make sure manually added queries are on top
	if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return  1;
	if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
	//QueryInfo *qia = &qa->m_queryInfo;
	//QueryInfo *qib = &qb->m_queryInfo;
	// get scores
	float scorea = qa->m_queryImportance;
	float scoreb = qb->m_queryImportance;
	if ( scorea < scoreb ) return  1;
	if ( scorea > scoreb ) return -1;
	// fallback to traffic otherwise i guess
	int32_t traffica = qa->m_queryLogEntry.m_gigablastTraffic;
	int32_t trafficb = qb->m_queryLogEntry.m_gigablastTraffic;
	if ( qa->m_queryLogEntry.m_googleTraffic != -1 )
		traffica = qa->m_queryLogEntry.m_googleTraffic;
	if ( qb->m_queryLogEntry.m_googleTraffic != -1 )
		trafficb = qb->m_queryLogEntry.m_googleTraffic;
	if ( traffica < trafficb ) return  1;
	if ( traffica > trafficb ) return -1;
	// fallback alphabetical otherwise?
	char *qsa = qa->m_queryStr;
	char *qsb = qb->m_queryStr;
	if ( ! qsa ) return 0;
	if ( ! qsb ) return 0;
	return strcmp( qsa , qsb );
	//return 0;
}
*/

#include "Cachedb.h"

// . only check cachedb once per url
// . return false if blocked, true otherwise
// . returns true and sets g_errno on error
bool XmlDoc::checkCachedb ( ) {


	if ( ! m_readFromCachedb ) return true;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	// already set?
	//if ( m_seoInfoSetFromCache )
	//	return true;

	// return -1 if this blocked
	if ( ! m_checkedCachedb ) {
		// we now use the contenthash as part of the key because the
		// data we cache is dependent on the content. i guess we don't
		// need to use the user id then...
		int32_t *ch32p = getContentHash32();
		if ( ! ch32p ) return true;
		if ( ch32p == (void *)-1 ) return false;
		int32_t ch32 = *ch32p;
		// include spider date now in case indexed copy changes
		// site rank, tags, etc.
		if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;
		// first check cachedb. enum type cr_MatchingQueries
		int32_t uh32 ;
		uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());
		key_t sk = g_cachedb.makeStartKey ( uh32 , ch32 );
		key_t ek = g_cachedb.makeEndKey   ( uh32 , ch32 );
		// debug
		log("seo: checking cachedb uh32=%"UINT32" ch32=%"UINT32"",
		    (uint32_t)uh32,
		    (uint32_t)ch32);
		// do not repeat
		m_checkedCachedb = true;
		// . get it from the appropriate host
		// . get cachedb rec for all types of safebufs for this
		//   url/content
		// . then we will set safebufs based on what recs we find
		//   in the returned list
		if ( ! m_msg0.getList ( -1, // hostid
					0 , // ip
					0 , // port
					0 , // maxcacheage
					false, // addtocache?
					RDB_CACHEDB,
					cr->m_collnum ,
					&m_cacheList,
					(char *)&sk ,
					(char *)&ek ,
					30000000, // minrecsizes 30MB
					m_masterState,
					m_masterLoop,
					m_niceness ) )
			// return FALSE if this blocks
			return false;
	}

	if ( m_processedCachedbReply ) return true;

	// only scan list once
	m_processedCachedbReply = true;

	// if empty, that was easy
	if ( m_cacheList.isEmpty() ) return true;

	// we might have one rec set from cache and another not, and we
	// still want to cache the one that is not in storeIntoCachedb()!
	//m_seoInfoSetFromCache = true;

	// otherwise, parse out the cache recs
	for ( ; ! m_cacheList.isExhausted() ; m_cacheList.skipCurrentRec() ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get it
		char *rec = m_cacheList.getCurrentRec();
		// . get type of cached rec
		// . enum types cr_MatchingQueries etc. as in Cachedb.h
		char recType = g_cachedb.getTypeFromKey(rec);
		int32_t dataSize = m_cacheList.getCurrentDataSize();
		// sanity. must at least have the cached date
		if ( dataSize < 4 ) { char *xx=NULL;*xx=0; }
		char *data = m_cacheList.getCurrentData ();
		// in data, first int32_t is the cached time in utc
		//int32_t cachedDate = *(int32_t *)data;
		// skip the TIMESTAMP!
		//int32_t timestamp = *(int32_t *)data;
		data     += 4;
		dataSize -= 4;
		// and version
		data     += 4;
		dataSize -= 4;


		// . 1
		// . is it a cached rec for matching queries?
		// . getSeoQueryInfo() needs this
		if (recType == cr_MatchingQueries && !m_matchingQueryBufValid){
			// debug
			log("seo: found matching queries");
			// total size of the msg99replies (totalMsg99ReplySize)
			int32_t size1 = *(int32_t *)data;
			data += 4;
			// just point into the list itself. we will
			// free m_cacheList on reset then.
			m_matchingQueryBuf.setBuf ( data ,
						    size1 , // size
						    size1 , // allocated
						    false , // owndata?
						    0 ); // encoding none
			data += size1;
			// now the m_queryLinkStringBuf
			size1 = *(int32_t *)data;
			data += 4;
			m_matchingQueryStringBuf.setBuf ( data  ,
							  size1 ,  // size
							  size1 ,  // allocated
							  false ,  // owndata?
							  0     ); // encoding
			data += size1;
			m_matchingQueryBufValid = true;
			continue;
		}

		// . 2
		// . is it a cached rec for related docis with titles?
		// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
		// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
		// . m_relatedTitleBuf is buf of titles and urls referenced
		//   by those classes
		if ( recType == cr_RelatedDocIds &&
		     ! m_relatedDocIdsWithTitlesValid ) {
			// debug
			log("seo: found related docids");
			// first is the safebuf of RelatedDocId classes
			int32_t size1 = *(int32_t *)data;
			data += 4;
			// point into it
			//char *p = data;
			//char *pend = data + size1;
			// just point into the list itself. we will
			// free m_cacheList on reset then.
			m_relatedDocIdBuf.setBuf ( data ,
						   size1 , // size
						   size1 , // allocated
						   false , // owndata?
						   0 ); // encoding none
			// skip that
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			// save this
			//char *rtbuf = data;
			// now the string buffer
			m_relatedTitleBuf.setBuf ( data ,
						   size1 ,
						   size1 ,
						   false ,
						   0 );
			// skip that
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			// now the string buffer
			m_commonQueryNumBuf.setBuf ( data ,
						     size1 ,
						     size1 ,
						     false ,
						     0 );

			// now the RelatedDocId::ptr_url/ptr_rd_title members
			// were hacked to be offsets into this for storage
			// into the cache!
			/*
			for ( ; p < pend ; p += sizeof(RelatedDocId) ) {
				QUICKPOLL(m_niceness);
				// cast it
				RelatedDocId *rd = (RelatedDocId *)p;
				// get offsets
				int32_t off1 = (int32_t)rd->ptr_rd_title;
				int32_t off2 = (int32_t)rd->ptr_rd_url;
				int32_t off3 = (int32_t)rd->ptr_rd_site;
				// normalize/store back
				rd->ptr_rd_title = rtbuf + off1;
				rd->ptr_rd_url   = rtbuf + off2;
				rd->ptr_rd_site  = rtbuf + off3;
			}
			*/
			m_relatedDocIdsWithTitlesValid = true;
			m_relatedTitleBufValid = true;
			m_relatedDocIdBufValid = true;
			continue;
		}

		// . 3
		// . is it a cached rec for related docis with titles?
		// . getSeoQueryInfo() calls getRelatedQueryBuf()
		if ( recType == cr_RelatedQueries && ! m_queryLinkBufValid ) {
			// we changed the format of relatedquerystringbuf
			// to be a bunch of QueryLogEntries now. so ignore
			// if old format.
			//if ( timestamp <= 1367704324 ) continue;
			// debug
			log("seo: found related queries");
			int32_t size1;
			// first is the safebuf m_queryLinkBuf of QueryLinks
			size1 = *(int32_t *)data;
			data += 4;
			m_relatedQueryBuf.setBuf ( data ,
					       size1 , // size
					       size1 , // allocated
					       false , // owndata?
					       0 ); // encoding none
			data += size1;
			// now the m_queryLinkStringBuf
			size1 = *(int32_t *)data;
			data += 4;
			m_relatedQueryStringBuf.setBuf ( data  ,
						       size1 ,  // size
						       size1 ,  // allocated
						       false ,  // owndata?
						       0     ); // encoding
			data += size1;
			/*
			// now the ptrs, sorted
			size1 = *(int32_t *)data;
			data += 4;
			m_relPtrs.setBuf ( data  ,
					   size1 ,  // size
					   size1 ,  // allocated
					   false ,  // owndata?
					   0     ); // encoding none
			// test sorting
			char *p = m_relPtrs.getBufStart();
			char *pend = m_relPtrs.getBuf();
			char *base = m_queryLinkBuf.getBufStart();
			QueryLink *lastqr = NULL;
			for ( ; p < pend ; p += 4 ) {
				QUICKPOLL(m_niceness);
				int32_t qkOff = *(int32_t *)p;
				QueryLink *qr = (QueryRel *)(base+qkOff);
				// no, longer, it is more complicated because
				// if m_uniqueRound scoring addition
				//if ( lastqr &&
				//     lastqr->m_totalRelatedQueryImportance <
				//     qr  ->m_totalRelatedQueryImportance ) {
				//	char *xx=NULL;*xx=0;}
				lastqr = qr;
			}
			*/
			// validate
			//m_relPtrsValid = true;
			//m_queryLinkStringBufValid = true;
			m_relatedQueryBufValid = true;
			continue;
		}

		// if it is debug and we are not, skip it!!
		//if(recType == cr_ScoredInsertableTermsDebug && ! m_seoDebug )
		//	continue;

		// or if we are debug and it is not, skip it!
		//if (recType == cr_ScoredInsertableTerms && m_seoDebug )
		//	continue;

		/*
		if ( (recType == cr_MissingTermBuf ) &&
		     ! m_missingTermBufValid ) {
			// debug
			log("seo: found missingtermbuf");
			int32_t size1;
			size1 = *(int32_t *)data;
			data += 4;
			m_missingTermBuf.setBuf ( data ,
						  size1 , // size
						  size1 , // allocated
						  false , // owndata?
						  0 ); // encoding none
			m_missingTermBufValid = true;
		}
		*/

		// 3b
		if ( (recType == cr_WordPosInfoBuf ) &&
		     ! m_wordPosInfoBufValid ) {
			// debug
			log("seo: found wordposinfo");
			int32_t size1;
			size1 = *(int32_t *)data;
			data += 4;
			m_wordPosInfoBuf.setBuf ( data ,
						  size1 , // size
						  size1 , // allocated
						  false , // owndata?
						  0 ); // encoding none
			// WordPosInfo::m_term relative to ptr_utf8Content
			char *p = m_wordPosInfoBuf.getBufStart();
			char *pend = m_wordPosInfoBuf.getBuf();
			for ( ; p < pend ; p += sizeof(WordPosInfo) ) {
				QUICKPOLL(m_niceness);
				WordPosInfo *wp = (WordPosInfo *)p;
				int64_t off = (int64_t)wp->m_wordPtr;
				char *ptr = ptr_utf8Content + off;
				if ( off == -1 ) ptr = NULL;
				wp->m_wordPtr = ptr;
			}
			m_wordPosInfoBufValid = true;
		}

		// . 4
		// . and the insertable terms buffer with its querychanges
		//   linked lists!
		if ( recType == cr_ScoredInsertableTerms &&
		     ! m_scoredInsertableTermsBufValid ) {
			// debug
			log("seo: found scored insertable terms");
			int32_t size1;
			// first is the safebuf m_queryLinkBuf of QueryLinks
			size1 = *(int32_t *)data;
			data += 4;
			// just point into the list itself. we will
			// free m_cacheList on reset then.
			m_insertableTermsBuf.setBuf ( data ,
						      size1 , // size
						      size1 , // allocated
						      false , // owndata?
						      0 ); // encoding none
			// skip that
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			// now the buffer of query changes
			// these are normally just referenced by
			// InsertableTerm and in the linked list directly
			// into the Msg95Reply::ptr_queryChanges, but for
			// caching we have to use a new safebuf
			m_queryChangeBuf.setBuf ( data ,
						  size1 , // size
						  size1 , // allocated
						  false , // owndata?
						  0 ); // encoding none
			// skip that
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			m_queryLogBuf.setBuf ( data ,
					       size1 , // size
					       size1 , // allocated
					       false , // owndata?
					       0 ); // encoding none
			/*
			// skip that
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			m_itStrBuf.setBuf ( data ,
					    size1 , // size
					    size1 , // allocated
					    false , // owndata?
					    0 ); // encoding none
			*/
			/*
			// debug scoring. QueryChange::m_debugScoreInfoOffset
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			m_debugScoreInfoBuf.setBuf ( data ,
						     size1 , // size
						     size1 , // allocated
						     false , // owndata?
						     0 ); // encoding none
			// debug scoring. QueryChange::m_origScoreInfoOffset
			data += size1;
			size1 = *(int32_t *)data;
			data += 4;
			m_origScoreInfoBuf.setBuf ( data ,
						    size1 , // size
						    size1 , // allocated
						    false , // owndata?
						    0 ); // encoding none
			*/
			// insertable terms deserialization logic
			char *p = m_insertableTermsBuf.getBufStart();
			char *pend = m_insertableTermsBuf.getBuf();
			for ( ; p < pend ; ) {
				QUICKPOLL(m_niceness);
				// cast it
				InsertableTerm *it = (InsertableTerm *)p;
				p += it->getSize();
				// normalize m_firstQueryChange
				int64_t off =(int64_t)(it->m_firstQueryChange);
				// fix this
				char *buf = m_queryChangeBuf.getBufStart();
				// int16_tcut
				QueryChange *fqc = (QueryChange *)(buf+off);
				// -1 means NULL
				if ( off == -1 ) fqc = NULL;
				// put back
				it->m_firstQueryChange = fqc;
				// terms
				//off = (int32_t)it->m_termStr;
				// to this
				//buf = m_itStrBuf.getBufStart();
				// cast it
				//it->m_termStr = (char *)(buf+off);
			}
			// . now we set QueryChange::m_next and
			//   InsertableTerm::m_firstQueryChange to be offsets
			//   into the new m_queryChangeBuf before we stored
			//   into the cache....
			p = m_queryChangeBuf.getBufStart();
			pend = m_queryChangeBuf.getBuf();
			for ( ; p < pend ; p += sizeof(QueryChange) ) {
				QUICKPOLL(m_niceness);
				// cast it
				QueryChange *qc = (QueryChange *)p;
				// normalize m_next
				int64_t off = (int64_t)qc->m_next;
				// offset into this
				char *buf = m_queryChangeBuf.getBufStart();
				// put back
				qc->m_next = (QueryChange *)(buf + off);
				// -1 means NULL
				if ( off == -1 ) qc->m_next = NULL;
			}
			// now all ptrs should be set correctly
			m_scoredInsertableTermsBufValid = true;
			m_insertableTermsBufValid = true;
			continue;
		}

		// . 2
		// . is it a cached rec for related docis with titles?
		// . getSeoQueryInfo() calls getRelatedDocIdsWithTitles()
		// . m_relatedDocIds SafeBuf is buf if RelatedDocId classes
		// . m_relatedTitleBuf is buf of titles and urls referenced
		//   by those classes
		if ( recType == cr_RecommendedLinks &&
		     ! m_recommendedLinksBufValid ) {
			// debug
			log("seo: found recommended links buf");
			// first is the safebuf of RelatedDocId classes
			int32_t size1 = *(int32_t *)data;
			data += 4;
			// now the string buffer
			m_recommendedLinksBuf.setBuf ( data ,
						       size1 ,
						       size1 ,
						       false ,
						       0 );
			m_recommendedLinksBufValid = true;
			continue;
		}

	}
	return true;
}

#define CACHEDB_CURRENT_VERSION 1

// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . flush the msg4 until it completes i guess
bool XmlDoc::storeMatchingQueriesIntoCachedb ( ) {

	if ( ! m_writeToCachedb ) return true;

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	// all these things should already be validated so they should
	// not block or have errors
	//SafeBuf *qpbuf = getMatchingQueriesScored();
	//SafeBuf *qpbuf = &m_queryPtrs;
	if ( ! m_matchingQueryBufValid ) { char *xx=NULL;*xx=0; }

	int32_t now = getTimeGlobal();

	// calc how much space we need
	//int32_t totalMsg99ReplySize = 0;
	//int32_t numQueryPtrs = 0;
	//Msg99Reply **qptrs = NULL;

	// 1. msg99replies for matchingQueries
	int32_t need = 0;
	need += sizeof(key_t) + 4 + 4+4; // key + dataSize+cacheDate(now)+ver
	need += 4 + m_matchingQueryBuf.length();
	need += 4 + m_matchingQueryStringBuf.length();
	// sanity
	if ( need > 20000000 ) {
		log("cachedb: mq listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;

	//
	// 1. first add the matching queries, msg99 replies
	//
	k = g_cachedb.makeKey ( uh32, ch32 , cr_MatchingQueries );

	// note it
	log("seo: cachedb storing matchingqueries "
	    "uh32=%"UINT32" ch32=%"UINT32""
	    ,(uint32_t)uh32,(uint32_t)ch32);

	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_matchingQueryBuf.length();
	dataSize += 4 + m_matchingQueryStringBuf.length();
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );
	listBuf.pushLong   ( m_matchingQueryBuf.length() );
	listBuf.safeMemcpy ( &m_matchingQueryBuf );
	listBuf.pushLong   ( m_matchingQueryStringBuf.length() );
	listBuf.safeMemcpy ( &m_matchingQueryStringBuf );

	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }

	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding matching query list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}

bool XmlDoc::storeRelatedDocIdsIntoCachedb ( ) {

	if ( ! m_writeToCachedb ) return true;

	if ( ! 	m_queryPtrsWholeValid ) { char *xx=NULL;*xx=0; }

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	if ( ! m_relatedDocIdsWithTitlesValid ) { char *xx=NULL;*xx=0;}
	if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0;}

	// 2. related docids
	int32_t need = 0;
	need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
	need += 4 + m_relatedDocIdBuf.length();
	need += 4 + m_relatedTitleBuf.length();
	need += 4 + m_commonQueryNumBuf.length();

	// sanity
	if ( need > 20000000 ) {
		log("cachedb: rd listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;
	char *p1;
	char *p2;
	int32_t now = getTimeGlobal();

	// 2. then add related docids
	k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedDocIds );

	// note it
	log("seo: cachedb storing relateddocids "
	    "uh32=%"UINT32" ch32=%"UINT32""
	    ,(uint32_t)uh32,(uint32_t)ch32);


	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize  = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_relatedDocIdBuf.length();
	dataSize += 4 + m_relatedTitleBuf.length();
	dataSize += 4 + m_commonQueryNumBuf.length();
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );
	listBuf.pushLong ( m_relatedDocIdBuf.length() );
	p1 = listBuf.getBuf();
	listBuf.safeMemcpy ( &m_relatedDocIdBuf );
	p2 = listBuf.getBuf();
	listBuf.pushLong ( m_relatedTitleBuf.length() );
	listBuf.safeMemcpy ( &m_relatedTitleBuf );
	//char *tbuf = m_relatedTitleBuf.getBufStart();
	listBuf.pushLong ( m_commonQueryNumBuf.length() );
	listBuf.safeMemcpy ( &m_commonQueryNumBuf );

	// make ptrs into offsets into m_relatedTitleBuf
	/*
	for ( ; p1 < p2 ; p1 += sizeof(RelatedDocId ))  {
		QUICKPOLL(m_niceness);
		RelatedDocId *rd = (RelatedDocId *)p1;
		int32_t off;
		off = rd->ptr_rd_url - tbuf;
		rd->ptr_rd_url = (char *)off;
		off = rd->ptr_rd_title - tbuf;
		rd->ptr_rd_title = (char *)off;
		off = rd->ptr_rd_site - tbuf;
		rd->ptr_rd_site = (char *)off;
	}
	*/
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }

	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding related docids list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}

// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool XmlDoc::storeRecommendedLinksBuf ( ) {

	if ( ! m_writeToCachedb ) return true;

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	if ( ! m_recommendedLinksBufValid ) { char *xx=NULL;*xx=0;}

	int32_t need = 0;
	need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
	need += 4 + m_recommendedLinksBuf.length();

	// sanity
	if ( need > 20000000 ) {
		log("cachedb: reclnx listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;
	int32_t now = getTimeGlobal();

	// 2. then add related docids
	k = g_cachedb.makeKey ( uh32 , ch32, cr_RecommendedLinks );

	// note it
	log("seo: cachedb storing recommendedlinksbuf "
	    "uh32=%"UINT32" ch32=%"UINT32""
	    ,(uint32_t)uh32,(uint32_t)ch32);

	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize  = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_recommendedLinksBuf.length();
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );

	listBuf.pushLong ( m_recommendedLinksBuf.length() );
	listBuf.safeMemcpy ( &m_recommendedLinksBuf );
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }

	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding recommendedlinksbuf list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}

// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool XmlDoc::storeRelatedQueriesIntoCachedb ( ) {

	if ( ! m_writeToCachedb ) return true;

	if ( ! m_relatedQueryBufValid ) { char *xx=NULL;*xx=0; }

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	//SafeBuf *relBuf = NULL;
	//if ( m_relPtrsValid ) relBuf = &m_relPtrs;

	int32_t now = getTimeGlobal();

	// calc how much space we need
	int32_t need = 0;
	// 3. related queries. buf of QueryLinks
	need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
	need += 4 + m_relatedQueryBuf.length();
	need += 4 + m_relatedQueryStringBuf.length();
	//need += 4 + m_relPtrs.length();

	// sanity
	if ( need > 20000000 ) {
		log("cachedb: rq listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;

	//
	// 3. then related queries (STORED by m_queryImportanceRelated)
	//
	//int32_t sizeRels = (m_relPtrs.length() / 4) * sizeof(QueryLink);
	k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedQueries );

	// note it
	log("seo: cachedb storing relatedqueries "
	    "uh32=%"UINT32" ch32=%"UINT32""
	    ,(uint32_t)uh32,(uint32_t)ch32);

	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize  = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_relatedQueryBuf.length(); // sizeRels;
	dataSize += 4 + m_relatedQueryStringBuf.length();
	//dataSize += 4 + m_relPtrs.length();
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );

	listBuf.pushLong   ( m_relatedQueryBuf.length() );
	//char *p3 = listBuf.getBuf();
	listBuf.safeMemcpy ( &m_relatedQueryBuf );
	//char *p4 = listBuf.getBuf();
	listBuf.pushLong   (  m_relatedQueryStringBuf.length() );
	listBuf.safeMemcpy ( &m_relatedQueryStringBuf );
	//listBuf.pushLong   ( m_relPtrs.length() );
	//char *p5 = listBuf.getBuf();
	//listBuf.safeMemcpy ( &m_relPtrs );
	// sanity tests
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }


	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding related queries list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}

bool XmlDoc::storeWordPosInfoBufIntoCachedb ( ) {

	if ( ! m_writeToCachedb ) return true;

	if ( ! m_wordPosInfoBufValid ) { char *xx=NULL;*xx=0; }

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	int32_t now = getTimeGlobal();

	// calc how much space we need
	int32_t need = 0;
	need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
	need += 4 + m_wordPosInfoBuf.length();

	// sanity
	if ( need > 20000000 ) {
		log("cachedb: wpi listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;

	// 4. then the insertable terms and their query changes and log buf
	// mangle key a little if in debug mode because that is the only
	// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
	uint8_t cr8 = cr_WordPosInfoBuf;
	k = g_cachedb.makeKey ( uh32 , ch32, cr8 );

	// note it
	log("seo: cachedb storing wordposinfobuf "
	    "uh32=%"UINT32" ch32=%"UINT32""
	    ,(uint32_t)uh32,(uint32_t)ch32);

	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize  = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_wordPosInfoBuf.length();
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );

	listBuf.pushLong   ( m_wordPosInfoBuf.length() );
	char *p8 = listBuf.getBuf();
	listBuf.safeMemcpy ( &m_wordPosInfoBuf );
	char *p9 = listBuf.getBuf();
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	// WordPosInfo::m_term relative to html ptr_utf8Content!
	for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
		QUICKPOLL(m_niceness);
		WordPosInfo *wp = (WordPosInfo *)p8;
		int64_t off = wp->m_wordPtr - ptr_utf8Content;
		// if its a tag or fielded term it won't be in the
		// html like ext:html or filetype:html
		if ( wp->m_wordPtr< ptr_utf8Content )
			off = -1;
		if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
			off = -1;
		wp->m_wordPtr = (char *)off;
	}
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }

	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding wordposinfobuf list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}


/*
bool XmlDoc::storeMissingTermBufIntoCachedb ( ) {

	if ( ! m_writeToCachedb ) return true;

	if ( ! m_missingTermBufValid ) { char *xx=NULL;*xx=0; }

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	int32_t now = getTimeGlobal();

	// calc how much space we need
	int32_t need = 0;
	need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver
	need += 4 + m_missingTermBuf.length();

	// sanity
	if ( need > 20000000 ) {
		log("cachedb: wpi listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;

	// 4. then the insertable terms and their query changes and log buf
	// mangle key a little if in debug mode because that is the only
	// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
	uint8_t cr = cr_MissingTermBuf;
	k = g_cachedb.makeKey ( uh32 , ch32, cr );

	// note it
	log("seo: cachedb storing missingtermbuf "
	    "uh32=%"UINT32" ch32=%"UINT32"",uh32,ch32);

	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize  = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_missingTermBuf.length();
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );

	listBuf.pushLong   ( m_missingTermBuf.length() );
	listBuf.safeMemcpy ( &m_missingTermBuf );
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }

	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding missingtermbuf list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}
*/

// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
// . flush the msg4 until it completes i guess
bool XmlDoc::storeScoredInsertableTermsIntoCachedb ( ) {

	if ( ! m_writeToCachedb ) return true;

	if ( ! m_scoredInsertableTermsBufValid ) return true;

	int32_t *ch32p = getContentHash32();
	if ( ! ch32p ) return true;
	if ( ch32p == (void *)-1 ) return false;
	int32_t ch32 = *ch32p;
	// include spider date now in case indexed copy changes
	// site rank, tags, etc.
	if ( m_spideredTimeValid ) ch32 ^= m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	int32_t now = getTimeGlobal();

	// calc how much space we need
	int32_t need = 0;
	need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver
	need += 4 + m_insertableTermsBuf.length();
	// InsertableTerm::m_firstQryChange:
	need += 4 + m_queryChangeBuf.length();
	//4 QueryChange::m_replyQueryOffset :
	need += 4 + m_queryLogBuf.length();
	//InsertableTerm::m_termStr reference
	//need += 4 + m_itStrBuf.length();
	//need += 4 + m_wordPosInfoBuf.length();
	// TOO BIG to score into cachedb!
	//need += 4 + m_debugScoreInfoBuf.length(); // debug only
	//need += 4 + m_origScoreInfoBuf.length(); // debug only

	// sanity
	if ( need > 20000000 ) {
		log("cachedb: listsize %"INT32" too big for cachedb",need);
		return true;
	}

	SafeBuf listBuf;
	// add 1 byte padding to ensure copying a 0 byte buf to listBuf
	// does not trigger a reserve
	if ( ! listBuf.reserve ( need + 4 ) ) return true;

	// ensure no reallocating - that would screw logic below up
	char *orig = listBuf.getBufStart();

	int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64());

	key_t k;

	int32_t dataSize = 0;
	char *p1;
	char *p2;

	// 4. then the insertable terms and their query changes and log buf
	// mangle key a little if in debug mode because that is the only
	// time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf
	uint8_t cr8 = cr_ScoredInsertableTerms;
	//if ( m_seoDebug ) cr = cr_ScoredInsertableTermsDebug;
	k = g_cachedb.makeKey ( uh32 , ch32, cr8 );

	// note it
	log("seo: cachedb storing scoredinsertableterms "
	    "uh32=%"UINT32" ch32=%"UINT32""
	    ,(uint32_t)uh32,(uint32_t)ch32);

	listBuf.safeMemcpy ( &k , sizeof(key_t) );
	dataSize  = 0;
	dataSize += 4; // timestamp
	dataSize += 4; // version
	dataSize += 4 + m_insertableTermsBuf.length();
	dataSize += 4 + m_queryChangeBuf.length();
	dataSize += 4 + m_queryLogBuf.length();
	//dataSize += 4 + m_itStrBuf.length();
	//dataSize += 4 + m_wordPosInfoBuf.length();
	//dataSize += 4 + m_debugScoreInfoBuf.length(); // debug only
	//dataSize += 4 + m_origScoreInfoBuf .length(); // debug only
	listBuf.pushLong   ( dataSize );
	listBuf.pushLong   ( now );  // cached date
	listBuf.pushLong   ( (int32_t)CACHEDB_CURRENT_VERSION );

	// m_insertableTermsBuf
	listBuf.pushLong   ( m_insertableTermsBuf.length() );
	p1 = listBuf.getBuf();
	listBuf.safeMemcpy ( &m_insertableTermsBuf );
	char *p1End = listBuf.getBuf();
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	// m_queryChangeBuf
	listBuf.pushLong   ( m_queryChangeBuf.length() );
	p2 = listBuf.getBuf();
	listBuf.safeMemcpy ( &m_queryChangeBuf );
	char *p2End = listBuf.getBuf();
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	// m_queryLogBuf
	listBuf.pushLong   ( m_queryLogBuf.length() );
	listBuf.safeMemcpy ( &m_queryLogBuf );
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	// m_itStrBuf referenced by InsertableTerm::m_termStr
	//listBuf.pushLong   ( m_itStrBuf.length() );
	//listBuf.safeMemcpy ( &m_itStrBuf );
	//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	// m_itStrBuf referenced by InsertableTerm::m_termStr
	//listBuf.pushLong   ( m_wordPosInfoBuf.length() );
	//char *p8 = listBuf.getBuf();
	//listBuf.safeMemcpy ( &m_wordPosInfoBuf );
	//char *p9 = listBuf.getBuf();
	//if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }
	// debug buffers, QueryChange::m_*Offset parms ref them if
	// m_seoDebug is true. TOO BIG TO STORE INTO CACHEDB!
	//listBuf.pushLong ( m_debugScoreInfoBuf.length() );
	//listBuf.safeMemcpy ( &m_debugScoreInfoBuf );
	//listBuf.pushLong ( m_origScoreInfoBuf.length() );
	//listBuf.safeMemcpy ( &m_origScoreInfoBuf );
	// make the InsertableTerm::m_firstQueryChange parms into
	// offsets
	for ( ; p1 < p1End ; ) { // p1 += sizeof(InsertableTerm) ) {
		QUICKPOLL(m_niceness);
		InsertableTerm *it = (InsertableTerm *)p1;
		p1 += it->getSize();
		QueryChange *qc = it->m_firstQueryChange;
		int64_t qoff =(char *)qc - m_queryChangeBuf.getBufStart();
		if ( qc == NULL ) qoff = -1;
		it->m_firstQueryChange = (QueryChange *)qoff;
		// and m_termStr
		//int32_t off = it->m_termStr - m_itStrBuf.getBufStart();
		//it->m_termStr = (char *)off;
	}
	// make QueryChange::m_next ptrs into offsets as well
	for ( ; p2 < p2End ; p2 += sizeof(QueryChange) ) {
		QUICKPOLL(m_niceness);
		QueryChange *qc = (QueryChange *)p2;
		QueryChange *next = qc->m_next;
		int64_t noff =(char *)next-m_queryChangeBuf.getBufStart();
		if ( next == NULL ) noff = -1;
		qc->m_next = (QueryChange *)noff;
	}
	// WordPosInfo::m_term relative to html ptr_utf8Content!
	/*
	for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) {
		QUICKPOLL(m_niceness);
		WordPosInfo *wp = (WordPosInfo *)p8;
		int32_t off = wp->m_wordPtr - ptr_utf8Content;
		// if its a tag or fielded term it won't be in the
		// html like ext:html or filetype:html
		if ( wp->m_wordPtr< ptr_utf8Content )
			off = -1;
		if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content)
			off = -1;
		wp->m_wordPtr = (char *)off;
	}
	*/
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }

	// ensure list did not realloc, that would screw up everything!
	if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 );
	key_t endKey   = g_cachedb.makeEndKey   ( uh32, ch32 );

	// . list is ready now
	// . this only returns when each record has been added
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	m_storeList.printList();

	QUICKPOLL(m_niceness);

	log("xmldoc: adding insertable terms list of %"INT32" bytes to cachedb",
	    m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		return false;

	return true;
}

#define MAX_TOP_MATCHING_QUERIES 300

/*
// returns -1 if blocked, NULL with g_errno set on error
SafeBuf *XmlDoc::getMatchingQueriesScored ( ) {

	setStatus ( "getmatchingqueriesscored" );

	// try to set m_queryPtrs from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	// just re-use the same m_queryPtrs SafeBuf we used above but we
	// set the Msg99Reply::m_myScore here and sort them by that
	if ( m_queryPtrsSortedValid )
		return &m_queryPtrs;

	// get the queries from msg99 replies first
	SafeBuf *mq = getMatchingQueries(false,-1);
	if ( mq == NULL || mq == (void *)-1 ) return mq;

	// time it
	if ( ! m_beginTimeMatchUrl )
		m_beginTimeMatchUrl = gettimeofdayInMilliseconds();

	// i'm assuming this is quer ptrs!?!?!
	int32_t numQueryPtrs = mq->length() / sizeof(Msg99Reply *);

	// get the qptrs
	Msg99Reply **qptrs = (Msg99Reply **)mq->getBufStart();

	// score them in parallel over all hosts in network
	if ( ! scoreDocIdRestrictedQueries ( qptrs,NULL,numQueryPtrs) )
		return (SafeBuf *)-1;
	// error?
	if ( g_errno ) return NULL;

	// total pages indexed!
	int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
	// take 25% of that. i think 'the', the most common term, is in about
	// 25% of those pages
	numPagesIndexed /= 4;

	//
	// SET QUERY IMPORTANCE
	//
	// . set the m_queryImportance float and sort by that
	// . how important is the matching query for the main url?
	// . just divide the main url's score by the
	//   QueryLogEntry::m_mintop50Score for the query to normalize it
	// . however, when we compute RelatedDocId::m_dotProduct we normalize
	//   using the score of the #1 result because we executed the full
	//   query, so keep that in mind. we can't mix the two.
	for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
		Msg99Reply *qp = qptrs[i];
		// int16_tcut
		QueryLogEntry *qe = &qp->m_queryLogEntry;
		// get # results
		int64_t numResults = qe->m_numTotalResultsInSlice;
		// fix it to be global
		numResults *= (int64_t)g_hostdb.getNumGroups();
		// big indexes did the "slice logic" restricting docid
		// range to MAX_DOCID * .10 when setting this!
		if ( numPagesIndexed > 10000000 ) numResults *= 10;
		// point to query
		char *qstr = qp->m_queryStr;
		// if not processed assume like 1M?
		if ( numResults < 0 ) {
			log("seo: guessing query importance for '%s' from "
			    "hostid #%"INT32"",
			    qstr,(int32_t)qp->m_replyingHostId);
			qp->m_queryImportance = 0.0;
			continue;
		}
		// zero means make it 1 to avoid div by zero below
		if ( numResults == 0 ) numResults = 1;

		// and also weight by traffic! the more traffic the
		// more important perhaps...
		// NO! with this we get 'www' 'view' etc for
		// jezebelgallery.com coming up in the top 50 matching
		// queries by importance. crap, but it hurts cheatcodes.com
		// then.
		// fix

		//if ( strcmp(qstr,"search engine") == 0 )
		//	log("poo");

		// adjust since numPagesIndexed is actually a quarter of
		// the # of pages indexed since 'the' is only in about
		// 1/4 of the pages and it is the most common term
		if ( numResults > numPagesIndexed )
			numResults = numPagesIndexed;

		// try doubling this to get rid of www problem for
		// jezebelgallery.com. it put www and view down some more.
		float popRatio = (float)numResults / (float)numPagesIndexed;

		// stuff like 'www' and 'view' will be near 1.0
		float weight = 1.0 - popRatio;//(popRatio * popRatio);
		// go crazy
		weight *= weight;
		weight *= weight;
		weight *= weight;
		weight *= weight;

		// do not let this be 1.0 because 'web page searching' is
		// getting 1.0 for it and getting a weight of 0.0 and making
		// it the same as the ignored matching queries for
		// gigablast.com, so we end up using the ignored common
		// word matching queries for getting competitor pages and it
		// is bad! we need to fix that to not use such queries if
		// their importance is 0!
		if ( weight < .01 ) weight = .01;


		// because you are in the top 50
		//numResults = (int32_t)powf ( (float)numResults , .4 );
		//if ( numResults == 0 )
		//	imp /= 1;
		// otherwise, normalize by division
		//else
		//	imp /= numResults;
		// boost it!
		//imp *= 10000;
		//QueryInfo *qi = &qp->m_queryInfo;
		//float imp = getQueryImportance2 ( qi , score );

		// just try sorting by your serp score, hopefully we remove
		// shit like 'www' becaise isCommonQueryWordInEnglish()
		// takes care of it below.
		// consider *= weight on this

		// the idea is to ignore the top serp score because
		// you do not want terms that you may be able to be #1
		// for but are not really relevant for your doc. so for this
		// let's focus on just getting the queries that best represent
		// your doc...
		double imp = qp->m_myScore * weight;


		qp->m_queryImportance = (float)imp;
		// just use this!!!
		//qp->m_queryImportance = qp->m_myScore /
		//	(float)(numResults*numResults);
		// set importance to 0 for queries with minus sign in them
		// that indicates negative terms...
		for ( char *p = qstr; *p ; p++ ) {
			if ( *p   != ' ' ) continue;
			if ( p[1] != '-' ) continue;
			// 'a - b' is ok
			if ( p[2] == ' ' ) continue;
			qp->m_queryImportance = 0.00;
			log("seo: ignoring query '%s' with minus sign", qstr);
			break;
		}
		// avoid common queries with just common words in them:
		// http web www com org us we 1 2 3 by on i https one page
		Words ww;
		ww.set3 ( qstr );
		int32_t i; for ( i = 0 ; i < ww.m_numWords ; i++ ) {
			int64_t wid = ww.m_wordIds[i];
			if ( wid == 0 ) continue;
			if ( ! isCommonQueryWordInEnglish ( wid ) ) break;
		}
		if ( i >= ww.m_numWords ) {
			qp->m_queryImportance = 0.00;
			log("seo: ignoring common query '%s'", qstr);
		}
		// skip debug for now
		if ( ! m_seoDebug ) continue;
		// note it
		log("seo: "
		    "imp=%f "
		    "numresults=%"INT64" "
		    "numpagesindexed=%"INT64" "
		    "popweight=%f "
		    "myscore=%f "
		    "topscore=%f "
		    "qstr=%s",
		    qp->m_queryImportance,
		    numResults,
		    numPagesIndexed,
		    weight,
		    qp->m_myScore,
		    qe->m_topSERPScore,
		    qstr);
	}


	// let's sort them first
	qsort ( qptrs ,
		numQueryPtrs ,
		sizeof(Msg99Reply *),
		qp99cmp );


	// log for debug
	int32_t maxk = numQueryPtrs;
	// limit to logging 300 to avoid log spam
	if ( maxk > MAX_TOP_MATCHING_QUERIES )
		maxk = MAX_TOP_MATCHING_QUERIES; // 300;

	// limit to top 300 dammit, otherwise we can't store all
	// into cachedb!!!
	int32_t newLen = maxk * sizeof(Msg99Reply *);
	m_queryPtrs.setLength ( newLen );

	for ( int32_t k = 0 ; k < maxk ; k++ ) {
		Msg99Reply *kp = qptrs[k];
		log("seopipe: newquery=\"%s\" myscore=%f imp=%f",
		    kp->m_queryStr,
		    kp->m_myScore,
		    kp->m_queryImportance);
	}

	// time it
	int64_t now = gettimeofdayInMilliseconds();
	int64_t took = now - m_beginTimeMatchUrl;
	log("seopipe: time: matchingscoredqueries took %"INT64" ms",took);

	m_queryPtrsSortedValid = true;

	if ( ! storeMatchingQueriesIntoCachedb() )
		// return -1 if it blocked and wait for store to complete
		return (SafeBuf *)-1;

	return mq;
}

*/

static void gotMsg3aReplyForFullQueryWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->setStatus ( "gotmsg3areplyforfullquerywrapper" );
	THIS->gotMsg3aReplyForFullQuery();
	// . go back to the main entry function
	// . make sure g_errno is clear from a msg3a g_errno before calling
	//   this lest it abandon the loop
	THIS->m_masterLoop ( THIS->m_masterState );
}

/*
void XmlDoc::gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
					       Msg99Reply *qp ) {

	// try again for next guy
	m_triedCache = false;

	char *p = cachedRec;
	// # docids
	int32_t numDocIds = *(int32_t *)p;
	p += 4;
	// total # results
	int32_t numTotalResults = *(int32_t *)p;
	p += 4;
	// docids
	int64_t *docIds = (int64_t *)p;
	p += 8 * numDocIds;
	// scores
	float *scores = (float *)p;
	p += sizeof(float) * numDocIds;
	// site hashes
	int32_t *siteHashes = (int32_t *)p;
	p += 4 * numDocIds;

	// store score info into this class
	TopDocIds *td = qp->m_topDocIds;

	// store reply info, like # docids, in the query ptr
	int32_t max = numDocIds;
	if ( max > (int32_t)NUM_TOP_RESULTS ) max = (int32_t)NUM_TOP_RESULTS;
	td->m_numDocIds = max;

	// count replies
	m_numMsg3aReplies++;

	// log to log as well
	char tmp[50000];
	p = tmp;
	p += sprintf(p,
		     "seopipe: got full results CACHED "
		     "qrynum=%"INT32"of%"INT32" docids=%"INT32" "
		     "query=\"%s\" ",
		     m_numMsg3aReplies,//m_msg3a->m_hackQNum,
		     m_maxFullQueries ,
		     td->m_numDocIds,
		     qp->m_queryStr );
	// log each docid
	for ( int32_t i = 0 ; i < max ; i++ ) {
		//float score = m_msg3a->getScores()[i];
		int64_t d = docIds[i];
		//int32_t sh32 = m_msg3a->getSiteHash32(i);
		p += sprintf(p,"d%"INT32"=%"INT64" ",i,d);
	}
	log(tmp);


	// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
	SafeBuf *sb = &m_socketWriteBuf;

	sb->safePrintf(
		       "\t<seoQueryScoreInfo>\n"
		       "\t\t<queryNum>%"INT32"</queryNum>\n"
		       "\t\t<numTotalEstimatedSearchResults>%"INT32""
		       "</numTotalEstimatedSearchResults>\n"
		       "\t\t<numDocIds>%"INT32"</numDocIds>\n"
		       , m_msg3a->m_hackQNum
		       , numTotalResults
		       , numDocIds
		       );
	// print the top 50 scores
	for ( int32_t i = 0 ; i < max ; i++ ) {
		float score = scores[i];
		int64_t d = docIds[i];
		int32_t sh32 = siteHashes[i];
		sb->safePrintf("\t\t<searchResult>\n");
		sb->safePrintf("\t\t\t<rank>%"INT32"</rank>\n",i+1);
		sb->safePrintf("\t\t\t<score>%f</score>\n",score);
		sb->safePrintf("\t\t\t<docId>%"INT64"</docId>\n",d);
		sb->safePrintf("\t\t\t<siteHash32>%"UINT32"</siteHash32>\n",sh32);
		sb->safePrintf("\t\t</searchResult>\n");
		// store results for this Msg99Reply
		td->m_topDocIds[i] = d;
		td->m_topScores[i] = score;
		td->m_topSiteHashes[i] = sh32;
	}
	// reset rest so it prints pretty on gdb debug print cmd
	for ( int32_t i = max ; i < (int32_t)NUM_TOP_RESULTS ; i++ ) {
		td->m_topDocIds[i] = 0LL;
		td->m_topScores[i] = 0.0;
		td->m_topSiteHashes[i] = 0;
	}

	sb->safePrintf("\t</seoQueryScoreInfo>\n");

	// pump m_socketWriteBuf to m_seoSocket
	pumpSocketWriteBuf ( );
}
*/

// . this is the msg3a reply for related docids only
// . the full replies we get for determining ranks from scores for the
//   HTML simulator, are handled in seo.cpp using State95::m_msg3a.
void XmlDoc::gotMsg3aReplyForFullQuery ( ) {

	int32_t err = g_errno;

	// save it so we know related docid generation had an error...
	if ( g_errno && ! m_msg3aErrno )
		m_msg3aErrno = g_errno;

	setStatus ( "gotmsg3areplyforfullquery" );

	if ( g_errno ) {
		log("seopipe: got msg3a reply error: %s",mstrerror(g_errno));
		g_errno = 0;
	}

	// try again for next guy
	//m_triedCache = false;

	// how many docids in the search results were returned to us?
	int32_t numDocIds = m_msg3a->getNumDocIds();
	// total # search results estimated
	//int32_t numTotalResults = m_msg3a->getNumTotalEstimatedHits();
	// get the query as we received it in the msg99 reply
	//Msg99Reply *qp = (Msg99Reply *)m_msg3a->m_hackQPtr;
	int32_t queryNum = (int32_t)m_msg3a->m_hackQNum;

	// . point to the empty class we reserved in the buf
	// . store score info into this class
	//TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBuf();//Start();
	// ensure enough room
	//if ( m_topDocIdsBuf.getAvail() < sizeof(TopDocIds) )
	//	m_topDocIdsBuf.reserve(sizeof(TopDocIds) )

	// get next available spot to store this
	TopDocIds *td = (TopDocIds *)m_topDocIdsBuf.getBuf();
	int32_t tdnum = m_topDocIdsBuf.length() / sizeof(TopDocIds);
	m_topDocIdsBuf.incrementLength(sizeof(TopDocIds));
	if ( m_topDocIdsBuf.length() > m_topDocIdsBuf.m_capacity ) {
		char *xx=NULL;*xx=0; }

	QueryLink *qks = (QueryLink *)m_matchingQueryBuf.getBufStart();
	QueryLink *qk = &qks[queryNum];

	// the relateddocidnum hack
	if ( tdnum > 32000 ) { char *xx=NULL;*xx=0; }
	qk->m_relatedDocIdNum = tdnum;

	// store reply info, like # docids, in the query ptr
	int32_t max = numDocIds;
	if ( max > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS )
		max = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
	td->m_numDocIds = max;

	// QueryLink # in the m_matchingQueryBuf buffer we represent
	td->m_queryNum = queryNum;

	// keep it clean
	//qp->m_docIdVotes = 0;

	// get the query base hash and use that to
	// dedup. the query base hash ignores common
	// words and converts words to their synonym
	// with the smallest hash
	//int64_t qbh = getQueryBaseHash(qstr);

	//m_msg3a->m_hackQNum = m_queryNum;
	//m_msg3a->m_hackQPtr = (char *)qp;

	// count replies
	m_numMsg3aReplies++;

	// log to log as well
	//char tmp[50000];
	SafeBuf tmp;
	//char *p = tmp;
	tmp.safePrintf(
		     "seopipe: got list of %"INT32" related docids for "
		     "qrynum=%"INT32" "
		     //"of%"INT32""
		     "numDocids=%"INT32" "
		     "query=\"",
		     numDocIds,
		     m_numMsg3aReplies,//m_msg3a->m_hackQNum,
		     //m_maxFullQueries ,
		     td->m_numDocIds);
	char *qqq = qk->getQueryString(&m_matchingQueryStringBuf);
	tmp.safeStrcpy(qqq);
	tmp.safePrintf("\" (err=%s)",
		     mstrerror(err));
	// log each docid
	//for ( int32_t i = 0 ; i < max ; i++ ) {
	//	//float score = m_msg3a->getScores()[i];
	//	int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
	//	//int32_t sh32 = m_msg3a->getSiteHash32(i);
	//	p += sprintf(p,"d%"INT32"=%"INT64" ",i,d);
	//}
	char *msg = tmp.getBufStart();
	log("%s",msg);

	/*
	// int16_tcut. pumpSocket() sends the contents of this to m_seoSocket
	SafeBuf *sb = &m_socketWriteBuf;

	sb->safePrintf(
		       "\t<seoQueryScoreInfo>\n"
		       "\t\t<queryNum>%"INT32"</queryNum>\n"
		       "\t\t<numTotalEstimatedSearchResults>%"INT32""
		       "</numTotalEstimatedSearchResults>\n"
		       "\t\t<numDocIds>%"INT32"</numDocIds>\n"
		       , m_msg3a->m_hackQNum
		       , numTotalResults
		       , numDocIds
		       );
	*/
	// print the top 50 scores
	for ( int32_t i = 0 ; i < max ; i++ ) {
		float score = m_msg3a->m_scores[i];//getScores()[i];
		int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i];
		int32_t sh26 = m_msg3a->getSiteHash26(i);
		/*
		sb->safePrintf("\t\t<searchResult>\n");
		sb->safePrintf("\t\t\t<rank>%"INT32"</rank>\n",i+1);
		sb->safePrintf("\t\t\t<score>%f</score>\n",score);
		sb->safePrintf("\t\t\t<docId>%"INT64"</docId>\n",d);
		sb->safePrintf("\t\t\t<siteHash32>%"UINT32"</siteHash32>\n",sh32);
		sb->safePrintf("\t\t</searchResult>\n");
		*/
		// store results for this Msg99Reply
		td->m_topDocIds[i] = d;
		td->m_topScores[i] = score;
		td->m_topSiteHashes26[i] = sh26;
	}
	// reset rest so it prints pretty on gdb debug print cmd
	for ( int32_t i = max ; i < (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; i++ ) {
		td->m_topDocIds[i] = 0LL;
		td->m_topScores[i] = 0.0;
		td->m_topSiteHashes26[i] = 0;
	}

	/*
	sb->safePrintf("\t</seoQueryScoreInfo>\n");
	*/

	// give front-end the progress bar info
	if ( m_seoSocket && m_progressBar ) {
		// tmp buf
		char tmp[16];
		float percent = (float)m_numMsg3aReplies ;
		//percent /= (float)m_maxFullQueries;
		percent *= 100.0;
		// these are 80% of the pipeline if getting competitor
		// backlinks
		if ( m_progressBar == 2 ) percent *= .80;
		int32_t percentLong = (int32_t)percent;
		if ( percentLong >= 100 ) percentLong = 99;
		int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong);
		if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen);
		// try a send on non-blocking socket
		int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
		if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n);
		// forget error
		errno = 0;
	}
}

bool XmlDoc::clientClosedConnection ( ) {

	if ( ! m_seoSocket ) return false;

	if ( m_clientClosed ) return true;

	if ( g_now - m_lastCheckTime < 50 ) return m_clientClosed;

	m_lastCheckTime = g_now;

	char buffer[100];
	if ( recv(m_seoSocket->m_sd,buffer,99,MSG_PEEK|MSG_DONTWAIT) == 0 ) {
		m_clientClosed = true;
		log("xmldoc: CLIENT CLOSED CONNECTION!!");
	}

	return m_clientClosed;
}

// . returns -1 if blocked, NULL with g_errno set on error
// . we do this to get related docids
SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) {

	setStatus ( "getmatchingqueriesscoredforfullquery" );

	// just re-use the same m_queryPtrs SafeBuf we used above but we
	// set the Msg99Reply::m_myScore here and sort them by that
	if ( m_queryPtrsWholeValid )
		return &m_matchingQueryBuf;

	// get the queries sorted by the url: | scores for our main url
	SafeBuf *mq = getMatchingQueryBuf();
	if ( mq == NULL || mq == (void *)-1 ) return mq;

	// setup timer
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	if ( ! m_beginTimeFullQueries )
		m_beginTimeFullQueries = gettimeofdayInMilliseconds();

	// this buffer holds a ptr to each query in each msg99 reply we
	// received from all hosts in the network
	QueryLink *qks = (QueryLink *)mq->getBufStart();
	int32_t nks = mq->length()/sizeof(QueryLink);

	int32_t maxFullQueries = 50;
	int32_t tneed = maxFullQueries * sizeof(TopDocIds);
	if ( m_topDocIdsBuf.length() == 0 && ! m_topDocIdsBuf.reserve(tneed) )
		return NULL;

	// . now launch msg3as at them
	// . this is 60k so new it here
	if ( ! m_msg3a ) {
		// reset the query # we are processing
		m_queryNum = 0;
		m_numMsg3aRequests = 0;
		m_numMsg3aReplies  = 0;
		if ( ! m_fullQueryDedup.set(8,0,256,NULL,0,
					    false,m_niceness,"fqdd"))
			return NULL;
		try { m_msg3a = new ( Msg3a ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			return NULL;
		}
		mnew ( m_msg3a, sizeof(Msg3a),"xdmsg3a");
		// need this too now i guess since it is 65k
		try { m_query3a = new ( Query ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			return NULL;
		}
		mnew ( m_query3a, sizeof(Query),"xdqry3a");
	}


 loop:

	// breath in case we hit all cache
	QUICKPOLL(m_niceness);

	// have we launched all the requests we need to
	bool exhausted = false;
	if ( m_queryNum >= nks ) exhausted = true;
	if ( m_numMsg3aRequests >= maxFullQueries ) exhausted = true;
	// if client closed browser connection by hitting the stop sign
	// then stop here!
	if ( clientClosedConnection() ) m_hadMatchError = ESOCKETCLOSED;
	if ( m_hadMatchError ) exhausted = true;

	// if nothing to launch
	if ( exhausted &&
	     // and all replies received
	     m_numMsg3aReplies >= m_numMsg3aRequests ) {
		// nuke the msg3a to save mem
		mdelete ( m_msg3a, sizeof(Msg3a) , "msg3a" );
		delete ( m_msg3a );
		m_msg3a = NULL;
		mdelete ( m_query3a , sizeof(Query), "qry3a" );
		delete ( m_query3a );
		m_query3a = NULL;
		// time it
		int64_t now = gettimeofdayInMilliseconds();
		int64_t took = now - m_beginTimeFullQueries;
		log("seopipe: time: fullqueries took %"INT64" ms",took);
		// force closed?
		if ( m_hadMatchError ) return NULL;
		// we are done!
		m_queryPtrsWholeValid = true;
		return &m_matchingQueryBuf;//queryPtrs;
	}

	// if nothing to launch wait for all replies
	if ( exhausted )
		return (SafeBuf *)-1;

	// get the current query to process
	//Msg99Reply *qp = queryPtrs[m_queryNum];
	QueryLink *qk = &qks[m_queryNum];

	int32_t savedQueryNum = m_queryNum;

	QueryLogEntry *qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);

	// int16_tcut
	//int64_t h64 = qk->m_querySynBaseHash64;
	int64_t h64 = getSynBaseHash64 ( qe->getQueryString(),qe->m_langId);

	// . if we already did a similar query, then skip it
	// . Msg99Reply::m_topDocIds will be NULL so getRelatedDocIds() will
	//   know we skipped this query and to ignore it
	if ( m_fullQueryDedup.isInTable(&h64) ) {
		m_queryNum++;
		goto loop;
	}

	// or if importance is 0, which means to ignore!
	if ( qk->m_queryImportance <= 0.0 ) {
		m_queryNum++;
		goto loop;
	}

	// int16_tcut
	char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);

	// sanity
	if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }
	// this is required for synonyms!
	// TODO: use whatever language the query is!!!
	uint8_t langId = langEnglish;

	// int16_tcut
	int32_t qlen = gbstrlen(qstr);

	//int32_t collLen = gbstrlen(cr->m_coll);
	// set the request
	m_mr2.reset();
	m_mr2.ptr_query  = qstr;
	m_mr2.size_query = qlen+1;
	//m_mr2.ptr_coll   = cr->m_coll;
	//m_mr2.size_coll  = collLen+1;
	m_mr2.m_collnum = cr->m_collnum;
	m_mr2.m_queryExpansion = 1;
	m_mr2.m_language = langId;
	m_mr2.m_niceness = m_niceness;
	// . get top 50 results now
	// . then related docids will have to be in there
	m_mr2.m_docsToGet = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS;
	m_mr2.m_useSeoResultsCache = true;
	// we do not need this, we just want the related docids/scores
	m_mr2.m_getDocIdScoringInfo = false;
	// use cache for 7 days since it is just for getting related docids
	// right now. make sure that that cache saves to disk.
	// MDW: why is this not working?
	//m_mr2.m_maxAge     = 86400 * 7;
	//m_mr2.m_addToCache = true;
	//m_mr2.m_debug = 1;
	// prepend to the query?
	int32_t ulen = m_firstUrl.m_ulen;
	// go to next guy if this query is too big already
	if ( ulen + qlen + 10 > ABS_MAX_QUERY_LEN ) {
		m_queryNum++;
		goto loop;
	}

	// support for the new TopDocIds class which holds detailed search
	// results for selected matching queries QueryLinks
	//int32_t maxt = numQueryPtrs;
	//if ( maxt > m_maxQueries ) maxt = m_maxQueries;
	//if ( ! maxt ) { char *xx=NULL;*xx=0; }
	// we also need the top docids
	//if ( ! m_topDocIdsBuf.m_capacity ) {
	//	int32_t need = sizeof(TopDocIds) * (int32_t)MAX_MATCHING_QUERIES;
	//	if ( ! m_topDocIdsBuf.reserve ( need ,"tdbuf" ) ) return NULL;
	//	//m_nextAvailTopDocIdsOffset = 0;// = m_topDocIdsBuf;
	//}
	// make matching query, "qk", point to the topdocids that we
	// will fill in when we execute this query in full below
	// sanity!
	//int32_t off3 = m_nextAvailTopDocIdsOffset ;
	//if ( off3/(int32_t)sizeof(TopDocIds)>=maxt){char *xx=NULL;*xx=0;}
	// seo.cpp's handleRequest99() should have set it to -1
	//if ( qp->m_topDocIdsBufOffset != -1 ) { char *xx=NULL;*xx=0; }
	// assign this TopDocIds class to this query ptr now
	//qp->m_topDocIdsBufOffset = m_nextAvailTopDocIdsOffset;
	// get that ptr to reset its count to 0
	//TopDocIds *ttt = qp->getTopDocIds(&m_topDocIdsBuf);
	//ttt->m_numDocIds = 0;
	// inc it
	//m_nextAvailTopDocIdsOffset += sizeof(TopDocIds);
	// update length since we store topdocids buf based on its m_length
	//m_topDocIdsBuf.setLength ( m_nextAvailTopDocIdsOffset );

	// advance for next guy
	m_queryNum++;

	// add it to dedup table
	if ( ! m_fullQueryDedup.addKey(&h64) ) {
		m_hadMatchError = g_errno;
		goto loop;
	}

	// mark it out
	m_numMsg3aRequests++;

	// . set the query class for msg3a
	// . queryExpansion = true
	m_query3a->set2 ( qstr , langId , true );

	// a debug thing
	m_query3a->m_containingParent = (void *)this;

	// secret variable latchon
	m_msg3a->m_hack = this;

	m_msg3a->m_hackQNum = savedQueryNum;
	m_msg3a->m_hackQPtr = NULL;//(char *)qp;

	// note it
	setStatus("launching msg3a");

	// . get the docIds
	// . this sets m_msg3a.m_clusterLevels[] for us
	// . it sends a msg39 request to each alive host in the network
	bool status = m_msg3a->getDocIds ( &m_mr2,
					   m_query3a,
					   this,//m_msg3a , // this ,
					   gotMsg3aReplyForFullQueryWrapper);
	// return false if msg3a blocked
	if ( ! status ) return (SafeBuf *)-1;
	// error?
	if ( g_errno ) {
		m_hadMatchError = g_errno;
		m_numMsg3aReplies++;
		goto loop;
	}
	// i guess did not block... can this happen? cached?
	//log("xmldoc: msg3a did not block");
	// not supported yet. we need to process reply.
	//char *xx=NULL;*xx=0;
	// yeah, msg17 in there can cache in seoresults cache now
	gotMsg3aReplyForFullQuery();
	// try looping
	goto loop;
}

static int rdCmp ( const void *a, const void *b ) {
	RelatedDocId *da = (RelatedDocId *)a;
	RelatedDocId *db = (RelatedDocId *)b;
	// get scores
	float scorea = da->m_relatedWeight;//dotProduct;//similarityScore;
	float scoreb = db->m_relatedWeight;//dotProduct;//similarityScore;
	if ( scorea < scoreb ) return  1;
	if ( scorea > scoreb ) return -1;
	return 0;
}

static int lkCmp ( const void *a, const void *b ) {
	QueryNumLinkedNode *ka = *(QueryNumLinkedNode **)a;
	QueryNumLinkedNode *kb = *(QueryNumLinkedNode **)b;
	// get scores
	int32_t ra = ka->m_relatedDocIdRank;
	int32_t rb = kb->m_relatedDocIdRank;
	if ( ra >= 0 && rb >= 0 ) {
		if ( ra < rb ) return -1;
		if ( ra > rb ) return  1; // swap
	}
	if ( ra >= 0 ) return -1;
	if ( rb >= 0 ) return  1; // swap
	// if neither ranked, go by serp score i guess
	float sa = ka->m_relatedDocIdSerpScore;
	float sb = kb->m_relatedDocIdSerpScore;
	if ( sa > sb ) return -1;
	if ( sa < sb ) return  1; // swap
	return 0;
}

// buf is an array of RelatedDocId members
SafeBuf *XmlDoc::getRelatedDocIds ( ) {

	setStatus ( "getrelateddocids" );

	if ( m_relatedDocIdBufValid )
		return &m_relatedDocIdBuf;

	// get the full replies with the top 50 docids and scores listed
	// for each query. should be sorted by m_myScore.
	SafeBuf *mq = getMatchingQueriesScoredForFullQuery ( );
	if ( ! mq || mq == (void *)-1 ) return mq;

	// . how many queries do we have that match this url?
	// . they should be sorted by our url's score
	//QueryLink *qks = (QueryLink *)mq->getBufStart();
	//int32_t nks = mq->length()/sizeof(QueryLink);


	int32_t *sh32 = getSiteHash32();
	if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SafeBuf *)sh32;

	int32_t dh32 = getDomHash32();

	//if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
	//if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; }
	int32_t ourSiteHash26 = *sh32 & 0x03ffffff;
	int32_t ourDomHash26  = dh32  & 0x03ffffff;

	// for deduping queries with the same "base hash" we do not want
	// them to count twice for RelatedDocId::m_numCommonQueries
	//HashTableX dedup;
	//if ( ! dedup.set(8,0,1024,NULL,0,false,0,"dddtab"))
	//	return NULL;

	// scan the top docids
	TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
	int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
	for ( int32_t i = 0 ; i < ntds ; i++ ) {
		TopDocIds *td = &tds[i];
		int32_t queryNum = td->m_queryNum;
		//QueryLink *qk = &qks[queryNum];
		// sanity
		int32_t nd = td->m_numDocIds;
		if( nd < 0) { char *xx=NULL;*xx=0; }
		if( nd > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS){
			char *xx=NULL;*xx=0;}
		// get main url score for query
		//float ourScore = qp->m_myScore;
		// and the score of the top result
		//float normScore = td->m_topScores[0];
		// norm main url score
		//ourScore /= normScore;
		// scan the top 50 (or more) docids for this query
		for ( int32_t j = 0 ; j < nd ; j++ ) {
			// . do not allow related docid (aka competitor page)
			//   to be from our site! will make sure we exclude
			//   our url itself, too. otherwise competitor
			//   backlinks mentions when a link links to us, and
			//   we don't care about that, we already have the
			//   link. we just want to see recommneded backlinks
			//   we do not yet have, so we can get them.
			// . skip it if from our same sitehash26
			if ( td->m_topSiteHashes26[j] == ourSiteHash26 )
				continue;
			// fix cheatcodes.com being a competitor page when
			// our main url is www.cheatcodes.com
			if ( td->m_topSiteHashes26[j] == ourDomHash26 )
				continue;
			// skip twitter facebook, etc
			int64_t docId = td->m_topDocIds[j];
			if ( docId == 114607849462LL || // https://www.twitter
			     docId == 273941610476LL || // twitter.com
			     docId == 1628437294LL || // facebook.com
			     docId == 146394931444LL ) // cnn.com/video/
				continue;
			// add RelatedDocId into m_relatedDocIdBuf and/or
			// augment its linked list of query/score pairs
			addRelatedDocIdInfo ( td->m_topDocIds[j],
					      queryNum ,
					      td->m_topScores[j], // score
					      j , // rank
					      td->m_topSiteHashes26[j] );
		}
	}

	QUICKPOLL(m_niceness);

	// this is now in getRelatedDocIdsScored()!!!!!!!
	/*
	char *rdbuf = m_relatedDocIdBuf.getBufStart();
	int32_t numDocIds = m_relatedDocIdBuf.length()/sizeof(RelatedDocId);
	// now sort by RelatedDocId::m_relatedWeight
	qsort ( rdbuf , numDocIds, sizeof(RelatedDocId),qp99docIdCmp );

	QUICKPOLL(m_niceness);

	// limit to top MAX_RELATED_DOCIDS related docids
	// will take longer to get titles/urls and related queries the
	// higher this number is, but we will have more competitor backlinks
	// and terms etc.
	int32_t maxLen = sizeof(RelatedDocId) * MAX_RELATED_DOCIDS;
	int32_t currentLen = m_relatedDocIdBuf.length();
	if ( currentLen > maxLen ) currentLen = maxLen;
	m_relatedDocIdBuf.setLength(currentLen);
	numDocIds = currentLen / sizeof(RelatedDocId);
	*/

	int32_t numDocIds = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
	/*
	// log out for debug
	char *rdbuf = m_relatedDocIdBuf.getBufStart();
	RelatedDocId *rds = (RelatedDocId *)rdbuf;
	for ( int32_t i = 0 ; g_conf.m_logDebugSEO && i < numDocIds ; i++ ) {
		log("seopipe: related docId #%"INT32" docid=%"INT64" "
		    "score=?? common=%"INT32"",
		    i,
		    rds[i].m_docId,
		    //rds[i].m_relatedWeight,//dotProduct, // similarityScore,
		    rds[i].m_numCommonQueries);
	}
	*/

	log("seo: got %"INT32" related docids in buf",numDocIds);

	m_relatedDocIdBufValid = true;
	return &m_relatedDocIdBuf;
}


// used as part of the msg4f request
SafeBuf *XmlDoc::getTopMatchingQueryBuf ( ) {

	if ( m_topMatchingQueryBufValid )
		return &m_topMatchingQueryBuf;

	// scan matching queries that we evaluated fully using msg3a
	SafeBuf *qkbuf = getMatchingQueriesScoredForFullQuery ( );
	if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;
	//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
	//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
	QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
	//int32_t nks = qkbuf->length()/sizeof(QueryLink);

	TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
	int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds);
	for ( int32_t i = 0 ; i < ntds ; i++ ) {
		TopDocIds *td = &tds[i];
		int32_t queryNum = td->m_queryNum;
		QueryLink *qk = &qks[queryNum];
		// ok, get it
		char *qstr = qk->getQueryString(&m_matchingQueryStringBuf);
		int32_t  qlen = gbstrlen(qstr);
		// store query #
		if ( ! m_topMatchingQueryBuf.pushLong(queryNum) )
			return NULL;
		// then query
		if ( ! m_topMatchingQueryBuf.safeMemcpy(qstr,qlen+1))
			return NULL;
	}

	m_topMatchingQueryBufValid = true;
	return &m_topMatchingQueryBuf;
}


static void gotMsg4fReplyWrapper ( void *state , UdpSlot *slot ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// a bit of a hack
	THIS->m_savedSlot = slot;
	// ultimately, getRelatedDocIdsScored() will be called from this
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . lets just put everything in this one function
// . launch a msg4f request for each relateddocid
// . get the msg4f reply back and add the positive scoring queries to the
//   related docids linked list of QueryNumLinkedNodes in the
//   m_commonQueryNumBuf, avoid dups.
// . then score each related docid by calling setRelatedDocIdScores()
SafeBuf *XmlDoc::getRelatedDocIdsScored ( ) {

	setStatus ( "getrelateddocidsscored");

	if ( m_relatedDocIdsScoredBufValid ) {
		// and return the buf of RelatedDocIds
		return &m_relatedDocIdBuf;
	}

	// what docids share our TOP-scoring matching queries?
	SafeBuf *rdbuf = getRelatedDocIds();
	if ( ! rdbuf || rdbuf == (void *)-1) return (SafeBuf *) rdbuf;

	SafeBuf *tmq = getTopMatchingQueryBuf();
	if ( ! tmq || tmq == (void *)-1) return (SafeBuf *) tmq;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// the top 50 or so matching queries will each be scored for
	// every related docid we have in m_relatedDocIdBuf. these are
	// the same queries we got the full results for above!!!
	// we have to score them for each related docid here because we only
	// get the top 300 or so results above for each one. so if the
	// related docid matched the query but was not in the top 300 results,
	// it would have appeared to NOT match the query. bad. that was
	// causing google to come up high in related docids because it
	// ranked high for so many generic queries. and the other good
	// related docids did not rank in the top 300 for those same
	// generic queries. so at least this logic will show that the
	// related docids do indeed match those generic queries, too.
	// and they will get higher scores (RelatedDocId::m_relatedWeight)

	// we must be an incoming reply if we already sent out all the requests
	if ( m_numMsg4fRequests > 0 ) {
		// increment our reply counter
		m_numMsg4fReplies++;
		// . m_savedSlot is a hack
		// . now parse the reply and add QueryNumLinkedNode
		//   into m_commonQueryNumBuf.
		char *p    = m_savedSlot->m_readBuf;
		char *pend = m_savedSlot->m_readBufSize + p;
		// now scan the reply
		for ( ; p < pend ; ) {
			// breathe
			QUICKPOLL(m_niceness);
			// the queryNum is relative to the m_queryPtrs array
			// which has all the matching queries of this document,
			// not just the "top" 50 matching queries by score.
			int32_t queryNum = *(int32_t *)p;
			// sanity
			if ( queryNum<0 ) {char *xx=NULL;*xx=0; }
			p += 4;
			// then docid of related docid that had this score
			int64_t docId = *(int64_t *)p;
			p += 8;
			// then score
			float score = *(float *)p;
			p += 4;
			// this will add the query/score pair into the
			// related docid buf. it will not add dups if already
			// ranked!
			addRelatedDocIdInfo ( docId ,
					      queryNum ,
					      score ,
					      -1 ,  // rank unknown
					      -1 ); // sitehash26 unknown
		}

		// return if awaiting more replies
		if ( m_numMsg4fReplies < m_numMsg4fRequests )
			return (SafeBuf *)-1;

		// point to buffer of related docids
		char *rdbuf = m_relatedDocIdBuf.getBufStart();
		RelatedDocId *rds = (RelatedDocId *)rdbuf;
		int32_t nr = m_relatedDocIdBuf.length() / sizeof(RelatedDocId);
		for ( int32_t i = 0 ; i < nr ; i++ ) {
			// int16_tcut
			RelatedDocId *rd = &rds[i];
			// now score it since we have all the serpscores for
			// all top matching queries.
			setRelatedDocIdWeightAndRank(rd);
		}

		// breathe
		QUICKPOLL(m_niceness);

		// now sort by RelatedDocId::m_relatedWeight
		qsort ( rdbuf , nr , sizeof(RelatedDocId),rdCmp );

		// breathe
		QUICKPOLL(m_niceness);

		// limit to top MAX_RELATED_DOCIDS related docids
		// will take longer to get titles/urls and related queries the
		// higher this number is, but we will have more competitor
		// backlinks and terms etc. less space in cachedb too!
		int32_t maxLen = MAX_RELATED_DOCIDS * sizeof(RelatedDocId);
		int32_t newLen = m_relatedDocIdBuf.length();
		if ( newLen > maxLen ) newLen = maxLen;
		m_relatedDocIdBuf.setLength(newLen);

		//
		// make a new buffer for m_commonQueryNumBuf just for the
		// related docids we picked, and sort them by rel docid rank.
		// so it will be smaller and sorted.
		//
		SafeBuf tmpBuf;
		if ( ! tmpBuf.reserve ( m_commonQueryNumBuf.length() ) )
			return NULL;
		// scan each related docid in the top 300 or so
		for ( int32_t i = 0 ; i < nr ; i++ ) {
			// int16_tcut
			RelatedDocId *rd = &rds[i];
			// store ptrs to query nums so we can sort them
			QueryNumLinkedNode *links[1024];
			int32_t nn = 0;
			int32_t fo = rd->m_firstCommonQueryNumOff;
			char *base = m_commonQueryNumBuf.getBufStart();
			// scan down the linked list and store ptrs to links[]
			for ( ; fo >= 0 ; ) {
				// cast it
				QueryNumLinkedNode *qn;
				qn = (QueryNumLinkedNode *)(base + fo);
				// point to next
				fo = qn->m_nextOff;
				// store this guy for sorting
				links[nn] = qn;
				nn++;
				if ( nn >= 1024 ) break;
			}
			// now sort them by m_relatedDocIdRank
			qsort( links, nn,sizeof(QueryNumLinkedNode *),lkCmp);
			// point to our new linked list in tmpBuf, we will
			// store them here.
			rd->m_firstCommonQueryNumOff = tmpBuf.length();
			QueryNumLinkedNode *prev = NULL;
			// now store into tmpbuf
			for ( int32_t k = 0 ; k < nn ; k++ ) {
				QueryNumLinkedNode *qn = links[k];
				int32_t size = sizeof(QueryNumLinkedNode);
				if ( !tmpBuf.reserve(size) ) return NULL;
				QueryNumLinkedNode *nn ;
				nn = (QueryNumLinkedNode *)tmpBuf.getBuf();
				int32_t clen = tmpBuf.length();
				tmpBuf.safeMemcpy(qn,size);
				// we are the previous guy's next node
				if ( prev ) prev->m_nextOff = clen;
				// assume nobody follows us
				nn->m_nextOff = -1;
				// we are now next guy's prev
				prev = nn;
			}
		}

		// now steal tmpbuf, and free our old stuff
		m_commonQueryNumBuf.stealBuf ( &tmpBuf );

		// i guess we are done now!
		m_relatedDocIdsScoredBufValid = true;
		return &m_relatedDocIdBuf;
	}


	int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
	RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();

	// . there's a massive # of related docids at this point
	// . possibly 50 x 300 = 15,000
	// . so launch one msg4f for each host in our network
	// . just specify all the related docids in the msg4f request and have
	//   the handleRequest4f() function in seo.cpp get the title rec.
	// . make sure all docids are local to that host
	// . dispatch the msg4f request to the machine that has that docid
	//   local so it can just hit disk
	// . handleRequest4f() can follow the same logic as in
	//   getRelatedQueryLinks() which make a new xmldoc. then it can
	//   call newxd->getTermListBuf() instead of us passing it in.
	// . so each host has a bin, a host bin
	//#ifdef __APPLE__
	SafeBuf hostBin[MAX_HOSTS];
	//#else
	//SafeBuf hostBin[g_hostdb.m_numHosts];
	//#endif

	// scan the related docids and send the requests if we have not already
	for ( int32_t i = 0 ; ! m_sentMsg4fRequests && i < numRelated ; i++ ) {
		RelatedDocId *rd = &rds[i];
		//uint32_t gid=g_hostdb.getGroupIdFromDocId (rd->m_docId);
		// pick host in that group
		//Host *group = g_hostdb.getGroup ( gid );
		int32_t shardNum = getShardNumFromDocId ( rd->m_docId );
		Host *group = g_hostdb.getShard ( shardNum );
		int32_t nh = g_hostdb.m_numHostsPerShard;
		int32_t hostNum = rd->m_docId % nh;
		Host *h = &group[hostNum];
		int32_t hostId = h->m_hostId;
		// skip if dead
		int32_t count = 0;
		if ( g_hostdb.isDead(hostId) && h->m_wasEverAlive ) {
			// increment hostnum if that one is dead
			if ( ++hostNum >= nh ) hostNum = 0;
			// set these again
			h = &group[hostNum];
			hostId = h->m_hostId;
			// if all dead, just pick this one i guess
			if ( ++count >= nh ) break;
		}
		// int16_tcut
		SafeBuf *hbin = &hostBin[hostId];
		// if bin is empty initialize
		if ( hbin->length() == 0 ) {
			// provide only collection to handleRequest4f()
			if ( ! hbin->safeMemcpy(cr->m_coll,
						gbstrlen(cr->m_coll)+1) )
				return NULL;
			// . store the queries we want it to evaluate
			// . these are null-terminated query strings preceeded
			//   by their corresponding query number in our
			//   m_queryPtrs[] array which pts to a Msg99Reply
			if ( ! hbin->pushLong(tmq->length()))
				return NULL;
			if ( ! hbin->safeMemcpy(tmq))
				return NULL;
		}
		// store this new docid, which is local to this host
		if ( ! hbin->pushLongLong(rd->m_docId) ) return NULL;
	}

	// shotgun out the msg4f requests now
	for ( int32_t i = 0 ;
	      ! m_sentMsg4fRequests && i < g_hostdb.getNumHosts() ; i++ ) {
		// int16_tcut
		SafeBuf *hbin = &hostBin[i];
		// get that host
		Host *host = g_hostdb.getHost(i);
		// make a copy for sending out
		SafeBuf copy;
		if ( ! copy.safeMemcpy ( hbin ) ) continue;
		// get the bin copy
		char *req     = copy.getBufStart();
		int32_t  reqSize = copy.length();
		// detach it so udpserver can free it when done transmitting
		copy.detachBuf ();
		// free this guy now i guess
		hbin->purge();
		// count as launched
		m_numMsg4fRequests++;
		// launch it
		if ( ! g_udpServer.sendRequest ( req ,
						 reqSize,
						 0x4f , // msgtype
						 host->m_ip , // ip
						 host->m_port , // port
						 host->m_hostId,
						 NULL, // retslot
						 this,
						 gotMsg4fReplyWrapper,
						 10000 , // timeout
						 -1 , // backoff
						 -1 , // maxwait
						 NULL, // replybuf
						 0, // replybufmaxsize
						 m_niceness // niceness
						 )) {
			// let admin know about error
			log("seopipe: sendRequest 4f had error: %s",
			    mstrerror(g_errno));
			// count it as replied then
			m_numMsg4fReplies++;
			continue;
		}
	}

	// do not re-send the requests
	m_sentMsg4fRequests = true;

	// wait for all replies to come in
	if ( m_numMsg4fRequests > m_numMsg4fReplies ) return (SafeBuf *)-1;

	// how can they all be done? all errors!
	if ( ! g_errno ) { char *xx=NULL;*xx=0; }

	return NULL;
}


// remote host will alloc an xmldoc, about 1MB each...
#define MAX_OUT_MSG20S 30

// . like getRelatedDocIds() but with titles, etc.
// . return a list of competiting docids/titles/etc.
SafeBuf *XmlDoc::getRelatedDocIdsWithTitles ( ) {

	setStatus ( "getrelateddocidswithtitles" );

	// try to set from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	if ( m_relatedDocIdsWithTitlesValid )
		return &m_relatedDocIdBuf;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	SafeBuf *rdbuf = getRelatedDocIdsScored();
	if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;

	int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);

	// now look up each docid in titledb and store the url title
	// into m_relatedTitleBuf safebuf and set the RelatedDocId::
	// rd_title_off and rd_url_off into that when done. store offsets for
	// now and make into full out ptrs when done in case the
	// m_relatedTitleBuf reallocs.

	if ( ! m_msg20Buf.length() ) {
		int32_t need = sizeof(Msg20) * MAX_OUT_MSG20S ;
		if ( ! m_msg20Buf.reserve ( need,"m20buf" ) ) return NULL;
		// mark it all in use
		m_msg20Buf.setLength(need);
		// init them
		Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
		int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
		for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
		// reset cursor to start with first related docid
		m_rdCursor = 0;
		m_relatedDocIdError = 0;
		m_numMsg20Replies = 0;
	}

	// point to buffer of related docids
	RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();;
	Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
	int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);

	// scan the msg20s we allocated to see if any got a reply
	for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
		// int16_tcut
		Msg20 *msg20 = &mp[i];
		// skip if never launched
		if ( ! msg20->m_launched ) continue;
		// skip if it is in progress, awaiting its reply
		if ( msg20->m_inProgress ) continue;
		// get the reply from it (might be NULL iff g_errno is set)
		Msg20Reply *reply = msg20->getReply(); // m_r
		// get the corresponding related docid
		int32_t hisCursor = msg20->m_hack2;
		// int16_tcut
		RelatedDocId *rd = &rds[hisCursor];
		// ok, it has a reply. could be NULL if g_errno was set.
		if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , reply ) )
			m_relatedDocIdError = g_errno;
		// reset it for later us... or not...
		msg20->reset();
		// count reply as back now
		m_numMsg20Replies++;
	}

	// launch more if we can. one launch per msg20.
	for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
		// no more related docids left to launch?
		if ( m_rdCursor >= numRelated ) break;
		// int16_tcut
		Msg20 *msg20 = &mp[i];
		// skip if already launched/inuse
		if ( msg20->m_inProgress ) continue;
		// get current related docid
		RelatedDocId *rd = &rds[m_rdCursor];
		// make the request
		Msg20Request req;
		//req.ptr_coll    = cr->m_coll;
		//req.size_coll   = gbstrlen(cr->m_coll)+1;
		req.m_collnum = cr->m_collnum;
		req.m_docId     = rd->m_docId;
		req.m_expected  = true;
		req.m_niceness  = m_niceness;
		req.m_state     = m_masterState;
		req.m_callback2 = m_masterLoop;
		// do not get summary stuff. too slow.
		req.m_numSummaryLines = 0;
		// if it has an outlink to our site/domain set
		// Msg20Reply::m_hasLinkToOurDomOrHost
		req.m_ourHostHash32 = getHostHash32a();
		req.m_ourDomHash32  = getDomHash32();
		// store cursor in msg20 itself so we know what rd it's using
		msg20->m_hack2 = m_rdCursor;
		// advance cursor!!!
		m_rdCursor++;
		// launch it
		if ( ! msg20->getSummary ( &req ) ) continue;
		// it did not block... wtf? g_errno might be set. ENOMEM?
		if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , NULL ) )
			m_relatedDocIdError = g_errno;
		// reset it
		msg20->reset();
		// count reply as back now
		m_numMsg20Replies++;
		// it is not launched
		i--;
	}

	// wait for one reply per related docid
	if ( m_numMsg20Replies < numRelated )
		return (SafeBuf *)-1;

	// call msg20 destructor
	for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
		Msg20 *msg20 = &mp[i];
		msg20->destructor();
	}
	// purge the mem they used
	m_msg20Buf.purge();

	// now we are done
	m_relatedDocIdsWithTitlesValid = true;
	m_relatedTitleBufValid         = true;

	// store it in cachedb
	if ( ! storeRelatedDocIdsIntoCachedb( ))
		return (SafeBuf *)-1;

	return &m_relatedDocIdBuf;
}


bool XmlDoc::setRelatedDocIdInfoFromMsg20Reply ( RelatedDocId *rd ,
						 Msg20Reply *reply ) {

	// get error. g_errno can be ENOTFOUND if titlerec not found
	int32_t error = g_errno;
	// . or could be EDOCBANNED/EDOCFILTERED etc.
	// . if reply is NULL then g_errno MUST be set
	if ( ! error ) error = reply->m_errno;

	// int16_tcuts
	char *urlStr = NULL;
	char *titleStr = NULL;
	char *siteStr = NULL;

	if ( reply ) {
		urlStr = reply->ptr_ubuf;
		titleStr = reply->ptr_tbuf;
		siteStr = reply->ptr_site;
	}

	// did that fail? i.e. docid not found!?!?!
	if ( error ) {
		// . just skip this asshole then
		// . might be EDOCBANNED or EDOCFILTERED!
		// . some are filtered because they are domain-only urls
		//   which should not be in the index because we force
		//   a "www." prepend on all urls now.
		log("seo: msg20 reply for docid=%"INT64" url=%s had "
		    "error: %s", rd->m_docId,urlStr,mstrerror(error));
		// clear that
		g_errno = 0;
	ignoreRelatedDocId:
		// mark them offsets as not-founds
		rd->rd_title_off = -1;
		rd->rd_url_off   = -1;
		rd->rd_site_off  = -1;
		return true;
	}

	// bar facebook.com and twitter.com roots... too popular for all!
	// was coming up for jezebelgallery.com
	if ( strcmp(urlStr,"http://www.twitter.com/") == 0 )
		goto ignoreRelatedDocId;

	if ( strcmp(urlStr,"https://www.twitter.com/") == 0 )
		goto ignoreRelatedDocId;

	if ( strcmp(urlStr,"http://www.facebook.com/") == 0 )
		goto ignoreRelatedDocId;

	// "/home.php?" or "home.*"
	if ( strncmp(urlStr,"http://www.facebook.com/home.",29) == 0 )
		goto ignoreRelatedDocId;

	if ( strcmp(urlStr,"https://www.facebook.com/") == 0 )
		goto ignoreRelatedDocId;

	if ( strcmp(urlStr,"http://www.cnn.com/video/") == 0 )
		goto ignoreRelatedDocId;

	// fix robothits.com competitor pages
	if ( strcmp(urlStr,"http://www.google.com/") == 0 )
		goto ignoreRelatedDocId;
	if ( strcmp(urlStr,"http://www.msn.com/") == 0 )
		goto ignoreRelatedDocId;

	// null means no title i guess
	if ( ! titleStr ) titleStr = "";

	// or if he links to us
	if ( reply->m_hasLinkToOurDomOrHost ) {
		log("seo: related docid=%"INT64" url=%s links to our domain",
		    reply->m_docId,
		    urlStr);
		goto ignoreRelatedDocId;
	}


	// store title
	int32_t titleOffset = m_relatedTitleBuf.length();
	if ( ! m_relatedTitleBuf.safeStrcpy ( titleStr ) ) return false;
	m_relatedTitleBuf.pushChar('\0');

	// then url
	int32_t urlOffset = m_relatedTitleBuf.length();
	if ( ! m_relatedTitleBuf.safeStrcpy ( urlStr ) ) return false;
	m_relatedTitleBuf.pushChar('\0');

	// then site
	int32_t siteOffset = m_relatedTitleBuf.length();
	if ( ! m_relatedTitleBuf.safeStrcpy ( siteStr ) ) return false;
	m_relatedTitleBuf.pushChar('\0');

	// then linkinfo
	//int32_t linkInfo1Offset = m_relatedTitleBuf.length();
	//if(!m_relatedTitleBuf.safeMemcpy(info1,info1->getSize()))return NULL;

	// store as offset for easy serialization for storage into cachedb
	//rd->m_linkInfo1Offset = linkInfo1Offset;
	rd->m_relatedFirstIp = reply->m_firstIp;
	rd->m_relatedCurrentIp = reply->m_ip;
	rd->m_rd_siteRank = reply->m_siteRank;
	rd->m_rd_langId = reply->m_language;

	rd->m_rd_siteHash32 = 0;
	if ( reply->ptr_site )
		rd->m_rd_siteHash32 = hash32n ( reply->ptr_site );

	// record the offsets of title/url/site in the m_relatedTitleBuf
	rd->rd_title_off = titleOffset;
	rd->rd_url_off   = urlOffset;
	rd->rd_site_off  = siteOffset;

	SafeBuf *rdbuf = getRelatedDocIds();
	int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);

	// log out for debug
	log(LOG_DEBUG,
	    "seopipe: related docid (%"INT32"of%"INT32") docid=%"INT64" score=%f "
	    "title=\"%s\" url=\"%s\"",
	    m_numMsg20Replies,
	    numRelated-1,
	    rd->m_docId,
	    rd->m_relatedWeight,
	    titleStr,
	    urlStr);

	return true;
}
/*
HashTableX *XmlDoc::getMatchingQueryHashTable ( ) {

	setStatus ( "getmatchingqueryhashtable" );

	if ( m_queryHashTableValid )
		return &m_queryHashTable;

	SafeBuf *qpbuf = getMatchingQueries(false);
	if ( ! qpbuf || qpbuf == (void *)-1) return (HashTableX *)qpbuf;

	// how many queries do we have that match this url?
	Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
	int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);

	// init it
	if ( ! m_queryHashTable.set(8,
				    0,
				    numQueryPtrs*4,
				    NULL,
				    0,
				    false,
				    m_niceness,
				    "qdht") )
		return NULL;

	for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) {
		// cast it
		Msg99Reply *qp = qptrs[i];
		// int16_tcut
		int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
		// hash it up
		if ( ! m_queryHashTable.addKey ( &eh64 ) )
			return NULL;
	}

	// all done
	m_queryHashTableValid = true;
	return &m_queryHashTable;
}
*/

/*
HashTableX *XmlDoc::getMatchingQueryOffsetTable ( ) {

	setStatus ( "getmatchingqueryoffsettable" );

	if ( m_queryOffsetTableValid )
		return &m_queryOffsetTable;

	SafeBuf *qkbuf = getMatchingQueryBuf();
	if ( ! qkbuf || qkbuf == (void *)-1) return (HashTableX *)qkbuf;

	// how many queries do we have that match this url?
	//Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart();
	//int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *);
	QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
	int32_t nks = qkbuf->length()/sizeof(QueryLink);


	// init it
	if ( ! m_queryOffsetTable.set(8,
				      0,
				      nks*4,
				      NULL,
				      0,
				      false,
				      m_niceness,
				      "qdot") )
		return NULL;

	for ( int32_t i = 0 ; i < nks ; i++ ) {
		// cast it
		QueryLink *qk = &qks[i];
		// int16_tcut
		//int64_t eh64 = qp->m_queryInfo.m_queryExactHash64;
		int64_t eh64 = qp->m_replyingHostId;
		eh64 <<= 32;
		eh64 |= qp->m_qbufOffset;
		// hash it up
		if ( ! m_queryOffsetTable.addKey ( &eh64 ) )
			return NULL;
	}

	// all done
	m_queryOffsetTableValid = true;
	return &m_queryOffsetTable;
}

//static char *s_base = NULL;

// related QUERY compate
int qp99relatedCmp ( const void *a, const void *b ) {
	// these are offsets
	//int32_t offa = *(int32_t *)a;
	//int32_t offb = *(int32_t *)b;
	QueryLink *qa = *(QueryLink **)a;
	QueryLink *qb = *(QueryLink **)b;
	// make sure manually added queries are on top
	//if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return  1;
	//if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1;
	//QueryInfo *qia = &qa->m_queryInfo;
	//QueryInfo *qib = &qb->m_queryInfo;
	// get scores
	float scorea = qa->m_rq_totalScore;
	float scoreb = qb->m_rq_totalScore;
	if ( scorea < scoreb ) return  1;
	if ( scorea > scoreb ) return -1;
	//return 0;
	// let docidsincommon break ties
	return qb->m_docIdVotes - qa->m_docIdVotes;
}
*/

/*
static int qlCmp ( const void *a, const void *b ) {
	QueryLink *qa = (QueryLink *)a;
	QueryLink *qb = (QueryLink *)b;

	// let docid break ties
	int64_t da = qa->getRelatedDocId(s_rdBuf)->m_docId;
	int64_t db = qb->getRelatedDocId(s_rdBuf)->m_docId;

	//int64_t da = qa->m_relatedDocId->m_docId;
	//int64_t db = qb->m_relatedDocId->m_docId;

	// always niceness 1 i guess
	QUICKPOLL(1);

	if ( da > db )
		return 1; // 1 means to swap!
	if ( da < db )
		return -1;
	return 0;
}
*/

#include <math.h> // sqrtf()

// now we can do square roots in gdb by calling this
float gbsqrt ( float x ) {
	return sqrtf(x);
}


/*
	// sort the related query links intersected buf by docid
	QueryLink *ptrs;
	ptrs = (QueryLink *)m_relatedQueryLinksIntersected.getBufStart();
	int32_t nk = m_relatedQueryLinksIntersected.length() / sizeof(QueryLink);
	qsort ( ptrs ,
		nk,
		sizeof(QueryLink),
		qlCmp );

	// show time
	int64_t now = gettimeofdayInMilliseconds();
	int64_t took = now - start;
	log("seopipe: time: relatedqueryintersection took %"INT64" ms",took);
*/

/*
void XmlDoc::gotMsg98Reply ( UdpSlot *slot ) {
	// get replying hostid
	int32_t hostId = slot->m_hostId;
	// log
	setStatus ( "gotmsg98reply" );
	// sanity
	if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;}
	// point to it
	char *p = slot->m_readBuf;
	char *pend = p + slot->m_readBufSize;
	// int16_tcuts
	QueryLink *qks = (QueryLink *)m_tmpBuf5.getBufStart();
	// sanity, i guess if oom
	int32_t maxLinkOff = m_tmpBuf5.length() ;
	maxLinkOff /= sizeof(QueryLink);
	// make some space
	int32_t need = slot->m_readBufSize;
	if ( ! m_tmpStringBuf5.reserve(need,"rqdbuf") ) {
		m_msg98ReplyError = g_errno;
		// do not bother scanning the reply
		p = pend;
	}

	// init table
	if ( m_qstringTable.m_numSlots == 0 ) {
		// 1M slots!
		if ( ! m_qstringTable.set(4,4,1000000,NULL,0,false,
					  m_niceness,"qstrtbl") ) {
			m_msg98ReplyError = g_errno;
			// do not bother scanning the reply
			p = pend;
		}
	}


	//int32_t numQueryLinks = m_relatedQueryLinksIntersected.length() ;
	//numQueryLinks /= sizeof(QueryLink);
	// put strings into m_tmpStringBuf5
	// parse these strings
	// maybe index so we can assign to QueryLinks::m_queryStringOffset
	// maybe include querylink # so we can assign quickly!
	QueryLink *qk;
	for ( ; p < pend ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// offset of query link
		int32_t queryLinkOff = *(int32_t *)p;
		p += 4;
		// crazy? maybe we went oom on m_relatedQueryLinksIntersected
		if ( queryLinkOff >= maxLinkOff ) {
			log("seopipe: msg98 reply link off breach %"INT32">=%"INT32"",
			    queryLinkOff,maxLinkOff);
			m_msg98ReplyError = ENOMEM;
			break;
		}

		// get that
		QueryLogEntry *qe = (QueryLogEntry *)p;
		// skip it
		p += qe->getSize();

		// point to it
		qk = &qks[queryLinkOff];

		// do not duplicate query strings!
		int32_t qh32 = hash32n ( qe->getQueryString() );
		int32_t slot = m_qstringTable.getSlot ( &qh32 );
		if ( slot >= 0 ) {
			int32_t qeOff;
			qeOff =*(int32_t *)m_qstringTable.getValueFromSlot(slot);
			qk->m_queryStringOffset = qeOff;
			qk->m_queryHostId = -1;
			continue;
		}

		// get offset of string in string bug
		int32_t stringOff = m_tmpStringBuf5.length();
		// store good serp score
		if ( ! m_tmpStringBuf5.safeMemcpy(qe,qe->getSize() ) ) {
			m_msg98ReplyError = g_errno;
			break;
		}

		// add to table
		if ( ! m_qstringTable.addKey(&qh32,&stringOff) ) {
			m_msg98ReplyError = g_errno;
			break;
		}


		// show it
		//log("seopipe: DEBUG. mapped remote off %"INT32" (hostid%"INT32") to "
		//    "local off %"INT32" (%s)"
		//    ,qk->m_queryStringOffset,qk->m_queryHostId,stringOff,qstr);
		// . save string offset
		// . THIS OVERWRITES the g_qbuf offset that was in there!!!
		qk->m_queryStringOffset = stringOff;
		// to indicate that this QueryLink::m_queryStringOffset is now
		// an offset into m_relatedQueryStringBuf and no longer an
		// offset into g_qbuf of the specific hostid, we set hostid
		// to -1
		qk->m_queryHostId = -1;
	}
	// steal it so it doesn't free it
	//slot->m_readBuf = NULL;
	// inc the counter
	m_numMsg98Replies++;
	// return control to transmit function. it will call m_callback1
	// if the function is done. but if a different parent function than
	// transmit called us then we call that. it just depends on the
	// intial entry function that called getMatchingQueries()
	m_masterLoop ( m_masterState );
}


static void gotMsg3fReplyWrapper ( void *state , void *state2 ) {
	XmlDoc *THIS = (XmlDoc *)state;
	//Multicast *m = (Multicast *)state2;
	Bin *bin = (Bin *)state2;
	THIS->gotMsg3fReply ( bin ); // m
}
*/

static int mtCmp ( const void *a, const void *b ) {
	MissingTerm *wa = *(MissingTerm **)a;
	MissingTerm *wb = *(MissingTerm **)b;
	if ( wb->m_importance > wa->m_importance ) return 1; // swap
	if ( wb->m_importance < wa->m_importance ) return -1;
	if ( wb->m_votes > wa->m_votes ) return  1; // swap
	if ( wb->m_votes < wa->m_votes ) return -1;
	if ( (int64_t)b < (int64_t)a ) return 1;  // swap
	if ( (int64_t)b > (int64_t)a ) return -1;
	return 0;
}

// . called by getMissingTermBuf() and getMatchingTermBuf()
// . returns false and sets g_errno on  error
bool XmlDoc::addTermsFromQuery ( char *qstr,
				 uint8_t queryLangId,
				 int32_t gigablastTraffic,
				 int32_t googleTraffic2,
				 //QueryLogEntry *qe ,
				 int32_t hackqoff,
				 SafeBuf *tmpBuf ,
				 HashTableX *scoreTable ,
				 HashTableX *topTermsTable ,
				 float imp, // importance
				 bool isRelatedQuery ) {

	// sanity
	if ( hackqoff < 0 ) { char *xx=NULL;*xx=0; }
	// print query but bold-face the terms our doc has not
	Query qq;
	//SafeBuf *rqsb = &m_relatedQueryStringBuf;
	// doQueryExpansion = false
	//char *qstr = qe->getQueryString ( );
	qq.set2 ( qstr , queryLangId , false );
	int32_t lastStart = -1;
	for ( int32_t k = 0 ; k < qq.m_numWords ; k++ ) {
		QUICKPOLL(m_niceness);
		QueryWord *qw = &qq.m_qwords[k];
		int32_t tid32 = qw->m_wordId & 0xffffffff;
		// is it not contained by our doc
		if ( ! tid32 ) continue;
		// skip if we contain it already
		if ( isRelatedQuery && topTermsTable->isInTable ( &tid32 ) )
			continue;
		// skip if common word like "on" "at" etc.
		if ( isCommonQueryWordInEnglish(tid32) ) continue;
		// get start of wikipedia phrase it is in
		int32_t start = qw->m_wikiPhraseStart;
		int32_t nwk   = qw->m_numWordsInWikiPhrase;
		// if not in wiki phrase at all, just use single word
		if ( qw->m_wikiPhraseId == 0 ) {
			start = k;
			nwk = 1;
		}
		// do not re-do any words in here
		if ( start == lastStart ) continue;
		lastStart = start;
		// hash each wordid in the term into the th64 hash
		int64_t th64 = 0LL;
		//int32_t alnumWordCount = 0;
		for ( int32_t j = start ; j < start + nwk ; j++ ) {
			// int16_tcut
			QueryWord *qw = &qq.m_qwords[j];
			// skip punct
			if ( qw->m_wordId == 0 ) continue;
			// hash otherwise
			th64 ^= qw->m_wordId;
			// count it
			//alnumWordCount++;
		}

		// get traffic of related query
		int32_t traffic = gigablastTraffic;
		// make gb traffic into google monthly traffic
		traffic *= GB_TRAFFIC_MODIFIER;
		// ues google numbers if we have them, more accurate
		int32_t googleTraffic = googleTraffic2;
		if ( googleTraffic >= 0 ) traffic = googleTraffic;


		// now score that term
		int32_t slot = scoreTable->getSlot ( &th64 );
		if ( slot >= 0 ) {
			int32_t off;
			off=*(int32_t *)scoreTable->getValueFromSlot(slot);
			char *base = tmpBuf->getBufStart();
			MissingTerm *pt=(MissingTerm *)(base + off);
			pt->m_importance += imp;
			pt->m_votes++;
			pt->m_traffic += traffic;
			// store first 10 related query strings
			// we got this term from
			for ( int32_t x = 1 ; x < 10 ; x++ ) {
				if ( pt->m_hackQueryOffsets[x] != -1 )
					continue;
				// grab it. querylogentry ptr!!
				pt->m_hackQueryOffsets[x] = hackqoff;
				break;
			}
			continue;
		}


		// set a class to store in safebuf
		MissingTerm mt;
		mt.m_importance    = imp;
		//mt.m_numAlnumWords = alnumWordCount;
		mt.m_synOf         = NULL;
		mt.m_votes         = 1;
		mt.m_traffic       = traffic;
		mt.m_hackQueryOffsets[0] = hackqoff;
		// if not a missing term, we are a MATCHING term
		mt.m_isMissingTerm = isRelatedQuery;
		// invalidate the remaining 9 query offsets
		for ( int32_t x = 1 ; x < 10 ; x++ )
			mt.m_hackQueryOffsets[x] = -1;
		int32_t offset = tmpBuf->length();
		int32_t toCopy = sizeof(MissingTerm);
		if ( ! tmpBuf->safeMemcpy(&mt,toCopy))
			return false;
		// for calculating length of stored term string
		int32_t startLen = tmpBuf->length();
		// . if first time in scoretable, add stuff
		// . store the string, each word separately
		for ( int32_t j = start ; j < start + nwk ; j++ ) {
			// int16_tcut
			QueryWord *qw = &qq.m_qwords[j];
			// point to word as string
			char *str = qw->m_word;
			int32_t  len = qw->m_wordLen;
			// make all punct a space
			if ( qw->m_wordId == 0 ) {
				str = " ";
				len = 1;
			}
			// store term string after MissingTerm class
			if ( ! tmpBuf->safeMemcpy(str,len) )
				return false;
		}
		tmpBuf->pushChar('\0');
		// record MissingTerm::m_termSize
		int32_t delta = tmpBuf->length() - startLen;
		char *base = tmpBuf->getBufStart();
		MissingTerm *pmt = (MissingTerm *)(base + offset);
		pmt->m_termSize = delta;
		// now score table entry
		if ( ! scoreTable->addKey ( &th64 , &offset ) )
			return false;
	}
	return true;
}

// this is used to sort the MissingTerm instances in a safeBuf,
// missingTermBuf. it is also used to sort the Matching terms from
// getMatchingTermBuf() as well now!
bool XmlDoc::sortTermsIntoBuf ( HashTableX *scoreTable ,
				SafeBuf *tmpBuf ,
				SafeBuf *missingTermBuf ) {

	// make ptrs for sorting
	int32_t numTerms = scoreTable->getNumUsedSlots();
	int32_t need =  numTerms * 4;
	SafeBuf ptrBuf;
	if ( ! ptrBuf.reserve ( need ,"srtbuf") ) return false;
	char *p = tmpBuf->getBufStart();
	char *pend = tmpBuf->getBuf();
	for ( ; p < pend ; ) {
		MissingTerm *mt = (MissingTerm *)p;
		p += mt->getSize();
		ptrBuf.pushPtr ( mt );
	}
	gbqsort ( ptrBuf.getBufStart(),
		  numTerms,
		  sizeof(MissingTerm *),
		  mtCmp,
		  m_niceness);

	// now write the missingTerm instances into m_missingTermBuf
	int32_t need2 = tmpBuf->length();
	if ( ! missingTermBuf->reserve ( need2 ,"mtbuf") ) return false;
	// now write back into the real buf
	MissingTerm **pp = (MissingTerm **)ptrBuf.getBufStart();
	for ( int32_t i = 0 ; i < numTerms ; i++ ) {
		MissingTerm *mt = pp[i];
		missingTermBuf->safeMemcpy ( mt , mt->getSize() );
	}
	return true;
}

// . now this uses the related queries
// . use logic from getInsertableTerms()!!!
SafeBuf *XmlDoc::getMissingTermBuf ( ) {

	// try to set from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	if ( m_missingTermBufValid )
		return &m_missingTermBuf;

	SafeBuf *qkbuf = getRelatedQueryBuf ();
	if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;

	HashTableX *topTermsTable = getTermIdBufDedupTable32();
	if ( ! topTermsTable || topTermsTable == (void *)-1 )
		return (SafeBuf *)topTermsTable;

	SafeBuf tmpBuf;
	if ( ! tmpBuf.reserve ( 100000 ,"t3buf" ) ) return NULL;

	// maps 64-bit term hash (can be multiple words in a term) to
	// an offset into tmpBuf.
	HashTableX scoreTable;
	if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
		return NULL;

	//
	// taken from seo.cpp's printRelatedQueries() function
	//
	//int32_t *qrOffs = (int32_t *)relBuf->getBufStart();
	//int32_t numRels = relBuf->length() / sizeof(int32_t);
	//char *base = m_queryRelBuf.getBufStart();
	//SafeBuf *rqsb = &m_relatedQueryStringBuf;

	int32_t nks = qkbuf->length() / sizeof(QueryLink);
	QueryLink *qks = (QueryLink *)qkbuf->getBufStart();
	int32_t i;
	for ( i = 0 ; i < nks ; i++ ) {
		QUICKPOLL(m_niceness);
		// stop at 300?
		//if ( i >= 300 ) break;
		QueryLink *qk = &qks[i];
		int32_t qkOff = (char *)qk - qkbuf->getBufStart();
		//int32_t relOff = qrOffs[i];
		//QueryRel *rel = (QueryRel *)(base+relOff);
		// skip if not head of a linked list
		if ( ! qk->m_isFirst ) continue;
		QueryLogEntry *qe ;
		qe = qk->getQueryLogEntry(&m_relatedQueryStringBuf);
		// relative to rqsb! m_relatedQueryStringBuf
		float imp = qk->m_totalQueryImportance;
		// modify by unique round? not yet...
		//imp -= rel->m_uniqueRound * 1000;
		// now use this function
		if ( ! addTermsFromQuery ( qe->getQueryString() ,
					   qe->m_langId,
					   qe->m_gigablastTraffic,
					   qe->m_googleTraffic,
					   qkOff, // hackqoff
					   &tmpBuf ,
					   &scoreTable ,
					   topTermsTable ,
					   imp ,
					   true ) ) // is related query?
			return NULL;
	}

	// sort MissingTerms from tmpBuf into m_missingTermBuf by
	// MissingTerm::m_importance
	if ( ! sortTermsIntoBuf ( &scoreTable,
				  &tmpBuf,
				  &m_missingTermBuf ) )
		return NULL;

	m_missingTermBufValid = true;

	//m_numMissingTerms = i;

	// store it
	//if ( ! storeMissingTermBufIntoCachedb() )
	//	return (SafeBuf *)-1;

	return &m_missingTermBuf;
}


// . now get the best terms from our matching queries
// . basically the exact same algo as getMissingTermBuf
SafeBuf *XmlDoc::getMatchingTermBuf ( ) {

	// try to set from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	if ( m_matchingTermBufValid )
		return &m_matchingTermBuf;

	SafeBuf *mq = getMatchingQueryBuf();
	if ( mq == NULL || mq == (void *)-1 ) return mq;


	HashTableX *topTermsTable = getTermIdBufDedupTable32();
	if ( ! topTermsTable || topTermsTable == (void *)-1 )
		return (SafeBuf *)topTermsTable;

	// tmpBuf will hold the MissingTerms we add.
	SafeBuf tmpBuf;
	if ( ! tmpBuf.reserve ( 100000 ,"t4buf") ) return NULL;

	// maps 64-bit term hash (can be multiple words in a term) to
	// an offset into tmpBuf. tmpBuf holds the missing terms, so we
	// use scoreTable to accumulate MissingTerm::m_importance for
	// the same term in different queries.
	HashTableX scoreTable;
	if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") )
		return NULL;

	// scan the queries this doc matches and add MissingTerms for them
	// into tmpBuf
	int32_t nks = mq->length() / sizeof(QueryLink);
	QueryLink *qks = (QueryLink *)mq->getBufStart();


	int32_t i; for ( i = 0 ; i < nks ; i++ ) {
		QUICKPOLL(m_niceness);
		QueryLink *qk = &qks[i];
		// stop at 300?
		if ( i >= 300 ) break;
		// "matching terms" have different hackqoff than missing terms
		int32_t qkOff = (char *)qk - mq->getBufStart();
		// relative to rqsb! m_relatedQueryStringBuf
		float imp = qk->m_queryImportance;
		// querylogentry does not have string info here! it is
		// just the basic class
		QueryLogEntry *qe ;
		qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
		// . now use this function
		if ( ! addTermsFromQuery ( qe->getQueryString(),
					   qe->m_langId,
					   qe->m_gigablastTraffic,
					   qe->m_googleTraffic,
					   qkOff, // hackqoff
					   &tmpBuf ,
					   &scoreTable ,
					   topTermsTable ,
					   imp ,
					   false ) ) // is related query?
			return NULL;
	}


	// sort MatchingTerms from tmpBuf into m_matchingTermBuf by
	// MatchingTerm::m_importance
	if ( ! sortTermsIntoBuf ( &scoreTable,
				  &tmpBuf,
				  &m_matchingTermBuf ) )
		return NULL;

	m_matchingTermBufValid = true;

	//m_numMatchingTerms = i;

	// store it
	//if ( ! storeMatchingTermBufIntoCachedb() )
	//	return (SafeBuf *)-1;

	return &m_matchingTermBuf;
}
/*
// . max # of outstanding msg3f requests we can send to one host
// . now just make it 1 since it is msg3f NOT msg39
#define MAXOUT 1

//#define BINSIZE 100000

class Bin {
public:
	// the current position for adding queries into m_buf
	int32_t  m_cursor;
	int32_t  m_maxCursor;
	int32_t  m_allocSize;
	// some hack storage
	Host *m_hackHost;
	bool  m_hackIsMsg99ReplyPtr;
	// for sending the m_buf to its host
	Multicast m_mcast;
	// allocates size of BINSIZE bytes
	char m_buf[0];
};

// . returns false and sets g_errno on error
// . returns true on successful launch of request, it will block always
bool XmlDoc::sendBin ( int32_t i ) {

	Bin *bin = m_currentBinPtrs[i];

	// get host
	Host *h = g_hostdb.getHost(i);

	// copy it
	//int32_t reqSize = p - tmpBuf;
	//char *req = mdup ( tmpBuf , reqSize , "3freq" );
	//if ( ! req ) return true;

	// increment outstanding requests he has
	h->m_numOutstandingRequests++;

	// this could be a ptr to a msg99reply or a querylink
	Multicast *mcast = &bin->m_mcast;

	//bin->m_hackxd   = this;
	//bin->m_hackPtrCursor = firstPtrCursor;
	bin->m_hackHost = h;

	// get his group id
	uint32_t groupId = h->m_groupId;

	char *req     = bin->m_buf;
	int32_t  reqSize = bin->m_cursor;

	// disown it so mcast can free it when its udpslot is destroyed
	m_currentBinPtrs[i] = NULL;

	// note that
	setStatus("launching msg3f");
	// log it too
	//log("seopipe: launching msg3f request of %"INT32" gbdocid queries to "
	//    "score to host %"INT32"", queryCount,h->m_hostId);
	// get the docIds for this query using msg3f.cpp's handleRequest3f()
	bool status = mcast->send ( req ,
				    reqSize,
				    0x3f ,
				    false, // mcast frees request? no!!!
				    groupId, // group to send to
				    false, // send to whole group?
				    0 , // query hash for host in group select
				    this , // state1
				    bin,//mcast, // state2
				    gotMsg3fReplyWrapper,
				    86401, // timeout in seconds. LONG TIME!
				    m_niceness,
				    false, // realtime?
				    h->m_hostId // firsthostid to try
				    );
	// mark it out
	m_numMsg3fRequests++;
	// if this is true then it was a success and we BLOCKED
	if ( status ) {
		// must BE IN USE!
		if ( ! mcast->m_inUse ) { char *xx=NULL;*xx=0; }
		// success
		return true;
	}
	// it came back?
	m_numMsg3fReplies++;
	// undo this
	h->m_numOutstandingRequests--;
	// errno should be set
	if ( ! g_errno ) { char *xx=NULL;*xx=0; }
	// set error
	m_binError = g_errno;
	// note it
	log("seopipe: mcast had error: %s", mstrerror(g_errno));
	// free that bin i guess
	mfree ( bin , bin->m_allocSize, "delbin" );
	// return false on error
	return false;
}


// . this is called from two places:
//   1. getMatchingQueriesScored()  (Msg99Reply ptrs)
//   2. getRelatedQueryBuf()        (QueryLink  ptrs)
// . this can take Msg99Reply ptrs or it can take QueryLink ptrs
// . it will glean the docid from either of these two ptrs types as well
//   as glean the pointer to the query string.
// . THEN it can create a 'gbdocid:xxxx | <queryString>' query which
//   it will send to a host in the network.
// . it will try to keep each host in the network answering 5 such queries
//   at any one time. bins are no longer used.
// . we need to implement heavy termlist caching remotely and locally to
//   ensure optimal speed
// . returns false if blocked, true otherwise
// . returns true with g_errno set on error
bool XmlDoc::scoreDocIdRestrictedQueries ( Msg99Reply **replyPtrs ,
					   QueryLink  *linkPtrs  ,
					   int32_t numPtrs ) {

	//log("debug: entered scoredocidrestrictedqueries");

	if ( numPtrs == 0 ) return true;

	// . sanity check
	// . you can only score your Msg99Replies or your QueryLinks
	// . score your Msg99Replies for queries that match the main url
	// . score your QueryLinks for queries that match a related docid
	if ( ! replyPtrs && ! linkPtrs ) { char *xx=NULL;*xx=0; }

	if ( replyPtrs && m_setForReplyPtrs ) return true;
	if ( linkPtrs  && m_setForLinkPtrs  ) return true;

	// we now send the termlistbuf to each host receiving a msg3f
	// request so when it performs the msg39 on a query we provide it
	// will set QueryTerm::m_posdbListPtr to point to the termlists we
	// provided only, just for this docid
	SafeBuf *termListBuf = NULL;
	if ( ! linkPtrs ) {
		termListBuf = getTermListBuf();
		if ( ! termListBuf ) return true;
		if ( termListBuf==(void *)-1 ) return false;
	}

	// force to ten for debug
	//numPtrs = 20;

 sendLoop:

	//
	// cleanup if got all replies we can
	//
	if ( m_numMsg3fReplies == m_numMsg3fRequests &&
	     ((m_qcursor >= numPtrs) || m_binError) ) {

		//log("debug: cleanup");

		// there might be remnant bins if we stopped trying to
		// call sendBin because we hit m_binError
		for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
			// see if that bin is still around
			Bin *bin = m_currentBinPtrs[i];
			if ( ! bin ) continue;
			// this will core if the multicast is in use
			bin->m_mcast.destructor();
			// now nuke it then
			mfree ( bin , bin->m_allocSize, "delbin" );
			// now make it null
			m_currentBinPtrs[i] = NULL;
		}
		// nuke this too!
		if ( m_newxd2 ) {
			mdelete ( m_newxd2 , sizeof(XmlDoc) , "newxd2" );
			delete ( m_newxd2 );
			m_newxd2 = NULL;
		}
		// free table's mem if used
		m_tmpDupTable.reset();
		// do not repeat this logic!
		if ( replyPtrs ) {
			m_setForReplyPtrs = true;
			m_binErrorForReplyPtrs = m_binError;
		}
		if ( linkPtrs  ) {
			m_setForLinkPtrs = true;
			m_binErrorForLinkPtrs = m_binError;
		}
		// inherit error? pass it on to caller
		//if ( m_binError ) g_errno = m_binError;
		// reset for another call to this function since we call
		// if from two different places above
		m_numMsg3fRequests = 0;
		m_numMsg3fReplies  = 0;
		m_qcursor = 0;
		m_binError = 0;
		// all done!
		g_errno = 0;
		return true;
	}

	// int16_tcut
	char *base = m_tmpStringBuf5.getBufStart();

	if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; }

	// store the queries in our buffer into the various bins and send
	// a bin off when it gets full
 queryLoop:

	// breathe
	QUICKPOLL(m_niceness);

	// nothing left to do except wait for replies?
	if ( m_qcursor >= numPtrs ) return false;

	// assume ptr is good
	bool good = true;
	// set these
	int64_t docId;
	// the query as a string
	char *qstr = NULL;
	// for passing to mcast::m_hackQPtrs
	void *vptr;
	// get the ith QueryLink?
	if ( linkPtrs ) {
		QueryLink *qk = &linkPtrs[m_qcursor];
		// skip if was not successfully processed above
		// because it's hostid was dead perhaps?
		if ( qk->m_queryHostId != -1 ) good = false;
		// get from related docid in this case
		SafeBuf *rdbuf = &m_relatedDocIdBuf;
		if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
		RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
		docId = rd->m_docId;
		// get it
		QueryLogEntry *qe ;
		qe = (QueryLogEntry *)(qk->m_queryStringOffset + base);
		// and this. skip over goodserpscore, gigablastTraffic and
		// googleTraffic
		qstr = qe->getQueryString();
		// save it
		vptr = qk;
	}
	// make a new one for the first time
	if ( linkPtrs && ! m_newxd2 ) {
		try { m_newxd2 = new ( XmlDoc ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			m_binError = g_errno;
			goto sendLoop;
		}
		mnew ( m_newxd2, sizeof(XmlDoc),"newxd2");
	}
	// set the xmldoc to this new docid, if it is new...
	if ( linkPtrs && m_newxd2->m_docId != docId ) {
		// a good stopping point?
		if ( clientClosedConnection() ) {
			m_binError = ESOCKETCLOSED;
			goto sendLoop;
		}
		// set it from related doc's docid
		if ( ! m_newxd2->set3 ( docId ,cr->m_coll, m_niceness ) ) {
			m_binError = g_errno;
			goto sendLoop;
		}
		// alloc space for tablen
		if ( m_tmpDupTable.getNumSlots() <= 0 &&
		     ! m_tmpDupTable.set ( 8,0,1024,NULL,0,false,m_niceness,
					   "tdtbl") ) {
			m_binError = g_errno;
			goto sendLoop;
		}
		// must not be in there already!
		if ( m_tmpDupTable.isInTable ( &docId ) ) {
			char *xx=NULL;*xx=0; }
		// add it
		if ( ! m_tmpDupTable.addKey ( &docId ) ) {
			m_binError = g_errno;
			goto sendLoop;
		}
		// ensure content is recycled from title rec
		m_newxd2->m_recycleContent = true;
		// newxd2 needs to use our master functions. so
		// anytime one of its internal functions blocks, then
		// our m_masterLoop will be called
		// and we'll end up right here again!
		m_newxd2->m_masterLoop  = m_masterLoop;
		m_newxd2->m_masterState = m_masterState;
		// only get posdb keys really for this stuff
		m_newxd2->m_useTitledb   = false;
		m_newxd2->m_useTagdb     = false;
		m_newxd2->m_useClusterdb = false;
		m_newxd2->m_useSpiderdb  = false;
		m_newxd2->m_useLinkdb    = false;
		// debug
		log("seopipe: setting newxd2 docid=%"INT64"",docId);
	}
	// pump this
	if ( linkPtrs && ! m_newxd2->m_loaded ) {

		// . CRAP, blocking here sucks because when this function
		//   is re-entered it can also be from a Msg3f reply
		//   not because this document is back from msg22a...

		//log("debug: loading newxd2");

		// try to set from title rec first. return false if blocks.
		if ( ! m_newxd2->loadFromOldTitleRec() ) {
			m_newxd2Blocked = true;
			//log("debug: newxd2 blocked");
			return false;
		}
	}
	// i guess no longer out
	if ( linkPtrs && m_newxd2->m_loaded )
		m_newxd2Blocked = false;

	//if ( linkPtrs )
	//	log("debug: newxd2 loaded=%"INT32"",(int32_t)m_newxd2->m_loaded);

	// sanity check
	if ( linkPtrs && ! m_newxd2->m_oldTitleRecValid ) {
		char *xx=NULL;*xx=0;
	}
	// . did that fail? i.e. docid not found!?!?!
	// . do not increment m_qcursor if m_binError is set
	if ( linkPtrs && ! m_newxd2->m_oldTitleRec && ! m_binError ) {
		// just skip this asshole then
		if ( m_lastPrintedDocId != docId ) {
			log("seopipe: related docid %"INT64" titlerec "
			    "load failed99",
			    docId);
		}
		m_lastPrintedDocId = docId;
		// clear that
		g_errno = 0;
		// skip it
		m_qcursor++;
		// try the next one
		goto queryLoop;
	}
	if ( linkPtrs ) {

		// . CRAP, blocking here sucks because when this function
		//   is re-entered it can also be from a Msg3f reply
		//   not because it has the termlistbuf ready

		// . use termlist buf of related docid
		// . we need to ENSURE that the QueryLinks are clustered
		//   by related docid so this logic is efficient here
		termListBuf = m_newxd2->getTermListBuf();
		// return false if it blocked
		if ( termListBuf == (void *)-1 ) {
			//log("debug: newxd2 blocked in termlistbuf");
			m_newxd2Blocked = true;
			return false;
		}
		// this sucks. error!
		if ( ! termListBuf ) {
			m_binError = g_errno;
			goto sendLoop;
		}
	}
	// i guess no longer out
	if ( linkPtrs ) {
		//log("debug: newxd2 UNblocked in termlistbuf");
		m_newxd2Blocked = false;
	}

	// wait for replies to come in so we can stop even if m_qcursor
	// did not complete its scan!
	// shit, but what if we are a msg22 coming in for m_newxd2? that
	// is why i moved this check down here so we can set m_newxd2Blocked
	// to false and allow the msg3f replies to come back in and free
	// all the bins. this is kinda fucked up because everything is
	// asynchronous.
	if ( m_binError ) return false;

	// otherwise the Msg99Reply
	if ( ! linkPtrs ) {
		Msg99Reply *qp = replyPtrs[m_qcursor];
		// tis us!
		docId = m_docId;
		// sanity
		if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }
		// and query string
		qstr = qp->m_queryStr;
		// save it
		vptr = qp;
	}

	int32_t qlen = gbstrlen(qstr);

	// mark as bad if this query is too big already
	if ( m_firstUrl.m_ulen + qlen + 10 > MAX_QUERY_LEN )
		good = false;

	// if ptr was bad, do not evaluate at all
	if ( ! good ) {
		m_qcursor++;
		goto queryLoop;
	}

	// sanity
	if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; }

	// . get hash of query to determine bin
	// . this keeps our term freqs consistent since every query goes
	//   back TO THE SAME HOST!!! thus our scores remain consistent.
	//   each host has a slightly different TermFreq/Weight for the
	//   exact same query because the termfreq is based on the termlist
	//   length for that termid. and each host has a different set of
	//   docids in its index for the most part.
	uint32_t h32 = hash32n ( qstr );
	int32_t numHosts = g_hostdb.getNumHosts();
	// do not send to host #0 if we got a lot of hosts
	if ( g_hostdb.getNumHosts() >= 8 ) numHosts--;
	int32_t hostNum = h32 % numHosts;
	// skip host #0 which is us i guess!
	if ( g_hostdb.getNumHosts() >= 8 ) hostNum++;
	// sanity for that
	if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }

	// get the current bin for that host
	Bin *bin = m_currentBinPtrs [ hostNum ];

	// alloc on demand
	if ( ! bin ) {
		// how big is the termlistbuf?
		int32_t tsize = termListBuf->length();
		int32_t collLen = gbstrlen(cr->m_coll);
		// how much space do we need for a good bin?
		int32_t alloc = sizeof(Bin) + 8 +1+ collLen + 1 + tsize + 100000;
		// make that
		char *mem = (char *)mmalloc ( alloc ,"binreq" );
		if ( ! mem ) {
			m_binError = g_errno;
			goto sendLoop;
		}
		// cast it
		bin = (Bin *)mem;
		// store it
		m_currentBinPtrs [ hostNum ] = bin;
		// this includes a Multicast in the Bin
		bin->m_mcast.constructor();
		// for freeing
		bin->m_allocSize = alloc;
		// the end of it
		char *memEnd = mem + alloc;
		// reset offset into Bin::m_buf
		bin->m_cursor = 0;
		// is it to a msg99reply? so the reply handler knows how to
		// handle mcast::m_hackQPtr and what action to take. it is
		// slightly different.
		if ( linkPtrs ) bin->m_hackIsMsg99ReplyPtr = 0;
		else            bin->m_hackIsMsg99ReplyPtr = 1;
		// . before we add any queries, store langid of QUERY
		// . crap just use doc langid for now
		char *bp = bin->m_buf;
		// first is docid. if doing QueryLinks this is the docid
		// of the related docid, otherwise, it is that of our main doc
		*(int64_t *)bp = docId; bp += 8;
		// then langid
		*bp = m_langId;	bp++;
		// then the coll
		gbmemcpy ( bp , cr->m_coll , collLen );
		bp += collLen;
		*bp++ = '\0';
		// sanity!
		if ( bp >= memEnd ) { char *xx=NULL;*xx=0; }
		// the size of the termlist buf
		*(int32_t *)bp = tsize; bp += 4;
		// then the termlistbuf that has all the termlists forour docid
		gbmemcpy ( bp , termListBuf->getBufStart(), tsize ); bp += tsize;
		// update bin's cursor
		bin->m_cursor = bp - bin->m_buf;
		// for breach detection. send off Bin when breach happens.
		bin->m_maxCursor = alloc - sizeof(Bin);
	}

	// can we store the current query into this bin?
	bool storeInBin = true;

	// is there enough room for this query in the bin?
	int32_t need = qlen + 40;
	if ( bin->m_cursor + need >= bin->m_maxCursor )
		storeInBin = false;

	// does docid of bin match?
	int64_t binDocId = *(int64_t *)(bin->m_buf);
	if ( docId != binDocId )
		storeInBin = false;

	// if we can't store this query into the bin, send it off now
	if ( ! storeInBin ) {
		// use its multicast to send this bin off if too full
		if ( ! sendBin ( hostNum ) ) {
			m_binError = g_errno;
			goto sendLoop;
		}
		// . now the current bin should have been emptied
		// . go back to top to realloc Bin::m_buf to hold this query
		goto queryLoop;
	}

	char *p = bin->m_buf + bin->m_cursor;

	// first store the offset from the buf so we can return it
	// in the reply which is a list of scores basically and we know
	// what score goes with what m_qcursor
	*(int32_t *)p = m_qcursor;
	p += 4;

	// now store queries in the request buf for the msg3f
	p += sprintf(p,"gbdocid:%"UINT64" | %s",docId,qstr);
	*p++ = '\0';

	// update cursor
	bin->m_cursor = p - bin->m_buf;

	// skip to next query/docid to evaluate
	m_qcursor++;

	// if we have more queries left, add them to bins now
	if ( m_qcursor < numPtrs ) goto queryLoop;

	// now send every bin, we have no queries left.
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);
		// skip if empty
		if ( ! m_currentBinPtrs[i] ) continue;
		// this will transfer the request buffer over to mcast
		// so it will be freed when mcast returns
		sendBin ( i );
	}

	goto sendLoop;
}

// we got back the score for each query link in
// the bin that we sent out for the docid specified in the bin header request
void XmlDoc::gotMsg3fReply ( Bin *bin ) { // Multicast *mcast ) {

	setStatus ( "gotmsg3freply" );

	// do some housekeeping
	Host *h = bin->m_hackHost;
	h->m_numOutstandingRequests--;

	m_numMsg3fReplies++;

	// sanity
	Multicast *mcast = &bin->m_mcast;
	if ( mcast->m_inUse ) { char *xx=NULL;*xx=0; }

	// get the reply
	bool freeIt = false;
	int32_t replySize = 0;
	int32_t replyMaxSize;
	char *rbuf = mcast->getBestReply ( &replySize ,
					   &replyMaxSize ,
					   &freeIt ,
					   true ); // steal it?

	// log it too
	//log("seopipe: got msg3f reply from host %"INT32" size=%"INT32" bytes",
	//    h->m_hostId,replySize);

	// cast it
	//Msg3fReply *mr = (Msg3fReply *)rbuf;
	// in case of mem-leak this helps
	//if ( rbuf ) relabel(rbuf,replyMaxSize,"xx-rb");
	// . we must be able to free it... we must own it
	// . this is true if we should free it, but we should not have
	//   to free it since it is owned by the slot?
	if ( freeIt ) {
		log(LOG_LOGIC,"query: msg3f: Steal failed.");
		char *xx = NULL; *xx=0;
	}

	// if it failed for some reason i guess just bail
	if ( ! rbuf ) {
		// clean up the bin and the multicast and the request buffer
		mfree ( bin , bin->m_allocSize, "delbin" );
		g_errno = EBADREPLYSIZE;
		log(LOG_LOGIC,"seopipe: bad msg3f empty reply");
		return;
	}

	// reply is just sequence of docid/score pairs
	char *rp = rbuf;
	char *rpEnd = rbuf + replySize;

	//int32_t firstCursor = bin->m_hackPtrCursor;

	// scan the msg99 replies and insert the scores we got for each
	// query from the msg3f reply in "rbuf"
	for ( ; rp < rpEnd ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// . first is index, what query # in the request are we
		//   processing now, might not be in order because we launch
		//   a bunch of msg39s in parallel in handleRequest3f()'s call
		//   to processQueries()
		// . but the corresponding msg99reply is reply # "qcursor"
		int32_t qcursor = *(int32_t *)rp;
		rp += 4;
		int64_t docId = *(int64_t *)rp;
		rp += 8;
		float score = *(float *)rp;
		rp += 4;
		// . if this is true that means qcursor is referencing a
		//   msg99reply and we should set the score of that msg99
		//   reply to what the handlerequest3f provided
		// . so store the docid and score for our url for this query
		if ( bin->m_hackIsMsg99ReplyPtr ) {
			SafeBuf *mqbuf = getMatchingQueries(false,-1);
			Msg99Reply **qptrs=(Msg99Reply **)mqbuf->getBufStart();
			Msg99Reply *qr = qptrs[qcursor];
			qr->m_myScore = score;
			qr->m_myDocId = docId;
			int32_t numQueryPtrs=mqbuf->length()/sizeof(Msg99Reply *);
			// if too many skip some
			if ( numQueryPtrs > 1000 && (qcursor%1000)!=0)continue;
			// if too many skip some
			if ( numQueryPtrs > 400 && (qcursor%100)  !=0)continue;
			char *qstr = qr->m_queryStr;
			log("seopipe: got query #%"INT32"of%"INT32" score=%f qstr=%s"
			    ,qcursor+1
			    ,numQueryPtrs
			    ,score
			    ,qstr
			    );
			continue;
		}
		// might be storing in a QueryLink (doing related docids)
		//SafeBuf *ibuf = getRelatedQueryLinksWithStrings();
		QueryLink *qks =(QueryLink *)m_tmpBuf5.getBufStart();
		//int32_t numQueryLinks = ibuf->length() / sizeof(QueryLink);
		QueryLink *qk = &qks[qcursor];
		// sanity. make sure qk->m_queryStringOffset is related to our
		// local m_tmpStringBuf5 and not relative to the
		// g_qbuf of the hostid that sent back the msg99 reply.
		if ( qk->m_queryHostId != -1 ) { char *xx=NULL;*xx=0; }
		// how many related query links do we got? for logging.
		int32_t nks = m_tmpBuf5.length()/sizeof(QueryLink);
		// int16_tcuts
		char *base = m_tmpStringBuf5.getBufStart();
		// skip over gigablastTraffic and googleTraffic
		QueryLogEntry *qe;
		qe = (QueryLogEntry *)(base + qk->m_queryStringOffset);
		SafeBuf *rdbuf = &m_relatedDocIdBuf;
		if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; }
		RelatedDocId *rd = qk->getRelatedDocId(rdbuf);
		// note it
		if ( (qcursor % 1000) == 0 ) // || qcursor < 100 )
			log("seopipe: got msg3f reply for related query "
			    "#%"INT32"of%"INT32" "
			    "query \"gbdocid:%"INT64" | %s\" gigablasttraffic=%"INT32" "
			    "googletraffic=%"INT32" serpscore=%f goodscore=%f"
			    ,qcursor+1
			    ,nks
			    ,rd->m_docId
			    ,qe->getQueryStr()
			    ,qe->m_gigablastTraffic
			    ,qe->m_googleTraffic
			    ,score
			    ,qe->m_topSERPScore // of a docid slice on 1 host
			    );
		//
		// no longer used queryrel!
		//
		// if we are scoring QueryLinks then we add a QueryRel
		//QueryRel qr;
		// clear that mem to zero
		//memset ( &qr , 0 , sizeof(QueryRel));
		// then add the info we know
		//qr.m_relatedDocId = qk->m_relatedDocId;
		//char *base2 = m_relatedDocIdBuf.getBufStart();
		//int32_t rdOff = (char *)qk->m_relatedDocId - base2;
		//qr.m_relatedDocIdOff = rdOff;
		//qr.m_offsetIntoRelQStrBuf = qk->m_queryStringOffset;
		//qr.m_myScore      = score;
		//qr.m_nextOff = -1;
		//qr.m_tailOff = -1;
		qk->m_serpScore = score;
		// save that. WHAT IF THIS ERRORS?!?!?!
		//if ( ! m_queryRelBuf.safeMemcpy(&qr,sizeof(QueryRel)) ) {
		//	m_binError = g_errno;
		//	log("xmldoc: panic. failed to store query rel");
		//	break;
		//}
		// debug test
		//m_binError = EBADENGINEER;
		//log("xmldoc: panic2. failed to store query rel");
		//break;
	}

	// ok, we got the docid and score, now free it
	mfree ( rbuf , replyMaxSize , "fmsg3f" );

	// clean up the bin and the multicast and the request buffer
	mfree ( bin , bin->m_allocSize, "delbin" );

	//if ( m_newxd2Blocked )
	//	log("debug: got reply, but returning because newxd2 "
	//	    "had blocked");

	// prevent double entry bug from entering scoreDocIdRestrictedQueries()
	// from a newxd2 function blocking and coming in through msg22
	// callback or whatever, vs. coming in from here
	if ( m_newxd2Blocked ) return;

	//log("debug: got reply and calling masterloop");

	// go back to the transmit function
	m_masterLoop ( m_masterState );

	// if not done, just return... otherwise we double enter
	// scoreDocIdRestrictedQueries() along with it's call to
	// getTermListBuf()... and all hell breaks loose
	return;
}
*/

/*
// send contents of m_socketWriteBuf to m_seoSocket
void XmlDoc::pumpSocketWriteBuf ( ) {

	if ( ! m_seoSocket ) return;

	setStatus ( "pumpsocketwritebuf" );

	SafeBuf *sb = &m_socketWriteBuf;

	// insert http header into m_socketWriteBuf if not there
	char *wbuf = sb->getBufStart();
	bool insertIt = false;
	if ( ! wbuf ) insertIt = true;
	if ( wbuf && strncmp(wbuf,"HTTP/1.0 ",9 ) ) insertIt = true;
	// add http header first
	if ( insertIt ) {
		// reset # bytes sent
		m_socketWriteBufSent = 0;
		m_registeredSocketCallback = false;
		// xml-itize each query reply without scoring info
		sb->insert("HTTP/1.0 200 OK\r\n"
			   "Content-Type: text/xml ; "
			   "charset=utf-8\r\n"
			   "\r\n"
			   "<response>\n",0);
	}

	// come back here to do another send
 sendLoop:

	// try sending out our xml buffer on the socket
	// the very first things we do is send the queries over without
	// the ranking info which we compute by calling msg39 on each query,
	// so at least we can display something quite quickly.
	if ( m_socketWriteBufSent < sb->length() ) {
		int32_t sd = m_seoSocket->m_sd;
		// just in case
		if ( m_registeredSocketCallback ) {
			g_loop.unregisterWriteCallback(sd,this,
						      getSEOQueryInfoWrapper2);
			m_registeredSocketCallback = false;
		}
		// send that off
		int32_t  sendLen = sb->length();
		char *sendStr = sb->getBufStart();
		char *sendEnd = sendStr + sendLen;
		// if we sent SOME last time, skip over that
		sendStr += m_socketWriteBufSent;
		// how much left?
		int32_t remaining = sendEnd - sendStr;
		// wtf?
		if ( remaining <= 0 ) { char *xx=NULL;*xx=0; }
		// try a send on non-blocking socket
		int32_t n = ::send ( sd , sendStr , remaining , 0 );
		// did we send something?
		if ( n > 0 ) {
			m_socketWriteBufSent += n;
			goto sendLoop;
		}
		// maybe it sent 0 because it was waiting for something
		// so set our callback for when the socket is ready for
		// writing again. try sending more later.
		g_loop.registerWriteCallback ( sd ,
					       this ,
					       getSEOQueryInfoWrapper2,
					       0 ); // niceness = 0
		// flag it so we don't leak these
		m_registeredSocketCallback = true;
	}
}
*/

bool XmlDoc::getIsInjecting ( ) {
	bool isInjecting = false;
	//if ( g_inPageInject ) isInjecting = true;
	if ( m_sreqValid && m_sreq.m_isInjecting ) isInjecting = true;
	if ( m_isInjecting && m_isInjectingValid ) isInjecting = true;
	return isInjecting;
}


int posdbKeyCmp ( const void *a, const void *b ) {
	char *ka = (char *)a;
	char *kb = (char *)b;
	//int64_t tid64a = g_posdb.getTermId(ka);
	//int64_t tid64b = g_posdb.getTermId(kb);
	// a bit of a hack so handleRequest8e already has these
	// guys sorted by their lower 32-bits of termids so it can
	// match this doc to queries without having to sort first.
	//uint32_t tid32a = (uint32_t)tid64a;
	//uint32_t tid32b = (uint32_t)tid64b;
	//if ( tid32a < tid32b ) return -1;
	//if ( tid32a > tid32b ) return  1; // swap
	//if ( tid64a < tid64b ) return -1;
	//if ( tid64a > tid64b ) return  1; // swap
	char val = KEYCMP(ka,kb,sizeof(POSDBKEY));
	if ( val > 0 ) return  1;
	if ( val < 0 ) return -1;
	return 0;
}


// . used by XmlDoc::getTermListBuf() below
// . sorted by posdb key straight up
SafeBuf *XmlDoc::getTermIdSortedPosdbListBuf ( ) {

	if ( m_sortedPosdbListBufValid )
		return &m_sortedPosdbListBuf;

	// get the lists. forDelete = false.
	char *metaList = getMetaList ( false );
	if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;

	// sanity
	if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }

	// make a tmp buf to hold posdb keys
	//SafeBuf tmp;
	if ( ! m_sortedPosdbListBuf.reserve(m_metaListSize,"spbuf"))
		return NULL;
	// point into it
	char *dst = m_sortedPosdbListBuf.getBufStart();

	// debug test
	//verifyMetaList ( m_metaList ,
	//		 m_metaList + m_metaListSize ,
	//		 false );

	// scan the meta list for posdb keys
	char *p    = metaList;
	char *pend = p + m_metaListSize;
	// stole this loop from getMetaList()
	for ( ; p < pend ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// save it with the flag
		char byte = *p;
		// get rdbId
		char rdbId = byte & 0x7f;
		// skip that
		p++;
		// key size
		int32_t ks = getKeySizeFromRdbId(rdbId);
		// get key
		char *key = p;
		// skip that
		p += ks;
		// get datasize
		int32_t ds = getDataSizeFromRdbId(rdbId);
		// assume we do not store the datasize
		//bool neg = false;
		// . if key is negative, no data is present
		// . the doledb key is negative for us here
		if ( (key[0] & 0x01) == 0x00 ) ds = 0;
		// if datasize variable, read it in
		if ( ds == -1 ) {
			// get data size
			ds = *(int32_t *)p;
			// skip data size int32_t
			p += 4;
		}
		// point to data
		//char *data = p;
		// skip data if not zero
		p += ds;
		// if not posdb skip rec
		if ( rdbId != RDB_POSDB ) continue;
		// skip negative keys
		if ( (key[0] & 0x01) == 0x00 ) continue;
		// add to new buf now
		gbmemcpy ( dst , key , sizeof(POSDBKEY) );
		// advance
		dst += sizeof(POSDBKEY);
	}
	char *start = m_sortedPosdbListBuf.getBufStart();
	// update tmp
	m_sortedPosdbListBuf.incrementLength ( dst - start );
	// sanity
	if ( m_sortedPosdbListBuf.length() > m_metaListSize ) {
		char *xx=NULL;*xx=0; }


	// point
	char *pbuf    = m_sortedPosdbListBuf.getBufStart();
	int32_t  numKeys = m_sortedPosdbListBuf.length()/sizeof(POSDBKEY);

	// sort keys by termid
	gbqsort ( pbuf ,
		  numKeys,
		  sizeof(POSDBKEY),
		  posdbKeyCmp,
		  m_niceness );

	m_sortedPosdbListBufValid = true;
	return &m_sortedPosdbListBuf;
}


#define TLBUFSIZE 5000

// . used by the seo pipeline
// . this is a list of posdb termlists, one termlist per termid.
// . we store each termlist in this termlistbuf into g_termListCache
// . we use g_termListCache for evaluating gbdocid:xxx| restricted queries
//   very quickly without having to hit disk because all the posdb termlists
//   for that docid should be in g_termListCache
SafeBuf *XmlDoc::getTermListBuf ( ) {

	if ( m_termListBufValid )
		return &m_termListBuf;

	// . ensure content is recycled from title rec
	// . no, because if we had to download the doc fresh for the first
	//   time, this caused us headaches around line 30657 and we ended
	//   up setting m_docIndexed to false there and calling logIt() twice!
	//m_recycleContent = true;
	//m_recycleLinkInfo = true;

	// try to set from title rec first. return false if it blocked.
	//if ( ! loadFromOldTitleRec() ) return (SafeBuf *)-1;

	// did that fail? i.e. docid not found!?!?!
	//if ( m_oldTitleRecValid && ! m_oldTitleRec ) {
	//	g_errno = ENOTFOUND;
	//	return NULL;
	//}

	// only get posdb keys in getMetaList()
	/*
	m_useTitledb   = false;
	m_useTagdb     = false;
	m_useClusterdb = false;
	m_useSpiderdb  = false;
	m_useLinkdb    = false;
	*/

	// . these are FULL 18-byte keys, no compression
	// . sorted by posdbkeys straight up, so by termid
	SafeBuf *posdbBuf = getTermIdSortedPosdbListBuf ();
	if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;

	int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);

	// . reserve mem for new termlistbuf
	// . include 4 bytes for listsize
	// . this buffer will be a list of lists
	int32_t need = numKeys * (sizeof(POSDBKEY) + 4);
	if ( ! m_termListBuf.reserve ( need ,"tlstbuf" ) )
	     return NULL;


	int64_t lastTermId = -1LL;
	/*
	char tmpBuf[TLBUFSIZE];
	// build termlists from the posdb records
	RdbList termList;
	// stolen from RdbList::set
	termList.m_list      = tmpBuf;
	termList.m_listSize  = 0;
	termList.m_listEnd   = tmpBuf;
	termList.m_alloc     = tmpBuf;
	termList.m_allocSize = TLBUFSIZE;
	termList.m_ownData   = false;
	termList.m_ks = sizeof(POSDBKEY);
	termList.m_fixedDataSize = 0;
	termList.m_ownData = false;
	termList.m_useHalfKeys = true;
	termList.resetListPtr();
	bool breakOut = false;
	*/
	// start a size bookmark
	int32_t *bookmark = NULL;
	// scan all the sorted posdb keys and build posdb termlists and
	// store the termlists into "m_termListBuf"
	char *p = posdbBuf->getBufStart();
	char *pend = p + posdbBuf->length();
	for ( ; p < pend ; ) {
		// get the key
		char *key = p;
		// must be full 18 byte keys!
		if ( p[0] & 0x06 ) { char *xx=NULL;*xx=0; }
		// skip it
		p += sizeof(POSDBKEY);
		// get key termid
		int64_t termId = g_posdb.getTermId ( key );
		// sanity
		int64_t docId = g_posdb.getDocId ( key );
		if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
		// sanity. is it sorted by termid?
		if ( termId < lastTermId && lastTermId == -1 ) {
			char *xx=NULL;*xx=0; }
		// log it for debug
		//if ( docId == 192304365235LL )
		//	log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"",
		//	    docId,
		//	    termId,
		//	    g_posdb.getWordPos(key));
		// . store size of keys following that have same termid
		// . assume just one for now!
		if ( termId != lastTermId ) {
			bookmark = (int32_t *)m_termListBuf.getBuf();
			m_termListBuf.pushLong(sizeof(POSDBKEY));
		}
		// store the key
		m_termListBuf.safeMemcpy ( key , sizeof(POSDBKEY) );
		// if not first in the list, update size
		if ( termId == lastTermId ) *bookmark += sizeof(POSDBKEY);
		// . cache currently made list then
		// . set startkey/endkey
		//char startKey[sizeof(POSDBKEY)];
		//char endKey  [sizeof(POSDBKEY)];
		//g_posdb.makeStartKey(startKey,lastTermId,m_docId);
		//g_posdb.makeEndKey  (endKey,lastTermId,m_docId);
		// update it for next list
		lastTermId = termId;
		// . add to ongoing list? will use compression bit.
		// . return true with g_errno set on error
		// . use g_termListCache in Msg0.cpp
		//if(!addToTermListCache(cr->m_coll,startKey,endKey,&termList))
		//	return true;
		// first store the lits size
		//m_termListBuf.pushLong(termList.m_listSize);
		// then the list data itself
		//m_termListBuf.safeMemcpy(termList.m_list,termList.m_listSize)
		// now reset
		//termList.m_listSize  = 0;
		//termList.m_list      = tmpBuf;
		//termList.m_listEnd   = tmpBuf;//ermList.m_list;
		//termList.resetListPtr();
		// if we are a loopback, bail
		//if ( breakOut ) break;
		// are we the last record?
		//if ( p >= pend ) breakOut = true;
		// add fresh to the new termlist
		//goto addIt;
	}

	// sanity
	if ( m_termListBuf.length() &&
	     g_posdb.getDocId(m_termListBuf.getBufStart()+4) != m_docId ) {
		char *xx=NULL;*xx=0; }

	m_termListBufValid = true;

	return &m_termListBuf;
	// print timing
	//int64_t now = gettimeofdayInMilliseconds();
	//int64_t took = now - m_cacheStartTime;
	//log("seopipe: took %"INT64" ms to parse docid %"INT64"",took,m_docId);
	// . flag it as being completely cached now
	// . returns false and sets g_errno on error
	//return addDocIdToTermListCache ( m_docId , cr->m_coll );
}


//int32_t XmlDoc::getNumInsertableTerms ( ) {
//	// make sure they called getInsertableTerms() first!
//	if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0;}
//	return m_insertableTermsBuf.length() / sizeof(InsertableTerm);
//}

// . return a list of InsertableTerms
// . these are just terms we will try to insert into the document in every
//   possible place to see how they affect ranking of this document for
//   all the applicable queries
// . then when we call getScoredInsertableTerms() it will fill in the
//   m_queryChangeBuf array
SafeBuf *XmlDoc::getInsertableTerms ( ) {

	if ( m_insertableTermsBufValid )
		return &m_insertableTermsBuf;

	// make sure related query string buf is valid
	//SafeBuf *rrr = getRelatedQueryLinksWithStrings();
	//if ( ! rrr || rrr == (void *)-1 ) return rrr;

	// just use this now
	SafeBuf *mtBuf = getMissingTermBuf();
	if ( ! mtBuf || mtBuf == (void *)-1 ) return mtBuf;

	// get buffer of ptrs to the msg99 replies for this url
	//SafeBuf *mqbuf = getMatchingQueries ( false );
	//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;

	// just use the MissingTerm class for these as well!!
	SafeBuf *maBuf = getMatchingTermBuf();
	if ( ! maBuf || maBuf == (void *)-1 ) return maBuf;


	//
	// alloc space for the insertable terms in its safebuf
	//
	int32_t need = 0;
	char *p;
	char *pend;
	p = mtBuf->getBufStart();
	pend = mtBuf->getBuf();
	for ( ; p < pend ; ) {
		MissingTerm *mt = (MissingTerm *)p;
		p += mt->getSize();
		need += sizeof(InsertableTerm);
		need += mt->getTermSize();
	}
	// these are the matching terms, but use the same MissingTerm class
	p = maBuf->getBufStart();
	pend = maBuf->getBuf();
	for ( ; p < pend ; ) {
		MissingTerm *mt = (MissingTerm *)p;
		p += mt->getSize();
		need += sizeof(InsertableTerm);
		need += mt->getTermSize();
	}
	if ( ! m_insertableTermsBuf.reserve ( need ,"itblbuf" ) ) return NULL;

	//
	// now interleave the matching terms with the related terms
	//

	char *p1 = mtBuf->getBufStart();
	char *p1End = mtBuf->getBuf();

	char *p2 = maBuf->getBufStart();
	char *p2End = maBuf->getBuf();

	// int16_tcut
	SafeBuf *ib = &m_insertableTermsBuf;

	int32_t count; for ( count = 0 ; ; count++ ) {
		// . just get top 50 insertable terms
		// . use #define MAX_INSERTABLE_TERMS 50?
		if ( count >= 50 ) break;
		bool add1 = false;
		bool add2 = false;
		if ( ( count % 2 ) == 0 && p1 < p1End ) add1 = true;
		if ( ( count % 2 ) == 1 && p2 < p2End ) add2 = true;
		if ( ! add1 && ! add2 ) break;
		MissingTerm *mt;
		if ( add1 ) {
			mt = (MissingTerm *)p1;
			p1 += mt->getSize();
		}
		if ( add2 ) {
			mt = (MissingTerm *)p2;
			p2 += mt->getSize();
		}
		// make an insertable term
		InsertableTerm it;
		if ( add1 ) it.m_isRelatedTerm = true;
		else        it.m_isRelatedTerm = false;
		// sum of traffic of the queries that contained this term
		it.m_trafficSum = mt->m_traffic;
		// hash it up
		char *term = mt->getTerm();
		int32_t  termSize = mt->getTermSize();
		it.m_termHash64 = hash64 ( term , termSize - 1 );
		it.m_termSize         = termSize;
		// reset this for later use
		it.m_bestTrafficGain    = -1;
		it.m_bestInsertPos      = -1;
		// store that insertable term
		ib->safeMemcpy(&it,sizeof(InsertableTerm));
		// then the term string itself follows for easy serialization
		// into cachedb...
		ib->safeMemcpy(term,termSize);
	}

	if ( ib->length() > need ) { char *xx=NULL;*xx=0; }

	//m_numInsertableTerms = count;

	m_insertableTermsBufValid = true;
	return &m_insertableTermsBuf;
}


static void gotMsg95ReplyWrapper ( void *state , UdpSlot *slot ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->gotMsg95Reply( slot );
}

void XmlDoc::gotMsg95Reply ( UdpSlot *slot ) {
	// count it
	m_numMsg95Replies++;
	// return if still waiting
	if ( m_numMsg95Replies < m_numMsg95Requests ) return;
	// . store each msg95reply
	// . TODO: do we need m_msg95ReplyAlloc[] like m_msg99 has?
	m_msg95ReplyPtrs [slot->m_hostId] = slot->m_readBuf;
	m_msg95ReplySizes[slot->m_hostId] = slot->m_readBufSize;
	// do not let it free it, we will free it
	slot->m_readBuf = NULL;
	// all done! should call getScoredInsertableTerms() indirectly
	m_masterLoop ( m_masterState );
}

#include "seo.h" // for Msg95Request class

/*
// return a buffer of WordFreqInfo instances for every word in the
// insertable terms buffer. we use this so the msg95 handler can get the
// term freqs of any term in any matching query consistently, because
// we are host #0 calling this presumably. msg95 handler will use these
// to set the termfreqs in the Msg39Request when calling msg39.
// TODO: run through related queries as well! why didn't insertable terms
// work!?!?! it should...
SafeBuf *XmlDoc::getInsertableWordFreqInfoBuf ( ) {

	// must always be host 0 or it's twin! we have to ensure
	// consistency always when calling getTermFreq()...
	if ( g_hostdb.m_groupId != 0 ) { char *xx=NULL;*xx=0; }

	if ( m_iwfiBufValid )
		return &m_iwfiBuf;

	// get the same top word ids we pass to the msg95 request,
	// because handleRequest95() uses those to get the queries
	// that we match, and it evaluates each of those queries on each
	// insertion we do.
	// So that is the ptr_twid32Buf, which MUST include all
	// insertable terms as well, like those insertable terms that are
	// new to us!!

	// scan list of insertable terms
	SafeBuf *itBuf = getInsertableTerms();
	if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;

	// . true means to get synonyms
	// . itBuf non-null will append new insertable terms we don't have
	int32_t *twids = getTopTermsVectorWithNewTerms ( true , itBuf );
	if ( ! twids || twids==(void *)-1 ) return (SafeBuf *)twids;

	// int16_tcut
	//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
	//int32_t ni = itBuf->length() / sizeof(InsertableTerm);

	// get buffer of ptrs to the msg99 replies for this url
	//SafeBuf *mqbuf = getMatchingQueries ( false );
	//if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf;
	//Msg99Reply **mrp = (Msg99Reply **)mqbuf->getBufStart();
	//int32_t nmrp = mqbuf->length() / 4;


	// use table to dedup so we do not store dups
	HashTableX dups;
	if ( ! dups.set ( 8,0,8192,NULL,0,false,m_niceness,"iwfidup") )
		return NULL;

	// . first store the langid in the buf!!!
	// . then the wordfreqinfos follow!
	if ( ! m_iwfiBuf.safeMemcpy ( &docLangId , 1 ) )
		return NULL;

	char *p = itBuf->getBufStart();
	char*pend = itBuf->getBuf();

	// scan each "term" which might be one or more words
	for ( ; p < pend ; ) {
		//for ( int32_t i = 0 ; i < nmrp ; i++ ) {
		QUICKPOLL(m_niceness);
		// cast it
		InsertableTerm *it = (InsertableTerm *)p;
		p += it->getSize();
		// add it in
		if ( ! addTermFreqsForTerm ( it->getTerm() , &dups ) )
			return NULL;
	}

	// do the same for all words and bigram terms in doc as well


	m_iwfiBufValid = true;
	return &m_iwfiBuf;
}

bool XmlDoc::addTermFreqsForTerm ( char *term , HashTableX *dups ) {

	// we need this for synonyms
	//uint8_t langId = langEnglish;
	uint8_t *langIdPtr = getLangId();
	// this should have been set by parent caller
	if ( ! langIdPtr || langIdPtr == (uint8_t *)-1 ) {char *xx=NULL;*xx=0;}
	// get the language this doc is in
	uint8_t docLangId = *langIdPtr;
	// if uknown, use english!
	if ( docLangId == langUnknown ) docLangId = langEnglish;


	//Msg99Reply *mr = mrp[i];
	//Words ww;
	//ww.set3 ( it->m_termStr );
	//ww.set3(it->getTerm() );//mr->m_queryStr );//it->m_termStr );
	Query qq;
	// false = query expansion? i.e. use synonyms?
	//qq.set2 ( it->getTerm(),docLangId,true);
	qq.set2 ( term,docLangId,true);
	//if ( strstr ( mr->m_queryStr, "bio wagner"))
	//	log("hey");
	log("adding %s",term);
	//int64_t *wids = ww.getWordIds();
	// scan each word for term freq
	for ( int32_t j = 0 ; j < qq.m_numTerms ; j++ ) {
		// int16_tcut
		QueryTerm *qt = &qq.m_qterms[j];
		// get the full 64-bit hash of the word
		int64_t wid = qt->m_rawTermId;
		// skip if punct
		if ( ! wid ) continue;
		// dup?
		if ( dups->isInTable ( &wid ) ) continue;
		// add it
		int64_t tf = g_posdb.getTermFreq ( cr->m_coll, wid );
		if ( ! dups->addKey ( &wid ) ) return NULL;
		WordFreqInfo wfi;
		wfi.m_wordId64   = wid;
		wfi.m_wordFreq64 = tf;
		// note it
		SafeBuf bb;
		bb.safePrintf("seo: tf for term=\"");
		bb.safeMemcpy ( qt->m_term, qt->m_termLen);
		bb.safePrintf("\" = %"INT64"",tf);
		log("seo: %s",bb.getBufStart());
		// store it
		if(!m_iwfiBuf.safeMemcpy(&wfi,sizeof(WordFreqInfo)))
			return NULL;
	}
	return true;
}
*/

// 2. now transmit all the insertable terms to each host in the network. each
//    host will evaluate each term in the list for every query that that
//    host has in its memory for every new word position. kick this process
//    off with the getNewRanks() function which returns a list of
//    query terms where each query term has a wordposition/trafficgain
//    array.  [try to also insert entire phrases not just words]
//    Each host will return an InsertedTerm class for each term. But then
//    WE have to merge the InsertedTerm classes together for a particular
//    term. That can be a bit tricky since we do not list a wordposition
//    if it's traffic gain was the same as its previous wordposition.
//    PASS in the entire doc's termlist with each request in case not in cache
//    so it can evaluate each query's scores very quickly!
//
// . send a msg95 request to each host consisting of a list of terms to
//   insert, and the entire termlists of this document.
// . then merge the replies into a final list of InsertedTerms.
// . returned is buffer of InsertableTerms
SafeBuf *XmlDoc::getScoredInsertableTerms ( ) {

	setStatus ( "getscoredinsertableterms" );

	if ( m_scoredInsertableTermsBufValid )
		return &m_insertableTermsBuf;

	uint8_t *langIdPtr = getLangId();
	if ( ! langIdPtr || langIdPtr == (void *)-1 )
		return (SafeBuf *)langIdPtr;

	SafeBuf *itBuf = getInsertableTerms();
	if ( ! itBuf || itBuf == (void *)-1 ) return itBuf;

	// these are the posdb keys of our document, makes it fast
	// and easy for msg39 to return a serp score restricted to our docid
	SafeBuf *termListBuf = getTermListBuf();
	if ( ! termListBuf || termListBuf==(void *)-1 )
		return termListBuf;


	// this has all our documents terms and their synonyms in it,
	// as well as the new terms we plan to insert that our doc does not
	// have, from the getMissingTerms() buffer. in addition it
	// has the term freq of each one!
	SafeBuf *ntiBuf = getNewTermInfoBuf();
	if ( ! ntiBuf || ntiBuf == (void *)-1 ) return (SafeBuf *)ntiBuf;

	// get list of TermFreqInfo instances for all words in the
	// lits of insertable terms
	//SafeBuf *wfib = getInsertableWordFreqInfoBuf ( );
	//if ( ! wfib || wfib == (void *)-1 ) return wfib;

	SafeBuf *wpib = getWordPosInfoBuf();
	if ( ! wpib || wpib == (void *)-1 ) return wpib;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if still waiting for replies to come in, return -1
	if ( m_numMsg95Requests > 0 && m_numMsg95Replies < m_numMsg95Requests )
		return (SafeBuf *)-1;

 top:

	// otherwise, we are done!
	if ( m_numMsg95Requests > 0 && m_numMsg95Replies >=m_numMsg95Requests){
		// . calculate the best insertable position for each
		//   Insertable Term.
		// . we get a QueryChange array back from each host for
		//   the same term, but for queries local on that host,
		//   so add them all up here and set
		//   InsertableTerm::m_bestTrafficGain/m_bestTermPosition
		// . queries that did not have us in the top 50 will not
		//   be in the reply
		processMsg95Replies();
		// show how long it took
		int64_t now = gettimeofdayInMilliseconds();
		int64_t took = now - m_beginMsg95s;
		log("seopipe: time: getscoredinsertableterms took %"INT64" ms",
		    took);
		// return the list of InsertableTerms, scored
		m_scoredInsertableTermsBufValid = true;
		// cache it! if it blocks that is ok, since it is valid n
		// disable for debug... MDW!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		if ( ! storeScoredInsertableTermsIntoCachedb() )
			return (SafeBuf *)-1;
		return &m_insertableTermsBuf;
	}


	// now send every term in this list to every host in the
	// network so it can evaluate with each of the queries it contains
	// in memory from the query log for every position in the doc.
	// then it will return InsertableTerm::m_wordPositions/m_trafficGain
	// arrays for each InsertableTerm.

	// time how long this whole thing takes
	m_beginMsg95s = gettimeofdayInMilliseconds();
	// reset this crap i guess
	m_numMsg95Requests = 0;
	m_numMsg95Replies  = 0;

	// from seo.h
	Msg95Request mr;

	if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }

	mr.m_docId = m_docId;
	mr.m_docLangId = *langIdPtr;
	mr.m_seoDebug = m_seoDebug;

	mr.ptr_posdbTermList    = termListBuf->getBufStart();
	// a buffer of TermInfos. used to set the termFreq of each term
	// and used to determine what queries match the doc and should be
	// evaluated for every insertion.
	mr.ptr_termInfoBuf      = ntiBuf->getBufStart();
	mr.ptr_coll             = cr->m_coll;
	//mr.ptr_wordFreqInfoBuf  = wfib->getBufStart();
	mr.ptr_wordPosInfoBuf   = wpib->getBufStart();
	// why do we need this? doesn't termInfoBuf have all that? no,
	// because we limit insertableterms to like the top 300 highest
	// scoring, so they are separate. the termInfoBuf is sorted by
	// termid (lower 32-bits) and has a termfreq and is used to
	// get the matching queries in seo.cpp:handlerequest95()
	mr.ptr_insertableTerms  = m_insertableTermsBuf.getBufStart();

	mr.size_posdbTermList   = termListBuf->length();
	mr.size_termInfoBuf     = ntiBuf->length();//m_numTwids * 4;
	mr.size_coll            = gbstrlen(cr->m_coll)+1;
	//mr.size_wordFreqInfoBuf = wfib->length();
	mr.size_wordPosInfoBuf  = wpib->length();
	mr.size_insertableTerms = m_insertableTermsBuf.length();

	int32_t requestSize;
	char *req = serializeMsg ( sizeof(Msg95Request),
				   &mr.size_posdbTermList ,// firstSizeParm
				   &mr.size_insertableTerms,//lastSizeP
				   &mr.ptr_posdbTermList  ,// firststrptr
				   &mr            ,// thisPtr
				   &requestSize   ,
				   NULL           ,
				   0              ,
				   true           );

	if ( ! req ) return NULL;

	int32_t numHosts = g_hostdb.m_numHosts;
	// do not re-send if we already did this!
	if ( m_numMsg95Requests > 0 ) numHosts = 0;

	// send one msg95 request to each host. skip if dead.
	for ( int32_t i = 0; i < numHosts ; i++ ) {
		// get ptr to the host
		Host *host = g_hostdb.getHost(i);
		// get hostid of host #i
		int32_t hostId = host->m_hostId;
		// count it
		m_numMsg95Requests++;
		// skip if dead. i guess no queries from that guy. we can't
		// send to a twin because the twin does not have the same
		// queries in its in-memory query log. once we get more
		// machines we should probably make the twin have the same
		// copy so we can be redundant.
		if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive ) {
			log("seo: warning. host %"INT32" is dead so we could "
			    "not do the keyword tool right",hostId);
			m_numMsg95Replies++;
			continue;
		}
		// . send our posdb termlist to each host so it can
		//   call msg39 restricted to our docid very quickly
		// . also send a ALL of the insertable terms to each
		//   host so they can evaluate the insertion for all of the
		//   relevant queries.
		// . each host should be smart enough to realize that some
		//   queries need not be performed for an insertion because
		//   it is impossible to break the minimum score to be in the
		//   top 50 for that query. but we'll only have a minimum
		//   score for each query once we run a batch to eval
		//   each query at least partially to get a rough idea of
		//   the score needed to be in the top 50.
		// . reply should be an array of QueryChanges for each
		//   insertable term for every query that matches this doc
		//   in the g_qlog buffer.
		// . in most cases these arrays will be empty because we are
		//   not in the top 50 for that query
		if ( ! g_udpServer.sendRequest ( req ,
						 requestSize ,
						 0x95 , // msgtype
						 host->m_ip , // ip
						 host->m_port , // port
						 hostId,
						 NULL, // retslot
						 this,
						 gotMsg95ReplyWrapper,
						 10000 , // timeout
						 -1 , // backoff
						 -1 , // maxwait
						 NULL, // replybuf
						 0, // replybufmaxsize
						 m_niceness // niceness
						 )) {
			// let admin know about error
			log("seopipe: sendRequest 95 had error: %s",
			    mstrerror(g_errno));
			// count it as replied then
			m_numMsg95Replies++;
			continue;
		}
	}

	// wait for all msg95 replies to come in
	if ( m_numMsg95Requests > m_numMsg95Replies )
		return (SafeBuf *)-1;

	// somehow we finished without blocking
	goto top;

	// dummy return
	return NULL;
}


// now sort the huge ptr buffer to QueryChanges first by:
// 1: QueryChange::m_termHash64
// 2: QueryChange::m_queryHash32
// 3: QueryChange::m_insertPos
int queryChangeCmp ( const void *a, const void *b ) {
	QueryChange *qa = *(QueryChange **)a;
	QueryChange *qb = *(QueryChange **)b;
	// smallest term hash should be at the head of the list
	if ( qa->m_termHash64  < qb->m_termHash64  ) return -1;
	if ( qa->m_termHash64  > qb->m_termHash64  ) return  1;
	if ( qa->m_queryHash32 < qb->m_queryHash32 ) return -1;
	if ( qa->m_queryHash32 > qb->m_queryHash32 ) return  1;
	if ( qa->m_insertPos   < qb->m_insertPos   ) return -1;
	if ( qa->m_insertPos   > qb->m_insertPos   ) return  1;
	return 0;
}


// . make each InsertableTerm point to a linked list of QueryChanges for it.
// . each QueryChange is a word position and a rank change
// . the linked list will be sorted by QueryChange::m_insertPos
// . there can be multiple QueryChanges for a single m_insertPos, but
//   they will be fore different queries.
bool XmlDoc::processMsg95Replies() {

	int32_t need = 0;
	// each reply is a list of QueryChanges
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		// get reply
		Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
		// skip if empty, error?
		if ( ! mr ) continue;
		// deserialize the msg95replies first
		deserializeMsg ( sizeof(Msg95Reply) ,
				 (int32_t *)&mr->size_queryChangeBuf,//1stszparm
				 (int32_t *)&mr->size_queryLogBuf,//lastszparm
				 (char **)&mr->ptr_queryChangeBuf,//1ststrptr
				 mr->m_buf );
		// scan the QueryChanges
		//QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
		int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
		need += ncs * 4;
	}
	// alloc now
	SafeBuf hugePtrBuf;
	if ( ! hugePtrBuf.reserve ( need ,"hpbuf" ) ) return false;

	// how big are all query log bufs?
	int32_t sumTotal = 0;
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		// get reply
		Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
		// skip if empty, error?
		if ( ! mr ) continue;
		// how big
		sumTotal += mr->size_queryLogBuf;
	}
	m_queryLogBuf.reset();
	if ( ! m_queryLogBuf.reserve ( sumTotal ,"qlogbuf") ) return false;
	char *orig = m_queryLogBuf.getBufStart();

	int32_t ongoingOffset = 0;
	int32_t ongoingDebugOffset = 0;
	int32_t ongoingOrigOffset = 0;

	// . fill up higePtrBuf for sorting below
	// . also fill up m_queryLogBuf now for store*IntoCachedb()
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		// get reply
		Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i];
		// skip if empty, error?
		if ( ! mr ) continue;
		// ref it
		//char *ref = m_queryLogBuf.getBuf();
		//int32_t ref = m_queryLogBuf.length();
		// add to our big buffer
		m_queryLogBuf.safeMemcpy ( mr->ptr_queryLogBuf ,
					   mr->size_queryLogBuf );
		// debug scores. should be length 0 if not debugging.
		m_debugScoreInfoBuf.safeMemcpy ( mr->ptr_debugScoreInfoBuf ,
						 mr->size_debugScoreInfoBuf );
		// original scores buf
		m_origScoreInfoBuf.safeMemcpy ( mr->ptr_origScoreInfoBuf ,
						mr->size_origScoreInfoBuf );
		// scan the QueryChanges
		QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf;
		int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange);
		for ( int32_t j = 0 ; j < ncs ; j++ ) {
			QueryChange *qc = &qcs[j] ;
			// this is relative to ptr_queryLogBuf
			qc->m_replyQueryOffset += ongoingOffset;
			// if we have debug score info
			if ( m_seoDebug >= 2 ) {
				if ( qc->m_debugScoreInfoOffset < 0 ) {
					char *xx=NULL;*xx=0; }
				if ( qc->m_origScoreInfoOffset < 0 ) {
					char *xx=NULL;*xx=0; }
				qc->m_debugScoreInfoOffset +=
					ongoingDebugOffset;
				qc->m_origScoreInfoOffset  +=
					ongoingOrigOffset;
			}
			// that's relative to the msg95reply's ptr_queruStrBuf
			//QueryLogEntry *qe;
			//qe = (QueryLogEntry *)(mr->ptr_queryLogBuf + qoff);
			//qe = (QueryLogEntry *)(ref + qoff);
			// HACK that in. RELATIVE to m_queryLogBuf!!!
			//qc->m_queryOffset3 = ref;//(int32_t)qe;
			// add ptr to our global buffer
			hugePtrBuf.pushPtr ( qc );
		}
		// sum it up
		ongoingOffset      += mr->size_queryLogBuf;
		ongoingDebugOffset += mr->size_debugScoreInfoBuf;
		ongoingOrigOffset  += mr->size_origScoreInfoBuf;
	}
	// sanity. make sure doesn't grow since we reference it
	if ( m_queryLogBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; }

	// now sort the huge ptr buffer to QueryChanges first by:
	// 1: QueryChange::m_termHash64
	// 2: QueryChange::m_queryHash32
	// 3: QueryChange::m_insertPos
	char *hhh  = hugePtrBuf.getBufStart();
	int32_t  size = hugePtrBuf.length();
	// this should breath with niceness!!
	gbqsort ( hhh ,
		  size/4 ,
		  sizeof(QueryChange *),
		  queryChangeCmp ,
		  m_niceness ) ;

	// now store those sorted query changes into m_queryChangeBuf
	// so we can cache them in store*IntoCached() easily
	int32_t nqc = (need / 4) ;
	if ( ! m_queryChangeBuf.reserve ( nqc * sizeof(QueryChange),"qcbuf") )
		return false;
	// for sanity check
	char *orig2 = m_queryChangeBuf.getBufStart();

	// copy over sorted into m_queryChangeBuf so we can cache it in cachedb
	char *p = hhh;
	char *pend = hhh + size;
	for ( ; p < pend ; p += sizeof(QueryChange *) ) {
		// cast it
		QueryChange *qc = *(QueryChange **)p;
		// save ptr to it
		char *ref = m_queryChangeBuf.getBuf();
		// save it
		m_queryChangeBuf.safeMemcpy ( qc , sizeof(QueryChange) );
		// now ref that instead
		*(QueryChange **)p = (QueryChange *)ref;
	}
	// sanity test
	if ( m_queryChangeBuf.getBufStart() != orig2 ) { char *xx=NULL;*xx=0;}

	// now we can free the replies since we stored the replies into
	// m_queryLogBuf and m_queryChangeBuf for store*IntoCachedb()
	for ( int32_t i = 0;i < g_hostdb.m_numHosts;i++) {
		if ( ! m_msg95ReplyPtrs[i] ) continue;
		mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" );
		m_msg95ReplyPtrs[i] = NULL;
	}


	// . now set QueryChange::m_next to make our linked list
	// . if it is for a different query or termhash then end the linked
	//   list by setting m_next to NULL
	QueryChange *lastqc = NULL;
	for ( p = hhh ; p < pend ; p += 4 ) {
		// cast it
		QueryChange *qc = *(QueryChange **)p;
		// assume we are the last one in the linked list
		qc->m_next = NULL;
		// make linked list
		if ( lastqc &&
		     // terms must match to be in same linked list
		     lastqc->m_termHash64 == qc->m_termHash64 )
			// link them
			lastqc->m_next = qc;
		// set this for next qc
		lastqc = qc;
	}

	// now set InsertableTerm::m_firstQueryChange to point to the head
	// of the linked list for that term based on it's m_termHash64.
	// but the insertable terms are sorted by m_trafficSum.
	// map a termHash64 to its corresponding first QueryChange.
	HashTableX tit;
	if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0; }
	int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
	if ( ! tit.set ( 8,4, ni*4,NULL,0,false,m_niceness,"tittbl") )
		return false;

	int64_t lastHash64 = 0LL;
	// . store ptr to first querychange for each termhash64 into hash table
	// . should be the head of the linked list for a termid
	for ( p = hhh ; p < pend ; p += 4 ) {
		// cast it
		QueryChange *qc = *(QueryChange **)p;
		// skip if not a new term hash
		if ( qc->m_termHash64 == lastHash64 ) continue;
		// update it
		lastHash64 = qc->m_termHash64;
		// . map it in the hash table then
		// . it should be pre-allocated!
		if (!tit.addKey(&qc->m_termHash64,&qc)){char *xx=NULL;*xx=0;}
	}

	// now scan the insertable terms and set their
	// InsertableTerm::m_firstQueryChange ptr. points to the head
	// of the QueryChange linked list for this insertable term
	SafeBuf *itBuf = getInsertableTerms();
	p = itBuf->getBufStart();
	pend = itBuf->getBuf();
	for ( ; p < pend ; ) {
		InsertableTerm *it = (InsertableTerm *)p;
		p += it->getSize();
		// assume none
		it->m_firstQueryChange = NULL;
		char *val = (char *)tit.getValue(&it->m_termHash64);
		// i guess there is none
		if ( ! val ) continue;
		// cast it
		QueryChange *qc = *(QueryChange **)val;
		// and assign
		it->m_firstQueryChange = qc;
	}

	SafeBuf *wpib = getWordPosInfoBuf();
	if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
	WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
	int32_t nwpis = wpib->length() / sizeof(WordPosInfo);

	// now set InsertableTerm::m_bestTrafficGain/m_bestInsertPos/
	// m_bestQueryChange by scanning the linked list and scoring each
	// QueryChange::m_insertPos to see which is the highest traffic gain.
	// and in the case of ties prefer the lowest word position.
	p = itBuf->getBufStart();
	pend = itBuf->getBuf();
	for ( ; p < pend ; ) {
		InsertableTerm *it = (InsertableTerm *)p;
		p += it->getSize();
		// . use this function now so seo.cpp can call it too!
		// . sets WordPosInfo::m_trafficGain members
		setWordPosInfosTrafficGain ( it );
		// now find the insert position with the most traffic gain!
		int32_t bestTrafficGain = -1;
		int32_t bestInsertPos = -1;
		for ( int32_t j = 0 ; j < nwpis ; j++ ) {
			// skip if not the best scoring position
			if ( wpis[j].m_trafficGain <= bestTrafficGain &&
			     // and if not first time!
			     bestInsertPos != -1 )
				continue;
			// we got a new winner
			bestTrafficGain = wpis[j].m_trafficGain;
			bestInsertPos   = wpis[j].m_wordPos;//insertPos;
		}
		// set it
		it->m_bestTrafficGain = bestTrafficGain;
		it->m_bestInsertPos   = bestInsertPos;
	}

	return true;
}

void XmlDoc::setWordPosInfosTrafficGain ( InsertableTerm *it ) {

	// get the wordposinfobuf!
	SafeBuf *wpib = getWordPosInfoBuf();
	if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
	WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
	int32_t nwpis = wpib->length() / sizeof(WordPosInfo);

	// . use the wordposinfo array to accumulate traffic gains
	//   for each word position, WordPosInfo::m_insertPos.
	// . TODO: ignore tags like gblangid:
	// . so reset the traffic gains first
	for ( int32_t j = 0 ; j < nwpis ; j++ )
		wpis[j].m_trafficGain = 0;


	if ( ! it ) return;

	// head of the linked list of QueryChanges for this InsertableTerm
	QueryChange *qc = it->m_firstQueryChange;
	// skip if no list. leave traffic gains set to 0 for all
	if ( ! qc ) return;


	// accumulate traffic gains
	int32_t k = 0;
	int32_t lastQueryHash32 = 0;
	//bool firstQueryChangeForQuery;
	QueryChange *lastqc = NULL;
	// . scan the linked list of query changes
	// . this is sorted by query first then m_insertPos
	for ( ; qc ; qc = qc->m_next ) {
		// assume NOT the first QueryChange for this query
		//firstQueryChangeForQuery = false;
		// . reset stuff for each different query
		// . QueryChanges are sorted by m_queryHash32 secondly
		//   and by m_insertPos thirdly now...
		if ( qc->m_queryHash32 != lastQueryHash32 ) {
			// reset our WordPosInfo cursor
			k = 0;
			// for detecting the next set of QueryChanges
			// for a different query
			lastQueryHash32 = qc->m_queryHash32;
			//firstQueryChangeForQuery = true;
			lastqc = NULL;
		}
		// sanity
		if ( lastqc && lastqc->m_insertPos > qc->m_insertPos ) {
			char *xx=NULL;*xx=0; }
		// compute th traffic in advance from the rank changes
		int32_t trafficGain = getTrafficGain( qc );
		// checkpoint
		/*
		if ( trafficGain > 0 )
			log("got some traffic gain qh=%"UINT32" "
			    "pos=%"INT32" term=%s gain=%"INT32"",
			    qc->m_queryHash32,
			    qc->m_insertPos,
			    it->m_termStr,
			    trafficGain);
		*/
		// get next query change
		QueryChange *nqc = qc->m_next;
		// make it NULL if for a different query
		if ( nqc && nqc->m_queryHash32 != qc->m_queryHash32 )
			nqc = NULL;
		// . we use a compression where we only store a
		//   QueryChange if different than the last QueryChange
		// . so advance the WordPosInfos cursor "k" until
		//   we catch up to the qc->m_insertPos.
		for ( ; k < nwpis ; k++ ) {
			// stop if we are caught up
			if ( wpis[k].m_wordPos >= qc->m_insertPos )
				break;
		}
		// now this position and up to next qc "nqc" gets the traffic
		for ( ; k < nwpis ; k++ ) {
			// stop if we are caught up
			if ( nqc && wpis[k].m_wordPos >= nqc->m_insertPos )
				break;
			wpis[k].m_trafficGain += trafficGain;
		}
	}

	/*
	// print out positives - debug
	for ( int32_t k = 0 ; k < nwpis ; k++ ) {
		// stop if we are caught up
		if ( ! wpis[k].m_trafficGain ) continue;
		if ( wpis[k].m_trafficGain <= 0 ) continue;
		// note it
		log("seo: gain pos=%"INT32" gain=%"INT32"",
		    wpis[k].m_wordPos,
		    wpis[k].m_trafficGain);
	}
	*/
}

double getTrafficPercent ( int32_t rank ) {
	// from aol's query logs from that same searchenginewatch.com url
	static double s_posClicks[1000] = {
		.4230, // #1
		.1192,
		.0844,
		.0603,
		.0486,
		.0399,
		.0337,
		.0298,
		.0283,
		.0270  // #10  (was .297 but for our purposes, make it <)
	};

	//static float s_pageClicks[5];

	// set total of clicks each page gets
	static bool s_init = false;
	if ( ! s_init ) {
		s_init = true;
		//float sum = 0.0;
		//for ( int32_t i = 0 ; i < 10 ; i++ )
		//	sum += s_posClicks[i];
		// this is about .11 or so
		//float pageFactor = 1.0 - sum;
		// HACK! make it pass the sanity check below!
		//pageFactor *= .50;
		// sanity. do not allow top result on 2nd page
		// to rank higher!!
		//if ( pageFactor * s_posClicks[0] > s_posClicks[9] ) {
		//	char *xx=NULL;*xx=0; }
		// will be like .11 for second page, .01 for 3rd, etc.
		//float pageMult = 1.0;
		// fill in the rest
		for ( int32_t i = 10 ; i < 1000 ; i++ ) {
			// just make it linear since there is too much
			// chaos as to our diffs with google. so this is
			// a good estimation way...
			s_posClicks[i] = .0270 - .0007 * i;
			if ( s_posClicks[i] < 0 )
				s_posClicks[i] = 0.0;
		}
		// sanity to make sure all in order
		for ( int32_t i = 1 ; i < 1000 ; i++ ) {
			if ( s_posClicks[i-1] < s_posClicks[i] ) {
				char *xx=NULL;*xx=0; }
			if ( s_posClicks[i] < 0 ) {
				char *xx=NULL;*xx=0; }
		}
	}

	if ( rank >= 1000 ) rank = 999;
	if ( rank < 0 ) { char *xx=NULL;*xx=0; }

	return s_posClicks[rank];
}
// . based on difference between m_oldRank and m_newRank
// . m_*Rank starts at 0 and goes to 9 for first page of results
int32_t XmlDoc::getTrafficGain ( QueryChange *qc ) {

	// no rank change? this can both be -1 if it is a missing
	// term i guess... and we're not inserting it.
	if ( qc->m_oldRank == qc->m_newRank ) return 0;

	// get old clicks
	int32_t oldRank = qc->m_oldRank;
	double oldp;
	// if not ranked before because this was inserting a brand new
	// missing term, this will be -1
	if ( oldRank == -1 ) oldp = 0.0;
	else oldp = getTrafficPercent ( oldRank );
	//if ( oldRank < 50 ) oldp = s_posClicks[oldRank];

	// get new clicks
	int32_t newRank = qc->m_newRank;
	float newp = getTrafficPercent ( newRank );
	//if ( newRank < 50 ) newp = s_posClicks[newRank];

	// HACK
	// we stored the entire querylogreply buf in here
	char *ref = m_queryLogBuf.getBufStart();
	// so we can use the replyqueryoffset then...
	QueryLogEntry *qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
	int32_t traffic = qe->m_gigablastTraffic;
	traffic *= GB_TRAFFIC_MODIFIER;

	int32_t trafficChange = (int32_t)((newp - oldp) * traffic);

	// sanity.
	if ( qc->m_oldRank > qc->m_newRank && trafficChange < 0 ) {
		char *xx=NULL;*xx=0; }
	// ignore this sanity check if not ranked before. i.e. inserting
	// a new missing term...
	if ( qc->m_oldRank != -1 &&
	     qc->m_oldRank < qc->m_newRank && trafficChange > 0 ) {
		char *xx=NULL;*xx=0; }

	// return the change. it might be negative!
	return trafficChange;
}


// 4. then we just dump out all the InsertedTerms into xml so they can be
//    displayed on the front end.

// dump the list of InsertedTerms into "sbuf" as xml
bool XmlDoc::printScoredInsertableTerms ( SafeBuf *sbuf ) {
	// print the header
	sbuf->safePrintf("\t<insertableTerms>\n");
	// scan each term
	SafeBuf *itBuf = getInsertableTerms();
	// has to be there
	if ( ! itBuf || itBuf == (void *)-1 ) { char *xx=NULL;*xx=0; }

	SafeBuf *wpib = getWordPosInfoBuf();
	if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; }
	WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart();
	int32_t nwpis = wpib->length() / sizeof(WordPosInfo);

	// cast it
	//InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart();
	// how many terms do we have?
	//int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm);
	// dedup queries used in query changes
	HashTableX qdups;
	if ( ! qdups.set(4,0,32,NULL,0,false,m_niceness,"qddd") ) return false;

	//
	// . print query map
	// . print all query ids we use and their strings
	//
	bool firstTime = true;

	char *p = itBuf->getBufStart();
	char *pend = itBuf->getBuf();
	for ( ; p < pend ; ) {
		QUICKPOLL(m_niceness);
		// cast it
		InsertableTerm *it = (InsertableTerm *)p;
		p += it->getSize();
		// scan its query changes
		QueryChange *qc = it->m_firstQueryChange;
		for ( ; qc ; qc = qc->m_next ) {
			// skip if already printed
			if ( qdups.isInTable(&qc->m_queryHash32) ) continue;
			if ( firstTime ) {
				sbuf->safePrintf("\t\t<queryMap>\n");
				sbuf->safePrintf("\t\t\t<desc>"
						 "<![CDATA["
						 "32bitSignedQueryHash,"
						 "queryString"
						 "]]></desc>\n"
						 );
			}
			firstTime = false;
			// HACK
			char *ref = m_queryLogBuf.getBufStart();
			QueryLogEntry *qe;
			qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset);
			// new query, print it. map the hash to the string
			// so we can just show the hash when printing
			// out all the QueryChanges below to save space
			sbuf->safePrintf("\t\t\t<queryPoint>"
					 "<![CDATA[%"INT32",%s]]>"
					 "</queryPoint>\n"
					 , qc->m_queryHash32
					 // hack...
					 , qe->getQueryStr()
					 );
			// do not re-print
			if ( ! qdups.addKey(&qc->m_queryHash32) )return false;
		}
	}
	if ( ! firstTime )
		sbuf->safePrintf("\t\t</queryMap>\n");

	// . now the word position map
	// . we only provided querychange if it has a different score than
	//   the previously stored querychange. this is a kind of compression
	// . so you need to know all the possible word positions we tried
	//   for each insertion we did
	sbuf->safePrintf("\t\t<wordInsertionMap>\n");
	sbuf->safePrintf("\t\t\t<desc>"
			 "<![CDATA["
			 "Describes all positions we attempt to insert each "
			 "insertable term into. The terms at that position "
			 "and up are pushed forward by the insertion. "
			 "&lt;sent&gt; is the sentence number."
			 "]]></desc>\n"
			 );
	for ( int32_t i = 0 ; i < nwpis ; i++ ) {
		WordPosInfo *wpi = &wpis[i];
		sbuf->safePrintf("\t\t\t<word>\n"
				 "\t\t\t\t<pos>%"INT32"</pos>\n"
				 "\t\t\t\t<sent>%"INT32"</sent>\n"
				 "\t\t\t\t<hashGroup>%s</hashGroup>\n"
				 "\t\t\t\t<densityRank>%"INT32"</densityRank>\n"
				 "\t\t\t\t<spamRank>%"INT32"</spamRank>\n"
				 "\t\t\t</word>\n"
				 ,wpi->m_wordPos
				 ,wpi->m_sentNum
				 ,getHashGroupString(wpi->m_hashGroup)
				 ,(int32_t)wpi->m_densityRank
				 ,(int32_t)wpi->m_wordSpamRank
				 );

	}
	sbuf->safePrintf("\t\t</wordInsertionMap>\n");


	// scan all the insertable terms
	p = itBuf->getBufStart();
	pend = itBuf->getBuf();
	for ( ; p < pend ; ) {
		QUICKPOLL(m_niceness);
		// cast it
		InsertableTerm *it = (InsertableTerm *)p;
		p += it->getSize();
		// print the term
		sbuf->safePrintf("\t\t<term>\n");
		// the string
		sbuf->safePrintf("\t\t\t<string><![CDATA[%s]]></string>\n",
				 it->getTerm());
		// sum of traffic of all queries containing this term
		sbuf->safePrintf("\t\t\t<importance>%"INT32"</importance>\n",
				 it->m_trafficSum);
		// is it contained in the doc/linktext or is it "related"
		sbuf->safePrintf("\t\t\t<isRelatedTerm>%"INT32"</isRelatedTerm>\n",
				 (int32_t)it->m_isRelatedTerm);
		// get the first query change if any
		QueryChange *qc = it->m_firstQueryChange;
		// limit to fix firefox crash
		//int32_t queryChangeLimit = 30;
		// skip if no list
		if ( ! qc ) goto skip;
		// print the insert position that gives us the most traffic
		sbuf->safePrintf("\t\t\t<bestInsertPosition>%"INT32""
				 "</bestInsertPosition>\n",
				 it->m_bestInsertPos);
		sbuf->safePrintf("\t\t\t<bestTrafficGain>%"INT32""
				 "</bestTrafficGain>\n",
				 it->m_bestTrafficGain);
		// print query changes
		if ( it->m_firstQueryChange )
			sbuf->safePrintf("\t\t\t<queryChanges><![CDATA["
					 );
		// print out query changes for this term
		for ( qc = it->m_firstQueryChange ; qc ; qc = qc->m_next ) {
			// fix firefox crash for now
			//if ( --queryChangeLimit <= 0 ) break;
			// now store in binary
			sbuf->pushLong(qc->m_insertPos);
			sbuf->pushLong(qc->m_queryHash32);
			sbuf->pushChar(qc->m_oldRank);
			sbuf->pushChar(qc->m_newRank);
			/*
			// . TODO: make sure to remove QueryChanges that have
			//   the same old and new rank
			// . print it
			sbuf->safePrintf("\t\t\t<queryChange>\n");
			sbuf->safePrintf("\t\t\t\t<insertPos>%"INT32""
					 "</insertPos>\n", qc->m_insertPos);
			sbuf->safePrintf("\t\t\t\t<oldRank>%"INT32""
					 "</oldRank>\n",(int32_t)qc->m_oldRank);
			sbuf->safePrintf("\t\t\t\t<newRank>%"INT32""
					 "</newRank>\n",(int32_t)qc->m_newRank);
			sbuf->safePrintf("\t\t\t\t<queryId>%"INT32""
					 "</queryId>\n",
					 qc->m_queryHash32 );
			sbuf->safePrintf("\t\t\t</queryChange>\n");
			*/
		}
		if ( it->m_firstQueryChange )
			sbuf->safePrintf("]]></queryChanges>\n");

	skip:
		// print the term end
		sbuf->safePrintf("\t\t</term>\n");
	}
	sbuf->safePrintf("\t</insertableTerms>\n");
	return true;
}

/*
static int wordPosInfoCmp ( const void *a, const void *b ) {
	WordPosInfo *wa = (WordPosInfo *)a;
	WordPosInfo *wb = (WordPosInfo *)b;
	// smallest word position should be at the head of the list
	if ( wa->m_wordPos  < wb->m_wordPos  ) return -1;
	if ( wa->m_wordPos  > wb->m_wordPos  ) return  1;
	return 0;
}
*/

static int wpPosdbKeyCmp ( const void *a, const void *b ) {
	int32_t wpa = g_posdb.getWordPos((char *)a);
	int32_t wpb = g_posdb.getWordPos((char *)b);
	return wpa - wpb;
}

SafeBuf *XmlDoc::getWordPosSortedPosdbListBuf ( ) {

	if ( m_wpSortedPosdbListBufValid )
		return &m_wpSortedPosdbListBuf;

	// get the lists. forDelete = false.
	char *metaList = getMetaList ( false );
	if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList;

	// sanity
	if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; }

	// make a tmp buf to hold posdb keys
	//SafeBuf tmp;
	if ( ! m_wpSortedPosdbListBuf.reserve ( m_metaListSize,"wpsbuf" ) )
		return NULL;
	// point into it
	char *dst = m_wpSortedPosdbListBuf.getBufStart();

	// scan the meta list for posdb keys
	char *p    = metaList;
	char *pend = p + m_metaListSize;
	// stole this loop from getMetaList()
	for ( ; p < pend ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// save it with the flag
		char byte = *p;
		// get rdbId
		char rdbId = byte & 0x7f;
		// skip that
		p++;
		// key size
		int32_t ks = getKeySizeFromRdbId(rdbId);
		// get key
		char *key = p;
		// skip that
		p += ks;
		// get datasize
		int32_t ds = getDataSizeFromRdbId(rdbId);
		// assume we do not store the datasize
		//bool neg = false;
		// . if key is negative, no data is present
		// . the doledb key is negative for us here
		if ( (key[0] & 0x01) == 0x00 ) ds = 0;
		// if datasize variable, read it in
		if ( ds == -1 ) {
			// get data size
			ds = *(int32_t *)p;
			// skip data size int32_t
			p += 4;
		}
		// point to data
		//char *data = p;
		// skip data if not zero
		p += ds;
		// if not posdb skip rec
		if ( rdbId != RDB_POSDB ) continue;
		// skip negative keys
		if ( (key[0] & 0x01) == 0x00 ) continue;
		// add to new buf now
		gbmemcpy ( dst , key , sizeof(POSDBKEY) );
		// advance
		dst += sizeof(POSDBKEY);
	}
	char *start = m_wpSortedPosdbListBuf.getBufStart();
	// update tmp
	m_wpSortedPosdbListBuf.incrementLength ( dst - start );
	// sanity
	if ( m_wpSortedPosdbListBuf.length() > m_metaListSize ) {
		char *xx=NULL;*xx=0; }

	// point
	char *pbuf    = m_wpSortedPosdbListBuf.getBufStart();
	int32_t  numKeys = m_wpSortedPosdbListBuf.length()/sizeof(POSDBKEY);
	// sort keys by word position
	gbqsort ( pbuf ,
		  numKeys,
		  sizeof(POSDBKEY),
		  wpPosdbKeyCmp ,
		  m_niceness );

	m_wpSortedPosdbListBufValid = true;
	return &m_wpSortedPosdbListBuf;
}

// now pass this into Msg95Request so we only try to insert right before
// or after m_wordPos values in this WordPosInfo vector.
SafeBuf *XmlDoc::getWordPosInfoBuf ( ) {

	// if it is valid and we have not yet added to cachedb...
	if ( m_wordPosInfoBufValid && ! m_triedToAddWordPosInfoToCachedb ) {
		// only do this once
		m_triedToAddWordPosInfoToCachedb = true;
		// store the m_wordPosInfoBuf into cachedb
		if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) )
			return (SafeBuf *)-1;
	}


	if ( m_wordPosInfoBufValid )
		return &m_wordPosInfoBuf;

	// it should be valid now from our logic in hashWords3() if
	// m_doingSEO is set to true
	char *xx=NULL; *xx=0;

	// these are FULL 18-byte keys, no compression, sorted by word pos
	SafeBuf *posdbBuf = getWordPosSortedPosdbListBuf ();
	if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf;

	// scan posdb keys
	int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY);

	// . reserve mem for new buf
	int32_t need = numKeys * sizeof(WordPosInfo);
	if ( ! m_wordPosInfoBuf.reserve ( need ,"wpibuf" ) )
	     return NULL;

	int32_t sentNum = 0;
	int32_t lastWordPos = -1;
	//int32_t lastwp = -1;
	int32_t lastSentNum = -1;

	// scan all the sorted posdb keys and build posdb termlists and
	// store the termlists into "m_termListBuf"
	char *p = posdbBuf->getBufStart();
	char *pend = p + posdbBuf->length();
	for ( ; p < pend ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// get the key
		char *key = p;
		// sanity
		if ( g_posdb.getKeySize(p) != 18 ) { char *xx=NULL;*xx=0; }
		// skip del keys
		if ( (p[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; }
		// skip it
		p += sizeof(POSDBKEY);
		// get key termid
		//int64_t termId = g_posdb.getTermId ( key );
		// sanity
		//int64_t docId = g_posdb.getDocId ( key );
		//if ( docId != m_docId ) { char *xx=NULL;*xx=0; }
		// log it for debug
		//if ( docId == 192304365235LL )
		//	log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"",
		//	    docId,
		//	    termId,
		//	    g_posdb.getWordPos(key));
		WordPosInfo wpi;
		int32_t wp = g_posdb.getWordPos(key);
		// set "m_sentNum"
		if ( wp >= lastWordPos + 50 ) sentNum++;
		wpi.m_wordPos      = wp;
		wpi.m_sentNum      = sentNum;
		wpi.m_hashGroup    = g_posdb.getHashGroup    (key);
		wpi.m_densityRank  = g_posdb.getDensityRank  (key);
		wpi.m_wordSpamRank = g_posdb.getWordSpamRank (key);
		wpi.m_trafficGain  = 0;
		// log it
		/*
		log("seopipe: term=%"INT64" pos=%"INT32" sent=%"INT32" hg=%s dr=%"INT32"",
		    g_posdb.getTermId(key),
		    (int32_t)wp,
		    sentNum,
		    getHashGroupString(wpi.m_hashGroup),
		    (int32_t)wpi.m_densityRank);
		*/
		// bigrams share the same word position as the single term.
		// so ignore them. we only want unique insertion positions.
		if ( wp == lastWordPos ) continue;
		// . i thought sorted by word position??
		// . word position 0 is used by generic terms, like tags
		if ( wp < lastWordPos ) { char *xx=NULL;*xx=0; }
		// additional positoin at the end of a sentence?
		//if ( lastwp != wp && lastSentNum == sentNum )
		//	// store it
		//	m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
		// to right as well! so it can be in same sentence, if this
		// word as at the end of the sentence.
		//wpi.m_wordPos = wp;// + 2;
		// add it
		m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));

		int32_t nextSent = -1;
		if ( p < pend ) {
			// assume same as current sentence
			nextSent = sentNum;
			// get word position of next term
			int32_t nextwp = g_posdb.getWordPos(p);
			// same as us? then it is a bigram, so try the
			// word after that!
			if ( nextwp == wp && p+18<pend )
				nextwp = g_posdb.getWordPos(p+18);
			// if the following word position is in a new sentence
			// he will be separated by 50 units! that is our base
			// for sentence skip.
			if ( nextwp >= wp + SENT_UNITS )
				nextSent = sentNum+1;
		}

		// HACK. if next word starts a new sentence, add a WordPosInfo
		// here so we can insert term at end of THIS sentence.
		// otherwise we are inserted BEFORE the term whose position
		// we use.
		if ( nextSent != sentNum ) {
			wpi.m_wordPos += 2;
			m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo ));
		}

		// set these
		lastWordPos = wp;
		//lastwp = wp;// + 2;
		lastSentNum = sentNum;
	}

	/*
	// point to raw buf
	char *raw = m_wordPosInfoBuf.getBufStart();
	int32_t size = m_wordPosInfoBuf.length();
	// this shit is sorted by termid then pos, so sort just by pos
	// this should breath with niceness!!
	gbqsort ( raw ,
		  size / sizeof(WordPosInfo),
		  sizeof(WordPosInfo) ,
		  wordPosInfoCmp ,
		  m_niceness ) ;
	*/

	m_wordPosInfoBufValid = true;

	return &m_wordPosInfoBuf;
}

// . i made this easy to serialize by using offsets and not ptrs
// . so we can add to cachedb easily
// . and so its immune to reallocs() on m_linkSourceBuf SafeBuf
class LinkSource {
public:

	int32_t m_linkSiteRank;

	// the actual url of the link, references into m_buf
	int32_t m_linkUrlOffset;
	// the title of the link, references into m_buf
	int32_t m_linkTitleOffset;

	// . we store the offsets of the RelatedDocIds in m_relatedDocIdBuf
	// . these are the related docids that are linked to by this link src
	int32_t m_offsetOfRelatedDocIdOffsets;
	int32_t m_numRelatedDocIds;

	char m_buf[0];

	char *getLinkUrl ( SafeBuf *linkSourceBuf ) {
		char *buf = linkSourceBuf->getBufStart();
		buf += m_linkUrlOffset;
		return buf;
	};

	char *getLinkTitle ( SafeBuf *linkSourceBuf ) {
		char *buf = linkSourceBuf->getBufStart();
		buf += m_linkTitleOffset;
		return buf;
	};

	// crap, do we store RelatedDocIds into cachedb? we should
	// make it use offsets and not ptrs too...
	int32_t *getRelatedDocIdOffsets ( SafeBuf *linkSourceBuf ) {
		// how can this be?
		//if ( m_numRelatedDocIds == 0 ) return NULL;
		char *buf = linkSourceBuf->getBufStart();
		buf += m_offsetOfRelatedDocIdOffsets;
		return (int32_t *)buf;
	};

};


/*
static void gotLinkInfoReplyWrapper ( void *state ) {
	//XmlDoc *newxd = (XmlDoc *)state;
	Msg25 *msg25 = (Msg25 *)state;
	XmlDoc *xd = msg25->m_xd;
	// count it as returned
	xd->m_numLinkRequestsIn++;
	// this will nuke the msg25 as well after copying its linkinfo
	xd->processLinkInfoMsg20Reply ( msg25 );
	// try to send out more requests or intersect them if done
	xd->m_masterLoop ( xd->m_masterState );
}

// . before we were just looking at the LinkInfo the msg25 makes from
//   all the Msg20Replies it gets, but let's keep the msg20 replies
//   intact because they have the titles we need!
// . return false on error, true otherwise
bool XmlDoc::processLinkInfoMsg20Reply ( Msg25 *msg25 ) {
	// int16_tcut
	//LinkInfo *info = msg25->getLinkInfo ();
	// store into our buffer
	//bool status ;
	// i guess info can be NULL on error
	//if ( info )
	//	status = m_linkInfoReplyBuf.safeMemcpy (info, info->getSize());

	// give front-end the progress bar info
	if ( m_seoSocket && m_progressBar ) {
		// tmp buf
		char tmp[16];
		float percent = (float)m_rdCursor;
		SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
		int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId);
		percent /= (float)numRelated;
		// 80% of the pipeline was doing the full queries
		percent *= .20;
		percent += .80;
		percent *= 100.0;
		int32_t percentLong = (int32_t)percent;
		if ( percentLong >= 100 ) percentLong = 99;
		int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong);
		if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen);
		// try a send on non-blocking socket
		int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 );
		if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n);
		// forget error
		errno = 0;
	}

	// store this
	int32_t nr = msg25->m_numReplyPtrs;
	// reserve space
	if ( ! m_msg20ReplyPtrBuf.reserve ( 8 + nr * 4 * 2 ) ) {
		m_hadLinkInfoError = g_errno;
		nr = 0;
	}
	// first store related docid ptr into m_relatedDocIdBuf safebuf
	RelatedDocId *rd = (RelatedDocId *)msg25->m_hackrd;
	m_msg20ReplyPtrBuf.pushLong((int32_t)rd);
	// then store the # of msg20 replies
	m_msg20ReplyPtrBuf.pushLong(nr);
	// . scan each msg20reply it got, each msg20reply is an inlink
	//   for this docid
	// . seems like they are only freed in Msg25::reset()
	for ( int32_t i = 0 ; i < nr ; i++ ) {
		// get one
		Msg20Reply *r = msg25->m_replyPtrs[i];
		int32_t size = msg25->m_replySizes[i];
		// steal it, we will free them ourselves below
		m_msg20ReplyPtrBuf.pushLong((int32_t)r);
		// we need this since we need to free it when done
		m_msg20ReplyPtrBuf.pushLong(size);
	}
	// . do not allow Msg25 to free it, we will free it below
	// . on OOM error above we set nr to 0 on error, so allow msg25
	//   to free the replies in that case
	if ( nr ) msg25->m_numReplyPtrs = 0;
	// nuke it
	mdelete ( msg25 , sizeof(Msg25), "m25li" );
	delete ( msg25 );
	return true;
}
*/

static int riCmp ( const void *a, const void *b ) {
	RecommendedLink *wa = *(RecommendedLink **)a;
	RecommendedLink *wb = *(RecommendedLink **)b;
	int32_t diff = wb->m_votes - wa->m_votes;
	if ( diff ) return diff;
	if ( wb->m_totalRecommendedScore > wa->m_totalRecommendedScore )
		return 1;
	if ( wb->m_totalRecommendedScore < wa->m_totalRecommendedScore )
		return -1;
	// docid to break all ties
	if ( wb->m_rl_docId > wa->m_rl_docId )
		return  1;
	if ( wb->m_rl_docId < wa->m_rl_docId )
		return -1;

	return 0;
}

static void gotLinkdbListWrapper ( void *state ) {
	Msg0 *msg0 = (Msg0 *)state;
	XmlDoc *xd = msg0->m_hackxd;
	// free it's memory here lest we have a leak
	//msg0->reset();
	xd->m_numLinkRequestsIn++;
	xd->m_masterLoop ( xd->m_masterState );
}


#define MAX_RECOMMENDED_LINKS 300

// . returns safebuf of RecommendedLinks
// . use RecommendedLink::getSize() to skip over element in array/safebuf
// . these are the recommended link sources
// . these are the links that your relateddocids (i.e. competing pages) have
//   in common the most
// . TODO: store the returned safebuf in cachedb as well!
SafeBuf *XmlDoc::getRecommendedLinksBuf ( ) {

	// try to set from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	if ( m_recommendedLinksBufValid )
		return &m_recommendedLinksBuf;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// what docids share our matching queries?
	SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
	if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;
	// cast then
	RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
	// how many related docids do we have?
	int32_t numRelatedDocIds = rdbuf->length() / sizeof(RelatedDocId);

	if ( m_numLinkRequestsOut == 0 ) {
		// reset these on first call
		m_rdCursor          = 0;
		m_numLinkRequestsIn = 0;
		m_hadLinkInfoError  = 0;
		m_numMsg20sIn       = 0;
		m_numMsg20sOut      = 0;
		m_numValidMsg20s    = 0;
		m_titleCursor       = 0;
		m_msg20Phase        = 0;
		m_recommendedLinkError = 0;
	}

	if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0; }

	// if we are looking up the title/url of each docid in
	// the m_recommendedLinksBuf now, go back there
	if ( m_msg20Phase )
		return lookupTitles();

	for ( ; m_rdCursor < numRelatedDocIds ; m_rdCursor++ ) {
		// wait if too many out. only allow 2 out. otherwise each
		// one can send out like 500 msg20s
		if ( m_numLinkRequestsOut - m_numLinkRequestsIn > 60 )
			// wait for 1 to come back
			return (SafeBuf *)-1;
		// skip the rest on error
		if ( m_hadLinkInfoError ) continue;
		// cast it
		RelatedDocId *rd = &rds[m_rdCursor];
		// bogus? a not found, EDOCBANNED/EDOCFILTERED or it
		// linked to our domain
		if ( rd->rd_url_off < 0 )
			continue;
		// bogus?
		if ( ! rd->getUrl( &m_relatedTitleBuf ) ) {
			log("seo: skipping null url");
			continue;
		}
		if ( ! rd->getSite( &m_relatedTitleBuf ) ) {
			log("seo: skipping null site");
			continue;
		}

		// allocate msg0 array into m_tmpMsg0Buf safebuf
		if ( ! m_tmpMsg0Buf.length() ) {
			// fill tmpmsg0 buf
			int32_t need = sizeof(Msg0) * numRelatedDocIds;
			if ( ! m_tmpMsg0Buf.reserve ( need , "tmp20s" ) )
				return NULL;
			// do not re-call!
			m_tmpMsg0Buf.setLength(need);
			char *p = m_tmpMsg0Buf.getBufStart();
			char *pend = p + need;
			for ( ; p < pend ; p += sizeof(Msg0) ) {
				Msg0 *msg0 = (Msg0 *)p;
				msg0->constructor();
			}
		}

		// debug it
		if ( m_seoDebug >= 2 )
			log("seo: getting inlinks to related docid=%"INT64" "
			    "weight=%f "
			    "url=%s",
			    rd->m_docId,
			    rd->m_relatedWeight,
			    rd->getUrl(&m_relatedTitleBuf));

		// just get his linkdb list!
		Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
		Msg0 *msg0 = &array[m_rdCursor];
		key224_t startKey;
		key224_t endKey;
		char *rdurl = rd->getUrl(&m_relatedTitleBuf);
		// by default, just hash of hostname, unless overridden
		// with "site" tag in tagdb, or has a path like /~mwells
		int32_t siteHash32 = rd->m_rd_siteHash32;
		int64_t linkHash64 = hash64n(rdurl);
		startKey = g_linkdb.makeStartKey_uk (siteHash32,linkHash64 );
		endKey   = g_linkdb.makeEndKey_uk   (siteHash32,linkHash64 );

		// hack that thing
		msg0->m_hackxd = this;

		// consider it outstanding
		m_numLinkRequestsOut++;

		// int16_tcut, piggyback on the msg0
		RdbList *list = &msg0->m_handyList;
		//RdbList list2;

		if ( ! msg0->getList ( -1              , // hostId, -1 if none
				       0               , // hostId ip
				       0               , // hostId port
				       0               , // max cache age -secs
				       false           , // addToCache?
				       RDB_LINKDB      ,
				       cr->m_collnum      ,
				       list            , // linkdb list to fill
				       (char*)&startKey,
				       (char*)&endKey  ,
				       1000000         , // 1MB minrecsizes
				       msg0            ,
				       gotLinkdbListWrapper  ,
				       m_niceness      ,
				       true            , // error correct?
				       true            , // includeTree
				       true            , // do merge
				       -1,//hostId
				       0               , // startFileNum
				       -1              , // numFiles
				       60*60*24*365    )){//timeout of one year
			// blocked? keep chugging
			continue;
		}

		// . maybe it was cached or something, or we had an error!
		// . this will nuke the msg25
		// . returns false and sets g_errno on error
		//processLinkInfoMsg20Reply ( msg25 );
		m_numLinkRequestsIn++;
		// save g_errno
		int32_t saved = g_errno;
		// free it's memory here lest we have a leak
		//msg0->reset();
		// error? it will not have blocked then
		if ( ! saved ) continue;
		// save error, and stop launching any more requests
		m_hadLinkInfoError = saved;
		log("xmldoc: linksrc error3 = %s",mstrerror(saved));
	}

	// return -1 if waiting for more requests to come in
	if ( m_numLinkRequestsOut > m_numLinkRequestsIn )
		return (SafeBuf *)-1;

	// vote table to allow inlink voting
	HashTableX riTable;
	// do not return on error setting this table because we'll leave
	// the msg20 replies unfreed!
	if ( ! riTable.set ( 8,4,1024,NULL,0,false,m_niceness,"ritbl") )
		m_hadLinkInfoError = g_errno;

	RecommendedLink *ri;

	HashTableX dedupVotesTable;
	if ( ! dedupVotesTable.set(8,0,1024,NULL,0,false,m_niceness,"dvtt") )
		return NULL;

	// need this for computing rdOff
	char *rdStart = m_relatedDocIdBuf.getBufStart();

	// store recommended links bufs here temporarily
	SafeBuf tmpBuf;
	if ( ! tmpBuf.reserve ( 10000000 ,"tt5buf" ) ) return NULL;

	// all done.  scan linkdb lists and intersect. there is one list
	// per related docid.
	for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
		// get related docid that had the following msg20replies
		RelatedDocId *rd = &rds[i];
		// his offset in his buf
		int32_t rdOff = (char *)rd - rdStart;
		// get linkdb list loaded from msg0 call above
		Msg0 *msg0 = &((Msg0 *)m_tmpMsg0Buf.getBufStart())[i];
		RdbList *list = &msg0->m_handyList;
		list->resetListPtr();
		// scan the docids in list
		for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
			// get the current key if list has more left
			key224_t key;
			list->getCurrentKey( &key );
			//int32_t itop     = g_linkdb.getLinkerIp24_uk  ( &key );
			int32_t ip32       = g_linkdb.getLinkerIp_uk     ( &key );
			//bool isLinkSpam = g_linkdb.isLinkSpam_uk  ( &key );
			int64_t docId = g_linkdb.getLinkerDocId_uk ( &key );
			//int32_t discovered = g_linkdb.getDiscoveryDate_uk(&key);

			// skip if no longer there on page, we keep these
			// only to graph lost links over time
			int32_t lostDate = g_linkdb.getLostDate_uk ( &key );
			if ( lostDate )
				continue;

			// if the inlink is from the same c-block IP as the
			// related docid it links to, then do not consider.
			// the ip used in linkdb is the current ip not the
			// first ip actually.
			if ( ipdom(ip32)==ipdom(rd->m_relatedCurrentIp))
				continue;
			if ( ipdom(ip32)==ipdom(rd->m_relatedFirstIp))
				continue;
			// if the linking document links to the same related
			// docid multiple times/ we need to dedup so m_votes
			// is not incremented multiple times!
			// actually make it use c-block not docid to fix
			// links/pages getting two m_votes for linking to
			// two competitors, where each competitor linked to
			// is on the same c-block... kinda strange.
			int64_t dkey = docId ^ ipdom(rd->m_relatedFirstIp);
			if ( dedupVotesTable.isInTable(&dkey) )
				continue;
			if ( ! dedupVotesTable.addKey(&dkey) ) return NULL;

			// now we associate a new class with each unique linker
			int32_t *poff = (int32_t *)riTable.getValue ( &docId );
			// if there, it will be an offset into the links buf
			if ( poff ) {
				char *ptr = tmpBuf.getBufStart();
				ptr += *poff;
				RecommendedLink *rip = (RecommendedLink *)ptr;
				rip->m_totalRecommendedScore +=
					rd->m_relatedWeight;
				rip->m_votes++;
				// add to array of rd offs
				int32_t k; for ( k = 0 ; k < 10 ; k++ ) {
					if ( rip->m_relatedDocIdOff[k]==-1)
						break;
				}
				if ( k < 10 )
					rip->m_relatedDocIdOff[k] = rdOff;
				continue;
			}

			// reserve space
			int32_t need = sizeof(RecommendedLink);
			// reserve
			if ( ! tmpBuf.reserve ( need , "tt5buf" ) ) {
				m_hadLinkInfoError = g_errno;
				continue;
			}

			// save this
			int32_t firstOff = tmpBuf.length();

			// ref it
			char *buf = tmpBuf.getBuf();
			ri = (RecommendedLink *)buf;
			// advance over that
			int32_t over = sizeof(RecommendedLink);
			// increase buf length
			tmpBuf.incrementLength(over);

			// this is how similar the relatedDocId is to the
			// main url. these dotproducts are all relative
			// with the other relatedDocIds for this url.
			// the dotproduct was basically a dotproduct
			// of the score vector of "rd" with that of
			// the main url for the same queries. and that
			// was normalized by the score of the top result
			// for each query that have in common. see the
			// the algo above for the "m_dotProduct" computation.
			ri->m_totalRecommendedScore = rd->m_relatedWeight;
			ri->m_votes = 1;

			ri->m_rl_docId = docId;

			// we do not know these things until we call msg20
			// on the docid:
			ri->m_rl_siteRank = -1;//reply->m_siteRank;
			ri->m_rl_firstIp = 0;//reply->m_firstIp;

			// each recommended link links to one or more
			// related docids. so record them!
			ri->m_relatedDocIdOff[0] = rdOff;
			ri->m_relatedDocIdOff[1] = -1;
			ri->m_relatedDocIdOff[2] = -1;
			ri->m_relatedDocIdOff[3] = -1;
			ri->m_relatedDocIdOff[4] = -1;
			ri->m_relatedDocIdOff[5] = -1;
			ri->m_relatedDocIdOff[6] = -1;
			ri->m_relatedDocIdOff[7] = -1;
			ri->m_relatedDocIdOff[8] = -1;
			ri->m_relatedDocIdOff[9] = -1;

			ri->m_urlSize = 0;
			ri->m_titleSize = 0;

			// store it in table then, pointing into the new buf
			if ( ! riTable.addKey ( &docId, &firstOff  ) )
				m_hadLinkInfoError = g_errno;
		}
		// free that list now to save mem
		list->freeList();
	}

	// free the msg0s now, including Msg0::m_handyList, what we used
	// to hold the linkdb list
	for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
		Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart();
		Msg0 *msg0 = &array[i];
		// free the mem and the handylist now that we've processed them
		msg0->reset();
	}
	// no longer need the msg0s and linkdb lists (Msg0::m_handyLists)
	m_tmpMsg0Buf.purge();


	//
	// now sort RecommendedLinks in tmpBuf by their scores
	//
	// get the top 300 recommended links so we can save mem and
	// store this beastie in cachedb
	SafeBuf ptrBuf;
	int32_t maxNumPtrs = tmpBuf.length() / sizeof(RecommendedLink);
	if ( ! ptrBuf.reserve(maxNumPtrs *sizeof(RecommendedLink *),"ptrbuf"))
		return NULL;
	char *p = tmpBuf.getBufStart();
	char *pend = tmpBuf.getBuf();
	int32_t numPtrs = 0;
	for ( ; p < pend ; ) {
		RecommendedLink *ri = (RecommendedLink *)p;
		ptrBuf.pushPtr ( ri );
		p += sizeof(RecommendedLink);
		// we have no title or url at this point...
		if ( ri->getSize() != sizeof(RecommendedLink) ) {
			char *xx=NULL;*xx=0; }
		numPtrs++;
	}
	// now sort!
	RecommendedLink **ptrs = (RecommendedLink **)ptrBuf.getBufStart();
	gbqsort ( ptrs ,
		  numPtrs ,
		  sizeof(RecommendedLink *),
		  riCmp,
		  m_niceness );
	// copy over the top recommended links into permanent buffer in order
	// of score
	int32_t need2 = tmpBuf.length();
	// increase for storing titles/urls into here
	need2 = numPtrs * sizeof(RecommendedLink);
	// allocate that now
	if ( ! m_recommendedLinksBuf.reserve ( need2 ,"rlkbuf") ) return NULL;
	// and copy over from tmpBuf, sorted by the score
	for ( int32_t i = 0 ; i < numPtrs ; i++ )
		m_recommendedLinksBuf.safeMemcpy(ptrs[i],
						 sizeof(RecommendedLink));
	// this can be really huge! > 30MB
	tmpBuf.purge();
	// free the ptrs too!
	ptrBuf.purge();


	//
	// now m_recommendedLinksBuf is a bunch of RecommendedLinks sorted
	// by score. now use msg20 to lookup the top 300 or so that
	// do not link to our main doc
	//
	m_msg20Phase = true;
	return lookupTitles ();
}

//static void gotLinkerTitleWrapper ( void *state ) {
//	Msg20 *msg20 = (Msg20 *)state;
//	XmlDoc *THIS = (XmlDoc *)msg20->m_state2;
//	THIS->gotLinkerTitle ( msg20 );
//	THIS->m_masterLoop ( THIS->m_masterState );
//}

SafeBuf *XmlDoc::lookupTitles ( ) {

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// none have a title/url following them in here yet
	int32_t numLinkers = m_recommendedLinksBuf.length();
	numLinkers /= sizeof(RecommendedLink);

	if ( ! m_msg20Array.length() ) {
		int32_t need = numLinkers * sizeof(Msg20);
		if ( ! m_msg20Array.reserve ( need,"m20arr" ) )
			return (SafeBuf *)-1;
		// do not re-call!
		m_msg20Array.setLength(need);
		char *p = m_msg20Array.getBufStart();
		char *pend = p + need;
		for ( ; p < pend ; p += sizeof(Msg20) )
			((Msg20 *)p)->constructor();
	}

	Msg20 *msg20s = (Msg20 *)m_msg20Array.getBufStart();
	// one per linker
	int32_t numMsg20s = numLinkers;

	// we can use the array model because each element is fixed size
	// because they do not have the url/title string following them
	// yet...
	char *ppp = m_recommendedLinksBuf.getBufStart();
	RecommendedLink *ptr = (RecommendedLink *)ppp;

	// scan the msg20s we allocated to see if any got a reply
	for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
		// int16_tcut
		Msg20 *msg20 = &msg20s[i];
		// skip if never launched
		if ( ! msg20->m_launched ) continue;
		// skip if it is in progress, awaiting its reply
		if ( msg20->m_inProgress ) continue;
		// ok, it has a reply. could be NULL if g_errno was set.
		if ( ! gotLinkerTitle ( msg20 ) )
			m_recommendedLinkError = g_errno;
		// reset it for later us... or not...
		msg20->reset();
	}

	//
	// call a msg20 on each recommendedlink to get url/title and
	// see if it links to any url on our main url's site/domain
	//
	for ( ; m_titleCursor < numLinkers ; m_titleCursor++ ) {
		// bail?
		if ( m_numMsg20sOut - m_numMsg20sIn > 60 )
			break;
		// stop launching if got enough
		if ( m_numValidMsg20s >= MAX_RECOMMENDED_LINKS )
			break;
		// cast it
		RecommendedLink *rl = &ptr[m_titleCursor];

		// get avail msg20
		int32_t i; for ( i = 0 ; i < 100 ; i++ ) {
			if ( msg20s[i].m_inProgress ) continue;
			break;
		}
		// sanity!
		if ( i >= 100 ) { char *xx=NULL;*xx=0; }
		// look it up
		Msg20 *msg20 = &msg20s[i];
		// make request
		Msg20Request req;
		req.m_docId     = rl->m_rl_docId;
		//req.m_state     = msg20;
		req.m_state     = m_masterState;//this;
		req.m_callback2 = m_masterLoop;//gotLinkerTitleWrapper;
		//req.ptr_coll    = cr->m_coll;
		//req.size_coll   = gbstrlen(cr->m_coll)+1;
		req.m_collnum = cr->m_collnum;
		req.m_expected  = true;
		req.m_niceness  = m_niceness;
		// do not get summary stuff. too slow.
		req.m_numSummaryLines = 0;
		// if it has an outlink to our site/domain set
		// Msg20Reply::m_hasLinkToOurDomOrHost
		req.m_ourHostHash32 = getHostHash32a();
		req.m_ourDomHash32  = getDomHash32();

		// store cursor in msg20 itself so we know what rd it's using
		msg20->m_hack2 = m_titleCursor;

		// assume outstanding
		m_numMsg20sOut++;
		// debug
		//log("seo: DEBUG: launching msg20 d=%"INT64"",req.m_docId);
		// get it. continue if blocked
		if ( ! msg20->getSummary ( &req ) ) continue;
		// error?
		if ( ! gotLinkerTitle ( msg20 ) )
			m_recommendedLinkError = g_errno;
		// save mem
		msg20->reset();
	}

	// wait for all to return?
	if ( m_numMsg20sOut > m_numMsg20sIn )
		return (SafeBuf *)-1;


	// we called gotLinkerTitle() on all msg20s, so destroy them
	for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
		// int16_tcut
		Msg20 *msg20 = &msg20s[i];
		// free
		msg20->destructor();
	}
	// and free the lot of them
	m_msg20Array.purge();


	// now revert back
	m_recommendedLinksBuf.stealBuf ( &m_newLinkerBuf );

	// . this is an array of Inlinks
	// . shit, but we need to add a count of how many related docids
	//   had the inlink, and what the weight or score of it was
	// . it should be based on the weights/scores of the related docids
	// . maybe just hijack "Inlink::m_numUniqueIPs" or something
	// . crap, we also need to store the RelatedDocIds, i guess we
	//   could store a list of offsets to them in m_relatedDocIdBuf
	m_recommendedLinksBufValid = true;

	// store in cachedb. if it blocks return -1. bufvalid is set to
	// true so when this function is re-entered it should return
	// the safebuf ptr right away.
	if ( ! storeRecommendedLinksBuf () )
		return (SafeBuf *)-1;

	return &m_recommendedLinksBuf;
}

// returns false and sets g_errno on error
bool XmlDoc::gotLinkerTitle ( Msg20 *msg20 ) {
	// count it as returned
	m_numMsg20sIn++;

	// debug
	//log("seo: DEBUG: got msg20 reply");

	// get the recommendedlink for this (titleCursor)
	char *vvv = m_recommendedLinksBuf.getBufStart();
	RecommendedLink *rptrs = (RecommendedLink *)vvv;
	int32_t titleCursor = msg20->m_hack2;
	RecommendedLink *rl = &rptrs[titleCursor];
	// sanity
	if ( titleCursor < 0 ) {char *xx=NULL;*xx=0;}

	// not found?
	if ( g_errno ) {
		log("seo: lookuptitles: %s",mstrerror(g_errno));
		// ignore
		g_errno = 0;
		return true;
	}
	// get reply
	Msg20Reply *reply = msg20->getReply();
	// skip if linked to our site!
	if ( reply->m_hasLinkToOurDomOrHost ) {
		if ( m_seoDebug >= 2 )
			log("seo: inlinker %s links to our "
			    "domain. ignoring.",
			    reply->ptr_ubuf);
		return true;
	}
	// or if banned/filtered.. then skip
	if ( reply->m_errno ) {
		if ( m_seoDebug >= 2 )
			log("seo: inlinker %s had error: %s",
			    reply->ptr_ubuf,
			    mstrerror(reply->m_errno));
		return true;
	}
	// wtf?
	if ( reply->size_ubuf <= 1 ) {
		return true;
	}

	// set basic info
	rl->m_rl_siteRank = reply->m_siteRank;
	rl->m_rl_firstIp  = reply->m_firstIp;

	// sanity
	if ( rl->m_rl_docId != reply->m_docId ) { char *xx=NULL;*xx=0; }

	char *title = reply->ptr_tbuf;
	int32_t titleSize = reply->size_tbuf;
	if ( titleSize == 0 ) {
		title = "\0";
		titleSize = 1;
	}

	// debug
	//log("seo: DEBUG: got VALID msg20 reply #%"INT32"",m_numValidMsg20s);

	// count as valid
	m_numValidMsg20s++;

	rl->m_urlSize = reply->size_ubuf;
	rl->m_titleSize = titleSize;


	if ( ! m_newLinkerBuf.safeMemcpy ( rl , sizeof(RecommendedLink) ) )
		return false;
	if ( ! m_newLinkerBuf.safeMemcpy ( reply->ptr_ubuf,reply->size_ubuf))
		return false;
	if ( ! m_newLinkerBuf.safeMemcpy ( title , titleSize ) )
		return false;


	// i guess we are done then
	return true;
}

/*
// returns false if blocked, true otherwise. sets g_errno on error
bool XmlDoc::printRecommendedLinksBuf ( SafeBuf *sb ) {

	SafeBuf *recBuf = getRecommendedLinksBuf();
	if ( ! recBuf ) return true;
	if ( recBuf == (void *)-1 ) return false;

	int32_t count = 1;
	char *p    = recBuf->getBufStart();
	char *pend = recBuf->getBuf     ();
	for ( ; p < pend ; ) {
		// cast it
		RecommendedLink *ri = (RecommendedLink *)p;
		// skip it
		p += ri->getSize();
		// print it out
		sb->safePrintf("%"INT32") %.04f %s | %s<br>"
			       ,count++
			       ,ri->m_totalRecommendedScore
			       ,ri->getUrl(recBuf)
			       ,ri->getTitle(recBuf)
			       );
	}

	return true;
}
*/


// . use Msg25::m_numReplyPtrs and Msg25::m_replyPtrs[i] to access the
//   Msg20s of the inlinks
// . NOT the same as getLinkInfo() because this does not filter out the
//   "bad" inlinks, it gets everything and keeps the full Msg20Replies!!
Msg25 *XmlDoc::getAllInlinks ( bool forSite ) {

	// if valid, return it now
	if ( forSite && m_tempMsg25SiteValid )
		return m_tempMsg25Site;

	if ( ! forSite && m_tempMsg25PageValid )
		return m_tempMsg25Page;

	Msg25 *myMsg25 ;
	if ( forSite ) myMsg25 = m_tempMsg25Site;
	else           myMsg25 = m_tempMsg25Page;

	int32_t *ipp = getIp();
	if ( ! ipp || ipp == (void *)-1 ) return (Msg25 *)ipp;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Msg25 *)d;
	char *site = getSite ();
	if ( ! site || site == (char *)-1 ) return (Msg25 *)site;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	Url *fu = getFirstUrl();

	// make a new one
	if ( ! myMsg25 ) {
		Msg25 *msg25 = NULL;
		try { msg25 = new ( Msg25 ); }
		catch ( ... ) {
			g_errno = ENOMEM;
			log("xmldoc: linksrc error2 = %s",mstrerror(g_errno));
			m_hadLinkInfoError = g_errno;
		}
		mnew ( msg25, sizeof(Msg25),"m25li");
		// record it for freeing/deleting later
		if ( forSite ) m_tempMsg25Site = msg25;
		else           m_tempMsg25Page = msg25;
		// reference it
		myMsg25 = msg25;
	}

	int32_t type ;
	if ( forSite ) type = cr_Msg25SiteInfo;
	else           type = cr_Msg25PageInfo;

	// get list
	RdbList *myList;
	if ( forSite ) myList = &m_siteReplyList;
	else           myList = &m_pageReplyList;

	int32_t uh32 =(uint32_t)((uint64_t)getFirstUrlHash64());

	// first check cachedb!
	bool checkIt = false;
	if (   forSite && ! m_checkedCachedbForSite ) checkIt = true;
	if ( ! forSite && ! m_checkedCachedbForPage ) checkIt = true;
	if ( checkIt ) {
		// do not repeat
		if ( forSite ) m_checkedCachedbForSite = true;
		else           m_checkedCachedbForPage = true;
		// use 0 for content hash since the link info is independent
		// of your page's or site's content
		key_t sk = g_cachedb.makeStartKey2 ( uh32 , 0 , type );
		key_t ek = g_cachedb.makeEndKey2   ( uh32 , 0 , type );
		// . get it from the appropriate host
		// . get cachedb rec for all types of safebufs for this
		//   url/content
		// . then we will set safebufs based on what recs we find
		//   in the returned list
		if ( ! m_msg0.getList ( -1, // hostid
					0 , // ip
					0 , // port
					0 , // maxcacheage
					false, // addtocache?
					RDB_CACHEDB,
					cr->m_collnum ,
					myList, // &m_cacheList,
					(char *)&sk ,
					(char *)&ek ,
					30000000, // minrecsizes 30MB
					m_masterState,
					m_masterLoop,
					m_niceness ) )
			// blocked?
			return (Msg25 *)-1;
	}

	Msg20Reply *reply;

	// even if it had 0 msg20replies, list should be non-zero length
	if ( ! myList->isEmpty() ) {
		// get # replies
		char *p = myList->getList();
		// first is key
		p += 12;
		// then datasize
		p += 4;
		// then # msg20 replies
		int32_t numReplies = *(int32_t *)p;
		p += 4;
		myMsg25->m_numReplyPtrs = numReplies;
		// do not free any replies, they reference into m_pageList
		myMsg25->m_ownReplies = false;
		// loop over replies
		for ( int32_t i = 0 ; i < numReplies ; i++ ) {
			// get reply size
			int32_t replySize = *(int32_t *)p;
			p += 4;
			// reply itself
			reply = (Msg20Reply *)p;
			// reconstruct ptrs from the offsets relative
			// to start of "reply"
			int32_t used = reply->deserialize();
			if ( used < 0 ) {
				log("xmldoc: reply deserialize error");
				g_errno = ECORRUPTDATA;
				return NULL;
			}
			// skip reply
			p += replySize;
			// store it
			myMsg25->m_replyPtrs[i] = reply;
		}
		// validate!
		if (   forSite ) m_tempMsg25SiteValid = true;
		else             m_tempMsg25PageValid = true;
		// all done!
		return myMsg25;
	}

	bool *calledItPtr ;
	if ( forSite ) calledItPtr = &m_calledMsg25ForSite;
	else           calledItPtr = &m_calledMsg25ForPage;


	// ok, get it the hard way
	// send out the request now
	if ( ! *calledItPtr ) {
		// do not re-call!
		*calledItPtr = true;
		// call it now
		if ( ! myMsg25->getLinkInfo2( site,
					      fu->getUrl() , // url
					      false , // isSiteLinkInfo?
					      *ipp,
					      *d, // docid
					      m_collnum,//cr->m_coll,
					      NULL, // qbuf
					      0, // qbufSize
					      m_masterState, // state
						  m_masterLoop, // callback
					      false, // isInjecting?
					      false, // pbuf (for printing)
					      //this, // xd holder (Msg25::m_xd
					      false, // printInXml
					      // this is irrelevant since we
					      // are getting all inlinks:
					      0, // siteNumInlinks, irrelevant
					      NULL, // oldlinkinfo
					      m_niceness,
					      true, // doLinkSpamCheck?
					      true, // onevoteperip. unused?
					      false,// can be cancelled?
					      0, // lastupdatetime
					      // !!!!!!!!!!
					      // we want all!!!!!!!!!!!!!!!!!!!
					      // !!!!!!!!!!
					      false ,//onlyneedgoodinlinks?
					      false,//getlinkertitles?
					      0, // ourhosthash32 (special)
					      0, // ourdomhash32 (special)
					      &m_myTempLinkInfoBuf ) )
			// blocked?
			return (Msg25 *)-1;
	}

	// validate it so when msg1 below returns and calls this function
	// again at the top we return the ptr right away
	if (   forSite ) m_tempMsg25SiteValid = true;
	else             m_tempMsg25PageValid = true;

	// serialize the msg20 reply ptrs into a buf for list
	SafeBuf listBuf;
	// compute datasize
	int32_t dataSize = 0;
	// # of replies
	dataSize += 4;
	// each reply
	for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
		// reply size
		dataSize += 4;
		// reply data
		//dataSize += myMsg25->m_replySizes[i];
		// we can't use replySizes[i] because Linkdb.cpp will
		// MODIFY the msg20 requests to add ptr_note/size_note
		reply = myMsg25->m_replyPtrs[i];
		// so we have to calculate the new serialized size
		dataSize += reply->getStoredSize();
	}
	// how much to reserve?
	int32_t need = sizeof(key_t) + 4 + dataSize;
	// reserve that space!
	if ( ! listBuf.reserve ( need ,"listbuf" ) ) {
		// just ignore error
		g_errno = 0;
		// and return
		if ( forSite ) return m_tempMsg25Site;
		else           return m_tempMsg25Page;
	}
	// make key for it, contenthash is 0, since it is irrelevant
	key_t kk = g_cachedb.makeKey ( uh32 , 0 , type );
	// store key
	listBuf.safeMemcpy ( &kk , sizeof(key_t) );
	// store datasize
	listBuf.pushLong ( dataSize );
	// # of replies
	listBuf.pushLong ( myMsg25->m_numReplyPtrs );
	// store each reply then
	for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) {
		// get reply
		reply = myMsg25->m_replyPtrs[i];
		// . how many bytes to store the MODIFIED msg20reply?
		// . Linkdb.cpp adds the ptr_note AFTER it receives all replies
		//   so we can't just use Msg25::m_replySizes[i]
		int32_t replySize = reply->getStoredSize();
		listBuf.pushLong ( replySize );
		// store that
		int32_t stored = reply->serialize ( listBuf.getBuf() ,
						 listBuf.getAvail() );
		// skip that
		listBuf.incrementLength ( stored );
		// sanity
		if ( stored != replySize ) { char *xx=NULL;*xx=0; }
	}
	// sanity
	if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; }


	// make the list to add to cachedb
	RdbList storeList;
	key_t startKey = g_cachedb.makeStartKey2 ( uh32, 0 , type );
	key_t endKey   = g_cachedb.makeEndKey2   ( uh32, 0 , type );
	m_storeList.set ( listBuf.getBufStart() ,
			  listBuf.length() ,
			  listBuf.getBufStart() , // alloc
			  listBuf.getCapacity(), // allocsize
			  startKey,
			  endKey,
			  -1, // fixeddatasize
			  true, // owndata?
			  false ); // use half keys?

	// disconnect it from safebuf so it doesn't get freed
	listBuf.detachBuf();

	//m_storeList.printList();

	QUICKPOLL(m_niceness);

	char *tt ;
	if ( forSite ) tt = "site";
	else           tt = "page";
	log("xmldoc: adding msg20%slinkreplies list of %"INT32" bytes to cachedb",
	    tt,m_storeList.m_listSize);

	// returns false if it blocks, true otherwise
	if ( ! m_msg1.addList ( &m_storeList,
				RDB_CACHEDB ,
				cr->m_collnum,
				m_masterState,
				m_masterLoop,
				false, // forcelocal?
				m_niceness ) )
		// blocked?
		return (Msg25 *)-1;

	if ( forSite ) return m_tempMsg25Site;
	else           return m_tempMsg25Page;
}

// . returns false and sets g_errno on error
// . sets RelatedDocId::m_relatedWeight
// . when printing the competitor pages, we sort by this, highest first
// 1. then scan the list of queries for each related docid
// 2. determine each of those matching queries weights
// 3. add up the weights and set RelatedDocId::m_relatedWeight to that
bool XmlDoc::setRelatedDocIdWeightAndRank ( RelatedDocId *rd ) {

	// get our site hash
	int32_t *shp = getSiteHash32();
	if ( ! shp ) return false;
	if ( shp == (int32_t *)-1 ) { char *xx=NULL;*xx=0; }
	if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; }
	int32_t mainUrlSiteRank = getSiteRank();

	// max queries
	int32_t nc = rd->m_numCommonQueries;
	int32_t unit = 0;
	unit += sizeof(float);
	//unit += sizeof(Msg99Reply *);
	unit += sizeof(Query);
	unit += sizeof(HashTableX);
	unit += sizeof(QueryNumLinkedNode *);
	int32_t need = nc * unit;
	char *mem = (char *)mmalloc ( need , "qrybuf" );
	if ( ! mem ) {
		log("seo: failed to set related docid weight: %s",
		    mstrerror(g_errno));
		return false;
	}
	char *p = mem;

	float *queryWeights = (float *)p;
	p += nc * sizeof(float);

	//Msg99Reply **replyPtrs = (Msg99Reply **)p;
	//p += nc * sizeof(Msg99Reply *);

	Query *queries = (Query *)p;
	p += nc * sizeof(Query);

	QueryNumLinkedNode **qnPtrs = (QueryNumLinkedNode **)p;
	p += nc * sizeof(QueryNumLinkedNode *);

	HashTableX *htables = (HashTableX *)p;
	p += nc * sizeof(HashTableX);

	// sanity
	if ( p != mem + need ) { char *xx=NULL;*xx=0; }
	// initialize the mem
	for ( int32_t i = 0 ; i < nc ; i++ ) {
		queryWeights[i] = 1.0;
		qnPtrs[i] = NULL;
		queries[i].constructor();
		htables[i].constructor();
	}

	// total pages indexed!
	//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();


	float totalWeight;

	// get matching queries
	//SafeBuf *qpbuf = getMatchingQueriesScored();
	//if ( ! qpbuf || qpbuf == (void *)-1 ) { char *xx=NULL;*xx=0; }
	// cast it
	//Msg99Reply **qptrs=(Msg99Reply **)qpbuf->getBufStart();

	SafeBuf *mq = getMatchingQueryBuf();
	if ( mq == NULL || mq == (void *)-1 ) { char *xx=NULL;*xx=0; }
	int32_t nks = mq->length() / sizeof(QueryLink);
	QueryLink *qks = (QueryLink *)mq->getBufStart();

	// print the queries in common!
	int32_t firstOff = rd->m_firstCommonQueryNumOff;
	int32_t offset = firstOff;
	int32_t qc = 0;

	//int64_t numPagesIndexed = g_titledb.getGlobalNumDocs();
	// this is fixed at the time we set QueryLogEntry::m_numResultsInSlice
	int64_t numPagesIndexed = 1114000000;
	int64_t point0 = numPagesIndexed / 119LL;
	int64_t point1 = numPagesIndexed / 15LL;

	// loop over the query/score pairs this related docid matched
	for ( ; offset >= 0 ; qc++ ) {
		// get that node
		char *buf = m_commonQueryNumBuf.getBufStart();
		// and offset
		buf += offset;
		// then cast
		QueryNumLinkedNode *qn;
		qn = (QueryNumLinkedNode *)buf;
		// advance. will be -1 when done
		if ( qn ) offset = qn->m_nextOff;
		else offset = -1;
		// get #qn into there
		//Msg99Reply *rp = qptrs[qn->m_queryNum];
		if ( qn->m_queryNum < 0 || qn->m_queryNum >= nks ) {
			char *xx=NULL;*xx=0; }
		QueryLink *qk = &qks[qn->m_queryNum];
		QueryLogEntry *qe ;
		qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf);
		char *qstr = qe->getQueryString();

		qnPtrs[qc] = qn;

		// save ptrs too
		//replyPtrs[qc] = rp;

		// get main url score for query
		//float mainUrlScore = rp->m_myScore;
		int32_t  mainUrlSiteHash26 = m_siteHash32;
		// seems like clusterdb masks them a bit in
		// Clusterdb::getSiteHash()
		mainUrlSiteHash26 &= 0x03ffffff;

		int32_t mainUrlRank = -1;
		int32_t rdRank      = -1;
		//float mainUrlSerpScore = -1.0;

		// . the relateddocidnumhack
		// . this is used as the topdocidnum # in the case of
		//   m_matchingQueryBuf (doMatchingQueries)
		int32_t tdnum = qk->m_relatedDocIdNum;
		TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart();
		int32_t maxnum = m_topDocIdsBuf.length()/sizeof(TopDocIds);
		if ( tdnum < 0 || tdnum >= maxnum ) { char *xx=NULL;*xx=0; }
		TopDocIds *td = &tds[tdnum];

		// assume none
		//float rdScore = 0.0;
		// find docid for this related docid
		//TopDocIds *td = rp->getTopDocIds(&m_topDocIdsBuf);

		int32_t nd = td->m_numDocIds;
		for ( int32_t y = 0 ; y < nd ; y++ ) {
			// if we first encounter a result from the same
			// site as the main url then stop! you don't get
			// the 10x bonus then!
			if ( td->m_topSiteHashes26[y] == mainUrlSiteHash26 &&
			     mainUrlRank == -1 ) {
				//mainUrlSerpScore = td->m_topScores[y];
				mainUrlRank = y;
			}
			// set our score?
			if ( td->m_topDocIds[y] == rd->m_docId ) {
				//rdScore = td->m_topScores[y];
				rdRank = y;
			}
		}
		// these should always be set! even if not ranked in the
		// top 300 because of our new logic using msg4f in
		// getRelatedDocIdsScored()
		float rdScore          = qn->m_relatedDocIdSerpScore;
		float mainUrlSerpScore = qk->m_serpScore;

		bool better = false;
		// give it a weight of 10 if higher-scoring!
		//if ( rdRank < mainUrlRank ) better = true;
		if ( rdScore >= mainUrlSerpScore ) better = true;
		// if your site not in top 300 or so, and he is, he's better
		//if ( mainUrlRank == -1 && rdRank >= 0 ) better = true;

		// this is the specific url, not the SITE, like
		// mainUrlRank is, for the entire site
		//if ( rdScore > mainUrlScore ) better = true;

		// how many search results does this query have total?
		int64_t numResults = qe->m_numTotalResultsInSlice;
		// fix it to be global
		numResults *= (int64_t)g_hostdb.getNumShards();
		// big indexes did the "slice logic" restricting docid
		// range to MAX_DOCID * .10 when setting this!
		if ( numPagesIndexed > 10000000 ) numResults *= 10;

		////////////////////
		//
		// Scoring is what we do when the number of combinations
		// it too high to effectively compute. - matt
		//
		////////////////////


		// lower from 10 so google still won't dominate generic qyries?
		// crap, at 2.0 gigablast.com had bad competitors because
		// they all matc queries with gigablast in them.
		// i put it down from 30.0 to 5.0 to fix chessusa.com
		// who was getting bad competitor pages that had just
		// 'ccc' matching non-generic queries having them come up too
		// high of score.
		//if ( better )
		//	queryWeights[qc] = 1.0;//30.0;//100.0; // 10.0;

		//
		// do not give related docid query that has YOUR brand in it
		// much weight. we do not want it talking about you, because
		// it is a competitor.
		//
		// PROBLEM: "cheatcodes.com"'s brand is descriptive!
		//
		// . if not generic and it beats YOU, give more!
		// . try to fix ibm.com gigablast.com seomoz.org ahrefs.com
		//   that suffer because of matching their brand. actually
		//   maybe only do this if seomoz.org matches this query
		//   with their link text only...??? thus, pages that contain
		//   "seo moz" will match the "seo moz" query but will gain
		//   RELATIVELY little because they can't be seomoz.org on it.
		// . crap though this will hurt chessusa.com right?? try again
		//   since algo changed a lot since then
		bool isBrand = true;
		// if other guy ranks better than you, probably not
		// your brand, or if it is, it could be his brand too?
		if ( better ) // && numResults < point0 )
			isBrand = false;
		// or if you are not in the top 100 it is probably not
		// your brand name either!
		if ( mainUrlRank == -1 )
			isBrand = false;
		// fix chessusa.com for 'chess' by lowering from 100 to 20...
		if ( mainUrlRank >= 20 )
			isBrand = false;
		// fix 'corporation' for ibm.com. it is too generic to
		// be a brand. on our 1.1B page index, point0 is like 9.3M.
		// 'ibm' is 5.5M, 'corporation' is 25M,...
		if ( numResults >= point0 )
			isBrand = false;
		// or for ibm.com ... or other pages with high siteranks,
		// your brand queries should be in the top 10!! otherwise,
		// ibm has so many other matching queries in the top 100 that
		// are not brands for it because its siterank is so high.
		if ( mainUrlSiteRank >= 10 && mainUrlRank >= 10 )
			isBrand = false;
		// top 5 for brands in siterank 11 sites
		if ( mainUrlSiteRank >= 11 && mainUrlRank >= 5 )
			isBrand = false;

		// . good competitors will be in top 30 for a query
		// . let's keep in mind though that we use these competitors
		//   to find backlinks AND to generate related terms, so
		//   it's not so important that they dominate a query, but
		//   rather that they match your content...
		/*
		if ( better &&
		     numResults < point0 &&
		     rdRank >= 0 &&
		     rdRank < 20 )
			queryWeights[qc] *= 1.2;//50.0;

		// top ten???
		if ( better &&
		     numResults < point0 &&
		     rdRank >= 0 &&
		     rdRank < 10 )
			queryWeights[qc] *= 1.3;//51.0;

		// top 5?
		if ( better &&
		     numResults < point0 &&
		     rdRank >= 0 &&
		     rdRank < 5 )
			queryWeights[qc] *= 1.4;//52.0;
		*/

		// weight it by how relevant the query it matches is to us
		//if ( better && numResults < point0 )
		//	queryWeights[qc] = (qk->m_serpScore / 1000000.0);

		//
		// generic query?
		//
		float weight = 1.0;
		if      ( numResults < point0 ) weight = 100.0;
		else if ( numResults < point1 ) weight = 10.0;
		queryWeights[qc] *= weight;

		//
		// weight by related docid's serp score
		//
		float ss = qk->m_serpScore;
		float w2 = 1.0;
		if      ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
		else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
		else if ( ss > 10000000.0 )  w2 = 8.0; // > 10M
		else if ( ss > 1000000.0 )  w2 = 7.0; // > 1M
		else if ( ss > 100000.0 )  w2 = 6.0; // > 100k
		else if ( ss > 10000.0 )  w2 = 5.0; // > 10k
		else if ( ss > 1000.0 )  w2 = 4.0; // > 1k
		else if ( ss > 100.0 )  w2 = 3.0; // > 100
		else if ( ss > 10.0 )  w2 = 2.0; // > 10
		queryWeights[qc] *= w2;


		//
		// weight by main url's serp score as well!
		//
		ss = mainUrlSerpScore;//qk->m_serpScore;
		w2 = 1.0;
		if      ( ss > 1000000000.0 ) w2 = 10.0; // > 1B
		else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M
		else if ( ss > 10000000.0 )  w2 = 8.0; // > 10M
		else if ( ss > 1000000.0 )  w2 = 7.0; // > 1M
		else if ( ss > 100000.0 )  w2 = 6.0; // > 100k
		else if ( ss > 10000.0 )  w2 = 5.0; // > 10k
		else if ( ss > 1000.0 )  w2 = 4.0; // > 1k
		else if ( ss > 100.0 )  w2 = 3.0; // > 100
		else if ( ss > 10.0 )  w2 = 2.0; // > 10
		queryWeights[qc] *= w2;


		// punish query weight if it is your brand most likely
		//if ( isBrand )
		//	queryWeights[qc] = 0.01;

		// . store related docid rank and your rank
		// . then we do not need cache m_topDocIdsBuf and seo.cpp
		//   has this info readily available.
		qn->m_relatedDocIdRank = rdRank;
		qn->m_mainUrlRank      = mainUrlRank;
		//qn->m_mainUrlSerpScore = mainUrlSerpScore;

		/*
		int64_t numResults = qe->m_numTotalResultsInSlice;
		// fix it to be global
		numResults *= (int64_t)g_hostdb.getNumGroups();
		// big indexes did the "slice logic" restricting docid
		// range to MAX_DOCID * .10 when setting this!
		if ( numPagesIndexed > 10000000 ) numResults *= 10;
		// fix divide by zero and make all rare queries similar weight
		//if ( numResults < 1000 ) numResults = 1000;
		// divide by # results query has so more generic stuff
		// is down weighted
		//queryWeights[qc] /= (float)numResults;
		if ( numResults < 1000 )
			queryWeights[qc] /= 1;
		else if ( numResults < 10000 )
			queryWeights[qc] /= 2;
		else if ( numResults < 100000 )
			queryWeights[qc] /= 4;
		else if ( numResults < 1000000 ) // 1M
			queryWeights[qc] /= 8;
		else if ( numResults < 10000000 ) // 10M
			queryWeights[qc] /= 16;
		else if ( numResults < 10000000 ) // 100M
			queryWeights[qc] /= 32;
		else
			queryWeights[qc] /= 64;
		*/

		//int32_t qlen = gbstrlen(qstr);
		// int16_tcuts
		Query      *qp = &queries[qc];
		HashTableX *ht = &htables[qc];
		// this is currently a int64_t bit vector
		int32_t vs = sizeof(qvec_t);
		if ( ! ht->set ( 8,vs,128,NULL,0,false,m_niceness,"wbvbuf") )
			// hopefully g_errno is preserved
			goto done;
		// if unknown use english so pandora's -> pandora,pandoras?
		// because 'pandora's tower' was not matching
		// 'pandoras tower' because both words could have been
		// english or german, thus the queries were thought to be
		// independent! giving rise to high-scoring competitive pages
		// that matched only those two queries.
		uint8_t qlangId = qe->m_langId;
		if ( ! qlangId ) qlangId = langEnglish;
		qp->set2 ( qstr , qlangId , true );
		// hash it up
		for ( int32_t i = 0 ; i < qp->m_numTerms ; i++ ) {
			// int16_tcut
			QueryTerm *qt = &qp->m_qterms[i];
			// bigrams imply 2 explicit bits, one from each term
			// in the bigram. synonym terms should share the same
			// bit as the term they are a synonym of
			int64_t bits = qt->m_implicitBits;
			// . add bit vec. use rawTermId?
			// . hash to wordbit vector of query words contained
			if ( ! ht->addKey ( &qt->m_termId , &bits ) )
				goto done;
		}
	}

	// . set the dup flags!
	// . scan queries related docid matches
	for ( int32_t i = 0 ; i < qc ; i++ ) {
		// get it
		Query      *qpi = &queries[i];
		HashTableX *hti = &htables[i];
		// scan all queries above
		for ( int32_t j = i+1 ; j < qc ; j++ ) {
			// reset
			bool jIsSubQueryOfi = false;
			bool iIsSubQueryOfj = false;
			// skip ourselves
			//if ( j == i ) continue;
			// get it
			Query      *qpj = &queries[j];
			HashTableX *htj = &htables[j];
			// scan every query term in query #j and map each
			// termid to the term bit vector that indicates what
			// terms query #j has in query #i.
			qvec_t totalVec = 0LL;
			// is it a dup?
			for ( int32_t k = 0 ; k < qpj->m_numTerms ; k++ ) {
				// int16_tcut
				QueryTerm *qt = &qpj->m_qterms[k];
				// see if in there
				char *val ;
				val = (char *)hti->getValue(&qt->m_termId);
				if ( ! val ) continue;
				// get implied term bits
				qvec_t vec = *(qvec_t *)val;
				// this is the termbit vector for query #i.
				// it tells us what terms query #j shares.
				totalVec |= vec;
			}
			// we only care about "required" terms. i.e. bigrams
			// are essentially ignored if not in quotes.
			totalVec &= qpi->m_requiredBits;
			// how many words do we match?
			if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
			int32_t numSharedWithQueryi = getNumBitsOn64(totalVec);
			// how many required bits does it have?
			int32_t needi = getNumBitsOn64(qpi->m_requiredBits);
			// if all terms in query #i are in query #j then subset
			if ( numSharedWithQueryi == needi )
				iIsSubQueryOfj = true;

			//
			// now go the other way
			//
			totalVec = 0LL;
			// is it a dup?
			for ( int32_t k = 0 ; k < qpi->m_numTerms ; k++ ) {
				// int16_tcut
				QueryTerm *qt = &qpi->m_qterms[k];
				// see if in there
				char *val;
				val = (char *)htj->getValue(&qt->m_termId);
				if ( ! val ) continue;
				// get implied term bits
				qvec_t vec = *(qvec_t *)val;
				// this is the termbit vector for query #j.
				// it tells us what terms query #i shares.
				totalVec |= vec;
			}
			// we only care about "required" terms. i.e. bigrams
			// are essentially ignored if not in quotes.
			totalVec &= qpj->m_requiredBits;
			// how many words do we match?
			if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; }
			int32_t numSharedWithQueryj = getNumBitsOn64(totalVec);
			// how many required bits does it have?
			int32_t needj = getNumBitsOn64(qpj->m_requiredBits);
			// if all terms in query #i are in query #j then subset
			if ( numSharedWithQueryj == needj )
				jIsSubQueryOfi = true;


			// now set dup bit if query #i is same as query #j
			// taking into account "missing spaces" so that we
			// have two terms in one query , and their bigram
			// in the other query. OR we have synonyms. OR we
			// have differences of "ignored" words.
			// "leg"  = "legs"
			// "cheat code" = "cheatcodes"
			// "the tigers" = "tigers"
			if(jIsSubQueryOfi&&
			   iIsSubQueryOfj&&
			   queryWeights[j]>.02){
				// debug?
				if ( m_seoDebug >= 2 )
				log("seo: %s ISDUPOF %s",
				    qpj->m_orig,
				    qpi->m_orig);
				// the dup weight is .02
				queryWeights[j] *= .1; // = .02
			}

			// proper subquery examples:
			// "leg" is subquery of "nice legs"
			else if ( jIsSubQueryOfi &&
			     ! iIsSubQueryOfj &&
			     queryWeights[j] > .05 ) {
				// debug?
				if ( m_seoDebug >= 2 )
				log("seo: %s SUBQUERYOF %s",
				    qpj->m_orig,
				    qpi->m_orig);
				// the subquery weight is .05
				queryWeights[j] *= 0.1; // = 5.0;//.05;
			}

			// is query #i a PROPER subquery of query #j
			else if ( iIsSubQueryOfj &&
			     ! jIsSubQueryOfi &&
			     queryWeights[i] > .05 ) {
				// debug?
				if ( m_seoDebug >= 2 )
				log("seo: %s SUBQUERYOF %s",
				    qpi->m_orig,
				    qpj->m_orig);
				// the subquery weight is .05
				// increase to 5.0 to try to drown out the
				// anomaly queries promoting poker sites
				// for cheatcodes.com competitors
				queryWeights[i] *= 0.1; // = 5.0;//.05;
			}

			else {
				// debug?
				//if ( debug )
				//log("seo: %s UNRELATEDTO %s",
				//    qpi->m_orig,
				//    qpj->m_orig);
			}

		}
	}

	// scan the queries again and add up their weights this time!
	totalWeight = 0.0;
	for ( int32_t i = 0 ; i < qc ; i++ ) {
		totalWeight += queryWeights[i];
		qnPtrs[i]->m_queryScoreWeight = queryWeights[i];
		//Msg99Reply *ptr = replyPtrs[i];
		Query *qp = &queries[i];
		char *qstr = qp->m_orig;//ptr->m_queryStr;
		// log it
		if ( m_seoDebug >= 2 )
			log("seo: docid=%"INT64" weight=%f qry=%s",
			    rd->m_docId,
			    queryWeights[i],
			    qstr);
	}


	// that is the docid related weight now
	rd->m_relatedWeight = totalWeight;

 done:
	for ( int32_t i = 0 ; i < nc ; i++ ) {
		queries[i].destructor();
		htables[i].destructor();
	}
	mfree ( mem , need , "qrybuf" );
	return true;
}


// returns false and sets g_errno on error
bool XmlDoc::addRelatedDocIdInfo ( int64_t docId ,
				   int32_t queryNum ,
				   float score ,
				   int32_t  rank ,
				   int32_t  siteHash26 ) {

	// do not add if does not match the query
	if ( score <= 0.0 ) return true;

	// alloc space if first time calling
	if ( ! m_rdtab.m_numSlots ) {
		if ( ! m_rdtab.set(8,sizeof(RelatedDocId),1024,NULL,0,
				 false,0,"rdtab"))
			return false;
	}

	// get the related docid as it exists in m_relatedDocIdBuf
	RelatedDocId *rd = NULL;

	// now we also store these for intersecting
	// in phase 2 to see what urls are most
	// similar to us
	int32_t slot = m_rdtab.getSlot(&docId);
	// if not there, add it
	if ( slot < 0 ) {
		// make one
		RelatedDocId rdx;
		// the most important thing is the docid!
		rdx.m_docId = docId;
		// and now the 32-bit site hash
		rdx.m_siteHash26 = siteHash26;
		// how many search results we are in
		rdx.m_numCommonQueries = 0;
		// the queryImportance should be our score
		// for this query divided by m_minTop50Score
		// to normalize it.
		//float qimp=qp->m_queryInfo.m_queryImportance;
		// just add up the query importance for
		// each query we share in common with main url
		//rd.m_similarityScore = qip;
		// now we do a dot product of this related
		// docids score vector with the main url's
		// score vector. both vector's are normalized
		// using the score of the 1st result!
		//rd.m_dotProduct = score;
		// reset this
		rdx.m_rd_siteRank = -1;
		rdx.m_rd_langId = 255;
		rdx.rd_title_off = -1;
		rdx.rd_url_off   = -1;
		rdx.rd_site_off  = -1;
		// point to beginning of linked list of qrynums
		rdx.m_firstCommonQueryNumOff = -1;//off;
		//rdx.m_lastCommonQueryNumOff  = -1;//off;
		// remember offset
		int32_t rdOff = m_relatedDocIdBuf.length();
		// store it
		m_relatedDocIdBuf.safeMemcpy ( &rdx , sizeof(RelatedDocId) );
		// add OFFSET to table. data is 12 bytes
		if(! m_rdtab.addKey(&docId,&rdOff)) return false;
		// all done then
		//continue;
		// set this for adding to the linked list
		char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
		// cast it
		rd = (RelatedDocId *)p;
	}
	else {
		// get the data
		int32_t rdOff = *(int32_t *)m_rdtab.getValueFromSlot(slot);
		// point to it
		char *p = m_relatedDocIdBuf.getBufStart() + rdOff;
		// cast it
		rd = (RelatedDocId *)p;
	}

	// before we add the querynumlinkednode make sure not a dup!
	char *qnbuf = m_commonQueryNumBuf.getBufStart();
	// . offset of first node for this related docid
	// . this is the start of his linked list of query/score nodes
	int32_t firstOff = rd->m_firstCommonQueryNumOff;

	// sanity
	if ( firstOff == -1 && rd->m_numCommonQueries ) { char *xx=NULL;*xx=0;}

	// assume no linked list
	QueryNumLinkedNode *node = NULL;
	// only a linked list if firstOff is not -1
	if ( firstOff >= 0 ) node = (QueryNumLinkedNode *)(qnbuf + firstOff);
	// scan the nodes (query/score pairs) we got for this related docid
	for ( ; node ; ) {
		// if this query is already in the linked list, stop! we
		// do not want to add dup QueryNumLinkedNode nodes.
		if ( node->m_queryNum == queryNum ) return true;
		// end of linked list?
		if ( node->m_nextOff == -1 ) break;
		// advance to next node in linked list
		node = (QueryNumLinkedNode *)(qnbuf+node->m_nextOff);
	}

	// store query num element in a linked list so
	// we can print the actualy queryNums a related
	// docid has in common with the main url
	int32_t nodeOff = m_commonQueryNumBuf.length();
	// we can record our rank and your rank in this!
	QueryNumLinkedNode qn;
	qn.m_queryNum = queryNum; // qp->m_queryNum;
	qn.m_nextOff  = -1;
	qn.m_relatedDocIdRank = rank;
	qn.m_relatedDocIdSerpScore = score;
	qn.m_mainUrlRank = -1;
	//qn.m_mainUrlSerpScore = -1.0;
	int32_t sq = sizeof(QueryNumLinkedNode);
	// point to it
	if ( ! m_commonQueryNumBuf.safeMemcpy(&qn,sq) )
		return false;
	// point to node we stored in the buf so we can adjust it below
	QueryNumLinkedNode *stored ;
	stored = (QueryNumLinkedNode *)(m_commonQueryNumBuf.getBuf() - sq);


	// increment the count. the # of nodes in his linked list.
	rd->m_numCommonQueries++;

	// continue the linked list
	qnbuf = m_commonQueryNumBuf.getBufStart();

	// the first node?
	if ( firstOff == -1 ) {
		rd->m_firstCommonQueryNumOff = nodeOff;
		//rd->m_lastCommonQueryNumOff  = nodeOff;
		return true;
	}

	// get the current first
	int32_t oldFirstOff = rd->m_firstCommonQueryNumOff;
	//char *vv = qnbuf + rd->m_firstCommonQueryNumOff;
	//QueryNumLinkedNode *first = (QueryNumLinkedNode *)vv;
	// we are the new first
	rd->m_firstCommonQueryNumOff = nodeOff;
	// we point to old first as our next
	stored->m_nextOff = oldFirstOff;
	// and update that node's next link
	//last->m_nextOff = nodeOff;
	// and our new tail
	//rd->m_lastCommonQueryNumOff = nodeOff;

	return true;
}

// . safebuf returned is a buffer of QueryLinks
// . use m_matchingQueryBuf/m_matchingStringBuf
SafeBuf *XmlDoc::getMatchingQueryBuf ( ) {

	setStatus ( "getmatchingqueries" );

	// try to set from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	if ( m_matchingQueryBufValid )
		return &m_matchingQueryBuf;


	if ( ! m_beginTimeAllMatch )
		m_beginTimeAllMatch = gettimeofdayInMilliseconds();

	if ( m_docIdListBuf.length() == 0 )
		m_docIdListBuf.pushLongLong(m_docId);

	// true = doMatchingQueries?
	SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , true );
	if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;

	m_matchingQueryBuf      .stealBuf ( qkbuf );
	m_matchingQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );

	// show time
	int64_t now = gettimeofdayInMilliseconds();
	int64_t took = now - m_beginTimeAllMatch;
	log("seopipe: time: getMatchingQueries took %"INT64" ms",took);

	m_matchingQueryBufValid = true;

	// if getRelatedQueryBuf calles getQueryLinkBuf() it should
	// do a recompute, so set this to false
	m_queryLinkBufValid = false;

	m_docIdListBuf.purge();

	// store it
	if ( ! storeMatchingQueriesIntoCachedb() )
		return (SafeBuf *)-1;

	return &m_matchingQueryBuf;
}

// . returns safebuf of QueryLinks, representing the intersected matching
//   queries of all the related docids
SafeBuf *XmlDoc::getRelatedQueryBuf () {

	// try to set from cachedb record
	if ( ! checkCachedb() )
		return (SafeBuf *)-1;

	if ( m_relatedQueryBufValid )
		return &m_relatedQueryBuf;

	// we need these
	SafeBuf *rdbuf = getRelatedDocIdsWithTitles();
	if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf;


	if ( ! m_beginRelatedQueries )
		m_beginRelatedQueries = gettimeofdayInMilliseconds();

	if ( m_docIdListBuf.length() == 0 ) {
		int32_t numRelatedDocIds = rdbuf->length()/sizeof(RelatedDocId);
		// just use the top 50 for related queries for speed!
		if ( numRelatedDocIds > 50 ) numRelatedDocIds = 50;
		RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();
		for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) {
			RelatedDocId *rd = &rds[i];
			m_docIdListBuf.pushLongLong(rd->m_docId);
		}
	}

	// false = doMatchingQueries?
	SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , false );
	if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf;

	m_relatedQueryBuf      .stealBuf ( qkbuf );
	m_relatedQueryStringBuf.stealBuf ( &m_queryLinkStringBuf );

	m_relatedQueryBufValid = true;
	m_queryLinkBufValid    = false;

	m_docIdListBuf.purge();

	// show time
	int64_t now = gettimeofdayInMilliseconds();
	int64_t took = now - m_beginRelatedQueries;
	log("seopipe: time: getRelatedQueries took %"INT64" ms",took);

	// store it
	if ( ! storeRelatedQueriesIntoCachedb() )
		return (SafeBuf *)-1;


	return &m_relatedQueryBuf;
}


static void gotMsg8eReplyWrapper ( void *state , UdpSlot *slot ) {
	XmlDoc *THIS = (XmlDoc *)state;

	int32_t hostId = slot->m_hostId;
	THIS->m_msg8eReply    [hostId] = slot->m_readBuf;
	THIS->m_msg8eReplySize[hostId] = slot->m_readBufSize;
	// do not let udpserver.cpp free it, we will later
	slot->m_readBuf = NULL;

	log("seo: got msg8e reply #%"INT32" of %"INT32" from host #%"INT32"",
	    (int32_t)THIS->m_numMsg8eReplies,
	    (int32_t)THIS->m_numMsg8eRequests,
	    (int32_t)hostId);

	THIS->m_numMsg8eReplies++;
	// do not free send buf until last reply!
	if ( THIS->m_numMsg8eReplies < THIS->m_numMsg8eRequests ) {
		slot->m_sendBufAlloc = NULL;
		return;
	}
	// ok, sendBuf will auto free in UdpServer.cpp when we return from this
	THIS->m_masterLoop ( THIS->m_masterState );
}


//static void gotMsg20ReplyWrapper ( void *state ) {
//	XmlDoc *THIS = (XmlDoc *)state;
//	THIS->m_numMsg20Replies++;
//	if ( THIS->m_numMsg20Replies < THIS->m_numMsg20Requests )
//		return;
//	THIS->m_masterLoop ( THIS->m_masterState );
//}


// . returned safebuf is array of QueryLinks
// . gets all matching queries from all related docids and store them
//   compactly as QueryLinks, otherwise we'd run out of memory because
//   each docid has like 50,000 matching queries on avg.
// . we now get matching queries in modulus parts to avoid OOM, because
//   with my new changes i made we are getting like a few hundred thousand
//   matching queries per related docid.
// . we do not store the query string, etc, for the QueryLink,
//   just the query offset and the  hostid that has the query in its
//   memory (g_qbuf). after we intersect the QueryLinks we will get the
//   query strings, etc. there will be a lot fewer in the intersection.
SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {

	if ( m_queryLinkBufValid )
		return &m_queryLinkBuf;

	bool doRelatedQueries = true;
	if ( doMatchingQueries ) doRelatedQueries = false;

	// get the 32-bit terms the main doc matches, so we may determine
	// what terms in a related query are novel to this document.
	SafeBuf *mainUrlTwidBuf32 = NULL;
	if ( doRelatedQueries ) {
		mainUrlTwidBuf32 = getTermId32Buf() ;//InfoBuf();
		if ( ! mainUrlTwidBuf32 || mainUrlTwidBuf32 == (void *)-1 )
			return mainUrlTwidBuf32;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	//
	// SHIT! we can't use the keys in the termlistbuf for dual purpose
	// role as terms the doc contains, because they do not have the
	// synonym forms!!! So we have to get this terminfobuf as wells
	// as the termlistbuf for each docid!!!!
	//
	// so we might as well not sort by the lower 32 bit hack as well
	//


	//
	//
	// 1. get termlistbuf for each docid possibly using msg20s
	//
	//    we need this for getting the QueryLink::m_serpScores in
	//    handleRequest8e
	//
	//
	//int32_t numDocIds = docIdList->length() / 8;
	//int64_t *docIds = (int64_t *)docIdList->getBufStart();


	//SafeBuf *tlistBuf = NULL;
	//SafeBuf *twidBuf32 = NULL;

	// . we just want the termlistbuf of each related docid
	// . hack: it should be sorted by the LOWER 32 bits of termid
	//   so handlerequest8e does not need to sort its termid32/twid32 buf
	//if ( doMatchingQueries ) {
	//	tlistBuf = getTermListBuf();
	//	if ( ! tlistBuf || tlistBuf == (void *)-1 ) return tlistBuf;
	//	twidBuf32 = getTermId32Buf();
	//	if ( ! twidBuf32 || twidBuf32 == (void *)-1 ) return twidBuf32;
	//}

	/*
	if ( doRelatedQueries && ! m_launchedAll ) {
		int32_t need = sizeof(Msg20) * numDocIds;
		// we also use this same buf in getRelatedDocIdsWithTitles
		if ( ! m_msg20Buf.reserve ( need,"m20buf3" ) ) return NULL;
		// mark it all in use
		m_msg20Buf.setLength(need);
		// init them
		Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart();
		int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20);
		for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor();
		// reset cursor to start with first related docid
		m_numMsg20Replies  = 0;
		m_numMsg20Requests = 0;
		// launch all!
		for ( int32_t i = 0 ; i < numMsg20s ; i++ ) {
			// int16_tcut
			Msg20 *msg20 = &mp[i];
			// get current related docid
			//RelatedDocId *rd = &rds[i];
			// make the request
			Msg20Request req;
			req.ptr_coll    = cr->m_coll;
			req.size_coll   = gbstrlen(cr->m_coll)+1;
			req.m_docId     = docIds[i];
			req.m_expected  = true;
			req.m_niceness  = m_niceness;
			//req.m_state     = m_masterState;
			//req.m_callback2 = m_masterLoop;
			req.m_state = this;
			req.m_callback2 = gotMsg20ReplyWrapper;
			// do not get summary stuff. too slow.
			req.m_numSummaryLines = 0;
			// get this
			req.m_getTermListBuf = true;
			// count these!
			m_numMsg20Requests++;
			// store cursor in msg20 itself so we know the rd
			//msg20->m_hack2 = i;
			// launch it
			if ( ! msg20->getSummary ( &req ) ) continue;
			// error?
			if ( ! g_errno ) { char *xx=NULL;*xx=0; }
			// note it
			log("seo: error getting termlistbuf docid=%"INT64"",
			    docIds[i]);
			// reset it
			//msg20->reset();
			// count reply as back now
			m_numMsg20Replies++;
		}
		m_launchedAll = true;
	}
	// wait for one reply per related docid
	if ( doRelatedQueries && m_numMsg20Replies < m_numMsg20Requests )
		return (SafeBuf *)-1;
	*/

	//
	//
	// 2. send one msg8e request to each host with those termlistbufs
	//
	//    it has one termlistbuf per relateddocid, enough info
	//    for handlerequest8e to return the list of matching QueryLinks
	//    intersected for all related docids.
	//
	if ( m_numMsg8eRequests == 0 ) {
		SafeBuf request;
		// how big is the request?
		int32_t need = 0;
		need += 1; // for the byte flag
		int32_t collLen = gbstrlen(cr->m_coll);
		need += collLen + 1;
		// list of docids (just one for matching queries)
		need += 4;
		need += docIdList->length();

		// twidtable alloc
		if ( doRelatedQueries ) {
			need += 4;
			need += mainUrlTwidBuf32->length();
		}

		//if ( doMatchingQueries ) {
		//	// just our main url's termlistbuf
		//	need += 4;
		//	need += tlistBuf->length();
		//	need += 4;
		//	need += twidBuf32->length();
		//}

		//
		// make the 8e request
		//
		if ( ! request.reserve ( need ,"rep8ebuf" ) )
			return NULL;
		// first store flag to indicate if getting matching or
		// related queries
		if ( doMatchingQueries ) request.pushChar(1);
		else                     request.pushChar(0);
		// then coll\0
		request.safeMemcpy ( cr->m_coll, collLen );
		request.pushChar   ( 0 );
		// then docids after the collection name
		request.pushLong   ( docIdList->length() );
		request.safeMemcpy ( docIdList );

		// then if doing related queries we need to store our
		// 32-bit twids of the main url for setting m_uniqueRound
		if ( doRelatedQueries ) {
			request.pushLong(mainUrlTwidBuf32->length());
			request.safeMemcpy(mainUrlTwidBuf32->getBufStart(),
					   mainUrlTwidBuf32->length() );
		}
		/*
		// then store each termlistbuf from each msg20
		for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ) {
			// int16_tcut
			Msg20 *mp = &mps[i];
			Msg20Reply *rep = mp->getReply();
			if ( rep ) {
				request.pushLong   ( rep->size_tlistBuf );
				request.safeMemcpy ( rep->ptr_tlistBuf  ,
						     rep->size_tlistBuf );
				// then the 32-bit termid buf with synonyms
				// that the above posdblist termlists don't
				// have so we can match queries
				request.pushLong   ( rep->size_tiBuf );
				request.safeMemcpy ( rep->ptr_tiBuf,
						     rep->size_tiBuf );
			}
			// make them empty i guess
			else {
				request.pushLong ( 0 );
				request.pushLong ( 0 );
			}
		}
		*/
		/*
		// just our main url's termlistbuf
		if ( doMatchingQueries ) {
			request.pushLong   (tlistBuf->length());
			request.safeMemcpy (tlistBuf);
			// then the 32-bit termid buf with synonyms that
			// the above posdblist termlists don't have so
			// we can match queries
			request.pushLong   (twidBuf32->length());
			request.safeMemcpy (twidBuf32);

		}
		*/
		// sanity
		if ( request.length() != need ) { char *xx=NULL;*xx=0; }

		// do not free it here, let udpserver free it
		char *req     = request.getBufStart();
		int32_t  reqSize = request.length();
		request.detachBuf();

		// we've formulated the 8e request, no need for msg20s anymore
		//for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ){
		//	// int16_tcut
		//	Msg20 *mp = &mps[i];
		//	mp->destructor();
		//}
		// free the mem as well
		//m_msg20Buf.purge();

		// must be host #0 for this next algo to work
		if ( g_hostdb.m_hostId != 0 ) { char *xx=NULL;*xx=0; }
		//
		// send msg8e request to each host. skip if dead.
		//
		for ( int32_t k = 1; k <= g_hostdb.m_numHosts ; k++ ) {
			// breathe
			QUICKPOLL(m_niceness);
			// send to ourselves last so we can do all in parallel
			int32_t hosti = k;
			if ( k == g_hostdb.m_numHosts ) hosti = 0;
			// get ptr to the host
			Host *host = g_hostdb.getHost(hosti);
			// get hostid of host #i
			int32_t hostId = host->m_hostId;
			if ( hostId != hosti ) { char *xx=NULL;*xx=0; }
			// count it
			m_numMsg8eRequests++;
			// skip if dead. i guess no queries from that guy. we
			// can't send to a twin because the twin does not have
			// the same queries in its in-memory query log.
			if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive) {
				log("seo: skipping msg8e to dead host %"INT32"",
				    hostId);
				m_msg8eReply    [hostId] = NULL;
				m_msg8eReplySize[hostId] = 0;
				m_numMsg8eReplies++;
				continue;
			}
			// . send request to him
			// . reply is the query strings
			// . when reply comes in we store it in the query
			//   string buf and make the QueryLinks reference it
			//   with their QueryLink::m_queryStringOffset
			if ( ! g_udpServer.sendRequest ( req ,
							 reqSize ,
							 0x8e , // msgtype
							 host->m_ip , // ip
							 host->m_port , // port
							 hostId,
							 NULL, // retslot
							 this,
							 gotMsg8eReplyWrapper,
							 999999, // timeout
							 -1 , // backoff
							 -1 , // maxwait
							 NULL, // replybuf
							 0, // replybufmaxsize
							 m_niceness // niceness
							 )) {
				// let admin know about error
				log("seopipe: sendRequest 8e had error: %s",
				    mstrerror(g_errno));
				// count it as replied then
				m_numMsg8eReplies++;
				continue;
			}
		}
	}

	// this should never happen now with our new wrapper
	if ( m_numMsg8eReplies < m_numMsg8eRequests )
		return (SafeBuf *)-1;

	//
	//
	// 3. MERGE the msg8e replies from all hosts
	//
	//

	// gotMgs8eReplyWrapper() should have recorded each one into
	// m_msg8Reply[i], the msg20 reply ptr. set up for merging.
	char *bestPtr[MAX_HOSTS];
	char *bufEnd [MAX_HOSTS];
	for ( int32_t i = 0; i < g_hostdb.m_numHosts ; i++ ) {
		char *reply     = m_msg8eReply     [i];
		// this happens if host is dead...
		if ( ! reply ) {
			bestPtr[i] = NULL;
			bufEnd [i] = NULL;
			continue;
		}
		//int32_t  replySize = m_msg8eReplySize [i];
		// it should be a list of QueryLinks
		char *p = reply;
		int32_t queryLinkBufSize = *(int32_t *)p;
		p += 4;
		bestPtr[i] = p;
		// bufEnd[i] also marks the start of the querystringbuf
		bufEnd [i] = p + queryLinkBufSize;
	}
	int32_t count = 0;
	int32_t maxQueryLinks = MAX_RELATED_QUERIES;
	if ( doMatchingQueries ) maxQueryLinks = MAX_MATCHING_QUERIES;

	// now merge the top "max" highest scoring
	// QueryLinks and their correspoding QueryLogEntries into
	// m_queryLinkBuf/m_queryLinkStringBuf
 storeMore:
	// get the max scoring QueryLink from the 8e replies
	int32_t  maxi     = -1;
	float maxScore = -1.0;
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		// skip if exhausted
		if ( bestPtr[i] >= bufEnd[i] ) continue;
		// cast it
		QueryLink *qk = (QueryLink *)bestPtr[i];
		// sanity, if not list head core
		if ( ! qk->m_isFirst ) { char *xx=NULL;*xx=0; }
		// skip if score is not the current maximum
		if ( qk->m_totalQueryImportance < maxScore ) continue;
		// we got a new max!
		maxScore = qk->m_totalQueryImportance;
		maxi     = i;
	}

	// store max into m_queryLinkBuf and m_queryLinkStringBuf
	if ( maxi >= 0 ) {
		// int16_tcut
		QueryLink *best = (QueryLink *)bestPtr[maxi];
		// get # to copy
		int32_t toCopy = sizeof(QueryLink);
		if ( doRelatedQueries )
			// how many querylinks in this list? i.e. those
			// that all share the same query, but different
			// relateddocid?
			toCopy = best->m_numInList * sizeof(QueryLink);
		// copy the querylink
		if ( ! m_queryLinkBuf.reserve ( toCopy ) ) return NULL;
		// point to it
		QueryLink *qk = (QueryLink *)m_queryLinkBuf.getBuf();
		// THEN store it
		m_queryLinkBuf.safeMemcpy( best , toCopy );
		// point to its querylogentry buf, it occurs right
		// after the list of QueryLinks!
		char *p = bufEnd[maxi];
		// and the query it is for
		p += qk->m_queryStringOffset;
		// cast that
		QueryLogEntry *qe = (QueryLogEntry *)p;
		// ensure enough space
		if ( ! m_queryLinkStringBuf.reserve(qe->getSize(),"rqbb" ) )
			return NULL;
		// we are moving it into the final buf
		qk->m_queryStringOffset = m_queryLinkStringBuf.length();
		// store query log entry here now
		m_queryLinkStringBuf.safeMemcpy ( qe, qe->getSize() );
		// advance
		bestPtr[maxi] += toCopy;
	}
	// limit
	if ( ++count < maxQueryLinks ) goto storeMore;
	// liberate those msg20 reply buffers
	for ( int32_t i = 0; i < g_hostdb.m_numHosts;i++) {
		if ( ! m_msg8eReply[i] ) continue;
		mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" );
		m_msg8eReply[i] = NULL;
	}
	// reset our parms if we are re-called for related queries
	m_numMsg8eReplies  = 0;
	m_numMsg8eRequests = 0;
	m_queryLinkBufValid = true;
	// show time
	int64_t now = gettimeofdayInMilliseconds();
	int64_t took = now - m_beginRelatedQueries;
	log("seopipe: getrelatedquerybuftook %"INT64" ms",took);
	m_beginRelatedQueries = 0LL;

	// validate
	m_queryLinkBufValid = true;

	/*
	// log for debug
	qks = (QueryLink *)m_queryLinkBuf->getBufStart();
	nks = m_queryLinkBuf->length() / sizeof(QueryLink);
	for ( int32_t k = 0 ; k < nks ; k++ ) {
		// now we use offsets into m_relatedQueryBuf.m_buf[]
		QueryRel *qk = &qks[k];
		// skip if not a head
		if ( ! qk->m_isFirst ) continue;
		char *qstr = qk->getQueryString(&m_queryLinkStringBuf);
		log("seopipe: relquery=\"%s\" imp=%f votes=%"INT32"",
		    qstr,
		    qk->m_rq_totalScore,
		    qk->m_docIdVotes);
	}
	*/

	return &m_queryLinkBuf;
}

// scan matches like XmlDoc::getSummary() does and get all sentences
// containing a query term...
//void XmlDoc::getGigabitExcerpts ( ) {
//}


// this is still used by Title.cpp to get the title: field quickly
char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {

	if ( ! json ) return NULL;

	// get length
	int32_t fieldLen = gbstrlen(field);
	// keep track of in a quote or not
	bool inQuotes = false;
	char *stringStart = NULL;
	char *p = json;
	bool gotOne = false;
	int32_t depth = 0;
	// scan
	for ( ; *p ; p++ ) {
		// escaping a quote? ignore quote then.
		if ( *p == '\\' && p[1] == '\"' ) {
			// skip two bytes then..
			p++;
			continue;
		}
		// count {} depth
		if ( ! inQuotes ) {
			if ( *p == '{' ) depth++;
			if ( *p == '}' ) depth--;
		}
		// a quote?
		if ( *p == '\"' ) {
			inQuotes = ! inQuotes;
			// set start of the string if quote is beginning
			if ( inQuotes ) stringStart = p + 1;
			// if quote is ending and a colon follows then
			// it was a json field name. so if it matches the
			// field we want return the following field for it.
			else if ( ! inQuotes &&
				  ! gotOne &&
				  p[1] == ':' &&
				  // {"title":"whatever",...}
				  // could be product:{title:... depth=2
				  (depth == 1 ||depth==2) &&
				  stringStart &&
				  (p - stringStart) == fieldLen &&
				  strncmp(field,stringStart,fieldLen)==0 ) {
				// now, the next time we set stringStart
				// it will be set to the VALUE of this field
				// assuming the field is a STRING!!!!
				gotOne = true;
				// return after the quote
				//return p + 2;
			}
			// ok, we got the string after the field string...
			else if ( ! inQuotes && gotOne ) {
				if ( valueLen ) *valueLen = p - stringStart;
				return stringStart;
			}
			// keep chugging
			continue;
		}
	}
	// done, not found
	return NULL;
}


Json *XmlDoc::getParsedJson ( ) {

	if ( m_jpValid ) return &m_jp;

	// core if not a json object
	if ( m_contentTypeValid && m_contentType != CT_JSON &&
	     // spider status docs are now really json
	     m_contentType != CT_STATUS ) {
		char *xx=NULL;*xx=0; }

	// \0 terminated
	char **pp = getUtf8Content();
	if ( ! pp || pp == (void *)-1 ) return (Json *)pp;

	// point to the json
	char *p = *pp;

	// empty? all done then.
	//if ( ! p ) return (char *)pp;

	// . returns NULL and sets g_errno on error
	// . if p is NULL i guess this should still be ok and be empty
	if ( ! m_jp.parseJsonStringIntoJsonItems ( p , m_niceness ) ) {
		g_errno = EBADJSONPARSER;
		return NULL;
	}

	m_jpValid = true;
	return &m_jp;
}

// . returns -1 if blocked, returns NULL and sets g_errno on error
// . hash each json VALUE (not FIELD) ... AND ... hash each json
//   VALUE with its FIELD like "title:cool" or "description:whatever"
// . example:
//   [{"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":1378322570280,"matched":64,"status":"Stopped","start":1378322184332,"token":"poo","parameterMap":{"token":"poo","seed":"www.alleyinsider.com","api":"article"},"crawled":64},{"id":"830e0584-7f69-4bdd-

#include "Json.h"

char *XmlDoc::hashJSONFields ( HashTableX *table ) {

	setStatus ( "hashing json fields" );

	HashInfo hi;
	hi.m_tt        = table;
	hi.m_desc      = "json object";

	// use new json parser
	Json *jp = getParsedJson();
	if ( ! jp || jp == (void *)-1 ) return (char *)jp;

	return hashJSONFields2 ( table , &hi , jp , true );
}


char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
				HashInfo *hi , Json *jp ,
				bool hashWithoutFieldNames ) {

	JsonItem *ji = jp->getFirstItem();

	char nb[1024];
	SafeBuf nameBuf(nb,1024);

	//int32_t totalHash32 = 0;

	for ( ; ji ; ji = ji->m_next ) {
		QUICKPOLL(m_niceness);
		// skip if not number or string
		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
			continue;
		// reset, but don't free mem etc. just set m_length to 0
		nameBuf.reset();

		// get its full compound name like "meta.twitter.title"
		JsonItem *p = ji;
		char *lastName = NULL;
		char *nameArray[20];
		int32_t  numNames = 0;
		for ( ; p ; p = p->m_parent ) {
			// empty name?
			if ( ! p->m_name ) continue;
			if ( ! p->m_name[0] ) continue;
			// dup? can happen with arrays. parent of string
			// in object, has same name as his parent, the
			// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
			if ( p->m_name == lastName ) continue;
			// update
			lastName = p->m_name;
			// add it up
			nameArray[numNames++] = p->m_name;
			// breach?
			if ( numNames < 15 ) continue;
			log("build: too many names in json tag");
			break;
		}

		// if we are the diffbot reply "html" field do not hash this
		// because it is redundant and it hashes html tags etc.!
		// plus it slows us down a lot and bloats the index.
		if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"html")==0)
			continue;

		// assemble the names in reverse order which is correct order
		for ( int32_t i = 1 ; i <= numNames ; i++ ) {
			// copy into our safebuf
			if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
				return NULL;
			// separate names with periods
			if ( ! nameBuf.pushChar('.') ) return NULL;
		}
		// remove last period
		nameBuf.removeLastChar('.');
		// and null terminate
		if ( ! nameBuf.nullTerm() ) return NULL;
		// change all :'s in names to .'s since : is reserved!
		char *px = nameBuf.getBufStart();
		for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
		//for ( px = nameBuf.getBufStart(); *px ; px++ ) if ( *px == '-' ) *px = '_';
		//
		// DIFFBOT special field hacks
		//
		char *name = nameBuf.getBufStart();
		hi->m_hashGroup = HASHGROUP_BODY;
		if ( strstr(name,"title") )
			hi->m_hashGroup = HASHGROUP_TITLE;
		if ( strstr(name,"url") )
			hi->m_hashGroup = HASHGROUP_INURL;
		if ( strstr(name,"resolved_url") )
			hi->m_hashGroup = HASHGROUP_INURL;
		if ( strstr(name,"tags") )
			hi->m_hashGroup = HASHGROUP_INTAG;
		if ( strstr(name,"meta") )
			hi->m_hashGroup = HASHGROUP_INMETATAG;
		//
		// now Json.cpp decodes and stores the value into
		// a buffer, so ji->getValue() should be decoded completely
		//

		// . get the value of the json field
		// . if it's a number or bool it converts into a string
		int32_t vlen;
		char *val = ji->getValueAsString( &vlen );
		char tbuf[32];

		// if the value is clearly a date, just hash it as
		// a number, so use a temporary value that holds the
		// time_t and hash with that... this will hash
		// diffbot's article date field as a number so we can
		// sortby and constrain by it in the search results
		if ( name && (strcasecmp(name,"date") == 0 || strcasecmp(name,"estimatedDate") == 0)) {
			// this is in HttpMime.cpp
			int64_t tt = atotime1 ( val );
			// we can't store 64-bit dates... so truncate to -2147483648
			// which is Dec 13 1901. so we don't quite get the 1898 date
			// for the new york times dbpedia entry. maybe if we added
			// an extra termlist for more precision to indicate century or
			// something.
			if ( tt && tt < (int32_t)0x80000000 )
				tt = (int32_t)0x80000000;
			// likewise, we can't be too big, passed 2038
			if ( tt && tt > 0x7fffffff )
				tt = (int32_t)0x7fffffff;
			if ( tt ) {
				// print out the time_t in ascii
				vlen = sprintf(tbuf,"%"INT32"",(int32_t)tt);
				// and point to it for hashing/indexing
				val = tbuf;
			}
		}


		//
		// for deduping search results we set m_contentHash32 here for
		// diffbot json objects.
		// we can't do this here anymore, we have to set the
		// contenthash in ::getContentHash32() because we need it to
		// set EDOCUNCHANGED in ::getIndexCode() above.
		//
		/*
		if ( hi->m_hashGroup != HASHGROUP_INURL ) {
			// make the content hash so we can set m_contentHash32
			// for deduping
			int32_t nh32 = hash32n ( name );
			// do an exact hash for now...
			int32_t vh32 = hash32 ( val , vlen , m_niceness );
			// accumulate, order independently
			totalHash32 ^= nh32;
			totalHash32 ^= vh32;
		}
		*/

		// index like "title:whatever"
		hi->m_prefix = name;
		hashString ( val , vlen , hi );

		//log("hashing json var as %s %s %d", name, val, vlen);

		// hash gbfieldmatch:some.fieldInJson:"case-sens field Value"
		if ( name )
			hashFieldMatchTerm ( val , (int32_t)vlen , hi );

		if ( ! hashWithoutFieldNames )
			continue;

		// hash without the field name as well
		hi->m_prefix = NULL;
		hashString ( val , vlen , hi );

		/*
		// a number? hash special then as well
		if ( ji->m_type != JT_NUMBER ) continue;

		// use prefix for this though
		hi->m_prefix = name;

		// hash as a number so we can sort search results by
		// this number and do range constraints
		float f = ji->m_valueDouble;
		if ( ! hashNumber2 ( f , hi ) )
			return NULL;
		*/
	}

	//m_contentHash32 = totalHash32;
	//m_contentHash32Valid = true;

	return (char *)0x01;
}

char *XmlDoc::hashXMLFields ( HashTableX *table ) {

	setStatus ( "hashing xml fields" );

	HashInfo hi;
	hi.m_tt        = table;
	hi.m_desc      = "xml object";
	hi.m_hashGroup = HASHGROUP_BODY;


	Xml *xml = getXml();
	int32_t n = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes   ();

	SafeBuf nameBuf;

	// scan the xml nodes
	for ( int32_t i = 0 ; i < n ; i++ ) {

		// breathe
		QUICKPOLL(m_niceness);

		// . skip if it's a tag not text node skip it
		// . we just want the "text" nodes
		if ( nodes[i].isTag() ) continue;

		//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
		//	log("hey:hy");

		// assemble the full parent name
		// like "tag1.tag2.tag3"
		nameBuf.reset();
		xml->getCompoundName ( i , &nameBuf );

		// this is \0 terminated
		char *tagName = nameBuf.getBufStart();

		// get the utf8 text
		char *val = nodes[i].m_node;
		int32_t vlen = nodes[i].m_nodeLen;

		// index like "title:whatever"
		if ( tagName && tagName[0] ) {
			hi.m_prefix = tagName;
			hashString ( val , vlen , &hi );
		}

		// hash without the field name as well
		hi.m_prefix = NULL;
		hashString ( val , vlen , &hi );
	}

	return (char *)0x01;
}

// if our url is that of a subdoc, then get the url of the parent doc
// from which we were a subsection
char *XmlDoc::getDiffbotParentUrl( char *myUrl ) {
	// remove -diffbotxyz
	if ( ! m_kbuf.safeStrcpy( myUrl ) ) return NULL;
	char *p =  m_kbuf.getBufStart();
	char *s = strstr(p,"-diffbotxyz");
	if ( s ) { *s = '\0'; return p; }
	// temporarily until we inject "diffbotreply" uncomment this
	/*
	// otherwise i guess we got dan's format of -article|%"INT32"|%"INT32"
	char *e = m_kbuf.getBuf() - 1;
	for ( ; *e && is_digit(*e) ; e-- );
	if ( *e != '|' ) return NULL;
	e--;
	for ( ; *e && is_digit(*e) ; e-- );
	if ( *e != '|' ) return NULL;
	e--;
	// now to hyphen
	char *estart = m_kbuf.getBufStart();
	for ( ; e>estart && *e !='-' ; e-- );
	if ( *e != '-' ) return NULL;
	*e = '\0';
	return p;
	*/
	return NULL;
}

bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {

	// sanity
	if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }

	storeFacetValuesSite ( qs, sb, fvh );

	if ( m_hasMetadata) {
		Json jpMetadata;
		if (jpMetadata.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
			storeFacetValuesJSON ( qs, sb, fvh, &jpMetadata );
		}
	}

	// if "qa" is a gbxpathsitehash123456 type of beastie then we
	// gotta scan the sections
	if ( strncasecmp(qs,"gbxpathsitehash",15) == 0 )
		return storeFacetValuesSections ( qs , sb , fvh );

	// if a json doc, get json field
	// spider status docs are really json now
	if ( m_contentType == CT_JSON || m_contentType == CT_STATUS )
		return storeFacetValuesJSON ( qs , sb , fvh, getParsedJson());


	if ( m_contentType == CT_HTML )
		return storeFacetValuesHtml ( qs , sb , fvh );

	if ( m_contentType == CT_XML )
		return storeFacetValuesXml ( qs , sb , fvh );


	return true;
}

// Store facet for site
bool XmlDoc::storeFacetValuesSite ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {

  char* val = getSite();
  int  vlen = gbstrlen(val);
	FacetValHash_t val32 = hash32 ( val , vlen );


	// skip if not for us
	if ( fvh && val32 != fvh ) return false;
	if ( strcmp("gbtagsite",qs) ) return false;


	// otherwise add facet FIELD to our buf
	if ( ! sb->safeStrcpy(qs) ) return false;
	if ( ! sb->pushChar('\0') ) return false;

	// then add facet VALUE
	if ( !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false;
	if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
	if ( ! sb->pushChar('\0') ) return false;

	return true;
}

bool XmlDoc::storeFacetValuesSections ( char *qs , SafeBuf *sb ,
					FacetValHash_t fvh ) {

	// scan all sections
	Sections *ss = getSections();
	if ( ! ss ) return false;
	if ( ss == (void *)-1 ) { char *xx=NULL;*xx=0; }

	Words *ww = getWords();
	if ( ! ww ) return false;
	if ( ww == (void *)-1 ) { char *xx=NULL;*xx=0; }

	int32_t siteHash32 = *getSiteHash32();

	// qs is like gbxpathsitehash1234567
	// so get the digit part
	char *p = qs;
	for ( ; *p && ! is_digit(*p); p++ );
	uint64_t xsh = (uint64_t)atoll(p);

	bool isString = false;
	if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;

	Section *si = ss->m_rootSection;
	//sec_t mflags = SEC_SENTENCE | SEC_MENU;
	for ( ; si ; si = si->m_next ) {
		// breathe
		QUICKPOLL(m_niceness);
		// is it a match?
		uint64_t mod;
		mod = (uint32_t)si->m_turkTagHash32;
		mod ^= (uint32_t)siteHash32;
		if ( mod != xsh ) continue;
		// . then add facet VALUE
		// . hash of the innerhtml of sentence
		// . get hash of sentences this tag contains indirectly
		uint32_t val32 = (uint32_t)si->m_indirectSentHash64;
		if ( ! val32 ) continue;
		// if a facetvalhash was provided we must match
		if ( fvh && val32 != fvh ) continue;
		// got one print the facet field
		if ( ! sb->safeStrcpy(qs) ) return false;
		if ( ! sb->pushChar('\0') ) return false;
		if ( isString && ! sb->safePrintf("%"UINT32",",val32) )
			return false;
		// put ALSO print the string somewhat
		char *a = m_words.m_words[si->m_next->m_a];
		char *b = m_words.m_words[si->m_next->m_b-1];
		b += m_words.m_wordLens  [si->m_next->m_b-1];
		if ( ! sb->safeTruncateEllipsis (a,b-a,160) ) return false;
		if ( ! sb->pushChar('\0') ) return false;
		// if wanted a specific string, we are done
		if ( fvh ) return true;
	}
	return true;
}


bool XmlDoc::storeFacetValuesHtml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {

	Xml *xml = getXml();

	int32_t qsLen = gbstrlen(qs);

	bool isString = false;
	if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;

	// check for gblang:en etc.
	// if ( isString && strncmp(qs,"gblang",6)==0 ) {
	// 	if (!sb->safeStrcpy(qs) ) return false;
	// 	if (!sb->pushChar('\0') ) return false;
	// 	// find the lang that has that hash!
	// 	if (!sb->safePrintf("%"UINT32",",(uint32_t)val32))return false;
	// 	if (!sb->safeMemcpy(content,contentLen) ) return false;
	// 	if (!sb->pushChar('\0') ) return false;
	//}


	char *content;
	int32_t contentLen;
	int32_t nameLen;
	char *s;
	int32_t i = 0;

	bool uniqueField = false;

	// a title tag can count now too
	if ( strcmp(qs,"title") == 0 ) {
		// skip leading spaces = false
		content = xml->getString ("title",&contentLen,false);
		uniqueField = true;
		goto skip;
	}


	// find the first meta summary node
	for ( i = 0 ; i < xml->m_numNodes ; i++ ) {

		// continue if not a meta tag
		if ( xml->m_nodes[i].m_nodeId != TAG_META ) continue;
		// . does it have a type field that's "summary"
		// . <meta name=summary content="...">
		// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
		s = xml->getString ( i , "name", &nameLen );
		// "s" can be "summary","description","keywords",...
		if ( nameLen != qsLen ) continue;
		if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
		// point to the summary itself
		content = xml->getString ( i , "content" , &contentLen );
		if ( ! content || contentLen <= 0 ) continue;

	skip:
		// hash it to match it if caller specified a particular hash
		// because they are coming from Msg40::lookUpFacets() function
		// to convert the hashes to strings, like for rendering in
		// the facets box to the left of the search results
		FacetValHash_t val32 = hash32 ( content, contentLen);
		if ( fvh && fvh != val32 ) continue;

		// otherwise add facet FIELD to our buf
		if ( ! sb->safeStrcpy(qs) ) return false;
		if ( ! sb->pushChar('\0') ) return false;

		// then add facet VALUE
		if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
			return false;
		if ( !sb->safeMemcpy(content,contentLen) ) return false;
		if ( !sb->pushChar('\0') ) return false;

		// if only one specified, we are done
		if ( fvh ) return true;

		if ( uniqueField ) return true;
	}

	return true;
}


bool XmlDoc::storeFacetValuesXml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {

	Xml *xml = getXml();

	int32_t qsLen = gbstrlen(qs);

	bool isString = false;
	if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;

	int32_t i = 0;

	bool uniqueField = false;

	SafeBuf nameBuf;

	// find the first meta summary node
	for ( i = 0 ; i < xml->m_numNodes ; i++ ) {

		// skip text nodes
		if ( xml->m_nodes[i].m_nodeId == 0 ) continue;

		// assemble the full parent name
		// like "tag1.tag2.tag3"
		nameBuf.reset();
		xml->getCompoundName ( i , &nameBuf );
		int32_t nameLen = nameBuf.length();
		char *s = nameBuf.getBufStart();

		// . does it have a type field that's "summary"
		// . <meta name=summary content="...">
		// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
		//s = xml->getString ( i , "name", &nameLen );

		// "s" can be "summary","description","keywords",...
		if ( nameLen != qsLen ) continue;
		if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;

		// got it...

		// wtf?
		if ( i + 1 >= xml->m_numNodes ) continue;

		// point to the content! this is a text node?

		// skip if not a text node, we don't return tag nodes i guess
		if ( xml->m_nodes[i+1].m_nodeId ) continue;

		char *content = xml->m_nodes[i+1].m_node;
		int32_t contentLen = xml->m_nodes[i+1].m_nodeLen;

		// skip if empty
		if ( ! content || contentLen <= 0 ) continue;

		// skip commen cases too! like white space
		if ( contentLen == 1 && is_wspace_a(content[0]) ) continue;

		// hash it to match it if caller specified a particular hash
		// because they are coming from Msg40::lookUpFacets() function
		// to convert the hashes to strings, like for rendering in
		// the facets box to the left of the search results
		FacetValHash_t val32 = hash32 ( content, contentLen);
		if ( fvh && fvh != val32 ) continue;

		// otherwise add facet FIELD to our buf
		if ( ! sb->safeStrcpy(qs) ) return false;
		if ( ! sb->pushChar('\0') ) return false;

		// then add facet VALUE
		if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
			return false;
		if ( !sb->safeMemcpy(content,contentLen) ) return false;
		if ( !sb->pushChar('\0') ) return false;

		// if only one specified, we are done
		if ( fvh ) return true;

		if ( uniqueField ) return true;
	}

	return true;
}

bool XmlDoc::storeFacetValuesJSON (char *qs,
                                   SafeBuf *sb,
                                   FacetValHash_t fvh,
                                   Json *jp ) {

	JsonItem *ji = jp->getFirstItem();

	char nb[1024];
	SafeBuf nameBuf(nb,1024);

	bool isString = false;
	if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;

	for ( ; ji ; ji = ji->m_next ) {

		QUICKPOLL(m_niceness);

		// skip if not number or string
		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
			continue;

		// reset, but don't free mem etc. just set m_length to 0
		nameBuf.reset();

		// get its full compound name like "meta.twitter.title"
		ji->getCompoundName ( nameBuf );

		// skip if not for us
		if ( strcmp(nameBuf.getBufStart(),qs) ) continue;

		//
		// now Json.cpp decodes and stores the value into
		// a buffer, so ji->getValue() should be decoded completely
		//
		int32_t vlen;
		char *val = ji->getValueAsString( &vlen );

		// hash it to match it if caller specified a particular hash
		// because they are coming from Msg40::lookUpFacets() function
		// to convert the hashes to strings, like for rendering in
		// the facets box to the left of the search results
		FacetValHash_t val32 = hash32 ( val , vlen );
		if ( fvh && val32 != fvh )
			continue;

		// otherwise add facet FIELD to our buf
		if ( ! sb->safeStrcpy(qs) ) return false;
		if ( ! sb->pushChar('\0') ) return false;

		// then add facet VALUE
		if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32))
				return false;

		if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false;
		if ( ! sb->pushChar('\0') ) return false;

		// if wanted a specific string, then we are done
		if ( fvh ) return true;
	}

	return true;
}