#include "gb-include.h"
#include "hash.h"
#include "XmlDoc.h"
#include "Conf.h"
#include "Query.h"     // getFieldCode()
#include "Clusterdb.h" // g_clusterdb
#include "Collectiondb.h"
#include "iana_charset.h"
#include "Stats.h"
#include "Sanity.h"
#include "Speller.h"
#include "CountryCode.h"
#include "linkspam.h"
#include "Tagdb.h"
#include "Repair.h"
#include "HashTableX.h"
#include "LanguageIdentifier.h" // g_langId
#include "CountryCode.h" // g_countryCode
#include "sort.h"
#include "Wiki.h"
#include "Speller.h"
#include "SiteGetter.h"
#include "Synonyms.h"
#include "PageInject.h"
#include "HttpServer.h"
#include "Posdb.h"
#include "Highlight.h"
#include "Wiktionary.h"
#include "Parms.h"
#include "Domains.h"
#include "AdultCheck.h"
#include "Doledb.h"
#include "IPAddressChecks.h"
#include "PageRoot.h"
#include "BitOperations.h"
#include "Robots.h"
#include <pthread.h>
#include "JobScheduler.h"
#include "Process.h"
#include "Statistics.h"
#include "GbCompress.h"
#include "GbUtil.h"
#include "ScopedLock.h"
#include "Mem.h"
#include "UrlBlockList.h"
#include <fcntl.h>


#ifdef _VALGRIND_
#include <valgrind/memcheck.h>
#endif


#define SENT_UNITS 30

#define NUMTERMIDBITS 48 // was in RdbList but only used in XmlDoc

static void getWordToPhraseRatioWeights ( int64_t   pid1 , // pre phrase
					  int64_t   wid1 ,
					  int64_t   pid2 ,
					  int64_t   wid2 , // post word
					  float      *ww   ,
					  const HashTableX *tt1);

static void getMetaListWrapper ( void *state ) ;


#if 0
static void doneReadingArchiveFileWrapper ( int fd, void *state );
#endif

XmlDoc::XmlDoc() {
	//clear all fields in the titledb structure (which are the first fileds in this class)
	memset(&m_headerSize, 0, (size_t)((char*)&ptr_firstUrl-(char*)&m_headerSize));

	m_esbuf.setLabel("exputfbuf");
	m_freed = false;
	m_contentInjected = false;
	m_wasContentInjected = false;

	// warc parsing stuff

	//m_coll  = NULL;
	m_ubuf = NULL;
	m_pbuf = NULL;
	m_rootDoc    = NULL;
	m_oldDoc     = NULL;
	m_printedMenu = false;
	// reset all *valid* flags to false
	void *p    = &m_VALIDSTART;
	void *pend = &m_VALIDEND;
	memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p
	m_msg22Request.m_inUse = 0;
	m_indexedDoc = false;
	m_msg4Waiting = false;
	m_msg4Launched = false;
	m_dupTrPtr = NULL;
	m_oldTitleRec = NULL;
	m_filteredContent = NULL;
	m_filteredContentAllocSize = 0;
	m_metaList = NULL;
	m_metaListSize = 0;
	m_metaListAllocSize = 0;
	m_rootTitleRec = NULL;
	m_isIndexed = 0;	// may be -1
	m_isInIndex = false;
	m_wasInIndex = false;
	m_outlinkHopCountVector = NULL;
	m_extraDoc = NULL;
	m_statusMsg = NULL;
	m_errno = 0;
	m_docId = 0;

	reset();
}

XmlDoc::~XmlDoc() {
	setStatus("freeing this xmldoc");
	reset();
	m_freed = true;
}

void XmlDoc::reset ( ) {
	m_redirUrl.reset();

	m_updatedMetaData = false;

	m_ipStartTime = 0;
	m_ipEndTime   = 0;

	m_isImporting = false;

	m_printedMenu = false;

	m_bodyStartPos = 0;

	m_indexedTime = 0;

	m_metaList2.purge();

	m_mySiteLinkInfoBuf.purge();
	m_myPageLinkInfoBuf.purge();

	// we need to reset this to false
	m_useTimeAxis = false;

	m_loaded = false;

	m_indexedDoc = false;
	m_msg4Launched = false;

	m_doConsistencyTesting = g_conf.m_doConsistencyTesting;

	m_computedMetaListCheckSum = false;

	m_allHashed = false;

	m_doledbKey.n0 = 0LL;
	m_doledbKey.n1 = 0;

	m_wordSpamBuf.purge();
	m_fragBuf.purge();

	m_lastTimeStart = 0LL;

	m_req = NULL;
	m_abortMsg20Generation = false;

	m_storeTermListInfo = false;

	// for limiting # of iframe tag expansions
	m_numExpansions = 0;

	// . are not allowed to exit if waiting for msg4 to complete
	// . yes we are, it should be saved as addsinprogress.dat
	if ( m_msg4Waiting ) {
		if(m_docIdValid)
			log("doc: resetting xmldoc with outstanding msg4. should "
			    "be saved in addsinprogress.dat. docid=%" PRIu64,m_docId);
		else
			log("doc: resetting xmldoc with outstanding msg4. should "
			    "be saved in addsinprogress.dat.");
	}

	m_pbuf = NULL;
	m_wts  = NULL;

	m_deleteFromIndex = false;

	if ( m_rootDocValid    ) nukeDoc ( m_rootDoc    );
	if ( m_oldDocValid     ) nukeDoc ( m_oldDoc     );
	if ( m_extraDocValid   ) nukeDoc ( m_extraDoc   );

	if ( m_linkInfo1Valid && ptr_linkInfo1 && m_freeLinkInfo1 ) {
		// it now points into m_myPageLinkInfoBuf !
		//mfree ( ptr_linkInfo1 , size_linkInfo1, "LinkInfo1");
		ptr_linkInfo1    = NULL;
		m_linkInfo1Valid = false;
	}

	if ( m_rawUtf8ContentValid && m_rawUtf8Content && !m_setFromTitleRec
	     // was content supplied by pageInject.cpp?
	     //! m_contentInjected ) {
	     ) {
		mfree ( m_rawUtf8Content, m_rawUtf8ContentAllocSize,"Xml3");
	}

	// reset this
	m_contentInjected = false;
	m_rawUtf8ContentValid = false;
	m_wasContentInjected = false;

	m_rootDoc = NULL;

	// if this is true, then only index if new
	m_newOnly = 0;
	m_skipContentHashCheck = false;

	if ( m_httpReplyValid && m_httpReply ) {
		mfree(m_httpReply,m_httpReplyAllocSize,"httprep");
		m_httpReply = NULL;
		m_httpReplyValid = false;
	}

	if ( m_filteredContentAllocSize ) {
		mfree (m_filteredContent,m_filteredContentAllocSize,"xdfc");
		m_filteredContent = NULL;
		m_filteredContentAllocSize = 0;
	}

	if ( m_metaList ) { // m_metaListValid && m_metaList ) {
		mfree ( m_metaList , m_metaListAllocSize , "metalist");
		m_metaList          = NULL;
		m_metaListSize      = 0;
		m_metaListAllocSize = 0;
	}

	if ( m_ubuf ) {
		mfree ( m_ubuf     , m_ubufAlloc         , "ubuf");
		m_ubuf = NULL;
	}

	m_titleRecBuf.purge();

	if ( m_dupTrPtr ) {
		mfree ( m_dupTrPtr , m_dupTrSize , "trecd" );
		m_dupTrPtr = NULL;
	}

	if ( m_oldTitleRecValid && m_oldTitleRec ) {
		mfree ( m_oldTitleRec , m_oldTitleRecSize , "treca" );
		m_oldTitleRec = NULL;
		m_oldTitleRecValid = false;
	}

	if ( m_rootTitleRecValid && m_rootTitleRec ) {
		mfree ( m_rootTitleRec , m_rootTitleRecSize , "treca" );
		m_rootTitleRec = NULL;
		m_rootTitleRecValid = false;
	}


	if ( m_outlinkHopCountVectorValid && m_outlinkHopCountVector ) {
		int32_t sz = m_outlinkHopCountVectorSize;
		mfree ( m_outlinkHopCountVector,sz,"ohv");
	}
	m_outlinkHopCountVector = NULL;

	// reset all *valid* flags to false
	void *p    = &m_VALIDSTART;
	void *pend = &m_VALIDEND;
	memset ( p , 0 , (char *)pend - (char *)p );

	m_hashedMetas = false;

	// Doc.cpp:
	m_mime.reset();
	m_words.reset();
	m_phrases.reset();
	m_bits.reset();
	m_sections.reset();
	m_countTable.reset();

	// other crap
	m_xml.reset();
	m_links.reset();
	m_bits2.reset();
	m_pos.reset();
	m_synBuf.reset();
	m_images.reset();
	m_countTable.reset();
	m_mime.reset();
	m_tagRec.reset();
	m_newTagBuf.reset();
	m_dupList.reset();
	m_msg8a.reset();
	m_msg13.reset();
	m_msge0.reset();
	m_msge1.reset();
	m_reply.reset();
	// mroe stuff skipped

	m_wtsTable.reset();
	m_wbuf.reset();
	m_pageLinkBuf.reset();
	m_siteLinkBuf.reset();
	m_esbuf.reset();
	m_tagRecBuf.reset();

	// origin of this XmlDoc
	m_setFromTitleRec    = false;
	m_setFromUrl         = false;
	m_setFromDocId       = false;
	m_setFromSpiderRec   = false;
	m_freeLinkInfo1      = false;

	m_checkedUrlFilters  = false;

	m_indexCode   = 0;
	m_masterLoop  = NULL;
	m_masterState = NULL;

	//m_isAddUrl = false;
	m_isInjecting = false;
	m_useFakeMime = false;
	m_useSiteLinkBuf = false;
	m_usePageLinkBuf = false;
	m_printInXml = false;

	m_check1   = false;
	m_check2   = false;
	m_prepared = false;

	// keep track of updates to the rdbs we have done, so we do not re-do
	m_listAdded                = false;
	m_copied1                  = false;
	m_updatingSiteLinkInfoTags = false;
	m_hashedTitle              = false;

	m_numRedirects             = 0;
	m_numOutlinksAdded         = 0;
	m_useRobotsTxt             = true;

	m_allowSimplifiedRedirs    = false;

	m_didDelay                 = false;
	m_didDelayUnregister       = false;
	m_calledMsg22e             = false;
	m_calledMsg22f             = false;
	m_calledMsg25              = false;
	m_calledSections           = false;
	m_calledThread             = false;
	m_loaded                   = false;

	m_setTr                    = false;

	m_recycleContent           = false;
	m_callback1                = NULL;
	m_callback2                = NULL;
	m_state                    = NULL;

	m_doingConsistencyCheck    = false;

	m_isChildDoc = false;

	// for utf8 content functions
	m_savedp       = NULL;
	m_oldp         = NULL;
	m_didExpansion = false;

	// Repair.cpp now explicitly sets these to false if needs to
	m_usePosdb     = true;
	m_useClusterdb = true;
	m_useLinkdb    = true;
	m_useSpiderdb  = true;
	m_useTitledb   = true;
	m_useTagdb     = true;
	m_useSecondaryRdbs = false;

	// used by Msg13.cpp only. kinda a hack.
	m_isSpiderProxy = false;

	// do not cache the http reply in msg13 etc.
	m_maxCacheAge = 0;

	// reset these ptrs too!
	void *px    = &ptr_firstUrl;
	void *pxend = &m_dummyEnd;
	memset ( px , 0 , (char *)pxend - (char *)px );

	//unclear if this would make things blow up:
	//m_errno = 0;
}

int64_t XmlDoc::logQueryTimingStart() {
	if ( !g_conf.m_logTimingQuery ) {
		return 0;
	}

	return gettimeofdayInMilliseconds();
}

void XmlDoc::logQueryTimingEnd(const char* function, int64_t startTime) {
	if ( !g_conf.m_logTimingQuery ) {
		return;
	}

	int64_t endTime = gettimeofdayInMilliseconds();
	int64_t diff = endTime - startTime;

	//if (diff > 5) {
		log( LOG_TIMING, "query: XmlDoc::%s took %" PRId64 " ms for docId=%" PRId64, function, diff, m_docId );
	//}
}

int32_t XmlDoc::getSpideredTime ( ) {
	// stop if already set
	if ( m_spideredTimeValid ) return m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return 0;

	// . set spider time to current time
	// . this might already be valid if we set it in
	//   getTestSpideredDate()
	m_spideredTime      = getTimeGlobal();
	m_spideredTimeValid = true;
	return m_spideredTime;
}

// . we need this so PageGet.cpp can get the cached web page
// . but not for Msg20::getSummary(), that uses XmlDoc::set(Msg20Request*)
// . returns false and sets g_errno on error
bool XmlDoc::set3 ( int64_t  docId       ,
		    const char   *coll        ,
		    int32_t       niceness    ) {

	reset();

	// this is true
	m_setFromDocId = true;

	m_docId       = docId;
	m_docIdValid  = true;
	m_niceness    = niceness;

	if ( ! setCollNum ( coll ) ) return false;

	return true;
}

static void loadFromOldTitleRecWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { g_process.shutdownAbort(true);}
	// note it
	THIS->setStatus ( "loading from old title rec wrapper" );
	// return if it blocked
	if ( ! THIS->loadFromOldTitleRec ( ) ) return;

	const char *coll = "";
	CollectionRec *cr = THIS->getCollRec();
	if ( cr ) coll = cr->m_coll;

	// error?
	if ( g_errno ) log("doc: loadfromtitlerec coll=%s: %s",
			   coll,
			   mstrerror(g_errno));
	// otherwise, all done, call the caller callback
	THIS->callCallback();
}

// returns false if blocked, returns true and sets g_errno on error otherwise
bool XmlDoc::loadFromOldTitleRec ( ) {
	// . we are an entry point.
	// . if anything blocks, this will be called when it comes back
	if ( ! m_masterLoop ) {
		m_masterLoop  = loadFromOldTitleRecWrapper;
		m_masterState = this;
	}
	// if we already loaded!
	if ( m_loaded ) return true;
	// if set from a docid, use msg22 for this!
	char **otr = getOldTitleRec ( );
	// error?
	if ( ! otr ) return true;
	// blocked?
	if ( otr == (void *)-1 ) return false;
	// this is a not found
	if ( ! *otr ) {
		// so we do not retry
		m_loaded = true;
		// make it an error
		g_errno = ENOTFOUND;
		return true;
	}
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;
	// use that. decompress it! this will also set
	// m_setFromTitleRec to true
	if ( ! set2 ( m_oldTitleRec     ,
		      m_oldTitleRecSize , // maxSize
		      cr->m_coll            ,
		      NULL              , // pbuf
		      m_niceness        )) {
		// we are now loaded, do not re-call
		m_loaded = true;
		// return true with g_errno set on error uncompressing
		return true;
	}
	// we are now loaded, do not re-call
	m_loaded = true;
	// sanity check
	if ( ! m_titleRecBufValid ) { g_process.shutdownAbort(true); }
	// good to go
	return true;
}

bool XmlDoc::setCollNum ( const char *coll ) {
	CollectionRec *cr;
	cr = g_collectiondb.getRec ( coll , strlen(coll) );
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		log(LOG_WARN, "build: collrec not found for %s",coll);
		return false;
	}
	// we can store this safely:
	m_collnum = cr->m_collnum;
	m_collnumValid = true;
	return true;
}



CollectionRec *XmlDoc::getCollRec ( ) {
	if ( ! m_collnumValid ) { g_process.shutdownAbort(true); }
	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
	if ( ! cr ) {
		log("build: got NULL collection rec for collnum=%" PRId32".",
		    (int32_t)m_collnum);
		g_errno = ENOCOLLREC;
		return NULL;
	}
	// was it reset since we started spidering this url?
	// we don't do it this way, when resetting a coll when delete it and
	// re-add under a different collnum to avoid getting msg4 adds to it.
	//if ( cr->m_lastResetCount != m_lastCollRecResetCount ) {
	//	log("build: collection rec was reset. returning null.");
	//	g_errno = ENOCOLLREC;
	//	return NULL;
	//}
	return cr;
}



// returns false and sets g_errno on error
bool XmlDoc::set4 ( SpiderRequest *sreq      ,
		    key96_t         *doledbKey ,
		    const char     *coll      ,
		    SafeBuf       *pbuf      ,
		    int32_t        niceness  ,
		    char          *utf8ContentArg ,
		    bool           deleteFromIndex ,
		    int32_t        forcedIp ,
		    uint8_t        contentType ,
		    uint32_t       spideredTime ,
		    bool           contentHasMimeArg) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// sanity check
	if ( sreq->m_dataSize == 0 ) { g_process.shutdownAbort(true); }

	reset();

	logDebug(g_conf.m_logDebugSpider, "xmldoc: set4 uh48=%" PRIu64" parentdocid=%" PRIu64,
	         sreq->getUrlHash48(),sreq->getParentDocId());

	// used by PageSpiderdb.cpp
	m_startTime      = gettimeofdayInMilliseconds();
	m_startTimeValid = true;

	// this is true
	m_setFromSpiderRec = true;

	// did page inject (pageinject) request to delete it?
	m_deleteFromIndex = deleteFromIndex;

	// PageReindex.cpp will set this in the spider request
	if ( sreq->m_forceDelete ) {
		m_deleteFromIndex = true;
	}

	char *utf8Content = utf8ContentArg;

	if ( contentHasMimeArg && utf8Content ) {
		// get length of it all
		int32_t clen = strlen(utf8Content);
		// return true on error with g_errno set
		if ( ! m_mime.set ( utf8ContentArg , clen , NULL ) ) {
			if ( ! g_errno ) g_errno = EBADMIME;
			log("xmldoc: could not set mime: %s", mstrerror(g_errno));
			logTrace( g_conf.m_logTraceXmlDoc, "END, returning false. Mime problem." );
			return false;
		}
		// it's valid
		m_mimeValid = true;
		// advance
		utf8Content = m_mime.getContent();
	}

	// use this to avoid ip lookup if it is not zero
	if ( forcedIp ) {
		m_ip = forcedIp;
		m_ipValid = true;
	}

	// sometimes they supply the content they want! like when zaks'
	// injects pages from PageInject.cpp
	if ( utf8Content ) {
		// . this is the most basic content from the http reply
		// . only set this since sometimes it is facebook xml and
		//   contains encoded html which needs to be decoded.
		//   like <name>Ben &amp; Jerry's</name> otherwise are
		//   sentence formation stops at the ';' in the "&amp;" and
		//   we also index "amp" which is bad.
		m_content             = utf8Content;
		if ( m_mimeValid && m_mime.getContentLen() > 0) {
			m_contentLen = m_mime.getContentLen();
		} else {
			m_contentLen = strlen(utf8Content);
		}

		m_contentValid        = true;

		m_contentInjected     = true;
		m_wasContentInjected  = true;
		m_contentType         = contentType;
		m_contentTypeValid    = true;
		// use this ip as well for now to avoid ip lookup
		//m_ip      = atoip("127.0.0.1");
		//m_ipValid = true;
		// do not need robots.txt then
		m_isAllowed      = true;
		m_isAllowedValid = true;
		// nor mime
		m_httpStatus      = 200;
		m_httpStatusValid = true;
		// this too
		m_downloadStatus      = 0;
		m_downloadStatusValid = true;
		// assume this is the download time since the content
		// was pushed/provided to us
		if ( spideredTime )
			m_downloadEndTime = spideredTime;
		else
			m_downloadEndTime = gettimeofdayInMilliseconds();
		// either way, validate it
		m_downloadEndTimeValid = true;
		// and need a legit mime
		if ( ! m_mimeValid ) {
			m_mime.setBufLen(1);
			m_mimeValid      = true;
			m_mime.setContentType(contentType);
		}
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// no redir
		ptr_redirUrl    = NULL;
		size_redirUrl   = 0;
		m_redirUrl.reset();
		m_redirUrlPtr   = NULL;//&m_redirUrl;
		m_redirUrlValid = true;
		m_redirErrorValid = true;
		m_redirError      = 0;
		m_crawlDelay      = -1;
		m_crawlDelayValid = true;
	}

	// override content type based on mime for application/json
	if ( m_mimeValid ) {
		m_contentType = m_mime.getContentType();
		m_contentTypeValid = true;
	}


	//m_coll      = coll;
	m_pbuf      = pbuf;
	m_niceness  = niceness;
	m_version   = TITLEREC_CURRENT_VERSION;
	m_versionValid = true;

	/*
	// set min/max pub dates right away
	m_minPubDate = -1;
	m_maxPubDate = -1;
	// parentPrevSpiderTime is 0 if that was the first time that the
	// parent was spidered, in which case isNewOutlink will always be set
	// for every outlink it had!
	if ( sreq->m_isNewOutlink && sreq->m_parentPrevSpiderTime ) {
		// sanity check
		if ( ! sreq->m_parentPrevSpiderTime ) {g_process.shutdownAbort(true);}
		// pub date is somewhere between these two times
		m_minPubDate = sreq->m_parentPrevSpiderTime;
		m_maxPubDate = sreq->m_addedTime;
	}
	*/

	// this is used to removing the rec from doledb after we spider it
	m_doledbKey.setMin();
	if ( doledbKey ) m_doledbKey = *doledbKey;

	m_sreqValid    = true;

	// store the whole rec, key+dataSize+data, in case it disappears.
	gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );

	// set m_collnum etc.
	if ( ! setCollNum ( coll ) )
	{
		log("XmlDoc: set4() coll %s invalid",coll);
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning false. Collection invalid" );
		return false;
	}

	// it should be valid since we just set it
	CollectionRec *cr = getCollRec();

	m_useRobotsTxt = cr ? cr->m_useRobotsTxt : true;

	// fix some corruption i've seen
	if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) {
		log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url);
		m_sreq.m_urlIsDocId = 0;
	}

	// if url is a docid... we are from pagereindex.cpp
	//if ( sreq->m_isPageReindex ) {
	// now we can have url-based page reindex requests because
	// if we have a diffbot json object fake url reindex request
	// we add a spider request of the PARENT url for it as page reindex
	//if ( is_digit ( sreq->m_url[0] ) ) {
	// watch out for 0.r.msn.com!!
	if ( m_sreq.m_urlIsDocId ) {
		m_docId          = atoll(m_sreq.m_url);
		// assume its good
		m_docIdValid     = true;
		// similar to set3() above
		m_setFromDocId   = true;
		// use content and ip from old title rec to save time
		// . crap this is making the query reindex not actually
		//   re-download the content.
		// . we already check the m_deleteFromIndex flag below
		//   in getUtf8Content() and use the old content in that case
		//   so i'm not sure why we are recycling here, so take
		//   this out. MDW 9/25/2014.
		//m_recycleContent = true;
		// sanity
		if ( m_docId == 0LL ) { g_process.shutdownAbort(true); }
	}
	else {
		logTrace( g_conf.m_logTraceXmlDoc, "Calling setFirstUrl with [%s]", m_sreq.m_url);
		setFirstUrl ( m_sreq.m_url );
		// you can't call this from a docid based url until you
		// know the uh48
		//setSpideredTime();
	}

	// now query reindex can specify a recycle content option so it
	// can replace the rebuild tool. try to recycle on global index.
	if ( m_sreqValid )
		m_recycleContent = m_sreq.m_recycleContent;

	logTrace( g_conf.m_logTraceXmlDoc, "END, returning true" );
	return true;
}



// . set our stuff from the TitleRec (from titledb)
// . returns false and sets g_errno on error
bool XmlDoc::set2 ( char    *titleRec ,
		    int32_t     maxSize  ,
		    const char    *coll     ,
		    SafeBuf *pbuf     ,
		    int32_t     niceness ,
		    SpiderRequest *sreq ) {

	// NO! can't do this. see below
	//reset();

	setStatus ( "setting xml doc from title rec");

	// . it resets us, so save this
	// . we only save these for set2() not the other sets()!
	//void (*cb1)(void *state) = m_callback1;
	//bool (*cb2)(void *state) = m_callback2;
	//void *state = m_state;

	// . clear it all out
	// . no! this is clearing our msg20/msg22 reply...
	// . ok, but repair.cpp needs it so do it there then
	//reset();

	// restore callbacks
	//m_callback1 = cb1;
	//m_callback2 = cb2;
	//m_state     = state;

	// sanity check - since we do not reset
	if ( m_contentValid ) { g_process.shutdownAbort(true); }

	// this is true
	m_setFromTitleRec = true;

	// this is valid i guess. includes key, etc.
	//m_titleRec      = titleRec;
	//m_titleRecSize  = *(int32_t *)(titleRec+12) + sizeof(key96_t) + 4;
	//m_titleRecValid = true;
	// . should we free m_cbuf on our reset/destruction?
	// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
	//   that should not be freed, besides the alloc size is not known!
	//m_freeTitleRec  = false;

	// it must be there!
	if ( !titleRec ) { g_errno=ENOTFOUND; return false; }

	int32_t titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key96_t) + 4;

	// it must be there!
	if ( titleRecSize==0 ) { g_errno=ENOTFOUND; return false; }

	// . should we free m_cbuf on our reset/destruction?
	// . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec
	//   that should not be freed, besides the alloc size is not known!
	if( !m_titleRecBuf.setBuf(	titleRec,
								titleRecSize,	// bufmax
								titleRecSize,	// bytes in use
								false) ) {		// ownData?
		log(LOG_ERROR, "m_titleRecBuf.setBuf of size %" PRId32 " failed", titleRecSize);
		gbshutdownLogicError();
	}
	m_titleRecBufValid = true;

	//m_coll               = coll;
	m_pbuf               = pbuf;
	m_niceness           = niceness;

	// set our collection number
	if ( ! setCollNum ( coll ) ) return false;

	// store the whole rec, key+dataSize+data, in case it disappears.
	if ( sreq ) {
		gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() );
		m_sreqValid = true;
	}

	m_hashedTitle        = false;
	m_hashedMetas        = false;

	// save the compressed buffer in case we should free it when done
	//m_titleRec = titleRec;
	// should we free m_cbuf on our reset/destruction?
	//m_freeTitleRec = true;
	// our record may not occupy all of m_cbuf, careful
	//m_titleRecAllocSize = maxSize;

	// get a parse ptr
	char *p = titleRec;

	// . this is just like a serialized RdbList key/dataSize/data of 1 rec
	// . first thing is the key
	// . key should have docId embedded in it
	m_titleRecKey =  *(key96_t *) p ;
	//m_titleRecKeyValid = true;
	p += sizeof(key96_t);

	// bail on error
	if ( (m_titleRecKey.n0 & 0x01) == 0x00 ) {
		g_errno = EBADTITLEREC;
		log(LOG_ERROR, "db: Titledb record is a negative key.");
		g_process.shutdownAbort(true);
	}

	int64_t docId = Titledb::getDocIdFromKey(&m_titleRecKey);
	if (m_docIdValid) {
		// validate docId if already set
		if (m_docId != docId) {
			log(LOG_ERROR, "db: Mismatched in docid. Requested docId=%" PRId64 " but got docId=%" PRId64, m_docId, docId);
			gbshutdownLogicError();
		}
	} else {
		m_docId = docId;
		m_docIdValid = true;
	}

	// then the size of the data that follows this
	int32_t dataSize =  *(int32_t *) p ;
	p += 4;
	// bail on error
	if ( dataSize < 4 ) {
		g_errno = EBADTITLEREC;
		log(LOG_ERROR, "TITLEDB CORRUPTION. Record has size of %" PRId32" which is too small. Probable disk corruption in a titledb file. DocId=%" PRId64 "", dataSize, m_docId);
		gbshutdownLogicError();
		// return false;
	}


	// what is the size of cbuf/titleRec in bytes?
	int32_t cbufSize = dataSize + 4 + sizeof(key96_t);
	// . the actual data follows "dataSize"
	// . what's the size of the uncompressed compressed stuff below here?
	m_ubufSize = *(int32_t  *) p ; p += 4;

	// . because of disk/network data corruption this may be wrong!
	// . we can now have absolutely huge titlerecs...
	if ( m_ubufSize == 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 )
		g_errno = EBADTITLEREC;
		log(LOG_ERROR, "POSSIBLE TITLEDB CORRUPTION. Uncompressed size=%" PRId32", docId=%" PRId64 ", dataSize=%" PRId32 ", cbufSize=%" PRId32 "", m_ubufSize, m_docId, dataSize, cbufSize);
		loghex(LOG_ERROR, titleRec, (cbufSize < 400 ? cbufSize : 400), "titleRec (first max. 400 bytes)");
		return false;
		//gbshutdownLogicError();
		//return false;
	}


	if ( m_ubufSize < 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 )
		g_errno = EBADTITLEREC;
		log(LOG_ERROR, "TITLEDB CORRUPTION. Uncompressed size=%" PRId32", docId=%" PRId64 ", dataSize=%" PRId32 ", cbufSize=%" PRId32 "", m_ubufSize, m_docId, dataSize, cbufSize);
		loghex(LOG_ERROR, titleRec, (cbufSize < 400 ? cbufSize : 400), "titleRec (first max. 400 bytes)");

		gbshutdownLogicError();
		//return false;
	}

	// trying to uncompress corrupt titlerecs sometimes results in
	// a seg fault... watch out
	if ( m_ubufSize > 100*1024*1024 ) {
		g_errno = EBADTITLEREC;
		log(LOG_ERROR, "TITLEDB CORRUPTION. Uncompressed size=%" PRId32" > 100MB. unacceptable, probable corruption. docId=%" PRId64 "", m_ubufSize, m_docId);
		loghex(LOG_ERROR, titleRec, (cbufSize < 400 ? cbufSize : 400), "titleRec (first max. 400 bytes)");
		gbshutdownLogicError();
		//return false;
	}

	// make buf space for holding the uncompressed stuff
	m_ubufAlloc = m_ubufSize;
	m_ubuf = (char *) mmalloc ( m_ubufAlloc ,"TitleRecu1");

	if ( ! m_ubuf ) {
		// we had bad ubufsizes on gb6, like > 1GB print out key
		// so we can manually make a titledb.dat file to delete these
		// bad keys
		log("build: alloc failed ubufsize=%" PRId32" key.n1=%" PRIu32" n0=%" PRIu64,
		    m_ubufAlloc,m_titleRecKey.n1,m_titleRecKey.n0);
		return false;
	}
	// we need to loop since uncompress is wierd, sometimes it needs more
	// space then it should. see how much it actually took.
	int32_t realSize = m_ubufSize;
	// time it
	int64_t startTime = gettimeofdayInMilliseconds();
	// debug msg

	setStatus( "Uncompressing title rec." );
	// . uncompress the data into m_ubuf
	// . m_ubufSize should remain unchanged since we stored it
	int err = gbuncompress ( (unsigned char *)  m_ubuf ,
				 (uint32_t *) &realSize   ,
				 (unsigned char *)  p ,
				 (uint32_t  ) (dataSize - 4) );
	// hmmmm...
	if ( err == Z_BUF_ERROR ) {
		log(LOG_ERROR, "!!! Buffer is too small to hold uncompressed document. Probable disk corruption in a titledb file.");
		g_errno = EUNCOMPRESSERROR;
		return false;
	}
	// set g_errno and return false on error
	if ( err != Z_OK ) {
		g_errno = EUNCOMPRESSERROR;
		log(LOG_ERROR, "!!! Uncompress of document failed. ZG_ERRNO=%i. cbufSize=%" PRId32" ubufsize=%" PRId32" realSize=%" PRId32,
		    err , cbufSize , m_ubufSize , realSize );
		return false;
	}

	if ( realSize != m_ubufSize ) {
		log(LOG_ERROR,"CORRUPTED TITLEREC detected for docId %" PRId64 "", m_docId);
		gbshutdownLogicError();
		//g_errno = EBADENGINEER;
		//log(LOG_WARN, "db: Uncompressed document size is not what we recorded it to be. Probable disk corruption in a titledb file.");
		//return false;
	}

	// . add the stat
	// . use white for the stat
	g_stats.addStat_r(0, startTime, gettimeofdayInMilliseconds(), 0x00ffffff);

	// first 2 bytes in m_ubuf is the header size
	int32_t headerSize = *(uint16_t *)m_ubuf;

	int32_t shouldbe = (char *)&ptr_firstUrl - (char *)&m_headerSize;

	if ( headerSize != shouldbe ) {
		log(LOG_ERROR,"CORRUPTED TITLEREC detected for docId %" PRId64 "", m_docId);
		gbshutdownLogicError();
		//g_errno = ECORRUPTDATA;
		//return false;
	}

	// set our easy stuff
	gbmemcpy ( (void *)this , m_ubuf , headerSize );

	// NOW set the XmlDoc::ptr_* and XmlDoc::size_* members
	// like in Msg.cpp and Msg20Reply.cpp
	if ( m_pbuf ) {
		int32_t crc = hash32(m_ubuf,headerSize);
		m_pbuf->safePrintf("crchdr=0x%" PRIx32" sizehdr=%" PRId32", ",
				   crc,headerSize);
	}


	// point to the string data
	char *up = m_ubuf + headerSize;

	// end of the rec
	char *upend = m_ubuf + m_ubufSize;

	// how many XmlDoc::ptr_* members do we have? set "np" to that
	int32_t np = ((char *)&size_firstUrl  - (char *)&ptr_firstUrl) ;
	np /= sizeof(char *);

	// point to the first ptr
	char **pd = (char **)&ptr_firstUrl;
	// point to the first size
	int32_t *ps = (int32_t *)&size_firstUrl;

	// loop over them
	for ( int32_t i = 0 ; i < np ; i++ , pd++ , ps++ ) {
		// zero out the ith ptr_ and size_ member
		*pd = 0;
		*ps = 0;
		// make the mask
		uint32_t mask = 1 << i ;
		// do we have this member? skip if not.
		if ( ! (m_internalFlags1 & mask) ) {
			continue;
		}

		// watch out for corruption
		if ( up > upend ) {
			log(LOG_ERROR,"CORRUPTED TITLEREC detected for docId %" PRId64 "", m_docId);
			gbshutdownLogicError();
			//g_errno = ECORRUPTDATA;
			//return false;
		}

		// get the size
		*ps = *(int32_t *)up;
		// this should never be 0, otherwise, why was its flag set?
		if ( *ps <= 0 ) {
			log(LOG_ERROR,"CORRUPTED TITLEREC detected for docId %" PRId64 "", m_docId);
			gbshutdownLogicError();
		}
		// skip over to point to data
		up += 4;

		// point to the data. could be 64-bit ptr.
		*pd = up;//(int32_t)up;

		// Sanity - bail if size set, but no data
		if( *ps && !pd ) {
			log(LOG_ERROR,"CORRUPTED TITLEREC detected for docId %" PRId64 "", m_docId);
			gbshutdownLogicError();
		}

		// debug
		if ( m_pbuf ) {
			int32_t crc = hash32(up,*ps);
			m_pbuf->safePrintf("crc%" PRId32"=0x%" PRIx32" size%" PRId32"=%" PRId32", ",
					   i,crc,i,*ps);
		}
		// skip over data
		up += *ps;

		// watch out for corruption
		if ( up > upend ) {
			log(LOG_ERROR,"CORRUPTED TITLEREC detected for docId %" PRId64 "", m_docId);
			gbshutdownLogicError();
			//g_errno = ECORRUPTDATA;
			//return false;
		}
	}
	// cap it
	char *pend = m_ubuf + m_ubufSize;
	// sanity check. must match exactly.
	if ( up != pend ) { g_process.shutdownAbort(true); }

	// set the urls i guess
	m_firstUrl.set   ( ptr_firstUrl );
	if ( ptr_redirUrl ) {
		m_redirUrl.set   ( ptr_redirUrl );
		m_currentUrl.set ( ptr_redirUrl );
		m_currentUrlValid = true;
		m_redirUrlPtr     = &m_redirUrl;
	}
	else {
		m_currentUrl.set ( ptr_firstUrl );
		m_currentUrlValid = true;
		m_redirUrlPtr     = NULL;
	}
	m_firstUrlValid               = true;
	m_redirUrlValid               = true;

	// validate *shadow* members since bit flags cannot be returned
	m_isRSS2              = m_isRSS;
	m_isPermalink2        = m_isPermalink;
	m_isAdult2            = m_isAdult;
	m_spiderLinks2        = m_spiderLinks;
	m_isContentTruncated2 = m_isContentTruncated;
	m_isLinkSpam2         = m_isLinkSpam;
	m_isSiteRoot2         = m_isSiteRoot;

	// these members are automatically validated
	m_ipValid                     = true;
	m_spideredTimeValid           = true;
	m_indexedTimeValid            = true;

	m_outlinksAddedDateValid      = true;
	m_charsetValid                = true;
	m_countryIdValid              = true;

	// new stuff
	m_siteNumInlinksValid         = true;
	m_metaListCheckSum8Valid      = true;

	m_hopCountValid               = true;
	m_langIdValid                 = true;
	m_contentTypeValid            = true;
	m_isRSSValid                  = true;
	m_isPermalinkValid            = true;
	m_isAdultValid                = true;
	m_spiderLinksValid            = true;
	m_isContentTruncatedValid     = true;
	m_isLinkSpamValid             = true;
	m_tagRecDataValid             = true;
	m_contentHash32Valid          = true;
	m_tagPairHash32Valid          = true;
	m_imageDataValid              = true;
	m_utf8ContentValid            = true;
	m_siteValid                   = true;
	m_linkInfo1Valid              = true;
	m_versionValid                = true;
	m_httpStatusValid             = true;
	m_crawlDelayValid             = true;
	m_isSiteRootValid             = true;

	// there was no issue indexing it...
	m_indexCode       = 0;
	m_indexCodeValid  = true;
	m_redirError      = 0;
	m_redirErrorValid = true;

	// stop core when importing and calling getNewSpiderReply()
	m_downloadEndTime = m_spideredTime;
	m_downloadEndTimeValid = true;

	// must not be negative
	if ( m_siteNumInlinks < 0 ) { g_process.shutdownAbort(true); }


	// sanity check. if m_siteValid is true, this must be there
	if ( ! ptr_site ) {
		log("set2: ptr_site is null for docid %" PRId64,m_docId);
		//g_process.shutdownAbort(true); }
		g_errno = ECORRUPTDATA;
		return false;
	}

	// success, return true then
	return true;
}


bool XmlDoc::setFirstUrl ( const char *u ) {
	m_firstUrl.reset();
	m_currentUrl.reset();

	m_firstUrlValid = true;

	// assume url is not correct format
	ptr_firstUrl  = NULL;
	size_firstUrl = 0;

	if ( ! u || ! u[0] ) {
		//if ( ! m_indexCode ) m_indexCode = EBADURL;
		return true;
	}

	//if ( strlen (u) + 1 > MAX_URL_LEN )
	//	m_indexCode = EURLTOOLONG;

	m_firstUrl.set( u );

	// it is the active url
	m_currentUrl.set ( &m_firstUrl );
	m_currentUrlValid = true;

	// set this to the normalized url
	ptr_firstUrl  = m_firstUrl.getUrl();
	size_firstUrl = m_firstUrl.getUrlLen() + 1;

	return true;
}



void XmlDoc::setStatus ( const char *s ) {
	bool timeIt = false;
	if ( g_conf.m_logDebugBuildTime )
		timeIt = true;

	// log times to detect slowness
	if ( timeIt  && m_statusMsgValid ) {
		int64_t now = gettimeofdayInMilliseconds();
		if ( m_lastTimeStart == 0LL ) m_lastTimeStart = now;
		int32_t took = now - m_lastTimeStart;
		//if ( took > 100 )
			log("xmldoc: %s (xd=0x%" PTRFMT" "
			    "u=%s) took %" PRId32"ms",
			    m_statusMsg,
			    (PTRTYPE)this,
			    m_firstUrl.getUrl(),
			    took);
		m_lastTimeStart = now;
	}

	m_statusMsg = s;
	m_statusMsgValid = true;

	bool logIt = g_conf.m_logDebugBuild;
	if ( ! logIt ) return;

	if ( m_firstUrlValid )
		logf(LOG_DEBUG,"build: status = %s for %s (this=0x%" PTRFMT")",
		     s,m_firstUrl.getUrl(),(PTRTYPE)this);
	else
		logf(LOG_DEBUG,"build: status = %s for docId %" PRId64" "
		     "(this=0x%" PTRFMT")",
		     s,m_docId, (PTRTYPE)this);
}

// caller must now call XmlDoc::setCallback()
void XmlDoc::setCallback ( void *state, void (* callback) (void *state) ) {
	m_state     = state;
	m_callback1 = callback;
	// add this additional state==this constraint to prevent core when
	// doing a page parser
	if ( state == this &&
	     // i don't remember why i added this sanity check...
	     callback == getMetaListWrapper ) { g_process.shutdownAbort(true); }
}

void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) {
	m_state     = state;
	m_callback2 = callback;
}



static void indexDoc3(void *state) {
	XmlDoc *that = reinterpret_cast<XmlDoc*>(state);
	logTrace( g_conf.m_logTraceXmlDoc, "Calling XmlDoc::indexDoc" );
	// return if it blocked
	if (!that->indexDoc()) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, indexDoc blocked");
		return;
	}
	
	// otherwise, all done, call the caller callback
	
	that->m_indexedDoc = true;

	logTrace(g_conf.m_logTraceXmlDoc, "END");
}

static void indexedDoc3(void *state, job_exit_t exit_type) {
	XmlDoc *that = reinterpret_cast<XmlDoc*>(state);
	if(that->m_indexedDoc) {
		logTrace(g_conf.m_logTraceXmlDoc, "Calling callback");
		that->callCallback();
	}
}


static void indexDocWrapper ( void *state ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { g_process.shutdownAbort(true);}
	// note it
	THIS->setStatus ( "in index doc wrapper" );

#if 0
	//Running indexDoc()/indexDoc2()/IndexDoc3()/getMetaList() causes a bug in the
	//masterloop/callback logic to manifest itself. It is tricky to track down so
	//disable job submission for now until we have time to clean up the callback
	//logic. The downside is that some large documents can temporarily stall the
	//main thread.

	//shovel this off to a thread
	if(g_jobScheduler.submit(&indexDoc3,indexedDoc3,THIS,thread_type_spider_index,THIS->m_niceness)) {
		//excellent
		logTrace( g_conf.m_logTraceXmlDoc, "END, queued for thread" );
		return;
	}
#endif
	//threads not available (or oom or simmilar)
	indexDoc3(THIS);
	indexedDoc3(THIS, job_exit_normal);
}


// . the highest level function in here
// . user is requesting to inject this url
// . returns false if blocked and your callback will be called when done
// . returns true and sets g_errno on error
bool XmlDoc::injectDoc ( const char *url ,
			 CollectionRec *cr ,
			 char *content ,
			 bool contentHasMimeArg ,
			 int32_t hopCount,
			 int32_t charset,
			 int32_t langId,
			 bool deleteUrl,
			 const char *contentTypeStr, // text/html application/json
			 bool spiderLinks ,
			 char newOnly, // index iff new
			 bool skipContentHashCheck,
			 void *state,
			 void (*callback)(void *state) ,

			 uint32_t firstIndexed,
			 uint32_t lastSpidered ,
			 int32_t injectDocIp
			 ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// normalize url
	Url uu;
	uu.set( url );

	// remove >'s i guess and store in st1->m_url[] buffer
	char cleanUrl[MAX_URL_LEN+1];
	cleanInput ( cleanUrl,
		     sizeof(cleanUrl),
		     uu.getUrl(),
		     uu.getUrlLen() );


	int32_t contentType = CT_UNKNOWN;
	if ( contentTypeStr && contentTypeStr[0] )
		contentType = getContentTypeFromStr(contentTypeStr, strlen(contentTypeStr));

	// use CT_HTML if contentTypeStr is empty or blank. default
	if ( ! contentTypeStr || ! contentTypeStr[0] )
		contentType = CT_HTML;

	// this can go on the stack since set4() copies it
	SpiderRequest sreq;
	sreq.setFromInject ( cleanUrl );

	if ( lastSpidered )
		sreq.m_addedTime = lastSpidered;

	if ( deleteUrl )
		sreq.m_forceDelete = 1;

	// . use the enormous power of our new XmlDoc class
	// . this returns false with g_errno set on error
	if ( ! set4 ( &sreq       ,
		      NULL        ,
		      cr->m_coll  ,
		      NULL        , // pbuf
		      // from PageInject.cpp:
		      // give it a niceness of 1, we have to be
		      // careful since we are a niceness of 0!!!!
		      1, // niceness, // 1 ,
		      // inject this content
		      content ,
		      deleteUrl, // false, // deleteFromIndex ,
		      injectDocIp, // 0,//forcedIp ,
		      contentType ,
		      lastSpidered,//lastSpidered overide
		      contentHasMimeArg )) {
		// g_errno should be set if that returned false
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning true. set4 returned false" );
		if ( ! g_errno ) { g_process.shutdownAbort(true); }
		return true;
	}

	// othercrap. used for importing from titledb of another coll/cluster.
	if ( firstIndexed ) {
		m_firstIndexedDate = firstIndexed;
		m_firstIndexedDateValid = true;
	}

	if ( lastSpidered ) {
		m_spideredTime      = lastSpidered;
		m_spideredTimeValid = true;
	}

	if ( hopCount != -1 ) {
		m_hopCount = hopCount;
		m_hopCountValid = true;
	}

	// PageInject calls memset on gigablastrequest so add '!= 0' here
	if ( charset != -1 && charset != csUnknown && charset != 0 ) {
		m_charset = charset;
		m_charsetValid = true;
	}

	if (langId > langUnknown && langId < langLast) {
		m_langId = langId;
		m_langIdValid = true;
	}

	// avoid looking up ip of each outlink to add "firstip" tag to tagdb
	// because that can be slow!!!!!!!
	m_spiderLinks = (char)spiderLinks;
	m_spiderLinks2 = (char)spiderLinks;
	m_spiderLinksValid = true;

	// . newOnly is true --> do not inject if document is already indexed!
	// . maybe just set indexCode
	m_newOnly = newOnly;

	m_skipContentHashCheck = skipContentHashCheck;

	// do not re-lookup the robots.txt
	m_isAllowed      = true;
	m_isAllowedValid = true;
	m_crawlDelay     = -1; // unknown
	m_crawlDelayValid = true;


	m_isInjecting = true;
	m_isInjectingValid = true;

	// log it now
	//log("inject: indexing injected doc %s",cleanUrl);

	// make this our callback in case something blocks
	setCallback ( state , callback );

	// . now tell it to index
	// . this returns false if blocked
	// . eventually it will call "callback" when done if it blocks
	logTrace( g_conf.m_logTraceXmlDoc, "Calling indexDoc" );
	bool status = indexDoc ( );
	if ( ! status ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning false. indexDoc returned false" );
		return false;
	}

	// log it. i guess only for errors when it does not block?
	// because xmldoc.cpp::indexDoc calls logIt()
	if ( status ) logIt();

	logTrace( g_conf.m_logTraceXmlDoc, "END, returning true. indexDoc returned true" );
	return true;
}

// XmlDoc::injectDoc uses a fake spider request so we have to add
// a real spider request into spiderdb so that the injected doc can
// be spidered again in the future by the spidering process, otherwise,
// injected docs can never be re-spidered. they would end up having
// a SpiderReply in spiderdb but no matching SpiderRequest as well.
void XmlDoc::getRevisedSpiderRequest ( SpiderRequest *revisedReq ) {

	if ( ! m_sreqValid ) { g_process.shutdownAbort(true); }

	// we are doing this because it has a fake first ip
	if ( ! m_sreq.m_fakeFirstIp ) { g_process.shutdownAbort(true); }

	// copy it over from our current spiderrequest
	gbmemcpy ( revisedReq , &m_sreq , m_sreq.getRecSize() );

	// this must be valid for us of course
	if ( ! m_firstIpValid ) { g_process.shutdownAbort(true); }

	// wtf? it might be invalid!!! parent caller will handle it...
	//if ( m_firstIp == 0 || m_firstIp == -1 ) { g_process.shutdownAbort(true); }

	// store the real ip in there now
	revisedReq->m_firstIp = m_firstIp;

	// but turn off this flag! the whole point of all this...
	revisedReq->m_fakeFirstIp = 0;

	// re-make the key since it contains m_firstIp
	int64_t uh48 = m_sreq.getUrlHash48();
	int64_t parentDocId = m_sreq.getParentDocId();

	// set the key properly to reflect the new "first ip" since
	// we shard spiderdb by that.
	revisedReq->m_key = Spiderdb::makeKey ( m_firstIp, uh48, true, parentDocId, false );
	revisedReq->setDataSize();
}

void XmlDoc::getRebuiltSpiderRequest ( SpiderRequest *sreq ) {

	// memset 0
	sreq->reset();

	// assume not valid
	sreq->m_siteNumInlinks = -1;

	if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }

	// how many site inlinks?
	sreq->m_siteNumInlinks       = m_siteNumInlinks;
	sreq->m_siteNumInlinksValid  = true;

	if ( ! m_firstIpValid ) { g_process.shutdownAbort(true); }

	// set other fields besides key
	sreq->m_firstIp              = m_firstIp;
	sreq->m_hostHash32           = m_hostHash32a;
	//sreq->m_domHash32            = m_domHash32;
	//sreq->m_siteNumInlinks       = m_siteNumInlinks;
	//sreq->m_pageNumInlinks     = m_pageNumInlinks;
	sreq->m_hopCount             = m_hopCount;

	sreq->m_pageNumInlinks       = 0;//m_sreq.m_parentFirstIp;

	Url *fu = getFirstUrl();

	sreq->m_isAddUrl             = 0;//m_isAddUrl;
	sreq->m_isPingServer         = fu->isPingServer();
	//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;

	// transcribe from old spider rec, stuff should be the same
	sreq->m_addedTime            = m_firstIndexedDate;

	// validate the stuff so getUrlFilterNum() acks it
	sreq->m_hopCountValid = 1;

	// we need this now for ucp ucr upp upr new url filters that do
	// substring matching on the url
	if ( m_firstUrlValid )
		strcpy(sreq->m_url,m_firstUrl.getUrl());

	// re-make the key since it contains m_firstIp
	long long uh48 = fu->getUrlHash48();
	// set the key properly to reflect the new "first ip"
	// since we shard spiderdb by that.
	sreq->m_key = Spiderdb::makeKey ( m_firstIp, uh48, true, 0LL, false );
	sreq->setDataSize();
}

////////////////////////////////////////////////////////////////////
//   THIS IS THE HEART OF HOW THE PARSER ADDS TO THE RDBS
////////////////////////////////////////////////////////////////////

// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
// . this is now a WRAPPER for indexDoc2() and it will deal with
//   g_errnos by adding an error spider reply so we offload the
//   logic to the url filters table
bool XmlDoc::indexDoc ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// return from the msg4.addMetaList() below?
	if ( m_msg4Launched ) {
		// must have been waiting
		if ( ! m_msg4Waiting ) {
			g_process.shutdownAbort(true);
		}

		logTrace( g_conf.m_logTraceXmlDoc, "END, returning true. m_msg4Launched" );
		return true;
	}

	// return true with g_errno set on error
	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning true. Could not get collection." );
		return true;
	}

	if ( ! m_masterLoop ) {
		m_masterLoop  = indexDocWrapper;
		m_masterState = this;
	}

	// do not index if already indexed and we are importing
	// from the code in PageInject.cpp from a foreign titledb file
	if ( m_isImporting && m_isImportingValid ) {
		char *isIndexed = getIsIndexed();
		if ( ! isIndexed ) {
			log("import: import had error: %s",mstrerror(g_errno));
			logTrace( g_conf.m_logTraceXmlDoc, "END, returning true. Import error." );
			return true;
		}
		if ( isIndexed == (char *)-1)
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, returning false. isIndex = -1" );
			return false;
		}
		if ( *isIndexed ) {
			log("import: skipping import for %s. already indexed.",
			    m_firstUrl.getUrl());
			logTrace( g_conf.m_logTraceXmlDoc, "END, returning true." );
			return true;
		}
	}

	bool status = true;

	if ( ! g_errno ) status = indexDoc2 ( );

	// blocked?
	if ( ! status )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return false, indexDoc2 blocked" );
		return false;
	}

	// done with no error?
	bool success = true;
	if ( g_errno ) success = false;
	// if we were trying to spider a fakefirstip request then
	// pass through because we lookup the real firstip below and
	// add a new request as well as a reply for this one
	if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) success = false;

	if ( success )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true, success!" );
		return true;
	}

	// . ignore failed child docs like diffbot pages
	// . they are getting EMALFORMEDSECTIONS
	if ( m_isChildDoc ) {
		log("build: done indexing child doc. error=%s. not adding "
		    "spider reply for %s",
		    mstrerror(g_errno),
		    m_firstUrl.getUrl());
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true, indexed child doc" );
		return true;
	}

	///
	// otherwise, an internal error. we must add a SpiderReply
	// to spiderdb to release the lock.
	///

 logErr:

	if ( m_firstUrlValid && g_errno )
		log("build: %s had internal error = %s. adding spider "
		    "error reply.",
		    m_firstUrl.getUrl(),mstrerror(g_errno));
	else if ( g_errno )
		log("build: docid=%" PRId64" had internal error = %s. "
		    "adding spider error reply.",
		    m_docId,mstrerror(g_errno));

	// seems like this was causing a core somehow...
	if ( g_errno == ENOMEM )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true, ENOMEM" );
		return true;
	}

	// and do not add spider reply if shutting down the server
	if ( g_errno == ESHUTTINGDOWN )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true, ESHUTTINGDOWN" );
		return true;
	}

	// i saw this on shard 9, how is it happening
	if ( g_errno == EBADRDBID )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true, EBADRDBID" );
		return true;
	}

	// if docid not found when trying to do a query reindex...
	// this really shouldn't happen but i think we were adding
	// additional SpiderRequests since we were using a fake first ip.
	// but i have since fixed that code. so if the titlerec was not
	// found when trying to do a force delete... it's not a temporary
	// error and should not be retried. if we set indexCode to
	// EINTERNALERROR it seems to be retried.
	if ( g_errno == ENOTFOUND ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	// this should not be retired either. i am seeing it excessively
	// retried from a
	// "TitleRec::set: uncompress uncompressed size=-2119348471"
	// error condition. it also said
	// "Error spidering for doc http://www.... : Bad cached document"
	if ( g_errno == EBADTITLEREC ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	// i've seen Multicast got error in reply from hostId 19 (msgType=0x22
	// transId=496026 nice=1 net=default): Buf too small.
	// so fix that with this
	if ( g_errno == EBUFTOOSMALL ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	if ( g_errno == EBADURL ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	if ( g_errno == ENOTITLEREC ) {
		m_indexCode = g_errno;
		m_indexCodeValid = true;
	}

	// default to internal error which will be retried forever otherwise
	if ( ! m_indexCodeValid ) {
		m_indexCode = EINTERNALERROR;//g_errno;
		m_indexCodeValid = true;
	}

	// if our spiderrequest had a fake "firstip" so that it could be
	// injected quickly into spiderdb, then do the firstip lookup here
	// and re-add the new spider request with that, and add the reply
	// to the fake firstip request below.
	if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) {
		// at least get this if possible
		int32_t *fip = getFirstIp();
		if ( fip == (void *) -1 ) return false;
		// error? g_errno will be changed if this is NULL
		if ( ! fip ) {
			log("build: error getting real firstip: %s",
			    mstrerror(g_errno));
			m_indexCode = EINTERNALERROR;
			m_indexCodeValid = true;
			goto logErr;
		}
		// sanity log
		if ( ! m_firstIpValid ) { g_process.shutdownAbort(true); }
		// sanity log
		if ( *fip == 0 || *fip == -1 ) {
			//
			// now add a spider status doc for this so we know
			// why a crawl might have failed to start
			//
			// save this
			int32_t saved = m_indexCode;
			// make it the real reason for the spider status doc
			m_indexCode = EDNSERROR;
			// get the spiderreply ready to be added. false=del
			SafeBuf *ssDocMetaList =getSpiderStatusDocMetaList(NULL ,false);
			// revert
			m_indexCode = saved;
			// error?
			if ( ! ssDocMetaList ) return true;
			// blocked?
			if ( ssDocMetaList == (void *)-1 ) return false;
			// need to alloc space for it too
			char *list = ssDocMetaList->getBufStart();
			int32_t len = ssDocMetaList->length();
			//needx += len;
			// this too
			m_addedStatusDocSize = len;
			m_addedStatusDocSizeValid = true;

			const char *url = "unknown";
			if ( m_sreqValid ) url = m_sreq.m_url;
			log("build: error2 getting real firstip of "
			    "%" PRId32" for "
			    "%s. Not adding new spider req. "
			    "spiderstatusdocsize=%" PRId32, (int32_t)*fip,url,
			    m_addedStatusDocSize);

			if ( ! m_metaList2.safeMemcpy ( list , len ) )
			{
				logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 safeMemcpy returned false" );
				return true;
			}

			goto skipNewAdd1;
		}
		// store the new request (store reply for this below)
		rdbid_t rd = RDB_SPIDERDB;
		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
		if ( ! m_metaList2.pushChar(rd) )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 pushChar returned false" );
			return true;
		}
		// store it here
		SpiderRequest revisedReq;
		// this fills it in
		getRevisedSpiderRequest ( &revisedReq );
		// and store that new request for adding
		if ( ! m_metaList2.safeMemcpy (&revisedReq,revisedReq.getRecSize()))
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 safeMemcpy returned false" );
			return true;
		}
		// make sure to log the size of the spider request
		m_addedSpiderRequestSize = revisedReq.getRecSize();
		m_addedSpiderRequestSizeValid = true;
	}

 skipNewAdd1:

	SpiderReply *nsr = NULL;

	// if only rebuilding posdb do not rebuild spiderdb
	if ( m_useSpiderdb ) {

		////
		//
		// make these fake so getNewSpiderReply() below does not block
		//
		////
		nsr = getFakeSpiderReply (  );
		// this can be NULL and g_errno set to ENOCOLLREC or something
		if ( ! nsr )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, return true, getFakeSpiderReply returned false" );
			return true;
		}

		//SafeBuf metaList;

		rdbid_t rd = RDB_SPIDERDB;
		if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2;
		if ( ! m_metaList2.pushChar( rd ) )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 pushChar returned false" );
			return true;
		}

		if ( ! m_metaList2.safeMemcpy ( (char *)nsr,nsr->getRecSize()))
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, return true, metaList2 safeMemcpy returned false" );
			return true;
		}

		m_addedSpiderReplySize = nsr->getRecSize();
		m_addedSpiderReplySizeValid = true;
	}


	m_msg4Launched = true;

	// display the url that had the error
	logIt();

	// log this for debug now
	if ( nsr ) {
#ifdef _VALGRIND_
		VALGRIND_CHECK_MEM_IS_DEFINED(nsr, sizeof(*nsr));
#endif
		SafeBuf tmp;
		nsr->print(&tmp);
		log("xmldoc: added reply %s",tmp.getBufStart());
	}

	// clear g_errno
	g_errno = 0;

	// "cr" might have been deleted by calling indexDoc() above i think
	// so use collnum here, not "cr"
	if (!m_msg4.addMetaList(&m_metaList2, m_collnum, m_masterState, m_masterLoop)) {
		m_msg4Waiting = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, return false, m_msg4.addMetaList returned false" );
		return false;
	}

	//logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" );

	m_msg4Launched = false;

	logTrace( g_conf.m_logTraceXmlDoc, "END, return true, all done" );
	// all done
	return true;
}



// . returns false if blocked, true otherwise
// . sets g_errno on error and returns true
bool XmlDoc::indexDoc2 ( ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// if anything blocks, this will be called when it comes back
	if ( ! m_masterLoop ) {
		m_masterLoop  = indexDocWrapper;
		m_masterState = this;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true. Could not get collection." );
		return true;
	}


	// do this before we increment pageDownloadAttempts below so that
	// john's smoke tests, which use those counts, are not affected
	if ( m_sreqValid && m_sreq.m_fakeFirstIp &&
	     // only do for add url, not for injects. injects expect
	     // the doc to be indexed while the browser waits. add url
	     // is really just adding the spider request and returning
	     // to the browser without delay.
	     ! m_sreq.m_isInjecting &&
	     // not for page reindexes either!
	     ! m_sreq.m_isPageReindex &&
	     // just add url
	     m_sreq.m_isAddUrl ) {
		m_indexCodeValid = true;
		m_indexCode = EFAKEFIRSTIP;

		logTrace( g_conf.m_logTraceXmlDoc, "END, return true. Set indexCode EFAKEFIRSTIP" );
		return true;
	}

	setStatus("indexing doc");

	// maybe a callback had g_errno set?
	if ( g_errno )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END. return true, g_errno set (%" PRId32")",g_errno);
		return true;
	}

	// . now get the meta list from it to add
	// . returns NULL and sets g_errno on error
	char *metaList = getMetaList ( );

	// error?
	if ( ! metaList ) {
		// sanity check. g_errno must be set
		if ( ! g_errno ) {
			log("build: Error UNKNOWN error spidering. setting "
			    "to bad engineer.");
			g_errno = EBADENGINEER;
			//g_process.shutdownAbort(true); }
		}
		log("build: Error spidering for doc %s: %s",
		    m_firstUrl.getUrl(),mstrerror(g_errno));
		logTrace( g_conf.m_logTraceXmlDoc, "END, return true. getMetaList returned false" );
		return true;
	}
	// did it block? return false if so, we will be recalled since
	// we set m_masterLoop to indexDoc
	if ( metaList == (char *) -1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return false. metaList = -1" );
		return false;
	}


	// must be valid
	int32_t *indexCode = getIndexCode();
	if (! indexCode || indexCode == (void *)-1)
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return %s based on indexCode", (bool*)indexCode?"true":"false");
		return (char *)indexCode;
	}

	// . check to make sure the parser is consistent so we can cleanly
	//   delete the various rdb records if we need to in the future solely
	//   based on the titleRec.
	// . force = false
	// . unless we force it, the test is only done at random intervals
	//   for performance reasons
	if ( ! *indexCode ) doConsistencyTest ( false );
	// ignore errors from that
	g_errno = 0;


	// now add it
	if ( ! m_listAdded && m_metaListSize ) {
		// only call this once
		m_listAdded = true;
		// show it for now
		//printMetaList(m_metaList , m_metaList + m_metaListSize,NULL);
		// test it
		verifyMetaList ( m_metaList , m_metaList + m_metaListSize , false );

		// do it
		if (!m_msg4.addMetaList(m_metaList, m_metaListSize, m_collnum, m_masterState, m_masterLoop)) {
			m_msg4Waiting = true;
			logTrace( g_conf.m_logTraceXmlDoc, "END, return false. addMetaList blocked" );
			return false;
		}

		// error with msg4? bail
		if ( g_errno ) {
			logIt();
			logTrace( g_conf.m_logTraceXmlDoc, "END, return true. g_errno %" PRId32" after addMetaList", g_errno);
			return true;
		}

	}

	// make sure our msg4 is no longer in the linked list!
	if (m_msg4Waiting && Msg4::isInLinkedList(&m_msg4)){
		g_process.shutdownAbort(true);
	}

	// we are not waiting for the msg4 to return
	m_msg4Waiting = false;

	// there used to be logic here to flush injections, but it was disabled to make things faster
	// flush it if we are injecting it in case the next thing we spider is dependent on this one
	// remove in commit d23858c92d0d715d493a358ea69ecf77a5cc00fc

	logIt();
	logTrace( g_conf.m_logTraceXmlDoc, "END, all done. Returning true");

	return true;
}

#if 0
static void doneReadingArchiveFileWrapper ( int fd, void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// . go back to the main entry function
	// . make sure g_errno is clear from a msg3a g_errno before calling
	//   this lest it abandon the loop

	THIS->m_masterLoop ( THIS->m_masterState );
}
#endif


static void getTitleRecBufWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { g_process.shutdownAbort(true);}
	// note it
	THIS->setStatus ( "in get title rec wrapper" );
	// return if it blocked
	if ( THIS->getTitleRecBuf() == (void *)-1 ) return;
	// otherwise, all done, call the caller callback
	THIS->callCallback();
}

// . return NULL and sets g_errno on error
// . returns -1 if blocked
int32_t *XmlDoc::getIndexCode ( ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// return it now if we got it already
	if ( m_indexCodeValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid: %" PRId32, m_indexCode);
		return &m_indexCode;
	}

	setStatus ( "getting index code");

	// page inject can set deletefromindex to true
	if ( m_deleteFromIndex ) {
		m_indexCode = EDOCFORCEDELETE;
		m_indexCodeValid = true;

		logTrace( g_conf.m_logTraceXmlDoc, "END, delete operation. Returning EDOCFORCEDELETE" );
		return &m_indexCode;
	}

	if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }

	if ( m_firstUrl.getUrlLen() <= 5 ) {
		m_indexCode = EBADURL;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADURL. FirstURL len too short" );
		return &m_indexCode;
	}

	if ( m_firstUrl.getUrlLen() + 1 >= MAX_URL_LEN ) {
		m_indexCode      = EURLTOOLONG;
		m_indexCodeValid = true;

		logTrace( g_conf.m_logTraceXmlDoc, "END, EURLTOOLONG" );
		return &m_indexCode;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. Could not get collection." );
		return NULL;
	}

	// "url is repeating path components" error?
	if ( ! m_check1 ) {
		m_check1         = true;
		if ( m_firstUrl.isLinkLoop() ) {
			m_indexCode      = ELINKLOOP;
			m_indexCodeValid = true;

			logTrace( g_conf.m_logTraceXmlDoc, "END, ELINKLOOP" );
			return &m_indexCode;
		}
	}

	// fix for "http://.xyz.com/...."
	if ( m_firstUrl.getHost() && m_firstUrl.getHost()[0] == '.' ) {
		m_indexCode      = EBADURL;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADURL (URL first char is .)" );
		return &m_indexCode;
	}

	if ( cr->m_doUrlSpamCheck && ! m_check2 ) {
		m_check2         = true;
		if ( m_firstUrl.isAdult() ) {
			m_indexCode      = EDOCURLSPAM;
			m_indexCodeValid = true;
			logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCURLSPAM" );
			return &m_indexCode;
		}
	}

	// . don't spider robots.txt urls for indexing!
	// . quickly see if we are a robots.txt url originally
	int32_t fulen = getFirstUrl()->getUrlLen();
	char *fu   = getFirstUrl()->getUrl();
	char *fp   = fu + fulen - 11;
	if ( fulen > 12 &&
	     fp[1] == 'r' &&
	     ! strncmp ( fu + fulen - 11 , "/robots.txt" , 11 )) {
		m_indexCode = EBADURL;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADURL (robots.txt)" );
		return &m_indexCode;
	}

	// if this is an injection and "newonly" is not zero then we
	// only want to do the injection if the url is "new", meaning not
	// already indexed. "m_wasContentInjected" will be true if this is
	// an injection. "m_newOnly" will be true if the injector only
	// wants to proceed with the injection if this url is not already
	// indexed.
	if ( m_wasContentInjected && m_newOnly ) {
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (XmlDoc **)-1 )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END return error, getOldXmlDoc failed" );
			return (int32_t *)pod;
		}
		XmlDoc *od = *pod;
		// if the old doc does exist and WAS NOT INJECTED itself
		// then abandon this injection. it was spidered the old
		// fashioned way and we want to preserve it and NOT overwrite
		// it with this injection.
		if ( od && ! od->m_wasContentInjected ) {
			m_indexCode = EABANDONED;
			m_indexCodeValid = true;
			logTrace( g_conf.m_logTraceXmlDoc, "END, EABANDONED" );
			return &m_indexCode;
		}
		// if it was injected itself, only abandon this injection
		// in the special case that m_newOnly is "1". otherwise
		// if m_newOnly is 2 then we will overwrite any existing
		// titlerecs that were not injected themselves.
		if ( od && od->m_wasContentInjected && m_newOnly == 1 ) {
			m_indexCode = EABANDONED;
			m_indexCodeValid = true;
			logTrace( g_conf.m_logTraceXmlDoc, "END, EABANDONED (2)" );
			return &m_indexCode;
		}

	}

	// need tagrec to see if banned
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// this is an automatic ban!
	if ( gr->getLong("manualban",0) ) {
		m_indexCode = EDOCBANNED;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCBANNED" );
		return &m_indexCode;
	}


	// get the ip of the current url
	int32_t *ip = getIp ( );
	if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
	if ( *ip == 0 ) {
		m_indexCode      = EBADIP;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADIP" );
		return &m_indexCode;
	}

	// . check robots.txt
	// . uses the curernt url
	// . if we end in /robots.txt then this quickly returns true
	// . no, we still might want to index if we got link text, so just
	//   check this again below
	bool *isAllowed = getIsAllowed();
	if ( ! isAllowed || isAllowed == (void *)-1) return (int32_t *)isAllowed;

	// . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc.
	// . this will be the reply from diffbot.com if using diffbot
	int32_t *dstatus = getDownloadStatus();
	if ( ! dstatus || dstatus == (void *)-1 ) return (int32_t *)dstatus;
	if ( *dstatus ) {
		m_indexCode      = *dstatus;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, %" PRId32" (getDownloadStatus)", m_indexCode);
		return &m_indexCode;
	}

	// check the mime
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, error. Could not getMime" );
		return (int32_t *)mime;
	}

	// check redir url
	Url **redirp = getRedirUrl();
	if ( ! redirp || redirp == (void *)-1 ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, could not getRedirUrl" );
		return (int32_t *)redirp;
	}

	// this must be valid now
	if ( ! m_redirErrorValid ) { g_process.shutdownAbort(true); }
	if ( m_redirError ) {
		m_indexCode      = m_redirError;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, redirError (%" PRId32")", m_indexCode);
		return &m_indexCode;
	}

	int64_t *d = getDocId();
	if ( ! d || d == (void *)-1 ) return (int32_t *)d;
	if ( *d == 0LL ) {
		m_indexCode      = ENODOCID;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, ENODOCID" );
		return &m_indexCode;
	}

	// . is the same url but with a www. present already in titledb?
	// . example: if we are xyz.com and www.xyz.com is already in titledb
	//   then nuke ourselves by setting m_indexCode to EDOCDUPWWW
	char *isWWWDup = getIsWWWDup ();
	if ( ! isWWWDup || isWWWDup == (char *)-1) return (int32_t *)isWWWDup;
	if ( *isWWWDup ) {
		m_indexCode      = EDOCDUPWWW;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCDUPWWW" );
		return &m_indexCode;
	}


	uint16_t *charset = getCharset();
	if ( ! charset && g_errno == EBADCHARSET ) {
		g_errno = 0;
		m_indexCode      = EBADCHARSET;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADCHARSET" );
		return &m_indexCode;
	}

	if ( ! charset || charset == (void *)-1) return (int32_t *)charset;
	// we had a 2024 for charset come back and that had a NULL
	// get_charset_str() but it was not supported
	if ( ! supportedCharset(*charset) ) { //&&get_charset_str(*charset) ) {
		m_indexCode      = EBADCHARSET;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADCHARSET (2)" );
		return &m_indexCode;
	}

	// get local link info
	LinkInfo   *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLinkInfo1 failed" );
		return (int32_t *)info1;
	}

	// if robots.txt said no, and if we had no link text, then give up
	bool disallowed = !( *isAllowed );
	if ( info1 && info1->hasLinkText() ) {
		disallowed = false;
	}
	// if we generated a new sitenuminlinks to store in tagdb, we might
	// want to add this for that only reason... consider!
	if ( disallowed ) {
		m_indexCode      = EDOCDISALLOWED;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCDISALLOWED" );
		return &m_indexCode;
	}

	// check for bad url extension, like .jpg
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, error getCurrentUrl" );
		return (int32_t *)cu;
	}

	// take this check out because it is hurting
	// http://community.spiceworks.com/profile/show/Mr.T
	// because 't' was in the list of bad extensions.
	// now we use the url filters table to exclude the extensions we want.
	// and we use the 'ismedia' directive to exclude common media
	// extensions. having this check here is no longer needed and confusing
	// BUT on the otherhand stuff like .exe .rpm .deb is good to avoid!
	// so i'll just edit the list to remove more ambiguous extensions
	// like .f and .t
	//
	bool badExt = cu->hasNonIndexableExtension(TITLEREC_CURRENT_VERSION);	// @todo BR: For now ignore actual TitleDB version. // m_version);
	if ( badExt && ! info1->hasLinkText() ) {
	 	m_indexCode      = EDOCBADCONTENTTYPE;
	 	m_indexCodeValid = true;
	 	logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCBADCONTENTTYPE" );
	 	return &m_indexCode;
	}

	int16_t *hstatus = getHttpStatus();
	if ( ! hstatus || hstatus == (void *)-1 ) return (int32_t *)hstatus;
	if ( *hstatus != 200 ) {
		m_indexCode      = EDOCBADHTTPSTATUS;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADHTTPSTATUS (%d)", *hstatus);
		return &m_indexCode;
	}

	// check for EDOCISERRPG (custom error pages)
	char *isErrorPage = getIsErrorPage();
	if ( !isErrorPage||isErrorPage==(void *)-1)
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getIsErrorPage failed" );
		return (int32_t *)isErrorPage;
	}

	if ( *isErrorPage ) {
		m_indexCode      = EDOCISERRPG;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCISERRPG" );
		return &m_indexCode;
	}

	// . i moved this up to perhaps fix problems of two dup pages being
	//   downloaded at about the same time
	// . are we a dup of another doc from any other site already indexed?
	char *isDup = getIsDup();
	if ( ! isDup || isDup == (char *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getIsDup failed" );
		return (int32_t *)isDup;
	}
	if ( *isDup ) {
		m_indexCode      = EDOCDUP;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCDUP" );
		return &m_indexCode;
	}

	// . is a non-canonical page that have <link href=xxx rel=canonical>
	// . also sets m_canonicanlUrl.m_url to it if we are not
	// . returns NULL if we are the canonical url
	// . do not do this check if the page was injected
	bool checkCanonical = true;

	if (m_wasContentInjected) {
		checkCanonical = false;
	}

	if (m_isInjecting && m_isInjectingValid) {
		checkCanonical = false;
	}

	// do not do canonical deletion if recycling content either i guess
	if (m_sreqValid && m_sreq.m_recycleContent) {
		checkCanonical = false;
	}

	/// @todo ALC do we want to delete during a query reindex?
	// do not delete from being canonical if doing a query reindex
	if (m_sreqValid && m_sreq.m_isPageReindex) {
		checkCanonical = false;
	}

	if (checkCanonical) {
		Url **canon = getCanonicalRedirUrl();
		if (!canon || canon == (void *)-1) {
			logTrace( g_conf.m_logTraceXmlDoc, "END, getCanonicalRedirUrl failed" );
			return (int32_t *)canon;
		}

		// if there is one then we are it's leaf, it is the primary
		// page so we should not index ourselves
		if (*canon) {
			m_indexCode = EDOCNONCANONICAL;
			m_indexCodeValid = true;

			// store canonical url in titlerec as well
			ptr_redirUrl    = m_canonicalRedirUrl.getUrl();
			size_redirUrl   = m_canonicalRedirUrl.getUrlLen()+1;

			logTrace(g_conf.m_logTraceXmlDoc, "END, EDOCNONCANONICAL");
			return &m_indexCode;
		}
	}

	// was page unchanged since last time we downloaded it?
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getOldXmlDoc failed" );
		return (int32_t *)pod;
	}

	XmlDoc *od = NULL;
	if ( *pod ) od = *pod;

	// if recycling content is true you gotta have an old title rec.
	if ( ! od && m_recycleContent ) {
		m_indexCode = ENOTITLEREC;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, ENOTITLEREC" );
		return &m_indexCode;
	}

	bool check = true;
	if ( ! od ) check = false;

	// or if recycling content turn this off as well! otherwise
	// it will always be 100% the same
	if ( m_recycleContent )
		check = false;

	if ( check ) {
		// check inlinks now too!
		LinkInfo  *info1 = getLinkInfo1 ();
		if ( ! info1 || info1 == (LinkInfo *)-1 )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END error, getLinkInfo1 failed" );
			return (int32_t *)info1;
		}

		LinkInfo  *info2 = od->getLinkInfo1 ();
		if ( ! info2 || info2 == (LinkInfo *)-1 )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END error, getLinkInfo1 (od) failed" );
			return (int32_t *)info2;
		}

		Inlink *k1 = NULL;
		Inlink *k2 = NULL;
		char *s1, *s2;
		int32_t len1,len2;
		if ( info1->getNumGoodInlinks() !=
		     info2->getNumGoodInlinks() )
			goto changed;
		for ( ; k1=info1->getNextInlink(k1) ,
			      k2=info2->getNextInlink(k2); ) {
			if ( ! k1 )
				break;
			if ( ! k2 )
				break;
			if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks )
				goto changed;
			s1   = k1->getLinkText();
			len1 = k1->size_linkText - 1; // exclude \0
			s2   = k2->getLinkText();
			len2 = k2->size_linkText - 1; // exclude \0
			if ( len1 != len2 )
				goto changed;
			if ( len1 > 0 && memcmp(s1,s2,len1) != 0 )
				goto changed;
		}
		// no change in link text, look for change in page content now
		int32_t *ch32 = getContentHash32();
		if ( ! ch32 || ch32 == (void *)-1 )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END error, getContentHash32 failed" );
			return (int32_t *)ch32;
		}

		// disable content hash check if language differ (we could have overridden language when injecting doc)
		bool checkContentHash = true;
		if (m_wasContentInjected) {
			if (m_skipContentHashCheck || (m_langIdValid && m_langId != od->m_langId)) {
				checkContentHash = false;
			}
		}

		if (checkContentHash && *ch32 == od->m_contentHash32) {
			m_indexCode = EDOCUNCHANGED;
			m_indexCodeValid = true;
			logTrace(g_conf.m_logTraceXmlDoc, "END, EDOCUNCHANGED");
			return &m_indexCode;
		}
	}

 changed:
	// words
	Words *words = getWords();
	if ( ! words || words == (Words *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END error, getWords failed" );
		return (int32_t *)words;
	}

	// we set the D_IS_IN_DATE flag for these bits
	Bits *bits = getBits();
	if ( ! bits )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END error, getBits failed" );
		return NULL;
	}


	// bad sections? fixes http://www.beerexpedition.com/northamerica.shtml
	// being continuously respidered when its lock expires every
	// MAX_LOCK_AGE seconds
	Sections *sections = getSections();
	// on EBUFOVERFLOW we will NEVER be able to parse this url
	// correctly so do not retry!
	if ( ! sections && g_errno == EBUFOVERFLOW ) {
		g_errno = 0;
		m_indexCode      = EBUFOVERFLOW;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBUFOVERFLOW (Sections)" );
		return &m_indexCode;
	}
	if (!sections||sections==(Sections *)-1)
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END error, getSections failed" );
		return (int32_t *)sections;
	}

	if ( sections->m_numSections == 0 && words->getNumWords() > 0 ) {
		m_indexCode      = EDOCBADSECTIONS;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCBADSECTIONS" );
		return &m_indexCode;
	}

	// i think an oom error is not being caught by Sections.cpp properly
	if ( g_errno ) { g_process.shutdownAbort(true); }


#if 0
	// @todo: See if this spam-check should be re-enabled and improved. Was hard coded to OFF below.

	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END error, getIsSiteRoot failed" );
		return (int32_t *)isRoot;
	}

	bool spamCheck = true;

	// if we are a root, allow repeat spam
	if ( *isRoot ) spamCheck = false;

	// if we are being spidered deep, allow repeat spam
	if ( gr->getLong("deep",0) ) {
		spamCheck = false;
	}

	// only html for now
	if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;

	// turn this off for now
	spamCheck = false;

	// otherwise, check the weights
	if ( spamCheck ) {
		char *ws = getWordSpamVec();
		if ( ! ws || ws == (void *)-1 ) return (int32_t *)ws;
		if ( m_isRepeatSpammer ) {
			m_indexCode      = EDOCREPEATSPAMMER;
			m_indexCodeValid = true;
			logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCREPEATSPAMMER" );
			return &m_indexCode;
		}
	}
#endif

	// validate this here so getSpiderPriority(), which calls
	// getUrlFilterNum(), which calls getNewSpiderReply(), which calls
	// us, getIndexCode() does not repeat all this junk
	//m_indexCodeValid = true;
	//m_indexCode      = 0;

	// this needs to be last!
	int32_t *priority = getSpiderPriority();
	if ( ! priority || priority == (void *)-1) {
		// allow this though
		if ( g_errno == EBUFOVERFLOW ) {
			g_errno = 0;
			m_indexCode      = EBUFOVERFLOW;
			m_indexCodeValid = true;
			logTrace( g_conf.m_logTraceXmlDoc, "END, EBUFOVERFLOW (getSpiderPriority)" );
			return &m_indexCode;
		}
		// but if it blocked, then un-validate it
		m_indexCodeValid = false;
		// and return to be called again i hope
		logTrace( g_conf.m_logTraceXmlDoc, "END, getSpiderPriority blocked" );
		return (int32_t *)priority;
	}

	if ( *priority  == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
		m_indexCode      = EDOCFILTERED;
		m_indexCodeValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCFILTERED" );
		return &m_indexCode;
	}

	// no error otherwise
	m_indexCode      = 0;
	m_indexCodeValid = true;
	logTrace( g_conf.m_logTraceXmlDoc, "END." );
	return &m_indexCode;
}



char *XmlDoc::prepareToMakeTitleRec ( ) {
	// do not re-call this for speed
	if (m_prepared) {
		return (char *)1;
	}

	int32_t *indexCode = getIndexCode();
	if (! indexCode || indexCode == (void *)-1) return (char *)indexCode;
	if (*indexCode && (*indexCode != EDOCSIMPLIFIEDREDIR && *indexCode != EDOCNONCANONICAL)) {
		m_prepared = true;
		return (char *)1;
	}

	//
	// do all the sets here
	//

	// . this gets our old doc from titledb, if we got it
	// . TODO: make sure this is cached in the event of a backoff, we
	//   will redo this again!!! IMPORTANT!!!
	char *isIndexed = getIsIndexed();
	if (!isIndexed || isIndexed == (char *)-1) {
		return isIndexed;
	}

	CollectionRec *cr = getCollRec();
	if (!cr) {
		return NULL;
	}

	// get our site root
	char *mysite = getSite();
	if (!mysite || mysite == (void *)-1) {
		return mysite;
	}

	uint8_t *langId = getLangId();
	if (!langId || langId == (uint8_t *)-1) {
		return (char *)langId;
	}

	getHostHash32a();
	getContentHash32();

	char **id = getThumbnailData();
	if (!id || id == (void *)-1) {
		return (char *)id;
	}

	int8_t *hopCount = getHopCount();
	if (!hopCount || hopCount == (void *)-1) {
		return (char *)hopCount;
	}

	char *spiderLinks = getSpiderLinks();
	if (!spiderLinks || spiderLinks == (char *)-1) {
		return spiderLinks;
	}

	int32_t *firstIndexedDate = getFirstIndexedDate();
	if (!firstIndexedDate || firstIndexedDate == (int32_t *)-1) {
		return (char *)firstIndexedDate;
	}

	int32_t *outlinksAddedDate = getOutlinksAddedDate();
	if (!outlinksAddedDate || outlinksAddedDate == (int32_t *)-1) {
		return (char *)outlinksAddedDate;
	}

	uint16_t *countryId = getCountryId();
	if (!countryId || countryId == (uint16_t *)-1) {
		return (char *)countryId;
	}

	char *trunc = getIsContentTruncated();
	if (!trunc || trunc == (char *)-1) {
		return trunc;
	}

	char *pl = getIsPermalink();
	if (!pl || pl == (char *)-1) {
		return pl;
	}

	// . before storing this into title Rec, make sure all tags
	//   are valid and tagRec is up to date
	// . like we might need to update the siteNumInlinks,
	//   or other tags because, for instance, contact info might not
	//   be in there because isSpam() never required it.
	int32_t *sni = getSiteNumInlinks();
	if (!sni || sni == (int32_t *)-1) {
		return (char *)sni;
	}

	char *ict = getIsContentTruncated();
	if (!ict || ict == (char *)-1) {
		return ict;
	}

	char *at = getIsAdult();
	if (!at || at == (void *)-1) {
		return at;
	}

	char *ls = getIsLinkSpam();
	if (!ls || ls == (void *)-1) {
		return ls;
	}

	uint32_t *tph = getTagPairHash32();
	if (!tph || tph == (uint32_t *)-1) {
		return (char *)tph;
	}

	m_prepared = true;
	return (char *)1;
}


// . create and store the titlerec into "buf".
// . it is basically the header part of all the member vars in this XmlDoc.
// . it has a key,dataSize,compressedData so it can be a record in an Rdb
// . return true on success, false on failure
bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){

	//setStatus ( "making title rec");

	// assume could not make one because we were banned or something
	tbuf->purge(); // m_titleRec = NULL;

	// start seting members in THIS's header before compression
	m_version           = TITLEREC_CURRENT_VERSION;

	// set this
	m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize;

	// add in variable length data
	int32_t *ps = (int32_t *)&size_firstUrl;
	// data ptr, consider a NULL to mean empty too!
	char **pd = (char **)&ptr_firstUrl;
	// how many XmlDoc::ptr_* members do we have? set "np" to that
	int32_t np = ((char *)&size_firstUrl  - (char *)&ptr_firstUrl) ;
	np /= sizeof(char *);
	// count up total we need to alloc
	int32_t need1 = m_headerSize;
	// clear these
	m_internalFlags1 = 0;
	// loop over em
	for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
		// skip if empty
		if ( *ps <= 0 ) continue;
		// or empty string ptr
		if ( ! *pd ) continue;
		// 4 bytes for the size
		need1 += 4;
		// add it up
		need1 += *ps;
		// make the mask
		uint32_t mask = 1 << i ;
		// add it in
		m_internalFlags1 |= mask;
	}
	// alloc the buffer
	char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
	// return NULL with g_errno set on error
	if ( ! ubuf ) return false;
	// serialize into it
	char *p = ubuf;
	// copy our crap into there
#ifdef _VALGRIND_
	VALGRIND_CHECK_MEM_IS_DEFINED(&m_headerSize,(size_t)((char*)&ptr_firstUrl-(char*)&m_headerSize));
#endif
	gbmemcpy ( p , &m_headerSize , m_headerSize );
	// skip it
	p += m_headerSize;
	// reset data ptrs
	pd = (char **)&ptr_firstUrl;
	// reset data sizes
	ps = (int32_t *)&size_firstUrl;

	// then variable length data
	for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) {
		// skip if empty, do not serialize
		if ( ! *ps ) continue;
		// or empty string ptr
		if ( ! *pd ) continue;

		// Sanity
		if( *ps < 0 ) {
			log(LOG_ERROR,"DATA CORRUPTION AVOIDED in setTitleRec. Variable length data item %" PRId32 " has negative length: %" PRId32 "", i, *ps);
			gbshutdownLogicError();
		}

		// store size first
		*(int32_t *)p = *ps;
		p += 4;
		// then the data
#ifdef _VALGRIND_
		VALGRIND_CHECK_MEM_IS_DEFINED(*pd,*ps);
#endif
		gbmemcpy ( p , *pd , *ps );
		// skip *ps bytes we wrote. should include a \0
		p += *ps;
	}
	// sanity check
	if ( p != ubuf + need1 ) { g_process.shutdownAbort(true); }

	// . make a buf big enough to hold compressed, we'll realloc afterwards
	// . according to zlib.h line 613 compress buffer must be .1% larger
	//   than source plus 12 bytes. (i add one for round off error)
	// . now i added another extra 12 bytes cuz compress seemed to want it
	int32_t need2 = ((int64_t)need1 * 1001LL) / 1000LL + 13 + 12;

	// we also need to store a key then regular dataSize then
	// the uncompressed size in cbuf before the compression of m_ubuf
	int32_t hdrSize = sizeof(key96_t) + 4 + 4;

	// . now i add 12 bytes more so Msg14.cpp can also squeeze in a
	//   negative key to delete the old titleRec, cuz we use this cbuf
	//   to set our list that we add to our twins with
	// . we now store the negative rec before the positive rec in Msg14.cpp
	//hdrSize += sizeof(key96_t) + 4;
	need2 += hdrSize;

	// return false on error
	if ( ! tbuf->reserve ( need2 ,"titbuf" ) ) return false;

	// shortcut
	char *cbuf = tbuf->getBufStart();

	// . how big is the buf we're passing to ::compress()?
	// . don't include the last 12 byte, save for del key in Msg14.cpp
	int32_t size = need2 - hdrSize ;

	// . uncompress the data into ubuf
	// . this will reset cbufSize to a smaller value probably
	// . "size" is set to how many bytes we wrote into "cbuf + hdrSize"
	int err = gbcompress ( (unsigned char *)cbuf + hdrSize,
			       (uint32_t *)&size,
			       (unsigned char *)ubuf ,
			       (uint32_t  )need1 );

	// free the buf we were trying to compress now
	mfree ( ubuf , need1 , "trub" );

	// we should check ourselves
	if ( err == Z_OK && size > (need2 - hdrSize ) ) {
		tbuf->purge();
		g_errno = ECOMPRESSFAILED;
		log(LOG_ERROR, "!!! Failed to compress document of %" PRId32" bytes. "
		    "Provided buffer of %" PRId32" bytes.",
		    size, (need2 - hdrSize ) );
		return false;
	}

	// check for error
	if ( err != Z_OK ) {
		tbuf->purge();
		g_errno = ECOMPRESSFAILED;
		log(LOG_ERROR,"!!! Failed to compress document.");
		return false;
	}

	key96_t tkey = Titledb::makeKey (docId,uh48,false);//delkey?

	// get a ptr to the Rdb record at start of the header
	p = cbuf;

	// . store key in header of cbuf
	// . store in our host byte ordering so we can be a rec in an RdbList
	*(key96_t *) p = tkey;
	p += sizeof(key96_t);

	// store total dataSize in header (excluding itself and key only)
	int32_t dataSize = size + 4;
	*(int32_t  *) p = dataSize ;
	p += 4;

	// store uncompressed size in header
	*(int32_t  *) p = need1;
	p += 4;

	// sanity check
	if ( p != cbuf + hdrSize ) {
		g_process.shutdownAbort(true);
	}

	// sanity check
	if ( need1 <= 0 ) {
		g_process.shutdownAbort(true);
	}

	// advance over data
	p += size;

	// update safebuf::m_length so it is correct
	tbuf->setLength ( p - cbuf );

	logTrace( g_conf.m_logTraceXmlDoc, "dataSize=%" PRId32 ", uncompressed=%" PRId32 ", docId=%" PRId64 "", dataSize, need1, docId);
	return true;
}


// . return NULL and sets g_errno on error
// . returns -1 if blocked
SafeBuf *XmlDoc::getTitleRecBuf ( ) {

	// return it now if we got it already
	if ( m_titleRecBufValid ) return &m_titleRecBuf;

	setStatus ( "making title rec");

	// did one of our many blocking function calls have an error?
	if ( g_errno ) return NULL;

	// . HACK so that TitleRec::isEmpty() return true
	// . faster than calling m_titleRec.reset()
	//m_titleRec.m_url.m_ulen = 0;

	int32_t *indexCode = getIndexCode();
	// not allowed to block here
	if ( indexCode == (void *)-1) { g_process.shutdownAbort(true); }
	// return on errors with g_errno set
	if ( ! indexCode ) return NULL;
	// force delete? EDOCFORCEDELETE
	if (*indexCode) {
		if (*indexCode == EDOCSIMPLIFIEDREDIR || *indexCode == EDOCNONCANONICAL) {
			// make sure we store an empty document if it's a simplified redirect/non-canonical
			ptr_utf8Content = NULL;
			size_utf8Content = 0;
		} else {
			m_titleRecBufValid = true;
			return &m_titleRecBuf;
		}
	}

	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block and return -1, we will be re-called from the top
	if ( ! m_masterLoop ) {
		m_masterLoop  = getTitleRecBufWrapper;
		m_masterState = this;
	}


	/////////
	//
	// IF ANY of these validation sanity checks fail then update
	// prepareToMakeTitleRec() so it makes them valid!!!
	//
	/////////

	// verify key parts
	if ( ! m_docIdValid                  ) { g_process.shutdownAbort(true); }

	// verify record parts
	//if ( ! m_versionValid                ) { g_process.shutdownAbort(true); }
	if ( ! m_ipValid                     ) { g_process.shutdownAbort(true); }
	if ( ! m_spideredTimeValid           ) { g_process.shutdownAbort(true); }
	if ( ! m_firstIndexedDateValid       ) { g_process.shutdownAbort(true); }
	if ( ! m_outlinksAddedDateValid      ) { g_process.shutdownAbort(true); }
	if ( ! m_charsetValid                ) { g_process.shutdownAbort(true); }
	if ( ! m_countryIdValid              ) { g_process.shutdownAbort(true); }
	if ( ! m_httpStatusValid             ) { g_process.shutdownAbort(true); }

	if ( ! m_siteNumInlinksValid         ) { g_process.shutdownAbort(true); }

	if ( ! m_hopCountValid               ) { g_process.shutdownAbort(true); }
	if ( ! m_metaListCheckSum8Valid      ) { g_process.shutdownAbort(true); }
	if ( ! m_langIdValid                 ) { g_process.shutdownAbort(true); }
	if ( ! m_contentTypeValid            ) { g_process.shutdownAbort(true); }

	if ( ! m_isRSSValid                  ) { g_process.shutdownAbort(true); }
	if ( ! m_isPermalinkValid            ) { g_process.shutdownAbort(true); }
	if ( ! m_isAdultValid                ) { g_process.shutdownAbort(true); }
	if ( ! m_spiderLinksValid            ) { g_process.shutdownAbort(true); }
	if ( ! m_isContentTruncatedValid     ) { g_process.shutdownAbort(true); }
	if ( ! m_isLinkSpamValid             ) { g_process.shutdownAbort(true); }

	// buffers
	if ( ! m_firstUrlValid               ) { g_process.shutdownAbort(true); }
	if ( ! m_redirUrlValid               ) { g_process.shutdownAbort(true); }
	if ( ! m_tagRecValid                 ) { g_process.shutdownAbort(true); }
	if ( ! m_imageDataValid              ) { g_process.shutdownAbort(true); }
	if ( ! m_recycleContent ) {
		if ( ! m_rawUtf8ContentValid         ) { g_process.shutdownAbort(true); }
		if ( ! m_expandedUtf8ContentValid    ) { g_process.shutdownAbort(true); }
	}
	if ( ! m_utf8ContentValid            ) { g_process.shutdownAbort(true); }
	if ( ! m_siteValid                   ) { g_process.shutdownAbort(true); }
	if ( ! m_linkInfo1Valid              ) { g_process.shutdownAbort(true); }

	// do we need these?
	if ( ! m_hostHash32aValid            ) { g_process.shutdownAbort(true); }
	if ( ! m_contentHash32Valid          ) { g_process.shutdownAbort(true); }
	if ( ! m_tagPairHash32Valid          ) { g_process.shutdownAbort(true); }

	setStatus ( "compressing into final title rec");

	int64_t uh48 = getFirstUrlHash48();

	int64_t *docId = getDocId();

	// time it
	int64_t startTime = gettimeofdayInMilliseconds();

	//////
	//
	// fill in m_titleRecBuf
	//
	//////

	// we need docid and uh48 for making the key of the titleRec
	if ( ! setTitleRecBuf ( &m_titleRecBuf , *docId , uh48 ) )
		return NULL;

	// set this member down here because we can't set it in "xd"
	// because it is too short of an xmldoc stub
	m_versionValid = true;

	// . add the stat
	// . use white for the stat
	g_stats.addStat_r(0, startTime, gettimeofdayInMilliseconds(), 0x00ffffff);

	char *cbuf = m_titleRecBuf.getBufStart();
	m_titleRecKey = *(key96_t *)cbuf;
	m_titleRecKeyValid = true;

	// now valid. congratulations!
	m_titleRecBufValid   = true;
	return &m_titleRecBuf;
}


// . store this in clusterdb rec so family filter works!
// . check content for adult words
char *XmlDoc::getIsAdult ( ) {

	if ( m_isAdultValid ) return &m_isAdult2;

	// call that
	setStatus ("getting is adult bit");


	// need the content
	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1) return (char *)u8;

	// time it
	int64_t start = gettimeofdayInMilliseconds();

	// score that up
	int32_t total = getAdultPoints ( ptr_utf8Content, size_utf8Content - 1 , m_firstUrl.getUrl() );


	// debug msg
	int64_t took = gettimeofdayInMilliseconds() - start;
	if ( took > 10 )
		logf(LOG_DEBUG,
		     "build: Took %" PRId64" ms to check doc of %" PRId32" bytes for "
		     "dirty words.",took,size_utf8Content-1);

	m_isAdult  = false;
	// adult?
	if ( total >= 2 ) m_isAdult = true;
	// set shadow member
	m_isAdult2 = (bool)m_isAdult;
	// validate
	m_isAdultValid = true;

	// note it
	if ( m_isAdult2 && g_conf.m_logDebugDirty )
		log("dirty: %s points = %" PRId32,m_firstUrl.getUrl(),total);

	// no dirty words found
	return &m_isAdult2;
}



// . sets g_errno on error and returns NULL
// . now returns a ptr to it so we can return NULL to signify error, that way
//   all accessors have equivalent return values
// . an acessor function returns (char *)-1 if it blocked!
char *XmlDoc::getIsPermalink ( ) {
	if ( m_isPermalinkValid ) return &m_isPermalink2;
	Url *url      = getCurrentUrl();
	if ( ! url      ) return NULL;
	char *isRSS   = getIsRSS();
	// return NULL with g_errno set, -1 if blocked
	if ( ! isRSS || isRSS == (char  *)-1 ) return isRSS;
	Links *links  = getLinks();
	// return NULL with g_errno set, -1 if blocked
	if ( ! links || links == (Links *)-1 ) return (char *)links;
	uint8_t  *ct     = getContentType();
	// return NULL with g_errno set, -1 if blocked
	if ( ! ct    || ct    == (uint8_t  *)-1 ) return (char *)ct;
	// GUESS if it is a permalink by the format of the url
	int32_t p = ::isPermalink ( links  , // Links ptr
				 url    ,
				 *ct    , // CT_HTML default?
				 NULL   , // LinkInfo ptr
				 *isRSS );// isRSS?
	m_isPermalink      = p;
	m_isPermalink2     = p;
	m_isPermalinkValid = true;
	return &m_isPermalink2;
}

// guess based on the format of the url if this is a permalink
//@todo BR: FLAKY at best...
char *XmlDoc::getIsUrlPermalinkFormat ( ) {
	if ( m_isUrlPermalinkFormatValid ) return &m_isUrlPermalinkFormat;

	setStatus ( "getting is url permalink format" );

	Url *url      = getCurrentUrl();
	if ( ! url      ) return NULL;
	// just guess if we are rss here since we most likely do not have
	// access to the url's content...
	bool isRSS = false;
	const char *ext = url->getExtension();
	if ( ext && strcasecmp(ext,"rss") == 0 ) isRSS = true;
	// GUESS if it is a permalink by the format of the url
	int32_t p = ::isPermalink ( NULL    , // Links ptr
				 url     ,
				 CT_HTML ,
				 NULL    , // LinkInfo ptr
				 isRSS   );// we guess this...
	m_isUrlPermalinkFormat      = p;
	m_isUrlPermalinkFormatValid = true;
	return &m_isUrlPermalinkFormat;
}

char *XmlDoc::getIsRSS ( ) {
	if ( m_isRSSValid ) return &m_isRSS2;
	// the xml tells us for sure
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	m_isRSS      = xml->isRSSFeed();
	m_isRSS2     = (bool)m_isRSS;
	m_isRSSValid = true;
	return &m_isRSS2;
}

bool *XmlDoc::getIsSiteMap ( ) {
	if ( m_isSiteMapValid ) return &m_isSiteMap;
	uint8_t  *ct = getContentType();
	if ( ! ct    || ct    == (uint8_t  *)-1 ) return (bool *)ct;
	char *uf = m_firstUrl.getFilename();
	int32_t ulen = m_firstUrl.getFilenameLen();
	// sitemap.xml
	m_isSiteMap = false;
	// must be xml to be a sitemap
	if ( *ct == CT_XML &&
	     ulen == 11 &&
	     strncmp(uf,"sitemap.xml",11) == 0 )
		m_isSiteMap = true;
	m_isSiteMapValid = true;
	return &m_isSiteMap;
}

// . this function should really be called getTagTokens() because it mostly
//   works on HTML documents, not XML, and just sets an array of ptrs to
//   the tags in the document, including ptrs to the text in between
//   tags.
Xml *XmlDoc::getXml ( ) {

	// return it if it is set
	if ( m_xmlValid ) {
		return &m_xml;
	}

	// note it
	setStatus ( "parsing html");

	// get the filtered content
	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
	int32_t u8len = size_utf8Content - 1;

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;

	int64_t start = logQueryTimingStart();

	// set it
	if ( !m_xml.set( *u8, u8len, m_version, *ct ) ) {
		// return NULL on error with g_errno set
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	m_xmlValid = true;
	return &m_xml;
}

static bool setLangVec ( Words *words ,
			 SafeBuf *langBuf ,
			 Sections *ss ) {

	const int64_t *wids       = words->getWordIds();
	const char * const *wptrs = words->getWordPtrs();
	int32_t nw                = words->getNumWords();

	// allocate
	if ( ! langBuf->reserve ( nw ) ) return false;

	uint8_t *langVector = (uint8_t *)langBuf->getBufStart();

	// now set the langid
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// default
		langVector[i] = langUnknown;
		// add the word
		if ( wids[i] == 0LL ) continue;
		// skip if number
		if ( is_digit(wptrs[i][0]) ) {
			langVector[i] = langTranslingual;
			continue;
		}
		// get the lang bits. does not include langTranslingual
		// or langUnknown
		int64_t bits = g_speller.getLangBits64 ( wids[i] );
		// skip if not unique
		char count = getNumBitsOn64 ( bits ) ;
		// if we only got one lang we could be, assume that
		if ( count == 1 ) {
			// get it. bit #0 is english, so add 1
			char langId = getBitPosLL((uint8_t *)&bits) + 1;
			//langVector[i] = g_wiktionary.getLangId(&wids[i]);
			langVector[i] = langId;
			continue;
		}
		// ambiguous? set it to unknown then
		if ( count >= 2 ) {
			langVector[i] = langUnknown;
			continue;
		}
		// try setting based on script. greek. russian. etc.
		// if the word was not in the wiktionary.
		// this will be langUnknown if not definitive.
		langVector[i] = getCharacterLanguage(wptrs[i]);
	}

	// . now go sentence by sentence
	// . get the 64 bit vector for each word in the sentence
	// . then intersect them all
	// . if the result is a unique langid, assign that langid to
	//   all words in the sentence

	// get first sentence in doc
	Section *si = NULL;
	if ( ss ) si = ss->m_firstSent;
	// scan the sentence sections and or in the bits we should
	for ( ; si ; si = si->m_nextSent ) {
		// reset vec
		int64_t bits = LANG_BIT_MASK;
		// get lang 64 bit vec for each wid in sentence
		for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
			// skip if not alnum word
			if ( ! wids[j] ) continue;
			// skip if starts with digit
			if ( is_digit(wptrs[j][0]) ) continue;
			// get 64 bit lang vec. does not include
			// langUnknown or langTransligual bits
			bits &= g_speller.getLangBits64 ( wids[j] );
		}
		// bail if none
		if ( ! bits ) continue;
		// skip if more than one language in intersection
		if ( getNumBitsOn64(bits) != 1 ) continue;
		// get it. bit #0 is english, so add 1
		char langId = getBitPosLL((uint8_t *)&bits) + 1;
		// ok, must be this language i guess
		for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) {
			// skip if not alnum word
			if ( ! wids[j] ) continue;
			// skip if starts with digit
			if ( is_digit(wptrs[j][0]) ) continue;
			// set it
			langVector[j] = langId;
		}
	}

	// try the same thing but do not use sentences. use windows of
	// 5 words. this will pick up pages that have an english menu
	// where each menu item is an individual sentence and only
	// one word.
	// http://www.topicexchange.com/
	int64_t window[5];
	int32_t wpos[5];
	memset ( window , 0 , 8*5 );
	int32_t wp = 0;
	int32_t total = 0;
	// now set the langid
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// must be alnum
		if ( ! wids[i] ) continue;
		// skip if starts with digit
		if ( is_digit(wptrs[i][0]) ) continue;
		// skip if lang already set to a language
		//if ( langVector[i] != langUnknown &&
		//     langVector[i] != langTranslingual )
		//	continue;
		// get last 5
		window[wp] = g_speller.getLangBits64 ( wids[i] );
		// skip if not in dictionary!
		if ( window[wp] == 0 ) continue;
		// otherwise, store it
		wpos  [wp] = i;
		if ( ++wp >= 5 ) wp = 0;
		// need at least 3 samples
		if ( ++total <= 2 ) continue;
		// intersect them all together
		int64_t bits = LANG_BIT_MASK;
		for ( int32_t j = 0 ; j < 5 ; j++ ) {
			// skip if uninitialized, like if we have 3
			// or only 4 samples
			if ( ! window[j] ) continue;
			// otherwise, toss it in the intersection
			bits &= window[j];
		}
		// skip if intersection empty
		if ( ! bits ) continue;
		// skip if more than one language in intersection
		if ( getNumBitsOn64(bits) != 1 ) continue;
		// get it. bit #0 is english, so add 1
		char langId = getBitPosLL((uint8_t *)&bits) + 1;
		// set all in window to this language
		for ( int32_t j = 0 ; j < 5 ; j++ ) {
			// skip if unitialized
			if ( ! window[j] ) continue;
			// otherwise, set it
			langVector[wpos[j]] = langId;
		}
	}


	return true;
}

// 1-1 with the words!
uint8_t *XmlDoc::getLangVector ( ) {

	if ( m_langVectorValid ) {
		// can't return NULL, that means error!
		uint8_t *v = (uint8_t *)m_langVec.getBufStart();
		if ( ! v ) return (uint8_t *)0x01;
		return v;
	}

	// words
	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (uint8_t *)words;

	// get the sections
	Sections *ss = getSections();
	if ( ! ss || ss==(void *)-1) return (uint8_t *)ss;


	if ( ! setLangVec ( words , &m_langVec , ss ) )
		return NULL;

	m_langVectorValid = true;
	// can't return NULL, that means error!
	uint8_t *v = (uint8_t *)m_langVec.getBufStart();
	if ( ! v ) return (uint8_t *)0x01;
	return v;
}

// returns -1 and sets g_errno on error
uint8_t *XmlDoc::getLangId ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_langIdValid ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid" );
		return &m_langId;
	}
	setStatus ( "getting lang id");

	// get the stuff we need
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (uint8_t *)ip;

	// . if we got no ip, we can't get the page...
	// . also getLinks() will call getSiteNumInlinks() which will
	//   call getSiteLinkInfo() and will core if ip is 0 or -1
	if ( *ip == 0 || *ip == -1 ) {
		m_langId = langUnknown;
		m_langIdValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, IP unknown" );
		return &m_langId;
	}

	Words    *words    = getWords   ();
	if ( ! words || words == (Words *)-1 ) {
		return (uint8_t *)words;
	}

	Sections *sections = getSections();
	// did it block?
	if ( sections==(Sections *)-1) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, invalid section" );
		return(uint8_t *)sections;
	}

	// well, it still calls Dates::parseDates which can return g_errno
	// set to EBUFOVERFLOW...
	if ( ! sections && g_errno != EBUFOVERFLOW ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, invalid section" );
		return NULL;
	}

	// if sectinos is still NULL - try lang id without sections then,
	// reset g_errno
	g_errno = 0;

	uint8_t *lv = getLangVector();
	if ( ! lv || lv == (void *)-1 ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, invalid lang vector" );
		return (uint8_t *)lv;
	}

	setStatus ( "getting lang id");

	// compute langid from vector
	m_langId = computeLangId ( sections , words, (char *)lv );
	if ( m_langId != langUnknown ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning langid=%s from langVector", getLanguageAbbr(m_langId) );
		m_langIdValid = true;
		return &m_langId;
	}

	// . try the meta description i guess
	// . 99% of the time we don't need this because the above code
	//   captures the language
	int32_t mdlen;
	char *md = getMetaDescription( &mdlen );
	Words mdw;
	mdw.set ( md , mdlen , true );

	SafeBuf langBuf;
	setLangVec ( &mdw,&langBuf,NULL);
	char *tmpLangVec = langBuf.getBufStart();
	m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
	if ( m_langId != langUnknown ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning langid=%s from metaDescription", getLanguageAbbr(m_langId) );
		m_langIdValid = true;
		return &m_langId;
	}

	// try meta keywords
	md = getMetaKeywords( &mdlen );
	mdw.set ( md , mdlen , true );

	langBuf.purge();
	setLangVec ( &mdw,&langBuf,NULL);
	tmpLangVec = langBuf.getBufStart();
	m_langId = computeLangId ( NULL , &mdw , tmpLangVec );
	if (m_langId != langUnknown) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, returning langid=%s from metaKeywords", getLanguageAbbr(m_langId));
		m_langIdValid = true;
		return &m_langId;
	}

	// try charset
	if (m_charsetValid && m_charset != csUnknown) {
		m_langId = getLangIdFromCharset(m_charset);
		if (m_langId != langUnknown) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, returning langid=%s from charset", getLanguageAbbr(m_langId));
			m_langIdValid = true;
			return &m_langId;
		}
	}

	logTrace(g_conf.m_logTraceXmlDoc, "END, returning langid=%s", getLanguageAbbr(m_langId));
	m_langIdValid = true;
	return &m_langId;
}


// lv = langVec
char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {

	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	// this means null too
	if ( sections && sections->m_numSections == 0 ) sp = NULL;
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE;//|SEC_SELECT;

	int32_t counts [ MAX_LANGUAGES ];
	memset(counts, 0, sizeof(counts));



	int32_t             nw    = words->getNumWords();
	const char * const *wptrs = words->getWordPtrs();


	// now set the langid
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if in script or style section
		if ( sp && (sp[i]->m_flags & badFlags) ) continue;
		//
		// skip if in a url
		//
		// blah/
		int32_t wlen = words->getWordLen(i);
		if ( wptrs[i][wlen] == '/' ) continue;
		// blah.blah or blah?blah
		if ( (wptrs[i][wlen] == '.' ||
		      wptrs[i][wlen] == '?' ) &&
		     is_alnum_a(wptrs[i][wlen+1]) )
			continue;
		// /blah or ?blah
		if ( (i>0 && wptrs[i][-1] == '/') ||
		     (i>0 && wptrs[i][-1] == '?')    )
			continue;
		// add it up
		counts[(unsigned char)lv[i]]++;
	}

	// get the majority count
	int32_t max = 0;
	int32_t maxi = 0;
	// skip langUnknown by starting at 1, langEnglish
	for ( int32_t i = 1 ; i < MAX_LANGUAGES ; i++ ) {
		// skip translingual
		if ( i == langTranslingual ) {
			continue;
		}
		if ( counts[i] <= max ) {
			continue;
		}
		max = counts[i];
		maxi = i;
	}

	return maxi;
}



Words *XmlDoc::getWords ( ) {
	// return it if it is set
	if ( m_wordsValid ) {
		return &m_words;
	}

	// this will set it if necessary
	Xml *xml = getXml();
	// returns NULL on error, -1 if blocked
	if ( ! xml || xml == (Xml *)-1 ) return (Words *)xml;

	// note it
	setStatus ( "getting words");

	int64_t start = logQueryTimingStart();

	// now set what we need
	if ( !m_words.set( xml, true ) ) {
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	m_wordsValid = true;
	return &m_words;
}

Bits *XmlDoc::getBits ( ) {
	// return it if it is set
	if ( m_bitsValid ) return &m_bits;

	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Bits *)words;

	int64_t start = logQueryTimingStart();

	// now set what we need
	if ( !m_bits.set(words))
		return NULL;

	logQueryTimingEnd( __func__, start );

	// we got it
	m_bitsValid = true;
	return &m_bits;
}

Bits *XmlDoc::getBitsForSummary ( ) {
	// return it if it is set
	if ( m_bits2Valid ) return &m_bits2;

	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Bits *)words;

	int64_t start = logQueryTimingStart();

	// now set what we need
	if ( ! m_bits2.setForSummary ( words ) ) return NULL;

	logQueryTimingEnd( __func__, start );

	// we got it
	m_bits2Valid = true;
	return &m_bits2;
}

Pos *XmlDoc::getPos ( ) {
	// return it if it is set
	if ( m_posValid ) return &m_pos;

	// this will set it if necessary
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Pos *)ww;

	int64_t start = logQueryTimingStart();

	if ( ! m_pos.set ( ww ) ) return NULL;

	logQueryTimingEnd( __func__, start );

	// we got it
	m_posValid = true;
	return &m_pos;
}

Phrases *XmlDoc::getPhrases ( ) {
	// return it if it is set
	if ( m_phrasesValid ) {
		return &m_phrases;
	}

	// this will set it if necessary
	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Phrases *)words;

	// get this
	Bits *bits = getBits();
	// bail on error
	if ( ! bits ) return NULL;

	int64_t start = logQueryTimingStart();

	// now set what we need
	if ( !m_phrases.set( words, bits ) ) {
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	// we got it
	m_phrasesValid = true;
	return &m_phrases;
}


Sections *XmlDoc::getSections ( ) {
	// return it if it is set
	if ( m_sectionsValid ) return &m_sections;

	setStatus ( "getting sections" );

	// use the old title rec to make sure we parse consistently!
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod;

	Words *words = getWords();
	// returns NULL on error, -1 if blocked
	if ( ! words || words == (Words *)-1 ) return (Sections *)words;

	// get this
	Bits *bits = getBits();
	// bail on error
	if ( ! bits ) return NULL;

	// the docid
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Sections *)d;

	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	setStatus ( "getting sections");

	int64_t start = logQueryTimingStart();

	// this uses the sectionsReply to see which sections are "text", etc.
	// rather than compute it expensively
	if ( !m_calledSections &&
		 !m_sections.set( &m_words, bits, getFirstUrl(), cr->m_coll, *ct ) ) {
		m_calledSections = true;
		// it blocked, return -1
		return (Sections *) -1;
	}

	// error? maybe ENOMEM
	if ( g_errno ) return NULL;

	// set inlink bits
	m_bits.setInLinkBits ( &m_sections );

	logQueryTimingEnd( __func__, start );

	// we got it
	m_sectionsValid = true;
	return &m_sections;
}

int32_t *XmlDoc::getLinkSiteHashes ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_linkSiteHashesValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid" );
		return (int32_t *)m_linkSiteHashBuf.getBufStart();
	}

	// get the outlinks
	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLinks returned -1" );
		return (int32_t *)links;
	}

	// . get the outlink tag rec vector
	// . each link's tagrec may have a "site" tag that is basically
	//   the cached SiteGetter::getSite() computation
	TagRec ***grv = NULL;
	if ( ! m_setFromTitleRec ) {
		logTrace( g_conf.m_logTraceXmlDoc, "!m_setFromTitleRec, calling getOutlinkTagRecVector" );
		grv = getOutlinkTagRecVector();
		if ( ! grv || grv == (void *)-1 )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, getOutlinkTagRecVector returned -1" );
			return (int32_t *)grv;
		}
	}

	// how many outlinks do we have on this page?
	int32_t n = links->getNumLinks();
	logTrace( g_conf.m_logTraceXmlDoc, "%" PRId32" outlinks found on page", n);

	// reserve space
	m_linkSiteHashBuf.purge();
	if ( ! m_linkSiteHashBuf.reserve ( n * 4 ) )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, m_linkSiteHashBuf.reserve failed" );
		return NULL;
	}

	if ( n == 0 ) {
		ptr_linkdbData = NULL;
		size_linkdbData = 0;
		logTrace( g_conf.m_logTraceXmlDoc, "END, no outlinks" );
		return (int32_t *)0x1234;
	}

	// if set from titlerec then assume each site is the full hostname
	// of the link, unless its specified explicitly in the hashtablex
	// serialized in ptr_linkdbData
	if ( m_setFromTitleRec ) {
		// this holds the sites that are not just the hostname
		int32_t *p = (int32_t *)ptr_linkdbData;
		int32_t *pend = (int32_t *)(ptr_linkdbData + size_linkdbData);
		// loop over links
		for ( int32_t i = 0 ; i < n ; i++ ) {
			// get the link
			char *u = links->getLinkPtr(i);
			// assume site is just the host
			int32_t hostLen = 0;
			const char *host = ::getHost ( u , &hostLen );
			int32_t siteHash32 = hash32 ( host , hostLen , 0 );
			// unless give as otherwise
			if ( p < pend && *p == i ) {
				p++;
				siteHash32 = *p;
				p++;
			}
			// store that then. should not fail since we allocated
			// right above
			if ( ! m_linkSiteHashBuf.pushLong(siteHash32) ) {
				g_process.shutdownAbort(true); }
		}
		// return ptr of array, which is a safebuf
		logTrace( g_conf.m_logTraceXmlDoc, "END, m_setFromTitleRec. Returning link list" );
		return (int32_t *)m_linkSiteHashBuf.getBufStart();
	}

	// ptr_linkdbData will point into this buf
	m_linkdbDataBuf.purge();

	// loop through them
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// get the link
		char *u = links->getLinkPtr(i);
		// get full host from link
		int32_t hostLen = 0;
		const char *host = ::getHost ( u , &hostLen );
		int32_t hostHash32 = hash32 ( host , hostLen , 0 );
		// get the site
		TagRec *gr = (*grv)[i];
		const char *site = NULL;
		int32_t  siteLen = 0;
		if ( gr ) {
			int32_t dataSize = 0;
			site = gr->getString("site",NULL,&dataSize);
			if ( dataSize ) siteLen = dataSize - 1;
		}
		// otherwise, make it the host or make it cut off at
		// a "/user/" or "/~xxxx" or whatever path component
		if ( ! site ) {
			// GUESS link site... like /~xxx
			site    = host;
			siteLen = hostLen;
		}
		int32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 );
		// only store if different form host itself
		if ( linkeeSiteHash32 != hostHash32 ) {
			if ( ! m_linkdbDataBuf.pushLong(i) )
			{
				logTrace( g_conf.m_logTraceXmlDoc, "END, could not store in buffer (1)" );
				return NULL;
			}
			if ( ! m_linkdbDataBuf.pushLong(linkeeSiteHash32) )
			{
				logTrace( g_conf.m_logTraceXmlDoc, "END, could not store in buffer (2)" );
				return NULL;
			}
		}
		// store it always in this buf
		if ( ! m_linkSiteHashBuf.pushLong(linkeeSiteHash32) ) {
			// space should have been reserved above!
			g_process.shutdownAbort(true); }
	}
	// set ptr_linkdbData
	ptr_linkdbData  = m_linkdbDataBuf.getBufStart();
	size_linkdbData = m_linkdbDataBuf.length();
	m_linkSiteHashesValid = true;

	logTrace( g_conf.m_logTraceXmlDoc, "END, returning list" );
	return (int32_t *)m_linkSiteHashBuf.getBufStart();
}



Links *XmlDoc::getLinks ( bool doQuickSet ) {
	if ( m_linksValid ) return &m_links;
	// set status
	setStatus ( "getting outlinks");

	// this will set it if necessary
	Xml *xml = getXml();

	// bail on error
	if ( ! xml || xml == (Xml *)-1 ) return (Links *)xml;

	// can't call getIsPermalink() here without entering a dependency loop
	char *pp = getIsUrlPermalinkFormat();
	if ( !pp || pp == (char *)-1 ) return (Links *)pp;

	// use the old xml doc
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (Links *)od;

	// get Links class of the old title rec
	Links *oldLinks = NULL;

	// if we were set from a title rec, do not do this
	if ( *od ) {
		oldLinks = (*od)->getLinks();
		if (!oldLinks||oldLinks==(Links *)-1) return (Links *)oldLinks;
	}

	Url *baseUrl = getBaseUrl();
	if ( ! baseUrl || baseUrl==(Url *)-1) return (Links *)baseUrl;

	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (Links *)ip;

	// this ensures m_contentLen is set
	//char **content = getContent();
	//if ( ! content || content == (char **)-1 ) return (Links *)content;

	char *ict = getIsContentTruncated();
	if ( ! ict || ict == (char *)-1 ) return (Links *)ict;

	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (Links *)sni;

	// get the latest url we are on
	Url *u = getCurrentUrl();

	//
	// if we had a EDOCSIMPLIFIEDREDIR error, pretend it is a link
	// so addOutlinkSpiderRecsToMetaList() will add it to spiderdb
	//
	if ( m_indexCodeValid && m_indexCode == EDOCSIMPLIFIEDREDIR ) {
		m_links.set(m_redirUrl.getUrl());
		m_linksValid = true;
		return &m_links;
	}

	if ( m_indexCodeValid && m_indexCode == EDOCNONCANONICAL ) {
		m_links.set(m_canonicalRedirUrl.getUrl());
		m_linksValid = true;
		return &m_links;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		return NULL;
	}

	bool useRelNoFollow = true;
	if ( ! cr->m_obeyRelNoFollowLinks ) {
		useRelNoFollow = false;
	}

	// . set it
	// . if parent is a permalink we can avoid its suburl outlinks
	//   containing "comment" from being classified as permalinks
	if ( ! m_links.set ( useRelNoFollow ,
			     xml         ,
			     u           ,
			     baseUrl     ,
			     m_version   ,
			     *pp         , // parent url in permalink format?
			     oldLinks    ,// oldLinks, might be NULL!
			     doQuickSet ) )
		return NULL;

	m_linksValid = true;

	// do not bother setting that bit if we are being called for link
	// text because that bit was already in the linkdb key, and it
	// was set to zero! so if getting msg20 reply.... bail now
	if ( m_req ) return &m_links;

	// . apply link spam settings
	// . set the "spam bits" in the Links class
	setLinkSpam ( *ip                ,
		      u                  , // linker url
		      *sni               ,
		      xml                ,
		      &m_links           ,
		      *ict               );
	// we got it
	return &m_links;
}


HashTableX *XmlDoc::getCountTable ( ) {
	// return it if we got it
	if ( m_countTableValid ) return &m_countTable;

	setStatus ("getting count table");

	// get the stuff we need
	Xml      *xml      = getXml     ();
	if ( ! xml || xml == (Xml *)-1 ) return (HashTableX *)xml;
	Words    *words    = getWords   ();
	if ( ! words || words == (Words *)-1 ) return (HashTableX *)words;
	Phrases  *phrases  = getPhrases ();
	if ( ! phrases || phrases==(Phrases *)-1) return (HashTableX *)phrases;
	Bits     *bits     = getBits    ();
	if ( ! bits || bits == (Bits *)-1 ) return (HashTableX *)bits;
	Sections *sections = getSections();
	if ( !sections||sections==(Sections *)-1) return(HashTableX *)sections;
	LinkInfo *info1    = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (HashTableX *)info1;

	// . reduce score of words in badly repeated fragments to 0 so we do
	//   not count them here!
	// . ff[i] will have score of 0 if in repeated frag
	// . make sure this is stored for whole doc... since we only use it
	//   for the body
	char *fv = getFragVec();
	if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv;

	//
	// this was in Weights.cpp, but now it is here...
	//

	// shortcut
	HashTableX *ct = &m_countTable;

	// ez var
	const nodeid_t *tids  = words->getTagIds();
	int32_t   nw    = words->getNumWords   ();
	const int64_t  *pids  = phrases->getPhraseIds2();

	// add 5000 slots for inlink text in hashString_ct() calls below
	int32_t numSlots = nw * 3 + 5000;
	// only alloc for this one if not provided
	if (!ct->set(8,4,numSlots,NULL,0,false,"xmlct"))
	  return (HashTableX *)NULL;

	// . now hash all the phrase ids we have in order to see if the phrase
	//   is unique or not. if phrase is repeated a lot we punish the scores
	//   of the individual words in the phrase and boost the score of the
	//   phrase itself. We check for uniqueness down below.
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// add the word
		int64_t wid = words->getWordId(i);
		if ( wid == 0LL )
			continue;

		// . skip if in repeated fragment
		// . unfortunately we truncate the frag vec to like
		//   the first 80,000 words for performance reasons
		if ( i < MAXFRAGWORDS && fv[i] == 0 ) continue;
		// accumulate the wid with a score of 1 each time it occurs
		if ( ! ct->addTerm(wid) ) return (HashTableX *)NULL;
		// skip if word #i does not start a phrase
		if ( ! pids [i] ) continue;
		// if phrase score is less than 100% do not consider as a
		// phrase so that we do not phrase "albuquerque, NM" and stuff
		// like that... in fact, we can only have a space here...
		const char *wptr = words->getWord(i+1);
		if ( wptr[0] == ',' ) continue;
		if ( wptr[1] == ',' ) continue;
		if ( wptr[2] == ',' ) continue;
		// put it in, accumulate, max score is 0x7fffffff
		if ( ! ct->addTerm(pids[i]) ) return (HashTableX *)NULL;
	}

	// now add each meta tag to the pot
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not a meta tag
		if ( tids[i] != TAG_META ) continue;
		// find the "content=" word
		const char *w    = words->getWord(i);
		int32_t  wlen = words->getWordLen(i);
		const char *wend = w + wlen;
		const char *p = strncasestr  (w,wlen,"content=");
		// skip if we did not have any content in this meta tag
		if ( ! p ) continue;
		// skip the "content="
		p += 8;
		// skip if empty meta content
		if ( wend - p <= 0 ) continue;

		// our ouw hash
		//const_cast because hashString_ct calls Words::set and that is still not const-sane
		if ( ! hashString_ct ( ct , const_cast<char*>(p) , wend - p ) )
			return (HashTableX *)NULL;
	}
	// add each incoming link text
	for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) {
		// shortcuts
		char *p;
		int32_t  plen;
		// hash link text (was hashPwids())
		p    = k-> getLinkText();
		plen = k->size_linkText - 1;
		if ( ! verifyUtf8 ( p , plen ) ) {
			log("xmldoc: bad link text 3 from url=%s for %s",
			    k->getUrl(),m_firstUrl.getUrl());
			continue;
		}
		if ( ! hashString_ct ( ct , p , plen ) )
		  return (HashTableX *)NULL;
		// hash this stuff (was hashPwids())
		p    = k->getSurroundingText();
		plen = k->size_surroundingText - 1;
		if ( ! hashString_ct ( ct , p , plen ) )
		  return (HashTableX *)NULL;
	}

	// we got it
	m_countTableValid = true;
	return &m_countTable;
}

// . a special function used by XmlDoc::getCountTable() above
// . kinda similar to XmlDoc::hashString()
bool XmlDoc::hashString_ct ( HashTableX *ct , char *s , int32_t slen ) {

	Words   words;
	Bits    bits;
	Phrases phrases;
	if ( ! words.set   ( s , slen , true ) )
		return false;
	if ( !bits.set(&words))
		return false;
	if ( !phrases.set( &words, &bits ) )
		return false;
	int32_t nw = words.getNumWords();
	const int64_t  *pids  = phrases.getPhraseIds2();

	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// add the word
		int64_t wid = words.getWordId(i);
		if ( wid == 0LL ) continue;
		// skip if in repeated fragment
		// . NO, we do not use this for these short strings
		//if ( ww[i] == 0 ) continue;
		// accumulate the wid with a score of 1 each time it occurs
		if ( ! ct->addTerm(wid) ) return false;
		// skip if word #i does not start a phrase
		if ( ! pids [i] ) continue;
		// if phrase score is less than 100% do not consider as a
		// phrase so that we do not phrase "albuquerque, NM" and stuff
		// like that... in fact, we can only have a space here...
		if ( i+1<nw ) {
			const char *wptr = words.getWord(i+1);
			if ( wptr[0] == ',' ) continue;
			int32_t wlen = words.getWordLen(i);
			if ( wlen>=2 && wptr[1] == ',' ) continue;
			if ( wlen>=3 && wptr[2] == ',' ) continue;
		}
		// put it in, accumulate, max score is 0x7fffffff
		if ( ! ct->addTerm(pids[i]) ) return false;
	}
	return true;
}

static int cmp(const void *h1, const void *h2);

// vector components are 32-bit hashes
int32_t *XmlDoc::getTagPairHashVector ( ) {

	if ( m_tagPairHashVecValid ) return m_tagPairHashVec;

	Xml      *xml      = getXml     ();
	if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;

	// store the hashes here
	uint32_t hashes [ 2000 ];
	int32_t          nh = 0;
	// go through each node
	XmlNode *nodes = xml->getNodes    ();
	int32_t   n       = xml->getNumNodes ();

	// start with the ith node
	int32_t i = 0;

	uint32_t saved = 0;
	uint32_t lastHash = 0;
	// loop over the nodes
	for ( ; i < n ; i++ ) {
		// skip NON tags
		if ( ! nodes[i].isTag() ) continue;
		// use the tag id as the hash, its unique
		uint32_t h = hash32h ( nodes[i].getNodeId() , 0 );
		// ensure hash is not 0, that has special meaning
		if ( h == 0 ) h = 1;
		// store in case we have only one hash
		saved = h;

		// if we are the first, set this
		if ( ! lastHash ) {
			lastHash = h;
			continue;
		}

		// if they were the same do not xor, they will zero out
		if ( h == lastHash ) hashes[nh++] = h;
		// incorporate it into the last hash
		else                 hashes[nh++] = h ^ lastHash;

		// we are the new last hash
		lastHash = h;
		// bust out if no room
		if ( nh >= 2000 ) break;
	}
	// if only had one tag after, use that
	if ( nh == 0 && saved ) hashes[nh++] = saved;

	// . TODO: remove the link text hashes here?
	// . because will probably be identical..
	// . now sort hashes to get the top MAX_PAIR_HASHES
	gbsort ( hashes , nh , 4 , cmp );

	// uniquify them
	int32_t d = 0;
	for ( int32_t j = 1 ; j < nh ; j++ ) {
		if ( hashes[j] == hashes[d] ) continue;
		hashes[++d] = hashes[j];
	}

	// how many do we got?
	nh = d;
	// truncate to MAX_PAIR_HASHES MINUS 1 so we can put a 0 at the end
	if ( nh > MAX_TAG_PAIR_HASHES-1 ) nh = MAX_TAG_PAIR_HASHES-1;
	// store the top MAX_PAIR_HASHES
	gbmemcpy ( m_tagPairHashVec , hashes , nh * 4 );
	// null term it. all vectors need this so computeSimilarity() works
	m_tagPairHashVec [ nh++ ] = 0;
	m_tagPairHashVecValid = true;
	m_tagPairHashVecSize = nh * 4;
	return m_tagPairHashVec;
}

// sort in descending order
static int cmp(const void *h1, const void *h2) {
	return *(uint32_t *)h2 - *(uint32_t *)h1;
}

// . m_tagVector.setTagPairHashes(&m_xml, niceness);
// . Sections.cpp and getIsDup() both use this hash
// . returns NULL and sets g_errno on error
// . xors all the unique adjacent tag hashes together
// . kind of represents the template the web pages uses
// . we add this to sectiondb as a vote in Sections::addVotes()
uint32_t *XmlDoc::getTagPairHash32 ( ) {

	// only compute once
	if ( m_tagPairHash32Valid ) return &m_tagPairHash32;

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (uint32_t *)words;

        // shortcuts
	//int64_t *wids  = words->getWordIds  ();
	const nodeid_t *tids  = words->getTagIds();
	int32_t           nw  = words->getNumWords();
	int32_t           nt  = words->getNumTags();

	// . get the hash of all the tag pair hashes!
	// . we then combine that with our site hash to get our site specific
	//   html template termid
	// . put all tag pairs into a hash table
	// . similar to Vector::setTagPairHashes() but we do not compute a
	//   vector, just a single scalar/hash of 32 bits, m_termId
	HashTableX tp; // T<int64_t,char> tp;
	if ( ! tp.set ( 4 , 1 , nt * 4  , NULL , 0 , true,"xmltp"))
		return 0LL;
	uint32_t lastTid = 0;
	char val = 1;
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not tag
		if ( tids[i] == 0LL ) continue;
		// skip if back tag
		if ( tids[i] & BACKBIT ) continue;
		// get last tid
		uint32_t h = hash32h ( tids[i] , lastTid );
		//logf(LOG_DEBUG,"build: tph %" PRId32" h=%" PRIu64,i,(int64_t)h);
		// . add to table (skip if 0, means empty bucket)
		// . return NULL and set g_errno on error
		if ( h && ! tp.addKey ( &h , &val ) ) return NULL;
		// update this
		lastTid = h;
	}
	// linear scan on hash table to get all the hash, XOR together
	uint32_t hx = 0;
	int32_t nb = tp.getNumSlots();
	char *flags = tp.m_flags;
	// get keys
	uint32_t *keys = (uint32_t *)tp.m_keys;
	for ( int32_t i = 0 ; i < nb ; i++ ) {
		// skip if empty
		if ( flags[i] == 0 ) continue;
		// skip if empty
		//if ( keys[i] == 0LL ) continue;
		// incorporate
		hx ^= keys[i];
	}
	// never return 0, make it 1. 0 means an error
	if ( hx == 0 ) hx = 1;
	// set the hash
	m_tagPairHash32 = hx ;
	// it is now valid
	m_tagPairHash32Valid = true;
	return &m_tagPairHash32;
}

// . used for deduping search results
// . also uses the title
int32_t *XmlDoc::getSummaryVector ( ) {
	if ( m_summaryVecValid ) return (int32_t *)m_summaryVec;
	Summary *s = getSummary();
	if ( ! s || s == (Summary *)-1 ) return (int32_t *)s;
	Title *ti = getTitle();
	if ( ! ti || ti == (Title *)-1 ) return (int32_t *)ti;

	int64_t start = logQueryTimingStart();

	// store title and summary into "buf" so we can call words.set()
	SafeBuf sb;

	// put title into there
	int32_t tlen = ti->getTitleLen() - 1;
	if ( tlen < 0 ) tlen = 0;

	// put summary into there
	int32_t slen = s->getSummaryLen();

	// allocate space
	int32_t need = tlen + 1 + slen + 1;
	if ( ! sb.reserve ( need ) ) return NULL;

	sb.safeMemcpy ( ti->getTitle() , tlen );

	// space separting the title from summary
	if ( tlen > 0 ) sb.pushChar(' ');

	sb.safeMemcpy ( s->getSummary() , slen );

	// null terminate it
	//sb.nullTerm();
	//workaround for truncation causing a multibyte utf8 character to be
	//split and then text parsing traversing past the defined bytes.
	sb.nullTerm4();

	// word-ify it
	Words words;
	if ( ! words.set ( sb.getBufStart() , true ) ) {
		return NULL;
	}

	// . now set the dedup vector from big summary and title
	// . store sample vector in here
	// . returns size in bytes including null terminating int32_t
	m_summaryVecSize = computeVector ( &words , (uint32_t *)m_summaryVec );

    logQueryTimingEnd(__func__, start);

	m_summaryVecValid = true;
	return m_summaryVec;
}


// used by getIsDup() and Dates.cpp for detecting dups and for
// seeing if the content changed respectively
int32_t *XmlDoc::getPageSampleVector ( ) {
	if ( m_pageSampleVecValid ) return m_pageSampleVec;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;
	m_pageSampleVecSize = computeVector( ww, (uint32_t *)m_pageSampleVec );
	m_pageSampleVecValid = true;
	return m_pageSampleVec;
}

// . this is the vector of the words right after the hypertext for the link
//   we are voting on.
// . it is used to dedup voters in Msg25.cpp
int32_t *XmlDoc::getPostLinkTextVector ( int32_t linkNode ) {

	if ( m_postVecValid ) return m_postVec;
	// assume none
	m_postVecSize = 0;

	// set up
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml;
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww;

	// sanity check
	if ( linkNode < 0 ) { g_process.shutdownAbort(true); }

	// linkNode starts pointing to a <a> tag so skip over that!
	linkNode++;
	// limit
	int32_t     nn    = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes();
	// and advance i to the next anchor tag thereafter, we do not
	// want to include link text in this vector because it is usually
	// repeated and will skew our "similarities"
	for ( ; linkNode < nn ; linkNode++ ) {
		// stop if we hit </a> or <a>
		if ( (nodes[linkNode].m_nodeId & BACKBITCOMP) != TAG_A ) continue;
		// advance over the </a> or <a>
		linkNode++;
		// then stop, we will start gathering link text here
		break;
	}
	// if we hit end of the doc, we got not vector then
	if ( linkNode >= nn ) return m_postVec;

	// now convert the linkNode # to a word #, "start"
	int32_t          nw   = ww->getNumWords();
	const int64_t   *wids = ww->getWordIds();
	const nodeid_t  *tids = ww->getTagIds();
	const int32_t   *wn   = ww->getNodes();
	int32_t       i    = 0;
	for ( ; i < nw ; i++ ) {
		// stop when we got the first word in this node #
		if ( wn[i] == linkNode ) break;
	}
	// if none, bail now, size is 0
	if ( i >= nw ) return m_postVec;
	// save that
	int32_t start = i;

	// likewise, set the end of it
	int32_t end = nw;
	// count alnum words
	int32_t count = 0;
	// limit it
	for ( i = start ; i < nw && count < 35 ; i++ ) {
		// get tag id
		nodeid_t tid = tids[i] & BACKBITCOMP;
		// stop if certain ones
		if ( tid     == TAG_TABLE ) break;
		if ( tid     == TAG_UL    ) break;
		// <a>, </a> is ok
		if ( tids[i] == TAG_A     ) break;
		// only up to 35 words allowed in the hash
		if ( wids[i] ) count++;
	}
	// set the end of the words to hash
	end = i;

	// specify starting node # now
	m_postVecSize = computeVector( ww, (uint32_t *)m_postVec, start, end );

	// return what we got
	return m_postVec;
}

// . was kinda like "m_tagVector.setTagPairHashes(&m_xml, niceness);"
// . this is used by getIsDup() (below)
// . this is used by Dates.cpp to see how much a doc has changed
// . this is also now used for getting the title/summary vector for deduping
//   search results
// . if we couldn't extract a good pub date for the doc, and it has changed
//   since last spidered, use the bisection method to come up with our own
//   "last modified date" which we use as the pub date.
// . this replaces the clusterdb.getSimilarity() logic in Msg14.cpp used
//   to do the same thing. but we call Vector::setForDates() from
//   Dates.cpp. that way the logic is more contained in Dates!
// . doesn't Msg14 already do that?
// . yes, but it uses two TermTables and calls Clusterdb::getSimilarity()
// . returns false and sets g_errno on error
// . these words classes should have been set by a call to Words::set(Xml *...)
//   so that we have "tids1" and "tids2"

// . returns NULL and sets g_errno on error
// . TODO: if our title rec is non-empty consider getting it from that
// . we use this vector to compare two docs to see how similar they are
int32_t XmlDoc::computeVector( Words *words, uint32_t *vec, int32_t start, int32_t end ) {
	// assume empty vector
	vec[0] = 0;

	// shortcuts
	int32_t       nw     = words->getNumWords();
	const int64_t *wids  = words->getWordIds();

	// set the end to the real end if it was specified as less than zero
	if ( end < 0 ) end = nw;

	// # of alnum words, about... minus the tags, then the punct words
	// are half of what remains...
	int32_t count = words->getNumAlnumWords();

	// . Get sample vector from content section only.
	// . This helps remove duplicate menu/ad from vector

	// 4 bytes per hash, save the last one for a NULL terminator, 0 hash
	int32_t maxTerms = SAMPLE_VECTOR_SIZE / 4  - 1;
	// what portion of them do we want to mask out from the rest?
	int32_t ratio = count / maxTerms ;
	// a mask of 0 means to get them all
	unsigned char mask = 0x00;
	// if we got twice as many terms as we need, then set mask to 0x01
	// to filter out half of them! but actually, let's aim for twice
	// as many as we need to ensure we really get as many as we need.
	// so if we got 4 or more than we need then cut in half...
	while ( ratio >= 4 ) {
		// shift the mask down, ensure hi bit is set
		mask >>= 1;
		mask |= 0x80;
		ratio >>= 1; // /2
	}

	// store vector into "d" for now. will sort below
	uint32_t d [ 3000 ];

	// dedup our vector using this hashtable, "ht"
	char hbuf[3000*6*2];
	HashTableX ht;
	if ( ! ht.set(4,0,3000,hbuf,3000*6*2,false,"xmlvecdedup")){
		g_process.shutdownAbort(true);}

 again:
	// a buffer to hold the top termIds
	int32_t nd = 0;
	// count how many we mask out
	int32_t mo = 0;
	// . buffer should have at least "maxTerms" in it
	// . these should all be 12 byte keys
	for ( int32_t i = start ; i < end ; i++ ) {
		// skip if not alnum word
		if ( wids[i] == 0 ) continue;

		// skip if mask filters it
		if ( ((wids[i]>>(NUMTERMIDBITS-8)) & mask)!=0) {mo++;continue;}

		// make 32 bit
		uint32_t wid32 = (uint32_t)wids[i];

		// do not add if we already got it
		if ( ht.getSlot ( &wid32 ) >= 0 ) continue;

		// add to hash table. return NULL and set g_errno on error
		if ( ! ht.addKey (&wid32 )){g_process.shutdownAbort(true); }

		// add it to our vector
		d[nd] = (uint32_t)wids[i];

		// stop after 3000 for sure
		if ( ++nd < 3000 ) continue;

		// bitch and break out on error
		log(LOG_INFO,"build: Sample vector overflow. Slight performance hit.");
		break;
	}

	// . if nd was too small, don't use a mask to save time
	// . well just make the mask less restrictive
	if ( nd < maxTerms && mask && mo ) {
		// shift the mask UP, allow more termIds to pass through
		mask <<= 1;
		// reset hash table since we are starting over
		ht.clear();
		goto again;
	}

	// bubble sort them
	bool flag = true;
	while ( flag ) {
		flag = false;
		for ( int32_t i = 1 ; i < nd ; i++ ) {
			if ( d[i-1] <= d[i] ) continue;
			uint32_t tmp = d[i-1];
			d[i-1] = d[i];
			d[i]   = tmp;
			flag   = true;
		}
	}

	// truncate
	if ( nd > maxTerms ) nd = maxTerms;
	// null terminate
	d [ nd++ ] = 0;
	// store in our sample vector
	gbmemcpy ( vec , d , nd * 4 );
	// return size in bytes
	return nd * 4;
}

float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
	int32_t *sv1 = getPageSampleVector();
	if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
	int32_t *sv2 = xd2->getPageSampleVector();
	if ( ! sv2 || sv2 == (int32_t *)-1 ) return (float *)sv2;
	m_pageSimilarity = computeSimilarity ( sv1, sv2, NULL, NULL, NULL);
	// this means error, g_errno should be set
	if ( almostEqualFloat(m_pageSimilarity, -1.0) ) return NULL;
	return &m_pageSimilarity;
}

// . compare old page vector with new
// . returns ptr to a float from 0.0 to 100.0
float *XmlDoc::getPercentChanged ( ) {
	// if we got it
	if ( m_percentChangedValid ) return &m_percentChanged;
	// get the old doc
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (float *)od;
	// if empty, assume 0% changed
	if ( ! *od ) {
		m_percentChanged      = 0;
		m_percentChangedValid = true;
		return &m_percentChanged;
	}
	// get its page c
	float *ps = getPageSimilarity    ( *od );
	if ( ! ps || ps == (float *)-1 ) return (float *)ps;
	// got it
	m_percentChanged      = *ps;
	m_percentChangedValid = true;
	// just return it
	return &m_percentChanged;
}

// . compare two vectors
// . components in vectors are int32_ts
// . last component is a zero, to mark EOV = end of vector
// . discount any termIds that are in the query vector, qvec, which may be NULL
// . returns -1 and sets g_errno on error
// . vector components are 32-bit hashes of the words (hash32())???
//   i would say they should be the lower 32 bits of the 64-bit hashes!
// . replaces:
//   m_tagVec->getLinkBrotherProbability()
//   g_clusterdb.getSampleSimilarity()
float computeSimilarity ( const int32_t   *vec0,
			  const int32_t   *vec1,
			  const int32_t   *s0, // corresponding scores vector
			  const int32_t   *s1, // corresponding scores vector
			  Query  *q    ,
			  bool    dedupVectors ) {
	// if both empty, assume not similar at all
	if(!vec0 || !vec1)
		return 0;


	// flag if from query vector
	HashTableX qt;
	char qbuf[5000];
	if ( q ) {
		// init hash table
		if ( ! qt.set ( 4,0,512,qbuf,5000,false,"xmlqvtbl") )
			return -1;
		// . stock the query term hash table
		// . use the lower 32 bits of the termids to make compatible
		//   with the other vectors we use
		//int64_t *qtids = q->getTermIds ();
		int32_t       nt    = q->getNumTerms();
		for ( int32_t i = 0 ; i < nt ; i++ ) {
			// get query term
			QueryTerm *QT = &q->m_qterms[i];
			// get the termid
			int64_t termId = QT->m_termId;
			// get it
			uint32_t h = (uint32_t)(termId & 0xffffffff);
			// hash it
			if ( ! qt.addKey ( &h ) ) return -1;
		}
	}

	// if we ignore cardinality then it only matters if both vectors
	// have a particular value, and not how many times they each have it.
	// so we essentially dedup each vector if dedupVectors is true.
	// but we do total up the score and put it behind the one unique
	// occurence though. we do this only for
	// Sections::addDateBasedImpliedSections() right now
	bool allowDups = true;
	if ( dedupVectors ) allowDups = false;

	HashTableX ht;
	char  hbuf[10000];
	if ( ! ht.set ( 4,4,-1,hbuf,10000,allowDups,"xmlqvtbl2"))
		return -1;

	bool useScores  = s0 ? true : false;

	int32_t matches    = 0;
	int32_t total      = 0;

	int32_t matchScore = 0;
	int32_t totalScore = 0;

	// hash first vector. accumulating score total and total count
	for ( const int32_t *p = vec0; *p; p++, s0++ ) {
		// skip if matches a query term
		if ( q && qt.getSlot ( p ) ) continue;
		// count it
		total++;
		// get it
		int32_t score = 1;
		// get the score if valid
		if ( useScores ) score = *s0;
		// total it up
		totalScore += score;
		// add it
		if ( dedupVectors ) {
			// accumulate all the scores into this one bucket
			// in the case of p being a dup
			if ( ! ht.addTerm32(*p, score) ) return -1;
		}
		else {
			// otherwise, add each into its own bucket since
			// ht.m_allowDups should be true
			if ( ! ht.addKey ( p , &score ) ) return -1;
		}
	}

	int32_t zero = 0;

	// see what components of this vector match
	for ( const int32_t *p = vec1; *p; p++, s1++ ) {
		// skip if matches a query term
		if ( q && qt.getSlot ( p ) ) continue;
		// count it
		total++;
		// get it
		int32_t score = 1;
		// get the score if valid
		if ( useScores ) score = *s1;
		// and total scores
		totalScore += score;
		// is it in there?
		int32_t slot = ht.getSlot ( p );
		// skip if unmatched
		if ( slot < 0 ) continue;
		// otherwise, it is a match!
		matches++;
		// and scores
		matchScore += score;
		// and score of what we matched
		uint32_t *val = (uint32_t *)ht.getValueFromSlot ( slot );
		// he is hit too
		matchScore += *val;

		// remove it as we match it to deal with dups
		if ( allowDups ) {
			// once we match it once, do not match again, score was
			// already accumulated
			ht.setValue ( slot , &zero );
		}
		else {
			// otherwise, remove this dup and try to match any
			// remaining dups in the table
			ht.removeSlot ( slot );
		}
	}

	// if after subtracting query terms we got no hits, return 0.framesets?
	if ( useScores && totalScore == 0 ) return 0;
	if ( total                   == 0 ) return 0;
	// . what is the max possible score we coulda had?
	// . subtract the vector components that matched a query term
	float percent = 100 * (float)matchScore / (float)totalScore;
	//if ( useScores)percent = 100 * (float)matchScore / (float)totalScore;
	//else           percent = 100 * (float)matches    / (float)total;
	// sanity
	//if ( percent > 100 ) percent = 100;
	if ( percent > 100 ) { g_process.shutdownAbort(true); }

	return percent;
}

int64_t *XmlDoc::getExactContentHash64 ( ) {

	if ( m_exactContentHash64Valid )
		return &m_exactContentHash64;

	char **u8 = getUtf8Content();
	if ( ! u8 || u8 == (char **)-1) return (int64_t *)u8;


	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;


	unsigned char *p = (unsigned char *)*u8;

	int32_t plen = size_utf8Content;
	if ( plen > 0 ) plen--;

	// sanity
	//if ( ! p ) return 0LL;
	//if ( p[plen] != '\0' ) { g_process.shutdownAbort(true); }

	unsigned char *pend = (unsigned char *)p + plen;
	uint64_t h64 = 0LL;
	unsigned char pos = 0;
	bool lastWasSpace = true;
	for ( ; p < pend ; p++ ) {
		// treat sequences of white space as a single ' ' (space)
		if ( is_wspace_a(*p) ) {
			if ( lastWasSpace ) continue;
			lastWasSpace = true;
			// treat all white space as a space
			h64 ^= g_hashtab[pos][(unsigned char)' '];
			pos++;
			continue;
		}
		lastWasSpace = false;
		// xor this in right
		h64 ^= g_hashtab[pos][p[0]];
		pos++;
	}

	m_exactContentHash64Valid = true;
	m_exactContentHash64 = h64;
	return &m_exactContentHash64;
}



RdbList *XmlDoc::getDupList ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_dupListValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid" );
		return &m_dupList;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, could not get collection" );
		return NULL;
	}

	int64_t *ph64 = getExactContentHash64();
	if ( ! ph64 || ph64 == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getExactContentHash64 returned -1" );
		return (RdbList *)ph64;
	}

	// must match term in XmlDoc::hashVectors()
	char qbuf[256];
	snprintf(qbuf, 256, "%" PRIu64, (uint64_t)(*ph64));
	int64_t pre     = hash64b ( "gbcontenthash" , 0LL );
	int64_t rawHash = hash64b ( qbuf , 0LL );
	int64_t termId  = hash64 ( rawHash , pre );
	// get the startkey, endkey for termlist
	key144_t sk ;
	key144_t ek ;
	Posdb::makeStartKey ( &sk,termId ,0);
	Posdb::makeEndKey   ( &ek,termId ,MAX_DOCID);
	// note it
	log(LOG_DEBUG,"build: check termid=%" PRIu64" for docid %" PRIu64
	    ,(uint64_t)(termId&TERMID_MASK)
	    ,(uint64_t)m_docId);
	// assume valid now
	m_dupListValid = true;
	// this is a no-split lookup by default now
	if ( ! m_msg0.getList ( -1    , // hostId
				RDB_POSDB, // INDEXDB ,
				cr->m_collnum,
				&m_dupList  ,
				(char *)&sk          ,
				(char *)&ek          ,
				606006        , // minRecSizes in bytes
				m_masterState , // state
				m_masterLoop  ,
				m_niceness    ,
				true , // error correction?
				true , // include tree?
				-1 , // firsthosti
				0 , // startfilenum
				-1, // # files
				// never timeout when spidering in case
				// a host is down.
				msg0_getlist_infinite_timeout , // timeout
				NULL, // msg5
				false , // isRealMerge
				true, // shardByTermId? THIS IS DIFFERENT!!!
				-1  ) ) // forceParitySplit
	{
		// return -1 if this blocks
		logTrace( g_conf.m_logTraceXmlDoc, "END, return -1. msg0.getList blocked." );
		return (RdbList *)-1;
	}

	// assume valid!
	m_dupListValid = true;
	logTrace( g_conf.m_logTraceXmlDoc, "END, done." );
	return &m_dupList;
}


// moved DupDetector.cpp into here...
char *XmlDoc::getIsDup ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_isDupValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid" );
		return &m_isDup;
	}

	// assume we are not a dup
	m_isDup = (char)false;
	// get it
	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, could not get collection" );
		return NULL;
	}

	// skip if we should
	if ( ! cr->m_dedupingEnabled ) {
		m_isDupValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END, deduping not enabled" );
		return &m_isDup;
	}

	setStatus ( "checking for dups" );

	// get our docid
	int64_t *mydocid = getDocId();
	if ( ! mydocid || mydocid == (int64_t *)-1)
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getDocId returned -1" );
		return (char *)mydocid;
	}

	// get the duplist!
	RdbList *list = getDupList();
	if ( ! list || list == (RdbList *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getDupList returned -1" );
		return (char *)list;
	}

	// sanity. must be posdb list.
	if ( ! list->isEmpty() && list->getKeySize() != 18 ) { g_process.shutdownAbort(true);}

	// so getSiteRank() does not core
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getSiteNumInlinks returned -1" );
		return (char *)sni;
	}

	int32_t myRank = getSiteRank ( );

	// assume not a dup
	m_isDup = (char)false;
	// get the docid that we are a dup of
	for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
		char *rec = list->getCurrentRec();

		// get the docid
		int64_t d = Posdb::getDocId ( rec );

		// just let the best site rank win i guess?
		// even though one page may have more inlinks???
		char sr = (char )Posdb::getSiteRank ( rec );

		// skip if us
		if ( d == m_docId ) continue;

		// if his rank is <= ours then he was here first and we
		// are the dup i guess...
		if ( sr >= myRank ) {
			log("build: doc %s is dup of docid %" PRId64,
			    m_firstUrl.getUrl(),d);
			m_isDup = (char)true;
			m_isDupValid = true;
			m_docIdWeAreADupOf = d;
			logTrace( g_conf.m_logTraceXmlDoc, "END, we are a duplicate" );
			return &m_isDup;
		}

	}

	m_isDup = (char)false;
	m_isDupValid = true;
	logTrace( g_conf.m_logTraceXmlDoc, "END, done. Not dup." );
	return &m_isDup;
}

char *XmlDoc::getMetaDescription( int32_t *mdlen ) {
	if ( m_metaDescValid ) {
		*mdlen = m_metaDescLen;
		return m_metaDesc;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	// we need to point to it in the html source so our WordPosInfo algo works right.
	m_metaDesc = xml->getMetaContentPointer( "description", 11, "name", &m_metaDescLen );
	*mdlen = m_metaDescLen;
	m_metaDescValid = true;
	return m_metaDesc;
}


char *XmlDoc::getMetaSummary ( int32_t *mslen ) {
	if ( m_metaSummaryValid ) {
		*mslen = m_metaSummaryLen;
		return m_metaSummary;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	m_metaSummary = xml->getMetaContentPointer( "summary", 7, "name", &m_metaSummaryLen );
	*mslen = m_metaSummaryLen;
	m_metaSummaryValid = true;
	return m_metaSummary;
}


char *XmlDoc::getMetaKeywords( int32_t *mklen ) {
	if ( m_metaKeywordsValid ) {
		*mklen = m_metaKeywordsLen;
		return m_metaKeywords;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	// we need to point to it in the html source so our WordPosInfo algo works right.
	m_metaKeywords = xml->getMetaContentPointer( "keywords", 8, "name", &m_metaKeywordsLen );
	*mklen = m_metaKeywordsLen;
	m_metaKeywordsValid = true;
	return m_metaKeywords;
}


char *XmlDoc::getMetaGeoPlacename( int32_t *mgplen ) {
	if ( m_metaGeoPlacenameValid ) {
		*mgplen = m_metaGeoPlacenameLen;
		return m_metaGeoPlacename;
	}
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	// we need to point to it in the html source so our WordPosInfo algo works right.
	m_metaGeoPlacename = xml->getMetaContentPointer( "geo.placename", 13, "name", &m_metaGeoPlacenameLen );
	*mgplen = m_metaGeoPlacenameLen;
	m_metaGeoPlacenameValid = true;
	return m_metaGeoPlacename;
}

Url *XmlDoc::getCurrentUrl ( ) {
	if ( m_currentUrlValid ) return &m_currentUrl;
	// otherwise, get first url
	Url *fu = getFirstUrl();
	if ( ! fu || fu == (void *)-1 ) return (Url *)fu;
	// make that current url
	m_currentUrl.set ( &m_firstUrl );
	m_currentUrlValid = true;
	return &m_currentUrl;
}

Url *XmlDoc::getFirstUrl() {
	if ( m_firstUrlValid ) return &m_firstUrl;
	// we might have a title rec
	if ( m_setFromTitleRec ) {
		setFirstUrl ( ptr_firstUrl );
		m_firstUrlValid = true;
		return &m_firstUrl;
	}
	// must be this otherwise
	if ( ! m_setFromDocId ) { g_process.shutdownAbort(true); }
	// this must be valid
	if ( ! m_docIdValid ) { g_process.shutdownAbort(true); }

	// get the old xml doc from the old title rec
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (void *)-1 ) return (Url *)pod;
	// shortcut
	XmlDoc *od = *pod;
	// now set it
	if (od) {
		setFirstUrl(od->ptr_firstUrl);
		m_firstUrlValid = true;
	}

	return &m_firstUrl;
}


int64_t XmlDoc::getFirstUrlHash48() {
	if ( m_firstUrlHash48Valid ) return m_firstUrlHash48;
	// this must work
	if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }
	if ( getUseTimeAxis() ) {
		m_firstUrlHash48 = hash64b ( getTimeAxisUrl()->getBufStart() ) & 0x0000ffffffffffffLL;
		m_firstUrlHash48Valid = true;
		return m_firstUrlHash48;
	}

	m_firstUrlHash48 = hash64b ( m_firstUrl.getUrl() ) & 0x0000ffffffffffffLL;
	m_firstUrlHash48Valid = true;
	return m_firstUrlHash48;
}

int64_t XmlDoc::getFirstUrlHash64() {
	if ( m_firstUrlHash64Valid ) return m_firstUrlHash64;
	// this must work
	if ( ! m_firstUrlValid ) { g_process.shutdownAbort(true); }

	if ( getUseTimeAxis() ) {
		m_firstUrlHash64 = hash64b ( getTimeAxisUrl()->getBufStart() );
		m_firstUrlHash64Valid = true;
		return m_firstUrlHash64;
	}

	m_firstUrlHash64 = hash64b ( m_firstUrl.getUrl() );
	m_firstUrlHash64Valid = true;
	return m_firstUrlHash64;
}

// . operates on the latest m_httpReply
Url **XmlDoc::getRedirUrl() {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_redirUrlValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning already valid redirUrl" );
		return &m_redirUrlPtr;
	}

	setStatus ( "getting redir url" );

	// assume no redirect
	m_redirUrlPtr = NULL;

	// we might have a title rec
	if ( m_setFromTitleRec ) { g_process.shutdownAbort(true); }

	// or recycling content from old title rec
	if ( m_recycleContent ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, return redirUrl from old TitleRec" );
		m_redirError = 0;
		m_redirErrorValid = true;
		m_redirUrlValid = true;
		return &m_redirUrlPtr;
	}

	// get the current http reply, not the final http reply necessarily
	if ( ! m_httpReplyValid ) { g_process.shutdownAbort(true); }

	// set a mime on the stack
	HttpMime mime;

	// shortcut
	int32_t httpReplyLen = m_httpReplySize - 1;

	// sanity check
	if ( httpReplyLen > 0 && ! m_httpReply ) { g_process.shutdownAbort(true); }

	// empty reply, no redir
	if ( httpReplyLen == 0 ) {
		// bad mime, but i guess valid empty redir url
		m_redirUrlValid = true;
		// no error
		m_redirError = 0;
		m_redirErrorValid = true;

		logTrace( g_conf.m_logTraceXmlDoc, "END, returning fake. Length is 0" );
		// return a fake thing. content length is 0.
		return &m_redirUrlPtr;
	}

	// set it
	if ( httpReplyLen<0 || ! mime.set ( m_httpReply, httpReplyLen, getCurrentUrl() ) ) {
		// bad mime, but i guess valid empty redir url
		m_redirUrlValid = true;

		// return nothing, no redirect url was there
		m_redirUrlPtr = NULL;

		// no error
		m_redirError = 0;
		m_redirErrorValid = true;

		// return a fake thing. content length is 0.
		logTrace( g_conf.m_logTraceXmlDoc, "END, returning fake. Bad mime." );

		return &m_redirUrlPtr;
	}

	int32_t httpStatus = mime.getHttpStatus();

	Url *loc = NULL;

	// quickly see if we are a robots.txt url originally
	bool isRobotsTxt = isFirstUrlRobotsTxt ( );

	//
	// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
	// if httpStatus is not a redirect
	//
	if ( httpStatus < 300 || httpStatus > 399 ) {
		logTrace( g_conf.m_logTraceXmlDoc, "Checking meta for redirect, if not robot.txt" );

		// ok, crap, i was getting the xml here to get the meta
		// http-equiv refresh tag, but that added an element of
		// recursion that is just too confusing to deal with. so
		// let's just parse out the meta tag by hand
		if ( !isRobotsTxt ) {
			Url **mrup = getMetaRedirUrl();
			if ( ! mrup || mrup == (void *)-1) {
				logTrace( g_conf.m_logTraceXmlDoc, "END, bad meta?" );
				return (Url **)mrup;
			}

			// set it. might be NULL if not there.
			loc = *mrup;
		}
	} else {
		logTrace( g_conf.m_logTraceXmlDoc, "call mime.getLocationUrl" );
		// get Location: url (the redirect url) from the http mime
		loc = mime.getLocationUrl();
	}

	// get current url
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, error, could not get current url" );
		return (Url **)cu;
	}

	// get local link info
	LinkInfo *info1 = getLinkInfo1();
	// error or blocked
	if ( ! info1 || info1 == (LinkInfo *)-1 ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, error, could not get LinkInfo1" );
		return (Url **)info1;
	}

	// did we send a cookie with our last request?
	bool sentCookieLastTime = false;
	if ( m_redirCookieBuf.length() ) {
		sentCookieLastTime = true;
	}

	// get cookie for redirect
	mime.addToCookieJar(getCurrentUrl(), &m_redirCookieBuf);
	m_redirCookieBufValid = true;

	// a hack for removing session ids already in there
	// must not have an actual redirect url in there & must be a valid http status
	if ( !loc && httpStatus == 200 ) {
		Url *tt = &m_redirUrl;
		tt->set( cu->getUrl(), cu->getUrlLen(), false, true );

		// if url changes, force redirect it
		if ( strcmp ( cu->getUrl(), tt->getUrl() ) != 0 ) {
			m_redirUrlValid = true;
			m_redirUrlPtr   = &m_redirUrl;
			ptr_redirUrl    = m_redirUrl.getUrl();
			size_redirUrl   = m_redirUrl.getUrlLen()+1;

			/// @todo ALC should we use EDOCSIMPLIFIEDREDIR
			// m_redirError = EDOCSIMPLIFIEDREDIR;

			// no error
			m_redirError = 0;

			m_redirErrorValid = true;

			logTrace( g_conf.m_logTraceXmlDoc, "END, Forced redirect from '%s' to '%s'", cu->getUrl(),m_redirUrl.getUrl() );
			return &m_redirUrlPtr;
		}
	}

	// if no location url, then no redirect a NULL redir url
	if ( ! loc || loc->getUrl()[0] == '\0' ) {
		// validate it
		m_redirUrlValid = true;
		// no error
		m_redirError = 0;
		m_redirErrorValid = true;
		// and return an empty one
		logTrace( g_conf.m_logTraceXmlDoc, "END, no redir url (no loc)" );
		return &m_redirUrlPtr;
	}

	bool keep = false;
	if ( info1->hasLinkText() ) keep = true;

	// at this point we do not block anywhere
	m_redirUrlValid = true;

	// store the redir error
	m_redirError      = 0;
	m_redirErrorValid = true;

	// i've seen a "Location: 2010..." bogus url as well, so make sure
	// we got a legit url
	if ( ! loc->getDomain() || loc->getDomainLen() <= 0 ) {
		if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;

		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCBADREDIRECTURL" );
		return &m_redirUrlPtr;
	}

	// . if redirect url is nothing new, then bail (infinite loop)
	// . www.xbox.com/SiteRequirements.htm redirects to itself
	//   until you send a cookie!!
	// . www.twomileborris.com does the cookie thing, too
	if ( strcmp ( cu->getUrl(), loc->getUrl() ) == 0 ) {
		// try sending the cookie if we got one now and didn't have
		// one for this last request
		if ( ! sentCookieLastTime && m_redirCookieBuf.length() ) {
		        m_redirUrl.set ( loc->getUrl() );
		        m_redirUrlPtr = &m_redirUrl;
		        return &m_redirUrlPtr;
		}
		if ( ! keep ) m_redirError = EDOCREDIRECTSTOSELF;

		logTrace( g_conf.m_logTraceXmlDoc, "END, redir err" );
		return &m_redirUrlPtr;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. getCollRec returned false" );
		return NULL;
	}

	// . don't allow redirects when injecting!
	// . otherwise, we would mfree(m_buf) which would free our
	//   injected reply... yet m_injectedReplyLen would still be
	//   positive! can you say 'seg fault'?
	// . hmmm... seems to have worked though
	if ( cr->m_recycleContent || m_recycleContent ) {
		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCTOOMANYREDIRECTS (recycled)" );
		return &m_redirUrlPtr;
	}

	// . if we followed too many then bail
	// . www.motorolamobility.com www.outlook.com ... failed when we
	//   had >= 4 here
	if ( ++m_numRedirects >= 10 ) {
		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCTOOMANYREDIRECTS" );
		return &m_redirUrlPtr;
	}

	// sometimes idiots don't supply us with a Location: mime
	if ( loc->getUrlLen() == 0 ) {
		if ( ! keep ) m_redirError = EDOCBADREDIRECTURL;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EDOCBADREDIRECTURL" );
		return &m_redirUrlPtr;
	}

	// . protocol of url must be http or https
	// . we had one url redirect to an ihttp:// protocol and caused
	//   spider to core dump when it saw that SpiderRequest record
	const char *proto = loc->getScheme();
	if ( strncmp(proto,"http://" ,7) && strncmp(proto,"https://",8) ) {
		m_redirError = EDOCBADREDIRECTURL;
		logTrace( g_conf.m_logTraceXmlDoc, "END, EBADREDIRECTURL (wrong scheme)" );
		return &m_redirUrlPtr;
	}

	// log a msg
	if ( g_conf.m_logSpideredUrls ) {
		logf( LOG_INFO, "build: %s redirected to %s", cu->getUrl(), loc->getUrl());
	}

	// if not same Domain, it is not a simplified redirect
	bool sameDom = true;
	int32_t dlen = loc->getDomainLen();
	if ( cu->getDomainLen() != dlen  || ( strncmp(cu->getDomain(), loc->getDomain(), dlen) ) ) {
		sameDom = false;
	}

	if ( ! sameDom ) {
		m_redirUrl.set ( loc );
		m_redirUrlPtr   = &m_redirUrl;
		ptr_redirUrl    = m_redirUrl.getUrl();
		size_redirUrl    = m_redirUrl.getUrlLen()+1;
		logTrace( g_conf.m_logTraceXmlDoc, "END, return redirUrl. Not same domain [%s]", m_redirUrlPtr->getUrl());
		return &m_redirUrlPtr;
	}

	// get first url ever
	Url *f = getFirstUrl();

	// set this to true if the redirected urls is much preferred
	bool simplifiedRedir = false;

	// . if it redirected to a simpler url then stop spidering now
	//   and add the simpler url to the spider queue
	// . by simpler, i mean one w/ fewer path components
	// . or one with a www for hostname
	// . or could be same as firstUrl but with a / appended
	char *r   = loc->getUrl();
	char *u   = f->getUrl();
	int32_t rlen = loc->getUrlLen();
	int32_t ulen = f->getUrlLen();

	// simpler if new path depth is shorter
	if ( loc->getPathDepth( true ) < f->getPathDepth( true ) ) {
		logTrace(g_conf.m_logTraceXmlDoc, "redirected url path depth is shorter. simplifiedRedir=true");
		simplifiedRedir = true;
	}

	// simpler if old has cgi and new does not
	if ( !simplifiedRedir && f->isCgi() && ! loc->isCgi() ) {
		logTrace(g_conf.m_logTraceXmlDoc, "redirected url doesn't have query param, old url does. simplifiedRedir=true");
		simplifiedRedir = true;
	}

	// simpler if new one is same as old but has a '/' at the end
	if ( !simplifiedRedir && rlen == ulen+1 && r[rlen-1]=='/' && strncmp(r, u, ulen) == 0 ) {
		logTrace(g_conf.m_logTraceXmlDoc, "redirected url has '/', old url doesn't. simplifiedRedir=true");
		simplifiedRedir = true;
	}

	// . if new url does not have semicolon but old one does
	// . http://news.yahoo.com/i/738;_ylt=AoL4eFRYKEdXbfDh6W2cF
	//   redirected to http://news.yahoo.com/i/738
	if ( !simplifiedRedir && strchr ( u, ';' ) &&  ! strchr ( r, ';' ) ) {
		logTrace(g_conf.m_logTraceXmlDoc, "redirected url doesn't have semicolon, old url does. simplifiedRedir=true");
		simplifiedRedir = true;
	}

	// simpler is new host is www and old is not
	if ( !simplifiedRedir && loc->isHostWWW() && ! f->isHostWWW() ) {
		logTrace(g_conf.m_logTraceXmlDoc, "redirect is www & original is not. simplifiedRedir=true");
		simplifiedRedir = true;
	}

	// if redirect is to different domain, set simplified
	// this helps locks from bunching on one domain
	if ( !simplifiedRedir && ( loc->getDomainLen() != f->getDomainLen() ||
	     strncasecmp ( loc->getDomain(), f->getDomain(), loc->getDomainLen() ) != 0 ) ) {
		// crap, but www.hotmail.com redirects to live.msn.com
		// login page ... so add this check here
		if ( !f->isRoot() ) {
			logTrace(g_conf.m_logTraceXmlDoc, "different domain & not root. simplifiedRedir=true");
			simplifiedRedir = true;
		}
	}

	bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;
	logTrace(g_conf.m_logTraceXmlDoc, "allowSimplifiedRedirs=%s", allowSimplifiedRedirs ? "true" : "false");

	// follow redirects if injecting so we do not return
	// EDOCSIMPLIFIEDREDIR
	if ( getIsInjecting ( ) ) {
		logTrace(g_conf.m_logTraceXmlDoc, "is injecting. allowSimplifiedRedirs=true");
		allowSimplifiedRedirs = true;
	}

	// or if disabled then follow the redirect
	if ( ! cr->m_useSimplifiedRedirects ) {
		logTrace(g_conf.m_logTraceXmlDoc, "collection disallow useSimplifiedRedirects. allowSimplifiedRedirs=true");
		allowSimplifiedRedirs = true;
	}

	// if redirect is setting cookies we have to follow the redirect
	// all the way through so we can stop now.
	if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
		logTrace(g_conf.m_logTraceXmlDoc, "has redirCookie. allowSimplifiedRedirs=true");
		allowSimplifiedRedirs = true;
	}

	// . don't bother indexing this url if the redir is better
	// . 301 means moved PERMANENTLY...
	// . many people use 301 on their root pages though, so treat
	//   it like a temporary redirect, like exclusivelyequine.com
	if ( simplifiedRedir && ! allowSimplifiedRedirs ) {
		m_redirError = EDOCSIMPLIFIEDREDIR;

		// set this because getLinks() treats this redirUrl
		// as a link now, it will add a SpiderRequest for it:
		m_redirUrl.set ( loc );
		m_redirUrlPtr = &m_redirUrl;

		// store redirUrl in titlerec as well
		ptr_redirUrl = m_redirUrl.getUrl();
		size_redirUrl = m_redirUrl.getUrlLen() + 1;

		// mdw: let this path through so contactXmlDoc gets a proper
		// redirect that we can follow. for the base xml doc at
		// least the m_indexCode will be set
		logTrace( g_conf.m_logTraceXmlDoc, "END, return [%s]. Simplified, but not allowed.", m_redirUrlPtr->getUrl());
		return &m_redirUrlPtr;
	}

	// good to go
	m_redirUrl.set ( loc );
	m_redirUrlPtr = &m_redirUrl;
	ptr_redirUrl = m_redirUrl.getUrl();
	size_redirUrl = m_redirUrl.getUrlLen()+1;
	logTrace( g_conf.m_logTraceXmlDoc, "END, return [%s]", m_redirUrlPtr->getUrl());
	return &m_redirUrlPtr;
}



int32_t *XmlDoc::getFirstIndexedDate ( ) {
	if ( m_firstIndexedDateValid ) return (int32_t *)&m_firstIndexedDate;
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
	// valid
	m_firstIndexedDateValid = true;
	// must be downloaded
	//if ( ! m_spideredTimeValid ) { g_process.shutdownAbort(true); }
	// assume now is the first time
	m_firstIndexedDate = getSpideredTime();//m_spideredTime;
	// inherit from our old title rec
	if ( *od ) m_firstIndexedDate = (*od)->m_firstIndexedDate;
	// return it
	return (int32_t *)&m_firstIndexedDate;
}


int32_t *XmlDoc::getOutlinksAddedDate ( ) {
	if ( m_outlinksAddedDateValid ) return (int32_t *)&m_outlinksAddedDate;
	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od;
	// valid
	m_outlinksAddedDateValid = true;
	// must be downloaded
	//if ( ! m_spideredTimeValid ) { g_process.shutdownAbort(true); }
	// assume we are doing it now
	m_outlinksAddedDate = getSpideredTime();//m_spideredTime;
	// get that
	if ( *od ) m_outlinksAddedDate = (*od)->m_outlinksAddedDate;
	// return it
	return (int32_t *)&m_outlinksAddedDate;
}

uint16_t *XmlDoc::getCountryId ( ) {
	if ( m_countryIdValid ) return &m_countryId;

	setStatus ( "getting country id" );

	// get it
	Url *u = getCurrentUrl();
	if ( ! u || u == (void *)-1) return (uint16_t *)u;

	// use the url's tld to guess the country
	uint16_t country = LanguageIdentifier::guessCountryTLD ( u->getUrl ( ) );

	m_countryIdValid = true;
	m_countryId      = country;

	return &m_countryId;
}

XmlDoc **XmlDoc::getOldXmlDoc ( ) {

	if ( m_oldDocValid ) return &m_oldDoc;

	// note it
	setStatus ( "getting old xml doc");

	// if we are set from a title rec, we are the old doc
	if ( m_setFromTitleRec ) {
		m_oldDocValid = true;
		m_oldDoc      = NULL;//this;
		return &m_oldDoc;
	}

	// . cache age is 0... super fresh
	// . returns NULL w/ g_errno if not found unless isIndexed is false
	//   and valid, and it is not valid for pagereindexes.
	char **otr = getOldTitleRec ( );
	if ( ! otr || otr == (char **)-1 ) return (XmlDoc **)otr;
	// if no title rec, return ptr to a null
	m_oldDoc = NULL;
	if ( ! *otr ) { m_oldDocValid = true; return &m_oldDoc; }

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if provided title rec matches our docid but not uh48 then there
	// was a docid collision and we should null out our title rec
	// and return with an error and no index this puppy!
	// crap, we can't call getFirstUrl() because it might not be
	// valid if we are a docid based doc and THIS function was called
	// from getFirstUrl() -- we end up in a recursive loop.
	if ( ! m_setFromDocId ) {
		//int64_t uh48 = getFirstUrl()->getUrlHash48();
		int64_t uh48 = getFirstUrlHash48();
		int64_t tuh48 = Titledb::getUrlHash48 ( (key96_t *)*otr );
		if ( uh48 != tuh48 ) {
			log("xmldoc: docid collision uh48 mismatch. cannot "
				"index "
				"%s",getFirstUrl()->getUrl() );
			g_errno = EDOCIDCOLLISION;
			return NULL;
		}
	}

	// . if *otr is NULL that means not found
	// . return a NULL old XmlDoc in that case as well?
	// . make a new one
	// . this will uncompress it and set ourselves!
	try { m_oldDoc = new ( XmlDoc ); }
	catch(std::bad_alloc&) {
		g_errno = ENOMEM;
		return NULL;
	}
	mnew ( m_oldDoc , sizeof(XmlDoc),"xmldoc1");
	// debug the mem leak
	// log("xmldoc: xmldoc1=%" PTRFMT" u=%s"
	//     ,(PTRTYPE)m_oldDoc
	//     ,m_firstUrl.getUrl());
	// if title rec is corrupted data uncompress will fail and this
	// will return false!
	if ( ! m_oldDoc->set2 ( m_oldTitleRec ,
				m_oldTitleRecSize , // maxSize
				cr->m_coll     ,
				NULL       , // pbuf
				m_niceness ) ) {
		log("build: failed to set old doc for %s",m_firstUrl.getUrl());
		if ( ! g_errno ) { g_process.shutdownAbort(true); }
		//int32_t saved = g_errno;
		// ok, fix the memleak here
		mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
		delete ( m_oldDoc );

		//m_oldDocExistedButHadError = true;
		//log("xmldoc: nuke xmldoc1=%" PTRFMT"",(PTRTYPE)m_oldDoc);
		m_oldDoc = NULL;
		// g_errno = saved;
		// MDW: i removed this on 2/8/2016 again so the code below
		// would execute.
		//return NULL; //mdwmdwmdw
		// if it is data corruption, just assume empty so
		// we don't stop spidering a url because of this. so we'll
		// think this is the first time indexing it. otherwise
		// we get "Bad cached document" in the logs and the
		// SpiderReply and it never gets re-spidered because it is
		// not a 'temporary' error according to the url filters.
		log("build: treating corrupted titlerec as not found");
		g_errno = 0;
		m_oldDoc = NULL;
		m_oldDocValid = true;
		return &m_oldDoc;
	}
	m_oldDocValid = true;
	// share our masterloop and state!
	m_oldDoc->m_masterLoop  = m_masterLoop;
	m_oldDoc->m_masterState = m_masterState;
	return &m_oldDoc;
}

void XmlDoc::nukeDoc ( XmlDoc *nd ) {
	// skip if empty
	if (!nd) {
		return;
	}

	// do not nuke yerself!
	if ( nd == this ) return;
	// or root doc!
	//if ( nd == m_rootDoc ) return;
	// invalidate
	if ( nd == m_extraDoc ) {
		m_extraDocValid = false;
		m_extraDoc      = NULL;
	}
	if ( nd == m_rootDoc    ) {
		m_rootDocValid    = false;
		m_rootDoc         = NULL;
	}
	if ( nd == m_oldDoc     ) {
		m_oldDocValid     = false;
		m_oldDoc          = NULL;
	}
	// nuke it
	mdelete ( nd , sizeof(XmlDoc) , "xdnuke");
	delete ( nd );
}

static LinkInfo s_dummy;

XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN [%s]", u);

	if ( m_extraDocValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END. m_extraDocValid is true" );
		return &m_extraDoc;
	}

	// note that
	setStatus ( "getting new doc" );
	// we need a valid first ip first!
	//int32_t *pfip = getFirstIp();
	//if ( ! pfip || pfip == (void *)-1 ) return (XmlDoc **)pfip;
	// must be NULL
	if ( m_extraDoc ) { g_process.shutdownAbort(true); }
	// sanity check
	if ( ! u || ! u[0] ) { g_process.shutdownAbort(true); }//return &m_extraDoc;
	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END - collection not found" );
		return NULL;
	}

	// . if *otr is NULL that means not found
	// . return a NULL old XmlDoc in that case as well?
	// . make a new one
	// . this will uncompress it and set ourselves!
	try { m_extraDoc = new ( XmlDoc ); }
	catch(std::bad_alloc&) {
		g_errno = ENOMEM;
		logTrace( g_conf.m_logTraceXmlDoc, "END - out of memory" );
		return NULL;
	}
	mnew ( m_extraDoc , sizeof(XmlDoc),"xmldoc2");

	// . if we did not have it in titledb then download it!
	// . or if titleRec was too old!

	// a spider rec for the extra doc to use
	SpiderRequest sreq;
	// clear it
	sreq.reset();
	// spider the url "u"
	strcpy ( sreq.m_url , u );
	// inherit page parser
	sreq.m_isPageParser = getIsPageParser();
	// set the data size right
	sreq.setDataSize();
	// . prepare to download it, set it up
	// . returns false and sets g_errno on error
	if ( ! m_extraDoc->set4 ( &sreq        ,
				  NULL         , // doledbkey ptr
				  cr->m_coll       ,
				  NULL         , // SafeBuf
				  m_niceness   ))
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END. set4 failed" );
		return NULL;
	}

	// share our masterloop and state!
	m_extraDoc->m_masterLoop  = m_masterLoop;
	m_extraDoc->m_masterState = m_masterState;

	// carry this forward always!
	m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;

	// tell msg13 to get this from it robots.txt cache if it can. it also
	// keeps a separate html page cache for the root pages, etc. in case
	m_extraDoc->m_maxCacheAge = maxCacheAge;

	// a dummy thing
	s_dummy.m_numStoredInlinks = 0;
	s_dummy.m_numGoodInlinks  = 0;

	// we indirectly call m_extraDoc->getHttpReply() which calls
	// m_extraDoc->getRedirectUrl(), which checks the linkInfo and
	// dmoz catids of the original url to see if we should set m_indexCode
	// to something bad or not. to avoid these unnecessary lookups we
	// set these to NULL and validate them
	m_extraDoc->ptr_linkInfo1           = &s_dummy;
	m_extraDoc->size_linkInfo1          = 0;
	m_extraDoc->m_linkInfo1Valid        = true;
	m_extraDoc->m_urlFilterNumValid     = true;
	m_extraDoc->m_urlFilterNum          = 0;
	// for redirects
	m_extraDoc->m_allowSimplifiedRedirs = true;
	// set this flag so msg13.cpp doesn't print the "hammering ip" msg
	m_extraDoc->m_isChildDoc = true;

	// and inherit test dir so getTestDir() doesn't core on us
	bool isPageParser = getIsPageParser();
	m_extraDoc->m_isPageParser      = isPageParser;
	m_extraDoc->m_isPageParserValid = true;

	// without this we send all the msg13 requests to host #3! because
	// Msg13 uses it to determine what host to handle it
	if ( ! m_firstIpValid ) { g_process.shutdownAbort(true); }
	m_extraDoc->m_firstIp = m_firstIp;
	m_extraDoc->m_firstIpValid = true;

	// i guess we are valid now
	m_extraDocValid = true;

	logTrace( g_conf.m_logTraceXmlDoc, "END." );
	return &m_extraDoc;
}


bool XmlDoc::getIsPageParser ( ) {
	if ( m_isPageParserValid ) return m_isPageParser;
	// assume not
	m_isPageParser = false;
	// and set otherwise
	if ( m_sreqValid && m_sreq.m_isPageParser ) m_isPageParser = true;
	// and validate
	m_isPageParserValid = true;
	return m_isPageParser;
}

XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) {
	if ( m_rootDocValid ) return &m_rootDoc;
	// help avoid mem leaks
	if ( m_rootDoc ) { g_process.shutdownAbort(true); }
	// note it

	setStatus ( "getting root doc");
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (XmlDoc **)isRoot;
	// if we are root use us!!!!!
	if ( *isRoot ) {
		m_rootDoc = this;
		m_rootDocValid = true;
		return &m_rootDoc;
	}
	// get our site root
	char *_mysite = getSite();
	if ( ! _mysite || _mysite == (void *)-1 ) return (XmlDoc **)_mysite;


	// BR 20151215: Prefix domain with the scheme, otherwise it will later
	// prefix with http:// in Url::set even for https sites.
	char sitebuf[MAX_SITE_LEN + MAX_SCHEME_LEN+4];	// +4 = :// + 0-terminator
	char *mysite 	= sitebuf;
	const char *myscheme 	= getScheme();
	if( myscheme ) {
		mysite += sprintf(mysite, "%s://", myscheme);
	}
	sprintf(mysite, "%s", _mysite);
	mysite = sitebuf;


	// otherwise, we gotta get it!
	char **rtr = getRootTitleRec ( );
	if ( ! rtr || rtr == (char **)-1 ) return (XmlDoc **)rtr;
	// if no title rec, return ptr to a null
	//m_rootDoc = NULL;
	//if ( ! *rtr ) {
	//	// damn, not in titledb, i guess download it then
	//	m_rootDocValid = true; return &m_rootDoc; }
	// note it
	setStatus ( "getting root doc");

	// to keep injections fast, do not download the root page!
	if ( ! *rtr && m_contentInjected ) {
		// assume none
		m_rootDoc = NULL;
		m_rootDocValid = true;
		return &m_rootDoc;
	}

	// likewise, if doing a rebuild
	if ( ! *rtr && m_useSecondaryRdbs ) {
		// assume none
		m_rootDoc = NULL;
		m_rootDocValid = true;
		return &m_rootDoc;
	}

	// or recycling content like for query reindex. keep it fast.
	if ( ! *rtr && m_recycleContent ) {
		m_rootDoc = NULL;
		m_rootDocValid = true;
		return &m_rootDoc;
	}


	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . if *otr is NULL that means not found
	// . return a NULL root XmlDoc in that case as well?
	// . make a new one
	// . this will uncompress it and set ourselves!
	try { m_rootDoc = new ( XmlDoc ); }
	catch(std::bad_alloc&) {
		g_errno = ENOMEM;
		return NULL;
	}
	mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3");
	// if we had the title rec, set from that
	if ( *rtr ) {
		if ( ! m_rootDoc->set2 ( m_rootTitleRec     ,
					 m_rootTitleRecSize , // maxSize    ,
					 cr->m_coll             ,
					 NULL               , // pbuf
					 m_niceness         ) ) {
			// it was corrupted... delete this
			// possibly printed
			// " uncompress uncompressed size=..." bad uncompress
			log("build: rootdoc set2 failed");
			mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
			delete ( m_rootDoc );
			// call it empty for now, we don't want to return
			// NULL with g_errno set because it could stop
			// the whole indexing pipeline
			m_rootDoc = NULL;
			m_rootDocValid = true;
			return &m_rootDoc;
			//return NULL;
		}
	}
	// . otherwise, set the url and download it on demand
	// . this junk copied from the contactDoc->* stuff below
	else {
		// a spider rec for the contact doc
		SpiderRequest sreq;
		// clear it
		sreq.reset();
		// spider the url "u"
		strcpy ( sreq.m_url , mysite );
		// set this
		if ( m_sreqValid ) {
			// this will avoid it adding to tagdb!
			sreq.m_isPageParser = m_sreq.m_isPageParser;
		}
		// reset the data size
	        sreq.setDataSize ();
		// . prepare to download it, set it up
		// . returns false and sets g_errno on error
		if ( ! m_rootDoc->set4 ( &sreq        ,
					 NULL         , // doledbkey ptr
					 cr->m_coll       ,
					 NULL         , // SafeBuf
					 m_niceness   )) {
			mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke");
			delete ( m_rootDoc );
			m_rootDoc = NULL;
			return NULL;
		}
		// do not throttle it!
		//m_rootDoc->m_throttleDownload = false;
		// . do not do robots check for it
		// . no we must to avoid triggering a bot trap & getting banned
		//m_rootDoc->m_isAllowed      = m_isAllowed;
		//m_rootDoc->m_isAllowedValid = true;
	}

	// share our masterloop and state!
	m_rootDoc->m_masterLoop  = m_masterLoop;
	m_rootDoc->m_masterState = m_masterState;

	// msg13 caches the pages it downloads
	m_rootDoc->m_maxCacheAge = maxCacheAge;

	// like m_contactDoc we avoid unnecessary lookups in call to
	// getRedirUrl() by validating these empty members
	m_rootDoc->ptr_linkInfo1           = &s_dummy;
	m_rootDoc->size_linkInfo1          = 0;
	m_rootDoc->m_linkInfo1Valid        = true;
	m_rootDoc->m_urlFilterNumValid     = true;
	m_rootDoc->m_urlFilterNum          = 0;
	// for redirects
	m_rootDoc->m_allowSimplifiedRedirs = true;
	// set this flag so msg13.cpp doesn't print the "hammering ip" msg
	m_rootDoc->m_isChildDoc = true;

	// validate it
	m_rootDocValid = true;
	return &m_rootDoc;
}

SafeBuf *XmlDoc::getTimeAxisUrl ( ) {
	if ( m_timeAxisUrlValid ) return &m_timeAxisUrl;
	if ( m_setFromDocId ) return &m_timeAxisUrl;
	m_timeAxisUrlValid = true;
	Url *fu = getFirstUrl();
	m_timeAxisUrl.reset();
	m_timeAxisUrl.safePrintf("%s.%u",fu->getUrl(),m_contentHash32);
	return &m_timeAxisUrl;
}

// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
//   from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getOldTitleRec() {
	// if valid return that
	if ( m_oldTitleRecValid ) {
		return &m_oldTitleRec;
	}

	// update status msg
	setStatus ( "getting old title rec");
	// if we are set from a title rec, we are the old doc
	if ( m_setFromTitleRec ) {
		m_oldTitleRecValid = true;
		m_oldTitleRec      = NULL;//m_titleRec;
		return &m_oldTitleRec;
	}
	// sanity check
	if ( m_oldTitleRecValid && m_msg22a.isOutstanding() ) {
		g_process.shutdownAbort(true);
	}

	// assume its valid
	m_oldTitleRecValid = true;

	// not if new! no we need to do this so XmlDoc::getDocId() works!
	// this logic prevents us from setting g_errno to ENOTFOUND
	// when m_msg22a below calls indexDocWrapper(). however, for
	// doing a query delete on a not found docid will succumb to
	// the g_errno because m_isIndexed is not valid i think...
	if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) {
		m_oldTitleRec      = NULL;
		m_oldTitleRecValid = true;
		return &m_oldTitleRec;
	}
	// sanity check. if we have no url or docid ...
	if ( ! m_firstUrlValid && ! m_docIdValid ) {
		g_process.shutdownAbort(true);
	}

	// use docid if first url not valid
	int64_t docId = 0;
	if ( ! m_firstUrlValid ) {
		docId = m_docId;
	}
	// if url not valid, use NULL
	char *u = NULL;
	if ( docId == 0LL && ptr_firstUrl ) u = getFirstUrl()->getUrl();
	// if both are not given that is a problem
	if ( docId == 0LL && ! u ) {
		log(LOG_WARN, "doc: no url or docid provided to get old doc");
		g_errno = EBADENGINEER;
		return NULL;
	}
	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		return NULL;
	}

	// if using time axis then append the timestamp to the end of
	// the url. this way Msg22::getAvailDocId() will return a docid
	// based on that so we don't collide with other instances of this
	// same url.
	if ( u && getUseTimeAxis() ) { // g_conf.m_useTimeAxis ) {
		SafeBuf *tau = getTimeAxisUrl();
		u = tau->getBufStart();
	}

	// the title must be local since we're spidering it
	if ( ! m_msg22a.getTitleRec ( &m_msg22Request      ,
				      u                    ,
				      docId                , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      &m_oldTitleRec       ,
				      &m_oldTitleRecSize   ,
				      false                , // just chk tfndb?
				      false , // getAvailDocIdOnly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      999999               )) {// timeout seconds
		// return -1 if we blocked
		return (char **)-1;
	}

	// not really an error
	if ( g_errno == ENOTFOUND ) {
		g_errno = 0;
	}

	// error?
	if ( g_errno ) {
		return NULL;
	}

	// got it
	return &m_oldTitleRec;
}


// . look up TitleRec using Msg22 if we need to
// . set our m_titleRec member from titledb
// . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec
//   from scratch. this loads it from titledb.
// . NULL is a valid value (EDOCNOTFOUND) so return a char **
char **XmlDoc::getRootTitleRec ( ) {
	// if valid return that
	if ( m_rootTitleRecValid ) return &m_rootTitleRec;
	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot;
	// if we are root use us!!!!! well, the old us...
	if ( *isRoot ) {
		char **otr = getOldTitleRec ( );
		if ( ! otr || otr == (char **)-1 ) return (char **)otr;
		m_rootTitleRec     = m_oldTitleRec;
		m_rootTitleRecSize = m_oldTitleRecSize;
		return &m_rootTitleRec;
	}
	// get our site root
	char *mysite = getSite();
	if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// make it a url. keep it on stack since msg22 copies it into its
	// url request buffer anyway! (m_msg22Request.m_url[])
	Url site; site.set ( mysite );
	// assume its valid
	m_rootTitleRecValid = true;
	//if ( maxCacheAge > 0 ) addToCache = true;
	// update status msg
	setStatus ( "getting root title rec");
	// the title must be local since we're spidering it
        if ( ! m_msg22b.getTitleRec ( &m_msg22Request      ,
				      site.getUrl()        ,
				      0                    , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      &m_rootTitleRec      ,
				      &m_rootTitleRecSize  ,
				      false                , // just chk tfndb?
				      false , // getAvailDocIdOnly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      999999               )) // timeout seconds
		// return -1 if we blocked
		return (char **)-1;
	// not really an error
	if ( g_errno == ENOTFOUND ) g_errno = 0;
	// error?
	if ( g_errno ) return NULL;
	// got it
	return &m_rootTitleRec;
}



// used for indexing spider replies. we need a unique docid because it
// is treated as a different document even though its url will be the same.
// and there is never an "older" version of it because each reply is treated
// as a brand new document.
int64_t *XmlDoc::getAvailDocIdOnly ( int64_t preferredDocId ) {
	if ( m_availDocIdValid && g_errno ) {
		log("xmldoc: error getting availdocid: %s",
		    mstrerror(g_errno));
		return NULL;
	}
	if ( m_availDocIdValid )
		// this is 0 or -1 if no avail docid was found
		return &m_msg22c.m_availDocId;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// pre-validate it
	m_availDocIdValid = true;
	if ( ! m_msg22c.getAvailDocIdOnly ( &m_msg22Requestc ,
					    preferredDocId ,
					    cr->m_coll ,
					    m_masterState ,
					    m_masterLoop ,
					    m_niceness ) )
		return (int64_t *)-1;
	// error?
	log("xmldoc: error getting availdocid2: %s",mstrerror(g_errno));
	return NULL;
}


int64_t *XmlDoc::getDocId ( ) {
	if ( m_docIdValid ) return &m_docId;
	setStatus ("getting docid");
	XmlDoc **od = getOldXmlDoc( );
	if ( ! od || od == (XmlDoc **)-1 ) return (int64_t *)od;
	setStatus ("getting docid");

	// . set our docid
	// . *od is NULL if no title rec found with that docid in titledb
	if ( *od ) {
		m_docId = *(*od)->getDocId();
		m_docIdValid = true;
		return &m_docId;
	}

	m_docId = m_msg22a.getAvailDocId();

	// if titlerec was there but not od it had an error uncompressing
	// because of the corruption bug in RdbMem.cpp when dumping to disk.
	if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
		m_docId = Titledb::getDocIdFromKey ( (key96_t *)m_oldTitleRec );
		log(LOG_WARN, "build: salvaged docid %" PRId64" from corrupt title rec for %s",m_docId,m_firstUrl.getUrl());
	}

	if ( m_docId == 0 ) {
		log(LOG_WARN, "build: docid is 0 for %s",m_firstUrl.getUrl());
		g_errno = ENODOCID;
		return NULL;
	}

	// ensure it is within probable range
	if ( ! getUseTimeAxis () ) {
		char *u = getFirstUrl()->getUrl();
		int64_t pd = Titledb::getProbableDocId(u);
		int64_t d1 = Titledb::getFirstProbableDocId ( pd );
		int64_t d2 = Titledb::getLastProbableDocId  ( pd );
		if ( m_docId < d1 || m_docId > d2 ) {
			g_process.shutdownAbort(true); }
	}

	m_docIdValid = true;
	return &m_docId;
}

// . is our docid on disk? i.e. do we exist in the index already?
// . TODO: just check tfndb?
char *XmlDoc::getIsIndexed ( ) {
	if ( m_isIndexedValid ) return &m_isIndexed;

	setStatus ( "getting is indexed" );

	// we must be old if this is true
	//if ( m_setFromTitleRec ) {
	//	m_isNew      = false;
	//	m_isNewValid = true;
	//	return &m_isNew;
	//}
	// get the url
	//char *u = getFirstUrl()->getUrl();

	if ( m_oldDocValid ) {
		m_isIndexedValid = true;
		if ( m_oldDoc ) m_isIndexed = (char)true;
		else            m_isIndexed = (char)false;
		return &m_isIndexed;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// sanity check. if we have no url or docid ...
	if ( ! m_firstUrlValid && ! m_docIdValid ) { g_process.shutdownAbort(true); }
	// use docid if first url not valid
	int64_t docId = 0;
	char      *url  = NULL;
	// use docid if its valid, otherwise use url
	if ( m_docIdValid ) docId = m_docId;
	else                url   = ptr_firstUrl;

	// note it
	if(!m_calledMsg22e) {
		setStatus ( "checking titledb for old title rec");
		m_calledMsg22e = true;

		// . consult the title rec tree!
		// . "justCheckTfndb" is set to true here!
		if(!m_msg22e.getTitleRec(&m_msg22Request,
					 url,
					 docId                , // probable docid
					 cr->m_coll               ,
					 // . msg22 will set this to point to it!
					 // . if NULL that means NOT FOUND
					 NULL                 , // tr ptr
					 NULL                 , // tr size ptr
					 true                 , // just chk tfndb?
					 false, // getavaildocidonly
					 m_masterState        ,
					 m_masterLoop         ,
					 m_niceness           , // niceness
					 999999               )){ // timeout seconds
			logTrace( g_conf.m_logTraceXmlDoc, "END, called msg22e.getTitleRec, which blocked. Return -1" );
			// return -1 if we blocked
			return (char *)-1;
		}
		logTrace( g_conf.m_logTraceXmlDoc, "msg22e.getTitleRec did not block" );
	} else
		setStatus ( "back from msg22e call");

	// error?
	if ( g_errno ) return NULL;
	// get it
	m_isIndexed = (char)m_msg22e.wasFound();

	// validate
	m_isIndexedValid = true;
	logTrace( g_conf.m_logTraceXmlDoc, "END, returning isIndexed [%s]", m_isIndexed?"true":"false");
	return &m_isIndexed;
}

static void gotTagRecWrapper(void *state) {
	XmlDoc *THIS = (XmlDoc *)state;
	// note it
	THIS->setStatus ( "in got tag rec wrapper" );
	// set these
	if ( ! g_errno ) {
		THIS->m_tagRec.serialize ( THIS->m_tagRecBuf );
		THIS->ptr_tagRecData =  THIS->m_tagRecBuf.getBufStart();
		THIS->size_tagRecData = THIS->m_tagRecBuf.length();
		// validate
		THIS->m_tagRecValid = true;
	}
	// continue
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
TagRec *XmlDoc::getTagRec ( ) {
	// if we got it give it
	if ( m_tagRecValid ) return &m_tagRec;

	// do we got a title rec?
	if ( m_setFromTitleRec && m_tagRecDataValid ) {
		// we set m_tagRecValid and m_tagRecDataValid to false in Repair.cpp
		// if rebuilding titledb!! otherwise, we have to use what is in titlerec
	    // to avoid parsing inconsistencies that would result in undeletable posdb data.

	    // lookup the tagdb rec fresh if setting for a summary. that way
		// we can see if it is banned or not

		// all done
		m_tagRecValid = true;

		// just return empty otherwise
		m_tagRec.setFromBuf ( ptr_tagRecData , size_tagRecData );

		return &m_tagRec;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// update status msg
	setStatus ( "getting tagdb record" );

	// nah, try this
	Url *u = getFirstUrl();

	// get it, user our collection for lookups, not m_tagdbColl[] yet!
	if ( !m_msg8a.getTagRec( u, cr->m_collnum, m_niceness, this, gotTagRecWrapper, &m_tagRec ) ) {
		// we blocked, return -1
		return (TagRec *) -1;
	}

	// error? ENOCOLLREC?
	if ( g_errno ) {
		return NULL;
	}

	// assign it
	m_tagRec.serialize ( m_tagRecBuf );
	ptr_tagRecData =  m_tagRecBuf.getBufStart();
	size_tagRecData = m_tagRecBuf.length();

	// our tag rec should be all valid now
	m_tagRecValid = true;
	return &m_tagRec;
}




// we need this for setting SpiderRequest::m_parentFirstIp of each outlink
int32_t *XmlDoc::getFirstIp ( ) {
	// return it if we got it
	if ( m_firstIpValid ) return &m_firstIp;
	// note it
	setStatus ( "getting first ip");
	// get tag rec
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// got it
	Tag *tag = gr->getTag ( "firstip" );
	// get from tag
	m_firstIp = 0;
	if ( tag ) m_firstIp = atoip(tag->getTagData());
	// if no tag, or is bogus in tag... set from ip
	if ( m_firstIp == 0 || m_firstIp == -1 ) {
		// need ip then!
		int32_t *ip = getIp();
		if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
		// set that
		m_firstIp = *ip;
	}
	m_firstIpValid = true;
	return &m_firstIp;
	// must be 4 bytes - no now its a string
	//if ( tag->getTagDataSize() != 4 ) { g_process.shutdownAbort(true); }
}

// this is the # of GOOD INLINKS to the site. so it is no more than
// 1 per c block, and it has to pass link spam detection. this is the
// highest-level count of inlinks to the site. use it a lot.
int32_t *XmlDoc::getSiteNumInlinks ( ) {

	if ( m_siteNumInlinksValid ) return &m_siteNumInlinks;

	// sanity check
	if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {g_process.shutdownAbort(true);}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// hacks of speed. computeSiteNumInlinks is true by default
	// but if the user turns it off the just use sitelinks.txt
	if ( cr && ! cr->m_computeSiteNumInlinks ) {
		int32_t hostHash32 = getHostHash32a();
		int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
		// try with www if not there
		if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
			int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
			min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
		}
		// fix core by setting these
		//a nd this
		m_siteNumInlinksValid = true;
		m_siteNumInlinks      = 0;
		// if still not in sitelinks.txt, just use 0
		if ( min < 0 ) {
			return &m_siteNumInlinks;
		}
		m_siteNumInlinks = min;
		return &m_siteNumInlinks;
	}

	setStatus ( "getting site num inlinks");

	// get it from the tag rec if we can
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (void *)-1 ) return (int32_t *)gr;

	// the current top ip address
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip;
	//int32_t top = *ip & 0x00ffffff;

	// this happens when its NXDOMAIN reply from dns so assume
	// no site inlinks
	if ( *ip == 0 ) {
		m_siteNumInlinks             = 0;
		m_siteNumInlinksValid             = true;
		return &m_siteNumInlinks;
	}

	if ( *ip == -1 ) {
		log("xmldoc: ip is %" PRId32", can not get site inlinks",*ip);
		g_errno = EBADIP;
		return NULL;
	}

	setStatus ( "getting site num inlinks");
	// check the tag first
	Tag *tag = gr->getTag ("sitenuminlinks");
	// is it valid?
	bool valid = true;
	// current time
	int32_t now = getTimeGlobal();

	// get tag age in days
	int32_t age = 0; if ( tag ) age = (now - tag->m_timestamp) ;
	// add in some flutter to avoid having all hsots in the network
	// calling msg25 for this site at the same time.
	// a 10,000 second jitter. 3 hours.
	int32_t flutter = rand() % 10000;
	// add it in
	age += flutter;
	// . if site changes ip then toss the contact info out the window,
	//   but give it a two week grace period
	// . well now we use the "ownershipchanged" tag to indicate that
	//if (tag && age>14*3600*24) valid=false;
	// . we also expire it periodically to keep the info uptodate
	// . the higher quality the site, the longer the expiration date
	int32_t ns = 0;
	int32_t maxAge = 0;
	int32_t sni = -1;
	if ( tag ) {
		// how many site inlinks?
		ns = atol(tag->getTagData());
		// for less popular sites use smaller maxAges
		maxAge = 90;
		if      ( ns <  10 ) maxAge = 10;
		else if ( ns <  30 ) maxAge = 15;
		else if ( ns <  50 ) maxAge = 30;
		else if ( ns < 100 ) maxAge = 60;
		// if index size is tiny then maybe we are just starting to
		// build something massive, so reduce the cached max age
		int64_t nt = g_titledb.getRdb()->getCollNumTotalRecs(m_collnum);
		if ( nt < 100000000 ) //100M
			maxAge = 3;
		if ( nt < 10000000 ) //10M
			maxAge = 1;
		// for every 100 urls you already got, add a day!
		sni = atol(tag->getTagData());

		/// @note if we need to force an update in tagdb for sitenuminlinks, add it here

		// convert into seconds
		maxAge *= 3600*24;
		// so youtube which has 2997 links will add an extra 29 days
		maxAge += (sni / 100) * 86400;
		// hack for global index. never affect siteinlinks i imported
		if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0;
		// invalidate for that as wel
		if ( age > maxAge ) valid = false;
	}

	// if we have already been through this
	if ( m_updatingSiteLinkInfoTags ) valid = false;
	// if rebuilding linkdb assume we have no links to sample from!
	if ( tag && m_useSecondaryRdbs && g_repair.linkdbRebuildPending() )
		valid = true;

	// debug log
	if ( g_conf.m_logDebugLinkInfo )
		log("xmldoc: valid=%" PRId32" "
		    "age=%" PRId32" ns=%" PRId32" sni=%" PRId32" "
		    "maxage=%" PRId32" "
		    "tag=%" PTRFMT" "
		    // "tag2=%" PTRFMT" "
		    // "tag3=%" PTRFMT" "
		    "url=%s",
		    (int32_t)valid,age,ns,sni,
		    maxAge,
		    (PTRTYPE)tag,
		    // (PTRTYPE)tag2,
		    // (PTRTYPE)tag3,
		    m_firstUrl.getUrl());

	LinkInfo *sinfo = NULL;
	char *mysite = NULL;

	// if we are good return it
	if ( tag && valid ) {
		// set it
		m_siteNumInlinks = atol(tag->getTagData());
		m_siteNumInlinksValid = true;

		// . consult our sitelinks.txt file
		// . returns -1 if not found
		goto updateToMin;
	}

	// set this flag so when we are re-called, "valid" will be set to false
	// so we can come down here and continue this. "flutter" might
	// otherwise cause us to not make it down here.
	m_updatingSiteLinkInfoTags = true;

	// we need to re-get both if either is NULL
	sinfo = getSiteLinkInfo();
	// block or error?
	if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo;

	//
	// now update tagdb!
	//

	mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite;

	setStatus ( "adding site info tags to tagdb 1");

	// why are we adding tag again! should already be in tagdb!!!
	if ( m_doingConsistencyCheck ) {g_process.shutdownAbort(true);}

	// do not re-call at this point
	m_siteNumInlinks      = (int32_t)sinfo->m_numGoodInlinks;
	m_siteNumInlinksValid      = true;

 updateToMin:

	// . consult our sitelinks.txt file
	// . returns -1 if not found
	int32_t hostHash32 = getHostHash32a();
	int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );

	// try with www if not there
	if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
		int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
		min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
	}

	if ( min >= 0 ) {
		if ( m_siteNumInlinks < min ||
		     ! m_siteNumInlinksValid ) {
			m_siteNumInlinks = min;
			m_siteNumInlinksValid = true;
		}
	}

	// deal with it
	return &m_siteNumInlinks;
}

// TODO: can we have a NULL LinkInfo without having had an error?
LinkInfo *XmlDoc::getSiteLinkInfo() {
	// lookup problem?
	if ( g_errno ) {
		log("build: error getting link info: %s",
		    mstrerror(g_errno));
		return NULL;
	}

	setStatus ( "getting site link info" );

	if ( m_siteLinkInfoValid )
	{
		//return msg25.m_linkInfo;
		return (LinkInfo *)m_mySiteLinkInfoBuf.getBufStart();
	}

	char *mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 )
	{
		return (LinkInfo *)mysite;
	}

	int32_t *fip = getFirstIp();
	if ( ! fip || fip == (int32_t *)-1)
	{
		return (LinkInfo *)fip;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		return NULL;
	}

	// can we be cancelled?
	bool canBeCancelled = true;
	// not if pageparser though
	if ( m_pbuf ) canBeCancelled = false;
	// not if injecting
	if ( ! m_sreqValid ) canBeCancelled = false;
	// assume valid when it returns
	m_siteLinkInfoValid = true;
	// use this buffer so XmlDoc::print() can display it where it wants
	SafeBuf *sb = NULL;
	if ( m_pbuf ) sb = &m_siteLinkBuf;
	// only do this for showing them!!!
	if ( m_useSiteLinkBuf ) sb = &m_siteLinkBuf;
	//bool onlyGetGoodInlinks = true;
	//if ( m_useSiteLinkBuf ) onlyGetGoodInlinks = false;
	// get this
	int32_t lastUpdateTime = getTimeGlobal();
	// get from spider request if there
	//bool injected = false;
	//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;

	bool onlyNeedGoodInlinks = true;
	// so if steve wants to display all links then set this
	// to false so we get titles of bad inlinks
	// seems like pageparser.cpp just sets m_pbuf and not
	// m_usePageLinkBuf any more
	if ( sb ) onlyNeedGoodInlinks = false;

	// shortcut
	//Msg25 *m = &m_msg25;
	if ( ! getLinkInfo ( &m_tmpBuf11,
			     &m_mcast11,
			     mysite , // site
				mysite , // url
				true , // isSiteLinkInfo?
				*fip                 ,
				0 , // docId
				cr->m_collnum           , //linkInfoColl
				m_masterState       ,
				m_masterLoop        ,
				m_contentInjected ,// isInjecting?
				sb                  ,
			     m_printInXml        ,
				0 , // sitenuminlinks -- dunno!
				NULL , // oldLinkInfo1        ,
				m_niceness          ,
				cr->m_doLinkSpamCheck ,
				cr->m_oneVotePerIpDom ,
				canBeCancelled        ,
				lastUpdateTime ,
				onlyNeedGoodInlinks ,
				false,
				0,
				0,
				// it will store the linkinfo into this safebuf
				&m_mySiteLinkInfoBuf) )
		// return -1 if it blocked
		return (LinkInfo *)-1;

	// getLinkInfo() now calls multicast so it returns true on errors only
	log("build: error making link info: %s",mstrerror(g_errno));
	return NULL;
}

static void gotIpWrapper ( void *state , int32_t ip ) ;

static void delayWrapper ( int fd , void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->m_masterLoop ( THIS->m_masterState );
}

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
int32_t *XmlDoc::getIp ( ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// return if we got it
	if ( m_ipValid )
	{
		char ipbuf[16];
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid [%s]", iptoa(m_ip,ipbuf));
		return &m_ip;
	}

	// update status msg
	setStatus ( "getting ip" );

	m_ipStartTime = 0;
	// assume the same in case we get it right away
	m_ipEndTime = 0;

	// if set from docid and recycling
	if ( m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "END, return -1. getOldXmlDoc failed" );
			return (int32_t *)pod;
		}

		// shortcut
		XmlDoc *od = *pod;
		// set it
		if ( od ) {
			m_ip      = od->m_ip;
			m_ipValid = true;

			char ipbuf[16];
			logTrace( g_conf.m_logTraceXmlDoc, "END, got it from old XmlDoc [%s]", iptoa(m_ip,ipbuf));
			return &m_ip;
		}
	}


	// get the best url
	Url *u = getCurrentUrl();
	if ( ! u || u == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return -1. getCurrentUrl failed." );
		return (int32_t *)u;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. getCollRec failed" );
		return NULL;
	}

	// we need the ip before we download the page, but before we get
	// the IP and download the page, wait for this many milliseconds.
	// this basically slows the spider down.
	int32_t delay = cr->m_spiderDelayInMilliseconds;
	// injected?
	if ( m_sreqValid && m_sreq.m_isInjecting  ) delay = 0;
	if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
	if ( m_sreqValid && m_sreq.m_fakeFirstIp  ) delay = 0;
	// . don't do the delay when downloading extra doc, robots.txt etc.
	// . this also reports a status msg of "getting new doc" when it
	//   really means "delaying spider"
	if ( m_isChildDoc ) delay = 0;

	if ( delay > 0 && ! m_didDelay ) {
		// we did it
		m_didDelay = true;
		m_statusMsg = "delaying spider";
		// random fuzz so we don't get everyone being unleashed at once
		int32_t radius = delay/5;
		if(radius<=0)
			radius = 1;
		int32_t fuzz = (rand() % (radius * 2)) - radius;
		delay += fuzz;
		if(delay<=0)
			delay = 1;
		logTrace( g_conf.m_logTraceXmlDoc, "SLEEPING %" PRId32" msecs", delay);
		// make a callback wrapper.
		// this returns false and sets g_errno on error
		if (g_loop.registerSleepCallback(delay, m_masterState, delayWrapper, "XmlDoc::delayWrapper", m_niceness))
			// wait for it, return -1 since we blocked
			return (int32_t *)-1;
		// if was not able to register, ignore delay
	}

	if ( m_didDelay && ! m_didDelayUnregister ) {
		g_loop.unregisterSleepCallback(m_masterState,delayWrapper);
		m_didDelayUnregister = true;
	}

	// update status msg
	setStatus ( "getting ip" );

	m_ipStartTime = gettimeofdayInMilliseconds();

	// assume valid! if reply handler gets g_errno set then m_masterLoop
	// should see that and call the final callback
	//m_ipValid = true;
	// get it
	logTrace( g_conf.m_logTraceXmlDoc, "Calling MsgC.getIp [%s]", u->getHost());
	if (!m_msgc.getIp(u->getHost(), u->getHostLen(), &m_ip, this, gotIpWrapper)) {
		// we blocked
		logTrace( g_conf.m_logTraceXmlDoc, "END, return -1. Blocked." );
    	return (int32_t *)-1;
	}

	// wrap it up
	int32_t *rval2 = gotIp ( true );
	char ipbuf[16];
	logTrace( g_conf.m_logTraceXmlDoc, "END, return [%s]", rval2 ? iptoa(*rval2,ipbuf) : "NULL");
	return rval2;
}



void gotIpWrapper ( void *state , int32_t ip ) {
	// point to us
	XmlDoc *THIS = (XmlDoc *)state;

	THIS->m_ipEndTime = gettimeofdayInMilliseconds();

	char ipbuf[16];
	logTrace( g_conf.m_logTraceXmlDoc, "Got IP [%s]. Took %" PRId64" msec", iptoa(ip,ipbuf), THIS->m_ipEndTime - THIS->m_ipStartTime);

	// wrap it up
	THIS->gotIp ( true );
	// . call the master callback
	// . m_masterState usually equals THIS, unless THIS is the
	//   Xml::m_contactDoc or something...
	THIS->m_masterLoop ( THIS->m_masterState );
}


int32_t *XmlDoc::gotIp ( bool save ) {
	// return NULL on error
	if ( g_errno ) return NULL;
	// this is bad too
	//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
	//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());

	setStatus ("got ip");

	// we got it
	m_ipValid = true;
	// give it to them
	return &m_ip;
}


// when doing a custom crawl we have to decide between the provided crawl
// delay, and the one in the robots.txt...
int32_t *XmlDoc::getFinalCrawlDelay() {

	if ( m_finalCrawlDelayValid ) {
		if ( g_conf.m_logDebugRobots  ) {
			log(LOG_DEBUG,"getFinalCrawlDelay: returning %" PRId32 " - m_finalCrawlDelayValid is true", m_finalCrawlDelay);
		}
		return &m_finalCrawlDelay;
	}

	bool *isAllowed = getIsAllowed();
	if ( ! isAllowed || isAllowed == (void *)-1 ) {
		if ( g_conf.m_logDebugRobots  ) {
			log(LOG_DEBUG,"getFinalCrawlDelay: not allowed");
		}
		return (int32_t *)isAllowed;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		if ( g_conf.m_logDebugRobots  ) {
			log(LOG_DEBUG,"getFinalCrawlDelay: Returning NULL, no CollectionRec");
		}
		return NULL;
	}

	m_finalCrawlDelayValid = true;

	// getIsAllowed already sets m_crawlDelayValid to true
	m_finalCrawlDelay = m_crawlDelay;

	// Changed previously hard coded default of 250ms to the 
	// configurable delay for sites with no robots.txt
	if ( m_crawlDelay < 0 )	{
		m_finalCrawlDelay = cr->m_crawlDelayDefaultForNoRobotsTxtMS;
	}

	if ( g_conf.m_logDebugRobots  ) {
		log(LOG_DEBUG,"getFinalCrawlDelay: returning %" PRId32 ". Setting m_finalCrawlDelayValid to true", m_finalCrawlDelay);
	}

	return &m_finalCrawlDelay;
}


bool XmlDoc::isFirstUrlRobotsTxt ( ) {
	if ( m_isRobotsTxtUrlValid )
		return m_isRobotsTxtUrl;
	Url *fu = getFirstUrl();

	m_isRobotsTxtUrl = ( fu->getUrlLen() > 12 && ! strncmp ( fu->getUrl() + fu->getUrlLen() - 11 , "/robots.txt" , 11 ) );
	m_isRobotsTxtUrlValid = true;
	return m_isRobotsTxtUrl;
}


// . get the Robots.txt and see if we are allowed
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
// . getting a robots.txt is not trivial since we need to follow redirects,
//   so we make use of the powerful XmlDoc class for this
bool *XmlDoc::getIsAllowed ( ) {

	logTrace( g_conf.m_logTraceSpider, "BEGIN" );

	// return if we got it
	if ( m_isAllowedValid )
	{
		logTrace( g_conf.m_logTraceSpider, "END. Valid. Allowed=%s",(m_isAllowed?"true":"false"));
		return &m_isAllowed;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		log(LOG_ERROR,"getIsAllowed - NOT allowed, could not get CollectionRec!");
		m_isAllowed      = false;
		return &m_isAllowed;
	}

	// could be turned off for everyone
	if ( ! m_useRobotsTxt ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		m_crawlDelayValid = true;
		m_crawlDelay = cr->m_crawlDelayDefaultForNoRobotsTxtMS;

		//log("xmldoc: skipping robots.txt lookup for %s",
		//    m_firstUrl.m_url);
		logTrace( g_conf.m_logTraceSpider, "END. !m_useRobotsTxt" );
		return &m_isAllowed;
	}

	// . if setting from a title rec, assume allowed
	// . this avoids doConsistencyCheck() from blocking and coring
	if ( m_setFromTitleRec ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		logTrace( g_conf.m_logTraceSpider, "END. Allowed, m_setFromTitleRec" );
		return &m_isAllowed;
	}

	if ( m_recycleContent ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		logTrace( g_conf.m_logTraceSpider, "END. Allowed, m_recycleContent" );
		return &m_isAllowed;
	}

	// double get?
	if ( m_crawlDelayValid ) { g_process.shutdownAbort(true); }

	// . if WE are robots.txt that is always allowed!!!
	// . check the *first* url since these often redirect to wierd things
	if ( isFirstUrlRobotsTxt() ) {
		m_isAllowed      = true;
		m_isAllowedValid = true;
		m_crawlDelayValid = true;
		// make it super fast...
		m_crawlDelay      = 0;
		logTrace( g_conf.m_logTraceSpider, "END. Allowed, WE are robots.txt" );
		return &m_isAllowed;
	}

	// update status msg
	setStatus ( "getting robots.txt" );
	// sanity
	int32_t *ip = getIp();
	// error? or blocked?
	if ( ! ip || ip == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceSpider, "END. getIp failed" );
		return (bool *)ip;
	}

	Url *fu = getFirstUrl();
	// if ip does not exist on the dns, do not try to download robots.txt
	// it is pointless... this can happen in the dir coll and we basically
	// have "m_siteInCatdb" set to true
	char ipbuf[16];
	logTrace( g_conf.m_logTraceSpider, "IP=%s", iptoa(*ip,ipbuf));

	if ( *ip == 1 || *ip == 0 || *ip == -1 ) {
		// note this
		log("build: robots.txt ip is %s for url=%s. allowing for now.", iptoa(*ip,ipbuf), fu->getUrl());
		// just core for now
		//g_process.shutdownAbort(true);

		//@todo BR: WHY allow when we couldn't get IP??
		m_isAllowed      = true;
		m_isAllowedValid = true;
		// since ENOMIME is no longer causing the indexCode
		// to be set, we are getting a core because crawlDelay
		// is invalid in getNewSpiderReply()
		m_crawlDelayValid = true;
		m_crawlDelay = cr->m_crawlDelayDefaultForNoRobotsTxtMS;;
		logTrace( g_conf.m_logTraceSpider, "END. We allow it. FIX?" );
		return &m_isAllowed;
	}

	// we need this so getExtraDoc does not core
	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceSpider, "END. No first IP, return %s", ((bool *)pfip?"true":"false"));
		return (bool *)pfip;
	}


	// get the current url after redirects
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceSpider, "END. No current URL, return %s", ((bool *)cu?"true":"false"));
		return (bool *)cu;
	}


	// set m_extraUrl to the robots.txt url
	char buf[MAX_URL_LEN+1];
	char *p = buf;
	if ( ! cu->getScheme() )
	{
		p += sprintf ( p , "http://" );
	}
	else
	{
		gbmemcpy ( p , cu->getScheme() , cu->getSchemeLen() );
		p += cu->getSchemeLen();
		p += sprintf(p,"://");
	}



	// sanity
	if ( ! cu->getHost() ) { g_process.shutdownAbort(true); }
	gbmemcpy ( p , cu->getHost() , cu->getHostLen() );
	p += cu->getHostLen();

	// add port if not default
	if ( cu->getPort() != cu->getDefaultPort() ) {
		p += sprintf( p, ":%" PRId32, cu->getPort() );
	}

	p += sprintf ( p , "/robots.txt" );
	m_extraUrl.set ( buf );

	logTrace( g_conf.m_logTraceSpider, "m_extraUrl [%s]", buf);


	// . maxCacheAge = 3600 seconds = 1 hour for robots.txt
	// . if this is non-zero then msg13 should store it as well!
	// . for robots.txt it should only cache the portion of the doc
	//   relevant to our user agent!
	// . getHttpReply() should use msg13 to get cached reply!
	XmlDoc **ped = getExtraDoc(m_extraUrl.getUrl(), cr->m_maxRobotsCacheAge);
	if ( ! ped || ped == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceSpider, "END. getExtraDoc (ped) failed, return %s", ((bool *)ped?"true":"false"));
		return (bool *)ped;
	}

	// assign it
	XmlDoc *ed = *ped;
	// return NULL on error with g_errno set
	if ( ! ed ) {
		// sanity check, g_errno must be set
		if ( ! g_errno ) { g_process.shutdownAbort(true); }
		// log it -- should be rare?
		log("doc: had error getting robots.txt: %s",
		    mstrerror(g_errno));

		logTrace( g_conf.m_logTraceSpider, "END. Return NULL, ed failed" );

		return NULL;
	}

	// . now try the content
	// . should call getHttpReply
	char **pcontent = ed->getContent();
	if ( ! pcontent || pcontent == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceSpider, "END. pcontent failed, return %s", ((bool *)pcontent?"true":"false"));
		return (bool *)pcontent;
	}

	// get the mime
	HttpMime *mime = ed->getMime();
	if ( ! mime || mime == (HttpMime *)-1 )
	{
		logTrace( g_conf.m_logTraceSpider, "END. mime failed, return %s", ((bool *)mime?"true":"false"));
		return (bool *)mime;
	}

	// get this
	int32_t contentLen = ed->m_contentLen;

	// save this
	m_robotsTxtLen = contentLen;
	m_robotsTxtLenValid = true;

	// get content
	char *content = *pcontent;

	// sanity check
	if ( content && contentLen > 0 && content[contentLen] != '\0'){
		g_process.shutdownAbort(true);}

	// reset this. -1 means unknown or none found. We now use a more sane default
	// as the caller would have defaulted to 250ms if set to -1 here.
	m_crawlDelay = cr->m_crawlDelayDefaultForNoRobotsTxtMS;
	m_crawlDelayValid = true;

	// assume valid and ok to spider
	m_isAllowed      = true;
	m_isAllowedValid = true;


	if ( mime->getHttpStatus() != 200 )
	{
		/// @todo ALC we should allow more error codes
		/// 2xx (successful) : allow
		/// 3xx (redirection) : follow
		/// 4xx (client errors) : allow
		/// 5xx (server errors) : disallow

		// We could not get robots.txt - use default crawl-delay for
		// sites with no robots.txt
		m_crawlDelay = cr->m_crawlDelayDefaultForNoRobotsTxtMS;


		// BR 20151215: Do not allow spidering if we cannot read robots.txt EXCEPT if the error code is 404 (Not Found).
		if( mime->getHttpStatus() != 404 )
		{
			m_isAllowed = false;
		}

		logTrace( g_conf.m_logTraceSpider, "END. httpStatus != 200. Return %s", (m_isAllowed?"true":"false"));

		// nuke it to save mem
		nukeDoc ( ed );
		return &m_isAllowed;
	}


	/// @todo ALC cache robots instead of robots.txt
	// initialize robots
	Robots robots( content, contentLen, g_conf.m_spiderBotName );

	m_isAllowed = robots.isAllowed( cu );
	m_crawlDelay = robots.getCrawlDelay();

	if( m_crawlDelay == -1 ) {
		// robots.txt found, but it contains no crawl-delay for us. Set to configured default.
		m_crawlDelay = cr->m_crawlDelayDefaultForRobotsTxtMS;
	}

	m_isAllowedValid = true;

	// nuke it to save mem
	nukeDoc ( ed );

	logTrace( g_conf.m_logTraceSpider, "END. Returning %s (m_crawlDelay=%" PRId32 "", (m_isAllowed?"true":"false"), m_crawlDelay);

	return &m_isAllowed;
}



// . lookup the title rec with the "www." if we do not have that in the url
// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
char *XmlDoc::getIsWWWDup ( ) {
	// this is not a real error really
	//if ( g_errno == ENOTFOUND ) g_errno = 0;
	// return if we got it
	if ( m_isWWWDupValid ) return &m_isWWWDup;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// could be turned off for everyone
	if ( ! cr->m_dupCheckWWW ) {
		m_isWWWDup      = (char)false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}
	// get the FIRST URL... (no longer current url after redirects)
	Url *u = getFirstUrl(); // CurrentUrl();
	// if we are NOT a DOMAIN-ONLY url, then no need to do this dup check
	if ( u->getDomainLen() != u->getHostLen() ) {
		m_isWWWDup      = (char)false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}

	// must NOT have a www
	if ( ! u->isHostWWW() ) {
		m_isWWWDup      = (char)false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}

	// watch out for idiot urls like www.gov.uk and www.gov.za
	// treat them as though the TLD is uk/za and the domain
	// is gov.uk and gov.za
	if ( u->getDomain() &&
	     strncmp ( u->getDomain() , "www." , 4 ) == 0 ) {
		m_isWWWDup      = (char)false;
		m_isWWWDupValid = true;
		return &m_isWWWDup;
	}

	// make it without the www
	char withoutWWW[MAX_URL_LEN+1];
	const char *proto = "http";
	if ( u->isHttps() ) proto = "https";
	sprintf(withoutWWW,"%s://%s",proto,u->getDomain());

	// assume yes
	m_isWWWDup = (char)true;

	if ( ! m_calledMsg22f )
		setStatus ( "getting possible www dup title rec" );

	// . does this title rec exist in titledb?
	// . "justCheckTfndb" is set to true here!
	if ( ! m_calledMsg22f &&
	     ! m_msg22f.getTitleRec ( &m_msg22Request      ,
				      withoutWWW           ,
				      0                    , // probable docid
				      cr->m_coll               ,
				      // . msg22 will set this to point to it!
				      // . if NULL that means NOT FOUND
				      NULL                 , // tr ptr
				      NULL                 , // tr size ptr
				      true                 , // just chk tfndb?
				      false, // getavaildocidonly
				      m_masterState        ,
				      m_masterLoop         ,
				      m_niceness           , // niceness
				      999999               )){ // timeout seconds
		// validate
		m_calledMsg22f = true;
		// return -1 if we blocked
		return (char *)-1;
	}
	// got it
	m_calledMsg22f = true;
	// valid now
	m_isWWWDupValid = true;
	// found?
	if(!g_errno && m_msg22f.wasFound()) {
		// crap we are a dup
		m_isWWWDup = (char)true;
		// set the index code
		//m_indexCode = EDOCDUPWWW;
	}
	// return us
	return &m_isWWWDup;
}



static LinkInfo s_dummy2;

// . returns NULL and sets g_errno on error
// . returns -1 if blocked, will re-call m_callback
LinkInfo *XmlDoc::getLinkInfo1 ( ) {
	if ( m_linkInfo1Valid && ptr_linkInfo1 )
		return ptr_linkInfo1;

	// do not generate in real-time from a msg20 request for a summary,
	// because if this falls through then getFirstIp() below can return -1
	// and we return -1, causing all kinds of bad things to happen for
	// handling the msg20 request
	if ( m_setFromTitleRec && m_req && ! ptr_linkInfo1 ) {
		memset ( &s_dummy2 , 0 , sizeof(s_dummy2) );
		s_dummy2.m_lisize = sizeof(s_dummy2);
		ptr_linkInfo1  = &s_dummy2;
		size_linkInfo1 = sizeof(s_dummy2);
		return ptr_linkInfo1;
	}

	// at least get our firstip so if cr->m_getLinkInfo is false
	// then getRevisedSpiderReq() will not core because it is invalid
	int32_t *ip = getFirstIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip;


	// just return nothing if not doing link voting
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// to keep things fast we avoid getting link info for some collections
	if ( ! m_linkInfo1Valid && ! cr->m_getLinkInfo ) {
		ptr_linkInfo1 = NULL;
		m_linkInfo1Valid = true;
	}

	// sometimes it is NULL in title rec when setting from title rec
	if ( m_linkInfo1Valid && ! ptr_linkInfo1 ) {
		memset ( &s_dummy2 , 0 , sizeof(s_dummy2) );
		s_dummy2.m_lisize = sizeof(s_dummy2);
		ptr_linkInfo1  = &s_dummy2;
		size_linkInfo1 = sizeof(s_dummy2);
		return ptr_linkInfo1;
	}

	// return if we got it
	if ( m_linkInfo1Valid )
		return ptr_linkInfo1;

	// change status
	setStatus ( "getting local inlinkers" );

	XmlDoc **od = getOldXmlDoc ( );
	if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo *)od;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni;
	//int32_t *fip = getFirstIp();
	//if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d;
	// sanity check. error?
	if ( *d == 0LL ) {
		log("xmldoc: crap no g_errno");
		g_errno = EBADENGINEER;
		return NULL;
	}
	char *mysite = getSite();
	if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite;

	// grab a ptr to the LinkInfo contained in our Doc class
	LinkInfo  *oldLinkInfo1 = NULL;
	if ( *od ) oldLinkInfo1 = (*od)->getLinkInfo1();

	//link info generation requires an IP for internal/external computation
	// UNLESS we are from getSpiderStatusDocMetaList2()
	// if ip does not exist, make it 0
	if ( *ip == 0 || *ip == -1 ) {
		m_linkInfo1Valid = true;
		memset ( &s_dummy2 , 0 , sizeof(LinkInfo) );
		s_dummy2.m_lisize = sizeof(LinkInfo);
		ptr_linkInfo1  = &s_dummy2;
		size_linkInfo1 = sizeof(LinkInfo);
		return ptr_linkInfo1;
	}

	// . error getting linkers?
	// . on udp timeout we were coring below because msg25.m_linkInfo
	//   was NULL
	if ( g_errno && m_calledMsg25 ) return NULL;

	// . now search for some link info for this url/doc
	// . this queries the search engine to get linking docIds along
	//   with their termIds/scores from anchor text and then compiles
	//   it all into one IndexList
	// . if we have no linkers to this url then we set siteHash, etc.
	//   for this linkInfo class
	// . this is my google algorithm
	// . let's use the first url (before redirects) for this
	// . m_newDocId is used for classifying doc under predefined news topic
	// . catSiteRec is used for classifying pages under a predefined
	//   newstopic. this is currently for news search only.
	// . use the rootTitleRecPtr if there and we are doing our link info
	//   stuff in this collection, but if doing it in another collection
	//   the msg25 will look up the root in that collection...
	if ( ! m_calledMsg25 ) {
		// get this
		int32_t lastUpdateTime = getTimeGlobal();

		// do not redo it
		m_calledMsg25 = true;
		// shortcut
		//Msg25 *m = &m_msg25;
		// can we be cancelled?
		bool canBeCancelled = true;
		// not if pageparser though
		if ( m_pbuf ) canBeCancelled = false;
		// not if injecting
		if ( ! m_sreqValid ) canBeCancelled = false;
		// use this buffer so XmlDoc::print() can display wherever
		SafeBuf *sb = NULL;
		if ( m_pbuf ) sb = &m_pageLinkBuf;
		// only do this for showing them!!!
		if ( m_usePageLinkBuf ) sb = &m_pageLinkBuf;
		// get from spider request if there
		//bool injected = false;
		//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
		// we do not want to waste time computing the page title
		// of bad inlinks if we only want the good inlinks, because
		// as of oct 25, 2012 we only store the "good" inlinks
		// in the titlerec
		bool onlyNeedGoodInlinks = true;
		// so if steve wants to display all links then set this
		// to false so we get titles of bad inlinks
		if ( m_usePageLinkBuf ) onlyNeedGoodInlinks = false;
		// seems like pageparser.cpp just sets m_pbuf and not
		// m_usePageLinkBuf any more
		if ( m_pbuf           ) onlyNeedGoodInlinks = false;
		// status update
		setStatus ( "calling msg25 for url" );
		CollectionRec *cr = getCollRec();
		if ( ! cr ) return NULL;

		// we want to get all inlinks if doing a custom crawlbot crawl
		// because we need the anchor text to pass in to diffbot
		bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
		bool oneVotePerIpDom = cr->m_oneVotePerIpDom;

		// call it. this is defined in Linkdb.cpp
		char *url = getFirstUrl()->getUrl();
		if ( ! getLinkInfo ( &m_tmpBuf12,
				     &m_mcast12,
				        mysite ,
					url ,
					false , // isSiteLinkInfo?
					*ip                 ,
					*d                  ,
					cr->m_collnum       , //linkInfoColl
					m_masterState       ,
					m_masterLoop        ,
					m_contentInjected ,//m_injectedReply ,
					sb                  ,
					m_printInXml        ,
					*sni                ,
					oldLinkInfo1        ,
					m_niceness          ,
					doLinkSpamCheck ,
					oneVotePerIpDom ,
					canBeCancelled        ,
					lastUpdateTime ,
					onlyNeedGoodInlinks ,
					false, // getlinkertitles
					0, // ourhosthash32 (special)
					0,  // ourdomhash32 (special)
					&m_myPageLinkInfoBuf
					) )
			// blocked
			return (LinkInfo *)-1;
		// error?
		if ( g_errno ) return NULL;
		// panic! what the fuck? why did it return true and then
		// call our callback???
		log(LOG_ERROR, "build: xmldoc call to msg25 did not block");
		// must now block since it uses multicast now to
		// send the request onto the network
		gbshutdownLogicError();
	}

	// at this point assume its valid
	m_linkInfo1Valid = true;
	// . get the link info we got set
	// . this ptr references into m_myPageLinkInfoBuf safebuf
	//ptr_linkInfo1  = m_msg25.m_linkInfo;
	//size_linkInfo1 = m_msg25.m_linkInfo->getSize();
	ptr_linkInfo1  = (LinkInfo *)m_myPageLinkInfoBuf.getBufStart();
	size_linkInfo1 = m_myPageLinkInfoBuf.length();

	// we should free it
	m_freeLinkInfo1 = true;

	// this can not be NULL!
	if ( ! ptr_linkInfo1 || size_linkInfo1 <= 0 ) {
		log(LOG_ERROR, "build: error getting linkinfo1: %s",mstrerror(g_errno));
		gbshutdownLogicError();
	}

	// validate linkinfo
	if (ptr_linkInfo1->m_version != 0 ||
		ptr_linkInfo1->m_lisize < 0 || ptr_linkInfo1->m_lisize != size_linkInfo1 ||
		ptr_linkInfo1->m_numStoredInlinks < 0 || ptr_linkInfo1->m_numGoodInlinks < 0) {
		gbshutdownCorrupted();
	}

	// set flag
	m_linkInfo1Valid = true;
	// . validate the hop count thing too
	// . i took hopcount out of linkdb to put in lower ip byte for steve
	//m_minInlinkerHopCount = -1;//m_msg25.getMinInlinkerHopCount();
	// return it
	return ptr_linkInfo1;
}

static void gotSiteWrapper ( void *state ) ;

// . we should store the site in the title rec because site getter might
//   change what it thinks the site is!
char *XmlDoc::getSite ( ) {
	// was there a problem getting site?
	if ( m_siteValid && m_siteGetter.getErrno() ) {
		g_errno = m_siteGetter.getErrno();
		return NULL;
	}
	// ok, return it
	if ( m_siteValid ) {
		return ptr_site;
	}

	// note it
	setStatus ( "getting site");
	// need this
	TagRec *gr = getTagRec();
	// sanity check
	if ( ! gr && ! g_errno ) { g_process.shutdownAbort(true); }
	// blocked or error?
	if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr;
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// get url
	Url *f = getFirstUrl();
	// bogus first url? prevent core in getIsSiteRoot().
	if ( f->getUrlLen() <= 1 ) {
		log("xmldoc: getSite: got bogus first url.");
		g_errno = EBADURL;
		return NULL;
	}

	int32_t timestamp = getSpideredTime();

	// do it
	if ( ! m_siteGetter.getSite ( f->getUrl(), gr, timestamp, cr->m_collnum, m_niceness, this, gotSiteWrapper )) {
		// return -1 if we blocked
		return (char *) -1;
	}

	// error?
	if ( g_errno ) {
		return NULL;
	}

	// set these then
	gotSite();
	return ptr_site;
}

// set it
void gotSiteWrapper ( void *state ) {
	// point to us
	XmlDoc *THIS = (XmlDoc *)state;
	THIS->gotSite ();
	// resume. this checks g_errno for being set.
	THIS->m_masterLoop ( THIS->m_masterState );
}

void XmlDoc::gotSite ( ) {
	// sanity check
	if ( ! m_siteGetter.allDone() && ! g_errno ) { g_process.shutdownAbort(true); }

	// this sets g_errno on error
	ptr_site    = const_cast<char*>(m_siteGetter.getSite());
	size_site   = m_siteGetter.getSiteLen()+1; // include \0

	// sanity check -- must have a site
	if ( ! g_errno && size_site <= 1 ) { g_process.shutdownAbort(true); }

	// BR 20151215: Part of fix that avoids defaultint to http:// when getting
	// robots.txt and root document of a https:// site.
	ptr_scheme    = const_cast<char*>(m_siteGetter.getScheme());
	size_scheme   = m_siteGetter.getSchemeLen()+1; // include \0

	// sitegetter.m_errno might be set!
	m_siteValid = true;

	// must be valid
	if ( ! m_tagRecValid ) { g_process.shutdownAbort(true); }
}


int32_t *XmlDoc::getSiteHash32 ( ) {
	if ( m_siteHash32Valid ) return &m_siteHash32;
	char *site = getSite();
	if ( ! site || site == (void *)-1) return (int32_t *)site;
	m_siteHash32 = hash32 ( site , strlen(site) );
	m_siteHash32Valid = true;
	return &m_siteHash32;
}


const char *XmlDoc::getScheme ( ) {
	// was there a problem getting site?
	if ( m_siteValid && m_siteGetter.getErrno() ) {
		g_errno = m_siteGetter.getErrno();
		return NULL;
	}
	// ok, return it
	if ( m_siteValid ) return ptr_scheme;//m_siteGetter.m_scheme;

	return "";
}



char **XmlDoc::getHttpReply ( ) {
	// both must be valid now
	if ( m_redirUrlValid && m_httpReplyValid ) {
		// might have been a download error of ECORRUPTDATA
		if ( m_downloadStatus == ECORRUPTDATA ) {
			// set g_errno so caller knows
			g_errno = m_downloadStatus;
			// null means error
			return NULL;
		}
		// otherwise, assume reply is valid
		return &m_httpReply;
	}

	setStatus("getting http reply");

	// come back up here if a redirect invalidates it
	for ( ; ; ) {
		// get the http reply
		char **replyPtr = getHttpReply2();
		if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;

		// . now if the reply was a redirect we should set m_redirUrl to it
		//   and re-do all this code
		// . this often sets m_indexCode to stuff like ESIMPLIFIEDREDIR, etc.
		Url **redirp = getRedirUrl();

		// we often lookup the assocaited linkInfo on the original url to
		// see if it is worth keeping and indexing just to take advantage of
		// the incoming link text it has, so we may block on that!
		// but in the case of a contactDoc, getContactDoc() sets these things
		// to NULL to avoid unnecessary lookups.
		if ( ! redirp || redirp == (void *)-1 ) return (char **)redirp;

		// sanity check
		if ( *redirp && ! m_redirUrlValid ) { g_process.shutdownAbort(true); }

		// if NULL, we are done
		if ( ! *redirp ) return &m_httpReply;

		// . also, hang it up if we got a simplified redir url now
		// . we set m_redirUrl so that getLinks() can add a spiderRequest
		//   for it, but we do not want to actually redirect to it to get
		//   the content for THIS document
		if ( m_redirError ) return &m_httpReply;

		// and invalidate the redir url because we do not know if the
		// current url will redirect or not (mdwmdw)
		m_redirUrlValid           = false;
		m_metaRedirUrlValid       = false;

		// free it
		mfree ( m_httpReply , m_httpReplyAllocSize, "freehr" );

		// always nullify if we free so we do not re-use freed mem
		m_httpReply = NULL;

		// otherwise, we had a redirect, so invalidate what we had set
		m_httpReplyValid          = false;

		m_isContentTruncatedValid = false;

		// do not redo robots.txt lookup if the redir url just changed from
		// http to https or vice versa
		Url *ru = *redirp;
		Url *cu = getCurrentUrl();
		if ( ! cu || cu == (void *)-1) return (char **)cu;
		if ( strcmp ( ru->getUrl() + ru->getSchemeLen(), cu->getUrl() + cu->getSchemeLen() ) != 0 ) {
			// redo robots.txt lookup. might be cached.
			m_isAllowedValid  = false;
			m_crawlDelayValid = false;
		}

		// keep the same ip if hostname is unchanged
		if ( ru->getHostLen() != cu->getHostLen() || strncmp(ru->getHost(), cu->getHost(), cu->getHostLen()) != 0 ) {
			// ip is supposed to be that of the current url, which changed
			m_ipValid = false;
		}

		// we set our m_xml to the http reply to check for meta redirects
		// in the html sometimes in getRedirUrl() so since we are redirecting,
		// invalidate that xml
		m_xmlValid                = false;
		m_wordsValid              = false;
		m_rawUtf8ContentValid     = false;
		m_expandedUtf8ContentValid= false;
		m_utf8ContentValid        = false;
		m_filteredContentValid    = false;
		m_contentValid            = false;
		m_mimeValid               = false;

		// update our current url now to be the redirected url
		m_currentUrl.set ( *redirp );
		m_currentUrlValid = true;
	}
}

static void gotHttpReplyWrapper ( void *state ) {
	// point to us
	XmlDoc *THIS = (XmlDoc *)state;
	// this sets g_errno on error
	THIS->gotHttpReply ( );
	// resume. this checks g_errno for being set.
	THIS->m_masterLoop ( THIS->m_masterState );
}



// "NULL" can be a valid http reply (empty page) so we need to use "char **"
char **XmlDoc::getHttpReply2 ( ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_httpReplyValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already has valid reply" );
		return &m_httpReply;
	}

	setStatus("getting http reply2");


	// if recycle is set then NEVER download if doing query reindex
	// but if doing an injection then i guess we can download.
	// do not even do ip lookup if no old titlerec, which is how we
	// ended up here...
	if ( m_recycleContent && m_sreqValid && m_sreq.m_isPageReindex ) {
		g_errno = ENOTITLEREC;
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. ENOTITLEREC (1)" );
		return NULL;
	}

	// get ip
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. no IP" );
		return (char **)ip;
	}

	// reset
	m_httpReplySize = 0;
	m_httpReply     = NULL;

	// if ip is bogus, we are done
	if ( *ip == 0 || *ip == -1 ) {
		log("xmldoc: ip is bogus 0 or -1 for %s. skipping download",
		    m_firstUrl.getUrl());
		m_httpReplyValid          = true;
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// need this now too. but don't hurt a nonzero val if we have
		if ( ! m_downloadEndTimeValid ) {
			m_downloadEndTime      = 0;
			m_downloadEndTimeValid = true;
		}

		logTrace( g_conf.m_logTraceXmlDoc, "END, return empty reply, IP is bogus" );
		return &m_httpReply;
		//return gotHttpReply ( );
	}

	// get this. should operate on current url (i.e. redir url if there)
	bool *isAllowed = getIsAllowed();

	// error or blocked
	if ( ! isAllowed || isAllowed == (void *)-1)
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return, not allowed." );
		return (char **)isAllowed;
	}

	// this must be valid, since we share m_msg13 with it
	if ( ! m_isAllowedValid ) { g_process.shutdownAbort(true); }

	int32_t *cd = getFinalCrawlDelay();
	if ( ! cd || cd == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. could not get crawl delay" );
		return (char **)cd;
	}

	// we might bail
	if ( ! *isAllowed ) {
		m_httpReplyValid          = true;
		m_isContentTruncated      = false;
		m_isContentTruncatedValid = true;
		// need this now too. but don't hurt a nonzero val if we have
		if ( ! m_downloadEndTimeValid ) {
			m_downloadEndTime      = 0;
			m_downloadEndTimeValid = true;
		}
		m_downloadStatusValid = true;
		// forbidden? assume we downloaded it and it was empty
		m_downloadStatus = 0; // EDOCDISALLOWED;//403;

		logTrace( g_conf.m_logTraceXmlDoc, "END, return empty reply, download not allowed" );
		return &m_httpReply;
		//return gotHttpReply ( );
	}

	// are we site root page?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return, error calling getIsSiteRoot" );
		return (char **)isRoot;
	}

	XmlDoc *od = NULL;
	if ( ! m_isSpiderProxy &&
		 // don't lookup xyz.com/robots.txt in titledb
		 ! isFirstUrlRobotsTxt() ) {
		    XmlDoc **pod = getOldXmlDoc ( );
		    if ( ! pod || pod == (XmlDoc **)-1 ) {
		    	logTrace( g_conf.m_logTraceXmlDoc, "END, return, error calling getOldXmlDoc" );
		    	return (char **)pod;
		    }
		    // get ptr to old xml doc, could be NULL if non exists
		    od = *pod;
	}



	// sanity check
	if ( od && m_recycleContent ) {g_process.shutdownAbort(true); }

	// validate m_firstIpValid
	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return, error calling getFirstIp" );
		return (char **)pfip;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. getCollRec returned false" );
		return NULL;
	}

	// if we didn't block getting the lock, keep going
	setStatus ( "getting web page" );

	// sanity check
	if ( ! m_masterLoop ) { g_process.shutdownAbort(true); }

	// shortcut. this will return the redirUrl if it is non-empty.
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return, getCurrentUrl returned false" );
		return (char **)cu;
	}

	// set parms
	Msg13Request *r = &m_msg13Request;
	// clear it first
	r->reset();
	// and set the url
	r->ptr_url  = cu->getUrl();
	r->size_url = cu->getUrlLen()+1;

	// sanity check
	if ( ! m_firstIpValid ) { g_process.shutdownAbort(true); }

	// max to download in bytes.
	r->m_maxTextDocLen          = cr->m_maxTextDocLen;
	r->m_maxOtherDocLen         = cr->m_maxOtherDocLen;

	// but if url is on the intranet/internal nets
	if ( m_ipValid && is_internal_net_ip(m_ip) ) {
		// . if local then make web page download max size unlimited
		// . this is for adding the gbdmoz.urls.txt.* files to
		//   populate dmoz. those files are about 25MB each.
		r->m_maxTextDocLen  = -1;
		r->m_maxOtherDocLen = -1;
	}
	// m_maxCacheAge is set for getting contact or root docs in
	// getContactDoc() and getRootDoc() and it only applies to
	// titleRecs in titledb i guess... but still... for Msg13 it applies
	// to its cache ... for robots.txt files too
	r->m_maxCacheAge            = m_maxCacheAge;
	r->m_urlIp                  = *ip;
	r->m_firstIp                = m_firstIp;
	r->m_urlHash48              = getFirstUrlHash48();
	r->m_spideredTime           = getSpideredTime();//m_spideredTime;
	r->m_ifModifiedSince        = 0;
	r->m_skipHammerCheck        = 0;

	if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
		r->ptr_cookie  = m_redirCookieBuf.getBufStart();
		r->size_cookie = m_redirCookieBuf.length() + 1;
		// . only do once per redirect
		// . do not invalidate because we might have to carry it
		//   through to the next redir... unless we change domain
		// . this fixes the nyt.com/nytimes.com bug some more
		//m_redirCookieBufValid = false;
	}

	// . this is -1 if unknown. none found in robots.txt or provided
	//   in the custom crawl parms.
	// . it should also be 0 for the robots.txt file itself
	r->m_crawlDelayMS = *cd;

	// let's time our crawl delay from the initiation of the download
	// not from the end of the download. this will make things a little
	// faster but could slam servers more.
	r->m_crawlDelayFromEnd = false;

	// new stuff
	r->m_contentHash32 = 0;
	// if valid in SpiderRequest, use it. if spider compression proxy
	// sees the content is unchanged it will not send it back! it will
	// send back g_errno = EDOCUNCHANGED or something
	if ( m_sreqValid )
		r->m_contentHash32 = m_sreq.m_contentHash32;

	// if we have the old doc already set use that
	if ( od )
		r->m_contentHash32 = od->m_contentHash32;

	// for beta testing, make it a collection specific parm for diffbot
	// so we can turn on manually
	if ( cr->m_forceUseFloaters )
		r->m_forceUseFloaters = true;


	// turn this off too
	r->m_attemptedIframeExpansion = false;

	r->m_collnum = (collnum_t)-1;
	if ( m_collnumValid )r->m_collnum = m_collnum;

	// turn off
	r->m_useCompressionProxy = false;
	r->m_compressReply       = false;

	// set it for this too
	if ( g_conf.m_useCompressionProxy ) {
		r->m_useCompressionProxy = true;
		r->m_compressReply       = true;
	}

	logTrace( g_conf.m_logTraceXmlDoc, "cu->m_url [%s]", cu->getUrl());
	logTrace( g_conf.m_logTraceXmlDoc, "m_firstUrl.m_url [%s]", m_firstUrl.getUrl());

	// if current url IS NOT EQUAL to first url then set redir flag
	if ( strcmp(cu->getUrl(),m_firstUrl.getUrl()) != 0 )
		r->m_skipHammerCheck = 1;
	// or if this an m_extraDoc or m_rootDoc for another url then
	// do not bother printing the hammer ip msg in msg13.cpp either
	if ( m_isChildDoc )
		r->m_skipHammerCheck = 1;

	if ( m_contentInjected ) // oldsrValid && m_sreq.m_isInjecting )
		r->m_skipHammerCheck = 1;

	if ( r->m_skipHammerCheck )
		log(LOG_DEBUG,"build: skipping hammer check");

	// if we had already spidered it... try to save bandwidth and time
	if ( od ) {
		// sanity check
		if ( ! od->m_spideredTimeValid ) { g_process.shutdownAbort(true); }
		// only get it if modified since last spider time
		r->m_ifModifiedSince = od->m_spideredTime;
	}

	// if doing frame expansion on a doc we just downloaded as the
	// spider proxy, we are asking ourselves now to download the url
	// from an <iframe src=...> tag. so definitely use msg13 again
	// so it can use the robots.txt cache, and regular html page cache.
	if ( m_isSpiderProxy ) {
		r->m_useCompressionProxy = false;
		r->m_compressReply       = false;
		r->m_skipHammerCheck = 1;
		// no frames within frames
		r->m_attemptedIframeExpansion = 1;
		log(LOG_DEBUG,"build: skipping hammer check 2");

	}

	// . use msg13 to download the file, robots.txt
	// . msg13 will ensure only one download of that url w/ locks
	// . msg13 can use the compress the http reply before
	//   sending it back to you via udp (compression proxy)
	// . msg13 uses XmlDoc::getHttpReply() function to handle
	//   redirects, etc.? no...

	// sanity check. keep injections fast. no downloading!
	if ( m_wasContentInjected ) {
		log("xmldoc: url injection failed! error!");
		g_process.shutdownAbort(true);
	}

	// sanity check
	if ( m_deleteFromIndex ) {
		log("xmldoc: trying to download page to delete");
		g_process.shutdownAbort(true);
	}

	m_downloadStartTimeValid = true;
	m_downloadStartTime = gettimeofdayInMilliseconds();

	logTrace( g_conf.m_logTraceXmlDoc, "Calling msg13.getDoc" );

	if ( ! m_msg13.getDoc ( r,this , gotHttpReplyWrapper ) )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return -1. msg13.getDoc blocked" );
		// return -1 if blocked
		return (char **)-1;
	}

	logTrace( g_conf.m_logTraceXmlDoc, "END, calling gotHttpReply and returning result" );
	return gotHttpReply ( );
}

// . this returns false if blocked, true otherwise
// . sets g_errno on error
char **XmlDoc::gotHttpReply ( ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// save it
	int32_t saved = g_errno;
	// note it
	setStatus ( "got web page" );

	// sanity check. are we already valid?
	if ( m_httpReply && m_httpReplyValid ) { g_process.shutdownAbort(true); }

	// do not re-call
	m_httpReplyValid = true;

	// assume none
	m_httpReply = NULL;

	// . get the HTTP reply
	// . TODO: free it on reset/destruction, we own it now
	// . this is now NULL terminated thanks to changes in
	//   Msg13.cpp, but watch the buf size, need to subtract 1
	// . therefore, we can set the Xml class with it
	m_httpReply     = m_msg13.m_replyBuf;
	m_httpReplySize = m_msg13.m_replyBufSize;
	// how much to free?
	m_httpReplyAllocSize = m_msg13.m_replyBufAllocSize;

	// sanity check
	if ( m_httpReplySize > 0 && ! m_httpReply ) { g_process.shutdownAbort(true); }

	// . don't let UdpServer free m_buf when socket is
	//   recycled/closed
	// . we own it now and are responsible for freeing it
	m_msg13.m_replyBuf = NULL;
	m_msg13.m_replyBufSize = 0;
	m_msg13.m_replyBufAllocSize = 0;

	// relabel mem so we know where it came from
	relabel( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. Could not get collection" );
		return NULL;
	}

	// . sanity test
	// . i.e. what are you doing downloading the page if there was
	//   a problem with the page we already know about
	if ( m_indexCode && m_indexCodeValid ) {
		g_process.shutdownAbort(true);
	}

	// fix this
	if ( saved == EDOCUNCHANGED ) {
		logTrace( g_conf.m_logTraceXmlDoc, "EDOCUNCHANGED" );
		// assign content from it since unchanged
		m_recycleContent = true;
		// clear the error
		saved = 0;
		g_errno = 0;
	}

	// . save the error in download status
	// . could now be EDOCUNCHANGED or EDOCNOGOODDATE (w/ tod)
	m_downloadStatus = saved; // g_errno;
	// validate
	m_downloadStatusValid = true;

	// update m_downloadEndTime if we should, used for sameIpWait
	m_downloadEndTime      = gettimeofdayInMilliseconds();
	m_downloadEndTimeValid = true;

	// make it so
	g_errno = saved;

	// this means the spider compression proxy's reply got corrupted
	// over roadrunner's crappy wireless internet connection
	if ( saved == ECORRUPTDATA )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. ECORRUPTDATA" );
		return NULL;
	}

	// this one happens too! for the same reason...
	if ( saved == EBADREPLYSIZE )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. EBADREPLYSIZE" );
		return NULL;
	}

	// might as well check this too while we're at it
	if ( saved == ENOMEM )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, return NULL. ENOMEM" );
		return NULL;
	}

	// sanity check -- check after bailing on corruption because
	// corrupted replies do not end in NULLs
	if ( m_httpReplySize > 0 && m_httpReply[m_httpReplySize-1] ) {
		log("http: httpReplySize=%" PRId32" http reply does not end in \\0 "
		    "for %s in collnum=%" PRId32". blanking out reply."
		    ,m_httpReplySize
		    ,m_firstUrl.getUrl()
		    ,(int32_t)m_collnum
		    );
		// free it i guess
		mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
		// and reset it
		m_httpReplySize      = 0;
		m_httpReply          = NULL;
		m_httpReplyAllocSize = 0;
		// call it data corruption i guess for now
		g_errno = ECORRUPTDATA;
		//g_process.shutdownAbort(true);
		logTrace( g_conf.m_logTraceXmlDoc, "Clearing data, detected corruption" );
	}

	// if its a bad gzip reply, a compressed http reply, then
	// make the whole thing empty? some websites return compressed replies
	// even though we do not ask for them. and then the compression
	// is corrupt.
	if ( saved == ECORRUPTHTTPGZIP ||
	     // if somehow we got a page too big for MAX_DGRAMS... treat
	     // it like an empty page...
	     saved == EMSGTOOBIG ) {

	     logTrace( g_conf.m_logTraceXmlDoc, "Clearing data, ECORRUPTHTTPGZIP or EMSGTOOBIG" );
		// free it i guess
		mfree ( m_httpReply, m_httpReplyAllocSize, "XmlDocHR" );
		// and reset it
		m_httpReplySize      = 0;
		m_httpReply          = NULL;
		m_httpReplyAllocSize = 0;
	}

	// clear this i guess
	g_errno = 0;

	logTrace( g_conf.m_logTraceXmlDoc, "END, returning reply." );
	return &m_httpReply;
}



char *XmlDoc::getIsContentTruncated ( ) {
	if ( m_isContentTruncatedValid ) return &m_isContentTruncated2;

	setStatus ( "getting is content truncated" );

	// if recycling content use its download end time
	if ( m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (char *)pod;
		// shortcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			m_isContentTruncated  = od->m_isContentTruncated;
			m_isContentTruncated2 = (bool)m_isContentTruncated;
			m_isContentTruncatedValid = true;
			return &m_isContentTruncated2;
		}
	}

	// need a valid reply
	char **replyPtr = getHttpReply ();
	if ( ! replyPtr || replyPtr == (void *)-1 ) return (char *)replyPtr;

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (char *)ct;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// shortcut - convert size to length
	int32_t LEN = m_httpReplySize - 1;

	m_isContentTruncated  = false;
	// was the content truncated? these might label a doc is truncated
	// when it really is not... but we only use this for link spam stuff,
	// so it should not matter too much. it should only happen rarely.
	if ( cr->m_maxTextDocLen >= 0 &&
		LEN >= cr->m_maxTextDocLen-1  &&
		*ct == CT_HTML )
			m_isContentTruncated = true;

	if ( cr->m_maxOtherDocLen >= 0 &&
		LEN >= cr->m_maxOtherDocLen-1 &&
		*ct != CT_HTML )
			m_isContentTruncated = true;

	//if ( LEN > MAXDOCLEN ) m_isContentTruncated = true;
	// set this
	m_isContentTruncated2 = (bool)m_isContentTruncated;
	// validate it
	m_isContentTruncatedValid = true;

	return &m_isContentTruncated2;
}



int32_t *XmlDoc::getDownloadStatus ( ) {
	if ( m_downloadStatusValid ) return &m_downloadStatus;
	// log it
	setStatus ( "getting download status");
	// if recycling content, we're 200!
	if ( m_recycleContent ) {
		m_downloadStatus = 0;
		m_downloadStatusValid = true;
		return &m_downloadStatus;
	}
	// get ip
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip;
	// . first try ip
	// . this means the dns lookup timed out
	if ( *ip == -1 ) {
		m_downloadStatus = EDNSTIMEDOUT;
		m_downloadStatusValid = true;
		return &m_downloadStatus;
	}
	// this means ip does not exist
	if ( *ip == 0 ) {
		m_downloadStatus = EBADIP;
		m_downloadStatusValid = true;
		return &m_downloadStatus;
	}
	// need a valid reply
	char **reply = getHttpReply ();
	if ( ! reply || reply == (void *)-1 ) return (int32_t *)reply;
	// must be valid now
	if ( ! m_downloadStatusValid ) { g_process.shutdownAbort(true); }
	// return it
	return &m_downloadStatus;
}



int64_t *XmlDoc::getDownloadEndTime ( ) {
	if ( m_downloadEndTimeValid ) return &m_downloadEndTime;
	// log it
	setStatus ( "getting download end time");

	// do not cause us to core in getHttpReply2() because m_deleteFromIndex
	// is set to true...
	if ( m_deleteFromIndex ) {
		m_downloadEndTime = 0;
		m_downloadEndTimeValid = true;
		return &m_downloadEndTime;
	}

	// if recycling content use its download end time
	if ( m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (int64_t *)pod;
		// shortcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			m_downloadEndTime      = od->m_downloadEndTime;
			m_downloadEndTimeValid = true;
			return &m_downloadEndTime;
		}
	}

	// need a valid reply
	char **reply = getHttpReply ();
	if ( ! reply || reply == (void *)-1 ) return (int64_t *)reply;
	// must be valid now
	if ( ! m_downloadEndTimeValid ) { g_process.shutdownAbort(true);}
	// return it
	return &m_downloadEndTime;
}



int16_t *XmlDoc::getHttpStatus ( ) {
	// if we got a title rec then return that
	if ( m_httpStatusValid ) return &m_httpStatus;
	// get mime otherwise
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (int16_t *)mime;
	// get from that
	m_httpStatus = mime->getHttpStatus();
	m_httpStatusValid = true;
	return &m_httpStatus;
}



HttpMime *XmlDoc::getMime () {
	if ( m_mimeValid ) return &m_mime;

	// log debug
	setStatus("getting http mime");

	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1) return (HttpMime *)cu;

	// injection from SpiderLoop.cpp sets this to true
	if ( m_useFakeMime ) {
	usefake:
		m_mime.set            ( NULL , 0 , cu );
		m_mime.setHttpStatus  ( 200 );
		m_mime.setContentType ( CT_HTML );
		m_mimeValid = true;
		return &m_mime;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// if recycling content, fake this mime
	if ( cr->m_recycleContent || m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (HttpMime *)pod;
		// shortcut
		XmlDoc *od = *pod;
		// . this is non-NULL if it existed
		// . fake it for now
		if ( od ) goto usefake;
	}

	// need a valid reply
	char **reply = getHttpReply ();
	if ( ! reply || reply == (void *)-1 ) return (HttpMime *)reply;

	// fake it for now
	m_mime.set ( NULL , 0 , cu );
	m_mime.setHttpStatus ( 200 );
	m_mime.setContentType ( CT_HTML );

	// shortcut
	int32_t LEN = m_httpReplySize - 1;

	// validate it
	m_mimeValid = true;

	// TODO: try again on failures because server may have been overloaded
	// and closed the connection w/o sending anything
	if ( LEN>0 && ! m_mime.set ( m_httpReply , LEN , cu ) ) {
		// set this on mime error
		//m_indexCode = EBADMIME;
		// return a fake thing. content length is 0.
		return &m_mime;
	}

	return &m_mime;
}



// need to use "char **" since content might be NULL itself, if none
char **XmlDoc::getContent ( ) {
	if ( m_contentValid ) return &m_content;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// recycle?
	if ( cr->m_recycleContent || m_recycleContent ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (char **)pod;
		// shortcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			m_content      = od-> ptr_utf8Content;
			m_contentLen   = od->size_utf8Content - 1;
			m_contentValid = true;
			return &m_content;
		}
		if ( m_recycleContent )
			log("xmldoc: failed to load old title rec "
			    "when recycle content was true and url = "
			    "%s",ptr_firstUrl);
		// if could not find title rec and we are docid-based then
		// we can't go any further!!
		if ( m_setFromDocId ) {
			log("xmldoc: null content for docid-based titlerec "
			    "lookup which was not found");
			m_content = NULL;
			m_contentLen = 0;
			m_contentValid = true;
			return &m_content;
		}
	}

	if ( m_recycleContent ) {
		if ( m_firstUrlValid )
			log("xmldoc: failed to recycle content for %s. could "
			    "not load title rec",m_firstUrl.getUrl());
		else if ( m_docIdValid )
			log("xmldoc: failed to recycle content for %" PRIu64". "
			    "could "
			    "not load title rec",m_docId );
		else
			log("xmldoc: failed to recycle content. "
			    "could not load title rec" );
		// let's let it pass and just download i guess, then
		// we can get page stats for urls not in the index
		//g_errno = EBADENGINEER;
		//return NULL;
	}


	// if we were set from a title rec use that we do not have the original
	// content, and caller should be calling getUtf8Content() anyway!!
	if ( m_setFromTitleRec ) { g_process.shutdownAbort(true); }

	// get the mime first
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (char **)mime;

	// http reply must be valid
	if ( ! m_httpReplyValid ) { g_process.shutdownAbort(true); }

	// make it valid
	m_contentValid = true;

	// assume none
	m_content    = NULL;
	m_contentLen = 0;

	// all done if no reply
	if ( ! m_httpReply ) return &m_content;

	// watch out for this!
	if (m_useFakeMime) {
		m_content = m_httpReply;
		m_contentLen = m_httpReplySize;
	} else {
		// set the content, account for mime header
		m_content = m_httpReply + mime->getMimeLen();
		m_contentLen = m_httpReplySize - mime->getMimeLen();
	}

	// why is this not really the size???
	m_contentLen--;

	// sanity check
	if ( m_contentLen < 0 ) { g_process.shutdownAbort(true); }
	return &m_content;
}

static char getContentTypeFromContent(const char *p) {
	char ctype = 0;
	// max
	const char *pmax = p + 100;
	// check that out
	for ( ; p && *p && p < pmax ; p++ ) {
		if ( p[0] != '<' ) continue;
		if ( p[1] != '!' ) continue;
		if ( to_lower_a(p[2]) != 'd' ) continue;
		if ( strncasecmp(p,"<!doctype ",10) != 0 ) continue;
		const char *dt = p + 10;
		// skip spaces
		for ( ; *dt ; dt++ ) {
			if ( ! is_wspace_a ( *dt ) ) break;
		}
		// point to that
		if ( ! strncasecmp(dt,"html"     ,4) ) ctype = CT_HTML;
		if ( ! strncasecmp(dt,"xml"      ,3) ) ctype = CT_XML;
		if ( ! strncasecmp(dt,"text/html",9) ) ctype = CT_HTML;
		if ( ! strncasecmp(dt,"text/xml" ,8) ) ctype = CT_XML;
		break;
	}
	return ctype;
}

uint8_t *XmlDoc::getContentType ( ) {
	if ( m_contentTypeValid ) return &m_contentType;
	// log debug
	setStatus("getting content type");
	// get the mime first
	HttpMime *mime = getMime();
	if ( ! mime || mime == (HttpMime *)-1 ) return (uint8_t *)mime;
	// then get mime
	m_contentType = mime->getContentType();
	// but if they specify <!DOCTYPE html> in the document that overrides
	// the content type in the mime! fixes planet.mozilla.org
	char **pp = getContent();
	if ( ! pp || pp == (void *)-1 ) return (uint8_t *)pp;
	char *p = *pp;
	// scan content for content type. returns 0 if none found.
	char ctype2 = getContentTypeFromContent ( p );
	// valid?
	if ( ctype2 != 0 ) m_contentType = ctype2;
	// it is valid now
	m_contentTypeValid = true;
	// give to to them
	return &m_contentType;
}



// . similar to getMetaRedirUrl but look for different strings
// . rel="canonical" or rel=canonical in a link tag.
Url **XmlDoc::getCanonicalRedirUrl ( ) {
	logTrace(g_conf.m_logTraceXmlDoc, "BEGIN");

	// return if we got it
	if (m_canonicalRedirUrlValid) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. Already valid");
		return &m_canonicalRedirUrlPtr;
	}

	// assume none in doc
	m_canonicalRedirUrlPtr = NULL;

	CollectionRec *cr = getCollRec();
	if (!cr) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. CollectionRec is null, returning NULL");
		return NULL;
	}

	if (!cr->m_useCanonicalRedirects) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. Canonical redirects is disabled. No canonical redirection");
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}

	// are we site root page? don't follow canonical url then.
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. Unable to check if site is root");
		return (Url **)isRoot;
	}
	if ( *isRoot ) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. Site is root. No canonical redirection");
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}

	uint8_t *ct = getContentType();
	if ( ! ct ) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. content type is null, returning NULL");
		return NULL;
	}

	// these canonical links only supported in xml/html i think
	if ( *ct != CT_HTML && *ct != CT_XML ) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. Content type not HTML/XML. No canonical redirection");
		m_canonicalRedirUrlValid = true;
		return &m_canonicalRedirUrlPtr;
	}

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) {
		logTrace(g_conf.m_logTraceXmlDoc, "END. Unable to get xml");
		return (Url **)xml;
	}

	// scan nodes looking for a <link> node. like getBaseUrl()
	for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
		// 12 is the <base href> tag id
		if ( xml->getNodeId ( i ) != TAG_LINK ) {
			continue;
		}

		// get the href field of this base tag
		int32_t linkLen;
		char *link = xml->getString ( i, "href", &linkLen );
		// skip if not valid
		if ( ! link || linkLen == 0 ) {
			continue;
		}

		// must also have rel=canoncial
		int32_t relLen;
		char *rel = xml->getString(i,"rel",&relLen);
		if ( ! rel ) continue;
		// skip if does not match "canonical"
		if ( strncasecmp(rel,"canonical",relLen) != 0 ) {
			continue;
		}

		// allow for relative urls
		Url *cu = getCurrentUrl();
		// set base to it
		m_canonicalRedirUrl.set( cu, link, linkLen );

		// Detect invalid canonical URLs like <link rel="canonical" href="https://://jobs.dart.biz/search/" />
		// The Url class really should have a "isValid" function...
		if( m_canonicalRedirUrl.getTLDLen() == 0 || m_canonicalRedirUrl.getDomainLen() == 0 ) {
			log(LOG_DEBUG, "Invalid canonical URL ignored [%.*s]", linkLen, link);
			continue;
		}

		// assume it is not our url
		bool isMe = false;
		// if it is us, then skip!
		if(strcmp(m_canonicalRedirUrl.getUrl(),m_firstUrl.getUrl())==0)
			isMe = true;
		// might also be our redir url i guess
		if(strcmp(m_canonicalRedirUrl.getUrl(),m_redirUrl.getUrl())==0)
			isMe = true;
		// if it is us, keep it NULL, it's not a redirect. we are
		// the canonical url.
		if ( isMe ) break;

		// ignore if in an expanded iframe (<gbframe>) tag
		char *pstart = xml->getContent();
		char *p      = link;
		// scan backwards
		if ( ! m_didExpansion ) p = pstart;
		bool skip = false;
		for ( ; p > pstart ; p-- ) {
			if ( p[0] != '<' )
				continue;
			if ( p[1] == '/' &&
			     p[2] == 'g' &&
			     p[3] == 'b' &&
			     p[4] == 'f' &&
			     p[5] == 'r' &&
			     p[6] == 'a' &&
			     p[7] == 'm' &&
			     p[8] == 'e' &&
			     p[9] == '>' )
				break;
			if ( p[1] == 'g' &&
			     p[2] == 'b' &&
			     p[3] == 'f' &&
			     p[4] == 'r' &&
			     p[5] == 'a' &&
			     p[6] == 'm' &&
			     p[7] == 'e' &&
			     p[8] == '>' ) {
				skip = true;
				break;
			}
		}
		if ( skip ) continue;

		// otherwise, it is not us, we are NOT the canonical url
		// and we should not be indexed, but just ass the canonical
		// url as a spiderrequest into spiderdb, just like
		// simplified meta redirect does.
		m_canonicalRedirUrlPtr = &m_canonicalRedirUrl;
		logTrace(g_conf.m_logTraceXmlDoc, "Got canonical url");
		break;
	}

	logTrace(g_conf.m_logTraceXmlDoc, "END. Returning canonical url[%s]", m_canonicalRedirUrlPtr ? m_canonicalRedirUrlPtr->getUrl() : NULL);
	m_canonicalRedirUrlValid = true;
	return &m_canonicalRedirUrlPtr;
}



// returns false if none found
static bool setMetaRedirUrlFromTag(char *p, Url *metaRedirUrl, Url *cu) {
	// limit scan
	char *limit = p + 30;
	// skip whitespace
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// must be a num
	if ( ! is_digit(*p) ) return false;
	// init delay
	int32_t delay = atol ( p );
	// ignore long delays
	if ( delay >= 10 ) return false;
	// now find the semicolon, if any
	for ( ; *p && p < limit && *p != ';' ; p++ );
	// must have semicolon
	if ( *p != ';' ) return false;
	// skip it
	p++;
	// skip whitespace some more
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// must have URL
	if ( strncasecmp(p,"URL",3) != 0 ) return false;
	// skip that
	p += 3;
	// skip white space
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// then an equal sign
	if ( *p != '=' ) return false;
	// skip equal sign
	p++;
	// them maybe more whitespace
	for ( ; *p && p < limit && is_wspace_a(*p) ; p++ );
	// an optional quote
	if ( *p == '\"' ) p++;
	// can also be a single quote!
	if ( *p == '\'' ) p++;
	// set the url start
	char *url = p;
	// now advance to next quote or space or >
	for ( ; *p && !is_wspace_a(*p) &&
		      *p !='\'' &&
		      *p !='\"' &&
		      *p !='>' ;
	      p++);
	// that is the end
	char *urlEnd = p;
	// get size
	int32_t usize = urlEnd - url;
	// skip if too big
	if ( usize > 1024 ) {
		log("build: meta redirurl of %" PRId32" bytes too big",usize);
		return false;
	}
	// get our current utl
	//Url *cu = getCurrentUrl();
	// decode what we got
	char decoded[MAX_URL_LEN];

	// convert &amp; to "&"
	int32_t decBytes = htmlDecode( decoded, url, usize, false );
	decoded[decBytes]='\0';

	// . then the url
	// . set the url to the one in the redirect tag
	// . but if the http-equiv meta redirect url starts with a '?'
	//   then just replace our cgi with that one
	if ( *url == '?' ) {
		char foob[MAX_URL_LEN*2];
		char *pf = foob;
		int32_t cuBytes = cu->getPathEnd() - cu->getUrl();
		gbmemcpy(foob,cu->getUrl(),cuBytes);
		pf += cuBytes;
		gbmemcpy ( pf , decoded , decBytes );
		pf += decBytes;
		*pf = '\0';
		metaRedirUrl->set(foob);
	}
	// . otherwise, append it right on
	// . use "url" as the base Url
	// . it may be the original url or the one we redirected to
	// . redirUrl is set to the original at the top
	else {
		// addWWW = false, stripSessId=true
		metaRedirUrl->set( cu, decoded, decBytes, false, true, false );
	}

	return true;
}



// scan document for <meta http-equiv="refresh" content="0;URL=xxx">
Url **XmlDoc::getMetaRedirUrl ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_metaRedirUrlValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid" );
		return &m_metaRedirUrlPtr;
	}


	// get ptr to utf8 content
	if ( ! m_httpReplyValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "DIE, reply not valid." );
		g_process.shutdownAbort(true);
	}

	char *p    = m_httpReply;
	// subtract one since this is a size not a length
	char *pend = p + m_httpReplySize - 1;//size_utf8Content;

	// assume no meta refresh url
	m_metaRedirUrlPtr = NULL;
	// make it valid regardless i guess
	m_metaRedirUrlValid = true;

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
		return NULL;
	}

	// if we are recycling or injecting, do not consider meta redirects
	if ( cr->m_recycleContent || m_recycleContent )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, recycleContent - do not consider meta redirects" );
		return &m_metaRedirUrlPtr;
	}

	Url *cu = getCurrentUrl();

	bool gotOne = false;

	// advance a bit, we are initially looking for the 'v' char
	p += 10;
	// begin the string matching loop
	for ( ; p < pend ; p++ ) {
		// fix <!--[if lte IE 6]>
		// <meta http-equiv="refresh" content="0; url=/error-ie6/" />
		if ( *p == '!' &&
			 p[-1]=='<' &&
			 p[1] == '-' &&
			 p[2] == '-' ) {
				// find end of comment
				for ( ; p < pend ; p++ ) {
					if (p[0] == '-' &&
						p[1] == '-' &&
						p[2] == '>' )
							break;
				}
				// if found no end of comment, then stop
				if ( p >= pend )
					break;
				// resume looking for meta redirect tags
				continue;
		}

		// base everything off the equal sign
		if ( *p != '=' ) continue;
		// did we match "http-equiv="?
		if ( to_lower_a(p[-1]) != 'v' ||
		     to_lower_a(p[-2]) != 'i' ||
		     to_lower_a(p[-3]) != 'u' ||
		     to_lower_a(p[-4]) != 'q' ||
		     to_lower_a(p[-5]) != 'e' ||
		                p[-6]  != '-' ||
		     to_lower_a(p[-7]) != 'p' ||
		     to_lower_a(p[-8]) != 't' ||
		     to_lower_a(p[-9]) != 't' ||
		     to_lower_a(p[-10])!= 'h' ) continue;

		// BR 20160306: Fix comparison where we have spaces before and/or after =
		// limit the # of white spaces
		char *limit = p + 20;
		// skip white spaces
		while ( *p && p < limit && is_wspace_a(*p) ) p++;

		// skip the equal sign
		// skip =
		if ( *p != '=' )
		{
			continue;
		}
		p++;

		// limit the # of white spaces
		limit = p + 20;
		// skip white spaces
		while ( *p && p < limit && is_wspace_a(*p) ) p++;

		// skip quote if there
		if ( *p == '\"' || *p == '\'' ) p++;
		// must be "refresh", continue if not
		if ( strncasecmp(p,"refresh",7) != 0 ) continue;
		// skip that
		p += 7;
		// skip another quote if there
		if ( *p == '\"' || *p == '\'' ) p++;

		// limit the # of white spaces
		limit = p + 20;
		// skip white spaces
		while ( *p && p < limit && is_wspace_a(*p) ) p++;

		// must be content now
		if ( strncasecmp(p,"content",7) != 0 ) continue;
		// skip that
		p += 7;


		// BR 20160306: Fix comparison where we have spaces before and/or after =
		// e.g. http://dnr.state.il.us/

		// limit the # of white spaces
		limit = p + 20;
		// skip white spaces
		while ( *p && p < limit && is_wspace_a(*p) ) p++;

		// skip =
		if ( *p != '=' )
		{
			continue;
		}
		p++;

		// limit the # of white spaces
		limit = p + 20;
		// skip white spaces
		while ( *p && p < limit && is_wspace_a(*p) ) p++;


		// skip possible quote
		if ( *p == '\"' || *p == '\'' ) p++;
		// PARSE OUT THE URL
		logTrace( g_conf.m_logTraceXmlDoc, "Possible redirect URL [%s]", p);

		Url dummy;
		if ( ! setMetaRedirUrlFromTag(p, &dummy, cu))
		{
			logTrace( g_conf.m_logTraceXmlDoc, "Failed to set redirect URL" );
			continue;
		}

		gotOne = true;
		break;
	}


	if ( ! gotOne )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, none found" );
		return &m_metaRedirUrlPtr;
	}

	// to fix issue with scripts containing
	// document.write('<meta http-equiv="Refresh" content="0;URL=http://ww
	// we have to get the Xml. we can't call getXml() because of
	// recursion bugs so just do it directly here

	Xml xml;
	// assume html since getContentType() is recursive on us.
	if ( !xml.set( m_httpReply, m_httpReplySize - 1, m_version, CT_HTML ) ) {
		// return NULL on error with g_errno set
		logTrace( g_conf.m_logTraceXmlDoc, "END, xml.set failed" );
		return NULL;
	}

	XmlNode *nodes = xml.getNodes();
	int32_t     n  = xml.getNumNodes();
	// find the first meta summary node
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != TAG_META ) continue;
		// only get content for <meta http-equiv=..>
		int32_t tagLen;
		char *tag ;
		tag = xml.getString ( i , "http-equiv" , &tagLen );
		// skip if empty
		if ( ! tag || tagLen <= 0 ) continue;
		// if not a refresh, skip it
		if ( strncasecmp ( tag , "refresh", 7 ) != 0 ) continue;
		// get the content
		tag = xml.getString ( i ,"content", &tagLen );
		// skip if empty
		if ( ! tag || tagLen <= 0 ) continue;

		logTrace( g_conf.m_logTraceXmlDoc, "Found possible URL in XmlNode" );
		// PARSE OUT THE URL
		if (!setMetaRedirUrlFromTag(p,&m_metaRedirUrl,cu) )
		{
			logTrace( g_conf.m_logTraceXmlDoc, "Failed to set URL from XmlNode data" );
			continue;
		}

		// set it
		m_metaRedirUrlPtr = &m_metaRedirUrl;

		logTrace( g_conf.m_logTraceXmlDoc, "END, got redirect URL from XmlNode data" );
		// return it
		return &m_metaRedirUrlPtr;
	}

	// nothing found
	logTrace( g_conf.m_logTraceXmlDoc, "END, nothing found" );
	return &m_metaRedirUrlPtr;
}



static uint16_t getCharsetFast(HttpMime *mime,
			       const char *url,
			       const char *s,
			       int32_t slen) {

	int16_t httpHeaderCharset = csUnknown;
	int16_t unicodeBOMCharset = csUnknown;
	int16_t metaCharset = csUnknown;
	bool invalidUtf8Encoding = false;

	int16_t charset = csUnknown;

	if ( slen < 0 ) slen = 0;

	const char *pstart = s;
	const char *pend   = s + slen;

	const char *cs    = mime->getCharset();
	int32_t  cslen = mime->getCharsetLen();
	if ( cslen > 31 ) cslen = 31;
	if ( cs && cslen > 0 ) {
		charset = get_iana_charset ( cs , cslen );
		httpHeaderCharset = charset;
	}

	// look for Unicode BOM first though
	cs = ucDetectBOM ( pstart , pend - pstart );
	if (cs) {
		log(LOG_DEBUG, "build: Unicode BOM signature detected: %s", cs);
		int32_t len = strlen(cs);
		if (len > 31) len = 31;
		unicodeBOMCharset = get_iana_charset(cs, len);

		if (charset == csUnknown) {
			charset = unicodeBOMCharset;
		}
	}

	// prepare to scan doc
	const char *p = pstart;

	// if the doc claims it is utf-8 let's double check because
	// newmexicomusic.org says its utf-8 in the mime header and it says
	// it is another charset in a meta content tag, and it is NOT in
	// utf-8, so don't trust that!
	if ( charset == csUTF8 ) {
		// loop over every char
		for ( const char *s = pstart ; s < pend ; s += getUtf8CharSize(s) ) {
			// sanity check
			if ( ! isFirstUtf8Char ( s ) ) {
				// note it
				log(LOG_DEBUG,
				    "build: mime says UTF8 but does not "
				    "seem to be for url %s",url);
				// reset it back to unknown then
				charset = csUnknown;
				invalidUtf8Encoding = true;
				break;
			}
		}
	}

	// do not scan the doc if we already got it set
	if ( charset != csUnknown ) p = pend;

	//
	// it is inefficient to set xml just to get the charset.
	// so let's put in some quick string matching for this!
	//

	// advance a bit, we are initially looking for the = sign
	if ( p ) p += 10;
	// begin the string matching loop
	for ( ; p < pend ; p++ ) {
		// base everything off the equal sign
		if ( *p != '=' ) continue;
		// must have a 't' or 'g' before the equal sign
		char c = to_lower_a(p[-1]);
		// did we match "charset="?
		if ( c == 't' ) {
			if ( to_lower_a(p[-2]) != 'e' ||
			     to_lower_a(p[-3]) != 's' ||
			     to_lower_a(p[-4]) != 'r' ||
			     to_lower_a(p[-5]) != 'a' ||
			     to_lower_a(p[-6]) != 'h' ||
			     to_lower_a(p[-7]) != 'c' ) continue;
		}
		// did we match "encoding="?
		else if ( c == 'g' ) {
			if ( to_lower_a(p[-2]) != 'n' ||
			     to_lower_a(p[-3]) != 'i' ||
			     to_lower_a(p[-4]) != 'd' ||
			     to_lower_a(p[-5]) != 'o' ||
			     to_lower_a(p[-6]) != 'c' ||
			     to_lower_a(p[-7]) != 'n' ||
			     to_lower_a(p[-8]) != 'e' ) continue;
		}
		// if not either, go to next char
		else
			continue;
		// . make sure a <xml or a <meta preceeds us
		// . do not look back more than 500 chars
		const char *limit = p - 500;
		// assume charset= or encoding= did NOT occur in a tag
		bool inTag = false;
		if ( limit <  pstart ) limit = pstart;
		for ( const char *s = p ; s >= limit ; s -= 1 ) { // oneChar ) {
			// break at > or <
			if ( *s == '>' ) break;
			if ( *s != '<' ) continue;
			// . TODO: this could be in a quoted string too! fix!!
			// . is it in a <meta> tag?
			if ( to_lower_a(s[1]) == 'm' &&
			     to_lower_a(s[2]) == 'e' &&
			     to_lower_a(s[3]) == 't' &&
			     to_lower_a(s[4]) == 'a' ) {
				inTag = true;
				break;
			}
			// is it in an <xml> tag?
			if ( to_lower_a(s[1]) == 'x' &&
			     to_lower_a(s[2]) == 'm' &&
			     to_lower_a(s[3]) == 'l' ) {
				inTag = true;
				break;
			}
			// is it in an <?xml> tag?
			if ( to_lower_a(s[1]) == '?' &&
			     to_lower_a(s[2]) == 'x' &&
			     to_lower_a(s[3]) == 'm' &&
			     to_lower_a(s[4]) == 'l' ) {
				inTag = true;
				break;
			}
		}
		// if not in a tag proper, it is useless
		if ( ! inTag ) continue;
		// skip over equal sign
		p += 1;//oneChar;
		// skip over ' or "
		if ( *p == '\'' ) p += 1;//oneChar;
		if ( *p == '\"' ) p += 1;//oneChar;
		// keep start ptr
		const char *csString = p;
		// set a limit
		limit = p + 50;
		if ( limit > pend ) limit = pend;
		if ( limit < p    ) limit = pend;
		// stop at first special character
		while ( p < limit &&
			*p &&
			*p !='\"' &&
			*p !='\'' &&
			! is_wspace_a(*p) &&
			*p !='>' &&
			*p != '<' &&
			*p !='?' &&
			*p !='/' &&
			// fix yaya.pro-street.us which has
			// charset=windows-1251;charset=windows-1"
			*p !=';' &&
			*p !='\\' )
			p += 1;//oneChar;
		size_t csStringLen = (size_t)(p-csString);
		// get the character set
		metaCharset = get_iana_charset(csString, csStringLen);
		// update "charset" to "metaCs" if known, it overrides all
		if (metaCharset != csUnknown ) {
			charset = metaCharset;
			break;
		}
	}

        // alias these charsets so iconv understands
	if ( charset == csISO58GB231280 ||
	     charset == csHZGB2312      ||
	     charset == csGB2312         )
		charset = csGB18030;

	if ( charset == csEUCKR )
		charset = csKSC56011987; //x-windows-949

	// use utf8 if still unknown
	if ( charset == csUnknown ) {
		if ( g_conf.m_logDebugSpider )
			logf(LOG_DEBUG,"doc: forcing utf8 charset");
		charset = csUTF8;
	}

	// once again, if the doc is claiming utf8 let's double check it!
	if ( charset == csUTF8 ) {
		// use this for iterating
		char size;
		// loop over every char
		for ( const char *s = pstart ; s < pend ; s += size ) {
			// set
			size = getUtf8CharSize(s);
			// sanity check
			if ( ! isFirstUtf8Char ( s ) ) {
				// but let 0x80 slide? it is for the
				// 0x80 0x99 apostrophe i've seen for
				// eventvibe.com. it did have a first byte,
				// 0xe2 that led that sequece but it was
				// converted into &acirc; by something that
				// thought it was a latin1 byte.
				if ( s[0] == (char)0x80 &&
				     s[1] == (char)0x99 ) {
					s += 2;
					size = 0;
					continue;
				}
				// note it
				log(LOG_DEBUG,
				    "build: says UTF8 (2) but does not "
				    "seem to be for url %s"
				    " Resetting to ISOLatin1.",url);
				// reset it to ISO then! that's pretty common
				// no! was causing problems for
				// eventvibe.com/...Yacht because it had
				// some messed up utf8 in it but it really
				// was utf8. CRAP, but really messes up
				// sunsetpromotions.com and washingtonia
				// if we do not have this here
				charset = csISOLatin1;
				invalidUtf8Encoding = true;
				break;
			}
		}
	}

	log(LOG_INFO, "encoding: charset='%s' header='%s' bom='%s' meta='%s' invalid=%d url='%s'",
	    get_charset_str(charset), get_charset_str(httpHeaderCharset), get_charset_str(unicodeBOMCharset),
	    get_charset_str(metaCharset), invalidUtf8Encoding, url);

	// all done
	return charset;
}


uint16_t *XmlDoc::getCharset ( ) {
	if ( m_charsetValid ) {
		return &m_charset;
	}

	// . get ptr to filtered content
	// . we can't get utf8 content yet until we know what charset this
	//   junk is so we can convert it!
	char **fc = getFilteredContent();
	if ( ! fc || fc == (void *)-1 ) {
		return (uint16_t *)fc;
	}

	// scan document for two things:
	// 1.  charset=  (in a <meta> tag)
	// 2. encoding=  (in an <?xml> tag)
	char *pstart = *fc;
	//char *pend   = *fc + m_filteredContentLen;

	// assume known charset
	m_charset = csUnknown;
	// make it valid regardless i guess
	m_charsetValid = true;

	// check in http mime for charset
	HttpMime *mime = getMime();
	if (mime && mime->getContentType() == CT_PDF) {
		// assume UTF-8
		m_charset = csUTF8;
		m_charsetValid = true;

		return &m_charset;
	}

	if( !mime ) {
		return NULL;
	}

	m_charset = getCharsetFast ( mime ,
				     m_firstUrl.getUrl(),
				     pstart ,
				     m_filteredContentLen );
	m_charsetValid = true;
	return &m_charset;
}



// declare these two routines for using threads
static void filterDoneWrapper    ( void *state, job_exit_t exit_type );
static void filterStartWrapper_r ( void *state );

// filters m_content if its pdf, word doc, etc.
char **XmlDoc::getFilteredContent ( ) {
	// return it if we got it already
	if ( m_filteredContentValid ) return &m_filteredContent;

	// this must be valid
	char **content = getContent();
	if ( ! content || content == (void *)-1 ) return content;
	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;
	// it needs this
	HttpMime *mime = getMime();
	if ( ! mime || mime == (void *)-1 ) return (char **)mime;

	// make sure NULL terminated always
	// Why? pdfs can have nulls embedded
	// if ( m_content &&
	//      m_contentValid &&
	//      m_content[m_contentLen] ) {
	// 	g_process.shutdownAbort(true); }

	int32_t max , max2;
	bool filterable = false;

	if ( !m_calledThread ) {

		// assume we do not need filtering by default
		m_filteredContent      = m_content;
		m_filteredContentLen   = m_contentLen;
		m_filteredContentValid = true;
		m_filteredContentAllocSize = 0;

		// empty content?
		if ( ! m_content ) return &m_filteredContent;

		if ( *ct == CT_HTML    ) return &m_filteredContent;
		if ( *ct == CT_TEXT    ) return &m_filteredContent;
		if ( *ct == CT_XML     ) return &m_filteredContent;
		// javascript - sometimes has address information in it, so keep it!
		if ( *ct == CT_JS      ) return &m_filteredContent;
		if ( m_contentLen == 0 ) return &m_filteredContent;

		// we now support JSON for diffbot
		if ( *ct == CT_JSON    ) return &m_filteredContent;

		if ( *ct == CT_ARC     ) return &m_filteredContent;
		if ( *ct == CT_WARC    ) return &m_filteredContent;

		// unknown content types are 0 since it is probably binary... and
		// we do not want to parse it!!
		if ( *ct == CT_PDF ) filterable = true;
		if ( *ct == CT_DOC ) filterable = true;
		if ( *ct == CT_XLS ) filterable = true;
		if ( *ct == CT_PPT ) filterable = true;
		if ( *ct == CT_PS  ) filterable = true;

		// if its a jpeg, gif, text/css etc. bail now
		if ( ! filterable ) {
			m_filteredContent      = NULL;
			m_filteredContentLen   = 0;
			m_filteredContentValid = true;
			return &m_filteredContent;
		}

		// invalidate
		m_filteredContentValid = false;

		CollectionRec *cr = getCollRec();
		if( !cr ) {
			return NULL;
		}

		// if not text/html or text/plain, use the other max
		//max = MAXDOCLEN; // cr->m_maxOtherDocLen;
		max = cr->m_maxOtherDocLen;

		// if not text/html or text/plain, use the other max
		// max = MAXDOCLEN; // cr->m_maxOtherDocLen;

		// now we base this on the pre-filtered length to save memory because
		// our maxOtherDocLen can be 30M and when we have a lot of injections
		// at the same time we lose all our memory quickly
		max2 = 5 * m_contentLen + 10*1024;
		if ( max > max2 ) max = max2;
		// user uses -1 to specify no maxTextDocLen or maxOtherDocLen
		if ( max < 0    ) max = max2;
		// make a buf to hold filtered reply
		m_filteredContentAllocSize = max;
		m_filteredContent = (char *)mmalloc(m_filteredContentAllocSize,"xdfc");
		if ( ! m_filteredContent ) {
			log("build: Could not allocate %" PRId32" bytes for call to "
			    "content filter.",m_filteredContentMaxSize);
			return NULL;
		}

		// reset this here in case thread gets killed by the kill() call below
		m_filteredContentLen = 0;
		// update status msg so its visible in the spider gui
		setStatus ( "filtering content" );
		// reset this... why?
		g_errno = 0;
		// . call thread to call popen
		// . callThread returns true on success, in which case we block
		// . do not repeat
		m_calledThread = true;
		// reset this since filterStart_r() will set it on error
		m_errno = 0;

		// how can this be? don't core like this in thread, because it
		// does not save our files!!
		if ( ! m_mimeValid ) { g_process.shutdownAbort(true); }

		// do it
		if ( g_jobScheduler.submit(filterStartWrapper_r, filterDoneWrapper, this, thread_type_spider_filter, MAX_NICENESS) ) {
			// return -1 if blocked
			return (char **) -1;
		}

		// clear error!
		g_errno = 0;

		// note it
		log(LOG_INFO, "build: Could not spawn thread for call to content filter.");
		// get the data
		filterStart_r ( false ); // am thread?
	}
	// skip down here if thread has returned and we got re-called

	// if size is 0, free the buf
	if ( m_filteredContentLen <= 0 ) {
		mfree ( m_filteredContent , m_filteredContentAllocSize,"fcas");
		m_filteredContent          = NULL;
		m_filteredContentLen       = 0;
		m_filteredContentAllocSize = 0;
	}

	// did we have an error from the thread?
	if ( m_errno ) g_errno = m_errno;
	// but bail out if it set g_errno
	if ( g_errno ) return NULL;
	// must be valid now - sanity check
	if ( ! m_filteredContentValid ) { g_process.shutdownAbort(true); }
	// return it
	return &m_filteredContent;
}

// come back here
// Use of ThreadEntry parameter is NOT thread safe
static void filterDoneWrapper ( void *state, job_exit_t /*exit_type*/ ) {
	// jump back into the brawl
	XmlDoc *THIS = (XmlDoc *)state;

	// if size is 0, free the buf. have to do this outside the thread
	// since malloc/free cannot be called in thread
	if ( THIS->m_filteredContentLen <= 0 ) {
		mfree ( THIS->m_filteredContent, THIS->m_filteredContentAllocSize,"fcas");
		THIS->m_filteredContent          = NULL;
		THIS->m_filteredContentLen       = 0;
		THIS->m_filteredContentAllocSize = 0;
	}

	// . call the master callback
	// . it will ultimately re-call getFilteredContent()
	THIS->m_masterLoop ( THIS->m_masterState );
}

// thread starts here
// Use of ThreadEntry parameter is NOT thread safe
static void filterStartWrapper_r ( void *state ) {
	XmlDoc *that = (XmlDoc *)state;

	// assume no error since we're at the start of thread call
	that->m_errno = 0;

	that->filterStart_r ( true ); // am thread?

	if (g_errno && !that->m_errno) {
		that->m_errno = g_errno;
	}
}


// sets m_errno on error
void XmlDoc::filterStart_r ( bool amThread ) {
	// get thread id
	pthread_t id = pthread_self();
	// sanity check
	if ( ! m_contentTypeValid ) { g_process.shutdownAbort(true); }
	// shortcut
	int32_t ctype = m_contentType;

	// assume none
	m_filteredContentLen = 0;

	// pass the input to the program through this file
	// rather than a pipe, since popen() seems broken
	char in[1024];
	snprintf(in,1023,"%sin.%" PRId64, g_hostdb.m_dir , (int64_t)id );
	unlink ( in );
	// collect the output from the filter from this file
	char out[1024];
	snprintf ( out , 1023,"%sout.%" PRId64, g_hostdb.m_dir, (int64_t)id );
	unlink ( out );
	// ignore errno from those unlinks
	errno = 0;
	// open the input file
 retry11:
	int fd = open ( in , O_WRONLY | O_CREAT , getFileCreationFlags() );
	if ( fd < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry11;
		m_errno = errno;
		log(LOG_WARN, "build: Could not open file %s for writing: %s.", in,mstrerror(m_errno));
		return;
	}
	// we are in a thread, this must be valid!
	if ( ! m_mimeValid ) { g_process.shutdownAbort(true);}

 retry12:
	// write the content into the input file
	int32_t w = write ( fd , m_content , m_contentLen );
	// valgrind
	if ( w < 0 && errno == EINTR ) goto retry12;
	// did we get an error
	if ( w != m_contentLen ) {
		//int32_t w = fwrite ( m_buf , 1 , m_bufLen , pd );
		//if ( w != m_bufLen ) {
		m_errno = errno;
		log(LOG_WARN, "build: Error writing to %s: %s.",in, mstrerror(m_errno));
		close(fd);
		return;
	}
	// close the file
	close ( fd );

	// shortcut
	char *wdir = g_hostdb.m_dir;

	char cmd[2048] = {};

	if (ctype == CT_PDF) {
		snprintf(cmd, 2047, "%sgbconvert.sh %s %s %s", wdir, g_contentTypeStrings[ctype], in, out);
	} else if ( ctype == CT_DOC ) {
		// "wdir" include trailing '/'? not sure
		snprintf(cmd, 2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; timeout 30s nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
	} else if ( ctype == CT_XLS ) {
		snprintf(cmd, 2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
	// this is too buggy for now... causes hanging threads because it
	// hangs, so i added 'timeout 10s' but that only works on newer
	// linux version, so it'll just error out otherwise.
	} else if ( ctype == CT_PPT ) {
		snprintf(cmd, 2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/ppthtml %s > %s" , wdir , in , out );
	} else if ( ctype == CT_PS  ) {
		snprintf(cmd, 2047, "ulimit -v 25000 ; ulimit -t 30; timeout 10s nice -n 19 %s/pstotext %s > %s" , wdir , in , out );
	} else {
		gbshutdownLogicError();
	}

	// breach sanity check
	//if ( strlen(cmd) > 2040 ) { g_process.shutdownAbort(true); }

	// execute it
	int retVal = gbsystem ( cmd );
	if ( retVal == -1 ) {
		log( LOG_WARN, "gb: system(%s) : %s", cmd, mstrerror( g_errno ) );
	}

	// all done with input file
	// clean up the binary input file from disk
	if ( unlink ( in ) != 0 ) {
		// log error
		log( LOG_WARN, "gbfilter: unlink(%s): %s",in, strerror(errno));

		// ignore it, since it was not a processing error per se
		errno = 0;
	}

retry13:
	fd = open ( out , O_RDONLY );
	if ( fd < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry13;
		m_errno = errno;
		log( LOG_WARN, "gbfilter: Could not open file %s for reading: %s.", out,mstrerror(m_errno));
		return;
	}
	// sanity -- need room to store a \0
	if ( m_filteredContentAllocSize < 2 ) { g_process.shutdownAbort(true); }
	// to read - leave room for \0
	int32_t toRead = m_filteredContentAllocSize - 1;
retry14:
	// read right from pipe descriptor
	int32_t r = read (fd, m_filteredContent,toRead);
	// note errors
	if ( r < 0 ) {
		// valgrind
		if ( errno == EINTR ) goto retry14;
		log( LOG_WARN, "gbfilter: reading output: %s",mstrerror(errno));
		// this is often bad fd from an oom error, so ignore it
		//m_errno = errno;
		errno = 0;
		r = 0;
	}
	// clean up shop
	close ( fd );
	// delete output file
	unlink ( out );

	// validate now
	m_filteredContentValid = 1;
	// save the new buf len
	m_filteredContentLen = r;
	// ensure enough room for null term
	if ( r >= m_filteredContentAllocSize ) {
		g_process.shutdownAbort(true);
	}
	// ensure filtered stuff is NULL terminated so we can set the Xml class
	m_filteredContent [ m_filteredContentLen ] = '\0';
	// it is good
	m_filteredContentValid = true;

	// . at this point we got the filtered content
	// . bitch if we didn't allocate enough space
	if ( r > 0 && r == toRead )
		log(LOG_LOGIC,"build: Had to truncate document to %" PRId32" bytes "
		    "because did not allocate enough space for filter. "
		    "This should never happen. It is a hack that should be "
		    "fixed right.", toRead );
}


// return downloaded content as utf8
char **XmlDoc::getRawUtf8Content ( ) {
	// if we already computed it, return that
	if ( m_rawUtf8ContentValid ) return &m_rawUtf8Content;

	// . get our characterset
	// . crap! this can be recursive. it calls getXml() which calls
	//   getUtf8Content() which is us!
	uint16_t *charset = getCharset ( );
	if ( ! charset || charset == (uint16_t *)-1 ) return (char **)charset;

	const char *csName = get_charset_str(*charset);

	// . if not supported fix that!
	// . m_indexCode should be set to EBADCHARSET ultimately, but not here
	if ( ! supportedCharset(*charset) && csName ) {
		m_rawUtf8Content          = NULL;
		m_rawUtf8ContentSize      = 0;
		m_rawUtf8ContentAllocSize = 0;
		m_rawUtf8ContentValid     = true;
		return &m_rawUtf8Content;
	}

	// get ptr to filtered content
	char **fc = getFilteredContent();
	if ( ! fc || fc == (void *)-1 ) return (char **)fc;

	// make sure NULL terminated always
	if ( m_filteredContent &&
	     m_filteredContentValid &&
	     m_filteredContent[m_filteredContentLen] ) {
		g_process.shutdownAbort(true); }

	// NULL out if no content
	if ( ! m_filteredContent ) {
		m_rawUtf8Content          = NULL;
		m_rawUtf8ContentSize      = 0;
		m_rawUtf8ContentAllocSize = 0;
		m_rawUtf8ContentValid     = true;
		return &m_rawUtf8Content;
	}

	// assume already utf8
	m_rawUtf8Content          = m_filteredContent;
	m_rawUtf8ContentSize      = m_filteredContentLen + 1;
	m_rawUtf8ContentAllocSize = 0;

	// if we are not ascii or utf8 already, encode it into utf8
	if ( m_rawUtf8ContentSize > 1 &&
	     csName &&
	     *charset != csASCII &&
	     *charset != csUTF8 ) {
		// ok, no-go
		//ptr_utf8Content = NULL;
		m_rawUtf8Content = NULL;
		// assume utf8 will be twice the size ... then add a little
		int32_t  need = (m_filteredContentLen * 2) + 4096;
		char *buf  = (char *) mmalloc(need, "Xml3");
		// log oom error
		if ( ! buf ) {
			log("build: xml: not enough memory for utf8 buffer");
			return NULL;
		}
		// note it
		setStatus ( "converting doc to utf8" );
		// returns # of bytes i guess
		int32_t used = ucToUtf8 ( buf                  ,
				       // fix core dump by subtracting 10!
				       need - 10,
				       m_filteredContent    ,
				       m_filteredContentLen ,
				       csName               ,
				       -1                   );//allowBadChars
		// clear this if successful, otherwise, it sets errno
		if ( used > 0 ) g_errno = 0;
		// unrecoverable error? bad charset is g_errno == E2BIG
		// which is like argument list too long or something
		// error from Unicode.cpp's call to iconv()
		if ( g_errno )
			log(LOG_INFO, "build: xml: failed parsing buffer: %s "
			    "(cs=%d)", mstrerror(g_errno), *charset);
		if ( g_errno && g_errno != E2BIG ) {
			mfree ( buf, need, "Xml3");
			// do not index this doc, delete from spiderdb/tfndb
			//if ( g_errno != ENOMEM ) m_indexCode = g_errno;
			// if conversion failed NOT because of bad charset
			// then return NULL now and bail out. probably ENOMEM
			return NULL;
		}
		// if bad charset... just make doc empty as a utf8 doc
		if ( g_errno == E2BIG ) {
			used = 0;
			buf[0] = '\0';
			buf[1] = '\0';
			// clear g_errno
			g_errno = 0;
			// and make a note for getIndexCode() so it will not
			// bother indexing the doc! nah, just index it
			// but with no content...
		}
		// crazy? this is pretty important...
		if ( used + 10 >= need )
			log("build: utf8 using too much buf space!!! u=%s",
			    getFirstUrl()->getUrl());
		// re-assign
		//ptr_utf8Content        = buf;
		//size_utf8Content       = used + 1;
		//m_utf8ContentAllocSize = need;
		m_rawUtf8Content          = buf;
		m_rawUtf8ContentSize      = used + 1;
		m_rawUtf8ContentAllocSize = need;
	}

	// convert \0's to spaces. why do we see these in some pages?
	// http://www.golflink.com/golf-courses/ has one in the middle after
	// about 32k of content.
	char *p    =     m_rawUtf8Content;
	char *pend = p + m_rawUtf8ContentSize - 1;
	for ( ; p < pend ; p++ ) {
		if ( ! *p ) *p = ' ';
	}


	//
	// VALIDATE the UTF-8
	//

	// . make a buffer to hold the decoded content now
	// . we were just using the m_expandedUtf8Content buf itself, but "n"
	//   ended up equalling m_expadedUtf8ContentSize one time for a
	//   doc, http://ediso.net/, which probably had corrupt utf8 in it,
	//   and that breached our buffer! so verify that this is good
	//   utf8, and that we can parse it without breaching our buffer!
	p = m_rawUtf8Content;
	// make sure NULL terminated always
	if ( p[m_rawUtf8ContentSize-1]) { g_process.shutdownAbort(true);}
	// make sure we don't breach the buffer when parsing it
	char size;
	char *lastp = NULL;
	for ( ; ; p += size ) {
		if ( p >= pend ) break;
		lastp = p;
		size = getUtf8CharSize(p);
	}
	// overflow?
	if ( p > pend && lastp ) {
		// back up to the bad utf8 char that made us overshoot
		p = lastp;
		// space it out
		for ( ; p < pend ; p++ ) *p = ' ';
		// log it maybe due to us not being keep alive http server?
		log("doc: fix bad utf8 overflow (because we are not "
		    "keepalive?) in doc %s",m_firstUrl.getUrl());
	}
	// overflow?
	if ( p != pend ) { g_process.shutdownAbort(true); }
	// sanity check for breach. or underrun in case we encountered a
	// premature \0
	if (p-m_rawUtf8Content!=m_rawUtf8ContentSize-1) {g_process.shutdownAbort(true);}

	// sanity -- must be \0 terminated
	if ( m_rawUtf8Content[m_rawUtf8ContentSize-1] ) {g_process.shutdownAbort(true); }

        // it might have shrunk us
	//m_rawUtf8ContentSize = n + 1;
	// we are good to go
	m_rawUtf8ContentValid = true;

	//return &ptr_utf8Content;
	return &m_rawUtf8Content;
}

// this is so Msg13.cpp can call getExpandedUtf8Content() to do its
// iframe expansion logic
static void getExpandedUtf8ContentWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	char **retVal = THIS->getExpandedUtf8Content();
	// return if blocked again
	if ( retVal == (void *)-1 ) return;
	// otherwise, all done, call the caller callback
	THIS->callCallback();
}

// now if there are any <iframe> tags let's substitute them for
// the html source they represent here. that way we will get all the
// information you see on the page. this is somewhat critical since
// a lot of pages have their content in the frame.
char **XmlDoc::getExpandedUtf8Content ( ) {
	// if we already computed it, return that
	if ( m_expandedUtf8ContentValid ) return &m_expandedUtf8Content;

	// if called from spider compression proxy we need to set
	// masterLoop here now
	if ( ! m_masterLoop ) {
		m_masterLoop  = getExpandedUtf8ContentWrapper;
		m_masterState = this;
	}

	// get the unexpanded cpontent first
	char **up = getRawUtf8Content ();
	if ( ! up || up == (void *)-1 ) return up;

	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (char **)cu;

	// NULL out if no content
	if ( ! *up ) {
		m_expandedUtf8Content          = NULL;
		m_expandedUtf8ContentSize      = 0;
		m_expandedUtf8ContentValid     = true;
		return &m_expandedUtf8Content;
	}

	// do not do iframe expansion in order to keep injections fast
	if ( m_wasContentInjected ) {
		m_expandedUtf8Content     = m_rawUtf8Content;
		m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
		m_expandedUtf8ContentValid = true;
		return &m_expandedUtf8Content;
	}

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (char **)ct;

	// if we have a json reply, leave it alone... do not expand iframes
	// in json, it will mess up the json
	if ( *ct == CT_JSON ) {
		m_expandedUtf8Content     = m_rawUtf8Content;
		m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
		m_expandedUtf8ContentValid = true;
		return &m_expandedUtf8Content;
	}

	// we need this so getExtraDoc does not core
	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip;

	// point to it
	char *p    = *up;
	char *pend = *up + m_rawUtf8ContentSize; // includes \0
	// declare crap up here so we can jump into the for loop
	int32_t urlLen;
	char *url;
	char *fend;
	Url furl;
	XmlDoc **ped;
	XmlDoc *ed;
	bool inScript = false;
	bool match;
	// assign saved value if we got that
	if ( m_savedp ) {
		// restore "p"
		p = m_savedp;
		// update this
		ed = m_extraDoc;
		// and see if we got the mime now
		goto gotMime;
	}
	// now loop for frame and iframe tags
	for ( ; p < pend ; p += getUtf8CharSize(p) ) {
		// if never found a frame tag, just keep on chugging
		if ( *p != '<' ) continue;
		// <script>?
		if ( to_lower_a(p[1]) == 's' &&
		     to_lower_a(p[2]) == 'c' &&
		     to_lower_a(p[3]) == 'r' &&
		     to_lower_a(p[4]) == 'i' &&
		     to_lower_a(p[5]) == 'p' &&
		     to_lower_a(p[6]) == 't' )
			inScript = 1;
		// </script>?
		if ( p[1]=='/' &&
		     to_lower_a(p[2]) == 's' &&
		     to_lower_a(p[3]) == 'c' &&
		     to_lower_a(p[4]) == 'r' &&
		     to_lower_a(p[5]) == 'i' &&
		     to_lower_a(p[6]) == 'p' &&
		     to_lower_a(p[7]) == 't' )
			inScript = 0;
		// . skip if in script
		// . fixes guysndollsllc.com which has an iframe tag in
		//   a script section, "document.write ('<iframe..."
		if ( inScript ) continue;
		// iframe or frame?
		match = false;
		if ( to_lower_a(p[1]) == 'f' &&
		     to_lower_a(p[2]) == 'r' &&
		     to_lower_a(p[3]) == 'a' &&
		     to_lower_a(p[4]) == 'm' &&
		     to_lower_a(p[5]) == 'e' )
			match = true;
		if ( to_lower_a(p[1]) == 'i' &&
		     to_lower_a(p[2]) == 'f' &&
		     to_lower_a(p[3]) == 'r' &&
		     to_lower_a(p[4]) == 'a' &&
		     to_lower_a(p[5]) == 'm' &&
		     to_lower_a(p[6]) == 'e' )
			match = true;
		// skip tag if not iframe or frame
		if ( ! match ) continue;
		// check for frame or iframe
		//if ( strncasecmp(p+1,"frame " , 6) &&
		//     strncasecmp(p+1,"iframe ", 7) )
		//	continue;
		// get src tag (function in Words.h)
		url = getFieldValue ( p , pend - p ,"src" , &urlLen );
		// needs a src field
		if ( ! url ) continue;
		// "" is not acceptable either. techcrunch.com has
		// <iframe src=""> which ends up embedding the root url.
		if ( urlLen == 0 )
			continue;
		// skip if "about:blank"
		if ( urlLen==11 && strncmp(url,"about:blank",11) == 0 )
			continue;
		// get our current url
		//cu = getCurrentUrl();
		// set our frame url
		furl.set( cu, url, urlLen );
		// no recursion
		if ( strcmp(furl.getUrl(),m_firstUrl.getUrl()) == 0 )
			continue;
		// must be http or https, not ftp! ftp was causing us to
		// core in Msg22.cpp where it checks the url's protocol
		// when trying to lookup the old title rec.
		// http://sweetaub.ipower.com/ had an iframe with a ftp url.
		if ( ! furl.isHttp() && ! furl.isHttps() ) continue;
		/// @todo why are we ignoring specific domains here?
		// ignore google.com/ assholes for now
		if ( strstr(furl.getUrl(),"google.com/" ) ) continue;
		// and bing just to be safe
		if ( strstr(furl.getUrl(),"bing.com/" ) ) continue;
		// save it in case we have to return and come back later
		m_savedp = p;
		// break here
		//log("mdw: breakpoing here");
		// . download that. get as a doc. use 0 for max cache time
		// . no, use 5 seconds since we often have the same iframe
		//   in the root doc that we have in the  main doc, like a
		//   facebook iframe or something.
		// . use a m_maxCacheAge of 5 seconds now!
		ped = getExtraDoc ( furl.getUrl() , 5 );
		// should never block
		if ( ! ped ) {
			log("xmldoc: getExpandedutf8content = %s",
			    mstrerror(g_errno));
			return NULL;
		}
		// . return -1 if it blocked???
		// . no, this is not supported right now
		// . it will mess up our for loop
		if ( ped == (void *)-1 ) {g_process.shutdownAbort(true);}
		// cast it
		ed = *ped;
		// sanity
		if ( ! ed ) { g_process.shutdownAbort(true); }
		// jump in here from above
	gotMime:
		// make it not use the ips.txt cache
		//ed->m_useIpsTxtFile     = false;
		// get the mime
		HttpMime *mime = ed->getMime();
		if ( ! mime || mime == (void *)-1 ) return (char **)mime;
		// if not success, do not expand it i guess...
		if ( mime->getHttpStatus() != 200 ) {
			// free it
			nukeDoc ( ed );
			// and continue
			continue;
		}
		// update m_downloadEndTime if we should
		if ( ed->m_downloadEndTimeValid ) {
			// we must already be valid
			if ( ! m_downloadEndTimeValid ) {g_process.shutdownAbort(true);}
			// only replace it if it had ip and robots.txt allowed
			if ( ed->m_downloadEndTime )
				m_downloadEndTime = ed->m_downloadEndTime;
		}

		// re-write that extra doc into the content
		char **puc = ed->getRawUtf8Content();
		// this should not block
		//if ( puc == (void *)-1 ) { g_process.shutdownAbort(true); }
		// it blocked before! because the charset was not known!
		if ( puc == (void *)-1 ) return (char **)puc;
		// error?
		if ( ! puc ) return (char **)puc;
		// cast it
		char *uc = *puc;
		// or if no content, and no mime (like if robots.txt disallows)
		if ( ! uc || ed->m_rawUtf8ContentSize == 1 ) {
			// free it
			nukeDoc ( ed );
			// and continue
			continue;
		}
		// size includes terminating \0
		if ( uc[ed->m_rawUtf8ContentSize-1] ) { g_process.shutdownAbort(true);}

		// if first time we are expanding, set this
		if ( ! m_oldp ) m_oldp = *up;

		// find end of frame tag
		fend = p;
		for ( ; fend < pend ; fend += getUtf8CharSize(fend) ) {
			// if never found a frame tag, just keep on chugging
			if ( *fend == '>' ) break;
		}
		// if no end to the iframe tag was found, bail then...
		if ( fend >= pend ) continue;
		// skip the >
		fend++;

		// insert the non-frame crap first AND the frame/iframe tag
		m_esbuf.safeMemcpy ( m_oldp , fend - m_oldp );
		// end the frame
		//m_esbuf.safeMemcpy ( "</iframe>", 9 );
		// use our own special tag so Sections.cpp can set
		// Section::m_gbFrameNum which it uses internally
		m_esbuf.safePrintf("<gbframe>"); // gbiframe
		// identify javascript
		bool javascript = false;

		uint8_t *ct = ed->getContentType();
		if ( ct && *ct == CT_JS ) {
			javascript = true;
		}

		// so we do not mine javascript for cities and states etc.
		// in Address.cpp
		if ( javascript ) {
			m_esbuf.safePrintf("<script>");
		}
		// store that
		m_esbuf.safeMemcpy ( uc , ed->m_rawUtf8ContentSize - 1 );
		// our special tag has an end tag as well
		if ( javascript ) m_esbuf.safePrintf("</script>");
		m_esbuf.safePrintf("</gbframe>");
		// free up ed
		nukeDoc ( ed );

		// end of frame tag, skip over whole thing
		m_oldp = fend ;
		// sanity check
		if ( m_oldp > pend ) { g_process.shutdownAbort(true); }
		// another flag
		m_didExpansion = true;
		// count how many we did
		if ( ++m_numExpansions >= 5 ) break;
	}
	// default
	m_expandedUtf8Content     = m_rawUtf8Content;
	m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
	// point to expansion buffer if we did any expanding
	if ( m_didExpansion ) {
		// copy over the rest
		m_esbuf.safeMemcpy ( m_oldp , pend - m_oldp );
		// null term it
		m_esbuf.pushChar('\0');
		// and point to that buffer
		m_expandedUtf8Content     = m_esbuf.getBufStart();//m_buf;
		// include the \0 as part of the size
		m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1;
	}
	// sanity -- must be \0 terminated
	if ( m_expandedUtf8Content[m_expandedUtf8ContentSize-1] ) {
		g_process.shutdownAbort(true); }

	m_expandedUtf8ContentValid = true;
	return &m_expandedUtf8Content;
}





// . get the final utf8 content of the document
// . all html entities are replaced with utf8 chars
// . all iframes are expanded
// . if we are using diffbot then getting the utf8 content should return
//   the json which is the output from the diffbot api. UNLESS we are getting
//   the webpage itself for harvesting outlinks to spider later.
char **XmlDoc::getUtf8Content ( ) {

	// if we already computed it, return that
	if ( m_utf8ContentValid ) return &ptr_utf8Content;

	if ( m_setFromTitleRec ) {
		m_utf8ContentValid = true;
		return &ptr_utf8Content;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	setStatus("getting utf8 content");

	// recycle?
	if ( cr->m_recycleContent || m_recycleContent ||
	     // if trying to delete from index, load from old titlerec
	     m_deleteFromIndex ) {
		// get the old xml doc from the old title rec
		XmlDoc **pod = getOldXmlDoc ( );
		if ( ! pod || pod == (void *)-1 ) return (char **)pod;
		// shortcut
		XmlDoc *od = *pod;
		// this is non-NULL if it existed
		if ( od ) {
			ptr_utf8Content    = od-> ptr_utf8Content;
			size_utf8Content   = od->size_utf8Content;
			m_utf8ContentValid = true;
			m_contentType      = od->m_contentType;
			m_contentTypeValid = true;
			// sanity check
			if ( ptr_utf8Content &&
			     ptr_utf8Content[size_utf8Content-1] ) {
				g_process.shutdownAbort(true); }
			return &ptr_utf8Content;
		}
		// if could not find title rec and we are docid-based then
		// we can't go any further!!
		if ( m_setFromDocId ||
		     // it should be there if trying to delete as well!
		     m_deleteFromIndex ) {
			log("xmldoc: null utf8 content for docid-based "
			    "titlerec (d=%" PRId64") lookup which was not found",
			    m_docId);
			ptr_utf8Content = NULL;
			size_utf8Content = 0;
			m_utf8ContentValid = true;
			m_contentType = CT_HTML;
			m_contentTypeValid = true;
			return &ptr_utf8Content;
		}

	}

	char **ep = getExpandedUtf8Content();
	if ( ! ep || ep == (void *)-1 ) return ep;

	// NULL out if no content
	if ( ! *ep ) {
		ptr_utf8Content    = NULL;
		size_utf8Content   = 0;
		m_utf8ContentValid = true;
		return &ptr_utf8Content;
	}

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (char **)ct;

	// if we have a json reply, leave it alone... expanding a &quot;
	// into a double quote will mess up the JSON!
	if ( *ct == CT_JSON ) {
		ptr_utf8Content  = (char *)m_expandedUtf8Content;
		size_utf8Content = m_expandedUtf8ContentSize;
		m_utf8ContentValid = true;
		return &ptr_utf8Content;
	}

	// why would the spider proxy, who use msg13.cpp to call
	// XmlDoc::getExpandedUtf8Content() want to call this??? it seems
	// to destroy expandedutf8content with a call to htmldecode
	if ( m_isSpiderProxy ) { g_process.shutdownAbort(true); }

	// sabnity check
	if ( m_xmlValid   ) { g_process.shutdownAbort(true); }
	if ( m_wordsValid ) { g_process.shutdownAbort(true); }

	//
	// convert illegal utf8 characters into spaces
	//
	// fixes santaclarachorale.vbotickets.com/tickets/g.f._handels_israel_in_egypt/1062
	// which has a 228,0x80,& sequence (3 chars, last is ascii)
	char *x = m_expandedUtf8Content;
	char size;
	for ( ; *x ; x += size ) {
		size = getUtf8CharSize(x);
		// ok, make it a space i guess if it is a bad utf8 char
		if ( ! isValidUtf8Char(x) ) {
			*x = ' ';
			size = 1;
			continue;
		}
	}

	// sanity
	if ( ! m_contentTypeValid ) { g_process.shutdownAbort(true); }

	// richmondspca.org has &quot; in some tags and we do not like
	// expanding that to " because it messes up XmlNode::getTagLen()
	// and creates big problems. same for www.first-avenue.com. so
	// by setting doSpecial to try we change &lt; &gt and &quot; to
	// [ ] and ' which have no meaning in html per se.
	bool doSpecial = ( m_contentType != CT_XML );

	// . now decode those html entites into utf8 so that we never have to
	//   check for html entities anywhere else in the code. a big win!!
	// . doSpecial = true, so that &lt, &gt, &amp; and &quot; are
	//   encoded into high value
	//   utf8 chars so that Xml::set(), etc. still work properly and don't
	//   add any more html tags than it should
	// . this will decode in place
	// . MDW: 9/28/2014. no longer do for xml docs since i added
	//   hashXmlFields()
	int32_t n = m_expandedUtf8ContentSize - 1;
	if ( m_contentType != CT_XML ) {
		n = htmlDecode( m_expandedUtf8Content, m_expandedUtf8Content, m_expandedUtf8ContentSize - 1,
						doSpecial );
	}

	// can't exceed this! n does not include the final \0 even though
	// we do right it out.
	if ( n > m_expandedUtf8ContentSize-1 ) {g_process.shutdownAbort(true); }

	// sanity
	if ( m_expandedUtf8Content[n] != '\0' ) { g_process.shutdownAbort(true); }

	// finally transform utf8 apostrophe's into regular apostrophes
	// to make parsing easier
	uint8_t *p   = (uint8_t *)m_expandedUtf8Content;
	uint8_t *dst = (uint8_t *)m_expandedUtf8Content;
	uint8_t *pend = p + n;

	for ( ; *p ; p += size ) {
		size = getUtf8CharSize(p);

		// quick copy
		if ( size == 1 ) {
			*dst++ = *p;
			continue;
		}

		// check for crazy apostrophes
		if ( p[0] == 0xe2 && p[1] == 0x80 &&
		     ( p[2] == 0x98 ||    // U+2018 LEFT SINGLE QUOTATION MARK
		       p[2] == 0x99 ||    // U+2019 RIGHT SINGLE QUOTATION MARK
		       p[2] == 0x9b ) ) { // U+201B SINGLE HIGH-REVERSED-9 QUOTATION MARK
			*dst++ = '\'';
			continue;
		}

		// utf8 control character?
		if ( p[0] == 0xc2 &&
		     p[1] >= 0x80 && p[1] <= 0x9f ) {
			*dst++ = ' ';
			continue;
		}

		// double quotes in utf8
		// DO NOT do this if type JSON!! json uses quotes as control characters
		if (m_contentType != CT_JSON) {
			if ( p[0] == 0xe2 && p[1] == 0x80 ) {
				if ( p[2] == 0x9c ) {
					*dst++ = '\"';
					continue;
				}
				if ( p[2] == 0x9d ) {
					*dst++ = '\"';
					continue;
				}
			}
		}

		// and crazy hyphens (8 - 10pm)
		if ( ( p[0] == 0xc2 && p[1] == 0xad ) ||                  // U+00AD SOFT HYPHEN
		     ( p[0] == 0xe2 && p[1] == 0x80 && p[2] == 0x93 ) ||  // U+2013 EN DASH
			 ( p[0] == 0xe2 && p[1] == 0x80 && p[2] == 0x94 ) ) { // U+2014 EM DASH
			*dst++ = '-';
			continue;
		}

		// . convert all utf8 white space to ascii white space
		// . should benefit the string matching algo in
		//   XmlDoc::getEventSummary() which needs to skip spaces
		if ( ! g_map_is_ascii[(unsigned char)*p]  &&
		     is_wspace_utf8(p) ) {
			*dst++ = ' ';
			continue;
		}

		// otherwise, just copy it
		gbmemcpy(dst,p,size);
		dst += size;
	}

	// null term
	*dst++ = '\0';

	// now set it up
	ptr_utf8Content  = (char *)m_expandedUtf8Content;
	size_utf8Content = (char *)dst - m_expandedUtf8Content;

	// sanity -- skipped over the \0???
	if ( p > pend ) { g_process.shutdownAbort(true); }

	// sanity check
	if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) {
		g_process.shutdownAbort(true); }

	m_utf8ContentValid = true;
	return &ptr_utf8Content;
}

// *pend should be \0
int32_t getContentHash32Fast ( unsigned char *p , int32_t plen ) {
	// sanity
	if ( ! p ) return 0;
	if ( plen <= 0 ) return 0;
	if ( p[plen] != '\0' ) { g_process.shutdownAbort(true); }
	unsigned char *pend = p + plen;

	static bool s_init = false;
	static char s_qtab0[256];
	static char s_qtab1[256];
	static char s_qtab2[256];
	static const char * const s_skips[] = {
		"jan",
		"feb",
		"mar",
		"apr",
		"may",
		"jun",
		"jul",
		"aug",
		"sep",
		"oct",
		"nov",
		"dec",
		"sun",
		"mon",
		"tue",
		"wed",
		"thu",
		"fri",
		"sat" };
	if ( ! s_init ) {
		// only call this crap once
		s_init = true;
		// clear up
		memset(s_qtab0,0,256);
		memset(s_qtab1,0,256);
		memset(s_qtab2,0,256);
		for ( int32_t i = 0 ; i < 19  ; i++ ) {
			unsigned char *s = (unsigned char *)s_skips[i];
			s_qtab0[(unsigned char)to_lower_a(s[0])] = 1;
			s_qtab0[(unsigned char)to_upper_a(s[0])] = 1;
			// do the quick hash
			unsigned char qh = to_lower_a(s[0]);
			qh ^= to_lower_a(s[1]);
			qh <<= 1;
			qh ^= to_lower_a(s[2]);
			s_qtab1[qh] = 1;
			// try another hash, the swift hash
			unsigned char sh = to_lower_a(s[0]);
			sh <<= 1;
			sh ^= to_lower_a(s[1]);
			sh <<= 1;
			sh ^= to_lower_a(s[2]);
			s_qtab2[sh] = 1;
		}
	}

	bool lastWasDigit = false;
	bool lastWasPunct = true;
	uint32_t h = 0LL;
	//char  size = 0;
	unsigned char pos = 0;
	for ( ; p < pend ; p++ ) { //  += size ) {
		if ( ! is_alnum_a ( *p ) ) {
			lastWasDigit = false;
			lastWasPunct = true;
			continue;
		}

		// if its a digit, call it 1
		if ( is_digit(*p) ) {
			// skip consecutive digits
			if ( lastWasDigit ) continue;
			// xor in a '1'
			h ^= g_hashtab[pos][(unsigned char)'1'];
			pos++;
			lastWasDigit = true;
			continue;
		}

		// reset
		lastWasDigit = false;

		// exclude days of the month or week so clocks do
		// not affect this hash
		if ( s_qtab0[p[0]] && lastWasPunct && p[1] && p[2] ) {
			// quick hash
			unsigned char qh = to_lower_a(p[0]);
			qh ^= to_lower_a(p[1]);
			qh <<= 1;
			qh ^= to_lower_a(p[2]);
			// look that up
			if ( ! s_qtab1[qh] ) goto skip;
			// try another hash, the swift hash
			unsigned char sh = to_lower_a(p[0]);
			sh <<= 1;
			sh ^= to_lower_a(p[1]);
			sh <<= 1;
			sh ^= to_lower_a(p[2]);
			if ( ! s_qtab2[sh] ) goto skip;
			// ok, probably a match..
			unsigned char *s = p + 3;
			// skip to end of word
			for ( ; s < pend ; s++ ) {
				if ( ! is_alnum_a ( *s ) )
					break;
			}

			// advance p now
			p = s;

			// hash as one type of thing...
			h ^= g_hashtab[pos][(unsigned char)'X'];

			pos++;
			continue;
		}

	skip:
		// reset this
		lastWasPunct = false;
		// xor this in right
		h ^= g_hashtab[pos][p[0]];
		pos++;
		// assume ascii or latin1
		continue;
	}
	return h;
}

int32_t *XmlDoc::getContentHash32 ( ) {
	// return it if we got it
	if ( m_contentHash32Valid ) return &m_contentHash32;
	setStatus ( "getting contenthash32" );

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) return (int32_t *)ct;

	// we do not hash the url/resolved_url/html fields in diffbot json
	// because the url field is a mirror of the url and the html field
	// is redundant and would slow us down
	if ( *ct == CT_JSON )
		return getContentHashJson32();

	// . get the content. get the pure untouched content!!!
	// . gotta be pure since that is what Msg13.cpp computes right
	//   after it downloads the doc...
	// . if iframes are present, msg13 gives up
	char **pure = getContent();
	if ( ! pure || pure == (char **)-1 ) return (int32_t *)pure;

	unsigned char *p = (unsigned char *)(*pure);
	int32_t plen = m_contentLen;//size_utf8Content - 1;

	// no content means no hash32
	if ( plen <= 0 ) {//ptr_utf8Content ) {
		m_contentHash32 = 0;
		m_contentHash32Valid = true;
		return &m_contentHash32;
	}

	// we set m_contentHash32 in ::hashJSON() below because it is special
	// for diffbot since it ignores certain json fields like url: and the
	// fields are independent, and numbers matter, like prices

	// *pend should be \0
	m_contentHash32 = getContentHash32Fast ( p , plen );
	// validate
	m_contentHash32Valid = true;
	return &m_contentHash32;
}

// we do not hash the url/resolved_url/html fields in diffbot json
// because the url field is a mirror of the url and the html field
// is redundant and would slow us down
int32_t *XmlDoc::getContentHashJson32 ( ) {

	if ( m_contentHash32Valid ) return &m_contentHash32;

	// use new json parser
	Json *jp = getParsedJson();
	if ( ! jp || jp == (void *)-1 ) return (int32_t *)jp;

	JsonItem *ji = jp->getFirstItem();
	int32_t totalHash32 = 0;

	//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);

	for ( ; ji ; ji = ji->m_next ) {
		// skip if not number or string
		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
			continue;

		char *topName = NULL;

		// what name level are we?
		int32_t numNames = 1;
		JsonItem *pi = ji->m_parent;
		for ( ; pi ; pi = pi->m_parent ) {
			// empty name?
			if ( ! pi->m_name ) continue;
			if ( ! pi->m_name[0] ) continue;
			topName = pi->m_name;
			numNames++;
		}

		// if we are the diffbot reply "html" field do not hash this
		// because it is redundant and it hashes html tags etc.!
		// plus it slows us down a lot and bloats the index.
		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"html") == 0 )
			continue;

		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"url") == 0 )
			continue;

		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"pageUrl") == 0 )
			continue;

		if ( ji->m_name && numNames==1 &&
		     strcmp(ji->m_name,"resolved_url") == 0 )
			continue;

		if ( topName && strcmp(topName,"stats") == 0 )
			continue;

		if ( topName && strcmp(topName,"queryString") == 0 )
			continue;

		if ( topName && strcmp(topName,"nextPages") == 0 )
			continue;

		if ( topName && strcmp(topName,"textAnalysis") == 0 )
			continue;

		if ( topName && strcmp(topName,"links") == 0 )
			continue;


		// hash the fully compound name
		int32_t nameHash32 = 0;
		JsonItem *p = ji;
		char *lastName = NULL;
		for ( ; p ; p = p->m_parent ) {
			// empty name?
			if ( ! p->m_name ) continue;
			if ( ! p->m_name[0] ) continue;
			// dup? can happen with arrays. parent of string
			// in object, has same name as his parent, the
			// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
			if ( p->m_name == lastName ) continue;
			// update
			lastName = p->m_name;
			// hash it up
			nameHash32 = hash32(p->m_name,p->m_nameLen,nameHash32);
		}

		//
		// now Json.cpp decodes and stores the value into
		// a buffer, so ji->getValue() should be decoded completely
		//

		// . get the value of the json field
		// . if it's a number or bool it converts into a string
		int32_t vlen;
		char *val = ji->getValueAsString( &vlen );

		//
		// for deduping search results we set m_contentHash32 here for
		// diffbot json objects.
		//
		// we use this hash for setting EDOCUNCHANGED when reindexing
		// a diffbot reply. we also use to see if the diffbot reply
		// is a dup with another page in the index. thirdly, we use
		// to dedup search results, which could be redundant because
		// of our spider-time deduping.
		//
		// make the content hash so we can set m_contentHash32
		// for deduping. do an exact hash for now...
		int32_t vh32 = hash32 ( val , vlen , m_niceness );
		// combine
		int32_t combined32 = hash32h ( nameHash32 , vh32 );
		// accumulate field/val pairs order independently
		totalHash32 ^= combined32;
		// debug note
		//logf(LOG_DEBUG,"ch32: field=%s nh32=%" PRIu32" vallen=%" PRId32,
		//     ji->m_name,
		//     nameHash32,
		//     vlen);
	}

	m_contentHash32 = totalHash32;
	m_contentHash32Valid = true;
	return &m_contentHash32;
}

int32_t XmlDoc::getHostHash32a ( ) {
	if ( m_hostHash32aValid ) return m_hostHash32a;
	m_hostHash32aValid = true;
	Url *f = getFirstUrl();
	m_hostHash32a = f->getHostHash32();
	return m_hostHash32a;
}

int32_t XmlDoc::getDomHash32( ) {
	if ( m_domHash32Valid ) return m_domHash32;
	m_domHash32Valid = true;
	Url *f = getFirstUrl();
	m_domHash32 = hash32 ( f->getDomain(), f->getDomainLen() );
	return m_domHash32;
}

// . this will be the actual pnm data of the image thumbnail
// . you can inline it in an image tag like
//   <img src="...."/>
//   background-image:url(...);
// . FORMAT of ptr_imageData:
//   <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg>
char **XmlDoc::getThumbnailData ( ) {
	if ( m_imageDataValid ) return &ptr_imageData;
	Images *images = getImages();
	if ( ! images || images == (Images *)-1 ) return (char **)images;
	ptr_imageData  = NULL;
	size_imageData = 0;
	m_imageDataValid = true;
	if ( ! images->m_imageBufValid ) return &ptr_imageData;
	if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData;
	// this buffer is a ThumbnailArray
	ptr_imageData  = images->m_imageBuf.getBufStart();
	size_imageData = images->m_imageBuf.length();
	return &ptr_imageData;
}

Images *XmlDoc::getImages ( ) {
	if ( m_imagesValid ) return &m_images;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	if ( ! cr->m_makeImageThumbnails ) {
		m_images.reset();
		m_imagesValid = true;
		return &m_images;
	}

	setStatus ( "getting thumbnail" );

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (Images *)words;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Images *)xml;
	Sections *sections = getSections();
	if ( ! sections || sections==(Sections *)-1) return (Images *)sections;
	char *site = getSite ();
	if ( ! site || site == (char *)-1 ) return (Images *)site;
	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) return (Images *)d;
	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (Images *)cu;

	// . this does not block or anything
	// . if we are a diffbot json reply it should just use the primary
	//   image, if any, as the only candidate
	m_images.setCandidates ( cu , words , xml , sections );

	setStatus ("getting thumbnail");

	// assume valid
	m_imagesValid = true;

	// now get the thumbnail
	if ( ! m_images.getThumbnail ( site         ,
				       strlen(site) ,
				       *d           ,
				       this         ,
				       cr->m_collnum       ,
				       m_masterState,
				       m_masterLoop ) )
		return (Images *)-1;

	return &m_images;
}


// . get different attributes of the Links as vectors
// . these are 1-1 with the Links::m_linkPtrs[] array
TagRec ***XmlDoc::getOutlinkTagRecVector () {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// error?
	if ( m_outlinkTagRecVectorValid && m_msge0.getErrno() ) {
		g_errno = m_msge0.getErrno();
		logTrace( g_conf.m_logTraceXmlDoc, "END, g_errno %" PRId32, g_errno);
		return NULL;
	}

	// if not using fake ips, give them the real tag rec vector
	if ( m_outlinkTagRecVectorValid )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid (and not fake IPs)" );
		return m_msge0.getTagRecPtrsPtr();
	}

	Links *links = getLinks();
	if ( ! links || links == (void *) -1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLinks returned -1" );
		return (TagRec ***)links;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
		return NULL;
	}


	// update status msg
	setStatus ( "getting outlink tag rec vector" );
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getTagRec returned -1" );
		return (TagRec ***)gr;
	}

	// assume valid
	m_outlinkTagRecVectorValid = true;
	// go get it
	if ( ! m_msge0.getTagRecs ( const_cast<const char **>(links->m_linkPtrs),
				    links->m_linkFlags ,
				    links->m_numLinks  ,
				    // make it point to this basetagrec if
				    // the LF_SAMEHOST flag is set for the link
				    gr ,
				    cr->m_collnum             ,
				    m_niceness         ,
				    m_masterState      ,
				    m_masterLoop       )) {
		// sanity check
		if ( m_doingConsistencyCheck ) { g_process.shutdownAbort(true); }

		// we blocked
		logTrace( g_conf.m_logTraceXmlDoc, "END, msge0.getTagRecs blocked" );
		return (TagRec ***)-1;
	}

	// error?
	if ( g_errno )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, g_errno %" PRId32" after msge0.getTagRecs", g_errno);
		return NULL;
	}

	// or this?
	if ( m_msge0.getErrno() ) {
		g_errno = m_msge0.getErrno();
		logTrace( g_conf.m_logTraceXmlDoc, "END, m_msge0.m_errno=%" PRId32, g_errno);
		return NULL;
	}
	// set it
	//m_outlinkTagRecVector = m_msge0.m_tagRecPtrs;
	// ptr to a list of ptrs to tag recs
	logTrace( g_conf.m_logTraceXmlDoc, "END, got list" );
	return m_msge0.getTagRecPtrsPtr();
}


int32_t **XmlDoc::getOutlinkFirstIpVector () {

	Links *links = getLinks();
	if ( ! links ) return NULL;

	// error?
	if ( m_outlinkTagRecVectorValid && m_msge1.getErrno() ) {
		g_errno = m_msge1.getErrno();
		logTrace( g_conf.m_logTraceXmlDoc, "END, g_errno %" PRId32, g_errno);
		return NULL;
	}

	// return msge1's buf otherwise
	if ( m_outlinkIpVectorValid )
		return m_msge1.getIpBufPtr();

	// should we have some kinda error for msge1?
	//if ( m_outlinkIpVectorValid && m_msge1.m_errno ) {
	//	g_errno = m_msge1.m_errno;
	//	return NULL;
	//}

	// . we now scrounge them from TagRec's "firstip" tag if there!
	// . that way even if a domain changes its ip we still use the
	//   original ip, because the only reason we need this ip is for
	//   deciding which group of hosts will store this SpiderRequest and
	//   we use that for throttling, so we have to be consistent!!!
	// . we never add -1 or 0 ips to tagdb though.... (NXDOMAIN,error...)
	// . uses m_msgeForTagRecs for this one
	TagRec ***grv = getOutlinkTagRecVector();
	if ( ! grv || grv == (void *)-1 ) return (int32_t **)grv;
	// note it
	setStatus ( "getting outlink first ip vector" );
	// assume valid
	m_outlinkIpVectorValid = true;
	// sanity check
	//if ( ! m_spideredTimeValid ) { g_process.shutdownAbort(true); }
	// use this
	int32_t nowGlobal = getSpideredTime();//m_spideredTime;
	// add tags to tagdb?
	bool addTags = true;
	//if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false;
	if ( getIsPageParser() ) addTags = false;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// . go get it
	// . this will now update Tagdb with the "firstip" tags if it should!!
	// . this just dns looks up the DOMAINS of each outlink because these
	//   are *first* ips and ONLY used by Spider.cpp for throttling!!!
	if ( ! m_msge1.getFirstIps ( *grv               ,
				     const_cast<const char**>(links->m_linkPtrs),
				     links->m_linkFlags ,
				     links->m_numLinks  ,
				     m_niceness         ,
				     m_masterState      ,
				     m_masterLoop       ,
				     nowGlobal          )) {
		// sanity check
		if ( m_doingConsistencyCheck ) { g_process.shutdownAbort(true); }
		// we blocked
		return (int32_t **)-1;
	}
	// error?
       	if ( g_errno ) return NULL;
	// or this?
	if ( m_msge1.getErrno() ) {
		g_errno = m_msge1.getErrno();
		logTrace( g_conf.m_logTraceXmlDoc, "END, m_msge1.m_errno=%" PRId32, g_errno);
		return NULL;
	}
	// . ptr to a list of ptrs to tag recs
	// . ip will be -1 on error
	return m_msge1.getIpBufPtr();
}

int32_t *XmlDoc::getUrlFilterNum ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	// return it if already set
	if ( m_urlFilterNumValid ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END. already valid: %" PRId32, m_urlFilterNum );
		return &m_urlFilterNum;
	}

	// note that
	setStatus ( "getting url filter row num");

	// . make the partial new spider rec
	// . we need this for matching filters like lang==zh_cn
	// . crap, but then it matches "hasReply" when it should not
	// . PROBLEM! this is the new reply not the OLD reply, so it may
	//   end up matching a DIFFERENT url filter num then what it did
	//   before we started spidering it...
	//SpiderReply *newsr = getNewSpiderReply ( );
	// note it
	//if ( ! newsr )
	//	log("doc: getNewSpiderReply: %s",mstrerror(g_errno));
	//if ( ! newsr || newsr == (void *)-1 ) return (int32_t *)newsr;

	// need language i guess
	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 ) {
//		log("build: failed to get url filter for xmldoc %s - could not get language id",
//		    m_firstUrl.getUrl());
		logTrace( g_conf.m_logTraceXmlDoc, "END. unable to get langId" );
		return (int32_t *)langId;
	}


	// make a fake one for now
	// SpiderReply fakeReply;
	// // fix errors
	// fakeReply.reset();
	// fakeReply.m_isIndexedINValid = true;
	// // just language for now, so we can FILTER by language
	// if ( m_langIdValid ) fakeReply.m_langId = m_langId;

	int32_t langIdArg = -1;
	if ( m_langIdValid ) langIdArg = m_langId;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;
	// this must be valid
	//if ( ! m_spideredTimeValid ) { g_process.shutdownAbort(true); }
	int32_t spideredTime = getSpideredTime();
	// get the spider request
	SpiderRequest *oldsr = &m_sreq;
	// null it out if invalid...
	if ( ! m_sreqValid ) oldsr = NULL;
	// do not set the spideredTime in the spiderReply to 0
	// so we do not trigger the lastSpiderTime
	//int32_t saved = newsr->m_spideredTime;
	//newsr->m_spideredTime = 0;
	//
	// PROBLEM: we end up matching "isIndexed" in the url filters
	// even if this is a NEW document because we pass it in the spider
	// reply that we generate now even though another spider reply
	// may not exist.
	//
	// SOLUTION: just do not supply a spider reply, we only seem to
	// use the urlfilternum to get a diffbot api url OR to see if the
	// document is banned/filtered so we should delete it. otherwise
	// we were supplying "newsr" above...

	// . look it up
	// . use the old spidered date for "nowGlobal" so we can be consistent
	//   for injecting into the "qatest123" coll
	int32_t ufn = ::getUrlFilterNum(oldsr, NULL, spideredTime, false, cr, false, NULL, langIdArg);

	// put it back
	//newsr->m_spideredTime = saved;

	// bad news?
	if ( ufn < 0 ) {
		log("build: failed to get url filter for xmldoc %s",
		    m_firstUrl.getUrl());
		//g_errno = EBADENGINEER;
		//return NULL;
	}


	// store it
	m_urlFilterNum      = ufn;
	m_urlFilterNumValid = true;

	logTrace( g_conf.m_logTraceXmlDoc, "END. returning %" PRId32, m_urlFilterNum );

	return &m_urlFilterNum;
}

// . both "u" and "site" must not start with http:// or https:// or protocol
static bool isSiteRootFunc ( const char *u , const char *site ) {
	// get length of each
	int32_t slen = strlen(site);//m_siteLen;
	int32_t ulen = strlen(u);
	// "site" may or may not end in /, so remove that
	if ( site[slen-1] == '/' ) slen--;
	// same for url
	if ( u[ulen-1] == '/' ) ulen--;
	// skip http:// or https://
	if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
	if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
	if ( strncmp(site,"http://" ,7)==0 ) { site += 7; slen -= 7; }
	if ( strncmp(site,"https://",8)==0 ) { site += 8; slen -= 8; }
	// subtract default.asp etc. from "u"
	//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.asp",11)==0 )
	//	ulen -= 11;
	//if ( ulen > 15 && strncasecmp(u+ulen-11,"default.html",12)==0 )
	//	ulen -= 12;
	//if ( ulen > 15 && strncasecmp(u+ulen-11,"index.html",10)==0 )
	//	ulen -= 10;
	// now they must match exactly
	if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true;
	// all done
	return false;
}

static bool isSiteRootFunc3 ( const char *u , int32_t siteRootHash32 ) {
	// get length of each
	int32_t ulen = strlen(u);
	// remove trailing /
	if ( u[ulen-1] == '/' ) ulen--;
	// skip http:// or https://
	if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; }
	if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; }
	// now they must match exactly
	int32_t sh32 = hash32(u,ulen);
	return ( sh32 == siteRootHash32 );
}

char *XmlDoc::getIsSiteRoot ( ) {
	if ( m_isSiteRootValid ) return &m_isSiteRoot2;
	// get our site
	char *site = getSite ();
	if ( ! site || site == (char *)-1 ) return (char *)site;
	// get our url without the http:// or https://
	const char *u = getFirstUrl()->getHost();
	if ( ! u ) {
		g_errno = EBADURL;
		return NULL;
	}
	// assume valid now
	m_isSiteRootValid = true;
	// get it
	bool isRoot = isSiteRootFunc ( u , site );
	// seems like https:://twitter.com/ is not getting set to root
	if ( m_firstUrl.getPathDepth(true) == 0  && ! m_firstUrl.isCgi() )
		isRoot = true;
	m_isSiteRoot2 = m_isSiteRoot = isRoot;
	return &m_isSiteRoot2;
}

int8_t *XmlDoc::getHopCount ( ) {
	// return now if valid
	if ( m_hopCountValid ) return &m_hopCount;

	setStatus ( "getting hop count" );

	// the unredirected url
	Url *f = getFirstUrl();
	// get url as string, skip "http://" or "https://"
	//char *u = f->getHost();
	// if we match site, we are a site root, so hop count is 0
	//char *isr = getIsSiteRoot();
	//if ( ! isr || isr == (char *)-1 ) return (int8_t *)isr;
	//if ( *isr ) {
	//	m_hopCount      = 0;
	//	m_hopCountValid = true;
	//	return &m_hopCount;
	//}
	// ping servers have 0 hop counts
	if ( f->isPingServer() ) {
		// log("xmldoc: hc2 is 0 (pingserver) %s",m_firstUrl.m_url);
		m_hopCount      = 0;
		m_hopCountValid = true;
		return &m_hopCount;
	}
	char *isRSS = getIsRSS();
	if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS;
	// check for site root
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int8_t *)gr;
	// and site roots
	char *isSiteRoot = getIsSiteRoot();
	if (!isSiteRoot ||isSiteRoot==(char *)-1) return (int8_t *)isSiteRoot;
	if ( *isSiteRoot ) {
		// log("xmldoc: hc1 is 0 (siteroot) %s",m_firstUrl.m_url);
		m_hopCount      = 0;
		m_hopCountValid = true;
		return &m_hopCount;
	}
	// make sure m_minInlinkerHopCount is valid
	LinkInfo *info1 = getLinkInfo1();
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1;
	// . fix bad original hop counts
	// . assign this hop count from the spider rec
	int32_t origHopCount = -1;
	if ( m_sreqValid ) origHopCount = m_sreq.m_hopCount;
	// derive our hop count from our parent hop count
	int32_t hc = -1;
	// . BUT use inlinker if better
	// . if m_linkInfo1Valid is true, then m_minInlinkerHopCount is valid
	// if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 )
	// 	hc = m_minInlinkerHopCount + 1;
	// or if parent is unknown, but we have a known inlinker with a
	// valid hop count, use the inlinker hop count then
	// if ( hc == -1 && m_minInlinkerHopCount >= 0 )
	// 	hc = m_minInlinkerHopCount + 1;
	// if ( origHopCount == 0 )
	// 	log("xmldoc: hc3 is 0 (spiderreq) %s",m_firstUrl.m_url);
	// or use our hop count from the spider rec if better
	if ( origHopCount < hc && origHopCount >= 0 )
		hc = origHopCount;
	// or if neither parent or inlinker was valid hop count
	if ( hc == -1 && origHopCount >= 0 )
		hc = origHopCount;
	// if we have no hop count at this point, i guess just pick 1!
	if ( hc == -1 )
		hc = 1;
	// truncate, hop count is only one byte in the TitleRec.h::m_hopCount
	if ( hc > 0x7f ) hc = 0x7f;

	// and now so do rss urls.
	if ( *isRSS && hc > 1 ) {
		// force it to one, not zero, otherwise it gets pounded
		// too hard on the aggregator sites. spider priority
		// is too high
		m_hopCount      = 1;
		m_hopCountValid = true;
		return &m_hopCount;
	}

	// unknown hop counts (-1) are propogated, except for root urls
	m_hopCountValid = true;
	m_hopCount      = hc;
	return &m_hopCount;
}

//set to false fo rinjecting and validate it... if &spiderlinks=0
// should we spider links?
char *XmlDoc::getSpiderLinks ( ) {
	// set it to false on issues
	//if ( m_indexCode ) {
	//	m_spiderLinks      = false;
	//	m_spiderLinks2     = false;
	//	m_spiderLinksValid = true ; }

	// this slows importing down because we end up doing ip lookups
	// for every outlink if "firstip" not in tagdb.
	// shoot. set2() already sets m_spiderLinksValid to true so we
	// have to override if importing.
	if ( m_isImporting && m_isImportingValid ) {
		m_spiderLinks  = (char)false;
		m_spiderLinks2 = (char)false;
		m_spiderLinksValid = true;
		return &m_spiderLinks2;
	}

	// return the valid value
	if ( m_spiderLinksValid ) return &m_spiderLinks2;

	setStatus ( "getting spider links flag");

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return (char *)cr;

	int32_t *ufn = getUrlFilterNum();
	if ( ! ufn || ufn == (void *)-1 ) return (char *)ufn;

	// if url filters forbids it
	if ( ! cr->m_harvestLinks[*ufn] ) {
		m_spiderLinksValid = true;
		m_spiderLinks2 = (char)false;
		m_spiderLinks  = (char)false;
		return &m_spiderLinks2;
	}

	// check the xml for a meta robots tag
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;

	// assume true
	m_spiderLinks = (char)true;

	// or if meta tag says not to
	char buf1 [256];
	char buf2 [256];
	buf1[0] = '\0';
	buf2[0] = '\0';
	xml->getMetaContent ( buf1, 255 , "robots" , 6 );
	xml->getMetaContent ( buf2, 255 , g_conf.m_spiderBotName, strlen(g_conf.m_spiderBotName) );

	if ( strstr ( buf1 , "nofollow" ) ||
	     strstr ( buf2 , "nofollow" ) ||
	     strstr ( buf1 , "none"     ) ||
	     strstr ( buf2 , "none"     ) )
		m_spiderLinks = (char)false;

	// spider links if not using robots.txt
	if ( ! m_useRobotsTxt )
		m_spiderLinks = (char)true;

	// spider request forbade it? diffbot.cpp crawlbot api when
	// specifying urldata (list of urls to add to spiderdb) usually
	// they do not want the links crawled i'd imagine.
	if ( m_sreqValid && m_sreq.m_avoidSpiderLinks )
		m_spiderLinks = (char)false;


	// also check in url filters now too


	// set shadow member
	m_spiderLinks2 = m_spiderLinks;
	// validate
	m_spiderLinksValid = true;
	return &m_spiderLinks2;
}

int32_t *XmlDoc::getSpiderPriority ( ) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_priorityValid ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END. already valid: %" PRId32, m_priority );
		return &m_priority;
	}

	setStatus ("getting spider priority");
	// need tagrec to see if banned
	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
	// this is an automatic ban!
	if ( gr->getLong("manualban",0) ) {
		m_priority      = -3;//SPIDER_PRIORITY_BANNED;
		m_priorityValid = true;
		logTrace( g_conf.m_logTraceXmlDoc, "END. Manual ban" );
		return &m_priority;
	}
	int32_t *ufn = getUrlFilterNum();
	if ( ! ufn || ufn == (void *)-1 ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END. Invalid ufn" );
		return (int32_t *)ufn;
	}

	// sanity check
	if ( *ufn < 0 ) { g_process.shutdownAbort(true); }
	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END. No collection" );
		return NULL;
	}

	m_priority = cr->m_spiderPriorities[*ufn];

	// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
	if ( cr->m_forceDelete[*ufn] ) {
		logTrace( g_conf.m_logTraceXmlDoc, "force delete" );
		m_priority = -3;
	}

	logTrace( g_conf.m_logTraceXmlDoc, "END. ufn=%" PRId32" priority=%" PRId32, *ufn, m_priority );
	m_priorityValid = true;
	return &m_priority;
}

void XmlDoc::logIt (SafeBuf *bb ) {
	// set errCode
	int32_t errCode = m_indexCode;
	if ( ! errCode && g_errno ) {
		errCode = g_errno;
	}

	// were we new?
	bool isNew = true;
	if ( m_sreqValid && m_sreq.m_hadReply ) isNew = false;

	// download time
	unsigned took = 0;
	if ( m_downloadStartTimeValid ) {
		if ( m_downloadEndTimeValid ) {
			took = static_cast<unsigned>( m_downloadEndTime - m_downloadStartTime );
		} else {
			took = static_cast<unsigned>( gettimeofdayInMilliseconds() - m_downloadStartTime );
		}
	}

	// keep track of stats
	Statistics::register_spider_time(isNew, errCode, m_httpStatus, took);
	Statistics::register_document_encoding(errCode, m_charset, m_langId, m_countryId);

	// do not log if we should not, saves some time
	if ( ! g_conf.m_logSpideredUrls ) return;

	const char *coll = "nuked";
	CollectionRec *cr = getCollRec();
	if ( cr ) coll = cr->m_coll;

	SafeBuf tmpsb;

	// print into this now
	SafeBuf *sb = &tmpsb;
	// log into provided safebuf if not null
	if ( bb ) sb = bb;

	//
	// coll
	//
	sb->safePrintf("coll=%s ",coll);
	sb->safePrintf("collnum=%" PRId32" ",(int32_t)m_collnum);

	//
	// print ip
	//
	if ( m_ipValid ) {
		char ipbuf[16];
		sb->safePrintf("ip=%s ",iptoa(m_ip,ipbuf) );
	}

	if ( m_firstIpValid ) {
		char ipbuf[16];
		sb->safePrintf("firstip=%s ",iptoa(m_firstIp,ipbuf) );
	}

	// . first ip from spider req if it is fake
	// . we end up spidering the same url twice because it will have
	//   different "firstips" in the SpiderRequest key. maybe just
	//   use domain hash instead of firstip, and then let msg13
	//   make queues in the case of hammering an ip, which i think
	//   it already does...
#ifdef _VALGRIND_
	if(m_sreqValid)
		VALGRIND_CHECK_MEM_IS_DEFINED(&m_sreq.m_firstIp,sizeof(m_sreq.m_firstIp));
	if(m_firstIpValid)
		VALGRIND_CHECK_MEM_IS_DEFINED(&m_firstIp,sizeof(m_firstIp));
#endif

	if ( m_sreqValid && m_firstIpValid && m_sreq.m_firstIp != m_firstIp ) {
		char ipbuf[16];
		sb->safePrintf("fakesreqfirstip=%s ",iptoa(m_sreq.m_firstIp,ipbuf) );
	}

	//
	// print when this spider request was added
	//
	//if ( m_sreqValid && m_sreq.m_addedTime ) {
	//	struct tm *timeStruct = gmtime_r( &m_sreq.m_addedTime );
	//	char tmp[64];
	//	strftime(tmp,64,"requestadded=%b-%d-%Y(%H:%M:%S)", timeStruct);
	//	sb->safePrintf("%s(%" PRIu32") ",tmp,m_sreq.m_addedTime);
	//}

	//
	// print spidered time
	//
	//if ( m_spideredTimeValid ) {
	time_t spideredTime = (time_t)getSpideredTime();
	struct tm tm_buf;
	struct tm *timeStruct = gmtime_r(&spideredTime,&tm_buf);
	char tmp[64];
	strftime(tmp,64,"spidered=%b-%d-%Y(%H:%M:%S)", timeStruct );
	sb->safePrintf("%s(%" PRIu32") ",tmp,(uint32_t)spideredTime);

	// when it was scheduled to be spidered
	if ( m_sreqValid && m_sreq.m_addedTime ) {
		time_t ts = m_sreq.m_addedTime;
		struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
		char tmp[64];
		strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
		sb->safePrintf("scheduledtime=%s(%" PRIu32") ",
			       tmp,(uint32_t)m_sreq.m_addedTime);
	}

	// discovery date, first time spiderrequest was added to spiderdb
	if ( m_sreqValid && m_sreq.m_discoveryTime ) {
		time_t ts = m_sreq.m_discoveryTime;
		struct tm *timeStruct = gmtime_r(&ts,&tm_buf);
		char tmp[64];
		strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct );
		sb->safePrintf("discoverydate=%s(%" PRIu32") ",
			       tmp,(uint32_t)m_sreq.m_discoveryTime);
	}

	// print first indexed time
	if ( m_firstIndexedDateValid ) {
		time_t ts = m_firstIndexedDate;
		timeStruct = gmtime_r(&ts,&tm_buf);//m_firstIndexedDate );
		strftime(tmp,64,"firstindexed=%b-%d-%Y(%H:%M:%S)", timeStruct);
		sb->safePrintf("%s(%" PRIu32") ",tmp,
			       (uint32_t)m_firstIndexedDate);
	}


	//if ( ! m_isIndexedValid ) { g_process.shutdownAbort(true); }

	// just use the oldurlfilternum for grepping i guess
	//if ( m_oldDocValid && m_oldDoc )

	// when injecting a request we have no idea if it had a reply or not
	if ( m_sreqValid && m_sreq.m_isInjecting )
		sb->safePrintf("firsttime=? ");
	else if ( m_sreqValid && m_sreq.m_hadReply )
		sb->safePrintf("firsttime=0 ");
	else if ( m_sreqValid )
		sb->safePrintf("firsttime=1 ");
	else
		sb->safePrintf("firsttime=? ");

	//
	// print # of link texts
	//
	if ( m_linkInfo1Valid && ptr_linkInfo1 ) {
		LinkInfo *info = ptr_linkInfo1;
		int32_t nt = info->getNumLinkTexts();
		sb->safePrintf("goodinlinks=%" PRId32" ",nt );
		// new stuff. includes ourselves i think.
		//sb->safePrintf("ipinlinks=%" PRId32" ",info->m_numUniqueIps);
		//sb->safePrintf("cblockinlinks=%" PRId32" ",
		//info->m_numUniqueCBlocks);
	}

	if (  m_docIdValid )
		sb->safePrintf("docid=%" PRIu64" ",m_docId);

	char *u = getFirstUrl()->getUrl();
	int64_t pd = Titledb::getProbableDocId(u);
	int64_t d1 = Titledb::getFirstProbableDocId ( pd );
	int64_t d2 = Titledb::getLastProbableDocId  ( pd );
	sb->safePrintf("probdocid=%" PRIu64" ",pd);
	sb->safePrintf("probdocidmin=%" PRIu64" ",d1);
	sb->safePrintf("probdocidmax=%" PRIu64" ",d2);
	sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);

	if ( m_siteNumInlinksValid ) {
		sb->safePrintf("siteinlinks=%04" PRId32" ",m_siteNumInlinks );
		int32_t sr = ::getSiteRank ( m_siteNumInlinks );
		sb->safePrintf("siterank=%" PRId32" ", sr );
	}

	if ( m_sreqValid )
		sb->safePrintf("pageinlinks=%04" PRId32" ",
			       m_sreq.m_pageNumInlinks);

	// shortcut
	int64_t uh48 = hash64b ( m_firstUrl.getUrl() );
	// mask it
	uh48 &= 0x0000ffffffffffffLL;
	sb->safePrintf ("uh48=%" PRIu64" ",uh48 );


	if ( m_charsetValid )
		sb->safePrintf("charset=%s ",get_charset_str(m_charset));

	if ( m_contentTypeValid )
		sb->safePrintf("ctype=%s ",
			      g_contentTypeStrings [m_contentType]);

	if ( m_langIdValid ) {
		sb->safePrintf( "lang=%02" PRId32"(%s) ", ( int32_t ) m_langId, getLanguageAbbr( m_langId ) );
	}

	if ( m_countryIdValid )
		sb->safePrintf("country=%02" PRId32"(%s) ",(int32_t)m_countryId,
			      g_countryCode.getAbbr(m_countryId));

	if ( m_hopCountValid )
		sb->safePrintf("hopcount=%02" PRId32" ",(int32_t)m_hopCount);


	if ( m_contentValid )
		sb->safePrintf("contentlen=%06" PRId32" ",m_contentLen);

    if ( m_isContentTruncatedValid )
		sb->safePrintf("contenttruncated=%" PRId32" ",(int32_t)m_isContentTruncated);

	if ( m_robotsTxtLenValid )
		sb->safePrintf("robotstxtlen=%04" PRId32" ",m_robotsTxtLen );

	if ( m_isAllowedValid )
		sb->safePrintf("robotsallowed=%i ", (int)m_isAllowed);
	else
		sb->safePrintf("robotsallowed=? " );

	if ( m_contentHash32Valid )
		sb->safePrintf("ch32=%010" PRIu32" ",m_contentHash32);

	if ( m_domHash32Valid )
		sb->safePrintf("dh32=%010" PRIu32" ",m_domHash32);

	if ( m_siteHash32Valid )
		sb->safePrintf("sh32=%010" PRIu32" ",m_siteHash32);

	if ( m_isPermalinkValid )
		sb->safePrintf("ispermalink=%" PRId32" ",(int32_t)m_isPermalink);

	if ( m_isRSSValid )
		sb->safePrintf("isrss=%" PRId32" ",(int32_t)m_isRSS);

	if ( m_linksValid )
		sb->safePrintf("hasrssoutlink=%" PRId32" ",
			      (int32_t)m_links.hasRSSOutlink() );

	if ( m_numOutlinksAddedValid ) {
		sb->safePrintf("outlinksadded=%04" PRId32" ",
			       (int32_t)m_numOutlinksAdded);
	}

	if ( m_metaListValid )
		sb->safePrintf("addlistsize=%05" PRId32" ",
			       (int32_t)m_metaListSize);
	else
		sb->safePrintf("addlistsize=%05" PRId32" ",(int32_t)0);

	if ( m_addedSpiderRequestSizeValid )
		sb->safePrintf("addspiderreqsize=%05" PRId32" ",
			       m_addedSpiderRequestSize);
	else
		sb->safePrintf("addspiderreqsize=%05" PRId32" ",0);


	if ( m_addedSpiderReplySizeValid )
		sb->safePrintf("addspiderrepsize=%05" PRId32" ",
			       m_addedSpiderReplySize);
	else
		sb->safePrintf("addspiderrepsize=%05" PRId32" ",0);


	if ( m_addedStatusDocSizeValid )
		sb->safePrintf("addstatusdocsize=%05" PRId32" ",
			       m_addedStatusDocSize);
	else
		sb->safePrintf("addstatusdocsize=%05" PRId32" ",0);


	if ( m_useSecondaryRdbs ) {
		sb->safePrintf("useposdb=%i ",(int)m_usePosdb);
		sb->safePrintf("usetitledb=%i ",(int)m_useTitledb);
		sb->safePrintf("useclusterdb=%i ",(int)m_useClusterdb);
		sb->safePrintf("usespiderdb=%i ",(int)m_useSpiderdb);
		sb->safePrintf("uselinkdb=%i ",(int)m_useLinkdb);
		if ( cr )
			sb->safePrintf("indexspiderreplies=%i ",(int)
				       cr->m_indexSpiderReplies);
	}

	if ( m_imageDataValid && size_imageData ) {
		// url is in data now
		ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData;
		int32_t nt = ta->getNumThumbnails();
		ThumbnailInfo *ti = ta->getThumbnailInfo(0);
		sb->safePrintf("thumbnail=%s,%" PRId32"bytes,%" PRId32"x%" PRId32",(%" PRId32") ",
			      ti->getUrl(),
			      ti->m_dataSize,
			      ti->m_dx,
			      ti->m_dy,
			      nt);
	}
	else
		sb->safePrintf("thumbnail=none ");


	if ( m_rawUtf8ContentValid )
		sb->safePrintf("utf8size=%" PRId32" ",
			       m_rawUtf8ContentSize);
	if ( m_utf8ContentValid )
		sb->safePrintf("rawutf8size=%" PRId32" ",
			       size_utf8Content);

	// get the content type
	uint8_t ct = CT_UNKNOWN;
	if ( m_contentTypeValid ) ct = m_contentType;

	bool isRoot = false;
	if ( m_isSiteRootValid ) isRoot = m_isSiteRoot;

	// make sure m_minInlinkerHopCount is valid
	LinkInfo *info1 = NULL;
	if ( m_linkInfo1Valid ) info1 = ptr_linkInfo1;


	// hack this kinda
	// . in PageInject.cpp we do not have a valid priority without
	//   blocking because we did a direct injection!
	//   so ignore this!!
	// . a diffbot json object, an xmldoc we set from a json object
	//   in a diffbot reply, is a childDoc (m_isChildDoc) is true
	//   and does not have a spider priority. only the parent doc
	//   that we used to get the diffbot reply (array of json objects)
	//   will have the spider priority
	if ( ! getIsInjecting() ) {
		//int32_t *priority = getSpiderPriority();
		//if ( ! priority ||priority==(void *)-1){g_process.shutdownAbort(true);}
		if ( m_priorityValid )
			sb->safePrintf("priority=%" PRId32" ",
				      (int32_t)m_priority);
	}

	// should be valid since we call getSpiderPriority()
	if ( m_urlFilterNumValid )
		sb->safePrintf("urlfilternum=%" PRId32" ",(int32_t)m_urlFilterNum);


	if ( m_siteValid )
		sb->safePrintf("site=%s ",ptr_site);

	if ( m_isSiteRootValid )
		sb->safePrintf("siteroot=%" PRId32" ",m_isSiteRoot );
	else
		sb->safePrintf("siteroot=? ");

	// like how we index it, do not include the filename. so we can
	// have a bunch of pathdepth 0 urls with filenames like xyz.com/abc.htm
	if ( m_firstUrlValid && m_firstUrl.getUrl() && m_firstUrl.getUrlLen() >= 3 ) {
		int32_t pd = m_firstUrl.getPathDepth(false);
		sb->safePrintf("pathdepth=%" PRId32" ",pd);
	}
	else {
		sb->safePrintf("pathdepth=? ");
	}

	//
	// . sometimes we print these sometimes we do not
	// . put this at the end so we can awk out the above fields reliably
	//

	// print when it was last spidered
	if ( m_oldDocValid && m_oldDoc ) {
		time_t spideredTime = m_oldDoc->getSpideredTime();
		struct tm *timeStruct = gmtime_r(&spideredTime,&tm_buf);
		char tmp[64];
		strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
		sb->safePrintf("%s(%" PRIu32") ", tmp,(uint32_t)spideredTime);
	}

	if ( m_linkInfo1Valid && ptr_linkInfo1 && ptr_linkInfo1->hasRSSItem())
		sb->safePrintf("hasrssitem=1 ");

	// was the content itself injected?
	if ( m_wasContentInjected )
		sb->safePrintf("contentinjected=1 ");
	else
		sb->safePrintf("contentinjected=0 ");

	// might have just injected the url and downloaded the content?
	if ( (m_sreqValid && m_sreq.m_isInjecting) ||
	     (m_isInjecting && m_isInjectingValid) )
		sb->safePrintf("urlinjected=1 ");
	else
		sb->safePrintf("urlinjected=0 ");

	if ( m_sreqValid && m_sreq.m_isAddUrl )
		sb->safePrintf("isaddurl=1 ");
	else
		sb->safePrintf("isaddurl=0 ");

	if ( m_sreqValid && m_sreq.m_isPageReindex )
		sb->safePrintf("pagereindex=1 ");

	if ( m_spiderLinksValid && m_spiderLinks )
		sb->safePrintf("spiderlinks=1 ");
	if ( m_spiderLinksValid && ! m_spiderLinks )
		sb->safePrintf("spiderlinks=0 ");


	if ( m_crawlDelayValid && m_crawlDelay != -1 )
		sb->safePrintf("crawldelayms=%" PRId32" ",(int32_t)m_crawlDelay);

	if ( m_recycleContent )
		sb->safePrintf("recycleContent=1 ");

	if ( m_exactContentHash64Valid )
		sb->safePrintf("exactcontenthash=%" PRIu64" ",
			      m_exactContentHash64 );

	// . print percent changed
	// . only print if non-zero!
	if ( m_percentChangedValid && m_oldDocValid && m_oldDoc &&
	     m_percentChanged )
		sb->safePrintf("changed=%.00f%% ",m_percentChanged);

	// only print if different now! good for grepping changes
	if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_docId != m_docId )
		sb->safePrintf("olddocid=%" PRIu64" ",m_oldDoc->m_docId);

	// only print if different now! good for grepping changes
	if ( m_sreqValid && m_sreq.m_ufn >= 0 &&
	     (!m_urlFilterNumValid || m_sreq.m_ufn != m_urlFilterNum) )
		sb->safePrintf("oldurlfilternum=%" PRId32" ",
			      (int32_t)m_sreq.m_ufn);

	if ( m_sreqValid && m_sreq.m_priority >= 0 &&
	     (!m_priorityValid || m_sreq.m_priority != m_priority) )
		sb->safePrintf("oldpriority=%" PRId32" ",
			      (int32_t)m_sreq.m_priority);

	if ( m_oldDoc && m_oldDoc->m_langIdValid &&
	     (!m_langIdValid || m_oldDoc->m_langId != m_langId) )
		sb->safePrintf("oldlang=%02" PRId32"(%s) ",(int32_t)m_oldDoc->m_langId,
			      getLanguageAbbr(m_oldDoc->m_langId));

	if ( m_useSecondaryRdbs &&
	     m_useTitledb &&
	     (!m_langIdValid || m_logLangId != m_langId) )
		sb->safePrintf("oldlang=%02" PRId32"(%s) ",(int32_t)m_logLangId,
			      getLanguageAbbr(m_logLangId));

	if ( m_useSecondaryRdbs &&
	     m_useTitledb &&
	     m_logSiteNumInlinks != m_siteNumInlinks )
		sb->safePrintf("oldsiteinlinks=%04" PRId32" ",m_logSiteNumInlinks);

	if ( m_useSecondaryRdbs &&
	     m_useTitledb &&
	     m_oldDocValid &&
	     m_oldDoc &&
	     strcmp(ptr_site,m_oldDoc->ptr_site) != 0 )
		sb->safePrintf("oldsite=%s ",m_oldDoc->ptr_site);

	if ( m_isAdultValid )
		sb->safePrintf("isadult=%" PRId32" ",(int32_t)m_isAdult);

	// only print if different now! good for grepping changes
	if ( m_oldDocValid && m_oldDoc &&
	     m_oldDoc->m_siteNumInlinks >= 0 &&
	     m_oldDoc->m_siteNumInlinks != m_siteNumInlinks ) {
		int32_t sni = -1;
		if  ( m_oldDoc ) sni = m_oldDoc->m_siteNumInlinks;
		sb->safePrintf("oldsiteinlinks=%04" PRId32" ",sni);
	}


	// Spider.cpp sets m_sreq.m_errCount before adding it to doledb
	if ( m_sreqValid ) // && m_sreq.m_errCount )
		sb->safePrintf("errcnt=%" PRId32" ",(int32_t)m_sreq.m_errCount );
	else
		sb->safePrintf("errcnt=? ");

	if ( ptr_redirUrl ) { // m_redirUrlValid && m_redirUrlPtr ) {
		sb->safePrintf("redir=%s ",ptr_redirUrl);//m_redirUrl.getUrl());
		if ( m_numRedirects > 2 )
			sb->safePrintf("numredirs=%" PRId32" ",m_numRedirects);
	}

	if ( m_canonicalRedirUrlValid && m_canonicalRedirUrlPtr )
		sb->safePrintf("canonredir=%s ",
			       m_canonicalRedirUrlPtr->getUrl());

	if ( m_httpStatusValid && m_httpStatus != 200 )
		sb->safePrintf("httpstatus=%" PRId32" ",(int32_t)m_httpStatus);

	if ( m_updatedMetaData )
		sb->safePrintf("updatedmetadata=1 ");

	if ( m_isDupValid && m_isDup )
		sb->safePrintf("dupofdocid=%" PRId64" ",m_docIdWeAreADupOf);

	if ( m_firstUrlValid )
		sb->safePrintf("url=%s ",m_firstUrl.getUrl());
	else
		sb->safePrintf("urldocid=%" PRId64" ",m_docId);

	//
	// print error/status
	//
	sb->safePrintf(": %s",mstrerror(m_indexCode));

	// if safebuf provided, do not log to log
	if ( bb ) return;

	// log it out
	logf ( LOG_INFO ,
	       "build: %s",
	       //getFirstUrl()->getUrl(),
	       sb->getBufStart() );

	return;
}


// . returns false and sets g_errno on error
// . make sure that the title rec we generated creates the exact same
//   meta list as what we got
bool XmlDoc::doConsistencyTest ( bool forceTest ) {

	// skip for now it was coring on a json doc test
	return true;
#if 0
	CollectionRec *cr = getCollRec();
	if ( ! cr )
		return true;

	if ( ! m_doConsistencyTesting )
		return true;

	// if we had an old doc then our meta list will have removed
	// stuff already in the database from indexing the old doc.
	// so it will fail the parsing consistency check... because of
	// the 'incremental indexing' algo above
	// disable for now... just a secondfor testing cheatcc.com
	if ( m_oldDoc && m_oldDocValid && g_conf.m_doIncrementalUpdating )
		return true;

	// if not test coll skip this
	//if ( strcmp(cr->m_coll,"qatest123") ) return true;

	// title rec is null if we are reindexing an old doc
	// and "unchanged" was true.
	if ( m_unchangedValid && m_unchanged ) {
		if ( ! m_titleRecBufValid ) return true;
		if ( m_titleRecBuf.length()==0 ) return true;
	}

	// leave this uncommented so we can see if we are doing it
	setStatus ( "doing consistency check" );

	// log debug
	log("spider: doing consistency check for %s",ptr_firstUrl);

	// . set another doc from that title rec
	// . do not keep on stack since so huge!
	XmlDoc *doc ;
	try { doc = new ( XmlDoc ); }
	catch(std::bad_alloc&) {
		g_errno = ENOMEM;
		return false;
	}
	mnew ( doc , sizeof(XmlDoc),"xmldcs");


	if ( ! doc->set2 ( m_titleRecBuf.getBufStart() ,
			   -1 , cr->m_coll , NULL , m_niceness ,
			  // no we provide the same SpiderRequest so that
			  // it can add the same SpiderReply to the metaList
			   &m_sreq ) ) {
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		return false;
	}

	// . some hacks
	// . do not look up title rec in titledb, assume it is new
	doc->m_isIndexed      = false;
	doc->m_isIndexedValid = true;

	// so we don't core in getRevisedSpiderRequest()
	doc->m_firstIp = m_firstIp;
	doc->m_firstIpValid = true;

	// getNewSpiderReply() calls getDownloadEndTime() which is not valid
	// and causes the page to be re-downloaded, so stop that..!
	doc->m_downloadEndTime      = m_downloadEndTime;
	doc->m_downloadEndTimeValid = true;

	// inherit doledb key as well to avoid a core there
	doc->m_doledbKey = m_doledbKey;

	// flag it
	doc->m_doingConsistencyCheck = true;

	// get get its metalist. rv = return value
	char *rv = doc->getMetaList ( );

	// sanity check - compare urls
	if ( doc->m_firstUrl.getUrlLen() != m_firstUrl.getUrlLen()){g_process.shutdownAbort(true);}

	// error setting it?
	if ( ! rv ) {
		// sanity check
		if ( ! g_errno ) { g_process.shutdownAbort(true); }
		// free it
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		// error
		return false;
	}
	// blocked? that is not allowed
	if ( rv == (void *)-1 ) { g_process.shutdownAbort(true); }

	// compare with the old list
	char *list1     = m_metaList;
	int32_t  listSize1 = m_metaListSize;

	char *list2     = doc->m_metaList;
	int32_t  listSize2 = doc->m_metaListSize;

	// do a compare
	HashTableX ht1;
	HashTableX ht2;

	ht1.set ( sizeof(key224_t),sizeof(char *),
		  262144,NULL,0,false,m_niceness,"xmlht1");
	ht2.set ( sizeof(key224_t),sizeof(char *),
		  262144,NULL,0,false,m_niceness,"xmlht2");

	// format of a metalist... see XmlDoc::addTable() where it adds keys
	// from a table into the metalist
	// <nosplitflag|rdbId><key><dataSize><data>
	// where nosplitflag is 0x80
	char *p1    = list1;
	char *p2    = list2;
	char *pend1 = list1 + listSize1;
	char *pend2 = list2 + listSize2;

	// see if each key in list1 is in list2
	if ( ! hashMetaList ( &ht1 , p1 , pend1 , false ) ) {
		g_process.shutdownAbort(true);
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		log(LOG_WARN, "doc: failed consistency test for %s",ptr_firstUrl);
		return false;
	}
	if ( ! hashMetaList ( &ht2 , p2 , pend2 , false ) ) {
		g_process.shutdownAbort(true);
		mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
		delete ( doc );
		log(LOG_WARN, "doc: failed consistency test for %s",ptr_firstUrl);
		return false;
	}

	// . now make sure each list matches the other
	// . first scan the guys in "p1" and make sure in "ht2"
	hashMetaList ( &ht2 , p1 , pend1 , true );
	// . second scan the guys in "p2" and make sure in "ht1"
	hashMetaList ( &ht1 , p2 , pend2 , true );

	mdelete ( doc , sizeof(XmlDoc) , "xdnuke");
	delete ( doc );

	log ("spider: passed consistency test for %s",ptr_firstUrl );

	// no serious error, although there might be an inconsistency
	return true;
#endif
}

#define TABLE_ROWS 25

void XmlDoc::printMetaList() const {
	const char *p = m_metaList;
	const char *pend = m_metaList + m_metaListSize;
	for (; p < pend;) {
		// get rdbId
		rdbid_t rdbId = (rdbid_t)(*p & 0x7f);
		p++;

		// key size
		int32_t ks = getKeySizeFromRdbId(rdbId);

		// get key
		const char *key = p;
		p += ks;

		// . if key is negative, no data is present
		// . the doledb key is negative for us here
		bool isDel = ((key[0] & 0x01) == 0x00);
		int32_t ds = isDel ? 0 : getDataSizeFromRdbId(rdbId);

		// if datasize variable, read it in
		if (ds == -1) {
			// get data size
			ds = *(int32_t *)p;

			// skip data size int32_t
			p += 4;
		}

		// skip data if not zero
		p += ds;

		if (rdbId == RDB_POSDB || rdbId == RDB2_POSDB2) {
			Posdb::printKey(key);
		} else {
			/// @todo ALC implement other rdb types
			gbshutdownLogicError();
		}
	}
}

// print this also for page parser output!
void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {

	verifyMetaList ( p , pend , false );

	SafeBuf tmp;
	if ( ! sb ) sb = &tmp;

	const char *hdr =
		"<table border=1>\n"
		"<tr>"
		"<td><b>rdb</b></td>"
		"<td><b>del?</b></td>"
		"<td><b>shardByTermId?</b></td>"
		// illustrates key size
		"<td><b>key</b></td>"
		// break it down. based on rdb, of course.
		"<td><b>desc</b></td>"
		"</tr>\n" ;

	sb->safePrintf("%s",hdr);

	int32_t recSize = 0;
	int32_t rcount = 0;
	for ( ; p < pend ; p += recSize ) {
		// get rdbid
		rdbid_t rdbId = (rdbid_t)(*p & 0x7f);
		// skip
		p++;
		// get key size
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		// point to it
		char *rec = p;
		// init this
		int32_t recSize = ks;

		char k[MAX_KEY_BYTES];
		if ( ks > MAX_KEY_BYTES ) { g_process.shutdownAbort(true); }
		gbmemcpy ( k , p , ks );
		// is it a negative key?
		bool neg = false;
		if ( ! ( p[0] & 0x01 ) ) neg = true;
		// this is now a bit in the posdb key so we can rebalance
		bool shardByTermId = false;
		if ( rdbId==RDB_POSDB && Posdb::isShardedByTermId(k))
			shardByTermId = true;
		// skip it
		p += ks;
		// get datasize
		int32_t dataSize = getDataSizeFromRdbId ( rdbId );
		// . always zero if key is negative
		// . this is not the case unfortunately...
		if ( neg ) dataSize = 0;
		// if -1, read it in
		if ( dataSize == -1 ) {
			dataSize = *(int32_t *)p;
			// inc this
			recSize += 4;
			// sanity check
                        if ( dataSize < 0 ) { g_process.shutdownAbort(true); }
			p += 4;
                }
		// skip the data
		p += dataSize;
		// inc it
		recSize += dataSize;

		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS) == 0 )
			sb->safePrintf("<!--ignore--></table>%s",hdr);

		// print dbname
		sb->safePrintf("<tr>");
		const char *dn = getDbnameFromId ( rdbId );
		sb->safePrintf("<td>%s</td>",dn);

		if ( neg ) sb->safePrintf("<td>D</td>");
		else       sb->safePrintf("<td>&nbsp;</td>");

		if ( shardByTermId ) sb->safePrintf("<td>shardByTermId</td>");
		else           sb->safePrintf("<td>&nbsp;</td>");

		sb->safePrintf("<td><nobr>%s</nobr></td>", KEYSTR(k,ks));

		if ( rdbId == RDB_POSDB ) {
			// get termid et al
			key144_t *k2 = (key144_t *)k;
			int64_t tid = Posdb::getTermId(k2);
			// sanity check
			if(dataSize!=0){g_process.shutdownAbort(true);}
			sb->safePrintf("<td>"
				       "termId=%020" PRIu64" "
				       "</td>"
				       ,(uint64_t)tid
				       );
		}
		else if ( rdbId == RDB_LINKDB ) {
			key224_t *k2 = (key224_t *)k;
			int64_t linkHash=Linkdb::getLinkeeUrlHash64_uk(k2);
			int32_t linkeeSiteHash  = Linkdb::getLinkeeSiteHash32_uk(k2);
			int32_t linkerSiteHash  = Linkdb::getLinkerSiteHash32_uk(k2);
			char linkSpam   = Linkdb::isLinkSpam_uk    (k2);
			int32_t siteRank = Linkdb::getLinkerSiteRank_uk (k2);
			int32_t ip32       = Linkdb::getLinkerIp_uk       (k2);
			int64_t docId = Linkdb::getLinkerDocId_uk      (k2);
			// sanity check
			if(dataSize!=0){g_process.shutdownAbort(true);}
			char ipbuf[16];
			sb->safePrintf("<td>"
				       "<nobr>"
				       "linkeeSiteHash32=0x%08" PRIx32" "
				       "linkeeUrlHash=0x%016" PRIx64" "
				       "linkSpam=%" PRId32" "
				       "siteRank=%" PRId32" "
				       //"hopCount=%03" PRId32" "
				       "sitehash32=0x%" PRIx32" "
				       "IP32=%s "
				       "docId=%" PRIu64
				       "</nobr>"
				       "</td>",
				       linkeeSiteHash,
				       linkHash,
				       (int32_t)linkSpam,
				       siteRank,
				       linkerSiteHash,
				       iptoa(ip32,ipbuf),
				       docId);

		}
		else if ( rdbId == RDB_CLUSTERDB ) {
			key128_t *k2 = (key128_t *)k;
			char *r = (char *)k2;
			int32_t siteHash26 = Clusterdb::getSiteHash26   ( r );
			char lang       = Clusterdb::getLanguage     ( r );
			int64_t docId = Clusterdb::getDocId        ( r );
			char ff         = Clusterdb::getFamilyFilter ( r );
			// sanity check
			if(dataSize!=0){g_process.shutdownAbort(true);}
			sb->safePrintf("<td>"
				       // 26 bit site hash
				       "siteHash26=0x%08" PRIx32" "
				       "family=%" PRId32" "
				       "lang=%03" PRId32" "
				       "docId=%" PRIu64
				       "</td>",
				       siteHash26 ,
				       (int32_t)ff,
				       (int32_t)lang,
				       docId );
		}
		// key parsing logic taken from Address::makePlacedbKey
		else if ( rdbId == RDB_SPIDERDB ) {
			sb->safePrintf("<td><nobr>");
			key128_t *k2 = (key128_t *)k;
			if ( Spiderdb::isSpiderRequest(k2) ) {
				SpiderRequest *sreq = (SpiderRequest *)rec;
				sreq->print ( sb );
			}
			else {
				SpiderReply *srep = (SpiderReply *)rec;
				srep->print ( sb );
			}
			sb->safePrintf("</nobr></td>");
		}
		else if ( rdbId == RDB_DOLEDB ) {
			key96_t *k2 = (key96_t *)k;
			sb->safePrintf("<td><nobr>");
			sb->safePrintf("priority=%" PRId32" "
				       "spidertime=%" PRIu32" "
				       "uh48=%" PRIx64" "
				       "isdel=%" PRId32,
			           Doledb::getPriority(k2),
				       (uint32_t)Doledb::getSpiderTime(k2),
				       Doledb::getUrlHash48(k2),
				       Doledb::getIsDel(k2));
			sb->safePrintf("</nobr></td>");
		}
		else if ( rdbId == RDB_TITLEDB ) {
			// print each offset and size for the variable crap
			sb->safePrintf("<td><nobr>titlerec datasize=%" PRId32" "
				       "</nobr></td>",
				       dataSize
				       );
		}
		else if ( rdbId == RDB_TAGDB ) {
			Tag *tag = (Tag *)rec;
			sb->safePrintf("<td><nobr>");
			if ( rec[0] & 0x01 ) tag->printToBuf(sb);
			else sb->safePrintf("negativeTagKey");
			sb->safePrintf("</nobr></td>");
		}
		else {
			g_process.shutdownAbort(true);
		}

		// close it up
		sb->safePrintf("</tr>\n");
	}
	sb->safePrintf("</table>\n");

	if ( sb == &tmp )
		sb->print();
}


bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
	return true;

#if 0
	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	// do not do this if not test collection for now
	if ( strcmp(cr->m_coll,"qatest123") ) return true;


	log(LOG_DEBUG, "xmldoc: VERIFYING METALIST");

	// store each record in the list into the send buffers
	for ( ; p < pend ; ) {
		// first is rdbId
		rdbid_t rdbId = (rdbid_t)(*p++ & 0x7f);

		// negative key?
		bool del = !( *p & 0x01 );

		// must always be negative if deleteing
		// spiderdb is exempt because we add a spiderreply that is
		// positive and a spiderdoc
		// no, this is no longer the case because we add spider
		// replies to the index when deleting or rejecting a doc.
		//if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
		//	g_process.shutdownAbort(true); }

		// get the key size. a table lookup in Rdb.cpp.
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		if ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 ) {
			// no compress bits set!
			if ( p[0] & 0x06 ) { g_process.shutdownAbort(true); }
			// alignment bit set or cleared
			if ( ! ( p[1] & 0x02 ) ) { g_process.shutdownAbort(true); }
			if (   ( p[7] & 0x02 ) ) { g_process.shutdownAbort(true); }
			int64_t docId = Posdb::getDocId(p);
			if ( docId != m_docId && !cr->m_indexSpiderReplies) {
				log( LOG_WARN, "xmldoc: %" PRId64" != %" PRId64, docId, m_docId );
				g_process.shutdownAbort(true);
			}
		}

		// sanity
		if ( ks < 12 ) { g_process.shutdownAbort(true); }
		if ( ks > MAX_KEY_BYTES ) { g_process.shutdownAbort(true); }
		// another check
		Rdb *rdb = getRdbFromId(rdbId);
		if ( ! rdb ) { g_process.shutdownAbort(true); }
		if ( rdb->m_ks < 12 || rdb->m_ks > MAX_KEY_BYTES ) {
			g_process.shutdownAbort(true);}

		char *rec = p;

		// set this
		//bool split = true;
		//if(rdbId == RDB_POSDB && Posdb::isShardedByTermId(p) )
		// split =false;
		// skip key
		p += ks;
		// . if key belongs to same group as firstKey then continue
		// . titledb now uses last bits of docId to determine groupId
		// . but uses the top 32 bits of key still
		// . spiderdb uses last 64 bits to determine groupId
		// . tfndb now is like titledb(top 32 bits are top 32 of docId)
		//uint32_t gid = getGroupId ( rdbId , key , split );
		// get the record, is -1 if variable. a table lookup.
		int32_t dataSize = getDataSizeFromRdbId ( rdbId );

		// . for delete never stores the data
		// . you can have positive keys without any dataSize member
		//   when they normally should have one, like titledb
		if ( forDelete ) dataSize = 0;
		// . negative keys have no data
		// . this is not the case unfortunately
		if ( del ) dataSize = 0;

		// ensure spiderdb request recs have data/url in them
		if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) &&
		     g_spiderdb.isSpiderRequest ( (spiderdbkey_t *)rec ) &&
		     ! forDelete &&
		     ! del &&
		     dataSize == 0 ) {
			g_process.shutdownAbort(true); }

		// if variable read that in
		if ( dataSize == -1 ) {
			// -1 means to read it in
			dataSize = *(int32_t *)p;
			// sanity check
			if ( dataSize < 0 ) { g_process.shutdownAbort(true); }
			// skip dataSize
			p += 4;
		}
		// skip over the data, if any
		p += dataSize;
		// breach us?
		if ( p > pend ) { g_process.shutdownAbort(true); }
	}
	// must be exactly equal to end
	if ( p != pend ) return false;
	return true;
#endif
}

bool XmlDoc::hashMetaList ( HashTableX *ht        ,
			    char       *p         ,
			    char       *pend      ,
			    bool        checkList ) {
	int32_t recSize = 0;
	int32_t count = 0;
	for ( ; p < pend ; p += recSize , count++ ) {
		// get rdbid
		rdbid_t rdbId = (rdbid_t)(*p & 0x7f);
		// skip rdb id
		p++;
		// save that
		char *rec = p;
		// get key size
		int32_t ks = getKeySizeFromRdbId ( rdbId );
		// sanity check
		if ( ks > 28 ) { g_process.shutdownAbort(true); }
		// is it a delete key?
		bool del;
		if ( ( p[0] & 0x01 ) == 0x00 ) del = true;
		else                           del = false;
		// convert into a key128_t, the biggest possible key
		char k[MAX_KEY_BYTES];//key128_t k ;
		// zero out
		KEYMIN(k,MAX_KEY_BYTES);
		//k.setMin();
		gbmemcpy ( k , p , ks );
		// skip it
		p += ks;
		// if negative, no data size allowed -- no
		if ( del ) continue;
		// get datasize
		int32_t dataSize = getDataSizeFromRdbId ( rdbId );
		// if -1, read it in
		if ( dataSize == -1 ) {
			dataSize = *(int32_t *)p;
			// sanity check
                        if ( dataSize < 0 ) { g_process.shutdownAbort(true); }
			p += 4;
                }

		// skip the data
		p += dataSize;
		// ignore spiderdb recs for parsing consistency check
		if ( rdbId == RDB_SPIDERDB ) continue;
		if ( rdbId == RDB2_SPIDERDB2 ) continue;
		// ignore tagdb as well!
		if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue;

		// set our rec size, includes key/dataSize/data
		int32_t recSize = p - rec;

		// if just adding, do it
		if ( ! checkList ) {
			// we now store ptr to the rec, not hash!
			if ( ! ht->addKey ( k , &rec ) ) return false;
			continue;
		}
		// check to see if this rec is in the provided hash table
		int32_t slot = ht->getSlot ( k );
		// bitch if not found
		if ( slot < 0 && ks==12 ) {
			key144_t *k2 = (key144_t *)k;
			int64_t tid = Posdb::getTermId(k2);
			char shardByTermId = Posdb::isShardedByTermId(k2);
			log("build: missing key #%" PRId32" rdb=%s ks=%" PRId32" ds=%" PRId32" "
			    "tid=%" PRIu64" "
			    "key=%s "
			    //"score8=%" PRIu32" score32=%" PRIu32" "
			    "shardByTermId=%" PRId32,
			    count,getDbnameFromId(rdbId),(int32_t)ks,
			    (int32_t)dataSize,tid ,
			    //(int32_t)score8,(int32_t)score32,
			    KEYSTR(k2,ks),
			    (int32_t)shardByTermId);
			// look it up


			// shortcut
			HashTableX *wt = m_wts;

			// now print the table we stored all we hashed into
			for ( int32_t i = 0 ; i < wt->getNumSlots() ; i++ ) {
				// skip if empty
				if ( wt->m_flags[i] == 0 ) continue;
				// get the TermInfo
				TermDebugInfo *ti;
				ti = (TermDebugInfo *)wt->getValueFromSlot(i);
				// skip if not us
				if((ti->m_termId & TERMID_MASK)!=tid)continue;
				// got us
				char *start = m_wbuf.getBufStart();
				char *term = start + ti->m_termOff;
				const char *prefix = "";
				if ( ti->m_prefixOff >= 0 ) {
					prefix = start + ti->m_prefixOff;
					//prefix[ti->m_prefixLen] = '\0';
				}
				// NULL term it
				term[ti->m_termLen] = '\0';
				// print it
				log("parser: term=%s prefix=%s",//score32=%" PRId32,
				    term,prefix);//,(int32_t)ti->m_score32);
			}

			g_process.shutdownAbort(true);
		}
		if ( slot < 0 && ks != 12 ) {
			log("build: missing key #%" PRId32" rdb=%s ks=%" PRId32" ds=%" PRId32" "
			    "ks=%s "
			    ,count,getDbnameFromId(rdbId),(int32_t)ks,
			    (int32_t)dataSize,KEYSTR(k,ks));
			g_process.shutdownAbort(true);
		}
		// if in there, check the hashes
		//int32_t h2 = *(int32_t *)ht->getValueFromSlot ( slot );
		char *rec2 = *(char **)ht->getValueFromSlot ( slot );
		// get his dataSize
		int32_t dataSize2 = getDataSizeFromRdbId(rdbId);
		// his keysize
		int32_t ks2 = getKeySizeFromRdbId(rdbId);
		// get his recsize
		int32_t recSize2 = ks2 ;
		// if -1 that is variable
		if ( dataSize2 == -1 ) {
			dataSize2 = *(int32_t *)(rec2+ks2);
			recSize2 += 4;
		}
		// add it up
		recSize2 += dataSize2;
		// keep on chugging if they match
		if ( recSize2==recSize && !memcmp(rec,rec2,recSize) ) continue;
		// otherwise, bitch
		bool shardByTermId = false;
		if ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 )
			shardByTermId = Posdb::isShardedByTermId(rec2);
		log("build: data not equal for key=%s "
		    "rdb=%s splitbytermid=%" PRId32" dataSize=%" PRId32,
		    KEYSTR(k,ks2),
		    getDbnameFromId(rdbId),(int32_t)shardByTermId,dataSize);

		// print into here
		SafeBuf sb1;
		SafeBuf sb2;

		// print it out
		if ( rdbId == RDB_SPIDERDB ) {
			// get rec
			if ( Spiderdb::isSpiderRequest((key128_t *)rec) ) {
				SpiderRequest *sreq1 = (SpiderRequest *)rec;
				SpiderRequest *sreq2 = (SpiderRequest *)rec2;
				sreq1->print(&sb1);
				sreq2->print(&sb2);
			}
			else {
				SpiderReply *srep1 = (SpiderReply *)rec;
				SpiderReply *srep2 = (SpiderReply *)rec2;
				srep1->print(&sb1);
				srep2->print(&sb2);
			}
			log("build: rec1=%s",sb1.getBufStart());
			log("build: rec2=%s",sb2.getBufStart());

		}
		g_process.shutdownAbort(true);
	}
	return true;
}

void getMetaListWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { g_process.shutdownAbort(true);}
	// note it
	THIS->setStatus ( "in get meta list wrapper" );
	// get it
	char *ml = THIS->getMetaList ( );
	// sanity check
	if ( ! ml && ! g_errno ) {
		log(LOG_ERROR, "doc: getMetaList() returned NULL without g_errno");
		g_process.shutdownAbort(true);
	}
	// return if it blocked
	if ( ml == (void *)-1 ) return;
	// sanityh check
	if ( THIS->m_callback1 == getMetaListWrapper ) { g_process.shutdownAbort(true);}
	// otherwise, all done, call the caller callback
	THIS->callCallback();
}


// . returns NULL and sets g_errno on error
// . make a meta list to call Msg4::addMetaList() with
// . called by Msg14.cpp
// . a meta list is just a buffer of Rdb records of the following format:
//   rdbid | rdbRecord
// . meta list does not include title rec since Msg14 adds that using Msg1
// . returns false and sets g_errno on error
// . sets m_metaList ptr and m_metaListSize
// . if "deleteIt" is true, we are a delete op on "old"
// . returns (char *)-1 if it blocks and will call your callback when done
// . generally only Repair.cpp changes these use* args to false
char *XmlDoc::getMetaList(bool forDelete) {
	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN forDelete=%s", forDelete ? "true" : "false" );

	if (m_metaListValid) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, already valid" );
		return m_metaList;
	}

	setStatus("getting meta list");

	// force it true?
	// "forDelete" means we want the metalist to consist of "negative"
	// keys that will annihilate with the positive keys in the index,
	// posdb and the other rdbs, in order to delete them. "deleteFromIndex"
	// means to just call getMetaList(tre) on the m_oldDoc (old XmlDoc)
	// which is built from the titlerec in Titledb. so don't confuse
	// these two things. otherwise when i add this we were not adding
	// the spiderreply of "Doc Force Deleted" from doing a query reindex
	// and it kept repeating everytime we started gb up.
	//if ( m_deleteFromIndex ) forDelete = true;

	// assume valid
	m_metaList     = "";
	m_metaListSize = 0;


	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block, this callback will be called
	if ( ! m_masterLoop ) {
		m_masterLoop  = getMetaListWrapper;
		m_masterState = this;
	}

	// returning from a handler that had an error?
	if (g_errno) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, g_errno=%" PRId32, g_errno);
		return NULL;
	}

	// if we are a spider status doc/titlerec and we are doing a rebuild
	// operation, then keep it simple
	if (m_setFromTitleRec && m_useSecondaryRdbs && m_contentTypeValid && m_contentType == CT_STATUS) {
		// if not rebuilding posdb then done, list is empty since
		// spider status docs do not contribute to linkdb, clusterdb,..
		if (!m_usePosdb && !m_useTitledb) {
			m_metaListValid = true;
			logTrace(g_conf.m_logTraceXmlDoc, "END, CT_STATUS");
			return m_metaList;
		}

		/////////////
		//
		// if user disabled spider status docs then delete the titlerec
		// AND the posdb index list from our dbs for this ss doc
		//
		/////////////
		CollectionRec *cr = getCollRec();
		if (!cr) {
			return NULL;
		}

		if (!cr->m_indexSpiderReplies) {
			logTrace(g_conf.m_logTraceXmlDoc, "Not indexing spider replies. Delete titlerec for this doc");

			int64_t uh48 = m_firstUrl.getUrlHash48();

			// delete title rec. true = delete?
			key96_t tkey = Titledb::makeKey (m_docId,uh48,true);

			// shortcut
			SafeBuf *ssb = &m_spiderStatusDocMetaList;

			// add to list. and we do not add the spider status
			// doc to posdb since we deleted its titlerec.
			ssb->pushChar(RDB_TITLEDB); // RDB2_TITLEDB2
			ssb->safeMemcpy(&tkey, sizeof(key96_t));
			m_metaList = ssb->getBufStart();
			m_metaListSize = ssb->length();
			m_metaListValid = true;

			logTrace( g_conf.m_logTraceXmlDoc, "END" );
			return m_metaList;
		}

		// set safebuf to the json of the spider status doc
		SafeBuf jd;
		if (!jd.safeMemcpy(ptr_utf8Content, size_utf8Content)) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, jd.safeMemcpy failed");
			return NULL;
		}

		// set m_spiderStatusDocMetaList from the json
		if (!setSpiderStatusDocMetaList(&jd, m_docId)) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, setSpiderStatusDocMetaList failed");
			return NULL;
		}

		// TODO: support titledb rebuild as well
		m_metaList = m_spiderStatusDocMetaList.getBufStart();
		m_metaListSize = m_spiderStatusDocMetaList.length();
		m_metaListValid = true;

		logTrace( g_conf.m_logTraceXmlDoc, "END, OK" );
		return m_metaList;
	}

	// if "rejecting" from index fake all this stuff
	if (m_deleteFromIndex) {
		logTrace(g_conf.m_logTraceXmlDoc, "deleteFromIndex true");

		// set these things to bogus values since we don't need them
		m_contentHash32Valid = true;
		m_contentHash32 = 0;
		m_httpStatusValid = true;
		m_httpStatus = 200;
		m_siteValid = true;
		ptr_site = "";
		size_site = strlen(ptr_site) + 1;
		m_isSiteRootValid = true;
		m_isSiteRoot2 = 1;
		m_tagPairHash32Valid = true;
		m_tagPairHash32 = 0;
		m_spiderLinksValid = true;
		m_spiderLinks2 = 1;
		m_langIdValid = true;
		m_langId = 1;
		m_siteNumInlinksValid = true;
		m_siteNumInlinks = 0;
		m_isIndexed = (char)true;	// may be -1
		m_isIndexedValid = true;
		m_ipValid = true;
		m_ip = 123456;
	}

	CollectionRec *cr = getCollRec();
	if (!cr) {
		logTrace(g_conf.m_logTraceXmlDoc, "getCollRec failed");
		return NULL;
	}

	// get our checksum
	int32_t *plainch32 = getContentHash32();
	if (!plainch32 || plainch32 == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getContentHash32 failed");
		return (char *)plainch32;
	}

	// get this too
	int16_t *hs = getHttpStatus();
	if (!hs || hs == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getHttpStatus failed");
		return (char *)hs;
	}

	// make sure site is valid
	char *site = getSite();
	if (!site || site == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getSite failed");
		return (char *)site;
	}

	// this seems to be an issue as well for "unchanged" block below
	char *isr = getIsSiteRoot();
	if (!isr || isr == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getIsSiteRoot failed");
		return (char *)isr;
	}

	// make sure docid valid
	int64_t *mydocid = getDocId();
	if (!mydocid || mydocid == (int64_t *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getDocId failed");
		return (char *)mydocid;
	}

	// . get the old version of our XmlDoc from the previous spider time
	// . set using the old title rec in titledb
	// . should really not do any more than set m_titleRec...
	// . should not even uncompress it!
	// . getNewSpiderReply() will use this to set the reply if
	//   m_indexCode == EDOCUNCHANGED...
	XmlDoc **pod = getOldXmlDoc();
	if (!pod || pod == (XmlDoc **)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getOldXmlDoc failed");
		return (char *)pod;
	}

	// point to the old xml doc if no error, etc.
	XmlDoc *od = *pod;

	// check if we are already indexed
	char *isIndexed = getIsIndexed();
	if (!isIndexed || isIndexed == (char *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getIsIndexed failed");
		return (char *)isIndexed;
	}

	// why call this way down here? it ends up downloading the doc!
	// @todo: BR: Eh, what? ^^^
	int32_t *indexCode = getIndexCode();
	if (!indexCode || indexCode == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getIndexCode failed");
		return (char *)indexCode;
	}

	// sanity check
	if (!m_indexCodeValid) {
		g_process.shutdownAbort(true);
	}

	// this means to abandon the injection
	if (*indexCode == EABANDONED) {
		m_metaList = (char *)0x123456;
		m_metaListSize = 0;
		m_metaListValid = true;
		logTrace(g_conf.m_logTraceXmlDoc, "END, abandoned");
		return m_metaList;
	}

	// . some index code warrant retries, like EDNSTIMEDOUT, ETCPTIMEDOUT,
	//   etc. these are deemed temporary errors. other errors basically
	//   indicate a document that will never be indexable and should,
	//   if currently indexed, be deleted.
	// . just add the spider reply and we're done
	if (    *indexCode == EDNSTIMEDOUT
	     || *indexCode == ETCPTIMEDOUT
	     || *indexCode == EUDPTIMEDOUT
	     || *indexCode == EDNSDEAD
	     || *indexCode == ENETUNREACH
	     || *indexCode == EHOSTUNREACH
		// . treat this as a temporary error i guess
		// . getNewSpiderReply() below will clear the error in it and
		//   copy stuff over from m_sreq and m_oldDoc for this case
		|| *indexCode == EDOCUNCHANGED
		) {
		// sanity - in repair mode?
		if (m_useSecondaryRdbs) {
			g_process.shutdownAbort(true);
		}

		logTrace(g_conf.m_logTraceXmlDoc, "Temporary error state: %" PRId32, *indexCode);

		// . this seems to be an issue for blocking
		// . if we do not have a valid ip, we can't compute this,
		//   in which case it will not be valid in the spider reply
		// . why do we need this for timeouts etc? if the doc is
		//   unchanged
		//   we should probably update its siteinlinks in tagdb
		//   periodically and reindex the whole thing...
		// . i think we were getting the sitenuminlinks for
		//   getNewSpiderReply()
		if (m_ipValid && m_ip != 0 && m_ip != -1) {
			int32_t *sni = getSiteNumInlinks();
			if (!sni || sni == (int32_t *)-1) {
				logTrace(g_conf.m_logTraceXmlDoc, "getSiteNumInlinks failed");
				return (char *)sni;
			}
		}

		// all done!
		bool addReply = true;
		// page parser calls set4 and sometimes gets a dns time out!
		if (m_sreqValid && m_sreq.m_isPageParser) {
			addReply = false;
		}

		// return nothing if done
		if (!addReply) {
			m_metaListSize = 0;
			m_metaList = (char *)0x1;
			logTrace(g_conf.m_logTraceXmlDoc, "END, m_isPageParser and valid");
			return m_metaList;
		}

		// save this
		int32_t savedCode = *indexCode;

		// before getting our spider reply, assign crap from the old
		// doc to us since we are unchanged! this will allow us to
		// call getNewSpiderReply() without doing any processing, like
		// setting the Xml or Words classes, etc.
		copyFromOldDoc(od);

		// need this though! i don't want to print out "Success"
		// in the log in the logIt() function
		m_indexCode = savedCode;
		m_indexCodeValid = true;

		// but set our m_contentHash32 from the spider request
		// which got it from the spiderreply in the case of
		// EDOCUNCHANGED. this way ch32=xxx will log correctly.
		// I think this is only when EDOCUNCHANGED is set in the
		// Msg13.cpp code, when we have a spider compression proxy.
		if (*indexCode == EDOCUNCHANGED && m_sreqValid && !m_contentHash32Valid) {
			m_contentHash32 = m_sreq.m_contentHash32;
			m_contentHash32Valid = true;
		}

		// we need these got getNewSpiderReply()
		m_wasInIndex = (od != NULL);
		m_isInIndex = m_wasInIndex;
		m_wasInIndexValid = true;
		m_isInIndexValid = true;

		// unset our ptr_linkInfo1 so we do not free it and core
		// since we might have set it in copyFromOldDoc() above
		ptr_linkInfo1 = NULL;
		size_linkInfo1 = 0;
		m_linkInfo1Valid = false;

		// . if not using spiderdb we are done at this point
		// . this happens for diffbot json replies (m_dx)
		if (!m_useSpiderdb) {
			m_metaList = NULL;
			m_metaListSize = 0;
			logTrace(g_conf.m_logTraceXmlDoc, "END, not using spiderdb");
			return (char *)0x01;
		}

		// get our spider reply
		SpiderReply *newsr = getNewSpiderReply();
		// return on error
		if (!newsr) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, could not get spider reply");
			return (char *)newsr;
		}

		// . panic on blocking! this is supposed to be fast!
		// . it might still have to lookup the tagdb rec?????
		if (newsr == (void *)-1) {
			g_process.shutdownAbort(true);
		}

		// how much we need
		int32_t needx = sizeof(SpiderReply) + 1;


		// . INDEX SPIDER REPLY (1a)
		// . index ALL spider replies as separate doc. error or not.
		// . then print out error histograms.
		// . we should also hash this stuff when indexing the
		//   doc as a whole

		// i guess it is safe to do this after getting the spiderreply
		// get the spiderreply ready to be added
		SafeBuf *spiderStatusDocMetaList = getSpiderStatusDocMetaList(newsr, forDelete);
		// error?
		if (!spiderStatusDocMetaList) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getSpiderStatusDocMetaList failed");
			return NULL;
		}

		// blocked?
		if (spiderStatusDocMetaList==(void *)-1) {
			logTrace( g_conf.m_logTraceXmlDoc, "END, getSpiderStatusDocMetaList blocked" );
			return (char *)-1;
		}

		// need to alloc space for it too
		int32_t len = spiderStatusDocMetaList->length();
		needx += len;

		// this too
		m_addedStatusDocSize = len;
		m_addedStatusDocSizeValid = true;

		// make the buffer
		m_metaList = (char *)mmalloc(needx, "metalist");
		if (!m_metaList) {
			return NULL;
		}

		// save size for freeing later
		m_metaListAllocSize = needx;

		// ptr and boundary
		m_p = m_metaList;
		m_pend = m_metaList + needx;

		// save it
		char *saved = m_p;

		// first store spider reply "document"
		if (spiderStatusDocMetaList) {
			gbmemcpy (m_p, spiderStatusDocMetaList->getBufStart(), spiderStatusDocMetaList->length());
			m_p += spiderStatusDocMetaList->length();
		}

		// sanity check
		if (!m_docIdValid) {
			g_process.shutdownAbort(true);
		}

		// now add the new rescheduled time
		setStatus("adding SpiderReply to spiderdb");
		logTrace(g_conf.m_logTraceXmlDoc, "Adding spider reply to spiderdb");

		// rdbid first
		rdbid_t rd = m_useSecondaryRdbs ? RDB2_SPIDERDB2 : RDB_SPIDERDB;
		*m_p++ = (char)rd;

		// get this
		if (!m_srepValid) {
			g_process.shutdownAbort(true);
		}

		// store the spider rec
		int32_t newsrSize = newsr->getRecSize();
		gbmemcpy (m_p, newsr, newsrSize);
		m_p += newsrSize;
		m_addedSpiderReplySize = newsrSize;
		m_addedSpiderReplySizeValid = true;

		// sanity check
		if (m_p - saved != needx) {
			g_process.shutdownAbort(true);
		}

		// sanity check
		verifyMetaList(m_metaList, m_p, forDelete);

		// verify it
		m_metaListValid = true;

		// set size
		m_metaListSize = m_p - m_metaList;

		// all done
		logTrace(g_conf.m_logTraceXmlDoc, "END, all done");
		return m_metaList;
	}


	// get the old meta list if we had an old doc
	char *oldList = NULL;
	int32_t oldListSize = 0;
	if (od) {
		od->m_useSpiderdb = false;
		od->m_useTagdb = false;

		// if we are doing diffbot stuff, we are still indexing this
		// page, so we need to get the old doc meta list
		oldList = od->getMetaList(true);
		oldListSize = od->m_metaListSize;
		if (!oldList || oldList == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, get old meta list failed");
			return oldList;
		}
	}

	// . need this if useTitledb is true
	// . otherwise XmlDoc::getTitleRecBuf() cores because its invalid
	// . this cores if rebuilding just posdb because hashAll() needs
	//   the inlink texts for hashing
	LinkInfo *info1 = getLinkInfo1();
	if (!info1 || info1 == (LinkInfo *)-1) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLinkInfo1 failed" );
		return (char *)info1;
	}

	// so getSiteRank() works
	int32_t *sni = getSiteNumInlinks();
	if (!sni || sni == (int32_t *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getSiteNumInlinks failed");
		return (char *)sni;
	}

	// so addTable144 works
	uint8_t *langId = getLangId();
	if (!langId || langId == (uint8_t *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getLangId failed");
		return (char *)langId;
	}

	// . before making the title rec we need to set all the ptrs!
	// . so at least now set all the data members we will need to
	//   seriazlize into the title rec because we can't be blocking further
	//   down below after we set all the hashtables and XmlDoc::ptr_ stuff
	if (!m_setFromTitleRec || m_useSecondaryRdbs) {
		// all member vars should already be valid if set from titlerec
		char *ptg = prepareToMakeTitleRec();

		// return NULL with g_errno set on error
		if (!ptg || ptg == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, prepareToMakeTitleRec failed");
			return ptg;
		}
	}

	// our next slated spider priority
	char *spiderLinks3 = getSpiderLinks();
	if (!spiderLinks3 || spiderLinks3 == (char *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getSpiderLinks failed");
		return spiderLinks3;
	}

	bool spideringLinks = *spiderLinks3;

	bool addPosRec = false;
	bool addTitleRec = false;
	bool addClusterRec = false;
	bool addLinkInfo = true;

	///////////////////////////////////
	///////////////////////////////////
	//
	// if we had an error, do not add us regardless to the index
	// although we might add SOME things depending on the error.
	// Like add the redirecting url if we had a ESIMPLIFIEDREDIR error.
	// So what we had to the Rdbs depends on the indexCode.
	//
	// OR if deleting from index, we just want to get the metalist
	// directly from "od"
	//
	m_isInIndex  = !(m_indexCode || m_deleteFromIndex);
	m_isInIndexValid  = true;

	// set these for getNewSpiderReply() so it can set
	// SpiderReply::m_wasIndexed and m_isIndexed...
	m_wasInIndex = (od != NULL);
	m_wasInIndexValid = true;

	if (m_isInIndex) {
		addPosRec = true;
		addTitleRec = true;
		addClusterRec = true;
	} else {
		if (m_indexCode == EDOCSIMPLIFIEDREDIR || m_indexCode == EDOCNONCANONICAL) {
			// we're adding titlerec to keep links between redirection intact
			addTitleRec = true;

			// since we're adding titlerec, add posrec as well
			addPosRec = true;

			// if we are adding a simplified redirect as a link to spiderdb
			// likewise if the error was ENONCANONICAL treat it like that
			spideringLinks = true;

			// don't add linkinfo since titlerec is empty
			addLinkInfo = false;
		} else {
			spideringLinks = false;
		}
	}

	//
	// . prepare the outlink info if we are adding links to spiderdb!
	// . do this before we start hashing so we do not block and re-hash!!
	//
	if (m_useSpiderdb && spideringLinks && !m_doingConsistencyCheck) {
		setStatus("getting outlink info");
		logTrace(g_conf.m_logTraceXmlDoc, "call getOutlinkTagRecVector");

		TagRec ***grv = getOutlinkTagRecVector();
		if (!grv || grv == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getOutlinkTagRecVector returned -1");
			return (char *)grv;
		}

		logTrace(g_conf.m_logTraceXmlDoc, "call getOutlinkFirstIpVector");

		int32_t **ipv = getOutlinkFirstIpVector();
		if (!ipv || ipv == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getOutlinkFirstIpVector returned -1");
			return (char *)ipv;
		}
	}

	// get the tag buf to add to tagdb
	SafeBuf *ntb = NULL;
	if (m_useTagdb && !m_deleteFromIndex) {
		logTrace(g_conf.m_logTraceXmlDoc, "call getNewTagBuf");
		ntb = getNewTagBuf();
		if (!ntb || ntb == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getNewTagBuf failed");
			return (char *)ntb;
		}
	}

	logTrace(g_conf.m_logTraceXmlDoc, "call getIsSiteRoot");
	char *isRoot = getIsSiteRoot();
	if (!isRoot || isRoot == (char *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getIsSiteRoot returned -1");
		return isRoot;
	}

	Words *ww = getWords();
	if (!ww || ww == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getWords returned -1");
		return (char *)ww;
	}

	int64_t *pch64 = getExactContentHash64();
	if (!pch64 || pch64 == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getExactContentHash64 returned -1");
		return (char *)pch64;
	}

	// need firstip if adding a rebuilt spider request
	if (m_useSpiderdb && m_useSecondaryRdbs) {
		int32_t *fip = getFirstIp();
		if (!fip || fip == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getFirstIp returned -1");
			return (char *)fip;
		}
	}

	// shit, we need a spider reply so that it will not re-add the
	// spider request to waiting tree, we ignore docid-based
	// recs that have spiderreplies in Spider.cpp
	SpiderReply *newsr = NULL;
	if (m_useSpiderdb) {
		newsr = getNewSpiderReply();
		if (!newsr || newsr == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getNewSpiderReply failed");
			return (char *)newsr;
		}
	}

	// the site hash for hashing
	int32_t *sh32 = getSiteHash32();
	if (!sh32 || sh32 == (int32_t *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getSiteHash32 failed");
		return (char *)sh32;
	}

	if (m_useLinkdb && !m_deleteFromIndex) {
		int32_t *linkSiteHashes = getLinkSiteHashes();
		if (!linkSiteHashes || linkSiteHashes == (void *)-1) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, getLinkSiteHashes failed");
			return (char *)linkSiteHashes;
		}
	}


	///////////
	//
	// BEGIN the diffbot json object index hack
	//
	// if we are using diffbot, then each json object in the diffbot reply
	// should be indexed as its own document.
	//
	///////////


	// i guess it is safe to do this after getting the spiderreply
	SafeBuf *spiderStatusDocMetaList = NULL;

	// get the spiderreply ready to be added to the rdbs w/ msg4
	// but if doing a rebuild operation then do not get it, we'll rebuild
	// it since it will have its own titlerec
	if (!m_useSecondaryRdbs) {
		spiderStatusDocMetaList = getSpiderStatusDocMetaList(newsr, forDelete);
		if (!spiderStatusDocMetaList) {
			log("build: ss doc metalist null. bad!");

			logTrace(g_conf.m_logTraceXmlDoc, "END, getSpiderStatusDocMetaList failed");
			return NULL;
		}
	}

	if (spiderStatusDocMetaList == (void *)-1) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, getSpiderStatusDocMetaList failed");
		return (char *)spiderStatusDocMetaList;
	}

	//
	// CAUTION
	//
	// We should never "block" after this point, lest the hashtables
	// we create get messed up.
	//

	//
	//
	// START HASHING
	//
	//

	// store what we hash into this table
	if ((m_pbuf || m_storeTermListInfo) && !m_wts) {
		// init it. the value is a TermInfo class. allowDups=true!
		m_wtsTable.set(12, sizeof(TermDebugInfo), 0, NULL, 0, true, "wts-tab");

		// point to it, make it active
		m_wts = &m_wtsTable;
	}

	// how much to alloc? compute an upper bound
	int32_t need = 0;

	setStatus("hashing posdb terms");

	// . hash our documents terms into "tt1"
	// . hash the old document's terms into "tt2"
	// . by old, we mean the older versioned doc of this url spidered b4
	HashTableX tt1;

	// . prepare it, 5000 initial terms
	// . make it nw*8 to avoid have to re-alloc the table!!!
	// . i guess we can have link and neighborhood text too! we don't
	//   count it here though... but add 5k for it...
	int32_t need4 = m_words.getNumWords() * 4 + 5000;
	if (m_usePosdb && addPosRec) {
		if (!tt1.set(18, 4, need4, NULL, 0, false, "posdb-indx")) {
			logTrace(g_conf.m_logTraceXmlDoc, "tt1.set failed");
			return NULL;
		}

		int32_t did = tt1.getNumSlots();
		// . hash the document terms into "tt1"
		// . this is a biggie!!!
		// . only hash ourselves if m_indexCode is false
		// . m_indexCode is non-zero if we should delete the doc from
		//   index
		// . i think this only adds to posdb
		// shit, this blocks which is bad!!!
		char *nod = hashAll(&tt1);

		// you can't block here because if we are re-called we lose tt1
		if (nod == (char *)-1) {
			g_process.shutdownAbort(true);
		}

		// error?
		if (!nod) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, hashAll failed");
			return NULL;
		}

		int32_t done = tt1.getNumSlots();
		if (done != did) {
			log(LOG_WARN, "xmldoc: reallocated big table! bad. old=%" PRId32" new=%" PRId32" nw=%" PRId32, did, done, m_words.getNumWords());
		}
	}

	// if indexing the spider reply as well under a different docid
	// there is no reason we can't toss it into our meta list here
	if (spiderStatusDocMetaList) {
		need += spiderStatusDocMetaList->length();
	}

	/// @todo ALC verify that we actually need sizeof(key128_t)
	// space for indexdb AND DATEDB! +2 for rdbids
	int32_t needPosdb = tt1.getNumUsedSlots() * (sizeof(posdbkey_t) + 2 + sizeof(key128_t));
	if (!forDelete) {
		// need 1 additional key for special key (with termid 0)
		needPosdb += sizeof(posdbkey_t) + 1;
	}

	need += needPosdb;

	// clusterdb keys. plus one for rdbId
	int32_t needClusterdb = addClusterRec ? 13 : 0;
	need += needClusterdb;

	// . LINKDB
	// . linkdb records. assume one per outlink
	// . we may index 2 16-byte keys for each outlink
	// if injecting, spideringLinks is false, but then we don't
	// add the links to linkdb, which causes the qainlinks() test to fail
	Links *nl2 = &m_links;

	// do not bother if deleting. but we do add simplified redirects
	// to spiderdb as SpiderRequests now.
	int32_t code = m_indexCode;
	if (code == EDOCSIMPLIFIEDREDIR || code == EDOCNONCANONICAL) {
		code = 0;
	}

	if (code) {
		nl2 = NULL;
	}

	// . set key/data size
	// . use a 16 byte key, not the usual 12
	// . use 0 for the data, since these are pure keys, which have no
	//   scores to accumulate
	HashTableX kt1;

	int32_t nis = 0;
	if (m_useLinkdb && nl2) {
		nis = nl2->getNumLinks() * 4;
	}

	// pre-grow table based on # outlinks
	// linkdb keys will have the same lower 4 bytes, so make hashing fast.
	// they are 28 byte keys. bytes 20-23 are the hash of the linkEE
	// so that will be the most random.
	kt1.set(sizeof(key224_t), 0, nis, NULL, 0, false, "link-indx", true, 20);

	// . we already have a Links::hash into the Termtable for links: terms,
	//   but this will have to be for adding to Linkdb. basically take a
	//   lot of it from Linkdb::fillLinkdbList()
	// . these return false with g_errno set on error
	if (m_useLinkdb && nl2 && !hashLinksForLinkdb(&kt1)) {
		logTrace(g_conf.m_logTraceXmlDoc, "END, hashLinksForLinkdb failed");
		return NULL;
	}

	// add up what we need. +1 for rdbId
	int32_t needLinkdb = kt1.getNumUsedSlots() * (sizeof(key224_t)+1);
	need += needLinkdb;

	// we add a negative key to doledb usually (include datasize now)
	int32_t needDoledb = forDelete ? 0 : (sizeof(key96_t) + 1);
	need += needDoledb;

	// for adding the SpiderReply to spiderdb (+1 for rdbId)
	int32_t needSpiderdb1 = forDelete ? 0 : (sizeof(SpiderReply) + 1);
	need += needSpiderdb1;

	// if injecting we add a spiderrequest to be able to update it
	// but don't do this if it is pagereindex. why is pagereindex
	// setting the injecting flag anyway?
	int32_t needSpiderdbRequest = 0;
	if (m_sreqValid && m_sreq.m_isInjecting && m_sreq.m_fakeFirstIp && !m_sreq.m_forceDelete) {
		// NO! because when injecting a warc and the subdocs
		// it contains, gb then tries to spider all of them !!! sux...
		needSpiderdbRequest = 0;
	} else if (m_useSpiderdb && m_useSecondaryRdbs) {
		// or if we are rebuilding spiderdb
		needSpiderdbRequest = sizeof(SpiderRequest) + m_firstUrl.getUrlLen() + 1;
	}
	need += needSpiderdbRequest;

	// . for adding our outlinks to spiderdb
	// . see SpiderRequest::getRecSize() for description
	// . SpiderRequest::getNeededSize() will include the null terminator
	int32_t needSpiderdb2 = 0;

	// don't need this if doing consistecy check
	// nor for generating the delete meta list for incremental indexing
	// and the url buffer of outlinks. includes \0 terminators i think
	if (!m_doingConsistencyCheck && !forDelete) {
		needSpiderdb2 = (SpiderRequest::getNeededSize(0) * m_links.getNumLinks()) + m_links.getLinkBufLen();
	}

	need += needSpiderdb2;

	// the new tags for tagdb
	int32_t needTagdb = ntb ? ntb->length() : 0;
	need += needTagdb;

	//
	// . CHECKSUM PARSING CONSISTENCY TEST
	//
	// . set m_metaListChecksum member (will be stored in titleRec header)
	// . gotta set m_metaListCheckSum8 before making titleRec below
	// . also, if set from titleRec, verify metalist is the same!
	//
	if (!m_computedMetaListCheckSum) {
		// do not call twice!
		m_computedMetaListCheckSum = true;
		// all keys in tt1, ns1, kt1 and pt1
		int32_t ck32 = tt1.getKeyChecksum32();

		// set this before calling getTitleRecBuf() below
		uint8_t currentMetaListCheckSum8 = (uint8_t)ck32;
		// see if matches what was in old titlerec
		if (m_metaListCheckSum8Valid &&
		    // if we were set from a titleRec, see if we got
		    // a different hash of terms to index this time around...
		    m_setFromTitleRec &&
		    // fix for import log spam
		    !m_isImporting &&
		    m_metaListCheckSum8 != currentMetaListCheckSum8) {
			log(LOG_WARN, "xmldoc: checksum parsing inconsistency for %s (old)%i != %i(new). ",
			    m_firstUrl.getUrl(), (int)m_metaListCheckSum8, (int)currentMetaListCheckSum8);
			//tt1.print();
		}

		// assign the new one, getTitleRecBuf() call below needs this
		m_metaListCheckSum8 = currentMetaListCheckSum8;
		m_metaListCheckSum8Valid = true;
	}


	//
	// now that we've set all the ptr_* members vars, we can make
	// the title rec
	//

	// . add in title rec size
	// . should be valid because we called getTitleRecBuf() above
	// . this should include the key
	// . add in possible negative key for deleting old title rec
	// +1 for rdbId
	int32_t needTitledb = sizeof(key96_t) + 1;

	// . MAKE the title rec from scratch, that is all we need at this point
	// . if repairing and not rebuilding titledb, we do not need the titlerec
	if (m_useTitledb) {
		// this buf includes key/datasize/compressdata
		SafeBuf *tr = getTitleRecBuf();

		// panic if this blocks! it should not at this point because
		// we'd have to re-hash the crap above
		if (tr == (void *)-1) {
			g_process.shutdownAbort(true);
		}

		// return NULL with g_errno set on error
		if (!tr) {
			return (char *)tr;
		}

		// sanity check - if the valid title rec is null,
		// m_indexCode is set!
		if (tr->length() == 0 && !m_indexCode) {
			g_process.shutdownAbort(true);
		}

		if (addTitleRec && !forDelete) {
			needTitledb += m_titleRecBuf.length();
		}

		// then add it in
		need += needTitledb;

		// the titledb unlock key for msg12 in spider.cpp
		need += sizeof(key96_t);
	}

	// . alloc mem for metalist
	// . sanity
	if (m_metaListSize > 0) {
		g_process.shutdownAbort(true);
	}

	// make the buffer
	m_metaList = (char *)mmalloc(need, "metalist");
	if (!m_metaList) {
		return NULL;
	}

	// save size for freeing later
	m_metaListAllocSize = need;

	// ptr and boundary
	m_p = m_metaList;
	m_pend = m_metaList + need;

	//
	// TITLEDB
	//
	setStatus ("adding titledb recs");

	// checkpoint
	char *saved = m_p;

	// . store title rec
	// . Repair.cpp might set useTitledb to false!
	if (m_useTitledb && addTitleRec) {
		// rdbId
		*m_p++ = m_useSecondaryRdbs ? RDB2_TITLEDB2 : RDB_TITLEDB;

		// sanity
		if (!m_titleRecBufValid) {
			g_process.shutdownAbort(true);
		}

		// key, dataSize, data is the whole rec
		// if getting an "oldList" to do incremental posdb updates
		// then do not include the data portion of the title rec
		int32_t tsize = (forDelete) ? sizeof(key96_t) : m_titleRecBuf.length();
		gbmemcpy ( m_p , m_titleRecBuf.getBufStart() , tsize );

		// Sanity. Shut down if data sizes are wrong.
		if( !forDelete) {
			Titledb::validateSerializedRecord( m_p, tsize );
		}
		else {
			logTrace(g_conf.m_logTraceXmlDoc, "Storing delete key for DocId=%" PRId64 "", m_docId);
		}

		m_p += tsize;
	}

	// sanity check
	if (m_p - saved > needTitledb) {
		g_process.shutdownAbort(true);
	}

	// sanity check
	verifyMetaList(m_metaList, m_p, forDelete);

	//
	// ADD BASIC POSDB TERMS
	//
	setStatus("adding posdb terms");

	// checkpoint
	saved = m_p;

	// store indexdb terms into m_metaList[]
	if (m_usePosdb) {
		if (!addTable144(&tt1, m_docId)) {
			logTrace(g_conf.m_logTraceXmlDoc, "END, addTable144 failed");
			return NULL;
		}

		/// @todo ALC we need to handle delete keys for other rdb types

		// we need to add delete key per document when it's deleted (with term 0)
		// we also need to add positive key per document when it's new
		// in case there is already a delete key in the tree/bucket (this will not be persisted and will be removed in Rdb::addRecord)
		// we don't need to do this if getMetaList is called to get negative keys
		if (!forDelete) {
			if ((m_isInIndex && !m_wasInIndex) || (!m_isInIndex && m_wasInIndex)) {
				char key[MAX_KEY_BYTES];

				int64_t docId;
				bool delKey = (!m_isInIndex);
				if (!m_isInIndex) {
					// deleted doc
					docId = *od->getDocId();
				} else {
					// new doc
					docId = *getDocId();
				}

				// add posdb doc key
				*m_p++ = m_useSecondaryRdbs ? RDB2_POSDB2 : RDB_POSDB;

				Posdb::makeDeleteDocKey(key, docId, delKey);
				memcpy(m_p, key, sizeof(posdbkey_t));
				m_p += sizeof(posdbkey_t);
			}
		}
	}

	// sanity check
	if (m_p - saved > needPosdb) {
		g_process.shutdownAbort(true);
	}

	// free all mem
	tt1.reset();

	// sanity check
	verifyMetaList(m_metaList, m_p, forDelete);


	//
	// ADD CLUSTERDB KEYS
	//
	setStatus("adding clusterdb keys");

	// checkpoint
	saved = m_p;

	// . do we have adult content?
	// . should already be valid!
	if (addClusterRec && !m_isAdultValid) {
		g_process.shutdownAbort(true);
	}

	// . store old only if new tr is good and keys are different from old
	// . now we store even if skipIndexing is true because i'd like to
	//   see how many titlerecs we have and count them towards the
	//   docsIndexed count...
	if (m_useClusterdb && addClusterRec) {
		// . get new clusterdb key
		// . we use the host hash for the site hash! hey, this is only 26 bits!
		key96_t newk = Clusterdb::makeClusterRecKey(*getDocId(), *getIsAdult(), *getLangId(), getHostHash32a(), false);

		// store rdbid
		*m_p = RDB_CLUSTERDB;

		// use secondary if we should
		if (m_useSecondaryRdbs) {
			*m_p = RDB2_CLUSTERDB2;
		}

		// skip
		m_p++;

		// and key
		*(key96_t *)m_p = newk;

		// skip it
		m_p += sizeof(key96_t);
	}

	// sanity check
	if (m_p - saved > needClusterdb) {
		g_process.shutdownAbort(true);
	}

	// sanity check
	verifyMetaList(m_metaList, m_p, forDelete);


	//
	// ADD LINKDB KEYS
	//
	setStatus("adding linkdb keys");

	// checkpoint
	saved = m_p;

	// add that table to the metalist (LINKDB)
	if (m_useLinkdb && addLinkInfo && !addTable224(&kt1)) {
		logTrace(g_conf.m_logTraceXmlDoc, "addTable224 failed");
		return NULL;
	}

	// sanity check
	if (m_p - saved > needLinkdb) {
		g_process.shutdownAbort(true);
	}

	// all done
	kt1.reset();

	// sanity check
	verifyMetaList(m_metaList, m_p, forDelete);


	//////
	//
	// add SPIDERREPLY BEFORE and SPIDERREQUEST!!!
	//
	// add spider reply first so we do not immediately respider
	// this same url if we were injecting it because no SpiderRequest
	// may have existed, and SpiderColl::addSpiderRequest() will
	// spawn a spider of this url again unless there is already a REPLY
	// in spiderdb!!! crazy...
	bool addReply = true;

	// save it
	saved = m_p;

	// now add the new rescheduled time
	if (m_useSpiderdb && addReply && !forDelete) {
		// note it
		setStatus("adding SpiderReply to spiderdb");

		// rdbid first
		*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2 : RDB_SPIDERDB;

		// get this
		if (!m_srepValid) {
			g_process.shutdownAbort(true);
		}

		// store the spider rec
		int32_t newsrSize = newsr->getRecSize();
		gbmemcpy (m_p, newsr, newsrSize);
		m_p += newsrSize;

		m_addedSpiderReplySize = newsrSize;
		m_addedSpiderReplySizeValid = true;

		// sanity check - must not be a request, this is a reply
		if (Spiderdb::isSpiderRequest(&newsr->m_key)) {
			g_process.shutdownAbort(true);
		}

		// sanity check
		if (m_p - saved != needSpiderdb1) {
			g_process.shutdownAbort(true);
		}

		// sanity check
		verifyMetaList(m_metaList, m_p, forDelete);
	}


	// if we are injecting we must add the spider request
	// we are injecting from so the url can be scheduled to be
	// spidered again.
	// NO! because when injecting a warc and the subdocs
	// it contains, gb then tries to spider all of them !!! sux...
	if (needSpiderdbRequest) {
		// note it
		setStatus("adding spider request");

		// checkpoint
		saved = m_p;

		// store it here
		SpiderRequest revisedReq;

 		// if doing a repair/rebuild of spiderdb...
		if (m_useSecondaryRdbs) {
			getRebuiltSpiderRequest(&revisedReq);
		} else {
			// this fills it in for doing injections
			getRevisedSpiderRequest(&revisedReq);

			// sanity log
			if (!m_firstIpValid) {
				g_process.shutdownAbort(true);
			}

			// sanity log
			if (m_firstIp == 0 || m_firstIp == -1) {
				const char *url = m_sreqValid ? m_sreq.m_url : "unknown";
				log(LOG_WARN, "build: error3 getting real firstip of %" PRId32" for %s. not adding new request.",
				    (int32_t)m_firstIp,url);
				goto skipNewAdd2;
			}
		}

		// copy it
		*m_p++ = (m_useSecondaryRdbs) ? RDB2_SPIDERDB2 : RDB_SPIDERDB;

		// store it back
		gbmemcpy (m_p, &revisedReq, revisedReq.getRecSize());

		// skip over it
		m_p += revisedReq.getRecSize();

		// sanity check
		if (m_p - saved > needSpiderdbRequest) {
			g_process.shutdownAbort(true);
		}

		m_addedSpiderRequestSize = revisedReq.getRecSize();
		m_addedSpiderRequestSizeValid = true;
	}

skipNewAdd2:

	//
	// ADD SPIDERDB RECORDS of outlinks
	//
	// - do this AFTER computing revdb since we do not want spiderdb recs
	//   to be in revdb.
	//
	setStatus("adding spiderdb keys");

	// checkpoint
	saved = m_p;

	// . should be fixed from Links::setRdbList
	// . we should contain the msge that msg16 uses!
	// . we were checking m_msg16.m_recycleContent, but i have not done
	//   that in years!!! MDW
	// . we were also checking if the # of banned outlinks >= 2, then
	//   we would not do this...
	// . should also add with a time of now plus 5 seconds to that if
	//   we spider an outlink linkdb should be update with this doc
	//   pointing to it so it can get link text then!!
	if (m_useSpiderdb && spideringLinks && nl2 && !m_doingConsistencyCheck && !forDelete) {
		logTrace( g_conf.m_logTraceXmlDoc, "Adding spiderdb records of outlinks" );

		// returns NULL and sets g_errno on error
		char *ret = addOutlinkSpiderRecsToMetaList();

		// sanity check
		if (!ret && !g_errno) {
			g_process.shutdownAbort(true);
		}

		// return NULL on error
		if (!ret) {
			logTrace(g_conf.m_logTraceXmlDoc, "addOutlinkSpiderRecsToMetaList failed");
			return NULL;
		}

		// this MUST not block down here, to avoid re-hashing above
		if (ret == (void *)-1) {
			g_process.shutdownAbort(true);
		}
	}

	// sanity check
	if (m_p - saved > needSpiderdb2) {
		g_process.shutdownAbort(true);
	}

	// sanity check
	verifyMetaList(m_metaList, m_p, forDelete);

	//
	// ADD TAG RECORDS TO TAGDB
	//

	// checkpoint
	saved = m_p;

	// . only do this if NOT setting from a title rec
	// . it might add a bunch of forced spider recs to spiderdb
	// . store into tagdb even if indexCode is set!
	if (m_useTagdb && ntb && !forDelete) {
		// ntb is a safebuf of Tags, which are already Rdb records
		// so just gbmemcpy them directly over
		gbmemcpy (m_p, ntb->getBufStart(), ntb->length());
		m_p += ntb->length();
	}

	// sanity check
	if (m_p - saved > needTagdb) {
		g_process.shutdownAbort(true);
	}

	// sanity check
	verifyMetaList(m_metaList, m_p, forDelete);

	//
	// ADD INDEXED SPIDER REPLY with different docid so we can
	// search index of spider replies! (NEW!)
	//
	// . index spider reply with separate docid so they are all searchable.
	// . see getSpiderStatusDocMetaList() function to see what we index
	//   and the titlerec we create for it
	if (spiderStatusDocMetaList) {
		gbmemcpy (m_p, spiderStatusDocMetaList->getBufStart(), spiderStatusDocMetaList->length());
		m_p += spiderStatusDocMetaList->length();
		m_addedStatusDocSize = spiderStatusDocMetaList->length();
		m_addedStatusDocSizeValid = true;
	}

	// shortcut
	saved = m_p;

	// sanity check
	if (m_p > m_pend || m_p < m_metaList) {
		g_process.shutdownAbort(true);
	}

	/////////////////
	//
	// INCREMENTAL INDEXING / INCREMENTAL UPDATING
	//
	// now prune/manicure the metalist to remove records that
	// were already added, and insert deletes for records that
	// changed since the last time. this is how we do deletes
	// now that we have revdb. this allows us to avoid
	// parsing inconsistency errors.
	//
	/////////////////

	if (oldList) {
		// point to start of the old meta list, the first and only
		// record in the oldList
		char *om = oldList;

		// the size
		int32_t osize = oldListSize;

		// the end
		char *omend = om + osize;
		int32_t needx = 0;

		HashTableX dt8;
		char dbuf8[34900];

		// value is the ptr to the rdbId/key in the oldList
		dt8.set(8, sizeof(char *), 2048, dbuf8, 34900, false, "dt8-tab");

		// scan recs in that and hash them
		for (char *p = om; p < omend;) {
			// save this
			char byte = *p;
			char *rec = p;

			// get the rdbid for this rec
			rdbid_t rdbId = (rdbid_t)(byte & 0x7f);
			p++;

			// get the key size
			int32_t ks = getKeySizeFromRdbId(rdbId);

			// get that
			char *k = p;

			// unlike a real meta list, this meta list has
			// no data field, just rdbIds and keys only! because
			// we only use it for deleting, which only requires
			// a key and not the data
			p += ks;

			// tally this up in case we have to add the delete
			// version of this key back (add 1 for rdbId)
			needx += ks + 1;

			// do not add it if datasize > 0
			// do not include discovery or lost dates in the linkdb key...
			uint64_t hk = (rdbId == RDB_LINKDB) ? hash64(k + 12, ks - 12) : hash64(k, ks);

			// sanity check
			if (rdbId == RDB_LINKDB && Linkdb::getLinkerDocId_uk((key224_t *)k) != m_docId) {
				g_process.shutdownAbort(true);
			}

			if (!dt8.addKey(&hk, &rec)) {
				logTrace(g_conf.m_logTraceXmlDoc, "addKey failed");
				return NULL;
			}
		}

		// also need all the new keys just to be sure, in case none
		// are already in the rdbs
		needx += (m_p - m_metaList);

		// now alloc for our new manicured metalist
		char *nm = (char *)mmalloc(needx, "newmeta");
		if (!nm) {
			logTrace(g_conf.m_logTraceXmlDoc, "mmalloc failed");
			return NULL;
		}

		char *nptr = nm;
		char *nmax = nm + needx;

		// scan each rec in the current meta list, see if its in either
		// the dt12 or dt16 hash table, if it already is, then
		// do NOT add it to the new metalist, nm, because there is
		// no need to.
		char *p    = m_metaList;
		char *pend = p + (m_p - m_metaList);
		for (; p < pend;) {
			// save it with the flag
			char byte = *p;

			// get rdbId
			rdbid_t rdbId = (rdbid_t)(byte & 0x7f);
			p++;

			// key size
			int32_t ks = getKeySizeFromRdbId(rdbId);

			// get key
			char *key = p;
			p += ks;

			// . if key is negative, no data is present
			// . the doledb key is negative for us here
			bool isDel = ((key[0] & 0x01) == 0x00);
			int32_t ds = isDel ? 0 : getDataSizeFromRdbId(rdbId);

			// if datasize variable, read it in
			if (ds == -1) {
				// get data size
				ds = *(int32_t *)p;

				// skip data size int32_t
				p += 4;
			}

			// point to data
			char *data = p;

			// skip data if not zero
			p += ds;

			// mix it up for hashtable speed
			// skip if for linkdb, we do that below
			uint64_t hk = (rdbId == RDB_LINKDB) ? hash64(key + 12, ks - 12) : hash64(key, ks);

			// was this key already in the "old" list?
			int32_t slot = dt8.getSlot(&hk);

			// see if already in an rdb, IFF dataless, otherwise
			// the keys might be the same but with different data!
			if (slot >= 0) {
				// remove from hashtable so we do not add it
				// as a delete key below
				dt8.removeSlot(slot);

				// but do add like a titledb rec that has the
				// same key, because its data is probably
				// different...
				// HACK: enable for now since we lost
				// the url:www.geico.com term somehow!!!
				// geico got deleted but not the title rec!!
				// MAKE SURE TITLEREC gets deleted then!!!
				if (ds == 0 && g_conf.m_doIncrementalUpdating) {
					// don't do incremental updating when using index file
					Rdb *rdb = getRdbFromId(rdbId);
					if (!rdb->isUseIndexFile()) {
						continue;
					}
				}
			}

			// ok, it is not already in an rdb, so add it
			*nptr++ = byte;

			// store key
			gbmemcpy ( nptr, key , ks );

			// skip over it
			nptr += ks;

			// store data
			if (ds) {
				// store data size
				*(int32_t *)nptr = ds;
				nptr += 4;

				gbmemcpy (nptr, data, ds);
				nptr += ds;
			}
		}

		// now scan dt8 and add their keys as del keys
		for ( int32_t i = 0 ; i < dt8.getNumSlots() ; i++ ) {
			// skip if empty
			if (!dt8.m_flags[i]) {
				continue;
			}

			// store rdbid first
			char *rec = *(char **)dt8.getValueFromSlot(i);

			// get rdbId with hi bit possibly set
			rdbid_t rdbId = (rdbid_t)(rec[0] & 0x7f);

			// key size
			int32_t ks = getKeySizeFromRdbId(rdbId);

			// sanity test - no negative keys
			if ((rec[1] & 0x01) == 0x00) {
				g_process.shutdownAbort(true);
			}

			// copy the rdbId byte and key
			gbmemcpy ( nptr , rec , 1 + ks );

			// skip over rdbid
			nptr++;

			// make it a negative key by clearing lsb
			*nptr = *nptr & 0xfe;

			// skip it
			nptr += ks;
		}

		// sanity. check for metalist breach
		if (nptr > nmax) {
			g_process.shutdownAbort(true);
		}

		// free the old meta list
		mfree(m_metaList, m_metaListAllocSize, "fm");

		// now switch over to the new one
		m_metaList = nm;
		m_metaListAllocSize = needx;
		m_p = nptr;
	}

	//
	// repeat this logic special for linkdb since we keep lost links
	// and may update the discovery date or lost date in the keys
	//
	// 1. hash keys of old linkdb keys into dt9 here
	// 2. do not hash the discovery/lost dates when making key hash for dt9
	// 3. scan keys in meta list and add directly into new meta list
	//    if not in dt9
	// 4. if in dt9 then add dt9 key instead
	// 5. remove dt9 keys as we add them
	// 6. then add remaining dt9 keys into meta list but with lost date
	//    set to now UNLESS it's already set
	//

	//
	// validate us!
	//
	m_metaListValid = true;

	// set the list size, different from the alloc size
	m_metaListSize = m_p - m_metaList;

	// sanity check
	verifyMetaList(m_metaList, m_metaList + m_metaListSize, forDelete);

	// all done
	logTrace(g_conf.m_logTraceXmlDoc, "END, all done");
	return m_metaList;
}

// . copy from old title rec to us to speed things up!
// . returns NULL and set g_errno on error
// . returns -1 if blocked
// . returns 1 otherwise
// . when to doc content is unchanged, just inherit crap from the old title
//   rec so we can make the spider reply in getNewSpiderReply()
void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
	// skip if none
	if ( ! od ) return;
	// skip if already did it
	if ( m_copied1 ) return;
	// do not repeat
	m_copied1 = true;
	// set these
	m_percentChanged      = 0;
	m_percentChangedValid = true;

	// copy over bit members
	m_contentHash32 = od->m_contentHash32;
	//m_tagHash32     = od->m_tagHash32;
	m_tagPairHash32 = od->m_tagPairHash32;
	m_httpStatus    = od->m_httpStatus;
	m_isRSS         = od->m_isRSS;
	m_isPermalink   = od->m_isPermalink;
	m_hopCount      = od->m_hopCount;
	m_crawlDelay    = od->m_crawlDelay;

	// do not forget the shadow members of the bit members
	m_isRSS2         = m_isRSS;
	m_isPermalink2   = m_isPermalink;

	// validate them
	m_contentHash32Valid = true;
	//m_tagHash32Valid     = true;
	m_tagPairHash32Valid = true;
	m_httpStatusValid    = true;
	m_isRSSValid         = true;
	m_isPermalinkValid   = true;
	m_hopCountValid      = true;
	m_crawlDelayValid    = true;

	m_langId        = od->m_langId;

	m_langIdValid       = true;

	// so get sitenuminlinks doesn't crash when called by getNewSpiderReply
	// because dns timed out. it timed out with EDNSTIMEDOUT before.
	// so overwrite it here...
	if ( m_ip == -1 || m_ip == 0 || ! m_ipValid ) {
		m_ip                  = od->m_ip;
		m_ipValid             = true;
		m_siteNumInlinks            = od->m_siteNumInlinks;
		m_siteNumInlinksValid = od->m_siteNumInlinksValid;
	}

	m_indexCode      = 0;//od->m_indexCode;
	m_indexCodeValid = true;

	// we need the link info too!
	ptr_linkInfo1  = od->ptr_linkInfo1;
	size_linkInfo1 = od->size_linkInfo1;

	// validate linkinfo
	if (ptr_linkInfo1 && ptr_linkInfo1->m_lisize != size_linkInfo1) {
		gbshutdownAbort(true);
	}

	if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
	else m_linkInfo1Valid = false;
}

// for adding a quick reply for EFAKEIP and for diffbot query reindex requests
SpiderReply *XmlDoc::getFakeSpiderReply ( ) {

	if ( ! m_tagRecValid ) {
		m_tagRec.reset();
		m_tagRecValid = true;
	}

	if ( ! m_siteHash32Valid ) {
		m_siteHash32 = 1;
		m_siteHash32Valid = true;
	}

	if ( ! m_downloadEndTimeValid ) {
		m_downloadEndTime = 0;
		m_downloadEndTimeValid = true;
	}

	if ( ! m_ipValid ) {
		m_ipValid = true;
		m_ip = atoip("1.2.3.4");
	}

	if ( ! m_spideredTimeValid ) {
		m_spideredTimeValid = true;
		m_spideredTime = getTimeGlobal();//0; use now!
	}

	// if doing diffbot query reindex
	// TODO: does this shard the request somewhere else???
	if ( ! m_firstIpValid ) {
		m_firstIp = m_ip;//atoip("1.2.3.4");
		m_firstIpValid = true;
	}

	// this was causing nsr to block and core below on a bad engineer
	// error loading the old title rec
	if ( ! m_isPermalinkValid ) {
		m_isPermalink = false;
		m_isPermalinkValid = true;
	}

	//if ( ! m_sreqValid ) {
	// 	m_sreqValid = true;
	// 	m_sreq.m_parentDocId = 0LL;
	// }


	// if error is EFAKEFIRSTIP, do not core
	//if ( ! m_isIndexedValid ) {
	//	m_isIndexed = false;
	//	m_isIndexedValid = true;
	//}

	// if this is EABANDONED or ECORRUPTDATA (corrupt gzip reply)
	// then this should not block. we need a spiderReply to release the
	// url spider lock in SpiderLoop::m_lockTable.
	// if m_isChildDoc is true, like for diffbot url, this should be
	// a bogus one.
	SpiderReply *nsr = getNewSpiderReply ();
	if ( nsr == (void *)-1) { g_process.shutdownAbort(true); }
	if ( ! nsr ) {
		log("doc: crap, could not even add spider reply "
		    "to indicate internal error: %s",mstrerror(g_errno));
		if ( ! g_errno ) g_errno = EBADENGINEER;
		//return true;
		return NULL;
	}

	return nsr;

	//if ( nsr->getRecSize() <= 1) { g_process.shutdownAbort(true); }

	//CollectionRec *cr = getCollRec();
	//if ( ! cr ) return true;
}

// getSpiderReply()
SpiderReply *XmlDoc::getNewSpiderReply ( ) {

	if ( m_srepValid ) return &m_srep;

	setStatus ( "getting spider reply" );

	// diffbot guys, robots.txt, frames, sshould not be here
	if ( m_isChildDoc ) { g_process.shutdownAbort(true); }

	// . get the mime first
	// . if we are setting XmlDoc from a titleRec, this causes
	//   doConsistencyCheck() to block and core
	//HttpMime *mime = getMime();
	//if ( ! mime || mime == (HttpMime *)-1 ) return (SpiderReply *)mime;

	// if we had a critical error, do not do this
	int32_t *indexCode = getIndexCode();
	if (! indexCode || indexCode == (void *)-1)
		return (SpiderReply *)indexCode;

	TagRec *gr = getTagRec();
	if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr;

	// can't call getIsPermalink() here without entering a dependency loop
	//char *pp = getIsUrlPermalinkFormat();
	//if ( !pp || pp == (char *)-1 ) return (SpiderReply *)pp;

	// the site hash
	int32_t *sh32 = getSiteHash32();
	if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SpiderReply *)sh32;

	int64_t *de = getDownloadEndTime();
	if ( ! de || de == (void *)-1 ) return (SpiderReply *)de;

	// shortcut
	Url *fu = NULL;
	// watch out for titlerec lookup errors for docid based spider reqs
	if ( m_firstUrlValid ) fu = getFirstUrl();

	// reset
	m_srep.reset();

	int32_t firstIp = -1;
	// inherit firstIp
	Tag *tag = m_tagRec.getTag("firstip");
	// tag must be there?
	if ( tag ) firstIp = atoip(tag->getTagData());

	// this is usually the authority
	if ( m_firstIpValid )
		firstIp = m_firstIp;

	// otherwise, inherit from oldsr to be safe
	// BUT NOT if it was a fakeip and we were injecting because
	// the SpiderRequest was manufactured and not actually taken
	// from spiderdb! see XmlDoc::injectDoc() because that is where
	// it came from!! if it has m_sreq.m_isAddUrl and
	// m_sreq.m_fakeFirstIp then we actually do add the reply with that
	// fake ip so that they will exist in the same shard.
	// BUT if it is docid pased from PageReindex.cpp (a query reindex)
	// we set the injection bit and the pagereindex bit, we should let
	// thise guys keep the firstip because the docid-based spider request
	// is in spiderdb. it needs to match up.
	if ( m_sreqValid && (!m_sreq.m_isInjecting||m_sreq.m_isPageReindex) )
		firstIp = m_sreq.m_firstIp;

	// sanity
	if ( firstIp == 0 || firstIp == -1 ) {
		if ( m_firstUrlValid )
			log("xmldoc: BAD FIRST IP for %s",m_firstUrl.getUrl());
		else
			log("xmldoc: BAD FIRST IP for %" PRId64,m_docId);
		firstIp = 12345;
		//g_process.shutdownAbort(true); }
	}
	// store it
	m_srep.m_firstIp = firstIp;
	// assume no error
	// MDW: not right...
	m_srep.m_errCount = 0;
	// otherwise, inherit from oldsr to be safe
	//if ( m_sreqValid )
	//	m_srep.m_firstIp = m_sreq.m_firstIp;

	// do not inherit this one, it MIGHT HAVE CHANGE!
	m_srep.m_siteHash32 = m_siteHash32;

	// need this for updating crawl delay table, m_cdTable in Spider.cpp
	if ( fu ) m_srep.m_domHash32  = getDomHash32();
	else      m_srep.m_domHash32  = 0;

	if ( ! m_tagRecValid               ) { g_process.shutdownAbort(true); }
	if ( ! m_ipValid                   ) { g_process.shutdownAbort(true); }
	if ( ! m_siteHash32Valid           ) { g_process.shutdownAbort(true); }
	//if ( ! m_spideredTimeValid         ) { g_process.shutdownAbort(true); }

	// . set other fields besides key
	// . crap! if we are the "qatest123" collection then m_spideredTime
	//   was read from disk usually and is way in the past! watch out!!
	m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;


	// TODO: expire these when "ownershipchanged" tag is newer!!

	if ( gr->getTag ( "authorityinlink" ) )
		m_srep.m_hasAuthorityInlink = 1;
	// automatically valid either way
	m_srep.m_hasAuthorityInlinkValid = 1;

	int64_t uh48        = 0LL;
	// we might be a docid based spider request so fu could be invalid
	// if the titlerec lookup failed
	if ( fu ) uh48 = hash64b(fu->getUrl()) & 0x0000ffffffffffffLL;
	int64_t parentDocId = 0LL;
	if ( m_sreqValid )
		parentDocId = m_sreq.getParentDocId();

	// for docid based urls from PageReindex.cpp we have to make
	// sure to set the urlhash48 correctly from that.
	if ( m_sreqValid ) uh48 = m_sreq.getUrlHash48();

	// note it
	logDebug( g_conf.m_logDebugSpider, "xmldoc: uh48=%" PRIu64" parentdocid=%" PRIu64, uh48, parentDocId );

	// set the key, m_srep.m_key
	m_srep.setKey (  firstIp, parentDocId, uh48, false );

	// . did we download a page? even if indexcode is set we might have
	// . if this is non-zero that means its valid
	if ( m_contentHash32Valid )
		m_srep.m_contentHash32 = m_contentHash32;

	// injecting the content (url implied)
	if ( m_contentInjected ) // m_sreqValid && m_sreq.m_isInjecting )
		m_srep.m_fromInjectionRequest = 1;

	// can be injecting a url too, content not necessarily implied
	if ( m_sreqValid && m_sreq.m_isInjecting )
		m_srep.m_fromInjectionRequest = 1;

	// were we already in titledb before we started spidering?
	m_srep.m_wasIndexed = m_wasInIndex;

	// note whether m_wasIndexed is valid because if it isn't then
	// we shouldn't be counting this reply towards the page counts.
	// if we never made it this far i guess we should not forcibly call
	// getIsIndexed() at this point so our performance is fast in case
	// this is an EFAKEFIRSTIP error or something similar where we
	// basically just add this reply and we're done.
	// NOTE: this also pertains to SpiderReply::m_isIndexed.
	m_srep.m_wasIndexedValid = m_wasInIndexValid;

	// assume no change
	m_srep.m_isIndexed = m_isInIndex;

	// we need to know if the m_isIndexed bit is valid or not
	// because sometimes like if we are being called directly from
	// indexDoc() because of an error situation, we do not know!
	if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
	else                    m_srep.m_isIndexedINValid = true;

	// likewise, we need to know if we deleted it so we can decrement the
	// quota count for this subdomain/host in SpiderColl::m_quotaTable
	//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;

	// treat error replies special i guess, since langId, etc. will be
	// invalid
	if ( m_indexCode ) {
		// validate
		m_srepValid = true;
		// set these items if valid already, but don't bother
		// trying to compute them, since we are not indexing.
		if ( m_siteNumInlinksValid       ) {
			m_srep.m_siteNumInlinks = m_siteNumInlinks;
			m_srep.m_siteNumInlinksValid = true;
		}
		//if ( m_percentChangedValid )
		//	m_srep.m_percentChangedPerDay = m_percentChanged;
		if ( m_crawlDelayValid && m_crawlDelay >= 0 )
			m_srep.m_crawlDelayMS = m_crawlDelay;
		else
			m_srep.m_crawlDelayMS = -1;
		//if ( m_pubDateValid     ) m_srep.m_pubDate = m_pubDate;
		                          m_srep.m_pubDate = 0;
		if ( m_langIdValid      ) m_srep.m_langId = m_langId;
		if ( m_isRSSValid       ) m_srep.m_isRSS = m_isRSS;
		if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink;
		if ( m_httpStatusValid  ) m_srep.m_httpStatus = m_httpStatus;
		// stuff that is automatically valid
		m_srep.m_isPingServer = 0;
		if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer();
		// this was replaced by m_contentHash32
		//m_srep.m_newRequests  = 0;
		m_srep.m_errCode      = m_indexCode;
		if ( m_downloadEndTimeValid )
			m_srep.m_downloadEndTime = m_downloadEndTime;
		else
			m_srep.m_downloadEndTime = 0;
		// is the original spider request valid?
		if ( m_sreqValid ) {
			// preserve the content hash in case m_indexCode is
			// EDOCUNCHANGED. so we can continue to get that
			// in the future. also, if we had the doc indexed,
			// just carry the contentHash32 forward for the other
			// errors like EDNSTIMEDOUT or whatever.
			m_srep.m_contentHash32 = m_sreq.m_contentHash32;
			// shortcuts
			SpiderReply   *n = &m_srep;
			SpiderRequest *o = &m_sreq;
			// more stuff
			n->m_hasAuthorityInlink = o->m_hasAuthorityInlink;
			n->m_isPingServer       = o->m_isPingServer;
			// the validator flags
			n->m_hasAuthorityInlinkValid =
				o->m_hasAuthorityInlinkValid;
			// get error count from original spider request
			int32_t newc = m_sreq.m_errCount;
			// inc for us, since we had an error
			newc++;
			// contain to one byte
			if ( newc > 255 ) newc = 255;
			// store in our spiderreply
			m_srep.m_errCount = newc;
		}
		// . and do not really consider this an error
		// . i don't want the url filters treating it as an error reply
		// . m_contentHash32 should have been carried forward from
		//   the block of code right above
		if ( m_indexCode == EDOCUNCHANGED ) {
			// we should have had a spider request, because that's
			// where we got the m_contentHash32 we passed to
			// Msg13Request.
			if ( ! m_sreqValid ) { g_process.shutdownAbort(true); }
			// make it a success
			m_srep.m_errCode = 0;
			// and no error count, it wasn't an error per se
			m_srep.m_errCount = 0;
			// call it 200
			m_srep.m_httpStatus = 200;
		}
		// copy flags and data from old doc...
		if ( m_indexCode == EDOCUNCHANGED &&
		     m_oldDocValid &&
		     m_oldDoc ) {
			//m_srep.m_pubDate        = m_oldDoc->m_pubDate;
			m_srep.m_pubDate        = 0;
			m_srep.m_langId         = m_oldDoc->m_langId;
			m_srep.m_isRSS          = m_oldDoc->m_isRSS;
			m_srep.m_isPermalink    = m_oldDoc->m_isPermalink;
			m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
			// they're all valid
			m_srep.m_siteNumInlinksValid = true;
		}
		// do special things if
		return &m_srep;
	}

	// this will help us avoid hammering ips & respect same ip wait
	if ( ! m_downloadEndTimeValid ) { g_process.shutdownAbort(true); }
	m_srep.m_downloadEndTime      = m_downloadEndTime;

	// . if m_indexCode was 0, we are indexed then...
	// . this logic is now above
	//m_srep.m_isIndexed = 1;

	// get ptr to old doc/titlerec
	XmlDoc **pod = getOldXmlDoc ( );
	if ( ! pod || pod == (XmlDoc **)-1 ) return (SpiderReply *)pod;
	// this is non-NULL if it existed
	XmlDoc *od = *pod;

	// status is -1 if not found
	int16_t *hs = getHttpStatus ();
	if ( ! hs || hs == (void *)-1 ) return (SpiderReply *)hs;

	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (SpiderReply *)sni;

	float *pc = getPercentChanged();
	if ( ! pc || pc == (void *)-1 ) return (SpiderReply *)pc;

	// get the content type
	uint8_t *ct = getContentType();
	if ( ! ct ) return NULL;
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (SpiderReply *)isRoot;



	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 )
		return (SpiderReply *)langId;

	char *isRSS   = getIsRSS();
	if ( ! isRSS || isRSS == (char  *)-1 )
		return (SpiderReply *)isRSS;

	char *pl = getIsPermalink();
	if ( ! pl || pl == (char *)-1 )
		return (SpiderReply *)pl;

	// this is only know if we download the robots.tt...
	if ( od && m_recycleContent ) {
		m_crawlDelay = od->m_crawlDelay;
		m_crawlDelayValid = true;
	}

	// sanity checks
	//if(! m_sreqValid                ) { g_process.shutdownAbort(true); }
	if ( ! m_siteNumInlinksValid       ) { g_process.shutdownAbort(true); }
	if ( ! m_hopCountValid             ) { g_process.shutdownAbort(true); }
	if ( ! m_langIdValid               ) { g_process.shutdownAbort(true); }
	if ( ! m_isRSSValid                ) { g_process.shutdownAbort(true); }
	if ( ! m_isPermalinkValid          ) { g_process.shutdownAbort(true); }
	//if ( ! m_pageNumInlinksValid     ) { g_process.shutdownAbort(true); }
	if ( ! m_percentChangedValid       ) { g_process.shutdownAbort(true); }
	//if ( ! m_isSpamValid               ) { g_process.shutdownAbort(true); }
	//if ( ! m_crawlDelayValid           ) { g_process.shutdownAbort(true); }

	// httpStatus is -1 if not found (like for empty http replies)
	m_srep.m_httpStatus = *hs;

	// zero if none
	//m_srep.m_percentChangedPerDay = 0;
	// . only if had old one
	// . we use this in url filters to set the respider wait time usually
	if ( od ) {
		int32_t spideredTime = getSpideredTime();
		int32_t oldSpideredTime = od->getSpideredTime();
		float numDays = spideredTime - oldSpideredTime;
		m_srep.m_percentChangedPerDay = (m_percentChanged+.5)/numDays;
	}

	// . update crawl delay, but we must store now as milliseconds
	//   because Spider.cpp like it better that way
	// . -1 implies crawl delay unknown or not found
	if ( m_crawlDelay >= 0 && m_crawlDelayValid )
		m_srep.m_crawlDelayMS = m_crawlDelay;
	else
		// -1 means invalid/unknown
		m_srep.m_crawlDelayMS = -1;

	// . we use this to store "bad" spider recs to keep from respidering
	//   a "bad" url over and over again
	// . it is up to the url filters whether they want to retry this
	//   again or not!
	// . TODO: how to represent "ETCPTIMEDOUT"????
	// . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP,
	//   ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH
	m_srep.m_siteNumInlinks       = m_siteNumInlinks;
	//m_srep.m_pubDate              = *pubDate;
	m_srep.m_pubDate              = 0;
	// this was replaced by m_contentHash32
	//m_srep.m_newRequests          = 0;
	m_srep.m_langId               = *langId;
	m_srep.m_isRSS                = (bool)*isRSS;
	m_srep.m_isPermalink          = (bool)*pl;
        m_srep.m_isPingServer         = (bool)fu->isPingServer();
	//m_srep.m_isSpam             = m_isSpam;

	m_srep.m_siteNumInlinksValid = true;

	// validate all
	m_srep.m_hasAuthorityInlinkValid = 1;

	// a quick validation. reply must unlock the url from the lock table.
	// so the locks must be equal.
	if ( m_sreqValid &&
	     // we create a new spiderrequest if injecting with a fake firstip
	     // so it will fail this test...
	     ! m_sreq.m_isInjecting ) {
		int64_t lock1 = makeLockTableKey(&m_sreq);
		int64_t lock2 = makeLockTableKey(&m_srep);
		if ( lock1 != lock2 ) {
			log("build: lock1 != lock2 lock mismatch for %s",
			    m_firstUrl.getUrl());
			g_process.shutdownAbort(true);
		}
	}

	// validate
	m_srepValid = true;

	return &m_srep;
}

// . so Msg20 can see if we are banned now or not...
// . we must skip certain rules in getUrlFilterNum() when doing to for Msg20
//   because things like "parentIsRSS" can be both true or false since a url
//   can have multiple spider recs associated with it!
void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq   ,
				    SpiderReply   *srep   ) {

	// sanity checks
	if ( ! m_ipValid                   ) { g_process.shutdownAbort(true); }
	if ( ! m_hopCountValid             ) { g_process.shutdownAbort(true); }
	if ( ! m_langIdValid               ) { g_process.shutdownAbort(true); }
	if ( ! m_isRSSValid                ) { g_process.shutdownAbort(true); }
	if ( ! m_isPermalinkValid          ) { g_process.shutdownAbort(true); }

	Url *fu = getFirstUrl();

	// reset
	sreq->reset();
	// assume not valid
	sreq->m_siteNumInlinks = -1;

	if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
	// how many site inlinks?
	sreq->m_siteNumInlinks       = m_siteNumInlinks;
	sreq->m_siteNumInlinksValid  = true;

	// set other fields besides key
	sreq->m_firstIp              = m_ip;
	sreq->m_hostHash32           = m_hostHash32a;
	sreq->m_hopCount             = m_hopCount;

	sreq->m_pageNumInlinks       = 0;//m_sreq.m_parentFirstIp;

	sreq->m_isAddUrl             = 0;//m_isAddUrl;
	sreq->m_isPingServer         = fu->isPingServer();
	//sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat;

	// transcribe from old spider rec, stuff should be the same
	sreq->m_addedTime          = m_firstIndexedDate;

	// validate the stuff so getUrlFilterNum() acks it
	sreq->m_hopCountValid = 1;

	srep->reset();

	srep->m_spideredTime         = getSpideredTime();//m_spideredTime;
	//srep->m_isSpam             = isSpam; // real-time update this!!!
	srep->m_isRSS                = m_isRSS;
	srep->m_isPermalink          = m_isPermalink;
	srep->m_httpStatus           = 200;
	//srep->m_retryNum           = 0;
	srep->m_langId               = m_langId;
	srep->m_percentChangedPerDay = 0;//m_percentChanged;

	// we need this now for ucp ucr upp upr new url filters that do
	// substring matching on the url
	if ( m_firstUrlValid )
		strcpy(sreq->m_url,m_firstUrl.getUrl());
}


// . add the spiderdb recs to the meta list
// . used by XmlDoc::setMetaList()
// . returns NULL and sets g_errno on error
// . otherwise returns the "new p"
// . if Scraper.cpp or PageAddUrl.cpp and Msg7.cpp should all use the XmlDoc
//   class even if just adding links. they should make a fake html page and
//   "inject" it, with only m_useSpiderdb set to true...
char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {

	logTrace( g_conf.m_logTraceXmlDoc, "BEGIN" );

	if ( m_doingConsistencyCheck ) { g_process.shutdownAbort(true); }

	// do not do this if recycling content
	// UNLESS REBUILDING...
	if ( m_recycleContent && ! m_useSecondaryRdbs )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, rebuilding" );
		return (char *)0x01;
	}


	// for now skip in repair tool
	if ( m_useSecondaryRdbs && ! g_conf.m_rebuildAddOutlinks )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, in repair mode" );
		return (char *)0x01;
	}


	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getXml failed" );
		return (char *)xml;
	}

	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLinks failed" );
		return (char *)links;
	}

	char *spiderLinks = getSpiderLinks();
	if ( ! spiderLinks || spiderLinks == (char *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getSpiderLinks failed" );
		return (char *)spiderLinks;
	}

	TagRec ***grv = getOutlinkTagRecVector();
	if ( ! grv || grv == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getOutlinkTagRecVector failed" );
		return (char *)grv;
	}

	int32_t    **ipv = getOutlinkFirstIpVector();
	if ( ! ipv || ipv == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "getOutlinkFirstIpVector failed" );
		return (char *)ipv;
	}

	char     *ipi = getIsIndexed(); // is the parent indexed?
	if ( ! ipi || ipi == (char *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getIsIndexed failed" );
		return (char *)ipi;
	}

	// need this
	int32_t parentDomHash32 = getDomHash32();
	if ( parentDomHash32 != m_domHash32 ) { g_process.shutdownAbort(true); }

	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getIsSiteRoot failed" );
		return (char *)isRoot;
	}

	int32_t *psni = getSiteNumInlinks();
	if ( ! psni || psni == (int32_t *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getSiteNumInlinks failed" );
		return (char *)psni;
	}

	int32_t *pfip = getFirstIp();
	if ( ! pfip || pfip == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getFirstIp failed" );
		return (char *)pfip;
	}

	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getDocId failed" );
		return (char *)d;
	}

	Url *fu = getFirstUrl();
	if ( ! fu || fu == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getFirstUrl failed" );
		return (char *)fu;
	}

	Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getCurrentUrl failed" );
		return (char *)cu;
	}

	uint8_t *langId = getLangId();
	if ( ! langId || langId == (uint8_t *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLangId failed" );
		return (char *)langId;
	}

	// so linkSites[i] is site for link #i in Links.cpp class
	int32_t *linkSiteHashes = getLinkSiteHashes ( );
	if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getLinkSiteHashes failed" );
		return (char *)linkSiteHashes;
	}

	int8_t *hopCount = getHopCount();
	if ( ! hopCount || hopCount == (int8_t *)-1 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, getHopCount failed" );
		return (char *)hopCount;
	}

	XmlDoc  *nd  = this;

	bool    isParentRSS       = false;
	// PageAddUrl.cpp does not supply a valid new doc, so this is NULL
	if ( nd ) {
		isParentRSS       = *nd->getIsRSS() ;
	}

	int32_t n = links->m_numLinks;
	// return early if nothing to do. do not return NULL though cuz we
	// do not have g_errno set!
	if ( n <= 0 )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, no links to add (%" PRId32").", n);
		return (char *)0x01;
	}

	// sanity checks
	if ( ! m_ipValid             ) { g_process.shutdownAbort(true); }
	if ( ! m_domHash32Valid      ) { g_process.shutdownAbort(true); }
	if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
	if ( ! m_hostHash32aValid    ) { g_process.shutdownAbort(true); }
	if ( ! m_siteHash32Valid     ) { g_process.shutdownAbort(true); }
	if ( ! m_hopCountValid       ) { g_process.shutdownAbort(true); }
	//if ( ! m_spideredTimeValid   ) { g_process.shutdownAbort(true); }

	int64_t myUh48 = m_firstUrl.getUrlHash48();

	// . pre-allocate a buffer to hold the spider recs
	// . taken from SpiderRequest::store()
	int32_t size = 0;
	for ( int32_t i = 0 ; i < n ; i++ )
		size += SpiderRequest::getNeededSize ( links->getLinkLen(i) );

	// append spider recs to this list ptr
	char *p = m_p;

	// hash table to avoid dups
	HashTableX ht;
	char buf2[8192];
	if ( ! ht.set ( 4,0,1000,buf2 , 8192,false,"linkdedup" ) )
	{
		logTrace( g_conf.m_logTraceXmlDoc, "END, ht.set failed" );
		return NULL;
	}

	// count how many we add
	int32_t numAdded = 0;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		logTrace( g_conf.m_logTraceXmlDoc, "END, getCollRec failed" );
		return NULL;
	}

	bool avoid = false;

	// if this is a simplified redir and we should not be spidering
	// links then turn it off as well! because we now add simplified
	// redirects back into spiderdb using this function.
	if ( m_spiderLinksValid && ! m_spiderLinks )
		avoid = true;

	logTrace( g_conf.m_logTraceXmlDoc, "Handling %" PRId32" links", n);

	//
	// serialize each link into the metalist now
	//
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// grab our info
		TagRec *gr        = (*grv)[i];
		int32_t    firstIp   = (*ipv)[i];

		// ip lookup failed? do not add to spiderdb then
		if ( firstIp == 0 || firstIp == -1 ) continue;

		// get flags
		linkflags_t flags = links->m_linkFlags[i];

		// . skip if we are rss page and this link is an <a href> link
		// . we only harvest <link> urls from rss feeds, not href links
		// . or in the case of feedburner, those orig tags
		if ( isParentRSS && (flags & LF_AHREFTAG) ) continue;

		// if we have a <feedburner:origLink> tag, then ignore <link>
		// tags and only get the links from the original links
		if ( links->m_isFeedBurner && !(flags & LF_FBTAG) ) continue;

		// do not add self links, pointless
		if ( flags & LF_SELFLINK ) continue;

		// do not add if no follow
		if ( flags & LF_NOFOLLOW ) continue;

		// point to url
		char *s    = links->getLinkPtr(i);
		int32_t  slen = links->getLinkLen(i);

		// get hash
		int32_t uh = hash32 ( s , slen );

		// it does not like keys of 0, that means empty slot
		if ( uh == 0 ) uh = 1;

		// skip if dup
		if ( ht.isInTable ( &uh ) ) continue;

		// add it, returns false and sets g_errno on error
		if ( ! ht.addKey ( &uh ) ) return NULL;

		// we now supports HTTPS
		if ( strncmp(s,"http://",7) && strncmp(s,"https://",8) )
			continue;

		// . do not add if "old"
		// . Links::set() calls flagOldOutlinks()
		// . that just means we probably added it the last time
		//   we spidered this page
		// . no cuz we might have a different siteNumInlinks now
		//   and maybe this next hop count is now allowed where as
		//   before it was not!
		//if ( flags & LF_OLDLINK ) continue;

		Url url;
		url.set( s, slen );

		// if hostname length is <= 2 then SILENTLY reject it
		if ( url.getHostLen() <= 2 ) continue;

		// BR 20160125: Do not create spiderdb entries for media URLs etc.
		if(	url.hasNonIndexableExtension(TITLEREC_CURRENT_VERSION) ||
			url.hasScriptExtension() ||
			url.hasJsonExtension() ||
//			url.hasXmlExtension() ||
			g_urlBlockList.isUrlBlocked(url))
		{
			logTrace( g_conf.m_logTraceXmlDoc, "Unwanted for indexing [%s]", url.getUrl());
			continue;
		}

		// get # of inlinks to this site... if recorded...
		int32_t ksni = -1;
		Tag *st = NULL;
		if ( gr ) st = gr->getTag ("sitenuminlinks");
		if ( st ) ksni = atol(st->getTagData());

		int32_t hostHash32   = url.getHostHash32();
		// . consult our sitelinks.txt file
		// . returns -1 if not found
		int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );

		// try with www if not there
		if ( min < 0 && ! url.hasSubdomain() ) {
			int32_t wwwHash32 = url.getHash32WithWWW();
			min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
		}

		if ( min >= 0 && ksni < min )
			ksni = min;

		// get this
		bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] );

		// get it quick
		bool ispingserver = url.isPingServer();
		int32_t domHash32    = url.getDomainHash32();

		// is link rss?
		bool isRSSExt = false;
		const char *ext = url.getExtension();
		if ( ext ) {
			if ( strcasecmp( ext, "rss" ) == 0 ) {
				isRSSExt = true;
			} else if ( strcasecmp( ext, "xml" ) == 0 ) {
				isRSSExt = true;
			} else if ( strcasecmp( ext, "atom" ) == 0 ) {
				isRSSExt = true;
			}
		}

		logTrace( g_conf.m_logTraceXmlDoc, "link is RSS [%s]", isRSSExt?"true":"false");

		// make the spider request rec for it
		SpiderRequest ksr;

		// to defaults (zero out)
		ksr.reset();

		// set other fields besides key
		ksr.m_firstIp          = firstIp;
		ksr.m_hostHash32       = hostHash32;
		ksr.m_domHash32        = domHash32;
		ksr.m_siteHash32       = linkSiteHashes[i];//siteHash32;
		ksr.m_siteNumInlinks   = ksni;
		ksr.m_siteNumInlinksValid = true;
		ksr.m_isRSSExt            = isRSSExt;

		// hop count is now 16 bits so do not wrap that around
		int32_t hc = m_hopCount + 1;
		if ( hc > 65535 ) hc = 65535;
		ksr.m_hopCount         = hc;

		// keep hopcount the same for redirs
		if ( m_indexCodeValid &&
		     ( m_indexCode == EDOCSIMPLIFIEDREDIR || m_indexCode == EDOCNONCANONICAL ) ) {
			ksr.m_hopCount = m_hopCount;
		}

		if ( issiteroot   ) ksr.m_hopCount = 0;
		if ( ispingserver ) ksr.m_hopCount = 0;

		// validate it
		ksr.m_hopCountValid = true;

		ksr.m_addedTime        = getSpideredTime();//m_spideredTime;
		//ksr.m_lastAttempt    = 0;
		//ksr.m_errCode        = 0;

		ksr.m_pageNumInlinks   = 0;

		// get this
		bool isupf = ::isPermalink(NULL,&url,CT_HTML,NULL,isRSSExt);
		// set some bit flags. the rest are 0 since we call reset()
		if ( isupf        ) ksr.m_isUrlPermalinkFormat = 1;
		//if ( isIndexed    ) ksr.m_isIndexed          = 1;
		if ( ispingserver ) ksr.m_isPingServer         = 1;

		// is it like www.xxx.com/* (does not include www.xxx.yyy.com)
		// includes xxx.com/* however
		ksr.m_isWWWSubdomain = url.isSimpleSubdomain();


		// if parent is a root of a popular site, then it is considered
		// an authority linker.  (see updateTagdb() function above)

		//@todo BR: This is how site authority is decided. Improve?
		// the mere existence of authorityinlink tag is good
		if ( ( *isRoot && *psni >= 500 ) || ( gr->getTag("authorityinlink") ) ) {
			ksr.m_hasAuthorityInlink = 1;
		}
		ksr.m_hasAuthorityInlinkValid = true;

		// this is used for building dmoz. we just want to index
		// the urls in dmoz, not their outlinks.
		if ( avoid  ) ksr.m_avoidSpiderLinks = 1;

		// . if this is the 2nd+ time we were spidered and this outlink
		//   wasn't there last time, then set this!
		// . if this is the first time spidering this doc then set it
		//   to zero so that m_minPubDate is set to -1 when the outlink
		//   defined by "ksr" is spidered.
		if ( m_oldDocValid && m_oldDoc ) {
			int32_t oldSpideredTime = m_oldDoc->getSpideredTime();
			ksr.m_parentPrevSpiderTime = oldSpideredTime;
		} else {
			ksr.m_parentPrevSpiderTime = 0;
		}

		//
		// . inherit manual add bit if redirecting to simplified url
		// . so we always spider seed url even if prohibited by
		//   the regex, and even if it simplified redirects
		//
		if ( m_indexCodeValid &&
		     ( m_indexCode == EDOCSIMPLIFIEDREDIR || m_indexCode == EDOCNONCANONICAL ) &&
		     m_sreqValid ) {
			if ( m_sreq.m_isInjecting )
				ksr.m_isInjecting = 1;
			if ( m_sreq.m_isAddUrl )
				ksr.m_isAddUrl = 1;
		}

		// copy the url into SpiderRequest::m_url buffer
		strcpy(ksr.m_url,s);

		// this must be valid
		if ( ! m_docIdValid ) { g_process.shutdownAbort(true); }

		// set the key, ksr.m_key. isDel = false
		ksr.setKey ( firstIp, *d , false );

		// we were hopcount 0, so if we link to ourselves we override
		// our original hopcount of 0 with this guy that has a
		// hopcount of 1. that sux... so don't do it.
		if ( ksr.getUrlHash48() == myUh48 ) continue;

		// . technically speaking we do not have any reply so we
		//   should not be calling this! cuz we don't have all the info
		// . see if banned or filtered, etc.
		// . at least try to call it. getUrlFilterNum() should
		//   break out and return -1 if it encounters a filter rule
		//   that it does not have enough info to answer.
		//   so if your first X filters all map to a "FILTERED"
		//   priority and this url matches one of them we can
		//   confidently toss this guy out.
		// . show this for debugging!
		// int32_t ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime ,
		// 			       false, m_niceness, cr,
		// 			       false,//true , // outlink?
		// 			       NULL ); // quotatable
		// logf(LOG_DEBUG,"build: ufn=%" PRId32" for %s",
		//      ufn,ksr.m_url);

		// bad?
		//if ( ufn < 0 ) {
		//	log("build: link %s had bad url filter."
		//	    , ksr.m_url );
		//	g_errno = EBADENGINEER;
		//	return NULL;
		//}

		// debug
		if ( g_conf.m_logDebugUrlAttempts ) {
			// print the tag rec out into sb2
			SafeBuf sb2;
			if ( gr ) gr->printToBuf ( &sb2 );
			// get it
			//SafeBuf sb1;
			const char *action = "add";
			logf(LOG_DEBUG,
			     "spider: attempting to %s link. "
			     "%s "
			     "tags=%s "
			     "onpage=%s"
			     ,
			     action ,
			     ksr.m_url,
			     //sb1.getBufStart(),
			     sb2.getBufStart(),
			     m_firstUrl.getUrl());
		}

		// serialize into the buffer
		int32_t need = ksr.getRecSize();

		// sanity check
		if ( p + 1 + need > m_pend ) { g_process.shutdownAbort(true); }
		// store the rdbId
		if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2;
		else                      *p++ = RDB_SPIDERDB;

		// store the spider rec
		gbmemcpy ( p , &ksr , need );
		// skip it
		p += need;
		// count it
		numAdded++;
	}

	logTrace( g_conf.m_logTraceXmlDoc, "Added %" PRId32" links", numAdded);

	// save it
	m_numOutlinksAdded      = numAdded;
	m_numOutlinksAddedValid = true;

	// update end of list once we have successfully added all spider recs
	m_p = p;

	// return current ptr
	logTrace( g_conf.m_logTraceXmlDoc, "END, all done." );
	return m_p ;
}

int32_t XmlDoc::getSiteRank ( ) {
	if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }
	return ::getSiteRank ( m_siteNumInlinks );
}

// . add keys/recs from the table into the metalist
// . we store the keys into "m_p" unless "buf" is given
bool XmlDoc::addTable144 ( HashTableX *tt1 , int64_t docId , SafeBuf *buf ) {

	// sanity check
	if ( tt1->getNumSlots() ) {
		if ( tt1->getKeySize() != sizeof(key144_t) ) {g_process.shutdownAbort(true);}
		if ( tt1->getDataSize() != 4                ) {g_process.shutdownAbort(true);}
	}

	// assume we are storing into m_p
	char *p = m_p;

	// reserve space if we had a safebuf and point into it if there
	if ( buf ) {
		int32_t slotSize = (sizeof(key144_t)+2+sizeof(key128_t));
		int32_t need = tt1->getNumUsedSlots() * slotSize;
		if ( ! buf->reserve ( need ) ) return false;
		// get cursor into buf, NOT START of buf
		p = buf->getBufStart();
	}

	int32_t siteRank = getSiteRank ();

	if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }

	rdbid_t rdbId = RDB_POSDB;
	if ( m_useSecondaryRdbs ) rdbId = RDB2_POSDB2;

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->getNumSlots() ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// get its key
		char *kp = (char *)tt1->getKeyFromSlot( i );
		// store rdbid
		*p++ = rdbId; // (rdbId | f);
		// store it as is
		gbmemcpy ( p , kp , sizeof(key144_t) );
		// this was zero when we added these keys to zero, so fix it
		Posdb::setDocIdBits ( p , docId );
		// if this is a numeric field we do not want to set
		// the siterank or langid bits because it will mess up
		// sorting by the float which is basically in the position
		// of the word position bits.
		if ( Posdb::isAlignmentBitClear ( p ) ) {
			// make sure it is set again. it was just cleared
			// to indicate that this key contains a float
			// like a price or something, and we should not
			// set siterank or langid so that its termlist
			// remains sorted just by that float
			Posdb::setAlignmentBit ( p , 1 );
		}
		// otherwise, set the siterank and langid
		else {
			// this too
			Posdb::setSiteRankBits ( p , siteRank );
			// set language here too
			Posdb::setLangIdBits ( p , m_langId );
		}
		// advance over it
		p += sizeof(key144_t);
	}

	// all done
	if ( ! buf ) { m_p = p; return true; }

	// update safebuf otherwise
	char *start = buf->getBufStart();
	// fix SafeBuf::m_length
	buf->setLength ( p - start );
	// sanity
	if ( buf->length() > buf->getCapacity() ) { g_process.shutdownAbort(true); }

	return true;
}

// add keys/recs from the table into the metalist
bool XmlDoc::addTable224 ( HashTableX *tt1 ) {

	// sanity check
	if ( tt1->getNumSlots() ) {
		if ( tt1->getKeySize() != sizeof(key224_t) ) {g_process.shutdownAbort(true);}
		if ( tt1->getDataSize() != 0                ) {g_process.shutdownAbort(true);}
	}

	rdbid_t rdbId = RDB_LINKDB;
	if ( m_useSecondaryRdbs ) rdbId = RDB2_LINKDB2;

	// store terms from "tt1" table
	for ( int32_t i = 0 ; i < tt1->getNumSlots() ; i++ ) {
		// skip if empty
		if ( tt1->m_flags[i] == 0 ) continue;
		// get its key
		char *kp = (char *)tt1->getKeyFromSlot( i );
		// store rdbid
		*m_p++ = rdbId; // (rdbId | f);
		// store it as is
		gbmemcpy ( m_p , kp , sizeof(key224_t) );
		// advance over it
		m_p += sizeof(key224_t);
	}
	return true;
}

// . this is kinda hacky because it uses a short XmlDoc on the stack
// . no need to hash this stuff for regular documents since all the terms
//   are fielded by gberrorstr, gberrornum or gbisreply.
// . normally we might use a separate xmldoc class for this but i wanted
//   something more lightweight
SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply, bool forDelete ) {

	// set status for this
	setStatus ( "getting spider reply meta list");

	if ( m_spiderStatusDocMetaListValid )
		return &m_spiderStatusDocMetaList;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	if ( ! cr->m_indexSpiderReplies || forDelete ) {
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// if docid based do not hash a spider reply. docid-based spider
	// requests are added to spiderdb from the query reindex tool.
	// do not do for diffbot subdocuments either, usespiderdb should be
	// false for those.
	// MDW: i disagree, i want to see when these get updated! 9/6/2014
	// ok, let's index for diffbot objects so we can see if they are
	// a dup of another diffbot object, or so we can see when they get
	// revisted, etc.
	//if ( m_setFromDocId || ! m_useSpiderdb ) {
	if ( ! m_useSpiderdb ) {
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// do not add a status doc if doing a query delete on a status doc
	if ( m_contentTypeValid && m_contentType == CT_STATUS ) {
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// doing it for diffbot throws off smoketests
	// ok, smoketests are updated now, so remove this
	// if ( strncmp(cr->m_coll,"crawlbottesting-",16) == 0 ) {
	// 	m_spiderStatusDocMetaListValid = true;
	// 	return &m_spiderStatusDocMetaList;
	// }

	// we double add regular html urls in a query reindex because the
	// json url adds the parent, so the parent gets added twice sometimes,
	// and for some reason it is adding a spider status doc the 2nd time
	// so cut that out. this is kinda a hack b/c i'm not sure what's
	// going on. but you can set a break point here and see what's up if
	// you want.
	// MDW: likewise, take this out, i want these recorded as well..
	// if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
	// 	m_spiderStatusDocMetaListValid = true;
	// 	return &m_spiderStatusDocMetaList;
	// }

	// . fake this out so we do not core
	// . hashWords3() uses it i guess
	bool forcedLangId = false;
	if ( ! m_langIdValid ) {
		forcedLangId = true;
		m_langIdValid = true;
		m_langId = langUnknown;
	}

	// prevent more cores
	bool forcedSiteNumInlinks = false;
	if ( ! m_siteNumInlinksValid ) {
		forcedSiteNumInlinks = true;
		m_siteNumInlinks = 0;
		m_siteNumInlinksValid = true;
	}

	SafeBuf *mbuf = getSpiderStatusDocMetaList2 ( reply );

	if ( forcedLangId )
		m_langIdValid = false;

	if ( forcedSiteNumInlinks ) {
		m_siteNumInlinksValid = false;
	}

	return mbuf;
}

// . the spider status doc
// . TODO:
//   usedProxy:1
//   proxyIp:1.2.3.4
SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply1 ) {

	setStatus ( "making spider reply meta list");

	// . we also need a unique docid for indexing the spider *reply*
	//   as a separate document
	// . use the same url, but use a different docid.
	// . use now to mix it up
	//int32_t now = getTimeGlobal();
	//int64_t h = hash64(m_docId, now );
	// to keep qa test consistent this docid should be consistent
	// so base it on spidertime of parent doc.
	// if doc is being force deleted then this is invalid!
	//if ( ! m_spideredTimeValid ) { g_process.shutdownAbort(true); }
	int64_t h = hash64(m_docId, m_spideredTime );
	// mask it out
	int64_t d = h & DOCID_MASK;
	// try to get an available docid, preferring "d" if available
	int64_t *uqd = getAvailDocIdOnly ( d );
	if ( ! uqd || uqd == (void *)-1 ) return  (SafeBuf *)uqd;

	// unsigned char *hc = (unsigned char *)getHopCount();
	// if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc;

	int32_t tmpVal = -1;
	int32_t *priority = &tmpVal;
	int32_t *ufn = &tmpVal;

	// prevent a core if sreq is not valid, these will freak out
	// diffbot replies may not have a valid m_sreq
	if ( m_sreqValid ) {
		priority = getSpiderPriority();
		if ( ! priority || priority == (void *)-1 )
			return (SafeBuf *)priority;

		ufn = getUrlFilterNum();
		if ( ! ufn || ufn == (void *)-1 )
			return (SafeBuf *)ufn;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;


	// sanity
	if ( ! m_indexCodeValid ) { g_process.shutdownAbort(true); }

	// why isn't gbhopcount: being indexed consistently?
	//if ( ! m_hopCountValid )  { g_process.shutdownAbort(true); }

	// reset just in case
	m_spiderStatusDocMetaList.reset();

	// sanity
	if ( *uqd <= 0 || *uqd > MAX_DOCID ) {
		log("xmldoc: avail docid = %" PRId64". could not index spider "
		    "reply or %s",*uqd,m_firstUrl.getUrl());
		//g_process.shutdownAbort(true); }
		m_spiderStatusDocMetaListValid = true;
		return &m_spiderStatusDocMetaList;
	}

	// the old doc
	XmlDoc *od = NULL;
	if ( m_oldDocValid && m_oldDoc ) od = m_oldDoc;

	Url *fu = &m_firstUrl;

	// . make a little json doc that we'll hash up
	// . only index the fields in this doc, no extra gbdocid: inurl:
	//   hash terms
	SafeBuf jd;
	jd.safePrintf("{\n");

	// so type:status query works
	jd.safePrintf("\"type\":\"status\",\n");

	jd.safePrintf("\"gbssUrl\":\"%s\",\n" , fu->getUrl()  );

	if ( ptr_redirUrl )
		jd.safePrintf("\"gbssFinalRedirectUrl\":\"%s\",\n",
			      ptr_redirUrl);

	if ( m_indexCodeValid ) {
		jd.safePrintf("\"gbssStatusCode\":%i,\n",(int)m_indexCode);
		jd.safePrintf("\"gbssStatusMsg\":\"");
		jd.jsonEncode (mstrerror(m_indexCode));
		jd.safePrintf("\",\n");
	}
	else {
		jd.safePrintf("\"gbssStatusCode\":-1,\n");
		jd.safePrintf("\"gbssStatusMsg\":\"???\",\n");
	}


	if ( m_httpStatusValid )
		jd.safePrintf("\"gbssHttpStatus\":%" PRId32",\n",
			      (int32_t)m_httpStatus);

	// do not index gbssIsSeedUrl:0 because there will be too many usually
	bool isSeed = ( m_sreqValid && m_sreq.m_isAddUrl );
	if ( isSeed )
		jd.safePrintf("\"gbssIsSeedUrl\":1,\n");

	if ( od )
		jd.safePrintf("\"gbssWasIndexed\":1,\n");
	else
		jd.safePrintf("\"gbssWasIndexed\":0,\n");

	int32_t now = getTimeGlobal();
	if ( od )
		jd.safePrintf("\"gbssAgeInIndex\":"
			      "%" PRIu32",\n",now - od->m_spideredTime);

	jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");

	jd.safePrintf("\"gbssDomain\":\"");
	jd.safeMemcpy(fu->getDomain(), fu->getDomainLen() );
	jd.safePrintf("\",\n");

	jd.safePrintf("\"gbssSubdomain\":\"");
	jd.safeMemcpy(fu->getHost(), fu->getHostLen() );
	jd.safePrintf("\",\n");

	//if ( m_redirUrlPtr && m_redirUrlValid )
	//if ( m_numRedirectsValid )
	jd.safePrintf("\"gbssNumRedirects\":%" PRId32",\n",m_numRedirects);

	if ( m_docIdValid )
		jd.safePrintf("\"gbssDocId\":%" PRId64",\n", m_docId);//*uqd);

	if ( m_hopCountValid )
		//jd.safePrintf("\"gbssHopCount\":%" PRId32",\n",(int32_t)*hc);
		jd.safePrintf("\"gbssHopCount\":%" PRId32",\n",(int32_t)m_hopCount);

	// for -diffbotxyz fake docs addedtime is 0
	if ( m_sreqValid && m_sreq.m_discoveryTime != 0 ) {
		// in Spider.cpp we try to set m_sreq's m_addedTime to the
		// min of all the spider requests, and we try to ensure
		// that in the case of deduping we preserve the one with
		// the oldest time. no, now we actually use
		// m_discoveryTime since we were using m_addedTime in
		// the url filters as it was originally intended.
		jd.safePrintf("\"gbssDiscoveredTime\":%" PRId32",\n",
			      m_sreq.m_discoveryTime);
	}

	if ( m_isDupValid && m_isDup )
		jd.safePrintf("\"gbssDupOfDocId\":%" PRId64",\n",
			      m_docIdWeAreADupOf);

	// how many spiderings were successful vs. failed
	// these don't work because we only store one reply
	// which overwrites any older reply. that's how the
	// key is. we can change the key to use the timestamp
	// and not parent docid in makeKey() for spider
	// replies later.
	// if ( m_sreqValid ) {
	// 	jd.safePrintf("\"gbssPrevTotalNumIndexAttempts\":%" PRId32",\n",
	// 		      m_sreq.m_reservedc1 + m_sreq.m_reservedc2 );
	// 	jd.safePrintf("\"gbssPrevTotalNumIndexSuccesses\":%" PRId32",\n",
	// 		      m_sreq.m_reservedc1);
	// 	jd.safePrintf("\"gbssPrevTotalNumIndexFailures\":%" PRId32",\n",
	// 		      m_sreq.m_reservedc2);
	// }

	if ( m_spideredTimeValid )
		jd.safePrintf("\"gbssSpiderTime\":%" PRId32",\n",
			      m_spideredTime);
	else
		jd.safePrintf("\"gbssSpiderTime\":%" PRId32",\n",0);


	if ( m_firstIndexedDateValid )
		jd.safePrintf("\"gbssFirstIndexed\":%" PRIu32",\n",
			      m_firstIndexedDate);

	if ( m_contentHash32Valid )
		jd.safePrintf("\"gbssContentHash32\":%" PRIu32",\n",
			      m_contentHash32);

	if ( m_downloadStartTimeValid && m_downloadEndTimeValid ) {
		jd.safePrintf("\"gbssDownloadStartTimeMS\":%" PRId64",\n",
			      m_downloadStartTime);
		jd.safePrintf("\"gbssDownloadEndTimeMS\":%" PRId64",\n",
			      m_downloadEndTime);

		int64_t took = m_downloadEndTime - m_downloadStartTime;
		jd.safePrintf("\"gbssDownloadDurationMS\":%" PRId64",\n",took);

		jd.safePrintf("\"gbssDownloadStartTime\":%" PRIu32",\n",
			      (uint32_t)(m_downloadStartTime/1000));

		jd.safePrintf("\"gbssDownloadEndTime\":%" PRIu32",\n",
			      (uint32_t)(m_downloadEndTime/1000));
	}


	jd.safePrintf("\"gbssUsedRobotsTxt\":%" PRId32",\n",
		      m_useRobotsTxt);

	//if ( m_numOutlinksAddedValid )
	// crap, this is not right because we only call addOutlinksToMetaList()
	// after we call this function.
	// jd.safePrintf("\"gbssNumOutlinksAdded\":%" PRId32",\n",
	// 	      (int32_t)m_numOutlinksAdded);

	// how many download/indexing errors we've had, including this one
	// if applicable.
	if ( m_srepValid )
		jd.safePrintf("\"gbssConsecutiveErrors\":%" PRId32",\n",
			      m_srep.m_errCount);
	else
		jd.safePrintf("\"gbssConsecutiveErrors\":%" PRId32",\n",0);


	if ( m_ipValid ) {
		char ipbuf[16];
		jd.safePrintf("\"gbssIp\":\"%s\",\n",iptoa(m_ip,ipbuf));
	} else
		jd.safePrintf("\"gbssIp\":\"0.0.0.0\",\n");

	if ( m_ipEndTime ) {
		int64_t took = m_ipEndTime - m_ipStartTime;
		jd.safePrintf("\"gbssIpLookupTimeMS\":%" PRId64",\n",took);
	}

	if ( m_siteNumInlinksValid ) {
		jd.safePrintf("\"gbssSiteNumInlinks\":%" PRId32",\n",
			      (int32_t)m_siteNumInlinks);
		char siteRank = getSiteRank();
		jd.safePrintf("\"gbssSiteRank\":%" PRId32",\n",
			      (int32_t)siteRank);
	}

	jd.safePrintf("\"gbssContentInjected\":%" PRId32",\n",
		      (int32_t)m_contentInjected);

	if ( m_percentChangedValid && od )
		jd.safePrintf("\"gbssPercentContentChanged\""
			      ":%.01f,\n",
			      m_percentChanged);

	jd.safePrintf("\"gbssSpiderPriority\":%" PRId32",\n",
		      *priority);

	// this could be -1, careful
	if ( *ufn >= 0 )
		jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
			      cr->m_regExs[*ufn].getBufStart());

	// we forced the langid valid above
	if ( m_langIdValid && m_contentLen )
		jd.safePrintf("\"gbssLanguage\":\"%s\",\n",
			      getLanguageAbbr(m_langId));

	if ( m_contentTypeValid && m_contentLen )
		jd.safePrintf("\"gbssContentType\":\"%s\",\n",
			      g_contentTypeStrings[m_contentType]);

	if ( m_contentValid )
		jd.safePrintf("\"gbssContentLen\":%" PRId32",\n",
			      m_contentLen);

	// do not show the -1 any more, just leave it out then
	// to make things look prettier
	if (  m_crawlDelayValid && m_crawlDelay >= 0 )
		// -1 if none?
		jd.safePrintf("\"gbssCrawlDelayMS\":%" PRId32",\n",
			      (int32_t)m_crawlDelay);


	// remove last ,\n
	jd.incrementLength(-2);
	// end the json spider status doc
	jd.safePrintf("\n}\n");

	// BEFORE ANY HASHING
	int32_t savedDist = m_dist;

	// add the index list for it. it returns false and sets g_errno on err
	// otherwise it sets m_spiderStatusDocMetaList
	if ( ! setSpiderStatusDocMetaList ( &jd , *uqd ) )
		return NULL;

	// now make the titlerec
	char xdhead[2048];
	// just the head of it. this is the hacky part.
	XmlDoc *xd = (XmlDoc *)xdhead;
	// clear it out
	memset ( xdhead, 0 , 2048);

	// copy stuff from THIS so the spider reply "document" has the same
	// header info stuff
	int32_t hsize = (char *)&ptr_firstUrl - (char *)this;
	if ( hsize > 2048 ) { g_process.shutdownAbort(true); }
	gbmemcpy ( xdhead , (char *)this , hsize );

	// override spider time in case we had error to be consistent
	// with the actual SpiderReply record
	//xd->m_spideredTime = reply->m_spideredTime;
	//xd->m_spideredTimeValid = true;
	// sanity
	//if ( reply->m_spideredTime != m_spideredTime ) {g_process.shutdownAbort(true);}

	// this will cause the maroon box next to the search result to
	// say "STATUS" similar to "PDF" "DOC" etc.
	xd->m_contentType  = CT_STATUS;

	int32_t fullsize = &m_dummyEnd - (char *)this;
	if ( fullsize > 2048 ) { g_process.shutdownAbort(true); }

	/*
	// the ptr_* were all zero'd out, put the ones we want to keep back in
	SafeBuf tmp;
	// was "Spider Status: %s" but that is unnecessary
	tmp.safePrintf("<title>%s</title>",
		       mstrerror(m_indexCode));

	// if we are a dup...
	if ( m_indexCode == EDOCDUP )
		tmp.safePrintf("Dup of docid %" PRId64"<br>", m_docIdWeAreADupOf );

	if ( m_redirUrlPtr && m_redirUrlValid )
		tmp.safePrintf("Redirected to %s<br>",m_redirUrlPtr->getUrl());
	*/

	// put stats like we log out from logIt
	//tmp.safePrintf("<div style=max-width:800px;>\n");
	// store log output into doc
	//logIt(&tmp);
	//tmp.safePrintf("\n</div>");

	// the content is just the title tag above
	// xd->ptr_utf8Content = tmp.getBufStart();
	// xd->size_utf8Content = tmp.length()+1;
	xd->ptr_utf8Content = jd.getBufStart();
	xd->size_utf8Content = jd.length()+1;

	// keep the same url as the doc we are the spider reply for
	xd->ptr_firstUrl = ptr_firstUrl;
	xd->size_firstUrl = size_firstUrl;

	// serps need site, otherwise search results core
	xd->ptr_site = ptr_site;
	xd->size_site = size_site;

	// if this is null then ip lookup failed i guess so just use
	// the subdomain
	if ( ! ptr_site && m_firstUrlValid ) {
		xd->ptr_site  = m_firstUrl.getHost();
		xd->size_site = m_firstUrl.getHostLen();
	}

	// use the same uh48 of our parent
	int64_t uh48 = m_firstUrl.getUrlHash48();
	// then make into a titlerec but store in metalistbuf, not m_titleRec
	SafeBuf titleRecBuf;
	// this should not include ptrs that are NULL when compressing
	// using its m_internalFlags1
	if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) )
		return NULL;

	// concat titleRec to our posdb key records
	if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) )
		return NULL;
	if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) )
		return NULL;

	// return the right val
	m_dist = savedDist;

	// ok, good to go, ready to add to posdb and titledb
	m_spiderStatusDocMetaListValid = true;
	return &m_spiderStatusDocMetaList;
}




// slightly greater than m_spideredTime, which is the download time.
// we use this for sorting as well, like for the widget so things
// don't really get added out of order and not show up in the top spot
// of the widget list.
int32_t XmlDoc::getIndexedTime() {
	if ( m_indexedTimeValid ) return m_indexedTime;
	m_indexedTime = getTimeGlobal();
	return m_indexedTime;
}




Url *XmlDoc::getBaseUrl ( ) {
	if ( m_baseUrlValid ) return &m_baseUrl;
	// need this
	const Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
	const Url *cu = getCurrentUrl();
	if ( ! cu || cu == (void *)-1 ) return (Url *)cu;

	m_baseUrl.set ( cu );

	// look for base url and use it if it exists
	for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) {
		// 12 is the <base href> tag id
		if ( xml->getNodeId ( i ) == TAG_BASE ) {
			// get the href field of this base tag
			int32_t linkLen;
			const char *link = xml->getString ( i, "href", &linkLen );

			// https://www.w3.org/TR/html51/document-metadata.html#the-base-element
			// if there are multiple <base> elements with href attributes, all but the first are ignored
			if (link == NULL) {
				continue;
			}

			m_baseUrl.set(cu, link, linkLen);

			break;
		}
	}

	m_baseUrlValid = true;
	return &m_baseUrl;
}

////////////////////////////////////////////////////////////
//
// Summary/Title generation for Msg20
//
////////////////////////////////////////////////////////////

void XmlDoc::setMsg20Request(Msg20Request *req) {
	// clear it all out
	reset();
	// this too
	m_reply.reset();

	m_pbuf     = NULL;//pbuf;
	m_niceness = req->m_niceness;
	// remember this
	m_req = req;
	m_collnum = req->m_collnum;
	m_collnumValid = true;
	// make this stuff valid
	if ( m_req->m_docId > 0 ) {
		m_docId      = m_req->m_docId;
		m_docIdValid = true;
	}
	// set url too if we should
	if ( m_req->size_ubuf > 1 )
		setFirstUrl ( m_req->ptr_ubuf );
}


class GetMsg20State {
public:
	bool something_ready;
	pthread_mutex_t mtx;
	pthread_cond_t cond;	
	GetMsg20State()
	  : something_ready(false)
	{
		pthread_mutex_init(&mtx,NULL);
		pthread_cond_init(&cond,NULL);
	}
	~GetMsg20State() {
		pthread_mutex_destroy(&mtx);
		pthread_cond_destroy(&cond);
	}
	void wait_for_something() {
		ScopedLock sl(mtx);
		while(!something_ready)
			pthread_cond_wait(&cond,&mtx);
		something_ready = false;
	}
	void notify_something_is_ready() {
		ScopedLock sl(mtx);
		something_ready = true;
		int rc = pthread_cond_signal(&cond);
		assert(rc==0);
	}

	void abort(bool *abort_flag) {
		ScopedLock sl(mtx);
		*abort_flag = true;
		int rc = pthread_cond_signal(&cond);
		assert(rc==0);
	}
};


//Just notify the msg20 generation thread that a step has finished and it should call getMsg20ReplyStepwise() again
static void wakeupMsg20Thread(void *pv) {
	GetMsg20State *gm20s = static_cast<GetMsg20State*>(pv);
	gm20s->notify_something_is_ready();
}


// . returns NULL with g_errno set on error
Msg20Reply *XmlDoc::getMsg20Reply() {
	// return it right away if valid
	if ( m_replyValid ) return &m_reply;

	if(m_errno!=0) {
		g_errno = m_errno;
		return NULL;
	}

	// caller shouldhave the callback set
	if ( ! m_callback1 && ! m_callback2 ) { g_process.shutdownAbort(true); }

	// used by Msg20.cpp to time this XmlDoc::getMsg20Reply() function
	if ( ! m_startTimeValid ) {
		m_startTime      = gettimeofdayInMilliseconds();
		m_startTimeValid = true;
	}

	GetMsg20State *gm20s = new GetMsg20State();
	// . internal callback
	// . so if any of the functions we end up calling directly or
	//   indirectly block, this callback will be called
	if ( ! m_masterLoop ) {
		m_masterLoop  = wakeupMsg20Thread;
		m_masterState = gm20s;
	}

	//ok, ready to start piecing together a msg20reply
	if(g_jobScheduler.submit(getMsg20ReplyThread, msg20Done, this, thread_type_query_summary, 0)) {
		return (Msg20Reply*)-1; //no result yet
	} else {
		//not expected to happen but we support it anyway
		m_errno = 0;
		loopUntilMsg20ReplyReady(gm20s);
		delete gm20s;
		if(m_errno!=0) {
			g_errno = m_errno;
			return NULL;
		}
		return &m_reply;
	}
}


//just a trampoline
void XmlDoc::getMsg20ReplyThread(void *pv) {
	XmlDoc *that = static_cast<XmlDoc*>(pv);
	that->getMsg20ReplyThread();
}


void XmlDoc::getMsg20ReplyThread() {
	GetMsg20State *gm20s = static_cast<GetMsg20State*>(m_masterState);
	loopUntilMsg20ReplyReady(gm20s);
	delete gm20s;
	callCallback();
}


void XmlDoc::msg20Done(void *pv, job_exit_t exit_type) {
	XmlDoc *that = static_cast<XmlDoc*>(pv);
	that->msg20Done(exit_type);
}

void XmlDoc::msg20Done(job_exit_t exit_type) {
	if(exit_type!=job_exit_normal) {
		//abort job by telling loopUntilMsg20ReplyReady to give up
		GetMsg20State *gm20s = static_cast<GetMsg20State*>(m_masterState);
		gm20s->abort(&m_abortMsg20Generation);
	}
}

//Repeat calling getMsg20ReplyStepwise() until a result is ready or and error has been encountered
void XmlDoc::loopUntilMsg20ReplyReady(GetMsg20State *gm20s) {
//	while(getMsg20ReplyStepwise() == (Msg20Reply*)-1)
//		gm20s->wait_for_something();
	for(;;) {
		Msg20Reply *r = getMsg20ReplyStepwise();
		if(r==(Msg20Reply*)-1)
			gm20s->wait_for_something();
		else {
			if(r==NULL) {
				if(g_errno)
					m_errno = g_errno;
				else
					g_process.shutdownAbort(true);
			}
			break;
		}
	}
}


//verify that a pointer return from getXxxx() methods is consistent. If NULL returns it means that an error occurred but then g_errno must be set
static void checkPointerError(const void *ptr) {
	if(ptr==NULL && g_errno==0)
		gbshutdownLogicError();
}


//Make progress toward getting a summary. Returns NULL on error, -1 if an async action is waiting,
//and a pointer to the reply when done.
Msg20Reply *XmlDoc::getMsg20ReplyStepwise() {
	if(m_abortMsg20Generation) {
		log(LOG_DEBUG,"msg20: aborted");
		if(!m_errno)
			m_errno = ECANCELED;
		return NULL;
	}

	m_niceness = m_req->m_niceness;

	m_collnum = m_req->m_collnum;//cr->m_collnum;
	m_collnumValid = true;

	//char *coll = m_req->ptr_coll;
	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
	if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; }

	// . cache it for one hour
	// . this will set our ptr_ and size_ member vars
	char **otr = getOldTitleRec();
	if ( ! otr || otr == (void *)-1 ) {
		checkPointerError(otr);
		return (Msg20Reply *)otr;
	}

	// must have a title rec in titledb
	if ( ! *otr ) {
		g_errno = ENOTFOUND;
		return NULL;
	}

	// sanity
	if ( *otr != m_oldTitleRec ) {
		g_process.shutdownAbort(true);
	}

	// . set our ptr_ and size_ member vars from it after uncompressing
	// . returns false and sets g_errno on error
	if ( ! m_setTr ) {
		// . this completely resets us
		// . this returns false with g_errno set on error
		bool status = set2( *otr, 0, cr->m_coll, NULL, m_niceness);

		// sanity check
		if ( ! status && ! g_errno ) {
			g_process.shutdownAbort(true);
		}

		// if there was an error, g_errno should be set.
		if ( ! status ) {
			return NULL;
		}

		m_setTr = true;
	}

	m_reply.m_collnum = m_collnum;

	// lookup the tagdb rec fresh if setting for a summary. that way we
	// can see if it is banned or not. but for getting m_getTermListBuf
	// and stuff above, skip the tagrec lookup!
	// save some time when SPIDERING/BUILDING by skipping fresh
	// tagdb lookup and using tags in titlerec
	if ( m_req && ! m_req->m_getLinkText && ! m_checkedUrlFilters )
		m_tagRecDataValid = false;

	// if  shard responsible for tagrec is dead, then
	// just recycle!
	if ( m_req && ! m_checkedUrlFilters && ! m_tagRecDataValid ) {
		char *site = getSite();
		TAGDB_KEY tk1 = Tagdb::makeStartKey ( site );
		TAGDB_KEY tk2 = Tagdb::makeDomainStartKey ( &m_firstUrl );
		uint32_t shardNum1 = g_hostdb.getShardNum(RDB_TAGDB,&tk1);
		uint32_t shardNum2 = g_hostdb.getShardNum(RDB_TAGDB,&tk2);
		// shardnum1 and shardnum2 are often different!
		// log("db: s1=%i s2=%i",(int)shardNum1,(int)shardNum2);
		if ( g_hostdb.isShardDead ( shardNum1 ) ) {
			log("query: skipping tagrec lookup for dead shard "
			    "# %" PRId32
			    ,shardNum1);
			m_tagRecDataValid = true;
		}
		if ( g_hostdb.isShardDead ( shardNum2 ) && m_firstUrlValid ) {
			log("query: skipping tagrec lookup for dead shard "
			    "# %" PRId32
			    ,shardNum2);
			m_tagRecDataValid = true;
		}
	}

	// if we are showing sites that have been banned in tagdb, we dont
	// have to do a tagdb lookup. that should speed things up.
	TagRec *gr = NULL;
	if ( cr && cr->m_doTagdbLookups ) {
		gr = getTagRec();
		if ( ! gr || gr == (void *)-1 ) { checkPointerError(gr); return (Msg20Reply *)gr; }
	}

	// this should be valid, it is stored in title rec
	if ( m_contentHash32Valid ) m_reply.m_contentHash32 = m_contentHash32;
	else                        m_reply.m_contentHash32 = 0;

	if ( ! m_checkedUrlFilters ) {
		// do not re-check
		m_checkedUrlFilters = true;

		// get this
		SpiderRequest sreq;
		SpiderReply   srep;
		setSpiderReqForMsg20 ( &sreq , &srep );

		int32_t spideredTime = getSpideredTime();
		int32_t langIdArg = -1;
		if ( m_langIdValid ) {
			langIdArg = m_langId;
		}

		// get it
		int32_t ufn = ::getUrlFilterNum(&sreq, &srep, spideredTime, true, cr, false, NULL, langIdArg);

		// get spider priority if ufn is valid
		int32_t pr = 0;

		// sanity check
		if ( ufn < 0 ) {
			log("msg20: bad url filter for url [%s], langIdArg=%" PRId32, sreq.m_url, langIdArg);
		}
		else {
			if ( cr->m_forceDelete[ufn] ) {
				pr = -3;
			}
		}


		// this is an automatic ban!
		if ( gr && gr->getLong("manualban",0))
			pr=-3;//SPIDER_PRIORITY_BANNED;

		// is it banned
		if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
			// set m_errno
			m_reply.m_errno = EDOCBANNED;
			// and this
			m_reply.m_isBanned = true;
		}

		//
		// for now always allow it until we can fix this better
		// we probably should assume NOT filtered unless it matches
		// a string match only url filter... but at least we will
		// allow it to match "BANNED" filters for now...
		//
		pr = 0;

		// done if we are
		if ( m_reply.m_errno && ! m_req->m_showBanned ) {
			// give back the url at least
			m_reply.ptr_ubuf = getFirstUrl()->getUrl();
			m_reply.size_ubuf = getFirstUrl()->getUrlLen() + 1;
			m_replyValid = true;
			return &m_reply;
		}
	}

	// a special hack for XmlDoc::getRecommendedLinksBuf() so we exclude
	// links that link to the main url's site/domain as well as a
	// competitor url (aka related docid)
	Links *links = NULL;
	if ( m_req->m_ourHostHash32 || m_req->m_ourDomHash32 ) {
		links = getLinks();
		if ( ! links || links==(Links *)-1) { checkPointerError(links); return (Msg20Reply *)links; }
	}

	// do they want a summary?
	if ( m_req->m_numSummaryLines>0 && ! m_reply.ptr_displaySum ) {
		char *hsum = getHighlightedSummary( &(m_reply.m_isDisplaySumSetFromTags) );

		if ( ! hsum || hsum == (void *)-1 ) {
			checkPointerError(hsum);
			return (Msg20Reply *)hsum;
		}

		// is it size and not length?
		int32_t hsumLen = 0;

		// seems like it can return 0x01 if none...
		if ( hsum == (char *)0x01 ) hsum = NULL;

		// get len. this is the HIGHLIGHTED summary so it is ok.
		if ( hsum ) hsumLen = strlen(hsum);

		// must be \0 terminated. not any more, it can be a subset
		// of a larger summary used for deduping
		if ( hsumLen > 0 && hsum[hsumLen] ) { g_process.shutdownAbort(true); }

		// grab stuff from it!
		m_reply.ptr_displaySum = hsum;
		m_reply.size_displaySum = hsumLen+1;
	}

	// copy the link info stuff?
	if ( ! m_req->m_getLinkText ) {
		m_reply.ptr_linkInfo  = (char *)ptr_linkInfo1;
		m_reply.size_linkInfo = size_linkInfo1;
	}

	bool getThatTitle = true;
	if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false;
	if ( m_reply.ptr_tbuf           ) getThatTitle = false;
	// if steve's requesting the inlink summary we will want to get
	// the title of each linker even if they are spammy!
	// only get title here if NOT getting link text otherwise
	// we only get it down below if not a spammy voter, because
	// this sets the damn slow sections class
	if ( m_req->m_getLinkText &&
	     ! m_useSiteLinkBuf &&
	     ! m_usePageLinkBuf &&
	     // m_pbuf is used by pageparser.cpp now, not the other two things
	     // above this.
	     ! m_pbuf )
		getThatTitle = false;

	// if steve is getting the inlinks, bad and good, for displaying
	// then get the title here now... otherwise, if we are just spidering
	// and getting the inlinks, do not bother getting the title because
	// the inlink might be linkspam... and we check down below...
	if ( ! m_req->m_onlyNeedGoodInlinks )
		getThatTitle = true;

	// ... no more seo so stop it... disable this for sp
	if ( m_req->m_getLinkText )
	        getThatTitle = false;

	if ( getThatTitle ) {
		Title *ti = getTitle();
		if ( ! ti || ti == (Title *)-1 ) { checkPointerError(ti); return (Msg20Reply *)ti; }
		char *tit = ti->getTitle();
		int32_t  titLen = ti->getTitleLen();
		m_reply.ptr_tbuf = tit;
		m_reply.size_tbuf = titLen + 1; // include \0
		// sanity
		if ( tit && tit[titLen] != '\0' ) { g_process.shutdownAbort(true); }
		if ( ! tit || titLen <= 0 ) {
			m_reply.ptr_tbuf = NULL;
			m_reply.size_tbuf = 0;
		}
	}

	// this is not documented because i don't think it will be popular
	if ( m_req->m_getHeaderTag ) {
		SafeBuf *htb = getHeaderTagBuf();
		if ( ! htb || htb == (SafeBuf *)-1 ) { checkPointerError(htb); return (Msg20Reply *)htb; }
		// . it should be null terminated
		// . actually now it is a \0 separated list of the first
		//   few h1 tags
		// . we call SafeBuf::pushChar(0) to add each one
		m_reply.ptr_htag = htb->getBufStart();
		m_reply.size_htag = htb->length();
	}

	// get site
	m_reply.ptr_site  = ptr_site;
	m_reply.size_site = size_site;

	// assume unknown
	m_reply.m_noArchive = 0;
	// are we noarchive? only check this if not getting link text
	if ( ! m_req->m_getLinkText ) {
		char *na = getIsNoArchive();
		if ( ! na || na == (char *)-1 ) { checkPointerError(na); return (Msg20Reply *)na; }
		m_reply.m_noArchive = *na;
	}

	// . summary vector for deduping
	// . does not compute anything if we should not! (svSize will be 0)
	if ( ! m_reply.ptr_vbuf &&
	     m_req->m_getSummaryVector &&
	     cr->m_percentSimilarSummary >   0 &&
	     cr->m_percentSimilarSummary < 100   ) {
		int32_t *sv = getSummaryVector ( );
		if ( ! sv || sv == (void *)-1 ) { checkPointerError(sv); return (Msg20Reply *)sv; }
		m_reply.ptr_vbuf = (char *)m_summaryVec;
		m_reply.size_vbuf = m_summaryVecSize;
	}

	// returns values of specified meta tags
	if ( ! m_reply.ptr_dbuf && m_req->size_displayMetas > 1 ) {
		int32_t dsize;  char *d;
		d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
		if ( ! d || d == (char *)-1 ) { checkPointerError(d); return (Msg20Reply *)d; }
		m_reply.ptr_dbuf  = d;
		m_reply.size_dbuf = dsize; // includes \0
	}

	// get thumbnail image DATA
	if ( ! m_reply.ptr_imgData && ! m_req->m_getLinkText ) {
		m_reply.ptr_imgData = ptr_imageData;
		m_reply.size_imgData = size_imageData;
	}

	// get firstip
	int32_t *fip = getFirstIp();
	if ( ! fip || fip == (void *)-1 ) { checkPointerError(fip); return (Msg20Reply *)fip; }

	char *ru = ptr_redirUrl;
	int32_t  rulen = 0;
	if ( ru ) rulen = strlen(ru)+1;

	// need full cached page of each search result?
	// include it always for spider status docs.
	if ( m_req->m_includeCachedCopy || m_contentType == CT_STATUS ) {
		m_reply.ptr_content =  ptr_utf8Content;
		m_reply.size_content = size_utf8Content;
	}

	// do they want to know if this doc has an outlink to a url
	// that has the provided site and domain hash, Msg20Request::
	// m_ourHostHash32 and m_ourDomHash32?
	int32_t nl = 0;
	if ( links ) nl = links->getNumLinks();
	// scan all outlinks we have on this page
	int32_t i ; for ( i = 0 ; i < nl ; i++ ) {
		// get the normalized url
		//char *url = links->getLinkPtr(i);
		// get the site. this will not block or have an error.
		int32_t hh32 = (int32_t)((uint32_t)links->getHostHash64(i));
		if ( hh32 == m_req->m_ourHostHash32 ) break;
		int32_t dh32 = links->getDomHash32(i);
		if ( dh32 == m_req->m_ourDomHash32 ) break;
	}

	// easy ones
	m_reply.m_isPermalink      = m_isPermalink;
	m_reply.m_ip               = m_ip;
	m_reply.m_firstIp          = *fip;
	m_reply.m_docId            = m_docId;
	m_reply.m_httpStatus       = m_httpStatus;
	m_reply.m_contentLen       = size_utf8Content - 1;
	m_reply.m_lastSpidered     = getSpideredTime();//m_spideredTime;
	m_reply.m_datedbDate       = 0;
	m_reply.m_firstIndexedDate = m_firstIndexedDate;
	m_reply.m_firstSpidered    = m_firstIndexedDate;
	m_reply.m_contentType      = m_contentType;
	m_reply.m_language         = m_langId;
	m_reply.m_country          = *getCountryId();
	m_reply.m_hopcount         = m_hopCount;
	m_reply.m_siteRank         = getSiteRank();
	m_reply.m_isAdult          = m_isAdult; //QQQ getIsAdult()? hmmm

	m_reply.ptr_ubuf             = getFirstUrl()->getUrl();
	m_reply.ptr_rubuf            = ru;
	m_reply.ptr_metadataBuf      = NULL;


	m_reply.size_ubuf             = getFirstUrl()->getUrlLen() + 1;
	m_reply.size_rubuf            = rulen;
	m_reply.size_metadataBuf      = 0;

	// check the tag first
	if ( ! m_siteNumInlinksValid ) { g_process.shutdownAbort(true); }

	m_reply.m_siteNumInlinks       = m_siteNumInlinks;

	// . get stuff from link info
	// . this is so fast, just do it for all Msg20 requests
	// . no! think about it -- this can be huge for pages like
	//   google.com!!!
	LinkInfo *info1 = ptr_linkInfo1;
	if ( info1 ) {
		m_reply.m_pageNumInlinks        = info1->m_totalInlinkingDocIds;
		m_reply.m_pageNumGoodInlinks     = info1->m_numGoodInlinks;
		m_reply.m_pageNumUniqueIps       = info1->m_numUniqueIps;
		m_reply.m_pageNumUniqueCBlocks   = info1->m_numUniqueCBlocks;
		m_reply.m_pageInlinksLastUpdated = info1->m_lastUpdated;
	}

	// getLinkText is true if we are getting the anchor text for a
	// supplied url as part of the SPIDER process..
	// this was done by Msg23 before
	if ( ! m_req->m_getLinkText ) {
		m_replyValid = true;
		return &m_reply;
	}

	// use the first url of the linker by default
	Url *linker = &m_firstUrl;

	// the base url, used for doing links: terms, is the final url,
	// just in case there were any redirects
	Url redir;
	if ( ru ) {
		redir.set ( ru );
		linker = &redir;
	}

	// . we need the mid doma hash in addition to the ip domain because
	//   chat.yahoo.com has different ip domain than www.yahoo.com , ...
	//   and we don't want them both to be able to vote
	// . the reply is zeroed out in call the m_reply.reset() above so
	//   if this is not yet set it will be 0
	if ( m_reply.m_midDomHash == 0 ) {
		m_reply.m_midDomHash = hash32 ( linker->getMidDomain(), linker->getMidDomainLen() );
	}

	int64_t start = gettimeofdayInMilliseconds();

	// if not set from above, set it here
	if ( ! links ) links = getLinks ( true ); // do quick set?
	if ( ! links || links == (Links *)-1 ) {checkPointerError(links); return (Msg20Reply *)links; }
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) { checkPointerError(pos); return (Msg20Reply *)pos; }
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) { checkPointerError(ww); return (Msg20Reply *)ww; }
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) { checkPointerError(xml); return (Msg20Reply *)xml; }

	// get a ptr to the link in the content. will point to the
	// stuff in the href field of the anchor tag. used for seeing if
	// we have bad links or not.
	int32_t linkNode = -1;
	int32_t linkNum  = -1;
	// . get associated link text from the linker's document for our "url"
	// . only gets from FIRST link to us
	// . TODO: allow more link text from better quality pages?
	// . TODO: limit score based on link text length?
	// . should always be NULL terminated
	// . should not break in the middle of a word
	// . this will return the item/entry if we are extracting from an
	//   rss/atom feed
	char  *rssItem    = NULL;
	int32_t   rssItemLen = 0;

	//
	// TODO: for getting siteinlinks just match the site in the url
	// not the full url... and maybe match the one with the shortest path.
	//

	//workaround for truncation causeing a multibyte utf8 character to be
	//split and then text parsing traversing past the defined bytes.
	m_linkTextBuf[sizeof(m_linkTextBuf)-3] = '\0';
	m_linkTextBuf[sizeof(m_linkTextBuf)-2] = '\0';
	m_linkTextBuf[sizeof(m_linkTextBuf)-1] = '\0';

	// . get the link text
	// . linkee might be a site if m_isSiteLinkInfo is true in which
	//   case we get the best inlink to that site, and linkee is
	//   something like blogspot.com/mary/ or some other site.
	int32_t blen = links->getLinkText ( m_req->ptr_linkee  ,//&linkee,
					 m_req->m_isSiteLinkInfo ,
					 m_linkTextBuf       ,
					 sizeof(m_linkTextBuf)-2,
					 &rssItem            ,
					 &rssItemLen         ,
					 &linkNode           ,
					 &linkNum            );

	// . BUT this skips the news topic stuff too. bad?
	// . THIS HAPPENED before because we were truncating the xml(see above)
	if ( linkNode < 0 ) {

		int64_t took = gettimeofdayInMilliseconds() - start;
		if ( took > 100 )
			log("build: took %" PRId64" ms to get link text for "
			    "%s from linker %s",
			    took,
			    m_req->ptr_linkee,
			    m_firstUrl.getUrl() );

		logf(LOG_DEBUG,"build: Got linknode = %" PRId32" < 0. Cached "
		     "linker %s does not have outlink to %s like linkdb "
		     "says it should. page is probably too big and the "
		     "outlink is past our limit. contentLen=%" PRId32". or "
		     "a sitehash collision, or an area tag link.",
		     linkNode,getFirstUrl()->getUrl(),m_req->ptr_linkee,
		     m_xml.getContentLen());
		//g_errno = ECORRUPTDATA;
		// do not let multicast forward to a twin! so use this instead
		// of ECORRUTPDATA
		g_errno = EBADENGINEER;
		//g_process.shutdownAbort(true);
		return NULL;
	}

	if ( ! verifyUtf8 ( m_linkTextBuf , blen ) ) {
		log("xmldoc: bad OUT link text from url=%s for %s",
		    m_req->ptr_linkee,m_firstUrl.getUrl());
		m_linkTextBuf[0] = '\0';
		blen = 0;
	}

	// verify for rss as well. seems like we end up coring because
	// length/size is not in cahoots and [size-1] != '\0' sometimes
	if ( ! verifyUtf8 ( rssItem , rssItemLen ) ) {
		log("xmldoc: bad RSS ITEM text from url=%s for %s",
		    m_req->ptr_linkee,m_firstUrl.getUrl());
		rssItem[0] = '\0';
		rssItemLen = 0;
	}

	// point to it, include the \0.
	if ( blen > 0 ) {
		m_reply.ptr_linkText  = m_linkTextBuf;
		// save the size into the reply, include the \0
		m_reply.size_linkText = blen + 1;
		// sanity check
		if ( (size_t)blen + 2 > sizeof(m_linkTextBuf) ) { g_process.shutdownAbort(true); }
		// sanity check. null termination required.
		if ( m_linkTextBuf[blen] ) { g_process.shutdownAbort(true); }
	}

	// . the link we link to
	// . important when getting site info because the link url
	//   can be different than the root url!
	m_reply. ptr_linkUrl = links->getLinkPtr(linkNum);
	m_reply.size_linkUrl = links->getLinkLen(linkNum)+1;

	// save the rss item in our state so we can point to it, include \0
	if ( (size_t)rssItemLen > sizeof(m_rssItemBuf)-2)
		rssItemLen = sizeof(m_rssItemBuf)-2;

	if ( rssItemLen > 0) {
		gbmemcpy ( m_rssItemBuf, rssItem , rssItemLen );
		// NULL terminate it
		m_rssItemBuf[rssItemLen] = 0;

		// point to it, include the \0
		m_reply.ptr_rssItem  = m_rssItemBuf;
		m_reply.size_rssItem = rssItemLen + 1;
	}

	if ( ! m_req->m_doLinkSpamCheck )
		m_reply.m_isLinkSpam = 0;

	if ( m_req->m_doLinkSpamCheck ) {
		// reset to NULL to avoid strlen segfault
		const char *note = NULL;
		// need this
		if ( ! m_xmlValid ) { g_process.shutdownAbort(true); }

		Url linkeeUrl;
		linkeeUrl.set ( m_req->ptr_linkee );

		// get it. does not block.
		m_reply.m_isLinkSpam = ::isLinkSpam ( linker ,
						     m_ip ,
						     m_siteNumInlinks,
						     &m_xml,
						     links,
						     // if doc length more
						     // than 150k then consider
						     // it linkspam
						     // automatically so it
						     // can't vote
						     150000,//MAXDOCLEN//150000
						     &note ,
						     &linkeeUrl , // url ,
						     linkNode );
		// store it
		if ( note ) {
			// include the \0
			m_reply.ptr_note  = note;
			m_reply.size_note = strlen(note)+1;
		}
		// log the reason why it is a log page
		if ( m_reply.m_isLinkSpam )
			log(LOG_DEBUG,"build: linker %s: %s.",
			    linker->getUrl(),note);
		// sanity
		if ( m_reply.m_isLinkSpam && ! note )
			log("linkspam: missing note for d=%" PRId64"!",m_docId);
	}

	// sanity check
	if ( m_reply.ptr_rssItem &&
	     m_reply.size_rssItem>0 &&
	     m_reply.ptr_rssItem[m_reply.size_rssItem-1]!=0) {
		g_process.shutdownAbort(true); }

	// . skip all this junk if we are a spammy voter
	// . we get the title above in "getThatTitle"
	if ( m_reply.m_isLinkSpam ) {
		m_replyValid = true; 
		return &m_reply;
	}

	// . this vector is set from a sample of the entire doc
	// . it is used to dedup voters in Msg25.cpp
	// . this has pretty much been replaced by vector2, it was
	//   also saying a doc was a dup if all its words were
	//   contained by another, like if it was a small subset, which
	//   wasn't the best behaviour.
	// . yeah neighborhood text is much better and this is setting
	//   the slow sections class, so i took it out
	getPageSampleVector ();
	// must not block or error out. sanity check
	if ( ! m_pageSampleVecValid ) { g_process.shutdownAbort(true); }
	//st->m_v1.setPairHashes    ( ww , -1    , m_niceness );

	// . this vector is set from the text after the link text
	// . it terminates at at a breaking tag
	// . check it out in ~/fff/src/Msg20.cpp
	getPostLinkTextVector ( linkNode );

	// get it
	getTagPairHashVector();
	// must not block or error out. sanity check
	if ( ! m_tagPairHashVecValid ) { g_process.shutdownAbort(true); }

	// reference the vectors in our reply
	m_reply. ptr_vector1 = m_pageSampleVec;
	m_reply.size_vector1 = m_pageSampleVecSize;
	m_reply. ptr_vector2 = m_postVec;
	m_reply.size_vector2 = m_postVecSize;
	m_reply. ptr_vector3 = m_tagPairHashVec;
	m_reply.size_vector3 = m_tagPairHashVecSize;

	// crap, we gotta bubble sort these i think
	// but only tag pair hash vec
	bool flag = true;
	uint32_t *d = (uint32_t *)m_tagPairHashVec;
	// exclude the terminating 0 int32_t
	int32_t nd = (m_tagPairHashVecSize / 4) - 1;
	while ( flag ) {
		flag = false;
		for ( int32_t i = 1 ; i < nd ; i++ ) {
			if ( d[i-1] <= d[i] ) continue;
			uint32_t tmp = d[i-1];
			d[i-1] = d[i];
			d[i]   = tmp;
			flag   = true;
		}
	}

	// convert "linkNode" into a string ptr into the document
	char *node = xml->getNodePtr(linkNode)->m_node;
	// . find the word index, "n" for this node
	// . this is INEFFICIENT!!
	char **wp = ww->getWordPtrs();
	int32_t   nw = ww->getNumWords();
	int32_t   n;

	for ( n = 0; n < nw && wp[n] < node ; n++ ) {
	}

	// sanity check
	if ( n >= nw ) {
		log("links: crazy! could not get word before linknode");
		g_errno = EBADENGINEER;
		return NULL;
	}

	//
	// get the surrounding link text, around "linkNode"
	//
	// radius of 80 characters around n
	int32_t  radius = 80;
	char *p      = m_surroundingTextBuf;
	char *pend   = m_surroundingTextBuf + sizeof(m_surroundingTextBuf)/2;
	// . make a neighborhood in the "words" space [a,b]
	// . radius is in characters, so "convert" into words by dividing by 5
	int32_t a = n - radius / 5;
	int32_t b = n + radius / 5;
	if ( a <     0 ) a =     0;
	if ( b >    nw ) b =    nw;
	int32_t *pp  = pos->m_pos;
	int32_t  len;
	// if too big shring the biggest, a or b?
	while ( (len=pp[b]-pp[a]) >= 2 * radius + 1 ) {
		// decrease the largest, a or b
		if ( a<n && (pp[n]-pp[a])>(pp[b]-pp[n])) a++;
		else if ( b>n )                          b--;
	}
	// only store it if we can
	if ( p + len + 1 < pend ) {
		// store it
		// FILTER the html entities!!
		int32_t len2 = pos->filter( ww, a, b, false, p, pend, m_version );

		// ensure NULL terminated
		p[len2] = '\0';
		// store in reply. it will be serialized when sent.
		m_reply.ptr_surroundingText  = p;
		m_reply.size_surroundingText = len2 + 1;
	}

	// get title? its slow because it sets the sections class
	if ( m_req->m_titleMaxLen > 0 && ! m_reply.ptr_tbuf &&
	     // don't get it anymore if getting link info because it
	     // is slow...
	     getThatTitle ) {
		Title *ti = getTitle();
		if ( ! ti || ti == (Title *)-1 ) { checkPointerError(ti); return (Msg20Reply *)ti; }
		char *tit = ti->getTitle();
		int32_t  titLen = ti->getTitleLen();
		m_reply. ptr_tbuf = tit;
		m_reply.size_tbuf = titLen + 1; // include \0
		if ( ! tit || titLen <= 0 ) {
			m_reply.ptr_tbuf = NULL;
			m_reply.size_tbuf = 0;
		}
	}

	int64_t took = gettimeofdayInMilliseconds() - start;
	if ( took > 100 )
		log("build: took %" PRId64" ms to get link text for "
		    "%s from linker %s",
		    took,
		    m_req->ptr_linkee,
		    m_firstUrl.getUrl() );


	m_replyValid = true;
	return &m_reply;
}


Query *XmlDoc::getQuery() {
	if ( m_queryValid ) return &m_query;

	// bail if no query
	if ( ! m_req || ! m_req->ptr_qbuf ) {
		m_queryValid = true;
		return &m_query;
	}

	int64_t start = logQueryTimingStart();

	// return NULL with g_errno set on error
	if ( !m_query.set2( m_req->ptr_qbuf, m_req->m_langId, m_req->m_queryExpansion, m_req->m_useQueryStopWords ) ) {
		if(!g_errno)
			g_errno = EBADENGINEER; //can fail due to a multitude of problems
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	m_queryValid = true;
	return &m_query;
}

Matches *XmlDoc::getMatches () {
	// return it if it is set
	if ( m_matchesValid ) return &m_matches;

	// if no query, matches are empty
	if ( ! m_req || ! m_req->ptr_qbuf ) {
		m_matchesValid = true;
		return &m_matches;
	}

	// need a buncha crap
	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (Matches *)ww;
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (Matches *)xml;
	Bits *bits = getBitsForSummary();
	if ( ! bits || bits == (Bits *)-1 ) return (Matches *)bits;
	Sections *ss = getSections();
	if ( ! ss || ss == (void *)-1) return (Matches *)ss;
	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) return (Matches *)pos;
	Title *ti = getTitle();
	if ( ! ti || ti == (Title *)-1 ) return (Matches *)ti;
	Phrases *phrases = getPhrases();
	if ( ! phrases || phrases == (void *)-1 ) return (Matches *)phrases;

	Query *q = getQuery();
	if ( ! q ) return (Matches *)q;

	int64_t start = logQueryTimingStart();

	// set it up
	m_matches.setQuery ( q );

	LinkInfo *linkInfo = getLinkInfo1();
	if(linkInfo==(LinkInfo*)-1)
		linkInfo = NULL;
	// returns false and sets g_errno on error
	if ( !m_matches.set( ww, phrases, ss, bits, pos, xml, ti, getFirstUrl(), linkInfo ) ) {
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	// we got it
	m_matchesValid = true;
	return &m_matches;
}

// sender wants meta description, custom tags, etc.
char *XmlDoc::getDescriptionBuf ( char *displayMetas , int32_t *dsize ) {
	// return the buffer if we got it
	if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	// now get the content of the requested display meta tags
	//char  dbuf [ 1024*64 ];
	char *dbufEnd = m_dbuf + 1024;//1024*64;
	char *dptr    = m_dbuf;
	char *pp      = displayMetas;
	char *ppend   = pp + strlen(displayMetas);
	// loop over the list of requested meta tag names
	while ( pp < ppend && dptr < dbufEnd ) {
		// skip initial spaces. meta tag names are ascii always i guess
		while ( *pp && is_wspace_a(*pp) ) pp++;
		// that's the start of the meta tag name
		char *s = pp;
		// . find end of that meta tag name
		// . can end in :<integer> which specifies max len
		while ( *pp && ! is_wspace_a(*pp) && *pp != ':' ) pp++;
		// assume no max length to the content of this meta tag
		int32_t maxLen = 0x7fffffff;
		// save current char
		char c = *pp;
		// . NULL terminate the name
		// . before, overflowed the request buffer and caused core!
		// . seems like it is already NULL terminated
		if ( *pp ) *pp = '\0';
		// always advance regardless though
		pp++;
		// if ':' was specified, get the max length
		if ( c == ':' ) {
			if ( is_digit(*pp) ) maxLen = atoi ( pp );
			// skip over the digits
			while ( *pp && ! is_wspace_a (*pp) ) pp++;
		}
		// don't exceed our total buffer size (save room for \0 at end)
		int32_t avail = dbufEnd - dptr - 1;
		if ( maxLen > avail ) maxLen = avail;
		// store the content at "dptr" (do not exceed "maxLen" bytes)
		int32_t wlen = xml->getMetaContent( dptr, maxLen, s, strlen( s ) );
		dptr[wlen] = '\0';

		// test it out
		if ( ! verifyUtf8 ( dptr ) ) {
			log("xmldoc: invalid utf8 content for meta tag %s.",s);
			continue;
		}

		// advance and NULL terminate
		dptr    += wlen;
		*dptr++  = '\0';
		// bitch if we truncated
		if ( dptr >= dbufEnd )
			log("query: More than %" PRId32" bytes of meta tag "
			    "content "
			    "was encountered. Truncating.",
			    (int32_t)(dbufEnd-m_dbuf));
	}
	// what is the size of the content of displayed meta tags?
	m_dbufSize   = dptr - m_dbuf;
	m_dbufValid = true;
	*dsize = m_dbufSize;
	return m_dbuf;
}

SafeBuf *XmlDoc::getHeaderTagBuf() {
	if ( m_htbValid ) return &m_htb;

	Sections *ss = getSections();
	if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;

	int32_t count = 0;

	// scan sections
	Section *si = ss->m_rootSection;

 moreloop:

	for ( ; si ; si = si->m_next ) {
		if ( si->m_tagId != TAG_H1 ) continue;
		// if it contains now text, this will be -1
		// so give up on it
		if ( si->m_firstWordPos < 0 ) continue;
		if ( si->m_lastWordPos  < 0 ) continue;
		// ok, it works, get it
		break;
	}
	// if no h1 tag then make buf empty
	if ( ! si ) {
		m_htb.nullTerm();
		m_htbValid = true;
		return &m_htb;
	}
	// otherwise, set it
	const char *a = m_words.getWord(si->m_firstWordPos);
	const char *b = m_words.getWord(si->m_lastWordPos);
	b += m_words.getWordLen(si->m_lastWordPos);

	// copy it
	m_htb.safeMemcpy ( a , b - a );
	m_htb.pushChar('\0');

	si = si->m_next;

	// add more?
	if ( count++ < 3 ) goto moreloop;

	m_htbValid = true;
	return &m_htb;
}

Title *XmlDoc::getTitle() {
	if ( m_titleValid ) {
		return &m_title;
	}

	uint8_t *contentTypePtr = getContentType();
	if ( ! contentTypePtr || contentTypePtr == (void *)-1 ) {
		return (Title *)contentTypePtr;
	}

	// xml and json docs have empty title
	if ( *contentTypePtr == CT_JSON || *contentTypePtr == CT_XML ) {
		m_titleValid = true;
		return &m_title;
	}

	int32_t titleMaxLen = 80;
	if ( m_req ) {
		titleMaxLen = m_req->m_titleMaxLen;
	} else {
		CollectionRec *cr = getCollRec();
		if (cr) {
			titleMaxLen = cr->m_titleMaxLen;
		}
	}

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) {
		return (Title *)xml;
	}

	int64_t start = logQueryTimingStart();

	// we try to set from tags to avoid initializing everything else
	if ( m_title.setTitleFromTags( xml, titleMaxLen, *contentTypePtr ) ) {
		m_titleValid = true;

		logQueryTimingEnd( __func__, start );

		return &m_title;
	}

	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) {
		return (Title *)ww;
	}

	Query *query = getQuery();
	if ( ! query ) {
		return (Title *)query;
	}

	m_titleValid = true;

	char *filteredRootTitleBuf = getFilteredRootTitleBuf();
	if ( filteredRootTitleBuf == (char*) -1) {
		filteredRootTitleBuf = NULL;
	}

	start = logQueryTimingStart();

	if ( !m_title.setTitle( xml, ww, titleMaxLen, query, getLinkInfo1(), getFirstUrl(), filteredRootTitleBuf,
							m_filteredRootTitleBufSize, *contentTypePtr, m_langId ) ) {
		g_errno = ETITLEERROR;
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	return &m_title;
}


Summary *XmlDoc::getSummary () {
	if ( m_summaryValid ) {
		return &m_summary;
	}

	// time cpu set time
	m_cpuSummaryStartTime = gettimeofdayInMilliseconds();

	uint8_t *ct = getContentType();
	if ( ! ct || ct == (void *)-1 ) {
		checkPointerError(ct);
		return (Summary *)ct;
	}
	// xml and json docs have empty summaries
	if ( *ct == CT_JSON || *ct == CT_XML ) {
		m_summaryValid = true;
		return &m_summary;
	}

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) {
		checkPointerError(xml);
		return (Summary *)xml;
	}

	Title *ti = getTitle();
	if ( ! ti || ti == (Title *)-1 ) {
		checkPointerError(ti);
		return (Summary *)ti;
	}

	int64_t start = logQueryTimingStart();
    if ( m_summary.setSummaryFromTags( xml, m_req->m_summaryMaxLen, ti->getTitle(), ti->getTitleLen() ) ) {
		logQueryTimingEnd( __func__, start );

		m_summaryValid = true;
		return &m_summary;
	}

	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) {
		checkPointerError(ww);
		return (Summary *)ww;
	}

	Sections *sections = getSections();
	if ( ! sections ||sections==(Sections *)-1) {
		checkPointerError(sections);
		return (Summary *)sections;
	}

	Pos *pos = getPos();
	if ( ! pos || pos == (Pos *)-1 ) {
		checkPointerError(pos);
		return (Summary *)pos;
	}

	char *site = getSite();
	if ( ! site || site == (char *)-1 ) {
		checkPointerError(site);
		return (Summary *)site;
	}

	int64_t *d = getDocId();
	if ( ! d || d == (int64_t *)-1 ) {
		checkPointerError(d);
		return (Summary *)d;
	}

	Matches *mm = getMatches();
	if ( ! mm || mm == (Matches *)-1 ) {
		checkPointerError(mm);
		return (Summary *)mm;
	}

	Query *q = getQuery();
	if ( ! q ) {
		checkPointerError(q);
		return (Summary *)q;
	}

	CollectionRec *cr = getCollRec();
	if ( ! cr ) {
		abort(); //bad abort for now
		return NULL;
	}

	start = logQueryTimingStart();

	// . get the highest number of summary lines that we need
	// . the summary vector we generate for doing summary-based deduping
	//   typically has more lines in it than the summary we generate for
	//   displaying to the user
	int32_t numLines = m_req->m_numSummaryLines;
	if ( cr->m_percentSimilarSummary >   0  &&
	     cr->m_percentSimilarSummary < 100  &&
	     m_req->m_getSummaryVector            &&
	     cr->m_summDedupNumLines > numLines   ) {
		// request more lines than we will display
		numLines = cr->m_summDedupNumLines;
	}

	// compute the summary
	bool status = m_summary.setSummary( xml, ww, sections, pos, q, m_req->m_summaryMaxLen, numLines,
	                                    m_req->m_numSummaryLines, m_req->m_summaryMaxNumCharsPerLine, getFirstUrl(), mm,
	                                    ti->getTitle(), ti->getTitleLen() );

	// error, g_errno should be set!
	if ( ! status ) {
		checkPointerError(NULL);
		return NULL;
	}

	logQueryTimingEnd( __func__, start );

	m_summaryValid = true;
	return &m_summary;
}

char *XmlDoc::getHighlightedSummary ( bool *isSetFromTagsPtr ) {
	if ( m_finalSummaryBufValid ) {
		if ( isSetFromTagsPtr ) {
			*isSetFromTagsPtr = m_isFinalSummarySetFromTags;
		}

		if(m_finalSummaryBuf.getBufStart()==NULL)
			gbshutdownLogicError();
		return m_finalSummaryBuf.getBufStart();
	}

	Summary *s = getSummary();
	if ( ! s || s == (void *)-1 ) {
		checkPointerError(s);
		return (char *)s;
	}

	Query *q = getQuery();
	if ( ! q ) {
		checkPointerError(q);
		return (char *)q;
	}

	// get the summary
	char *sum    = s->getSummary();
	int32_t sumLen = s->getSummaryDisplayLen();
	m_isFinalSummarySetFromTags = s->isSetFromTags();

	// assume no highlighting?
	if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
		if(!m_finalSummaryBuf.safeMemcpy(sum,sumLen) || !m_finalSummaryBuf.nullTerm())
			return NULL;
		m_finalSummaryBufValid = true;

		if ( isSetFromTagsPtr ) {
			*isSetFromTagsPtr = m_isFinalSummarySetFromTags;
		}

		return m_finalSummaryBuf.getBufStart();
	}

	if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }

	// url encode summary
	StackBuf<> tmpSum;
	tmpSum.htmlEncode(sum, sumLen, false);

	Highlight hi;
	StackBuf<> hb;

	// highlight the query in it
	int32_t hlen = hi.set ( &hb, tmpSum.getBufStart(), tmpSum.length(), q, "<b>", "</b>" );

	// highlight::set() returns 0 on error
	if ( hlen < 0 ) {
		log("build: highlight class error = %s",mstrerror(g_errno));
		if ( ! g_errno ) { g_process.shutdownAbort(true); }
		return NULL;
	}

	// store into our safebuf then
	if(!m_finalSummaryBuf.safeMemcpy(&hb) || !m_finalSummaryBuf.nullTerm())
		return NULL;
	m_finalSummaryBufValid = true;

	if ( isSetFromTagsPtr ) {
		*isSetFromTagsPtr = m_isFinalSummarySetFromTags;
	}

	return m_finalSummaryBuf.getBufStart();
}

// <meta name=robots value=noarchive>
// <meta name=<configured botname> value=noarchive>
char *XmlDoc::getIsNoArchive ( ) {
	if ( m_isNoArchiveValid ) return &m_isNoArchive;
	Xml *xml = getXml();
	if ( ! xml || xml == (void *)-1 ) return (char *)xml;
	m_isNoArchive      = (char)false;
	m_isNoArchiveValid = true;
	int32_t     n     = xml->getNumNodes();
	XmlNode *nodes = xml->getNodes();
	// find the meta tags
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// continue if not a meta tag
		if ( nodes[i].m_nodeId != TAG_META ) continue;
		// get robots attribute
		int32_t alen; char *att;
		// <meta name=robots value=noarchive>
		att = nodes[i].getFieldValue ( "name" , &alen );
		// need a name!
		if ( ! att ) continue;
		// get end
		char *end = att + alen;
		// skip leading spaces
		while ( att < end && *att && is_wspace_a(*att) ) att++;
		// must be robots or <configured botname>. skip if not
		if ( strncasecmp(att,"robots" ,6) &&
		     strncasecmp(att,g_conf.m_spiderBotName,strlen(g_conf.m_spiderBotName))   ) continue;

		// get the content vaue
		att = nodes[i].getFieldValue("content",&alen);
		// skip if none
		if ( ! att ) continue;
		// get end
		end = att + alen;
		// skip leading spaces
		while ( att < end && *att && is_wspace_a(*att) ) att++;
		// is is noarchive? skip if no such match
		if ( strncasecmp(att,"noarchive",9) != 0 ) continue;
		// ok, we got it
		m_isNoArchive = (char)true;
		break;
	}
	// return what we got
	return &m_isNoArchive;
}

char *XmlDoc::getIsLinkSpam ( ) {
	if ( m_isLinkSpamValid ) return &m_isLinkSpam2;

	setStatus ( "checking if linkspam" );

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	Links *links = getLinks();
	if ( ! links || links == (Links *)-1 ) return (char *)links;
	int32_t *ip = getIp();
	if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip;
	//LinkInfo *info1 = getLinkInfo1();
	//if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;
	int32_t *sni = getSiteNumInlinks();
	if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// reset note
	m_note = NULL;

	// . if a doc is "link spam" then it cannot vote, or its
	//   voting power is reduced
	// . look for indications that the link is from a guestbook
	// . doc length over 100,000 bytes consider it link spam
	m_isLinkSpamValid = true;
	m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker
				      *ip ,
				      *sni ,
				      xml,
				      links,
				      150000,//MAXDOCLEN,//maxDocLen ,
				      &m_note ,
				      NULL , // &linkee , // url ,
				      -1 ); // linkNode ,
	// set shadow
	m_isLinkSpam2 = (bool)m_isLinkSpam;
	return &m_isLinkSpam2;
}


// is it a custom error page? ppl do not always use status 404!
char *XmlDoc::getIsErrorPage ( ) {
	if ( m_isErrorPageValid ) {
		return &m_isErrorPage;
	}

	setStatus ( "getting is error page");

	// need a buncha crap
	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
	// get local link info
	LinkInfo   *info1  = getLinkInfo1();
	// error or blocked
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1;

	// default
	LinkInfo  *li = info1;

	//we have to be more sophisticated with longer pages because they
	//are could actually be talking about an error message.
	//if(xml->getContentLen() > 4096) return false;

	// assume not
	m_isErrorPage      = (char)false;
	m_isErrorPageValid = true;

	int32_t nn = xml->getNumNodes();
	int32_t i;

	char* s;
	int32_t len;
	int32_t len2;

	const char* errMsg = NULL;

	int32_t numChecked = 0;
	// check the first header and title tag
	// limit it to first 32 nodes
	if(nn > 32) nn = 32;
	for ( i = 0 ; i < nn ; i++ ) {
		switch(xml->getNodeId(i)) {
		case TAG_TITLE:
		case TAG_H1:
		case TAG_H2:
		case TAG_H3:
		case TAG_SPAN:
			char* p = xml->getString(i,true,&len);
			if(len == 0 || len > 1024) continue;
			char* pend = p + len;
			errMsg = matchErrorMsg(p, pend );
			++numChecked;
			break;
		}
		if(errMsg || numChecked > 1) break;
	}
	if(!errMsg) return &m_isErrorPage;
	len = strlen(errMsg);

	// make sure the error message was not present in the link text
	if ( li && li->getNumGoodInlinks() > 5 ) return &m_isErrorPage;
	for (Inlink *k=NULL;li && (k=li->getNextInlink(k)); ) {
		//int32_t nli = li->getNumLinkTexts();
		//if we can index some link text from the page, then do it
		//if(nli > 5) return false;
		//for ( int32_t i = 0 ; i < nli ; i++ ) {
		s    = k->getLinkText();
		len2 = k->size_linkText - 1; // exclude \0
		//if(!s) break;
		//allow error msg to contain link text or vice versa
		if(len < len2) {
			if(strncasestr(errMsg, s,len,len2) != NULL)
				return &m_isErrorPage;
		}
		else {
			if(strncasestr(s, errMsg,len2,len) != NULL)
				return &m_isErrorPage;
		}
	}

	m_isErrorPage = (char)true;
	return &m_isErrorPage;
}


const char* XmlDoc::matchErrorMsg(char* p, char* pend ) {
 	char utf8Buf[1024];
	//	int32_t utf8Len = 0;
	int32_t len = pend - p;

	if(len > 1024) len = 1024;
	pend = p + len;
	char* tmp = utf8Buf;
	while(p < pend) {
		*tmp = to_lower_a(*p);
		tmp++; p++;
	}

	p = utf8Buf;
	pend = p + len;

	const char* errMsg = NULL;

	while(p < pend) {
		int32_t r = pend - p;
		switch (*p) { //sorted by first letter, then by frequency
		case '4':
			errMsg = "404 error";
			if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
			errMsg = "403 forbidden";
			if(r>=13&&strncmp(p, errMsg, 13) == 0) return errMsg;
			break;

		case 'd':
			errMsg = "detailed error information follows";
			if(r>=34&&strncmp(p, errMsg, 34) == 0) return errMsg;
			break;

		case 'e':
			errMsg = "error 404";
			if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
			errMsg = "error was encountered while processing "
				"your request";
			if(r>=51&&strncmp(p, errMsg,51) == 0) return errMsg;

			errMsg = "error occurred while processing request";
			if(r>=39&&strncmp(p, errMsg, 39) == 0) return errMsg;
			errMsg = "exception error has occurred";
			if(r>=28&&strncmp(p, errMsg,28) == 0) return errMsg;
			errMsg = "error occurred";
			if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
			//http://www.gnu.org/fun/jokes/unix.errors.html
			//errMsg = "error message";
			//if(strncmp(p, errMsg, 13) == 0) return errMsg;
			break;

		case 'f':
			errMsg = "file not found";
			if(r>=14&&strncmp(p, errMsg, 14) == 0) return errMsg;
			break;

		case 'h':
			errMsg = "has moved";
			if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg;
			break;

		case 'n':
			errMsg = "no referrer";
			if(r>=12&&strncmp(p, errMsg,12) == 0) return errMsg;
			break;

		case 'o':
			errMsg = "odbc error code = ";
			if(r>=18&&strncmp(p, errMsg,18) == 0) return errMsg;
			errMsg = "object not found";
			if(r>=16&&strncmp(p, errMsg,16) == 0) return errMsg;
			break;

		case 'p':
			errMsg = "page not found";
			if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg;
			break;

		case 's':
			errMsg = "system error";
			if(r>=12&&strncmp(p, errMsg, 12) == 0) return errMsg;
			break;
		case 't':
			errMsg = "the application encountered an "
				"unexpected problem";
			if(r>=49&&strncmp(p, errMsg, 49) == 0) return errMsg;
			errMsg = "the page you requested has moved";
			if(r>=32&&strncmp(p, errMsg, 32) == 0) return errMsg;
			errMsg = "this page has moved";
			if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;
			break;

		case 'u':
			errMsg = "unexpected problem has occurred";
			if(r>=31&&strncmp(p, errMsg, 31) == 0) return errMsg;
			errMsg = "unexpected error has occurred";
			if(r>=29&&strncmp(p, errMsg, 29) == 0) return errMsg;
			errMsg = "unexpected problem occurred";
			if(r>=27&&strncmp(p, errMsg, 27) == 0) return errMsg;
			errMsg ="unexpected error occurred";
			if(r>=25&&strncmp(p, errMsg, 25) == 0) return errMsg;
			errMsg ="unexpected result has occurred";
			if(r>=33&&strncmp(p, errMsg, 33) == 0) return errMsg;
			errMsg ="unhandled exception";
			if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg;

			break;

		case 'y':
			errMsg = "you have been blocked";
			if(r>=21&&strncmp(p, errMsg, 21) == 0) return errMsg;
			break;
		}
		//skip to the beginning of the next word
		while(p < pend && !is_wspace_a(*p)) p++;
		while(p < pend && is_wspace_a(*p)) p++;
	}
	return NULL;
}

#include "Spider.h"

static SafeBuf *s_wbuf = NULL;

// . this is used by gbsort() above
// . sorts TermInfos alphabetically by their TermInfo::m_term member
static int cmptp (const void *v1, const void *v2) {
	TermDebugInfo *t1 = *(TermDebugInfo **)v1;
	TermDebugInfo *t2 = *(TermDebugInfo **)v2;

	char *start = s_wbuf->getBufStart();

	// prefix first
	char *ps1 = start + t1->m_prefixOff;
	char *ps2 = start + t2->m_prefixOff;
	if ( t1->m_prefixOff < 0 ) ps1 = NULL;
	if ( t2->m_prefixOff < 0 ) ps2 = NULL;
	int32_t plen1 = 0; if ( ps1 ) plen1 = strlen(ps1);
	int32_t plen2 = 0; if ( ps2 ) plen2 = strlen(ps2);
	int32_t pmin = plen1;
	if ( plen2 < pmin ) pmin = plen2;
	int32_t pn = strncmp ( ps1 , ps2 , pmin );
	if ( pn ) return pn;
	if ( plen1 != plen2 ) return ( plen1 - plen2 );

	// return if groups differ
	int32_t len1 = t1->m_termLen;
	int32_t len2 = t2->m_termLen;
	int32_t min = len1;
	if ( len2 < min ) min = len2;
	char *s1    = start + t1->m_termOff;
	char *s2    = start + t2->m_termOff;
	int32_t n = strncasecmp ( s1 , s2 , min );
	if ( n ) return n;
	// . if length same, we are tied
	// . otherwise, prefer the shorter
	return ( len1 - len2 );
}

// . this is used by gbsort() above
// . sorts TermDebugInfos by their TermDebugInfo::m_wordPos member
static int cmptp2 (const void *v1, const void *v2) {
	TermDebugInfo *t1 = *(TermDebugInfo **)v1;
	TermDebugInfo *t2 = *(TermDebugInfo **)v2;
	// word position first
	int32_t d = t1->m_wordPos - t2->m_wordPos;
	if ( d ) return d;
	// secondly drop back to hashgroup i guess
	//d = t1->m_hashGroup - t2->m_hashGroup;
	d = t1->m_synSrc - t2->m_synSrc;
	if ( d ) return d;
	// word len
	d = t1->m_termLen - t2->m_termLen;
	if ( d ) return d;
	return 0;
}

static bool printLangBits ( SafeBuf *sb , TermDebugInfo *tp ) {

	bool printed = false;
	if ( tp->m_synSrc ) {
		sb->safePrintf("&nbsp;");
		printed = true;
	}
	int32_t j = 0;
	if ( printed ) j = MAX_LANGUAGES;
	for ( ; j < MAX_LANGUAGES ; j++ ) {
		int64_t mask = 1LL << j;
		//if ( j == tp->m_langId )
		//	sb->safePrintf("[%s]",
		//		       getLanguageAbbr(tp->m_langId));
		if ( ! (tp->m_langBitVec64 & mask) ) continue;
		char langId = j+1;
		// match in langvec? that means even if the
		// word is in multiple languages we put it in
		// this language because we interesect its lang bit
		// vec with its neighbors in the sliding window
		// algo in setLangVector.
		if ( langId == tp->m_langId )
			sb->safePrintf("<b>");
		sb->safePrintf("%s ", getLanguageAbbr(langId) );
		if ( langId == tp->m_langId )
			sb->safePrintf("</b>");
		printed = true;
	}
	if ( ! printed ) {
		sb->safePrintf("??");
	}
	return true;
}

bool XmlDoc::printDoc ( SafeBuf *sb ) {

	if ( ! sb ) return true;


	// shortcut
	char *fu = ptr_firstUrl;

	const char *allowed = "???";
	if      ( m_isAllowedValid && m_isAllowed )  allowed = "yes";
	else if ( m_isAllowedValid                )  allowed = "no";

	int32_t ufn = -1;
	if ( m_urlFilterNumValid ) ufn = m_urlFilterNum;
	time_t spideredTime = getSpideredTime();

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
			"content=\"text/html; charset=utf-8\">"

			"<table cellpadding=3 border=0>\n"

			"<tr>"
			"<td width=\"25%%\">docId</td>"
			"<td><a href=/get?c=%s&d=%" PRIu64">%" PRIu64"</a></td>"
			"</tr>\n"

			"<tr>"
			"<td width=\"25%%\">uh48</td>"
			"<td>%" PRIu64"</td>"
			"</tr>\n"

			"<tr>"
			"<td width=\"25%%\">uh64</td>"
			"<td>%" PRIu64"</td>"
			"</tr>\n"

			"<tr>"
			"<td>index error code</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>http status</td>"
			"<td>%i</td>"
			"</tr>\n"

			"<tr>"
			"<td>url filter num</td>"
			"<td>%" PRId32"</td>"
			"</tr>\n"


			"<tr>"
			"<td>other - errno</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>robots.txt allows</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>metalist size</td>"
			"<td>%" PRId32"</td>"
			"</tr>\n"


			"<tr>"
			"<td>url</td>"
			"<td><a href=\"%s\">%s</a></td>"
			"</tr>\n"

			,
			cr->m_coll,
			m_docId ,
			m_docId ,
			getFirstUrlHash48(), // uh48
			getFirstUrlHash64(), // uh48

			mstrerror(m_indexCode),
			m_httpStatus,
			ufn,
			mstrerror(g_errno),
			allowed,

			m_metaListSize,

			fu,
			fu

			);

	if ( ptr_redirUrl )
		sb->safePrintf(
			       "<tr>"
			       "<td>redir url</td>"
			       "<td><a href=\"%s\">%s</a></td>"
			       "</tr>\n"
			       ,ptr_redirUrl
			       ,ptr_redirUrl
			       );
	else
		sb->safePrintf(
			       "<tr>"
			       "<td>redir url</td>"
			       "<td>--</td>"
			       "</tr>\n"
			       );


	sb->safePrintf("<tr><td>hostHash64</td><td>0x%" PRIx64"</td></tr>",
		       (uint64_t)getHostHash32a());
	sb->safePrintf("<tr><td>site</td><td>");
	sb->safeMemcpy(ptr_site,size_site-1);
	sb->safePrintf("</td></tr>\n");
	if ( m_siteHash32Valid )
		sb->safePrintf("<tr><td>siteHash32</td><td>0x%" PRIx32"</td></tr>\n",
			       m_siteHash32);
	if ( m_domHash32Valid )
		sb->safePrintf("<tr><td>domainHash32</td><td>0x%" PRIx32"</td></tr>\n",
			       m_domHash32);
	sb->safePrintf ( "<tr>"
			 "<td>domainHash8</td>"
			 "<td>0x%" PRIx32"</td>"
			 "</tr>\n"
			 ,
			 (int32_t)Titledb::getDomHash8FromDocId(m_docId)
			 );

	struct tm tm_buf;
	char buf[64];
	sb->safePrintf(
			"<tr>"
			"<td>coll</td>"
			"<td>%s</td>"
			"</tr>\n"

			"<tr>"
			"<td>spidered date</td>"
			"<td>%s UTC</td>"
			"</tr>\n"
			,
			cr->m_coll,
			asctime_r(gmtime_r(&spideredTime,&tm_buf),buf)
			);


	/*
	char *ms = "-1";
	if ( m_minPubDate != -1 ) ms = asctime_r(gmtime_r( &m_minPubDate ));
	sb->safePrintf (
			"<tr>"
			"<td>min pub date</td>"
			"<td>%s UTC</td>"
			"</tr>\n" , ms );

	ms = "-1";
	if ( m_maxPubDate != -1 ) ms = asctime_r(gmtime_r( &m_maxPubDate ));
	sb->safePrintf (
			"<tr>"
			"<td>max pub date</td>"
			"<td>%s UTC</td>"
			"</tr>\n" , ms );
	*/

	// our html template fingerprint
	sb->safePrintf ("<tr><td>tag pair hash 32</td><td>");
	if ( m_tagPairHash32Valid )sb->safePrintf("%" PRIu32,
						  (uint32_t)m_tagPairHash32);
	else                       sb->safePrintf("invalid");
	sb->safePrintf("</td></tr>\n" );


	// print list we added to delete stuff
	if ( m_indexCode && m_oldDocValid && m_oldDoc ) {
		// skip debug printing for now...
		//return true;
		sb->safePrintf("</table><br>\n");
		sb->safePrintf("<h2>Delete Meta List</h2>");
		printMetaList ( m_metaList , m_metaList + m_metaListSize ,sb);
	}


	if ( m_indexCode || g_errno ) {
		printMetaList ( m_metaList , m_metaList + m_metaListSize, sb );
	}

	if ( m_indexCode ) return true;
	if ( g_errno     ) return true;


	// sanity check
	//if ( ! m_sreqValid ) { g_process.shutdownAbort(true); }

	/*
	sb->safePrintf("<tr><td>next spider date</td>"
		       "<td>%s UTC</td></tr>\n"

		       "<tr><td>next spider priority</td>"
		       "<td>%" PRId32"</td></tr>\n" ,
		       asctime_r(gmtime_r( &m_nextSpiderTime )) ,
		       (int32_t)m_nextSpiderPriority );
	*/

	// must always start with http
	if ( strncmp ( fu , "http" , 4 ) != 0 ) { g_process.shutdownAbort(true); }
	// show the host that should spider it
	//int32_t domLen ; char *dom = getDomFast ( fu , &domLen , true );
	//int32_t hostId;
	if ( m_sreqValid ) {
		// must not block
		SpiderRequest *oldsr = &m_sreq;
		uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr);
		sb->safePrintf ("<tr><td><b>assigned spider shard</b>"
				"</td>\n"
				"<td><b>%" PRIu32"</b></td></tr>\n",shard);
	}

	time_t ts = m_firstIndexedDate;
	sb->safePrintf("<tr><td>first indexed date</td>"
		       "<td>%s UTC</td></tr>\n" ,
		       asctime_r(gmtime_r(&ts,&tm_buf),buf) );

	ts = m_outlinksAddedDate;
	sb->safePrintf("<tr><td>outlinks last added date</td>"
		       "<td>%s UTC</td></tr>\n" ,
		       asctime_r(gmtime_r(&ts,&tm_buf),buf) );

	// hop count
	sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td></tr>\n",
		      (int32_t)m_hopCount);

	// thumbnails
	ThumbnailArray *ta = (ThumbnailArray *) ptr_imageData;
	if ( ta ) {
		int32_t nt = ta->getNumThumbnails();
		sb->safePrintf("<tr><td># thumbnails</td>"
			       "<td>%" PRId32"</td></tr>\n",nt);
		for ( int32_t i = 0 ; i < nt ; i++ ) {
			ThumbnailInfo *ti = ta->getThumbnailInfo(i);
			sb->safePrintf("<tr><td>thumb #%" PRId32"</td>"
				       "<td>%s (%" PRId32"x%" PRId32",%" PRId32"x%" PRId32") "
				       , i
				       , ti->getUrl()
				       , ti->m_origDX
				       , ti->m_origDY
				       , ti->m_dx
				       , ti->m_dy
				       );
			ti->printThumbnailInHtml ( sb , 100,100,true,NULL) ;
			// end the row for this thumbnail
			sb->safePrintf("</td></tr>\n");
		}
	}



	const char *ddd = "---";

	char strLanguage[128];
	languageToString(m_langId, strLanguage);

	SafeBuf tb;

	TagRec *ogr = NULL;
	if ( m_tagRecValid ) ogr = &m_tagRec;
	if ( ogr ) ogr->printToBufAsHtml ( &tb , "old tag" );

	SafeBuf *ntb = NULL;
	if ( m_newTagBufValid ) ntb = getNewTagBuf();
	if ( ntb ) {
		// this is just a sequence of tags like an rdblist
		char *pt    = ntb->getBufStart();
		char *ptend = pt + ntb->length();
		for ( ; pt < ptend ; ) {
			// skip rdbid
			pt++;
			// cast it
			Tag *tag = (Tag *)pt;
			// skip it
			pt += tag->getRecSize();
			// print tag out
			tag->printToBufAsHtml ( &tb, "new tag");
		}
	}


	// prevent (null) from being displayed
	tb.pushChar('\0');

	int32_t sni  = m_siteNumInlinks;

	LinkInfo *info1 = ptr_linkInfo1;

	char ipString[16];
	iptoa(m_ip,ipString);
	const char *estimated = "";

	//char *ls = getIsLinkSpam();
	Links *links = getLinks();
	// sanity check. should NEVER block!
	if ( links == (void *)-1 ) { g_process.shutdownAbort(true); }

	// this is all to get "note"
	//char *note = NULL;
	// make it a URL
	Url uu; uu.set ( ptr_firstUrl );
	// sanity check
	Xml *xml = getXml();
	// sanity check
	if ( xml == (void *)-1 ) { g_process.shutdownAbort(true); }

	sb->safePrintf (
		  "<tr><td>datedb date</td><td>%s UTC (%" PRIu32")%s"
		  "</td></tr>\n"

		  "<tr><td>compressed size</td><td>%" PRId32" bytes</td></tr>\n"

		  "<tr><td>original charset</td><td>%s</td></tr>\n"

		  //"<tr><td>site num inlinks</td><td><b>%" PRId32"%</b></td></tr>\n"

		  //"<tr><td>total extrapolated linkers</td><td>%" PRId32"</td></tr>\n"

		  "<tr><td><b>title rec version</b></td><td><b>%" PRId32"</b>"
		  "</td></tr>\n"

		  "<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n"

		  //"<tr><td>is link spam?</td><td>%" PRId32" <b>%s</b></td></tr>\n"

		  "<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n"
		  "<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n"
		  //"<tr><td>index article only?</td><td>%" PRId32"</td></tr>\n"
		  "%s\n"
		  "<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
		  "%s</td></tr>\n"
		  "<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n"
		  "<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n"

		  "<tr><td>content type</td><td>%" PRId32" (%s)</td></tr>\n"
		  "<tr><td>language</td><td>%" PRId32" (%s)</td></tr>\n"
		  "<tr><td>country</td><td>%" PRId32" (%s)</td></tr>\n"
		  "<tr><td>time axis used</td><td>%" PRId32"</td></tr>\n"
		  "<tr><td>metadata</td><td>%s</td></tr>\n"
		  "</td></tr>\n",

		  ddd ,
		  0 ,
		  estimated ,

		  m_oldTitleRecSize,

		  get_charset_str(m_charset),

		  //sni ,

		  //ptr_linkInfo1->m_numInlinksExtrapolated,

		  (int32_t)m_version ,

		  (int32_t)m_isAdult,

		  //(int32_t)m_isLinkSpam,
		  //m_note,

		  (int32_t)m_isPermalink,

		  (int32_t)m_isRSS,


		  //(int32_t)m_eliminateMenus,


		  // tag rec
		  tb.getBufStart(),

		  ipString,
		  cr->m_coll,
		  ipString,
		  size_utf8Content - 1,
		  (int32_t)m_isContentTruncated,

		  (int32_t)m_contentType,
		  g_contentTypeStrings[(int)m_contentType] ,

		  (int32_t)m_langId,
		  strLanguage,

		  (int32_t)m_countryId,
		  g_countryCode.getName(m_countryId),
		  m_useTimeAxis,
		  "");

	if ( info1 ) {
		sb->safePrintf("<tr><td>num GOOD links to whole site</td>"
			       "<td>%" PRId32"</td></tr>\n",
			       sni );
	}

	// close the table
	sb->safePrintf ( "</table></center><br>\n" );

	// print outlinks
	if( links ) {
		links->print( sb );
	}

	//
	// PRINT SECTIONS
	//
	Sections *sections = getSections();
	if ( ! sections ||sections==(Sections *)-1) {g_process.shutdownAbort(true);}

	printRainbowSections ( sb , NULL );

	//
	// PRINT LINKINFO
	//

	char *p    = m_pageLinkBuf.getBufStart();
	int32_t  plen = m_pageLinkBuf.length();
	sb->safeMemcpy ( p , plen );


	//
	// PRINT SITE LINKINFO
	//
	p    = m_siteLinkBuf.getBufStart();
	plen = m_siteLinkBuf.length();
	sb->safeMemcpy ( p , plen );


	// note this
	sb->safePrintf("<h2>NEW Meta List</h2>");

	printMetaList ( m_metaList , m_metaList + m_metaListSize , sb );

	// all done if no term table to print out
	if ( ! m_wts ) return true;


	//
	// BEGIN PRINT HASHES TERMS
	//

	// shortcut
	HashTableX *wt = m_wts;

	// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
	TermDebugInfo **tp = NULL;
	// add them with this counter
	int32_t nt = 0;

	int32_t nwt = 0;
	if ( wt ) {
		nwt = wt->getNumSlots();
		tp = (TermDebugInfo **)wt->m_keys;
	}

	// now print the table we stored all we hashed into
	for ( int32_t i = 0 ; i < nwt ; i++ ) {
		// skip if empty
		if ( wt->m_flags[i] == 0 ) continue;
		// get its key, date=32bits termid=64bits
		//key96_t *k = (key96_t *)wt->getKey ( i );
		// get the TermDebugInfo
		TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
		// point to it for sorting
		tp[nt++] = ti;
	}

	// set this for cmptp
	s_wbuf = &m_wbuf;

	// sort them alphabetically by Term
	gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp );

	// print them out in a table
	char hdr[1000];
	sprintf(hdr,
		"<table border=1 cellpadding=0>"
		"<tr>"
		"<td><b>Prefix</b></td>"
		"<td><b>WordNum</b></td>"
		"<td><b>Lang</b></td>"
		"<td><b>Term</b></td>"
		"<td><b>Desc</b></td>"
		"<td><b>TermId/TermHash48</b></td>"
		"<td><b>ShardByTermId?</b></td>"
		"</tr>\n"
		);

	sb->safePrintf("%s",hdr);

	char *start = m_wbuf.getBufStart();
	int32_t rcount = 0;

	for ( int32_t i = 0 ; i < nt ; i++ ) {


		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS) == 0 )
			sb->safePrintf("</table>%s",hdr);

		const char *prefix = "&nbsp;";
		if ( tp[i]->m_prefixOff >= 0 )
			prefix = start + tp[i]->m_prefixOff;

		sb->safePrintf ( "<tr><td>%s</td>", prefix);

		sb->safePrintf( "<td>%" PRId32 "</td>", tp[i]->m_wordNum );

		// print out all langs word is in if it's not clear
		// what language it is. we use a sliding window to
		// resolve some ambiguity, but not all, so print out
		// the possible langs here
		sb->safePrintf("<td>");
		printLangBits ( sb , tp[i] );
		sb->safePrintf("</td>");

		// print the term
		sb->safePrintf("<td><nobr>");

		if ( tp[i]->m_synSrc ) {
			sb->pushChar('*');
		}

		char *term = start + tp[i]->m_termOff;
		int32_t  termLen = tp[i]->m_termLen;
		sb->safeMemcpy ( term , termLen );

		sb->safePrintf ( "</nobr></td>");

		sb->safePrintf( "<td><nobr>%s</nobr></td>", getHashGroupString( tp[i]->m_hashGroup ) );

		sb->safePrintf ( "<td>%016" PRIu64"</td>", (uint64_t)(tp[i]->m_termId & TERMID_MASK) );

		if ( tp[i]->m_shardByTermId ) {
			sb->safePrintf( "<td><b>1</b></td>" );
		} else {
			sb->safePrintf( "<td>0</td>" );
		}

		sb->safePrintf("</tr>\n");
	}


	sb->safePrintf("</table><br>\n");

	//
	// END PRINT HASHES TERMS
	//

	return true;
}

bool XmlDoc::printMenu ( SafeBuf *sb ) {

	if( !sb ) {
		return false;
	}

	// encode it
	SafeBuf ue;
	urlEncode(&ue, ptr_firstUrl);

	// get
	sb->safePrintf ("<meta http-equiv=\"Content-Type\" "
			"content=\"text/html; charset=utf-8\">" );

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	return true;
}

// if printDocForProCog, an entry function, blocks, we gotta re-call it
static void printDocForProCogWrapper ( void *state ) {
	XmlDoc *THIS = (XmlDoc *)state;
	// make sure has not been freed from under us!
	if ( THIS->m_freed ) { g_process.shutdownAbort(true);}
	// note it
	THIS->setStatus ( "in print doc for pro cog wrapper" );
	// get it
	bool status = THIS->printDocForProCog ( THIS->m_savedSb ,
						THIS->m_savedHr );
	// return if it blocked
	if ( ! status ) return;
	// otherwise, all done, call the caller callback
	THIS->callCallback();
}


// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) {

	if ( ! sb ) return true;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	m_masterLoop = printDocForProCogWrapper;
	m_masterState = this;

	m_savedSb = sb;
	m_savedHr = hr;

	// if we are generating site or page inlinks info for a
	// non docid based url, then store that info in the respective
	// safe bufs
	m_useSiteLinkBuf = true;
	m_usePageLinkBuf = true;


	int32_t page = hr->getLong("page",1);


	// for some reason sections page blocks forever in browser
	if ( page != 7 && ! m_printedMenu ) {
		printFrontPageShell ( sb , "search" , cr , false );
		m_printedMenu = true;
		//printMenu ( sb );
	}


	if ( page == 1 )
		return printGeneralInfo(sb,hr);

	if ( page == 2 )
		return printPageInlinks(sb,hr);

	if ( page == 3 )
		return printSiteInlinks(sb,hr);

	if ( page == 4 )
		return printRainbowSections(sb,hr);

	if ( page == 5 )
		return printTermList(sb,hr);

	if ( page == 6 )
		return printSpiderStats(sb,hr);

	if ( page == 7 )
		return printCachedPage(sb,hr);

	return true;
}

bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) {

	// shortcut
	char *fu = ptr_firstUrl;

	// sanity check
	Xml *xml = getXml();
	// blocked?
	if ( xml == (void *)-1 ) return false;
	// error?
	if ( ! xml ) return true;

	char *ict = getIsContentTruncated();
	if ( ! ict ) return true;
	if ( ict == (char *)-1 ) return false;

	char *at = getIsAdult();
	if ( ! at ) return true;
	if ( at == (void *)-1 ) return false;

	char *ls = getIsLinkSpam();
	if ( ! ls ) return true;
	if ( ls == (void *)-1 ) return false;

	uint8_t *ct = getContentType();
	if ( ! ct ) return true;
	if ( ct == (void *)-1 ) return false;

	uint16_t *cs = getCharset ( );
	if ( ! cs ) return true;
	if ( cs == (uint16_t *)-1 ) return false;

	char *pl = getIsPermalink();
	if ( ! pl ) return true;
	if ( pl == (char *)-1 ) return false;

	char *isRSS   = getIsRSS();
	if ( ! isRSS ) return true;
	if ( isRSS == (char  *)-1 )  return false;

	int32_t *ip = getIp();
	if ( ! ip ) return true;
	if ( ip == (int32_t *)-1 ) return false;

	uint8_t *li = getLangId();
	if ( ! li ) return true;
	if ( li == (uint8_t *)-1 ) return false;

	uint16_t *cid = getCountryId();
	if ( ! cid ) return true;
	if ( cid == (uint16_t *)-1 ) return false;

	LinkInfo   *info1  = getLinkInfo1();
	if ( ! info1 ) return true;
	if ( info1 == (void *)-1 ) return false;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return true;

	// make it a URL
	Url uu;
	uu.set ( fu );

	const char *allowed = "???";
	int32_t allowedInt = 1;
	if      ( m_isAllowedValid && m_isAllowed )  {
		allowed = "yes";
		allowedInt = 1;
	}
	else if ( m_isAllowedValid                )  {
		allowed = "no";
		allowedInt = 0;
	}

	const char *es = mstrerror(m_indexCode);
	if ( ! m_indexCode ) es = mstrerror(g_errno);

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	int32_t shardNum = getShardNumFromDocId ( m_docId );
	Host *hosts = g_hostdb.getShard ( shardNum );
	Host *h = &hosts[0];

	key128_t spiderKey = Spiderdb::makeFirstKey(m_firstIp);
	int32_t spiderShardNum = getShardNum(RDB_SPIDERDB, &spiderKey);
	int32_t spiderHostId = g_hostdb.getHostIdWithSpideringEnabled(spiderShardNum);

	if ( ! isXml )
		sb->safePrintf (
				"<table cellpadding=3 border=0>\n"

				"<tr>"
				"<td width=\"25%%\">docId</td>"
				"<td><a href=/get?c=%s&d=%" PRIu64">%" PRIu64"</a></td>"
				"</tr>\n"

				"<tr>"
				"<td width=\"25%%\">on host #</td>"
				"<td>%" PRId32"</td>"
				"</tr>\n"

				"<tr>"
				"<td width=\"25%%\">spidered on host #</td>"
				"<td>%" PRId32"</td>"
				"</tr>\n"

				"<tr>"
				"<td>index error code</td>"
				"<td>%s</td>"
				"</tr>\n"


				"<tr>"
				"<td>robots.txt allows</td>"
				"<td>%s</td>"
				"</tr>\n"


				"<tr>"
				"<td>url</td>"
				"<td><a href=\"%s\">%s</a></td>"
				"</tr>\n"

				,
				cr->m_coll,
				m_docId ,
				m_docId ,

				h->m_hostId,
				spiderHostId,
				es,
				allowed,

				fu,
				fu

				);
	else
		sb->safePrintf (
				"<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				"\t<coll><![CDATA[%s]]></coll>\n"
				"\t<docId>%" PRId64"</docId>\n"
				"\t<indexError><![CDATA[%s]]></indexError>\n"
				"\t<robotsTxtAllows>%" PRId32
				"</robotsTxtAllows>\n"
				"\t<url><![CDATA[%s]]></url>\n"
				,
				cr->m_coll,
				m_docId ,
				es,
				allowedInt,//(int32_t)m_isAllowed,
				fu
				);

	char *redir = ptr_redirUrl;
	if ( redir && ! isXml ) {
		sb->safePrintf(
			       "<tr>"
			       "<td>redir url</td>"
			       "<td><a href=\"%s\">%s</a></td>"
			       "</tr>\n"
			       ,redir
			       ,redir );
	}
	else if ( redir ) {
		sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>"
			       "</redirectUrl>\n" ,redir );
	}


	if ( m_indexCode || g_errno ) {
		if ( ! isXml ) sb->safePrintf("</table><br>\n");
		else           sb->safePrintf("</response>\n");
		return true;
	}


	// must always start with http
	if ( strncmp ( fu , "http" , 4 ) != 0 ) { g_process.shutdownAbort(true); }

	struct tm tm_buf;
	char buf[64];
	time_t ts = (time_t)m_firstIndexedDate;

	if ( ! isXml )
		sb->safePrintf("<tr><td>first indexed date</td>"
			       "<td>%s UTC</td></tr>\n" ,
			       asctime_r(gmtime_r(&ts,&tm_buf),buf) );
	else
		sb->safePrintf("\t<firstIndexedDateUTC>%" PRIu32
			       "</firstIndexedDateUTC>\n",
			       (uint32_t)m_firstIndexedDate );

	ts = m_spideredTime;

	if ( ! isXml )
		sb->safePrintf("<tr><td>last indexed date</td>"
			       "<td>%s UTC</td></tr>\n" ,
			       asctime_r(gmtime_r(&ts,&tm_buf),buf) );
	else
		sb->safePrintf("\t<lastIndexedDateUTC>%" PRIu32
			       "</lastIndexedDateUTC>\n",
			       (uint32_t)m_spideredTime );

	ts = m_outlinksAddedDate;

	if ( ! isXml )
		sb->safePrintf("<tr><td>outlinks last added date</td>"
			       "<td>%s UTC</td></tr>\n" ,
			       asctime_r(gmtime_r(&ts,&tm_buf),buf) );
	else
		sb->safePrintf("\t<outlinksLastAddedUTC>%" PRIu32
			       "</outlinksLastAddedUTC>\n",
			       (uint32_t)m_outlinksAddedDate );

	// hop count
	if ( ! isXml )
		sb->safePrintf("<tr><td>hop count</td><td>%" PRId32"</td>"
			       "</tr>\n",
			       (int32_t)m_hopCount);
	else
		sb->safePrintf("\t<hopCount>%" PRId32"</hopCount>\n",
			       (int32_t)m_hopCount);


	char strLanguage[128];
	languageToString(m_langId, strLanguage);

	// print tags
	//SafeBuf tb;
	int32_t sni  = m_siteNumInlinks;

	char ipString[16];
	iptoa(m_ip,ipString);

	//int32_t sni = info1->getNumGoodInlinks();

	time_t tlu = info1->getLastUpdated();
	struct tm *timeStruct3 = gmtime_r(&tlu,&tm_buf);//info1->m_lastUpdated );
	char tmp3[64];
	strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 );


	if ( ! isXml )
		sb->safePrintf (
			"<tr><td>original charset</td><td>%s</td></tr>\n"
			"<tr><td>adult bit</td><td>%" PRId32"</td></tr>\n"
			//"<tr><td>is link spam?</td><td>%" PRId32" <b>%s</b></td></tr>\n"
			"<tr><td>is permalink?</td><td>%" PRId32"</td></tr>\n"
			"<tr><td>is RSS feed?</td><td>%" PRId32"</td></tr>\n"
			"<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">"
			"%s</td></tr>\n"
			"<tr><td>http status</td><td>%d</td></tr>"
			"<tr><td>content len</td><td>%" PRId32" bytes</td></tr>\n"
			"<tr><td>content truncated</td><td>%" PRId32"</td></tr>\n"
			"<tr><td>content type</td><td>%s</td></tr>\n"
			"<tr><td>language</td><td>%s</td></tr>\n"
			"<tr><td>country</td><td>%s</td></tr>\n"

			"<tr><td><b>good inlinks to site</b>"
			"</td><td>%" PRId32"</td></tr>\n"

			"<tr><td><b>site rank</b></td><td>%" PRId32"</td></tr>\n"

			"<tr><td>good inlinks to page"
			"</td><td>%" PRId32"</td></tr>\n"

			"<tr><td><nobr>page inlinks last computed</nobr></td>"
			"<td>%s</td></tr>\n"
			"</td></tr>\n",
			get_charset_str(m_charset),
			(int32_t)m_isAdult,
			(int32_t)m_isPermalink,
			(int32_t)m_isRSS,
			ipString,
			cr->m_coll,
			ipString,
			m_httpStatus,
			size_utf8Content - 1,
			(int32_t)m_isContentTruncated,
			g_contentTypeStrings[(int)m_contentType] ,
			strLanguage,
			g_countryCode.getName(m_countryId) ,
			sni,
			::getSiteRank(sni),
			info1->getNumGoodInlinks(),

			tmp3
			);
	else {
		sb->safePrintf (
			"\t<charset><![CDATA[%s]]></charset>\n"
			"\t<isAdult>%" PRId32"</isAdult>\n"
			"\t<isLinkSpam>%" PRId32"</isLinkSpam>\n"
			"\t<siteRank>%" PRId32"</siteRank>\n"

			"\t<numGoodSiteInlinks>%" PRId32"</numGoodSiteInlinks>\n"

			"\t<numGoodPageInlinks>%" PRId32"</numGoodPageInlinks>\n"
			"\t<pageInlinksLastComputed>%" PRId32
			"</pageInlinksLastComputed>\n"

			,get_charset_str(m_charset)
			,(int32_t)m_isAdult
			,(int32_t)m_isLinkSpam
			,::getSiteRank(sni)
			,sni

			,info1->getNumGoodInlinks()
			,(int32_t)info1->m_lastUpdated
			);
		sb->safePrintf("\t<isPermalink>%" PRId32"</isPermalink>\n"
			       "\t<isRSSFeed>%" PRId32"</isRSSFeed>\n"
			       "\t<ipAddress><![CDATA[%s]]></ipAddress>\n"
			       "\t<httpStatus>%d</httpStatus>"
			       "\t<contentLenInBytes>%" PRId32
			       "</contentLenInBytes>\n"
			       "\t<isContentTruncated>%" PRId32
			       "</isContentTruncated>\n"
			       "\t<contentType><![CDATA[%s]]></contentType>\n"
			       "\t<language><![CDATA[%s]]></language>\n"
			       "\t<country><![CDATA[%s]]></country>\n",
			       (int32_t)m_isPermalink,
			       (int32_t)m_isRSS,
			       ipString,
			       m_httpStatus,
			       size_utf8Content - 1,
			       (int32_t)m_isContentTruncated,
			       g_contentTypeStrings[(int)m_contentType] ,
			       strLanguage,
			       g_countryCode.getName(m_countryId) );
	}

	TagRec *ogr = NULL;
	if ( m_tagRecDataValid ) {
		ogr = getTagRec(); // &m_tagRec;
		// sanity. should be set from titlerec, so no blocking!
		if ( ! ogr || ogr == (void *)-1 ) { g_process.shutdownAbort(true); }
	}
	if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" );
	else if ( ogr )       ogr->printToBufAsXml  ( sb  );

	// show the good inlinks we used when indexing this
	if ( ! isXml )
		info1->print(sb,cr->m_coll);

	// close the table
	if ( ! isXml )
		sb->safePrintf ( "</table></center><br>\n" );
	else
		sb->safePrintf("</response>\n");

	return true;
}

bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) {

	// use msg25 to hit linkdb and give us a link info class i guess
	// but we need paging functionality so we can page through like
	// 100 links at a time. clustered by c-class ip.

	// do we need to mention how many from each ip c-class then? because
	// then we'd have to read the whole termlist, might be several
	// separate disk reads.

	// we need to re-get both if either is NULL
	LinkInfo *sinfo = getSiteLinkInfo();
	// block or error?
	if ( ! sinfo ) return true;
	if ( sinfo == (LinkInfo *)-1) return false;

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	if ( isXml )
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);


	sb->safeMemcpy ( &m_siteLinkBuf );

	if ( isXml )
		sb->safePrintf ("</response>\n"	);

	// just print that
	//sinfo->print ( sb , cr->m_coll );

	return true;
}

bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) {

	// we need to re-get both if either is NULL
	LinkInfo *info1 = getLinkInfo1();
	// block or error?
	if ( ! info1 ) return true;
	if ( info1 == (LinkInfo *)-1) return false;

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	if ( isXml )
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);

	int32_t recompute = hr->getLong("recompute",0);

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;

	// i guess we need this
	if ( ! recompute ) // m_setFromTitleRec )
		info1->print ( sb , cr->m_coll );
	else
		sb->safeMemcpy ( &m_pageLinkBuf );

	if ( isXml )
		sb->safePrintf ("</response>\n"	);

	return true;
}

bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) {

	// what wordposition to scroll to and blink?
	int32_t hiPos = -1;
	if ( hr ) hiPos = hr->getLong("hipos",-1);

	//
	// PRINT SECTIONS
	//
	Sections *sections = getSections();
	if ( ! sections) return true;
	if (sections==(Sections *)-1)return false;

	Words *words = getWords();
	if ( ! words ) return true;
	if ( words == (Words *)-1 ) return false;

	Phrases *phrases = getPhrases();
	if ( ! phrases ) return true;
	if (phrases == (void *)-1 ) return false;

	HashTableX *cnt = getCountTable();
	if ( ! cnt ) return true;
	if ( cnt == (void *)-1 ) return false;


	int32_t nw = words->getNumWords();
	int64_t *wids = words->getWordIds();

	int32_t isXml = 0;
	if ( hr ) isXml = hr->getLong("xml",0);

	// now complement, cuz bigger is better in the ranking world
	SafeBuf densBuf;

	// returns false and sets g_errno on error
	if ( ! getDensityRanks((int64_t *)wids,
			       nw,
			       HASHGROUP_BODY,//hi->m_hashGroup,
			       &densBuf,
			       sections))
		return true;
	// a handy ptr
	char *densityVec = (char *)densBuf.getBufStart();
	char *wordSpamVec = getWordSpamVec();
	char *fragVec = m_fragBuf.getBufStart();

	SafeBuf wpos;
	if ( ! getWordPosVec ( words ,
			       sections,
			       // we save this in the titlerec, when we
			       // start hashing the body. we have the url
			       // terms before the body, so this is necessary.
			       m_bodyStartPos,
			       fragVec,
			       &wpos) ) return true;
	// a handy ptr
	int32_t *wposVec = (int32_t *)wpos.getBufStart();

	if ( ! isXml ) {
		// put url in for steve to parse out
		sb->safePrintf("%s\n",
			       m_firstUrl.getUrl());
		sb->safePrintf("<font color=black>w</font>"
			       "/"
			       "<font color=purple>x</font>"
			       //"/"
			       //"<font color=green>y</font>"
			       "/"
			       "<font color=red>z</font>"
			       ": "
			       "w=wordPosition "
			       "x=densityRank "
			       "y=diversityRank "
			       "z=wordSpamRank "
			       "<br>"
			       "<br>"
			       ""
			       );

		// try the new print function
		sections->print( sb, hiPos, wposVec, densityVec, wordSpamVec, fragVec );
		return true;
	}

	// at this point, xml only
	sb->safePrintf ("<?xml version=\"1.0\" "
	                "encoding=\"UTF-8\" ?>\n"
	                "<response>\n"
	                );

	Section *si = sections->m_rootSection;

	sec_t mflags = SEC_SENTENCE | SEC_MENU;

	for ( ; si ; si = si->m_next ) {
		// print it out
		sb->safePrintf("\t<section>\n");
		// get our offset in the array of sections
		int32_t num = si - sections->m_sections;
		sb->safePrintf("\t\t<id>%" PRId32"</id>\n",num);
		Section *parent = si->m_parent;
		if ( parent ) {
			int32_t pnum = parent - sections->m_sections;
			sb->safePrintf("\t\t<parent>%" PRId32"</parent>\n",pnum);
		}
		const char *byte1 = words->getWord(si->m_a);
		const char *byte2 = words->getWord(si->m_b-1) +
				    words->getWordLen(si->m_b-1);
		int32_t off1 = byte1 - words->getWord(0);
		int32_t size = byte2 - byte1;
		sb->safePrintf("\t\t<byteOffset>%" PRId32"</byteOffset>\n",off1);
		sb->safePrintf("\t\t<numBytes>%" PRId32"</numBytes>\n",size);
		if ( si->m_flags & mflags ) {
			sb->safePrintf("\t\t<flags><![CDATA[");
			bool printed = false;
			if ( si->m_flags & SEC_SENTENCE ) {
				sb->safePrintf("sentence");
				printed = true;
			}
			if ( si->m_flags & SEC_MENU ) {
				if ( printed ) sb->pushChar(' ');
				sb->safePrintf("ismenu");
				printed = true;
			}
			sb->safePrintf("]]></flags>\n");
		}
		int32_t bcolor = (int32_t)si->m_colorHash& 0x00ffffff;
		int32_t fcolor = 0x000000;
		//int32_t rcolor = 0x000000;
		uint8_t *bp = (uint8_t *)&bcolor;
		bool dark = false;
		if ( bp[0]<128 && bp[1]<128 && bp[2]<128 )
			dark = true;
		// or if two are less than 50
		if ( (bp[0]<100 && bp[1]<100) ||
		     (bp[1]<100 && bp[2]<100) ||
		     (bp[0]<100 && bp[2]<100) ) dark = true;
		// if bg color is dark, make font color light
		if ( dark ) {
			fcolor = 0x00ffffff;
			//rcolor = 0x00ffffff;
		}
		sb->safePrintf("\t\t<bgColor>%06" PRIx32"</bgColor>\n",bcolor);
		sb->safePrintf("\t\t<textColor>%06" PRIx32"</textColor>\n",fcolor);
		sb->safePrintf("\t</section>\n");
	}

	// now print out the entire page content so the offsets make sense!
	sb->safePrintf("\t<utf8Content><![CDATA[");
	if ( ptr_utf8Content )
		sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,false);
	sb->safePrintf("]]></utf8Content>\n");

	// end xml response
	sb->safePrintf("</response>\n");

	return true;
}

void XmlDoc::printTermList() const {
	if (!m_wts) {
		return;
	}

	// shortcut
	HashTableX *wt = m_wts;

	// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
	TermDebugInfo **tp = NULL;

	// add them with this counter
	int32_t nt = 0;

	int32_t nwt = 0;
	if ( wt ) {
		nwt = wt->getNumSlots();
		tp = (TermDebugInfo **)wt->m_keys;
	}

	// now print the table we stored all we hashed into
	for ( int32_t i = 0 ; i < nwt ; i++ ) {
		// skip if empty
		if ( wt->m_flags[i] == 0 ) continue;

		// get the TermDebugInfo
		TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
		// point to it for sorting
		tp[nt++] = ti;
	}

	const char *start = m_wbuf.getBufStart();

	for ( int32_t i = 0 ; i < nt ; i++ ) {
		TermDebugInfo *tpi = tp[i];

		const char *prefix = NULL;
		if (tpi->m_prefixOff >= 0) {
			prefix = start + tpi->m_prefixOff;
		}

		const char *desc = NULL;
		if (tpi->m_descOff >= 0) {
			desc = start + tpi->m_descOff;
		}

		// use hashgroup
		int32_t hg = tpi->m_hashGroup;
		if (!desc || !strcmp(desc, "body"))
			desc = getHashGroupString(hg);

		logf(LOG_TRACE, "termId=%015" PRId64" prefix='%s' wordPos=%" PRId32" wordNum=%" PRId32" term='%.*s' desc='%s%s%s' densityRank=%hhd wordSpamRank=%hhd",
		     (int64_t)(tp[i]->m_termId & TERMID_MASK),
		     prefix ? prefix : "",
		     tpi->m_wordPos,
		     tpi->m_wordNum,
		     tpi->m_termLen, start + tpi->m_termOff,
		     desc,
		     tpi->m_synSrc ? " - " : "",
		     tpi->m_synSrc ? getSourceString(tpi->m_synSrc) : "",
		     tpi->m_densityRank,
		     tpi->m_wordSpamRank);

	}
}

bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) {

	// set debug buffer
	m_storeTermListInfo = true;

	// default to sorting by wordpos
	m_sortTermListBy = hr->getLong("sortby",1);

	// cores in getNewSpiderReply() if we do not have this and provide
	// the docid...
	m_useSpiderdb = false;

	char *metaList = getMetaList ( );
	if ( ! metaList ) return true;
	if (metaList==(char *) -1) return false;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return false;


	int32_t isXml = hr->getLong("xml",0);

	if ( isXml ) {
		sb->safePrintf ("<?xml version=\"1.0\" "
				"encoding=\"UTF-8\" ?>\n"
				"<response>\n"
				);
		sb->safePrintf(
			       "\t<maxDens>%" PRId32"</maxDens>\n"
			       "\t<maxDiv>%" PRId32"</maxDiv>\n"
			       "\t<maxSpam>%" PRId32"</maxSpam>\n"
			       , (int32_t)MAXDENSITYRANK
			       , (int32_t)MAXDIVERSITYRANK
			       , (int32_t)MAXWORDSPAMRANK
			       );
	}

	if ( ! m_langIdValid ) { g_process.shutdownAbort(true); }

	if ( ! isXml ) {
		//printMenu ( sb );
		//sb->safePrintf("<i>* indicates word is a synonym or "
		//	       "alternative word form<br><br>");
		sb->safePrintf("N column = DensityRank (0-%" PRId32")<br>"
			       "V column = DiversityRank (0-%" PRId32")<br>"
			       "S column = WordSpamRank  (0-%" PRId32") "
			       "[or linker "
			       "siterank if its offsite link text]<br>"

			       "Lang column = language used for purposes "
			       "of detecting the document's primary language "
			       "using a simple majority vote"
			       "<br>"

			       "</i>"
			       "<br>"
			       "Document Primary Language: <b>%s</b> (%s)"
			       "<br>"
			       "<br>"
			       , (int32_t)MAXDENSITYRANK
			       , (int32_t)MAXDIVERSITYRANK
			       , (int32_t)MAXWORDSPAMRANK
			       , getLanguageString (m_langId)
			       , getLanguageAbbr(m_langId)
			       );
		// encode it
		SafeBuf ue;
		urlEncode(&ue, ptr_firstUrl);

		sb->safePrintf("Sort by: " );
		if ( m_sortTermListBy == 0 )
			sb->safePrintf("<b>Term</b>");
		else
			sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
				       "sortby=0>"
				       "Term</a>"
				       , cr->m_coll
				       , ue.getBufStart()
				       );
		sb->safePrintf(" | ");
		if ( m_sortTermListBy == 1 )
			sb->safePrintf("<b>WordPos</b>");
		else
			sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&"
				       "sortby=1>"
				       "WordPos</a>"
				       , cr->m_coll
				       , ue.getBufStart()
				       );
		sb->safePrintf("<br>"
			       "<br>"
			       );
	}


	//
	// BEGIN PRINT HASHES TERMS (JUST POSDB)
	//

	// shortcut
	HashTableX *wt = m_wts;

	// use the keys to hold our list of ptrs to TermDebugInfos for sorting!
	TermDebugInfo **tp = NULL;
	// add them with this counter
	int32_t nt = 0;

	int32_t nwt = 0;
	if ( wt ) {
		nwt = wt->getNumSlots();
		tp = (TermDebugInfo **)wt->m_keys;
	}

	// now print the table we stored all we hashed into
	for ( int32_t i = 0 ; i < nwt ; i++ ) {
		// skip if empty
		if ( wt->m_flags[i] == 0 ) continue;
		// get its key, date=32bits termid=64bits
		//key96_t *k = (key96_t *)wt->getKey ( i );
		// get the TermDebugInfo
		TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i );
		// point to it for sorting
		tp[nt++] = ti;
	}

	// set this for cmptp
	s_wbuf = &m_wbuf;

	if ( m_sortTermListBy == 0 )
		// sort them alphabetically
		gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp );
	else
		// sort by word pos
		gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp2 );

	// print them out in a table
	char hdr[1000];
	sprintf(hdr,
		"<table border=1 cellpadding=0>"
		"<tr>"

		"<td><b>Term ID</b></td>"

		"<td><b>Prefix</b></td>"
		"<td><b>WordPos</b></td>"
		"<td><b>Lang</b></td>"

		"<td><b>Term</b></td>"

		"<td><b>Desc</b></td>"
		"<td><b>Density</b></td>"
		"<td><b>Diversity</b></td>"
		"<td><b>Spam</b></td>"
		"<td><b>Inlink PR</b></td>"
		"<td><b>Score</b></td>"

		"</tr>\n"
		//,fbuf
		);

	if ( ! isXml )
		sb->safePrintf("%s",hdr);

	char *start = m_wbuf.getBufStart();
	int32_t rcount = 0;

	for ( int32_t i = 0 ; i < nt ; i++ ) {

		// see if one big table causes a browser slowdown
		if ( (++rcount % TABLE_ROWS) == 0 && ! isXml )
			sb->safePrintf("<!--ignore--></table>%s",hdr);

		char *prefix = NULL;//"&nbsp;";
		if ( tp[i]->m_prefixOff >= 0 )
			prefix = start + tp[i]->m_prefixOff;

		if ( isXml ) sb->safePrintf("\t<term>\n");

		if ( isXml && prefix )
			sb->safePrintf("\t\t<prefix><![CDATA[%s]]>"
				       "</prefix>\n",prefix);

		if ( ! isXml ) {
			sb->safePrintf ( "<tr>");
			// Show termId in decimal, masked as it would be stored in posdb
			sb->safePrintf("<td align=\"right\">%" PRId64"</td>", (int64_t)(tp[i]->m_termId & TERMID_MASK));

			if ( prefix )
				sb->safePrintf("<td>%s:</td>",prefix);
			else
				sb->safePrintf("<td>&nbsp;</td>");

			sb->safePrintf("<td>%" PRId32
				       "/%" PRId32
				       "</td>" ,
				       tp[i]->m_wordPos
				       ,tp[i]->m_wordNum
				       );

			// print out all langs word is in if it's not clear
			// what language it is. we use a sliding window to
			// resolve some ambiguity, but not all, so print out
			// the possible langs here
			sb->safePrintf("<td>");
			printLangBits ( sb , tp[i] );
			sb->safePrintf("</td>");
		}

		if ( isXml )
			sb->safePrintf("\t\t<s><![CDATA[");

		if ( ! isXml )
			sb->safePrintf ("<td><nobr>" );

		sb->safeMemcpy_nospaces ( start + tp[i]->m_termOff ,
					  tp[i]->m_termLen );

		if ( isXml )
			sb->safePrintf("]]></s>\n");
		else
			sb->safePrintf ( "</nobr></td>" );


		if ( isXml )
			sb->safePrintf("\t\t<wordPos>%" PRId32"</wordPos>\n",
				       tp[i]->m_wordPos);

		const char *desc = NULL;
		if ( tp[i]->m_descOff >= 0 )
			desc = start + tp[i]->m_descOff;

		// use hashgroup
		int32_t hg = tp[i]->m_hashGroup;
		if ( ! desc || ! strcmp(desc,"body") )
			desc = getHashGroupString(hg);

		if ( isXml && desc )
			sb->safePrintf("\t\t<loc>%s</loc>\n", desc);
		else if ( ! isXml ) {
			if ( ! desc ) desc = "&nbsp;";
			sb->safePrintf ( "<td>%s", desc );
			char ss = tp[i]->m_synSrc;
			if ( ss )
				sb->safePrintf(" - %s",
					       getSourceString(ss));
			sb->safePrintf("</td>");
		}

		int32_t dn = (int32_t)tp[i]->m_densityRank;
		if ( isXml ) {
			sb->safePrintf("\t\t<dens>%" PRId32"</dens>\n",dn);
		}
		else {
			if( dn >= MAXDENSITYRANK ) {
				sb->safePrintf("<td>%" PRId32"</td>\n",dn);
			}
			else {
				sb->safePrintf("<td><font color=purple>%" PRId32"</font></td>",dn);
			}
		}

		int32_t dv = (int32_t)tp[i]->m_diversityRank;
		if ( isXml ) {
			sb->safePrintf("\t\t<divers>%" PRId32"</divers>\n",dv);
		}
		else {
			if( dv >= MAXDIVERSITYRANK ) {
				sb->safePrintf("<td>%" PRId32"</td>\n",dv);
			}
			else {
				sb->safePrintf("<td><font color=purple>%" PRId32"</font></td>",dv);
			}
		}


		// the wordspamrank
		int32_t ws = (int32_t)tp[i]->m_wordSpamRank;

		if ( isXml ) {
			if( hg == HASHGROUP_INLINKTEXT ) {
				sb->safePrintf("\t\t<linkerSiteRank>%" PRId32 "</linkerSiteRank>\n",ws);
			}
			else {
				sb->safePrintf("\t\t<spam>%" PRId32"</spam>\n",ws);
			}
		}
		else {
			if( hg == HASHGROUP_INLINKTEXT ) {
				sb->safePrintf("<td></td>");
				sb->safePrintf("<td>%" PRId32"</td>",ws);
			}
			else {
				if ( ws >= MAXWORDSPAMRANK ) {
					sb->safePrintf("<td>%" PRId32"</td>",ws);
				}
				else {
					sb->safePrintf("<td><font color=red>%" PRId32"</font></td>", ws);
				}
				sb->safePrintf("<td></td>");
			}
		}

		float score = 1.0;
		// square this like we do in the query ranking algo
		score *= getHashGroupWeight(hg) * getHashGroupWeight(hg);
		score *= getDiversityWeight(tp[i]->m_diversityRank);
		score *= getDensityWeight(tp[i]->m_densityRank);
		if ( tp[i]->m_synSrc ) score *= g_conf.m_synonymWeight;
		if ( hg == HASHGROUP_INLINKTEXT ) score *= getLinkerWeight(ws);
		else                           score *= getWordSpamWeight(ws);
		if ( isXml )
			sb->safePrintf("\t\t<score>%.02f</score>\n",score);
		else
			sb->safePrintf("<td>%.02f</td>\n",score);

		if ( isXml )
			sb->safePrintf("\t</term>\n");
		else
			sb->safePrintf("</tr>\n");
	}


	if ( isXml )
		sb->safePrintf ("</response>\n"	);
	else
		sb->safePrintf("</table><br>\n");

	//
	// END PRINT HASHES TERMS
	//

	return true;
}

bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) {

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) printMenu ( sb );

	sb->safePrintf("<b>Coming Soon</b>");

	return true;
}

bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) {

	char **c = getUtf8Content();
	if ( ! c ) return true;
	if ( c==(void *)-1) return false;

	int32_t isXml = hr->getLong("xml",0);

	if ( ! isXml ) {
		printMenu ( sb );
		// just copy it otherwise
		if ( ptr_utf8Content )
			sb->safeMemcpy ( ptr_utf8Content ,size_utf8Content -1);
		return true;
	}

	sb->safePrintf ("<?xml version=\"1.0\" "
			"encoding=\"UTF-8\" ?>\n"
			"<response>\n"
			);
	sb->safePrintf("\t<utf8Content><![CDATA[");
	if ( ptr_utf8Content )
		sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,
				 false);
	sb->safePrintf("]]></utf8Content>\n");
	// end xml response
	sb->safePrintf("</response>\n");
	return true;
}


// . get the possible titles of the root page
// . includes the title tag text
// . includes various inlink text
// . used to match the VERIFIED place name 1 or 2 of addresses on this
//   site in order to set Address::m_flags's AF_VENUE_DEFAULT bit which
//   indicates the address is the address of the website (a venue website)
char *XmlDoc::getRootTitleBuf ( ) {

	// return if valid
	if ( m_rootTitleBufValid ) return m_rootTitleBuf;

	// get it from the tag rec first
	setStatus ( "getting root title buf");

	// get it from the tag rec if we can
	TagRec *gr = getTagRec ();
	if ( ! gr || gr == (void *)-1 ) return (char *)(void*)gr;

	// PROBLEM: new title rec is the only thing which has sitetitles tag
	// sometimes and we do not store that in the title rec. in this case
	// we should maybe store ptr_siteTitleBuf/size_siteTitleBuf in the
	// title rec?
	Tag *tag = gr->getTag("roottitles");

	char *src     = NULL;
	int32_t  srcSize = 0;

	if ( ptr_rootTitleBuf || m_setFromTitleRec ) {
		src    =  ptr_rootTitleBuf;
		srcSize = size_rootTitleBuf;
	}
	else if ( tag ) {
		src     = tag->getTagData();
		srcSize = tag->getTagDataSize();
		// no need to add to title rec since already in the tag so
		// make sure we did not double add
		if ( ptr_rootTitleBuf ) { g_process.shutdownAbort(true); }
	}
	else {
		// . get the root doc
 		// . allow for a one hour cache of the titleRec
		XmlDoc **prd = getRootXmlDoc( 3600 );
		if ( ! prd || prd == (void *)-1 ) return (char*)(void*)prd;
		// shortcut
		XmlDoc *rd = *prd;
		// . if no root doc, then assume no root title
		// . this happens if we are injecting because we do not want
		//   to download the root page for speed purposes
		if ( ! rd ) {
			m_rootTitleBuf[0] = '\0';
			m_rootTitleBufSize = 0;
			m_rootTitleBufValid = true;
			return m_rootTitleBuf;
		}

		// a \0 separated list
		char *rtl = rd->getTitleBuf();
		if ( ! rtl || rtl == (void *)-1 ) return rtl;

		// ptr
		src     = rd->m_titleBuf;
		srcSize = rd->m_titleBufSize;
	}

	int32_t max = (int32_t)ROOT_TITLE_BUF_MAX - 5;
	// sanity
	if ( src && srcSize >= max ) {
		// truncate
		srcSize = max;
		// back up so we split on a space
		for ( ; srcSize>0 && ! is_wspace_a(src[srcSize]); srcSize--);
		// null term
		src[srcSize] = '\0';
		// include it
		srcSize++;
	}

	// copy that over in case root is destroyed
	if( src && srcSize ) {
		gbmemcpy ( m_rootTitleBuf , src , srcSize );
	}
	else {
		m_rootTitleBuf[0] = '\0';
	}
	m_rootTitleBufSize = srcSize;

	// sanity check, must include the null ni the size
	if ( m_rootTitleBufSize > 0 &&
	     m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) {
		log("build: bad root titlebuf size not end in null char for "
		    "collnum=%i",(int)m_collnum);
		ptr_rootTitleBuf = NULL;
		size_rootTitleBuf = 0;
		m_rootTitleBufValid = true;
		return m_rootTitleBuf;
	}

	// sanity check - breach check
	if ( m_rootTitleBufSize > ROOT_TITLE_BUF_MAX ) { g_process.shutdownAbort(true);}

	// serialize into our titlerec
	ptr_rootTitleBuf  = m_rootTitleBuf;
	size_rootTitleBuf = m_rootTitleBufSize;

	m_rootTitleBufValid = true;

	return m_rootTitleBuf;
}


char *XmlDoc::getFilteredRootTitleBuf ( ) {

	if ( m_filteredRootTitleBufValid )
		return m_filteredRootTitleBuf;

	// get unfiltered. m_rootTitleBuf should be set from this call.
	char *rtbp = getRootTitleBuf();
	if ( ! rtbp || rtbp == (void *)-1 ) return rtbp;

	// filter all the punct to \0 so that something like
	// "walmart.com : live better" is reduced to 3 potential
	// names, "walmart", "com" and "live better"
#ifdef _VALGRIND_
	VALGRIND_CHECK_MEM_IS_DEFINED(m_rootTitleBuf,m_rootTitleBufSize);
#endif
	char *src    =       m_rootTitleBuf;
	char *srcEnd = src + m_rootTitleBufSize;
	char *dst    =       m_filteredRootTitleBuf;

	// save some room to add a \0, so subtract 5
	char *dstEnd = dst + ROOT_TITLE_BUF_MAX - 5;

	int32_t  size = 0;
	bool lastWasPunct = true;
	for ( ; src < srcEnd && dst < dstEnd ; src += size ) {
		// set the char size
		size = getUtf8CharSize(src);
		// space?
		if ( is_wspace_a (*src) ||
		     // allow periods too
		     *src=='.' ) {
			// no back to back punct
			if ( lastWasPunct ) continue;
			// flag it
			lastWasPunct = true;
			// add it in
			*dst++ = '.';
			// that's it
			continue;
		}
		// x'y  or x-y
		if ( ( *src == '\'' ||
		       *src == '.'  ||
		       *src == '-'  ) &&
		     ! lastWasPunct &&
		     is_alnum_a(src[1]) ) {
			// add it in
			*dst++ = *src;
			// that's it
			continue;
		}
		// x & y is ok
		if ( *src == '&' ) {
			// assume not punct (stands for and)
			lastWasPunct = false;
			// add it in
			*dst++ = *src;
			// that's it
			continue;
		}
		// store alnums right in
		if ( is_alnum_a(*src) ) {
			// flag it
			lastWasPunct = false;
			// copy it over
			gbmemcpy ( dst , src , size );
			// skip what we copied
			dst += size;
			continue;
		}
		// if punct and haven't stored anything, just skip it
		if ( lastWasPunct ) dst[-1] = '\0';
		// store it
		else *dst++ = '\0';
	}
	// make sure we end on a \0
	if ( dst > m_filteredRootTitleBuf && dst[-1] != '\0' )
		*dst++ = '\0';

	// shortcut
	char *str     = m_filteredRootTitleBuf;
	int32_t  strSize = dst - m_filteredRootTitleBuf;

	// copy that over in case root is destroyed
	gbmemcpy ( m_filteredRootTitleBuf , str , strSize );
	m_filteredRootTitleBufSize = strSize;

	// sanity check, must include the null ni the size
	if ( m_filteredRootTitleBufSize > 0 &&
	     m_filteredRootTitleBuf [ m_filteredRootTitleBufSize - 1 ] ) {
		g_process.shutdownAbort(true);
	}

	// sanity check - breach check
	if ( m_filteredRootTitleBufSize > ROOT_TITLE_BUF_MAX ) {
		g_process.shutdownAbort(true);}

	m_filteredRootTitleBufValid = true;

#ifdef _VALGRIND_
	VALGRIND_CHECK_MEM_IS_DEFINED(m_filteredRootTitleBuf,m_filteredRootTitleBufSize);
#endif

	return m_filteredRootTitleBuf;
}

//static bool s_dummyBool = 1;

class Binky {
public:
	char      *m_text;
	int32_t       m_textLen;
	int32_t       m_score;
	int64_t  m_hash;
};


// static int cmpbk ( const void *v1, const void *v2 ) {
// 	Binky *b1 = (Binky *)v1;
// 	Binky *b2 = (Binky *)v2;
// 	return b1->m_score - b2->m_score;
// }

char *XmlDoc::getTitleBuf ( ) {
	if ( m_titleBufValid ) return m_titleBuf;

	// recalc this everytime the root page is indexed
	setStatus ( "getting title buf on root");

	// are we a root?
	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (char*)isRoot;
	// this should only be called on the root!
	// . if the site changed for us, but the title rec of what we
	//   think is now the root thinks that it is not the root because
	//   it is using the old site, then it cores here!
	// . i.e. if the new root is www.xyz.com/user/ted/ and the old root
	//   is www.xyz.com then and the old root is stored in ptr_site for
	//   the title rec for www.xyz.com/user/ted/ then we core here,
	// . so take this sanity check out
	// . but if the title rec does not think he is the site root yet
	//   then just wait until he does so we can get his
	//   ptr_rootTitleBuf below
	if ( ! *isRoot ) {
		m_titleBuf[0] = '\0';
		m_titleBufSize = 0;
		m_titleBufValid = true;
		return m_titleBuf;
	}

	// sanity check
	if ( m_setFromTitleRec ) {
		gbmemcpy(m_titleBuf, ptr_rootTitleBuf, size_rootTitleBuf );
		m_titleBufSize  = size_rootTitleBuf;
		m_titleBufValid = true;
		return m_titleBuf;
	}

	char *mysite = getSite();
	if ( ! mysite || mysite == (char *)-1 ) return mysite;
	// get link info first
	LinkInfo   *info1  = getLinkInfo1();
	// error or blocked
	if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char*)(void*)info1;

	// sanity check
	Xml *xml = getXml();
	// return -1 if it blocked
	if ( xml == (void *)-1 ) return (char*)-1;
	// set up for title
	int32_t tlen ;
	char *title ;
	// on error, ignore it to avoid hammering the root!
	if ( xml == (void *)NULL ) {
		// log it
		log("build: error downloading root xml: %s",
		    mstrerror(g_errno));
		// clear it
		g_errno = 0;
		// make it 0
		tlen  = 0;
		title = NULL;
	}
	else {
		// get the title
		title = m_xml.getTextForXmlTag ( 0,
						 999999 ,
						 "title" ,
						 &tlen ,
						 true ); // skip leading spaces
	}

	// truncate to 100 chars
	//for ( ; tlen>0 && (tlen > 100 || is_alnum_a(title[tlen])) ; tlen-- )
	//	if ( tlen == 0 ) break;
	if ( tlen > 100 ) {
		char *tpend = title + 100;
		char *prev  = getPrevUtf8Char ( tpend , title );
		// make that the end so we don't split a utf8 char
		tlen = prev - title;
	}

	// store tag in here
	char tmp[1024];
	// point to it
	char *ptmp = tmp;
	// set this
	char *pend = tmp + 1024;
	// add that in
	gbmemcpy ( ptmp, title, tlen); ptmp += tlen;
	// null terminate it
	*ptmp++ = '\0';

	// two votes per internal inlink
	int32_t internalCount = 0;
	// count inlinkers
	int32_t linkNum = 0;
	Binky bk[1000];
	// init this
	//char stbuf[2000];
	//HashTableX scoreTable;
	//scoreTable.set(8,4,64,stbuf,2000,false,m_niceness,"xmlscores");
	// scan each link in the link info
	for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) {
		// do not breach
		if ( linkNum >= 1000 ) break;
		// is this inlinker internal?
		bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff));
		// get length of link text
		int32_t tlen = k->size_linkText;
		if ( tlen > 0 ) tlen--;
		// get the text
		char *txt = k->getLinkText();
		// skip corrupted
		if ( ! verifyUtf8 ( txt , tlen ) ) {
			log("xmldoc: bad link text 4 from url=%s for %s",
			    k->getUrl(),m_firstUrl.getUrl());
			continue;
		}
		// store these
		// zero out hash
		bk[linkNum].m_hash    = 0;
		bk[linkNum].m_text    = txt;
		bk[linkNum].m_textLen = tlen;
		bk[linkNum].m_score   = 0;
		// internal count
		if ( internal && ++internalCount >= 3 ) continue;
		// it's good
		bk[linkNum].m_score = 1;
		linkNum++;
	}
	// init this
	char dtbuf[1000];
	HashTableX dupTable;
	dupTable.set(8,0,64,dtbuf,1000,false,"xmldup");
	// now set the scores and isdup
	for ( int32_t i = 0 ; i < linkNum ; i++ ) {
		// skip if ignored
		if ( bk[i].m_score == 0 ) continue;
		// get hash
		int64_t h = bk[i].m_hash;
		// assume a dup
		bk[i].m_score = 0;
		// skip if zero'ed out
		if ( ! h ) continue;
		// only do each hash once!
		if ( dupTable.isInTable(&h) ) continue;
		// add to it. return NULL with g_errno set on error
		if ( ! dupTable.addKey(&h) ) return NULL;
		// is it in there?
		bk[i].m_score = 1; // scoreTable.getScore(h);
	}
	// now sort the bk array by m_score
	//gbsort ( bk , linkNum , sizeof(Binky), cmpbk );

	// sanity check - make sure sorted right
	//if ( linkNum >= 2 && bk[0].m_score < bk[1].m_score ) {
	//	g_process.shutdownAbort(true); }

	// . now add the winners to the buffer
	// . skip if score is 0
	for ( int32_t i = 0 ; i < linkNum ; i++ ) {
		// skip if score is zero
		if ( bk[i].m_score == 0 ) continue;
		// skip if too big
		if ( bk[i].m_textLen + 1 > pend - ptmp ) continue;
		// store it
		gbmemcpy ( ptmp , bk[i].m_text , bk[i].m_textLen );
		// advance
		ptmp += bk[i].m_textLen;
		// null terminate it
		*ptmp++ = '\0';
	}

	// sanity
	int32_t size = ptmp - tmp;
	if ( size > ROOT_TITLE_BUF_MAX ) { g_process.shutdownAbort(true); }

	gbmemcpy ( m_titleBuf , tmp , ptmp - tmp );
	m_titleBufSize = size;
	m_titleBufValid = true;
	// ensure null terminated
	if ( size > 0 && m_titleBuf[size-1] ) { g_process.shutdownAbort(true); }
	//ptr_siteTitleBuf = m_siteTitleBuf;
	//size_siteTitleBuf = m_siteTitleBufSize;
	return m_titleBuf;
}


// . now we just get all the tagdb rdb recs to add using this function
// . then we just use the metalist to update tagdb
SafeBuf *XmlDoc::getNewTagBuf ( ) {
	if ( m_newTagBufValid ) return &m_newTagBuf;

	setStatus ( "getting new tags");

	int32_t *ic = getIndexCode();
	if ( ic == (void *)-1 ) { g_process.shutdownAbort(true); }

	// get our ip
	int32_t *ip = getIp();
	// this must not block to avoid re-computing "addme" above
	if ( ip == (void *)-1 ) { g_process.shutdownAbort(true); }
	if ( ! ip || ip == (int32_t *)-1) return (SafeBuf *)ip;

	// . do not both if there is a problem
	// . otherwise if our ip is invalid (0 or 1) we core in
	//   getNumSiteInlinks() which requires a valid ip
	// . if its robots.txt disallowed, then indexCode will be set, but we
	//   still want to cache our sitenuminlinks in tagdb! delicious.com was
	//   recomputing the sitelinkinfo each time because we were not storing
	//   these tags in tagdb!!
	if ( ! *ip || *ip == -1 ) { // *ic ) {
		m_newTagBuf.reset();
		m_newTagBufValid = true;
		return &m_newTagBuf;
	}

	// get the tags already in tagdb
	TagRec *gr = getTagRec ( );
	if ( ! gr || gr == (void *)-1 ) return (SafeBuf *)gr;

	// get our site
	char *mysite = getSite();
	// this must not block to avoid re-computing "addme" above
	if ( mysite == (void *)-1 ) { g_process.shutdownAbort(true); }
	if ( ! mysite || mysite == (char *)-1 ) return (SafeBuf *)mysite;

	// age of tag in seconds
	int32_t timestamp;

	// always just use the primary tagdb so we can cache our sitenuminlinks
	rdbid_t rdbId = RDB_TAGDB;

	// sitenuminlinks special for repair
	if ( m_useSecondaryRdbs &&
	     // and not rebuilding titledb
	     ! m_useTitledb ) {
		m_newTagBuf.reset();
		m_newTagBufValid = true;
		int32_t old1 = gr->getLong("sitenuminlinks",-1,&timestamp);
		if ( old1 == m_siteNumInlinks &&
		     old1 != -1 &&
		     ! m_updatingSiteLinkInfoTags )
			return &m_newTagBuf;
		int32_t now = getTimeGlobal();
		if ( g_conf.m_logDebugLinkInfo )
			log("xmldoc: adding tag site=%s sitenuminlinks=%" PRId32,
			    mysite,m_siteNumInlinks);
		if ( ! Tagdb::addTag2(&m_newTagBuf, mysite,"sitenuminlinks",now,
				      "xmldoc",
				      *ip,m_siteNumInlinks,rdbId) )
			return NULL;
		return &m_newTagBuf;
	}

	// if doing consistency check, this buf is for adding to tagdb
	// so just ignore those. we use ptr_tagRecData in getTagRec() function
	// but this is really for updating tagdb.
	if ( m_doingConsistencyCheck ) {
		m_newTagBuf.reset();
		m_newTagBufValid = true;
		return &m_newTagBuf;
	}

	Xml *xml = getXml();
	if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml;

	Words *ww = getWords();
	if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww;

	char *isIndexed = getIsIndexed();
	if ( !isIndexed || isIndexed==(char *)-1 ) return (SafeBuf *)isIndexed;

	char *isRoot = getIsSiteRoot();
	if ( ! isRoot || isRoot == (char *)-1 ) return (SafeBuf *)isRoot;

	int32_t *siteNumInlinks = getSiteNumInlinks();
	if ( ! siteNumInlinks ) return NULL;
	if (   siteNumInlinks == (int32_t *)-1) return (SafeBuf *)-1;

	// ok, get the sites of the external outlinks and they must
	// also be NEW outlinks, added to the page since the last time
	// we spidered it...
	Links *links = getLinks ();
	if ( ! links || links == (Links *)-1 ) return (SafeBuf *)links;

	// our next slated spider priority
	char *spiderLinks = getSpiderLinks();
	if ( ! spiderLinks  || spiderLinks == (char *)-1 )
		return (SafeBuf *)spiderLinks;

	// . get ips of all outlinks.
	// . use m_msgeForIps class just for that
	// . it sucks if the outlink's ip is a dns timeout, then we never
	//   end up being able to store it in tagdb, that is why when
	//   rebuilding we need to skip adding firstip tags for the outlinks
	int32_t **ipv = NULL;
	TagRec ***grv = NULL;
	bool addLinkTags = true;
	if ( ! *spiderLinks ) addLinkTags = false;
	if ( ! m_useSpiderdb ) addLinkTags = false;
	if ( addLinkTags ) {
		ipv = getOutlinkFirstIpVector ();
		if ( ! ipv || ipv == (void *)-1 ) return (SafeBuf *)ipv;
		// . uses m_msgeForTagRecs for this one
		grv = getOutlinkTagRecVector();
		if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv;
	}

	//
	// init stuff
	//

	// . this gets the root doc and and parses titles out of it
	// . sets our m_rootTitleBuf/m_rootTitleBufSize
	char *rtbufp = getRootTitleBuf();
	if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf*)(void*)rtbufp;

	CollectionRec *cr = getCollRec();
	if ( ! cr ) return NULL;

	// overwrite "getting root title buf" status
	setStatus ("computing new tags");

	if ( g_conf.m_logDebugLinkInfo )
		log("xmldoc: adding tags for mysite=%s",mysite);

	// shortcut
	//TagRec *tr = &m_newTagRec;
	// current time
	int32_t now = getTimeGlobal();

	// store tags into here
	SafeBuf *tbuf = &m_newTagBuf;
	// allocate space to hold the tags we will add
	int32_t need = 512;
	// add in root title buf in case we add it too
	need += m_rootTitleBufSize;
	// reserve it all now
	if ( ! tbuf->reserve(need) ) return NULL;

	//
	// add "site" tag
	//
	const char *oldsite = gr->getString( "site", NULL, NULL, &timestamp );
	if ( ! oldsite || strcmp(oldsite,mysite) != 0 || now-timestamp > 10*86400)
		Tagdb::addTag3(tbuf,mysite,"site",now,"xmldoc",*ip,mysite,rdbId);

	//
	// add firstip if not there at all
	//
	const char *oldfip = gr->getString("firstip",NULL);
	// convert it
	int32_t ip3 = 0;
	if ( oldfip ) ip3 = atoip(oldfip);
	// if not there or if bogus, add it!! should override bogus firstips
	if ( ! ip3 || ip3 == -1 ) {
		char ipbuf[16];
		Tagdb::addTag3(tbuf,mysite,"firstip",now,"xmldoc",*ip,iptoa(m_ip,ipbuf),
			       rdbId);
	}

	// sitenuminlinks
	int32_t old1 = gr->getLong("sitenuminlinks",-1,&timestamp);
	if ( old1 == -1 || old1 != m_siteNumInlinks || m_updatingSiteLinkInfoTags ) {
		if ( g_conf.m_logDebugLinkInfo )
			log("xmldoc: adding tag site=%s sitenuminlinks=%" PRId32,
			    mysite,m_siteNumInlinks);
		if ( ! Tagdb::addTag2(tbuf,mysite,"sitenuminlinks",now,"xmldoc",
				      *ip,m_siteNumInlinks,rdbId) )
			return NULL;
	}

	// get root title buf from old tag
	char *data  = NULL;
	int32_t  dsize = 0;
	Tag *rt = gr->getTag("roottitles");
	if ( rt ) {
		data  = rt->getTagData();
		dsize = rt->getTagDataSize();
	}

	bool addRootTitle = false;
	// store the root title buf if we need to. if we had no tag yet...
	if ( ! rt )
		addRootTitle = true;
	// or if differs in size
	else if ( dsize != m_rootTitleBufSize )
		addRootTitle = true;
	// or if differs in content
	else if ( memcmp(data,m_rootTitleBuf,m_rootTitleBufSize) != 0 )
		addRootTitle =true;
	// or if it is 10 days old or more
	if ( old1!=-1 && now-timestamp > 10*86400 ) addRootTitle = true;
	// but not if injected
	if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false;
	// add it then
	if ( addRootTitle &&
	     ! Tagdb::addTag(tbuf,mysite,"roottitles",now,"xmldoc",
			     *ip,m_rootTitleBuf,m_rootTitleBufSize,
			    rdbId,true) )
		return NULL;


	//
	//
	// NOW add tags for our outlinks
	//
	//

	bool oldHighQualityRoot = true;

	// if we are new, do not add anything, because we only add a tagdb
	// rec entry for "new" outlinks  that were added to the page since
	// the last time we spidered it
	if ( ! *isIndexed ) oldHighQualityRoot = false;

	// no updating if we are not root
	if ( ! *isRoot ) oldHighQualityRoot = false;

	// must be high quality, too
	if ( *siteNumInlinks < 500 ) oldHighQualityRoot = false;

	// only do once per site
	char buf[1000];
	HashTableX ht; ht.set (4,0,-1 , buf , 1000 ,false,"sg-tab");
	// get site of outlink
	SiteGetter siteGetter;
	// . must be from an EXTERNAL DOMAIN and must be new
	// . we should already have its tag rec, if any, since we have msge
	int32_t n = links->getNumLinks();
	// not if not spidering links
	if ( ! addLinkTags ) n = 0;
	// get the flags
	linkflags_t *flags = links->m_linkFlags;
	// scan all outlinks we have on this page
	for ( int32_t i = 0 ; i < n ; i++ ) {

		// get its tag rec
		TagRec *gr = (*grv)[i];

		// does this hostname have a "firstIp" tag?
		const char *ips = gr->getString("firstip",NULL);

		bool skip = false;
		// skip if we are not "old" high quality root
		if ( ! oldHighQualityRoot ) skip = true;
		// . skip if not external domain
		// . we added this above, so just "continue"
		if ( flags[i] & LF_SAMEDOM ) continue;//skip = true;
		// skip links in the old title rec
		if ( flags[i] & LF_OLDLINK ) skip = true;
		// skip if determined to be link spam! should help us
		// with the text ads we hate so much
		if ( links->m_spamNotes[i] ) skip = true;

		// if we should skip, and they have firstip already...
		if ( skip && ips ) continue;

		// get the normalized url
		char *url = links->getLinkPtr(i);
		// get the site. this will not block or have an error.
		siteGetter.getSite(url,gr,timestamp,cr->m_collnum,m_niceness);
		// these are now valid and should reference into
		// Links::m_buf[]
		const char *site    = siteGetter.getSite();
		int32_t  siteLen = siteGetter.getSiteLen();

		int32_t linkIp  = (*ipv)[i];

		// get site hash
		uint32_t sh = hash32 ( site , siteLen );
		// ensure site is unique
		if ( ht.getSlot ( &sh ) >= 0 ) continue;
		// add it. returns false and sets g_errno on error
		if ( ! ht.addKey ( &sh ) ) return NULL;

		// . need to add firstip tag for this link's subdomain?
		// . this was in Msge1.cpp but now we do it here
		if ( ! ips && linkIp && linkIp != -1 ) {
			// make it
			char ipbuf[16];
			if (!Tagdb::addTag3(tbuf,site,"firstip",now,"xmldoc",*ip,iptoa(linkIp,ipbuf),
					    rdbId))
				return NULL;
		}

		if ( skip ) continue;

		// how much avail for adding tags?
		int32_t avail = tbuf->getAvail();
		// reserve space
		int32_t need = 512;
		// make sure enough
		if ( need > avail && ! tbuf->reserve ( need ) ) return NULL;

		// add tag for this outlink
		// link is linked to by a high quality site! 500+ inlinks.
		if ( gr->getNumTagTypes("authorityinlink") < 5 &&
		     ! Tagdb::addTag(tbuf,site,"authorityinlink",now,"xmldoc",
				     *ip,"1",2,rdbId,true) )
			return NULL;
	}

	m_newTagBufValid = true;
	return &m_newTagBuf;
}


//
//
// BEGIN OLD SPAM.CPP class
//
//

#define WTMPBUFSIZE (MAX_WORDS *21*3)

// RULE #28, repetitive word/phrase spam detector
// Set's the "spam" member of each word from 0(no spam) to 100(100% spam).
//
// "bits" describe each word in phrasing terminology.
//
// If more than maxPercent of the words are spammed to some degree then we
// consider all of the words to be spammed, and give each word the minimum
// score possible when indexing the document.
//
// Returns false and sets g_errno on error
char *XmlDoc::getWordSpamVec() {

	logTrace( g_conf.m_logTraceWordSpam, "BEGIN" );

	if ( m_wordSpamBufValid ) {
		char *wbuf = m_wordSpamBuf.getBufStart();
		if ( ! wbuf ) {
			logTrace( g_conf.m_logTraceWordSpam, "END - no buffer" );
			return (char *)0x01;
		}
		logTrace( g_conf.m_logTraceWordSpam, "END - Valid" );
		return wbuf;
	}

	setStatus("getting word spam vec");

	// assume not the repeat spammer
	m_isRepeatSpammer = false;

	Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) {
		logTrace( g_conf.m_logTraceWordSpam, "END - no Words obj" );
		return (char *)words;
	}

	m_wordSpamBuf.purge();

	int32_t nw = words->getNumWords();
	if ( nw <= 0 ) {
		m_wordSpamBufValid = true;
		logTrace( g_conf.m_logTraceWordSpam, "END - no words" );
		return (char *)0x01;
	}

	Phrases *phrases = getPhrases ();
	if ( ! phrases || phrases == (void *)-1 ) {
		logTrace( g_conf.m_logTraceWordSpam, "END - no Phrases" );
		return (char *)phrases;
	}
	Bits *bits = getBits();
	if ( ! bits ) {
		logTrace( g_conf.m_logTraceWordSpam, "END - no Bits" );
		return (char *)NULL;
	}

	m_wordSpamBufValid = true;

	//if ( m_isLinkText   ) return true;
	//if ( m_isCountTable ) return true;


	// if 20 words totally spammed, call it all spam?
	m_numRepeatSpam = 20;

	if ( ! m_siteNumInlinksValid ) {
		g_process.shutdownAbort(true);
	}

#if 0
	// @todo: examine if this should be used. Was always hard coded to 25
	// shortcut
	int32_t sni = m_siteNumInlinks;
	// set "m_maxPercent"
	int32_t maxPercent = 6;
	if ( sni > 10  ) maxPercent = 8;
        if ( sni > 30  ) maxPercent = 10;
        if ( sni > 100 ) maxPercent = 20;
        if ( sni > 500 ) maxPercent = 30;
#endif
	// fix this a bit so we're not always totally spammed
	int32_t maxPercent = 25;

	// get # of words we have to set spam for
	int32_t numWords = words->getNumWords();

	// set up the size of the hash table (number of buckets)
	int32_t  numBuckets = numWords * 3;

	StackBuf<WTMPBUFSIZE> tmpBuf;
								// next, bucketHash, bucketWordPos, profile, commonWords
	int32_t need = (numWords * (sizeof(int32_t) + sizeof(int64_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(char))) * 3 + numWords;

	logTrace( g_conf.m_logTraceWordSpam, "numWords: %" PRId32 ", numBuckets: %" PRId32 ", need: %" PRId32 "", numWords, numBuckets, need);

	if(!tmpBuf.reserve(need)) {
		log(LOG_WARN, "Failed to allocate %" PRId32" more bytes for spam detection:  %s.", need, mstrerror(g_errno));
		logTrace( g_conf.m_logTraceWordSpam, "END - oom" );
		return NULL;
	}
	char *tmp = tmpBuf.getBufStart();

	
	//#
	//# We use one single memory block to store all data.
	//# Set up the pointers to each sub-block here.
	//#
	char *p = tmp;
	
	// One 1-byte spam indicator per word
	unsigned char *spam = (unsigned char *)p;
	p += numWords * sizeof(unsigned char);	// one per word, not per bucket
	
	// One "next pointer" per bucket.
	// This allows us to make linked lists of indices of words.
	// i.e. next[13] = 23 -> word #23 FOLLOWS word #13 in the linked list
	int32_t *next = (int32_t *)p;
	p += numBuckets * sizeof(int32_t);

	// Hash table of word IDs 
	int64_t *bucketHash = (int64_t *)p;
	p += numBuckets * sizeof(int64_t);

	// Position in document of word in bucketHash
	int32_t *bucketWordPos = (int32_t *)p;
	p += numBuckets * sizeof(int32_t);

	// Profile of word in bucketHash
	int32_t *profile = (int32_t *)p;
	p += numBuckets * sizeof(int32_t);

	// Is word in bucketHash a stopword or number?
	char *commonWords = (char *)p;
	p += numBuckets * sizeof(char);

	// sanity check
	if ( p - tmp > need ) {
		g_process.shutdownAbort(true);
	}

	// clear all our spam percentages for these words
	memset(spam, 0, numWords);

    // clear the hash table
    int32_t i;
    for(i=0; i < numBuckets; i++) {
		bucketHash   [i] =  0;
		bucketWordPos[i] = -1;
		commonWords  [i] =  0;
    }


	int64_t *wids = words->getWordIds();
	const char *const*wptrs = words->getWordPtrs();
	const int32_t  *wlens = words->getWordLens();


	//#
	//# Register all word occurrences in our hash table
	//#
	for(i=0; i < numWords; i++) {
		// Skip punctuation, spaces and other non-word entries
		if ( wids[i] == 0 ) {
			continue;
		}

		// Get the hash of the ith word
		int64_t h = words->getWordId(i);

		// "j" is the bucket index
		int32_t j = (uint64_t)h % numBuckets;

		// If the hash bucket is already used, see if it is by our
		// word, otherwise increase the index until a free bucket
		// is found.
		while( bucketHash[j] ) {
			if ( h == bucketHash[j] ) {
				break;
			}
			if (++j == numBuckets) {
				j = 0;
			}
		}
		// j now points to either a free bucket or a bucket already
		// occupied by a previous instance of our word.

		if (bucketHash[j]) {
			// Bucket already occupied by a previous instance of our word.
			// Add the previous word position into the "linked list" for the ith word.
			// So if the bucket was used by word 6 and this is word 10, we set
			// next[10] to 6. If word 6 collided with word 4, next[6] will point to 4.
			next[i] = bucketWordPos[j];

			// replace bucket with index to this word
			bucketWordPos[j] = i;
		}
		else {
			// Bucket is free. We have the first occurence of this word
			bucketHash[j] = h;

			// Store our position (i) in bucket
			bucketWordPos[j] = i;

			// no next occurence of the ith word yet
			next[i] = -1;
		}

		// if stop word or number then mark it
		if ( bits->isStopWord(i) ) {
			commonWords[j] = 1;
		}
		if ( words->isNum(i) ) {
			commonWords[j] = 1;
		}

		logTrace( g_conf.m_logTraceWordSpam, "Word[%" PRId32 "] [%.*s] (%" PRIu64 ") -> bucket %" PRId32 ", next[%" PRId32 "]=%" PRId32"", i, wlens[i], wptrs[i], wids[i], j, i, next[i]);
	}


	// count distinct candidates that had spam and did not have spam
	int32_t spamWords = 0;
	int32_t goodWords = 0;
	int32_t numpos;


	//#
	//# Loop through the hash table looking for filled buckets.
	//# Grab the linked list of indices and make a "profile"
	//#
	for ( i=0; i < numBuckets; i++ ) {
		// skip empty buckets
		if( bucketHash[i] == 0 ) {
			continue;
		}

		// word #j is in bucket #i
		int32_t j = bucketWordPos[i];

		// Loop through the linked list for this word
		numpos=0;
		while( j != -1 ) {
			// Store position of occurence of this word in profile
			profile[numpos++] = j;
			// get the position of next occurence of this word
			j = next[j];
		}

		// if 2 or less occurences of this word, don't check for spam
		if ( numpos < 3 ) {
			goodWords++;
			continue;
		}


#if 0
		// @todo: BR 20161109: This code is defective. It checks for <a tags in Words, 
		// but there are NO tags in Words. It also checks for separator using is_alnum_a
		// which does not consider a space a separator. In the current condition it
		// will never catch anything.

		//
		// set m_isRepeatSpammer
		//
		// look for a word repeated in phrases, in a big list,
		// where each phrase is different
		//
		int32_t max = 0;
		int32_t count = 0;
		int32_t knp = numpos;

		// must be 3+ letters, not a stop word, not a number
		if ( words->getWordLen(profile[0]) <= 2 || commonWords[i] ) {
			knp = 0;
		}

		// scan to see if they are a tight list
		for ( int32_t k = 1 ; k < knp ; k++ ) {
			// are they close together? if not, bail
			if ( profile[k-1] - profile[k] >= 25 ) {
				count = 0;
				continue;
			}

			// otherwise inc it
			count++;

			// must have another word in between or tag
			int32_t a = profile[k];
			int32_t b = profile[k-1];
			bool gotSep = false;
			bool inLink = false;


			for(int32_t j=a+1; j < b; j++) {
				// if in link do not count, chinese spammer
				// does not have his crap in links
				// @@@ BR: There are never tags in Words.. will never catch anything
				if ( words->getWord(j)[0] == '<' && words->getWordLen(j) >= 3 ) {
					// get the next char after the <
					char nc;
					nc=to_lower_a(words->getWord(j)[1]);

					// now check it for anchor tag
					if ( nc == 'a' ) {
						inLink = true;
						break;
					}
				}
				if ( words->getWord(j)[0] == '<' ) {
					gotSep = true;
				}

				//@@@ BR: Returns false for space .. which is what it always checks
				if ( is_alnum_a(words->getWord(j)[0]) ) {
					gotSep = true;
				}
			}

			// . the chinese spammer always has a separator,
			//   usually another tag
			// . and fix "BOW BOW BOW..." which has no separators
			if( !gotSep ) {
				count--;
			}
			else 
			if( inLink ) {
				count--;
			}

			// get the max
			if ( count > max ) {
				max = count;
			}
		}

		// a count of 50 such monsters indicates the chinese spammer
		if ( max >= 50 ) {
			m_isRepeatSpammer = true;
		}
		//
		// end m_isRepeatSpammer detection
		//
#endif


		// . determine the probability this word was spammed by looking
		//   at the distribution of it's positions in the document
		// . sets "spam" member of each word in this profile
		// . don't check if word occurred 2 or less times
		// . TODO: what about TORA! TORA! TORA!
		// . returns true if 1+ occurences were considered spam
		bool isSpam = setSpam(profile, numpos, numWords, spam);

		// don't count stop words or numbers towards this threshold
		if ( commonWords[i] ) {
			continue;
		}

		// tally them up
		if ( isSpam ) {
			spamWords++;
		}
		else {
			goodWords++;
		}
	}

	// what percent of distinct cadidate words were spammed?
	int32_t totalWords = spamWords + goodWords;

	// if no or very few words return true
	int32_t percent;
	if ( totalWords <= 10 ) {
		goto done;
	}
	percent    = ( spamWords * 100 ) / totalWords;
	// if 20% of words we're spammed punish everybody now to 100% spam
	// if we had < 100 candidates and < 20% spam, don't bother
	//if ( percent < 5 ) goto done;
	if ( percent <= maxPercent ) {
		goto done;
	}

	// now only set to 99 so each singleton usually gets hashed
	for ( i = 0 ; i < numWords ; i++ ) {
		if ( words->getWordId(i) && spam[i] < 99 ) {
			spam[i] = 99;
		}
	}
 done:

	// update the weights for the words
	//for ( i = 0 ; i < numWords ; i++ ) {
	//	m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100;
	//}

	// TODO: use the min word spam algo as in Phrases.cpp for this!
	//for ( i = 0 ; i < numWords ; i++ ) {
	//	m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100;
	//}

	// convert from percent spammed into rank.. from 0 to 10 i guess
	for ( i = 0 ; i < numWords ; i++ ) {
		spam[i] = (MAXWORDSPAMRANK * (100 - spam[i])) / 100;
	}

	// copy into our buffer
	if ( ! m_wordSpamBuf.safeMemcpy ( (char *)spam , numWords ) ) {
		logTrace( g_conf.m_logTraceWordSpam, "END - buffer copy failed" );
		return NULL;
	}

	logTrace( g_conf.m_logTraceWordSpam, "END - done" );
	return m_wordSpamBuf.getBufStart();
}


// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we map "each" subProfile to a probability of spam (from 0 to 100)
// . if the profile is really big we get really slow (O(n^2)) iterating through
//   many subProfiles
// . so after the first 25 words, it's automatically considered spam
// . return true if one word was spammed w/ probability > 20%
bool XmlDoc::setSpam ( const int32_t *profile, int32_t plen , int32_t numWords ,
			unsigned char *spam ) {
	// don't bother detecting spam if 2 or less occurences of the word
	if ( plen < 3 ) return false;
	// if we have more than 10 words and this word is 20% or more of
	// them then all but the first occurence is spammed
	//log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam);
	if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) {
		for (int32_t i=1; i<plen; i++)
			spam[profile[i]] = 100;
		return true ;
	}

	// we have to do this otherwise it takes FOREVER to do for plens in
	// the thousands, like i saw a plen of 8338!
	if ( plen > 50 ) { // && m_version >= 93 ) {
		// . set all but the last 50 to a spam of 100%
		// . the last 50 actually occur as the first 50 in the doc
		for (int32_t i=0; i<plen-50;i++)
			spam[profile[i]] = 100;
		// we now have only 50 occurences
		plen = 50;
		// we want to skip the first plen-50 because they actually
		// occur at the END of the document
		profile += plen - 50;
	}

	// just use 40% "quality"
	int32_t off = 3;

	// . now the nitty-gritty part
	// . compute all sub sequences of the profile
	// . similar to a compression scheme (wavelets?)
	// . TODO: word positions should count by two's since punctuation is
	//         not included so start step @ 2 instead of 1
	// . if "step" is 1 we look at every       word position in the profile
	// . if "step" is 2 we look at every other word position
	// . if "step" is 3 we look at every 3rd   word position, etc...
	int32_t maxStep = plen / 4;
	if ( maxStep > 4 ) maxStep = 4;
	// . loop through all possible tuples
	for ( int32_t step = 1 ; step <= maxStep ; step++ ) {
		for ( int32_t window = 0 ; window + 3 < plen ; window+=1) {
			for (int32_t wlen = 3; window+wlen <= plen ; wlen+=1) {
				// continue if step isn't aligned with window
				// length
				if (wlen % step != 0) continue;
				// . get probability that this tuple is spam
				// . returns 0 to 100
				int32_t prob = getProbSpam ( profile + window ,
						             wlen , step);
				// printf("(%i,%i,%i)=%i\n",step,window,
				// wlen,prob);
				// . if the probability is too low continue
				// . was == 100
				if ( prob <= 20 ) continue;
				// set the spammed words spam to "prob"
				// only if it's bigger than their current spam
				for (int32_t i=window; i<window+wlen;i++) {
					// first occurences can have immunity
					// due to doc quality being high
					if ( i >= plen - off ) break;
					if (spam[profile[i]] < prob)
						spam[profile[i]] = prob;
				}
			}

		}
	 }
	 // was this word spammed at all?
	 bool hadSpam = false;
	 for (int32_t i=0; i<plen; i++)
		 if ( spam[profile[i]] > 20 )
			 hadSpam = true;
	 // make sure at least one word survives
	 for (int32_t i=0; i<plen; i++)
		 if ( spam[profile[i]] == 0)
			return hadSpam;
	 // clear the spam level on this guy
	 spam[profile[0]] = 0;
	 // return true if we had spam, false if not
	 return hadSpam;
}


// . returns 0 to 100 , the probability of spam for this subprofile
// . a "profile" is an array of all the positions of a word in the document
// . a "position" is just the word #, like first word, word #8, etc...
// . we are passed a subprofile, "profile", of the actual profile
//   because some of the document may be more "spammy" than other parts
// . inlined to speed things up because this may be called multiple times
//   for each word in the document
// . if "step" is 1 we look at every       word position in the profile
// . if "step" is 2 we look at every other word position
// . if "step" is 3 we look at every 3rd   word position, etc...
int32_t XmlDoc::getProbSpam(const int32_t *profile, int32_t plen, int32_t step) {

	// you can spam 2 or 1 letter words all you want to
	if ( plen <= 2 ) return 0;

	// if our step is bigger than the profile return 0
	if ( step == plen ) return 0;

	int32_t dev=0;
	
	for (int32_t j = 0; j < step; j++) {

		// find avg. of gaps between consecutive tokens in subprofile
		// TODO: isn't profile[i] < profile[i+1]??
		int32_t istop = plen-1;
		int32_t avgSpacing = 0;
		for (int32_t i=0; i < istop; i += step )
			avgSpacing += ( profile[i] - profile[i+1] );
		// there's 1 less spacing than positions in the profile
		// so we divide by plen-1
		avgSpacing = (avgSpacing * 256) / istop;

		// compute standard deviation of the gaps in this sequence
		int32_t stdDevSpacing = 0;
		for (int32_t i = 0 ; i < istop; i += step ) {
			int32_t d = (( profile[i] - profile[i+1]) * 256 ) - avgSpacing;
			if ( d < 0 ) stdDevSpacing -= d;
			else         stdDevSpacing += d;
		}

		// TODO: should we divide by istop-1 for stdDev??
		stdDevSpacing /= istop;

		// average of the stddevs for all sequences
		dev += stdDevSpacing;
	}

	dev /= step;
	
	// if the plen is big we should expect dev to be big
	// here's some interpolation points:
	// plen >=  2  and  dev<= 0.2  --> 100%
	// plen  =  7  and  dev = 1.0  --> 100%
	// plen  = 14  and  dev = 2.0  --> 100%
	// plen  = 21  and  dev = 3.0  --> 100%
	// plen  = 7   and  dev = 2.0  -->  50%

	// NOTE: dev has been multiplied by 256 to avoid using floats
	//@todo BR: So why do you compare with a float? 
	if ( dev <= 51.2 ) return 100;  // (.2 * 256)
	int32_t prob = ( (256*100/7) * plen ) / dev;

	if (prob>100) prob=100;

	return prob;
}


bool getWordPosVec ( const Words *words ,
		     const Sections *sections,
		     int32_t startDist,
		     const char *fragVec,
		     SafeBuf *wpos ) {

	int32_t dist = startDist; // 0;
	const Section *lastsx = NULL;
	int32_t tagDist = 0;
	Section **sp = NULL;
	if ( sections ) sp = sections->m_sectionPtrs;
	const nodeid_t *tids = words->getTagIds();
	const int32_t *wlens = words->getWordLens();
	const char *const*wptrs = words->getWordPtrs();
	int32_t nw = words->getNumWords();

	if ( ! wpos->reserve ( nw * sizeof(int32_t) ) ) return false;
	int32_t *wposvec = (int32_t *)wpos->getBufStart();


	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// save it
		wposvec[i] = dist;

		// tags affect the distance/wordposition cursor
		if ( tids && tids[i] ) {
			// tag distance affects
			nodeid_t tid = tids[i] & BACKBITCOMP;
			if ( isBreakingTagId ( tid ) ) tagDist += SENT_UNITS;
			dist++;
			continue;
		}
		// . and so do sequences of punct
		// . must duplicate this code in Query.cpp for setting
		//   QueryWord::m_posNum
		if ( ! words->getWordId(i) ) {
			// simple space or sequence of just white space
			if ( words->isSpaces(i) )
				dist++;
			// 'cd-rom'
			else if ( wptrs[i][0]=='-' && wlens[i]==1 )
				dist++;
			// 'mr. x'
			else if ( wptrs[i][0]=='.' && words->isSpaces(i,1))
				dist++;
			// animal (dog)
			else
				dist += 2;
			continue;
		}
		// ignore if in repeated fragment
		if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) {
			dist++; continue; }

		const Section *sx = NULL;
		if ( sp ) {
			sx = sp[i];
			// ignore if in style tag, etc. and do not
			// increment the distance
			if ( sx->m_flags & NOINDEXFLAGS )
				continue;
		}

		// different sentence?
		if ( sx &&
		     ( ! lastsx ||
		     sx->m_sentenceSection != lastsx->m_sentenceSection ) ) {
			// separate different sentences with 30 units
			dist += SENT_UNITS; // 30;
			// limit this!
			if ( tagDist > 120 ) tagDist = 120;
			// and add in tag distances as well here, otherwise
			// we do not want "<br>" to really increase the
			// distance if the separated words are in the same
			// sentence!
			dist += tagDist;
			// new last then
			lastsx = sx;
			// store the vector AGAIN
			wposvec[i] = dist;
		}

		tagDist = 0;

		dist++;
	}
	return true;
}

bool getDensityRanks ( const int64_t *wids ,
		       int32_t nw ,
		       int32_t hashGroup ,
		       SafeBuf *densBuf ,
		       const Sections *sections ) {

	//int32_t nw = wordEnd - wordStart;

	// make the vector
	if ( ! densBuf->reserve ( nw ) ) return false;

	// convenience
	char *densVec = densBuf->getBufStart();

	// clear i guess
	memset ( densVec , 0 , nw );

	if ( hashGroup != HASHGROUP_BODY &&
	     hashGroup != HASHGROUP_HEADING )
		sections = NULL;

	// scan the sentences if we got those
	Section *ss = NULL;
	if ( sections ) ss = sections->m_firstSent;
	// sanity
	//if ( sections && wordStart != 0 ) { g_process.shutdownAbort(true); }
	for ( ; ss ; ss = ss->m_nextSent ) {
		// count of the alnum words in sentence
		int32_t count = ss->m_alnumPosB - ss->m_alnumPosA;
		// start with one word!
		count--;
		// how can it be less than one alnum word
		if ( count < 0 ) continue;
		// . base density rank on that
		// . count is 0 for one alnum word now
		int32_t dr = MAXDENSITYRANK - count;
		// ensure not negative. make it at least 1. zero means un-set.
		if ( dr < 1 ) dr = 1;
		// mark all in sentence then
		for ( int32_t i = ss->m_senta ; i < ss->m_sentb ; i++ ) {
			// assign
			densVec[i] = dr;
		}
	}
	// all done if using sections
	if ( sections ) return true;


	// count # of alphanumeric words in this string
	int32_t na = 0;
	for ( int32_t i = 0 ; i < nw ; i++ ) if ( wids[i] ) na++;
	// a single alnum should map to 0 "na"
	na--;
	// wtf?
	if ( na < 0 ) return true;
	// compute density rank
	int32_t dr  = MAXDENSITYRANK - na ;
	// at least 1 to not be confused with 0 which means un-set
	if ( dr < 1 ) dr = 1;
	// assign
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// assign
		densVec[i] = dr;
	}
	return true;
}

// . called by hashString() for hashing purposes, i.e. creating posdb keys
// . string is usually the document body or inlink text of an inlinker or
//   perhaps meta keywords. it could be anything. so we need to create this
//   vector based on that string, which is represented by words/phrases here.
bool getDiversityVec( const Words *words, const Phrases *phrases, HashTableX *countTable, SafeBuf *sbWordVec ) {
	const int64_t  *wids  = words->getWordIds ();
	int32_t        nw    = words->getNumWords();
	const int64_t  *pids  = phrases->getPhraseIds2();

	// . make the vector
	// . it will be diversity ranks, so one float per word for now
	//   cuz we convert to rank below though, one byte rank
	if ( ! sbWordVec  ->reserve ( nw*sizeof(float) ) ) return false;

	// get it
	float *ww = (float *)sbWordVec  ->getBufStart();

	int32_t      nexti        = -10;
	int64_t pidLast      = 0;

	// . now consider ourselves the last word in a phrase
	// . adjust the score of the first word in the phrase to be
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not alnum word
		if ( ! wids[i] ) { ww[i] = 0.0; continue; }
		// try to inline this
		int64_t nextWid = 0;
		int64_t lastPid = 0;
		// how many words in the bigram?
		int32_t      nwp = phrases->getNumWordsInPhrase2(i);
		if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ;
		if ( i == nexti ) lastPid = pidLast;
		// get current pid
		int64_t pid = pids[i];
		// get the word and phrase weights for term #i
		float ww2;

		getWordToPhraseRatioWeights ( lastPid  ,
					      wids[i]  ,
					      pid      ,
					      nextWid  ,
					      &ww2     ,
					      countTable);
		// 0 to 1.0
		if ( ww2 < 0 || ww2 > 1.0 ) { g_process.shutdownAbort(true); }
		// save the last phrase id
		if ( nwp > 0 ) {
			nexti        = i + nwp - 1;
			pidLast      = pid;
		}
		// . apply the weights
		// . do not hit all the way down to zero though...
		// . Words.cpp::hash() will not index it then...
		ww[i] = ww2;
	}

	// overwrite the array of floats with an array of chars (ranks)
	char *nww = (char *)ww;

	// convert from float into a rank from 0-15
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		if ( almostEqualFloat(ww[i], 0) ) { 
			nww[i] = 0; 
			continue; 
		}
		// 2.50 is max in getWordToPhraseRatioWeights() function
		char wrank = (char) ((ww[i] * ((float)MAXDIVERSITYRANK))/.55);
		// sanity
		if ( wrank > MAXDIVERSITYRANK ) {
			wrank = MAXDIVERSITYRANK;
		}
		if ( wrank < 0 ) { g_process.shutdownAbort(true); }
		// assign now
		nww[i] = wrank;
	}

	return true;
}

// match word sequences of NUMWORDS or more words
#define NUMWORDS 5

// . repeated sentence frags
// . 1-1 with words in body of doc
char *XmlDoc::getFragVec ( ) {

	if ( m_fragBufValid ) {
		char *fb = m_fragBuf.getBufStart();
		if ( ! fb ) return (char *)0x01;
		return fb;
	}

	setStatus("getting frag vec");

	const Words *words = getWords();
	if ( ! words || words == (Words *)-1 ) return (char *)words;
	Bits *bits = getBits();
	if ( ! bits ) return NULL;

	m_fragBuf.purge();

	// ez vars
	const int64_t *wids  = words->getWordIds ();
	int32_t        nw    = words->getNumWords();

	// if no words, nothing to do
	if ( nw == 0 ) {
		m_fragBufValid = true;
		return (char *)0x01;//true;
	}

	// truncate for performance reasons. i've seen this be over 4M
	// and it was VERY VERY SLOW... over 10 minutes...
	// - i saw this tak over 200MB for an alloc for
	//   WeightsSet3 below, so lower from 200k to 50k. this will probably
	//   make parsing inconsistencies for really large docs...
	if ( nw > MAXFRAGWORDS ) nw = MAXFRAGWORDS;

	int64_t   ringWids [ NUMWORDS ];
	int32_t        ringPos  [ NUMWORDS ];
	int32_t        ringi = 0;
	int32_t        count = 0;
	uint64_t   h     = 0;

	// . make the hash table
	// . make it big enough so there are gaps, so chains are not too long
	int32_t       minBuckets = (int32_t)(nw * 1.5);
	uint32_t  nb     = 2 * getHighestLitBitValue ( minBuckets ) ;
	int32_t       need       = nb * (8+4+4);
	StackBuf<50000> weightsBuf;
	if(!weightsBuf.reserve(need))
		return NULL;
	char *buf = weightsBuf.getBufStart();
	char      *ptr        = buf;
	uint64_t *hashes = (uint64_t *)ptr; ptr += nb * 8;
	int32_t      *vals       = (int32_t      *)ptr; ptr += nb * 4;
	float     *ww         = (float     *)ptr; ptr += nb * 4;

	for ( int32_t i = 0 ; i < nw ; i++ ) ww[i] = 1.0;

	if ( ptr != buf + need ) { g_process.shutdownAbort(true); }

	// make the mask
	uint32_t mask = nb - 1;

	// clear the hash table
	memset ( hashes , 0 , nb * 8 );

	// clear ring of hashes
	memset ( ringWids , 0 , NUMWORDS * 8 );

	// for sanity check
	int32_t lastStart = -1;

	// . hash EVERY NUMWORDS-word sequence in the document
	// . if we get a match look and see what sequences it matches
	// . we allow multiple instances of the same hash to be stored in
	//   the hash table, so keep checking for a matching hash until you
	//   chain to a 0 hash, indicating the chain ends
	// . check each matching hash to see if more than NUMWORDS words match
	// . get the max words that matched from all of the candidates
	// . demote the word and phrase weights based on the total/max
	//   number of words matching
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if not alnum word
		if ( ! wids[i] ) continue;

		// add new to the 5 word hash
		h ^= wids[i];
		// . remove old from 5 word hash before adding new...
		// . initial ring wids are 0, so should be benign at startup
		h ^= ringWids[ringi];
		// add to ring
		ringWids[ringi] = wids[i];
		// save our position
		ringPos[ringi] = i;
		// wrap the ring ptr if we need to, that is why we are a ring
		if ( ++ringi >= NUMWORDS ) ringi = 0;
		// this 5-word sequence starts with word # "start"
		int32_t start = ringPos[ringi];
		// need at least NUMWORDS words in ring buffer to do analysis
		if ( ++count < NUMWORDS ) continue;
		// . skip if it starts with a word which can not start phrases
		// . that way "a new car" being repeated a lot will not
		//   decrease the weight of the phrase term "new car"
		// . setCountTable() calls set3() with this set to NULL
		//if ( bits && ! bits->canStartPhrase(start) ) continue;
		// sanity check
		if ( start <= lastStart ) { g_process.shutdownAbort(true); }
		// reset max matched
		int32_t max = 0;
		// look up in the hash table
		uint32_t n = h & mask;
		// sanity breach check
		if ( n >= nb ) { g_process.shutdownAbort(true); }
	loop:
		// all done if empty
		if ( ! hashes[n] ) {
			// sanity check
			//if ( n >= nb ) { g_process.shutdownAbort(true); }
			// add ourselves to the hash table now
			hashes[n] = h;
			// sanity check
			//if ( wids[start] == 0 ) { g_process.shutdownAbort(true); }
			// this is where the 5-word sequence starts
			vals  [n] = start;
			// save it
			lastStart = start;
			// debug point
			//if ( start == 7948 )
			//	log("heystart");
			// do not demote words if less than NUMWORDS matched
			if ( max < NUMWORDS ) continue;
			// . how much we should we demote
			// . 10 matching words pretty much means 0 weights
			float demote = 1.0 - ((max-5)*.10);
			if ( demote >= 1.0 ) continue;
			if ( demote <  0.0 ) demote = 0.0;

			// . RULE #26 ("long" phrases)
			// . if we got 3, 4 or 5 in our matching sequence
			// . basically divide by the # of *phrase* terms
			// . multiply by 1/(N-1)
			// . HOWEVER, should we also look at HOW MANY other
			//   sequences matches this too!???
			//float demote = 1.0 / ((float)max-1.0);
			// set3() is still called from setCountTable() to
			// discount the effects of repeated fragments, and
			// the count table only understands score or no score
			//if ( max >= 15 ) demote = 0.0;

			// demote the next "max" words
			int32_t mc = 0;
			int32_t j;
			for ( j = start ; mc < max ; j++ ) {
				// sanity
				if ( j >= nw ) { g_process.shutdownAbort(true); }
				if ( j <  0 ) { g_process.shutdownAbort(true); }
				// skip if not an alnum word
				if ( ! wids[j] ) continue;
				// count it
				mc++;
				// demote it
				ww[j] = (int32_t)(ww[j] * demote);
				if ( ww[j] <= 0 ) ww[j] = 2;
			}
			// save the original i
			int32_t mini = i;
			// advance i, it will be incremented by 1 immediately
			// after hitting the "continue" statement
			i = j - 1;
			// must be at least the original i, we are monotinic
			// otherwise ringPos[] will not be monotonic and core
			// dump ultimately cuz j and k will be equal below
			// and we increment matched++ forever.
			if ( i < mini ) i = mini;
			// get next word
			continue;
		}
		// get next in chain if hash does not match
		if ( hashes[n] != h ) {
			// wrap around the hash table if we hit the end
			if ( ++n >= nb ) n = 0;
			// check out bucket #n now
			goto loop;
		}
		// how many words match so far
		int32_t matched = 0;
		// . we have to check starting at the beginning of each word
		//   sequence since the XOR compositional hash is order
		//   independent
		// . see what word offset this guy has
		int32_t j = vals[n] ;
		// k becomes the start of the current 5-word sequence
		int32_t k = start;
		// sanity check
		if ( j == k ) { g_process.shutdownAbort(true); }
		// skip to next in chain to check later
		if ( ++n >= nb ) n = 0;
		// keep advancing k and j as long as the words match
	matchLoop:
		// get next wid for k and j
		while ( k < nw && ! wids[k] ) k++;
		while ( j < nw && ! wids[j] ) j++;
		if ( k < nw && wids[k] == wids[j] ) {
			matched++;
			k++;
			j++;
			goto matchLoop;
		}
		// keep track of the max matched for i0
		if ( matched > max ) max = matched;
		// get another matching string of words, if possible
		goto loop;
	}

	if ( nw <= 0 ) { g_process.shutdownAbort(true);}

	// make space
	if ( ! m_fragBuf.reserve ( nw ) ) {
		return NULL;
	}
	// validate
	m_fragBufValid = true;
	// handy ptr
	char *ff = m_fragBuf.getBufStart();

	// wtf?
	if ( ! ff ) { g_process.shutdownAbort(true); }

	// convert from floats into frag score, 0 or 1 really
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		if ( ww[i] <= 0.0 ) ff[i] = 0;
		else                ff[i] = 1;
	}

	return ff;
}


// . inline this for speed
// . if a word repeats in different phrases, promote the word
//   and  demote the phrase
// . if a word repeats in pretty much the same phrase, promote
//   the  phrase and demote the word
// . if you have the window of text "new mexico good times"
//   and word #i is mexico, then:
//   pid1 is "new mexico"
//   wid1 is "mexico"
//   pid2 is "mexico good"
//   wid2 is "good"
// . we store sliderParm in titleRec so we can update it along
//   with title and header weights on the fly from the spider controls
static void getWordToPhraseRatioWeights ( int64_t   pid1 , // pre phrase
					  int64_t   wid1 ,
					  int64_t   pid2 ,
					  int64_t   wid2 , // post word
					  float      *retww   ,
					  const HashTableX *tt1) {

	static float s_wtab[30][30];
	static float s_fsp;
	// from 0 to 100
	char sliderParm = g_conf.m_sliderParm;

	// . to support RULE #15 (word to phrase ratio)
	// . these weights are based on the ratio of word to phrase count
	//   for a particular word
	static char s_sp = -1;
	if ( s_sp != sliderParm ) {
		// . set it to the newly updated value
		// . should range from 0 up to 100
		s_sp = sliderParm;
		// the float version
		s_fsp = (float)sliderParm / 100.0;
		// sanity test
		if ( s_fsp < 0.0 || s_fsp > 1.0 ) { g_process.shutdownAbort(true); }
		// i is the word count, how many times a particular word
		// occurs in the document
		for ( int32_t i = 0 ; i < 30 ; i++ ) {
			// . k is the phrase count, how many times a particular phrase
			//   occurs in the document
			// . k can be GREATER than i because we index only phrase terms
			//   sometimes when indexing neighborhoods, and not the
			//   single words that compose them
			for ( int32_t k = 0 ; k < 30 ; k++ ) {
				// do not allow phrase count to be greater than
				// word count, even though it can happen since we
				// add imported neighborhood pwids to the count table
				int32_t j = k;
				if ( k > i ) j = i;
				// get ratio
				//float ratio = (float)phrcount / (float)wrdcount;
				float ratio = i ? (float)j/(float)i : 0;
				// it should be impossible that this can be over 1.0
				// but might happen due to hash collisions
				if ( ratio > 1.0 ) ratio = 1.0;
				// restrict the range we can weight a word or phrase
				// based on the word count
				//float r = 1.0;
				//if      ( i >= 20 ) r = 2.1;
				//else if ( i >= 10 ) r = 1.8;
				//else if ( i >=  4 ) r = 1.5;
				//else                r = 1.3;
				//g_ptab[i][k] = 1.00;
				s_wtab[i][k] = 1.00;
				if ( i <= 1 ) continue;
				// . we used to have a sliding bar between 0.0 and 1.0.
				//   word is weighted (1.0 - x) and phrase is weighted
				//   by (x). however, x could go all the way to 1.0
				//   even when i = 2, so we need to restrict x.
				// . x is actually "ratio"
				// . when we have 8 or less word occurences, do not
				//   remove more than 80% of its score, a 1/5 penalty
				//   is good enough for now. but for words that occur
				//   a lot in the link text or pwids, go to town...
				if      ( i <=  2 && ratio >= .50 ) ratio = .50;
				else if ( i <=  4 && ratio >= .60 ) ratio = .60;
				else if ( i <=  8 && ratio >= .80 ) ratio = .80;
				else if ( i <= 12 && ratio >= .95 ) ratio = .95;
				// round up, so many "new mexico" phrases but only
				// make it up to 95%...
				if ( ratio >= .95 ) ratio = 1.00;
				// if word's phrase is repeated 3 times or more then
				// is a pretty good indication that we should weight
				// the phrase more and the word itself less
				//if ( k >= 3 && ratio < .90 ) ratio = .90;
				// compute the weights
				//float pw = 2.0 * ratio;
				//float ww = 2.0 * (1.0 - ratio);
				float ww = (1.0 - ratio);

				// . punish words a little more
				// . if we got 50% ratio, words should not get as much
				//   weight as the phrase
				//ww *= .45;
				// do not weight to 0, no less than .15
				if ( ww < 0.0001 ) ww = 0.0001;
				//if ( pw < 0.0001 ) pw = 0.0001;
				// do not overpromote either
				//if ( ww > 2.50 ) ww = 2.50;
				//if ( pw > 2.50 ) pw = 2.50;
				// . do a sliding weight of the weight
				// . a "ww" of 1.0 means to do no weight
				// . can't do this for ww cuz we use "mod" below
				//float newWW = s_fsp*ww + (1.0-s_fsp)*1.00;
				//float newPW = s_fsp*pw + (1.0-s_fsp)*1.00;
				// limit how much we promote a word because it
				// may occur 30 times total, but have a phrase count
				// of only 1. however, the other 29 times it occurs it
				// is in the same phrase, just not this particular
				// phrase.
				//if ( ww > 2.0 ) ww = 2.0;
				s_wtab[i][k] = ww;
				//g_ptab[i][k] = newPW;
				//logf(LOG_DEBUG,"build: wc=%" PRId32" pc=%" PRId32" ww=%.2f "
				//"pw=%.2f",i,k,s_wtab[i][k],g_ptab[i][k]);
			}
		}
	}

	int32_t phrcount1 = 0;
	int32_t phrcount2 = 0;
	int32_t wrdcount1 = 0;
	int32_t wrdcount2 = 0;
	if ( !tt1->isTableEmpty() ) {
		if (pid1) phrcount1 = tt1->getScore(pid1);
		if (pid2) phrcount2 = tt1->getScore(pid2);
		if (wid1) wrdcount1 = tt1->getScore(wid1);
		if (wid2) wrdcount2 = tt1->getScore(wid2);
	}
	// if we are always ending the same phrase, like "Mexico"
	// in "New Mexico"... get the most popular phrase this word is
	// in...
	int32_t phrcountMax = phrcount1;
	int32_t wrdcountMin = wrdcount1;
	// these must actually exist to be part of the selection
	if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2;
	if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2;


	// . but if we are 'beds' and in a popular phrase like 'dog beds'
	//   there maybe a lot of other phrases mentioned that have 'beds'
	//   in them like 'pillow beds', 'pet beds', but we need to assume
	//   that is phrcountMax is high enough, do not give much weight to
	//   the word... otherwise you can subvert this algorithm by just
	//   adding other random phrases with the word 'bed' in them.
	// . BUT, if a page has 'X beds' with a lot of different X's then you
	//   still want to index 'beds' with a high score!!! we are trying to
	//   balance those 2 things.
	// . do this up here before you truncate phrcountMax below!!
	float mod = 1.0;
	if      ( phrcountMax <=  6 ) mod = 0.50;
	else if ( phrcountMax <=  8 ) mod = 0.20;
	else if ( phrcountMax <= 10 ) mod = 0.05;
	else if ( phrcountMax <= 15 ) mod = 0.03;
	else                          mod = 0.01;

	// scale wrdcount1/phrcountMax down for the s_wtab table
	if ( wrdcount1 > 29 ) {
		float ratio = (float)phrcountMax / (float)wrdcount1;
		phrcountMax = (int32_t)((29.0 * ratio) + 0.5);
		wrdcount1   = 29;
	}
	if ( phrcountMax > 29 ) {
		float ratio = (float)wrdcount1 / (float)phrcountMax;
		wrdcount1   = (int32_t)((29.0 * ratio) + 0.5);
		phrcountMax = 29;
	}

	// . sanity check
	// . neighborhood.cpp does not always have wid/pid pairs
	//   that match up right for some reason... so we can't do this
	//if ( phrcount1 > wrdcount1 ) { g_process.shutdownAbort(true); }
	//if ( phrcount2 > wrdcount2 ) { g_process.shutdownAbort(true); }

	// apply the weights from the table we computed above
	*retww = mod   *   s_wtab[wrdcount1][phrcountMax];

	// slide it
	*retww = s_fsp*(*retww) + (1.0-s_fsp)*1.00;

	// ensure we do not punish too hard
	if ( *retww <= 0.0 ) *retww = 0.01;

	if ( *retww > 1.0 ) { g_process.shutdownAbort(true); }

	// . if the word is Mexico in 'New Mexico good times' then
	//   phrase term #i which is, say, "Mexico good" needs to
	//   get the min word count when doings its word to phrase
	//   ratio.
	// . it has two choices, it can use the word count of
	//   "Mexico" or it can use the word count of "good".
	// . say, each is pretty high in the document so the phrase
	//   ends up getting penalized heavily, which is good because
	//   it is a nonsense phrase.
	// . if we had "united socialist soviet republic" repeated
	//   a lot, the phrase "socialist soviet" would score high
	//   and the individual words would score low. that is good.
	// . try to seek the highest weight possible for this phrase
	//   by choosing the lowest word count possible
	// . NO LONGER AFFECT phrase weights because just because the
	//   words occur a lot in the document and this may be the only
	//   occurence of this phrase, does not mean we should punish
	//   the phrase.  -- MDW
	//*retpw = 1.0;
	return;
}

bool XmlDoc::getIsInjecting ( ) {
	bool isInjecting = false;
	if ( m_sreqValid && m_sreq.m_isInjecting ) isInjecting = true;
	if ( m_isInjecting && m_isInjectingValid ) isInjecting = true;
	return isInjecting;
}

Json *XmlDoc::getParsedJson ( ) {

	if ( m_jpValid ) return &m_jp;

	// core if not a json object
	if ( m_contentTypeValid && m_contentType != CT_JSON &&
	     // spider status docs are now really json
	     m_contentType != CT_STATUS ) {
		g_process.shutdownAbort(true); }

	// \0 terminated
	char **pp = getUtf8Content();
	if ( ! pp || pp == (void *)-1 ) return (Json *)pp;

	// point to the json
	char *p = *pp;

	// empty? all done then.
	//if ( ! p ) return (char *)pp;

	// . returns NULL and sets g_errno on error
	// . if p is NULL i guess this should still be ok and be empty
	if ( ! m_jp.parseJsonStringIntoJsonItems ( p ) ) {
		g_errno = EBADJSONPARSER;
		return NULL;
	}

	m_jpValid = true;
	return &m_jp;
}


void XmlDoc::callCallback() {
	if(m_callback1 )
		m_callback1(m_state);
	else
		m_callback2(m_state);
}