Remove unused variable/functions from XmlDoc

2016-03-02 22:22:22 +01:00
parent da3e30490b
commit c46a4c5841
5 changed files with 17 additions and 1021 deletions
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -312,20 +312,6 @@ bool processLoop ( void *state ) {
 		return status;
 	}

-	/*
-	  // this was calling XmlDoc and setting sections, etc. to
-	  // get the SpiderReply junk... no no no
-	// is it banned or filtered? this ignores the TagRec in the titleRec
-	// and uses msg8a to get it fresh instead
-	char *vi = xd->getIsFiltered();//Visible( );
-	// wait if blocked
-	if ( vi == (void *)-1 ) return false;
-	// error?
-	if ( ! vi ) return sendErrorReply ( st , g_errno );
-	// banned?
-	if ( ! st->m_isMasterAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
-	*/
-
 	// get the utf8 content
 	char **utf8 = xd->getUtf8Content();
 	//int32_t   len  = xd->size_utf8Content - 1;
--- a/Spider.h
+++ b/Spider.h
@ -567,12 +567,6 @@ class SpiderRequest {
 	//int32_t m_reservedc1;
 	int32_t m_reservedc2;

-	//int32_t  m_parentPubDate;
-
-	// . pub date taken from url directly, not content
-	// . ie. http://mysite.com/blog/nov-06-2009/food.html
-	// . ie. http://mysite.com/blog/11062009/food.html
-	//int32_t    m_urlPubDate;
 	// . replace this with something we need for smart compression
 	// . this is zero if none or invalid
 	int32_t    m_contentHash32;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -100,7 +100,6 @@ XmlDoc::XmlDoc() {
 	m_wasInIndex = false;
 	m_outlinkHopCountVector = NULL;
 	m_extraDoc = NULL;
-	m_wikiqbuf = NULL;

 	reset();
 }
@ -126,16 +125,11 @@ void XmlDoc::reset ( ) {

 	m_printedMenu = false;

-	m_tmpBuf2.purge();
-
 	m_bodyStartPos = 0;

-	m_skipIframeExpansion = false;
 	m_indexedTime = 0;

 	m_metaList2.purge();
-	m_zbuf.purge();
-	m_kbuf.purge();

 	m_mySiteLinkInfoBuf.purge();
 	m_myPageLinkInfoBuf.purge();
@ -155,10 +149,6 @@ void XmlDoc::reset ( ) {
 	m_fakeIpBuf.purge();
 	m_fakeTagRecPtrBuf.purge();

-	m_tlbufTimer = 0LL;
-	m_gsbuf.reset();
-
-
 	m_doConsistencyTesting = g_conf.m_doConsistencyTesting;

 	m_computedMetaListCheckSum = false;
@ -191,9 +181,6 @@ void XmlDoc::reset ( ) {
 			    "be saved in addsinprogress.dat.");
 	}

-	m_ei = 0;
-	m_lastLaunch = -1;
-
 	m_pbuf = NULL;
 	m_wts  = NULL;

@ -277,9 +264,6 @@ void XmlDoc::reset ( ) {
 	}
 	m_outlinkHopCountVector = NULL;

-	m_gsbuf.reset();
-
-
 	// reset all *valid* flags to false
 	void *p    = &m_VALIDSTART;
 	void *pend = &m_VALIDEND;
@ -329,7 +313,6 @@ void XmlDoc::reset ( ) {
 	m_setFromDocId       = false;
 	m_setFromSpiderRec   = false;
 	m_freeLinkInfo1      = false;
-	m_freeLinkInfo2      = false;

 	m_checkedUrlFilters  = false;

@ -351,50 +334,35 @@ void XmlDoc::reset ( ) {
 	// keep track of updates to the rdbs we have done, so we do not re-do
 	m_listAdded                = false;
 	m_listFlushed              = false;
-	m_updatedCounts            = false;
-	m_updatedCounts2           = false;
 	m_copied1                  = false;
 	m_updatingSiteLinkInfoTags = false;
 	m_hashedTitle              = false;

 	m_registeredSleepCallback  = false;
-	m_addedNegativeDoledbRec   = false;

 	m_numRedirects             = 0;
 	m_numOutlinksAdded         = 0;
-	m_spamCheckDisabled        = false;
 	m_useRobotsTxt             = true;
-	m_redirectFlag             = false;

 	m_allowSimplifiedRedirs    = false;

 	m_didDelay                 = false;
 	m_didDelayUnregister       = false;
-	m_calledMsg22d             = 0LL;
 	m_calledMsg22e             = false;
 	m_calledMsg22f             = false;
 	m_calledMsg25              = false;
-	m_calledMsg25b             = false;
-	m_calledMsg40              = false;
 	m_calledSections           = false;
 	m_calledThread             = false;
 	m_alreadyRegistered        = false;
 	m_loaded                   = false;
-	m_firstEntry               = true;
-	m_firstEntry2              = true;
-	m_launchedSpecialMsg8a     = false;
-	m_launchedMsg8a2           = false;

 	m_setTr                    = false;
-	m_calledMsg8b              = false;

 	m_recycleContent           = false;
 	m_callback1                = NULL;
 	m_callback2                = NULL;
 	m_state                    = NULL;

-	m_processedLang            = false;
-
 	m_doingConsistencyCheck    = false;


@ -431,19 +399,6 @@ void XmlDoc::reset ( ) {
 	void *px    = &ptr_firstUrl;
 	void *pxend = &m_dummyEnd;
 	memset ( px , 0 , (char *)pxend - (char *)px );
-
-	ptr_unused6 = NULL;
-	size_unused6 = 0;
-	ptr_unused7 = NULL;
-	size_unused7 = 0;
-	ptr_unused1 = NULL;
-	size_unused1 = 0;
-	ptr_unused2 = NULL;
-	size_unused2 = 0;
-	ptr_unused3 = NULL;
-	size_unused3 = 0;
-	ptr_unused5 = NULL;
-	size_unused5 = 0;
 }

 int64_t XmlDoc::logQueryTimingStart() {
@ -638,8 +593,6 @@ bool XmlDoc::setCollNum ( const char *coll ) {
 	// we can store this safely:
 	m_collnum = cr->m_collnum;
 	m_collnumValid = true;
-	// if user "resets" the collection we need to know
-	m_lastCollRecResetCount = cr->m_lastResetCount;
 	return true;
 }

@ -1286,10 +1239,6 @@ bool XmlDoc::set2 ( char    *titleRec ,
 		return false;
 	}

-	// debug thing
-	ptr_sectiondbData = NULL;
-	size_sectiondbData = 0;
-
 	// success, return true then
 	return true;
 }
@ -2255,37 +2204,6 @@ bool XmlDoc::indexDoc2 ( ) {
 		m_registeredSleepCallback = false;
 	}

-	//////////
-	// . add the doledb negative key quickly to our tree to avoid a
-	//   respider because the msg4 doledb negative key is buffered by msg4
-	// . make it negative
-	// . well it should not be respidered because the lock is on it!!
-	//   -- so let's comment this out
-	/////////
-	/*
-	key_t negative = m_doledbKey;
-	// make it negative
-	negative.n0 &= 0xfffffffffffffffeLL;
-	// . store it in our tree if we can
-	// . returns false and sets g_errno on error
-	// . i.e. g_errno == ETRYAGAIN
-	if ( ! m_addedNegativeDoledbRec &&
-	     ! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative,
-					NULL,0,m_niceness)){
-		log("build: error trying to add to doledb: %s",
-		    mstrerror(g_errno));
-		// set sleep wrapper
-		g_loop.registerSleepCallback(1000,m_masterState,
-					     indexDocWrapper2,m_niceness);
-		// note it
-		m_registeredSleepCallback = true;
-		// sleep and retry
-		return false;
-	}
-	*/
-	// we did that
-	m_addedNegativeDoledbRec = true;
-
 	// now add it
 	if ( ! m_listAdded && m_metaListSize ) {
 		// only call this once
@ -5610,18 +5528,6 @@ int32_t XmlDoc::computeVector( Words *words, uint32_t *vec, int32_t start, int32
 	return nd * 4;
 }

-float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) {
-	int32_t *tv1 = getTagPairHashVector();
-	if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1;
-	int32_t *tv2 = xd2->getTagPairHashVector();
-	if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2;
-	m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL ,
-					      m_niceness );
-	// this means error, g_errno should be set
-	if ( m_tagSimilarity == -1.0 ) return NULL;
-	return &m_tagSimilarity;
-}
-
 float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) {
 	int32_t *sv1 = getPageSampleVector();
 	if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1;
@ -5869,17 +5775,6 @@ bool isSimilar_sorted ( int32_t   *vec0 ,
 	goto mergeLoop;
 }

-uint64_t *XmlDoc::getFuzzyDupHash ( ) {
-
-	if ( m_dupHashValid ) return &m_dupHash;
-	uint32_t *h1 = getTagPairHash32();
-	if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1;
-
-	m_dupHash = *h1;
-	m_dupHashValid = true;
-	return &m_dupHash;
-}
-
 int64_t *XmlDoc::getExactContentHash64 ( ) {

 	if ( m_exactContentHash64Valid )
@ -6599,7 +6494,6 @@ Url **XmlDoc::getRedirUrl() {
 	if      ( cu->getDomainLen() != dlen                   ) sameDom=false;
 	else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false;
 	if ( ! sameDom ) {
-		m_redirectFlag = true;
 		m_redirUrl.set ( loc , false ); // addWWW=false
 		m_redirUrlPtr   = &m_redirUrl;
 		ptr_redirUrl    = m_redirUrl.m_url;
@ -6701,7 +6595,6 @@ Url **XmlDoc::getRedirUrl() {
 		return &m_redirUrlPtr;
 	}
 	// good to go
-	m_redirectFlag = true;
 	m_redirUrl.set ( loc , false ); // addWWW=false
 	m_redirUrlPtr   = &m_redirUrl;
 	ptr_redirUrl    = m_redirUrl.m_url;
@ -7118,9 +7011,6 @@ XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) {
 	// carry this forward always!
 	m_extraDoc->m_isSpiderProxy = m_isSpiderProxy;

-	// disable spam check because that is not necessary for this doc!
-	m_extraDoc->m_spamCheckDisabled = true;
-
 	// tell msg13 to get this from it robots.txt cache if it can. it also
 	// keeps a separate html page cache for the root pages, etc. in case
 	m_extraDoc->m_maxCacheAge = maxCacheAge;
@ -12559,14 +12449,6 @@ int32_t XmlDoc::getHostHash32a ( ) {
 	return m_hostHash32a;
 }

-int32_t XmlDoc::getHostHash32b ( ) {
-	if ( m_hostHash32bValid ) return m_hostHash32b;
-	m_hostHash32bValid = true;
-	Url *c = getCurrentUrl();
-	m_hostHash32b = c->getHostHash32();
-	return m_hostHash32b;
-}
-
 int32_t XmlDoc::getDomHash32( ) {
 	if ( m_domHash32Valid ) return m_domHash32;
 	m_domHash32Valid = true;
@ -13421,19 +13303,6 @@ char *XmlDoc::getSpiderLinks ( ) {
 	return &m_spiderLinks2;
 }

-// should we index the doc? if already indexed, and is filtered, we delete it
-char *XmlDoc::getIsFiltered ( ) {
-	if ( m_isFilteredValid ) return &m_isFiltered;
-	int32_t *priority = getSpiderPriority();
-	if ( ! priority || priority == (void *)-1 ) return (char *)priority;
-	m_isFiltered = false;
-	// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
-	// if ( *priority == SPIDER_PRIORITY_BANNED   ) m_isFiltered = true;
-	if ( *priority == -3 ) m_isFiltered = true;
-	m_isFilteredValid = true;
-	return &m_isFiltered;
-}
-
 int32_t *XmlDoc::getSpiderPriority ( ) {
 	if ( m_priorityValid ) return &m_priority;
 	setStatus ("getting spider priority");
@ -14121,17 +13990,6 @@ bool XmlDoc::doConsistencyTest ( bool forceTest ) {
 	return true;
 }

-int32_t XmlDoc::printMetaList ( ) {
-
-	SafeBuf sb;
-	printMetaList ( m_metaList ,
-			m_metaList + m_metaListSize ,
-			&sb );
-	fprintf(stderr,"%s\n",sb.getBufStart());
-	return 0;
-}
-
-
 #define TABLE_ROWS 25

 // print this also for page parser output!
@ -16684,10 +16542,6 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
 	size_linkInfo1 = od->size_linkInfo1;
 	if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true;
 	else m_linkInfo1Valid = false;
-
-	// turn off for debug
-	ptr_sectiondbData = NULL;
-	size_sectiondbData = 0;
 }

 // for adding a quick reply for EFAKEIP and for diffbot query reindex requests
@ -17726,7 +17580,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {

 		ksr.m_addedTime        = getSpideredTime();//m_spideredTime;
 		//ksr.m_lastAttempt    = 0;
-		//ksr.m_urlPubDate       = urlPubDate;
 		//ksr.m_errCode        = 0;
 		ksr.m_parentHostHash32 = hostHash32a;
 		ksr.m_parentDomHash32  = m_domHash32;
@ -17955,8 +17808,6 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 	m_numOutlinksAdded      = numAdded;
 	m_numOutlinksAddedValid = true;
 	m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain;
-	m_numOutlinksFiltered = linksFiltered;
-	m_numOutlinksBanned = linksBanned;
 	// update end of list once we have successfully added all spider recs
 	m_p = p;
 	// return current ptr
@ -23457,8 +23308,6 @@ char *XmlDoc::getWordSpamVec ( ) {
 	// fix this a bit so we're not always totally spammed
 	maxPercent = 25;

-	// assume not totally spammed
-	m_totallySpammed = false;
 	// get # of words we have to set spam for
 	int32_t numWords = words->getNumWords();

@ -23670,9 +23519,7 @@ char *XmlDoc::getWordSpamVec ( ) {
 	// if we had < 100 candidates and < 20% spam, don't bother
 	//if ( percent < 5 ) goto done;
 	if ( percent <= maxPercent ) goto done;
-	// set flag so linkspam.cpp can see if all is spam and will not allow
-	// this page to vote
-	m_totallySpammed = true;
+
 	// now only set to 99 so each singleton usually gets hashed
 	for ( i = 0 ; i < numWords ; i++ )
 		if ( words->getWordId(i) && spam[i] < 99 )
@ -24627,70 +24474,6 @@ bool XmlDoc::getIsInjecting ( ) {
 	return isInjecting;
 }

-// this is still used by Title.cpp to get the title: field quickly
-char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) {
-
-	if ( ! json ) return NULL;
-
-	// get length
-	int32_t fieldLen = gbstrlen(field);
-	// keep track of in a quote or not
-	bool inQuotes = false;
-	char *stringStart = NULL;
-	char *p = json;
-	bool gotOne = false;
-	int32_t depth = 0;
-	// scan
-	for ( ; *p ; p++ ) {
-		// escaping a quote? ignore quote then.
-		if ( *p == '\\' && p[1] == '\"' ) {
-			// skip two bytes then..
-			p++;
-			continue;
-		}
-		// count {} depth
-		if ( ! inQuotes ) {
-			if ( *p == '{' ) depth++;
-			if ( *p == '}' ) depth--;
-		}
-		// a quote?
-		if ( *p == '\"' ) {
-			inQuotes = ! inQuotes;
-			// set start of the string if quote is beginning
-			if ( inQuotes ) stringStart = p + 1;
-			// if quote is ending and a colon follows then
-			// it was a json field name. so if it matches the
-			// field we want return the following field for it.
-			else if ( ! inQuotes &&
-				  ! gotOne &&
-				  p[1] == ':' &&
-				  // {"title":"whatever",...}
-				  // could be product:{title:... depth=2
-				  (depth == 1 ||depth==2) &&
-				  stringStart &&
-				  (p - stringStart) == fieldLen &&
-				  strncmp(field,stringStart,fieldLen)==0 ) {
-				// now, the next time we set stringStart
-				// it will be set to the VALUE of this field
-				// assuming the field is a STRING!!!!
-				gotOne = true;
-				// return after the quote
-				//return p + 2;
-			}
-			// ok, we got the string after the field string...
-			else if ( ! inQuotes && gotOne ) {
-				if ( valueLen ) *valueLen = p - stringStart;
-				return stringStart;
-			}
-			// keep chugging
-			continue;
-		}
-	}
-	// done, not found
-	return NULL;
-}
-
-
 Json *XmlDoc::getParsedJson ( ) {

 	if ( m_jpValid ) return &m_jp;
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -71,10 +71,6 @@ bool setLangVec ( class Words *words ,
 		  class Sections *sections ,
 		  int32_t niceness ) ;

-char *getJSONFieldValue ( char *json, char *field , int32_t *valueLen ) ;
-
-bool logQueryLogs ( );
-
 bool getDensityRanks ( int64_t *wids , 
 		       int32_t nw,
 		       //int32_t wordStart , 
@ -192,7 +188,7 @@ public:
 	uint32_t   m_spideredTime; // time_t
 	uint32_t  m_indexedTime; // slightly > m_spideredTime (time_t)
 	uint32_t  m_reserved32;
-	uint32_t  reserved3; //was: m_pubDate;    // aka m_datedbDate // time_t
+	uint32_t  reserved3;
 	uint32_t    m_firstIndexedDate; // time_t
 	uint32_t    m_outlinksAddedDate; // time_t

@ -206,7 +202,7 @@ public:
 	uint16_t  m_bodyStartPos;
 	uint16_t  m_reserved5;

-	uint16_t  m_unused0; //was: m_diffbotJSONCount
+	uint16_t  m_unused0;

 	int16_t   m_httpStatus; // -1 if not found (empty http reply)
 	
@ -230,9 +226,9 @@ public:
 	uint16_t  m_reserved799:1;
 	uint16_t  m_isSiteRoot:1;

-	uint16_t  m_reserved800:1; //was:m_isDiffbotJSONObject
-	uint16_t  m_reserved801:1; //was:m_sentToDiffbot
-	uint16_t  m_reserved802:1; //was:m_gotDiffbotSuccessfulReply
+	uint16_t  m_reserved800:1;
+	uint16_t  m_reserved801:1;
+	uint16_t  m_reserved802:1;
 	uint16_t  m_useTimeAxis:1; // m_reserved804:1;
 	uint16_t  m_reserved805:1;
 	uint16_t  m_reserved806:1;
@ -273,7 +269,7 @@ public:
 	char      *ptr_site;
 	LinkInfo  *ptr_linkInfo1;
 	char      *ptr_linkdbData;
-	char      *ptr_sectiondbData;
+	char      *ptr_unused14;
 	char      *ptr_tagRecData;
 	LinkInfo  *ptr_unused9;

@ -296,7 +292,7 @@ public:
 	int32_t       size_site;
 	int32_t       size_linkInfo1;
 	int32_t       size_linkdbData;
-	int32_t       size_sectiondbData;
+	int32_t       size_unused14;
 	int32_t       size_tagRecData;
 	int32_t       size_unused9;

@ -395,10 +391,8 @@ public:
 	int32_t *getPageSampleVector ( ) ;
 	int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
 	int32_t computeVector ( class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
-	float *getTagSimilarity ( class XmlDoc *xd2 ) ;
 	float *getPageSimilarity ( class XmlDoc *xd2 ) ;
 	float *getPercentChanged ( );
-	uint64_t *getFuzzyDupHash ( );
 	int64_t *getExactContentHash64();
 	class RdbList *getDupList ( ) ;
 	char *getIsDup ( ) ;
@ -471,7 +465,6 @@ public:
 	int32_t     *getTagPairHashVector ( ) ;
 	uint32_t *getTagPairHash32 ( ) ;
 	int32_t getHostHash32a ( ) ;
-	int32_t getHostHash32b ( ) ;
 	int32_t getDomHash32 ( );
 	char **getThumbnailData();
 	class Images *getImages ( ) ;
@ -482,7 +475,6 @@ public:
 	char *getIsSiteRoot ( ) ;
 	int8_t *getHopCount ( ) ;
 	char *getSpiderLinks ( ) ;
-	char *getIsFiltered ();
 	bool getIsInjecting();
 	int32_t *getSpiderPriority ( ) ;
 	int32_t *getIndexCode ( ) ;
@ -492,7 +484,7 @@ public:
 	bool logIt ( class SafeBuf *bb = NULL ) ;
 	bool m_doConsistencyTesting;
 	bool doConsistencyTest ( bool forceTest ) ;
-	int32_t printMetaList ( ) ;
+
 	void printMetaList ( char *metaList , char *metaListEnd ,
 			     class SafeBuf *pbuf );
 	bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
@ -518,10 +510,6 @@ public:
 	// m_indexCode or g_errno was set!
 	class SpiderReply *getNewSpiderReply ( );

-	SpiderRequest m_redirSpiderRequest;
-	SpiderRequest *m_redirSpiderRequestPtr;
-
-
 	void  setSpiderReqForMsg20 ( class SpiderRequest *sreq , 
 				     class SpiderReply   *srep );

@ -542,7 +530,6 @@ public:
 	bool hashNoSplit ( class HashTableX *tt ) ;
 	char *hashAll ( class HashTableX *table ) ;
 	bool hashMetaTags ( class HashTableX *table ) ;
-	bool hashMetaZip ( class HashTableX *table ) ;
 	bool hashContentType ( class HashTableX *table ) ;
 	
 	bool hashLinks ( class HashTableX *table ) ;
@ -550,12 +537,9 @@ public:
 	SafeBuf *getTimeAxisUrl ( );
 	bool hashUrl ( class HashTableX *table, bool urlOnly );
 	bool hashDateNumbers ( class HashTableX *tt );
-	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
 	bool hashLinksForLinkdb ( class HashTableX *table ) ;
 	bool hashNeighborhoods ( class HashTableX *table ) ;
-	bool hashRSSInfo ( class HashTableX *table ) ;
-	bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ;
 	bool hashTitle ( class HashTableX *table );
 	bool hashBody2 ( class HashTableX *table );
 	bool hashMetaKeywords ( class HashTableX *table );
@ -564,12 +548,8 @@ public:
 	bool hashLanguage ( class HashTableX *table ) ;
 	bool hashLanguageString ( class HashTableX *table ) ;
 	bool hashCountry ( class HashTableX *table ) ;
-	bool hashSiteNumInlinks ( class HashTableX *table ) ;
-	bool hashCharset ( class HashTableX *table ) ;
-	bool hashTagRec ( class HashTableX *table ) ;
 	bool hashPermalink ( class HashTableX *table ) ;
-	bool hashVectors(class HashTableX *table ) ;
-	
+
 	class Url *getBaseUrl ( ) ;
 	bool hashIsAdult    ( class HashTableX *table ) ;

@ -611,10 +591,6 @@ public:
 			  int32_t bufLen , 
 			  class HashInfo *hi ) ;

-	bool hashNumberForSortingAsFloat ( float f , 
-			   class HashInfo *hi ,
-			   char *gbsortByStr ) ;
-
 	bool hashNumberForSortingAsInt32 ( int32_t x,
 			   class HashInfo *hi ,
 			   char *gbsortByStr ) ;
@ -678,9 +654,7 @@ public:
 	int64_t  m_firstUrlHash64;
 	Url        m_currentUrl;

-	CollectionRec *m_lastcr;
 	collnum_t      m_collnum;
-	int32_t           m_lastCollRecResetCount;
 	class CollectionRec *getCollRec ( ) ;
 	bool setCollNum ( const char *coll ) ;

@ -696,8 +670,6 @@ public:
 	int32_t m_addedStatusDocSize;

 	SafeBuf  m_metaList2;
-	SafeBuf  m_zbuf;
-	SafeBuf  m_kbuf;

 	// used by msg7 to store udp slot
 	class UdpSlot *m_injectionSlot;
@ -719,8 +691,6 @@ public:
 	char m_logLangId;
 	int32_t m_logSiteNumInlinks;

-	SafeBuf m_tmpBuf2;
-
 	SafeBuf m_timeAxisUrl;

 	Images     m_images;
@ -767,7 +737,6 @@ public:
 	char     m_fragBufValid;
 	char     m_wordSpamBufValid;
 	char     m_finalSummaryBufValid;
-	char     m_redirSpiderRequestValid;

 	char     m_hopCountValid;
 	char     m_isInjectingValid;
@ -782,7 +751,6 @@ public:
 	char     m_datedbDateValid;
 	char     m_isRSSValid;
 	char     m_isSiteMapValid;
-	char     m_spiderLinksArgValid;
 	char     m_isContentTruncatedValid;
 	char     m_xmlValid;
 	char     m_linksValid;
@ -790,10 +758,8 @@ public:
 	char     m_bitsValid;
 	char     m_bits2Valid;
 	char     m_posValid;
-	char     m_isUrlBadYearValid;
 	char     m_phrasesValid;
 	char     m_sectionsValid;
-	char     m_subSentsValid;

 	char     m_imageDataValid;
 	char     m_imagesValid;
@ -806,7 +772,6 @@ public:
 	bool m_firstIpValid;
 	bool m_spideredTimeValid;
 	bool m_indexedTimeValid;
-	bool m_firstIndexedValid;
 	bool m_isInIndexValid;
 	bool m_wasInIndexValid;
 	bool m_outlinksAddedDateValid;
@ -828,9 +793,7 @@ public:
 	bool m_canonicalRedirUrlValid;
 	bool m_statusMsgValid;
 	bool m_mimeValid;
-	bool m_pubDateValid;
 	bool m_hostHash32aValid;
-	bool m_hostHash32bValid;
 	bool m_indexCodeValid;
 	bool m_priorityValid;
 	bool m_downloadStatusValid;
@ -845,29 +808,23 @@ public:
 	bool m_isPermalinkValid;

 	bool m_isAdultValid;
-	bool m_urlPubDateValid;
 	bool m_isUrlPermalinkFormatValid;
 	bool m_percentChangedValid;
 	bool m_unchangedValid;
 	bool m_countTableValid;
-	bool m_summaryLangIdValid;
 	bool m_tagPairHashVecValid;
 	bool m_summaryVecValid;
-	bool m_titleVecValid;
 	bool m_pageSampleVecValid;
 	bool m_postVecValid;
 	bool m_dupListValid;
-	bool m_likedbListValid;
 	bool m_isDupValid;
 	bool m_metaDescValid;
 	bool m_metaSummaryValid;
 	bool m_metaKeywordsValid;
 	bool m_metaGeoPlacenameValid;
-	bool m_siteSpiderQuotaValid;
 	bool m_oldDocValid;
 	bool m_extraDocValid;
 	bool m_rootDocValid;
-	bool m_oldMetaListValid;
 	bool m_oldTitleRecValid;
 	bool m_rootTitleRecValid;
 	bool m_isIndexedValid;
@ -881,12 +838,10 @@ public:
 	bool m_siteHash32Valid;
 	bool m_httpReplyValid;
 	bool m_contentTypeValid;
-	bool m_priorityQueueNumValid;
 	bool m_outlinkTagRecVectorValid;
 	bool m_outlinkIpVectorValid;
 	bool m_hasNoIndexMetaTagValid;
 	bool m_hasUseFakeIpsMetaTagValid;
-	bool m_outlinkIsIndexedVectorValid;
 	bool m_isSiteRootValid;
 	bool m_wasContentInjectedValid;
 	bool m_outlinkHopCountVectorValid;
@ -906,7 +861,6 @@ public:
 	bool m_htbValid;
 	bool m_collnumValid;
 	bool m_summaryValid;
-	bool m_gsbufValid;
 	bool m_spiderStatusDocMetaListValid;
 	bool m_isCompromisedValid;
 	bool m_isNoArchiveValid;
@ -914,9 +868,7 @@ public:
 	bool m_isLinkSpamValid;
 	bool m_isErrorPageValid;
 	bool m_isHijackedValid;
-	bool m_dupHashValid;
 	bool m_exactContentHash64Valid;
-	bool m_looseContentHash64Valid;
 	bool m_jpValid;

 	char m_isSiteMap;
@ -933,43 +885,31 @@ public:
 	// DO NOT add validity flags below this line!
 	char     m_VALIDEND;

-
 	bool m_printedMenu;
-	int32_t m_urlPubDate;
 	char m_isUrlPermalinkFormat;
-	uint8_t m_summaryLangId;
 	int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
 	int32_t m_tagPairHashVecSize;
 	int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
 	int32_t m_summaryVecSize;
-	int32_t m_titleVec [SAMPLE_VECTOR_SIZE/4];
-	int32_t m_titleVecSize;
 	int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
 	int32_t m_pageSampleVecSize;
 	int32_t m_postVec[POST_VECTOR_SIZE/4];
 	int32_t m_postVecSize;
-	float m_tagSimilarity;
 	float m_pageSimilarity;
 	float m_percentChanged;
 	bool  m_unchanged;
 	// what docids are similar to us? docids are in this list
 	RdbList m_dupList;
-	RdbList m_likedbList;
-	uint64_t m_dupHash;
 	int64_t m_exactContentHash64;
-	int64_t m_looseContentHash64;
 	Msg0 m_msg0;
 	Msg5 m_msg5;
 	char m_isDup;
 	int64_t m_docIdWeAreADupOf;
-	int32_t m_ei;
-	int32_t m_lastLaunch;
 	Msg22Request m_msg22Request;
 	Msg22Request m_msg22Requestc;
 	Msg22 m_msg22a;
 	Msg22 m_msg22b;
 	Msg22 m_msg22c;
-	Msg22 m_msg22d;
 	Msg22 m_msg22e;
 	Msg22 m_msg22f;
 	// these now reference directly into the html src so our 
@ -983,13 +923,10 @@ public:
 	
 	char *m_metaGeoPlacename;
 	int32_t  m_metaGeoPlacenameLen;
-	
-	
-	int32_t  m_siteSpiderQuota;
+
 	class XmlDoc *m_oldDoc;
 	class XmlDoc *m_extraDoc;
 	class XmlDoc *m_rootDoc;
-	RdbList m_oldMetaList;
 	char   *m_oldTitleRec;
 	int32_t    m_oldTitleRecSize;
 	char   *m_rootTitleRec;
@ -1002,12 +939,9 @@ public:
 	char m_wasInIndex;

 	Msg8a   m_msg8a;
-	char   *m_tagdbColl;
-	int32_t    m_tagdbCollLen;

 	Url   m_extraUrl;
 	uint8_t m_siteNumInlinks8;
-	LinkInfo m_siteLinkInfo;
 	SafeBuf m_mySiteLinkInfoBuf;
 	SafeBuf m_myPageLinkInfoBuf;
 	SafeBuf m_myTempLinkInfoBuf;
@ -1021,9 +955,6 @@ public:
 	SafeBuf m_tmpBuf12;
 	Multicast m_mcast11;
 	Multicast m_mcast12;
-	// lists from cachedb for msg25's msg20 replies serialized
-	RdbList m_siteReplyList;
-	RdbList m_pageReplyList;
 	MsgC m_msgc;
 	bool m_isAllowed;
 	bool m_forwardDownloadRequest;
@ -1035,22 +966,17 @@ public:
 	int32_t m_numExpansions;
 	char m_newOnly;
 	char m_isWWWDup;
-	char m_calledMsg0b;

 	SafeBuf m_linkSiteHashBuf;
 	SafeBuf m_linkdbDataBuf;
 	SafeBuf m_langVec;
 	Msg0 m_msg0b;
-	class RdbList *m_ulist;
-	char     *m_linkInfoColl;
 	SiteGetter m_siteGetter;
 	int64_t  m_siteHash64;
 	int32_t m_siteHash32;
 	char *m_httpReply;
 	char m_incrementedAttemptsCount;
 	char m_incrementedDownloadCount;
-	char m_redirectFlag;
-	char m_spamCheckDisabled;
 	char m_useRobotsTxt;
 	int32_t m_robotsTxtLen;
 	int32_t m_httpReplySize;
@ -1062,16 +988,13 @@ public:
 	char m_calledThread;
 	int32_t m_errno;
 	int32_t m_hostHash32a;
-	int32_t m_hostHash32b;
 	int32_t m_domHash32;
-	int32_t m_priorityQueueNum;

 	// this points into m_msge0 i guess
 	Msge0 m_msge0;

 	// this points into m_msge1 i guess
 	int32_t *m_outlinkIpVector;
-	SafeBuf m_outlinkTagRecPtrBuf;
 	SafeBuf m_fakeIpBuf;
 	char m_hasNoIndexMetaTag;
 	char m_hasUseFakeIpsMetaTag;
@ -1080,23 +1003,13 @@ public:
 	SafeBuf m_fakeTagRecPtrBuf;
 	TagRec m_fakeTagRec;

-	//
-	// diffbot parms for indexing diffbot's json output
-	//
-
-	char *hashJSONFields ( HashTableX *table );
 	char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
 				bool hashWithoutFieldNames ) ;

-	char *hashXMLFields ( HashTableX *table );
-
 	Json *getParsedJson();
 	// object that parses the json
 	Json m_jp;

-	// related query algo stuff
-	int64_t m_tlbufTimer;
-
 	// flow flags

 	bool m_computedMetaListCheckSum;
@ -1104,18 +1017,11 @@ public:
 	// cachedb related args
 	bool    m_allHashed;

-	// for getRelatedDocIdsWithTitles() launching msg20s
-	int32_t m_numMsg20Replies;
-	int32_t m_numMsg20Requests;
-
 	int8_t *m_outlinkHopCountVector;
 	int32_t  m_outlinkHopCountVectorSize;
-	char m_isFiltered;
 	int32_t m_urlFilterNum;
 	int32_t m_numOutlinksAdded;
 	int32_t m_numOutlinksAddedFromSameDomain;
-	int32_t m_numOutlinksFiltered;
-	int32_t m_numOutlinksBanned;
 	int32_t m_numRedirects;
 	bool m_isPageParser;
 	Url m_baseUrl;
@ -1124,12 +1030,8 @@ public:
 	char  m_linkTextBuf[MAX_LINK_TEXT_LEN];
 	char m_surroundingTextBuf[MAX_SURROUNDING_TEXT_WIDTH];
 	char m_rssItemBuf[MAX_RSSITEM_SIZE];
-	SafeBuf m_gsbuf;
+
 	char *m_note;
-	char *m_imageUrl;
-	char *m_imageUrl2;
-	SafeBuf m_imageUrlBuf;
-	SafeBuf m_imageUrlBuf2;
 	Query m_query;
 	Matches m_matches;
 	// meta description buf
@ -1138,7 +1040,6 @@ public:
 	SafeBuf m_htb;
 	Title m_title;
 	Summary m_summary;
-	char m_isCompromised;
 	char m_isNoArchive;
 	char m_isErrorPage;
 	char m_isHijacked;
@ -1146,8 +1047,6 @@ public:
 	// stuff
 	char *m_statusMsg;
 	Msg4  m_msg4;
-	bool  m_incCount;
-	bool  m_decCount;

 	bool  m_deleteFromIndex;

@ -1171,33 +1070,19 @@ public:
 	bool m_check1                   ;
 	bool m_check2                   ;
 	bool m_prepared                 ;
-	bool m_updatedCounts            ;
-	bool m_updatedCounts2           ;
 	bool m_copied1                  ;
 	bool m_updatingSiteLinkInfoTags ;

-	int64_t m_calledMsg22d             ;
 	bool m_didDelay                 ;
 	bool m_didDelayUnregister       ;
 	bool m_calledMsg22e             ;
 	bool m_calledMsg22f             ;
 	bool m_calledMsg25              ;
-	bool m_calledMsg25b             ;
-	bool m_calledMsg8b              ;
-	bool m_calledMsg40              ;
 	bool m_calledSections           ;
-	bool m_firstEntry               ;
-	bool m_firstEntry2              ;
-	bool m_launchedSpecialMsg8a     ;
-	bool m_launchedMsg8a2           ;
 	bool m_loaded                   ;

-	bool m_processedLang            ;
-
 	bool m_doingConsistencyCheck ;

-	int32_t    m_langIdScore;
-
 	int32_t m_dist;

 	// use to store a \0 list of "titles" of the root page so we can
@ -1217,7 +1102,6 @@ public:
 	char   m_titleBuf[ROOT_TITLE_BUF_MAX];
 	int32_t   m_titleBufSize;

-
 	bool m_setTr                    ;

 	void (* m_masterLoop) ( void *state );
@ -1227,8 +1111,6 @@ public:
 	bool (* m_callback2) ( void *state );	
 	void  *m_state;

-	bool m_skipIframeExpansion;
-
 	// this is non-zero if we decided not to index the doc
 	int32_t m_indexCode;

@ -1250,12 +1132,8 @@ public:

 	int32_t  m_maxCacheAge;

-	char     *m_wikiqbuf;
-	int32_t      m_wikiqbufSize;
-
 	bool      m_registeredSleepCallback;
-	bool      m_addedNegativeDoledbRec;
-	
+
 	bool          m_hashedTitle;
 	bool          m_hashedMetas;

@ -1293,7 +1171,6 @@ public:
 	bool          m_setFromUrl;
 	bool          m_setFromDocId;
 	bool          m_freeLinkInfo1;
-	bool          m_freeLinkInfo2;
 	bool          m_contentInjected;

 	bool          m_recycleContent;
@ -1329,7 +1206,6 @@ public:
 	int32_t  getProbSpam  ( int32_t *profile, int32_t plen , int32_t step );
 	bool m_isRepeatSpammer;
 	int32_t m_numRepeatSpam;
-	bool m_totallySpammed;

 	// frag vector (repeated fragments). 0 means repeated, 1 means not.
 	// vector is 1-1 with words in the document body.
@ -1358,10 +1234,6 @@ public:
 	void logQueryTimingEnd(const char* function, int64_t startTime);

 	int32_t  m_i;
-	int32_t  m_blocked;
-	void *m_finalState;
-	void (* m_finalCallback) ( void *state );
-	int64_t m_cacheStartTime;
 };

 // . PageParser.cpp uses this class for printing hashed terms out by calling
--- a/XmlDoc_Indexing.cpp
+++ b/XmlDoc_Indexing.cpp
@ -431,9 +431,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 		if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashCountry failed", __FILE__,__func__, __LINE__);
 		return NULL;
 	}
-	
-// BR 20160117 removed:	if ( ! hashSiteNumInlinks( table ) ) return NULL;
-// BR 20160117 removed:	if ( ! hashTagRec        ( table ) ) return NULL;
+
 // BR 20160106 removed:	if ( ! hashAds           ( table ) ) return NULL;
 // BR 20160106 removed:	if ( ! hashSubmitUrls    ( table ) ) return NULL;
 	if ( ! hashIsAdult       ( table ) ) 
@ -445,10 +443,6 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 	// has gbhasthumbnail:1 or 0
 // BR 20160106 removed:	if ( ! hashImageStuff    ( table ) ) return NULL;

-	// . hash sectionhash:xxxx terms
-	// . diffbot still needs to hash this for voting info
-// BR 20160106 removed:	if ( ! hashSections   ( table ) ) return NULL;
-
 	// now hash the terms sharded by termid and not docid here since they
 	// just set a special bit in posdb key so Rebalance.cpp can work.
 	// this will hash the content checksum which we need for deduping
@ -477,17 +471,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 		return (char *)1;
 	}

-	// hash json fields
-	if ( *ct == CT_JSON ) {
-		// this hashes both with and without the fieldname
-// BR 20160107 removed:		hashJSONFields ( table );
-		goto skip;
-	}
-
-	// same for xml now, so we can search for field:value like w/ json
-	if ( *ct == CT_XML ) {
-		// this hashes both with and without the fieldname
-// BR 20160107 removed:		hashXMLFields ( table );
+	if ( *ct == CT_JSON || *ct == CT_XML ) {
 		goto skip;
 	}

@ -579,18 +563,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 		if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaTags failed", __FILE__,__func__, __LINE__);
 		return NULL;
 	}
-	
-/*
-	BR 20160220 removed.
-	if ( ! hashMetaZip       ( table ) ) 
-	{
-		if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashMetaZip failed", __FILE__,__func__, __LINE__);
-		return NULL;
-	}
-*/
-// BR 20160107 removed:	if ( ! hashCharset       ( table ) ) return NULL;
-// BR 20160107 removed:		if ( ! hashRSSInfo       ( table ) ) return NULL;
-	if ( ! hashPermalink     ( table ) ) 
+
+	if ( ! hashPermalink     ( table ) )
 	{
 		if( g_conf.m_logTraceXmlDoc ) log(LOG_TRACE,"%s:%s:%d: END, hashPermaLink failed", __FILE__,__func__, __LINE__);
 		return NULL;
@ -619,8 +593,6 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 }


-
-
 bool XmlDoc::setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t uqd ) {

 	// the posdb table
@ -870,57 +842,6 @@ bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { // , bool isStatusDoc ) {
 	return true;
 }

-bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
-
-	setStatus ( "hashing meta zip" );
-
-	// . set the score based on quality
-	// . scores are multiplied by 256 to preserve fractions for adding
-	uint32_t score = *getSiteNumInlinks8() * 256 ;
-	if ( score <= 0   ) score = 1;
-	// search for meta date
-	char buf [ 32 ];
-	int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 );
-	if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3);
-	char *p     = buf;
-	char *pend  = buf + bufLen ;
-	if ( bufLen <= 0 ) return true;
-
-	// set up the hashing parms
-	HashInfo hi;
-	hi.m_hashGroup = HASHGROUP_INTAG;
-	hi.m_tt        = tt;
-	//hi.m_prefix    = "zipcode";
-	hi.m_prefix    = "gbzipcode";
-
- nextZip:
-	// . parse out the zip codes, may be multiple ones
-	// . skip non-digits
-	while ( p < pend && ! is_digit(*p) ) p++;
-	// skip if no digits
-	if ( p == pend ) return true;
-	// need at least 5 consecutive digits
-	if ( p + 5 > pend  ) return true;
-	// if not a zip code, skip it
-	if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; }
-	if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; }
-	if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; }
-	if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; }
-	// do we have too many consectuive digits?
-	if ( p + 5 != pend && is_digit(p[5]) ) {
-		// if so skip this whole string of digits
-		p += 5; while ( p < pend && is_digit(*p) ) p++;
-		goto nextZip;
-	}
-
-	// 90210 --> 90 902 9021 90210
-	for ( int32_t i = 0 ; i <= 3 ; i++ )
-		// use prefix as description
-		if ( ! hashString ( p,5-i,&hi ) ) return false;
-	p += 5;
-	goto nextZip;
-}
-
 // returns false and sets g_errno on error
 bool XmlDoc::hashContentType ( HashTableX *tt ) {

@ -1788,13 +1709,6 @@ bool XmlDoc::hashUrl ( HashTableX *tt, bool urlOnly ) { // , bool isStatusDoc )
 	return true;
 }

-
-// . returns false and sets g_errno on error
-bool XmlDoc::hashSections ( HashTableX *tt ) {
-	// BR 20160106: No longer store xpath-hashes in posdb as we do not use them.
-	return true;
-}
-
 // . returns false and sets g_errno on error
 bool XmlDoc::hashIncomingLinkText ( HashTableX *tt               ,
 				    bool        hashAnomalies    ,
@ -1981,186 +1895,6 @@ bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) {
 	goto loop;
 }

-
-// . returns false and sets g_errno on error
-bool XmlDoc::hashRSSInfo ( HashTableX *tt ) {
-
-	setStatus ( "hashing rss info" );
-
-	uint8_t *ct = getContentType();
-	if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; }
-
-	// . finally hash in the linkText terms from the LinkInfo
-	// . the LinkInfo class has all the terms of hashed anchor text for us
-	// . if we're using an old TitleRec linkTermList is just a ptr to
-	//   somewhere in TitleRec
-	// . otherwise, we generated it from merging a bunch of LinkInfos
-	//   and storing them in this new TitleRec
-	LinkInfo  *linkInfo = getLinkInfo1();
-
-	// get the xml of the first rss/atom item/entry referencing this url
-	Xml xml;
-	// . returns NULL if no item xml
-	// . this could also be a "channel" blurb now, so we index channel pgs
-	if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false;
-
-	if ( xml.isEmpty() )
-		// hash gbrss:0
-		return hashRSSTerm ( tt , false );
-
-	// parser info msg
-	//if ( m_pbuf ) {
-	//	m_pbuf->safePrintf(
-	//		"<br><b>--BEGIN RSS/ATOM INFO HASH--</b><br><br>");
-	//}
-
-	// hash nothing if not a permalink and eliminating "menus"
-	//if ( ! *getIsPermalink() && m_eliminateMenus ) return true;
-
-	// . IMPORTANT: you must be using the new link algo, so turn it on
-	//   in the spider controls. this allows us to include LinkTexts from
-	//   the same IP in our LinkInfo class in the TitleRec.
-	// . is it rss or atom? both use title tag, so doesn't matter
-	// . get the title tag
-	bool  isHtmlEncoded;
-	int32_t  titleLen;
-	char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded );
-	char  c = 0;
-
-	// sanity check
-	if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; }
-
-	bool hashIffUnique = true;
-	// but if we had no content because we were an mp3 or whatever,
-	// do not worry about avoiding double hashing
-	if ( size_utf8Content <= 0 ) hashIffUnique = false;
-
-	// decode it?
-	// should we decode it? if they don't use [CDATA[]] then we should
-	// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
-	// but most other feeds do not use it
-	if ( isHtmlEncoded && title && titleLen > 0 ) {
-		// it is html encoded so that the <'s are encoded to &lt;'s so
-		// we must decode them back. this could turn latin1 into utf8
-		// though? no, because the &'s should have been encoded, too!
-		int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness);
-		// make sure we don't overflow the buffer
-		if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; }
-		// reassign the length
-		titleLen = newLen;
-		// NULL terminate it
-		c = title[titleLen];
-		title[titleLen] = '\0';
-	}
-
-	// update hash parms
-	HashInfo hi;
-	hi.m_tt        = tt;
-	hi.m_hashGroup = HASHGROUP_TITLE;
-	hi.m_desc      = "rss title";
-
-	// . hash the rss title
-	// . only hash the terms if they are unique to stay balanced with docs
-	//   that are not referenced by an rss feed
-	bool status = hashString ( title,titleLen,&hi ) ;
-	// pop the end back just in case
-	if ( c ) title[titleLen] = c;
-	// return false with g_errno set on error
-	if ( ! status ) return false;
-
-	// get the rss description
-	int32_t  descLen;
-	char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded );
-
-	// for adavanced hashing
-	Xml     xml2;
-	Words   w;
-	//Scores  scores;
-	Words  *wordsPtr = NULL;
-	//Scores *scoresPtr = NULL;
-	c = 0;
-	// should we decode it? if they don't use [CDATA[]] then we should
-	// ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA,
-	// but most other feeds do not use it
-	if ( isHtmlEncoded && desc && descLen > 0 ) {
-		// it is html encoded so that the <'s are encoded to &lt;'s so
-		// we must decode them back. this could turn latin1 into utf8
-		// though? no, because the &'s should have been encoded, too!
-		int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness);
-		// make sure we don't overflow the buffer
-		if ( newLen > descLen ) { char *xx = NULL; *xx = 0; }
-		// reassign the length
-		descLen = newLen;
-	}
-
-	// NULL terminate it
-	if ( desc ) {
-		c = desc[descLen];
-		desc[descLen] = '\0';
-		// set the xml class from the decoded html
-		if ( !xml2.set( desc, descLen, m_version, m_niceness, *ct ) ) {
-			return false;
-		}
-
-		// set the words class from the xml, returns false and sets
-		// g_errno on error
-		if ( !w.set( &xml2, true, true ) ) {
-			return false;
-		}
-
-		// pass it in to TermTable::hash() below
-		wordsPtr = &w;
-	}
-
-	// update hash parms
-	hi.m_tt        = tt;
-	hi.m_desc      = "rss body";
-	hi.m_hashGroup = HASHGROUP_BODY;
-	// . hash the rss/atom description
-	// . only hash the terms if they are unique to stay balanced with docs
-	//   that are not referenced by an rss feed
-	status = hashString ( desc, descLen, &hi );
-	// pop the end back just in case
-	if ( c ) desc[descLen] = c;
-	// return false with g_errno set
-	if ( ! status ) return false;
-
-	// hash gbrss:1
-       	if ( ! hashRSSTerm ( tt , true ) ) return false;
-
-	// parser info msg
-	//if ( m_pbuf ) {
-	//	m_pbuf->safePrintf("<br><b>--END RSS/ATOM INFO HASH--"
-	//			   "</b><br><br>");
-	//}
- 	return true;
-}
-
-bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) {
-	// hash gbrss:0 or gbrss:1
-	char *value;
-	if ( inRSS ) value = "1";
-	else         value = "0";
-
-	// update hash parms
-	HashInfo hi;
-	hi.m_tt        = tt;
-	hi.m_prefix    = "gbinrss";
-	hi.m_hashGroup = HASHGROUP_INTAG;
-
-	// returns false and sets g_errno on error
-	if ( ! hashString(value,1,&hi ) ) return false;
-
-	// hash gbisrss:1 if we are an rss page ourselves
-	if ( *getIsRSS() ) value = "1";
-	else               value = "0";
-	// update hash parms
-	hi.m_prefix = "gbisrss";
-	// returns false and sets g_errno on error
-	if ( ! hashString(value,1,&hi) ) return false;
-	return true;
-}
-
 // . we now do the title hashing here for newer titlerecs, version 80+, rather
 //   than use the <index> block in the ruleset for titles.
 // . this is not to be confused with hashing the title: terms which still
@ -2426,170 +2160,6 @@ bool XmlDoc::hashCountry ( HashTableX *tt ) {
 	return true;
 }

-bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) {
-
-	setStatus ( "hashing site num inlinks" );
-
-	char s[32];
-	int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() );
-
-	// update hash parms
-	HashInfo hi;
-	hi.m_tt        = tt;
-	hi.m_hashGroup = HASHGROUP_INTAG;
-	hi.m_prefix    = "gbsitenuminlinks";
-
-	// hack test
-	// slen = sprintf(s,"%"UINT32"",
-	// 	       ((uint32_t)m_firstUrl.getUrlHash32()) % 1000);
-	// log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s);
-
-	return hashString ( s, slen, &hi );
-}
-
-bool XmlDoc::hashCharset ( HashTableX *tt ) {
-
-	setStatus ( "hashing charset" );
-
-	char s[128]; // charset string
-	int32_t slen;
-
-	// hash the charset as a string
-	if ( ! get_charset_str(*getCharset()))
-		slen = sprintf(s, "unknown");
-	else
-		slen = sprintf(s, "%s", get_charset_str(*getCharset()));
-
-	// update hash parms
-	HashInfo hi;
-	hi.m_tt        = tt;
-	hi.m_hashGroup = HASHGROUP_INTAG;
-	hi.m_prefix    = "gbcharset";
-
-	if ( ! hashString ( s,slen, &hi ) ) return false;
-
-	// hash charset as a number
-	slen = sprintf(s, "%d", *getCharset());
-
-	return hashString ( s,slen, &hi ) ;
-}
-
-
-// . only hash certain tags (single byte scores and ST_COMMENT)
-// . do not hash clocks, ST_SITE, ST_COMMENT
-// . term = gbtag:blog1     score=0-100
-// . term = gbtag:blog2     score=0-100
-// . term = gbtag:english1  score=0-100
-// . term = gbtag:pagerank1 score=0-100, etc. ...
-// . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty)
-// . later we can support query like gbtag:english1>30
-bool XmlDoc::hashTagRec ( HashTableX *tt ) {
-
-	setStatus ( "hashing tag rec" );
-
-	//char *field    = "gbtag:";
-	//int32_t  fieldlen = gbstrlen(field);
-	//bool  retval   = true;
-
-	// . this tag rec does not have the ST_SITE tag in it to save space
-	// . it does not have clocks either?
-	TagRec *gr = getTagRec();
-
-	// count occurence of each tag id
-	//int16_t count [ LAST_TAG ];
-	//memset ( count , 0 , 2 * LAST_TAG );
-
-	// loop over all tags in the title rec
-	for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) {
-		// breathe
-		QUICKPOLL(m_niceness);
-		// get id
-		int32_t type = tag->m_type;
-		// skip tags we are not supposed to index, like
-		// ST_CLOCK, etc. or anything with a dataSize not 1
-		if ( ! tag->isIndexable() ) continue;
-		// hash these metas below
-		//if ( type == ST_META ) continue;
-		//if ( tag->isType("meta") ) continue;
-		// only single byters. this should have been covered by the
-		// isIndexable() function.
-		//if ( tag->getTagDataSize() != 1 ) continue;
-		// get the name
-		char *str = getTagStrFromType ( type );
-		// get data size
-		//uint8_t *data = (uint8_t *)tag->getTagData();
-		// make it a string
-		//char dataStr[6];
-		//sprintf ( dataStr , "%"INT32"",(int32_t)*data );
-		// skip if has non numbers
-		//bool num = true;
-		//for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ )
-		//	if ( ! is_digit(tag->getTagData()[i]) ) num = false;
-		// skip if it has more than just digits, we are not indexing
-		// strings at this point
-		//if ( ! num ) continue;
-		// point to it, should be a NULL terminated string
-		char *dataStr = tag->getTagData();
-		// skip if number is too big
-		//int32_t val = atol ( dataStr );
-		// boost by one so we can index "0" score
-		//val++;
-		// we really only want to index scores from 0-255
-		//if ( val > 255 ) continue;
-		// no negatives
-		//if ( val <= 0 ) continue;
-		// count occurence
-		//count [ type ]++;
-		// . make the term name to hash after the gbtag:
-		// . we want to hash "gbtag:english3" for example, for the
-		//   ST_ENGLISH tag id.
-		char prefix[64];
-		// . do not include the count for the first occurence
-		// . follows the gbruleset:36 convention
-		// . index gbtagspam:0 or gbtagspam:1, etc.!!!
-		//if ( count[type] == 1 )
-		sprintf ( prefix , "gbtag%s",str);
-		// assume that is good enough
-		//char *prefix = tmp;
-		// store prefix into m_wbuf so XmlDoc::print() works!
-		//if ( m_pbuf ) {
-		//	int32_t tlen = gbstrlen(tmp);
-		//	m_wbuf.safeMemcpy(tmp,tlen+1);
-		//	prefix = m_wbuf.getBuf() - (tlen+1);
-		//}
-		//else
-		//	sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]);
-		// "unmap" it so when it is hashed it will have the correct
-		// 8-bit score. IndexList.cpp will convert it back to 8 bits
-		// in IndexList::set(table), which sets our termlist from
-		// this "table".
-		//int32_t score = score8to32 ( val );
-		// we already incorporate the score as a string when we hash
-		// gbtagtagname:tagvalue so why repeat it?
-		//int32_t score = 1;
-
-		// update hash parms
-		HashInfo hi;
-		hi.m_tt        = tt;
-		hi.m_prefix    = prefix;
-		hi.m_hashGroup = HASHGROUP_INTAG;
-
-		// meta is special now
-		if ( tag->isType("meta") ) {
-			hi.m_prefix    = NULL;
-		}
-
-		// hash it. like "gbtagenglish:1" with a score of 1, etc.
-		// or "gbtagspam:33" with a score of 33. this would also
-		// hash gbtagclock:0xfe442211 type things as well.
-		int32_t dlen = gbstrlen(dataStr);
-		if ( ! hashString ( dataStr,dlen,&hi ) ) return false;
-	}
-
-	return true;
-}
-
-
 bool XmlDoc::hashPermalink ( HashTableX *tt ) {

 	setStatus ( "hashing is permalink" );
@ -2607,15 +2177,6 @@ bool XmlDoc::hashPermalink ( HashTableX *tt ) {
 	return hashString ( s,1,&hi );
 }

-
-bool XmlDoc::hashVectors ( HashTableX *tt ) {
-
-	setStatus ( "hashing vectors" );
-
-	return true;
-}
-
-
 // returns false and sets g_errno on error
 bool XmlDoc::hashIsAdult ( HashTableX *tt ) {

@ -3399,20 +2960,6 @@ bool XmlDoc::hashNumberForSorting ( char *beginBuf ,
 	// negative sign?
 	if ( p > beginBuf && p[-1] == '-' ) p--;

-	// BR 20160108: Removed all float numbers as we don't plan to use them
-	// . convert it to a float
-	// . this now allows for commas in numbers like "1,500.62"
-//	float f = atof2 ( p , bufEnd - p );
-
-//	if ( ! hashNumberForSortingAsFloat ( f , hi , "gbsortby" ) )
-//		return false;
-
-	// also hash in reverse order for sorting from low to high
-//	f = -1.0 * f;
-
-//	if ( ! hashNumberForSortingAsFloat ( f , hi , "gbrevsortby" ) )
-//		return false;
-
 	//
 	// also hash as an int, 4 byte-integer so our lastSpidered timestamps
 	// dont lose 128 seconds of resolution
@ -3433,116 +2980,6 @@ bool XmlDoc::hashNumberForSorting ( char *beginBuf ,
 	return true;
 }

-
-
-
-bool XmlDoc::hashNumberForSortingAsFloat ( float f , HashInfo *hi , char *sortByStr ) {
-
-	// prefix is something like price. like the meta "name" or
-	// the json name with dots in it like "product.info.price" or something
-	int64_t nameHash = 0LL;
-	int32_t nameLen = 0;
-	if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
-	if ( hi->m_prefix && nameLen )
-		nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
-	// need a prefix for hashing numbers... for now
-	else { char *xx=NULL; *xx=0; }
-
-	// combine prefix hash with a special hash to make it unique to avoid
-	// collisions. this is the "TRUE" prefix.
-	int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
-	// hash with the "TRUE" prefix
-	int64_t ph2 = hash64 ( nameHash , truePrefix64 );
-
-	// . now store it
-	// . use field hash as the termid. normally this would just be
-	//   a prefix hash
-	// . use mostly fake value otherwise
-	key144_t k;
-	g_posdb.makeKey ( &k ,
-			  ph2 ,
-			  0,//docid
-			  0,// word pos #
-			  0,// densityRank , // 0-15
-			  0 , // MAXDIVERSITYRANK
-			  0 , // wordSpamRank ,
-			  0 , //siterank
-			  0 , // hashGroup,
-			  // we set to docLang final hash loop
-			  //langUnknown, // langid
-			  // unless already set. so set to english here
-			  // so it will not be set to something else
-			  // otherwise our floats would be ordered by langid!
-			  // somehow we have to indicate that this is a float
-			  // termlist so it will not be mangled any more.
-			  //langEnglish,
-			  langUnknown,
-			  0 , // multiplier
-			  false, // syn?
-			  false , // delkey?
-			  hi->m_shardByTermId );
-
-	//int64_t final = hash64n("products.offerprice",0);
-	//int64_t prefix = hash64n("gbsortby",0);
-	//int64_t h64 = hash64 ( final , prefix);
-	//if ( ph2 == h64 )
-	//	log("hey: got offer price");
-
-	// now set the float in that key
-	g_posdb.setFloat ( &k , f );
-
-	// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
-	// so that we can b-step into a posdb list and make sure
-	// we are aligned on a 6 byte or 12 byte key, since they come
-	// in both sizes. but for this, hack it off to tell
-	// addTable144() that we are a special posdb key, a "numeric"
-	// key that has a float stored in it. then it will NOT
-	// set the siterank and langid bits which throw our sorting
-	// off!!
-	g_posdb.setAlignmentBit ( &k , 0 );
-
-	// sanity
-	float t = g_posdb.getFloat ( &k );
-	if ( t != f ) { char *xx=NULL;*xx=0; }
-
-	HashTableX *dt = hi->m_tt;
-
-	// the key may indeed collide, but that's ok for this application
-	if ( ! dt->addTerm144 ( &k ) )
-		return false;
-
-	if ( ! m_wts )
-		return true;
-
-	// store in buffer
-	char buf[128];
-	snprintf(buf,126,"%s:%s float32=%f",sortByStr,hi->m_prefix,f);
-	int32_t bufLen = gbstrlen(buf);
-
-	// add to wts for PageParser.cpp display
-	// store it
-	if ( ! storeTerm ( buf,
-			   bufLen,
-				ph2,
-			   hi,
-			   0, // word#, i,
-			   0, // wordPos
-			   0,// densityRank , // 0-15
-			   0, // MAXDIVERSITYRANK,//phrase
-			   0, // ws,
-			   0, // hashGroup,
-			   //true,
-			   &m_wbuf,
-			   m_wts,
-			   // a hack for display in wts:
-			   SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
-			   langUnknown ,
-			   k) )
-		return false;
-
-	return true;
-}
-
 bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , char *sortByStr ) {

 	// prefix is something like price. like the meta "name" or
@ -3662,22 +3099,6 @@ bool XmlDoc::hashNumberForSortingAsInt32 ( int32_t n , HashInfo *hi , char *sort

 #include "Json.h"

-char *XmlDoc::hashJSONFields ( HashTableX *table ) {
-
-	setStatus ( "hashing json fields" );
-
-	HashInfo hi;
-	hi.m_tt        = table;
-	hi.m_desc      = "json object";
-
-	// use new json parser
-	Json *jp = getParsedJson();
-	if ( ! jp || jp == (void *)-1 ) return (char *)jp;
-
-	return hashJSONFields2 ( table , &hi , jp , true );
-}
-
-
 char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
 				HashInfo *hi , Json *jp ,
 				bool hashWithoutFieldNames ) {
@ -3815,63 +3236,3 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,

 	return (char *)0x01;
 }
-
-char *XmlDoc::hashXMLFields ( HashTableX *table ) {
-
-	setStatus ( "hashing xml fields" );
-
-	HashInfo hi;
-	hi.m_tt        = table;
-	hi.m_desc      = "xml object";
-	hi.m_hashGroup = HASHGROUP_BODY;
-
-
-	Xml *xml = getXml();
-	int32_t n = xml->getNumNodes();
-	XmlNode *nodes = xml->getNodes   ();
-
-	SafeBuf nameBuf;
-
-	// scan the xml nodes
-	for ( int32_t i = 0 ; i < n ; i++ ) {
-
-		// breathe
-		QUICKPOLL(m_niceness);
-
-		// . skip if it's a tag not text node skip it
-		// . we just want the "text" nodes
-		if ( nodes[i].isTag() ) continue;
-
-		//if(!strncmp(nodes[i].m_node,"Congress%20Presses%20Uber",20))
-		//	log("hey:hy");
-
-		// assemble the full parent name
-		// like "tag1.tag2.tag3"
-		nameBuf.reset();
-		xml->getCompoundName ( i , &nameBuf );
-
-		// this is \0 terminated
-		char *tagName = nameBuf.getBufStart();
-
-		// get the utf8 text
-		char *val = nodes[i].m_node;
-		int32_t vlen = nodes[i].m_nodeLen;
-
-		// index like "title:whatever"
-		if ( tagName && tagName[0] ) {
-			hi.m_prefix = tagName;
-			hashString ( val , vlen , &hi );
-		}
-
-		// hash without the field name as well
-		hi.m_prefix = NULL;
-		hashString ( val , vlen , &hi );
-	}
-
-	return (char *)0x01;
-}
-
-
-
-
-