// Matt Wells, copyright Apr 2009

// . 2. you can also call setTitleRec() and then call getMetaList()
// . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp
// . Msg7 and Repair.cpp and injections can also set more than just 
//   m_firstUrl, like m_content, etc. or whatever elements are known, but
//   they must also set the corresponding "valid" flags of those elements
// . both methods must yield exactly the same result, the same "meta list"
// . after setting the contained classes XmlDoc::setMetaList() makes the list
//   of rdb records to be added to all the rdbs, this is the "meta list"
// . the meta list is made by hashing all the termIds/scores into some hash
//   tables in order to accumulate scores, then the hash table are serialized
//   into the "meta list"
// . the meta list is added to all rdbs with a simple call to 
//   Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now


#ifndef GB_XMLDOC_H
#define GB_XMLDOC_H

#include "Lang.h"
#include "Words.h"
#include "Bits.h"
#include "Pos.h"
#include "Phrases.h"
#include "Xml.h"
#include "SafeBuf.h"
#include "Images.h"
#include "Sections.h"
#include "Msge0.h"
#include "Msge1.h"
#include "Msg4.h"

#include "SearchInput.h"
#include "Msg40.h"
#include "Msg0.h"
#include "Msg22.h"
#include "Tagdb.h"
#include "Url.h"
#include "Linkdb.h"
#include "MsgC.h"
#include "Msg13.h"
#include "RdbList.h"
#include "SiteGetter.h"
#include "Msg20.h"
#include "Matches.h"
#include "Query.h"
#include "Title.h"
#include "Summary.h"
#include "zlib.h" // Z_OK
#include "Spider.h" // SpiderRequest/SpiderReply definitions
#include "HttpMime.h" // ET_DEFLAT
#include "Msg1.h"
#include "Json.h"

#define MAXFRAGWORDS 80000

#define MAX_TAG_PAIR_HASHES 100

#include "Msg40.h"

#define POST_VECTOR_SIZE   (32*4)

#define MAX_LINK_TEXT_LEN     512
#define MAX_SURROUNDING_TEXT_WIDTH 600
#define MAX_RSSITEM_SIZE  30000

bool getDensityRanks ( const int64_t *wids,
		       int32_t nw,
		       int32_t hashGroup ,
		       SafeBuf *densBuf ,
		       const Sections *sections,
		       int32_t niceness );

// diversity vector
bool getDiversityVec ( const Words *words ,
		       const Phrases *phrases ,
		       class HashTableX *countTable ,
		       class SafeBuf *sbWordVec ,
		       //class SafeBuf *sbPhraseVec ,
		       int32_t niceness );

float computeSimilarity ( int32_t   *vec0 , 
			  int32_t   *vec1 ,
			  // corresponding scores vectors
			  int32_t   *s0   , 
			  int32_t   *s1   , 
			  class Query  *q    ,
			  int32_t  niceness ,
			  // only Sections::addDateBasedImpliedSections()
			  // sets this to true right now. if set to true
			  // we essentially dedup each vector, although
			  // the score is compounded into the remaining 
			  // occurence. i'm not sure if that is the right
			  // behavior though.
			  bool dedupVecs = false );

bool isSimilar_sorted ( int32_t   *vec0 , 
			int32_t   *vec1 ,
			int32_t nv0 , // how many int32_ts in vec?
			int32_t nv1 , // how many int32_ts in vec?
			// they must be this similar or more to return true
			int32_t percentSimilar,
			int32_t    niceness ) ;


// tell zlib to use our malloc/free functions
int gbuncompress ( unsigned char *dest      ,
		   uint32_t *destLen   ,
		   unsigned char *source    ,
		   uint32_t  sourceLen );

int gbcompress   ( unsigned char *dest      ,
		   uint32_t *destLen   ,
		   unsigned char *source    ,
		   uint32_t  sourceLen ,
		   int32_t encoding = ET_DEFLATE);


uint32_t score8to32 ( uint8_t score8 );

// for Msg13.cpp
char getContentTypeFromContent ( char *p , int32_t niceness ) ;

// . for Msg13.cpp
// . *pend must equal \0
int32_t getContentHash32Fast ( unsigned char *p , 
			    int32_t plen ,
			    int32_t niceness ) ;

uint16_t getCharsetFast ( class HttpMime *mime, 
			  char *url ,
			  char *s , 
			  int32_t slen , 
			  int32_t niceness );

bool getWordPosVec ( const Words *words,
		     const Sections *sections,
		     int32_t startDist,
		     const char *fragVec,
		     int32_t niceness,
		     SafeBuf *wpos );


#define ROOT_TITLE_BUF_MAX 512

class XmlDoc {

public:

	/// @warning Do NOT change. titlerec binary compatibility header

	//
	// BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h)
	//

	// headerSize = this->ptr_firstUrl - this->m_headerSize
	uint16_t  m_headerSize; 
	uint16_t  m_version;

	// these flags are used to indicate which ptr_ members are present:
	uint32_t  m_internalFlags1;
	int32_t      m_ip;
	int32_t      m_crawlDelay;

	// . use this to quickly detect if doc is unchanged
	// . we can avoid setting Xml and Words classes etc...
	int32_t      m_contentHash32;

	// this is a hash of all adjacent tag pairs for templated identificatn
	uint32_t  m_tagPairHash32;
	int32_t      m_siteNumInlinks;

	int32_t    m_reserved1;
	int32_t    m_reserved2;
	uint32_t   m_spideredTime; // time_t
	uint32_t  m_indexedTime; // slightly > m_spideredTime (time_t)
	uint32_t  m_reserved32;
	uint32_t  reserved3;
	uint32_t    m_firstIndexedDate; // time_t
	uint32_t    m_outlinksAddedDate; // time_t

	uint16_t  m_charset; // the ORIGINAL charset, we are always utf8!
	uint16_t  m_countryId;

	int32_t      m_reserved3;

	uint8_t   m_metaListCheckSum8; // bring it back!!
	char      m_reserved3b;
	uint16_t  m_bodyStartPos;
	uint16_t  m_reserved5;

	uint16_t  m_unused0;

	int16_t   m_httpStatus; // -1 if not found (empty http reply)
	
	int8_t    m_hopCount;
	uint8_t   m_langId;
	uint8_t   m_reserved6;
	uint8_t   m_contentType;


	// bit flags
	uint16_t  m_isRSS:1;
	uint16_t  m_isPermalink:1;
	uint16_t  m_isAdult:1;
	uint16_t  m_wasContentInjected:1;//eliminateMenus:1;
	uint16_t  m_spiderLinks:1;
	uint16_t  m_isContentTruncated:1;
	uint16_t  m_isLinkSpam:1;
	uint16_t  m_reserved796:1;
	uint16_t  m_reserved797:1;
	uint16_t  m_reserved798:1;
	uint16_t  m_reserved799:1;
	uint16_t  m_isSiteRoot:1;

	uint16_t  m_reserved800:1;
	uint16_t  m_reserved801:1;
	uint16_t  m_reserved802:1;
	uint16_t  m_useTimeAxis:1; // m_reserved804:1;
	uint16_t  m_reserved805:1;
	uint16_t  m_reserved806:1;
	uint16_t  m_reserved807:1;
	uint16_t  m_reserved808:1;
	uint16_t  m_reserved809:1;
	uint16_t  m_reserved810:1;
	uint16_t  m_reserved811:1;
	uint16_t  m_reserved812:1;
	uint16_t  m_reserved813:1;
	uint16_t  m_reserved814:1;
	uint16_t  m_reserved815:1;
	uint16_t  m_reserved816:1;

	//end of titlerec binary compatibility header

	/// @warning Do NOT change the structure of the following until m_dummyEnd.
	/// check in XmlDoc::set2 (sanity check. must match exactly)

	char      *ptr_firstUrl;
	char      *ptr_redirUrl;
	char      *ptr_rootTitleBuf;
	int32_t      *ptr_unused12;
	int32_t      *ptr_unused13;
	void      *ptr_unused8;
	int64_t *ptr_unused10;
	float  *ptr_unused11;
	char      *ptr_imageData;
	int32_t      *ptr_unused6;
	int32_t      *ptr_unused7;
	char      *ptr_unused1;
	char      *ptr_unused2;
	char      *ptr_unused3;
	char      *ptr_utf8Content;
	char      *ptr_unused5;

	// do not let SiteGetter change this when we re-parse!
	char      *ptr_site;
	LinkInfo  *ptr_linkInfo1;
	char      *ptr_linkdbData;
	char      *ptr_unused14;
	char      *ptr_tagRecData;
	LinkInfo  *ptr_unused9;

	int32_t       size_firstUrl;
	int32_t       size_redirUrl;
	int32_t       size_rootTitleBuf;
	int32_t       size_unused12;
	int32_t       size_unused13;
	int32_t       size_unused8;
	int32_t       size_unused10;
	int32_t       size_unused11;
	int32_t       size_imageData;
	int32_t       size_unused6;
	int32_t       size_unused7;
	int32_t       size_unused1;
	int32_t       size_unused2;
	int32_t       size_unused3;
	int32_t       size_utf8Content;
	int32_t       size_unused5;
	int32_t       size_site;
	int32_t       size_linkInfo1;
	int32_t       size_linkdbData;
	int32_t       size_unused14;
	int32_t       size_tagRecData;
	int32_t       size_unused9;

	char      m_dummyEnd;

	//
	// END WHAT IS STORED IN THE TITLE REC (Titledb.h)
	//

	char		*ptr_scheme;	
	int32_t		size_scheme;	


 public:
	bool set2 ( char *titleRec,
		    int32_t maxSize, 
		    const char *coll,
		    class SafeBuf *p,
		    int32_t niceness ,
		    class SpiderRequest *sreq = NULL );

	// . since being set from a docId, we will load the old title rec
	//   and use that!
	// . used by PageGet.cpp
	bool set3 ( int64_t  docId       , 
		    const char   *coll        ,
		    int32_t       niceness    );

	bool set4 ( class SpiderRequest *sreq  , 
		    key_t           *doledbKey ,
		    const char      *coll      ,
		    class SafeBuf   *pbuf      , 
		    int32_t          niceness  ,
		    char            *utf8Content = NULL ,
		    bool             deleteFromIndex = false ,
		    int32_t             forcedIp = 0 ,
		    uint8_t          contentType = CT_HTML ,
		    uint32_t           spideredTime = 0 , // time_t
		    bool             contentHasMime = false );

	// we now call this right away rather than at download time!
	int32_t getSpideredTime();

	// time right before adding the termlists to the index, etc.
	// whereas spider time is the download time
	int32_t getIndexedTime();

	// another entry point, like set3() kinda
	bool loadFromOldTitleRec ();

	XmlDoc() ; 
	~XmlDoc() ; 
	void nukeDoc ( class XmlDoc *);
	void reset ( ) ;
	bool setFirstUrl ( char *u ) ;
	void setStatus ( const char *s ) ;
	void setCallback ( void *state, void (*callback) (void *state) ) ;
	void setCallback ( void *state, bool (*callback) (void *state) ) ;
	void getRevisedSpiderRequest ( class SpiderRequest *revisedReq );
	void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
	bool indexDoc ( );
	bool indexDoc2 ( );

	key_t *getTitleRecKey() ;
	char *prepareToMakeTitleRec ( ) ;
	// store TitleRec into "buf" so it can be added to metalist
	bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
	// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
	SafeBuf *getTitleRecBuf ( );
	SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
					      bool forDelete ) ;
	SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
	bool setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t ssDocId ) ;
	SafeBuf m_spiderStatusDocMetaList;
	char *getIsAdult ( ) ;

	char *getIsPermalink ( ) ;
	char *getIsUrlPermalinkFormat ( ) ;
	char *getIsRSS ( ) ;
	char *getIsSiteMap ( ) ;
	class Xml *getXml ( ) ;
	uint8_t *getLangVector ( ) ;	
	uint8_t *getLangId ( ) ;
	char computeLangId ( Sections *sections ,Words *words , char *lv ) ;
	class Words *getWords ( ) ;
	class Bits *getBits ( ) ;
	class Bits *getBitsForSummary ( ) ;
	class Pos *getPos ( );
	class Phrases *getPhrases ( ) ;
	class Sections *getSections ( ) ;
	int32_t *getLinkSiteHashes ( );
	class Links *getLinks ( bool doQuickSet = false ) ;
	class HashTableX *getCountTable ( ) ;
	bool hashString_ct ( class HashTableX *ht, char *s , int32_t slen ) ;
	int32_t *getSummaryVector ( ) ;
	int32_t *getPageSampleVector ( ) ;
	int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
	int32_t computeVector ( class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
	float *getPageSimilarity ( class XmlDoc *xd2 ) ;
	float *getPercentChanged ( );
	int64_t *getExactContentHash64();
	class RdbList *getDupList ( ) ;
	char *getIsDup ( ) ;
	char *getMetaDescription( int32_t *mdlen ) ;
	char *getMetaSummary ( int32_t *mslen ) ;
	char *getMetaKeywords( int32_t *mklen ) ;
	char *getMetaGeoPlacename( int32_t *mgplen );

	class Url *getCurrentUrl ( ) ;
	class Url *getFirstUrl() ;
	int64_t getFirstUrlHash48();
	int64_t getFirstUrlHash64();
	class Url **getLastRedirUrl() ;
	class Url **getRedirUrl() ;
	class Url **getMetaRedirUrl() ;
	class Url **getCanonicalRedirUrl ( ) ;
	int32_t *getFirstIndexedDate ( ) ;
	int32_t *getOutlinksAddedDate ( ) ;
	uint16_t *getCountryId ( ) ;
	class XmlDoc **getOldXmlDoc ( ) ;
	class XmlDoc **getExtraDoc ( char *url , int32_t maxCacheAge = 0 ) ;
	bool getIsPageParser ( ) ;
	class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
	char **getOldTitleRec ( );
	char **getRootTitleRec ( ) ;
	int64_t *getAvailDocIdOnly ( int64_t preferredDocId ) ;
	int64_t *getDocId ( ) ;
	char *getIsIndexed ( ) ;
	class TagRec *getTagRec ( ) ;
	// non-dup/nondup addresses only
	int32_t *getFirstIp ( ) ;
	uint8_t *getSiteNumInlinks8 () ;
	int32_t *getSiteNumInlinks ( ) ;
	class LinkInfo *getSiteLinkInfo() ;
	int32_t *getIp ( ) ;
	int32_t *gotIp ( bool save ) ;
	bool *getIsAllowed ( ) ;
	int32_t *getFinalCrawlDelay();
	int32_t      m_finalCrawlDelay;
	char *getIsWWWDup ( ) ;
	class LinkInfo *getLinkInfo1 ( ) ;
	char *getSite ( ) ;
	const char *getScheme ( ) ;
	
	void  gotSite ( ) ;
	int32_t *getSiteHash32 ( ) ;
	char **getHttpReply ( ) ;
	char **getHttpReply2 ( ) ;
	char **gotHttpReply ( ) ;
	char *getIsContentTruncated ( );
	int32_t *getDownloadStatus ( ) ;
	int64_t *getDownloadEndTime ( ) ;
	int16_t *getHttpStatus ( );
	char waitForTimeSync ( ) ;
	bool m_alreadyRegistered;
	class HttpMime *getMime () ;
	char **getContent ( ) ;
	uint8_t *getContentType ( ) ;
	uint16_t *getCharset ( ) ;
	char **getFilteredContent ( ) ;
	void filterStart_r ( bool amThread ) ;
	char **getRawUtf8Content ( ) ;
	char **getExpandedUtf8Content ( ) ;
	char **getUtf8Content ( ) ;
	// we download large files to a file on disk, like warcs and arcs
	int32_t *getContentHash32 ( ) ;
	int32_t *getContentHashJson32 ( ) ;
	int32_t     *getTagPairHashVector ( ) ;
	uint32_t *getTagPairHash32 ( ) ;
	int32_t getHostHash32a ( ) ;
	int32_t getDomHash32 ( );
	char **getThumbnailData();
	class Images *getImages ( ) ;
	class TagRec ***getOutlinkTagRecVector () ;
	char *hasNoIndexMetaTag();
	char *hasFakeIpsMetaTag ( );
	int32_t **getOutlinkFirstIpVector () ;
	char *getIsSiteRoot ( ) ;
	int8_t *getHopCount ( ) ;
	char *getSpiderLinks ( ) ;
	bool getIsInjecting();
	int32_t *getSpiderPriority ( ) ;
	int32_t *getIndexCode ( ) ;
	SafeBuf *getNewTagBuf ( ) ;

	bool logIt ( class SafeBuf *bb = NULL ) ;
	bool m_doConsistencyTesting;
	bool doConsistencyTest ( bool forceTest ) ;

	void printMetaList ( char *metaList , char *metaListEnd ,
			     class SafeBuf *pbuf );
	bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
	bool hashMetaList ( class HashTableX *ht        ,
			    char       *p         ,
			    char       *pend      ,
			    bool        checkList ) ;

	char *getMetaList ( bool forDelete = false );

	uint64_t m_downloadStartTime;

	uint64_t m_ipStartTime;
	uint64_t m_ipEndTime;

	bool m_updatedMetaData;

	void copyFromOldDoc ( class XmlDoc *od ) ;

	class SpiderReply *getFakeSpiderReply ( );

	// we add a SpiderReply to spiderdb when done spidering, even if
	// m_indexCode or g_errno was set!
	class SpiderReply *getNewSpiderReply ( );

	void  setSpiderReqForMsg20 ( class SpiderRequest *sreq , 
				     class SpiderReply   *srep );


	char *addOutlinkSpiderRecsToMetaList ( );

	int32_t getSiteRank ();
	bool addTable144 ( class HashTableX *tt1 , 
			   int64_t docId ,
			   class SafeBuf *buf = NULL );

	bool addTable224 ( HashTableX *tt1 ) ;

	bool hashNoSplit ( class HashTableX *tt ) ;
	char *hashAll ( class HashTableX *table ) ;
	bool hashMetaTags ( class HashTableX *table ) ;
	bool hashContentType ( class HashTableX *table ) ;
	
	bool hashLinks ( class HashTableX *table ) ;
	bool getUseTimeAxis ( ) ;
	SafeBuf *getTimeAxisUrl ( );
	bool hashUrl ( class HashTableX *table, bool urlOnly );
	bool hashDateNumbers ( class HashTableX *tt );
	bool hashIncomingLinkText( class HashTableX *table, bool hashAnomalies, bool hashNonAnomalies );
	bool hashLinksForLinkdb ( class HashTableX *table ) ;
	bool hashNeighborhoods ( class HashTableX *table ) ;
	bool hashTitle ( class HashTableX *table );
	bool hashBody2 ( class HashTableX *table );
	bool hashMetaKeywords ( class HashTableX *table );
	bool hashMetaGeoPlacename( class HashTableX *table );
	bool hashMetaSummary ( class HashTableX *table );
	bool hashLanguage ( class HashTableX *table ) ;
	bool hashLanguageString ( class HashTableX *table ) ;
	bool hashCountry ( class HashTableX *table ) ;
	bool hashPermalink ( class HashTableX *table ) ;

	class Url *getBaseUrl ( ) ;
	bool hashIsAdult    ( class HashTableX *table ) ;

	void set20 ( Msg20Request *req ) ;
	class Msg20Reply *getMsg20Reply ( ) ;
	Query *getQuery() ;
	Matches *getMatches () ;
	char *getDescriptionBuf ( char *displayMetas , int32_t *dlen ) ;
	SafeBuf *getHeaderTagBuf();
	class Title *getTitle ();
	class Summary *getSummary () ;
	char *getHighlightedSummary ( bool *isSetFromTagsPtr );
	char *getIsNoArchive ( ) ;
	int32_t *getUrlFilterNum();
	char *getIsLinkSpam ( ) ;
	char *getIsErrorPage ( ) ;
	const char* matchErrorMsg(char* p, char* pend );

	bool hashWords( class HashInfo *hi );
	bool hashSingleTerm( const char *s, int32_t slen, class HashInfo *hi );
	bool hashString( char *s, int32_t slen, class HashInfo *hi );

	bool hashWords3( class HashInfo *hi, const Words *words, class Phrases *phrases,
					 class Sections *sections, class HashTableX *countTable, char *fragVec, char *wordSpamVec,
					 char *langVec, class HashTableX *wts, class SafeBuf *wbuf, int32_t niceness );

	bool hashString3( char *s, int32_t slen, class HashInfo *hi, class HashTableX *countTable,
					  class SafeBuf *pbuf, class HashTableX *wts, class SafeBuf *wbuf, int32_t version,
					  int32_t siteNumInlinks, int32_t niceness );

	// gbfieldmatch:
	bool hashFieldMatchTerm ( char *val, int32_t vlen, class HashInfo *hi);

	bool hashNumberForSorting( const char *beginBuf ,
			  const char *buf , 
			  int32_t bufLen , 
			  class HashInfo *hi ) ;

	bool hashNumberForSortingAsInt32 ( int32_t x,
			   class HashInfo *hi ,
			   const char *gbsortByStr ) ;

	// print out for PageTitledb.cpp and PageParser.cpp
	bool printDoc ( class SafeBuf *pbuf );
	bool printMenu ( class SafeBuf *pbuf );
	bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
	bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
	bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
	bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr );
	bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
	bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
	bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr );
	bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );

	char *getTitleBuf             ( );
	char *getRootTitleBuf         ( );
	char *getFilteredRootTitleBuf ( );

 public:

	// stuff set from the key of the titleRec, above the compression area
	int64_t m_docId;

	char     *m_ubuf;
	int32_t      m_ubufSize;
	int32_t      m_ubufAlloc;

	// private:

	// we we started spidering it, in milliseconds since the epoch
	int64_t    m_startTime;
	int64_t    m_injectStartTime;

	class XmlDoc *m_prevInject;
	class XmlDoc *m_nextInject;

	// when set() was called by Msg20.cpp so we can time how long it took
	// to generate the summary
	int64_t    m_setTime;
	int64_t    m_cpuSummaryStartTime;

	// . these should all be set using set*() function calls so their
	//   individual validity flags can bet set to true, and successive
	//   calls to their corresponding get*() functions will not core
	// . these particular guys are set immediately on set(char *titleRec)

	Url        m_redirUrl;
	Url       *m_redirUrlPtr;
	Url       *m_lastRedirUrlPtr;
	SafeBuf    m_redirCookieBuf;
	Url        m_metaRedirUrl;
	Url       *m_metaRedirUrlPtr;
	Url        m_canonicalRedirUrl;
	Url       *m_canonicalRedirUrlPtr;
	int32_t       m_redirError;
	char       m_allowSimplifiedRedirs;
	Url        m_firstUrl;
	int64_t  m_firstUrlHash48;
	int64_t  m_firstUrlHash64;
	Url        m_currentUrl;

	collnum_t      m_collnum;
	class CollectionRec *getCollRec ( ) ;
	bool setCollNum ( const char *coll ) ;


	char      *m_content;
	int32_t       m_contentLen;

	char *m_metaList;
	int32_t  m_metaListSize;

	int32_t m_addedSpiderRequestSize;
	int32_t m_addedSpiderReplySize;
	int32_t m_addedStatusDocSize;

	SafeBuf  m_metaList2;

	// used by msg7 to store udp slot
	class UdpSlot *m_injectionSlot;

	// . same thing, a little more complicated
	// . these classes are only set on demand
	Xml        m_xml;
	Links      m_links;
	Words      m_words;
	Bits       m_bits;
	Bits       m_bits2;
	Pos        m_pos;
	Phrases    m_phrases;
	SafeBuf    m_synBuf;
	Sections   m_sections;

	// . for rebuild logging of what's changed
	// . Repair.cpp sets these based on titlerec
	char m_logLangId;
	int32_t m_logSiteNumInlinks;

	SafeBuf m_timeAxisUrl;

	bool isFirstUrlRobotsTxt();
	bool m_isRobotsTxtUrl;

	Images     m_images;
	HashTableX m_countTable;
	HttpMime   m_mime;
	TagRec     m_tagRec;
	SafeBuf    m_tagRecBuf;
	SafeBuf    m_newTagBuf;
	SafeBuf    m_fragBuf;
	SafeBuf    m_wordSpamBuf;
	SafeBuf    m_finalSummaryBuf;
	bool m_isFinalSummarySetFromTags;
	int32_t       m_firstIp;

	class SafeBuf     *m_savedSb;
	class HttpRequest *m_savedHr;

	// validity flags. on reset() all these are set to false.
	char     m_VALIDSTART;
	// DO NOT add validity flags above this line!
	char     m_metaListValid;
	char     m_addedSpiderRequestSizeValid;
	char     m_addedSpiderReplySizeValid;
	char     m_addedStatusDocSizeValid;
	char     m_downloadStartTimeValid;
	char     m_siteValid;
	char     m_startTimeValid;
	char     m_currentUrlValid;
	char     m_useTimeAxisValid;
	char     m_timeAxisUrlValid;
	char     m_firstUrlValid;
	char     m_firstUrlHash48Valid;
	char     m_firstUrlHash64Valid;
	char     m_lastUrlValid;
	char     m_docIdValid;
	char     m_availDocIdValid;
	char     m_tagRecValid;
	char     m_robotsTxtLenValid;
	char     m_tagRecDataValid;
	char     m_newTagBufValid;
	char     m_rootTitleBufValid;
	char     m_filteredRootTitleBufValid;
	char     m_titleBufValid;
	char     m_fragBufValid;
	char	 m_isRobotsTxtUrlValid;
	char     m_wordSpamBufValid;
	char     m_finalSummaryBufValid;

	char     m_hopCountValid;
	char     m_isInjectingValid;
	char     m_isImportingValid;
	char     m_metaListCheckSum8Valid;
	char     m_contentValid;
	char     m_filteredContentValid;
	char     m_charsetValid;
	char     m_langVectorValid;
	char     m_langIdValid;
	char     m_datedbDateValid;
	char     m_isRSSValid;
	char     m_isSiteMapValid;
	char     m_isContentTruncatedValid;
	char     m_xmlValid;
	char     m_linksValid;
	char     m_wordsValid;
	char     m_bitsValid;
	char     m_bits2Valid;
	char     m_posValid;
	char     m_phrasesValid;
	char     m_sectionsValid;

	char     m_imageDataValid;
	char     m_imagesValid;
	char     m_msge0Valid;
	char     m_msge1Valid;
	char     m_sreqValid;
	char     m_srepValid;

	bool m_ipValid;
	bool m_firstIpValid;
	bool m_spideredTimeValid;
	bool m_indexedTimeValid;
	bool m_isInIndexValid;
	bool m_wasInIndexValid;
	bool m_outlinksAddedDateValid;
	bool m_countryIdValid;
	bool m_bodyStartPosValid;

	bool m_httpStatusValid;
	bool m_crawlDelayValid;
	bool m_finalCrawlDelayValid;
	bool m_titleRecKeyValid;
	bool m_versionValid;
	bool m_rawUtf8ContentValid;
	bool m_expandedUtf8ContentValid;
	bool m_utf8ContentValid;
	bool m_isAllowedValid;
	bool m_redirUrlValid;
	bool m_redirCookieBufValid;
	bool m_metaRedirUrlValid;
	bool m_canonicalRedirUrlValid;
	bool m_statusMsgValid;
	bool m_mimeValid;
	bool m_hostHash32aValid;
	bool m_indexCodeValid;
	bool m_priorityValid;
	bool m_downloadStatusValid;
	bool m_downloadEndTimeValid;
	bool m_redirErrorValid;
	bool m_domHash32Valid;
	bool m_contentHash32Valid;
	bool m_tagPairHash32Valid;

	bool m_spiderLinksValid;
	bool m_firstIndexedDateValid;
	bool m_isPermalinkValid;

	bool m_isAdultValid;
	bool m_isUrlPermalinkFormatValid;
	bool m_percentChangedValid;
	bool m_unchangedValid;
	bool m_countTableValid;
	bool m_tagPairHashVecValid;
	bool m_summaryVecValid;
	bool m_pageSampleVecValid;
	bool m_postVecValid;
	bool m_dupListValid;
	bool m_isDupValid;
	bool m_metaDescValid;
	bool m_metaSummaryValid;
	bool m_metaKeywordsValid;
	bool m_metaGeoPlacenameValid;
	bool m_oldDocValid;
	bool m_extraDocValid;
	bool m_rootDocValid;
	bool m_oldTitleRecValid;
	bool m_rootTitleRecValid;
	bool m_isIndexedValid;
	bool m_siteNumInlinksValid;
	bool m_siteNumInlinks8Valid;
	bool m_siteLinkInfoValid;
	bool m_isWWWDupValid;
	bool m_linkInfo1Valid;
	bool m_linkSiteHashesValid;
	bool m_siteHash32Valid;
	bool m_httpReplyValid;
	bool m_contentTypeValid;
	bool m_outlinkTagRecVectorValid;
	bool m_outlinkIpVectorValid;
	bool m_hasNoIndexMetaTagValid;
	bool m_hasUseFakeIpsMetaTagValid;
	bool m_isSiteRootValid;
	bool m_wasContentInjectedValid;
	bool m_outlinkHopCountVectorValid;
	bool m_isFilteredValid;
	bool m_urlFilterNumValid;
	bool m_numOutlinksAddedValid;
	bool m_baseUrlValid;
	bool m_replyValid;
	bool m_crawlInfoValid;
	bool m_isPageParserValid;
	bool m_imageUrlValid;
	bool m_imageUrl2Valid;
	bool m_queryValid;
	bool m_matchesValid;
	bool m_dbufValid;
	bool m_titleValid;
	bool m_htbValid;
	bool m_collnumValid;
	bool m_summaryValid;
	bool m_spiderStatusDocMetaListValid;
	bool m_isCompromisedValid;
	bool m_isNoArchiveValid;
	bool m_titleRecBufValid;
	bool m_isLinkSpamValid;
	bool m_isErrorPageValid;
	bool m_exactContentHash64Valid;
	bool m_jpValid;

	char m_isSiteMap;

	// shadows
	char m_isRSS2;
	char m_isPermalink2;
	char m_isAdult2;
	char m_spiderLinks2;
	char m_isContentTruncated2;
	char m_isLinkSpam2;
	char m_isSiteRoot2;

	// DO NOT add validity flags below this line!
	char     m_VALIDEND;

	bool m_printedMenu;
	char m_isUrlPermalinkFormat;
	int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
	int32_t m_tagPairHashVecSize;
	int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
	int32_t m_summaryVecSize;
	int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
	int32_t m_pageSampleVecSize;
	int32_t m_postVec[POST_VECTOR_SIZE/4];
	int32_t m_postVecSize;
	float m_pageSimilarity;
	float m_percentChanged;
	bool  m_unchanged;
	// what docids are similar to us? docids are in this list
	RdbList m_dupList;
	int64_t m_exactContentHash64;
	Msg0 m_msg0;
	char m_isDup;
	int64_t m_docIdWeAreADupOf;
	Msg22Request m_msg22Request;
	Msg22Request m_msg22Requestc;
	Msg22 m_msg22a;
	Msg22 m_msg22b;
	Msg22 m_msg22c;
	Msg22 m_msg22e;
	Msg22 m_msg22f;
	// these now reference directly into the html src so our 
	// WordPosInfo::m_wordPtr algo works in seo.cpp
	char *m_metaDesc;
	int32_t  m_metaDescLen;
	char *m_metaSummary;
	int32_t  m_metaSummaryLen;
	char *m_metaKeywords;
	int32_t  m_metaKeywordsLen;
	
	char *m_metaGeoPlacename;
	int32_t  m_metaGeoPlacenameLen;

	class XmlDoc *m_oldDoc;
	class XmlDoc *m_extraDoc;
	class XmlDoc *m_rootDoc;
	char   *m_oldTitleRec;
	int32_t    m_oldTitleRecSize;
	char   *m_rootTitleRec;
	int32_t    m_rootTitleRecSize;
	char    m_isIndexed;

	// confusing, i know! these are used exclsusively by
	// getNewSpiderReply() for now
	char m_isInIndex;
	char m_wasInIndex;

	Msg8a   m_msg8a;

	Url   m_extraUrl;
	uint8_t m_siteNumInlinks8;
	SafeBuf m_mySiteLinkInfoBuf;
	SafeBuf m_myPageLinkInfoBuf;

	char m_isInjecting;
	char m_isImporting;
	char m_useFakeMime;
	char m_useSiteLinkBuf;
	char m_usePageLinkBuf;
	char m_printInXml;

	SafeBuf m_tmpBuf11;
	SafeBuf m_tmpBuf12;
	Multicast m_mcast11;
	Multicast m_mcast12;
	MsgC m_msgc;
	bool m_isAllowed;
	bool m_isChildDoc;
	Msg13 m_msg13;
	Msg13Request m_msg13Request;
	bool m_isSpiderProxy;
	// for limiting # of iframe tag expansions
	int32_t m_numExpansions;
	char m_newOnly;
	char m_isWWWDup;

	SafeBuf m_linkSiteHashBuf;
	SafeBuf m_linkdbDataBuf;
	SafeBuf m_langVec;

	SiteGetter m_siteGetter;
	int32_t m_siteHash32;
	char *m_httpReply;
	char m_incrementedAttemptsCount;
	char m_incrementedDownloadCount;
	char m_useRobotsTxt;
	int32_t m_robotsTxtLen;
	int32_t m_httpReplySize;
	int32_t m_httpReplyAllocSize;
	char *m_filteredContent;
	int32_t m_filteredContentLen;
	int32_t m_filteredContentAllocSize;
	int32_t m_filteredContentMaxSize;
	char m_calledThread;
	int32_t m_errno;
	int32_t m_hostHash32a;
	int32_t m_domHash32;

	// this points into m_msge0 i guess
	Msge0 m_msge0;

	// this points into m_msge1 i guess
	int32_t *m_outlinkIpVector;
	SafeBuf m_fakeIpBuf;
	char m_hasNoIndexMetaTag;
	char m_hasUseFakeIpsMetaTag;
	Msge1 m_msge1;
	TagRec **m_outlinkTagRecVector;
	SafeBuf m_fakeTagRecPtrBuf;
	TagRec m_fakeTagRec;

	char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
				bool hashWithoutFieldNames ) ;

	Json *getParsedJson();
	// object that parses the json
	Json m_jp;

	// flow flags

	bool m_computedMetaListCheckSum;

	// cachedb related args
	bool    m_allHashed;

	int8_t *m_outlinkHopCountVector;
	int32_t  m_outlinkHopCountVectorSize;
	int32_t m_urlFilterNum;
	int32_t m_numOutlinksAdded;
	int32_t m_numRedirects;
	bool m_isPageParser;
	Url m_baseUrl;
	Msg20Reply m_reply;
	Msg20Request *m_req;
	char  m_linkTextBuf[MAX_LINK_TEXT_LEN];
	char m_surroundingTextBuf[MAX_SURROUNDING_TEXT_WIDTH];
	char m_rssItemBuf[MAX_RSSITEM_SIZE];

	const char *m_note;
	Query m_query;
	Matches m_matches;
	// meta description buf
	int32_t m_dbufSize;
	char m_dbuf[1024];
	SafeBuf m_htb;
	Title m_title;
	Summary m_summary;
	char m_isNoArchive;
	char m_isErrorPage;

	// stuff
	const char *m_statusMsg;
	Msg4  m_msg4;

	bool  m_deleteFromIndex;

	// ptrs to stuff
	SafeBuf m_titleRecBuf;
	key_t   m_titleRecKey;

	// for isDupOfUs()
	char *m_dupTrPtr;
	int32_t  m_dupTrSize;

	key_t     m_doledbKey;
	SpiderRequest m_sreq;
	SpiderReply   m_srep;//newsr;

	// bool flags for what procedures we have done
	bool m_checkedUrlFilters;
	
	bool m_listAdded                ;
	bool m_listFlushed              ;
	bool m_check1                   ;
	bool m_check2                   ;
	bool m_prepared                 ;
	bool m_copied1                  ;
	bool m_updatingSiteLinkInfoTags ;

	bool m_didDelay                 ;
	bool m_didDelayUnregister       ;
	bool m_calledMsg22e             ;
	bool m_calledMsg22f             ;
	bool m_calledMsg25              ;
	bool m_calledSections           ;
	bool m_loaded                   ;

	bool m_doingConsistencyCheck ;

	int32_t m_dist;

	// use to store a \0 list of "titles" of the root page so we can
	// see which if any are the venue name, and thus match that to
	// addresses of the venue on the site, and we can use those addresses
	// as default venue addresses when no venues are listed on a page
	// on that site.
	char   m_rootTitleBuf[ROOT_TITLE_BUF_MAX];
	int32_t   m_rootTitleBufSize;

	// . this is filtered
	// . certain punct is replaced with \0
	char   m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX];
	int32_t   m_filteredRootTitleBufSize;

	// like m_rootTitleBuf but for the current page
	char   m_titleBuf[ROOT_TITLE_BUF_MAX];
	int32_t   m_titleBufSize;

	bool m_setTr                    ;

	void (* m_masterLoop) ( void *state );
	void  * m_masterState;

	void (* m_callback1) ( void *state );	
	bool (* m_callback2) ( void *state );	
	void  *m_state;

	// this is non-zero if we decided not to index the doc
	int32_t m_indexCode;

	// the spider priority
	int32_t m_priority;

	// the download error, like ETIMEDOUT, ENOROUTE, etc.
	int32_t m_downloadStatus;

	// . when the download was completed. will be zero if no download done
	// . used to set SpiderReply::m_downloadEndTime because we need
	//   high resolution for that so we can dole out the next spiderrequest
	//   from that IP quickly if the sameipwait is like 500ms.
	int64_t m_downloadEndTime;

	int32_t  m_metaListAllocSize;
	char *m_p;
	char *m_pend;

	int32_t  m_maxCacheAge;

	bool      m_registeredSleepCallback;

	bool          m_hashedTitle;
	bool          m_hashedMetas;

	int32_t          m_niceness;

	bool m_usePosdb     ;
	bool m_useClusterdb ;
	bool m_useLinkdb    ;
	bool m_useSpiderdb  ;
	bool m_useTitledb   ;
	bool m_useTagdb     ;
	bool m_useSecondaryRdbs ;

	SafeBuf *m_pbuf;
	// used by SpiderLoop to set m_pbuf to
	SafeBuf  m_sbuf;
	// store termlist into here if non-null
	bool     m_storeTermListInfo;
	char     m_sortTermListBy;

	// store the terms that we hash into this table so that PageParser.cpp
	// can print what was hashed and with what score and what description
	class HashTableX *m_wts;
	HashTableX m_wtsTable;
	SafeBuf m_wbuf;

	// Msg25.cpp stores its pageparser.cpp output into this one
	SafeBuf m_pageLinkBuf;
	SafeBuf m_siteLinkBuf;

	// which set() function was called above to set us?
	bool          m_setFromTitleRec;
	bool          m_setFromSpiderRec;
	bool          m_setFromUrl;
	bool          m_setFromDocId;
	bool          m_freeLinkInfo1;
	bool          m_contentInjected;

	bool          m_recycleContent;

	char *m_rawUtf8Content;
	int32_t  m_rawUtf8ContentSize;
	int32_t  m_rawUtf8ContentAllocSize; // we overallocate sometimes
	char *m_expandedUtf8Content;
	int32_t  m_expandedUtf8ContentSize;
	char *m_savedp;
	char *m_oldp;
	bool  m_didExpansion;
	SafeBuf m_esbuf;

	// used by msg13
	class Msg13Request *m_r;

	// Msg20 uses this to stash its TcpSlot
	void *m_slot;

	bool m_freed;

	bool m_msg4Waiting;
	bool m_msg4Launched;

	// word spam detection
	char *getWordSpamVec ( );
	bool setSpam ( const int32_t *profile, int32_t plen , int32_t numWords ,
		       unsigned char *spam );
	int32_t  getProbSpam  ( const int32_t *profile, int32_t plen , int32_t step );
	bool m_isRepeatSpammer;
	int32_t m_numRepeatSpam;

	// frag vector (repeated fragments). 0 means repeated, 1 means not.
	// vector is 1-1 with words in the document body.
	char *getFragVec ( );

	bool injectDoc ( char *url ,
			 class CollectionRec *cr ,
			 char *content ,
			 bool contentHasMime ,
			 int32_t hopCount,
			 int32_t charset,

			 bool deleteUrl,
			 char *contentTypeStr, // text/html, text/xml etc.
			 bool spiderLinks ,
			 char newOnly, // index iff new

			 void *state,
			 void (*callback)(void *state) ,

			 uint32_t firstIndexedTime = 0,
			 uint32_t lastSpideredDate = 0 ,
			 int32_t  injectDocIp = 0 );

	int64_t logQueryTimingStart();
	void logQueryTimingEnd(const char* function, int64_t startTime);

	int32_t  m_i;
};

// . PageParser.cpp uses this class for printing hashed terms out by calling
//   XmlDoc::print()
// . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX
// . one for each term hashed
// . the key is the termId. dups are allowed
// . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so
//   that TermInfo::m_term will reference that and it won't disappear on us
class TermDebugInfo {
 public:
	int32_t      m_termOff;
	int32_t      m_termLen;
	int32_t      m_descOff;   // the description offset
	int32_t      m_prefixOff; // the prefix offset, like "site" or "gbadid"
	int64_t m_termId;
	int32_t      m_date;
	bool      m_shardByTermId;

	char      m_langId;
	char      m_diversityRank;
	char      m_densityRank;
	char      m_wordSpamRank;
	char      m_hashGroup;
	int32_t      m_wordNum;
	int32_t      m_wordPos;
	POSDBKEY  m_key; // key144_t
	// 0 = not a syn, 1 = syn from presets,2=wikt,3=generated
	char      m_synSrc;
	int64_t  m_langBitVec64;
};


extern uint8_t score32to8 ( uint32_t score ) ;

#endif // GB_XMLDOC_H