privacore-open-source-searc.../XmlDoc.h
2018-06-25 16:10:24 +02:00

1221 lines
33 KiB
C++

// Matt Wells, copyright Apr 2009
// . 2. you can also call setTitleRec() and then call getMetaList()
// . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp
// . Msg7 and Repair.cpp and injections can also set more than just
// m_firstUrl, like m_content, etc. or whatever elements are known, but
// they must also set the corresponding "valid" flags of those elements
// . both methods must yield exactly the same result, the same "meta list"
// . after setting the contained classes XmlDoc::setMetaList() makes the list
// of rdb records to be added to all the rdbs, this is the "meta list"
// . the meta list is made by hashing all the termIds/scores into some hash
// tables in order to accumulate scores, then the hash table are serialized
// into the "meta list"
// . the meta list is added to all rdbs with a simple call to
// Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now
#ifndef GB_XMLDOC_H
#define GB_XMLDOC_H
#include "Lang.h"
#include "tokenizer.h"
#include "Bits.h"
#include "Pos.h"
#include "Phrases.h"
#include "Xml.h"
#include "SafeBuf.h"
#include "Images.h"
#include "Sections.h"
#include "Msge0.h"
#include "Msge1.h"
#include "Msg4Out.h"
#include "SearchInput.h"
#include "Msg40.h"
#include "Msg0.h"
#include "Msg22.h"
#include "Tagdb.h"
#include "Url.h"
#include "Linkdb.h"
#include "MsgC.h"
#include "Msg13.h"
#include "RdbList.h"
#include "SiteGetter.h"
#include "Msg20.h"
#include "Matches.h"
#include "Query.h"
#include "Title.h"
#include "Summary.h"
#include "Spider.h" // SpiderRequest/SpiderReply definitions
#include "HttpMime.h" // ET_DEFLAT
#include "Json.h"
#include "Posdb.h"
#include <unordered_set>
// forward declaration
class GetMsg20State;
class HashTableX;
class HashInfo;
namespace GbDns {
struct DnsResponse;
}
#define MAXFRAGWORDS 80000
#define MAX_TAG_PAIR_HASHES 100
#include "Msg40.h"
#define POST_VECTOR_SIZE (32*4)
#define MAX_LINK_TEXT_LEN 512
#define MAX_SURROUNDING_TEXT_WIDTH 600
#define MAX_RSSITEM_SIZE 30000
bool getDensityRanks ( const TokenizerResult *tr,
int32_t hashGroup ,
SafeBuf *densBuf ,
const Sections *sections);
// diversity vector
bool getDiversityVec ( const TokenizerResult *tr,
const Phrases *phrases ,
class HashTableX *countTable ,
SafeBuf *sbWordVec );
float computeSimilarity ( const int32_t *vec0,
const int32_t *vec1,
// corresponding scores vectors
const int32_t *s0,
const int32_t *s1,
class Query *q ,
// only Sections::addDateBasedImpliedSections()
// sets this to true right now. if set to true
// we essentially dedup each vector, although
// the score is compounded into the remaining
// occurence. i'm not sure if that is the right
// behavior though.
bool dedupVecs = false );
// tell zlib to use our malloc/free functions
int gbuncompress(unsigned char *dest,
uint32_t *destLen,
const unsigned char *source,
uint32_t sourceLen);
int gbcompress(unsigned char *dest,
uint32_t *destLen,
const unsigned char *source,
uint32_t sourceLen);
// . for Msg13.cpp
// . *pend must equal \0
int32_t getContentHash32Fast ( unsigned char *p , int32_t plen ) ;
bool getWordPosVec ( const TokenizerResult *tr,
const Sections *sections,
int32_t startDist,
const char *fragVec,
SafeBuf *wpos );
#define ROOT_TITLE_BUF_MAX 512
class XmlDoc {
public:
/// @warning Do NOT change. titlerec binary compatibility header
//
// BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h)
//
// headerSize = this->ptr_firstUrl - this->m_headerSize
uint16_t m_headerSize;
uint16_t m_version;
// these flags are used to indicate which ptr_ members are present:
uint32_t m_internalFlags1;
int32_t m_ip;
int32_t m_crawlDelay;
// . use this to quickly detect if doc is unchanged
// . we can avoid setting Xml and Words classes etc...
int32_t m_contentHash32;
// this is a hash of all adjacent tag pairs for templated identificatn
uint32_t m_tagPairHash32;
int32_t m_siteNumInlinks;
// this is non-zero if we decided not to index the doc
int32_t m_indexCode;
int32_t m_reserved2;
uint32_t m_spideredTime; // time_t
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
uint32_t m_reserved32;
uint32_t m_reserved33;
uint32_t m_firstIndexedDate; // time_t
uint32_t m_outlinksAddedDate; // time_t
uint16_t m_charset; // the ORIGINAL charset, we are always utf8!
uint16_t m_countryId;
int32_t m_reserved3;
uint8_t m_metaListCheckSum8; // bring it back!!
char m_reserved3b;
uint16_t m_bodyStartPos;
uint16_t m_reserved5;
uint16_t m_unused0;
int16_t m_httpStatus; // -1 if not found (empty http reply)
int8_t m_reserved5a;
uint8_t m_langId;
uint8_t m_reserved6;
uint8_t m_contentType;
// bit flags
uint16_t m_isRSS:1;
uint16_t m_isPermalink:1;
uint16_t m_isAdult:1;
uint16_t m_wasContentInjected:1;
uint16_t m_spiderLinks:1;
uint16_t m_isContentTruncated:1;
uint16_t m_isLinkSpam:1;
uint16_t m_reserved796:1;
uint16_t m_reserved797:1;
uint16_t m_reserved798:1;
uint16_t m_reserved799:1;
uint16_t m_isSiteRoot:1;
uint16_t m_reserved800:1;
uint16_t m_reserved801:1;
uint16_t m_reserved802:1;
uint16_t m_reserved803:1;
uint16_t m_reserved805:1;
uint16_t m_reserved806:1;
uint16_t m_reserved807:1;
uint16_t m_reserved808:1;
uint16_t m_reserved809:1;
uint16_t m_reserved810:1;
uint16_t m_reserved811:1;
uint16_t m_reserved812:1;
uint16_t m_reserved813:1;
uint16_t m_reserved814:1;
uint16_t m_reserved815:1;
uint16_t m_reserved816:1;
//end of titlerec binary compatibility header
/// @warning Do NOT change the structure of the following until m_dummyEnd.
/// check in XmlDoc::set2 (sanity check. must match exactly)
char *ptr_firstUrl;
char *ptr_redirUrl;
char *ptr_rootTitleBuf;
int32_t *ptr_unused12;
int32_t *ptr_unused13;
void *ptr_unused8;
int64_t *ptr_unused10;
float *ptr_unused11;
char *ptr_imageData;
int32_t *ptr_unused6;
int32_t *ptr_unused7;
char *ptr_explicitKeywords;
char *ptr_unused2;
char *ptr_unused3;
char *ptr_utf8Content;
char *ptr_unused5;
// do not let SiteGetter change this when we re-parse!
char *ptr_site;
LinkInfo *ptr_linkInfo1;
char *ptr_linkdbData;
char *ptr_unused14;
char *ptr_tagRecData;
LinkInfo *ptr_unused9;
int32_t size_firstUrl;
int32_t size_redirUrl;
int32_t size_rootTitleBuf;
int32_t size_unused12;
int32_t size_unused13;
int32_t size_unused8;
int32_t size_unused10;
int32_t size_unused11;
int32_t size_imageData;
int32_t size_unused6;
int32_t size_unused7;
int32_t size_explicitKeywords;
int32_t size_unused2;
int32_t size_unused3;
int32_t size_utf8Content;
int32_t size_unused5;
int32_t size_site;
int32_t size_linkInfo1;
int32_t size_linkdbData;
int32_t size_unused14;
int32_t size_tagRecData;
int32_t size_unused9;
char m_dummyEnd;
//
// END WHAT IS STORED IN THE TITLE REC (Titledb.h)
//
char *ptr_scheme;
int32_t size_scheme;
public:
bool set2 ( char *titleRec,
int32_t maxSize,
const char *coll,
int32_t niceness ,
class SpiderRequest *sreq = NULL );
// . since being set from a docId, we will load the old title rec
// and use that!
// . used by PageGet.cpp
bool set3 ( int64_t docId ,
const char *coll ,
int32_t niceness );
bool set4 ( class SpiderRequest *sreq ,
const key96_t *doledbKey,
const char *coll ,
SafeBuf *pbuf ,
int32_t niceness ,
char *utf8Content = NULL ,
bool deleteFromIndex = false ,
int32_t forcedIp = 0 ,
uint8_t contentType = CT_HTML ,
uint32_t spideredTime = 0 , // time_t
bool contentHasMime = false );
// we now call this right away rather than at download time!
int32_t getSpideredTime();
// time right before adding the termlists to the index, etc.
// whereas spider time is the download time
int32_t getIndexedTime();
// another entry point, like set3() kinda
bool loadFromOldTitleRec ();
XmlDoc() ;
~XmlDoc() ;
void nukeDoc ( class XmlDoc *);
void reset ( ) ;
bool setFirstUrl ( const char *u ) ;
void setStatus ( const char *s ) ;
void setCallback ( void *state, void (*callback) (void *state) ) ;
void setCallback ( void *state, bool (*callback) (void *state) ) ;
void getRevisedSpiderRequest ( class SpiderRequest *revisedReq );
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
bool indexDoc ( );
bool indexDoc2 ( );
char *prepareToMakeTitleRec ( ) ;
// store TitleRec into "buf" so it can be added to metalist
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
SafeBuf *getTitleRecBuf ( );
char *getIsAdult ( ) ;
bool *checkBlockList();
unsigned *getDefaultSitePageTemperature();
bool *parseRobotsMetaTag();
void parseRobotsMetaTagContent(const char *content, int32_t contentLen);
char *getIsPermalink ( ) ;
char *getIsUrlPermalinkFormat ( ) ;
char *getIsRSS ( ) ;
bool *getIsSiteMap ( ) ;
class Xml *getXml ( ) ;
uint8_t *getLangVector ( ) ;
uint8_t *getLangId ( ) ;
lang_t getSummaryLangIdCLD2();
lang_t getContentLangIdCLD2();
lang_t getContentLangIdCLD3();
uint8_t computeLangId(Sections *sections, const TokenizerResult *tr, char *lv);
TokenizerResult *getTokenizerResult();
TokenizerResult *getTokenizerResult2();
class Bits *getBits ( ) ;
class Bits *getBitsForSummary ( ) ;
class Pos *getPos ( );
class Phrases *getPhrases ( ) ;
class Sections *getSections ( ) ;
int32_t *getLinkSiteHashes ( );
class Links *getLinks ( bool doQuickSet = false ) ;
class HashTableX *getCountTable ( ) ;
bool hashString_ct(HashTableX *ht, const char *s, int32_t slen);
int32_t *getSummaryVector ( ) ;
int32_t *getPageSampleVector ( ) ;
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
int32_t computeVector ( const TokenizerResult *tr, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
float *getPercentChanged ( );
int64_t *getExactContentHash64();
class RdbList *getDupList ( ) ;
char *getIsDup ( ) ;
char *getMetaDescription( int32_t *mdlen ) ;
char *getMetaSummary ( int32_t *mslen ) ;
char *getMetaKeywords( int32_t *mklen ) ;
char *getMetaGeoPlacename( int32_t *mgplen );
class Url *getCurrentUrl ( ) ;
class Url *getFirstUrl() ;
int64_t getFirstUrlHash48();
int64_t getFirstUrlHash64();
class Url **getRedirUrl() ;
class Url **getMetaRedirUrl() ;
class Url *getCanonicalUrl();
class Url **getCanonicalRedirUrl ( ) ;
int32_t *getFirstIndexedDate ( ) ;
int32_t *getOutlinksAddedDate ( ) ;
uint16_t *getCountryId ( ) ;
class XmlDoc **getOldXmlDoc ( ) ;
class XmlDoc **getExtraDoc(const char *url, int32_t maxCacheAge = 0);
bool getIsPageParser ( ) ;
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
char **getOldTitleRec ( );
char **getRootTitleRec ( ) ;
int64_t *getDocId ( ) ;
char *getIsIndexed ( ) ;
class TagRec *getTagRec ( ) ;
class TagRec *getCurrentTagRec ( ) ;
// non-dup/nondup addresses only
int32_t *getFirstIp ( ) ;
int32_t *getSiteNumInlinks ( ) ;
class LinkInfo *getSiteLinkInfo() ;
int32_t *getIp ( ) ;
void setIp(GbDns::DnsResponse *response);
std::vector<std::string>* getHostNameServers(const char *hostname, size_t hostnameLen);
static void gotHostNameServersWrapper(GbDns::DnsResponse *response, void *state);
static void gotIpWrapper(GbDns::DnsResponse *response, void *state);
bool *getIsAllowed ( ) ;
int32_t *getFinalCrawlDelay();
int32_t m_finalCrawlDelay;
char *getIsWWWDup ( ) ;
class LinkInfo *getLinkInfo1 ( ) ;
char *getSite ( ) ;
const char *getScheme ( ) ;
void gotSite ( ) ;
int32_t *getSiteHash32 ( ) ;
char **getHttpReply ( ) ;
char **getHttpReply2 ( ) ;
char **gotHttpReply ( ) ;
char *getIsContentTruncated ( );
int32_t *getDownloadStatus ( ) ;
int64_t *getDownloadEndTime ( ) ;
int16_t *getHttpStatus ( );
class HttpMime *getMime () ;
char **getContent ( ) ;
uint8_t *getContentType ( ) ;
uint16_t *getCharset ( ) ;
char **getFilteredContent ( ) ;
void filterStart_r ( bool amThread ) ;
char **getRawUtf8Content ( ) ;
char **getExpandedUtf8Content ( ) ;
char **getUtf8Content ( ) ;
// we download large files to a file on disk, like warcs and arcs
int32_t *getContentHash32 ( ) ;
int32_t *getContentHashJson32 ( ) ;
int32_t *getTagPairHashVector ( ) ;
uint32_t *getTagPairHash32 ( ) ;
int32_t getHostHash32a ( ) ;
int32_t getDomHash32 ( );
char **getThumbnailData();
class Images *getImages ( ) ;
class TagRec ***getOutlinkTagRecVector () ;
int32_t **getOutlinkFirstIpVector () ;
char *getIsSiteRoot ( ) ;
char *getSpiderLinks ( ) ;
bool getIsInjecting();
int32_t *getSpiderPriority ( ) ;
int32_t *getIndexCode ( ) ;
SafeBuf *getNewTagBuf ( ) ;
void logIt ( SafeBuf *bb = NULL ) ;
bool m_doConsistencyTesting;
bool doConsistencyTest ( bool forceTest ) ;
void printMetaList() const;
void printMetaList ( char *metaList , char *metaListEnd ,
SafeBuf *pbuf );
bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
bool hashMetaList ( class HashTableX *ht ,
char *p ,
char *pend ,
bool checkList ) ;
char *getMetaList ( bool forDelete = false );
uint64_t m_downloadStartTime;
uint64_t m_ipStartTime;
uint64_t m_ipEndTime;
int64_t m_getLinkInfoStartTime;
int64_t m_getLinkInfoEndTime;
int64_t m_getSiteLinkInfoStartTime;
int64_t m_getSiteLinkInfoEndTime;
bool m_updatedMetaData;
void copyFromOldDoc ( class XmlDoc *od ) ;
class SpiderReply *getFakeSpiderReply ( );
// we add a SpiderReply to spiderdb when done spidering, even if
// m_indexCode or g_errno was set!
class SpiderReply *getNewSpiderReply ( );
void setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
class SpiderReply *srep );
char *addOutlinkSpiderRecsToMetaList ( );
void lookupAndSetExplicitKeywords();
int32_t getSiteRank ();
bool addTable144 ( class HashTableX *tt1 ,
int64_t docId ,
SafeBuf *buf = NULL );
bool addTable224 ( HashTableX *tt1 ) ;
bool hashNoSplit ( class HashTableX *tt ) ;
char *hashAll ( class HashTableX *table ) ;
bool hashMetaTags ( class HashTableX *table ) ;
bool hashContentType ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table, bool urlOnly );
bool hashIncomingLinkText(HashTableX *table);
bool hashLinksForLinkdb ( class HashTableX *table ) ;
bool hashNeighborhoods ( class HashTableX *table ) ;
bool hashTitle ( class HashTableX *table );
bool hashBody2 ( class HashTableX *table );
bool hashMetaKeywords ( class HashTableX *table );
bool hashExplicitKeywords(HashTableX *table);
bool hashMetaGeoPlacename( class HashTableX *table );
bool hashMetaSummary ( class HashTableX *table );
bool hashLanguage ( class HashTableX *table ) ;
bool hashLanguageString ( class HashTableX *table ) ;
bool hashCountry ( class HashTableX *table ) ;
bool hashLemmas(class HashTableX *table);
void sortTokenizerResult(TokenizerResult *tr);
void getLanguageAndCountry(lang_t *lang, const char **country_code);
class Url *getBaseUrl ( ) ;
void setMsg20Request(Msg20Request *req);
class Msg20Reply *getMsg20Reply ( ) ;
class Msg20Reply *getMsg20ReplyStepwise();
void loopUntilMsg20ReplyReady(GetMsg20State *);
static void getMsg20ReplyThread(void *pv);
void getMsg20ReplyThread();
static void msg20Done(void *pv, job_exit_t exit_type);
void msg20Done(job_exit_t exit_type);
Query *getQuery() ;
Matches *getMatches () ;
char *getDescriptionBuf ( char *displayMetas , int32_t *dlen ) ;
SafeBuf *getHeaderTagBuf();
class Title *getTitle ();
class Summary *getSummary () ;
char *getHighlightedSummary ( bool *isSetFromTagsPtr );
bool *getIsNoArchive();
bool *getIsNoFollow();
bool *getIsNoIndex();
bool *getIsNoSnippet();
int32_t *getUrlFilterNum();
char *getIsLinkSpam ( ) ;
char *getIsErrorPage ( ) ;
const char* matchErrorMsg(char* p, char* pend );
bool hashWords( class HashInfo *hi );
bool hashSingleTerm( const char *s, int32_t slen, class HashInfo *hi );
bool hashString( const char *s, int32_t slen, class HashInfo *hi );
bool hashString(size_t begin_token, size_t end_token, HashInfo *hi);
bool hashString3(const char *s, int32_t slen, class HashInfo *hi,
HashTableX *wts, SafeBuf *wbuf);
bool hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
HashTableX *wts, SafeBuf *wbuf);
bool hashWords3(HashInfo *hi, const TokenizerResult *tr,
Sections *sections, const Bits *bits,
const char *fragVec, const char *wordSpamVec, const char *langVec,
HashTableX *wts, SafeBuf *wbuf);
bool hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_token, size_t end_token,
Sections *sections, const Bits *bits,
const char *fragVec, const char *wordSpamVec, const char *langVec,
HashTableX *wts, SafeBuf *wbuf);
bool hashString4(const char *s, int32_t slen, HashInfo *hi);
// print out for PageTitledb.cpp and PageParser.cpp
bool printDoc ( class SafeBuf *pbuf );
bool printMenu ( class SafeBuf *pbuf );
bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );
char *getTitleBuf ( );
char *getRootTitleBuf ( );
char *getFilteredRootTitleBuf ( );
public:
// stuff set from the key of the titleRec, above the compression area
int64_t m_docId;
char *m_ubuf;
int32_t m_ubufSize;
int32_t m_ubufAlloc;
// private:
// we we started spidering it, in milliseconds since the epoch
int64_t m_startTime;
class XmlDoc *m_prevInject;
class XmlDoc *m_nextInject;
// when set() was called by Msg20.cpp so we can time how long it took
// to generate the summary
int64_t m_setTime;
int64_t m_cpuSummaryStartTime;
// . these should all be set using set*() function calls so their
// individual validity flags can bet set to true, and successive
// calls to their corresponding get*() functions will not core
// . these particular guys are set immediately on set(char *titleRec)
Url m_redirUrl;
Url *m_redirUrlPtr;
SafeBuf m_redirCookieBuf;
Url m_metaRedirUrl;
Url *m_metaRedirUrlPtr;
Url m_canonicalUrl;
Url *m_canonicalRedirUrlPtr;
int32_t m_redirError;
bool m_allowSimplifiedRedirs;
Url m_firstUrl;
int64_t m_firstUrlHash48;
int64_t m_firstUrlHash64;
Url m_currentUrl;
collnum_t m_collnum;
class CollectionRec *getCollRec ( ) ;
bool setCollNum ( const char *coll ) ;
char *m_content;
int32_t m_contentLen;
char *m_metaList;
int32_t m_metaListSize;
int32_t m_addedSpiderRequestSize;
int32_t m_addedSpiderReplySize;
SafeBuf m_metaList2;
// used by msg7 to store udp slot
class UdpSlot *m_injectionSlot;
// . same thing, a little more complicated
// . these classes are only set on demand
Xml m_xml;
Links m_links;
TokenizerResult m_tokenizerResult;
Bits m_bits;
Bits m_bits2;
Pos m_pos;
Phrases m_phrases;
Sections m_sections;
// . for rebuild logging of what's changed
// . Repair.cpp sets these based on titlerec
char m_logLangId;
int32_t m_logSiteNumInlinks;
bool isFirstUrlRobotsTxt();
bool m_isRobotsTxtUrl;
bool* isFirstUrlCanonical();
bool m_isUrlCanonical;
Images m_images;
HashTableX m_countTable;
HttpMime m_mime;
TagRec m_tagRec;
SafeBuf m_tagRecBuf;
TagRec m_currentTagRec;
SafeBuf m_newTagBuf;
SafeBuf m_fragBuf;
SafeBuf m_wordSpamBuf;
SafeBuf m_finalSummaryBuf;
bool m_isFinalSummarySetFromTags;
int32_t m_firstIp;
class SafeBuf *m_savedSb;
class HttpRequest *m_savedHr;
// validity flags. on reset() all these are set to false.
char m_VALIDSTART;
// DO NOT add validity flags above this line!
bool m_metaListValid;
bool m_addedSpiderRequestSizeValid;
bool m_addedSpiderReplySizeValid;
bool m_downloadStartTimeValid;
bool m_siteValid;
bool m_startTimeValid;
bool m_currentUrlValid;
bool m_firstUrlValid;
bool m_firstUrlHash48Valid;
bool m_firstUrlHash64Valid;
bool m_docIdValid;
bool m_tagRecValid;
bool m_currentTagRecValid;
bool m_robotsTxtLenValid;
bool m_tagRecDataValid;
bool m_newTagBufValid;
bool m_rootTitleBufValid;
bool m_filteredRootTitleBufValid;
bool m_titleBufValid;
bool m_fragBufValid;
bool m_isRobotsTxtUrlValid;
bool m_isUrlCanonicalValid;
bool m_wordSpamBufValid;
bool m_finalSummaryBufValid;
bool m_isInjectingValid;
bool m_metaListCheckSum8Valid;
bool m_contentValid;
bool m_filteredContentValid;
bool m_charsetValid;
bool m_langVectorValid;
bool m_langIdValid;
bool m_isRSSValid;
bool m_isSiteMapValid;
bool m_isContentTruncatedValid;
bool m_xmlValid;
bool m_linksValid;
bool m_tokenizerResultValid;
bool m_tokenizerResultValid2;
bool m_bitsValid;
bool m_bits2Valid;
bool m_posValid;
bool m_phrasesValid;
bool m_sectionsValid;
bool m_imageDataValid;
bool m_imagesValid;
bool m_sreqValid;
bool m_srepValid;
bool m_ipValid;
bool m_firstIpValid;
bool m_spideredTimeValid;
bool m_indexedTimeValid;
bool m_isInIndexValid;
bool m_wasInIndexValid;
bool m_outlinksAddedDateValid;
bool m_countryIdValid;
bool m_bodyStartPosValid;
bool m_httpStatusValid;
bool m_crawlDelayValid;
bool m_finalCrawlDelayValid;
bool m_titleRecKeyValid;
bool m_versionValid;
bool m_rawUtf8ContentValid;
bool m_expandedUtf8ContentValid;
bool m_utf8ContentValid;
bool m_isAllowedValid;
bool m_redirUrlValid;
bool m_redirCookieBufValid;
bool m_metaRedirUrlValid;
bool m_canonicalUrlValid;
bool m_canonicalRedirUrlValid;
bool m_statusMsgValid;
bool m_mimeValid;
bool m_hostHash32aValid;
bool m_indexCodeValid;
bool m_priorityValid;
bool m_downloadStatusValid;
bool m_downloadEndTimeValid;
bool m_redirErrorValid;
bool m_domHash32Valid;
bool m_contentHash32Valid;
bool m_tagPairHash32Valid;
bool m_spiderLinksValid;
bool m_firstIndexedDateValid;
bool m_isPermalinkValid;
bool m_isAdultValid;
bool m_isUrlPermalinkFormatValid;
bool m_percentChangedValid;
bool m_countTableValid;
bool m_tagPairHashVecValid;
bool m_summaryVecValid;
bool m_pageSampleVecValid;
bool m_postVecValid;
bool m_dupListValid;
bool m_isDupValid;
bool m_metaDescValid;
bool m_metaSummaryValid;
bool m_metaKeywordsValid;
bool m_metaGeoPlacenameValid;
bool m_oldDocValid;
bool m_extraDocValid;
bool m_rootDocValid;
bool m_oldTitleRecValid;
bool m_rootTitleRecValid;
bool m_isIndexedValid;
bool m_siteNumInlinksValid;
bool m_siteLinkInfoValid;
bool m_isWWWDupValid;
bool m_linkInfo1Valid;
bool m_linkSiteHashesValid;
bool m_siteHash32Valid;
bool m_httpReplyValid;
bool m_contentTypeValid;
bool m_outlinkTagRecVectorValid;
bool m_outlinkIpVectorValid;
bool m_isSiteRootValid;
bool m_wasContentInjectedValid;
bool m_urlFilterNumValid;
bool m_numOutlinksAddedValid;
bool m_baseUrlValid;
bool m_replyValid;
bool m_isPageParserValid;
bool m_queryValid;
bool m_matchesValid;
bool m_dbufValid;
bool m_titleValid;
bool m_htbValid;
bool m_collnumValid;
bool m_summaryValid;
bool m_titleRecBufValid;
bool m_isLinkSpamValid;
bool m_isErrorPageValid;
bool m_exactContentHash64Valid;
bool m_jpValid;
bool m_blockedDocValid;
bool m_defaultSitePageTemperatureValid;
bool m_hostNameServersValid;
bool m_ipsValid;
bool m_isSiteMap;
// shadows
char m_isRSS2;
char m_isPermalink2;
char m_isAdult2;
char m_spiderLinks2; // May be -1
char m_isContentTruncated2;
char m_isLinkSpam2;
char m_isSiteRoot2;
// DO NOT add validity flags below this line!
char m_VALIDEND;
bool m_printedMenu;
char m_isUrlPermalinkFormat;
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
int32_t m_tagPairHashVecSize;
int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
int32_t m_summaryVecSize;
int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
int32_t m_pageSampleVecSize;
int32_t m_postVec[POST_VECTOR_SIZE/4];
int32_t m_postVecSize;
float m_pageSimilarity;
float m_percentChanged;
// what docids are similar to us? docids are in this list
RdbList m_dupList;
int64_t m_exactContentHash64;
Msg0 m_msg0;
char m_isDup; // may be -1
int64_t m_docIdWeAreADupOf;
Msg22Request m_msg22Request;
Msg22 m_msg22a;
Msg22 m_msg22b;
Msg22 m_msg22e;
Msg22 m_msg22f;
// these now reference directly into the html src so our
// WordPosInfo::m_wordPtr algo works in seo.cpp
char *m_metaDesc;
int32_t m_metaDescLen;
char *m_metaSummary;
int32_t m_metaSummaryLen;
char *m_metaKeywords;
int32_t m_metaKeywordsLen;
char *m_metaGeoPlacename;
int32_t m_metaGeoPlacenameLen;
class XmlDoc *m_oldDoc;
class XmlDoc *m_extraDoc;
class XmlDoc *m_rootDoc;
char *m_oldTitleRec;
int32_t m_oldTitleRecSize;
char *m_rootTitleRec;
int32_t m_rootTitleRecSize;
char m_isIndexed; // may be -1
// confusing, i know! these are used exclsusively by
// getNewSpiderReply() for now
bool m_isInIndex;
bool m_wasInIndex;
Msg8a m_msg8a;
Msg8a m_currentMsg8a;
Url m_extraUrl;
SafeBuf m_mySiteLinkInfoBuf;
SafeBuf m_myPageLinkInfoBuf;
bool m_isInjecting;
bool m_useFakeMime;
bool m_useSiteLinkBuf;
bool m_usePageLinkBuf;
bool m_printInXml;
SafeBuf m_tmpBuf11;
SafeBuf m_tmpBuf12;
Multicast m_mcast11;
Multicast m_mcast12;
bool m_isAllowed;
bool m_isChildDoc;
Msg13 m_msg13;
Msg13Request m_msg13Request;
bool m_isSpiderProxy;
// for limiting # of iframe tag expansions
int32_t m_numExpansions;
char m_newOnly;
bool m_skipContentHashCheck;
char m_isWWWDup; // May be -1
SafeBuf m_explicitKeywordsBuf;
SafeBuf m_linkSiteHashBuf;
SafeBuf m_linkdbDataBuf;
SafeBuf m_langVec;
SiteGetter m_siteGetter;
int32_t m_siteHash32;
char *m_httpReply;
bool m_useRobotsTxt;
int32_t m_robotsTxtLen;
bool m_robotsTxtHttpStatusDisallowed;
bool m_robotsTxtErrorDisallowed;
int32_t m_httpReplySize;
int32_t m_httpReplyAllocSize;
char *m_filteredContent;
int32_t m_filteredContentLen;
int32_t m_filteredContentAllocSize;
bool m_calledThread;
int32_t m_errno;
int32_t m_hostHash32a;
int32_t m_domHash32;
Msge0 m_msge0;
Msge1 m_msge1;
Json *getParsedJson();
// object that parses the json
Json m_jp;
// flow flags
bool m_computedMetaListCheckSum;
// cachedb related args
bool m_allHashed;
int32_t m_urlFilterNum;
int32_t m_numOutlinksAdded;
int32_t m_numRedirects;
bool m_isPageParser;
Url m_baseUrl;
Msg20Reply m_reply;
Msg20Request *m_req;
bool m_abortMsg20Generation;
char m_linkTextBuf[MAX_LINK_TEXT_LEN];
char m_surroundingTextBuf[MAX_SURROUNDING_TEXT_WIDTH];
char m_rssItemBuf[MAX_RSSITEM_SIZE];
const char *m_note;
Query m_query;
Matches m_matches;
// meta description buf
int32_t m_dbufSize;
char m_dbuf[1024];
SafeBuf m_htb;
Title m_title;
Summary m_summary;
char m_isErrorPage; // May be -1
// stuff
int64_t m_lastTimeStart;
const char *m_statusMsg;
Msg4 m_msg4;
bool m_deleteFromIndex;
// ptrs to stuff
SafeBuf m_titleRecBuf;
key96_t m_titleRecKey;
// for isDupOfUs()
char *m_dupTrPtr;
int32_t m_dupTrSize;
key96_t m_doledbKey;
SpiderRequest m_sreq;
SpiderReply m_srep;//newsr;
// bool flags for what procedures we have done
bool m_checkedUrlFilters;
bool m_listAdded ;
bool m_check1 ;
bool m_check2 ;
bool m_prepared ;
bool m_copied1 ;
bool m_updatingSiteLinkInfoTags ;
bool m_didDelay ;
bool m_didDelayUnregister ;
bool m_calledMsg22e ;
bool m_calledMsg22f ;
bool m_calledMsg25 ;
bool m_calledSections ;
bool m_loaded ;
bool m_doingConsistencyCheck ;
int32_t m_dist;
// use to store a \0 list of "titles" of the root page so we can
// see which if any are the venue name, and thus match that to
// addresses of the venue on the site, and we can use those addresses
// as default venue addresses when no venues are listed on a page
// on that site.
char m_rootTitleBuf[ROOT_TITLE_BUF_MAX];
int32_t m_rootTitleBufSize;
// . this is filtered
// . certain punct is replaced with \0
char m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX];
int32_t m_filteredRootTitleBufSize;
// like m_rootTitleBuf but for the current page
char m_titleBuf[ROOT_TITLE_BUF_MAX];
int32_t m_titleBufSize;
bool m_setTr ;
void (* m_masterLoop) ( void *state );
void * m_masterState;
void (* m_callback1) ( void *state );
bool (* m_callback2) ( void *state );
void *m_state;
// the spider priority
int32_t m_priority;
// the download error, like ETIMEDOUT, ENOROUTE, etc.
int32_t m_downloadStatus;
// . when the download was completed. will be zero if no download done
// . used to set SpiderReply::m_downloadEndTime because we need
// high resolution for that so we can dole out the next spiderrequest
// from that IP quickly if the sameipwait is like 500ms.
int64_t m_downloadEndTime;
int32_t m_metaListAllocSize;
char *m_p;
char *m_pend;
int32_t m_maxCacheAge;
bool m_hashedTitle;
bool m_hashedMetas;
int32_t m_niceness;
bool m_usePosdb ;
bool m_useClusterdb ;
bool m_useLinkdb ;
bool m_useSpiderdb ;
bool m_useTitledb ;
bool m_useTagdb ;
bool m_useSecondaryRdbs ;
SafeBuf *m_pbuf;
// store termlist into here if non-null
bool m_storeTermListInfo;
char m_sortTermListBy;
// store the terms that we hash into this table so that PageParser.cpp
// can print what was hashed and with what score and what description
class HashTableX *m_wts;
HashTableX m_wtsTable;
SafeBuf m_wbuf;
//During hasning various sources (title, tags, body, ...) we put unique lemmas into this set
std::unordered_set<std::string> lemma_words;
// which set() function was called above to set us?
bool m_setFromTitleRec;
bool m_setFromSpiderRec;
bool m_setFromUrl;
bool m_setFromDocId;
bool m_freeLinkInfo1;
bool m_contentInjected;
bool m_recycleContent;
bool m_docRebuild;
char *m_rawUtf8Content;
int32_t m_rawUtf8ContentSize;
int32_t m_rawUtf8ContentAllocSize; // we overallocate sometimes
char *m_expandedUtf8Content;
int32_t m_expandedUtf8ContentSize;
char *m_savedp;
char *m_oldp;
bool m_didExpansion;
SafeBuf m_esbuf;
// used by msg13
class Msg13Request *m_r;
bool m_freed;
bool m_indexedDoc; //indexDoc() perfomrned completely
bool m_msg4Waiting;
bool m_msg4Launched;
bool m_blockedDoc;
bool m_checkedUrlBlockList;
bool m_checkedDnsBlockList;
bool m_checkedIpBlockList;
unsigned m_defaultSitePageTemperature;
bool m_calledServiceSiteMedianPageTemperature;
bool m_parsedRobotsMetaTag;
bool m_robotsNoIndex;
bool m_robotsNoFollow;
bool m_robotsNoArchive;
bool m_robotsNoSnippet;
std::vector<std::string> m_hostNameServers;
std::vector<uint32_t> m_ips;
bool m_addSpiderRequest;
// word spam detection
char *getWordSpamVec ( );
bool setSpam ( const int32_t *profile, int32_t plen , int32_t numWords ,
unsigned char *spam );
int32_t getProbSpam ( const int32_t *profile, int32_t plen , int32_t step );
bool m_isRepeatSpammer;
int32_t m_numRepeatSpam;
// frag vector (repeated fragments). 0 means repeated, 1 means not.
// vector is 1-1 with words in the document body.
char *getFragVec ( );
bool injectDoc(const char *url,
class CollectionRec *cr,
char *content,
bool contentHasMime,
int32_t charset,
int32_t langId,
bool deleteUrl,
const char *contentTypeStr, // text/html, text/xml etc.
bool spiderLinks,
char newOnly, // index iff new
bool skipContentHashCheck,
void *state,
void (*callback)(void *state),
uint32_t firstIndexedTime = 0,
uint32_t lastSpideredDate = 0,
int32_t injectDocIp = 0,
const char *redirUrl = nullptr,
int32_t indexCode = 0,
int16_t httpStatus = 200);
int64_t logQueryTimingStart();
void logQueryTimingEnd(const char* function, int64_t startTime);
void callCallback();
bool m_calledServiceSiteNumInlinks;
};
// . PageParser.cpp uses this class for printing hashed terms out by calling
// XmlDoc::print()
// . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX
// . one for each term hashed
// . the key is the termId. dups are allowed
// . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so
// that TermInfo::m_term will reference that and it won't disappear on us
class TermDebugInfo {
public:
int32_t m_termOff;
int32_t m_termLen;
int32_t m_descOff; // the description offset
int32_t m_prefixOff; // the prefix offset, like "site" or "gbadid"
int64_t m_termId;
int32_t m_date;
bool m_shardByTermId;
char m_langId;
char m_diversityRank;
char m_densityRank;
char m_wordSpamRank;
char m_hashGroup;
int32_t m_wordNum;
int32_t m_wordPos;
posdbkey_t m_key; // key144_t
// 0 = not a syn, 1 = syn from presets,2=wikt,3=generated
char m_synSrc;
int64_t m_langBitVec64;
};
#endif // GB_XMLDOC_H