1221 lines
33 KiB
C++
1221 lines
33 KiB
C++
// Matt Wells, copyright Apr 2009
|
|
|
|
// . 2. you can also call setTitleRec() and then call getMetaList()
|
|
// . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp
|
|
// . Msg7 and Repair.cpp and injections can also set more than just
|
|
// m_firstUrl, like m_content, etc. or whatever elements are known, but
|
|
// they must also set the corresponding "valid" flags of those elements
|
|
// . both methods must yield exactly the same result, the same "meta list"
|
|
// . after setting the contained classes XmlDoc::setMetaList() makes the list
|
|
// of rdb records to be added to all the rdbs, this is the "meta list"
|
|
// . the meta list is made by hashing all the termIds/scores into some hash
|
|
// tables in order to accumulate scores, then the hash table are serialized
|
|
// into the "meta list"
|
|
// . the meta list is added to all rdbs with a simple call to
|
|
// Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now
|
|
|
|
|
|
#ifndef GB_XMLDOC_H
|
|
#define GB_XMLDOC_H
|
|
|
|
#include "Lang.h"
|
|
#include "tokenizer.h"
|
|
#include "Bits.h"
|
|
#include "Pos.h"
|
|
#include "Phrases.h"
|
|
#include "Xml.h"
|
|
#include "SafeBuf.h"
|
|
#include "Images.h"
|
|
#include "Sections.h"
|
|
#include "Msge0.h"
|
|
#include "Msge1.h"
|
|
#include "Msg4Out.h"
|
|
|
|
#include "SearchInput.h"
|
|
#include "Msg40.h"
|
|
#include "Msg0.h"
|
|
#include "Msg22.h"
|
|
#include "Tagdb.h"
|
|
#include "Url.h"
|
|
#include "Linkdb.h"
|
|
#include "MsgC.h"
|
|
#include "Msg13.h"
|
|
#include "RdbList.h"
|
|
#include "SiteGetter.h"
|
|
#include "Msg20.h"
|
|
#include "Matches.h"
|
|
#include "Query.h"
|
|
#include "Title.h"
|
|
#include "Summary.h"
|
|
#include "Spider.h" // SpiderRequest/SpiderReply definitions
|
|
#include "HttpMime.h" // ET_DEFLAT
|
|
#include "Json.h"
|
|
#include "Posdb.h"
|
|
#include <unordered_set>
|
|
|
|
|
|
// forward declaration
|
|
class GetMsg20State;
|
|
class HashTableX;
|
|
class HashInfo;
|
|
|
|
|
|
namespace GbDns {
|
|
struct DnsResponse;
|
|
}
|
|
|
|
#define MAXFRAGWORDS 80000
|
|
|
|
#define MAX_TAG_PAIR_HASHES 100
|
|
|
|
#include "Msg40.h"
|
|
|
|
#define POST_VECTOR_SIZE (32*4)
|
|
|
|
#define MAX_LINK_TEXT_LEN 512
|
|
#define MAX_SURROUNDING_TEXT_WIDTH 600
|
|
#define MAX_RSSITEM_SIZE 30000
|
|
|
|
bool getDensityRanks ( const TokenizerResult *tr,
|
|
int32_t hashGroup ,
|
|
SafeBuf *densBuf ,
|
|
const Sections *sections);
|
|
|
|
// diversity vector
|
|
bool getDiversityVec ( const TokenizerResult *tr,
|
|
const Phrases *phrases ,
|
|
class HashTableX *countTable ,
|
|
SafeBuf *sbWordVec );
|
|
|
|
float computeSimilarity ( const int32_t *vec0,
|
|
const int32_t *vec1,
|
|
// corresponding scores vectors
|
|
const int32_t *s0,
|
|
const int32_t *s1,
|
|
class Query *q ,
|
|
// only Sections::addDateBasedImpliedSections()
|
|
// sets this to true right now. if set to true
|
|
// we essentially dedup each vector, although
|
|
// the score is compounded into the remaining
|
|
// occurence. i'm not sure if that is the right
|
|
// behavior though.
|
|
bool dedupVecs = false );
|
|
|
|
|
|
// tell zlib to use our malloc/free functions
|
|
int gbuncompress(unsigned char *dest,
|
|
uint32_t *destLen,
|
|
const unsigned char *source,
|
|
uint32_t sourceLen);
|
|
|
|
int gbcompress(unsigned char *dest,
|
|
uint32_t *destLen,
|
|
const unsigned char *source,
|
|
uint32_t sourceLen);
|
|
|
|
// . for Msg13.cpp
|
|
// . *pend must equal \0
|
|
int32_t getContentHash32Fast ( unsigned char *p , int32_t plen ) ;
|
|
|
|
bool getWordPosVec ( const TokenizerResult *tr,
|
|
const Sections *sections,
|
|
int32_t startDist,
|
|
const char *fragVec,
|
|
SafeBuf *wpos );
|
|
|
|
|
|
#define ROOT_TITLE_BUF_MAX 512
|
|
|
|
class XmlDoc {
|
|
|
|
public:
|
|
|
|
/// @warning Do NOT change. titlerec binary compatibility header
|
|
|
|
//
|
|
// BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h)
|
|
//
|
|
|
|
// headerSize = this->ptr_firstUrl - this->m_headerSize
|
|
uint16_t m_headerSize;
|
|
uint16_t m_version;
|
|
|
|
// these flags are used to indicate which ptr_ members are present:
|
|
uint32_t m_internalFlags1;
|
|
int32_t m_ip;
|
|
int32_t m_crawlDelay;
|
|
|
|
// . use this to quickly detect if doc is unchanged
|
|
// . we can avoid setting Xml and Words classes etc...
|
|
int32_t m_contentHash32;
|
|
|
|
// this is a hash of all adjacent tag pairs for templated identificatn
|
|
uint32_t m_tagPairHash32;
|
|
int32_t m_siteNumInlinks;
|
|
|
|
// this is non-zero if we decided not to index the doc
|
|
int32_t m_indexCode;
|
|
|
|
int32_t m_reserved2;
|
|
uint32_t m_spideredTime; // time_t
|
|
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
|
|
uint32_t m_reserved32;
|
|
uint32_t m_reserved33;
|
|
uint32_t m_firstIndexedDate; // time_t
|
|
uint32_t m_outlinksAddedDate; // time_t
|
|
|
|
uint16_t m_charset; // the ORIGINAL charset, we are always utf8!
|
|
uint16_t m_countryId;
|
|
|
|
int32_t m_reserved3;
|
|
|
|
uint8_t m_metaListCheckSum8; // bring it back!!
|
|
char m_reserved3b;
|
|
uint16_t m_bodyStartPos;
|
|
uint16_t m_reserved5;
|
|
|
|
uint16_t m_unused0;
|
|
|
|
int16_t m_httpStatus; // -1 if not found (empty http reply)
|
|
|
|
int8_t m_reserved5a;
|
|
uint8_t m_langId;
|
|
uint8_t m_reserved6;
|
|
uint8_t m_contentType;
|
|
|
|
|
|
// bit flags
|
|
uint16_t m_isRSS:1;
|
|
uint16_t m_isPermalink:1;
|
|
uint16_t m_isAdult:1;
|
|
uint16_t m_wasContentInjected:1;
|
|
uint16_t m_spiderLinks:1;
|
|
uint16_t m_isContentTruncated:1;
|
|
uint16_t m_isLinkSpam:1;
|
|
uint16_t m_reserved796:1;
|
|
uint16_t m_reserved797:1;
|
|
uint16_t m_reserved798:1;
|
|
uint16_t m_reserved799:1;
|
|
uint16_t m_isSiteRoot:1;
|
|
|
|
uint16_t m_reserved800:1;
|
|
uint16_t m_reserved801:1;
|
|
uint16_t m_reserved802:1;
|
|
uint16_t m_reserved803:1;
|
|
uint16_t m_reserved805:1;
|
|
uint16_t m_reserved806:1;
|
|
uint16_t m_reserved807:1;
|
|
uint16_t m_reserved808:1;
|
|
uint16_t m_reserved809:1;
|
|
uint16_t m_reserved810:1;
|
|
uint16_t m_reserved811:1;
|
|
uint16_t m_reserved812:1;
|
|
uint16_t m_reserved813:1;
|
|
uint16_t m_reserved814:1;
|
|
uint16_t m_reserved815:1;
|
|
uint16_t m_reserved816:1;
|
|
|
|
//end of titlerec binary compatibility header
|
|
|
|
/// @warning Do NOT change the structure of the following until m_dummyEnd.
|
|
/// check in XmlDoc::set2 (sanity check. must match exactly)
|
|
|
|
char *ptr_firstUrl;
|
|
char *ptr_redirUrl;
|
|
char *ptr_rootTitleBuf;
|
|
int32_t *ptr_unused12;
|
|
int32_t *ptr_unused13;
|
|
void *ptr_unused8;
|
|
int64_t *ptr_unused10;
|
|
float *ptr_unused11;
|
|
char *ptr_imageData;
|
|
int32_t *ptr_unused6;
|
|
int32_t *ptr_unused7;
|
|
char *ptr_explicitKeywords;
|
|
char *ptr_unused2;
|
|
char *ptr_unused3;
|
|
char *ptr_utf8Content;
|
|
char *ptr_unused5;
|
|
|
|
// do not let SiteGetter change this when we re-parse!
|
|
char *ptr_site;
|
|
LinkInfo *ptr_linkInfo1;
|
|
char *ptr_linkdbData;
|
|
char *ptr_unused14;
|
|
char *ptr_tagRecData;
|
|
LinkInfo *ptr_unused9;
|
|
|
|
int32_t size_firstUrl;
|
|
int32_t size_redirUrl;
|
|
int32_t size_rootTitleBuf;
|
|
int32_t size_unused12;
|
|
int32_t size_unused13;
|
|
int32_t size_unused8;
|
|
int32_t size_unused10;
|
|
int32_t size_unused11;
|
|
int32_t size_imageData;
|
|
int32_t size_unused6;
|
|
int32_t size_unused7;
|
|
int32_t size_explicitKeywords;
|
|
int32_t size_unused2;
|
|
int32_t size_unused3;
|
|
int32_t size_utf8Content;
|
|
int32_t size_unused5;
|
|
int32_t size_site;
|
|
int32_t size_linkInfo1;
|
|
int32_t size_linkdbData;
|
|
int32_t size_unused14;
|
|
int32_t size_tagRecData;
|
|
int32_t size_unused9;
|
|
|
|
char m_dummyEnd;
|
|
|
|
//
|
|
// END WHAT IS STORED IN THE TITLE REC (Titledb.h)
|
|
//
|
|
|
|
char *ptr_scheme;
|
|
int32_t size_scheme;
|
|
|
|
|
|
public:
|
|
bool set2 ( char *titleRec,
|
|
int32_t maxSize,
|
|
const char *coll,
|
|
int32_t niceness ,
|
|
class SpiderRequest *sreq = NULL );
|
|
|
|
// . since being set from a docId, we will load the old title rec
|
|
// and use that!
|
|
// . used by PageGet.cpp
|
|
bool set3 ( int64_t docId ,
|
|
const char *coll ,
|
|
int32_t niceness );
|
|
|
|
bool set4 ( class SpiderRequest *sreq ,
|
|
const key96_t *doledbKey,
|
|
const char *coll ,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness ,
|
|
char *utf8Content = NULL ,
|
|
bool deleteFromIndex = false ,
|
|
int32_t forcedIp = 0 ,
|
|
uint8_t contentType = CT_HTML ,
|
|
uint32_t spideredTime = 0 , // time_t
|
|
bool contentHasMime = false );
|
|
|
|
// we now call this right away rather than at download time!
|
|
int32_t getSpideredTime();
|
|
|
|
// time right before adding the termlists to the index, etc.
|
|
// whereas spider time is the download time
|
|
int32_t getIndexedTime();
|
|
|
|
// another entry point, like set3() kinda
|
|
bool loadFromOldTitleRec ();
|
|
|
|
XmlDoc() ;
|
|
~XmlDoc() ;
|
|
void nukeDoc ( class XmlDoc *);
|
|
void reset ( ) ;
|
|
bool setFirstUrl ( const char *u ) ;
|
|
void setStatus ( const char *s ) ;
|
|
void setCallback ( void *state, void (*callback) (void *state) ) ;
|
|
void setCallback ( void *state, bool (*callback) (void *state) ) ;
|
|
void getRevisedSpiderRequest ( class SpiderRequest *revisedReq );
|
|
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
|
|
bool indexDoc ( );
|
|
bool indexDoc2 ( );
|
|
|
|
char *prepareToMakeTitleRec ( ) ;
|
|
// store TitleRec into "buf" so it can be added to metalist
|
|
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
|
|
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
|
|
SafeBuf *getTitleRecBuf ( );
|
|
|
|
char *getIsAdult ( ) ;
|
|
|
|
bool *checkBlockList();
|
|
unsigned *getDefaultSitePageTemperature();
|
|
|
|
bool *parseRobotsMetaTag();
|
|
void parseRobotsMetaTagContent(const char *content, int32_t contentLen);
|
|
|
|
char *getIsPermalink ( ) ;
|
|
char *getIsUrlPermalinkFormat ( ) ;
|
|
char *getIsRSS ( ) ;
|
|
bool *getIsSiteMap ( ) ;
|
|
class Xml *getXml ( ) ;
|
|
uint8_t *getLangVector ( ) ;
|
|
uint8_t *getLangId ( ) ;
|
|
|
|
lang_t getSummaryLangIdCLD2();
|
|
|
|
lang_t getContentLangIdCLD2();
|
|
lang_t getContentLangIdCLD3();
|
|
|
|
uint8_t computeLangId(Sections *sections, const TokenizerResult *tr, char *lv);
|
|
TokenizerResult *getTokenizerResult();
|
|
TokenizerResult *getTokenizerResult2();
|
|
class Bits *getBits ( ) ;
|
|
class Bits *getBitsForSummary ( ) ;
|
|
class Pos *getPos ( );
|
|
class Phrases *getPhrases ( ) ;
|
|
class Sections *getSections ( ) ;
|
|
int32_t *getLinkSiteHashes ( );
|
|
class Links *getLinks ( bool doQuickSet = false ) ;
|
|
class HashTableX *getCountTable ( ) ;
|
|
bool hashString_ct(HashTableX *ht, const char *s, int32_t slen);
|
|
int32_t *getSummaryVector ( ) ;
|
|
int32_t *getPageSampleVector ( ) ;
|
|
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
|
|
int32_t computeVector ( const TokenizerResult *tr, uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
|
|
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
|
|
float *getPercentChanged ( );
|
|
int64_t *getExactContentHash64();
|
|
class RdbList *getDupList ( ) ;
|
|
char *getIsDup ( ) ;
|
|
char *getMetaDescription( int32_t *mdlen ) ;
|
|
char *getMetaSummary ( int32_t *mslen ) ;
|
|
char *getMetaKeywords( int32_t *mklen ) ;
|
|
char *getMetaGeoPlacename( int32_t *mgplen );
|
|
|
|
class Url *getCurrentUrl ( ) ;
|
|
class Url *getFirstUrl() ;
|
|
int64_t getFirstUrlHash48();
|
|
int64_t getFirstUrlHash64();
|
|
class Url **getRedirUrl() ;
|
|
class Url **getMetaRedirUrl() ;
|
|
class Url *getCanonicalUrl();
|
|
class Url **getCanonicalRedirUrl ( ) ;
|
|
int32_t *getFirstIndexedDate ( ) ;
|
|
int32_t *getOutlinksAddedDate ( ) ;
|
|
uint16_t *getCountryId ( ) ;
|
|
class XmlDoc **getOldXmlDoc ( ) ;
|
|
class XmlDoc **getExtraDoc(const char *url, int32_t maxCacheAge = 0);
|
|
bool getIsPageParser ( ) ;
|
|
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
|
|
char **getOldTitleRec ( );
|
|
char **getRootTitleRec ( ) ;
|
|
int64_t *getDocId ( ) ;
|
|
char *getIsIndexed ( ) ;
|
|
class TagRec *getTagRec ( ) ;
|
|
class TagRec *getCurrentTagRec ( ) ;
|
|
// non-dup/nondup addresses only
|
|
int32_t *getFirstIp ( ) ;
|
|
int32_t *getSiteNumInlinks ( ) ;
|
|
class LinkInfo *getSiteLinkInfo() ;
|
|
int32_t *getIp ( ) ;
|
|
void setIp(GbDns::DnsResponse *response);
|
|
std::vector<std::string>* getHostNameServers(const char *hostname, size_t hostnameLen);
|
|
static void gotHostNameServersWrapper(GbDns::DnsResponse *response, void *state);
|
|
static void gotIpWrapper(GbDns::DnsResponse *response, void *state);
|
|
bool *getIsAllowed ( ) ;
|
|
int32_t *getFinalCrawlDelay();
|
|
int32_t m_finalCrawlDelay;
|
|
char *getIsWWWDup ( ) ;
|
|
class LinkInfo *getLinkInfo1 ( ) ;
|
|
char *getSite ( ) ;
|
|
const char *getScheme ( ) ;
|
|
|
|
void gotSite ( ) ;
|
|
int32_t *getSiteHash32 ( ) ;
|
|
char **getHttpReply ( ) ;
|
|
char **getHttpReply2 ( ) ;
|
|
char **gotHttpReply ( ) ;
|
|
char *getIsContentTruncated ( );
|
|
int32_t *getDownloadStatus ( ) ;
|
|
int64_t *getDownloadEndTime ( ) ;
|
|
int16_t *getHttpStatus ( );
|
|
class HttpMime *getMime () ;
|
|
char **getContent ( ) ;
|
|
uint8_t *getContentType ( ) ;
|
|
uint16_t *getCharset ( ) ;
|
|
char **getFilteredContent ( ) ;
|
|
void filterStart_r ( bool amThread ) ;
|
|
char **getRawUtf8Content ( ) ;
|
|
char **getExpandedUtf8Content ( ) ;
|
|
char **getUtf8Content ( ) ;
|
|
// we download large files to a file on disk, like warcs and arcs
|
|
int32_t *getContentHash32 ( ) ;
|
|
int32_t *getContentHashJson32 ( ) ;
|
|
int32_t *getTagPairHashVector ( ) ;
|
|
uint32_t *getTagPairHash32 ( ) ;
|
|
int32_t getHostHash32a ( ) ;
|
|
int32_t getDomHash32 ( );
|
|
char **getThumbnailData();
|
|
class Images *getImages ( ) ;
|
|
class TagRec ***getOutlinkTagRecVector () ;
|
|
int32_t **getOutlinkFirstIpVector () ;
|
|
char *getIsSiteRoot ( ) ;
|
|
char *getSpiderLinks ( ) ;
|
|
bool getIsInjecting();
|
|
int32_t *getSpiderPriority ( ) ;
|
|
int32_t *getIndexCode ( ) ;
|
|
SafeBuf *getNewTagBuf ( ) ;
|
|
|
|
void logIt ( SafeBuf *bb = NULL ) ;
|
|
bool m_doConsistencyTesting;
|
|
bool doConsistencyTest ( bool forceTest ) ;
|
|
|
|
void printMetaList() const;
|
|
void printMetaList ( char *metaList , char *metaListEnd ,
|
|
SafeBuf *pbuf );
|
|
bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
|
|
bool hashMetaList ( class HashTableX *ht ,
|
|
char *p ,
|
|
char *pend ,
|
|
bool checkList ) ;
|
|
|
|
char *getMetaList ( bool forDelete = false );
|
|
|
|
uint64_t m_downloadStartTime;
|
|
|
|
uint64_t m_ipStartTime;
|
|
uint64_t m_ipEndTime;
|
|
|
|
int64_t m_getLinkInfoStartTime;
|
|
int64_t m_getLinkInfoEndTime;
|
|
|
|
int64_t m_getSiteLinkInfoStartTime;
|
|
int64_t m_getSiteLinkInfoEndTime;
|
|
|
|
bool m_updatedMetaData;
|
|
|
|
void copyFromOldDoc ( class XmlDoc *od ) ;
|
|
|
|
class SpiderReply *getFakeSpiderReply ( );
|
|
|
|
// we add a SpiderReply to spiderdb when done spidering, even if
|
|
// m_indexCode or g_errno was set!
|
|
class SpiderReply *getNewSpiderReply ( );
|
|
|
|
void setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
|
|
class SpiderReply *srep );
|
|
|
|
|
|
char *addOutlinkSpiderRecsToMetaList ( );
|
|
|
|
void lookupAndSetExplicitKeywords();
|
|
|
|
int32_t getSiteRank ();
|
|
bool addTable144 ( class HashTableX *tt1 ,
|
|
int64_t docId ,
|
|
SafeBuf *buf = NULL );
|
|
|
|
bool addTable224 ( HashTableX *tt1 ) ;
|
|
|
|
bool hashNoSplit ( class HashTableX *tt ) ;
|
|
char *hashAll ( class HashTableX *table ) ;
|
|
bool hashMetaTags ( class HashTableX *table ) ;
|
|
bool hashContentType ( class HashTableX *table ) ;
|
|
|
|
bool hashLinks ( class HashTableX *table ) ;
|
|
bool hashUrl ( class HashTableX *table, bool urlOnly );
|
|
bool hashIncomingLinkText(HashTableX *table);
|
|
bool hashLinksForLinkdb ( class HashTableX *table ) ;
|
|
bool hashNeighborhoods ( class HashTableX *table ) ;
|
|
bool hashTitle ( class HashTableX *table );
|
|
bool hashBody2 ( class HashTableX *table );
|
|
bool hashMetaKeywords ( class HashTableX *table );
|
|
bool hashExplicitKeywords(HashTableX *table);
|
|
bool hashMetaGeoPlacename( class HashTableX *table );
|
|
bool hashMetaSummary ( class HashTableX *table );
|
|
bool hashLanguage ( class HashTableX *table ) ;
|
|
bool hashLanguageString ( class HashTableX *table ) ;
|
|
bool hashCountry ( class HashTableX *table ) ;
|
|
bool hashLemmas(class HashTableX *table);
|
|
void sortTokenizerResult(TokenizerResult *tr);
|
|
void getLanguageAndCountry(lang_t *lang, const char **country_code);
|
|
|
|
class Url *getBaseUrl ( ) ;
|
|
|
|
void setMsg20Request(Msg20Request *req);
|
|
class Msg20Reply *getMsg20Reply ( ) ;
|
|
class Msg20Reply *getMsg20ReplyStepwise();
|
|
void loopUntilMsg20ReplyReady(GetMsg20State *);
|
|
static void getMsg20ReplyThread(void *pv);
|
|
void getMsg20ReplyThread();
|
|
static void msg20Done(void *pv, job_exit_t exit_type);
|
|
void msg20Done(job_exit_t exit_type);
|
|
Query *getQuery() ;
|
|
Matches *getMatches () ;
|
|
char *getDescriptionBuf ( char *displayMetas , int32_t *dlen ) ;
|
|
SafeBuf *getHeaderTagBuf();
|
|
class Title *getTitle ();
|
|
class Summary *getSummary () ;
|
|
char *getHighlightedSummary ( bool *isSetFromTagsPtr );
|
|
bool *getIsNoArchive();
|
|
bool *getIsNoFollow();
|
|
bool *getIsNoIndex();
|
|
bool *getIsNoSnippet();
|
|
int32_t *getUrlFilterNum();
|
|
char *getIsLinkSpam ( ) ;
|
|
char *getIsErrorPage ( ) ;
|
|
const char* matchErrorMsg(char* p, char* pend );
|
|
|
|
bool hashWords( class HashInfo *hi );
|
|
bool hashSingleTerm( const char *s, int32_t slen, class HashInfo *hi );
|
|
bool hashString( const char *s, int32_t slen, class HashInfo *hi );
|
|
bool hashString(size_t begin_token, size_t end_token, HashInfo *hi);
|
|
|
|
bool hashString3(const char *s, int32_t slen, class HashInfo *hi,
|
|
HashTableX *wts, SafeBuf *wbuf);
|
|
bool hashString3(size_t begin_token, size_t end_token, HashInfo *hi,
|
|
HashTableX *wts, SafeBuf *wbuf);
|
|
|
|
bool hashWords3(HashInfo *hi, const TokenizerResult *tr,
|
|
Sections *sections, const Bits *bits,
|
|
const char *fragVec, const char *wordSpamVec, const char *langVec,
|
|
HashTableX *wts, SafeBuf *wbuf);
|
|
bool hashWords3(HashInfo *hi, const TokenizerResult *tr, size_t begin_token, size_t end_token,
|
|
Sections *sections, const Bits *bits,
|
|
const char *fragVec, const char *wordSpamVec, const char *langVec,
|
|
HashTableX *wts, SafeBuf *wbuf);
|
|
bool hashString4(const char *s, int32_t slen, HashInfo *hi);
|
|
|
|
|
|
// print out for PageTitledb.cpp and PageParser.cpp
|
|
bool printDoc ( class SafeBuf *pbuf );
|
|
bool printMenu ( class SafeBuf *pbuf );
|
|
bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
|
|
bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
|
|
bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );
|
|
|
|
char *getTitleBuf ( );
|
|
char *getRootTitleBuf ( );
|
|
char *getFilteredRootTitleBuf ( );
|
|
|
|
public:
|
|
|
|
// stuff set from the key of the titleRec, above the compression area
|
|
int64_t m_docId;
|
|
|
|
char *m_ubuf;
|
|
int32_t m_ubufSize;
|
|
int32_t m_ubufAlloc;
|
|
|
|
// private:
|
|
|
|
// we we started spidering it, in milliseconds since the epoch
|
|
int64_t m_startTime;
|
|
|
|
class XmlDoc *m_prevInject;
|
|
class XmlDoc *m_nextInject;
|
|
|
|
// when set() was called by Msg20.cpp so we can time how long it took
|
|
// to generate the summary
|
|
int64_t m_setTime;
|
|
int64_t m_cpuSummaryStartTime;
|
|
|
|
// . these should all be set using set*() function calls so their
|
|
// individual validity flags can bet set to true, and successive
|
|
// calls to their corresponding get*() functions will not core
|
|
// . these particular guys are set immediately on set(char *titleRec)
|
|
|
|
Url m_redirUrl;
|
|
Url *m_redirUrlPtr;
|
|
SafeBuf m_redirCookieBuf;
|
|
Url m_metaRedirUrl;
|
|
Url *m_metaRedirUrlPtr;
|
|
Url m_canonicalUrl;
|
|
Url *m_canonicalRedirUrlPtr;
|
|
int32_t m_redirError;
|
|
bool m_allowSimplifiedRedirs;
|
|
Url m_firstUrl;
|
|
int64_t m_firstUrlHash48;
|
|
int64_t m_firstUrlHash64;
|
|
Url m_currentUrl;
|
|
|
|
collnum_t m_collnum;
|
|
class CollectionRec *getCollRec ( ) ;
|
|
bool setCollNum ( const char *coll ) ;
|
|
|
|
|
|
char *m_content;
|
|
int32_t m_contentLen;
|
|
|
|
char *m_metaList;
|
|
int32_t m_metaListSize;
|
|
|
|
int32_t m_addedSpiderRequestSize;
|
|
int32_t m_addedSpiderReplySize;
|
|
|
|
SafeBuf m_metaList2;
|
|
|
|
// used by msg7 to store udp slot
|
|
class UdpSlot *m_injectionSlot;
|
|
|
|
// . same thing, a little more complicated
|
|
// . these classes are only set on demand
|
|
Xml m_xml;
|
|
Links m_links;
|
|
TokenizerResult m_tokenizerResult;
|
|
Bits m_bits;
|
|
Bits m_bits2;
|
|
Pos m_pos;
|
|
Phrases m_phrases;
|
|
Sections m_sections;
|
|
|
|
// . for rebuild logging of what's changed
|
|
// . Repair.cpp sets these based on titlerec
|
|
char m_logLangId;
|
|
int32_t m_logSiteNumInlinks;
|
|
|
|
bool isFirstUrlRobotsTxt();
|
|
bool m_isRobotsTxtUrl;
|
|
|
|
bool* isFirstUrlCanonical();
|
|
bool m_isUrlCanonical;
|
|
|
|
Images m_images;
|
|
HashTableX m_countTable;
|
|
HttpMime m_mime;
|
|
TagRec m_tagRec;
|
|
SafeBuf m_tagRecBuf;
|
|
TagRec m_currentTagRec;
|
|
SafeBuf m_newTagBuf;
|
|
SafeBuf m_fragBuf;
|
|
SafeBuf m_wordSpamBuf;
|
|
SafeBuf m_finalSummaryBuf;
|
|
bool m_isFinalSummarySetFromTags;
|
|
int32_t m_firstIp;
|
|
|
|
class SafeBuf *m_savedSb;
|
|
class HttpRequest *m_savedHr;
|
|
|
|
// validity flags. on reset() all these are set to false.
|
|
char m_VALIDSTART;
|
|
// DO NOT add validity flags above this line!
|
|
bool m_metaListValid;
|
|
bool m_addedSpiderRequestSizeValid;
|
|
bool m_addedSpiderReplySizeValid;
|
|
bool m_downloadStartTimeValid;
|
|
bool m_siteValid;
|
|
bool m_startTimeValid;
|
|
bool m_currentUrlValid;
|
|
bool m_firstUrlValid;
|
|
bool m_firstUrlHash48Valid;
|
|
bool m_firstUrlHash64Valid;
|
|
bool m_docIdValid;
|
|
bool m_tagRecValid;
|
|
bool m_currentTagRecValid;
|
|
bool m_robotsTxtLenValid;
|
|
bool m_tagRecDataValid;
|
|
bool m_newTagBufValid;
|
|
bool m_rootTitleBufValid;
|
|
bool m_filteredRootTitleBufValid;
|
|
bool m_titleBufValid;
|
|
bool m_fragBufValid;
|
|
bool m_isRobotsTxtUrlValid;
|
|
bool m_isUrlCanonicalValid;
|
|
bool m_wordSpamBufValid;
|
|
bool m_finalSummaryBufValid;
|
|
|
|
bool m_isInjectingValid;
|
|
bool m_metaListCheckSum8Valid;
|
|
bool m_contentValid;
|
|
bool m_filteredContentValid;
|
|
bool m_charsetValid;
|
|
bool m_langVectorValid;
|
|
bool m_langIdValid;
|
|
bool m_isRSSValid;
|
|
bool m_isSiteMapValid;
|
|
bool m_isContentTruncatedValid;
|
|
bool m_xmlValid;
|
|
bool m_linksValid;
|
|
bool m_tokenizerResultValid;
|
|
bool m_tokenizerResultValid2;
|
|
bool m_bitsValid;
|
|
bool m_bits2Valid;
|
|
bool m_posValid;
|
|
bool m_phrasesValid;
|
|
bool m_sectionsValid;
|
|
|
|
bool m_imageDataValid;
|
|
bool m_imagesValid;
|
|
bool m_sreqValid;
|
|
bool m_srepValid;
|
|
|
|
bool m_ipValid;
|
|
bool m_firstIpValid;
|
|
bool m_spideredTimeValid;
|
|
bool m_indexedTimeValid;
|
|
bool m_isInIndexValid;
|
|
bool m_wasInIndexValid;
|
|
bool m_outlinksAddedDateValid;
|
|
bool m_countryIdValid;
|
|
bool m_bodyStartPosValid;
|
|
|
|
bool m_httpStatusValid;
|
|
bool m_crawlDelayValid;
|
|
bool m_finalCrawlDelayValid;
|
|
bool m_titleRecKeyValid;
|
|
bool m_versionValid;
|
|
bool m_rawUtf8ContentValid;
|
|
bool m_expandedUtf8ContentValid;
|
|
bool m_utf8ContentValid;
|
|
bool m_isAllowedValid;
|
|
bool m_redirUrlValid;
|
|
bool m_redirCookieBufValid;
|
|
bool m_metaRedirUrlValid;
|
|
bool m_canonicalUrlValid;
|
|
bool m_canonicalRedirUrlValid;
|
|
bool m_statusMsgValid;
|
|
bool m_mimeValid;
|
|
bool m_hostHash32aValid;
|
|
bool m_indexCodeValid;
|
|
bool m_priorityValid;
|
|
bool m_downloadStatusValid;
|
|
bool m_downloadEndTimeValid;
|
|
bool m_redirErrorValid;
|
|
bool m_domHash32Valid;
|
|
bool m_contentHash32Valid;
|
|
bool m_tagPairHash32Valid;
|
|
|
|
bool m_spiderLinksValid;
|
|
bool m_firstIndexedDateValid;
|
|
bool m_isPermalinkValid;
|
|
|
|
bool m_isAdultValid;
|
|
bool m_isUrlPermalinkFormatValid;
|
|
bool m_percentChangedValid;
|
|
bool m_countTableValid;
|
|
bool m_tagPairHashVecValid;
|
|
bool m_summaryVecValid;
|
|
bool m_pageSampleVecValid;
|
|
bool m_postVecValid;
|
|
bool m_dupListValid;
|
|
bool m_isDupValid;
|
|
bool m_metaDescValid;
|
|
bool m_metaSummaryValid;
|
|
bool m_metaKeywordsValid;
|
|
bool m_metaGeoPlacenameValid;
|
|
bool m_oldDocValid;
|
|
bool m_extraDocValid;
|
|
bool m_rootDocValid;
|
|
bool m_oldTitleRecValid;
|
|
bool m_rootTitleRecValid;
|
|
bool m_isIndexedValid;
|
|
bool m_siteNumInlinksValid;
|
|
bool m_siteLinkInfoValid;
|
|
bool m_isWWWDupValid;
|
|
bool m_linkInfo1Valid;
|
|
bool m_linkSiteHashesValid;
|
|
bool m_siteHash32Valid;
|
|
bool m_httpReplyValid;
|
|
bool m_contentTypeValid;
|
|
bool m_outlinkTagRecVectorValid;
|
|
bool m_outlinkIpVectorValid;
|
|
bool m_isSiteRootValid;
|
|
bool m_wasContentInjectedValid;
|
|
bool m_urlFilterNumValid;
|
|
bool m_numOutlinksAddedValid;
|
|
bool m_baseUrlValid;
|
|
bool m_replyValid;
|
|
bool m_isPageParserValid;
|
|
bool m_queryValid;
|
|
bool m_matchesValid;
|
|
bool m_dbufValid;
|
|
bool m_titleValid;
|
|
bool m_htbValid;
|
|
bool m_collnumValid;
|
|
bool m_summaryValid;
|
|
bool m_titleRecBufValid;
|
|
bool m_isLinkSpamValid;
|
|
bool m_isErrorPageValid;
|
|
bool m_exactContentHash64Valid;
|
|
bool m_jpValid;
|
|
bool m_blockedDocValid;
|
|
bool m_defaultSitePageTemperatureValid;
|
|
bool m_hostNameServersValid;
|
|
bool m_ipsValid;
|
|
bool m_isSiteMap;
|
|
|
|
// shadows
|
|
char m_isRSS2;
|
|
char m_isPermalink2;
|
|
char m_isAdult2;
|
|
char m_spiderLinks2; // May be -1
|
|
char m_isContentTruncated2;
|
|
char m_isLinkSpam2;
|
|
char m_isSiteRoot2;
|
|
|
|
// DO NOT add validity flags below this line!
|
|
char m_VALIDEND;
|
|
|
|
bool m_printedMenu;
|
|
char m_isUrlPermalinkFormat;
|
|
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
|
|
int32_t m_tagPairHashVecSize;
|
|
int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
|
|
int32_t m_summaryVecSize;
|
|
int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
|
|
int32_t m_pageSampleVecSize;
|
|
int32_t m_postVec[POST_VECTOR_SIZE/4];
|
|
int32_t m_postVecSize;
|
|
float m_pageSimilarity;
|
|
float m_percentChanged;
|
|
// what docids are similar to us? docids are in this list
|
|
RdbList m_dupList;
|
|
int64_t m_exactContentHash64;
|
|
Msg0 m_msg0;
|
|
char m_isDup; // may be -1
|
|
int64_t m_docIdWeAreADupOf;
|
|
Msg22Request m_msg22Request;
|
|
Msg22 m_msg22a;
|
|
Msg22 m_msg22b;
|
|
Msg22 m_msg22e;
|
|
Msg22 m_msg22f;
|
|
// these now reference directly into the html src so our
|
|
// WordPosInfo::m_wordPtr algo works in seo.cpp
|
|
char *m_metaDesc;
|
|
int32_t m_metaDescLen;
|
|
char *m_metaSummary;
|
|
int32_t m_metaSummaryLen;
|
|
char *m_metaKeywords;
|
|
int32_t m_metaKeywordsLen;
|
|
|
|
char *m_metaGeoPlacename;
|
|
int32_t m_metaGeoPlacenameLen;
|
|
|
|
class XmlDoc *m_oldDoc;
|
|
class XmlDoc *m_extraDoc;
|
|
class XmlDoc *m_rootDoc;
|
|
char *m_oldTitleRec;
|
|
int32_t m_oldTitleRecSize;
|
|
char *m_rootTitleRec;
|
|
int32_t m_rootTitleRecSize;
|
|
char m_isIndexed; // may be -1
|
|
|
|
// confusing, i know! these are used exclsusively by
|
|
// getNewSpiderReply() for now
|
|
bool m_isInIndex;
|
|
bool m_wasInIndex;
|
|
|
|
Msg8a m_msg8a;
|
|
Msg8a m_currentMsg8a;
|
|
|
|
Url m_extraUrl;
|
|
SafeBuf m_mySiteLinkInfoBuf;
|
|
SafeBuf m_myPageLinkInfoBuf;
|
|
|
|
bool m_isInjecting;
|
|
bool m_useFakeMime;
|
|
bool m_useSiteLinkBuf;
|
|
bool m_usePageLinkBuf;
|
|
bool m_printInXml;
|
|
|
|
SafeBuf m_tmpBuf11;
|
|
SafeBuf m_tmpBuf12;
|
|
Multicast m_mcast11;
|
|
Multicast m_mcast12;
|
|
bool m_isAllowed;
|
|
bool m_isChildDoc;
|
|
Msg13 m_msg13;
|
|
Msg13Request m_msg13Request;
|
|
bool m_isSpiderProxy;
|
|
// for limiting # of iframe tag expansions
|
|
int32_t m_numExpansions;
|
|
char m_newOnly;
|
|
bool m_skipContentHashCheck;
|
|
char m_isWWWDup; // May be -1
|
|
|
|
SafeBuf m_explicitKeywordsBuf;
|
|
SafeBuf m_linkSiteHashBuf;
|
|
SafeBuf m_linkdbDataBuf;
|
|
SafeBuf m_langVec;
|
|
|
|
SiteGetter m_siteGetter;
|
|
int32_t m_siteHash32;
|
|
char *m_httpReply;
|
|
bool m_useRobotsTxt;
|
|
int32_t m_robotsTxtLen;
|
|
bool m_robotsTxtHttpStatusDisallowed;
|
|
bool m_robotsTxtErrorDisallowed;
|
|
int32_t m_httpReplySize;
|
|
int32_t m_httpReplyAllocSize;
|
|
char *m_filteredContent;
|
|
int32_t m_filteredContentLen;
|
|
int32_t m_filteredContentAllocSize;
|
|
bool m_calledThread;
|
|
int32_t m_errno;
|
|
int32_t m_hostHash32a;
|
|
int32_t m_domHash32;
|
|
|
|
Msge0 m_msge0;
|
|
Msge1 m_msge1;
|
|
|
|
Json *getParsedJson();
|
|
// object that parses the json
|
|
Json m_jp;
|
|
|
|
// flow flags
|
|
|
|
bool m_computedMetaListCheckSum;
|
|
|
|
// cachedb related args
|
|
bool m_allHashed;
|
|
|
|
int32_t m_urlFilterNum;
|
|
int32_t m_numOutlinksAdded;
|
|
int32_t m_numRedirects;
|
|
bool m_isPageParser;
|
|
Url m_baseUrl;
|
|
Msg20Reply m_reply;
|
|
Msg20Request *m_req;
|
|
bool m_abortMsg20Generation;
|
|
char m_linkTextBuf[MAX_LINK_TEXT_LEN];
|
|
char m_surroundingTextBuf[MAX_SURROUNDING_TEXT_WIDTH];
|
|
char m_rssItemBuf[MAX_RSSITEM_SIZE];
|
|
|
|
const char *m_note;
|
|
Query m_query;
|
|
Matches m_matches;
|
|
// meta description buf
|
|
int32_t m_dbufSize;
|
|
char m_dbuf[1024];
|
|
SafeBuf m_htb;
|
|
Title m_title;
|
|
Summary m_summary;
|
|
char m_isErrorPage; // May be -1
|
|
|
|
// stuff
|
|
int64_t m_lastTimeStart;
|
|
const char *m_statusMsg;
|
|
Msg4 m_msg4;
|
|
|
|
bool m_deleteFromIndex;
|
|
|
|
// ptrs to stuff
|
|
SafeBuf m_titleRecBuf;
|
|
key96_t m_titleRecKey;
|
|
|
|
// for isDupOfUs()
|
|
char *m_dupTrPtr;
|
|
int32_t m_dupTrSize;
|
|
|
|
key96_t m_doledbKey;
|
|
SpiderRequest m_sreq;
|
|
SpiderReply m_srep;//newsr;
|
|
|
|
// bool flags for what procedures we have done
|
|
bool m_checkedUrlFilters;
|
|
|
|
bool m_listAdded ;
|
|
bool m_check1 ;
|
|
bool m_check2 ;
|
|
bool m_prepared ;
|
|
bool m_copied1 ;
|
|
bool m_updatingSiteLinkInfoTags ;
|
|
|
|
bool m_didDelay ;
|
|
bool m_didDelayUnregister ;
|
|
bool m_calledMsg22e ;
|
|
bool m_calledMsg22f ;
|
|
bool m_calledMsg25 ;
|
|
bool m_calledSections ;
|
|
bool m_loaded ;
|
|
|
|
bool m_doingConsistencyCheck ;
|
|
|
|
int32_t m_dist;
|
|
|
|
// use to store a \0 list of "titles" of the root page so we can
|
|
// see which if any are the venue name, and thus match that to
|
|
// addresses of the venue on the site, and we can use those addresses
|
|
// as default venue addresses when no venues are listed on a page
|
|
// on that site.
|
|
char m_rootTitleBuf[ROOT_TITLE_BUF_MAX];
|
|
int32_t m_rootTitleBufSize;
|
|
|
|
// . this is filtered
|
|
// . certain punct is replaced with \0
|
|
char m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX];
|
|
int32_t m_filteredRootTitleBufSize;
|
|
|
|
// like m_rootTitleBuf but for the current page
|
|
char m_titleBuf[ROOT_TITLE_BUF_MAX];
|
|
int32_t m_titleBufSize;
|
|
|
|
bool m_setTr ;
|
|
|
|
void (* m_masterLoop) ( void *state );
|
|
void * m_masterState;
|
|
|
|
void (* m_callback1) ( void *state );
|
|
bool (* m_callback2) ( void *state );
|
|
void *m_state;
|
|
|
|
// the spider priority
|
|
int32_t m_priority;
|
|
|
|
// the download error, like ETIMEDOUT, ENOROUTE, etc.
|
|
int32_t m_downloadStatus;
|
|
|
|
// . when the download was completed. will be zero if no download done
|
|
// . used to set SpiderReply::m_downloadEndTime because we need
|
|
// high resolution for that so we can dole out the next spiderrequest
|
|
// from that IP quickly if the sameipwait is like 500ms.
|
|
int64_t m_downloadEndTime;
|
|
|
|
int32_t m_metaListAllocSize;
|
|
char *m_p;
|
|
char *m_pend;
|
|
|
|
int32_t m_maxCacheAge;
|
|
|
|
bool m_hashedTitle;
|
|
bool m_hashedMetas;
|
|
|
|
int32_t m_niceness;
|
|
|
|
bool m_usePosdb ;
|
|
bool m_useClusterdb ;
|
|
bool m_useLinkdb ;
|
|
bool m_useSpiderdb ;
|
|
bool m_useTitledb ;
|
|
bool m_useTagdb ;
|
|
bool m_useSecondaryRdbs ;
|
|
|
|
SafeBuf *m_pbuf;
|
|
|
|
// store termlist into here if non-null
|
|
bool m_storeTermListInfo;
|
|
char m_sortTermListBy;
|
|
|
|
// store the terms that we hash into this table so that PageParser.cpp
|
|
// can print what was hashed and with what score and what description
|
|
class HashTableX *m_wts;
|
|
HashTableX m_wtsTable;
|
|
SafeBuf m_wbuf;
|
|
|
|
//During hasning various sources (title, tags, body, ...) we put unique lemmas into this set
|
|
std::unordered_set<std::string> lemma_words;
|
|
|
|
// which set() function was called above to set us?
|
|
bool m_setFromTitleRec;
|
|
bool m_setFromSpiderRec;
|
|
bool m_setFromUrl;
|
|
bool m_setFromDocId;
|
|
bool m_freeLinkInfo1;
|
|
bool m_contentInjected;
|
|
|
|
bool m_recycleContent;
|
|
bool m_docRebuild;
|
|
|
|
char *m_rawUtf8Content;
|
|
int32_t m_rawUtf8ContentSize;
|
|
int32_t m_rawUtf8ContentAllocSize; // we overallocate sometimes
|
|
char *m_expandedUtf8Content;
|
|
int32_t m_expandedUtf8ContentSize;
|
|
char *m_savedp;
|
|
char *m_oldp;
|
|
bool m_didExpansion;
|
|
SafeBuf m_esbuf;
|
|
|
|
// used by msg13
|
|
class Msg13Request *m_r;
|
|
|
|
bool m_freed;
|
|
|
|
bool m_indexedDoc; //indexDoc() perfomrned completely
|
|
|
|
bool m_msg4Waiting;
|
|
bool m_msg4Launched;
|
|
|
|
bool m_blockedDoc;
|
|
bool m_checkedUrlBlockList;
|
|
bool m_checkedDnsBlockList;
|
|
bool m_checkedIpBlockList;
|
|
|
|
unsigned m_defaultSitePageTemperature;
|
|
bool m_calledServiceSiteMedianPageTemperature;
|
|
|
|
bool m_parsedRobotsMetaTag;
|
|
bool m_robotsNoIndex;
|
|
bool m_robotsNoFollow;
|
|
bool m_robotsNoArchive;
|
|
bool m_robotsNoSnippet;
|
|
|
|
std::vector<std::string> m_hostNameServers;
|
|
std::vector<uint32_t> m_ips;
|
|
|
|
bool m_addSpiderRequest;
|
|
|
|
// word spam detection
|
|
char *getWordSpamVec ( );
|
|
bool setSpam ( const int32_t *profile, int32_t plen , int32_t numWords ,
|
|
unsigned char *spam );
|
|
int32_t getProbSpam ( const int32_t *profile, int32_t plen , int32_t step );
|
|
bool m_isRepeatSpammer;
|
|
int32_t m_numRepeatSpam;
|
|
|
|
// frag vector (repeated fragments). 0 means repeated, 1 means not.
|
|
// vector is 1-1 with words in the document body.
|
|
char *getFragVec ( );
|
|
|
|
bool injectDoc(const char *url,
|
|
class CollectionRec *cr,
|
|
char *content,
|
|
bool contentHasMime,
|
|
int32_t charset,
|
|
int32_t langId,
|
|
bool deleteUrl,
|
|
const char *contentTypeStr, // text/html, text/xml etc.
|
|
bool spiderLinks,
|
|
char newOnly, // index iff new
|
|
bool skipContentHashCheck,
|
|
void *state,
|
|
void (*callback)(void *state),
|
|
|
|
uint32_t firstIndexedTime = 0,
|
|
uint32_t lastSpideredDate = 0,
|
|
int32_t injectDocIp = 0,
|
|
const char *redirUrl = nullptr,
|
|
int32_t indexCode = 0,
|
|
int16_t httpStatus = 200);
|
|
|
|
int64_t logQueryTimingStart();
|
|
void logQueryTimingEnd(const char* function, int64_t startTime);
|
|
|
|
void callCallback();
|
|
|
|
bool m_calledServiceSiteNumInlinks;
|
|
};
|
|
|
|
// . PageParser.cpp uses this class for printing hashed terms out by calling
|
|
// XmlDoc::print()
|
|
// . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX
|
|
// . one for each term hashed
|
|
// . the key is the termId. dups are allowed
|
|
// . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so
|
|
// that TermInfo::m_term will reference that and it won't disappear on us
|
|
class TermDebugInfo {
|
|
public:
|
|
int32_t m_termOff;
|
|
int32_t m_termLen;
|
|
int32_t m_descOff; // the description offset
|
|
int32_t m_prefixOff; // the prefix offset, like "site" or "gbadid"
|
|
int64_t m_termId;
|
|
int32_t m_date;
|
|
bool m_shardByTermId;
|
|
|
|
char m_langId;
|
|
char m_diversityRank;
|
|
char m_densityRank;
|
|
char m_wordSpamRank;
|
|
char m_hashGroup;
|
|
int32_t m_wordNum;
|
|
int32_t m_wordPos;
|
|
posdbkey_t m_key; // key144_t
|
|
// 0 = not a syn, 1 = syn from presets,2=wikt,3=generated
|
|
char m_synSrc;
|
|
int64_t m_langBitVec64;
|
|
};
|
|
|
|
#endif // GB_XMLDOC_H
|