2687 lines
78 KiB
C++
2687 lines
78 KiB
C++
// Matt Wells, copyright Apr 2009
|
|
|
|
// . 2. you can also call setTitleRec() and then call getMetaList()
|
|
// . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp
|
|
// . Msg7 and Repair.cpp and injections can also set more than just
|
|
// m_firstUrl, like m_content, etc. or whatever elements are known, but
|
|
// they must also set the corresponding "valid" flags of those elements
|
|
// . both methods must yield exactly the same result, the same "meta list"
|
|
// . after setting the contained classes XmlDoc::setMetaList() makes the list
|
|
// of rdb records to be added to all the rdbs, this is the "meta list"
|
|
// . the meta list is made by hashing all the termIds/scores into some hash
|
|
// tables in order to accumulate scores, then the hash table are serialized
|
|
// into the "meta list"
|
|
// . the meta list is added to all rdbs with a simple call to
|
|
// Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now
|
|
|
|
|
|
#ifndef _XMLDOC_H_
|
|
#define _XMLDOC_H_
|
|
|
|
//#include "HashTableX.h"
|
|
#include "Lang.h"
|
|
#include "Words.h"
|
|
#include "Bits.h"
|
|
#include "Pos.h"
|
|
#include "Phrases.h"
|
|
//#include "Synonyms.h"
|
|
//#include "Weights.h"
|
|
#include "Xml.h"
|
|
#include "LangList.h"
|
|
#include "SafeBuf.h"
|
|
#include "Images.h"
|
|
#include "Sections.h"
|
|
#include "Msge0.h"
|
|
#include "Msge1.h"
|
|
//#include "Msge2.h"
|
|
#include "Msg4.h"
|
|
#include "Msg8b.h"
|
|
|
|
#include "SearchInput.h"
|
|
#include "Msg40.h"
|
|
#include "Dates.h"
|
|
//#include "IndexList.h"
|
|
#include "Msg0.h"
|
|
#include "Msg22.h"
|
|
#include "Tagdb.h"
|
|
#include "Url.h"
|
|
#include "Linkdb.h"
|
|
//#include "LinkInfo.h"
|
|
//#include "Msg25.h"
|
|
#include "MsgC.h"
|
|
#include "Msg13.h"
|
|
#include "RdbList.h"
|
|
#include "SiteGetter.h"
|
|
//#include "CollectionRec.h"
|
|
#include "Msg20.h"
|
|
#include "Matches.h"
|
|
#include "Query.h"
|
|
#include "Title.h"
|
|
#include "Summary.h"
|
|
#include "Msg8b.h"
|
|
#include "Address.h"
|
|
#include "zlib.h" // Z_OK
|
|
#include "Spider.h" // SpiderRequest/SpiderReply definitions
|
|
#include "HttpMime.h" // ET_DEFLAT
|
|
#include "Msg1.h"
|
|
#include "PingServer.h"
|
|
#include "Json.h"
|
|
|
|
//#define XMLDOC_MAX_AD_IDS 4
|
|
//#define XMLDOC_ADLEN 64
|
|
|
|
#define MAXFRAGWORDS 80000
|
|
|
|
#define MAX_WIKI_DOCIDS 20
|
|
|
|
#define MAX_TAG_PAIR_HASHES 100
|
|
|
|
#include "Msg40.h"
|
|
//#define SAMPLE_VECTOR_SIZE (32*4)
|
|
|
|
#define POST_VECTOR_SIZE (32*4)
|
|
|
|
#define XD_GQ_MAX_SIZE 1000
|
|
#define XD_MAX_GIGABIT_HASHES 48
|
|
|
|
#define XD_MAX_AD_IDS 5
|
|
|
|
double getTrafficPercent ( int32_t rank ) ;
|
|
|
|
bool setLangVec ( class Words *words ,
|
|
class SafeBuf *langBuf ,
|
|
class Sections *sections ,
|
|
int32_t niceness ) ;
|
|
|
|
char *getJSONFieldValue ( char *json, char *field , int32_t *valueLen ) ;
|
|
|
|
bool logQueryLogs ( );
|
|
|
|
bool checkRegex ( SafeBuf *regex ,
|
|
char *target ,
|
|
bool *boolVal ,
|
|
bool *boolValValid ,
|
|
int32_t *compileError ,
|
|
CollectionRec *cr ) ;
|
|
|
|
// Address.cpp calls this to make a vector from the "place name" for comparing
|
|
// to other places in placedb using the computeSimilarity() function. if
|
|
// we got a >75% similarity we set the AF_VERIFIED_PLACE_NAME bit in the
|
|
// Address::m_flags for that address on the web page.
|
|
int32_t makeSimpleWordVector ( char *s, int32_t *vbuf, int32_t vbufSize, int32_t niceness);
|
|
|
|
// this is used for making the event summary/title vectors as well as in
|
|
// Msg40.cpp where it merges events and does not want to repetitively display
|
|
// the same summary lines for an event
|
|
bool getWordVector ( char *s ,
|
|
HashTableX *ht ,
|
|
uint32_t *d ,
|
|
int32_t *nd ,
|
|
int32_t ndmax ) ;
|
|
|
|
bool getDensityRanks ( int64_t *wids ,
|
|
int32_t nw,
|
|
//int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
int32_t hashGroup ,
|
|
SafeBuf *densBuf ,
|
|
Sections *sections ,
|
|
int32_t niceness );
|
|
|
|
// diversity vector
|
|
bool getDiversityVec ( class Words *words ,
|
|
class Phrases *phrases ,
|
|
class HashTableX *countTable ,
|
|
class SafeBuf *sbWordVec ,
|
|
//class SafeBuf *sbPhraseVec ,
|
|
int32_t niceness );
|
|
|
|
float computeSimilarity ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
// corresponding scores vectors
|
|
int32_t *s0 ,
|
|
int32_t *s1 ,
|
|
class Query *q ,
|
|
int32_t niceness ,
|
|
// only Sections::addDateBasedImpliedSections()
|
|
// sets this to true right now. if set to true
|
|
// we essentially dedup each vector, although
|
|
// the score is compounded into the remaining
|
|
// occurrence. i'm not sure if that is the right
|
|
// behavior though.
|
|
bool dedupVecs = false );
|
|
|
|
bool isSimilar_sorted ( int32_t *vec0 ,
|
|
int32_t *vec1 ,
|
|
int32_t nv0 , // how many int32_ts in vec?
|
|
int32_t nv1 , // how many int32_ts in vec?
|
|
// they must be this similar or more to return true
|
|
int32_t percentSimilar,
|
|
int32_t niceness ) ;
|
|
|
|
// this is called by Msg40.cpp to set "top"
|
|
int32_t intersectGigabits ( Msg20 **mp , // search results
|
|
int32_t nmp ,
|
|
uint8_t langId , // searcher's langId
|
|
int32_t maxTop ,
|
|
int32_t docsToScan ,
|
|
int32_t minDocCount , // must be in this # docs
|
|
class GigabitInfo *top ,
|
|
int32_t niceness ) ;
|
|
|
|
int32_t getDirtyPoints ( char *s , int32_t len , int32_t niceness , char *logUrl ) ;
|
|
|
|
bool storeTerm ( char *s ,
|
|
int32_t slen ,
|
|
int64_t termId ,
|
|
class HashInfo *hi ,
|
|
int32_t wordNum ,
|
|
int32_t wordPos ,
|
|
char densityRank ,
|
|
char diversityRank ,
|
|
char wordSpamRank ,
|
|
char hashGroup ,
|
|
//bool isPhrase ,
|
|
class SafeBuf *wbuf ,
|
|
class HashTableX *wts ,
|
|
char synSrc ,
|
|
char langId ,
|
|
POSDBKEY key ) ;
|
|
|
|
// tell zlib to use our malloc/free functions
|
|
int gbuncompress ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen );
|
|
|
|
int gbcompress ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ,
|
|
int32_t encoding = ET_DEFLATE);
|
|
|
|
int gbcompress7 ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ,
|
|
bool compress = true );
|
|
|
|
int gbuncompress7 ( unsigned char *dest ,
|
|
uint32_t *destLen ,
|
|
unsigned char *source ,
|
|
uint32_t sourceLen ) ;
|
|
|
|
|
|
uint32_t score8to32 ( uint8_t score8 );
|
|
|
|
// for Msg13.cpp
|
|
char getContentTypeFromContent ( char *p , int32_t niceness ) ;
|
|
|
|
// . for Msg13.cpp
|
|
// . *pend must equal \0
|
|
int32_t getContentHash32Fast ( unsigned char *p ,
|
|
int32_t plen ,
|
|
int32_t niceness ) ;
|
|
|
|
uint16_t getCharsetFast ( class HttpMime *mime,
|
|
char *url ,
|
|
char *s ,
|
|
int32_t slen ,
|
|
int32_t niceness );
|
|
|
|
//#define MAX_CONTACT_OUTLINKS 5
|
|
|
|
#define MAX_CONTACT_ADDRESSES 20
|
|
#define EMAILBUFSIZE 512
|
|
|
|
#define ROOT_TITLE_BUF_MAX 512
|
|
|
|
// store the subsentences in an array now
|
|
class SubSent {
|
|
public:
|
|
sentflags_t m_subSentFlags;
|
|
//esflags_t m_esflags;
|
|
int32_t m_senta;
|
|
int32_t m_sentb;
|
|
int32_t m_subEnding;
|
|
float m_titleScore;
|
|
};
|
|
|
|
#define MAX_XML_DOCS 4
|
|
|
|
#define MAXMSG7S 50
|
|
|
|
class XmlDoc {
|
|
|
|
public:
|
|
|
|
// . variable size rdb records all start with key then dataSize
|
|
// . do not do that here since we compress our record's data!!
|
|
//key_t m_titleRecKey;
|
|
//int32_t m_dataSize;
|
|
|
|
//
|
|
// BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h)
|
|
//
|
|
|
|
|
|
// headerSize = this->ptr_firstUrl - this->m_headerSize
|
|
uint16_t m_headerSize;
|
|
uint16_t m_version;
|
|
// these flags are used to indicate which ptr_ members are present:
|
|
uint32_t m_internalFlags1;
|
|
int32_t m_ip;
|
|
int32_t m_crawlDelay;
|
|
// . use this to quickly detect if doc is unchanged
|
|
// . we can avoid setting Xml and Words classes etc...
|
|
int32_t m_contentHash32;
|
|
// like the above but hash of all tags in TagRec for this url
|
|
//int32_t m_tagHash32;
|
|
// this is a hash of all adjacent tag pairs for templated identificatn
|
|
uint32_t m_tagPairHash32;
|
|
int32_t m_siteNumInlinks;
|
|
//int32_t m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh
|
|
//int32_t m_siteNumInlinksUniqueCBlock; // m_sitePop;
|
|
int32_t m_reserved1;
|
|
int32_t m_reserved2;
|
|
uint32_t m_spideredTime; // time_t
|
|
// just don't throw away any relevant SpiderRequests and we have
|
|
// the data that m_minPubDate and m_maxPubDate provided
|
|
//time_t m_minPubDate;
|
|
//time_t m_maxPubDate;
|
|
uint32_t m_indexedTime; // slightly > m_spideredTime (time_t)
|
|
uint32_t m_reserved32;
|
|
uint32_t m_pubDate; // aka m_datedbDate // time_t
|
|
//time_t m_nextSpiderTime;
|
|
uint32_t m_firstIndexedDate; // time_t
|
|
uint32_t m_outlinksAddedDate; // time_t
|
|
uint16_t m_charset; // the ORIGINAL charset, we are always utf8!
|
|
uint16_t m_countryId;
|
|
//uint16_t m_reserved1;//titleWeight;
|
|
//uint16_t m_reserved2;//headerWeight;
|
|
//int32_t m_siteNumInlinksTotal;
|
|
int32_t m_reserved3;
|
|
//uint16_t m_reserved3;//urlPathWeight;
|
|
uint8_t m_metaListCheckSum8; // bring it back!!
|
|
char m_reserved3b;
|
|
uint16_t m_bodyStartPos;//m_reserved4;//externalLinkTextWeight;
|
|
uint16_t m_reserved5;//internalLinkTextWeight;
|
|
|
|
// a new parm from reserved6. need to know the count so we can
|
|
// delete the json objects derived from this page if we want to
|
|
// delete this page. or if this page is respidered then we get the
|
|
// json objects for it, REject the old json object urls, and inject
|
|
// the new ones i guess.
|
|
uint16_t m_diffbotJSONCount;
|
|
|
|
// these do not include header/footer (dup) addresses
|
|
//int16_t m_numAddresses;
|
|
int16_t m_httpStatus; // -1 if not found (empty http reply)
|
|
|
|
//int8_t m_nextSpiderPriority;
|
|
int8_t m_hopCount;
|
|
//int8_t m_metalistChecksum; // parser checksum
|
|
//uint8_t m_numBannedOutlinks8;
|
|
uint8_t m_langId;
|
|
uint8_t m_rootLangId;
|
|
uint8_t m_contentType;
|
|
|
|
|
|
// bit flags
|
|
uint16_t m_isRSS:1;
|
|
uint16_t m_isPermalink:1;
|
|
uint16_t m_isAdult:1;
|
|
uint16_t m_wasContentInjected:1;//eliminateMenus:1;
|
|
uint16_t m_spiderLinks:1;
|
|
uint16_t m_isContentTruncated:1;
|
|
uint16_t m_isLinkSpam:1;
|
|
uint16_t m_hasAddress:1;
|
|
uint16_t m_hasTOD:1;
|
|
uint16_t m_reserved_sv:1;//hasSiteVenue:1;
|
|
uint16_t m_hasContactInfo:1;
|
|
uint16_t m_isSiteRoot:1;
|
|
|
|
uint16_t m_isDiffbotJSONObject:1;
|
|
uint16_t m_sentToDiffbot:1;
|
|
uint16_t m_gotDiffbotSuccessfulReply:1;
|
|
uint16_t m_useTimeAxis:1; // m_reserved804:1;
|
|
uint16_t m_hasMetadata:1;
|
|
uint16_t m_reserved806:1;
|
|
uint16_t m_reserved807:1;
|
|
uint16_t m_reserved808:1;
|
|
uint16_t m_reserved809:1;
|
|
uint16_t m_reserved810:1;
|
|
uint16_t m_reserved811:1;
|
|
uint16_t m_reserved812:1;
|
|
uint16_t m_reserved813:1;
|
|
uint16_t m_reserved814:1;
|
|
uint16_t m_reserved815:1;
|
|
uint16_t m_reserved816:1;
|
|
|
|
|
|
char *ptr_firstUrl;
|
|
char *ptr_redirUrl;
|
|
//char *ptr_tagRecData;
|
|
char *ptr_rootTitleBuf;
|
|
int32_t *ptr_gigabitHashes;
|
|
int32_t *ptr_gigabitScores;
|
|
int64_t *ptr_adVector;
|
|
int64_t *ptr_wikiDocIds;
|
|
rscore_t *ptr_wikiScores;
|
|
char *ptr_imageData;
|
|
int32_t *ptr_catIds;
|
|
int32_t *ptr_indCatIds;
|
|
char *ptr_dmozTitles;
|
|
char *ptr_dmozSumms;
|
|
char *ptr_dmozAnchors;
|
|
char *ptr_utf8Content;
|
|
//char *ptr_sectionsReply; // votes read from sectiondb - m_osvt
|
|
//char *ptr_sectionsVotes; // our local votes - m_nsvt
|
|
//char *ptr_addressReply;
|
|
//char *ptr_clockCandidatesData;
|
|
char *ptr_metadata;
|
|
// . serialization of the sectiondb and placedb lists
|
|
// . that way we can store just these and not have to store the content
|
|
// of the entire page if we do not need to
|
|
//char *ptr_sectiondbData;
|
|
//char *ptr_placedbData;
|
|
// do not let SiteGetter change this when we re-parse!
|
|
char *ptr_site;
|
|
LinkInfo *ptr_linkInfo1;
|
|
char *ptr_linkdbData;
|
|
char *ptr_sectiondbData;
|
|
char *ptr_tagRecData;
|
|
LinkInfo *ptr_linkInfo2;
|
|
|
|
|
|
int32_t size_firstUrl;
|
|
int32_t size_redirUrl;
|
|
//int32_t size_tagRecData;
|
|
int32_t size_rootTitleBuf;
|
|
int32_t size_gigabitHashes;
|
|
int32_t size_gigabitScores;
|
|
int32_t size_adVector;
|
|
int32_t size_wikiDocIds;
|
|
int32_t size_wikiScores;
|
|
int32_t size_imageData;
|
|
int32_t size_catIds;
|
|
int32_t size_indCatIds;
|
|
int32_t size_dmozTitles;
|
|
int32_t size_dmozSumms;
|
|
int32_t size_dmozAnchors;
|
|
int32_t size_utf8Content;
|
|
//int32_t size_sectionsReply;
|
|
//int32_t size_sectionsVotes;
|
|
//int32_t size_addressReply;
|
|
//int32_t size_clockCandidatesData;
|
|
int32_t size_metadata;
|
|
//int32_t size_sectiondbData;
|
|
//int32_t size_placedbData;
|
|
int32_t size_site;
|
|
int32_t size_linkInfo1;
|
|
int32_t size_linkdbData;
|
|
int32_t size_sectiondbData;
|
|
int32_t size_tagRecData;
|
|
int32_t size_linkInfo2;
|
|
|
|
char m_dummyEnd;
|
|
|
|
//
|
|
// END WHAT IS STORED IN THE TITLE REC (Titledb.h)
|
|
//
|
|
|
|
public:
|
|
|
|
// . returns false and sets errno on error
|
|
// . once you call this you can call setMetaList() below
|
|
// . sets all the contained parser classes, Words, Xml, etc. if they
|
|
// have not already been set! that way Msg16/Msg14 can set bits
|
|
// and pieces here and there and we do not reset what it's done
|
|
// . our m_xml will contain ptrs into titleRec's content, be careful
|
|
// . if titleRec gets freed we should be freed too
|
|
//bool set ( char *titleRec ,
|
|
// class SafeBuf *pbuf = NULL ,
|
|
// int32_t niceness = MAX_NICENESS ,
|
|
// bool justSetLinks = false );
|
|
|
|
// . used by Msg16 to set the Xml to get meta redirect tag's content
|
|
// . used by Msg16 to get <META NAME="ROBOTS" CONTENT="index,follow">
|
|
// . this should be set by Msg16 so it can get meta redirect url
|
|
|
|
|
|
void print ( );
|
|
|
|
bool set1 ( char *url ,
|
|
char *coll,
|
|
SafeBuf *pbuf ,
|
|
int32_t niceness );
|
|
|
|
|
|
bool set2 ( char *titleRec,
|
|
int32_t maxSize,
|
|
char *coll,
|
|
class SafeBuf *p,
|
|
int32_t niceness ,
|
|
class SpiderRequest *sreq = NULL );
|
|
|
|
// . since being set from a docId, we will load the old title rec
|
|
// and use that!
|
|
// . used by PageGet.cpp
|
|
bool set3 ( int64_t docId ,
|
|
char *coll ,
|
|
int32_t niceness );
|
|
|
|
bool set4 ( class SpiderRequest *sreq ,
|
|
key_t *doledbKey ,
|
|
char *coll ,
|
|
class SafeBuf *pbuf ,
|
|
int32_t niceness ,
|
|
char *utf8Content = NULL ,
|
|
bool deleteFromIndex = false ,
|
|
int32_t forcedIp = 0 ,
|
|
uint8_t contentType = CT_HTML ,
|
|
uint32_t spideredTime = 0 , // time_t
|
|
bool contentHasMime = false ,
|
|
// for container docs, what is the separator of subdocs?
|
|
char *contentDelim = NULL,
|
|
char *metadata = NULL,
|
|
uint32_t metadataLen = 0,
|
|
// for injected docs we have the recv, buffer size don't exceed that
|
|
int32_t payloadLen = -1) ;
|
|
|
|
// we now call this right away rather than at download time!
|
|
int32_t getSpideredTime();
|
|
|
|
// time right before adding the termlists to the index, etc.
|
|
// whereas spider time is the download time
|
|
int32_t getIndexedTime();
|
|
|
|
// another entry point, like set3() kinda
|
|
bool loadFromOldTitleRec ();
|
|
|
|
XmlDoc() ;
|
|
~XmlDoc() ;
|
|
void nukeDoc ( class XmlDoc *);
|
|
void reset ( ) ;
|
|
bool setFirstUrl ( char *u , bool addWWW , Url *base = NULL ) ;
|
|
bool setRedirUrl ( char *u , bool addWWW ) ;
|
|
void setStatus ( char *s ) ;
|
|
void setCallback ( void *state, void (*callback) (void *state) ) ;
|
|
void setCallback ( void *state, bool (*callback) (void *state) ) ;
|
|
bool addToSpiderdb ( ) ;
|
|
void getRevisedSpiderRequest ( class SpiderRequest *revisedReq );
|
|
void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ;
|
|
bool indexDoc ( );
|
|
bool indexDoc2 ( );
|
|
bool isContainerDoc ( );
|
|
bool indexContainerDoc ( );
|
|
|
|
bool readMoreWarc();
|
|
bool indexWarcOrArc ( ) ;
|
|
key_t *getTitleRecKey() ;
|
|
//char *getSkipIndexing ( );
|
|
char *prepareToMakeTitleRec ( ) ;
|
|
// store TitleRec into "buf" so it can be added to metalist
|
|
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
|
|
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
|
|
SafeBuf *getTitleRecBuf ( );
|
|
bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ;
|
|
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
|
|
bool forDelete ) ;
|
|
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
|
|
bool setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t ssDocId ) ;
|
|
SafeBuf m_spiderStatusDocMetaList;
|
|
char *getIsAdult ( ) ;
|
|
int32_t **getIndCatIds ( ) ;
|
|
int32_t **getCatIds ( ) ;
|
|
class CatRec *getCatRec ( ) ;
|
|
|
|
int32_t *getNumDmozEntries() ;
|
|
char **getDmozTitles ( ) ;
|
|
char **getDmozSummaries ( ) ;
|
|
char **getDmozAnchors ( ) ;
|
|
bool setDmozInfo () ;
|
|
|
|
int64_t **getWikiDocIds ( ) ;
|
|
void gotWikiResults ( class UdpSlot *slot );
|
|
int32_t *getPubDate ( ) ;
|
|
//class DateParse2 *getDateParse2 ( ) ;
|
|
class Dates *getSimpleDates();
|
|
class Dates *getDates();
|
|
//class HashTableX *getClockCandidatesTable();
|
|
int32_t getUrlPubDate ( ) ;
|
|
int32_t getOutlinkAge ( int32_t outlinkNum ) ;
|
|
char *getIsPermalink ( ) ;
|
|
char *getIsUrlPermalinkFormat ( ) ;
|
|
char *getIsRSS ( ) ;
|
|
char *getIsSiteMap ( ) ;
|
|
class Xml *getXml ( ) ;
|
|
uint8_t *getLangVector ( ) ;
|
|
uint8_t *getLangId ( ) ;
|
|
char computeLangId ( Sections *sections ,Words *words , char *lv ) ;
|
|
class Words *getWords ( ) ;
|
|
class Bits *getBits ( ) ;
|
|
class Bits *getBitsForSummary ( ) ;
|
|
class Pos *getPos ( );
|
|
class Phrases *getPhrases ( ) ;
|
|
//class Synonyms *getSynonyms ( );
|
|
class Sections *getExplicitSections ( ) ;
|
|
class Sections *getImpliedSections ( ) ;
|
|
class Sections *getSections ( ) ;
|
|
class Sections *getSectionsWithDupStats ( );
|
|
class SafeBuf *getInlineSectionVotingBuf();
|
|
bool gotSectionFacets( class Multicast *mcast );
|
|
class SectionStats *getSectionStats ( uint32_t secHash32 ,
|
|
uint32_t sentHash32 ,
|
|
bool cacheOnly );
|
|
class SectionVotingTable *getOldSectionVotingTable();
|
|
class SectionVotingTable *getNewSectionVotingTable();
|
|
char **getSectionsReply ( ) ;
|
|
char **getSectionsVotes ( ) ;
|
|
HashTableX *getSectionVotingTable();
|
|
int32_t *getLinkSiteHashes ( );
|
|
class Links *getLinks ( bool doQuickSet = false ) ;
|
|
class HashTableX *getCountTable ( ) ;
|
|
bool hashString_ct ( class HashTableX *ht, char *s , int32_t slen ) ;
|
|
uint8_t *getSummaryLangId ( ) ;
|
|
int32_t *getSummaryVector ( ) ;
|
|
int32_t *getPageSampleVector ( ) ;
|
|
int32_t *getPostLinkTextVector ( int32_t linkNode ) ;
|
|
int32_t computeVector ( class Sections *sections, class Words *words,
|
|
uint32_t *vec , int32_t start = 0 , int32_t end = -1 );
|
|
float *getTagSimilarity ( class XmlDoc *xd2 ) ;
|
|
float *getGigabitSimilarity ( class XmlDoc *xd2 ) ;
|
|
float *getPageSimilarity ( class XmlDoc *xd2 ) ;
|
|
float *getPercentChanged ( );
|
|
uint64_t *getFuzzyDupHash ( );
|
|
int64_t *getExactContentHash64();
|
|
int64_t *getLooseContentHash64();
|
|
class RdbList *getDupList ( ) ;
|
|
class RdbList *getLikedbListForReq ( );
|
|
class RdbList *getLikedbListForIndexing ( );
|
|
int32_t addLikedbRecords ( bool justGetSize ) ;
|
|
char *getIsDup ( ) ;
|
|
char *isDupOfUs ( int64_t d ) ;
|
|
uint32_t *getGigabitVectorScorelessHash ( ) ;
|
|
int32_t **getGigabitHashes ( );
|
|
char *getGigabitQuery ( ) ;
|
|
char *getMetaDescription( int32_t *mdlen ) ;
|
|
char *getMetaSummary ( int32_t *mslen ) ;
|
|
char *getMetaKeywords( int32_t *mklen ) ;
|
|
char *getMetadata(int32_t* retlen);
|
|
bool addGigabits ( char *s , int64_t docId , uint8_t langId ) ;
|
|
bool addGigabits2 ( char *s,int32_t slen,int64_t docId,uint8_t langId);
|
|
bool addGigabits ( class Words *ww ,
|
|
int64_t docId,
|
|
class Sections *sections,
|
|
//class Weights *we ,
|
|
uint8_t langId );
|
|
|
|
int32_t *getSiteSpiderQuota ( ) ;
|
|
class Url *getCurrentUrl ( ) ;
|
|
class Url *getFirstUrl() ;
|
|
int64_t getFirstUrlHash48();
|
|
int64_t getFirstUrlHash64();
|
|
class Url **getLastRedirUrl() ;
|
|
class Url **getRedirUrl() ;
|
|
class Url **getMetaRedirUrl() ;
|
|
class Url **getCanonicalRedirUrl ( ) ;
|
|
int32_t *getFirstIndexedDate ( ) ;
|
|
int32_t *getOutlinksAddedDate ( ) ;
|
|
//int32_t *getNumBannedOutlinks ( ) ;
|
|
uint16_t *getCountryId ( ) ;
|
|
class XmlDoc **getOldXmlDoc ( ) ;
|
|
//bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
|
|
class XmlDoc **getExtraDoc ( char *url , int32_t maxCacheAge = 0 ) ;
|
|
bool getIsPageParser ( ) ;
|
|
class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ;
|
|
//class XmlDoc **getGatewayXmlDoc ( ) ;
|
|
// . returns false if blocked, true otherwise.
|
|
// . returns true and sets g_errno on error
|
|
//bool setFromOldTitleRec ( ) ;
|
|
//RdbList *getOldMetaList ( ) ;
|
|
char **getOldTitleRec ( );
|
|
uint8_t *getRootLangId ();
|
|
//bool *updateRootLangId ( );
|
|
char **getRootTitleRec ( ) ;
|
|
//char **getContactTitleRec ( char *url ) ;
|
|
int64_t *getAvailDocIdOnly ( int64_t preferredDocId ) ;
|
|
int64_t *getDocId ( ) ;
|
|
char *getIsIndexed ( ) ;
|
|
class TagRec *getTagRec ( ) ;
|
|
char *getHasContactInfo ( ) ;
|
|
char *getIsThisDocContacty ( );
|
|
bool *getHasTOD();
|
|
bool *getHasSiteVenue();
|
|
// non-dup/nondup addresses only
|
|
bool *getHasAddress();
|
|
class Addresses *getAddresses ( ) ;
|
|
Address **getContactAddresses ( );
|
|
int32_t *getNumOfficialEmails ( ) ;
|
|
char *getEmailBuf ( ) ;
|
|
int32_t *getNumContactAddresses ( );
|
|
int32_t addEmailTags ( class Xml *xml , class Words *ww ,
|
|
class TagRec *gr , int32_t ip ) ;
|
|
//class Url *getContactUsLink ( ) ;
|
|
//class Url *getAboutUsLink ( ) ;
|
|
int32_t *getFirstIp ( ) ;
|
|
bool *updateFirstIp ( ) ;
|
|
//int32_t *getSiteNumInlinksUniqueIp ( ) ;
|
|
//int32_t *getSiteNumInlinksUniqueCBlock ( ) ;
|
|
//int32_t *getSiteNumInlinksTotal ( );
|
|
//int32_t *getSiteNumInlinksFresh ( ) ;
|
|
//int32_t *getSitePop ( ) ;
|
|
uint8_t *getSiteNumInlinks8 () ;
|
|
int32_t *getSiteNumInlinks ( ) ;
|
|
class LinkInfo *getSiteLinkInfo() ;
|
|
int32_t *getIp ( ) ;
|
|
int32_t *gotIp ( bool save ) ;
|
|
bool *getIsAllowed ( ) ;
|
|
int32_t *getFinalCrawlDelay();
|
|
int32_t m_finalCrawlDelay;
|
|
//int32_t getTryAgainTimeDelta() {
|
|
// if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
|
|
// return m_tryAgainTimeDelta;
|
|
//};
|
|
char *getIsWWWDup ( ) ;
|
|
class LinkInfo *getLinkInfo1 ( ) ;
|
|
class LinkInfo **getLinkInfo2 ( ) ;
|
|
char *getSite ( ) ;
|
|
void gotSite ( ) ;
|
|
int64_t *getSiteHash64 ( ) ;
|
|
int32_t *getSiteHash32 ( ) ;
|
|
char **getHttpReply ( ) ;
|
|
char **getHttpReply2 ( ) ;
|
|
char **gotHttpReply ( ) ;
|
|
char *getIsContentTruncated ( );
|
|
int32_t *getDownloadStatus ( ) ;
|
|
int64_t *getDownloadEndTime ( ) ;
|
|
int16_t *getHttpStatus ( );
|
|
char waitForTimeSync ( ) ;
|
|
bool m_alreadyRegistered;
|
|
class HttpMime *getMime () ;
|
|
char **getContent ( ) ;
|
|
uint8_t *getContentType ( ) ;
|
|
uint16_t *getCharset ( ) ;
|
|
char *getIsBinary ( ) ;
|
|
char **getFilteredContent ( ) ;
|
|
void filterStart_r ( bool amThread ) ;
|
|
char **getRawUtf8Content ( ) ;
|
|
char **getExpandedUtf8Content ( ) ;
|
|
char **getUtf8Content ( ) ;
|
|
// we download large files to a file on disk, like warcs and arcs
|
|
FILE *getUtf8ContentInFile ( );
|
|
int32_t *getContentHash32 ( ) ;
|
|
int32_t *getContentHashJson32 ( ) ;
|
|
//int32_t *getTagHash32 ( ) ;
|
|
int32_t *getTagPairHashVector ( ) ;
|
|
uint32_t *getTagPairHash32 ( ) ;
|
|
int32_t getHostHash32a ( ) ;
|
|
int32_t getHostHash32b ( ) ;
|
|
int32_t getDomHash32 ( );
|
|
char **getThumbnailData();
|
|
class Images *getImages ( ) ;
|
|
int8_t *getNextSpiderPriority ( ) ;
|
|
int32_t *getPriorityQueueNum ( ) ;
|
|
class TagRec ***getOutlinkTagRecVector () ;
|
|
char *hasNoIndexMetaTag();
|
|
char *hasFakeIpsMetaTag ( );
|
|
int32_t **getOutlinkFirstIpVector () ;
|
|
//char **getOutlinkIsIndexedVector () ;
|
|
int32_t *getRegExpNum ( int32_t outlinkNum ) ;
|
|
int32_t *getRegExpNum2 ( int32_t outlinkNum ) ;
|
|
char *getIsSiteRoot ( ) ;
|
|
bool getIsOutlinkSiteRoot ( char *u , class TagRec *gr ) ;
|
|
int8_t *getHopCount ( ) ;
|
|
//int8_t *getOutlinkHopCountVector ( ) ;
|
|
char *getSpiderLinks ( ) ;
|
|
int32_t *getNextSpiderTime ( ) ;
|
|
//char *getIsSpam() ;
|
|
char *getIsFiltered ();
|
|
bool getIsInjecting();
|
|
int32_t *getSpiderPriority ( ) ;
|
|
int32_t *getIndexCode ( ) ;
|
|
int32_t *getIndexCode2 ( ) ;
|
|
SafeBuf *getNewTagBuf ( ) ;
|
|
|
|
char *updateTagdb ( ) ;
|
|
bool logIt ( class SafeBuf *bb = NULL ) ;
|
|
bool m_doConsistencyTesting;
|
|
bool doConsistencyTest ( bool forceTest ) ;
|
|
int32_t printMetaList ( ) ;
|
|
void printMetaList ( char *metaList , char *metaListEnd ,
|
|
class SafeBuf *pbuf );
|
|
bool verifyMetaList ( char *p , char *pend , bool forDelete ) ;
|
|
bool hashMetaList ( class HashTableX *ht ,
|
|
char *p ,
|
|
char *pend ,
|
|
bool checkList ) ;
|
|
|
|
char *getMetaList ( bool forDelete = false );
|
|
|
|
char *getDiffbotParentUrl( char *myUrl );
|
|
|
|
int64_t m_diffbotReplyEndTime;
|
|
int64_t m_diffbotReplyStartTime;
|
|
int32_t m_diffbotReplyRetries;
|
|
|
|
bool m_sentToDiffbotThisTime;
|
|
|
|
uint64_t m_downloadStartTime;
|
|
//uint64_t m_downloadEndTime;
|
|
|
|
uint64_t m_ipStartTime;
|
|
uint64_t m_ipEndTime;
|
|
|
|
bool m_updatedMetaData;
|
|
|
|
void copyFromOldDoc ( class XmlDoc *od ) ;
|
|
|
|
class SpiderReply *getFakeSpiderReply ( );
|
|
|
|
// we add a SpiderReply to spiderdb when done spidering, even if
|
|
// m_indexCode or g_errno was set!
|
|
class SpiderReply *getNewSpiderReply ( );
|
|
|
|
|
|
SpiderRequest **getRedirSpiderRequest ( );
|
|
SpiderRequest m_redirSpiderRequest;
|
|
SpiderRequest *m_redirSpiderRequestPtr;
|
|
|
|
|
|
void setSpiderReqForMsg20 ( class SpiderRequest *sreq ,
|
|
class SpiderReply *srep );
|
|
|
|
|
|
char *addOutlinkSpiderRecsToMetaList ( );
|
|
|
|
//bool addTable96 ( class HashTableX *tt1 ,
|
|
// int32_t date1 ,
|
|
// bool nosplit ) ;
|
|
|
|
int32_t getSiteRank ();
|
|
bool addTable144 ( class HashTableX *tt1 ,
|
|
int64_t docId ,
|
|
class SafeBuf *buf = NULL );
|
|
|
|
bool addTable224 ( HashTableX *tt1 ) ;
|
|
|
|
//bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1
|
|
// uint64_t docId ,
|
|
// uint8_t rdbId ,
|
|
// bool nosplit ) ;
|
|
|
|
bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1
|
|
uint8_t rdbId ,
|
|
bool forDelete ) ;
|
|
|
|
bool hashNoSplit ( class HashTableX *tt ) ;
|
|
char *hashAll ( class HashTableX *table ) ;
|
|
int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ;
|
|
bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
|
|
bool hashMetaTags ( class HashTableX *table ) ;
|
|
bool hashMetaData ( class HashTableX *table ) ;
|
|
bool hashIsClean ( class HashTableX *table ) ;
|
|
bool hashZipCodes ( class HashTableX *table ) ;
|
|
bool hashMetaZip ( class HashTableX *table ) ;
|
|
bool hashContentType ( class HashTableX *table ) ;
|
|
bool hashDMOZCategories ( class HashTableX *table ) ;
|
|
bool hashLinks ( class HashTableX *table ) ;
|
|
bool getUseTimeAxis ( ) ;
|
|
SafeBuf *getTimeAxisUrl ( );
|
|
bool hashUrl ( class HashTableX *table );
|
|
bool hashDateNumbers ( class HashTableX *tt );
|
|
bool hashSections ( class HashTableX *table ) ;
|
|
bool hashIncomingLinkText ( class HashTableX *table ,
|
|
bool hashAnomalies ,
|
|
bool hashNonAnomalies ) ;
|
|
|
|
bool hashLinksForLinkdb ( class HashTableX *table ) ;
|
|
bool hashNeighborhoods ( class HashTableX *table ) ;
|
|
bool hashRSSInfo ( class HashTableX *table ) ;
|
|
bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ;
|
|
bool hashTitle ( class HashTableX *table );
|
|
bool hashBody2 ( class HashTableX *table );
|
|
bool hashMetaKeywords ( class HashTableX *table );
|
|
bool hashMetaSummary ( class HashTableX *table );
|
|
bool linksToGigablast ( ) ;
|
|
bool searchboxToGigablast ( ) ;
|
|
bool hashLanguage ( class HashTableX *table ) ;
|
|
bool hashLanguageString ( class HashTableX *table ) ;
|
|
bool hashCountry ( class HashTableX *table ) ;
|
|
bool hashSiteNumInlinks ( class HashTableX *table ) ;
|
|
bool hashCharset ( class HashTableX *table ) ;
|
|
bool hashTagRec ( class HashTableX *table ) ;
|
|
bool hashPermalink ( class HashTableX *table ) ;
|
|
bool hashVectors(class HashTableX *table ) ;
|
|
bool hashAds(class HashTableX *table ) ;
|
|
class Url *getBaseUrl ( ) ;
|
|
bool hashSubmitUrls ( class HashTableX *table ) ;
|
|
bool hashImageStuff ( class HashTableX *table ) ;
|
|
bool hashIsAdult ( class HashTableX *table ) ;
|
|
|
|
void set20 ( Msg20Request *req ) ;
|
|
class Msg20Reply *getMsg20Reply ( ) ;
|
|
char **getDiffbotPrimaryImageUrl ( ) ;
|
|
char **getImageUrl() ;
|
|
class MatchOffsets *getMatchOffsets () ;
|
|
Query *getQuery() ;
|
|
Matches *getMatches () ;
|
|
char *getDescriptionBuf ( char *displayMetas , int32_t *dlen ) ;
|
|
SafeBuf *getHeaderTagBuf();
|
|
class Title *getTitle ();
|
|
class Summary *getSummary () ;
|
|
char *getHighlightedSummary ();
|
|
SafeBuf *getSampleForGigabits ( ) ;
|
|
SafeBuf *getSampleForGigabitsJSON ( ) ;
|
|
char *getIsCompromised ( ) ;
|
|
char *getIsNoArchive ( ) ;
|
|
int32_t *getUrlFilterNum();
|
|
//int32_t *getDiffbotApiNum();
|
|
SafeBuf *getDiffbotApiUrl();
|
|
int64_t **getAdVector ( ) ;
|
|
char *getIsLinkSpam ( ) ;
|
|
char *getIsHijacked();
|
|
char *getIsErrorPage ( ) ;
|
|
char* matchErrorMsg(char* p, char* pend );
|
|
|
|
bool hashWords ( //int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
class HashInfo *hi ) ;
|
|
bool hashSingleTerm ( int64_t termId ,
|
|
class HashInfo *hi ) ;
|
|
bool hashSingleTerm ( char *s ,
|
|
int32_t slen ,
|
|
class HashInfo *hi );
|
|
bool hashString ( class HashTableX *ht ,
|
|
//class Weights *we ,
|
|
class Bits *bits ,
|
|
char *s ,
|
|
int32_t slen ) ;
|
|
bool hashString ( char *s ,
|
|
int32_t slen ,
|
|
class HashInfo *hi ) ;
|
|
bool hashString ( char *s ,
|
|
class HashInfo *hi ) ;
|
|
|
|
|
|
|
|
bool hashWords3 ( //int32_t wordStart ,
|
|
//int32_t wordEnd ,
|
|
class HashInfo *hi ,
|
|
class Words *words ,
|
|
class Phrases *phrases ,
|
|
class Synonyms *synonyms ,
|
|
class Sections *sections ,
|
|
class HashTableX *countTable ,
|
|
char *fragVec ,
|
|
char *wordSpamVec ,
|
|
char *langVec ,
|
|
char docLangId , // default lang id
|
|
class SafeBuf *pbuf ,
|
|
class HashTableX *wts ,
|
|
class SafeBuf *wbuf ,
|
|
int32_t niceness );
|
|
|
|
bool hashString3 ( char *s ,
|
|
int32_t slen ,
|
|
class HashInfo *hi ,
|
|
class HashTableX *countTable ,
|
|
class SafeBuf *pbuf ,
|
|
class HashTableX *wts ,
|
|
class SafeBuf *wbuf ,
|
|
int32_t version ,
|
|
int32_t siteNumInlinks ,
|
|
int32_t niceness );
|
|
|
|
|
|
//bool hashSectionTerm ( char *term ,
|
|
// class HashInfo *hi ,
|
|
// int32_t sentHash32 ) ;
|
|
|
|
bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ;
|
|
|
|
bool hashFacet2 ( char *prefix,char *term,int32_t val32, HashTableX *dt,
|
|
bool shardByTermId = false ) ;
|
|
|
|
// gbfieldmatch:
|
|
bool hashFieldMatchTerm ( char *val, int32_t vlen, class HashInfo *hi);
|
|
|
|
bool hashNumber ( char *beginBuf ,
|
|
char *buf ,
|
|
int32_t bufLen ,
|
|
class HashInfo *hi ) ;
|
|
|
|
bool hashNumber2 ( float f ,
|
|
class HashInfo *hi ,
|
|
char *gbsortByStr ) ;
|
|
|
|
bool hashNumber3 ( int32_t x,
|
|
class HashInfo *hi ,
|
|
char *gbsortByStr ) ;
|
|
|
|
bool storeFacetValues ( char *qs , class SafeBuf *sb ,
|
|
FacetValHash_t fvh ) ;
|
|
bool storeFacetValuesSite ( char *qs , SafeBuf *sb ,
|
|
FacetValHash_t fvh );
|
|
bool storeFacetValuesSections ( char *qs , class SafeBuf *sb ,
|
|
FacetValHash_t fvh ) ;
|
|
bool storeFacetValuesHtml ( char *qs , class SafeBuf *sb ,
|
|
FacetValHash_t fvh ) ;
|
|
bool storeFacetValuesXml ( char *qs , class SafeBuf *sb ,
|
|
FacetValHash_t fvh ) ;
|
|
bool storeFacetValuesJSON ( char *qs , class SafeBuf *sb ,
|
|
FacetValHash_t fvh,
|
|
Json* jp ) ;
|
|
|
|
// print out for PageTitledb.cpp and PageParser.cpp
|
|
bool printDoc ( class SafeBuf *pbuf );
|
|
bool printMenu ( class SafeBuf *pbuf );
|
|
bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ;
|
|
bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ;
|
|
bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printTermList ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr );
|
|
bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr );
|
|
|
|
bool printSerpFiltered ( class Section *sx , char *tagName ) ;
|
|
|
|
char **getTitleBuf ( );
|
|
char **getRootTitleBuf ( );
|
|
char **getFilteredRootTitleBuf ( );
|
|
|
|
// funcs that update our tagdb tagrec, m_tagRec, and also update tagdb
|
|
bool *updateVenueAddresses ( );
|
|
|
|
// called by msg0 handler to add posdb termlists into g_termListCache
|
|
// for faster seo pipeline
|
|
bool cacheTermLists();
|
|
|
|
public:
|
|
|
|
// stuff set from the key of the titleRec, above the compression area
|
|
//key_t m_key;
|
|
int64_t m_docId;
|
|
|
|
char *m_ubuf;
|
|
int32_t m_ubufSize;
|
|
int32_t m_ubufAlloc;
|
|
|
|
// does this page link to gigablast, or has a search form to it?
|
|
//bool linksToGigablast();
|
|
//bool searchboxToGigablast();
|
|
|
|
// private:
|
|
|
|
// we we started spidering it, in milliseconds since the epoch
|
|
int64_t m_startTime;
|
|
int64_t m_injectStartTime;
|
|
|
|
class XmlDoc *m_prevInject;
|
|
class XmlDoc *m_nextInject;
|
|
|
|
// when set() was called by Msg20.cpp so we can time how long it took
|
|
// to generate the summary
|
|
int64_t m_setTime;
|
|
int64_t m_cpuSummaryStartTime;
|
|
|
|
// timers
|
|
int64_t m_beginSEOTime;
|
|
int64_t m_beginTimeAllMatch;
|
|
int64_t m_beginTimeMatchUrl;
|
|
int64_t m_beginTimeFullQueries;
|
|
int64_t m_beginTimeLinks;
|
|
//int64_t m_beginMsg98s;
|
|
int64_t m_beginRelatedQueries;
|
|
int64_t m_beginMsg95s;
|
|
|
|
// . these should all be set using set*() function calls so their
|
|
// individual validity flags can bet set to true, and successive
|
|
// calls to their corresponding get*() functions will not core
|
|
// . these particular guys are set immediately on set(char *titleRec)
|
|
|
|
Url m_redirUrl;
|
|
Url *m_redirUrlPtr;
|
|
Url *m_lastRedirUrlPtr;
|
|
SafeBuf m_redirCookieBuf;
|
|
Url m_metaRedirUrl;
|
|
Url *m_metaRedirUrlPtr;
|
|
Url m_canonicalRedirUrl;
|
|
Url *m_canonicalRedirUrlPtr;
|
|
int32_t m_redirError;
|
|
char m_allowSimplifiedRedirs;
|
|
Url m_firstUrl;
|
|
int64_t m_firstUrlHash48;
|
|
int64_t m_firstUrlHash64;
|
|
Url m_currentUrl;
|
|
|
|
//char *m_coll;
|
|
//char m_collBuf[MAX_COLL_LEN+1]; // include \0
|
|
CollectionRec *m_lastcr;
|
|
collnum_t m_collnum;
|
|
int32_t m_lastCollRecResetCount;
|
|
class CollectionRec *getCollRec ( ) ;
|
|
bool setCollNum ( char *coll ) ;
|
|
|
|
|
|
char *m_content;
|
|
int32_t m_contentLen;
|
|
|
|
char *m_metaList;
|
|
int32_t m_metaListSize;
|
|
|
|
int32_t m_addedSpiderRequestSize;
|
|
int32_t m_addedSpiderReplySize;
|
|
int32_t m_addedStatusDocSize;
|
|
int64_t m_addedStatusDocId;
|
|
|
|
SafeBuf m_metaList2;
|
|
SafeBuf m_zbuf;
|
|
SafeBuf m_kbuf;
|
|
|
|
// warc parsing member vars
|
|
class Msg7 *m_msg7;
|
|
class Msg7 *m_msg7s[MAXMSG7S];
|
|
char *m_warcContentPtr;
|
|
char *m_arcContentPtr;
|
|
char *m_anyContentPtr;
|
|
char *m_contentDelim;
|
|
SafeBuf m_injectUrlBuf;
|
|
bool m_subDocsHaveMime;
|
|
int32_t m_warcError ;
|
|
int32_t m_arcError ;
|
|
bool m_doneInjectingWarc ;
|
|
|
|
int64_t m_bytesStreamed;
|
|
char *m_fileBuf ;
|
|
int32_t m_fileBufAllocSize;
|
|
bool m_registeredWgetReadCallback;
|
|
char *m_fptr ;
|
|
char *m_fptrEnd ;
|
|
|
|
FILE* m_pipe;
|
|
|
|
BigFile m_file;
|
|
int64_t m_fileSize;
|
|
FileState m_fileState;
|
|
bool m_readThreadOut;
|
|
bool m_hasMoreToRead;
|
|
int32_t m_numInjectionsOut;
|
|
bool m_calledWgetThread;
|
|
|
|
// used by msg7 to store udp slot
|
|
class UdpSlot *m_injectionSlot;
|
|
|
|
// . same thing, a little more complicated
|
|
// . these classes are only set on demand
|
|
Xml m_xml;
|
|
Links m_links;
|
|
Words m_words;
|
|
Bits m_bits;
|
|
Bits m_bits2;
|
|
Pos m_pos;
|
|
Phrases m_phrases;
|
|
//Synonyms m_synonyms;
|
|
SafeBuf m_synBuf;
|
|
//Weights m_weights;
|
|
Sections m_sections;
|
|
|
|
// a hack storage thing used by Msg13.cpp
|
|
class Msg13Request *m_hsr;
|
|
|
|
Section *m_si;
|
|
//Section *m_nextSection;
|
|
//Section *m_lastSection;
|
|
int32_t m_mcastRequestsOut;
|
|
int32_t m_mcastRequestsIn;
|
|
int32_t m_secStatsErrno;
|
|
char *m_queryBuf;
|
|
Msg39Request *m_msg39RequestArray;
|
|
SafeBuf m_mcastBuf;
|
|
Multicast *m_mcastArray;
|
|
//char *m_inUse;
|
|
//Query *m_queryArray;
|
|
//Query *m_sharedQuery;
|
|
bool m_gotDupStats;
|
|
//Query m_q4;
|
|
//Msg3a m_msg3a;
|
|
//Msg39Request m_r39;
|
|
Msg39Request m_mr2;
|
|
SectionStats m_sectionStats;
|
|
HashTableX m_sectionStatsTable;
|
|
//char m_sectionHashQueryBuf[128];
|
|
|
|
// also set in getSections()
|
|
int32_t m_maxVotesForDup;
|
|
|
|
// . for rebuild logging of what's changed
|
|
// . Repair.cpp sets these based on titlerec
|
|
char m_logLangId;
|
|
int32_t m_logSiteNumInlinks;
|
|
|
|
SectionVotingTable m_nsvt;
|
|
|
|
SectionVotingTable m_osvt;
|
|
int32_t m_numSectiondbReads;
|
|
int32_t m_numSectiondbNeeds;
|
|
key128_t m_sectiondbStartKey;
|
|
RdbList m_secdbList;
|
|
int32_t m_sectiondbRecall;
|
|
SafeBuf m_tmpBuf3;
|
|
|
|
bool m_gotFacets;
|
|
SafeBuf m_tmpBuf2;
|
|
|
|
SafeBuf m_inlineSectionVotingBuf;
|
|
|
|
//HashTableX m_rvt;
|
|
//Msg17 m_msg17;
|
|
//char *m_cachedRootVoteRec;
|
|
//int32_t m_cachedRootVoteRecSize;
|
|
//bool m_triedVoteCache;
|
|
//bool m_storedVoteCache;
|
|
//SafeBuf m_cacheRecBuf;
|
|
|
|
SafeBuf m_timeAxisUrl;
|
|
|
|
HashTableX m_turkVotingTable;
|
|
HashTableX m_turkBitsTable;
|
|
uint32_t m_confirmedTitleContentHash ;
|
|
uint32_t m_confirmedVenueContentHash ;
|
|
uint32_t m_confirmedTitleTagHash ;
|
|
uint32_t m_confirmedVenueTagHash ;
|
|
|
|
// turk voting tag rec
|
|
TagRec m_vtr;
|
|
// tagrec of banned turks
|
|
TagRec m_bannedTurkRec;
|
|
// and the table of the hashed banned turk users
|
|
HashTableX m_turkBanTable;
|
|
|
|
// used for displaying turk votes...
|
|
HashTableX m_vctab;
|
|
HashTableX m_vcduptab;
|
|
|
|
bool isFirstUrlRobotsTxt();
|
|
bool m_isRobotsTxtUrl;
|
|
|
|
Images m_images;
|
|
HashTableX m_countTable;
|
|
HttpMime m_mime;
|
|
TagRec m_tagRec;
|
|
SafeBuf m_tagRecBuf;
|
|
// copy of m_oldTagRec but with our modifications, if any
|
|
//TagRec m_newTagRec;
|
|
SafeBuf m_newTagBuf;
|
|
SafeBuf m_fragBuf;
|
|
SafeBuf m_wordSpamBuf;
|
|
SafeBuf m_finalSummaryBuf;
|
|
// this one is initially the same as m_tagRec, but we do not modify it
|
|
// so that Address.cpp can reference into its buffer, m_buf, without
|
|
// fear of getting the buffer overwritten by crap
|
|
//TagRec m_savedTagRec1;
|
|
//char *m_sampleVector ;
|
|
//uint32_t m_tagPairHash32;
|
|
int32_t m_firstIp;
|
|
|
|
class SafeBuf *m_savedSb;
|
|
class HttpRequest *m_savedHr;
|
|
|
|
char m_savedChar;
|
|
|
|
|
|
// validity flags. on reset() all these are set to false.
|
|
char m_VALIDSTART;
|
|
// DO NOT add validity flags above this line!
|
|
char m_metaListValid;
|
|
char m_addedSpiderRequestSizeValid;
|
|
char m_addedSpiderReplySizeValid;
|
|
char m_addedStatusDocSizeValid;
|
|
char m_downloadStartTimeValid;
|
|
char m_contentDelimValid;
|
|
char m_fileValid;
|
|
//char m_docQualityValid;
|
|
char m_siteValid;
|
|
char m_startTimeValid;
|
|
char m_currentUrlValid;
|
|
char m_useTimeAxisValid;
|
|
char m_timeAxisUrlValid;
|
|
char m_firstUrlValid;
|
|
char m_firstUrlHash48Valid;
|
|
char m_firstUrlHash64Valid;
|
|
char m_lastUrlValid;
|
|
char m_docIdValid;
|
|
char m_availDocIdValid;
|
|
//char m_collValid;
|
|
char m_tagRecValid;
|
|
char m_robotsTxtLenValid;
|
|
char m_tagRecDataValid;
|
|
char m_newTagBufValid;
|
|
char m_rootTitleBufValid;
|
|
char m_filteredRootTitleBufValid;
|
|
char m_titleBufValid;
|
|
char m_fragBufValid;
|
|
char m_isRobotsTxtUrlValid;
|
|
char m_inlineSectionVotingBufValid;
|
|
char m_wordSpamBufValid;
|
|
char m_finalSummaryBufValid;
|
|
char m_matchingQueryBufValid;
|
|
char m_matchesCrawlPatternValid;
|
|
char m_relatedQueryBufValid;
|
|
char m_queryLinkBufValid;
|
|
char m_redirSpiderRequestValid;
|
|
//char m_queryPtrsValid;
|
|
char m_queryOffsetsValid;
|
|
//char m_queryPtrsSortedValid;
|
|
char m_queryPtrsWholeValid;
|
|
char m_relatedDocIdBufValid;
|
|
char m_topMatchingQueryBufValid;
|
|
char m_relatedDocIdsScoredBufValid;
|
|
char m_relatedDocIdsWithTitlesValid;
|
|
char m_relatedTitleBufValid;
|
|
//char m_queryLinkBufValid;
|
|
char m_missingTermBufValid;
|
|
char m_matchingTermBufValid;
|
|
//char m_relPtrsValid;
|
|
char m_sortedPosdbListBufValid;
|
|
char m_wpSortedPosdbListBufValid;
|
|
char m_termListBufValid;
|
|
char m_insertableTermsBufValid;
|
|
char m_scoredInsertableTermsBufValid;
|
|
//char m_iwfiBufValid; // for holding WordFreqInfo instances
|
|
char m_wordPosInfoBufValid;
|
|
char m_recommendedLinksBufValid;
|
|
char m_tempMsg25PageValid;
|
|
char m_tempMsg25SiteValid;
|
|
|
|
//char m_queryHashTableValid;
|
|
char m_queryOffsetTableValid;
|
|
//char m_socketWriteBufValid;
|
|
//char m_numBannedOutlinksValid;
|
|
char m_hopCountValid;
|
|
char m_isInjectingValid;
|
|
char m_isImportingValid;
|
|
char m_metaListCheckSum8Valid;
|
|
char m_contentValid;
|
|
char m_filteredContentValid;
|
|
char m_charsetValid;
|
|
char m_langVectorValid;
|
|
char m_langIdValid;
|
|
char m_rootLangIdValid;
|
|
char m_datedbDateValid;
|
|
char m_isRSSValid;
|
|
char m_isSiteMapValid;
|
|
char m_spiderLinksArgValid;
|
|
char m_isContentTruncatedValid;
|
|
char m_xmlValid;
|
|
char m_linksValid;
|
|
char m_wordsValid;
|
|
char m_bitsValid;
|
|
char m_bits2Valid;
|
|
char m_posValid;
|
|
char m_isUrlBadYearValid;
|
|
char m_phrasesValid;
|
|
//char m_synonymsValid;
|
|
//char m_weightsValid;
|
|
char m_sectionsValid;
|
|
char m_subSentsValid;
|
|
char m_osvtValid;
|
|
char m_nsvtValid;
|
|
//char m_rvtValid;
|
|
char m_turkVotingTableValid;
|
|
char m_turkBitsTableValid;
|
|
char m_turkBanTableValid;
|
|
char m_vctabValid;
|
|
char m_explicitSectionsValid;
|
|
char m_impliedSectionsValid;
|
|
char m_sectionVotingTableValid;
|
|
char m_imageDataValid;
|
|
char m_imagesValid;
|
|
char m_msge0Valid;
|
|
char m_msge1Valid;
|
|
//char m_msge2Valid;
|
|
//char m_sampleVectorValid;
|
|
char m_gigabitHashesValid;
|
|
//char m_oldsrValid;
|
|
char m_sreqValid;
|
|
char m_srepValid;
|
|
|
|
bool m_ipValid;
|
|
bool m_firstIpValid;
|
|
bool m_spideredTimeValid;
|
|
//bool m_nextSpiderTimeValid;
|
|
bool m_indexedTimeValid;
|
|
bool m_firstIndexedValid;
|
|
bool m_isInIndexValid;
|
|
bool m_wasInIndexValid;
|
|
bool m_outlinksAddedDateValid;
|
|
bool m_countryIdValid;
|
|
bool m_bodyStartPosValid;
|
|
/*
|
|
bool m_titleWeightValid;
|
|
bool m_headerWeightValid;
|
|
bool m_urlPathWeightValid;
|
|
bool m_externalLinkTextWeightValid;
|
|
bool m_internalLinkTextWeightValid;
|
|
bool m_conceptWeightValid;
|
|
*/
|
|
bool m_httpStatusValid;
|
|
bool m_crawlDelayValid;
|
|
bool m_finalCrawlDelayValid;
|
|
bool m_titleRecKeyValid;
|
|
bool m_adVectorValid;
|
|
bool m_wikiDocIdsValid;
|
|
bool m_catIdsValid;
|
|
bool m_versionValid;
|
|
bool m_indCatIdsValid;
|
|
bool m_dmozTitlesValid;
|
|
bool m_dmozSummsValid;
|
|
bool m_dmozAnchorsValid;
|
|
bool m_dmozInfoValid;
|
|
bool m_rawUtf8ContentValid;
|
|
bool m_expandedUtf8ContentValid;
|
|
bool m_utf8ContentValid;
|
|
bool m_isAllowedValid;
|
|
//bool m_tryAgainTimeDeltaValid;
|
|
//bool m_eliminateMenusValid;
|
|
bool m_redirUrlValid;
|
|
bool m_redirCookieBufValid;
|
|
bool m_metaRedirUrlValid;
|
|
bool m_canonicalRedirUrlValid;
|
|
bool m_statusMsgValid;
|
|
bool m_mimeValid;
|
|
bool m_pubDateValid;
|
|
bool m_hostHash32aValid;
|
|
bool m_hostHash32bValid;
|
|
bool m_indexCodeValid;
|
|
bool m_priorityValid;
|
|
bool m_downloadStatusValid;
|
|
bool m_downloadEndTimeValid;
|
|
bool m_redirErrorValid;
|
|
bool m_domHash32Valid;
|
|
bool m_contentHash32Valid;
|
|
//bool m_tagHash32Valid;
|
|
bool m_tagPairHash32Valid;
|
|
|
|
bool m_linkInfo2Valid;
|
|
bool m_spiderLinksValid;
|
|
//bool m_nextSpiderPriorityValid;
|
|
bool m_firstIndexedDateValid;
|
|
bool m_isPermalinkValid;
|
|
|
|
bool m_isAdultValid;
|
|
bool m_hasAddressValid;
|
|
bool m_hasTODValid;
|
|
//bool m_hasSiteVenueValid;
|
|
bool m_catRecValid;
|
|
bool m_urlPubDateValid;
|
|
bool m_isUrlPermalinkFormatValid;
|
|
bool m_percentChangedValid;
|
|
bool m_unchangedValid;
|
|
bool m_countTableValid;
|
|
bool m_summaryLangIdValid;
|
|
bool m_tagPairHashVecValid;
|
|
bool m_summaryVecValid;
|
|
bool m_titleVecValid;
|
|
bool m_pageSampleVecValid;
|
|
bool m_postVecValid;
|
|
bool m_dupListValid;
|
|
bool m_likedbListValid;
|
|
bool m_isDupValid;
|
|
bool m_gigabitVectorHashValid;
|
|
bool m_gigabitQueryValid;
|
|
bool m_metaDescValid;
|
|
bool m_metaSummaryValid;
|
|
bool m_metaKeywordsValid;
|
|
bool m_siteSpiderQuotaValid;
|
|
bool m_oldDocValid;
|
|
bool m_extraDocValid;
|
|
bool m_ahrefsDocValid;
|
|
//bool m_contactDocValid;
|
|
bool m_rootDocValid;
|
|
//bool m_gatewayDocValid;
|
|
bool m_oldMetaListValid;
|
|
bool m_oldTitleRecValid;
|
|
bool m_rootTitleRecValid;
|
|
//bool m_contactTitleRecValid;
|
|
bool m_isIndexedValid;
|
|
bool m_hasContactInfoValid;
|
|
bool m_isContactyValid;
|
|
bool m_contactInfoTagRecValid;
|
|
bool m_addressesValid;
|
|
bool m_contactAddressesValid;
|
|
bool m_emailBufValid;
|
|
//bool m_contactUsLinkValid;
|
|
//bool m_aboutUsLinkValid;
|
|
//bool m_contactLinksValid;
|
|
bool m_siteNumInlinksValid;
|
|
//bool m_siteNumInlinksUniqueIpValid;//FreshValid;
|
|
//bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid
|
|
//bool m_siteNumInlinksTotalValid;
|
|
bool m_siteNumInlinks8Valid;
|
|
bool m_siteLinkInfoValid;
|
|
bool m_isWWWDupValid;
|
|
bool m_linkInfo1Valid;
|
|
bool m_linkSiteHashesValid;
|
|
//bool m_dateParse2Valid;
|
|
bool m_simpleDatesValid;
|
|
bool m_datesValid;
|
|
bool m_sectionsReplyValid;
|
|
bool m_sectionsVotesValid;
|
|
bool m_sectiondbDataValid;
|
|
bool m_placedbDataValid;
|
|
bool m_siteHash64Valid;
|
|
bool m_siteHash32Valid;
|
|
bool m_httpReplyValid;
|
|
bool m_contentTypeValid;
|
|
bool m_isBinaryValid;
|
|
bool m_priorityQueueNumValid;
|
|
bool m_outlinkTagRecVectorValid;
|
|
bool m_outlinkIpVectorValid;
|
|
bool m_hasNoIndexMetaTagValid;
|
|
bool m_hasUseFakeIpsMetaTagValid;
|
|
bool m_outlinkIsIndexedVectorValid;
|
|
bool m_isSiteRootValid;
|
|
bool m_wasContentInjectedValid;
|
|
bool m_outlinkHopCountVectorValid;
|
|
//bool m_isSpamValid;
|
|
bool m_isFilteredValid;
|
|
bool m_urlFilterNumValid;
|
|
bool m_numOutlinksAddedValid;
|
|
bool m_baseUrlValid;
|
|
bool m_replyValid;
|
|
bool m_recycleDiffbotReplyValid;
|
|
bool m_diffbotReplyValid;
|
|
bool m_tokenizedDiffbotReplyValid;
|
|
//bool m_diffbotUrlCrawlPatternMatchValid;
|
|
//bool m_diffbotUrlProcessPatternMatchValid;
|
|
//bool m_diffbotPageProcessPatternMatchValid;
|
|
//bool m_useDiffbotValid;
|
|
//bool m_diffbotApiNumValid;
|
|
bool m_diffbotApiUrlValid;
|
|
bool m_diffbotTitleHashBufValid;
|
|
bool m_crawlInfoValid;
|
|
bool m_isPageParserValid;
|
|
bool m_imageUrlValid;
|
|
bool m_imageUrl2Valid;
|
|
bool m_matchOffsetsValid;
|
|
bool m_queryValid;
|
|
bool m_diffbotProxyReplyValid;
|
|
bool m_matchesValid;
|
|
bool m_dbufValid;
|
|
bool m_titleValid;
|
|
bool m_htbValid;
|
|
bool m_collnumValid;
|
|
//bool m_twidsValid;
|
|
bool m_termId32BufValid;
|
|
bool m_termInfoBufValid;
|
|
bool m_newTermInfoBufValid;
|
|
bool m_summaryValid;
|
|
bool m_gsbufValid;
|
|
bool m_spiderStatusDocMetaListValid;
|
|
bool m_isCompromisedValid;
|
|
bool m_isNoArchiveValid;
|
|
//bool m_isVisibleValid;
|
|
//bool m_clockCandidatesTableValid;
|
|
//bool m_clockCandidatesDataValid;
|
|
bool m_titleRecBufValid;
|
|
bool m_isLinkSpamValid;
|
|
bool m_isErrorPageValid;
|
|
bool m_isHijackedValid;
|
|
bool m_dupHashValid;
|
|
bool m_exactContentHash64Valid;
|
|
bool m_looseContentHash64Valid;
|
|
bool m_jpValid;
|
|
|
|
char m_isSiteMap;
|
|
|
|
// shadows
|
|
char m_isRSS2;
|
|
char m_isPermalink2;
|
|
char m_isAdult2;
|
|
char m_spiderLinks2;
|
|
char m_isContentTruncated2;
|
|
char m_isLinkSpam2;
|
|
bool m_hasAddress2;
|
|
bool m_hasTOD2;
|
|
//bool m_hasSiteVenue2;
|
|
char m_hasContactInfo2;
|
|
char m_isSiteRoot2;
|
|
|
|
// DO NOT add validity flags below this line!
|
|
char m_VALIDEND;
|
|
|
|
// more stuff
|
|
//char *m_utf8Content;
|
|
//int32_t m_utf8ContentLen;
|
|
CatRec m_catRec;
|
|
// use this stuff for getting wiki docids that match our doc's gigabits
|
|
//Query m_wq;
|
|
//SearchInput m_si;
|
|
//Msg40 m_msg40;
|
|
//DateParse2 m_dateParse2;
|
|
bool m_printedMenu;
|
|
Dates m_dates;
|
|
//HashTableX m_clockCandidatesTable;
|
|
//SafeBuf m_cctbuf;
|
|
float m_ageInDays;
|
|
int32_t m_urlPubDate;
|
|
//int32_t m_urlAge;
|
|
char m_isUrlPermalinkFormat;
|
|
uint8_t m_summaryLangId;
|
|
int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES];
|
|
int32_t m_tagPairHashVecSize;
|
|
int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4];
|
|
int32_t m_summaryVecSize;
|
|
int32_t m_titleVec [SAMPLE_VECTOR_SIZE/4];
|
|
int32_t m_titleVecSize;
|
|
int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4];
|
|
int32_t m_pageSampleVecSize;
|
|
int32_t m_postVec[POST_VECTOR_SIZE/4];
|
|
int32_t m_postVecSize;
|
|
float m_tagSimilarity;
|
|
float m_gigabitSimilarity;
|
|
float m_pageSimilarity;
|
|
float m_percentChanged;
|
|
bool m_unchanged;
|
|
// what docids are similar to us? docids are in this list
|
|
RdbList m_dupList;
|
|
RdbList m_likedbList;
|
|
uint64_t m_dupHash;
|
|
int64_t m_exactContentHash64;
|
|
int64_t m_looseContentHash64;
|
|
Msg0 m_msg0;
|
|
Msg5 m_msg5;
|
|
char m_isDup;
|
|
int64_t m_docIdWeAreADupOf;
|
|
int32_t m_ei;
|
|
int32_t m_lastLaunch;
|
|
Msg22Request m_msg22Request;
|
|
Msg22Request m_msg22Requestc;
|
|
Msg22 m_msg22a;
|
|
Msg22 m_msg22b;
|
|
Msg22 m_msg22c;
|
|
Msg22 m_msg22d;
|
|
Msg22 m_msg22e;
|
|
Msg22 m_msg22f;
|
|
//int32_t m_collLen;
|
|
uint32_t m_gigabitVectorHash;
|
|
char m_gigabitQuery [XD_GQ_MAX_SIZE];
|
|
int32_t m_gigabitHashes [XD_MAX_GIGABIT_HASHES];
|
|
int32_t m_gigabitScores [XD_MAX_GIGABIT_HASHES];
|
|
char *m_gigabitPtrs [XD_MAX_GIGABIT_HASHES];
|
|
// for debug printing really
|
|
class GigabitInfo *m_top[100];
|
|
int32_t m_numTop;
|
|
//char m_metaDesc[1025];
|
|
//char m_metaKeywords[1025];
|
|
// these now reference directly into the html src so our
|
|
// WordPosInfo::m_wordPtr algo works in seo.cpp
|
|
char *m_metaDesc;
|
|
int32_t m_metaDescLen;
|
|
char *m_metaSummary;
|
|
int32_t m_metaSummaryLen;
|
|
char *m_metaKeywords;
|
|
int32_t m_metaKeywordsLen;
|
|
int32_t m_siteSpiderQuota;
|
|
//int32_t m_numBannedOutlinks;
|
|
class XmlDoc *m_oldDoc;
|
|
class XmlDoc *m_extraDoc;
|
|
class XmlDoc *m_ahrefsDoc;
|
|
//class XmlDoc *m_contactDoc;
|
|
class XmlDoc *m_rootDoc;
|
|
//class XmlDoc *m_gatewayDoc;
|
|
RdbList m_oldMetaList;
|
|
char *m_oldTitleRec;
|
|
int32_t m_oldTitleRecSize;
|
|
char *m_rootTitleRec;
|
|
int32_t m_rootTitleRecSize;
|
|
//char *m_contactTitleRec;
|
|
//int32_t m_contactTitleRecSize;
|
|
char m_isIndexed;
|
|
|
|
// confusing, i know! these are used exclsusively by
|
|
// getNewSpiderReply() for now
|
|
char m_isInIndex;
|
|
char m_wasInIndex;
|
|
|
|
bool m_oldDocExistedButHadError;
|
|
|
|
Msg8a m_msg8a;
|
|
char *m_tagdbColl;
|
|
int32_t m_tagdbCollLen;
|
|
Addresses m_addresses;
|
|
|
|
Address *m_contactAddresses[MAX_CONTACT_ADDRESSES];
|
|
int32_t m_numContactAddresses;
|
|
|
|
char m_isContacty;
|
|
|
|
//Url m_contactUsLink;
|
|
//Url m_aboutUsLink;
|
|
/*
|
|
char *m_contactLinks [MAX_CONTACT_OUTLINKS];
|
|
int32_t m_contactLens [MAX_CONTACT_OUTLINKS];
|
|
int32_t m_contactScores [MAX_CONTACT_OUTLINKS];
|
|
int32_t m_contactFlags [MAX_CONTACT_OUTLINKS];
|
|
char m_contactProcessed [MAX_CONTACT_OUTLINKS];
|
|
char *m_contactText [MAX_CONTACT_OUTLINKS];
|
|
char *m_contactTextEnd [MAX_CONTACT_OUTLINKS];
|
|
int32_t m_minContactScore;
|
|
int32_t m_minContactIndex;
|
|
int32_t m_numContactLinks;
|
|
*/
|
|
Url m_extraUrl;
|
|
//int32_t m_siteNumInlinksFresh;
|
|
//int32_t m_sitePop;
|
|
uint8_t m_siteNumInlinks8;
|
|
//int32_t m_siteNumInlinks;
|
|
LinkInfo m_siteLinkInfo;
|
|
SafeBuf m_mySiteLinkInfoBuf;
|
|
SafeBuf m_myPageLinkInfoBuf;
|
|
SafeBuf m_myTempLinkInfoBuf;
|
|
char m_isInjecting;
|
|
char m_isImporting;
|
|
char m_useFakeMime;
|
|
char m_useSiteLinkBuf;
|
|
char m_usePageLinkBuf;
|
|
char m_printInXml;
|
|
//Msg25 m_msg25;
|
|
SafeBuf m_tmpBuf11;
|
|
SafeBuf m_tmpBuf12;
|
|
Multicast m_mcast11;
|
|
Multicast m_mcast12;
|
|
Msg25 *m_tempMsg25Page;
|
|
Msg25 *m_tempMsg25Site;
|
|
// for page or for site?
|
|
Msg25 *getAllInlinks ( bool forSite );
|
|
// lists from cachedb for msg25's msg20 replies serialized
|
|
RdbList m_siteReplyList;
|
|
RdbList m_pageReplyList;
|
|
bool m_checkedCachedbForSite;
|
|
bool m_checkedCachedbForPage;
|
|
bool m_triedToAddWordPosInfoToCachedb;
|
|
bool m_calledMsg25ForSite;
|
|
bool m_calledMsg25ForPage;
|
|
//void (* m_masterLoopWrapper) (void *state);
|
|
MsgC m_msgc;
|
|
bool m_isAllowed;
|
|
bool m_forwardDownloadRequest;
|
|
bool m_isChildDoc;
|
|
class XmlDoc *m_parentDocPtr;
|
|
Msg13 m_msg13;
|
|
Msg13Request m_msg13Request;
|
|
Msg13Request m_diffbotProxyRequest;
|
|
ProxyReply *m_diffbotProxyReply;
|
|
bool m_isSpiderProxy;
|
|
// for limiting # of iframe tag expansions
|
|
int32_t m_numExpansions;
|
|
char m_newOnly;
|
|
//int32_t m_tryAgainTimeDelta;
|
|
//int32_t m_sameIpWait;
|
|
//int32_t m_sameDomainWait;
|
|
//int32_t m_maxSpidersPerDomain;
|
|
char m_isWWWDup;
|
|
char m_calledMsg0b;
|
|
Url m_tmpUrl;
|
|
|
|
SafeBuf m_tmpsb1;
|
|
SafeBuf m_tmpsb2;
|
|
SafeBuf m_turkBuf;
|
|
SafeBuf m_linkSiteHashBuf;
|
|
SafeBuf m_linkdbDataBuf;
|
|
SafeBuf m_langVec;
|
|
Msg0 m_msg0b;
|
|
class RdbList *m_ulist;
|
|
void *m_hack;
|
|
class XmlDoc *m_hackxd;
|
|
//class LinkInfo *m_linkInfo1Ptr;
|
|
char *m_linkInfoColl;
|
|
//char m_injectedReply;
|
|
//int32_t m_minInlinkerHopCount;
|
|
//class LinkInfo *m_linkInfo2Ptr;
|
|
SiteGetter m_siteGetter;
|
|
int64_t m_siteHash64;
|
|
//char *m_site;
|
|
//int32_t m_siteLen;
|
|
//Url m_siteUrl;
|
|
int32_t m_siteHash32;
|
|
char *m_httpReply;
|
|
//char m_downloadAttempted;
|
|
char m_incrementedAttemptsCount;
|
|
char m_incrementedDownloadCount;
|
|
char m_redirectFlag;
|
|
//char m_isScraping;
|
|
//char m_throttleDownload;
|
|
char m_spamCheckDisabled;
|
|
char m_useRobotsTxt;
|
|
int32_t m_robotsTxtLen;
|
|
int32_t m_httpReplySize;
|
|
int32_t m_httpReplyAllocSize;
|
|
char m_isBinary;
|
|
char *m_filteredContent;
|
|
int32_t m_filteredContentLen;
|
|
char *m_filter;
|
|
int32_t m_filteredContentAllocSize;
|
|
int32_t m_filteredContentMaxSize;
|
|
char m_calledThread;
|
|
int32_t m_errno;
|
|
//class CollectionRec *m_cr;
|
|
//int32_t m_utf8ContentAllocSize;
|
|
int32_t m_hostHash32a;
|
|
int32_t m_hostHash32b;
|
|
int32_t m_domHash32;
|
|
int32_t m_priorityQueueNum;
|
|
|
|
// this points into m_msge0 i guess
|
|
//class TagRec **m_outlinkTagRecVector;
|
|
Msge0 m_msge0;
|
|
|
|
// this points into m_msge1 i guess
|
|
int32_t *m_outlinkIpVector;
|
|
SafeBuf m_outlinkTagRecPtrBuf;
|
|
SafeBuf m_fakeIpBuf;
|
|
char m_hasNoIndexMetaTag;
|
|
char m_hasUseFakeIpsMetaTag;
|
|
Msge1 m_msge1;
|
|
TagRec **m_outlinkTagRecVector;
|
|
SafeBuf m_fakeTagRecPtrBuf;
|
|
TagRec m_fakeTagRec;
|
|
|
|
//
|
|
// diffbot parms for indexing diffbot's json output
|
|
//
|
|
XmlDoc *m_dx;
|
|
char *m_diffbotObj;
|
|
SafeBuf m_diffbotReply;
|
|
SafeBuf m_v3buf;
|
|
SafeBuf *m_tokenizedDiffbotReplyPtr;
|
|
SafeBuf m_tokenizedDiffbotReply;
|
|
int32_t m_diffbotReplyError;
|
|
bool m_recycleDiffbotReply;
|
|
//bool m_diffbotUrlCrawlPatternMatch;
|
|
//bool m_diffbotUrlProcessPatternMatch;
|
|
//bool m_diffbotPageProcessPatternMatch;
|
|
//int32_t m_diffbotApiNum;
|
|
//bool m_useDiffbot;
|
|
// url to access diffbot with
|
|
SafeBuf m_diffbotApiUrl;
|
|
SafeBuf m_diffbotUrl; // exact url used to fetch reply from diffbot
|
|
|
|
bool *getRecycleDiffbotReply ( ) ;
|
|
SafeBuf *getTokenizedDiffbotReply ( ) ;
|
|
SafeBuf *getDiffbotReply ( ) ;
|
|
bool doesUrlMatchDiffbotCrawlPattern() ;
|
|
//bool doesUrlMatchDiffbotProcessPattern() ;
|
|
bool doesPageContentMatchDiffbotProcessPattern() ;
|
|
int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ;
|
|
char *hashJSONFields ( HashTableX *table );
|
|
char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp ,
|
|
bool hashWithoutFieldNames ) ;
|
|
|
|
char *hashXMLFields ( HashTableX *table );
|
|
int32_t *reindexJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ) ;
|
|
int32_t *nukeJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ) ;
|
|
int32_t *redoJSONObjects ( int32_t *newTitleHashes ,
|
|
int32_t numNewHashes ,
|
|
bool deleteFromIndex ) ;
|
|
|
|
int32_t m_joc;
|
|
SafeBuf m_diffbotTitleHashBuf;
|
|
|
|
Json *getParsedJson();
|
|
// object that parses the json
|
|
Json m_jp;
|
|
|
|
|
|
//EmailInfo m_emailInfo;
|
|
|
|
//
|
|
// functions and vars for the seo query matching tool
|
|
//
|
|
bool loadTitleRecFromDiskOrSpider();
|
|
//SafeBuf *getSEOQueryInfo ( );
|
|
HashTableX *getTermIdBufDedupTable32();
|
|
//int32_t *getTopWordsVector( bool includeSynonyms );
|
|
SafeBuf *getTermId32Buf();
|
|
SafeBuf *getTermInfoBuf();
|
|
SafeBuf *getNewTermInfoBuf();
|
|
SafeBuf *getMatchingQueryBuf();
|
|
SafeBuf *getQueryLinkBuf(SafeBuf *docIdListBuf,bool doMatchingQueries);
|
|
//SafeBuf *getMatchingQueriesScored();
|
|
SafeBuf *getMatchingQueriesScoredForFullQuery();
|
|
SafeBuf *getRelatedDocIds();
|
|
SafeBuf *getRelatedDocIdsScored();
|
|
SafeBuf *getTopMatchingQueryBuf();
|
|
bool addRelatedDocIdInfo ( int64_t docId ,
|
|
int32_t queryNum ,
|
|
float score ,
|
|
int32_t rank ,
|
|
int32_t siteHash26 ) ;
|
|
bool setRelatedDocIdWeightAndRank ( class RelatedDocId *rd );
|
|
SafeBuf *getRelatedDocIdsWithTitles();
|
|
bool setRelatedDocIdInfoFromMsg20Reply ( class RelatedDocId *rd ,
|
|
class Msg20Reply *reply );
|
|
|
|
SafeBuf *getRelatedQueryBuf();
|
|
//SafeBuf *getRelatedQueryLinksModPart ( int32_t modPart );
|
|
|
|
bool addTermsFromQuery ( char *queryStr,
|
|
uint8_t queryLangId,
|
|
int32_t gigablastTraffic,
|
|
int32_t googleTraffic,
|
|
int32_t hackqoff,
|
|
class SafeBuf *tmpBuf ,
|
|
class HashTableX *scoreTable ,
|
|
class HashTableX *topWordsTable ,
|
|
float imp,
|
|
bool isRelatedQuery ) ;
|
|
|
|
bool sortTermsIntoBuf ( class HashTableX *scoreTable ,
|
|
class SafeBuf *tmpBuf ,
|
|
class SafeBuf *missingTermBuf ) ;
|
|
|
|
|
|
SafeBuf *getMissingTermBuf ();
|
|
SafeBuf *getMatchingTermBuf ();
|
|
SafeBuf *getTermIdSortedPosdbListBuf();
|
|
SafeBuf *getWordPosSortedPosdbListBuf();
|
|
SafeBuf *getTermListBuf(); // list of posdb termlists for caching
|
|
SafeBuf *getWordPosInfoBuf ( ) ;
|
|
|
|
|
|
//bool sendBin ( int32_t i );
|
|
//bool scoreDocIdRestrictedQueries(class Msg99Reply **replyPtrs,
|
|
// class QueryLink *linkPtrs,
|
|
// int32_t numPtrs );
|
|
|
|
// private like functions
|
|
bool addUniqueWordsToBuf ( SafeBuf *termInfoBuf,
|
|
HashTableX *dedupTable ,
|
|
HashTableX *filterTable ,
|
|
HashTableX *minCountTable ,
|
|
bool storeCounts,
|
|
Words *words ,
|
|
bool includeSynonyms );
|
|
//void gotMsg99Reply ( UdpSlot *slot );
|
|
//void gotMsg98Reply ( UdpSlot *slot );
|
|
void gotMsg95Reply ( UdpSlot *slot );
|
|
//void gotMsg3aReplyForMainUrl ( );
|
|
void gotMsg3aReplyForFullQuery( );
|
|
//void gotMsg3aReplyForFullQueryCached ( char *cachedRec ,
|
|
// class Msg99Reply *qp );
|
|
//void gotMsg3aReplyForRelQuery ( class Msg3a *msg3a );
|
|
void gotMsg3fReply ( class Bin *bin );
|
|
//void pumpSocketWriteBuf ( );
|
|
//HashTableX *getMatchingQueryHashTable();
|
|
HashTableX *getMatchingQueryOffsetTable();
|
|
|
|
int32_t getNumInsertableTerms ( );
|
|
class SafeBuf *getInsertableTerms ( );
|
|
class SafeBuf *getScoredInsertableTerms ( );
|
|
//class SafeBuf *getInsertableWordFreqInfoBuf ();
|
|
bool processMsg95Replies();
|
|
void setWordPosInfosTrafficGain ( class InsertableTerm *it );
|
|
int32_t getTrafficGain( class QueryChange *qc ) ;
|
|
// print in xml
|
|
bool printScoredInsertableTerms ( SafeBuf *sbuf ) ;
|
|
|
|
|
|
HashTableX m_tidTable32;
|
|
//int32_t *m_twids;
|
|
//int32_t m_numTwids;
|
|
SafeBuf m_termId32Buf;
|
|
SafeBuf m_termInfoBuf;
|
|
SafeBuf m_newTermInfoBuf;
|
|
//int32_t m_maxQueries;
|
|
//int32_t m_maxRelatedQueries;
|
|
//int32_t m_maxRelatedUrls;
|
|
//int32_t m_numMsg99Requests;
|
|
//int32_t m_numMsg98Requests;
|
|
//int32_t m_numMsg99Replies;
|
|
//int32_t m_numMsg98Replies;
|
|
//char *m_msg99ReplyPtrs [MAX_HOSTS];
|
|
//int32_t m_msg99ReplySizes[MAX_HOSTS];
|
|
//int32_t m_msg99ReplyAlloc[MAX_HOSTS];
|
|
//int32_t m_msg99HostIds [MAX_HOSTS];
|
|
char *m_msg95ReplyPtrs [MAX_HOSTS];
|
|
int32_t m_msg95ReplySizes[MAX_HOSTS];
|
|
//HashTableX m_queryHashTable;
|
|
HashTableX m_queryOffsetTable;
|
|
HashTableX m_tmpTable;
|
|
HashTableX m_fullQueryDedup;
|
|
//SafeBuf m_twbuf;
|
|
//SafeBuf m_queryPtrs;
|
|
SafeBuf m_matchingQueryBuf;
|
|
SafeBuf m_matchingQueryStringBuf;
|
|
SafeBuf m_relatedQueryBuf;
|
|
SafeBuf m_relatedQueryStringBuf;
|
|
SafeBuf m_docIdListBuf;
|
|
SafeBuf m_queryOffsets;
|
|
SafeBuf m_extraQueryBuf;
|
|
//SafeBuf m_socketWriteBuf;
|
|
SafeBuf m_relatedDocIdBuf;
|
|
SafeBuf m_relatedTitleBuf;
|
|
SafeBuf m_commonQueryNumBuf;
|
|
SafeBuf m_topMatchingQueryBuf;
|
|
HashTableX m_rdtab;
|
|
|
|
// related query algo stuff
|
|
SafeBuf m_queryLinkBuf;
|
|
SafeBuf m_queryLinkStringBuf;
|
|
char *m_msg8eReply [MAX_HOSTS];
|
|
int32_t m_msg8eReplySize[MAX_HOSTS];
|
|
int32_t m_numMsg8eRequests;
|
|
int32_t m_numMsg8eReplies;
|
|
//bool m_launchedAll;
|
|
int64_t m_tlbufTimer;
|
|
|
|
SafeBuf m_missingTermBuf;
|
|
SafeBuf m_matchingTermBuf;
|
|
//SafeBuf m_queryRelBuf;
|
|
//SafeBuf m_relPtrs;
|
|
SafeBuf m_sortedPosdbListBuf;
|
|
SafeBuf m_wpSortedPosdbListBuf;
|
|
SafeBuf m_termListBuf;
|
|
SafeBuf m_insertableTermsBuf;
|
|
//SafeBuf m_iwfiBuf;
|
|
SafeBuf m_wordPosInfoBuf;
|
|
//SafeBuf m_msg20ReplyPtrBuf;
|
|
SafeBuf m_recommendedLinksBuf;
|
|
SafeBuf m_tmpMsg0Buf;
|
|
SafeBuf m_msg20Array;
|
|
SafeBuf m_newLinkerBuf;
|
|
|
|
//Msg17 m_msg17;
|
|
//key_t m_cacheKey;
|
|
//char *m_cacheRec;
|
|
//int32_t m_cacheRecSize;
|
|
//bool m_triedCache;
|
|
|
|
//class TopDocIds *m_topDocIdsBuf;
|
|
//int32_t m_topDocIdsBufSize;
|
|
SafeBuf m_topDocIdsBuf;
|
|
//class TopDocIds *m_nextAvailTopDocIds;
|
|
//int32_t m_nextAvailTopDocIdsOffset;
|
|
|
|
//int32_t m_maxFullQueries;
|
|
//XmlDoc *m_newxd;
|
|
//XmlDoc *m_newxd2;
|
|
//bool m_newxd2Blocked;
|
|
//HashTableX m_tmpDupTable;
|
|
//class Msg20 *m_newMsg20;
|
|
Msg3a *m_msg3a;
|
|
Query *m_query3a;
|
|
int32_t m_numMsg3aRequests;
|
|
int32_t m_numMsg3aReplies;
|
|
|
|
int32_t m_numMsg3fRequests;
|
|
int32_t m_numMsg3fReplies;
|
|
int32_t m_numMsg4fRequests;
|
|
int32_t m_numMsg4fReplies;
|
|
bool m_sentMsg4fRequests;
|
|
bool m_matchesCrawlPattern;
|
|
class UdpSlot *m_savedSlot;
|
|
int32_t m_numMsg95Requests;
|
|
int32_t m_numMsg95Replies;
|
|
int32_t m_qcursor;
|
|
char m_seoDebug;
|
|
char m_progressBar;
|
|
bool m_readFromCachedb;
|
|
bool m_writeToCachedb;
|
|
//bool m_setForReplyPtrs;
|
|
//bool m_setForLinkPtrs;
|
|
|
|
SafeBuf *getRecommendedLinksBuf ( );
|
|
bool processLinkInfoMsg20Reply ( class Msg25 *msg25 );
|
|
bool printRecommendedLinksBuf ( class SafeBuf *sb ) ;
|
|
|
|
// recommendedlinksbuf vars and functions
|
|
int32_t m_numLinkRequestsOut;
|
|
int32_t m_numLinkRequestsIn;
|
|
int32_t m_hadLinkInfoError;
|
|
int32_t m_numMsg20sIn;
|
|
int32_t m_numMsg20sOut;
|
|
int32_t m_numValidMsg20s;
|
|
int32_t m_titleCursor;
|
|
int32_t m_msg20Phase;
|
|
int32_t m_recommendedLinkError;
|
|
SafeBuf *lookupTitles();
|
|
bool gotLinkerTitle ( class Msg20 *msg20 );
|
|
|
|
// 1 *current* bin per host!
|
|
//class Bin *m_currentBinPtrs[MAX_HOSTS];
|
|
//int32_t m_binError;
|
|
//int32_t m_msg98ReplyError;
|
|
//int32_t m_binErrorForReplyPtrs;
|
|
//int32_t m_binErrorForLinkPtrs;
|
|
HashTableX m_qstringTable;
|
|
|
|
// flow flags
|
|
bool m_printedQueries;
|
|
bool m_printedRelatedDocIds;
|
|
bool m_printedRelatedQueries;
|
|
bool m_printedScoredInsertableTerms;
|
|
bool m_printedRecommendedLinks;
|
|
bool m_loggedMsg3;
|
|
int64_t m_lastPrintedDocId;
|
|
//bool m_docIndexed;
|
|
//bool m_sentMsg99Requests;
|
|
bool m_didSet3;
|
|
//bool m_didSet3b;
|
|
bool m_registeredSocketCallback;
|
|
// the caller's socket the expect the xml reply on
|
|
TcpSocket *m_seoSocket;
|
|
TcpSocket *m_hackSocket;
|
|
bool m_doingSEO;
|
|
|
|
|
|
bool clientClosedConnection ( );
|
|
bool m_hadMatchError;
|
|
bool m_clientClosed;
|
|
bool m_lastCheckTime;
|
|
int32_t m_msg3aErrno ;
|
|
bool m_computedMetaListCheckSum;
|
|
|
|
// cachedb related args
|
|
//bool m_seoInfoSetFromCache;
|
|
bool m_checkedCachedb;
|
|
bool m_processedCachedbReply;
|
|
//bool m_storedIntoCachedb;
|
|
RdbList m_cacheList;
|
|
//SafeBuf m_msg99ReplyBuf;
|
|
SafeBuf m_queryChangeBuf;
|
|
SafeBuf m_queryLogBuf;
|
|
//SafeBuf m_itStrBuf;
|
|
SafeBuf m_debugScoreInfoBuf;
|
|
SafeBuf m_origScoreInfoBuf;
|
|
RdbList m_storeList;
|
|
Msg1 m_msg1;
|
|
bool m_allHashed;
|
|
bool checkCachedb ( );
|
|
bool storeScoredInsertableTermsIntoCachedb ( ) ;
|
|
bool storeRelatedQueriesIntoCachedb ( ) ;
|
|
bool storeRelatedDocIdsIntoCachedb ( ) ;
|
|
bool storeMatchingQueriesIntoCachedb ( ) ; // only the top 1000 or so
|
|
bool storeMissingTermBufIntoCachedb ( );
|
|
bool storeWordPosInfoBufIntoCachedb ( );
|
|
bool storeRecommendedLinksBuf ( );
|
|
|
|
// cursors
|
|
int32_t m_socketWriteBufSent;
|
|
int32_t m_queryNum;
|
|
int32_t m_rdCursor;
|
|
int32_t m_relatedNum;
|
|
int32_t m_numRelatedAdded;
|
|
|
|
// for getRelatedDocIdsWithTitles() launching msg20s
|
|
int32_t m_relatedDocIdError;
|
|
int32_t m_numMsg20Replies;
|
|
int32_t m_numMsg20Requests;
|
|
SafeBuf m_msg20Buf;
|
|
|
|
// this points into m_msge2
|
|
//char *m_outlinkIsIndexedVector;
|
|
//Msge2 m_msge2;
|
|
|
|
bool m_doneWithAhrefs;
|
|
bool m_useAhrefs;
|
|
bool m_reallyInjectLinks;
|
|
int32_t m_downloadLevel;
|
|
int32_t m_numRegExs;
|
|
//char m_isSiteRoot;
|
|
int8_t *m_outlinkHopCountVector;
|
|
int32_t m_outlinkHopCountVectorSize;
|
|
//char m_isSpam;
|
|
char m_isUrlBadYear;
|
|
char m_isFiltered;
|
|
int32_t m_urlFilterNum;
|
|
int32_t m_numOutlinksAdded;
|
|
int32_t m_numOutlinksAddedFromSameDomain;
|
|
int32_t m_numOutlinksFiltered;
|
|
int32_t m_numOutlinksBanned;
|
|
int32_t m_numRedirects;
|
|
bool m_isPageParser;
|
|
Url m_baseUrl;
|
|
Msg20Reply m_reply;
|
|
Msg20Request *m_req;
|
|
//char *m_gsbuf;
|
|
SafeBuf m_surroundingTextBuf;
|
|
SafeBuf m_rssItemBuf;
|
|
SafeBuf m_gsbuf;
|
|
//int32_t m_gsbufSize;
|
|
//int32_t m_gsbufAllocSize;
|
|
char *m_note;
|
|
char *m_imageUrl;
|
|
char *m_imageUrl2;
|
|
//char m_imageUrlBuf[100];
|
|
SafeBuf m_imageUrlBuf;
|
|
SafeBuf m_imageUrlBuf2;
|
|
//int32_t m_imageUrlSize;
|
|
MatchOffsets m_matchOffsets;
|
|
Query m_query;
|
|
Matches m_matches;
|
|
// meta description buf
|
|
int32_t m_dbufSize;
|
|
char m_dbuf[1024];
|
|
SafeBuf m_htb;
|
|
Title m_title;
|
|
Summary m_summary;
|
|
char m_isCompromised;
|
|
char m_isNoArchive;
|
|
char m_isErrorPage;
|
|
char m_isHijacked;
|
|
//char m_isVisible;
|
|
//char m_dmozBuf[12000];
|
|
SafeBuf m_dmozBuf;
|
|
int32_t m_numDmozEntries;
|
|
|
|
// stuff
|
|
char *m_statusMsg;
|
|
Msg4 m_msg4;
|
|
Msg8b m_msg8b;
|
|
bool m_incCount;
|
|
bool m_decCount;
|
|
|
|
bool m_deleteFromIndex;
|
|
|
|
// ptrs to stuff
|
|
//char *m_titleRec;
|
|
SafeBuf m_titleRecBuf;
|
|
//int32_t m_titleRecSize;
|
|
//bool m_freeTitleRec;
|
|
//int32_t m_titleRecAllocSize;
|
|
key_t m_titleRecKey;
|
|
|
|
// for isDupOfUs()
|
|
char *m_dupTrPtr;
|
|
int32_t m_dupTrSize;
|
|
|
|
// parse these out of spider rec
|
|
/*
|
|
int32_t m_retryNum ;
|
|
int32_t m_spiderRecPriority ;
|
|
bool m_spiderRecIsNew ;
|
|
int32_t m_spiderRecSiteNumInlinks ;
|
|
int32_t m_spiderRecRetryCount ;
|
|
int32_t m_spiderRecHopCount ;
|
|
key_t m_spiderRecKey ;
|
|
bool m_spiderRecForced ;
|
|
int32_t m_spiderRecTime ;
|
|
int32_t m_srDataSize ;
|
|
char m_srData [ MAX_SPIDERREC_SIZE ];
|
|
*/
|
|
|
|
key_t m_doledbKey;
|
|
SpiderRequest m_sreq;
|
|
SpiderReply m_srep;//newsr;
|
|
|
|
// bool flags for what procedures we have done
|
|
bool m_checkedUrlFilters;
|
|
|
|
bool m_listAdded ;
|
|
bool m_listFlushed ;
|
|
bool m_check1 ;
|
|
bool m_check2 ;
|
|
bool m_prepared ;
|
|
bool m_updatedCounts ;
|
|
bool m_updatedCounts2 ;
|
|
//bool m_updatedTagdb1 ;
|
|
//bool m_updatedTagdb2 ;
|
|
//bool m_updatedTagdb3 ;
|
|
//bool m_updatedTagdb4 ;
|
|
//bool m_updatedTagdb5 ;
|
|
bool m_copied1 ;
|
|
bool m_updatingSiteLinkInfoTags ;
|
|
bool m_addressSetCalled ;
|
|
|
|
//bool m_calledMsg22a ;
|
|
//bool m_calledMsg22b ;
|
|
//bool m_calledMsg22c ;
|
|
int64_t m_calledMsg22d ;
|
|
bool m_didDelay ;
|
|
bool m_didDelayUnregister ;
|
|
bool m_calledMsg22e ;
|
|
bool m_calledMsg22f ;
|
|
bool m_calledMsg25 ;
|
|
bool m_calledMsg25b ;
|
|
bool m_calledMsg8b ;
|
|
bool m_calledMsg40 ;
|
|
bool m_calledSections ;
|
|
bool m_firstEntry ;
|
|
bool m_firstEntry2 ;
|
|
bool m_launchedSpecialMsg8a ;
|
|
bool m_launchedMsg8a2 ;
|
|
bool m_loaded ;
|
|
|
|
// used for getHasContactInfo()
|
|
bool m_processed0 ;
|
|
|
|
// a lock to prevent infinite loops
|
|
//bool m_checkForRedir ;
|
|
|
|
bool m_processedLang ;
|
|
|
|
bool m_doingConsistencyCheck ;
|
|
|
|
int32_t m_langIdScore;
|
|
//int32_t m_rootLangIdScore;
|
|
//uint8_t m_rootLangId;
|
|
|
|
// used for getting contact info
|
|
//bool m_triedRoot ;
|
|
//int32_t m_winner ;
|
|
|
|
int32_t m_dist;
|
|
|
|
// the tags in this tagRec are just contact info based tags and
|
|
// created in the addContactInfo() function. also, in that same
|
|
// function we add/sub the tags in m_citr to the m_newTagRec tag rec.
|
|
//TagRec m_citr ;
|
|
|
|
char m_emailBuf[EMAILBUFSIZE];
|
|
int32_t m_numOfficialEmails;
|
|
|
|
// use to store a \0 list of "titles" of the root page so we can
|
|
// see which if any are the venue name, and thus match that to
|
|
// addresses of the venue on the site, and we can use those addresses
|
|
// as default venue addresses when no venues are listed on a page
|
|
// on that site.
|
|
char m_rootTitleBuf[ROOT_TITLE_BUF_MAX];
|
|
int32_t m_rootTitleBufSize;
|
|
|
|
// . this is filtered
|
|
// . certain punct is replaced with \0
|
|
char m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX];
|
|
int32_t m_filteredRootTitleBufSize;
|
|
|
|
// like m_rootTitleBuf but for the current page
|
|
char m_titleBuf[ROOT_TITLE_BUF_MAX];
|
|
int32_t m_titleBufSize;
|
|
|
|
|
|
bool m_setTr ;
|
|
//bool m_checkedRobots ;
|
|
bool m_triedTagRec ;
|
|
bool m_didGatewayPage ;
|
|
bool m_didQuickDupCheck ;
|
|
|
|
void (* m_masterLoop) ( void *state );
|
|
void * m_masterState;
|
|
|
|
void (* m_callback1) ( void *state );
|
|
bool (* m_callback2) ( void *state );
|
|
void *m_state;
|
|
|
|
|
|
//void (* m_injectionCallback) ( void *state );
|
|
//void *m_injectionState;
|
|
|
|
// flags for spider
|
|
//bool m_isAddUrl;
|
|
//bool m_forceDelete;
|
|
bool m_didDelete;
|
|
|
|
bool m_skipIframeExpansion;
|
|
|
|
// this is non-zero if we decided not to index the doc
|
|
int32_t m_indexCode;
|
|
|
|
// the spider priority
|
|
int32_t m_priority;
|
|
|
|
// the download error, like ETIMEDOUT, ENOROUTE, etc.
|
|
int32_t m_downloadStatus;
|
|
|
|
// . when the download was completed. will be zero if no download done
|
|
// . used to set SpiderReply::m_downloadEndTime because we need
|
|
// high resolution for that so we can dole out the next spiderrequest
|
|
// from that IP quickly if the sameipwait is like 500ms.
|
|
int64_t m_downloadEndTime;
|
|
|
|
//char *m_metaListEnd;
|
|
int32_t m_metaListAllocSize;
|
|
char *m_p;
|
|
char *m_pend;
|
|
|
|
int32_t m_maxCacheAge;
|
|
|
|
// a list of 32-bit ints followed by a zero 32-bit int to terminate
|
|
int64_t m_adIds [ XD_MAX_AD_IDS ];
|
|
//char *m_adVector;// [XMLDOC_MAX_AD_IDS];
|
|
//int32_t m_adVectorSize;
|
|
|
|
char *m_wikiqbuf;
|
|
int32_t m_wikiqbufSize;
|
|
int64_t m_wikiDocIds [ MAX_WIKI_DOCIDS ];
|
|
rscore_t m_wikiScores [ MAX_WIKI_DOCIDS ];
|
|
|
|
bool m_registeredSleepCallback;
|
|
bool m_addedNegativeDoledbRec;
|
|
|
|
bool m_hashedTitle;
|
|
bool m_hashedMetas;
|
|
|
|
int32_t m_niceness;
|
|
|
|
bool m_usePosdb ;
|
|
//bool m_useDatedb ;
|
|
bool m_useClusterdb ;
|
|
bool m_useLinkdb ;
|
|
bool m_useSpiderdb ;
|
|
bool m_useTitledb ;
|
|
bool m_useTagdb ;
|
|
bool m_usePlacedb ;
|
|
//bool m_useTimedb ;
|
|
bool m_useSectiondb ;
|
|
//bool m_useRevdb ;
|
|
bool m_useSecondaryRdbs ;
|
|
|
|
int32_t m_linkeeQualityBoost;
|
|
|
|
SafeBuf *m_pbuf;
|
|
// used by SpiderLoop to set m_pbuf to
|
|
SafeBuf m_sbuf;
|
|
// store termlist into here if non-null
|
|
bool m_storeTermListInfo;
|
|
char m_sortTermListBy;
|
|
|
|
SafeBuf m_sectiondbData;
|
|
//char *m_sectiondbData;
|
|
char *m_placedbData;
|
|
//int32_t m_sectiondbDataSize;
|
|
int32_t m_placedbDataSize;
|
|
|
|
// we now have HashInfo to replace this
|
|
//bool m_inHashNoSplit;
|
|
|
|
// store the terms that we hash into this table so that PageParser.cpp
|
|
// can print what was hashed and with what score and what description
|
|
class HashTableX *m_wts;
|
|
HashTableX m_wtsTable;
|
|
SafeBuf m_wbuf;
|
|
|
|
// used by addContactInfo() to keep track of what urls we have
|
|
// processed for contact info to avoid re-processing them in the
|
|
// recursive loop thing that we do
|
|
//HashTableX m_pt;
|
|
|
|
// Msg25.cpp stores its pageparser.cpp output into this one
|
|
SafeBuf m_pageLinkBuf;
|
|
SafeBuf m_siteLinkBuf;
|
|
|
|
SafeBuf m_serpBuf;
|
|
|
|
// which set() function was called above to set us?
|
|
bool m_setFromTitleRec;
|
|
bool m_setFromSpiderRec;
|
|
bool m_setFromUrl;
|
|
bool m_setFromDocId;
|
|
bool m_freeLinkInfo1;
|
|
bool m_freeLinkInfo2;
|
|
bool m_contentInjected;
|
|
|
|
bool m_recycleContent;
|
|
//bool m_loadFromOldTitleRec;
|
|
|
|
char *m_rawUtf8Content;
|
|
int32_t m_rawUtf8ContentSize;
|
|
int32_t m_rawUtf8ContentAllocSize; // we overallocate sometimes
|
|
char *m_expandedUtf8Content;
|
|
int32_t m_expandedUtf8ContentSize;
|
|
char *m_savedp;
|
|
char *m_oldp;
|
|
bool m_didExpansion;
|
|
SafeBuf m_esbuf;
|
|
SafeBuf m_xbuf;
|
|
|
|
//bool m_useIpsTxtFile ;
|
|
//bool m_readFromTestCache ;
|
|
|
|
// used by msg13
|
|
class Msg13Request *m_r;
|
|
|
|
// Msg20 uses this to stash its TcpSlot
|
|
void *m_slot;
|
|
|
|
char *getTestDir();
|
|
|
|
bool m_freed;
|
|
|
|
bool m_msg4Waiting;
|
|
bool m_msg4Launched;
|
|
|
|
// word spam detection
|
|
char *getWordSpamVec ( );
|
|
bool setSpam ( int32_t *profile, int32_t plen , int32_t numWords ,
|
|
unsigned char *spam );
|
|
int32_t getProbSpam ( int32_t *profile, int32_t plen , int32_t step );
|
|
bool m_isRepeatSpammer;
|
|
int32_t m_numRepeatSpam;
|
|
bool m_totallySpammed;
|
|
|
|
// frag vector (repeated fragments). 0 means repeated, 1 means not.
|
|
// vector is 1-1 with words in the document body.
|
|
char *getFragVec ( );
|
|
|
|
bool injectDoc ( char *url ,
|
|
class CollectionRec *cr ,
|
|
char *content ,
|
|
char *diffbotReply, // usually null
|
|
bool contentHasMime ,
|
|
int32_t hopCount,
|
|
int32_t charset,
|
|
|
|
bool deleteUrl,
|
|
//char contentType, // CT_HTML, CT_XML
|
|
char *contentTypeStr, // text/html, text/xml etc.
|
|
bool spiderLinks ,
|
|
char newOnly, // index iff new
|
|
|
|
void *state,
|
|
void (*callback)(void *state) ,
|
|
|
|
uint32_t firstIndexedTime = 0,
|
|
uint32_t lastSpideredDate = 0 ,
|
|
int32_t injectDocIp = 0 ,
|
|
// for container docs consisting of subdocs to inject
|
|
char *contentDelim = NULL,
|
|
char* metadata = NULL,
|
|
uint32_t metadataLen = 0,
|
|
int32_t payloadLen = -1);
|
|
|
|
|
|
bool injectLinks ( HashTableX *linkDedupTable ,
|
|
HashTableX *domDedupTable ,
|
|
void *finalState ,
|
|
void (* finalCallback)(void *));
|
|
bool injectAhrefsLinks();
|
|
bool doInjectLoop ( );
|
|
void doneInjecting ( class XmlDoc *xd );
|
|
int32_t m_i;
|
|
int32_t m_blocked;
|
|
HashTableX m_domDedupTable;
|
|
HashTableX *m_linkDedupTablePtr;
|
|
HashTableX *m_domDedupTablePtr;
|
|
bool m_dedupLinkDomains;
|
|
void *m_finalState;
|
|
void (* m_finalCallback) ( void *state );
|
|
char m_used[MAX_XML_DOCS];
|
|
class XmlDoc *m_xmlDocs[MAX_XML_DOCS];
|
|
int64_t m_cacheStartTime;
|
|
};
|
|
|
|
// . PageParser.cpp uses this class for printing hashed terms out by calling
|
|
// XmlDoc::print()
|
|
// . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX
|
|
// . one for each term hashed
|
|
// . the key is the termId. dups are allowed
|
|
// . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so
|
|
// that TermInfo::m_term will reference that and it won't disappear on us
|
|
class TermDebugInfo {
|
|
public:
|
|
int32_t m_termOff;
|
|
int32_t m_termLen;
|
|
//uint32_t m_score32;
|
|
int32_t m_descOff; // the description offset
|
|
int32_t m_prefixOff; // the prefix offset, like "site" or "gbadid"
|
|
int64_t m_termId;
|
|
int32_t m_date;
|
|
bool m_shardByTermId;
|
|
|
|
//float m_weight;
|
|
char m_langId;
|
|
char m_diversityRank;
|
|
char m_densityRank;
|
|
char m_wordSpamRank;
|
|
char m_hashGroup;
|
|
int32_t m_wordNum;
|
|
int32_t m_wordPos;
|
|
POSDBKEY m_key; // key144_t
|
|
//bool m_isSynonym;
|
|
// 0 = not a syn, 1 = syn from presets,2=wikt,3=generated
|
|
char m_synSrc;
|
|
int64_t m_langBitVec64;
|
|
// used for gbsectionhash:xxxx terms to hack in the inner content
|
|
// hash, aka sentHash32 for doing xpath histograms on a site
|
|
//int32_t m_sentHash32;
|
|
//int32_t m_facetVal32;
|
|
// this is copied from Weights::m_rvw or m_rvp
|
|
//float m_rv[MAX_RULES];
|
|
};
|
|
|
|
// a ptr to HashInfo is passed to hashString() and hashWords()
|
|
class HashInfo {
|
|
public:
|
|
HashInfo() {
|
|
m_tt = NULL;
|
|
m_prefix = NULL;
|
|
m_desc = NULL;
|
|
m_date = 0;
|
|
// should we do sharding based on termid and not the usual docid???
|
|
// in general this is false, but for checksum we want to shard
|
|
// by the checksum and not docid to avoid having to do a
|
|
// gbchecksum:xxxxx search on ALL shards. much more efficient.
|
|
m_shardByTermId = false;
|
|
//m_useWeights = false;
|
|
m_useSynonyms = false;
|
|
m_hashGroup = -1;
|
|
m_useCountTable = true;
|
|
m_useSections = true;
|
|
m_startDist = 0;
|
|
// m_facetVal32 = 0;
|
|
// used for sectiondb stuff, but stored in posdb
|
|
//m_sentHash32 = 0;
|
|
};
|
|
class HashTableX *m_tt;
|
|
char *m_prefix;
|
|
// "m_desc" should detail the algorithm
|
|
char *m_desc;
|
|
int32_t m_date;
|
|
char m_shardByTermId;
|
|
char m_linkerSiteRank;
|
|
//char m_useWeights;
|
|
char m_useSynonyms;
|
|
char m_hashGroup;
|
|
int32_t m_startDist;
|
|
//int32_t m_facetVal32;
|
|
bool m_useCountTable;
|
|
bool m_useSections;
|
|
};
|
|
|
|
|
|
// g_tt is used for debugging
|
|
//extern class TermTable *g_tt;
|
|
|
|
extern uint8_t score32to8 ( uint32_t score ) ;
|
|
|
|
extern pid_t g_pid ;
|
|
extern int32_t g_ticker ;
|
|
extern int32_t g_filterTimeout ;
|
|
|
|
// as recommended in the "man system" page we use our own
|
|
int my_system_r ( char *cmd , int32_t timeout ) ;
|
|
|
|
// . returns 0 to 100 , the probability of spam for this subprofile
|
|
// . a "profile" is an array of all the positions of a word in the document
|
|
// . a "position" is just the word #, like first word, word #8, etc...
|
|
// . we are passed a subprofile, "profile", of the actual profile
|
|
// because some of the document may be more "spammy" than other parts
|
|
// . inlined to speed things up because this may be called multiple times
|
|
// for each word in the document
|
|
// . if "step" is 1 we look at every word position in the profile
|
|
// . if "step" is 2 we look at every other word position
|
|
// . if "step" is 3 we look at every 3rd word position, etc...
|
|
inline int32_t XmlDoc::getProbSpam(int32_t *profile, int32_t plen, int32_t step) {
|
|
|
|
// you can spam 2 or 1 letter words all you want to
|
|
if ( plen <= 2 ) return 0;
|
|
|
|
// if our step is bigger than the profile return 0
|
|
if ( step == plen ) return 0;
|
|
|
|
register int32_t avgSpacing, stdDevSpacing;
|
|
int32_t d,dev=0;
|
|
register int32_t i;
|
|
|
|
for (int32_t j = 0; j < step; j++) {
|
|
|
|
// find avg. of gaps between consecutive tokens in subprofile
|
|
// TODO: isn't profile[i] < profile[i+1]??
|
|
int32_t istop = plen-1;
|
|
avgSpacing = 0;
|
|
for (i=0; i < istop; i += step )
|
|
avgSpacing += ( profile[i] - profile[i+1] );
|
|
// there's 1 less spacing than positions in the profile
|
|
// so we divide by plen-1
|
|
avgSpacing = (avgSpacing * 256) / istop;
|
|
|
|
// compute standard deviation of the gaps in this sequence
|
|
stdDevSpacing = 0;
|
|
for (i = 0 ; i < istop; i += step ) {
|
|
d = (( profile[i] - profile[i+1]) * 256 ) - avgSpacing;
|
|
if ( d < 0 ) stdDevSpacing -= d;
|
|
else stdDevSpacing += d;
|
|
}
|
|
|
|
// TODO: should we divide by istop-1 for stdDev??
|
|
stdDevSpacing /= istop;
|
|
|
|
// average of the stddevs for all sequences
|
|
dev += stdDevSpacing;
|
|
}
|
|
|
|
dev /= step;
|
|
|
|
// if the plen is big we should expect dev to be big
|
|
// here's some interpolation points:
|
|
// plen >= 2 and dev<= 0.2 --> 100%
|
|
// plen = 7 and dev = 1.0 --> 100%
|
|
// plen = 14 and dev = 2.0 --> 100%
|
|
// plen = 21 and dev = 3.0 --> 100%
|
|
// plen = 7 and dev = 2.0 --> 50%
|
|
|
|
// NOTE: dev has been multiplied by 256 to avoid using floats
|
|
if ( dev <= 51.2 ) return 100; // (.2 * 256)
|
|
int32_t prob = ( (256*100/7) * plen ) / dev;
|
|
|
|
if (prob>100) prob=100;
|
|
|
|
return prob;
|
|
|
|
//if (prob>=0) {
|
|
// int32_t i;
|
|
//printf("dev=%i,plen=%i,nseq=%i,prob=%i----\n",dev,plen,step,prob);
|
|
// for (i=0;i<plen;i++)
|
|
// printf("%i#",profile[i]);
|
|
// printf("\n");
|
|
//}
|
|
}
|
|
|
|
#endif
|
|
|