// Matt Wells, copyright Apr 2009 // . 2. you can also call setTitleRec() and then call getMetaList() // . this class is used by Repair.cpp and by Msg7 (inject) and SpiderLoop.cpp // . Msg7 and Repair.cpp and injections can also set more than just // m_firstUrl, like m_content, etc. or whatever elements are known, but // they must also set the corresponding "valid" flags of those elements // . both methods must yield exactly the same result, the same "meta list" // . after setting the contained classes XmlDoc::setMetaList() makes the list // of rdb records to be added to all the rdbs, this is the "meta list" // . the meta list is made by hashing all the termIds/scores into some hash // tables in order to accumulate scores, then the hash table are serialized // into the "meta list" // . the meta list is added to all rdbs with a simple call to // Msg4::addMetaList(), which is only called by Msg14 or Repair.cpp for now #ifndef _XMLDOC_H_ #define _XMLDOC_H_ //#include "HashTableX.h" #include "Lang.h" #include "Words.h" #include "Bits.h" #include "Pos.h" #include "Phrases.h" //#include "Synonyms.h" //#include "Weights.h" #include "Xml.h" #include "LangList.h" #include "SafeBuf.h" #include "Images.h" #include "Sections.h" #include "Msge0.h" #include "Msge1.h" //#include "Msge2.h" #include "Msg4.h" #include "Msg8b.h" #include "SearchInput.h" #include "Msg40.h" #include "Dates.h" //#include "IndexList.h" #include "Msg0.h" #include "Msg22.h" #include "Tagdb.h" #include "Url.h" #include "Linkdb.h" //#include "LinkInfo.h" //#include "Msg25.h" #include "MsgC.h" #include "Msg13.h" #include "RdbList.h" #include "SiteGetter.h" //#include "CollectionRec.h" #include "Msg20.h" #include "Matches.h" #include "Query.h" #include "Title.h" #include "Summary.h" #include "Msg8b.h" #include "Address.h" #include "zlib.h" // Z_OK #include "Spider.h" // SpiderRequest/SpiderReply definitions #include "HttpMime.h" // ET_DEFLAT #include "Msg1.h" #include "PingServer.h" #include "Json.h" //#define XMLDOC_MAX_AD_IDS 4 //#define XMLDOC_ADLEN 64 #define MAXFRAGWORDS 80000 #define MAX_WIKI_DOCIDS 20 #define MAX_TAG_PAIR_HASHES 100 #include "Msg40.h" //#define SAMPLE_VECTOR_SIZE (32*4) #define POST_VECTOR_SIZE (32*4) #define XD_GQ_MAX_SIZE 1000 #define XD_MAX_GIGABIT_HASHES 48 #define XD_MAX_AD_IDS 5 #define MAX_LINK_TEXT_LEN 512 double getTrafficPercent ( int32_t rank ) ; bool setLangVec ( class Words *words , class SafeBuf *langBuf , class Sections *sections , int32_t niceness ) ; char *getJSONFieldValue ( char *json, char *field , int32_t *valueLen ) ; bool logQueryLogs ( ); bool checkRegex ( SafeBuf *regex , char *target , bool *boolVal , bool *boolValValid , int32_t *compileError , CollectionRec *cr ) ; // Address.cpp calls this to make a vector from the "place name" for comparing // to other places in placedb using the computeSimilarity() function. if // we got a >75% similarity we set the AF_VERIFIED_PLACE_NAME bit in the // Address::m_flags for that address on the web page. int32_t makeSimpleWordVector ( char *s, int32_t *vbuf, int32_t vbufSize, int32_t niceness); // this is used for making the event summary/title vectors as well as in // Msg40.cpp where it merges events and does not want to repetitively display // the same summary lines for an event bool getWordVector ( char *s , HashTableX *ht , uint32_t *d , int32_t *nd , int32_t ndmax ) ; bool getDensityRanks ( int64_t *wids , int32_t nw, //int32_t wordStart , //int32_t wordEnd , int32_t hashGroup , SafeBuf *densBuf , Sections *sections , int32_t niceness ); // diversity vector bool getDiversityVec ( class Words *words , class Phrases *phrases , class HashTableX *countTable , class SafeBuf *sbWordVec , //class SafeBuf *sbPhraseVec , int32_t niceness ); float computeSimilarity ( int32_t *vec0 , int32_t *vec1 , // corresponding scores vectors int32_t *s0 , int32_t *s1 , class Query *q , int32_t niceness , // only Sections::addDateBasedImpliedSections() // sets this to true right now. if set to true // we essentially dedup each vector, although // the score is compounded into the remaining // occurence. i'm not sure if that is the right // behavior though. bool dedupVecs = false ); bool isSimilar_sorted ( int32_t *vec0 , int32_t *vec1 , int32_t nv0 , // how many int32_ts in vec? int32_t nv1 , // how many int32_ts in vec? // they must be this similar or more to return true int32_t percentSimilar, int32_t niceness ) ; // this is called by Msg40.cpp to set "top" int32_t intersectGigabits ( Msg20 **mp , // search results int32_t nmp , uint8_t langId , // searcher's langId int32_t maxTop , int32_t docsToScan , int32_t minDocCount , // must be in this # docs class GigabitInfo *top , int32_t niceness ) ; int32_t getDirtyPoints ( char *s , int32_t len , int32_t niceness , char *logUrl ) ; bool storeTerm ( char *s , int32_t slen , int64_t termId , class HashInfo *hi , int32_t wordNum , int32_t wordPos , char densityRank , char diversityRank , char wordSpamRank , char hashGroup , //bool isPhrase , class SafeBuf *wbuf , class HashTableX *wts , char synSrc , char langId , POSDBKEY key ) ; // tell zlib to use our malloc/free functions int gbuncompress ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen ); int gbcompress ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen , int32_t encoding = ET_DEFLATE); int gbcompress7 ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen , bool compress = true ); int gbuncompress7 ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen ) ; uint32_t score8to32 ( uint8_t score8 ); // for Msg13.cpp char getContentTypeFromContent ( char *p , int32_t niceness ) ; // . for Msg13.cpp // . *pend must equal \0 int32_t getContentHash32Fast ( unsigned char *p , int32_t plen , int32_t niceness ) ; uint16_t getCharsetFast ( class HttpMime *mime, char *url , char *s , int32_t slen , int32_t niceness ); //#define MAX_CONTACT_OUTLINKS 5 #define MAX_CONTACT_ADDRESSES 20 #define EMAILBUFSIZE 512 #define ROOT_TITLE_BUF_MAX 512 // store the subsentences in an array now class SubSent { public: sentflags_t m_subSentFlags; //esflags_t m_esflags; int32_t m_senta; int32_t m_sentb; int32_t m_subEnding; float m_titleScore; }; #define MAX_XML_DOCS 4 #define MAXMSG7S 50 class XmlDoc { public: // . variable size rdb records all start with key then dataSize // . do not do that here since we compress our record's data!! //key_t m_titleRecKey; //int32_t m_dataSize; // // BEGIN WHAT IS STORED IN THE TITLE REC (Titledb.h) // // headerSize = this->ptr_firstUrl - this->m_headerSize uint16_t m_headerSize; uint16_t m_version; // these flags are used to indicate which ptr_ members are present: uint32_t m_internalFlags1; int32_t m_ip; int32_t m_crawlDelay; // . use this to quickly detect if doc is unchanged // . we can avoid setting Xml and Words classes etc... int32_t m_contentHash32; // like the above but hash of all tags in TagRec for this url //int32_t m_tagHash32; // this is a hash of all adjacent tag pairs for templated identificatn uint32_t m_tagPairHash32; int32_t m_siteNumInlinks; //int32_t m_siteNumInlinksUniqueIp; // m_siteNumInlinksFresh //int32_t m_siteNumInlinksUniqueCBlock; // m_sitePop; int32_t m_reserved1; int32_t m_reserved2; uint32_t m_spideredTime; // time_t // just don't throw away any relevant SpiderRequests and we have // the data that m_minPubDate and m_maxPubDate provided //time_t m_minPubDate; //time_t m_maxPubDate; uint32_t m_indexedTime; // slightly > m_spideredTime (time_t) uint32_t m_reserved32; uint32_t m_pubDate; // aka m_datedbDate // time_t //time_t m_nextSpiderTime; uint32_t m_firstIndexedDate; // time_t uint32_t m_outlinksAddedDate; // time_t uint16_t m_charset; // the ORIGINAL charset, we are always utf8! uint16_t m_countryId; //uint16_t m_reserved1;//titleWeight; //uint16_t m_reserved2;//headerWeight; //int32_t m_siteNumInlinksTotal; int32_t m_reserved3; //uint16_t m_reserved3;//urlPathWeight; uint8_t m_metaListCheckSum8; // bring it back!! char m_reserved3b; uint16_t m_bodyStartPos;//m_reserved4;//externalLinkTextWeight; uint16_t m_reserved5;//internalLinkTextWeight; // a new parm from reserved6. need to know the count so we can // delete the json objects derived from this page if we want to // delete this page. or if this page is respidered then we get the // json objects for it, REject the old json object urls, and inject // the new ones i guess. uint16_t m_diffbotJSONCount; // these do not include header/footer (dup) addresses //int16_t m_numAddresses; int16_t m_httpStatus; // -1 if not found (empty http reply) //int8_t m_nextSpiderPriority; int8_t m_hopCount; //int8_t m_metalistChecksum; // parser checksum //uint8_t m_numBannedOutlinks8; uint8_t m_langId; uint8_t m_rootLangId; uint8_t m_contentType; // bit flags uint16_t m_isRSS:1; uint16_t m_isPermalink:1; uint16_t m_isAdult:1; uint16_t m_wasContentInjected:1;//eliminateMenus:1; uint16_t m_spiderLinks:1; uint16_t m_isContentTruncated:1; uint16_t m_isLinkSpam:1; uint16_t m_hasAddress:1; uint16_t m_hasTOD:1; uint16_t m_reserved_sv:1;//hasSiteVenue:1; uint16_t m_hasContactInfo:1; uint16_t m_isSiteRoot:1; uint16_t m_isDiffbotJSONObject:1; uint16_t m_sentToDiffbot:1; uint16_t m_gotDiffbotSuccessfulReply:1; uint16_t m_useTimeAxis:1; // m_reserved804:1; uint16_t m_hasMetadata:1; uint16_t m_reserved806:1; uint16_t m_reserved807:1; uint16_t m_reserved808:1; uint16_t m_reserved809:1; uint16_t m_reserved810:1; uint16_t m_reserved811:1; uint16_t m_reserved812:1; uint16_t m_reserved813:1; uint16_t m_reserved814:1; uint16_t m_reserved815:1; uint16_t m_reserved816:1; char *ptr_firstUrl; char *ptr_redirUrl; //char *ptr_tagRecData; char *ptr_rootTitleBuf; int32_t *ptr_gigabitHashes; int32_t *ptr_gigabitScores; int64_t *ptr_adVector; int64_t *ptr_wikiDocIds; rscore_t *ptr_wikiScores; char *ptr_imageData; int32_t *ptr_catIds; int32_t *ptr_indCatIds; char *ptr_dmozTitles; char *ptr_dmozSumms; char *ptr_dmozAnchors; char *ptr_utf8Content; //char *ptr_sectionsReply; // votes read from sectiondb - m_osvt //char *ptr_sectionsVotes; // our local votes - m_nsvt //char *ptr_addressReply; //char *ptr_clockCandidatesData; char *ptr_metadata; // . serialization of the sectiondb and placedb lists // . that way we can store just these and not have to store the content // of the entire page if we do not need to //char *ptr_sectiondbData; //char *ptr_placedbData; // do not let SiteGetter change this when we re-parse! char *ptr_site; LinkInfo *ptr_linkInfo1; char *ptr_linkdbData; char *ptr_sectiondbData; char *ptr_tagRecData; LinkInfo *ptr_linkInfo2; int32_t size_firstUrl; int32_t size_redirUrl; //int32_t size_tagRecData; int32_t size_rootTitleBuf; int32_t size_gigabitHashes; int32_t size_gigabitScores; int32_t size_adVector; int32_t size_wikiDocIds; int32_t size_wikiScores; int32_t size_imageData; int32_t size_catIds; int32_t size_indCatIds; int32_t size_dmozTitles; int32_t size_dmozSumms; int32_t size_dmozAnchors; int32_t size_utf8Content; //int32_t size_sectionsReply; //int32_t size_sectionsVotes; //int32_t size_addressReply; //int32_t size_clockCandidatesData; int32_t size_metadata; //int32_t size_sectiondbData; //int32_t size_placedbData; int32_t size_site; int32_t size_linkInfo1; int32_t size_linkdbData; int32_t size_sectiondbData; int32_t size_tagRecData; int32_t size_linkInfo2; char m_dummyEnd; // // END WHAT IS STORED IN THE TITLE REC (Titledb.h) // public: // . returns false and sets errno on error // . once you call this you can call setMetaList() below // . sets all the contained parser classes, Words, Xml, etc. if they // have not already been set! that way Msg16/Msg14 can set bits // and pieces here and there and we do not reset what it's done // . our m_xml will contain ptrs into titleRec's content, be careful // . if titleRec gets freed we should be freed too //bool set ( char *titleRec , // class SafeBuf *pbuf = NULL , // int32_t niceness = MAX_NICENESS , // bool justSetLinks = false ); // . used by Msg16 to set the Xml to get meta redirect tag's content // . used by Msg16 to get <META NAME="ROBOTS" CONTENT="index,follow"> // . this should be set by Msg16 so it can get meta redirect url void print ( ); bool set1 ( char *url , char *coll, SafeBuf *pbuf , int32_t niceness ); bool set2 ( char *titleRec, int32_t maxSize, char *coll, class SafeBuf *p, int32_t niceness , class SpiderRequest *sreq = NULL ); // . since being set from a docId, we will load the old title rec // and use that! // . used by PageGet.cpp bool set3 ( int64_t docId , char *coll , int32_t niceness ); bool set4 ( class SpiderRequest *sreq , key_t *doledbKey , char *coll , class SafeBuf *pbuf , int32_t niceness , char *utf8Content = NULL , bool deleteFromIndex = false , int32_t forcedIp = 0 , uint8_t contentType = CT_HTML , uint32_t spideredTime = 0 , // time_t bool contentHasMime = false , // for container docs, what is the separator of subdocs? char *contentDelim = NULL, char *metadata = NULL, uint32_t metadataLen = 0, // for injected docs we have the recv, buffer size don't exceed that int32_t payloadLen = -1) ; // we now call this right away rather than at download time! int32_t getSpideredTime(); // time right before adding the termlists to the index, etc. // whereas spider time is the download time int32_t getIndexedTime(); // another entry point, like set3() kinda bool loadFromOldTitleRec (); XmlDoc() ; ~XmlDoc() ; void nukeDoc ( class XmlDoc *); void reset ( ) ; bool setFirstUrl ( char *u , bool addWWW , Url *base = NULL ) ; bool setRedirUrl ( char *u , bool addWWW ) ; void setStatus ( char *s ) ; void setCallback ( void *state, void (*callback) (void *state) ) ; void setCallback ( void *state, bool (*callback) (void *state) ) ; bool addToSpiderdb ( ) ; void getRevisedSpiderRequest ( class SpiderRequest *revisedReq ); void getRebuiltSpiderRequest ( class SpiderRequest *sreq ) ; bool indexDoc ( ); bool indexDoc2 ( ); bool isContainerDoc ( ); bool indexContainerDoc ( ); bool readMoreWarc(); bool indexWarcOrArc ( ) ; key_t *getTitleRecKey() ; //char *getSkipIndexing ( ); char *prepareToMakeTitleRec ( ) ; // store TitleRec into "buf" so it can be added to metalist bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 ); // sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid] SafeBuf *getTitleRecBuf ( ); bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ; SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply , bool forDelete ) ; SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ; bool setSpiderStatusDocMetaList ( SafeBuf *jd , int64_t ssDocId ) ; SafeBuf m_spiderStatusDocMetaList; char *getIsAdult ( ) ; int32_t **getIndCatIds ( ) ; int32_t **getCatIds ( ) ; class CatRec *getCatRec ( ) ; int32_t *getNumDmozEntries() ; char **getDmozTitles ( ) ; char **getDmozSummaries ( ) ; char **getDmozAnchors ( ) ; bool setDmozInfo () ; int64_t **getWikiDocIds ( ) ; void gotWikiResults ( class UdpSlot *slot ); int32_t *getPubDate ( ) ; //class DateParse2 *getDateParse2 ( ) ; class Dates *getSimpleDates(); class Dates *getDates(); //class HashTableX *getClockCandidatesTable(); int32_t getUrlPubDate ( ) ; int32_t getOutlinkAge ( int32_t outlinkNum ) ; char *getIsPermalink ( ) ; char *getIsUrlPermalinkFormat ( ) ; char *getIsRSS ( ) ; char *getIsSiteMap ( ) ; class Xml *getXml ( ) ; uint8_t *getLangVector ( ) ; uint8_t *getLangId ( ) ; char computeLangId ( Sections *sections ,Words *words , char *lv ) ; class Words *getWords ( ) ; class Bits *getBits ( ) ; class Bits *getBitsForSummary ( ) ; class Pos *getPos ( ); class Phrases *getPhrases ( ) ; //class Synonyms *getSynonyms ( ); class Sections *getExplicitSections ( ) ; class Sections *getImpliedSections ( ) ; class Sections *getSections ( ) ; class Sections *getSectionsWithDupStats ( ); class SafeBuf *getInlineSectionVotingBuf(); bool gotSectionFacets( class Multicast *mcast ); class SectionStats *getSectionStats ( uint32_t secHash32 , uint32_t sentHash32 , bool cacheOnly ); class SectionVotingTable *getOldSectionVotingTable(); class SectionVotingTable *getNewSectionVotingTable(); char **getSectionsReply ( ) ; char **getSectionsVotes ( ) ; HashTableX *getSectionVotingTable(); int32_t *getLinkSiteHashes ( ); class Links *getLinks ( bool doQuickSet = false ) ; class HashTableX *getCountTable ( ) ; bool hashString_ct ( class HashTableX *ht, char *s , int32_t slen ) ; uint8_t *getSummaryLangId ( ) ; int32_t *getSummaryVector ( ) ; int32_t *getPageSampleVector ( ) ; int32_t *getPostLinkTextVector ( int32_t linkNode ) ; int32_t computeVector ( class Sections *sections, class Words *words, uint32_t *vec , int32_t start = 0 , int32_t end = -1 ); float *getTagSimilarity ( class XmlDoc *xd2 ) ; float *getGigabitSimilarity ( class XmlDoc *xd2 ) ; float *getPageSimilarity ( class XmlDoc *xd2 ) ; float *getPercentChanged ( ); uint64_t *getFuzzyDupHash ( ); int64_t *getExactContentHash64(); int64_t *getLooseContentHash64(); class RdbList *getDupList ( ) ; class RdbList *getLikedbListForReq ( ); class RdbList *getLikedbListForIndexing ( ); int32_t addLikedbRecords ( bool justGetSize ) ; char *getIsDup ( ) ; char *isDupOfUs ( int64_t d ) ; uint32_t *getGigabitVectorScorelessHash ( ) ; int32_t **getGigabitHashes ( ); char *getGigabitQuery ( ) ; char *getMetaDescription( int32_t *mdlen ) ; char *getMetaSummary ( int32_t *mslen ) ; char *getMetaKeywords( int32_t *mklen ) ; char *getMetadata(int32_t* retlen); bool addGigabits ( char *s , int64_t docId , uint8_t langId ) ; bool addGigabits2 ( char *s,int32_t slen,int64_t docId,uint8_t langId); bool addGigabits ( class Words *ww , int64_t docId, class Sections *sections, //class Weights *we , uint8_t langId ); int32_t *getSiteSpiderQuota ( ) ; class Url *getCurrentUrl ( ) ; class Url *getFirstUrl() ; int64_t getFirstUrlHash48(); int64_t getFirstUrlHash64(); class Url **getLastRedirUrl() ; class Url **getRedirUrl() ; class Url **getMetaRedirUrl() ; class Url **getCanonicalRedirUrl ( ) ; int32_t *getFirstIndexedDate ( ) ; int32_t *getOutlinksAddedDate ( ) ; //int32_t *getNumBannedOutlinks ( ) ; uint16_t *getCountryId ( ) ; class XmlDoc **getOldXmlDoc ( ) ; //bool isRobotsTxtFile ( char *url , int32_t urlLen ) ; class XmlDoc **getExtraDoc ( char *url , int32_t maxCacheAge = 0 ) ; bool getIsPageParser ( ) ; class XmlDoc **getRootXmlDoc ( int32_t maxCacheAge = 0 ) ; //class XmlDoc **getGatewayXmlDoc ( ) ; // . returns false if blocked, true otherwise. // . returns true and sets g_errno on error //bool setFromOldTitleRec ( ) ; //RdbList *getOldMetaList ( ) ; char **getOldTitleRec ( ); uint8_t *getRootLangId (); //bool *updateRootLangId ( ); char **getRootTitleRec ( ) ; //char **getContactTitleRec ( char *url ) ; int64_t *getAvailDocIdOnly ( int64_t preferredDocId ) ; int64_t *getDocId ( ) ; char *getIsIndexed ( ) ; class TagRec *getTagRec ( ) ; char *getHasContactInfo ( ) ; char *getIsThisDocContacty ( ); bool *getHasTOD(); bool *getHasSiteVenue(); // non-dup/nondup addresses only bool *getHasAddress(); class Addresses *getAddresses ( ) ; Address **getContactAddresses ( ); int32_t *getNumOfficialEmails ( ) ; char *getEmailBuf ( ) ; int32_t *getNumContactAddresses ( ); int32_t addEmailTags ( class Xml *xml , class Words *ww , class TagRec *gr , int32_t ip ) ; //class Url *getContactUsLink ( ) ; //class Url *getAboutUsLink ( ) ; int32_t *getFirstIp ( ) ; bool *updateFirstIp ( ) ; //int32_t *getSiteNumInlinksUniqueIp ( ) ; //int32_t *getSiteNumInlinksUniqueCBlock ( ) ; //int32_t *getSiteNumInlinksTotal ( ); //int32_t *getSiteNumInlinksFresh ( ) ; //int32_t *getSitePop ( ) ; uint8_t *getSiteNumInlinks8 () ; int32_t *getSiteNumInlinks ( ) ; class LinkInfo *getSiteLinkInfo() ; int32_t *getIp ( ) ; int32_t *gotIp ( bool save ) ; bool *getIsAllowed ( ) ; int32_t *getFinalCrawlDelay(); int32_t m_finalCrawlDelay; //int32_t getTryAgainTimeDelta() { // if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;} // return m_tryAgainTimeDelta; //}; char *getIsWWWDup ( ) ; class LinkInfo *getLinkInfo1 ( ) ; class LinkInfo **getLinkInfo2 ( ) ; char *getSite ( ) ; void gotSite ( ) ; int64_t *getSiteHash64 ( ) ; int32_t *getSiteHash32 ( ) ; char **getHttpReply ( ) ; char **getHttpReply2 ( ) ; char **gotHttpReply ( ) ; char *getIsContentTruncated ( ); int32_t *getDownloadStatus ( ) ; int64_t *getDownloadEndTime ( ) ; int16_t *getHttpStatus ( ); char waitForTimeSync ( ) ; bool m_alreadyRegistered; class HttpMime *getMime () ; char **getContent ( ) ; uint8_t *getContentType ( ) ; uint16_t *getCharset ( ) ; char *getIsBinary ( ) ; char **getFilteredContent ( ) ; void filterStart_r ( bool amThread ) ; char **getRawUtf8Content ( ) ; char **getExpandedUtf8Content ( ) ; char **getUtf8Content ( ) ; // we download large files to a file on disk, like warcs and arcs FILE *getUtf8ContentInFile ( ); int32_t *getContentHash32 ( ) ; int32_t *getContentHashJson32 ( ) ; //int32_t *getTagHash32 ( ) ; int32_t *getTagPairHashVector ( ) ; uint32_t *getTagPairHash32 ( ) ; int32_t getHostHash32a ( ) ; int32_t getHostHash32b ( ) ; int32_t getDomHash32 ( ); char **getThumbnailData(); class Images *getImages ( ) ; int8_t *getNextSpiderPriority ( ) ; int32_t *getPriorityQueueNum ( ) ; class TagRec ***getOutlinkTagRecVector () ; char *hasNoIndexMetaTag(); char *hasFakeIpsMetaTag ( ); int32_t **getOutlinkFirstIpVector () ; //char **getOutlinkIsIndexedVector () ; int32_t *getRegExpNum ( int32_t outlinkNum ) ; int32_t *getRegExpNum2 ( int32_t outlinkNum ) ; char *getIsSiteRoot ( ) ; bool getIsOutlinkSiteRoot ( char *u , class TagRec *gr ) ; int8_t *getHopCount ( ) ; //int8_t *getOutlinkHopCountVector ( ) ; char *getSpiderLinks ( ) ; int32_t *getNextSpiderTime ( ) ; //char *getIsSpam() ; char *getIsFiltered (); bool getIsInjecting(); int32_t *getSpiderPriority ( ) ; int32_t *getIndexCode ( ) ; int32_t *getIndexCode2 ( ) ; SafeBuf *getNewTagBuf ( ) ; char *updateTagdb ( ) ; bool logIt ( class SafeBuf *bb = NULL ) ; bool m_doConsistencyTesting; bool doConsistencyTest ( bool forceTest ) ; int32_t printMetaList ( ) ; void printMetaList ( char *metaList , char *metaListEnd , class SafeBuf *pbuf ); bool verifyMetaList ( char *p , char *pend , bool forDelete ) ; bool hashMetaList ( class HashTableX *ht , char *p , char *pend , bool checkList ) ; char *getMetaList ( bool forDelete = false ); char *getDiffbotParentUrl( char *myUrl ); int64_t m_diffbotReplyEndTime; int64_t m_diffbotReplyStartTime; int32_t m_diffbotReplyRetries; bool m_sentToDiffbotThisTime; uint64_t m_downloadStartTime; //uint64_t m_downloadEndTime; uint64_t m_ipStartTime; uint64_t m_ipEndTime; bool m_updatedMetaData; void copyFromOldDoc ( class XmlDoc *od ) ; class SpiderReply *getFakeSpiderReply ( ); // we add a SpiderReply to spiderdb when done spidering, even if // m_indexCode or g_errno was set! class SpiderReply *getNewSpiderReply ( ); SpiderRequest **getRedirSpiderRequest ( ); SpiderRequest m_redirSpiderRequest; SpiderRequest *m_redirSpiderRequestPtr; void setSpiderReqForMsg20 ( class SpiderRequest *sreq , class SpiderReply *srep ); char *addOutlinkSpiderRecsToMetaList ( ); //bool addTable96 ( class HashTableX *tt1 , // int32_t date1 , // bool nosplit ) ; int32_t getSiteRank (); bool addTable144 ( class HashTableX *tt1 , int64_t docId , class SafeBuf *buf = NULL ); bool addTable224 ( HashTableX *tt1 ) ; //bool addTableDate ( class HashTableX *tt1 , //T<key128_t,char> *tt1 // uint64_t docId , // uint8_t rdbId , // bool nosplit ) ; bool addTable128 ( class HashTableX *tt1 , // T <key128_t,char>*tt1 uint8_t rdbId , bool forDelete ) ; bool hashNoSplit ( class HashTableX *tt ) ; char *hashAll ( class HashTableX *table ) ; int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ; bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ; bool hashMetaTags ( class HashTableX *table ) ; bool hashMetaData ( class HashTableX *table ) ; bool hashIsClean ( class HashTableX *table ) ; bool hashZipCodes ( class HashTableX *table ) ; bool hashMetaZip ( class HashTableX *table ) ; bool hashContentType ( class HashTableX *table ) ; bool hashDMOZCategories ( class HashTableX *table ) ; bool hashLinks ( class HashTableX *table ) ; bool getUseTimeAxis ( ) ; SafeBuf *getTimeAxisUrl ( ); bool hashUrl ( class HashTableX *table ); bool hashDateNumbers ( class HashTableX *tt ); bool hashSections ( class HashTableX *table ) ; bool hashIncomingLinkText ( class HashTableX *table , bool hashAnomalies , bool hashNonAnomalies ) ; bool hashLinksForLinkdb ( class HashTableX *table ) ; bool hashNeighborhoods ( class HashTableX *table ) ; bool hashRSSInfo ( class HashTableX *table ) ; bool hashRSSTerm ( class HashTableX *table , bool inRSS ) ; bool hashTitle ( class HashTableX *table ); bool hashBody2 ( class HashTableX *table ); bool hashMetaKeywords ( class HashTableX *table ); bool hashMetaSummary ( class HashTableX *table ); bool linksToGigablast ( ) ; bool searchboxToGigablast ( ) ; bool hashLanguage ( class HashTableX *table ) ; bool hashLanguageString ( class HashTableX *table ) ; bool hashCountry ( class HashTableX *table ) ; bool hashSiteNumInlinks ( class HashTableX *table ) ; bool hashCharset ( class HashTableX *table ) ; bool hashTagRec ( class HashTableX *table ) ; bool hashPermalink ( class HashTableX *table ) ; bool hashVectors(class HashTableX *table ) ; bool hashAds(class HashTableX *table ) ; class Url *getBaseUrl ( ) ; bool hashSubmitUrls ( class HashTableX *table ) ; bool hashImageStuff ( class HashTableX *table ) ; bool hashIsAdult ( class HashTableX *table ) ; void set20 ( Msg20Request *req ) ; class Msg20Reply *getMsg20Reply ( ) ; char **getDiffbotPrimaryImageUrl ( ) ; char **getImageUrl() ; class MatchOffsets *getMatchOffsets () ; Query *getQuery() ; Matches *getMatches () ; char *getDescriptionBuf ( char *displayMetas , int32_t *dlen ) ; SafeBuf *getHeaderTagBuf(); class Title *getTitle (); class Summary *getSummary () ; char *getHighlightedSummary (); SafeBuf *getSampleForGigabits ( ) ; SafeBuf *getSampleForGigabitsJSON ( ) ; char *getIsCompromised ( ) ; char *getIsNoArchive ( ) ; int32_t *getUrlFilterNum(); //int32_t *getDiffbotApiNum(); SafeBuf *getDiffbotApiUrl(); int64_t **getAdVector ( ) ; char *getIsLinkSpam ( ) ; char *getIsHijacked(); char *getIsErrorPage ( ) ; char* matchErrorMsg(char* p, char* pend ); bool hashWords ( //int32_t wordStart , //int32_t wordEnd , class HashInfo *hi ) ; bool hashSingleTerm ( int64_t termId , class HashInfo *hi ) ; bool hashSingleTerm ( char *s , int32_t slen , class HashInfo *hi ); bool hashString ( class HashTableX *ht , //class Weights *we , class Bits *bits , char *s , int32_t slen ) ; bool hashString ( char *s , int32_t slen , class HashInfo *hi ) ; bool hashString ( char *s , class HashInfo *hi ) ; bool hashWords3 ( //int32_t wordStart , //int32_t wordEnd , class HashInfo *hi , class Words *words , class Phrases *phrases , class Synonyms *synonyms , class Sections *sections , class HashTableX *countTable , char *fragVec , char *wordSpamVec , char *langVec , char docLangId , // default lang id class SafeBuf *pbuf , class HashTableX *wts , class SafeBuf *wbuf , int32_t niceness ); bool hashString3 ( char *s , int32_t slen , class HashInfo *hi , class HashTableX *countTable , class SafeBuf *pbuf , class HashTableX *wts , class SafeBuf *wbuf , int32_t version , int32_t siteNumInlinks , int32_t niceness ); //bool hashSectionTerm ( char *term , // class HashInfo *hi , // int32_t sentHash32 ) ; bool hashFacet1 ( char *term, class Words *words , HashTableX *dt) ; bool hashFacet2 ( char *prefix,char *term,int32_t val32, HashTableX *dt, bool shardByTermId = false ) ; // gbfieldmatch: bool hashFieldMatchTerm ( char *val, int32_t vlen, class HashInfo *hi); bool hashNumber ( char *beginBuf , char *buf , int32_t bufLen , class HashInfo *hi ) ; bool hashNumber2 ( float f , class HashInfo *hi , char *gbsortByStr ) ; bool hashNumber3 ( int32_t x, class HashInfo *hi , char *gbsortByStr ) ; bool storeFacetValues ( char *qs , class SafeBuf *sb , FacetValHash_t fvh ) ; bool storeFacetValuesSite ( char *qs , SafeBuf *sb , FacetValHash_t fvh ); bool storeFacetValuesSections ( char *qs , class SafeBuf *sb , FacetValHash_t fvh ) ; bool storeFacetValuesHtml ( char *qs , class SafeBuf *sb , FacetValHash_t fvh ) ; bool storeFacetValuesXml ( char *qs , class SafeBuf *sb , FacetValHash_t fvh ) ; bool storeFacetValuesJSON ( char *qs , class SafeBuf *sb , FacetValHash_t fvh, Json* jp ) ; // print out for PageTitledb.cpp and PageParser.cpp bool printDoc ( class SafeBuf *pbuf ); bool printMenu ( class SafeBuf *pbuf ); bool printDocForProCog ( class SafeBuf *sb , HttpRequest *hr ) ; bool printGeneralInfo ( class SafeBuf *sb , HttpRequest *hr ) ; bool printRainbowSections ( class SafeBuf *sb , HttpRequest *hr ); bool printSiteInlinks ( class SafeBuf *sb , HttpRequest *hr ); bool printPageInlinks ( class SafeBuf *sb , HttpRequest *hr ); bool printTermList ( class SafeBuf *sb , HttpRequest *hr ); bool printSpiderStats ( class SafeBuf *sb , HttpRequest *hr ); bool printCachedPage ( class SafeBuf *sb , HttpRequest *hr ); bool printSerpFiltered ( class Section *sx , char *tagName ) ; char **getTitleBuf ( ); char **getRootTitleBuf ( ); char **getFilteredRootTitleBuf ( ); // funcs that update our tagdb tagrec, m_tagRec, and also update tagdb bool *updateVenueAddresses ( ); // called by msg0 handler to add posdb termlists into g_termListCache // for faster seo pipeline bool cacheTermLists(); public: // stuff set from the key of the titleRec, above the compression area //key_t m_key; int64_t m_docId; char *m_ubuf; int32_t m_ubufSize; int32_t m_ubufAlloc; // does this page link to gigablast, or has a search form to it? //bool linksToGigablast(); //bool searchboxToGigablast(); // private: // we we started spidering it, in milliseconds since the epoch int64_t m_startTime; int64_t m_injectStartTime; class XmlDoc *m_prevInject; class XmlDoc *m_nextInject; // when set() was called by Msg20.cpp so we can time how long it took // to generate the summary int64_t m_setTime; int64_t m_cpuSummaryStartTime; // timers int64_t m_beginSEOTime; int64_t m_beginTimeAllMatch; int64_t m_beginTimeMatchUrl; int64_t m_beginTimeFullQueries; int64_t m_beginTimeLinks; //int64_t m_beginMsg98s; int64_t m_beginRelatedQueries; int64_t m_beginMsg95s; // . these should all be set using set*() function calls so their // individual validity flags can bet set to true, and successive // calls to their corresponding get*() functions will not core // . these particular guys are set immediately on set(char *titleRec) Url m_redirUrl; Url *m_redirUrlPtr; Url *m_lastRedirUrlPtr; SafeBuf m_redirCookieBuf; Url m_metaRedirUrl; Url *m_metaRedirUrlPtr; Url m_canonicalRedirUrl; Url *m_canonicalRedirUrlPtr; int32_t m_redirError; char m_allowSimplifiedRedirs; Url m_firstUrl; int64_t m_firstUrlHash48; int64_t m_firstUrlHash64; Url m_currentUrl; //char *m_coll; //char m_collBuf[MAX_COLL_LEN+1]; // include \0 CollectionRec *m_lastcr; collnum_t m_collnum; int32_t m_lastCollRecResetCount; class CollectionRec *getCollRec ( ) ; bool setCollNum ( char *coll ) ; char *m_content; int32_t m_contentLen; char m_zeroedOut; char *m_metaList; int32_t m_metaListSize; int32_t m_addedSpiderRequestSize; int32_t m_addedSpiderReplySize; int32_t m_addedStatusDocSize; int64_t m_addedStatusDocId; SafeBuf m_metaList2; SafeBuf m_zbuf; SafeBuf m_kbuf; // warc parsing member vars class Msg7 *m_msg7; class Msg7 *m_msg7s[MAXMSG7S]; char *m_warcContentPtr; char *m_arcContentPtr; char *m_anyContentPtr; char *m_contentDelim; SafeBuf m_injectUrlBuf; bool m_subDocsHaveMime; int32_t m_warcError ; int32_t m_arcError ; bool m_doneInjectingWarc ; int64_t m_bytesStreamed; char *m_fileBuf ; int32_t m_fileBufAllocSize; bool m_registeredWgetReadCallback; char *m_fptr ; char *m_fptrEnd ; FILE* m_pipe; BigFile m_file; int64_t m_fileSize; FileState m_fileState; bool m_readThreadOut; bool m_hasMoreToRead; int32_t m_numInjectionsOut; bool m_calledWgetThread; // used by msg7 to store udp slot class UdpSlot *m_injectionSlot; // . same thing, a little more complicated // . these classes are only set on demand Xml m_xml; Links m_links; Words m_words; Bits m_bits; Bits m_bits2; Pos m_pos; Phrases m_phrases; //Synonyms m_synonyms; SafeBuf m_synBuf; //Weights m_weights; Sections m_sections; // a hack storage thing used by Msg13.cpp class Msg13Request *m_hsr; Section *m_si; //Section *m_nextSection; //Section *m_lastSection; int32_t m_mcastRequestsOut; int32_t m_mcastRequestsIn; int32_t m_secStatsErrno; char *m_queryBuf; Msg39Request *m_msg39RequestArray; SafeBuf m_mcastBuf; Multicast *m_mcastArray; //char *m_inUse; //Query *m_queryArray; //Query *m_sharedQuery; bool m_gotDupStats; //Query m_q4; //Msg3a m_msg3a; //Msg39Request m_r39; Msg39Request m_mr2; SectionStats m_sectionStats; HashTableX m_sectionStatsTable; //char m_sectionHashQueryBuf[128]; // also set in getSections() int32_t m_maxVotesForDup; // . for rebuild logging of what's changed // . Repair.cpp sets these based on titlerec char m_logLangId; int32_t m_logSiteNumInlinks; SectionVotingTable m_nsvt; SectionVotingTable m_osvt; int32_t m_numSectiondbReads; int32_t m_numSectiondbNeeds; key128_t m_sectiondbStartKey; RdbList m_secdbList; int32_t m_sectiondbRecall; SafeBuf m_tmpBuf3; bool m_gotFacets; SafeBuf m_tmpBuf2; char m_tmp9[64]; SafeBuf m_inlineSectionVotingBuf; //HashTableX m_rvt; //Msg17 m_msg17; //char *m_cachedRootVoteRec; //int32_t m_cachedRootVoteRecSize; //bool m_triedVoteCache; //bool m_storedVoteCache; //SafeBuf m_cacheRecBuf; SafeBuf m_timeAxisUrl; HashTableX m_turkVotingTable; HashTableX m_turkBitsTable; uint32_t m_confirmedTitleContentHash ; uint32_t m_confirmedVenueContentHash ; uint32_t m_confirmedTitleTagHash ; uint32_t m_confirmedVenueTagHash ; // turk voting tag rec TagRec m_vtr; // tagrec of banned turks TagRec m_bannedTurkRec; // and the table of the hashed banned turk users HashTableX m_turkBanTable; // used for displaying turk votes... HashTableX m_vctab; HashTableX m_vcduptab; bool isFirstUrlRobotsTxt(); bool m_isRobotsTxtUrl; Images m_images; HashTableX m_countTable; HttpMime m_mime; TagRec m_tagRec; SafeBuf m_tagRecBuf; // copy of m_oldTagRec but with our modifications, if any //TagRec m_newTagRec; SafeBuf m_newTagBuf; SafeBuf m_fragBuf; SafeBuf m_wordSpamBuf; SafeBuf m_finalSummaryBuf; // this one is initially the same as m_tagRec, but we do not modify it // so that Address.cpp can reference into its buffer, m_buf, without // fear of getting the buffer overwritten by crap //TagRec m_savedTagRec1; //char *m_sampleVector ; //uint32_t m_tagPairHash32; int32_t m_firstIp; class SafeBuf *m_savedSb; class HttpRequest *m_savedHr; char m_savedChar; // validity flags. on reset() all these are set to false. char m_VALIDSTART; // DO NOT add validity flags above this line! char m_metaListValid; char m_addedSpiderRequestSizeValid; char m_addedSpiderReplySizeValid; char m_addedStatusDocSizeValid; char m_downloadStartTimeValid; char m_contentDelimValid; char m_fileValid; //char m_docQualityValid; char m_siteValid; char m_startTimeValid; char m_currentUrlValid; char m_useTimeAxisValid; char m_timeAxisUrlValid; char m_firstUrlValid; char m_firstUrlHash48Valid; char m_firstUrlHash64Valid; char m_lastUrlValid; char m_docIdValid; char m_availDocIdValid; //char m_collValid; char m_tagRecValid; char m_robotsTxtLenValid; char m_tagRecDataValid; char m_newTagBufValid; char m_rootTitleBufValid; char m_filteredRootTitleBufValid; char m_titleBufValid; char m_fragBufValid; char m_isRobotsTxtUrlValid; char m_inlineSectionVotingBufValid; char m_wordSpamBufValid; char m_finalSummaryBufValid; char m_matchingQueryBufValid; char m_matchesCrawlPatternValid; char m_relatedQueryBufValid; char m_queryLinkBufValid; char m_redirSpiderRequestValid; //char m_queryPtrsValid; char m_queryOffsetsValid; //char m_queryPtrsSortedValid; char m_queryPtrsWholeValid; char m_relatedDocIdBufValid; char m_topMatchingQueryBufValid; char m_relatedDocIdsScoredBufValid; char m_relatedDocIdsWithTitlesValid; char m_relatedTitleBufValid; //char m_queryLinkBufValid; char m_missingTermBufValid; char m_matchingTermBufValid; //char m_relPtrsValid; char m_sortedPosdbListBufValid; char m_wpSortedPosdbListBufValid; char m_termListBufValid; char m_insertableTermsBufValid; char m_scoredInsertableTermsBufValid; //char m_iwfiBufValid; // for holding WordFreqInfo instances char m_wordPosInfoBufValid; char m_recommendedLinksBufValid; char m_tempMsg25PageValid; char m_tempMsg25SiteValid; //char m_queryHashTableValid; char m_queryOffsetTableValid; //char m_socketWriteBufValid; //char m_numBannedOutlinksValid; char m_hopCountValid; char m_isInjectingValid; char m_isImportingValid; char m_metaListCheckSum8Valid; char m_contentValid; char m_filteredContentValid; char m_charsetValid; char m_langVectorValid; char m_langIdValid; char m_rootLangIdValid; char m_datedbDateValid; char m_isRSSValid; char m_isSiteMapValid; char m_spiderLinksArgValid; char m_isContentTruncatedValid; char m_xmlValid; char m_linksValid; char m_wordsValid; char m_bitsValid; char m_bits2Valid; char m_posValid; char m_isUrlBadYearValid; char m_phrasesValid; //char m_synonymsValid; //char m_weightsValid; char m_sectionsValid; char m_subSentsValid; char m_osvtValid; char m_nsvtValid; //char m_rvtValid; char m_turkVotingTableValid; char m_turkBitsTableValid; char m_turkBanTableValid; char m_vctabValid; char m_explicitSectionsValid; char m_impliedSectionsValid; char m_sectionVotingTableValid; char m_imageDataValid; char m_imagesValid; char m_msge0Valid; char m_msge1Valid; //char m_msge2Valid; //char m_sampleVectorValid; char m_gigabitHashesValid; //char m_oldsrValid; char m_sreqValid; char m_srepValid; bool m_ipValid; bool m_firstIpValid; bool m_spideredTimeValid; //bool m_nextSpiderTimeValid; bool m_indexedTimeValid; bool m_firstIndexedValid; bool m_isInIndexValid; bool m_wasInIndexValid; bool m_outlinksAddedDateValid; bool m_countryIdValid; bool m_bodyStartPosValid; /* bool m_titleWeightValid; bool m_headerWeightValid; bool m_urlPathWeightValid; bool m_externalLinkTextWeightValid; bool m_internalLinkTextWeightValid; bool m_conceptWeightValid; */ bool m_httpStatusValid; bool m_crawlDelayValid; bool m_finalCrawlDelayValid; bool m_titleRecKeyValid; bool m_adVectorValid; bool m_wikiDocIdsValid; bool m_catIdsValid; bool m_versionValid; bool m_indCatIdsValid; bool m_dmozTitlesValid; bool m_dmozSummsValid; bool m_dmozAnchorsValid; bool m_dmozInfoValid; bool m_rawUtf8ContentValid; bool m_expandedUtf8ContentValid; bool m_utf8ContentValid; bool m_isAllowedValid; //bool m_tryAgainTimeDeltaValid; //bool m_eliminateMenusValid; bool m_redirUrlValid; bool m_redirCookieBufValid; bool m_metaRedirUrlValid; bool m_canonicalRedirUrlValid; bool m_statusMsgValid; bool m_mimeValid; bool m_pubDateValid; bool m_hostHash32aValid; bool m_hostHash32bValid; bool m_indexCodeValid; bool m_priorityValid; bool m_downloadStatusValid; bool m_downloadEndTimeValid; bool m_redirErrorValid; bool m_domHash32Valid; bool m_contentHash32Valid; //bool m_tagHash32Valid; bool m_tagPairHash32Valid; bool m_linkInfo2Valid; bool m_spiderLinksValid; //bool m_nextSpiderPriorityValid; bool m_firstIndexedDateValid; bool m_isPermalinkValid; bool m_isAdultValid; bool m_hasAddressValid; bool m_hasTODValid; //bool m_hasSiteVenueValid; bool m_catRecValid; bool m_urlPubDateValid; bool m_isUrlPermalinkFormatValid; bool m_percentChangedValid; bool m_unchangedValid; bool m_countTableValid; bool m_summaryLangIdValid; bool m_tagPairHashVecValid; bool m_summaryVecValid; bool m_titleVecValid; bool m_pageSampleVecValid; bool m_postVecValid; bool m_dupListValid; bool m_likedbListValid; bool m_isDupValid; bool m_gigabitVectorHashValid; bool m_gigabitQueryValid; bool m_metaDescValid; bool m_metaSummaryValid; bool m_metaKeywordsValid; bool m_siteSpiderQuotaValid; bool m_oldDocValid; bool m_extraDocValid; bool m_ahrefsDocValid; //bool m_contactDocValid; bool m_rootDocValid; //bool m_gatewayDocValid; bool m_oldMetaListValid; bool m_oldTitleRecValid; bool m_rootTitleRecValid; //bool m_contactTitleRecValid; bool m_isIndexedValid; bool m_hasContactInfoValid; bool m_isContactyValid; bool m_contactInfoTagRecValid; bool m_addressesValid; bool m_contactAddressesValid; bool m_emailBufValid; //bool m_contactUsLinkValid; //bool m_aboutUsLinkValid; //bool m_contactLinksValid; bool m_siteNumInlinksValid; //bool m_siteNumInlinksUniqueIpValid;//FreshValid; //bool m_siteNumInlinksUniqueCBlockValid;//sitePopValid //bool m_siteNumInlinksTotalValid; bool m_siteNumInlinks8Valid; bool m_siteLinkInfoValid; bool m_isWWWDupValid; bool m_linkInfo1Valid; bool m_linkSiteHashesValid; //bool m_dateParse2Valid; bool m_simpleDatesValid; bool m_datesValid; bool m_sectionsReplyValid; bool m_sectionsVotesValid; bool m_sectiondbDataValid; bool m_placedbDataValid; bool m_siteHash64Valid; bool m_siteHash32Valid; bool m_httpReplyValid; bool m_contentTypeValid; bool m_isBinaryValid; bool m_priorityQueueNumValid; bool m_outlinkTagRecVectorValid; bool m_outlinkIpVectorValid; bool m_hasNoIndexMetaTagValid; bool m_hasUseFakeIpsMetaTagValid; bool m_outlinkIsIndexedVectorValid; bool m_isSiteRootValid; bool m_wasContentInjectedValid; bool m_outlinkHopCountVectorValid; //bool m_isSpamValid; bool m_isFilteredValid; bool m_urlFilterNumValid; bool m_numOutlinksAddedValid; bool m_baseUrlValid; bool m_replyValid; bool m_recycleDiffbotReplyValid; bool m_diffbotReplyValid; bool m_tokenizedDiffbotReplyValid; //bool m_diffbotUrlCrawlPatternMatchValid; //bool m_diffbotUrlProcessPatternMatchValid; //bool m_diffbotPageProcessPatternMatchValid; //bool m_useDiffbotValid; //bool m_diffbotApiNumValid; bool m_diffbotApiUrlValid; bool m_diffbotTitleHashBufValid; bool m_crawlInfoValid; bool m_isPageParserValid; bool m_imageUrlValid; bool m_imageUrl2Valid; bool m_matchOffsetsValid; bool m_queryValid; bool m_diffbotProxyReplyValid; bool m_matchesValid; bool m_dbufValid; bool m_titleValid; bool m_htbValid; bool m_collnumValid; //bool m_twidsValid; bool m_termId32BufValid; bool m_termInfoBufValid; bool m_newTermInfoBufValid; bool m_summaryValid; bool m_gsbufValid; bool m_spiderStatusDocMetaListValid; bool m_isCompromisedValid; bool m_isNoArchiveValid; //bool m_isVisibleValid; //bool m_clockCandidatesTableValid; //bool m_clockCandidatesDataValid; bool m_titleRecBufValid; bool m_isLinkSpamValid; bool m_isErrorPageValid; bool m_isHijackedValid; bool m_dupHashValid; bool m_exactContentHash64Valid; bool m_looseContentHash64Valid; bool m_jpValid; char m_isSiteMap; // shadows char m_isRSS2; char m_isPermalink2; char m_isAdult2; char m_spiderLinks2; char m_isContentTruncated2; char m_isLinkSpam2; bool m_hasAddress2; bool m_hasTOD2; //bool m_hasSiteVenue2; char m_hasContactInfo2; char m_isSiteRoot2; // DO NOT add validity flags below this line! char m_VALIDEND; // more stuff //char *m_utf8Content; //int32_t m_utf8ContentLen; CatRec m_catRec; // use this stuff for getting wiki docids that match our doc's gigabits //Query m_wq; //SearchInput m_si; //Msg40 m_msg40; //DateParse2 m_dateParse2; bool m_printedMenu; Dates m_dates; //HashTableX m_clockCandidatesTable; //SafeBuf m_cctbuf; float m_ageInDays; int32_t m_urlPubDate; //int32_t m_urlAge; char m_isUrlPermalinkFormat; uint8_t m_summaryLangId; int32_t m_tagPairHashVec[MAX_TAG_PAIR_HASHES]; int32_t m_tagPairHashVecSize; int32_t m_summaryVec [SAMPLE_VECTOR_SIZE/4]; int32_t m_summaryVecSize; int32_t m_titleVec [SAMPLE_VECTOR_SIZE/4]; int32_t m_titleVecSize; int32_t m_pageSampleVec[SAMPLE_VECTOR_SIZE/4]; int32_t m_pageSampleVecSize; int32_t m_postVec[POST_VECTOR_SIZE/4]; int32_t m_postVecSize; float m_tagSimilarity; float m_gigabitSimilarity; float m_pageSimilarity; float m_percentChanged; bool m_unchanged; // what docids are similar to us? docids are in this list RdbList m_dupList; RdbList m_likedbList; uint64_t m_dupHash; int64_t m_exactContentHash64; int64_t m_looseContentHash64; Msg0 m_msg0; Msg5 m_msg5; char m_isDup; int64_t m_docIdWeAreADupOf; int32_t m_ei; int32_t m_lastLaunch; Msg22Request m_msg22Request; Msg22Request m_msg22Requestc; Msg22 m_msg22a; Msg22 m_msg22b; Msg22 m_msg22c; Msg22 m_msg22d; Msg22 m_msg22e; Msg22 m_msg22f; //int32_t m_collLen; uint32_t m_gigabitVectorHash; char m_gigabitQuery [XD_GQ_MAX_SIZE]; int32_t m_gigabitHashes [XD_MAX_GIGABIT_HASHES]; int32_t m_gigabitScores [XD_MAX_GIGABIT_HASHES]; char *m_gigabitPtrs [XD_MAX_GIGABIT_HASHES]; // for debug printing really class GigabitInfo *m_top[100]; int32_t m_numTop; //char m_metaDesc[1025]; //char m_metaKeywords[1025]; // these now reference directly into the html src so our // WordPosInfo::m_wordPtr algo works in seo.cpp char *m_metaDesc; int32_t m_metaDescLen; char *m_metaSummary; int32_t m_metaSummaryLen; char *m_metaKeywords; int32_t m_metaKeywordsLen; int32_t m_siteSpiderQuota; //int32_t m_numBannedOutlinks; class XmlDoc *m_oldDoc; class XmlDoc *m_extraDoc; class XmlDoc *m_ahrefsDoc; //class XmlDoc *m_contactDoc; class XmlDoc *m_rootDoc; //class XmlDoc *m_gatewayDoc; RdbList m_oldMetaList; char *m_oldTitleRec; int32_t m_oldTitleRecSize; char *m_rootTitleRec; int32_t m_rootTitleRecSize; //char *m_contactTitleRec; //int32_t m_contactTitleRecSize; char m_isIndexed; // confusing, i know! these are used exclsusively by // getNewSpiderReply() for now char m_isInIndex; char m_wasInIndex; bool m_oldDocExistedButHadError; Msg8a m_msg8a; char *m_tagdbColl; int32_t m_tagdbCollLen; Addresses m_addresses; Address *m_contactAddresses[MAX_CONTACT_ADDRESSES]; int32_t m_numContactAddresses; char m_isContacty; //Url m_contactUsLink; //Url m_aboutUsLink; /* char *m_contactLinks [MAX_CONTACT_OUTLINKS]; int32_t m_contactLens [MAX_CONTACT_OUTLINKS]; int32_t m_contactScores [MAX_CONTACT_OUTLINKS]; int32_t m_contactFlags [MAX_CONTACT_OUTLINKS]; char m_contactProcessed [MAX_CONTACT_OUTLINKS]; char *m_contactText [MAX_CONTACT_OUTLINKS]; char *m_contactTextEnd [MAX_CONTACT_OUTLINKS]; int32_t m_minContactScore; int32_t m_minContactIndex; int32_t m_numContactLinks; */ Url m_extraUrl; //int32_t m_siteNumInlinksFresh; //int32_t m_sitePop; uint8_t m_siteNumInlinks8; //int32_t m_siteNumInlinks; LinkInfo m_siteLinkInfo; SafeBuf m_mySiteLinkInfoBuf; SafeBuf m_myPageLinkInfoBuf; SafeBuf m_myTempLinkInfoBuf; char m_isInjecting; char m_isImporting; char m_useFakeMime; char m_useSiteLinkBuf; char m_usePageLinkBuf; char m_printInXml; //Msg25 m_msg25; SafeBuf m_tmpBuf11; SafeBuf m_tmpBuf12; Multicast m_mcast11; Multicast m_mcast12; Msg25 *m_tempMsg25Page; Msg25 *m_tempMsg25Site; // for page or for site? Msg25 *getAllInlinks ( bool forSite ); // lists from cachedb for msg25's msg20 replies serialized RdbList m_siteReplyList; RdbList m_pageReplyList; bool m_checkedCachedbForSite; bool m_checkedCachedbForPage; bool m_triedToAddWordPosInfoToCachedb; bool m_calledMsg25ForSite; bool m_calledMsg25ForPage; //void (* m_masterLoopWrapper) (void *state); MsgC m_msgc; bool m_isAllowed; bool m_forwardDownloadRequest; bool m_isChildDoc; class XmlDoc *m_parentDocPtr; Msg13 m_msg13; Msg13Request m_msg13Request; Msg13Request m_diffbotProxyRequest; ProxyReply *m_diffbotProxyReply; bool m_isSpiderProxy; // for limiting # of iframe tag expansions int32_t m_numExpansions; char m_newOnly; //int32_t m_tryAgainTimeDelta; //int32_t m_sameIpWait; //int32_t m_sameDomainWait; //int32_t m_maxSpidersPerDomain; char m_isWWWDup; char m_calledMsg0b; Url m_tmpUrl; SafeBuf m_tmpsb1; SafeBuf m_tmpsb2; SafeBuf m_turkBuf; SafeBuf m_linkSiteHashBuf; SafeBuf m_linkdbDataBuf; SafeBuf m_langVec; Msg0 m_msg0b; class RdbList *m_ulist; void *m_hack; class XmlDoc *m_hackxd; //class LinkInfo *m_linkInfo1Ptr; char *m_linkInfoColl; //char m_injectedReply; //int32_t m_minInlinkerHopCount; //class LinkInfo *m_linkInfo2Ptr; SiteGetter m_siteGetter; int64_t m_siteHash64; //char *m_site; //int32_t m_siteLen; //Url m_siteUrl; int32_t m_siteHash32; char *m_httpReply; //char m_downloadAttempted; char m_incrementedAttemptsCount; char m_incrementedDownloadCount; char m_redirectFlag; //char m_isScraping; //char m_throttleDownload; char m_spamCheckDisabled; char m_useRobotsTxt; int32_t m_robotsTxtLen; int32_t m_httpReplySize; int32_t m_httpReplyAllocSize; char m_isBinary; char *m_filteredContent; int32_t m_filteredContentLen; char *m_filter; int32_t m_filteredContentAllocSize; int32_t m_filteredContentMaxSize; char m_calledThread; int32_t m_errno; //class CollectionRec *m_cr; //int32_t m_utf8ContentAllocSize; int32_t m_hostHash32a; int32_t m_hostHash32b; int32_t m_domHash32; int32_t m_priorityQueueNum; // this points into m_msge0 i guess //class TagRec **m_outlinkTagRecVector; Msge0 m_msge0; // this points into m_msge1 i guess int32_t *m_outlinkIpVector; SafeBuf m_outlinkTagRecPtrBuf; SafeBuf m_fakeIpBuf; char m_hasNoIndexMetaTag; char m_hasUseFakeIpsMetaTag; Msge1 m_msge1; TagRec **m_outlinkTagRecVector; SafeBuf m_fakeTagRecPtrBuf; TagRec m_fakeTagRec; // // diffbot parms for indexing diffbot's json output // XmlDoc *m_dx; char *m_diffbotObj; SafeBuf m_diffbotReply; SafeBuf m_v3buf; SafeBuf *m_tokenizedDiffbotReplyPtr; SafeBuf m_tokenizedDiffbotReply; int32_t m_diffbotReplyError; bool m_recycleDiffbotReply; //bool m_diffbotUrlCrawlPatternMatch; //bool m_diffbotUrlProcessPatternMatch; //bool m_diffbotPageProcessPatternMatch; //int32_t m_diffbotApiNum; //bool m_useDiffbot; // url to access diffbot with SafeBuf m_diffbotApiUrl; SafeBuf m_diffbotUrl; // exact url used to fetch reply from diffbot bool *getRecycleDiffbotReply ( ) ; SafeBuf *getTokenizedDiffbotReply ( ) ; SafeBuf *getDiffbotReply ( ) ; bool doesUrlMatchDiffbotCrawlPattern() ; //bool doesUrlMatchDiffbotProcessPattern() ; bool doesPageContentMatchDiffbotProcessPattern() ; int32_t *getDiffbotTitleHashes ( int32_t *numHashes ) ; char *hashJSONFields ( HashTableX *table ); char *hashJSONFields2 ( HashTableX *table , HashInfo *hi , Json *jp , bool hashWithoutFieldNames ) ; char *hashXMLFields ( HashTableX *table ); int32_t *reindexJSONObjects ( int32_t *newTitleHashes , int32_t numNewHashes ) ; int32_t *nukeJSONObjects ( int32_t *newTitleHashes , int32_t numNewHashes ) ; int32_t *redoJSONObjects ( int32_t *newTitleHashes , int32_t numNewHashes , bool deleteFromIndex ) ; int32_t m_joc; SafeBuf m_diffbotTitleHashBuf; Json *getParsedJson(); // object that parses the json Json m_jp; //EmailInfo m_emailInfo; // // functions and vars for the seo query matching tool // bool loadTitleRecFromDiskOrSpider(); //SafeBuf *getSEOQueryInfo ( ); HashTableX *getTermIdBufDedupTable32(); //int32_t *getTopWordsVector( bool includeSynonyms ); SafeBuf *getTermId32Buf(); SafeBuf *getTermInfoBuf(); SafeBuf *getNewTermInfoBuf(); SafeBuf *getMatchingQueryBuf(); SafeBuf *getQueryLinkBuf(SafeBuf *docIdListBuf,bool doMatchingQueries); //SafeBuf *getMatchingQueriesScored(); SafeBuf *getMatchingQueriesScoredForFullQuery(); SafeBuf *getRelatedDocIds(); SafeBuf *getRelatedDocIdsScored(); SafeBuf *getTopMatchingQueryBuf(); bool addRelatedDocIdInfo ( int64_t docId , int32_t queryNum , float score , int32_t rank , int32_t siteHash26 ) ; bool setRelatedDocIdWeightAndRank ( class RelatedDocId *rd ); SafeBuf *getRelatedDocIdsWithTitles(); bool setRelatedDocIdInfoFromMsg20Reply ( class RelatedDocId *rd , class Msg20Reply *reply ); SafeBuf *getRelatedQueryBuf(); //SafeBuf *getRelatedQueryLinksModPart ( int32_t modPart ); bool addTermsFromQuery ( char *queryStr, uint8_t queryLangId, int32_t gigablastTraffic, int32_t googleTraffic, int32_t hackqoff, class SafeBuf *tmpBuf , class HashTableX *scoreTable , class HashTableX *topWordsTable , float imp, bool isRelatedQuery ) ; bool sortTermsIntoBuf ( class HashTableX *scoreTable , class SafeBuf *tmpBuf , class SafeBuf *missingTermBuf ) ; SafeBuf *getMissingTermBuf (); SafeBuf *getMatchingTermBuf (); SafeBuf *getTermIdSortedPosdbListBuf(); SafeBuf *getWordPosSortedPosdbListBuf(); SafeBuf *getTermListBuf(); // list of posdb termlists for caching SafeBuf *getWordPosInfoBuf ( ) ; //bool sendBin ( int32_t i ); //bool scoreDocIdRestrictedQueries(class Msg99Reply **replyPtrs, // class QueryLink *linkPtrs, // int32_t numPtrs ); // private like functions bool addUniqueWordsToBuf ( SafeBuf *termInfoBuf, HashTableX *dedupTable , HashTableX *filterTable , HashTableX *minCountTable , bool storeCounts, Words *words , bool includeSynonyms ); //void gotMsg99Reply ( UdpSlot *slot ); //void gotMsg98Reply ( UdpSlot *slot ); void gotMsg95Reply ( UdpSlot *slot ); //void gotMsg3aReplyForMainUrl ( ); void gotMsg3aReplyForFullQuery( ); //void gotMsg3aReplyForFullQueryCached ( char *cachedRec , // class Msg99Reply *qp ); //void gotMsg3aReplyForRelQuery ( class Msg3a *msg3a ); void gotMsg3fReply ( class Bin *bin ); //void pumpSocketWriteBuf ( ); //HashTableX *getMatchingQueryHashTable(); HashTableX *getMatchingQueryOffsetTable(); int32_t getNumInsertableTerms ( ); class SafeBuf *getInsertableTerms ( ); class SafeBuf *getScoredInsertableTerms ( ); //class SafeBuf *getInsertableWordFreqInfoBuf (); bool processMsg95Replies(); void setWordPosInfosTrafficGain ( class InsertableTerm *it ); int32_t getTrafficGain( class QueryChange *qc ) ; // print in xml bool printScoredInsertableTerms ( SafeBuf *sbuf ) ; HashTableX m_tidTable32; //int32_t *m_twids; //int32_t m_numTwids; SafeBuf m_termId32Buf; SafeBuf m_termInfoBuf; SafeBuf m_newTermInfoBuf; //int32_t m_maxQueries; //int32_t m_maxRelatedQueries; //int32_t m_maxRelatedUrls; //int32_t m_numMsg99Requests; //int32_t m_numMsg98Requests; //int32_t m_numMsg99Replies; //int32_t m_numMsg98Replies; //char *m_msg99ReplyPtrs [MAX_HOSTS]; //int32_t m_msg99ReplySizes[MAX_HOSTS]; //int32_t m_msg99ReplyAlloc[MAX_HOSTS]; //int32_t m_msg99HostIds [MAX_HOSTS]; char *m_msg95ReplyPtrs [MAX_HOSTS]; int32_t m_msg95ReplySizes[MAX_HOSTS]; //HashTableX m_queryHashTable; HashTableX m_queryOffsetTable; HashTableX m_tmpTable; HashTableX m_fullQueryDedup; //SafeBuf m_twbuf; //SafeBuf m_queryPtrs; SafeBuf m_matchingQueryBuf; SafeBuf m_matchingQueryStringBuf; SafeBuf m_relatedQueryBuf; SafeBuf m_relatedQueryStringBuf; SafeBuf m_docIdListBuf; SafeBuf m_queryOffsets; SafeBuf m_extraQueryBuf; //SafeBuf m_socketWriteBuf; SafeBuf m_relatedDocIdBuf; SafeBuf m_relatedTitleBuf; SafeBuf m_commonQueryNumBuf; SafeBuf m_topMatchingQueryBuf; HashTableX m_rdtab; // related query algo stuff SafeBuf m_queryLinkBuf; SafeBuf m_queryLinkStringBuf; char *m_msg8eReply [MAX_HOSTS]; int32_t m_msg8eReplySize[MAX_HOSTS]; int32_t m_numMsg8eRequests; int32_t m_numMsg8eReplies; //bool m_launchedAll; int64_t m_tlbufTimer; SafeBuf m_missingTermBuf; SafeBuf m_matchingTermBuf; //SafeBuf m_queryRelBuf; //SafeBuf m_relPtrs; SafeBuf m_sortedPosdbListBuf; SafeBuf m_wpSortedPosdbListBuf; SafeBuf m_termListBuf; SafeBuf m_insertableTermsBuf; //SafeBuf m_iwfiBuf; SafeBuf m_wordPosInfoBuf; //SafeBuf m_msg20ReplyPtrBuf; SafeBuf m_recommendedLinksBuf; SafeBuf m_tmpMsg0Buf; SafeBuf m_msg20Array; SafeBuf m_newLinkerBuf; //Msg17 m_msg17; //key_t m_cacheKey; //char *m_cacheRec; //int32_t m_cacheRecSize; //bool m_triedCache; //class TopDocIds *m_topDocIdsBuf; //int32_t m_topDocIdsBufSize; SafeBuf m_topDocIdsBuf; //class TopDocIds *m_nextAvailTopDocIds; //int32_t m_nextAvailTopDocIdsOffset; //int32_t m_maxFullQueries; //XmlDoc *m_newxd; //XmlDoc *m_newxd2; //bool m_newxd2Blocked; //HashTableX m_tmpDupTable; //class Msg20 *m_newMsg20; Msg3a *m_msg3a; Query *m_query3a; int32_t m_numMsg3aRequests; int32_t m_numMsg3aReplies; int32_t m_numMsg3fRequests; int32_t m_numMsg3fReplies; int32_t m_numMsg4fRequests; int32_t m_numMsg4fReplies; bool m_sentMsg4fRequests; bool m_matchesCrawlPattern; class UdpSlot *m_savedSlot; int32_t m_numMsg95Requests; int32_t m_numMsg95Replies; int32_t m_qcursor; char m_seoDebug; char m_progressBar; bool m_readFromCachedb; bool m_writeToCachedb; //bool m_setForReplyPtrs; //bool m_setForLinkPtrs; SafeBuf *getRecommendedLinksBuf ( ); bool processLinkInfoMsg20Reply ( class Msg25 *msg25 ); bool printRecommendedLinksBuf ( class SafeBuf *sb ) ; // recommendedlinksbuf vars and functions int32_t m_numLinkRequestsOut; int32_t m_numLinkRequestsIn; int32_t m_hadLinkInfoError; int32_t m_numMsg20sIn; int32_t m_numMsg20sOut; int32_t m_numValidMsg20s; int32_t m_titleCursor; int32_t m_msg20Phase; int32_t m_recommendedLinkError; SafeBuf *lookupTitles(); bool gotLinkerTitle ( class Msg20 *msg20 ); // 1 *current* bin per host! //class Bin *m_currentBinPtrs[MAX_HOSTS]; //int32_t m_binError; //int32_t m_msg98ReplyError; //int32_t m_binErrorForReplyPtrs; //int32_t m_binErrorForLinkPtrs; HashTableX m_qstringTable; // flow flags bool m_printedQueries; bool m_printedRelatedDocIds; bool m_printedRelatedQueries; bool m_printedScoredInsertableTerms; bool m_printedRecommendedLinks; bool m_loggedMsg3; int64_t m_lastPrintedDocId; //bool m_docIndexed; //bool m_sentMsg99Requests; bool m_didSet3; //bool m_didSet3b; bool m_registeredSocketCallback; // the caller's socket the expect the xml reply on TcpSocket *m_seoSocket; TcpSocket *m_hackSocket; bool m_doingSEO; bool clientClosedConnection ( ); bool m_hadMatchError; bool m_clientClosed; bool m_lastCheckTime; int32_t m_msg3aErrno ; bool m_computedMetaListCheckSum; // cachedb related args //bool m_seoInfoSetFromCache; bool m_checkedCachedb; bool m_processedCachedbReply; //bool m_storedIntoCachedb; RdbList m_cacheList; //SafeBuf m_msg99ReplyBuf; SafeBuf m_queryChangeBuf; SafeBuf m_queryLogBuf; //SafeBuf m_itStrBuf; SafeBuf m_debugScoreInfoBuf; SafeBuf m_origScoreInfoBuf; RdbList m_storeList; Msg1 m_msg1; bool m_allHashed; bool checkCachedb ( ); bool storeScoredInsertableTermsIntoCachedb ( ) ; bool storeRelatedQueriesIntoCachedb ( ) ; bool storeRelatedDocIdsIntoCachedb ( ) ; bool storeMatchingQueriesIntoCachedb ( ) ; // only the top 1000 or so bool storeMissingTermBufIntoCachedb ( ); bool storeWordPosInfoBufIntoCachedb ( ); bool storeRecommendedLinksBuf ( ); // cursors int32_t m_socketWriteBufSent; int32_t m_queryNum; int32_t m_rdCursor; int32_t m_relatedNum; int32_t m_numRelatedAdded; // for getRelatedDocIdsWithTitles() launching msg20s int32_t m_relatedDocIdError; int32_t m_numMsg20Replies; int32_t m_numMsg20Requests; SafeBuf m_msg20Buf; // this points into m_msge2 //char *m_outlinkIsIndexedVector; //Msge2 m_msge2; bool m_doneWithAhrefs; bool m_useAhrefs; bool m_reallyInjectLinks; int32_t m_downloadLevel; int32_t m_numRegExs; //char m_isSiteRoot; int8_t *m_outlinkHopCountVector; int32_t m_outlinkHopCountVectorSize; //char m_isSpam; char m_isUrlBadYear; char m_isFiltered; int32_t m_urlFilterNum; int32_t m_numOutlinksAdded; int32_t m_numOutlinksAddedFromSameDomain; int32_t m_numOutlinksFiltered; int32_t m_numOutlinksBanned; int32_t m_numRedirects; bool m_isPageParser; Url m_baseUrl; Msg20Reply m_reply; Msg20Request *m_req; char m_linkTextBuf[MAX_LINK_TEXT_LEN]; //char *m_gsbuf; SafeBuf m_surroundingTextBuf; SafeBuf m_rssItemBuf; SafeBuf m_gsbuf; //int32_t m_gsbufSize; //int32_t m_gsbufAllocSize; char *m_note; char *m_imageUrl; char *m_imageUrl2; //char m_imageUrlBuf[100]; SafeBuf m_imageUrlBuf; SafeBuf m_imageUrlBuf2; //int32_t m_imageUrlSize; MatchOffsets m_matchOffsets; Query m_query; Matches m_matches; // meta description buf int32_t m_dbufSize; char m_dbuf[1024]; SafeBuf m_htb; Title m_title; Summary m_summary; char m_isCompromised; char m_isNoArchive; char m_isErrorPage; char m_isHijacked; //char m_isVisible; //char m_dmozBuf[12000]; SafeBuf m_dmozBuf; int32_t m_numDmozEntries; // stuff char *m_statusMsg; Msg4 m_msg4; Msg8b m_msg8b; bool m_incCount; bool m_decCount; bool m_deleteFromIndex; // ptrs to stuff //char *m_titleRec; SafeBuf m_titleRecBuf; //int32_t m_titleRecSize; //bool m_freeTitleRec; //int32_t m_titleRecAllocSize; key_t m_titleRecKey; // for isDupOfUs() char *m_dupTrPtr; int32_t m_dupTrSize; // parse these out of spider rec /* int32_t m_retryNum ; int32_t m_spiderRecPriority ; bool m_spiderRecIsNew ; int32_t m_spiderRecSiteNumInlinks ; int32_t m_spiderRecRetryCount ; int32_t m_spiderRecHopCount ; key_t m_spiderRecKey ; bool m_spiderRecForced ; int32_t m_spiderRecTime ; int32_t m_srDataSize ; char m_srData [ MAX_SPIDERREC_SIZE ]; */ key_t m_doledbKey; SpiderRequest m_sreq; SpiderReply m_srep;//newsr; // bool flags for what procedures we have done bool m_checkedUrlFilters; bool m_listAdded ; bool m_listFlushed ; bool m_check1 ; bool m_check2 ; bool m_prepared ; bool m_updatedCounts ; bool m_updatedCounts2 ; //bool m_updatedTagdb1 ; //bool m_updatedTagdb2 ; //bool m_updatedTagdb3 ; //bool m_updatedTagdb4 ; //bool m_updatedTagdb5 ; bool m_copied1 ; bool m_updatingSiteLinkInfoTags ; bool m_addressSetCalled ; //bool m_calledMsg22a ; //bool m_calledMsg22b ; //bool m_calledMsg22c ; int64_t m_calledMsg22d ; bool m_didDelay ; bool m_didDelayUnregister ; bool m_calledMsg22e ; bool m_calledMsg22f ; bool m_calledMsg25 ; bool m_calledMsg25b ; bool m_calledMsg8b ; bool m_calledMsg40 ; bool m_calledSections ; bool m_firstEntry ; bool m_firstEntry2 ; bool m_launchedSpecialMsg8a ; bool m_launchedMsg8a2 ; bool m_loaded ; // used for getHasContactInfo() bool m_processed0 ; // a lock to prevent infinite loops //bool m_checkForRedir ; bool m_processedLang ; bool m_doingConsistencyCheck ; int32_t m_langIdScore; //int32_t m_rootLangIdScore; //uint8_t m_rootLangId; // used for getting contact info //bool m_triedRoot ; //int32_t m_winner ; int32_t m_dist; // the tags in this tagRec are just contact info based tags and // created in the addContactInfo() function. also, in that same // function we add/sub the tags in m_citr to the m_newTagRec tag rec. //TagRec m_citr ; char m_emailBuf[EMAILBUFSIZE]; int32_t m_numOfficialEmails; // use to store a \0 list of "titles" of the root page so we can // see which if any are the venue name, and thus match that to // addresses of the venue on the site, and we can use those addresses // as default venue addresses when no venues are listed on a page // on that site. char m_rootTitleBuf[ROOT_TITLE_BUF_MAX]; int32_t m_rootTitleBufSize; // . this is filtered // . certain punct is replaced with \0 char m_filteredRootTitleBuf[ROOT_TITLE_BUF_MAX]; int32_t m_filteredRootTitleBufSize; // like m_rootTitleBuf but for the current page char m_titleBuf[ROOT_TITLE_BUF_MAX]; int32_t m_titleBufSize; bool m_setTr ; //bool m_checkedRobots ; bool m_triedTagRec ; bool m_didGatewayPage ; bool m_didQuickDupCheck ; void (* m_masterLoop) ( void *state ); void * m_masterState; void (* m_callback1) ( void *state ); bool (* m_callback2) ( void *state ); void *m_state; //void (* m_injectionCallback) ( void *state ); //void *m_injectionState; // flags for spider //bool m_isAddUrl; //bool m_forceDelete; bool m_didDelete; bool m_skipIframeExpansion; // this is non-zero if we decided not to index the doc int32_t m_indexCode; // the spider priority int32_t m_priority; // the download error, like ETIMEDOUT, ENOROUTE, etc. int32_t m_downloadStatus; // . when the download was completed. will be zero if no download done // . used to set SpiderReply::m_downloadEndTime because we need // high resolution for that so we can dole out the next spiderrequest // from that IP quickly if the sameipwait is like 500ms. int64_t m_downloadEndTime; //char *m_metaListEnd; int32_t m_metaListAllocSize; char *m_p; char *m_pend; int32_t m_maxCacheAge; // a list of 32-bit ints followed by a zero 32-bit int to terminate int64_t m_adIds [ XD_MAX_AD_IDS ]; //char *m_adVector;// [XMLDOC_MAX_AD_IDS]; //int32_t m_adVectorSize; char *m_wikiqbuf; int32_t m_wikiqbufSize; int64_t m_wikiDocIds [ MAX_WIKI_DOCIDS ]; rscore_t m_wikiScores [ MAX_WIKI_DOCIDS ]; bool m_registeredSleepCallback; bool m_addedNegativeDoledbRec; bool m_hashedTitle; bool m_hashedMetas; int32_t m_niceness; bool m_usePosdb ; //bool m_useDatedb ; bool m_useClusterdb ; bool m_useLinkdb ; bool m_useSpiderdb ; bool m_useTitledb ; bool m_useTagdb ; bool m_usePlacedb ; //bool m_useTimedb ; bool m_useSectiondb ; //bool m_useRevdb ; bool m_useSecondaryRdbs ; int32_t m_linkeeQualityBoost; SafeBuf *m_pbuf; // used by SpiderLoop to set m_pbuf to SafeBuf m_sbuf; // store termlist into here if non-null bool m_storeTermListInfo; char m_sortTermListBy; SafeBuf m_sectiondbData; //char *m_sectiondbData; char *m_placedbData; //int32_t m_sectiondbDataSize; int32_t m_placedbDataSize; // we now have HashInfo to replace this //bool m_inHashNoSplit; // store the terms that we hash into this table so that PageParser.cpp // can print what was hashed and with what score and what description class HashTableX *m_wts; HashTableX m_wtsTable; SafeBuf m_wbuf; // used by addContactInfo() to keep track of what urls we have // processed for contact info to avoid re-processing them in the // recursive loop thing that we do //HashTableX m_pt; // Msg25.cpp stores its pageparser.cpp output into this one SafeBuf m_pageLinkBuf; SafeBuf m_siteLinkBuf; SafeBuf m_serpBuf; // which set() function was called above to set us? bool m_setFromTitleRec; bool m_setFromSpiderRec; bool m_setFromUrl; bool m_setFromDocId; bool m_freeLinkInfo1; bool m_freeLinkInfo2; bool m_contentInjected; bool m_recycleContent; //bool m_loadFromOldTitleRec; char *m_rawUtf8Content; int32_t m_rawUtf8ContentSize; int32_t m_rawUtf8ContentAllocSize; // we overallocate sometimes char *m_expandedUtf8Content; int32_t m_expandedUtf8ContentSize; char *m_savedp; char *m_oldp; bool m_didExpansion; SafeBuf m_esbuf; SafeBuf m_xbuf; //bool m_useIpsTxtFile ; //bool m_readFromTestCache ; // used by msg13 class Msg13Request *m_r; // Msg20 uses this to stash its TcpSlot void *m_slot; char *getTestDir(); bool m_freed; bool m_msg4Waiting; bool m_msg4Launched; // word spam detection char *getWordSpamVec ( ); bool setSpam ( int32_t *profile, int32_t plen , int32_t numWords , unsigned char *spam ); int32_t getProbSpam ( int32_t *profile, int32_t plen , int32_t step ); bool m_isRepeatSpammer; int32_t m_numRepeatSpam; bool m_totallySpammed; // frag vector (repeated fragments). 0 means repeated, 1 means not. // vector is 1-1 with words in the document body. char *getFragVec ( ); bool injectDoc ( char *url , class CollectionRec *cr , char *content , char *diffbotReply, // usually null bool contentHasMime , int32_t hopCount, int32_t charset, bool deleteUrl, //char contentType, // CT_HTML, CT_XML char *contentTypeStr, // text/html, text/xml etc. bool spiderLinks , char newOnly, // index iff new void *state, void (*callback)(void *state) , uint32_t firstIndexedTime = 0, uint32_t lastSpideredDate = 0 , int32_t injectDocIp = 0 , // for container docs consisting of subdocs to inject char *contentDelim = NULL, char* metadata = NULL, uint32_t metadataLen = 0, int32_t payloadLen = -1); bool injectLinks ( HashTableX *linkDedupTable , HashTableX *domDedupTable , void *finalState , void (* finalCallback)(void *)); bool injectAhrefsLinks(); bool doInjectLoop ( ); void doneInjecting ( class XmlDoc *xd ); int32_t m_i; int32_t m_blocked; HashTableX m_domDedupTable; HashTableX *m_linkDedupTablePtr; HashTableX *m_domDedupTablePtr; bool m_dedupLinkDomains; void *m_finalState; void (* m_finalCallback) ( void *state ); char m_used[MAX_XML_DOCS]; class XmlDoc *m_xmlDocs[MAX_XML_DOCS]; int64_t m_cacheStartTime; }; // . PageParser.cpp uses this class for printing hashed terms out by calling // XmlDoc::print() // . we store TermInfos into XmlDoc::m_wtsTable, a HashTableX // . one for each term hashed // . the key is the termId. dups are allowed // . the term itself is stored into a separate buffer, m_wbuf, a SafeBuf, so // that TermInfo::m_term will reference that and it won't disappear on us class TermDebugInfo { public: int32_t m_termOff; int32_t m_termLen; //uint32_t m_score32; int32_t m_descOff; // the description offset int32_t m_prefixOff; // the prefix offset, like "site" or "gbadid" int64_t m_termId; int32_t m_date; bool m_shardByTermId; //float m_weight; char m_langId; char m_diversityRank; char m_densityRank; char m_wordSpamRank; char m_hashGroup; int32_t m_wordNum; int32_t m_wordPos; POSDBKEY m_key; // key144_t //bool m_isSynonym; // 0 = not a syn, 1 = syn from presets,2=wikt,3=generated char m_synSrc; int64_t m_langBitVec64; // used for gbsectionhash:xxxx terms to hack in the inner content // hash, aka sentHash32 for doing xpath histograms on a site //int32_t m_sentHash32; //int32_t m_facetVal32; // this is copied from Weights::m_rvw or m_rvp //float m_rv[MAX_RULES]; }; // a ptr to HashInfo is passed to hashString() and hashWords() class HashInfo { public: HashInfo() { m_tt = NULL; m_prefix = NULL; m_desc = NULL; m_date = 0; // should we do sharding based on termid and not the usual docid??? // in general this is false, but for checksum we want to shard // by the checksum and not docid to avoid having to do a // gbchecksum:xxxxx search on ALL shards. much more efficient. m_shardByTermId = false; //m_useWeights = false; m_useSynonyms = false; m_hashGroup = -1; m_useCountTable = true; m_useSections = true; m_startDist = 0; // m_facetVal32 = 0; // used for sectiondb stuff, but stored in posdb //m_sentHash32 = 0; }; class HashTableX *m_tt; char *m_prefix; // "m_desc" should detail the algorithm char *m_desc; int32_t m_date; char m_shardByTermId; char m_linkerSiteRank; //char m_useWeights; char m_useSynonyms; char m_hashGroup; int32_t m_startDist; //int32_t m_facetVal32; bool m_useCountTable; bool m_useSections; }; // g_tt is used for debugging //extern class TermTable *g_tt; extern uint8_t score32to8 ( uint32_t score ) ; extern pid_t g_pid ; extern int32_t g_ticker ; extern int32_t g_filterTimeout ; // as recommended in the "man system" page we use our own int my_system_r ( char *cmd , int32_t timeout ) ; // . returns 0 to 100 , the probability of spam for this subprofile // . a "profile" is an array of all the positions of a word in the document // . a "position" is just the word #, like first word, word #8, etc... // . we are passed a subprofile, "profile", of the actual profile // because some of the document may be more "spammy" than other parts // . inlined to speed things up because this may be called multiple times // for each word in the document // . if "step" is 1 we look at every word position in the profile // . if "step" is 2 we look at every other word position // . if "step" is 3 we look at every 3rd word position, etc... inline int32_t XmlDoc::getProbSpam(int32_t *profile, int32_t plen, int32_t step) { // you can spam 2 or 1 letter words all you want to if ( plen <= 2 ) return 0; // if our step is bigger than the profile return 0 if ( step == plen ) return 0; register int32_t avgSpacing, stdDevSpacing; int32_t d,dev=0; register int32_t i; for (int32_t j = 0; j < step; j++) { // find avg. of gaps between consecutive tokens in subprofile // TODO: isn't profile[i] < profile[i+1]?? int32_t istop = plen-1; avgSpacing = 0; for (i=0; i < istop; i += step ) avgSpacing += ( profile[i] - profile[i+1] ); // there's 1 less spacing than positions in the profile // so we divide by plen-1 avgSpacing = (avgSpacing * 256) / istop; // compute standard deviation of the gaps in this sequence stdDevSpacing = 0; for (i = 0 ; i < istop; i += step ) { d = (( profile[i] - profile[i+1]) * 256 ) - avgSpacing; if ( d < 0 ) stdDevSpacing -= d; else stdDevSpacing += d; } // TODO: should we divide by istop-1 for stdDev?? stdDevSpacing /= istop; // average of the stddevs for all sequences dev += stdDevSpacing; } dev /= step; // if the plen is big we should expect dev to be big // here's some interpolation points: // plen >= 2 and dev<= 0.2 --> 100% // plen = 7 and dev = 1.0 --> 100% // plen = 14 and dev = 2.0 --> 100% // plen = 21 and dev = 3.0 --> 100% // plen = 7 and dev = 2.0 --> 50% // NOTE: dev has been multiplied by 256 to avoid using floats if ( dev <= 51.2 ) return 100; // (.2 * 256) int32_t prob = ( (256*100/7) * plen ) / dev; if (prob>100) prob=100; return prob; //if (prob>=0) { // int32_t i; //printf("dev=%i,plen=%i,nseq=%i,prob=%i----\n",dev,plen,step,prob); // for (i=0;i<plen;i++) // printf("%i#",profile[i]); // printf("\n"); //} } #endif