privacore-open-source-searc.../Msg25.h
2018-07-04 11:48:24 +02:00

314 lines
8.6 KiB
C++

#ifndef MSG25_H_
#define MSG25_H_
#include "types.h"
#include "SafeBuf.h"
#include "Msg20.h" // for getting this url's LinkInfo from another cluster
#include "HashTableX.h"
#include "Msg22.h"
#include "Msg5.h"
#include "max_coll_len.h"
#include "max_url_len.h"
class LinkInfo;
class Inlink;
class Multicast;
class UdpSlot;
void handleRequest25(UdpSlot *slot, int32_t netnice);
// . get the inlinkers to this SITE (any page on this site)
// . use that to compute a site quality
// . also get the inlinkers sorted by date and see how many good inlinkers
// we had since X days ago. (each inlinker needs a pub/birth date)
class Msg25Request {
public:
// either MODE_PAGELINKINFO or MODE_SITELINKINFO
char m_mode;
int32_t m_ip;
int64_t m_docId;
collnum_t m_collnum;
bool m_isInjecting;
bool m_printInXml;
// when we get a reply we call this
void *m_state;
void (*m_callback)(void *state);
int32_t m_siteNumInlinks;
LinkInfo *m_oldLinkInfo;
int32_t m_niceness;
bool m_doLinkSpamCheck;
bool m_oneVotePerIpDom;
int32_t m_lastUpdateTime;
bool m_onlyNeedGoodInlinks;
int32_t m_ourHostHash32;
int32_t m_ourDomHash32;
// new stuff
int32_t m_siteHash32;
int64_t m_siteHash64;
int64_t m_linkHash64;
// for linked list of these guys in g_lineTable in Linkdb.cpp
// but only used on the server end, not client end
Msg25Request *m_next;
// the mutlicast we use
Multicast *m_mcast;
UdpSlot *m_udpSlot;
bool m_printDebugMsgs;
// store final LinkInfo reply in here
SafeBuf *m_linkInfoBuf;
char *ptr_site;
char *ptr_url;
const char *ptr_oldLinkInfo;
int32_t size_site;
int32_t size_url;
int32_t size_oldLinkInfo;
//variable data begins here
int32_t getStoredSize();
void serialize();
void deserialize();
};
// . get ALL the linkText classes for a url and merge 'em into a LinkInfo class
// . also gets the link-adjusted quality of our site's url (root url)
// . first gets all docIds of docs that link to that url via an link: search
// . gets the LinkText, customized for our url, from each docId in that list
// . merge them into a final LinkInfo class for your url
#define MAX_LINKERS 3000
// if a linker is a "title rec not found" or log spam, then we get another
// linker's titleRec. churn through up to these many titleRecs in an attempt
// to get MAX_LINKERS good titlerecs before giving up.
//#define MAX_DOCIDS_TO_SAMPLE 25000
// on news.google.com, 22393 of the 25000 are link spam, and we only end
// up getting 508 good inlinks, so rais from 25000 to 50000
//#define MAX_DOCIDS_TO_SAMPLE 50000
// try a ton of lookups so we can ditch xfactor and keep posdb key as
// simple as possible. just make sure we recycle link info a lot!
#define MAX_DOCIDS_TO_SAMPLE 1000000
// go down from 300 to 100 so XmlDoc::getRecommendLinksBuf() can launch
// like 5 msg25s and have no fear of having >500 msg20 requests outstanding
// which clogs things up
// crap, no, on gk144 we got 128 hosts now, so put back to 300...
// if we have less hosts then limit this proportionately in Linkdb.cpp
#define MAX_MSG20_OUTSTANDING 300
#define MAX_NOTE_BUF_LEN 20000
#define MSG25_MAX_REQUEST_SIZE (MAX_URL_LEN+MAX_COLL_LEN+64)
class Msg25 {
public:
// . returns false if blocked, true otherwise
// . sets errno on error
// . this sets Msg25::m_siteRootQuality and Msg25::m_linkInfo
// . "url/coll" should NOT be on stack in case weBlock
// . if "reallyGetLinkInfo" is false we don't actually try to fetch
// any link text and return true right away, really saves a bunch
// of disk seeks when spidering small collections that don't need
// link text/info indexing/analysis
bool getLinkInfo2 (const char *site,
const char *url,
bool isSiteLinkInfo,
int32_t ip,
int64_t docId,
collnum_t collnum,
char *qbuf,
int32_t qbufSize,
void *state,
void (*callback)(void *state),
bool isInjecting,
bool printDebugMsgs, // into "Msg25::m_pbuf"
bool printInXml,
int32_t siteNumInlinks,
LinkInfo *oldLinkInfo,
int32_t niceness,
bool doLinkSpamCheck,
bool oneVotePerIpDom,
int32_t lastUpdateTime,
bool onlyNeedGoodInlinks,
// if an inlinking document has an outlink
// of one of these hashes then we set
// Msg20Reply::m_hadLinkToOurDomOrHost.
// it is used to remove an inlinker to a related
// docid, which also links to our main seo url
// being processed. so we do not recommend
// such links since they already link to a page
// on your domain or hostname. set BOTH to zero
// to not perform this algo in handleRequest20()'s
// call to XmlDoc::getMsg20Reply().
int32_t ourHostHash32,
int32_t ourDomHash32,
SafeBuf *myLinkInfoBuf);
Msg25();
~Msg25();
void reset();
// a new parm referencing the request we got over the network
Msg25Request * m_req25;
Msg20Reply *getLoser(Msg20Reply *r, Msg20Reply *p);
const char *isDup (Msg20Reply *r, Msg20Reply *p);
bool addNote ( const char *note, int32_t noteLen, int64_t docId );
// m_linkInfo ptr references into here. provided by caller.
SafeBuf *m_linkInfoBuf;
SafeBuf m_realBuf;
// private:
// these need to be public for wrappers to call:
bool gotList();
bool sendRequests();
bool gotLinkText(class Msg20Request *req);
bool doReadLoop();
// input vars
const char *m_url;
const char *m_site;
int32_t m_ourHostHash32;
int32_t m_ourDomHash32;
int32_t m_round;
uint64_t m_linkHash64;
key224_t m_nextKey;
bool m_onlyNeedGoodInlinks;
int64_t m_docId;
collnum_t m_collnum;
void *m_state;
void (* m_callback) ( void *state );
int32_t m_siteNumInlinks;
enum mode_t {
MODE_UNSET = 0,
MODE_PAGELINKINFO = 1,
MODE_SITELINKINFO = 2
} m_mode;
bool m_printInXml;
// url info
int32_t m_ip;
int32_t m_top;
int32_t m_midDomHash;
bool m_gettingList;
// . we now use Msg2 since it has "restrictIndexdb" support to limit
// indexdb searches to just the root file to decrease disk seeks
Msg5 m_msg5;
RdbList m_list;
Inlink *m_k;
int32_t m_maxNumLinkers;
// should we free the m_replyPtrs on destruction? default=true
bool m_ownReplies;
// Now we just save the replies we get back from Msg20::getSummary()
// We point to them with a LinkTextReply, which is just a pointer
// and some access functions.
Msg20Reply *m_replyPtrs[MAX_LINKERS];
int32_t m_replySizes[MAX_LINKERS];
int32_t m_numReplyPtrs;
Msg20 m_msg20s [MAX_MSG20_OUTSTANDING];
Msg20Request m_msg20Requests [MAX_MSG20_OUTSTANDING];
char m_inUse [MAX_MSG20_OUTSTANDING];
// for "fake" replies
Msg20Reply m_msg20Replies [MAX_MSG20_OUTSTANDING];
int32_t m_numDocIds;
int32_t m_cblocks;
int32_t m_uniqueIps;
int32_t m_minRecSizes;
// how many msg20s have we sent/recvd?
int32_t m_numRequests;
int32_t m_numReplies;
int32_t m_linkSpamOut;
// have we had an error for any transaction?
int32_t m_errno;
SafeBuf m_tmp;
SafeBuf *m_pbuf; // will point to m_tmp if m_printDebugMsgs
// copied from CollectionRec
bool m_oneVotePerIpDom;
bool m_doLinkSpamCheck;
bool m_isInjecting;
int32_t m_lastUpdateTime;
Multicast m_mcast;
int32_t m_errors;
int32_t m_noText;
bool m_spideringEnabled;
int32_t m_dupCount;
int32_t m_spamLinks;
int32_t m_niceness;
int32_t m_numFromSameIp;
int32_t m_sameMidDomain;
// stats for allow some link spam inlinks to vote
int32_t m_spamCount;
int32_t m_maxSpam;
// this is used for the linkdb list
HashTableX m_ipTable;
HashTableX m_fullIpTable;
HashTableX m_firstIpTable;
// this is for deduping docids because we now combine the linkdb
// list of docids with the old inlinks in the old link info
//HashTableT <int64_t, char> m_docIdTable;
HashTableX m_docIdTable;
// special counts
int32_t m_ipDupsLinkdb;
int32_t m_docIdDupsLinkdb;
int32_t m_linkSpamLinkdb;
int32_t m_ipDups;
LinkInfo *m_oldLinkInfo;
char m_buf[MAX_NOTE_BUF_LEN];
char *m_bufPtr;
char *m_bufEnd;
HashTableX m_table;
char m_request[MSG25_MAX_REQUEST_SIZE];
int32_t m_requestSize;
// for setting <absScore2> or determining if a search results
// inlinkers also have the query terms. buzz.
char *m_qbuf;
int32_t m_qbufSize;
};
#endif