mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
314 lines
8.6 KiB
C++
314 lines
8.6 KiB
C++
#ifndef MSG25_H_
|
|
#define MSG25_H_
|
|
|
|
#include "types.h"
|
|
#include "SafeBuf.h"
|
|
#include "Msg20.h" // for getting this url's LinkInfo from another cluster
|
|
#include "HashTableX.h"
|
|
#include "Msg22.h"
|
|
#include "Msg5.h"
|
|
#include "max_coll_len.h"
|
|
#include "max_url_len.h"
|
|
|
|
class LinkInfo;
|
|
class Inlink;
|
|
class Multicast;
|
|
class UdpSlot;
|
|
|
|
|
|
void handleRequest25(UdpSlot *slot, int32_t netnice);
|
|
|
|
|
|
// . get the inlinkers to this SITE (any page on this site)
|
|
// . use that to compute a site quality
|
|
// . also get the inlinkers sorted by date and see how many good inlinkers
|
|
// we had since X days ago. (each inlinker needs a pub/birth date)
|
|
class Msg25Request {
|
|
public:
|
|
// either MODE_PAGELINKINFO or MODE_SITELINKINFO
|
|
char m_mode;
|
|
int32_t m_ip;
|
|
int64_t m_docId;
|
|
collnum_t m_collnum;
|
|
bool m_isInjecting;
|
|
bool m_printInXml;
|
|
|
|
// when we get a reply we call this
|
|
void *m_state;
|
|
void (*m_callback)(void *state);
|
|
|
|
int32_t m_siteNumInlinks;
|
|
LinkInfo *m_oldLinkInfo;
|
|
int32_t m_niceness;
|
|
bool m_doLinkSpamCheck;
|
|
bool m_oneVotePerIpDom;
|
|
int32_t m_lastUpdateTime;
|
|
bool m_onlyNeedGoodInlinks;
|
|
int32_t m_ourHostHash32;
|
|
int32_t m_ourDomHash32;
|
|
|
|
// new stuff
|
|
int32_t m_siteHash32;
|
|
int64_t m_siteHash64;
|
|
int64_t m_linkHash64;
|
|
// for linked list of these guys in g_lineTable in Linkdb.cpp
|
|
// but only used on the server end, not client end
|
|
Msg25Request *m_next;
|
|
// the mutlicast we use
|
|
Multicast *m_mcast;
|
|
UdpSlot *m_udpSlot;
|
|
bool m_printDebugMsgs;
|
|
// store final LinkInfo reply in here
|
|
SafeBuf *m_linkInfoBuf;
|
|
|
|
char *ptr_site;
|
|
char *ptr_url;
|
|
const char *ptr_oldLinkInfo;
|
|
|
|
int32_t size_site;
|
|
int32_t size_url;
|
|
int32_t size_oldLinkInfo;
|
|
|
|
//variable data begins here
|
|
|
|
int32_t getStoredSize();
|
|
void serialize();
|
|
void deserialize();
|
|
};
|
|
|
|
|
|
// . get ALL the linkText classes for a url and merge 'em into a LinkInfo class
|
|
// . also gets the link-adjusted quality of our site's url (root url)
|
|
// . first gets all docIds of docs that link to that url via an link: search
|
|
// . gets the LinkText, customized for our url, from each docId in that list
|
|
// . merge them into a final LinkInfo class for your url
|
|
|
|
|
|
#define MAX_LINKERS 3000
|
|
|
|
// if a linker is a "title rec not found" or log spam, then we get another
|
|
// linker's titleRec. churn through up to these many titleRecs in an attempt
|
|
// to get MAX_LINKERS good titlerecs before giving up.
|
|
//#define MAX_DOCIDS_TO_SAMPLE 25000
|
|
// on news.google.com, 22393 of the 25000 are link spam, and we only end
|
|
// up getting 508 good inlinks, so rais from 25000 to 50000
|
|
//#define MAX_DOCIDS_TO_SAMPLE 50000
|
|
// try a ton of lookups so we can ditch xfactor and keep posdb key as
|
|
// simple as possible. just make sure we recycle link info a lot!
|
|
#define MAX_DOCIDS_TO_SAMPLE 1000000
|
|
|
|
// go down from 300 to 100 so XmlDoc::getRecommendLinksBuf() can launch
|
|
// like 5 msg25s and have no fear of having >500 msg20 requests outstanding
|
|
// which clogs things up
|
|
// crap, no, on gk144 we got 128 hosts now, so put back to 300...
|
|
// if we have less hosts then limit this proportionately in Linkdb.cpp
|
|
#define MAX_MSG20_OUTSTANDING 300
|
|
|
|
#define MAX_NOTE_BUF_LEN 20000
|
|
|
|
#define MSG25_MAX_REQUEST_SIZE (MAX_URL_LEN+MAX_COLL_LEN+64)
|
|
|
|
|
|
class Msg25 {
|
|
|
|
public:
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets errno on error
|
|
// . this sets Msg25::m_siteRootQuality and Msg25::m_linkInfo
|
|
// . "url/coll" should NOT be on stack in case weBlock
|
|
// . if "reallyGetLinkInfo" is false we don't actually try to fetch
|
|
// any link text and return true right away, really saves a bunch
|
|
// of disk seeks when spidering small collections that don't need
|
|
// link text/info indexing/analysis
|
|
bool getLinkInfo2 (const char *site,
|
|
const char *url,
|
|
bool isSiteLinkInfo,
|
|
int32_t ip,
|
|
int64_t docId,
|
|
collnum_t collnum,
|
|
char *qbuf,
|
|
int32_t qbufSize,
|
|
void *state,
|
|
void (*callback)(void *state),
|
|
bool isInjecting,
|
|
bool printDebugMsgs, // into "Msg25::m_pbuf"
|
|
bool printInXml,
|
|
int32_t siteNumInlinks,
|
|
LinkInfo *oldLinkInfo,
|
|
int32_t niceness,
|
|
bool doLinkSpamCheck,
|
|
bool oneVotePerIpDom,
|
|
int32_t lastUpdateTime,
|
|
bool onlyNeedGoodInlinks,
|
|
// if an inlinking document has an outlink
|
|
// of one of these hashes then we set
|
|
// Msg20Reply::m_hadLinkToOurDomOrHost.
|
|
// it is used to remove an inlinker to a related
|
|
// docid, which also links to our main seo url
|
|
// being processed. so we do not recommend
|
|
// such links since they already link to a page
|
|
// on your domain or hostname. set BOTH to zero
|
|
// to not perform this algo in handleRequest20()'s
|
|
// call to XmlDoc::getMsg20Reply().
|
|
int32_t ourHostHash32,
|
|
int32_t ourDomHash32,
|
|
SafeBuf *myLinkInfoBuf);
|
|
Msg25();
|
|
~Msg25();
|
|
void reset();
|
|
|
|
// a new parm referencing the request we got over the network
|
|
Msg25Request * m_req25;
|
|
|
|
Msg20Reply *getLoser(Msg20Reply *r, Msg20Reply *p);
|
|
const char *isDup (Msg20Reply *r, Msg20Reply *p);
|
|
|
|
bool addNote ( const char *note, int32_t noteLen, int64_t docId );
|
|
|
|
// m_linkInfo ptr references into here. provided by caller.
|
|
SafeBuf *m_linkInfoBuf;
|
|
|
|
SafeBuf m_realBuf;
|
|
|
|
// private:
|
|
// these need to be public for wrappers to call:
|
|
bool gotList();
|
|
bool sendRequests();
|
|
bool gotLinkText(class Msg20Request *req);
|
|
bool doReadLoop();
|
|
|
|
// input vars
|
|
const char *m_url;
|
|
const char *m_site;
|
|
|
|
int32_t m_ourHostHash32;
|
|
int32_t m_ourDomHash32;
|
|
|
|
int32_t m_round;
|
|
uint64_t m_linkHash64;
|
|
key224_t m_nextKey;
|
|
|
|
bool m_onlyNeedGoodInlinks;
|
|
int64_t m_docId;
|
|
collnum_t m_collnum;
|
|
void *m_state;
|
|
void (* m_callback) ( void *state );
|
|
|
|
int32_t m_siteNumInlinks;
|
|
enum mode_t {
|
|
MODE_UNSET = 0,
|
|
MODE_PAGELINKINFO = 1,
|
|
MODE_SITELINKINFO = 2
|
|
} m_mode;
|
|
bool m_printInXml;
|
|
|
|
|
|
// url info
|
|
int32_t m_ip;
|
|
int32_t m_top;
|
|
int32_t m_midDomHash;
|
|
|
|
bool m_gettingList;
|
|
|
|
// . we now use Msg2 since it has "restrictIndexdb" support to limit
|
|
// indexdb searches to just the root file to decrease disk seeks
|
|
Msg5 m_msg5;
|
|
RdbList m_list;
|
|
|
|
Inlink *m_k;
|
|
|
|
int32_t m_maxNumLinkers;
|
|
|
|
// should we free the m_replyPtrs on destruction? default=true
|
|
bool m_ownReplies;
|
|
|
|
// Now we just save the replies we get back from Msg20::getSummary()
|
|
// We point to them with a LinkTextReply, which is just a pointer
|
|
// and some access functions.
|
|
Msg20Reply *m_replyPtrs[MAX_LINKERS];
|
|
int32_t m_replySizes[MAX_LINKERS];
|
|
int32_t m_numReplyPtrs;
|
|
|
|
Msg20 m_msg20s [MAX_MSG20_OUTSTANDING];
|
|
Msg20Request m_msg20Requests [MAX_MSG20_OUTSTANDING];
|
|
char m_inUse [MAX_MSG20_OUTSTANDING];
|
|
// for "fake" replies
|
|
Msg20Reply m_msg20Replies [MAX_MSG20_OUTSTANDING];
|
|
|
|
int32_t m_numDocIds;
|
|
int32_t m_cblocks;
|
|
int32_t m_uniqueIps;
|
|
|
|
int32_t m_minRecSizes;
|
|
|
|
// how many msg20s have we sent/recvd?
|
|
int32_t m_numRequests;
|
|
int32_t m_numReplies;
|
|
|
|
int32_t m_linkSpamOut;
|
|
|
|
// have we had an error for any transaction?
|
|
int32_t m_errno;
|
|
|
|
SafeBuf m_tmp;
|
|
SafeBuf *m_pbuf; // will point to m_tmp if m_printDebugMsgs
|
|
|
|
// copied from CollectionRec
|
|
bool m_oneVotePerIpDom;
|
|
bool m_doLinkSpamCheck;
|
|
bool m_isInjecting;
|
|
int32_t m_lastUpdateTime;
|
|
|
|
Multicast m_mcast;
|
|
|
|
int32_t m_errors;
|
|
int32_t m_noText;
|
|
|
|
bool m_spideringEnabled;
|
|
|
|
int32_t m_dupCount;
|
|
int32_t m_spamLinks;
|
|
int32_t m_niceness;
|
|
int32_t m_numFromSameIp;
|
|
int32_t m_sameMidDomain;
|
|
|
|
// stats for allow some link spam inlinks to vote
|
|
int32_t m_spamCount;
|
|
int32_t m_maxSpam;
|
|
|
|
// this is used for the linkdb list
|
|
HashTableX m_ipTable;
|
|
HashTableX m_fullIpTable;
|
|
HashTableX m_firstIpTable;
|
|
|
|
// this is for deduping docids because we now combine the linkdb
|
|
// list of docids with the old inlinks in the old link info
|
|
//HashTableT <int64_t, char> m_docIdTable;
|
|
HashTableX m_docIdTable;
|
|
|
|
// special counts
|
|
int32_t m_ipDupsLinkdb;
|
|
int32_t m_docIdDupsLinkdb;
|
|
int32_t m_linkSpamLinkdb;
|
|
int32_t m_ipDups;
|
|
|
|
LinkInfo *m_oldLinkInfo;
|
|
|
|
char m_buf[MAX_NOTE_BUF_LEN];
|
|
char *m_bufPtr;
|
|
char *m_bufEnd;
|
|
HashTableX m_table;
|
|
|
|
char m_request[MSG25_MAX_REQUEST_SIZE];
|
|
int32_t m_requestSize;
|
|
|
|
// for setting <absScore2> or determining if a search results
|
|
// inlinkers also have the query terms. buzz.
|
|
char *m_qbuf;
|
|
int32_t m_qbufSize;
|
|
};
|
|
|
|
|
|
#endif
|