privacore-open-source-searc.../Msg20.h

326 lines
10 KiB
C++

// Matt Wells, copyright Nov 2007
// get various information from a query and a docId, like summary, title, etc.
#ifndef GB_MSG20_H
#define GB_MSG20_H
#include "Multicast.h"
#include "collnum_t.h"
#include "WordVariationsConfig.h"
class Msg20Request {
public:
Msg20Request() { reset(); }
// zero ourselves out
void reset() {
memset(this,0,sizeof(*this));
// these are the only non-zero defaults
m_numSummaryLines = 1;
m_docId = -1LL; // set docid to "invalid"
m_titleMaxLen = 80 ;
m_summaryMaxLen = 180 ;
}
int32_t getStoredSize() const;
char *serialize(int32_t *sizePtr) const;
int32_t deserialize ( );
int64_t makeCacheKey() const;
char m_numSummaryLines ; // non-zero default
bool m_getHeaderTag ;
void *m_state ;
void *m_state2 ; // used by Msg25.cpp
int32_t m_j ; // used by Msg25.cpp
bool (* m_callback)( void *m_state );
int64_t m_docId ;
int32_t m_niceness ;
int32_t m_titleMaxLen ;
int32_t m_summaryMaxLen ;
int32_t m_summaryMaxNumCharsPerLine ;
int64_t m_maxCacheAge ;
int32_t m_discoveryDate ;
// special shit so we can remove an inlinker to a related docid
// if they also link to the main url we are processing seo for.
// set both of these to 0 to disregard.
int32_t m_ourHostHash32;
int32_t m_ourDomHash32;
// language the query is in (ptr_qbuf)
uint8_t m_langId;
uint8_t m_prefferedResultLangId;
// we now use the numeric collection # and not the ptr_coll
collnum_t m_collnum;
WordVariationsConfig m_word_variations_config;
unsigned char m_useQueryStopWords :1;
unsigned char m_highlightQueryTerms :1;
unsigned char m_getSummaryVector :1;
unsigned char m_showBanned :1;
unsigned char m_includeCachedCopy :1;
unsigned char m_doLinkSpamCheck :1;
unsigned char m_isLinkSpam :1; // Msg25 uses for storage
unsigned char m_isSiteLinkInfo :1; // site link info?
unsigned char m_isDebug :1;
// if true, just calls TitleRec::getLinkInfo() to set ptr_linkInfo
unsigned char m_getLinkInfo :1;
// if this is true we will not compute the title, etc. of BAD inlinks
// deemed link spam
unsigned char m_onlyNeedGoodInlinks :1;
// if true, sets ptr_linkText, etc.
unsigned char m_getLinkText :1;
unsigned char m_allowHighFrequencyTermCache:1;
// pointer+size variable section
char *ptr_qbuf ;
char *ptr_ubuf ; // url buffer
const char *ptr_linkee; // used by Msg25 for getting link text
char *ptr_displayMetas ;
int32_t size_qbuf ;
int32_t size_ubuf ; // url buffer
int32_t size_linkee ; // size includes terminating \0
int32_t size_displayMetas ; // size includes terminating \0
// variable data comes here
};
struct Msg20State;
class Msg20Reply {
public:
Msg20Reply();
// free the merge buf from Msg40.cpp merging event summaries
~Msg20Reply();
void destructor();
// zero ourselves out
void reset() { memset(this,0,sizeof(*this)); }
// how many bytes if we had to serialize it?
int32_t getStoredSize() const;
int32_t deserialize ( ) ;
int32_t serialize(char *buf, int32_t bufSize) const;
bool sendReply(Msg20State *state);
// after calling these, when serialize() is called again it will
// exclude these strings which were "cleared". Used by Msg40 to
// reduce the memory required for caching the Msg40 which includes an
// array of Msg20s.
void clearOutlinks ( ) {
size_linkText = 0;
size_surroundingText = 0;
size_outlinks = 0;
}
void clearVectors() {
size_vbuf = 0;
}
int32_t m_ip ;
int32_t m_firstIp ;
int32_t m_wordPosStart ;
int64_t m_docId ;
int32_t m_firstSpidered ;
int32_t m_lastSpidered ;
int32_t m_lastModified ;
int32_t m_datedbDate ;
int32_t m_firstIndexedDate ; // for the url/document as a whole
int32_t m_discoveryDate ; // for the inlink in question...
int32_t m_errno ; // LinkInfo uses it for LinkTextRepl
collnum_t m_collnum ; // collection # we came from
char m_noArchive ;
char m_contentType ;
char m_siteRank ;
bool m_isBanned ;
char m_recycled ;
uint8_t m_language ;
uint16_t m_country ;
bool m_isAdult ;
int16_t m_httpStatus;
int32_t m_indexCode;
int32_t m_contentLen ; // was m_docLen
int32_t m_contentHash32 ; // for deduping diffbot json objects streaming
int32_t m_pageNumInlinks ;
int32_t m_pageNumGoodInlinks ;
int32_t m_pageNumUniqueIps ; // includes our own inlinks
int32_t m_pageNumUniqueCBlocks; // includes our own inlinks
int32_t m_pageInlinksLastUpdated;
int32_t m_siteNumInlinks ; // GOOD inlinks!
int32_t m_numOutlinks ; // replaced m_linkCount
// these are just storage for LinkInfo::set() to use
int32_t m_linkTextNumWords ;
int32_t m_midDomHash ; // set for m_getLinkText
char m_isLinkSpam ; // set for m_getLinkText
char m_outlinkInContent ; // set for m_getLinkText
char m_outlinkInComment ; // set for m_getLinkText
char m_isPermalink ; // set for m_getLinkText (buzz)
bool m_isDisplaySumSetFromTags;
// pointer+size variable section
char *ptr_tbuf ; // title buffer
char *ptr_htag ; // h1 tag buf
char *ptr_ubuf ; // url buffer
char *ptr_rubuf ; // redirect url buffer
char *ptr_displaySum ; // summary for displaying
char *ptr_dbuf ; // display metas \0 separated
char *ptr_vbuf ; // summary vector
char *ptr_imgData ; // for encoded images
char *ptr_site ;
// . if m_computeLinkInfo is true this is computed using Msg25 (fresh)
// . if m_setLinkInfo is true this is just set from the titleRec
// . this is a serialized LinkInfo class
char *ptr_linkInfo; // inlinks ;
// . made using LinkInfo::set ( Msg20Reply **ptrs )
// . this is a serialized LinkInfo class
char *ptr_outlinks ;
// . these are used only by Msg25 to compute LinkInfo
// . Msg25 will call Msg20 on the docid of a potentially good inlinker
// instead of calling the now obsolete Msg23::getLinkText()
int32_t *ptr_vector1 ; // set for m_getLinkText
int32_t *ptr_vector2 ; // set for m_getLinkText
int32_t *ptr_vector3 ; // set for m_getLinkText
char *ptr_linkText ; // set for m_getLinkText
char *ptr_surroundingText ; // set for m_getLinkText
char *ptr_linkUrl ; // what we link to
char *ptr_rssItem ; // set for m_getLinkText
const char *ptr_categories ;
char *ptr_content ; // page content in utf8
char *ptr_templateVector ;
char *ptr_metadataBuf;
const char *ptr_note ; // reason why it cannot vote
int32_t size_tbuf;
int32_t size_htag;
int32_t size_ubuf;
int32_t size_rubuf;
int32_t size_displaySum;
int32_t size_dbuf;
int32_t size_vbuf;
int32_t size_imgData;
int32_t size_site;
int32_t size_linkInfo;
int32_t size_outlinks;
int32_t size_vector1;
int32_t size_vector2;
int32_t size_vector3;
int32_t size_linkText;
int32_t size_surroundingText;
int32_t size_linkUrl;
int32_t size_rssItem;
int32_t size_categories;
int32_t size_content; // page content in utf8
int32_t size_templateVector;
int32_t size_metadataBuf;
int32_t size_note;
// variable data comes here
};
class Msg20 {
public:
// . this should only be called once
// . should also register our get record handlers with the udpServer
static bool registerHandler();
// see definition of Msg20Request below
bool getSummary ( class Msg20Request *r );
// this is cast to m_replyPtr
Msg20Reply *m_r ;
int32_t m_replySize;
int32_t m_replyMaxSize;
// i guess Msg40.cpp looks at this flag
bool m_gotReply;
// set if we had an error
int32_t m_errno;
int64_t getRequestDocId () const { return m_requestDocId; }
int32_t getStoredSize() const {
if ( ! m_r ) return 0;
return m_r->getStoredSize();
}
// . return how many bytes we serialize into "buf"
// . sets g_errno and returns -1 on error
int32_t serialize ( char *buf , int32_t bufSize ) {
if ( ! m_r ) return 0;
return m_r->serialize ( buf , bufSize );
}
// . this is destructive on the "buf". it converts offs to ptrs
// . sets m_r to the modified "buf" when done
// . sets g_errno and returns -1 on error, otherwise # of bytes deseril
int32_t deserialize ( char *buf , int32_t bufSize ) ;
// Msg40 caches each Msg20Reply when it caches the page of results, so,
// to keep the size of the cached Msg40 down, we do not cache certain
// things. so we have to "clear" these guys out before caching.
void clearLinks () { if ( m_r ) m_r->clearOutlinks (); }
void clearVectors () { if ( m_r ) m_r->clearVectors (); }
// copy "src" to ourselves
void moveFrom(Msg20 *src);
void gotReply ( class UdpSlot *slot );
// general purpose routines
Msg20();
~Msg20();
// so we can alloc arrays of these using mmalloc()
void constructor ();
void destructor ();
void freeReply ();
void reset ();
int32_t m_ii;
// is the reply in progress? if msg20 has not launched a request
// this is false. if msg20 received its reply, this is false.
// otherwise this is true.
bool m_inProgress;
bool m_launched;
private:
char *m_request;
int32_t m_requestSize;
int64_t m_requestDocId;
// for sending the request
Multicast m_mcast;
bool m_ownReply;
bool (*m_callback ) ( void *state );
void (*m_callback2) ( void *state );
void *m_state;
static void gotReplyWrapper20(void *state, void *state20);
};
#endif // GB_MSG20_H