mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-01-22 02:18:42 -05:00
326 lines
10 KiB
C++
326 lines
10 KiB
C++
// Matt Wells, copyright Nov 2007
|
|
|
|
// get various information from a query and a docId, like summary, title, etc.
|
|
|
|
#ifndef GB_MSG20_H
|
|
#define GB_MSG20_H
|
|
|
|
#include "Multicast.h"
|
|
#include "collnum_t.h"
|
|
#include "WordVariationsConfig.h"
|
|
|
|
|
|
class Msg20Request {
|
|
public:
|
|
|
|
Msg20Request() { reset(); }
|
|
|
|
// zero ourselves out
|
|
void reset() {
|
|
memset(this,0,sizeof(*this));
|
|
// these are the only non-zero defaults
|
|
m_numSummaryLines = 1;
|
|
m_docId = -1LL; // set docid to "invalid"
|
|
m_titleMaxLen = 80 ;
|
|
m_summaryMaxLen = 180 ;
|
|
}
|
|
|
|
int32_t getStoredSize() const;
|
|
char *serialize(int32_t *sizePtr) const;
|
|
int32_t deserialize ( );
|
|
int64_t makeCacheKey() const;
|
|
|
|
char m_numSummaryLines ; // non-zero default
|
|
bool m_getHeaderTag ;
|
|
void *m_state ;
|
|
void *m_state2 ; // used by Msg25.cpp
|
|
int32_t m_j ; // used by Msg25.cpp
|
|
bool (* m_callback)( void *m_state );
|
|
int64_t m_docId ;
|
|
int32_t m_niceness ;
|
|
int32_t m_titleMaxLen ;
|
|
int32_t m_summaryMaxLen ;
|
|
int32_t m_summaryMaxNumCharsPerLine ;
|
|
int64_t m_maxCacheAge ;
|
|
int32_t m_discoveryDate ;
|
|
|
|
// special shit so we can remove an inlinker to a related docid
|
|
// if they also link to the main url we are processing seo for.
|
|
// set both of these to 0 to disregard.
|
|
int32_t m_ourHostHash32;
|
|
int32_t m_ourDomHash32;
|
|
|
|
// language the query is in (ptr_qbuf)
|
|
uint8_t m_langId;
|
|
uint8_t m_prefferedResultLangId;
|
|
// we now use the numeric collection # and not the ptr_coll
|
|
collnum_t m_collnum;
|
|
|
|
WordVariationsConfig m_word_variations_config;
|
|
|
|
unsigned char m_useQueryStopWords :1;
|
|
unsigned char m_highlightQueryTerms :1;
|
|
unsigned char m_getSummaryVector :1;
|
|
unsigned char m_showBanned :1;
|
|
unsigned char m_includeCachedCopy :1;
|
|
unsigned char m_doLinkSpamCheck :1;
|
|
unsigned char m_isLinkSpam :1; // Msg25 uses for storage
|
|
unsigned char m_isSiteLinkInfo :1; // site link info?
|
|
unsigned char m_isDebug :1;
|
|
// if true, just calls TitleRec::getLinkInfo() to set ptr_linkInfo
|
|
unsigned char m_getLinkInfo :1;
|
|
// if this is true we will not compute the title, etc. of BAD inlinks
|
|
// deemed link spam
|
|
unsigned char m_onlyNeedGoodInlinks :1;
|
|
// if true, sets ptr_linkText, etc.
|
|
unsigned char m_getLinkText :1;
|
|
unsigned char m_allowHighFrequencyTermCache:1;
|
|
|
|
// pointer+size variable section
|
|
char *ptr_qbuf ;
|
|
char *ptr_ubuf ; // url buffer
|
|
const char *ptr_linkee; // used by Msg25 for getting link text
|
|
char *ptr_displayMetas ;
|
|
|
|
int32_t size_qbuf ;
|
|
int32_t size_ubuf ; // url buffer
|
|
int32_t size_linkee ; // size includes terminating \0
|
|
int32_t size_displayMetas ; // size includes terminating \0
|
|
|
|
// variable data comes here
|
|
};
|
|
|
|
|
|
struct Msg20State;
|
|
|
|
class Msg20Reply {
|
|
public:
|
|
|
|
Msg20Reply();
|
|
// free the merge buf from Msg40.cpp merging event summaries
|
|
~Msg20Reply();
|
|
void destructor();
|
|
|
|
// zero ourselves out
|
|
void reset() { memset(this,0,sizeof(*this)); }
|
|
|
|
// how many bytes if we had to serialize it?
|
|
int32_t getStoredSize() const;
|
|
|
|
int32_t deserialize ( ) ;
|
|
int32_t serialize(char *buf, int32_t bufSize) const;
|
|
|
|
|
|
bool sendReply(Msg20State *state);
|
|
|
|
// after calling these, when serialize() is called again it will
|
|
// exclude these strings which were "cleared". Used by Msg40 to
|
|
// reduce the memory required for caching the Msg40 which includes an
|
|
// array of Msg20s.
|
|
void clearOutlinks ( ) {
|
|
size_linkText = 0;
|
|
size_surroundingText = 0;
|
|
size_outlinks = 0;
|
|
}
|
|
|
|
void clearVectors() {
|
|
size_vbuf = 0;
|
|
}
|
|
|
|
int32_t m_ip ;
|
|
int32_t m_firstIp ;
|
|
int32_t m_wordPosStart ;
|
|
int64_t m_docId ;
|
|
int32_t m_firstSpidered ;
|
|
int32_t m_lastSpidered ;
|
|
int32_t m_lastModified ;
|
|
int32_t m_datedbDate ;
|
|
int32_t m_firstIndexedDate ; // for the url/document as a whole
|
|
int32_t m_discoveryDate ; // for the inlink in question...
|
|
int32_t m_errno ; // LinkInfo uses it for LinkTextRepl
|
|
collnum_t m_collnum ; // collection # we came from
|
|
char m_noArchive ;
|
|
char m_contentType ;
|
|
char m_siteRank ;
|
|
bool m_isBanned ;
|
|
char m_recycled ;
|
|
uint8_t m_language ;
|
|
uint16_t m_country ;
|
|
bool m_isAdult ;
|
|
|
|
int16_t m_httpStatus;
|
|
int32_t m_indexCode;
|
|
int32_t m_contentLen ; // was m_docLen
|
|
int32_t m_contentHash32 ; // for deduping diffbot json objects streaming
|
|
int32_t m_pageNumInlinks ;
|
|
int32_t m_pageNumGoodInlinks ;
|
|
int32_t m_pageNumUniqueIps ; // includes our own inlinks
|
|
int32_t m_pageNumUniqueCBlocks; // includes our own inlinks
|
|
int32_t m_pageInlinksLastUpdated;
|
|
|
|
int32_t m_siteNumInlinks ; // GOOD inlinks!
|
|
|
|
int32_t m_numOutlinks ; // replaced m_linkCount
|
|
|
|
// these are just storage for LinkInfo::set() to use
|
|
int32_t m_linkTextNumWords ;
|
|
|
|
int32_t m_midDomHash ; // set for m_getLinkText
|
|
|
|
char m_isLinkSpam ; // set for m_getLinkText
|
|
char m_outlinkInContent ; // set for m_getLinkText
|
|
char m_outlinkInComment ; // set for m_getLinkText
|
|
char m_isPermalink ; // set for m_getLinkText (buzz)
|
|
|
|
bool m_isDisplaySumSetFromTags;
|
|
|
|
// pointer+size variable section
|
|
char *ptr_tbuf ; // title buffer
|
|
char *ptr_htag ; // h1 tag buf
|
|
char *ptr_ubuf ; // url buffer
|
|
char *ptr_rubuf ; // redirect url buffer
|
|
char *ptr_displaySum ; // summary for displaying
|
|
char *ptr_dbuf ; // display metas \0 separated
|
|
char *ptr_vbuf ; // summary vector
|
|
char *ptr_imgData ; // for encoded images
|
|
char *ptr_site ;
|
|
|
|
// . if m_computeLinkInfo is true this is computed using Msg25 (fresh)
|
|
// . if m_setLinkInfo is true this is just set from the titleRec
|
|
// . this is a serialized LinkInfo class
|
|
char *ptr_linkInfo; // inlinks ;
|
|
// . made using LinkInfo::set ( Msg20Reply **ptrs )
|
|
// . this is a serialized LinkInfo class
|
|
char *ptr_outlinks ;
|
|
|
|
// . these are used only by Msg25 to compute LinkInfo
|
|
// . Msg25 will call Msg20 on the docid of a potentially good inlinker
|
|
// instead of calling the now obsolete Msg23::getLinkText()
|
|
int32_t *ptr_vector1 ; // set for m_getLinkText
|
|
int32_t *ptr_vector2 ; // set for m_getLinkText
|
|
int32_t *ptr_vector3 ; // set for m_getLinkText
|
|
char *ptr_linkText ; // set for m_getLinkText
|
|
char *ptr_surroundingText ; // set for m_getLinkText
|
|
char *ptr_linkUrl ; // what we link to
|
|
char *ptr_rssItem ; // set for m_getLinkText
|
|
const char *ptr_categories ;
|
|
char *ptr_content ; // page content in utf8
|
|
char *ptr_templateVector ;
|
|
char *ptr_metadataBuf;
|
|
|
|
const char *ptr_note ; // reason why it cannot vote
|
|
|
|
int32_t size_tbuf;
|
|
int32_t size_htag;
|
|
int32_t size_ubuf;
|
|
int32_t size_rubuf;
|
|
int32_t size_displaySum;
|
|
int32_t size_dbuf;
|
|
int32_t size_vbuf;
|
|
int32_t size_imgData;
|
|
int32_t size_site;
|
|
int32_t size_linkInfo;
|
|
int32_t size_outlinks;
|
|
int32_t size_vector1;
|
|
int32_t size_vector2;
|
|
int32_t size_vector3;
|
|
int32_t size_linkText;
|
|
int32_t size_surroundingText;
|
|
int32_t size_linkUrl;
|
|
int32_t size_rssItem;
|
|
int32_t size_categories;
|
|
int32_t size_content; // page content in utf8
|
|
int32_t size_templateVector;
|
|
int32_t size_metadataBuf;
|
|
int32_t size_note;
|
|
|
|
// variable data comes here
|
|
};
|
|
|
|
class Msg20 {
|
|
public:
|
|
|
|
// . this should only be called once
|
|
// . should also register our get record handlers with the udpServer
|
|
static bool registerHandler();
|
|
|
|
// see definition of Msg20Request below
|
|
bool getSummary ( class Msg20Request *r );
|
|
|
|
// this is cast to m_replyPtr
|
|
Msg20Reply *m_r ;
|
|
int32_t m_replySize;
|
|
int32_t m_replyMaxSize;
|
|
|
|
// i guess Msg40.cpp looks at this flag
|
|
bool m_gotReply;
|
|
|
|
// set if we had an error
|
|
int32_t m_errno;
|
|
|
|
int64_t getRequestDocId () const { return m_requestDocId; }
|
|
|
|
int32_t getStoredSize() const {
|
|
if ( ! m_r ) return 0;
|
|
return m_r->getStoredSize();
|
|
}
|
|
|
|
// . return how many bytes we serialize into "buf"
|
|
// . sets g_errno and returns -1 on error
|
|
int32_t serialize ( char *buf , int32_t bufSize ) {
|
|
if ( ! m_r ) return 0;
|
|
return m_r->serialize ( buf , bufSize );
|
|
}
|
|
|
|
// . this is destructive on the "buf". it converts offs to ptrs
|
|
// . sets m_r to the modified "buf" when done
|
|
// . sets g_errno and returns -1 on error, otherwise # of bytes deseril
|
|
int32_t deserialize ( char *buf , int32_t bufSize ) ;
|
|
|
|
// Msg40 caches each Msg20Reply when it caches the page of results, so,
|
|
// to keep the size of the cached Msg40 down, we do not cache certain
|
|
// things. so we have to "clear" these guys out before caching.
|
|
void clearLinks () { if ( m_r ) m_r->clearOutlinks (); }
|
|
void clearVectors () { if ( m_r ) m_r->clearVectors (); }
|
|
// copy "src" to ourselves
|
|
void moveFrom(Msg20 *src);
|
|
|
|
void gotReply ( class UdpSlot *slot );
|
|
|
|
// general purpose routines
|
|
Msg20();
|
|
~Msg20();
|
|
// so we can alloc arrays of these using mmalloc()
|
|
void constructor ();
|
|
void destructor ();
|
|
void freeReply ();
|
|
void reset ();
|
|
|
|
int32_t m_ii;
|
|
|
|
// is the reply in progress? if msg20 has not launched a request
|
|
// this is false. if msg20 received its reply, this is false.
|
|
// otherwise this is true.
|
|
bool m_inProgress;
|
|
bool m_launched;
|
|
|
|
private:
|
|
char *m_request;
|
|
int32_t m_requestSize;
|
|
|
|
int64_t m_requestDocId;
|
|
|
|
// for sending the request
|
|
Multicast m_mcast;
|
|
|
|
bool m_ownReply;
|
|
|
|
bool (*m_callback ) ( void *state );
|
|
void (*m_callback2) ( void *state );
|
|
void *m_state;
|
|
|
|
static void gotReplyWrapper20(void *state, void *state20);
|
|
};
|
|
|
|
#endif // GB_MSG20_H
|