822 lines
33 KiB
C++
822 lines
33 KiB
C++
// Matt Wells, copyright Nov 2007
|
|
|
|
// get various information from a query and a docId, like summary, title, etc.
|
|
|
|
#ifndef _MSG20_H_
|
|
#define _MSG20_H_
|
|
|
|
#include "UdpServer.h"
|
|
#include "Hostdb.h"
|
|
#include "Multicast.h"
|
|
#include "Xml.h"
|
|
#include "Summary.h"
|
|
#include "Titledb.h"
|
|
#include "Query.h"
|
|
//#include "LinkInfo.h"
|
|
#include "Tagdb.h" // TagRec
|
|
#include "Events.h" // EventIdBits
|
|
|
|
// values for SummaryLine::m_flags
|
|
//#define SL_TRUNCATED 0x0100
|
|
//#define SL_IS_TITLE 0x0080
|
|
//#define SL_HAS_DATE 0x0040
|
|
//#define SL_HAS_QTERM 0x0020
|
|
//#define SL_BULLET 0x0001
|
|
|
|
#define MSG20_CURRENT_VERSION 0
|
|
|
|
// MAX_QUERY_LEN is pretty big and Msg40 contains 50 or so Msg20s so let's
|
|
// cut down on memory usage here.
|
|
//#define MAX_MSG20_REQUEST_SIZE (500+500)
|
|
//#define MSG20_MAX_REPLY_SIZE (1*1024)
|
|
// see what happens if we eliminate these bufs
|
|
#define MAX_MSG20_REQUEST_SIZE (1)
|
|
#define MSG20_MAX_REPLY_SIZE (1)
|
|
|
|
#define REQ20FLAG1_USEDATELISTS 0x01
|
|
#define REQ20FLAG1_EXCLDATELIST 0x02
|
|
#define REQ20FLAG1_EXCLQTINANCH 0x04
|
|
#define REQ20FLAG1_PQRENABLED 0x08
|
|
#define REQ20FLAG1_PQRLOCENABLED 0x010
|
|
|
|
#define INLINK_FLAG_SPAM 0x01
|
|
#define INLINK_FLAG_HASTEXT 0x02
|
|
|
|
class Msg20Request {
|
|
public:
|
|
|
|
Msg20Request() { reset(); };
|
|
|
|
// zero ourselves out
|
|
void reset() {
|
|
memset ( (char *)this,0,sizeof(Msg20Request) );
|
|
// these are the only non-zero defaults
|
|
m_version = MSG20_CURRENT_VERSION;
|
|
m_maxNumCharsPerLine = 50;
|
|
m_numSummaryLines = 2;
|
|
m_expected = false;
|
|
m_allowPunctInPhrase = true;
|
|
m_docId = -1LL; // set docid to "invalid"
|
|
m_boolFlag = 2 ; // autodetect if query boolean
|
|
m_titleMaxLen = 64 ;
|
|
m_summaryMaxLen = 512 ;
|
|
};
|
|
|
|
int32_t getStoredSize ( );
|
|
char *serialize ( int32_t *sizePtr ,
|
|
char *userBuf ,
|
|
int32_t userBufSize ) ;
|
|
int32_t deserialize ( );
|
|
|
|
char m_version ; // non-zero default
|
|
char m_numSummaryLines ; // non-zero default
|
|
char m_expected ; // non-zero default
|
|
char m_allowPunctInPhrase ; // non-zero default
|
|
bool m_getHeaderTag ;
|
|
void *m_state ;
|
|
void *m_state2 ; // used by Msg25.cpp
|
|
int32_t m_j ; // used by Msg25.cpp
|
|
bool (* m_callback)( void *m_state );
|
|
void (* m_callback2)( void *m_state );
|
|
int64_t m_docId ;
|
|
Hostdb *m_hostdb ;
|
|
int32_t m_niceness ;
|
|
char m_boolFlag ;
|
|
int32_t m_titleMaxLen ;
|
|
int32_t m_summaryMaxLen ;
|
|
int32_t m_summaryMaxNumCharsPerLine ;
|
|
int32_t m_maxNumCharsPerLine ;
|
|
int32_t m_bigSampleRadius ;
|
|
int32_t m_bigSampleMaxLen ;
|
|
int32_t m_maxCacheAge ;
|
|
int32_t m_maxLinks ;
|
|
int32_t m_discoveryDate ;
|
|
|
|
// special shit so we can remove an inlinker to a related docid
|
|
// if they also link to the main url we are processing seo for.
|
|
// set both of these to 0 to disregard.
|
|
int32_t m_ourHostHash32;
|
|
int32_t m_ourDomHash32;
|
|
|
|
FacetValHash_t m_facetValHash;
|
|
|
|
char m_justGetFacets : 1 ;
|
|
|
|
// for sending msg20 request to another network
|
|
//int32_t m_hostIP;
|
|
//int32_t m_hostUDPPort;
|
|
|
|
// if titleRec not from this ruleset, return g_errno = EDOCFILTERED
|
|
//int32_t m_rulesetFilter ;
|
|
// add this many seconds to clock to simulate event search going
|
|
// forward or backward in time
|
|
int32_t m_clockOff;
|
|
// we force the clock time to this if "clockset" is a non-zero cgi parm
|
|
time_t m_clockSet;
|
|
// pass in the same time in UTC we used for the intersection algo
|
|
time_t m_nowUTC;
|
|
int32_t m_turkIp;
|
|
// language the query is in (ptr_qbuf)
|
|
uint8_t m_langId;
|
|
// . if not 0 then return the event from the docid with that eventId
|
|
// . include the title and text of the event, and the address
|
|
// serialized using Address::serialize(), and all the start dates
|
|
// from now onward
|
|
int32_t m_eventId ;
|
|
// we now use the numeric collection # and not the ptr_coll
|
|
collnum_t m_collnum;
|
|
// set this to true when you pass in m_eventIdBits...
|
|
char m_getEventSummary ;
|
|
char m_summaryMode ;
|
|
// typically we allow 1 vote per ip or host i guess, but buzz should
|
|
// allow for up to 4, for better influence determination.
|
|
char m_linksPerIpHost ;
|
|
char m_flags ;
|
|
|
|
char m_highlightQueryTerms :1;
|
|
char m_highlightDates :1; // for event dates
|
|
char m_wcache :1;
|
|
//char m_checkSitedb :1;
|
|
char m_getImageUrl :1;
|
|
char m_ratInSummary :1;
|
|
char m_countOutlinks :1;
|
|
char m_considerTitlesFromBody :1;
|
|
char m_getSummaryVector :1;
|
|
char m_showBanned :1;
|
|
//char m_excludeLinkText :1;
|
|
//char m_excludeMetaText :1;
|
|
//char m_hackFixWords :1;
|
|
//char m_hackFixPhrases :1;
|
|
char m_includeCachedCopy :1;
|
|
char m_getSectionVotingInfo :1; // in JSON for now
|
|
char m_getMatches :1;
|
|
char m_useLinkdbForInlinks :1;
|
|
char m_getTermListBuf :1;
|
|
//char m_getInlinks :1; // use m_getLinkInfo!
|
|
char m_getOutlinks :1;
|
|
char m_getTitleRec :1; // sets ptr_tr in reply
|
|
char m_maxInlinks :1;
|
|
char m_getGigabitVector :1;
|
|
char m_doLinkSpamCheck :1;
|
|
char m_isLinkSpam :1; // Msg25 uses for storage
|
|
char m_isSiteLinkInfo :1; // site link info?
|
|
char m_isDebug :1;
|
|
// if true, calls Msg25 and fills in ptr_linkInfo/size_linkInfo
|
|
char m_computeLinkInfo :1;
|
|
// if true, just calls TitleRec::getLinkInfo() to set ptr_linkInfo
|
|
char m_getLinkInfo :1;
|
|
// if this is true we will not compute the title, etc. of BAD inlinks
|
|
// deemed link spam
|
|
char m_onlyNeedGoodInlinks :1;
|
|
// if true, sets ptr_linkText, etc.
|
|
char m_getLinkText :1;
|
|
// if this is true then we set ptr_turkForm to be an input form
|
|
// for turking this event summary and title
|
|
char m_getTurkForm :1;
|
|
char m_showTurkInstructions :1;
|
|
char m_isTurkSpecialQuery :1;
|
|
char m_isMasterAdmin :1;
|
|
// . this is for buzz.
|
|
// . this says to compute the <absScore2> tag in their xml feed.
|
|
// . the document receives a score of 0 if it does not match the query
|
|
// . can we just keep it a binary score? let's try that.
|
|
char m_checkForQueryMatch :1;
|
|
|
|
// pointer+size variable section
|
|
char *ptr_qbuf ;
|
|
char *ptr_hqbuf ;
|
|
//char *ptr_q2buf ;
|
|
char *ptr_turkUser ;
|
|
char *ptr_ubuf ; // url buffer
|
|
char *ptr_rubuf ; // redirect url buffer
|
|
char *ptr_termFreqs ;
|
|
char *ptr_affWeights ;
|
|
char *ptr_linkee ; // used by Msg25 for getting link text
|
|
//char *ptr_coll ;
|
|
//char *ptr_imgUrl ;
|
|
char *ptr_displayMetas ;
|
|
|
|
int32_t size_qbuf ;
|
|
int32_t size_hqbuf ;
|
|
//int32_t size_q2buf ;
|
|
int32_t size_turkUser ;
|
|
int32_t size_ubuf ; // url buffer
|
|
int32_t size_rubuf ; // redirect url buffer
|
|
int32_t size_termFreqs ;
|
|
int32_t size_affWeights ;
|
|
int32_t size_linkee ; // size includes terminating \0
|
|
//int32_t size_coll ; // size includes terminating \0
|
|
//int32_t size_imgUrl ;
|
|
int32_t size_displayMetas ; // size includes terminating \0
|
|
|
|
// variable data comes here
|
|
};
|
|
|
|
// the Msg20Reply::ptr_eventSummaryLines is a list of these classes
|
|
class SummaryLine {
|
|
public:
|
|
int32_t m_totalSize;
|
|
//int32_t m_pageOff;
|
|
int32_t m_pageOff1;
|
|
int32_t m_pageOff2;
|
|
int32_t m_firstDatePageOff;
|
|
// so we know if two summary lines are adjacent. then we do not
|
|
// insert the "..." between them when displaying.
|
|
int32_t m_alnumPosA;
|
|
int32_t m_alnumPosB;
|
|
// copied from EventDesc::m_dflags. might also include some tags
|
|
// that we add in XmlDoc::getEventSummary(), like EDF_TRUNCATED
|
|
int32_t m_flags;
|
|
// if two summary lines are adjacent then do not print the ... between
|
|
// in the serps, will look cleaner...
|
|
//int32_t m_alnumWordA;
|
|
//int32_t m_alnumWordB;
|
|
char m_buf[0];
|
|
};
|
|
|
|
// values for m_flags3
|
|
//#define F3_STORE_HOURS 0x01
|
|
|
|
class Msg20Reply {
|
|
public:
|
|
|
|
Msg20Reply();
|
|
// free the merge buf from Msg40.cpp merging event summaries
|
|
~Msg20Reply();
|
|
void destructor();
|
|
|
|
// zero ourselves out
|
|
void reset() { memset ( (char *)this,0,sizeof(Msg20Reply) ); };
|
|
|
|
// how many bytes if we had to serialize it?
|
|
int32_t getStoredSize() ;
|
|
|
|
int32_t deserialize ( ) ;
|
|
int32_t serialize ( char *buf , int32_t bufSize );
|
|
|
|
char *getAttendeeUrl ( int32_t i ) { return ""; };
|
|
char *getLikerUrl ( int32_t i ) { return ""; };
|
|
|
|
bool sendReply ( class XmlDoc *xd ) ;
|
|
|
|
// after calling these, when serialize() is called again it will
|
|
// exclude these strings which were "cleared". Used by Msg40 to
|
|
// reduce the memory required for caching the Msg40 which includes an
|
|
// array of Msg20s.
|
|
//void clearBigSample ( ) { size_sbuf = 0; };
|
|
void clearOutlinks ( ) {
|
|
size_obuf = 0;
|
|
size_linkText = 0;
|
|
size_surroundingText = 0;
|
|
//size_linkInfo = 0;
|
|
size_outlinks = 0;
|
|
};
|
|
void clearVectors ( ) { size_vbuf = 0; };
|
|
|
|
// a new one for getting the display contents sequentially used
|
|
// by Msg24.cpp. this routine is the exclusive user of the "next"
|
|
// variable which must be set to "ptr_dbuf" when first called.
|
|
char *getNextDisplayBuf ( int32_t *len , char **next ) {
|
|
if ( ! *next ) return NULL;
|
|
if ( *next >= (char *)ptr_dbuf + size_dbuf ) return NULL;
|
|
char *s = *next;
|
|
*len = gbstrlen(*next);
|
|
*next += *len + 1;
|
|
return s;
|
|
};
|
|
|
|
char m_version ;
|
|
int32_t m_ip ;
|
|
int32_t m_firstIp ;
|
|
int32_t m_wordPosStart ;
|
|
int64_t m_domHash ;
|
|
int64_t m_docId ;
|
|
int64_t m_urlHash48 ;
|
|
uint64_t m_eventHash64 ;
|
|
int32_t m_eventId ;
|
|
uint64_t m_eventDateHash64 ;
|
|
uint32_t m_adch32 ; // event address/data content hash
|
|
uint32_t m_adth32 ; // event address/data tag hash
|
|
int32_t m_firstSpidered ;
|
|
int32_t m_lastSpidered ;
|
|
int32_t m_lastModified ;
|
|
int32_t m_datedbDate ;
|
|
int32_t m_firstIndexedDate ; // for the url/document as a whole
|
|
int32_t m_discoveryDate ; // for the inlink in question...
|
|
int32_t m_numAlnumWords ;
|
|
//int32_t m_numAttendees ;
|
|
//int32_t m_numLikers ;
|
|
bool m_datedbDateIsEstimated;
|
|
int32_t m_errno ; // LinkInfo uses it for LinkTextRepl
|
|
collnum_t m_collnum ; // collection # we came from
|
|
char m_sumFromDmoz ; // unused
|
|
int32_t m_hostHash ;
|
|
char m_noArchive ;
|
|
char m_contentType ;
|
|
//char m_docQuality ;
|
|
char m_siteRank ;
|
|
char m_isBanned ;
|
|
char m_isFiltered ;
|
|
char m_eventExpired ;
|
|
char m_hasLinkToOurDomOrHost;
|
|
//char m_isNormalized ;
|
|
char m_urlFilterNum ;
|
|
char m_hopcount ;
|
|
//char m_flags3 ;
|
|
char m_recycled ;
|
|
uint8_t m_language ;
|
|
uint8_t m_summaryLanguage ;
|
|
uint16_t m_country ;
|
|
uint16_t m_computedCountry ;
|
|
int16_t m_charset ;
|
|
// for use by caller
|
|
class Msg20Reply *m_nextMerged ;
|
|
//int32_t m_numCatIds ; // use size_catIds
|
|
//int32_t m_numIndCatIds ; // use size_indCatIds
|
|
int32_t m_contentLen ; // was m_docLen
|
|
int32_t m_contentHash32 ; // for deduping diffbot json objects streaming
|
|
//int32_t m_docSummaryScore ;
|
|
//int32_t m_inSectionScore ;
|
|
//float m_proximityScore ;
|
|
//int32_t m_ruleset ;
|
|
int32_t m_pageNumInlinks ;
|
|
int32_t m_pageNumGoodInlinks ;
|
|
int32_t m_pageNumUniqueIps ; // includes our own inlinks
|
|
int32_t m_pageNumUniqueCBlocks; // includes our own inlinks
|
|
int32_t m_pageInlinksLastUpdated;
|
|
|
|
int32_t m_siteNumInlinks ; // GOOD inlinks!
|
|
//int32_t m_siteNumInlinksTotal ; // TOTAL inlinks
|
|
//int32_t m_siteNumUniqueIps ;
|
|
//int32_t m_siteNumUniqueCBlocks;
|
|
|
|
int32_t m_numOutlinks ; // replaced m_linkCount
|
|
int32_t m_tmp ; // used by Msg40.cpp for event merge
|
|
//float m_diversity ;
|
|
uint32_t m_tagVectorHash ; // zak's hash of html template
|
|
uint32_t m_gigabitVectorHash ; // zak's hash of the gigabits
|
|
|
|
uint32_t m_eventSummaryHash ;
|
|
double m_eventGeocoderLat ; // lat/lon of the event
|
|
double m_eventGeocoderLon ;
|
|
uint64_t m_eventAddressHash64 ; // event address hash
|
|
uint64_t m_eventTitleHash64 ; // event title hash
|
|
int32_t m_eventTitleOff ; // offset of first word in title
|
|
evflags_t m_eventFlags ;
|
|
char m_timeZoneOffset ; // in hours
|
|
char m_useDST ; // does event place use dst?
|
|
int32_t m_nextStart ; // next occ starts at this time_t
|
|
int32_t m_nextEnd ; // end - start is how long it is
|
|
int32_t m_prevStart ;
|
|
int32_t m_prevEnd ;
|
|
int32_t m_displayStartTime ; // the event times to display
|
|
int32_t m_displayEndTime ;
|
|
|
|
double m_balloonLat;
|
|
double m_balloonLon;
|
|
char m_balloonLetter;
|
|
|
|
// . hash of all the event start times from now until 60 days from now
|
|
// . used by Msg40.cpp for merging event summaries deemed to be the
|
|
// same event
|
|
uint32_t m_timeIntervalHash;
|
|
|
|
// these are just storage for LinkInfo::set() to use
|
|
//int32_t m_linkTextScoreWeight ;
|
|
int32_t m_linkTextNumWords ;
|
|
//int32_t m_linkTextLinkerQualityBoost ;
|
|
//int32_t m_linkTextNumWordsBoost ;
|
|
//int32_t m_linkTextBaseScore ;
|
|
char *m_linkTextNote ;
|
|
|
|
//int32_t m_pagePop ; // set for m_computeLinkInfo
|
|
//int32_t m_siteRootPagePop ; // set for m_computeLinkInfo
|
|
//int32_t m_siteRootNumInlinks ; // set for m_computeLinkInfo
|
|
//int32_t m_sitePop ; // set for m_computeLinkInfo
|
|
int32_t m_midDomHash ; // set for m_getLinkText
|
|
int32_t m_adIdHash ; // set for m_getLinkText
|
|
int32_t m_timeLinkSpam ; // set for m_getLinkText
|
|
void *m_parentOwner;
|
|
char m_constructorId;
|
|
|
|
char m_inlinkWeight ; // set for m_getLinkText
|
|
char m_isLinkSpam ; // set for m_getLinkText
|
|
char m_isAnomaly ; // set for m_getLinkText
|
|
char m_outlinkInContent ; // set for m_getLinkText
|
|
char m_outlinkInComment ; // set for m_getLinkText
|
|
char m_hasAllQueryTerms ; // set for m_getLinkText (buzz)
|
|
char m_isPermalink ; // set for m_getLinkText (buzz)
|
|
|
|
// pointer+size variable section
|
|
char *ptr_tbuf ; // title buffer
|
|
char *ptr_htag ; // h1 tag buf
|
|
char *ptr_ubuf ; // url buffer
|
|
char *ptr_rubuf ; // redirect url buffer
|
|
char *ptr_displaySum ; // summary for displaying
|
|
char *ptr_dedupSum ; // summary for deduping
|
|
char *ptr_dbuf ; // display metas \0 separated
|
|
//char *ptr_sbuf ; // big sample buf for gigabits
|
|
char *ptr_gigabitSample ;
|
|
char *ptr_obuf ; // outlinks buf, \0 separated
|
|
char *ptr_mbuf ; // match offsets
|
|
char *ptr_vbuf ; // summary vector
|
|
char *ptr_tvbuf ; // title vector
|
|
char *ptr_gbvecbuf ; // gigabit vector
|
|
char *ptr_imgUrl ; // youtube/metacafe vid thumb
|
|
char *ptr_imgData ; // for encoded images
|
|
char *ptr_facetBuf ;
|
|
//char *ptr_eventEnglishTime ; // "every saturday [[]] jan"
|
|
//char *ptr_eventDateIntervals ;
|
|
char *ptr_likedbList ;
|
|
|
|
char *ptr_matchedQueryWords ;
|
|
char *ptr_numMatchedQueryWords ;
|
|
char *ptr_matchedTypes ;
|
|
|
|
int32_t *ptr_catIds ;
|
|
int32_t *ptr_indCatIds ;
|
|
//char *ptr_dmozTitleLens ;
|
|
char *ptr_dmozTitles ;
|
|
//char *ptr_dmozSummLens ;
|
|
char *ptr_dmozSumms ;
|
|
//char *ptr_dmozAnchorLens ;
|
|
char *ptr_dmozAnchors ;
|
|
//char *ptr_tagRec ;
|
|
char *ptr_site ;
|
|
char *ptr_gbAdIds ;
|
|
char *ptr_summLocs ;
|
|
char *ptr_summLocsPops ;
|
|
|
|
// . if m_computeLinkInfo is true this is computed using Msg25 (fresh)
|
|
// . if m_setLinkInfo is true this is just set from the titleRec
|
|
// . this is a serialized LinkInfo class
|
|
char *ptr_linkInfo; // inlinks ;
|
|
// . made using LinkInfo::set ( Msg20Reply **ptrs )
|
|
// . this is a serialized LinkInfo class
|
|
char *ptr_outlinks ;
|
|
|
|
// . these are used only by Msg25 to compute LinkInfo
|
|
// . Msg25 will call Msg20 on the docid of a potentially good inlinker
|
|
// instead of calling the now obsolete Msg23::getLinkText()
|
|
int32_t *ptr_vector1 ; // set for m_getLinkText
|
|
int32_t *ptr_vector2 ; // set for m_getLinkText
|
|
int32_t *ptr_vector3 ; // set for m_getLinkText
|
|
char *ptr_linkText ; // set for m_getLinkText
|
|
char *ptr_surroundingText ; // set for m_getLinkText
|
|
char *ptr_linkUrl ; // what we link to
|
|
char *ptr_rssItem ; // set for m_getLinkText
|
|
char *ptr_categories ;
|
|
char *ptr_gigabitQuery ; // , separated list of gigabits
|
|
int32_t *ptr_gigabitScores ; // 1-1 with the terms in query
|
|
char *ptr_content ; // page content in utf8
|
|
char *ptr_sectionVotingInfo ; // in JSON
|
|
char *ptr_tr ; // like just using msg22
|
|
char *ptr_tlistBuf ;
|
|
char *ptr_tiBuf ; // terminfobuf
|
|
char *ptr_templateVector ;
|
|
char *ptr_metadataBuf;
|
|
|
|
// . for eventIds include the title and text of the event, and the addr
|
|
// serialized using Address::serialize(), and all the start dates
|
|
// from now onward
|
|
// . contains serialized EventReply classes
|
|
// . usually just one, but if multiple events that had different
|
|
// addresses from this same docid matched the query, then we will
|
|
// have multiple EventReply classes in this buf
|
|
//char *ptr_eventSummaryLines ;
|
|
//char *ptr_eventAddr ;
|
|
//char *ptr_eventTagsFromContent ;
|
|
//char *ptr_eventTagsFromTagdb ;
|
|
//char *ptr_eventBestPlaceName ;
|
|
|
|
// . if Msg20Request::m_forTurk is true then the ptr_turkForm will
|
|
// be a little input form that lists every line in the title and
|
|
// description of the event along with controls that allow the turk
|
|
// to turn descriptions on/off and pick different titles.
|
|
// . when they submit their changes then it should basically add
|
|
// the turk tag hashes of each line to tagdb, but only if changed
|
|
// by the turk.
|
|
// . i guess it should submit directly to tagdb...
|
|
// . then we should do a query reindex on all docs with that
|
|
// tagformathash
|
|
//char *ptr_turkForm;
|
|
|
|
char *ptr_note ; // reason why it cannot vote
|
|
|
|
int32_t size_tbuf ;
|
|
int32_t size_htag ;
|
|
int32_t size_ubuf ;
|
|
int32_t size_rubuf ;
|
|
int32_t size_displaySum ;
|
|
int32_t size_dedupSum ;
|
|
int32_t size_dbuf ;
|
|
//int32_t size_sbuf ;
|
|
int32_t size_gigabitSample ; // includes \0
|
|
int32_t size_obuf ;
|
|
int32_t size_mbuf ;
|
|
int32_t size_vbuf ;
|
|
int32_t size_tvbuf ;
|
|
int32_t size_gbvecbuf ;
|
|
int32_t size_imgUrl ; // youtube/metacafe vid thumb
|
|
int32_t size_imgData ;
|
|
int32_t size_facetBuf ;
|
|
//int32_t size_eventEnglishTime ;
|
|
//int32_t size_eventDateIntervals ;
|
|
int32_t size_likedbList ;
|
|
|
|
int32_t size_matchedQueryWords ;
|
|
int32_t size_numMatchedQueryWords ;
|
|
int32_t size_matchedTypes ;
|
|
|
|
int32_t size_catIds ;
|
|
int32_t size_indCatIds ;
|
|
//int32_t size_dmozTitleLens ;
|
|
int32_t size_dmozTitles ;
|
|
//int32_t size_dmozSummLens ;
|
|
int32_t size_dmozSumms ;
|
|
//int32_t size_dmozAnchorLens ;
|
|
int32_t size_dmozAnchors ;
|
|
//int32_t size_tagRec ;
|
|
int32_t size_site ;
|
|
int32_t size_gbAdIds ;
|
|
int32_t size_summLocs ;
|
|
int32_t size_summLocsPops ;
|
|
|
|
int32_t size_linkInfo;//inlinks ;
|
|
int32_t size_outlinks ;
|
|
|
|
int32_t size_vector1 ;
|
|
int32_t size_vector2 ;
|
|
int32_t size_vector3 ;
|
|
int32_t size_linkText ;
|
|
int32_t size_surroundingText ;
|
|
int32_t size_linkUrl ;
|
|
int32_t size_rssItem ;
|
|
int32_t size_categories ;
|
|
int32_t size_gigabitQuery ;
|
|
int32_t size_gigabitScores ;
|
|
int32_t size_content ; // page content in utf8
|
|
int32_t size_sectionVotingInfo ; // in json, includes \0
|
|
int32_t size_tr ;
|
|
int32_t size_tlistBuf ;
|
|
int32_t size_tiBuf ;
|
|
int32_t size_templateVector ;
|
|
int32_t size_metadataBuf ;
|
|
|
|
//int32_t size_eventSummaryLines ;
|
|
//int32_t size_eventAddr ;
|
|
//int32_t size_eventTagsFromContent ;
|
|
//int32_t size_eventTagsFromTagdb ;
|
|
//int32_t size_eventBestPlaceName ;
|
|
|
|
//int32_t size_turkForm ;
|
|
|
|
int32_t size_note ;
|
|
|
|
// variable data comes here
|
|
|
|
int32_t getNumCatIds (){return size_catIds/4; };
|
|
int32_t getNumIndCatIds (){return size_indCatIds/4; };
|
|
int32_t getCatId (int32_t i){return ((int32_t *)ptr_catIds)[i]; };
|
|
int32_t getIndCatId (int32_t i){return ((int32_t *)ptr_indCatIds)[i];};
|
|
|
|
//int32_t getDmozTitleLen (int32_t i){
|
|
// return ((int32_t *)ptr_dmozTitleLens)[i];};
|
|
//int32_t getDmozSummLen (int32_t i){
|
|
// return ((int32_t *)ptr_dmozSummLens)[i]; };
|
|
//int32_t getDmozAnchorLen (int32_t i){
|
|
// return (int32_t)((uint8_t *)ptr_dmozAnchorLens)[i];};
|
|
//int32_t *getCatIds (){return (int32_t *)ptr_catIds; };
|
|
//int32_t *getIndCatIds (){return (int32_t *)ptr_indCatIds; };
|
|
//int32_t *getTitleLens (){return (int32_t *)ptr_dmozTitleLens;};
|
|
//int32_t *getSummLens (){return (int32_t *)ptr_dmozSummLens; };
|
|
//uint8_t*getAnchorLens (){return(uint8_t *)ptr_dmozAnchorLens;};
|
|
};
|
|
|
|
class Msg20 {
|
|
|
|
public:
|
|
|
|
// . this should only be called once
|
|
// . should also register our get record handlers with the udpServer
|
|
bool registerHandler ( );
|
|
|
|
// see definition of Msg20Request below
|
|
bool getSummary ( class Msg20Request *r );
|
|
|
|
// "m_request = r->serialize(&m_requestSize,m_requestBuf)"
|
|
char *m_request;
|
|
int32_t m_requestSize;
|
|
char m_requestBuf[MAX_MSG20_REQUEST_SIZE];
|
|
|
|
// this is cast to m_replyPtr
|
|
Msg20Reply *m_r ;
|
|
// m_replyPtr pts to either m_replyBuf or to mem allocated from the
|
|
// udp server to hold the reply.
|
|
//char *m_replyPtr;
|
|
int32_t m_replySize;
|
|
int32_t m_replyMaxSize;
|
|
//char m_replyBuf[MSG20_MAX_REPLY_SIZE];
|
|
// i guess Msg40.cpp looks at this flag
|
|
char m_gotReply;
|
|
// set if we had an error
|
|
int32_t m_errno;
|
|
|
|
int64_t getRequestDocId () { return m_requestDocId; };
|
|
int64_t m_requestDocId;
|
|
|
|
// and this is copied from the msg20request
|
|
int32_t m_eventId;
|
|
|
|
// when we merge two msg20 replies in Msg40.cpp keep track of the
|
|
// event ids via this bit vector so if the click on the [cached] page
|
|
// link we can highlight the relevant event sections.
|
|
//EventIdBits m_eventIdBits;
|
|
|
|
int32_t getStoredSize ( ) {
|
|
if ( ! m_r ) return 0;
|
|
return m_r->getStoredSize(); };
|
|
// . return how many bytes we serialize into "buf"
|
|
// . sets g_errno and returns -1 on error
|
|
int32_t serialize ( char *buf , int32_t bufSize ) {
|
|
if ( ! m_r ) return 0;
|
|
return m_r->serialize ( buf , bufSize ); };
|
|
// . this is destructive on the "buf". it converts offs to ptrs
|
|
// . sets m_r to the modified "buf" when done
|
|
// . sets g_errno and returns -1 on error, otherwise # of bytes deseril
|
|
int32_t deserialize ( char *buf , int32_t bufSize ) ;
|
|
// Msg40 caches each Msg20Reply when it caches the page of results, so,
|
|
// to keep the size of the cached Msg40 down, we do not cache certain
|
|
// things. so we have to "clear" these guys out before caching.
|
|
//void clearBigSample () { m_r->clearBigSample(); };
|
|
void clearOutlinks () { if ( m_r ) m_r->clearOutlinks (); };
|
|
void clearLinks () { if ( m_r ) m_r->clearOutlinks (); };
|
|
void clearVectors () { if ( m_r ) m_r->clearVectors (); };
|
|
// copy "src" to ourselves
|
|
void copyFrom ( class Msg20 *src ) ;
|
|
|
|
// inlinker information, used by PostQueryRerank.cpp
|
|
//class LinkInfo *getInlinks () {
|
|
// return (class LinkInfo *)m_r->ptr_inlinks ; };
|
|
//class LinkInfo *getOutlinks () {
|
|
// return (class LinkInfo *)m_r->ptr_outlinks; };
|
|
|
|
// just let caller parse it up
|
|
Msg20Reply *getReply () { return m_r; };
|
|
|
|
/*
|
|
// these just return what we parsed out from the reply
|
|
char *getSummary () { return m_r->ptr_sum ; };
|
|
char *getDisplayBuf () { return m_r->ptr_dbuf; };
|
|
char *getBigSampleBuf () { return m_r->ptr_sbuf; };
|
|
char *getTitle () { return m_r->ptr_tbuf; };
|
|
char *getUrl () { return m_r->ptr_ubuf; };
|
|
char *getRedirUrl () { return m_r->ptr_rubuf; };
|
|
char *getOutlinksBuf () { return m_r->ptr_obuf; };
|
|
char *getLinksBuf () { return m_r->ptr_obuf; };
|
|
char *getMatchOffBuf () { return m_r->ptr_mbuf; };
|
|
char *getSummaryVector () { return m_r->ptr_vbuf; };
|
|
// what was this? gigabit/sample vector combined?
|
|
char *getVectorRec () { return NULL; };
|
|
char *getGigabitVector () {return m_r->ptr_gbvecbuf;}
|
|
uint64_t *getSummaryLocs () { return (uint64_t*)m_r->ptr_summLocs;}
|
|
int32_t *getSummaryLocsPops() {return (int32_t *)m_r->ptr_summLocsPops;}
|
|
int32_t getSummaryLen () { return m_r->size_sum; };
|
|
int32_t getDisplayBufLen () { return m_r->size_dbuf; };
|
|
int32_t getBigSampleLen () { return m_r->size_sbuf; };
|
|
int32_t getTitleLen () { return m_r->size_tbuf; };
|
|
int32_t getUrlSize () { return m_r->size_ubuf; };
|
|
int32_t getRedirUrlSize () { return m_r->size_rubuf; };
|
|
int32_t getNumSummaryLocs() { return m_r->size_summLocs/sizeof(uint64_t); }
|
|
int32_t getNumSummaryLocsPops() { return m_r->size_summLocs/sizeof(int32_t); }
|
|
|
|
int32_t getUrlLen () { return m_r->size_ubuf-1; };
|
|
int32_t getRedirUrlLen () { return m_r->size_rubuf-1; };
|
|
int32_t getOutlinksBufLen() { return m_r->size_obuf; };
|
|
int32_t getLinksBufLen () { return m_r->size_obuf; };
|
|
int32_t getMatchOffBufLen () { return m_r->size_mbuf; };
|
|
int32_t getDocLen () { return m_r->m_contentLen; };
|
|
int32_t getContentLen () { return m_r->m_contentLen; };
|
|
bool isSumFromDmoz () { return m_r->m_sumFromDmoz; };
|
|
int32_t getIp () { return m_r->m_ip; };
|
|
int64_t getDomainHash () { return m_r->m_domHash; };
|
|
unsigned char getLanguage () { return m_r->m_language; };
|
|
unsigned char getSummaryLanguage() { return m_r->m_summaryLanguage; };
|
|
uint16_t getCountry () { return m_r->m_country; };
|
|
uint16_t getComputedCountry(){ return m_r->m_computedCountry; }
|
|
int16_t getCharset () { return m_r->m_charset; };
|
|
char getUrlFilterNum () { return m_r->m_urlFilterNum; }
|
|
int64_t getDocId () { return m_r->m_docId; };
|
|
time_t getFirstSpidered () { return m_r->m_firstSpidered; };
|
|
time_t getLastSpidered () { return m_r->m_lastSpidered; };
|
|
time_t getNextSpiderDate() { return m_r->m_nextSpiderDate; };
|
|
time_t getLastModified () { return m_r->m_lastModified; };
|
|
time_t getDatedbDate () { return m_r->m_datedbDate; };
|
|
bool getDatedbDateIsEstimated () {
|
|
return m_r->m_datedbDateIsEstimated; };
|
|
int32_t getRuleset () { return m_r->m_ruleset; };
|
|
int32_t getHostHash () { return m_r->m_hostHash; };
|
|
// if this is true, do not display a [cached] link for this result
|
|
char getNoArchive () { return m_r->m_noArchive; };
|
|
// content-type codes are in HttpMime.h (CT_HTML, CT_PDF, ...)
|
|
char getContentType () { return m_r->m_contentType; };
|
|
unsigned char getQuality () { return m_r->m_docQuality; };
|
|
unsigned char getDocQuality () { return m_r->m_docQuality; };
|
|
//int32_t getNumOutlinks () { return m_r->m_numOutlinks; };
|
|
bool isBanned () { return m_r->m_isBanned; };
|
|
bool isNormalized () { return m_r->m_isNormalized; } ;
|
|
bool hasAllQueryTerms () { return m_r->m_hasAllQueryTerms; };
|
|
char getHopCount () { return m_r->m_hopcount; };
|
|
float getDiversity () { return m_r->m_diversity;}
|
|
|
|
|
|
// . parse the TitleRec::m_flags3 variable
|
|
// . if the datedb date was estimated that means we computed it using
|
|
// the bisection method. so print "Updated XXXX" instead of
|
|
// "Published XXXX" next to the result.
|
|
char datedbDateIsEstimated () {
|
|
return m_r->m_flags3 & FLAG3_DATEDB_DATE_IS_ESTIMATED; };
|
|
|
|
float getProximityScore () {return m_r->m_proximityScore;};
|
|
int32_t getInSectionScore () {return m_r->m_inSectionScore;};
|
|
int32_t getSummaryScore () {return m_r->m_docSummaryScore; };
|
|
|
|
TagRec *getTagRec (){return (TagRec *)m_r->ptr_tagRec; };
|
|
int32_t getTagRecSize (){return m_r->size_tagRec; };
|
|
int32_t getNumCatids (){return m_r->size_catids/4; };
|
|
int32_t getNumIndCatids (){return m_r->size_indCatids/4; };
|
|
int32_t *getDmozCatIds (){return (int32_t *)m_r->ptr_catids; };
|
|
int32_t *getDmozCatids (){return (int32_t *)m_r->ptr_catids; };
|
|
int32_t *getDmozIndCatIds (){return (int32_t *)m_r->ptr_indCatids; };
|
|
int32_t *getDmozIndCatids (){return (int32_t *)m_r->ptr_indCatids; };
|
|
char *getDmozTitles (){return m_r->ptr_dmozTitles; };
|
|
int32_t *getDmozTitleLens (){return (int32_t *)m_r->ptr_dmozTitleLens;};
|
|
char *getDmozSumms (){return m_r->ptr_dmozSumms; };
|
|
int32_t *getDmozSummLens (){return (int32_t *)m_r->ptr_dmozSummLens; };
|
|
char *getDmozAnchors (){return m_r->ptr_dmozAnchors; };
|
|
uint8_t *getDmozAnchorLens (){
|
|
return(uint8_t *)m_r->ptr_dmozAnchorLens;};
|
|
|
|
uint32_t getTagVectorHash () {return m_r->m_tagVectorHash; }
|
|
uint32_t getGigabitVectorHash() {return m_r->m_gigabitVectorHash; }
|
|
|
|
char* getAdIds () {return m_r->ptr_gbAdIds;}
|
|
int32_t getAdIdsSize () {return m_r->size_gbAdIds;}
|
|
*/
|
|
static int32_t getApproxLinkCount(char* content, int32_t contentLen);
|
|
//static int32_t getLinkHashes(Links& ln, char* buf, int32_t bufSize);
|
|
|
|
//char *getNextDisplayBuf ( int32_t *len , char **next ) {
|
|
// return m_r->getNextDisplayBuf(len,next); };
|
|
|
|
// for sending the request
|
|
Multicast m_mcast;
|
|
|
|
void gotReply ( class UdpSlot *slot );
|
|
|
|
// general purpose routines
|
|
Msg20();
|
|
~Msg20();
|
|
// so we can alloc arrays of these using mmalloc()
|
|
void constructor ();
|
|
void destructor ();
|
|
void freeReply ();
|
|
void reset ();
|
|
|
|
void *m_hack;
|
|
int32_t m_hack2;
|
|
int32_t m_ii;
|
|
|
|
// is the reply in progress? if msg20 has not launched a request
|
|
// this is false. if msg20 received its reply, this is false.
|
|
// otherwise this is true.
|
|
bool m_inProgress;
|
|
bool m_launched;
|
|
|
|
char m_ownReply;
|
|
char m_expected;
|
|
|
|
bool (*m_callback ) ( void *state );
|
|
void (*m_callback2) ( void *state );
|
|
void *m_state;
|
|
|
|
// used by MsgE to store its data
|
|
void *m_state2;
|
|
void *m_state3;
|
|
|
|
void *m_owningParent;
|
|
char m_constructedId;
|
|
|
|
// PostQueryRerank storage area for printing out in PageResults.cpp
|
|
float m_pqr_old_score ;
|
|
float m_pqr_factor_diversity ;
|
|
float m_pqr_factor_quality ;
|
|
float m_pqr_factor_inlinkers ;
|
|
float m_pqr_factor_proximity ;
|
|
float m_pqr_factor_ctype ;
|
|
float m_pqr_factor_lang ; // includes country
|
|
};
|
|
|
|
#endif
|
|
|