Files

167 lines
4.8 KiB
C
Raw Permalink Normal View History

2013-08-02 13:12:24 -07:00
// Matt Wells, copyright Jun 2007
// . gets the clusterRecs for a list of docIds
// . list of docids can be from an IndexList if provided, or a straightup array
// . meant as a replacement for some of Msg38
// . see Clusterdb.h for fomat of clusterRec
// . actually only stores the lower 64 bits of each cluster rec, that is all
// that is interesting
2016-03-08 22:14:30 +01:00
#ifndef GB_MSG51_H
#define GB_MSG51_H
2013-08-02 13:12:24 -07:00
#include "Msg0.h"
#include "Clusterdb.h"
2016-07-11 15:25:09 +02:00
#include "RdbList.h"
2013-08-02 13:12:24 -07:00
#include "Msg5.h"
2016-08-24 15:46:01 +02:00
#include "GbSignature.h"
2016-08-12 17:47:27 +02:00
#include <pthread.h>
2013-08-02 13:12:24 -07:00
// . m_clusterLevels[i] takes on one of these values
// . these describe a docid
// . they tell us why the docid is not ok to be displayed in the search results
// . this is used as part of the post query filtering step, after we get the
// resulting docids from Msg3a.
// . these are set some in Msg51.cpp but mostly in Msg40.cpp
enum {
// if clusterdb rec was not found...
CR_NOTFOUND = 0 ,
// clusterdb rec never set... how did this happen?
CR_UNINIT ,
// we got the clusterdb rec, this is a transistional value.
CR_GOT_REC ,
// had adult content
CR_DIRTY ,
// language did not match the language filter (iff langFilter>0)
CR_WRONG_LANG ,
2013-08-02 13:12:24 -07:00
// a 3rd+ result from the same hostname
CR_CLUSTERED ,
// has xml tag syntax in the url
2013-08-02 13:12:24 -07:00
CR_BAD_URL ,
// the url is banned in tagdb or url filters table
CR_BANNED_URL ,
// the title & summary is empty
CR_EMPTY_TITLE_SUMMARY ,
2013-08-02 13:12:24 -07:00
// error getting summary (Msg20::m_errno is set)
CR_ERROR_SUMMARY ,
// a summary dup of a higher-scoring result
CR_DUP_SUMMARY ,
// another error getting it... could be one of many
CR_ERROR_CLUSTERDB ,
// the url is a dup of a previous url (wiki pages capitalization)
CR_DUP_URL ,
// the url doesn't have any content due to simplified redirection page/non-caconical page
CR_EMPTY_REDIRECTION_PAGE,
2013-08-02 13:12:24 -07:00
// the docid is ok to display!
CR_OK ,
2013-08-02 13:12:24 -07:00
// from a blacklisted site hash
CR_BLACKLISTED_SITE ,
// was filtered because of ruleset
CR_RULESET_FILTERED ,
// URL (or site) classified as malicious (spyware, trojan, phishing, ...)
CR_MALICIOUS,
2013-08-02 13:12:24 -07:00
// verify this is LAST entry cuz we use i<CR_END for ending for-loops
CR_END
};
extern const char * const g_crStrings[];
2013-08-02 13:12:24 -07:00
bool setClusterLevels ( const key96_t *clusterRecs,
2016-08-12 16:40:51 +02:00
const int64_t *docIds,
2014-11-10 14:45:11 -08:00
int32_t numRecs ,
int32_t maxDocIdsPerHostname ,
2013-08-02 13:12:24 -07:00
bool doHostnameClustering ,
bool familyFilter ,
bool isDebug ,
// output to clusterLevels[]
char *clusterLevels );
class Msg51 {
public:
Msg51();
~Msg51();
void reset();
// . returns false if blocked, true otherwise
// . sets g_errno on error
2014-11-10 14:45:11 -08:00
// . we just store the "int32_t" part of the cluster rec
2016-08-12 16:40:51 +02:00
bool getClusterRecs ( const int64_t *docIds,
2013-08-02 13:12:24 -07:00
char *clusterLevels ,
key96_t *clusterRecs ,
2014-11-10 14:45:11 -08:00
int32_t numDocIds ,
collnum_t collnum ,
2013-08-02 13:12:24 -07:00
void *state ,
void (* callback)( void *state ) ,
2014-11-10 14:45:11 -08:00
int32_t niceness ,
2013-08-02 13:12:24 -07:00
// output to clusterRecs[]
bool isDebug ) ;
// see Clusterdb.h for this bitmap. we store the lower 64 bits of
// the clusterdb key into the "clusterRecs" array
2016-08-12 16:29:13 +02:00
//bool isFamilyBitOn ( uint64_t clusterRec ) {
// return g_clusterdb.hasAdultContent((char *)&clusterRec); }
//char getLangId ( uint64_t clusterRec ) {
// return g_clusterdb.getLanguage((char *)&clusterRec); }
//uint32_t getSiteHash26 ( uint64_t clusterRec ) {
// return g_clusterdb.getSiteHash26((char *)&clusterRec); }
2013-08-02 13:12:24 -07:00
key96_t getClusterRec ( int32_t i ) const { return m_clusterRecs[i]; }
2013-08-02 13:12:24 -07:00
2016-08-12 16:29:13 +02:00
private:
2014-11-10 14:45:11 -08:00
bool sendRequests ( int32_t k );
2016-08-12 17:47:27 +02:00
bool sendRequests_unlocked(int32_t k);
2014-11-10 14:45:11 -08:00
bool sendRequest ( int32_t i );
2013-08-02 13:12:24 -07:00
2016-08-24 15:46:01 +02:00
declare_signature
2013-08-02 13:12:24 -07:00
// docIds we're getting clusterRecs for
2016-08-12 16:40:51 +02:00
const int64_t *m_docIds;
2014-11-10 14:45:11 -08:00
int32_t m_numDocIds;
2013-08-02 13:12:24 -07:00
// the lower 64 bits of each cluster rec
key96_t *m_clusterRecs;
2013-08-02 13:12:24 -07:00
char *m_clusterLevels;
void (*m_callback ) ( void *state );
void *m_state;
2016-08-12 17:47:27 +02:00
pthread_mutex_t m_mtx; //protects m_nexti, m_numXxxx, m_slot, etc.
2013-08-02 13:12:24 -07:00
// next cluster rec # to get (for m_docIds[m_nexti])
2014-11-10 14:45:11 -08:00
int32_t m_nexti;
2013-08-02 13:12:24 -07:00
// use to get the cluster recs
2014-11-10 14:45:11 -08:00
int32_t m_numRequests;
int32_t m_numReplies;
int32_t m_errno;
2013-08-02 13:12:24 -07:00
2014-11-10 14:45:11 -08:00
int32_t m_niceness;
2013-08-02 13:12:24 -07:00
collnum_t m_collnum;
2013-08-02 13:12:24 -07:00
bool m_isDebug;
struct Slot {
Msg51 *m_msg51; //points to self
Msg0 m_msg0;
RdbList m_list;
Msg5 m_msg5;
bool m_inUse;
int32_t m_ci;
};
Slot *m_slot;
int32_t m_numSlots;
static void gotClusterRecWrapper51(void *state);
void gotClusterRec(Slot *slot);
2013-08-02 13:12:24 -07:00
};
class RdbCache;
2013-08-02 13:12:24 -07:00
extern RdbCache s_clusterdbQuickCache;
2016-03-08 22:14:30 +01:00
#endif // GB_MSG51_H