privacore-open-source-searc.../Msg2.h
2016-08-30 16:37:53 +02:00

143 lines
3.5 KiB
C++

// Matt Wells, copyright July 2001
#ifndef GB_MSG2_H
#define GB_MSG2_H
#include "Msg5.h"
#include "RdbList.h"
#include "max_niceness.h"
#include "GbMutex.h"
#include "GbSignature.h"
// support the &sites=xyz.com+abc.com+... to restrict search results to provided sites.
#define MAX_WHITELISTS 500
class QueryTerm;
/**
*
* Msg2 is the "message" that gets the term-list from the disk.
* This is one fundamental brick of the search, given a
* query term, it returns the various documents that contains it from
* the disk. The list returned takes the form of a RdbList of PosDB keys.
*
* This is used by Msg39 (which perform retrieval+intersection+ranking, see its doc).
* The main method of this message is getLists().
*
* For efficiency, this method actually retrieve several RdbList at once, corresponding
* to the different query terms. Msg2 possess a copy of the RdbList returned:
*
*
*/
class Msg2 {
public:
Msg2();
~Msg2();
void reset();
/** main function of Msg2. Fetch the term list (one per query term), and store them
* in "lists". A copy is kept in this->m_lists
* returns false if blocked, true otherwise.
* sets "errno" on error.
* "termIds/termFreqs" should NOT be on the stack in case we block
*/
bool getLists(collnum_t collnum,
bool addToCache,
const QueryTerm *qterms,
int32_t numQterms,
// restrict search results to this list of sites,
// i.e. "abc.com+xyz.com+..." (Custom Search)
const char *whiteList,
// for intersecting ranges of docids separately
// to prevent OOM errors
int fileNum,
int64_t docIdStart,
int64_t docIdEnd,
RdbList *lists,
void *state,
void (*callback)(void *state),
bool allowHighFrequencyTermCache,
int32_t niceness = MAX_NICENESS,
bool isDebug = false);
/** Get the list "i". Once we got the lists, (getLists(...) has been called), we cache them in m_lists.*/
RdbList *getList(int32_t i) {
return &m_lists[i];
}
/** return the number of lists == the number of query terms */
int32_t getNumLists() const {
return m_numLists;
}
int64_t docIdStart() const { return m_docIdStart; }
int64_t docIdEnd() const { return m_docIdEnd; }
int32_t getNumWhiteLists() const { return m_w; }
RdbList *getWhiteList(int32_t i) { return &(m_whiteLists[i]); }
private:
declare_signature
// list of sites to restrict search results to. space separated
int m_fileNum;
const char *m_whiteList;
int64_t m_docIdStart;
int64_t m_docIdEnd;
const char *m_p;
int32_t m_w;
RdbList *m_whiteLists;
int32_t m_numWhitelists;
// internal helper method that actually does the fetching of the lists
bool getLists();
Msg5 *getAvailMsg5();
void returnMsg5(Msg5 *msg5);
bool gotList();
// we can get up to MAX_QUERY_TERMS term frequencies at the same time
Msg5 *m_msg5;
bool *m_avail; // which msg5s are available?
GbMutex m_mtxMsg5;
int32_t m_errno;
RdbList *m_lists;
const QueryTerm *m_qterms;
int32_t m_numLists;
bool m_getComponents;
bool m_addToCache;
collnum_t m_collnum;
bool m_allowHighFrequencyTermCache;
int32_t m_numReplies;
int32_t m_numRequests;
bool m_requestsBeingSubmitted;
GbMutex m_mtxCounters; //protects the two counters and flag above
void incrementRequestCount();
bool incrementReplyCount();
bool allRequestsReplied();
static void gotListWrapper(void *state, RdbList *list, Msg5 *msg5);
void gotListWrapper(Msg5 *msg5);
void *m_state;
void (*m_callback)(void *state);
int32_t m_niceness;
// if this is true we log more output
bool m_isDebug;
// start time
int64_t m_startTime;
};
#endif // GB_MSG2_H