privacore-open-source-searc…/Matches.h

// Matt Wells, copyright Jul 2001

#ifndef GB_MATCHES_H
#define GB_MATCHES_H

#include "tokenizer.h"
#include "Pos.h"
#include "Bits.h"

// do not hash more than this many query words into the hash table
#define MAX_QUERY_WORDS_TO_MATCH 1000

// . i upped this from 500 to 3000 to better support the BIG HACK
//   getting 3000 matches slows down the summary generator a lot.
// . i raised MAX_MATCHES to 3000 for huge UOR queries made in SearchInput.cpp
//   from facebook interests
#define MAX_MATCHES              3000

#define MAX_MATCHGROUPS 300

typedef int32_t mf_t;

// . values for Match::m_flags
// . dictates the "match group" that the match belongs to
#define MF_TITLEGEN                   0x0001 // in generated title?
#define MF_TITLETAG                   0x0002
#define MF_LINK                       0x0004 // in non-anomalous link text
#define MF_HOOD                       0x0010 // in non-anomalous neighborhood
#define MF_BODY                       0x0040 // in body
#define MF_METASUMM                   0x0080 // in meta summary
#define MF_METADESC                   0x0100 // in meta description
#define MF_METAKEYW                   0x0200 // in meta keywords
#define MF_RSSTITLE                   0x1000
#define MF_RSSDESC                    0x2000
#define MF_URL                        0x4000  // in url

class Xml;
class Sections;
class Url;
class LinkInfo;
class Title;
class Phrases;
class QueryTerm;
class Query;

class Match {
 public:
	// word # we match in the document using "m_words" below
	int32_t m_wordNum;

	// # of words in this match, like if we match a phrase
	// we have > 1 words in the match
	int32_t m_numWords;

	// word # we match in the query
	int32_t m_qwordNum;

	// # of query words we match if we are a phrase, otherwise
	// this is 1
	int32_t m_numQWords;

	// "match group" or type of match. i.e. MF_TITLETAG, MF_METASUMM, ...
	mf_t m_flags;

	// . for convenience, these four class ptrs are used by Summary.cpp
	// . m_wordNum is relative to this "words" class (and scores,bits,pos)
	const TokenizerResult  *m_tr;
	const Sections *m_sections;
	const Bits     *m_bits;
	const Pos      *m_pos;
};

class Matches {

 public:

	void setQuery(const Query *q);

	bool set(const TokenizerResult *bodyTr, Phrases *bodyPhrases,
		 const Sections *bodySections, const Bits *bodyBits, const Pos *bodyPos, Xml *xml,
		 const Title *tt, const Url *firstUrl, LinkInfo *linkInfo);

	bool addMatches(const char *s, int32_t slen, mf_t flags );

	// . this sets the m_matches[] array
	// . m_matches[i] is -1 if it matches no term in the query
	// . m_matches[i] is X if it matches term #X in the query
	// . returns false and sets errno on error
	bool addMatches(const TokenizerResult *tr, Phrases *phrases = NULL, const Sections *sections = NULL,
			const Bits *bits = NULL, const Pos *pos = NULL, mf_t flags = 0 );

	// how many words matched a rawTermId?
	int32_t getNumMatches() const {
		return m_numMatches;
	}

	const Match& getMatch(int i) const { return m_matches[i]; }
	bool isTitleMatch(int i) const { return m_qwordFlags[i] & MF_TITLEGEN; }

	// janitorial stuff
	Matches();
	~Matches();
	void reset();

private:
	void reset2();
	bool isMatchableTerm(const QueryTerm *qt) const;
	int32_t getNumWordsInMatch(const TokenizerResult *tr, unsigned wn, int32_t n, int32_t *numQWords, int32_t *qwn,
				   bool allowPunctInPhrase = true);

	// . 1-1 with Query::m_qwords[] array of QWords
	// . shows the match flags for that query word
	mf_t     *m_qwordFlags;

	// how many words matched a rawTermId?
	Match  m_matches[MAX_MATCHES];
	int32_t   m_numMatches;

	// . hash query word ids into a small hash table
	// . we use this to see what words in the document are query terms
	int64_t m_qtableIds      [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
	int32_t      m_qtableWordNums [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
	char      m_qtableFlags    [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
	int32_t      m_numSlots;
	const Query *m_q;
	int32_t      m_numAlnums;

	int32_t m_qwordAllocSize;
	char m_tmpBuf[128];

	// . one words/scores/bits/pos/flags class per "match group"
	// . match groups examples = body, a single link text, a meta tag, etc.
	// . match groups are basically disjoint chunks of text information
	// . the document body (web page) is considered a single match group
	// . a single link text is considered a match group
	// . a single meta summary tag is a match group, ...
	int32_t      m_numMatchGroups;

	TokenizerResult  m_tokenizerResultArray[MAX_MATCHGROUPS];
	Bits     m_bitsArray     [MAX_MATCHGROUPS];
	Pos      m_posArray      [MAX_MATCHGROUPS];
};

#endif // GB_MATCHES_H