// Matt Wells, copyright Jul 2001

// . generate phrases and store their hashes into m_phraseIds[] array
// . hash() will then hash the phraseIds into the TermTable (hashtable)
// . will it hash a word as a phrase if it's the only word? No, it will not.
//   it only hashes 2+ word phrases

#ifndef _PHRASES_H_
#define _PHRASES_H_

//#include "TermTable.h"
#include "Bits.h"
//#include "Spam.h"
//#include "Scores.h"
#include "Words.h"
//#include "Weights.h"

#define PHRASE_BUF_SIZE (MAX_WORDS * 14)

#define PSKIP 201

class Phrases {

 public:

	Phrases();
	~Phrases();
	void reset() ;

	bool set2 ( Words *words, Bits *bits , int32_t niceness ) {
		return set ( words,bits,true,false,TITLEREC_CURRENT_VERSION,
			     niceness); };

	// . set the hashes (m_phraseIds) of the phrases for these words
	// . a phraseSpam of PSKIP means word is not in a phrase
	// . "bits" describes the words in a phrasing context
	// . "spam" is % spam of each word (spam may be NULL)
	bool set ( Words    *words, 
		   Bits     *bits ,
		   //Spam     *spam ,
		   //Scores   *scores ,
		   bool      useStopWords ,
		   bool      useStems     ,
		   int32_t      titleRecVersion,
		   int32_t      niceness);

	//int64_t getPhraseId   ( int32_t n ) { return m_phraseIds [n]; };
	int64_t getPhraseId2  ( int32_t n ) { return m_phraseIds2[n]; };
	//int64_t *getPhraseIds (        ) { return m_phraseIds ; };
	int64_t *getPhraseIds2(        ) { return m_phraseIds2; };
	int64_t *getPhraseIds3(        ) { return m_phraseIds3; };
	//int64_t *getPhraseIds4(        ) { return m_phraseIds4; };
	//int64_t *getPhraseIds5(        ) { return m_phraseIds5; };

	//int64_t *getStripPhraseIds (      ) { return m_stripPhraseIds ; };
	//int64_t getStripPhraseId   ( int32_t n ) 
	//{ return m_stripPhraseIds [n]; };
	int32_t      getPhraseSpam ( int32_t n ) { return m_phraseSpam[n]; };
	bool      hasPhraseId   ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
	bool      startsAPhrase ( int32_t n ) { return (m_phraseSpam[n]!=PSKIP);};
	bool      isInPhrase    ( int32_t n ) ;
	// . often word #i is involved in 2 phrases
	// . m_phraseIds[i] only holds the one he starts
	// . this gets the one he's in the middle of or on the right of
	// . used by Query.cpp for phrase-forcing
	//int64_t getLeftPhraseId       ( int32_t i ) ;
	//int64_t getLeftStripPhraseId  ( int32_t i ) ;
	//int32_t      getLeftPhraseIndex    ( int32_t i ) ;

	// . each non-spammy occurrence of phrase adds "baseScore" to it's score
	/*
	bool hash ( TermTable      *table       ,
		    Weights        *weightsPtr  ,
		    uint32_t   baseScore   ,
		    uint32_t   maxScore    ,
		    int64_t       startHash   ,
		    char           *prefix1     ,
		    int32_t            prefixLen1  ,
		    char           *prefix2     ,
		    int32_t            prefixLen2  ,
		    bool            hashUniqueOnly ,
		    int32_t            titleRecVersion,
		    int32_t            niceness = 0);
	*/

	// . store phrase that starts with word #i into "dest"
	// . we also NULL terminated it in "dest"
	// . return length
	char *getPhrase ( int32_t i , int32_t *phrLen , int32_t npw );
	//char *getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) ;
	//char *getStripPhrase ( int32_t i , int32_t *phrLen );

	//int32_t  getNumWords         ( int32_t i ) { return m_numWordsTotal[i]; };
	//int32_t  getNumWordsInPhrase ( int32_t i ) { return m_numWordsTotal [i]; };
	int32_t  getNumWordsInPhrase2( int32_t i ) { return m_numWordsTotal2[i]; };

	int32_t  getMaxWordsInPhrase( int32_t i , int64_t *pid ) ;
	int32_t  getMinWordsInPhrase( int32_t i , int64_t *pid ) ;

	// . leave this public so SimpleQuery.cpp can mess with it
	// . called by Phrases::set() above for each i
	// . we set phraseSpam to 0 to 100% typically
	// . we set phraseSpam to PSKIP if word #i cannot start a phrase
	void setPhrase ( int32_t i ,
			 int32_t niceness);

	// private:

	char  m_localBuf [ PHRASE_BUF_SIZE ];

	char *m_buf;
	int32_t  m_bufSize;

	// . these are 1-1 with the words in the Words class
	// . phraseSpam is PSKIP if the phraseId is invalid
	//int64_t     *m_phraseIds  ;
	// the two word hash
	int64_t     *m_phraseIds2  ;
	int64_t     *m_phraseIds3  ;
	//int64_t     *m_phraseIds4  ;
	//int64_t     *m_phraseIds5  ;
	//int64_t     *m_stripPhraseIds  ;
	unsigned char *m_phraseSpam ;
	// . # words in phrase TOTAL (including punct words)
	// . used for printing
	// . used by SimpleQuery::getTermIds() for setting word ranges
	//   for phrases
	//unsigned char *m_numWordsTotal ;
	// for the two word phrases:
	unsigned char *m_numWordsTotal2 ;
	unsigned char *m_numWordsTotal3 ;
	//unsigned char *m_numWordsTotal4 ;
	//unsigned char *m_numWordsTotal5 ;
	int32_t           m_numPhrases; // should equal the # of words

	// placeholders to avoid passing to subroutine
	Words      *m_words;
	int64_t  *m_wids;
	char      **m_wptrs;
	int32_t       *m_wlens;

	Bits    *m_bits;
	bool     m_useStems;
	bool     m_useStopWords;
	int32_t     m_titleRecVersion;

	// replaces Scores
	//class Sections *m_sections;
	//class Section  *m_sectionPtrs;

	// word scores, set in Scores.cpp
	//int32_t    *m_wordScores;
	// the score of the phrase is the min of the scores of the words that
	// make up the phrase
	//int32_t    *m_phraseScores ;
};

#endif