open-source-search-engine/src/Language.h


#ifndef _LANGUAGE_H_
#define _LANGUAGE_H_
//#include <wchar.h>
#include "gb-include.h"
//#include "UnicodeProperties.h" //UChar32
#include "File.h"
#include "HashTableT.h"
#include "Query.h"
#include "Lang.h"
#include "Multicast.h"
#include "Threads.h"
#include "Titledb.h"
#include "Iso8859.h"
#include "IndexList.h"
//#include "Msg3a.h"

#include "Msg20.h"
#include "Msg37.h"

// max chars in any language
#define MAX_CHARS 256
#define TOP_POP_PHRASES 40 * 1024
#define NUM_CHARS 40
#define MAX_FRAG_SIZE 1030
// max chars that start the rule

#define MAX_PHRASE_LEN 80
#define MAX_RECOMMENDATIONS 10
#define LARGE_SCORE 0xfffff
#define MAX_NARROW_SEARCHES 19

/*
// used only while generating titles from wikipedia pages, makeWikiFiles()
class StateWik {
public:
	bool getIndexList(  );
	bool getSummary (  );
	bool gotSummary (  );

	int       m_fdw;
	Msg0      m_msg0;
	IndexList m_list;
	Query     m_q;
	key_t     m_startKey;
	key_t     m_endKey;
	char     *m_coll;
	int32_t      m_collLen;
	int64_t m_termId;
	int32_t      m_minRecSize;
	Msg20     m_msg20s[MAX_FRAG_SIZE];
	int32_t      m_numMsg20sOutstanding;
	int32_t      m_numMsg20sLaunched;
	int32_t      m_numMsg20sReceived;
};

class StateDict{
 public:
	char      *m_dictBuf;
	int32_t       m_dictBufSize;
	char      *m_buf;
	int32_t       m_bufSize;
	char     **m_wordsPtr;
	int64_t *m_termIds;
	int64_t *m_termFreqs;
	int32_t       m_numTuples;
	Msg37      m_msg37;
};
*/

/*class StateAff{
 public:
	bool openAffinityFile ( );
	bool launchAffinity ( );
	bool gotAffinityFreqs1 ( );
	bool gotAffinityFreqs2 ( );
	bool doneAffinities ( );

	FILE      *m_fdr;
	int        m_fdw;
	int32_t       m_fileNum;
	char       m_buf[1026];
	Msg3a      m_msg3a;
	Query      m_q;
	int64_t  m_numerator;
	int64_t  m_denominator;
	};*/

typedef struct Reco{
	char reco[MAX_PHRASE_LEN];
	int32_t score;
}Reco;

class Language {

 public:

	Language();
	~Language();

	void reset();

	bool init( char *unifiedBuf, int32_t unifiedBufSize, int32_t lang,
		   int32_t hostsPerSplit, uint32_t myHash );

	void setLang( int32_t lang ) { m_lang = lang; };

	//bool makeAffinities();

	//int32_t getPhrasePopularity ( char *s, uint64_t h,
	//		       bool checkTitleRecDict );

	bool checkDict(char *s, int32_t slen, char encodeType);

	bool getRecommendation( char *origWord, int32_t origWordLen,
				char *recommendation, int32_t recommendationLen,
				bool *found, int32_t *score, int32_t *popularity,
				bool  forceReco = false );

	//int32_t narrowPhrase ( char *request, char *phrases, int32_t *pops,
	//		    int32_t maxPhrases );

	//bool generateDicts ( int32_t numWordsToDump , char *coll );

	//bool convertLatin1DictToUTF8 ( char *infile );

	// needed for makeDict
	//bool       gotTermFreqs( StateDict *st );
	//StateDict *m_stateDict;

	// hash table of the dictionary
	HashTableT <uint64_t, int32_t>m_dict;

 private:
	int32_t spellcheckDict();

	// always accepts only ascii chars. makeClean() converts unicode into
	// ascii
	bool getPhonetic( char *origWord, int32_t origWordLen,
			  char *target, int32_t targetLen );

	bool loadRules();

	bool loadSpellerDict( char *spellerBuf, int32_t spellerbufSize,
			      int32_t hostsPerSplit, uint32_t myHash );

	//bool loadTitleRecDicts( );

	//bool loadNarrow( char *spellerBuf, int32_t spellerBufSize,
	//		 int32_t hostsPerSplit, uint32_t myHash );

	bool loadDictHashTable( );

	//bool genTopPopFile ( char *infile );

	bool genDistributedPopFile ( char *infile, uint32_t myHash );

	//bool cleanDictFile ( );

	bool makeClean( char *inBuf, int32_t inBufSize,
			char *outBuf, int32_t outBufSize );//, bool isUTF16 );

	//bool makePhonet( char *infile);

	//bool makeDict();

	//bool makeQueryFiles ( );

	//bool makeWikiFiles ( );

	bool loadWikipediaWords();

	bool loadMispelledWords();

	bool hasMispelling(char *phrase, int32_t phraseLen);

	int32_t tryPhonet( char *phonetTmp, char *origPhonet,
			char *origClean, int32_t tryForScore,
			Reco *recos, int32_t numRecos, int32_t *lowestScore );

	int32_t editDistance( char *a, char *b, int32_t level, // starting level
			   int32_t limit ); // maximum level

	int32_t weightedAverage(int32_t soundslikeScore, int32_t wordScore);

	int32_t limitEditDistance( char *a, char *b, int32_t limit );

	int32_t limit1EditDistance( char *a, char *b );

	int32_t limit2EditDistance( char *a, char *b );

	int32_t checkRest( char *a, char *b, int32_t w, char *amax, int32_t min );

	int32_t check2( char *a, char *b, int32_t w, char *amax, int32_t min );

	int16_t editDistance( char *a0, char *b0 );

	int16_t reduceScore ( char *a, char *b );

	//bool makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
	//		     char *coll );

	//bool makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
	//			    char *coll);

	//bool makeScoreFiles ( int32_t maxWordsPerFile );

	// this map maps a char to a "dict char"
	//unsigned char m_map [ 256 ];

	// . when comparing letter pairs, we only allow them to consist of
	//   certain chars: 0-9, A-Z, apostrophe and space and \0 otherwise
	//   m_table gets too big. This implies a NUM_CHARS of
	// . this compressed the value, too
	// . \0, space, 0-9, A-Z, \'   is the ordering
	//unsigned char to_dict_char ( unsigned char c ) { return m_map[c]; };

	// Temporary unicode workaround for latin-1 compatibility
	//unsigned char uc_to_dict_char ( UChar c ) {
	//	if (c>255)c=0;
	//	return m_map[c];
	//};

	// what language loaded
	int32_t  m_lang;

	// what charset does this language use
	unsigned char    m_charset;

	// buffer to store the phonetic rules
	char   *m_rulesBuf;
	int32_t    m_rulesBufSize;
	char  **m_rulesPtr;
	int32_t    m_rulesPtrSize;
	int32_t    m_numRules;
	// points to the index of each rule that starts with a new character
	int32_t    m_ruleStarts[MAX_CHARS];
	// the chars that are in a phonet
	bool    m_ruleChars[MAX_CHARS];

	// buffers to store the dictionaries
	char  *m_distributedBuf;
	int32_t   m_distributedBufSize;
	char **m_tuplePtr;
	int32_t   m_tuplePtrSize;
	int32_t   m_numTuples;

	// total number of phonets
	int32_t m_numPhonets;

	// narrow phrase
	char  *m_narrowBuf;
	int32_t   m_narrowBufSize;
	int32_t   m_numNarrowPtrs;
	char **m_frntPtrs;
	char **m_bckPtrs;
	int32_t  *m_frntCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
	int32_t  *m_bckCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];

	// m_phonetics stores the hash of the phonetic as the key.
	// the value is a composite of index in m_tuplePtrs where the list
	// starts as the high 32 bits of the value and the number of
	// words having the same phonetic as the low 32 bits of the value
	HashTableT <uint64_t, uint64_t > m_phonetics;

	// hash table of the distributed pop words dictionary
	//	HashTableT <uint32_t, int32_t> m_titlerecDict;

	// hash table of the distributed pop words dictionary
	HashTableT <uint64_t, int32_t>m_distributedPopPhrases;

	// hash table of the top popular words in the dictionary
	//	HashTableT <uint32_t, char *> m_topPopPhrases;

	// hash table of misspelled words
	HashTableT <uint32_t, bool>m_misp;

	// hash table of wikipedia words
	HashTableT <uint32_t, bool>m_wiki;

	// PARMS, which can be adjusted. Currently all languages have the
	// same adjustments, so using the same parms.
	int32_t m_editDistanceWeightsDel1;
	int32_t m_editDistanceWeightsDel2;
	int32_t m_editDistanceWeightsSwap;
	int32_t m_editDistanceWeightsSub;
	int32_t m_editDistanceWeightsSimilar;
	int32_t m_editDistanceWeightsMin;
	int32_t m_editDistanceWeightsMax;
	int32_t m_soundslikeWeight;
	int32_t m_wordWeight;
	int32_t m_span;

	bool m_followup;
	bool m_collapseResult;
	bool m_removeAccents;
};

#endif