privacore-open-source-searc…/Speller.cpp

#include "Speller.h"
#include "Query.h"
#include "StopWords.h"
#include "Hostdb.h"
#include "Process.h"
#include "Conf.h"
#include "Lang.h"
#include <stdio.h>
#include <ctype.h>

Speller g_speller;

Speller::Speller(){
}

Speller::~Speller(){
	reset();
}

bool Speller::init(){
	static bool s_init = false;
	if ( s_init ) return true;
	s_init = true;

	log(LOG_INFO,"Loading unified dict");
	bool loaded = loadUnifiedDict();
	log(LOG_INFO,"Loaded unified dict");
	if (!loaded) {
		log(LOG_WARN, "spell: Could not load unified dict from unifiedDict-buf.txt and unifiedDict-map.dat");
		return false;
	}

	// this seems to slow our startup way down!!!
	log("speller: turning off spell checking for now");
	return true;
}

void Speller::reset(){
	m_unifiedBuf.purge();
	m_unifiedDict.reset();
}


// The unified dict is the combination of the word list, title rec and the top
// query dict of all languages. It has to be created by loading each languages
// dict into memory using Language.loadWordList(), loadTitleRecDict(), etc
bool Speller::loadUnifiedDict() {

	bool building = false;

 reload:

	bool needRebuild = false;

	m_unifiedBuf.purge();
	m_unifiedBuf.setLabel("unibuf");

	// this MUST be there
	if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir,
				       "unifiedDict-buf.txt" ) == 0 )
		needRebuild = true;

	// . give it a million slots
	// . unified dict currently has 1340223 entries
	m_unifiedDict.set ( 8,4, 2*1024*1024,NULL,0,false,"udictht");

	// try to load in the hashtable and the buffer directly
	if ( ! m_unifiedDict.load(g_hostdb.m_dir,"unifiedDict-map.dat"))
		needRebuild = true;

	if ( ! needRebuild ) {
		// convert unifiedBuf \n's to \0's
		char *start = m_unifiedBuf.getBufStart();
		char *end   = start + m_unifiedBuf.length();
		for ( char *p = start ; p < end ; p++ )
			if ( *p == '\n' ) *p = '\0';
		log(LOG_DEBUG,"speller: done loading successfully");

		return true;
	}

	if ( building ) {
		log("gb: rebuild failed. exiting.");
		exit(0);
	}

	building = true;

	log("gb: REBUILDING unifiedDict-buf.txt and unifiedDict-map.dat");

	// just in case that was there and the buf wasn't
	m_unifiedDict.clear();
	// or vice versa
	m_unifiedBuf.purge();

	// load the .txt file. this is REQUIRED for rebuild
	SafeBuf ub;
	if ( ub.fillFromFile (g_hostdb.m_dir,"unifiedDict.txt") <= 0 )
		return false;

	//
	// change \n to \0
	// TODO: filter out the first word from each line?
	//
	char *start = ub.getBufStart();
	char *end   = start + ub.length();
	for ( char *p = start ; p < end ; p++ )
		if ( *p == '\n' ) *p = '\0';


	// now scan wikitionary file wiktionary-lang.txt to get even
	// more words! this file is generated from Wiktionary.cpp when
	// it scans the wiktionary xml dump to generate the other
	// wiktionary-syns.dat and wiktionary-buf.txt files. it also
	// cranks this file out because we can use it since we do not
	// have czech in the unifiedDict.txt file.
	SafeBuf wkfBuf;
	if ( wkfBuf.fillFromFile ( g_hostdb.m_dir,"wiktionary-lang.txt") <= 0 )
		return false;

	// scan each line
	char *p = wkfBuf.getBufStart();
	char *pend = p + wkfBuf.length();
	HashTableX wkfMap;
	// true = allow dups. because same word can appear in multiple langs
	if ( ! wkfMap.set ( 8,1,1000000,NULL,0,true,"wkfmap") )
		return false;

	// "fr|livre" is how it's formatted
	for ( ; p && p < pend ; p = wkfBuf.getNextLine(p) ) {
		char *start = p;
		// skip til |
		for ( ; *p && *p != '|' ; p++ );
		// sanity check
		if ( *p != '|' ) { g_process.shutdownAbort(true); }
		// tmp NULL that
		*p = '\0';
		char langId = getLangIdFromAbbr(start);
		// revert
		*p = '|';
		if ( langId == langUnknown )
			continue;
		if ( langId == langTranslingual )
			continue;
		// skip |
		p++;
		// that's the word
		char *word = p;
		// find end
		char *end = p;
		for ( ; *end && *end != '\n' ; end++ ) ;
		// so hash it up
		int64_t wid = hash64d ( word , end - word );
		// debug point
		//if ( wid == 5000864073612302341LL )
		//	log("download");
		// add it to map
		if ( ! wkfMap.addKey ( &wid , &langId ) ) return false;
	}


	//
	// scan unifiedDict.txt file
	//
	int32_t totalCollisions = 0;
	uint64_t atline = 0;
	p = start;
	while ( p < end ) {
		atline++;
		char *phrase = p;
		// if line is a comment skip it
		if ( *p == '#' ){
			p += strlen(p) + 1;
			continue;
		}
		// skip phrase
		while ( *p != '\t' )
			p++;
		// Null end the phrase
		*p = '\0';

		// skip empty phrases
		if(strlen(phrase) < 1) {
			log(LOG_WARN,
				"spell: Got zero length entry in unifiedDict "
			    "at line %" PRIu64", skipping\n",
				atline);
			p += strlen(p) + 1;
			continue;
		}

		// skip single byte words that are not alphabetic
		// Anything over 'Z' is likely unicode, so don't bother
		if(strlen(phrase) == 1 && (phrase[0] < 'a')) {
			log(LOG_WARN,
				"spell: Got questionable entry in "
			    "unifiedDict at line %" PRIu64", skipping: %s\n",
				atline,p);
			p += strlen(p) + 1;
			continue;
		}
		// . i need to move everything over to utf8!!!
		// . this is the same hash function used by Words.cpp so that
		p++;
		// phonet
		char *phonet = p;
		// next is the phonet
		while ( *p != '\t' )
			p++;
		// Null end the phonet
		*p = '\0';
		p++;

		uint64_t key = hash64d(phrase,strlen(phrase));

		// make sure we haven't added this word/phrase yet
		if ( m_unifiedDict.isInTable ( &key ) ) {
			totalCollisions++;
			p += strlen(p) + 1;
			continue;
		}

		// reset lang vector
		int64_t pops[MAX_LANGUAGES];
		memset ( pops , 0 , MAX_LANGUAGES * 8 );

		// see how many langs this key is in in unifiedDict.txt file
		char *phraseRec = p;
		getPhraseLanguages2 ( phraseRec , pops );

		// make all pops positive if it has > 1 lang already
		//int32_t count = 0;
		//for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ )
		//	if ( pops[i] ) count++;

		int32_t imax = MAX_LANGUAGES;
		//if ( count <= 1 ) imax = 0;
		// assume none are in official dict
		// seems like nanny messed things up, so undo that
		// and set it negative if in wiktionary in loop below
		for ( int32_t i = 0 ; i < imax ; i++ )
			// HOWEVER, if it is -1 leave it be, i think it
			// was probably correct in that case for some reason.
			// Wiktionary fails to get a TON of forms for
			// many foreign languages in the english dict.
			// so nanny got these from some dict, so try to
			// keep them.
			// like 'abelhudo'
			// http://pt.wiktionary.org/wiki/abelhudo
			// and is not in en.wiktionary.org
			// . NO! because it has "ein" as english with
			//   a -1 popularity as well as "ist"! reconsider
			if ( pops[i] < -1 ) pops[i] *= -1;

		// now add in from wiktionary
		int32_t slot = wkfMap.getSlot ( &key );
		for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) {
			uint8_t langId = *(char *)wkfMap.getValueFromSlot(slot);
			if ( langId == langUnknown ) continue;
			if ( langId == langTranslingual ) continue;
			// if it marked as already in that dictionary, cont
			if ( pops[langId] < 0 ) continue;
			// if it is positive, make it negative to mark
			// it as being in the official dictionary
			// -1 means pop unknown but in dictionary
			if ( pops[langId] == 0 ) pops[langId]  = -1;
			else                     pops[langId] *= -1;
		}

		// save the offset
		int32_t offset = m_unifiedBuf.length();

		// print the word/phrase and its phonet, if any
		m_unifiedBuf.safePrintf("%s\t%s\t",phrase,phonet);

		int32_t count = 0;
		// print the languages and their popularity scores
		for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) {
			if ( pops[i] == 0 ) continue;
			// skip "unknown" what does that really mean?
			if ( i == 0 ) continue;
			m_unifiedBuf.safePrintf("%" PRId32"\t%" PRId32"\t",
						i,(int32_t)pops[i]);
			count++;
		}
		// if none, revert
		if ( count == 0 ) {
			m_unifiedBuf.setLength(offset);
			// skip "p" to next line in unifiedBuf.txt
			p += strlen(p) + 1;
			continue;
		}

		// trim final tab i guess
		m_unifiedBuf.incrementLength(-1);
		// end line
		m_unifiedBuf.pushChar('\n');

		// directly point to the (lang, score) tuples
		m_unifiedDict.addKey(&key, &offset);

		// skip "p" to next line in unifiedBuf.txt
		p += strlen(p) + 1;
	}

	log (LOG_WARN,"spell: got %" PRId32" TOTAL collisions in unified dict",
	     totalCollisions);

	HashTableX dedup;
	dedup.set(8,0,1000000,NULL,0,false,"dmdm");

	// . now add entries from wkfBuf that were not also in "ub"
	// . format is "<langAbbr>|<word>\n"
	p = wkfBuf.getBufStart();
	end = p + wkfBuf.length();
	for ( ; p ; p = wkfBuf.getNextLine(p) ) {
		//char *langAbbr = p;
		for ( ; *p && *p !='\n' && *p !='|' ; p++ );
		if ( *p != '|' ) {
			log("speller: bad format in wiktionary-lang.txt");
			g_process.shutdownAbort(true);
		}
		//*p = '\0';
		//uint8_t langId = getLangIdFromAbbr ( langAbbr );
		//*p = '|';
		// get word
		char *word = p + 1;
		// get end of it
		for ( ; *p && *p !='\n' ; p++ );
		if ( *p != '\n' ) {
			log("speller: bad format in wiktionary-lang.txt");
			g_process.shutdownAbort(true);
		}
		int32_t wordLen = p - word;
		// wiktinary has like prefixes ending in minus. skip!
		if ( word[wordLen-1] == '-' ) continue;
		// suffix in wiktionary? skip
		if ( word[0] == '-' ) continue;
		// .zr .dd
		if ( word[0] == '.' ) continue;

		// hash the word
		int64_t key = hash64d ( word , wordLen );

		// skip if we did it in the above loop
		if ( m_unifiedDict.isInTable ( &key ) ) continue;

		// skip if already did it in this loop
		if ( dedup.isInTable ( &key ) ) continue;
		if ( ! dedup.addKey ( &key ) ) return false;

		// reset lang vector
		int64_t pops[MAX_LANGUAGES];
		memset ( pops , 0 , MAX_LANGUAGES * 8 );

		// now add in from wiktionary map
		int32_t slot = wkfMap.getSlot ( &key );
		for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) {
			uint8_t langId = *(char *)wkfMap.getValueFromSlot(slot);
			if ( langId == langUnknown ) continue;
			if ( langId == langTranslingual ) continue;
			if ( pops[langId] ) continue;
			// -1 means pop unknown but in dictionary
			pops[langId] = -1;
		}


		// save the offset
		int32_t offset = m_unifiedBuf.length();

		// . print the word/phrase and its phonet, if any
		// . phonet is unknown here...
		//char *phonet = "";
		m_unifiedBuf.safeMemcpy ( word, wordLen );
		m_unifiedBuf.safePrintf("\t\t");//word,phonet);

		int32_t count = 0;
		// print the languages and their popularity scores
		for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) {
			if ( pops[i] == 0 ) continue;
			// skip "unknown" what does that really mean?
			if ( i == 0 ) continue;
			m_unifiedBuf.safePrintf("%" PRId32"\t%" PRId32"\t",
						i,(int32_t)pops[i]);
			count++;
		}
		// if none, revert
		if ( count == 0 ) {
			m_unifiedBuf.setLength(offset);
			continue;
		}

		// trim final tab i guess
		m_unifiedBuf.incrementLength(-1);
		// end line
		m_unifiedBuf.pushChar('\n');

		// directly point to the (lang, score) tuples
		m_unifiedDict.addKey(&key, &offset);

	}

	// save the text too! a merge of unifiedDict.txt and
	// wiktionary-lang.txt!!!
	if ( m_unifiedBuf.saveToFile(g_hostdb.m_dir,"unifiedDict-buf.txt") <=0)
		return false;

	// save it
	if ( !m_unifiedDict.save(g_hostdb.m_dir,"unifiedDict-map.dat") )
		return false;

	// start over and load what we created
	goto reload;

}

// in case the language is unknown, just give the pop of the
// first found language
int32_t Speller::getPhrasePopularity( const char *str, uint64_t h, unsigned char langId ) {
	//g_process.shutdownAbort(true);

	// hack fixes.
	// common word like "and"?
	if ( isCommonWord(h) ) return MAX_PHRASE_POP;
	// another common word check
	if ( isQueryStopWord(NULL,0,h,langId) ) return MAX_PHRASE_POP;
	// single letter?
	if ( str && str[0] && str[1] == '\0' ) return MAX_PHRASE_POP;
	// 0-99 only
	if ( str && is_digit(*str) ) {
		if ( !str[1]) return MAX_PHRASE_POP;
		if ( is_digit(str[1])&& !str[2]) return MAX_PHRASE_POP;
	}

	// what up with this?
	//if ( !s ) return 0;
	int32_t slot = m_unifiedDict.getSlot(&h);
	// if not in dictionary assume 0 popularity
	if ( slot == -1 ) return 0;
	//char *p = *(char **)m_unifiedDict.getValueFromSlot(slot);
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	char *pend = p + strlen(p);

	// skip word itself
	while ( *p != '\t' ) p++;
	p++;
	// skip phonet, if any
	while ( *p != '\t' ) p++;
	p++;

	int32_t max = 0;

	// the tuples are in ascending order of the langid
	// get to the right language
	while ( p < pend ){

		int32_t currLang = atoi(p);

		// the the pops are sorted by langId, return 0 if the lang
		// was not found
		if ( langId != langUnknown && currLang > langId )
			return 0;

		// skip language
		while ( *p != '\t' ) p++;
		p++;

		int32_t score = atoi(p);

		// i think negative scores mean it is only from titlerec and
		// not in any of the dictionaries.
		if ( score < 0 )
			score *= -1;

		if ( currLang == langId && langId != langUnknown )
			return score;

		// if lang is unknown get max
		if ( score > max ) max = score;

		// skip that score and go to the next <lang> <pop> tuple
		while ( *p != '\t' && *p != '\0' ) p++;
		p++;

	}
	return max;
}


// This isn't really much use except for the spider
// language detection to keep from making 32 sequential
// calls for the same phrase to isolate the language.
const char *Speller::getPhraseRecord(const char *phrase, int len ) {
	//g_process.shutdownAbort(true);
	if ( !phrase ) return NULL;
	//char *rv = NULL;
	int64_t h = hash64d(phrase, len);
	int32_t slot = m_unifiedDict.getSlot(&h);
	//log("speller: h=%" PRIu64" len=%i slot=%" PRId32,h,len,slot);
	if ( slot < 0 ) return NULL;
	//rv = *(char **)m_unifiedDict.getValueFromSlot(slot);
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	return p;
}

int64_t Speller::getLangBits64 ( int64_t wid ) {
	int32_t slot = m_unifiedDict.getSlot(&wid);
	if (slot < 0) return 0LL;
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	// skip over word
	for ( ; *p && *p != '\t' ; ) p++;
	// nothing after?
	if ( !*p ) return 0LL;
	// skip tab
	p++;
	// skip over phonet
	for ( ; *p && *p != '\t' ; ) p++;
	// nothing after?
	if ( !*p ) return 0LL;
	// skip tab
	p++;
	// init
	int64_t bits = 0LL;
	// loop over langid/pop pairs
	while ( *p ) {
		// get langid
		uint8_t langId = atoi(p);
		// skip to next delimiter
		for ( ; *p && *p != '\t' ; p++ );
		// error?
		if ( ! *p ) break;
		// skip tab
		p++;
		// error?
		if ( ! *p ) break;
		// . if pop is zero ignore it
		// . we now set pops to zero when generating
		//   unifiedDict-buf.txt if they are not in the wiktionary
		//   map for that language. seems like to many bad entries
		//   were put in there by john nanny.
		//char pop = 1;
		// if not official, cancel it?
		if ( *p != '-' ) langId = langUnknown;
		// skip pop
		for ( ; *p && *p != '\t' ; p++ );
		// multi lang count
		//if ( langId != langUnknown ) langCount++;
		// no unique lang
		//if ( langCount >= 2 ) return langTranslingual;
		if ( langId != langTranslingual &&
		     langId != langUnknown )
			// make english "1"
			bits |= 1LL << (langId-1);
		// done?
		if ( ! *p ) break;
		// skip tab
		p++;
	}
	return bits;
}

bool Speller::getPhraseLanguages(const char *phrase, int len,
				 int64_t *array) {
	const char *phraseRec = getPhraseRecord(phrase, len);
	if(!phraseRec || !array) return false;
	return getPhraseLanguages2 ( phraseRec,array );
}

bool Speller::getPhraseLanguages2(const char *phraseRec , int64_t *array) {

	int64_t l = 0;
	memset(array, 0, sizeof(int64_t)*MAX_LANGUAGES);

	while(*phraseRec) {
		l = 0;
		// skip leading whitespace
		while(*phraseRec && (*phraseRec == ' ' ||
				     *phraseRec == '\t'))
			phraseRec++;

		if(!*phraseRec) break;

		int64_t l = atoi(phraseRec);
		// l = abs(l); // not using score method anymore, so this is moot.

		// skip to next delimiter
		// while(*phraseRec && *phraseRec != '\t') phraseRec++;
		if(!(phraseRec = strchr(phraseRec, '\t'))) break;

		// skip tab
		phraseRec++;

		if(!*phraseRec) break;

		// wtf?
		if ( *phraseRec == '\t' ) return true;

		// Save score
		array[l] = atoi(phraseRec);

		// skip to next delimiter
		// while(*phraseRec && *phraseRec != '\t') phraseRec++;
		if(!(phraseRec = strchr(phraseRec, '\t'))) break;

		// skip over tab
		if(*phraseRec == '\t') phraseRec++;
	}
	return(true);
}

void Speller::dictLookupTest ( char *ff ){
	//char *ff = "/tmp/sctest";
	FILE *fd = fopen ( ff, "r" );
	if ( ! fd ) {
		log("speller: test: Could not open %s for "
		    "reading: %s.", ff,strerror(errno));
		return;
	}
	int64_t start = gettimeofdayInMilliseconds();
	char buf[1026];
	int32_t count = 0;
	// go through the words
	while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) {
		// length of word(s), including the terminating \n
		int32_t wlen = strlen(buf) ;
		// skip if empty
		if ( wlen <= 0 ) continue;
		buf[wlen-1]='\0';
		uint64_t h = hash64d ( buf, strlen(buf));
		int32_t pop = g_speller.getPhrasePopularity( buf, h, 0 );
		if ( pop < 0 ){
			g_process.shutdownAbort(true);
		}
		count++;
	}
	log ( LOG_WARN,"speller: dictLookupTest took %" PRId64" ms to do "
	      "%" PRId32" words. Compare against 46-66ms taken for dict/words file.",
	      gettimeofdayInMilliseconds() - start, count );
	fclose(fd);
}