2252 lines
		
	
	
		
			61 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			2252 lines
		
	
	
		
			61 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "gb-include.h"
 | |
| 
 | |
| #include "Mem.h"
 | |
| #include "Conf.h"
 | |
| #include "Dns.h"
 | |
| #include "HttpServer.h"
 | |
| #include "Loop.h"
 | |
| #include <sys/resource.h>  // setrlimit
 | |
| 
 | |
| #include "Speller.h"
 | |
| #include <stdio.h>
 | |
| #include <ctype.h>
 | |
| 
 | |
| /*
 | |
| static void handleRequestSpeller ( UdpSlot *slot , int32_t netnice );
 | |
| 
 | |
| static void gotSpellerReplyWrapper (void *state, void *state2);
 | |
| 
 | |
| bool Speller::registerHandler ( ) {
 | |
| 	// . register ourselves with the udp server
 | |
| 	// . it calls our callback when it receives a msg of type 0x39
 | |
| 	if ( ! g_udpServer.registerHandler ( 0x3d, handleRequestSpeller )) 
 | |
| 		return false;
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| // . handle a request to get a linkInfo for a given docId/url/collection
 | |
| // . returns false if slot should be nuked and no reply sent
 | |
| // . sometimes sets g_errno on error
 | |
| void handleRequestSpeller ( UdpSlot *slot , int32_t netnice ) {
 | |
| 	// The request is the string to be spellchecked, null ended
 | |
| 	char *request = slot->m_readBuf;
 | |
| 
 | |
| 	// first tells us if we should narrow the search stuff
 | |
| 	bool narrowP = *(bool *) request;
 | |
| 	request += sizeof(bool);
 | |
| 
 | |
| 	// is it found in dict or pop words
 | |
| 	bool found;
 | |
| 	int32_t score;
 | |
| 	char reco[MAX_PHRASE_LEN];
 | |
| 	int32_t pop;
 | |
| 	int64_t start = gettimeofdayInMilliseconds();
 | |
| 	bool recommendation = g_speller.m_language[langEnglish].
 | |
| 		getRecommendation( request, gbstrlen(request), 
 | |
| 				   reco, MAX_PHRASE_LEN, 
 | |
| 				   &found, &score,
 | |
| 				   &pop );
 | |
| 
 | |
| 	log ( LOG_DEBUG,"speller: %s --> %s", request, reco );
 | |
| 
 | |
| 	int32_t numNarrow = 0;
 | |
| 	char narrow[MAX_NARROW_SEARCHES * MAX_PHRASE_LEN];
 | |
| 	int32_t narrowPops[MAX_NARROW_SEARCHES];
 | |
| 	//if ( narrowP )
 | |
| 	//	numNarrow = g_speller.m_language[langEnglish].
 | |
| 	//		narrowPhrase ( request, narrow, narrowPops,
 | |
| 	//			       MAX_NARROW_SEARCHES );
 | |
| 	
 | |
| 	// calculate total reply size
 | |
| 	// int32_t replySize = found + recommendation + score + pop + reco
 | |
| 	int32_t replySize = sizeof(bool) + sizeof(bool) + 4 + 4 + 
 | |
| 		gbstrlen(reco) + 1;
 | |
| 
 | |
| 	if ( narrowP ){
 | |
| 		replySize += 4; // numPhrases 
 | |
| 		for ( int32_t i = 0; i < numNarrow; i++ )
 | |
| 			replySize += 4 + gbstrlen(&narrow[i*MAX_FRAG_SIZE]) + 1;
 | |
| 	}
 | |
| 
 | |
| 	char *reply = (char*) mmalloc(replySize, "SpellerReplyBuf");
 | |
| 	if ( !reply ) {
 | |
| 		g_errno = ENOMEM;
 | |
| 		//g_udpServer.sendReply_ass( NULL, 0, NULL, 0, slot );
 | |
| 		g_udpServer.sendErrorReply( slot , g_errno );
 | |
| 		return;
 | |
| 	}
 | |
| 	char *p = reply;
 | |
| 
 | |
| 	*(bool *)p = found;
 | |
| 	p += sizeof(bool);
 | |
| 	
 | |
| 	*(bool *)p = recommendation;
 | |
| 	p += sizeof(bool);
 | |
| 
 | |
| 	// store the score and pop
 | |
| 	*(int32_t *) p = score; p += 4;
 | |
| 	*(int32_t *) p = pop; p += 4;
 | |
| 
 | |
| 	// store the recommendation
 | |
| 	strcpy( p, reco );
 | |
| 	p += gbstrlen(reco) + 1;
 | |
| 	if ( narrowP ){
 | |
| 		// store the number of narrow phrases found
 | |
| 		*(int32_t *) p = numNarrow;
 | |
| 		p += 4;
 | |
| 		for ( int32_t i = 0; i < numNarrow; i++ ){
 | |
| 			*(int32_t *)p = narrowPops[i];
 | |
| 			p += 4;
 | |
| 			strcpy(p, &narrow[i * MAX_FRAG_SIZE]);
 | |
| 			p += gbstrlen(&narrow[i * MAX_FRAG_SIZE]) + 1;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	//sanity check
 | |
| 	if ( p - reply != replySize ){
 | |
| 		char *xx = NULL; *xx = 0;
 | |
| 	}
 | |
| 
 | |
| 	int64_t end = gettimeofdayInMilliseconds();
 | |
| 	if ( end - start > 1 )
 | |
| 		log (LOG_INFO,"speller: took %"INT64" ms to spellcheck "
 | |
| 		     "fragment %s", end-  start, request);
 | |
| 	g_udpServer.sendReply_ass ( reply   ,
 | |
| 				    replySize, 
 | |
| 				    reply   , 
 | |
| 				    replySize,
 | |
| 				    slot    );
 | |
| }
 | |
| */
 | |
| 
 | |
| Speller g_speller;
 | |
| 
 | |
| Speller::Speller(){
 | |
| 	//m_unifiedBuf = NULL;
 | |
| 	//mm_unifiedBufSize = 0;
 | |
| }
 | |
| 
 | |
| Speller::~Speller(){
 | |
| 	reset();
 | |
| }
 | |
| char *g_str=NULL;
 | |
| bool Speller::init(){
 | |
| 
 | |
| 	static bool s_init = false;
 | |
| 	if ( s_init ) return true;
 | |
| 	s_init = true;
 | |
| 
 | |
| 	/*
 | |
| 	m_hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
 | |
| 	m_hostsPerSplit /= g_hostdb.m_numHostsPerShard;
 | |
| 	if ( m_hostsPerSplit <= 0 )
 | |
| 		return log("db: the <indexSplit> in gb.conf is probably not "
 | |
| 			   "too big. Are you using the wrong hosts.conf?");
 | |
| 	// check if we've got enough multicasts available
 | |
| 	if ( m_hostsPerSplit > MAX_UNIQUE_HOSTS_PER_SPLIT ){
 | |
| 		log( LOG_WARN,"speller: not enough multicasts available for "
 | |
| 		     "this host configuration. Increase multicasts" );
 | |
| 		return false;
 | |
| 	}
 | |
| 	*/
 | |
| 
 | |
| 	if ( !loadUnifiedDict() )
 | |
| 		return log("spell: Could not load unified dict from "
 | |
| 			   "unifiedDict-buf.txt and unifiedDict-map.dat");
 | |
| 
 | |
| 	// this seems to slow our startup way down!!!
 | |
| 	log("speller: turning off spell checking for now");
 | |
| 	return true;
 | |
| 
 | |
| 	/*
 | |
| 	int32_t myHash = g_hostdb.m_hostId % 
 | |
| 		( m_hostsPerSplit * g_hostdb.m_indexSplits );
 | |
| 	myHash /= g_hostdb.m_indexSplits;
 | |
| 
 | |
| 	//for ( int32_t i = 0; i < MAX_LANGUAGES; i++ )
 | |
| 	m_language[langEnglish].init ( m_unifiedBuf.getBufStart(), 
 | |
| 				       m_unifiedBuf.length(),
 | |
| 				       langEnglish, 
 | |
| 				       m_hostsPerSplit,
 | |
| 				       myHash );
 | |
| 
 | |
| 	return true;
 | |
| 	*/
 | |
| }
 | |
| 
 | |
| void Speller::reset(){
 | |
| 	//if ( m_unifiedBuf && m_unifiedBufSize > 0 )
 | |
| 	//	mfree ( m_unifiedBuf, m_unifiedBufSize, "SpellerBuf" );
 | |
| 	m_unifiedBuf.purge();
 | |
| 	
 | |
| 	m_unifiedDict.reset();
 | |
| 	/*
 | |
| 	for(int32_t i = 0; i < MAX_LANGUAGES; i++) 
 | |
| 		m_language[i].reset();
 | |
| 	*/
 | |
| 
 | |
| 	//m_unifiedBuf = NULL;
 | |
| 	//m_unifiedBufSize = 0;
 | |
| }
 | |
| 
 | |
| // test it.
 | |
| void Speller::test ( char *ff ) {
 | |
| 	//char *ff = "/tmp/sctest";
 | |
| 	FILE *fd = fopen ( ff, "r" );
 | |
| 	if ( ! fd ) {
 | |
| 		log("speller: test: Could not open %s for "
 | |
| 		    "reading: %s.", ff,strerror(errno));
 | |
| 		return;
 | |
| 	}
 | |
| 
 | |
| 	char buf[1026];
 | |
| 	//char dst[1026];
 | |
| 	// go through the words in dict/words
 | |
| 	while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) {
 | |
| 		// length of word(s), including the terminating \n
 | |
| 		int32_t wlen = gbstrlen(buf) ;
 | |
| 		// skip if empty
 | |
| 		if ( wlen <= 0 ) continue;
 | |
| 		buf[wlen-1]='\0';
 | |
| 		Query q;
 | |
| 		q.set2 ( buf , langUnknown , false );
 | |
| 
 | |
| 		//if ( getRecommendation ( &q, dst , 1024 ) )
 | |
| 		//  log(LOG_INIT,"speller: %s-->%s",buf,dst);
 | |
| 		//  else
 | |
| 		// log(LOG_INIT,"speller: %s",buf);
 | |
| 	}
 | |
| 	fclose(fd);
 | |
| }
 | |
| 
 | |
| /*
 | |
| ///////////////////////////////////////////////////////
 | |
| // RECOMMENDATION ROUTINES BELOW HERE
 | |
| //
 | |
| // These will spellcheck and give recommendations
 | |
| ///////////////////////////////////////////////////////
 | |
| 
 | |
| bool Speller::canStart( QueryWord *qw ) {
 | |
| 	// can only start with a alpha character, no numeric
 | |
| 	if ( ! is_alnum_utf8 ( qw->m_word+0 ) ) return false;
 | |
| 
 | |
| 	if ( qw->m_ignoreWord &&
 | |
| 	     qw->m_ignoreWord != IGNORE_CONNECTED &&
 | |
| 	     qw->m_ignoreWord != IGNORE_QUOTED ) return false;
 | |
| 
 | |
| 	// don't check 'rom' in phrase "cd-rom", or 't' in "ain't"
 | |
| 	if ( qw->m_leftConnected ) 
 | |
| 		return false;
 | |
| 
 | |
| 	// don't start with a stop word
 | |
| 	if ( qw->m_isStopWord )
 | |
| 		return false;
 | |
| 	
 | |
| 	// a lot of field terms should not be spell checked
 | |
| 	if ( qw->m_fieldCode ) {
 | |
| 		if ( qw->m_fieldCode != FIELD_TITLE   &&
 | |
| 		     qw->m_fieldCode != FIELD_CITY    &&
 | |
| 		     qw->m_fieldCode != FIELD_AUTHOR  &&
 | |
| 		     qw->m_fieldCode != FIELD_COUNTRY   )
 | |
| 			return false;
 | |
| 	}
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| 
 | |
| // . returns false if blocked
 | |
| //   recommended something different than original query, "q"
 | |
| //   and false otherwise
 | |
| // . also returns false and sets g_errno on error
 | |
| // . stores recommended query in "dst" and NULL terminates it
 | |
| // . if dst is too small it will bitch and return true with g_errno set
 | |
| bool Speller::getRecommendation ( Query *q, 
 | |
| 				  bool   spellcheck,
 | |
| 				  char  *dst, // recommendation destination
 | |
| 				  int32_t   dstLen, // recommendation max len
 | |
| 				  bool   narrowSearch,
 | |
| 				  char  *narrow, // narrow search
 | |
| 				  int32_t   narrowLen,  // narrow search len
 | |
| 				  int32_t  *numNarrows, // num narrows found
 | |
| 				  void  *state, 
 | |
| 				  void (*callback)(void *state) ){
 | |
| 	*dst = '\0';
 | |
| 	*narrow = '\0';
 | |
| 	// no narrowing search if spellchecking is off
 | |
| 	if ( !spellcheck )
 | |
| 		return true;
 | |
| 
 | |
| 	// don't spellcheck queries that are more than MAX_FRAG_SIZE int32_t.
 | |
| 	if ( q->getQueryLen() >= MAX_FRAG_SIZE )
 | |
| 		return true;
 | |
| 
 | |
| 	StateSpeller *st ;
 | |
| 	try { st = new (StateSpeller); }
 | |
| 	catch ( ... ) { 
 | |
| 		g_errno = ENOMEM;
 | |
| 		log("Speller: new(%i): %s", sizeof(StateSpeller),
 | |
| 		    mstrerror(g_errno));
 | |
| 		return true; 
 | |
| 	}
 | |
| 	mnew ( st , sizeof(StateSpeller) , "State00" );
 | |
|        
 | |
| 	st->m_state = state;
 | |
| 	st->m_callback = callback;
 | |
| 	st->m_q = q;
 | |
| 	st->m_spellcheck = spellcheck;
 | |
| 	st->m_dst = dst;
 | |
| 	st->m_dend = dst + dstLen;
 | |
| 	st->m_narrowSearch = narrowSearch;
 | |
| 	st->m_nrw = narrow;
 | |
| 	st->m_nend = narrow + narrowLen;
 | |
| 	st->m_numNarrow = numNarrows;
 | |
| 	*st->m_numNarrow = 0;
 | |
| 	st->m_start = gettimeofdayInMilliseconds();
 | |
| 	st->m_numFrags = 0;
 | |
| 	st->m_numFragsReceived = 0;
 | |
| 	
 | |
| 	// . break query down into fragments
 | |
| 	// . each fragment is a string of words
 | |
| 	// . quotes and field names will separate fragments
 | |
| 	// . TODO: make field data in its own fragment
 | |
| 	int32_t nqw = q->m_numWords;
 | |
| 
 | |
| 	for ( int32_t i = 0 ; i < nqw ; i++ ) {
 | |
| 		// get a word in the Query to start a fragment with
 | |
| 		QueryWord *qw = &q->m_qwords[i];
 | |
| 		// can he start the phrase?
 | |
| 		if ( ! canStart( qw ) )
 | |
| 			continue;
 | |
| 
 | |
| 		bool inQuotes  = qw->m_inQuotes;
 | |
| 		char fieldCode = qw->m_fieldCode;
 | |
| 		// . get longest continual fragment that starts with word #i
 | |
| 		// . get the following words that can be in a fragment
 | |
| 		//   that starts with word #i
 | |
| 		// . start of the frag
 | |
| 		int32_t  endQword = i;
 | |
| 		int32_t  startQword = i;
 | |
| 		for ( ; i < nqw ; i++ ) {
 | |
| 			// . skip if we should
 | |
| 			// . keep punct, however
 | |
| 			QueryWord *qw1 = &q->m_qwords[i];
 | |
| 			if ( qw1->m_opcode                 ) break;
 | |
| 			if ( qw1->m_inQuotes  != inQuotes  ) break;
 | |
| 			if ( qw1->m_fieldCode != fieldCode ) break;
 | |
| 			if ( qw1->m_ignoreWord == IGNORE_FIELDNAME ) break;
 | |
| 			if ( qw1->m_phraseSign && 
 | |
| 			     !qw1->m_rightConnected ) break;
 | |
| 			// are we punct?
 | |
| 			if ( ! is_alnum_utf8(qw1->m_word) ) 
 | |
| 				endQword = i - 1;
 | |
| 			else    
 | |
| 				endQword = i;
 | |
| 		}
 | |
| 		// revisit this i in big loop since we did not include it
 | |
| 		i = endQword;
 | |
| 
 | |
| 		//create a new stateFrag
 | |
| 		StateFrag *stFrag;
 | |
| 		try { stFrag = new (StateFrag); }
 | |
| 		catch ( ... ) { 
 | |
| 			mdelete ( st, sizeof(StateSpeller),  "StateSpeller" );
 | |
| 			delete (st);
 | |
| 			g_errno = ENOMEM;
 | |
| 			log("Speller: new(%i): %s", sizeof(StateFrag),
 | |
| 			    mstrerror(g_errno));
 | |
| 			//continue;
 | |
| 			return true;
 | |
| 		}
 | |
| 		mnew ( stFrag, sizeof(StateFrag),
 | |
| 		       "StateFrag" );
 | |
| 
 | |
| 		stFrag->m_state = (void*) st;
 | |
| 		stFrag->m_narrowPhrase = st->m_narrowSearch;
 | |
| 		stFrag->m_q = q;
 | |
| 		stFrag->m_startQword = startQword;
 | |
| 		stFrag->m_endQword = endQword;
 | |
| 		stFrag->m_errno = 0;
 | |
| 		st->m_stFrag[st->m_numFrags] = stFrag;
 | |
| 		st->m_numFrags++;
 | |
| 		// blocked
 | |
| 		if ( !getRecommendation( stFrag ) ){
 | |
| 			continue;
 | |
| 		}
 | |
| 		st->m_numFragsReceived++;
 | |
| 	}
 | |
| 	// if outstanding frags
 | |
| 	if ( st->m_numFragsReceived < st->m_numFrags )
 | |
| 		return false;
 | |
| 	gotFrags(st);
 | |
| 	// delete state
 | |
| 	mdelete ( st, sizeof(StateSpeller),  "StateSpeller" );
 | |
| 	delete (st);
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| bool Speller::getRecommendation ( StateFrag *st ){
 | |
| 	st->m_recommended = false;
 | |
| 	st->m_numFound = 0;
 | |
| 	st->m_numNarrowPhrases = 0;
 | |
| 	char *dst = st->m_dst;
 | |
| 	
 | |
| 	// normalize this fragment and store in "dst"
 | |
| 	bool wasAlnum = true;
 | |
| 	for ( int32_t i = st->m_startQword; i <= st->m_endQword; i++ ){
 | |
| 		// start of each word
 | |
| 		st->m_wp[i] = dst;
 | |
| 		char *p = st->m_q->m_qwords[i].m_word;
 | |
| 		int32_t  plen = st->m_q->m_qwords[i].m_wordLen;
 | |
| 		for ( int32_t j = 0; dst-st->m_dst <MAX_FRAG_SIZE&&j<plen;j++ ) {
 | |
| 			if ( !getClean_utf8(p+j) ) 
 | |
| 				continue;
 | |
| 			// skip back to back punct/spaces
 | |
| 			if (j>0 && !is_alnum_utf8(p+j) &&!wasAlnum)
 | |
| 				continue;
 | |
| 			*dst = p[j];
 | |
| 			dst++;
 | |
| 			wasAlnum = is_alnum_utf8 ( p+j );
 | |
| 		}
 | |
| 		st->m_wplen[i] = dst - st->m_wp[i];
 | |
| 		st->m_isfound[i] = false;
 | |
| 	}
 | |
| 	*dst = '\0';
 | |
| 	
 | |
| 	// debug msg
 | |
| 	log(LOG_DEBUG,"speller: Getting recommendation for frag=%s",
 | |
| 	    st->m_dst);
 | |
| 
 | |
| 	// give each word in the phrase a chance to start the subphrase
 | |
| 	int32_t maxPhrase = st->m_endQword - st->m_startQword;
 | |
| 	if ( maxPhrase > MAX_WORDS_PER_PHRASE )
 | |
| 		maxPhrase = MAX_WORDS_PER_PHRASE;
 | |
| 
 | |
| 	// store the phraseLen and posn
 | |
| 	st->m_pLen = maxPhrase;
 | |
| 	st->m_pPosn = st->m_startQword;
 | |
| 	
 | |
| 	return launchReco(st);
 | |
| }
 | |
| 
 | |
| bool Speller::launchReco(StateFrag *st){
 | |
| 	// if we checked all the phrases or found all the words
 | |
| 	if ( st->m_numFound == st->m_endQword - st->m_startQword + 1 || 
 | |
| 	     st->m_pLen < 0 ){
 | |
| 		return true;
 | |
| 	}
 | |
| 
 | |
| 	bool launchPhrase = false;
 | |
|  	for ( ; st->m_pLen >= 0; st->m_pLen-- ){
 | |
| 		for ( ; st->m_pPosn + st->m_pLen <= st->m_endQword; 
 | |
| 		      st->m_pPosn++ ) {
 | |
| 			// find a word that can start the phrase
 | |
| 			QueryWord *qw = &st->m_q->m_qwords[st->m_pPosn];
 | |
| 			if ( !canStart (qw) )
 | |
| 				continue;
 | |
| 			// don't do this phrase if we have found even one
 | |
| 			// word in the phrase
 | |
| 			bool found = false;
 | |
| 			for ( int32_t k = st->m_pPosn; 
 | |
| 			      k <= st->m_pPosn + st->m_pLen; k++ ) {
 | |
| 				if ( st->m_isfound[k] ){
 | |
| 					found = true;
 | |
| 					break;
 | |
| 				}
 | |
| 			}
 | |
| 			if ( found )
 | |
| 				continue;
 | |
| 
 | |
| 			// cannot end on a stop word, punct, right-connected
 | |
| 			// word
 | |
| 			QueryWord *qwEnd = 
 | |
| 				&st->m_q->m_qwords[st->m_pPosn + st->m_pLen];
 | |
| 			if ( qwEnd->m_isStopWord || qwEnd->m_isPunct ||
 | |
| 			     qwEnd->m_rightConnected )
 | |
| 				continue;
 | |
| 			
 | |
| 			// found someone to start the phrase with
 | |
| 			// what is the new phrase parms?
 | |
| 			st->m_a = st->m_wp[st->m_pPosn];
 | |
| 			st->m_b = st->m_wp[st->m_pLen + st->m_pPosn]+
 | |
| 				st->m_wplen[st->m_pLen + st->m_pPosn];
 | |
| 			
 | |
| 			// also store the tmp char that we are changing
 | |
| 			st->m_c = *(st->m_b);
 | |
| 			*(st->m_b) = '\0';
 | |
| 
 | |
| 			// if it is just a number, don't get recommendation
 | |
| 			// lest we emabarrass ourselves
 | |
| 			if ( st->m_pPosn == 0 && is_digit(st->m_a[0]) ) {
 | |
| 				char *k = st->m_a+1;
 | |
| 				while ( is_digit(*k) ) k++;
 | |
| 				if ( ! *k ) { 
 | |
| 					*st->m_b = st->m_c ; 
 | |
| 					continue;
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			// if it is an adult phrase, don't get a recommendation
 | |
| 			// check if isAdult really finds a word.
 | |
| 			char *adultLoc = NULL;
 | |
| 			if ( isAdult(st->m_a, gbstrlen(st->m_a), &adultLoc) &&
 | |
| 			     ( adultLoc == st->m_a || *(adultLoc-1) == ' ' ) ){
 | |
| 				// mark as found
 | |
| 				for ( int32_t k = st->m_pPosn; 
 | |
| 				      k <= st->m_pPosn + st->m_pLen; k++ )
 | |
| 					st->m_isfound[k] = true;
 | |
| 				*(st->m_b) = st->m_c;
 | |
| 				continue;
 | |
| 			}
 | |
| 			// if the phrase is in dict or in the top pop words,
 | |
| 			// phrase is found. Don't check if we are narrowing 
 | |
| 			// the phrase because we need to multicast anyways
 | |
| 			uint64_t h ;
 | |
| 			h = hash64d(st->m_a, gbstrlen(st->m_a) );
 | |
| 			if ( !st->m_narrowPhrase && 
 | |
| 			     getPhrasePopularity( st->m_a, h, false ) > 0 ){
 | |
| 				// mark as found
 | |
| 				for ( int32_t k = st->m_pPosn; 
 | |
| 				      k <= st->m_pPosn + st->m_pLen; k++ )
 | |
| 					st->m_isfound[k] = true;
 | |
| 				*(st->m_b) = st->m_c;
 | |
| 				continue;
 | |
| 			}
 | |
| 			launchPhrase = true;
 | |
| 			break;
 | |
| 		}
 | |
| 		if ( launchPhrase )
 | |
| 			break;
 | |
| 		st->m_pPosn = st->m_startQword;
 | |
| 	}
 | |
| 
 | |
| 	if ( st->m_pLen < 0 ){
 | |
| 		return true;
 | |
| 	}
 | |
| 
 | |
| 	// debug msg
 | |
| 	log(LOG_DEBUG,"speller: ----------");
 | |
| 	log(LOG_DEBUG,"speller: Checking phrase=%s", st->m_a);
 | |
| 
 | |
| 
 | |
| 	// launch for all the splits
 | |
| 	st->m_numRequests = 0;
 | |
| 	st->m_numReplies = 0;
 | |
| 
 | |
| 
 | |
| 	int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
 | |
| 	// don't send to twins...
 | |
| 	hostsPerSplit /= g_hostdb.m_numHostsPerShard;
 | |
| 	int32_t mySplit = g_hostdb.m_hostId % g_hostdb.m_indexSplits;
 | |
| 
 | |
| 	int32_t key = st->m_q->getQueryHash();//0;
 | |
| 	int32_t timeout = 30;
 | |
| 	int32_t niceness = 0;
 | |
| 	char request[MAX_FRAG_SIZE + 4];
 | |
| 	char *p = request;
 | |
| 	*(bool *)p = st->m_narrowPhrase;
 | |
| 	p += sizeof(bool);
 | |
| 	strcpy ( p, st->m_a );
 | |
| 	// send the null end too
 | |
| 	p += gbstrlen(st->m_a)+1;
 | |
| 	int32_t plen = p - request;
 | |
| 	for ( int32_t i = 0; i < hostsPerSplit; i++ ){
 | |
| 		// get the hostId of the host we're sending to
 | |
| 		uint32_t hostId = 
 | |
| 			mySplit + ( i * g_hostdb.m_indexSplits );
 | |
| 		Host *h = g_hostdb.getHost(hostId);
 | |
| 		st->m_mcast[i].reset();
 | |
| 
 | |
| 		bool status = st->m_mcast[i].
 | |
| 			send(request   ,
 | |
| 			     plen      , // request size
 | |
| 			     0x3d      , // msgType 0x3d
 | |
| 			     false     , // multicast owns m_request?
 | |
| 			     h->m_groupId, // group to send to (groupKey)
 | |
| 			     false     , // send to whole group?
 | |
| 			     key       , 
 | |
| 			     st        , // state data
 | |
| 			     NULL      , // state data
 | |
| 			     gotSpellerReplyWrapper ,
 | |
| 			     timeout      , // in seconds
 | |
| 			     niceness  ,
 | |
| 			     false     , // realtime?
 | |
| 			     -1        , // m_q->m_bestHandlingHostId ,
 | |
| 			     NULL      , // m_replyBuf   ,
 | |
| 			     0         , // MSG39REPLYSIZE,
 | |
| 			     // this is true if multicast should free
 | |
| 			     // the
 | |
| 			     // reply, otherwise caller is responsible
 | |
| 			     // for freeing it after calling
 | |
| 			     // getBestReply).
 | |
| 			     // actually, this should always be false,
 | |
| 			     // there
 | |
| 			     // is a bug in Multicast.cpp.
 | |
| 			     false        );
 | |
| 
 | |
| 		if (!status){
 | |
| 			st->m_numReplies++;
 | |
| 			log("speller: Multicast had error: %s",
 | |
| 			    mstrerror(g_errno));
 | |
| 			st->m_errno = g_errno;
 | |
| 			continue;
 | |
| 		}
 | |
| 		// blocked
 | |
| 		else
 | |
| 			st->m_numRequests++;
 | |
| 	}
 | |
| 
 | |
| 	if ( st->m_numReplies == st->m_numRequests )
 | |
| 		return true;
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| void gotSpellerReplyWrapper( void *state, void *state2 ){
 | |
| 	StateFrag *stFrag = (StateFrag *) state;
 | |
| 	stFrag->m_numReplies++;
 | |
| 	if ( stFrag->m_numReplies < stFrag->m_numRequests )
 | |
| 		return;
 | |
| 	// blocked
 | |
| 	if ( !g_speller.gotSpellerReply(stFrag) )
 | |
| 		return;
 | |
| 
 | |
| 	StateSpeller *st = (StateSpeller *)stFrag->m_state;
 | |
| 	// One more frag received
 | |
| 	st->m_numFragsReceived++;
 | |
| 	if ( st->m_numFragsReceived < st->m_numFrags )
 | |
| 		return;
 | |
| 
 | |
| 	g_speller.gotFrags(st);
 | |
| 	// callback
 | |
| 	st->m_callback( st->m_state );
 | |
| 	// delete state
 | |
| 	mdelete ( st, sizeof(StateSpeller),  "StateSpeller" );
 | |
| 	delete (st);
 | |
| }
 | |
| 
 | |
| bool Speller::gotSpellerReply( StateFrag *st ){
 | |
| 	int32_t minScore = LARGE_SCORE;
 | |
| 	int32_t maxPop = -1;
 | |
| 	char *bestReco = NULL;
 | |
| 
 | |
| 	char *reply[MAX_UNIQUE_HOSTS_PER_SPLIT];
 | |
| 	int32_t  replySize[MAX_UNIQUE_HOSTS_PER_SPLIT];
 | |
| 	int32_t  replyMaxSize[MAX_UNIQUE_HOSTS_PER_SPLIT];
 | |
| 	bool  freeit;
 | |
| 	bool  found = false; //phrase was found in dict or pop words
 | |
| 	int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
 | |
| 	// don't send to twins...
 | |
| 	hostsPerSplit /= g_hostdb.m_numHostsPerShard;
 | |
| 
 | |
| 	int32_t  numNarrowPhrases[MAX_UNIQUE_HOSTS_PER_SPLIT];
 | |
| 	char *narrowPtrs[MAX_UNIQUE_HOSTS_PER_SPLIT];
 | |
| 
 | |
| 	// init narrowSearch arrays
 | |
| 	for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ ){
 | |
| 		numNarrowPhrases[i] = 0;
 | |
| 		narrowPtrs[i] = NULL;
 | |
| 	}
 | |
| 
 | |
| 	for ( int32_t i = 0; i < hostsPerSplit; i++ ){
 | |
| 		reply[i] = st->m_mcast[i].getBestReply( &replySize[i] ,
 | |
| 							&replyMaxSize[i] ,
 | |
| 							&freeit );
 | |
| 		// multicast may have an empty reply buffer if there was an
 | |
| 		// OOM error or something. m_errno should have been set, but
 | |
| 		// we have to loop through all the multicasts to free the
 | |
| 		// reply buffers.
 | |
| 		char *p = reply[i];
 | |
| 
 | |
| 		if ( g_errno || st->m_errno || !p){
 | |
| 			continue;
 | |
| 		}
 | |
| 		// was is found in dict
 | |
| 		bool foundInDict = *(bool *)p;
 | |
| 		p += sizeof(bool);
 | |
| 		if ( foundInDict )
 | |
| 			found = true;
 | |
| 
 | |
| 		// first is if there is a recommendation or not
 | |
| 		bool recommendation = *(bool *) p;
 | |
| 		p += sizeof (bool);
 | |
| 
 | |
| 		if ( !recommendation && !st->m_narrowPhrase )
 | |
| 			continue;
 | |
| 
 | |
| 		int32_t score = *(int32_t *)p;
 | |
| 		p += 4;
 | |
| 		int32_t pop = *(int32_t *)p;
 | |
| 		p += 4;
 | |
| 
 | |
| 		if ( recommendation ){
 | |
| 			log ( LOG_DEBUG,"speller: Received reco %s, "
 | |
| 			      "score=%"INT32", pop=%"INT32"", p, score, pop );
 | |
| 
 | |
| 			// we have a recommendation with score and pop
 | |
| 			// choose the one with the lowest score, and if the
 | |
| 			// score is same then the max pop 
 | |
| 			// HACK: we are getting bad recommendations for smaller
 | |
| 			// popularities. So don't consider them
 | |
| 			if ( pop > 8 && ( score < minScore || 
 | |
| 				   ( score == minScore && pop > maxPop ) ) ){
 | |
| 				bestReco = p;
 | |
| 				minScore = score;
 | |
| 				maxPop = pop;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		p += gbstrlen(p) + 1;
 | |
| 		if ( st->m_narrowPhrase ){
 | |
| 			numNarrowPhrases[i] = *(int32_t *)p;
 | |
| 			p += 4;
 | |
| 			narrowPtrs[i] = p;
 | |
| 		}
 | |
| 	}
 | |
| 	
 | |
| 	// merge all the narrow results
 | |
| 	if ( st->m_narrowPhrase ){
 | |
| 		int32_t currPhrase[MAX_UNIQUE_HOSTS_PER_SPLIT];
 | |
| 		for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ )
 | |
| 			currPhrase[i] = 0;
 | |
| 		for ( int32_t i = 0; i < MAX_NARROW_SEARCHES; i++ ){
 | |
| 			int32_t maxHost = -1;
 | |
| 			int32_t maxPop = 0;
 | |
| 			for ( int32_t j = 0; j < hostsPerSplit; j++ ){
 | |
| 				if ( numNarrowPhrases[j] <= currPhrase[j] )
 | |
| 					continue;
 | |
| 				int32_t pop = *(int32_t *)narrowPtrs[j];
 | |
| 				if ( pop <= maxPop )
 | |
| 					continue;
 | |
| 				maxPop = pop;
 | |
| 				maxHost = j;
 | |
| 			}
 | |
| 			if ( maxHost < 0 )
 | |
| 				break;
 | |
| 			// 
 | |
| 			narrowPtrs[maxHost] += 4;
 | |
| 			strcpy( st->m_narrowPhrases[i], narrowPtrs[maxHost] );
 | |
| 			narrowPtrs[maxHost] +=gbstrlen(narrowPtrs[maxHost]) + 1;
 | |
| 			currPhrase[maxHost]++;
 | |
| 			st->m_numNarrowPhrases++;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// make narrowPhrase false here, so that its not launched a second time
 | |
| 	// for the same frag;
 | |
| 	st->m_narrowPhrase = false;
 | |
| 
 | |
| 	// revert
 | |
| 	*(st->m_b) = st->m_c;
 | |
| 
 | |
| 	// if we found a recommendation,or if the phrase was found in the
 | |
| 	// dictionary or pop words then mark all the
 | |
| 	// words that fall under the phrase as found
 | |
| 	if ( found || bestReco ){
 | |
| 		for ( int32_t k = st->m_pPosn; 
 | |
| 		      k <= st->m_pLen + st->m_pPosn; k++ )
 | |
| 			st->m_isfound[k] = true;
 | |
| 		st->m_numFound += st->m_pLen + 1;
 | |
| 	}
 | |
| 
 | |
| 	// if not found in the dictionary or a recommendation, copy the phrase
 | |
| 	if ( !found && bestReco){
 | |
| 		// this fragment is going to be recommended
 | |
| 		st->m_recommended = true;
 | |
| 		// insert our recommendation into the phrase to get a new one
 | |
| 		char *s1    = st->m_wp[st->m_startQword];
 | |
| 		int32_t  slen1 = st->m_a - st->m_wp[st->m_startQword];
 | |
| 		char *s2    = bestReco;
 | |
| 		int32_t  slen2 = gbstrlen(bestReco);
 | |
| 		char *s3    = st->m_b ;
 | |
| 		// store the difference in length between the reco and the 
 | |
| 		// original string
 | |
| 		int32_t  diff = slen2 - ( st->m_b - st->m_a );
 | |
| 		int32_t  slen3 = st->m_wp[st->m_endQword] + 
 | |
| 			st->m_wplen[st->m_endQword] - st->m_b;
 | |
| 
 | |
| 		if ( slen3 < 0 )
 | |
| 			slen3 = 0;
 | |
| 
 | |
| 		int32_t  tlen = slen1 + slen2 + slen3 ;
 | |
| 		if ( tlen > MAX_FRAG_SIZE ){
 | |
| 			log(LOG_LOGIC,"speller: buf too small. Fix me 3.");
 | |
| 			// blocked
 | |
| 			if ( !launchReco(st) )
 | |
| 				return false;
 | |
| 			return true;
 | |
| 		}
 | |
| 		// make substitution and store in "dst"
 | |
| 		char buf2 [ MAX_FRAG_SIZE];
 | |
| 		char *nf = buf2;
 | |
| 		gbmemcpy ( nf , s1 , slen1 ) ; nf += slen1;
 | |
| 		gbmemcpy ( nf , s2 , slen2 ) ; nf += slen2;
 | |
| 		gbmemcpy ( nf , s3 , slen3 ) ; 
 | |
| 		nf += slen3;
 | |
| 	
 | |
| 		// don't forget to NULL terminate
 | |
| 		*nf = '\0';
 | |
| 		// debug msg
 | |
| 		log( LOG_DEBUG,"speller: Trying substitution \"%s\"",
 | |
| 		     buf2 );
 | |
| 
 | |
| 		strcpy ( st->m_dst , buf2 );
 | |
| 
 | |
| 		// the pointers might have to be changed if the 
 | |
| 		// recommendation was not of the same length as the words
 | |
| 		if ( diff != 0 ){
 | |
| 			for ( int32_t k = st->m_pLen+st->m_pPosn+1; 
 | |
| 			      k <= st->m_endQword; k++ )
 | |
| 				st->m_wp[k] += diff;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// don't forget to free the replies
 | |
| 	for ( int32_t i = 0; i < hostsPerSplit; i++ )
 | |
| 		if ( reply[i] && replyMaxSize[i] > 0 )
 | |
| 			mfree( reply[i], replyMaxSize[i], "SpellerReplyBuf" );
 | |
| 	
 | |
| 	// go to the next position in the phrase. if we have reached the end
 | |
| 	// of the phrase position, decrement the phrase length and start again
 | |
| 	if ( st->m_pPosn + st->m_pLen >= st->m_endQword - 1 ){
 | |
| 		st->m_pLen--;
 | |
| 		st->m_pPosn = st->m_startQword;
 | |
| 	}
 | |
| 	else
 | |
| 		st->m_pPosn++;
 | |
| 
 | |
| 	if ( !launchReco(st) )
 | |
| 		return false;
 | |
| 	return true;
 | |
| }
 | |
| */
 | |
| // . break a NULL-terminated string down into a list of ptrs to the words
 | |
| // . return the number of words stored into "wp"
 | |
| /*
 | |
| int32_t Speller::getWords ( const char *s ,
 | |
| 			 char *wp     [MAX_FRAG_SIZE] ,
 | |
| 			 int32_t  wplen  [MAX_FRAG_SIZE] ,
 | |
| 			 bool *isstop                   ) {
 | |
| 	int32_t nwp = 0;
 | |
|  loop:
 | |
| 	// skip initial punct
 | |
| 	while ( *s && ! is_alnum ( *s ) ) s++;
 | |
| 	// bail if done
 | |
| 	if ( ! *s ) return nwp;
 | |
| 	// point to word
 | |
| 	wp [ nwp ] = (char *)s;
 | |
| 	// convenience ptr
 | |
| 	char *ww = (char *)s;
 | |
| 	// count over it
 | |
| 	while ( is_alnum ( *s ) ) s++;
 | |
| 	// how long is the word?
 | |
| 	int32_t slen = s - wp [ nwp ];
 | |
| 	// set length
 | |
| 	wplen [ nwp ] = slen ;
 | |
| 	// is it a stop word?
 | |
| 	if ( isstop ) {
 | |
| 		// TODO: make the stop words utf8!!!
 | |
| 		int64_t h = hash64Lower_utf8 ( ww , slen ) ;
 | |
| 		bool stop = ::isStopWord       ( ww , slen , h ) ;
 | |
| 		// BUT ok if Capitalized or number
 | |
| 		if ( stop ) {
 | |
| 			if ( is_digit (ww[0])    ) stop = false;
 | |
| 			if ( is_cap   (ww,slen ) ) stop = false;
 | |
| 			// e-mail, c file, c. s. lewis
 | |
| 			if ( slen  == 1 && ww[0] != 'a' ) stop = false;
 | |
| 		}
 | |
| 		isstop[nwp] = stop;
 | |
| 	}
 | |
| 	nwp++;
 | |
| 	goto loop;
 | |
| }
 | |
| */
 | |
| /*
 | |
| void Speller::gotFrags( void *state ){
 | |
| 	StateSpeller *st = (StateSpeller *) state;
 | |
| 	
 | |
| 	char *dptr = st->m_dst;
 | |
| 	char *nptr = st->m_nrw;
 | |
| 	bool recommendation = false;
 | |
| 	Query *q = st->m_q;
 | |
| 
 | |
| 	// . break query down into fragments
 | |
| 	// . each fragment is a string of words
 | |
| 	// . quotes and field names will separate fragments
 | |
| 	// . TODO: make field data in its own fragment
 | |
| 	int32_t nqw = q->m_numWords;
 | |
| 	int32_t currFrag = 0;
 | |
| 	for ( int32_t i = 0 ; i < nqw ; i++ ) {
 | |
| 		// get a word in the Query to start a fragment with
 | |
| 		QueryWord *qw = &q->m_qwords[i];
 | |
| 		// if he has a phraseSign, put it right away
 | |
| 		//if ( qw->m_phraseSign ) {
 | |
| 		// *dptr = qw->m_phraseSign;
 | |
| 		// dptr++;
 | |
| 		// }
 | |
| 		// can he start the phrase?
 | |
| 		// if he can't start our fragment, just copy over to "dst"
 | |
| 		if ( !canStart( qw )) {
 | |
| 			// copy to rp and get next word
 | |
| 			char *w    = qw->m_word;
 | |
| 			int32_t  wlen = qw->m_wordLen;
 | |
| 			if ( dptr + wlen >= st->m_dend ) { 
 | |
| 				g_errno = EBUFTOOSMALL; continue; }
 | |
| 			// watch out for LeFtP and RiGhP
 | |
| 			if      ( qw->m_opcode == OP_LEFTPAREN ) *dptr++ = '(';
 | |
| 			else if ( qw->m_opcode == OP_RIGHTPAREN) *dptr++ = ')';
 | |
| 			else if ( qw->m_opcode == OP_PIPE      ) *dptr++ = '|';
 | |
| 			else { 
 | |
| 				gbmemcpy ( dptr , w , wlen ); 
 | |
| 				dptr += wlen; 
 | |
| 			}
 | |
| 			*dptr = '\0';
 | |
| 			continue;
 | |
| 		}
 | |
| 		bool inQuotes  = qw->m_inQuotes;
 | |
| 		char fieldCode = qw->m_fieldCode;
 | |
| 		// . get longest continual fragment that starts with word #i
 | |
| 		// . get the following words that can be in a fragment
 | |
| 		//   that starts with word #i
 | |
| 		// . start of the frag
 | |
| 		int32_t  endQword = i;
 | |
| 		for ( ; i < nqw ; i++ ) {
 | |
| 			// . skip if we should
 | |
| 			// . keep punct, however
 | |
| 			QueryWord *qw1 = &q->m_qwords[i];
 | |
| 			if ( qw1->m_opcode                 ) break;
 | |
| 			if ( qw1->m_inQuotes  != inQuotes  ) break;
 | |
| 			if ( qw1->m_fieldCode != fieldCode ) break;
 | |
| 			if ( qw1->m_ignoreWord== IGNORE_FIELDNAME ) break;
 | |
| 			if ( qw1->m_phraseSign && !qw1->m_rightConnected ) 
 | |
| 				break;
 | |
| 			// are we punct?
 | |
| 			if ( ! is_alnum_utf8 (qw1->m_word) ) 
 | |
| 				endQword = i - 1;
 | |
| 			else    
 | |
| 				endQword = i;
 | |
| 		}
 | |
| 		// revisit this i in big loop since we did not include it
 | |
| 		i = endQword;
 | |
| 
 | |
| 		// OOM errors might cause us not to launch frags
 | |
| 		if ( currFrag >= st->m_numFrags )
 | |
| 			continue;
 | |
| 		StateFrag *stFrag = st->m_stFrag[currFrag];
 | |
| 		// don't breech
 | |
| 		if ( dptr + gbstrlen(stFrag->m_dst) >= st->m_dend ) {
 | |
| 			g_errno = EBUFTOOSMALL;
 | |
| 		}
 | |
| 		else {
 | |
| 			// store it
 | |
| 			strcpy ( dptr, stFrag->m_dst );
 | |
| 			dptr += gbstrlen ( dptr );
 | |
| 			// add a space between fragments
 | |
| 			//			*dptr = ' ';
 | |
| 			//dptr++;
 | |
| 			*dptr = '\0';
 | |
| 			// set the flag
 | |
| 			if ( stFrag->m_recommended )
 | |
| 				recommendation = true;
 | |
| 		}
 | |
| 		// copy over all the narrow searches that can fit
 | |
| 		for ( int32_t j = 0; j < stFrag->m_numNarrowPhrases; j++ ){
 | |
| 			// don't breech
 | |
| 			if ( nptr +gbstrlen(stFrag->m_narrowPhrases[j]) >
 | |
| 			     st->m_nend )
 | |
| 				break;
 | |
| 			strcpy(nptr, stFrag->m_narrowPhrases[j]);
 | |
| 			nptr += gbstrlen(stFrag->m_narrowPhrases[j]) + 1;
 | |
| 			(*st->m_numNarrow)++;
 | |
| 		}
 | |
| 		
 | |
| 		mdelete(stFrag, sizeof(StateFrag), "StateFrag");
 | |
| 		delete (stFrag);
 | |
| 		// now we get the next frag
 | |
| 		currFrag++;
 | |
| 	}
 | |
| 	if ( !recommendation )
 | |
| 		*st->m_dst = '\0';
 | |
| 	
 | |
| 	int64_t now = gettimeofdayInMilliseconds();
 | |
| 	if ( now - st->m_start > 50 )
 | |
| 		log(LOG_INFO,"speller: Took %"INT64" ms to spell check %s",
 | |
| 		    now - st->m_start, st->m_q->getQuery() );
 | |
| 	return;
 | |
| }
 | |
| */
 | |
| 
 | |
| 
 | |
| bool Speller::generateDicts ( int32_t numWordsToDump , char *coll ){
 | |
| 	m_language[2].setLang(2);
 | |
| 	//m_language[2].generateDicts ( numWordsToDump, coll );
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| char *Speller::getRandomWord() {
 | |
| 	int32_t offset = rand() % m_unifiedBuf.length();//Size;
 | |
| 	// find nearest \0
 | |
| 	char *p = m_unifiedBuf.getBufStart() + offset;
 | |
| 	// backup until we hit \0
 | |
| 	for ( ; p > m_unifiedBuf.getBufStart() && *p ; p-- );
 | |
| 	// now advance!
 | |
| 	if ( p > m_unifiedBuf.getBufStart() ) p++;
 | |
| 	// that is the word
 | |
| 	return p;
 | |
| }
 | |
| 
 | |
| // The unified dict is the combination of the word list, title rec and the top
 | |
| // query dict of all languages. It has to be created by loading each languages
 | |
| // dict into memory using Language.loadWordList(), loadTitleRecDict(), etc
 | |
| bool Speller::loadUnifiedDict() {
 | |
| 
 | |
| 	bool building = false;
 | |
| 
 | |
|  reload:
 | |
| 
 | |
| 	bool needRebuild = false;
 | |
| 
 | |
| 	m_unifiedBuf.purge();
 | |
| 	m_unifiedBuf.setLabel("unibuf");
 | |
| 
 | |
| 	// this MUST be there
 | |
| 	if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir,
 | |
| 				       "unifiedDict-buf.txt" ) == 0 ) 
 | |
| 		needRebuild = true;
 | |
| 
 | |
| 	// . give it a million slots
 | |
| 	// . unified dict currently has 1340223 entries
 | |
| 	m_unifiedDict.set ( 8,4, 2*1024*1024,NULL,0,false,0,"udictht");	
 | |
| 
 | |
| 	// try to load in the hashtable and the buffer directly
 | |
| 	if ( ! m_unifiedDict.load(g_hostdb.m_dir,"unifiedDict-map.dat"))
 | |
| 		needRebuild = true;
 | |
| 
 | |
| 	if ( ! needRebuild ) {
 | |
| 		// convert unifiedBuf \n's to \0's
 | |
| 		char *start = m_unifiedBuf.getBufStart();
 | |
| 		char *end   = start + m_unifiedBuf.length();
 | |
| 		for ( char *p = start ; p < end ; p++ )
 | |
| 			if ( *p == '\n' ) *p = '\0';
 | |
| 		log(LOG_DEBUG,"speller: done loading successfully");
 | |
| 
 | |
| 		// a quick little checksum
 | |
| 		if ( ! g_conf.m_isLive ) return true;
 | |
| 
 | |
| 		// the size
 | |
| 		int64_t h1 = m_unifiedDict.getNumSlotsUsed();
 | |
| 		int64_t h2 = m_unifiedBuf .length();
 | |
| 		int64_t h = hash64 ( h1 , h2 );
 | |
| 		char *tail1 = (char *)m_unifiedDict.m_keys;
 | |
| 		char *tail2 = m_unifiedBuf.getBufStart()+h2-1000;
 | |
| 		h = hash64 ( tail1 , 1000 , h );
 | |
| 		h = hash64 ( tail2 , 1000 , h );
 | |
| 		//int64_t n = 8346765853685546681LL;
 | |
| 		int64_t n = -14450509118443930LL;
 | |
| 		if ( h != n ) {
 | |
| 			log("gb: unifiedDict-buf.txt or "
 | |
| 			    "unifiedDict-map.dat "
 | |
| 			    "checksum is not approved for "
 | |
| 			    "live service (%"INT64" != %"INT64")" ,h,n);
 | |
| 			//return false;
 | |
| 		}
 | |
| 
 | |
| 		return true;
 | |
| 	}
 | |
| 
 | |
| 	if ( building ) {
 | |
| 		log("gb: rebuild failed. exiting.");
 | |
| 		exit(0);
 | |
| 	}
 | |
| 
 | |
| 	building = true;
 | |
| 
 | |
| 	log("gb: REBUILDING unifiedDict-buf.txt and unifiedDict-map.dat");
 | |
| 
 | |
| 	// just in case that was there and the buf wasn't
 | |
| 	m_unifiedDict.clear();
 | |
| 	// or vice versa
 | |
| 	m_unifiedBuf.purge();
 | |
| 
 | |
| 	// load the .txt file. this is REQUIRED for rebuild
 | |
| 	SafeBuf ub;
 | |
| 	if ( ub.fillFromFile (g_hostdb.m_dir,"unifiedDict.txt") <= 0 )
 | |
| 		return false;
 | |
| 
 | |
| 	//
 | |
| 	// change \n to \0
 | |
| 	// TODO: filter out the first word from each line?
 | |
| 	//
 | |
| 	char *start = ub.getBufStart();
 | |
| 	char *end   = start + ub.length();
 | |
| 	for ( char *p = start ; p < end ; p++ )
 | |
| 		if ( *p == '\n' ) *p = '\0';
 | |
| 
 | |
| 
 | |
| 	// now scan wikitionary file wiktionary-lang.txt to get even
 | |
| 	// more words! this file is generated from Wiktionary.cpp when
 | |
| 	// it scans the wiktionary xml dump to generate the other
 | |
| 	// wiktionary-syns.dat and wiktionary-buf.txt files. it also
 | |
| 	// cranks this file out because we can use it since we do not
 | |
| 	// have czech in the unifiedDict.txt file.
 | |
| 	SafeBuf wkfBuf;
 | |
| 	if ( wkfBuf.fillFromFile ( g_hostdb.m_dir,"wiktionary-lang.txt") <= 0 )
 | |
| 		return false;
 | |
| 
 | |
| 	// scan each line
 | |
| 	char *p = wkfBuf.getBufStart();
 | |
| 	char *pend = p + wkfBuf.length();
 | |
| 	HashTableX wkfMap;
 | |
| 	// true = allow dups. because same word can appear in multiple langs
 | |
| 	if ( ! wkfMap.set ( 8,1,1000000,NULL,0,true,0,"wkfmap") )
 | |
| 		return false;
 | |
| 
 | |
| 	// "fr|livre" is how it's formatted
 | |
| 	for ( ; p && p < pend ; p = wkfBuf.getNextLine(p) ) {
 | |
| 		char *start = p;
 | |
| 		// skip til |
 | |
| 		for ( ; *p && *p != '|' ; p++ );
 | |
| 		// sanity check
 | |
| 		if ( *p != '|' ) { char *xx=NULL;*xx=0; }
 | |
| 		// tmp NULL that
 | |
| 		*p = '\0';
 | |
| 		char langId = getLangIdFromAbbr(start);
 | |
| 		// revert
 | |
| 		*p = '|';
 | |
| 		if ( langId == langUnknown )
 | |
| 			continue;
 | |
| 		if ( langId == langTranslingual )
 | |
| 			continue;
 | |
| 		// skip |
 | |
| 		p++;
 | |
| 		// that's the word
 | |
| 		char *word = p;
 | |
| 		// find end
 | |
| 		char *end = p;
 | |
| 		for ( ; *end && *end != '\n' ; end++ ) ;
 | |
| 		// so hash it up
 | |
| 		int64_t wid = hash64d ( word , end - word );
 | |
| 		// debug point
 | |
| 		//if ( wid == 5000864073612302341LL )
 | |
| 		//	log("download");
 | |
| 		// add it to map
 | |
| 		if ( ! wkfMap.addKey ( &wid , &langId ) ) return false;
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 
 | |
| 	//
 | |
| 	// scan unifiedDict.txt file
 | |
| 	//
 | |
| 	int32_t totalCollisions = 0;
 | |
| 	uint64_t atline = 0;
 | |
| 	p = start;
 | |
| 	while ( p < end ) {
 | |
| 		atline++;
 | |
| 		char *phrase = p;
 | |
| 		// if line is a comment skip it
 | |
| 		if ( *p == '#' ){
 | |
| 			p += gbstrlen(p) + 1;
 | |
| 			continue;
 | |
| 		}
 | |
| 		// skip phrase
 | |
| 		while ( *p != '\t' )
 | |
| 			p++;
 | |
| 		// Null end the phrase
 | |
| 		*p = '\0';
 | |
| 
 | |
| 		// skip empty phrases
 | |
| 		if(gbstrlen(phrase) < 1) {
 | |
| 			log(LOG_WARN,
 | |
| 				"spell: Got zero length entry in unifiedDict "
 | |
| 			    "at line %"UINT64", skipping\n",
 | |
| 				atline);
 | |
| 			p += gbstrlen(p) + 1;
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		// skip single byte words that are not alphabetic
 | |
| 		// Anything over 'Z' is likely unicode, so don't bother
 | |
| 		if(gbstrlen(phrase) == 1 && (phrase[0] < 'a')) {
 | |
| 			log(LOG_WARN,
 | |
| 				"spell: Got questionable entry in "
 | |
| 			    "unifiedDict at line %"UINT64", skipping: %s\n",
 | |
| 				atline,p);
 | |
| 			p += gbstrlen(p) + 1;
 | |
| 			continue;
 | |
| 		}
 | |
| 		// . i need to move everything over to utf8!!!
 | |
| 		// . this is the same hash function used by Words.cpp so that
 | |
| 		p++;
 | |
| 		// phonet
 | |
| 		char *phonet = p;
 | |
| 		// next is the phonet
 | |
| 		while ( *p != '\t' )
 | |
| 			p++;
 | |
| 		// Null end the phonet
 | |
| 		*p = '\0';
 | |
| 		p++;
 | |
| 
 | |
| 		uint64_t key = hash64d(phrase,gbstrlen(phrase));
 | |
| 
 | |
| 		// make sure we haven't added this word/phrase yet
 | |
| 		if ( m_unifiedDict.isInTable ( &key ) ) {
 | |
| 			totalCollisions++;
 | |
| 			p += gbstrlen(p) + 1;
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		// reset lang vector
 | |
| 		int64_t pops[MAX_LANGUAGES];
 | |
| 		memset ( pops , 0 , MAX_LANGUAGES * 8 );
 | |
| 
 | |
| 		// see how many langs this key is in in unifiedDict.txt file
 | |
| 		char *phraseRec = p;
 | |
| 		getPhraseLanguages2 ( phraseRec , pops );
 | |
| 
 | |
| 		// make all pops positive if it has > 1 lang already
 | |
| 		//int32_t count = 0;
 | |
| 		//for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ )
 | |
| 		//	if ( pops[i] ) count++;
 | |
| 
 | |
| 		int32_t imax = MAX_LANGUAGES;
 | |
| 		//if ( count <= 1 ) imax = 0;
 | |
| 		// assume none are in official dict
 | |
| 		// seems like nanny messed things up, so undo that
 | |
| 		// and set it negative if in wiktionary in loop below
 | |
| 		for ( int32_t i = 0 ; i < imax ; i++ )
 | |
| 			// HOWEVER, if it is -1 leave it be, i think it
 | |
| 			// was probably correct in that case for some reason.
 | |
| 			// Wiktionary fails to get a TON of forms for
 | |
| 			// many foreign languages in the english dict.
 | |
| 			// so nanny got these from some dict, so try to
 | |
| 			// keep them.
 | |
| 			// like 'abelhudo'
 | |
| 			// http://pt.wiktionary.org/wiki/abelhudo
 | |
| 			// and is not in en.wiktionary.org
 | |
| 			// . NO! because it has "ein" as english with
 | |
| 			//   a -1 popularity as well as "ist"! reconsider
 | |
| 			if ( pops[i] < -1 ) pops[i] *= -1;
 | |
| 			//if ( pops[i] < 0 ) pops[i] *= -1;
 | |
| 
 | |
| 		// debug
 | |
| 		//if ( strcmp(phrase,"download") == 0 )
 | |
| 		//	log("hey");
 | |
| 
 | |
| 		// now add in from wiktionary
 | |
| 		int32_t slot = wkfMap.getSlot ( &key );
 | |
| 		for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) {
 | |
| 			uint8_t langId = *(char *)wkfMap.getDataFromSlot(slot);
 | |
| 			if ( langId == langUnknown ) continue;
 | |
| 			if ( langId == langTranslingual ) continue;
 | |
| 			// if it marked as already in that dictionary, cont
 | |
| 			if ( pops[langId] < 0 ) continue;
 | |
| 			// if it is positive, make it negative to mark
 | |
| 			// it as being in the official dictionary
 | |
| 			// -1 means pop unknown but in dictionary
 | |
| 			if ( pops[langId] == 0 ) pops[langId]  = -1;
 | |
| 			else                     pops[langId] *= -1;
 | |
| 		}
 | |
| 
 | |
| 		// save the offset
 | |
| 		int32_t offset = m_unifiedBuf.length();
 | |
| 
 | |
| 		// print the word/phrase and its phonet, if any
 | |
| 		m_unifiedBuf.safePrintf("%s\t%s\t",phrase,phonet);
 | |
| 
 | |
| 		int32_t count = 0;
 | |
| 		// print the languages and their popularity scores
 | |
| 		for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) {
 | |
| 			if ( pops[i] == 0 ) continue;
 | |
| 			// skip "unknown" what does that really mean?
 | |
| 			if ( i == 0 ) continue;
 | |
| 			m_unifiedBuf.safePrintf("%"INT32"\t%"INT32"\t",
 | |
| 						i,(int32_t)pops[i]);
 | |
| 			count++;
 | |
| 		}
 | |
| 		// if none, revert
 | |
| 		if ( count == 0 ) {
 | |
| 			m_unifiedBuf.setLength(offset);
 | |
| 			// skip "p" to next line in unifiedBuf.txt
 | |
| 			p += gbstrlen(p) + 1;
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		// trim final tab i guess
 | |
| 		m_unifiedBuf.incrementLength(-1);
 | |
| 		// end line
 | |
| 		m_unifiedBuf.pushChar('\n');
 | |
| 
 | |
| 		// directly point to the (lang, score) tuples
 | |
| 		m_unifiedDict.addKey(&key, &offset);
 | |
| 
 | |
| 		// skip "p" to next line in unifiedBuf.txt
 | |
| 		p += gbstrlen(p) + 1;
 | |
| 	}
 | |
| 
 | |
| 	log (LOG_WARN,"spell: got %"INT32" TOTAL collisions in unified dict",
 | |
| 	     totalCollisions);
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 	HashTableX dedup;
 | |
| 	dedup.set(8,0,1000000,NULL,0,false,0,"dmdm");
 | |
| 
 | |
| 	// . now add entries from wkfBuf that were not also in "ub"
 | |
| 	// . format is "<langAbbr>|<word>\n"
 | |
| 	p = wkfBuf.getBufStart();
 | |
| 	end = p + wkfBuf.length();
 | |
| 	for ( ; p ; p = wkfBuf.getNextLine(p) ) {
 | |
| 		//char *langAbbr = p;
 | |
| 		for ( ; *p && *p !='\n' && *p !='|' ; p++ );
 | |
| 		if ( *p != '|' ) {
 | |
| 			log("speller: bad format in wiktionary-lang.txt");
 | |
| 			char *xx=NULL;*xx=0;
 | |
| 		}
 | |
| 		//*p = '\0';
 | |
| 		//uint8_t langId = getLangIdFromAbbr ( langAbbr );
 | |
| 		//*p = '|';
 | |
| 		// get word
 | |
| 		char *word = p + 1;
 | |
| 		// get end of it
 | |
| 		for ( ; *p && *p !='\n' ; p++ );
 | |
| 		if ( *p != '\n' ) {
 | |
| 			log("speller: bad format in wiktionary-lang.txt");
 | |
| 			char *xx=NULL;*xx=0;
 | |
| 		}
 | |
| 		int32_t wordLen = p - word;
 | |
| 		// wiktinary has like prefixes ending in minus. skip!
 | |
| 		if ( word[wordLen-1] == '-' ) continue;
 | |
| 		// suffix in wiktionary? skip
 | |
| 		if ( word[0] == '-' ) continue;
 | |
| 		// .zr .dd
 | |
| 		if ( word[0] == '.' ) continue;
 | |
| 
 | |
| 		// hash the word
 | |
| 		int64_t key = hash64d ( word , wordLen );
 | |
| 
 | |
| 		// skip if we did it in the above loop
 | |
| 		if ( m_unifiedDict.isInTable ( &key ) ) continue;
 | |
| 
 | |
| 		// skip if already did it in this loop
 | |
| 		if ( dedup.isInTable ( &key ) ) continue;
 | |
| 		if ( ! dedup.addKey ( &key ) ) return false;
 | |
| 
 | |
| 		// reset lang vector
 | |
| 		int64_t pops[MAX_LANGUAGES];
 | |
| 		memset ( pops , 0 , MAX_LANGUAGES * 8 );
 | |
| 
 | |
| 		// now add in from wiktionary map
 | |
| 		int32_t slot = wkfMap.getSlot ( &key );
 | |
| 		for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) {
 | |
| 			uint8_t langId = *(char *)wkfMap.getDataFromSlot(slot);
 | |
| 			if ( langId == langUnknown ) continue;
 | |
| 			if ( langId == langTranslingual ) continue;
 | |
| 			if ( pops[langId] ) continue;
 | |
| 			// -1 means pop unknown but in dictionary
 | |
| 			pops[langId] = -1;
 | |
| 		}
 | |
| 
 | |
| 		
 | |
| 		// save the offset
 | |
| 		int32_t offset = m_unifiedBuf.length();
 | |
| 
 | |
| 		// . print the word/phrase and its phonet, if any
 | |
| 		// . phonet is unknown here...
 | |
| 		//char *phonet = "";
 | |
| 		m_unifiedBuf.safeMemcpy ( word, wordLen );
 | |
| 		m_unifiedBuf.safePrintf("\t\t");//word,phonet); 
 | |
| 
 | |
| 		int32_t count = 0;
 | |
| 		// print the languages and their popularity scores
 | |
| 		for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) {
 | |
| 			if ( pops[i] == 0 ) continue;
 | |
| 			// skip "unknown" what does that really mean?
 | |
| 			if ( i == 0 ) continue;
 | |
| 			m_unifiedBuf.safePrintf("%"INT32"\t%"INT32"\t",
 | |
| 						i,(int32_t)pops[i]);
 | |
| 			count++;
 | |
| 		}
 | |
| 		// if none, revert
 | |
| 		if ( count == 0 ) {
 | |
| 			m_unifiedBuf.setLength(offset);
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		// trim final tab i guess
 | |
| 		m_unifiedBuf.incrementLength(-1);
 | |
| 		// end line
 | |
| 		m_unifiedBuf.pushChar('\n');
 | |
| 
 | |
| 		// directly point to the (lang, score) tuples
 | |
| 		m_unifiedDict.addKey(&key, &offset);
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 
 | |
| 	
 | |
| 
 | |
| 
 | |
| 	// save the text too! a merge of unifiedDict.txt and
 | |
| 	// wiktionary-lang.txt!!!
 | |
| 	if ( m_unifiedBuf.saveToFile(g_hostdb.m_dir,"unifiedDict-buf.txt") <=0)
 | |
| 		return false;
 | |
| 
 | |
| 	// save it
 | |
| 	if ( m_unifiedDict.save(g_hostdb.m_dir,"unifiedDict-map.dat")<=0 )
 | |
| 		return false;
 | |
| 
 | |
| 	// start over and load what we created
 | |
| 	goto reload;
 | |
| 
 | |
| 	// hmmm... seems like we need to re-run for some reason
 | |
| 	log("spell: PLEASE RERUN gb");
 | |
| 	log("spell: PLEASE RERUN gb");
 | |
| 	log("spell: PLEASE RERUN gb");
 | |
| 	exit(0);
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| // in case the language is unknown, just give the pop of the
 | |
| // first found language
 | |
| int32_t Speller::getPhrasePopularity ( char *str, uint64_t h,
 | |
| 				    bool checkTitleRecDict,
 | |
| 				    unsigned char langId ){
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 
 | |
| 	// hack fixes. 
 | |
| 	// common word like "and"?
 | |
| 	if ( isCommonWord(h) ) return MAX_PHRASE_POP;
 | |
| 	// another common word check
 | |
| 	if ( isQueryStopWord(NULL,0,h,langId) ) return MAX_PHRASE_POP;
 | |
| 	// single letter?
 | |
| 	if ( str && str[0] && str[1] == '\0' ) return MAX_PHRASE_POP;
 | |
| 	// 0-99 only
 | |
| 	if ( str && is_digit(*str) ) {
 | |
| 		if ( !str[1]) return MAX_PHRASE_POP;
 | |
| 		if ( is_digit(str[1])&& !str[2]) return MAX_PHRASE_POP;
 | |
| 	}
 | |
| 
 | |
| 	// what up with this?
 | |
| 	//if ( !s ) return 0;
 | |
| 	int32_t slot = m_unifiedDict.getSlot(&h);
 | |
| 	// if not in dictionary assume 0 popularity
 | |
| 	if ( slot == -1 ) return 0;
 | |
| 	//char *p = *(char **)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	char *p = m_unifiedBuf.getBufStart() + offset;
 | |
| 	char *pend = p + gbstrlen(p);
 | |
| 
 | |
| 	// skip word itself
 | |
| 	while ( *p != '\t' ) p++;
 | |
| 	p++;
 | |
| 	// skip phonet, if any
 | |
| 	while ( *p != '\t' ) p++;
 | |
| 	p++;
 | |
| 
 | |
| 	int32_t max = 0;
 | |
| 
 | |
| 	// the tuples are in ascending order of the langid
 | |
| 	// get to the right language
 | |
| 	while ( p < pend ){
 | |
| 
 | |
| 		int32_t currLang = atoi(p);
 | |
| 
 | |
| 		// the the pops are sorted by langId, return 0 if the lang
 | |
| 		// was not found
 | |
| 		if ( langId != langUnknown && currLang > langId )
 | |
| 			return 0;
 | |
| 			
 | |
| 		// skip language
 | |
| 		while ( *p != '\t' ) p++;
 | |
| 		p++;
 | |
| 
 | |
| 		int32_t score = atoi(p);
 | |
| 
 | |
| 		// i think negative scores mean it is only from titlerec and
 | |
| 		// not in any of the dictionaries.
 | |
| 		if ( score < 0 )
 | |
| 			score *= -1;
 | |
| 
 | |
| 		if ( currLang == langId && langId != langUnknown )
 | |
| 			return score;
 | |
| 
 | |
| 		// if lang is unknown get max
 | |
| 		if ( score > max ) max = score;
 | |
| 
 | |
| 		// skip that score and go to the next <lang> <pop> tuple
 | |
| 		while ( *p != '\t' && *p != '\0' ) p++;
 | |
| 		p++;
 | |
| 
 | |
| 	}
 | |
| 	return max;
 | |
| }
 | |
| 
 | |
| // splits words and checks if they form a porn word or not. montanalinux.org 
 | |
| // is showing up as porn because it has 'anal' in the hostname. So try to
 | |
| // find a combination of words such that they are NOT porn.
 | |
| // try this only after isAdult() succeeds.
 | |
| // Always tries to find longer words first. so 'montanalinux' is split as
 | |
| // 'montana' and 'linux' and not as 'mont', 'analinux'
 | |
| // if it finds a seq of words leading upto a porn word, then it returns true
 | |
| // eg. shall split montanalinux into 'mont', 'anal', and return true without
 | |
| // checking if 'inux' is a word. Need to do this because isAdult() cannot
 | |
| // define where an adult word has ended. 
 | |
| // TODO: chatswingers.com NOT identified as porn because it is split as 
 | |
| // 'chats' and 'wingers'.
 | |
| 
 | |
| bool Speller::canSplitWords( char *s, int32_t slen, bool *isPorn, 
 | |
| 			     char *splitWords,
 | |
| 			     unsigned char langId, int32_t encodeType ){
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 
 | |
| 	*isPorn = false;
 | |
| 	char *index[1024];
 | |
| 	if ( slen == 0 )
 | |
| 		return true;
 | |
| 	*splitWords = '\0';
 | |
| 
 | |
| 	// this is the current word we're on
 | |
| 	int32_t curr = 0;
 | |
| 	index[curr++] = s;
 | |
| 	index[curr] = s + slen;
 | |
| 	while ( curr > 0 ){
 | |
| 		char *nextWord = NULL;
 | |
| 		while ( findNext( index[curr-1], index[curr], 
 | |
| 				  &nextWord, isPorn, langId, encodeType ) ){
 | |
| 			// next word in chain
 | |
| 			index[curr++] = nextWord;
 | |
| 			index[curr] = s + slen;
 | |
| 			// found a porn word OR 
 | |
| 			// finished making a sequence of words
 | |
| 			if ( *isPorn || nextWord == s + slen ){
 | |
| 				char *p = splitWords;
 | |
| 				for ( int32_t k = 1; k < curr; k++ ){
 | |
| 					gbmemcpy (p, index[k - 1], 
 | |
| 						index[k] - index[k - 1]);
 | |
| 					p += index[k] - index[k - 1];
 | |
| 					*p = ' ';
 | |
| 					p++;
 | |
| 				}
 | |
| 				*p = '\0';
 | |
| 				return true;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// did not find any word. reduce the current position
 | |
| 		while ( --curr > 0 ){
 | |
| 			if ( curr > 0 && index[curr] > index[curr-1] ){
 | |
| 				index[curr]--;
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn,
 | |
| 			unsigned char langId, int32_t encodeType ){
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 
 | |
| 	char *loc = NULL;
 | |
| 	int32_t slen = send - s;
 | |
| 	// check if there is an adult word in there
 | |
| 	// NOTE: The word 'adult' gives a lot of false positives, so even 
 | |
| 	// though it is in the isAdult() list, skip it.
 | |
| 	// s/slen constitutes an individual word.
 | |
| 	if ( isAdult ( s, slen, &loc ) && strncmp ( s, "adult", 5 ) != 0 ){
 | |
| 		// if this string starts with the adult word, don't check 
 | |
| 		// further
 | |
| 		if ( loc == s ){
 | |
| 			*isPorn = true;
 | |
| 			*nextWord = send;
 | |
| 			return true;
 | |
| 		}
 | |
| 	}
 | |
| 	for ( char *a = send; a > s; a-- ){
 | |
| 		// a hack, if the word is only one letter long, check if it
 | |
| 		// is 'a' or 'i'. If not then continue
 | |
| 		if ( a - s == 1 && *s != 'a' && *s != 'i')
 | |
| 			continue;
 | |
| 		// another hack, the end word of the string cannot be 2 letters
 | |
| 		// or less. freesex was being split as 'frees ex'
 | |
| 		if ( a == send && a - s <= 2 )
 | |
| 			continue;
 | |
| 
 | |
| 		// do not allow "ult" to be a word because it always will
 | |
| 		// split "adult" into "ad+ult"
 | |
| 		if ( a - s == 3 && s[0]=='u' && s[1]=='l' && s[2]=='t' )
 | |
| 			continue;
 | |
| 		// adultsiteratings = "ad ul ts it era tings" 
 | |
| 		if ( a - s == 2 && s[0]=='u' && s[1]=='l' )
 | |
| 			continue;
 | |
| 		// lashaxxxnothing = "lash ax xx nothing"
 | |
| 		if ( a - s == 2 && s[0]=='u' && s[1]=='l' )
 | |
| 			continue;
 | |
| 		// livesexasian = "lives ex asian"
 | |
| 		if ( a - s == 2 && s[0]=='e' && s[1]=='x' )
 | |
| 			continue;
 | |
| 		// fuckedtits = "fu ck edt its"
 | |
| 		if ( a - s == 2 && s[0]=='c' && s[1]=='k' )
 | |
| 			continue;
 | |
| 		// blogsexe = "blogs exe" ... many others
 | |
| 		// any 3 letter fucking word starting with "ex"
 | |
| 		if ( a - s == 3 && s[0]=='e' && s[1]=='x' )
 | |
| 			continue;
 | |
| 		// shemales = "*s hem ales"
 | |
| 		if ( a - s == 4 && s[0]=='a' &&s[1]=='l'&&s[2]=='e'&&s[3]=='s')
 | |
| 			continue;
 | |
| 		// grooverotica = "groove rot ica"
 | |
| 		if ( a - s == 3 && s[0]=='i' && s[1]=='c' && s[2]=='a' )
 | |
| 			continue;
 | |
| 		// dinerotik = dinero tik
 | |
| 		if ( a - s == 3 && s[0]=='t' && s[1]=='i' && s[2]=='k' )
 | |
| 			continue;
 | |
| 		// nudeslutpics = "nud esl ut pics"
 | |
| 		if ( a - s == 3 && s[0]=='n' && s[1]=='u' && s[2]=='d' )
 | |
| 			continue;
 | |
| 		// seepornos = "seep or nos"
 | |
| 		if ( a - s == 3 && s[0]=='n' && s[1]=='o' && s[2]=='s' )
 | |
| 			continue;
 | |
| 		// bookslut = "books lut"
 | |
| 		if ( a - s == 3 && s[0]=='l' && s[1]=='u' && s[2]=='t' )
 | |
| 			continue;
 | |
| 		// lesexegratuit = "lese xe gratuit"
 | |
| 		if ( a - s == 2 && s[0]=='x' && s[1]=='e' )
 | |
| 			continue;
 | |
| 		// mooiemensensexdating = "mens ense xd a ting"
 | |
| 		if ( a - s == 2 && s[0]=='x' && s[1]=='d' )
 | |
| 			continue;
 | |
| 		// mpornlinks = mpo rn links
 | |
| 		if ( a - s == 2 && s[0]=='r' && s[1]=='n' )
 | |
| 			continue;
 | |
| 		// ukpornbases = ukp or nba bes
 | |
| 		if ( a - s == 2 && s[0]=='o' && s[1]=='r' )
 | |
| 			continue;
 | |
| 		// slut
 | |
| 		if ( a - s == 2 && s[0]=='l' && s[1]=='u' )
 | |
| 			continue;
 | |
| 		// independentstockholmescorts = "tock holme sco rts"
 | |
| 		if ( a - s == 3 && s[0]=='s' && s[1]=='c' && s[2]=='o' )
 | |
| 			continue;
 | |
| 		// relatosexcitantes = relat ose xci tan tes 
 | |
| 		if ( a - s == 3 && s[0]=='x' && s[1]=='c' && s[2]=='i' )
 | |
| 			continue;
 | |
| 		// babe = * bes
 | |
| 		if ( a - s == 3 && s[0]=='b' && s[1]=='e' && s[2]=='s' )
 | |
| 			continue;
 | |
| 		// xpornreviews "xp orn reviews "
 | |
| 		if ( a - s == 3 && s[0]=='o' && s[1]=='r' && s[2]=='n' )
 | |
| 			continue;
 | |
| 		// shemal fix
 | |
| 		if ( a - s == 3 && s[0]=='h' && s[1]=='e' && s[2]=='m' )
 | |
| 			continue;
 | |
| 		// adultswim = adults wim
 | |
| 		if ( a - s == 3 && s[0]=='w' && s[1]=='i' && s[2]=='m' )
 | |
| 			continue;
 | |
| 		// bdsm
 | |
| 		if ( a - s == 3 && s[0]=='d' && s[1]=='s' && s[2]=='m' )
 | |
| 			continue;
 | |
| 		// anal
 | |
| 		if ( a - s == 3 && s[0]=='n' && s[1]=='a' && s[2]=='l' )
 | |
| 			continue;
 | |
| 		// vibrator = bra 
 | |
| 		if ( a - s == 3 && s[0]=='b' && s[1]=='r' && s[2]=='a' )
 | |
| 			continue;
 | |
| 		// sitiospornox = sitio spor nox
 | |
| 		if ( a - s == 4 && s[0]=='s' && s[1]=='p' && s[2]=='o' &&
 | |
| 		     s[3] == 'r' )
 | |
| 			continue;
 | |
| 		// orn*
 | |
| 		if ( a - s == 4 && s[0]=='o' && s[1]=='r' && s[2]=='n' )
 | |
| 			continue;
 | |
| 		// hotescorts = hote scor
 | |
| 		if ( a - s == 4 && s[0]=='s' && s[1]=='c' && s[2]=='o' &&
 | |
| 		     s[3] == 'r' )
 | |
| 			continue;
 | |
| 		// uniformsluts = uniformts lutz
 | |
| 		if ( a - s == 4 && s[0]=='l' && s[1]=='u' && s[2]=='t' &&
 | |
| 		     s[3] == 'z' )
 | |
| 			continue;
 | |
| 		// free porn login = freep ornl
 | |
| 		if ( a - s == 5 && s[0]=='f' && s[1]=='r' && s[2]=='e' &&
 | |
| 		     s[3] == 'e' && s[4] == 'p' )
 | |
| 			continue;
 | |
| 		// shemal fix
 | |
| 		if ( a - s == 5 && s[0]=='h' && s[1]=='e' && s[2]=='m' &&
 | |
| 		     s[3] == 'a' && s[4] == 'l' )
 | |
| 			continue;
 | |
| 		// inbondage = inbond age
 | |
| 		if ( a - s == 6 && 
 | |
| 		     s[0]=='i' && s[1]=='n' && s[2]=='b' &&
 | |
| 		     s[3]=='o' && s[4]=='n' && s[5]=='d' )
 | |
| 			continue;
 | |
| 		// swingers = wingers
 | |
| 		if ( a - s == 7 && 
 | |
| 		     s[0]=='w' && s[1]=='i' && s[2]=='n' &&
 | |
| 		     s[3]=='g' && s[4]=='e' && s[5]=='r' &&
 | |
| 		     s[6]=='s' )
 | |
| 			continue;
 | |
| 		// free sex contents = freese xc ont ents
 | |
| 		if ( a - s == 2 && s[0]=='x' && s[1]=='c' )
 | |
| 			continue;
 | |
| 		// mosexstore = mose xs tore
 | |
| 		if ( a - s == 2 && s[0]=='x' && s[1]=='s' )
 | |
| 			continue;
 | |
| 		// phonesexfootsies
 | |
| 		if ( a - s == 8 && 
 | |
| 		     s[0]=='p' && s[1]=='h' && s[2]=='o' &&
 | |
| 		     s[3]=='n' && s[4]=='e' && s[5]=='s' &&
 | |
| 		     s[6]=='e' && s[7]=='x' )
 | |
| 			continue;
 | |
| 		// cybersex
 | |
| 		if ( a - s == 8 && 
 | |
| 		     s[0]=='c' && s[1]=='y' && s[2]=='b' &&
 | |
| 		     s[3]=='e' && s[4]=='r' && s[5]=='s' &&
 | |
| 		     s[6]=='e' && s[7]=='x' )
 | |
| 			continue;
 | |
| 		// hotescorts
 | |
| 		
 | |
| 
 | |
| 		// check if the word has popularity. if it is in the 
 | |
| 		// unifiedDict, then it is considered to be a word
 | |
| 		uint64_t h = hash64d(s, a-s);//a - s, encodeType);
 | |
| 		int32_t pop = getPhrasePopularity(s, h, false, langId);
 | |
| 
 | |
| 		// continue if did not find it
 | |
| 		if ( pop <= 0 )
 | |
| 			continue;
 | |
| 		// this is our next word
 | |
| 		*nextWord = a;
 | |
| 		return true;
 | |
| 	}
 | |
| 	return false;
 | |
| }	
 | |
| 
 | |
| //similar to one above but using recursion
 | |
| /*bool Speller::canSplitWords( char *s, int32_t slen, bool *isPorn, 
 | |
|   char *splitWords,
 | |
|   unsigned char langId, int32_t encodeType ){
 | |
| 
 | |
|   if ( slen == 0 )
 | |
|   return true;
 | |
|   char *loc = NULL;
 | |
|   // check if there is an adult word in there
 | |
|   if ( isAdult ( s, slen, &loc ) ){
 | |
|   // if this string starts with the adult word
 | |
|   if ( loc == s ){
 | |
|   gbmemcpy ( splitWords, s, slen );
 | |
|   splitWords[slen] = ' ';
 | |
|   splitWords[slen + 1] = '\0';
 | |
|   *isPorn = true;
 | |
|   return true;
 | |
|   }
 | |
|   }
 | |
| 
 | |
|   char *b = s + slen;
 | |
|   // split the phrase into two or more phrases.
 | |
|   for ( char *a = b; a > s; a-- ){
 | |
|   //	while ( a > s ){
 | |
|   // a hack, if the word is only one letter long, check if it
 | |
|   // is 'a' or 'i'. If not then continue
 | |
|   if ( a - s == 1 && *s != 'a' && *s != 'i')
 | |
|   continue;
 | |
| 
 | |
|   // check if the word has popularity. if it is in the 
 | |
|   // unifiedDict, then it is considered to be a word
 | |
|   uint64_t h = hash64d(s, a - s, encodeType);
 | |
|   int32_t pop = getPhrasePopularity(s, h, false, langId);
 | |
| 
 | |
|   // continue if did not find it
 | |
|   if ( pop <= 0 )
 | |
|   continue;
 | |
|   gbmemcpy ( splitWords, s, a - s );
 | |
|   splitWords[a - s] = ' ';
 | |
|   splitWords[a - s + 1] = '\0';
 | |
|   // see if we can split the rest
 | |
|   if ( canSplitWords ( a, b - a, isPorn, 
 | |
|   splitWords + (a - s + 1),
 | |
|   langId, encodeType ) )
 | |
|   return true;
 | |
|   }
 | |
|   // did not find any sequence of words that can make this string
 | |
|   return false;
 | |
|   }*/
 | |
| 
 | |
| bool Speller::createUnifiedDict (){
 | |
| 	// first get all the tuples from wordlist and query file
 | |
| 	//HashTableT <uint64_t, char*> ht[MAX_LANGUAGES];
 | |
| 	HashTableX ht[MAX_LANGUAGES];
 | |
| 	char ff[1024];
 | |
| 	for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){
 | |
| 		ht[i].set ( 8,4,0,NULL,0,false,0,"cud");
 | |
| 		sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir,
 | |
| 			  getLanguageAbbr(i), getLanguageAbbr(i) );
 | |
| 		populateHashTable(ff, &ht[i], i);
 | |
| 
 | |
| 		sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir,
 | |
| 			  getLanguageAbbr(i), getLanguageAbbr(i) );
 | |
| 		populateHashTable(ff, &ht[i], i);
 | |
| 
 | |
| 		for ( int32_t j = 0; j < NUM_CHARS; j++ ){
 | |
| 			sprintf ( ff , "%sdict/%s/%s.dict.%"INT32"", g_hostdb.m_dir,
 | |
| 				  getLanguageAbbr(i), getLanguageAbbr(i), j );
 | |
| 			populateHashTable(ff, &ht[i], i);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	//sprintf ( ff, "%sdict/unifiedDict",g_hostdb.m_dir );
 | |
| 	sprintf ( ff, "%sunifiedDict.txt",g_hostdb.m_dir );
 | |
| 	// delete it first
 | |
| 	unlink ( ff );
 | |
| 	// then open a new one for appending
 | |
| 	int fdw = open ( ff , 
 | |
| 			 O_CREAT | O_RDWR | O_APPEND ,
 | |
| 			 getFileCreationFlags());
 | |
| 			 // S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
 | |
| 	if ( fdw < 0 ){
 | |
| 		return log("lang: Could not open for %s "
 | |
| 			   "writing: %s.",ff, strerror(errno));
 | |
| 	}
 | |
| 
 | |
| 	log(LOG_INIT,"spell: Making %s.", ff );
 | |
| 
 | |
| 	//HashTableT <uint64_t, int32_t> phrases;
 | |
| 	HashTableX phrases;
 | |
| 	phrases.set(8,4,0,NULL,0,false,0,"phud");
 | |
| 	char buf[1024];
 | |
| 	for ( int32_t  i = 0; i < MAX_LANGUAGES; i++ ){
 | |
| 		// get each slot
 | |
| 		for ( int32_t j = 0; j < ht[i].getNumSlots(); j++ ){
 | |
| 			uint64_t key = *(uint64_t *)ht[i].getKey(j);
 | |
| 			if ( key == 0 )
 | |
| 				continue;
 | |
| 			// if key is already found
 | |
| 			int32_t slot = phrases.getSlot(&key);
 | |
| 			if ( slot != -1 )
 | |
| 				continue;
 | |
| 
 | |
| 			char *tuple = *(char **)ht[i].getValueFromSlot(j);
 | |
| 
 | |
| 			// here we print the phrase and the phonet if present
 | |
| 			// skip the score
 | |
| 			while ( *tuple != '\t' )
 | |
| 				tuple++;
 | |
| 			tuple++;
 | |
| 			
 | |
| 			sprintf( buf, "%s", tuple );
 | |
| 			
 | |
| 			char *p = buf;
 | |
| 			p += gbstrlen(buf);
 | |
| 
 | |
| 			// if there wasn't a phonet, its from the titleRec.
 | |
| 			// add another tab
 | |
| 			bool fromTitleRec = false;
 | |
| 			if ( strstr (tuple,"\t") == NULL ){
 | |
| 				*p = '\t';
 | |
| 				p++;
 | |
| 				fromTitleRec = true;
 | |
| 			}
 | |
| 
 | |
| 			for ( int32_t k = 0; k < MAX_LANGUAGES; k++ ){
 | |
| 				slot = ht[k].getSlot(&key);
 | |
| 				if ( slot == -1 )
 | |
| 					continue;
 | |
| 				char *val = *(char **)ht[k].getValueFromSlot(slot);
 | |
| 				int32_t pop = atoi(val);
 | |
| 				if ( fromTitleRec ) pop *= -1;
 | |
| 				sprintf(p,"\t%"INT32"\t%"INT32"",k,pop);
 | |
| 				p += gbstrlen(p);
 | |
| 			}
 | |
| 			// write out the trailing \n as well
 | |
| 			*p = '\n';
 | |
| 			p++;
 | |
| 			*p = '\0';
 | |
| 			p++;
 | |
| 			int32_t bufLen = gbstrlen(buf);
 | |
| 			int32_t wn = write ( fdw , buf , bufLen ) ;
 | |
| 			if ( wn != bufLen )
 | |
| 				return log("lang:  write: %s",strerror(errno));
 | |
| 			int32_t val = 1;
 | |
| 			phrases.addKey(&key, &val);
 | |
| 		}
 | |
| 	}
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| 
 | |
| bool Speller::populateHashTable( char *ff, HashTableX *htable,
 | |
| 				 unsigned char langId ){
 | |
| 	File f;
 | |
| 	f.set(ff);
 | |
| 	// open file
 | |
| 	if ( ! f.open ( O_RDONLY ) ) {
 | |
| 		log("spell: open: %s",mstrerror(g_errno)); 
 | |
| 		return false; 
 | |
| 	}
 | |
| 	
 | |
| 	// get file size
 | |
| 	int32_t fileSize = f.getFileSize() ;
 | |
| 
 | |
| 	int32_t bufSize = fileSize + 1;
 | |
| 	char *buf = (char *) mmalloc(bufSize, "SpellerTmpBuf");
 | |
| 	if (!buf)
 | |
| 		return false;
 | |
| 	if ( !f.read(buf, fileSize,0) ){
 | |
| 		log("spell: read: %s", mstrerror(g_errno));
 | |
| 		return false;
 | |
| 	}
 | |
| 	for ( int32_t i = 0; i < bufSize; i++ ){
 | |
| 		if ( buf[i] == '\n' )
 | |
| 			buf[i] = '\0';
 | |
| 	}
 | |
| 
 | |
| 	char *p = buf;
 | |
| 	while ( p < buf + fileSize ){
 | |
| 		char *tuple = p;
 | |
| 		int32_t score = atoi(p);
 | |
| 		// many scores in dict have a pop of 0. ignore them
 | |
| 		if ( score <= 0 ){
 | |
| 			p += gbstrlen(p) + 1;
 | |
| 			continue;
 | |
| 		}
 | |
| 		while ( *p != '\t' )
 | |
| 			p++;
 | |
| 		p++;
 | |
| 		// at the phrase
 | |
| 		char *phrase = p;
 | |
| 		while ( *p != '\t' && *p != '\0' )
 | |
| 			p++;
 | |
| 		uint64_t key = hash64d(phrase, p-phrase );
 | |
| 		int32_t slot = htable->getSlot(&key);
 | |
| 		if ( slot == -1 )
 | |
| 			htable->addKey(&key,&tuple);
 | |
| 		p += gbstrlen(p) + 1;
 | |
| 	}
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| // This isn't really much use except for the spider
 | |
| // language detection to keep from making 32 sequential
 | |
| // calls for the same phrase to isolate the language.
 | |
| char *Speller::getPhraseRecord(char *phrase, int len ) {
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 	if ( !phrase ) return NULL;
 | |
| 	//char *rv = NULL;
 | |
| 	int64_t h = hash64d(phrase, len);
 | |
| 	int32_t slot = m_unifiedDict.getSlot(&h);
 | |
| 	//log("speller: h=%"UINT64" len=%i slot=%"INT32"",h,len,slot);
 | |
| 	if ( slot < 0 ) return NULL;
 | |
| 	//rv = *(char **)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	char *p = m_unifiedBuf.getBufStart() + offset;
 | |
| 	return p;
 | |
| }
 | |
| 
 | |
| /*
 | |
| uint8_t Speller::getUniqueLang ( int64_t *wid ) {
 | |
| 	int32_t slot = m_unifiedDict.getSlot(wid);
 | |
| 	if (slot < 0) return langUnknown;
 | |
| 	//char *p = *(char **)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	char *p = m_unifiedBuf.getBufStart() + offset;
 | |
| 	int32_t langId = langUnknown;
 | |
| 	char langCount = 0;
 | |
| 	// skip over word
 | |
| 	for ( ; *p && *p != '\t' ; ) p++;
 | |
| 	// nothing after?
 | |
| 	if ( !*p ) return langUnknown;
 | |
| 	// skip tab
 | |
| 	p++;
 | |
| 	// skip over phonet
 | |
| 	for ( ; *p && *p != '\t' ; ) p++;
 | |
| 	// nothing after?
 | |
| 	if ( !*p ) return langUnknown;
 | |
| 	// skip tab
 | |
| 	p++;
 | |
| 	// loop over langid/pop pairs
 | |
| 	while ( *p ) {
 | |
| 		// get langid
 | |
| 		langId = atoi(p);
 | |
| 		// skip to next delimiter
 | |
| 		for ( ; *p && *p != '\t' ; p++ );
 | |
| 		// error?
 | |
| 		if ( ! *p ) break;
 | |
| 		// skip tab
 | |
| 		p++;
 | |
| 		// error?
 | |
| 		if ( ! *p ) break;
 | |
| 		// . if pop is zero ignore it
 | |
| 		// . we now set pops to zero when generating
 | |
| 		//   unifiedDict-buf.txt if they are not in the wiktionary
 | |
| 		//   map for that language. seems like to many bad entries
 | |
| 		//   were put in there by john nanny.
 | |
| 		//char pop = 1;
 | |
| 		//if ( *p == '0' ) pop = 0;
 | |
| 		// require it be in the official dictionary here
 | |
| 		bool official;
 | |
| 		if ( *p == '-' ) official = true;
 | |
| 		else             official = false;   
 | |
| 		// skip pop
 | |
| 		for ( ; *p && *p != '\t' ; p++ );
 | |
| 		// multi lang count
 | |
| 		if ( langId != langUnknown && official ) langCount++;
 | |
| 		// no unique lang
 | |
| 		//if ( langCount >= 2 ) return langTranslingual;
 | |
| 		if ( langCount >= 2 ) return langUnknown;
 | |
| 		// done?
 | |
| 		if ( ! *p ) break;
 | |
| 		// skip tab
 | |
| 		p++;
 | |
| 	}
 | |
| 	// unique lang!
 | |
| 	return langId;
 | |
| }
 | |
| */
 | |
| 
 | |
| int64_t Speller::getLangBits64 ( int64_t *wid ) {
 | |
| 	int32_t slot = m_unifiedDict.getSlot(wid);
 | |
| 	if (slot < 0) return 0LL;
 | |
| 	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
 | |
| 	char *p = m_unifiedBuf.getBufStart() + offset;
 | |
| 	// skip over word
 | |
| 	for ( ; *p && *p != '\t' ; ) p++;
 | |
| 	// nothing after?
 | |
| 	if ( !*p ) return 0LL;
 | |
| 	// skip tab
 | |
| 	p++;
 | |
| 	// skip over phonet
 | |
| 	for ( ; *p && *p != '\t' ; ) p++;
 | |
| 	// nothing after?
 | |
| 	if ( !*p ) return 0LL;
 | |
| 	// skip tab
 | |
| 	p++;
 | |
| 	// init
 | |
| 	int64_t bits = 0LL;
 | |
| 	// loop over langid/pop pairs
 | |
| 	while ( *p ) {
 | |
| 		// get langid
 | |
| 		uint8_t langId = atoi(p);
 | |
| 		// skip to next delimiter
 | |
| 		for ( ; *p && *p != '\t' ; p++ );
 | |
| 		// error?
 | |
| 		if ( ! *p ) break;
 | |
| 		// skip tab
 | |
| 		p++;
 | |
| 		// error?
 | |
| 		if ( ! *p ) break;
 | |
| 		// . if pop is zero ignore it
 | |
| 		// . we now set pops to zero when generating
 | |
| 		//   unifiedDict-buf.txt if they are not in the wiktionary
 | |
| 		//   map for that language. seems like to many bad entries
 | |
| 		//   were put in there by john nanny.
 | |
| 		//char pop = 1;
 | |
| 		// if not official, cancel it?
 | |
| 		if ( *p != '-' ) langId = langUnknown;
 | |
| 		// skip pop
 | |
| 		for ( ; *p && *p != '\t' ; p++ );
 | |
| 		// multi lang count
 | |
| 		//if ( langId != langUnknown ) langCount++;
 | |
| 		// no unique lang
 | |
| 		//if ( langCount >= 2 ) return langTranslingual;
 | |
| 		if ( langId != langTranslingual &&
 | |
| 		     langId != langUnknown )
 | |
| 			// make english "1"
 | |
| 			bits |= 1LL << (langId-1);
 | |
| 		// done?
 | |
| 		if ( ! *p ) break;
 | |
| 		// skip tab
 | |
| 		p++;
 | |
| 	}
 | |
| 	return bits;
 | |
| }
 | |
| 
 | |
| /*
 | |
| int64_t *Speller::getPhraseLanguages(char *phrase, int len ) {
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 
 | |
| 	char *phraseRec = getPhraseRecord(phrase, len );
 | |
| 	if(!phraseRec) return(NULL);
 | |
| 	int64_t *rv = (int64_t *)mmalloc(sizeof(int64_t) * MAX_LANGUAGES,
 | |
| 					     "PhraseRec");
 | |
| 	if(!rv) return(NULL);
 | |
| 	if(!getPhraseLanguages(phrase, len, rv)) {
 | |
| 		mfree(rv, sizeof(int64_t) * MAX_LANGUAGES,
 | |
| 		      "PhraseRec");
 | |
| 		return(NULL);
 | |
| 	}
 | |
| 	return(rv);
 | |
| }
 | |
| */
 | |
|  
 | |
| bool Speller::getPhraseLanguages(char *phrase, int len,
 | |
| 				 int64_t *array) {
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 
 | |
| 	char *phraseRec = getPhraseRecord(phrase, len);
 | |
| 	if(!phraseRec || !array) return false;
 | |
| 	return getPhraseLanguages2 ( phraseRec,array );
 | |
| }
 | |
| 
 | |
| bool Speller::getPhraseLanguages2 (char *phraseRec , int64_t *array) {
 | |
| 
 | |
| 	int64_t l = 0;
 | |
| 	memset(array, 0, sizeof(int64_t)*MAX_LANGUAGES);
 | |
| 
 | |
| 	while(*phraseRec) {
 | |
| 		l = 0;
 | |
| 		// skip leading whitespace
 | |
| 		while(*phraseRec && (*phraseRec == ' ' ||
 | |
| 				     *phraseRec == '\t'))
 | |
| 			phraseRec++;
 | |
| 
 | |
| 		if(!*phraseRec) break;
 | |
| 
 | |
| 		int64_t l = atoi(phraseRec);
 | |
| 		// l = abs(l); // not using score method anymore, so this is moot.
 | |
| 
 | |
| 		// skip to next delimiter
 | |
| 		// while(*phraseRec && *phraseRec != '\t') phraseRec++;
 | |
| 		if(!(phraseRec = strchr(phraseRec, '\t'))) break;
 | |
| 
 | |
| 		// skip tab
 | |
| 		phraseRec++;
 | |
| 
 | |
| 		if(!*phraseRec) break;
 | |
| 
 | |
| 		// wtf?
 | |
| 		if ( *phraseRec == '\t' ) return true;
 | |
| 
 | |
| 		// Save score
 | |
| 		array[l] = atoi(phraseRec);
 | |
| 
 | |
| 		// skip to next delimiter
 | |
| 		// while(*phraseRec && *phraseRec != '\t') phraseRec++;
 | |
| 		if(!(phraseRec = strchr(phraseRec, '\t'))) break;
 | |
| 
 | |
| 		// skip over tab
 | |
| 		if(*phraseRec == '\t') phraseRec++;
 | |
| 	}
 | |
| 	return(true);
 | |
| }
 | |
| 
 | |
| bool Speller::getSynsInEnglish ( char *w , 
 | |
| 				 int32_t wlen ,
 | |
| 				 char nativeLang ,
 | |
| 				 char wikiLang ) {
 | |
| 	// no digits please!
 | |
| 	if ( is_digit(w[0]) ) return false;
 | |
| 
 | |
| 	char *p = getPhraseRecord(w,wlen);
 | |
| 	if ( ! p ) return false;
 | |
| 	bool inEnglish = false;
 | |
| 	// skip word
 | |
| 	for ( ; *p != '\t' ; p++ );
 | |
| 	// skip tab
 | |
| 	p++;
 | |
| 	// skip phonet
 | |
| 	for ( ; *p != '\t' ; p++ );
 | |
| 	// skip tab
 | |
| 	p++;
 | |
| 
 | |
| 	for ( ; *p ; ) {
 | |
| 		// end of line?
 | |
| 		if ( !*p ) return inEnglish;
 | |
| 		// get language id
 | |
| 		int32_t l = atoi(p);
 | |
| 		// english?
 | |
| 		//if ( l == langEnglish ) inEnglish = true;
 | |
| 		//if ( l > langEnglish && ! inEnglish ) return false;
 | |
| 		//if ( l == nativeLang ) return false;
 | |
| 		// skip langid
 | |
| 		for ( ; *p && *p != '\t' ; p++ );
 | |
| 		// end of line?
 | |
| 		if ( !*p ) return inEnglish;
 | |
| 		// skip tab
 | |
| 		p++;
 | |
| 		// . get popularity. if not negative undo inEnglish.
 | |
| 		// . it has to be negative because that means it is in the
 | |
| 		//   OFFICIAL wiktionary dictionary for that language
 | |
| 		if ( l == langEnglish && p[0] == '-' ) inEnglish = true;
 | |
| 		// if this word is in the doc's primary/native language
 | |
| 		// then do not try to get english synonyms of it
 | |
| 		if ( l == nativeLang && p[0] == '-' ) return false;
 | |
| 		// no chance? it MUST be in english, and these are
 | |
| 		// sorted by langid...
 | |
| 		if ( l > langEnglish && ! inEnglish ) return false;
 | |
| 		// skip popularity
 | |
| 		for ( ; *p && *p != '\t' ; p++ );
 | |
| 		// no more?
 | |
| 		if ( ! *p ) 
 | |
| 			return inEnglish;
 | |
| 		// skip tab
 | |
| 		p++;
 | |
| 	}
 | |
| 	return inEnglish;
 | |
| }
 | |
| 
 | |
| /*
 | |
| static inline int s_findMaxVal(int64_t *vals, int numVals) {
 | |
| 	int64_t max, oldmax, val;
 | |
| 	if(!vals) return(0);
 | |
| 	max = oldmax = INT_MIN;
 | |
| 	val = 0;
 | |
| 	for(int x = 0; x < numVals; x++) {
 | |
| 		if(vals[x] >= max) {
 | |
| 			oldmax = max;
 | |
| 			max = vals[x];
 | |
| 			val = x;
 | |
| 		}
 | |
| 	}
 | |
| 	if(oldmax == max) return(0);
 | |
| 	return(val);
 | |
| }
 | |
| 
 | |
| char Speller::getPhraseLanguage(char *phrase, int len) {
 | |
| 	//char *xx=NULL;*xx=0;
 | |
| 
 | |
| 	char lang;
 | |
| 	int64_t *langs = getPhraseLanguages(phrase, len);
 | |
| 	if(!langs) return(0);
 | |
| 	lang = s_findMaxVal(langs, MAX_LANGUAGES);
 | |
| 	if ( lang < 0 ) { char *xx=NULL;*xx=0; }
 | |
| 	if(langs[(uint8_t)lang] == 0) lang = 0;
 | |
| 	mfree(langs, sizeof(int) * MAX_LANGUAGES, "PhraseRec");
 | |
| 	return(lang);
 | |
| }
 | |
| */
 | |
| 
 | |
| void Speller::dictLookupTest ( char *ff ){
 | |
| 	//char *ff = "/tmp/sctest";
 | |
| 	FILE *fd = fopen ( ff, "r" );
 | |
| 	if ( ! fd ) {
 | |
| 		log("speller: test: Could not open %s for "
 | |
| 		    "reading: %s.", ff,strerror(errno));
 | |
| 		return;
 | |
| 	}
 | |
| 	int64_t start = gettimeofdayInMilliseconds();
 | |
| 	char buf[1026];
 | |
| 	int32_t count = 0;
 | |
| 	// go through the words
 | |
| 	while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) {
 | |
| 		// length of word(s), including the terminating \n
 | |
| 		int32_t wlen = gbstrlen(buf) ;
 | |
| 		// skip if empty
 | |
| 		if ( wlen <= 0 ) continue;
 | |
| 		buf[wlen-1]='\0';
 | |
| 		uint64_t h = hash64d ( buf, gbstrlen(buf));
 | |
| 		int32_t pop = g_speller.getPhrasePopularity(buf, h, true);
 | |
| 		if ( pop < 0 ){
 | |
| 			char *xx = NULL; *xx = 0;
 | |
| 		}
 | |
| 		count++;
 | |
| 	}
 | |
| 	log ( LOG_WARN,"speller: dictLookupTest took %"INT64" ms to do "
 | |
| 	      "%"INT32" words. Compare against 46-66ms taken for dict/words file.",
 | |
| 	      gettimeofdayInMilliseconds() - start, count );
 | |
| 	fclose(fd);
 | |
| }
 |