privacore-open-source-searc…/Wiktionary.cpp

#include "Wiktionary.h"

#include "Query.h"
#include "tokenizer.h"
#include "Titledb.h"
#include "Speller.h"
#include "Conf.h"
#include "Lang.h"
#include "Mem.h"
#include "Errno.h"
#include <sys/stat.h> //stat()
#include <fcntl.h>
#include <unistd.h>
#include "gbmemcpy.h"

// the global instance
Wiktionary g_wiktionary;

Wiktionary::Wiktionary () {
	m_callback = NULL;
	m_state    = NULL;
	m_opened   = false;

	memset(m_buf, 0, sizeof(m_buf));
	m_txtSize = 0;
	m_errno = 0;

	// . use a 8 byte key size and 2 byte data size
	// . allowDups = true!
	// . now m_langTable just maps to langId, no POS bits...
	//m_langTable.set ( 6 , 1,0,NULL,0,false,0 ,"wkt-lang");
	m_synTable.set  ( 6 , 4,0,NULL,0,true,"wkt-synt");

	m_synBuf.setLabel("synbuf");
}

void Wiktionary::reset() {
	//m_langTable.reset();
	m_synTable .reset();
	m_synBuf.purge();

	m_debugMap .reset();
	m_debugBuf .purge();

	m_dedup.reset();
	m_tmp.reset();

	m_langBuf.reset();

	m_localBuf.purge();
	m_localTable.reset();
}

Wiktionary::~Wiktionary () {
	if ( m_opened ) m_f.close();
}


bool Wiktionary::test ( ) {

	// test words parsing here
	//Words w;
	//w.set9 ("get $4,500.00 now",0);

	// test it out!
	const char *str = "love";//pie"; //forsake";
	//int64_t wid = hash64Lower_utf8(str);
	int64_t wid = hash64n(str);
	// use this now
	const char *p = getSynSet ( wid, langEnglish );
	//char *p = (char *)m_synTable.getValue ( &wid );
	// must be there
	if ( ! p ) gbshutdownLogicError();
	// first # is number of forms
	//if ( *p < 0 || *p > 100 ) gbshutdownLogicError();
	// first is count!
	//int32_t n = *p;
	// skip that
	//p++;
	// find new line
	const char *end = p;
	for ( ; *end && *end !='\n' ; end++ );
	// cast it
	// only the first 6 bytes are valid
	//int64_t *termIds = (int64_t *)p;
	// header
	log("wikt: test \"%s\" -> \"%*.*s\"",str,(int)(end-p),(int)(end-p),p);

	return true;
}

#include "Synonyms.h"

bool Wiktionary::test2 ( ) {

 loop2:

	uint8_t langId = langEnglish; // langUnknown

	char input[256];
	fgets(input,200,stdin);
	input[strlen(input)-1]='\0';
	if ( input[0] == '\0' ) return true;

	char *str;

	// get language
	char *pipe = strstr ( input, "|" );
	if ( ! pipe ) {
		fprintf(stderr,"lang = %s\n",getLanguageAbbr(langId));
		str = input;
	}
	else {
		*pipe = '\0';
		langId = getLangIdFromAbbr ( input );
		fprintf(stderr,"lang = %s\n",getLanguageAbbr(langId));
		str = pipe + 1;
	}
	//wid = hash64Lower_utf8(str);
	//wid = hash64n(str);

	TokenizerResult tr;
	plain_tokenizer_phase_1(str,strlen(str), &tr);
	calculate_tokens_hashes(&tr);

	int32_t wordNum = 0;
	char tmpBuf[1000];
	Synonyms syn;
	int32_t naids = syn.getSynonyms(&tr,
				       wordNum ,
				       langId ,
				       tmpBuf );
	// print those out
	SafeBuf sb;
	for ( int32_t k = 0 ; k < naids ; k++ ) {
		char *str = syn.m_termPtrs[k];
		int32_t  len = syn.m_termLens[k];
		sb.safeMemcpy(str,len);
		if ( k+1<naids) sb.pushChar(',');
	}
	sb.pushChar('\0');

	// use this now.
	//p = getSynSet  ( wid, langId );//, WF_NOUN );

	// must be there
	if ( ! naids ) {
		fprintf(stderr,"no forms\n");
		goto loop2;
	}

	fprintf(stderr,"%s -> %s\n",str,sb.getBufStart());
	goto loop2;
}

// . load from disk
bool Wiktionary::load() {

	// load it from .dat file if exists and is newer
	char ff1[sizeof(g_hostdb.m_dir)+128];
	//char ff2[sizeof(g_hostdb.m_dir)+128];
	char ff3[sizeof(g_hostdb.m_dir)+128];
	char ff4[sizeof(g_hostdb.m_dir)+128];
	snprintf(ff1, sizeof(ff1), "%swiktionary.txt.aa", g_hostdb.m_dir);
	ff1[ sizeof(ff1)-1 ] = '\0';
	//sprintf(ff2, "%swiktionary-mybuf.txt", g_hostdb.m_dir);
	snprintf(ff3, sizeof(ff3), "%swiktionary-syns.dat", g_hostdb.m_dir);
	ff3[ sizeof(ff3)-1 ] = '\0';
	snprintf(ff4, sizeof(ff4), "%swiktionary-buf.txt", g_hostdb.m_dir);
	ff4[ sizeof(ff4)-1 ] = '\0';
	int fd1 = open ( ff1 , O_RDONLY );
	int fd3 = open ( ff3 , O_RDONLY );
	if ( fd3 < 0 ) {
		log(LOG_INFO,"wikt: open %s: %s",ff3,mstrerror(errno));
	}
	int fd4 = open ( ff4 , O_RDONLY );
	if ( fd4 < 0 ) {
		log(LOG_INFO,"wikt: open %s: %s",ff1,mstrerror(errno));
	}

	struct stat stats1;
	struct stat stats3;
	struct stat stats4;
	int32_t errno1 = 0;
	int32_t errno3 = 0;
	int32_t errno4 = 0;
	if ( fd1 < 0 || fstat ( fd1 , &stats1 ) == -1 ) errno1 = fd1 < 0 ? -1 : errno;
	if ( fd3 < 0 || fstat ( fd3 , &stats3 ) == -1 ) errno3 = fd3 < 0 ? -1 : errno;
	if ( fd4 < 0 || fstat ( fd4 , &stats4 ) == -1 ) errno4 = fd4 < 0 ? -1 : errno;
	if( fd1 >= 0 ) close ( fd1 );
	if( fd3 >= 0 ) close ( fd3 );
	if( fd4 >= 0 ) close ( fd4 );

	// if we got a newer binary version, use that
	if ( ! errno3 && ! errno4 &&
	     // load from binaries if orig txt is not there OR our
	     // binary make time is ahead of the orig txt make time
	     ( errno1 || stats3.st_mtime > stats1.st_mtime )
	     //&& ( errno2 || stats3.st_mtime > stats2.st_mtime )
	     ) {
		log(LOG_INFO,"wikt: Loading %s",ff3);
		if ( ! m_synTable .load ( NULL , ff3 ) )
			return false;
		log(LOG_INFO,"wikt: Loading %s",ff4);
		if ( m_synBuf.fillFromFile ( NULL , ff4 ) <= 0 )
			return false;

		// augment wiktionary with our own overrides and additions from
		if ( ! addSynsets ( "mysynonyms.txt" ) )
			return false;

		return true;
	}
	// if no text file that is bad
	if ( errno1 ) {
		g_errno = errno1 ;
		log (LOG_WARN, "gb: could not open %s for reading: %s",ff1, mstrerror(g_errno));
		return false;
	}
	//if ( errno2 ) {
	//	g_errno = errno2 ;
	//	log (LOG_WARN, "gb: could not open %s for reading: %s",ff2,mstrerror(g_errno));
	//  return false;
	//}
	// init table slot sizes
	//m_langTable.setTableSize ( 16777216 , NULL , 0 );
	//m_synTable .setTableSize ( 16777216 , NULL , 0 );
	//m_debugMap .setTableSize ( 8388608  , NULL , 0 );
	m_dedup.set    ( 8 , 0 , 16777216 , NULL , 0 , false,"ddtab");
	// this has to allow dups! it maps a baseForm to a variant/syn
	// now it includes langid
	m_tmp.set      ( 8 , 9 , 16777216 , NULL , 0 , true,"tmptab");
	m_debugMap.set  ( 8 , 4,0,NULL,0,false,"wkt-dmap");
	//m_langTableTmp.set( 6 , 1,0,NULL,0,false,0 ,"wktlangt");
	// this maps a pure word id (wid) to an offset in m_debugBuf for
	// printing out the word
	//m_debugMap.set ( 6 , 4 , 8388608  , NULL , 0 , false, 0,"dbgmap");

	// get the size of it
	int32_t size = stats1.st_size;
	// now we have to load the text file
	// returns false and sets g_errno on error
	if ( ! generateHashTableFromWiktionaryTxt ( size ) ) return false;
	// success!
	return true;
}

static const char *s_lowerLangWikiStrings[] = {
	"unknown","english","french","spanish","russian","turkish","japanese",
	"cantonese", // "chinese traditional",
	"mandarin", // "chinese simplified",
	"korean","german","dutch",
	"italian","finnish","swedish","norwegian","portuguese","vietnamese",
	"arabic","hebrew","indonesian","greek","thai","hindi","bengala",
	"polish","tagalog",

	"latin",
	"esperanto",
	"catalan",
	"bulgarian",
	"translingual",
	"serbo-croatian",
	"hungarian",
	"danish",
	"lithuanian",
	"czech",
	"galician",
	"georgian",
	"scottish gaelic",
	"gothic",
	"romanian",
	"irish",
	"latvian",
	"armenian",
	"icelandic",
	"ancient greek",
	"manx",
	"ido",
	"persian",
	"telugu",
	"venetian",
	"malagasy",
	"kurdish",
	"luxembourgish",
	"estonian"
};

// add our special augmentation table
// Synonyms.cpp should check this table separately so we can keep it
// somewhat small and re-load it on the fly.
// mysynonyms.txt
bool Wiktionary::addSynsets ( const char *filename ) {

	// load it up
	//SafeBuf sb;
	if ( m_localBuf.fillFromFile ( g_hostdb.m_dir , filename ) < 0 ) {
		log(LOG_WARN, "wikt: error loading %s", filename);
		return false;
	}

	if ( ! m_localTable.set ( 8 ,4,9000,NULL,0,false,"synloc") )
		return false;

	char *p = m_localBuf.getBufStart();

 nextLine:
	// get end of line
	char *eol = p;
	// sanity
	char *bufEnd = m_localBuf.getBufPtr();
	if ( eol >= bufEnd )
		return true;
	for ( ; *eol && *eol != '\n' ; eol++ );
	// skip spaces
	for ( ; *p == ' ' || *p == '\t' ; p++ );
	// skip comment lines
	if ( *p == '#' ) {
		p = eol + 1;
		goto nextLine;
	}
	// blank line?
	if ( *p == '\n' ) {
		p = eol + 1;
		goto nextLine;
	}
	// over? last line?
	if ( p == eol ) return true;
	// pretty lines
	//if ( *eol == '\n' )
	//	*eol = '\0';
	// need a langid like "en|vs,against"
	char *lang = p;
	p += 2;
	// is it like zh_ch?
	if ( *p == '_' ) p += 3;
	// sanity
	if ( *p != '|' ) {
		log(LOG_WARN, "wikt: bad %s file! no lang", filename);
		return false;
	}
	// null term now
	*p = '\0';
	// skip that
	uint8_t langId = getLangIdFromAbbr ( lang );
	// put char back
	*p = '|';
	// skip the pipe then
	p++;
	// must be there
	if ( langId == 0 ) {
		log(LOG_WARN, "wikt: bad language abbr in %s", filename);
		return false;
	}

	//
	// JUST ADD THESE SYNSETS as separate form wiktionary-buf.txt
	// because even if duped it will not matter, Synonyms.cpp dedups
	// all the word forms.
	//

	//
	// since we now only do synonyms at query time and never index them
	// it will make things much easier to deal with when we make mods
	// to this stuff.
	//

	// make it an offset
	int32_t firstLineOffset = lang - m_localBuf.getBufStart();

	// remember first word
	//char *first = p;
	//int64_t baseHash64;

 wordLoop:
	// find end of word
	char *e = p+1;
	for ( ; *e && *e != '\n' && *e != ',' ; e++ );

	// CRAP, hash each word separately???

	// get word hash. ignore spaces in there... we we hash it like
	// a bigram, although if a stopword leads the phrase ids will
	// xor in a special number to prevent "the rapist" from being
	// "therapist". see Phrases.cpp... we do not have trigrams yet
	// so we will have to do like bigram list chaning somehow to
	// simulate trigrams.
	int64_t wh64 = hash64n_nospaces(p,e-p);
	// mangle with language id so Wiktionary::getSynSet() works
	wh64 ^= g_hashtab[0][langId];
	// last of it?
	char *nextWord = NULL;
	if ( *e == ',' ) nextWord = e + 1;
	//
	// now add the words
	//
	// . point to line start... "en|..."
	// . fix "en|read,,centimes,phantasia" for empty word...
	if ( wh64 != 0 &&
	     e-p > 0 &&
	     ! m_localTable.addKey ( &wh64 , &firstLineOffset ) )
		return false;
	// advance to next word
	p = nextWord;
	// add the word into the synset
	if ( p ) goto wordLoop;

	// next line otherwise
	p = eol+1;
	goto nextLine;
}

bool Wiktionary::generateHashTableFromWiktionaryTxt ( int32_t sizen ) {

	// for debug
	//sizen = 10000000;
	int32_t round = 0;

	//
	// FILE FORMAT HELP:
	//
	// https://secure.wikimedia.org/wiktionary/en/wiki/Wiktionary:Entry_layout_explained
	//  https://secure.wikimedia.org/wiktionary/en/wiki/Wiktionary:Entry_layout_explained/POS_headers
	//
	//
	// i downloaded this file from
	// http://dumps.wikimedia.org/enwiktionary/latest/
	// http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-abstract.xml
	// THEN i ran split it on like 'split -b 2000000000 wiktionary.txt'
	// to divide it into two files, the first one being 2GB:
	// wiktionary.txt.aa and wiktionary.txt.ab
	// So read those files in here.
	//
	// OUTPUT files:
	//
	// wiktionary-syns.dat  (maps a wordId to ptr into wiktionary-buf.txt)
	// wiktionary-buf.txt   (one syn set per line)
	// wiktionary-lang.txt  (<landId>|<word>\n) (used by Speller.cpp)
	//
	char ff1[sizeof(g_hostdb.m_dir)+128];
	snprintf(ff1, sizeof(ff1), "%swiktionary.txt.aa", g_hostdb.m_dir);
	ff1[ sizeof(ff1)-1 ] = '\0';

	log(LOG_INFO,"wikt: Loading %s",ff1);
	int fd1 = open ( ff1 , O_RDONLY );
	if ( fd1 < 0 ) {
		log("wikt: open %s : %s",ff1,mstrerror(errno));
		return false;
	}
	// read in whole thing
	int64_t maxReadSize = 300000000; // 300MB
	char *buf = (char *)mmalloc ( maxReadSize + 1 , "wikt" );
	if ( ! buf ) {
		close ( fd1 );
		return false;
	}

	int64_t offset = 0LL;

	// use this to scrape popularity info and other words we are missing
	//if ( ! g_speller.init() ) return false;

	// the wiktionary file is like 2.6GB so we can't hold the whole thing
 readInSomeFile:

	// limit to 300MB
	int32_t readSize = sizen;
	if ( readSize > maxReadSize ) readSize = maxReadSize;

	// do not breach file size
	if ( offset + readSize > sizen )
		readSize = sizen - offset;

	//
	//
	// ARE WE DONE????
	//
	//
	if ( offset >= sizen ) {
		// don't forget to close
		close ( fd1 );

		// try reading next split file
		if ( round == 0 ) {
			round++;
			offset = 0;
			snprintf(ff1, sizeof(ff1), "%swiktionary.txt.ab",g_hostdb.m_dir);
			ff1[ sizeof(ff1)-1 ] = '\0';

			log(LOG_INFO,"wikt: Loading %s",ff1);
			fd1 = open ( ff1 , O_RDONLY );
			if ( fd1 < 0 ) {
				log("wikt: open %s : %s",ff1,mstrerror(errno));
				return false;
			}
			struct stat stats;
			if ( fstat ( fd1 , &stats ) == -1 ) {
				g_errno = errno;
				close ( fd1 );
				return false;
			}
			sizen = stats.st_size;
			goto readInSomeFile;
		}

		// do not save if we can't
		if ( g_conf.m_readOnlyMode ) return true;

		// build m_synTable from m_tmp table
		if ( ! compile() ) return false;

		// add unified dict entries into m_langTable if they
		// belong to one and only one language.
		// right now, this just cleans out m_langTable.
		if ( ! integrateUnifiedDict() ) return false;

		log("wikt: testing");

		//log("wiktL debug skipping test!");
		test();

		log("wikt: test passed");

		// now save this hash table for quicker loading next time
		//if ( ! m_langTable.save ( g_hostdb.m_dir ,
		//			  "wiktionary-langs.dat" ) )
		//	return false;

		// . and the synomnyms
		// . offsets into m_synBuf, text file of synsets
		if ( ! m_synTable.save ( g_hostdb.m_dir ,
					 "wiktionary-syns.dat" ,
					 NULL,
					 0 ) )
		     //m_synBuf.getBufStart() ,
		     //m_synBuf.length() ) )
			return false;
		// save text file
		if ( m_synBuf.saveToFile ( g_hostdb.m_dir,
					    "wiktionary-buf.txt" ) <= 0 )
			return false;

		if ( m_langBuf.saveToFile(g_hostdb.m_dir,
					    "wiktionary-lang.txt" ) <= 0 )
			return false;


		// this too?
		//if ( ! m_debugMap.save ( g_hostdb.m_dir ,
		//			 "wiktionary-strings.dat",
		//			 m_debugBuf.getBufStart() ,
		//			 m_debugBuf.length() ))
		//    return false;

		// clear this
		m_tmp  .reset();
		m_dedup.reset();

		m_debugMap.reset();
		m_debugBuf.purge();

		m_langBuf.reset();

		return true;
	}

	// log it
	log("wikt: reading %" PRId32" bytes of %s @ %" PRId64" (filesize=%" PRId32")",
	    readSize,ff1,offset,sizen);

	int32_t n = pread ( fd1 , buf , readSize , offset );

	if ( n != readSize ) {
		log("wikt: read: %s",mstrerror(errno));
		g_errno = EBADENGINEER;
		close ( fd1 );
		return false;
	}

	log("wikt: processing");

	// advance for next read
	offset += n;

	// null terminate
	buf[readSize] = '\0';

	//
	// simple filter. back to back spaces removed in next loop.
	//
	char *p = buf;
	for ( ; *p ; p++ ) {
		// fix # {{form of|Abbreviation|biography}} for 'bio'
		if ( p[0] == 'f' &&
		     p[1] == 'o' &&
		     p[2] == 'r' &&
		     p[3] == 'm' &&
		     p[4] == ' ' &&
		     p[5] == 'o' &&
		     p[6] == 'f' &&
		     p[7] == '|' &&
		     to_lower_a(p[8]) == 'a' &&
		     to_lower_a(p[9]) == 'b' &&
		     !strncasecmp(p ,"form of|abbreviation|",21) )
			// overwrite the pipe with a space
			gbmemcpy(p    ,"abbreviated  form of|",21);
	}


	char *src = buf;
	char *dst = buf;
	// filter out the annoying bold '''
	for ( ; *src ; src++ ) {
		// skip bold thingy
		if ( src[0] =='\'' &&
		     src[1] =='\'' &&
		     src[2] =='\'' ) {
			src += 2;
			continue;
		}
		// # {{present participle of|''[[snort]]''}}
		if ( src[0] =='\'' &&
		     src[1] =='\'' ) {
			src += 1;
			continue;
		}
		// <space>|  "for |" "form |"
		if ( src[0] == ' ' &&
		     src[1] == '|' )
			continue;
		// filter back-to-back spaces
		if ( src[0] == ' ' &&
		     src[1] == ' ' )
			continue;
		// <space>,
		if ( src[0] == ' ' &&
		     src[1] == ',' )
			continue;
		*dst++ = *src;
	}
	*dst = '\0';


	//
	// . filter the buffer
	// . set "name" to the word we are a form of
	//
	p = buf;
	for ( ; *p ; p++ ) {
		// REWRITE A LINE SEGMENT
		// # {{given name|male|diminutive=Samuel}}
		// # {{given name|male|diminut of|Samuel}}
		if ( p[0] == 'd' &&
		     p[1] == 'i' &&
		     p[2] == 'm' &&
		     !strncmp(p ,"diminutive=",11) ) {
			gbmemcpy(p,"diminut of|",11);
			p += 11;
			continue;
		}

		bool needPound = true;
		// assume no name
		char *name = NULL;
		// REWRITE A FULL LINE
		// # A [[diminutive]] of the male [[given name]] [[Douglas]].\n
		// # {{diminutive form of|Douglas}}                          \n
		if ( p[0] == 'm' &&
		     p[1] == 'a' &&
		     p[2] == 'l' &&
		     !strncmp(p ,"male [[given name]] [[",22) ) {
			needPound = false;
			name = p + 22;
		}
		//# {{given name|female}}, a [[diminutive]] of [[Abigail]].
		if ( p[0] == '[' &&
		     p[1] == '[' &&
		     p[2] == 'd' &&
		     p[3] == 'i' &&
		     !strncmp(p ,"[[diminutive]] of [[",20) ) {
			needPound = false;
			name = p + 20;
		}

		// set needPound = true for this below
		// variant spelling of [[poo]]
		if ( p[0] == 's' &&
		     p[1] == 'p' &&
		     p[2] == 'e' &&
		     p[3] == 'l' &&
		     ! strncasecmp(p ,"spelling of [[",14) )
			name = p + 14;

		// past participle of [[block]]
		if ( p[0] == 'p' &&
		     p[1] == 'a' &&
		     p[2] == 'r' &&
		     p[3] == 't' &&
		     p[4] == 'i' &&
		     ! strncasecmp(p ,"participle of [[",16) )
			name = p + 16;


		// past participle of to [[block]]
		if ( p[0] == 'p' &&
		     p[1] == 'a' &&
		     p[2] == 'r' &&
		     p[3] == 't' &&
		     p[4] == 'i' &&
		     ! strncasecmp(p ,"participle of to [[",19) )
			name = p + 19;

		// # [[present participle|Present participle]] of [[link]].
		if ( p[0] == 'a' &&
		     p[1] == 'r' &&
		     p[2] == 't' &&
		     p[3] == 'i' &&
		     p[4] == 'c' &&
		     ! strncasecmp(p ,"articiple]] of [[",17) )
			name = p + 17;

		// definite [s|S]ingular of [[block]]
		if ( p[0] == 'i' &&
		     p[1] == 'n' &&
		     p[2] == 'g' &&
		     p[3] == 'u' &&
		     p[4] == 'l' &&
		     ! strncasecmp(p ,"ingular of [[",14) )
			name = p + 14;

		// # Singular of {{term|airwaves|lang=en}};
		if ( p[0] == 'i' &&
		     p[1] == 'n' &&
		     p[2] == 'g' &&
		     p[3] == 'u' &&
		     p[4] == 'l' &&
		     ! strncasecmp(p ,"ingular of {{term|",18) )
			name = p + 18;

		// definite [p|P]lural of [[block]]
		if ( p[0] == 'l' &&
		     p[1] == 'u' &&
		     p[2] == 'r' &&
		     p[3] == 'a' &&
		     p[4] == 'l' &&
		     ! strncasecmp(p ,"lural of [[",11) )
			name = p + 11;

		// substitue form for case
		// "objective case of" ... treat like form
		// should fix page for "us" which is "objective case of we"
		bool mangled = false;
		if ( ! name &&
		     p[0] == 'c' &&
		     p[1] == 'a' &&
		     p[2] == 's' &&
		     p[3] == 'e' ) {
			gbmemcpy ( p , "form" , 4 );
			mangled = true;
		}

		// need "form of" for shit below
		if ( ! name &&
		     ( p[0] != 'f' ||
		       p[1] != 'o' ||
		       p[2] != 'r' ||
		       p[3] != 'm' ) )
			continue;

		bool doTailCheck = true;
		if ( name ) doTailCheck = false;

		// # Short form of [[hippopotamus]].
		if ( ! strncasecmp(p-5 ,"past form of",12) )
			name = p + 7;
		if ( ! strncasecmp(p-6 ,"short form of",13) )
			name = p + 7;
		if ( ! strncasecmp(p-6 ,"tense form of",13) )
			name = p + 7;
		if ( ! strncasecmp(p-7 ,"plural form of",14) )
			name = p + 7;
		if ( ! strncasecmp(p-7 ,"dative form of",14) )
			name = p + 7;
		if ( ! strncasecmp(p-8 ,"present form of",15) )
			name = p + 7;
		if ( ! strncasecmp(p-9 ,"familiar form of",16) )
			name = p + 7;
		if ( ! strncasecmp(p-9 ,"singular form of",16) )
			name = p + 7;
		if ( ! strncasecmp(p-9 ,"feminine form of",16) )
			name = p + 7;
		if ( ! strncasecmp(p-9 ,"emphatic form of",16) )
			name = p + 7;
		if ( ! strncasecmp(p-9 ,"genitive form of",16) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"shortened form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"inflected form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"masculine form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"imperfect form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"objective form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"partitive form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-10 ,"reflexive form of",17) )
			name = p + 7;
		if ( ! strncasecmp(p-11 ,"diminutive form of",18) )
			name = p + 7;
		if ( ! strncasecmp(p-11 ,"simplified form of",18) )
			name = p + 7;
		if ( ! strncasecmp(p-11 ,"imperative form of",18) )
			name = p + 7;
		if ( ! strncasecmp(p-11 ,"indicative form of",18) )
			name = p + 7;
		if ( ! strncasecmp(p-11 ,"possessive form of",18) )
			name = p + 7;
		if ( ! strncasecmp(p-11 ,"accusative form of",18) )
			name = p + 7;
		if ( ! strncasecmp(p-12 ,"abbreviated form of",19) )
			name = p + 7;
		if ( ! strncasecmp(p-12 ,"alternative form of",19) )
			name = p + 7;
		if ( mangled )
			gbmemcpy ( p , "case" , 4 );
		// skip if no match
		if ( ! name ) continue;

		// then after "of" comes a space
		if ( doTailCheck ) {
			// need to have this
			if ( strncmp(name," [[",3)== 0 ) name += 3;
			// OR YOU CAN HAVE THIS
			// # Past tense and past participle of ''to [[block]]''
			// for title of "blocked". the '' should have been
			// filtered out above.
			else if ( strncmp(name," to [[",6)== 0 ) name += 6;
			// otherwise, forget it!!
			else continue;
		}

		// ok, replace the line with a proper name line
		char *lineStart = p;
		for ( ; lineStart > buf&&*lineStart!='#'&&lineStart[-1]!='\n';
		      lineStart--);
		// need this? this is a numbered line used as a definition
		// line.
		if ( needPound && *lineStart != '#' )
			continue;
		// end end of it
		char *lineEnd = p;
		for ( ; *lineEnd&&*lineEnd !='\n';lineEnd++);
		// temp null that
		char c = *lineEnd;
		*lineEnd = '\0';
		//
		// check for badness
		// i don't like obsolete forms!!! filter out.
		//
		char *bad = NULL;
		if ( ! bad ) bad = gb_strcasestr(lineStart,"archaic");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"rare ");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"less common");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"uncommon ");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"obsolete");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"older ");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"old ");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"nonstandard");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"eye-dialect");
		if ( ! bad ) bad = gb_strcasestr(lineStart,"eye dialect");
		*lineEnd = c;
		if ( bad )
			continue;
		// now store a new form
		char *dst = lineStart;
		gbmemcpy(dst,"# {{form|",9);
		dst += 9;
		// point to name
		//char *name = p + 22;
		//
		// PUT it in the proper formation for parsing in the logic
		// below
		//
		// copy over name
		for ( ; *name !=']' &&
			      *name !='\n' &&
			      *name != '#' &&
			      *name != '|' ; name++ )
			*dst++ = *name;
		// close it up
		gbmemcpy(dst,"}}",2);
		dst += 2;
		// panic
		if ( dst > lineEnd ) gbshutdownLogicError();
		// space fill until lineEnd
		for ( ; dst < lineEnd ; dst++ )
			*dst = ' ';
		// skip over that line then
		p = lineEnd;
	}

	// start parsing here
	p = buf;

 wordLoop:

	// look for <title> tag
	char *title = strstr ( p , "<title>" );

	if ( ! title ) goto readInSomeFile;

	// find title after so we know we have a full page
	char *nextTitle = strstr ( title + 5 , "<title" );
	if ( ! nextTitle ) goto readInSomeFile;

	// advance
	p = nextTitle;

	// . scan from title to next title
	// . if it contains "Shavian" then bail! those are stupid
	//   shavian script characters. one of them is short for "of"
	//   so it shows up in of's synset!
	char c;
	if ( nextTitle ) {c = *nextTitle;*nextTitle = '\0';}
	char *found = strstr ( title , "Shavian ");
	if ( nextTitle ) *nextTitle = c;
	if ( found ) goto wordLoop;


	// get the word in the title, <title>
	char *word = title + 7;
	// find end of it
	char *wp = word ;
	for ( ; *wp && *wp != '<' ; wp++ ) {
		// any space is bad
		if ( is_wspace_a(*wp) ) break;
		// or colon
		if ( *wp == ':' ) break;
		// or * (f*ck)
		if ( *wp == '*' ) break;
	}
	// bad word that has space or colon in it?
	if ( *wp != '<' ) goto wordLoop;
	// remove any trailing spaces
	for  ( ; wp[-1] == ' ' ; wp-- );
	// if word ends in hyphen skip (anxio-)
	if ( wp[-1] == '-' ) goto wordLoop;
	// or starts with '
	if ( word[0] == '\'' ) goto wordLoop;
	// or ends with ' like "o'" form of "of"
	if ( wp[-1] == '\'' ) goto wordLoop;
	// null term so "title" is null terminated
	*wp = '\0';
	// and skip
	wp++;

	int32_t flag = 0;
	uint8_t langId = langUnknown;

	bool debug = false;
	//debug = true;

	// set nextline
	char *np = wp;
	for ( ; *np && np < nextTitle ; np++ )
		if ( *np =='#' || (*np == '=' && np[1]=='=') ) break;

 lineLoop:

	// advance to next line. unless its the first line for this word
	// in which np already equals wp.
	wp = np;

	// . set next line for next call to goto lineLoop.
	// . we do this this way because the code below inserts \0's into
	//   the line for easier parsing...
	np++;
	for ( ; *np == '=' ; np++ );
	for ( ; *np && np < nextTitle ; np++ ) {
		if ( *np =='#' ) break;
		//if ( np[-1] == '\n' ) break;
		if (*np == '=' && np[1]=='=') break;
	}

	// scan for next header OR part of speech description
	//for ( ; *wp && wp < nextTitle ; wp++ )
	//	if ( *wp =='#' || (*wp == '=' && wp[1]=='=') ) break;

	// get next word if no more lines
	if ( ! *wp || wp >= nextTitle ) goto wordLoop;

	// skip line break (\n)
	//if ( *wp == '\n' ) wp++;
	// get next word if no more lines
	//if ( ! *wp || wp >= nextTitle ) goto wordLoop;
	// need a header or a comment here
	//if ( *wp != '=' && *wp != '#' ) goto lineLoop;

	// we got a header, set langid or set POS
	if ( *wp == '=' ) {
		// count em
		int32_t equalCount = 0;
		// skip any extra ='s
		for ( ; *wp == '=' ; wp++ ) equalCount++;
		// if newline follows this equal, it was at the end of
		// an equal pair like "==English=="
		if ( *wp == '\n' )  goto lineLoop;
		// debug
		//int32_t diff = wp - buf;
		//log("diff = %" PRId32,diff);
		// a pos?
		if ( ! strncasecmp(wp,"noun",4) ) {
			flag = WF_NOUN;
			if ( debug )
				fprintf(stderr,"%s -> (noun)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"verb",4) ) {
			flag = WF_VERB;
			if ( debug )
				fprintf(stderr,"%s -> (verb)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"participle",10) ) {
			flag = WF_VERB;
			if ( debug )
				fprintf(stderr,"%s -> (particple)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"preposition",11) ) {
			flag = WF_PREPOSITION;
			if ( debug )
				fprintf(stderr,"%s -> (preposition)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"interjection",12) ) {
			flag = WF_INTERJECTION;
			if ( debug )
				fprintf(stderr,"%s -> (interjection)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"pronoun",7) ) {
			flag = WF_PRONOUN;
			if ( debug )
				fprintf(stderr,"%s -> (pronoun)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"proper",6) ) {
			flag = WF_NOUN; // proper noun
			if ( debug )
				fprintf(stderr,"%s -> (proper noun)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"abbrev",6) ) {
			flag = WF_ABBREVIATION;//NOUN; // abbreviation
			if ( debug )
				fprintf(stderr,"%s -> (abbreviation)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"letter",6) ) {
			flag = WF_LETTER;//NOUN; // abbreviation
			if ( debug )
				fprintf(stderr,"%s -> (letter)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"acronym",7) ) {
			flag = WF_NOUN;
			if ( debug )
				fprintf(stderr,"%s -> (acronym)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"initialism",10) ) {
			flag = WF_INITIALISM;
			if ( debug )
				fprintf(stderr,"%s -> (initialism)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"adjective",9) ) {
			flag = WF_ADJECTIVE;
			if ( debug )
				fprintf(stderr,"%s -> (adjective)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"adverb",6) ) {
			flag = WF_ADVERB;
			if ( debug )
				fprintf(stderr,"%s -> (adverb)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		if ( ! strncasecmp(wp,"article",7) ) {
			flag = WF_ARTICLE;
			if ( debug )
				fprintf(stderr,"%s -> (article)\n",word);
			addWord ( word, flag , langId , NULL );
			goto lineLoop;
		}
		// is it a language we support?
		int32_t n = sizeof(s_lowerLangWikiStrings) / sizeof(char *);
		for ( int32_t i = 0 ; i < n ; i++ ) {
			const char *str = s_lowerLangWikiStrings[i];
			if ( ! str ) gbshutdownLogicError();
			int32_t  len = strlen(str);
			if ( ! strncasecmp(wp,str,len) ) {
				langId = i;
				if ( debug )
					fprintf(stderr,"%s -> (%s)\n",
						word,getLanguageAbbr(langId));
				addWord ( word, 0 , langId , NULL);
				goto lineLoop;
			}
		}
		// unsupported lang?
		if ( equalCount == 2 ) {
			langId = langUnknown;
			if ( debug )
				fprintf(stderr,"%s -> (%s)\n",
					word,getLanguageAbbr(langId));
			addWord ( word, 0 , langId , NULL );
		}

		// ignore the header otherwise
		goto lineLoop;
	}

	bool gotGoodLine = false;

	// we might have "{{head|tr|abbreviation}} (''[[....
	// which does not start with a #
	//if ( wp[0] == '{' && wp[1] == '{' )
	//	gotGoodLine = true;

	// we got a comment
	if ( *wp == '#' ) {
		gotGoodLine = true;
		wp++;
	}

	if ( ! gotGoodLine ) goto lineLoop;

	// save this
	char *lineStart = wp;
	// skip #
	//wp++;
	// skip space
	if ( is_wspace_a(*wp) ) wp++;

	// debug point
	//if ( word[0] == 'b' && word[1] == 'i' && word[2] == 'o' && ! word[3])
	//	log("got bio");

	//
	// SPECIAL case for abbreviations.
	// like for http://en.wiktionary.org/wiki/KS we got
	// # [[Kansas]], a state of the [[United States of America]].
	/*
        if ( flag == WF_ABBREVIATION ||
	     flag == WF_INITIALISM ) {
		// save it
		char *wpsave = wp;
		// forget it if single letter! too much confusion!!
		if ( ! word[1] ) goto skipSpecialLogic;
		// if the line has a '{' in it then do not do this stuff
		// skip until we hit a [[ but stop on # or \n.
		// no! hurts # "{{economics}} [[gross domestic product]]"
		//for ( ; *wp &&
		//	      // if we hit this it might be of proper form
		//	      // like
		//	      // # [[operating system]];
		//	      // {{abbreviation of|operativsystem|lang=sv}}
		//	      *wp != '{' &&
		//	      *wp !='#' &&
		//	      *wp !='\n' ;
		//      wp++ );
		//if ( *wp == '{' ) { wp = wpsave; goto skipSpecialLogic; }
		// restore it
		wp = wpsave;
		// skip until we hit a [[ but stop on # or \n
		for ( ; *wp &&
			      *wp != '[' &&
			      *wp !='#' &&
			      *wp !='\n' ;
		      wp++ );
		// get [ for abbreviation lists. what are we an abbrev of?
		if ( *wp != '[' ) { wp = wpsave; goto skipSpecialLogic; }
		wp++;
		if ( *wp != '[' ) { wp = wpsave; goto skipSpecialLogic; }
		wp++;
		// skip w: for wikipedia references
		if ( wp[0] == 'w' && wp[1] == ':' ) wp += 2;
		// find ]
		char *wpend = wp + 1;
		for ( ; *wpend &&
			      //[[w:Maltese Cross#United Kingdom|Maltese Cross
			      *wpend != '#' &&
			      //[[w:Maltese Cross#United Kingdom|Maltese Cross
			      *wpend != '|' &&
			      *wpend != ']' ;
		      wpend++ ) ;
		if ( ! *wpend || *wpend != ']' ) {
			wp = wpsave; goto skipSpecialLogic; }
		// if word ends in '-' toss it out... "centi-" prefix
		if ( wpend[-1] == '-' ) {wp = wpsave; goto skipSpecialLogic; }
		// "w/"
		if ( wpend[-1] == '/' ) {wp = wpsave; goto skipSpecialLogic; }
		*wpend = '\0';
		// get that word then
		//if ( debug )
			fprintf(stderr,"%s|%s -> %s"
				"\n"
				//"(%s)\n",
				,getLanguageAbbr(langId)
				,word // TITLE!
				,wp
				);
		addWord ( word, flag , langId , wp );
		// try another line
		goto lineLoop;
	}

 skipSpecialLogic:
*/

	// look for something like "{{abbreviation of|Albuquerque|.."
	if ( *wp != '{' ) goto lineLoop;
	wp++;
	if ( *wp != '{' ) goto lineLoop;
	wp++;

	// somtimes we got something like
	// # {{education}} {{initialism of|Artium Magister}}
	// so go to next {{'s
	// so skip spaces
	char *secondSet = wp;
	for ( ; *secondSet && *secondSet != '\n'; secondSet++ ) {
		// check
		if ( secondSet[0] == '}' &&
		     secondSet[1] == '}' &&
		     secondSet[2] == ' ' &&
		     secondSet[3] == '{' &&
		     secondSet[4] == '{' ) {
			// skip to the second set of {{}}'s on the
			// same line
			wp = secondSet += 5;
			break;
		}
	}

	// start scan here
	//char *scanStart = wp;
	// assume good
	bool good = false;
	// loop over all little pipe-delineated sections
 scanForFormIndicator:
	// scan until we hit |and not }
	for ( ; *wp && *wp != '}' && *wp != '|' ; wp++ ) {
		// # {{nl-noun-form|pl=1|wijziginkje}}
		if ( wp[0] == 'f' &&
		     wp[1] == 'o' &&
		     wp[2] == 'r' &&
		     wp[3] == 'm' &&
		     wp[4] == '|' )
			good = true;
		// # {{abbeviation of|camarade|...
		if ( wp[0] == ' ' &&
		     wp[1] == 'o' &&
		     wp[2] == 'f' &&
		     wp[3] == '|' )
			good = true;
		// for 'BM' page:
		// # {{head|tr|abbreviation}} (''[[B...
		/*
		if ( wp[0] == 'h' &&
		     wp[1] == 'e' &&
		     wp[2] == 'a' &&
		     wp[3] == 'd' &&
		     wp[4] == '|' )
			good = true;
		*/
	}
	// success?
	if ( *wp != '|' ) goto lineLoop;
	// "of" or "form" must preceed
	if ( ! good ) {
		// maybe try next pipe delineated section
		wp++;
		goto scanForFormIndicator;
	}


	// broken:
	// # {{conjugation of|livrer||1|s|pres|ind|lang=fr}}
	// # {{form of|third-person singular present|pondre|lang=fr}}
	// # {{plural of|pie|lang=fr}}
	// # {{inflection of|[[pius#Latin|pius]]||voc|m|s|lang=la}}
	// # {{form of|Singular dative masculine|on|lang=cs}}

	// skip |
	wp++;

	// find terminating '}'
	char *end = wp;
	for ( ; *end && end < nextTitle && *end != '}' ;end++ );
	// try next line if could not find }
	if ( ! *end || end >= nextTitle ) goto lineLoop;
	// null term it
	*end = '\0';
	// in case there was a # in there!
	if ( np < end + 1 ) {
		np = end + 1;
		for ( ; *np && np < nextTitle ; np++ )
			if ( *np =='#' || (*np == '=' && np[1]=='=') )
				break;
	}


	// nuke all of it! "archaic third person ..."
	if ( gb_strcasestr(lineStart,"archaic ") )
		goto lineLoop;
	if ( gb_strcasestr(lineStart,"archaic|") )
		goto lineLoop;
	if ( gb_strcasestr(lineStart,"archaic}") )
		goto lineLoop;
	// fix 'goest' has {{archaic-verb-form
	if ( gb_strcasestr(lineStart,"{archaic") )
		goto lineLoop;
	if ( gb_strcasestr(lineStart,"eye dialect") )
		goto lineLoop;
	if ( gb_strcasestr(lineStart,"eye-dialect") )
		goto lineLoop;
	// obslete form or spelling
	if ( gb_strcasestr(lineStart,"obsolete ") )
		goto lineLoop;
	if ( gb_strcasestr(lineStart,"obsolete|") )
		goto lineLoop;
	if ( gb_strcasestr(lineStart,"obsolete}") )
		goto lineLoop;
	// {standard of identity|UK} (measurement)
	// prevent cream->UK
	if ( gb_strcasestr(lineStart,"standard ") )
		goto lineLoop;
	// fix 'gwine'
	if ( gb_strcasestr(lineStart,"nonstandard") )
		goto lineLoop;

	//
	// now wp = "|.....}" and end = the ending '}'
	//
	// CRAP: # {{sports}} {{initialism of|[[championship|Championship]] [[record|Record]] or [[competition|Competition]] Record}}
	// is messing up on converting pipes to \0 because it
	// ends up mapping "CR" to "championship".
	int32_t inBrackets = 0;
	for ( char *s = wp ; s < end ; s++ ) {
		if ( *s == '[' ) inBrackets++;
		if ( *s == ']' ) inBrackets--;
		if ( *s == '|' && ! inBrackets ) *s = '\0';
	}
	// scan the strings now
	char *start = NULL;
	int32_t slen;
	bool skipNext = false;
	for ( char *s = wp ; s < end ; s += slen + 1 ) {
		slen = strlen(s);
		// skip numbers |1|
		if ( slen == 1 && is_digit(*s) ) continue;
		// skip that {{l|en|... crap {{l|fro|...
		if ( ! strcmp(s,"{{l") ) { skipNext = true; continue;}
		if ( skipNext ) { skipNext = false; continue; }
		// skip certain words
		if ( ! strcmp(s,"pass") ) continue;
		if ( ! strcmp(s,"pres") ) continue;
		if ( ! strcmp(s,"fut") ) continue;
		if ( ! strcmp(s,"nom") ) continue;
		if ( ! strcmp(s,"act") ) continue;
		if ( ! strcmp(s,"voc") ) continue;
		if ( ! strcmp(s,"imp") ) continue;
		if ( ! strcmp(s,"acc") ) continue;
		if ( ! strcmp(s,"ind") ) continue;
		if ( ! strcmp(s,"sub") ) continue;
		if ( ! strcmp(s,"s") ) continue;
		if ( ! strcmp(s,"p") ) continue;
		if ( ! strcmp(s,"m") ) continue;
		if ( ! strcmp(s,"f") ) continue;
		// assignment like "lang=la"
		if ( strstr(s,"=" ) ) continue;
		// third-person singluar
		if ( gb_strcasestr(s,"person ") ) continue;
		if ( gb_strcasestr(s," person") ) continue;
		// third-person
		if ( gb_strcasestr(s,"-person") ) continue;
		// Singular dative masculine
		if ( gb_strcasestr(s,"dative ") ) continue;
		if ( gb_strcasestr(s,"nominative ") ) continue;
		if ( gb_strcasestr(s,"imperative ") ) continue;
		if ( gb_strcasestr(s,"comparative ") ) continue;
		if ( gb_strcasestr(s,"genitive") ) continue;
		if ( gb_strcasestr(s,"possessive ") ) continue;
		if ( gb_strcasestr(s," possessive") ) continue;
		if ( gb_strcasestr(s,"past tense") ) continue;
		// impersonal past
		if ( gb_strcasestr(s," past") ) continue;
		if ( gb_strcasestr(s,"present tense") ) continue;
		if ( gb_strcasestr(s,"future tense") ) continue;
		// passive voice
		if ( gb_strcasestr(s,"passive ") ) continue;
		// present analytic
		if ( gb_strcasestr(s," analytic") ) continue;
		if ( gb_strcasestr(s,"subjunctive ") ) continue;
		if ( gb_strcasestr(s," subjunctive ") ) continue;
		// Postal abbreviation
		if ( gb_strcasestr(s," abbreviation") ) continue;
		// abbreviation of
		if ( gb_strcasestr(s,"abbreviation ") ) continue;
		// infinitive passive
		if ( gb_strcasestr(s,"infinitive ") ) continue;
		// infinitive passive voice
		if ( gb_strcasestr(s," infinitive") ) continue;
		if ( gb_strcasestr(s,"appendix:") ) continue;
		// "form used..."
		if ( gb_strcasestr(s,"form ") ) continue;
		// inflection of
		if ( gb_strcasestr(s,"inflection ") ) continue;
		// front vowel variant
		if ( gb_strcasestr(s," variant") ) continue;
		if ( gb_strcasestr(s," spelling") ) continue;
		if ( gb_strcasestr(s," misspelling") ) continue;
		// definite and plural
		if ( gb_strcasestr(s,"definite") ) continue;
		if ( gb_strcasestr(s,"accusative ") ) continue;
		if ( gb_strcasestr(s,"vocative ") ) continue;
		if ( gb_strcasestr(s,"indicative") ) continue;
		if ( gb_strcasestr(s,"plural") ) continue;
		if ( gb_strcasestr(s,"feminine") ) continue;
		if ( gb_strcasestr(s,"masculine") ) continue;
		if ( gb_strcasestr(s,"oblique") ) continue;
		// singuler definite
		if ( gb_strcasestr(s,"singular ") ) continue;
		if ( gb_strcasestr(s," singular") ) continue;
		// prepositional singluar
		if ( gb_strcasestr(s,"prepositional") ) continue;
		if ( gb_strcasestr(s," participle") ) continue;
		// han form
		if ( gb_strcasestr(s," form") ) continue;
		// *PRENSENT* tense
		if ( gb_strcasestr(s," tense") ) continue;
		if ( gb_strcasestr(s,"lower case") ) continue;
		if ( gb_strcasestr(s,"upper case") ) continue;

		// kills the word "present"! so hardcode that!
		if ( ! strcmp(s,"present") ) continue;
		if ( ! strcmp(s,"past") ) continue;
		if ( ! strcmp(s,"capital form") ) continue;
		if ( ! strcmp(s,"capitalized form") ) continue;
		if ( ! strcmp(s,"obsolete capitalization") ) continue;
		if ( ! strcmp(s,"archaic form") ) continue;
		if ( ! strcmp(s,"shortened form") ) continue;
		if ( ! strcmp(s,"reduced form") ) continue;
		if ( ! strcmp(s,"unstressed form") ) continue;
		if ( ! strcmp(s,"lowercase form") ) continue;
		if ( ! strcmp(s,"uncapitalized form") ) continue;
		if ( ! strcmp(s,"imperative") ) continue;
		// assume that is it i guess
		start = s;
		break;
	}

	// skip if empty!!! wtf??
	if ( ! start ) { wp = end + 1 ; goto lineLoop; }
	// skip ['s and spaces
	// skipping ' made "ve" a form of "of" where it was "'ve"
	for ( ;
	      *start == '[' || *start == ' ' ; // || *start == '\'';
	      start++ );

	// and ]'s
	char *wend = start + strlen(start);
	for ( ; wend && wend>start && wend[-1] == ']' ;wend--);
	*wend = '\0';

	// sometimes they start with w: like for ANZAC:
	// # {{initialism of|[[w:Australian and New Zealand Army Corps|Australian and New Zealand Army Corps]]}}
	if ( start[0]=='w' && start[1]==':' ) {
		start += 2;
		// these are wikipedia titles, skip!
		//goto lineLoop;
	}
	if ( strncasecmp(start,"wikipedia:",10)==0 ) {
		start += 10;
		// these are wikipedia titles, skip!
		//goto lineLoop;
	}
	if ( start[0]==':' && start[1]=='w' && start[2]==':'){
		start += 3;
		// these are wikipedia titles, skip!
		//goto lineLoop;
	}

	// nuke after # anchor
	char *a = start;
	for ( ; *a ; a++ ) if ( *a == '#' ) { *a = '\0'; break; }
	// do not add huge words
	if ( strlen(start) > 1000 ) goto lineLoop;
	// skip that
	wp = end + 1;

	// or the word " or " in there!
	// identification|Identification]] or [[identity]] [[documentation]
	// # {{comparative of|[[good]] or [[well]]
	a = start;
	for ( ; *a ; a++ ) {
		if ( strncmp(a,"]] or [[",8) == 0 ) {
			*a = '\0';
			break;
		}
	}

	// if it has any pipes, i am not dealing with that
	// CRAP: # {{sports}} {{initialism of|[[championship|Championship]] [[record|Record]] or [[competition|Competition]] Record}}
	// cuz it gets too complicated!!!
	a = start;
	int32_t pipeCount = 0;
	for ( ; *a ; a++ ) { if ( *a == '|' ) pipeCount++; }
	a = start;
	// too many pipes?
	if ( pipeCount >= 2 )
		goto lineLoop;
	// if just one, pick the first term i guess
	// # {{initialism of|[[w:Americans for Democratic Action|Ame..
	for ( ; *a ; a++ ) {
		if ( *a == '|' ) {
			// fix
			// {{acronym of|Search for [[extraterrestrial|Extraterrestrial]] Intelligence\0
			char *bs = a;
			for ( ; *bs ; bs++ ) {
				if ( *bs == ']' )
					goto lineLoop;
			}
			// ok, good to go
			*a = '\0';
			break;
		}
	}


	// # {{British|Ireland|dated}} {{initialism of|[[&amp;pound;sd
	// nuke if semicolon
	a = start;
	for ( ; *a ; a++ ) {
		if ( *a == ';' ) goto lineLoop;
		if ( *a == '*' ) goto lineLoop; // f**k
		if ( *a == '+' ) goto lineLoop;
		if ( *a == ',' ) goto lineLoop;
		if ( *a == '{' ) goto lineLoop;
		if ( *a == '}' ) goto lineLoop;
		if ( *a == '(' ) goto lineLoop;
		if ( *a == ')' ) goto lineLoop;
		if ( *a == '/' ) goto lineLoop;
	}

	// skip initial spaces again
	for ( ; *start == ' ' ; start++ );

	// forget it if ends or begins with hyphen
	if ( start[0]  == '-' ) goto lineLoop;
	if ( a    [-1] == '-' ) goto lineLoop;

	// or starts with '
	// fix "'s" for "is"  (the dog's running after me)
	// fix "'ve" as a form of "of"
	if ( start[0] == '\'' ) goto lineLoop;

	// same with underscore (fix fotch->_)
	if ( start[0] == '_' ) goto lineLoop;
	if ( a[-1]    == '_' ) goto lineLoop;

	// re-write the base word and filter out [ and ]
	char normBuf[1024];
	dst = normBuf;
	src = start;
	for ( ; *src ; src++ ) {
		*dst = *src;
		if ( *dst == '[' ) continue;
		if ( *dst == ']' ) continue;
		dst++;
	}
	*dst = '\0';
	// trim off spaces
	wend = normBuf + strlen(normBuf);
	// fix ''sadden''
	for ( ; wend && wend>normBuf &&
		      (wend[-1] == ']' ||
		       wend[-1] == ' ' ||
		       wend[-1] == '\'' ) ;
	      wend--);
	*wend = '\0';


	// or starts with '
	// fix "'s" for "is"  (the dog's running after me)
	if ( normBuf[0] == '\'' ) goto lineLoop;

	if ( debug )
		fprintf(stderr,"%s -> %s"
			"\n"
			//"(%s)\n",
			,word // TITLE!
			,normBuf // baseform! // start
			//getLanguageAbbr(langId)
			);
	addWord ( word, flag , langId , normBuf ); // start );
	// try another line
	goto lineLoop;
}

bool Wiktionary::addWord ( char *word ,
			   uint8_t posFlag ,
			   uint8_t langId ,
			   char *formOf ) {

	// done if lang is unknown
	if ( langId == langUnknown ) return true;
	// hash the word
	//int64_t wid = hash64Lower_utf8(word);
	int64_t wid = hash64n(word);

	/*
	// see if already in there
	uint8_t *langIdPtr = (uint8_t *)m_langTableTmp.getValue(&wid);
	// if same
	if ( langIdPtr && *langIdPtr != langId ) {
		// mark it as multi-language, we will delete when done
		*langIdPtr = langTranslingual;
	}
	// otherwise, add it!
	else {
		// . add that then
		// . this only uses 6 byte keys
		if ( ! m_langTableTmp.addKey ( &wid, &langId ) ) return false;
	}
	*/

	// if not form of something make it form of itself
	if ( ! formOf ) formOf = word;

	// to file like dict.cz
	int64_t lk64 = wid ;
	lk64 ^= g_hashtab[4][langId];
	if ( ! m_dedup.isInTable ( &lk64 ) ) {
		m_dedup.addKey ( &lk64 );
		m_langBuf.safePrintf ( "%s|%s\n",
				       getLanguageAbbr(langId),
				       word);
	}

	// store word so we can map word it to a string
	int32_t len = m_debugBuf.length();
	int32_t wlen = strlen(word);
	if ( ! m_debugMap.isInTable ( &wid ) ) {
		m_debugBuf.safeMemcpy ( word, wlen );
		m_debugBuf.pushChar('\0');
		// this only uess 6 byte keys
		if ( ! m_debugMap.addKey ( &wid , &len ) ) return false;
	}

	// need a POS for adding for synonyms
	//if ( ! posFlag ) return true;

	// . get hash of form of
	// . i.e. if word is "jumping" then formOf is "jump"
	// . so this maps "jump" to all the forms it has
	// . thus allowDups is true for this one too
	// . but the "jump" key is language and POS sensitive
	// . so "jump" as a noun does not map to "jumping" (verb) but only
	//   maps to "jumps" the noun
	//int64_t fh64 = hash64Lower_utf8(formOf);
	int64_t fh64 = hash64n(formOf);
	// save that
	int64_t baseForm = fh64;


	// also add formOf
	if ( ! m_debugMap.isInTable ( &baseForm ) ) {
		len = m_debugBuf.length();
		m_debugBuf.safeStrcpy ( formOf );
		m_debugBuf.pushChar('\0');
		// this only uess 6 byte keys
		if ( ! m_debugMap.addKey ( &baseForm , &len ) ) return false;
	}

	// hash in langid
	fh64 ^= g_hashtab[0][langId];
	// include POS flag too i guess
	//fh64 ^= g_hashtab[1][posFlag];

	// dedup table
	int64_t dk64 = hash64h ( fh64 , wid );

	//if ( dk64 == 4174548643612680780LL )
	//	log("boo");

	if ( ! m_dedup.isInTable ( &dk64 ) ) {
		/*
		// the data now includes popularity of wid
		int32_t pop = g_speller.getPhrasePopularity(NULL,
							 wid,
							 true,
							 langId);
		if ( pop > 32000 ) pop = 32000;
		*/
		// make the data
		char data[9];
		gbmemcpy ( data , &wid , 8 );
		data[8] = langId;
		// . add that. allowDups. so you should be able to get all the
		//   forms by just looking at the base form
		// . this uses 8 byte keys
		if ( ! m_tmp.addKey ( &fh64 , data ) ) return false;
		// . add for both
		// . this uses 8 byte keys
		if ( ! m_dedup.addKey ( &dk64 ) ) return false;
	}

	// same for this
	dk64 = hash64h ( fh64 , baseForm );

	//if ( dk64 == 4174548643612680780LL )
	//	log("boo");

	if ( ! m_dedup.isInTable ( &dk64 ) ) {
		/*
		// the data now includes popularity of wid
		int32_t pop = g_speller.getPhrasePopularity(NULL,
							 baseForm,
							 true,
							 langId);
		if ( pop > 32000 ) pop = 32000;
		// make the data
		char data[8];
		gbmemcpy ( data , &baseForm , 6 );
		gbmemcpy ( data + 8 , &pop , 2 );
		*/
		// make the data
		char data[9];
		gbmemcpy ( data , &baseForm , 8 );
		data[8] = langId;
		// . map the base form to itself as well! so compile() works
		//   so if we have the word "jumping" an alt for is "jump"
		// . this uses 8 byte keys
		if ( ! m_tmp.addKey ( &fh64, data ) ) return false;
		// . add for both
		// . this uses 8 byte keys
		if ( ! m_dedup.addKey ( &dk64 ) ) return false;
	}

	// success!
	return true;
}

// . make the synonym/form table from m_tmp
// . m_synTable maps a 48-bit wordid (combined with its language id and
//   its part of speeach flag) to a list of alternative forms
//   which are also 48-bit wordids, suitable for hashing into posdb
// . the reason we combine language id and part of speech flag with the
//   word id, is because "jump" the english noun, does not map to
//   "jumping" for example. so we assume a word is a noun only if it
//   could be both a verb or a noun, as in the case of jump or jumps. however,
//   jumping is treated as a verb.
bool Wiktionary::compile ( ) {

	HashTableX dedup;
	dedup.set ( 8,0,16777216,NULL,0,false,"cdtab");

	// scan the m_tmp table
	for ( int32_t i = 0 ; i < m_tmp.getNumSlots() ; i++ ) {
		// skip empty slots
		if ( ! m_tmp.m_flags[i] ) continue;
		// get this guys key
		int64_t fh64 = m_tmp.getKey64FromSlot(i);
		// is base form "pie"? why doesn't "pie" map to it?
		//if( fh64 == 4935258599006239294LL ) // balon baseform in turk
		//	log("en|UK");
		// do not repeat
		if ( dedup.isInTable ( &fh64 ) ) continue;
		// this uses 8 byte keys
		if ( ! dedup.addKey  ( &fh64 ) ) return false;
		// reset
		//int64_t lastWid = 0LL;
		// remove dups
		HashTableX dd2;
		char dbuf2[512];
		dd2.set(8,0,8,dbuf2,512,false,"ddttt2");
		// how many forms? must be 2+ to get added to syntable
		int32_t formCount = 0;
		for ( int32_t j = i ; ; j++ ) {
			// wrap around
			if ( j >= m_tmp.getNumSlots() ) j = 0;
			// chain stops when we hit empty slot
			if ( ! m_tmp.m_flags[j] ) break;
			// make sure matches
			int64_t kk = m_tmp.getKey64FromSlot(j);
			// must match
			if ( kk != fh64 ) continue;
			// get a form of the base form, wid64
			char *data = (char *)m_tmp.getValueFromSlot(j);

			// must be there
			int32_t *offPtr = (int32_t *)m_debugMap.getValue(data);
			if ( ! offPtr ) gbshutdownLogicError();
			char *word = m_debugBuf.getBufStart() + *offPtr;
			// now re-hash it as lower case
			int64_t wid = hash64Lower_utf8(word);
			// dedup on it
			if ( dd2.isInTable ( &wid ) ) continue;
			dd2.addKey ( &wid );

			// unique
			//if ( *(int64_t *)data == lastWid ) continue;
			// adjacent deduping
			//lastWid = *(int64_t *)data;
			// it matches!
			formCount++;

			//The original code generated synonyms from words with accents based on the unicode canonical-decomposition
			//data. On the surface that sounds like a good idea, eg. if you search for 'Chloe' you'll find hits on
			//'Chloë' too. However, that ignores whether the accent/mark is optional. Removing accents from 'bûche de Noël',
			//'mañaja', 'Ötjendorf' or 'kål' changes the words significantly. You must either not do it or sometimes do
			//language/orthography-dependent transliteration
		}
		// need 2+ forms!
		if ( formCount <= 1 ) continue;
		// base form
		//int64_t wid = *(int64_t *)m_tmp.getValueFromSlot(i);
		// remember buf start
		int32_t bufLen = m_synBuf.length();
		// remove dups
		HashTableX dd;
		char dbuf[512];
		dd.set(8,0,8,dbuf,512,false,"ddttt");
		// a byte for storing the # of synonym forms
		//m_synBuf.pushChar(0);
		// push the langid!
		//m_synBuf.safePrintf("%" PRId32",",langId);
		int32_t count = 0;
		// chain for all keys that are the same
		for ( int32_t j = i ; ; j++ ) {
			// wrap around
			if ( j >= m_tmp.getNumSlots() ) j = 0;
			// chain stops when we hit empty slot
			if ( ! m_tmp.m_flags[j] ) break;
			// . get key of jth slot
			// . this uses 8 byte keys
			// . kk is the hash of the BASE form i think hashed
			//   with the langid
			int64_t kk = m_tmp.getKey64FromSlot(j);
			// must match
			if ( kk != fh64 ) continue;
			// get a form of the base form, wid64
			char *data = (char *)m_tmp.getValueFromSlot(j);
			// get the word id
			//int64_t wid =*(int64_t *)data;
			// CRAP! this is a case dependent hash! we need
			// to make it lower case now that the synsets
			// have been established based on case, since
			// wiktionary is highly case-dependent.
			// get the word itself
			int32_t *offPtr = (int32_t *)m_debugMap.getValue(data);
			// must be there
			if ( ! offPtr ) gbshutdownLogicError();
			char *word = m_debugBuf.getBufStart() + *offPtr;
			// now re-hash it
			int64_t wid = hash64Lower_utf8(word);
			// i bury langid in there
			uint8_t langId = data[8];
			// find "pie"!
			//if ( wid == 1050735555723194583LL )
			//	log("pie");
			// xor in the langid
			wid ^= g_hashtab[0][langId];
			// only add this word form once per langId
			if ( dd.isInTable ( &wid ) ) continue;
			dd.addKey ( &wid );
			// first first time lead with a "<langAbbr>|"
			if ( count == 0 ) {
				m_synBuf.safeStrcpy(getLanguageAbbr(langId));
				m_synBuf.pushChar('|');
			}
			// first is the wid (6 bytes) then pop (2 bytes)
			// exclude popularity for this
			//m_synBuf.safeMemcpy(data , 6 );
			// print that
			m_synBuf.safeStrcpy(word);
			// comma
			if ( count+1<formCount )
				m_synBuf.pushChar(',');
			// . a ptr to that sequence of alt forms in the buf
			// . this uses 6 byte keys
			m_synTable.addKey(&wid,&bufLen);
			// stratocumulus
			//if ( wid == -1556090671932692078 )
			//	log("stratocumulus");

			//
			// wtf?
			// "won" has two bases "win" and "won"
			// en|won,wons,woned
			// en|win,won,winning,wins
			// and we seem to map to the first one only...
			// so maybe allow dup keys in syntable?
			//

			//see note in preceeding lop about accent-based synonym generation
			// count em up
			count++;
			// limit to 100 synonyms per synset
			if ( count >= 100 ) break;
		}
		// new line
		m_synBuf.pushChar('\n');
		// store the count, the # of syns in this synset
		//char *buf = m_synBuf.getBufStart();
		//buf[bufLen] = (char)count;
		// . and of course the base form. "jump"
		// . no, i add the base form map to itself into m_tmp above
		//   in addWords() now
		//m_synTable.addKey(&baseKey64,&bufLen);
	}

	return true;
}

// add unified dict entries into m_langTable if they
// belong to one and only one language
bool Wiktionary::integrateUnifiedDict ( ) {

	/*
	// scan unified dict
	for ( int32_t i = 0 ; i < numSlots ; i++ ) {
		// skip empty slots
		if ( ! ud->m_flags[i] ) continue;
		// get ptrs
		int32_t off = *(int32_t *)ud->getValueFromSlot(i);
		// refernce
		char *p = g_speller.m_unifiedBuf + off;
		// just one lang?
		if ( ! justOneLang ) continue;
		// skip if already there
		if ( m_langTable.isInTable ( &wid ) ) continue;
		// add it then
		if ( ! m_langTable.addKey ( &wid , &langId ) ) return false;
	}
	*/

	/*
	// scan langtable and remove translingual entries
	for ( int32_t i = 0 ; i < m_langTableTmp.m_numSlots ; i++ ) {
		// skip empty slots
		if ( ! m_langTableTmp.m_flags[i] ) continue;
		// check it
		if ( *(uint8_t *)m_langTableTmp.getValueFromSlot(i) ==
		     langTranslingual )
			continue;
		// add it
		char *key = (char *)m_langTableTmp.getKeyFromSlot(i);
		char *val = (char *)m_langTableTmp.getValueFromSlot(i);
		if ( ! m_langTable.addKey ( key , val ) ) return false;
	}
	*/

	return true;
}