1985 lines
53 KiB
C++
1985 lines
53 KiB
C++
#include "Wiktionary.h"
|
|
|
|
#include "Query.h"
|
|
#include "tokenizer.h"
|
|
#include "Titledb.h"
|
|
#include "Speller.h"
|
|
#include "Conf.h"
|
|
#include "Lang.h"
|
|
#include "Mem.h"
|
|
#include "Errno.h"
|
|
#include <sys/stat.h> //stat()
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
#include "gbmemcpy.h"
|
|
|
|
// the global instance
|
|
Wiktionary g_wiktionary;
|
|
|
|
Wiktionary::Wiktionary () {
|
|
m_callback = NULL;
|
|
m_state = NULL;
|
|
m_opened = false;
|
|
|
|
memset(m_buf, 0, sizeof(m_buf));
|
|
m_txtSize = 0;
|
|
m_errno = 0;
|
|
|
|
// . use a 8 byte key size and 2 byte data size
|
|
// . allowDups = true!
|
|
// . now m_langTable just maps to langId, no POS bits...
|
|
//m_langTable.set ( 6 , 1,0,NULL,0,false,0 ,"wkt-lang");
|
|
m_synTable.set ( 6 , 4,0,NULL,0,true,"wkt-synt");
|
|
|
|
m_synBuf.setLabel("synbuf");
|
|
}
|
|
|
|
void Wiktionary::reset() {
|
|
//m_langTable.reset();
|
|
m_synTable .reset();
|
|
m_synBuf.purge();
|
|
|
|
m_debugMap .reset();
|
|
m_debugBuf .purge();
|
|
|
|
m_dedup.reset();
|
|
m_tmp.reset();
|
|
|
|
m_langBuf.reset();
|
|
|
|
m_localBuf.purge();
|
|
m_localTable.reset();
|
|
}
|
|
|
|
Wiktionary::~Wiktionary () {
|
|
if ( m_opened ) m_f.close();
|
|
}
|
|
|
|
|
|
bool Wiktionary::test ( ) {
|
|
|
|
// test words parsing here
|
|
//Words w;
|
|
//w.set9 ("get $4,500.00 now",0);
|
|
|
|
// test it out!
|
|
const char *str = "love";//pie"; //forsake";
|
|
//int64_t wid = hash64Lower_utf8(str);
|
|
int64_t wid = hash64n(str);
|
|
// use this now
|
|
const char *p = getSynSet ( wid, langEnglish );
|
|
//char *p = (char *)m_synTable.getValue ( &wid );
|
|
// must be there
|
|
if ( ! p ) gbshutdownLogicError();
|
|
// first # is number of forms
|
|
//if ( *p < 0 || *p > 100 ) gbshutdownLogicError();
|
|
// first is count!
|
|
//int32_t n = *p;
|
|
// skip that
|
|
//p++;
|
|
// find new line
|
|
const char *end = p;
|
|
for ( ; *end && *end !='\n' ; end++ );
|
|
// cast it
|
|
// only the first 6 bytes are valid
|
|
//int64_t *termIds = (int64_t *)p;
|
|
// header
|
|
log("wikt: test \"%s\" -> \"%*.*s\"",str,(int)(end-p),(int)(end-p),p);
|
|
|
|
return true;
|
|
}
|
|
|
|
#include "Synonyms.h"
|
|
|
|
bool Wiktionary::test2 ( ) {
|
|
|
|
loop2:
|
|
|
|
uint8_t langId = langEnglish; // langUnknown
|
|
|
|
char input[256];
|
|
fgets(input,200,stdin);
|
|
input[strlen(input)-1]='\0';
|
|
if ( input[0] == '\0' ) return true;
|
|
|
|
char *str;
|
|
|
|
// get language
|
|
char *pipe = strstr ( input, "|" );
|
|
if ( ! pipe ) {
|
|
fprintf(stderr,"lang = %s\n",getLanguageAbbr(langId));
|
|
str = input;
|
|
}
|
|
else {
|
|
*pipe = '\0';
|
|
langId = getLangIdFromAbbr ( input );
|
|
fprintf(stderr,"lang = %s\n",getLanguageAbbr(langId));
|
|
str = pipe + 1;
|
|
}
|
|
//wid = hash64Lower_utf8(str);
|
|
//wid = hash64n(str);
|
|
|
|
TokenizerResult tr;
|
|
plain_tokenizer_phase_1(str,strlen(str), &tr);
|
|
calculate_tokens_hashes(&tr);
|
|
|
|
int32_t wordNum = 0;
|
|
char tmpBuf[1000];
|
|
Synonyms syn;
|
|
int32_t naids = syn.getSynonyms(&tr,
|
|
wordNum ,
|
|
langId ,
|
|
tmpBuf );
|
|
// print those out
|
|
SafeBuf sb;
|
|
for ( int32_t k = 0 ; k < naids ; k++ ) {
|
|
char *str = syn.m_termPtrs[k];
|
|
int32_t len = syn.m_termLens[k];
|
|
sb.safeMemcpy(str,len);
|
|
if ( k+1<naids) sb.pushChar(',');
|
|
}
|
|
sb.pushChar('\0');
|
|
|
|
// use this now.
|
|
//p = getSynSet ( wid, langId );//, WF_NOUN );
|
|
|
|
// must be there
|
|
if ( ! naids ) {
|
|
fprintf(stderr,"no forms\n");
|
|
goto loop2;
|
|
}
|
|
|
|
fprintf(stderr,"%s -> %s\n",str,sb.getBufStart());
|
|
goto loop2;
|
|
}
|
|
|
|
// . load from disk
|
|
bool Wiktionary::load() {
|
|
|
|
// load it from .dat file if exists and is newer
|
|
char ff1[sizeof(g_hostdb.m_dir)+128];
|
|
//char ff2[sizeof(g_hostdb.m_dir)+128];
|
|
char ff3[sizeof(g_hostdb.m_dir)+128];
|
|
char ff4[sizeof(g_hostdb.m_dir)+128];
|
|
snprintf(ff1, sizeof(ff1), "%swiktionary.txt.aa", g_hostdb.m_dir);
|
|
ff1[ sizeof(ff1)-1 ] = '\0';
|
|
//sprintf(ff2, "%swiktionary-mybuf.txt", g_hostdb.m_dir);
|
|
snprintf(ff3, sizeof(ff3), "%swiktionary-syns.dat", g_hostdb.m_dir);
|
|
ff3[ sizeof(ff3)-1 ] = '\0';
|
|
snprintf(ff4, sizeof(ff4), "%swiktionary-buf.txt", g_hostdb.m_dir);
|
|
ff4[ sizeof(ff4)-1 ] = '\0';
|
|
int fd1 = open ( ff1 , O_RDONLY );
|
|
int fd3 = open ( ff3 , O_RDONLY );
|
|
if ( fd3 < 0 ) {
|
|
log(LOG_INFO,"wikt: open %s: %s",ff3,mstrerror(errno));
|
|
}
|
|
int fd4 = open ( ff4 , O_RDONLY );
|
|
if ( fd4 < 0 ) {
|
|
log(LOG_INFO,"wikt: open %s: %s",ff1,mstrerror(errno));
|
|
}
|
|
|
|
struct stat stats1;
|
|
struct stat stats3;
|
|
struct stat stats4;
|
|
int32_t errno1 = 0;
|
|
int32_t errno3 = 0;
|
|
int32_t errno4 = 0;
|
|
if ( fd1 < 0 || fstat ( fd1 , &stats1 ) == -1 ) errno1 = fd1 < 0 ? -1 : errno;
|
|
if ( fd3 < 0 || fstat ( fd3 , &stats3 ) == -1 ) errno3 = fd3 < 0 ? -1 : errno;
|
|
if ( fd4 < 0 || fstat ( fd4 , &stats4 ) == -1 ) errno4 = fd4 < 0 ? -1 : errno;
|
|
if( fd1 >= 0 ) close ( fd1 );
|
|
if( fd3 >= 0 ) close ( fd3 );
|
|
if( fd4 >= 0 ) close ( fd4 );
|
|
|
|
// if we got a newer binary version, use that
|
|
if ( ! errno3 && ! errno4 &&
|
|
// load from binaries if orig txt is not there OR our
|
|
// binary make time is ahead of the orig txt make time
|
|
( errno1 || stats3.st_mtime > stats1.st_mtime )
|
|
//&& ( errno2 || stats3.st_mtime > stats2.st_mtime )
|
|
) {
|
|
log(LOG_INFO,"wikt: Loading %s",ff3);
|
|
if ( ! m_synTable .load ( NULL , ff3 ) )
|
|
return false;
|
|
log(LOG_INFO,"wikt: Loading %s",ff4);
|
|
if ( m_synBuf.fillFromFile ( NULL , ff4 ) <= 0 )
|
|
return false;
|
|
|
|
// augment wiktionary with our own overrides and additions from
|
|
if ( ! addSynsets ( "mysynonyms.txt" ) )
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
// if no text file that is bad
|
|
if ( errno1 ) {
|
|
g_errno = errno1 ;
|
|
log (LOG_WARN, "gb: could not open %s for reading: %s",ff1, mstrerror(g_errno));
|
|
return false;
|
|
}
|
|
//if ( errno2 ) {
|
|
// g_errno = errno2 ;
|
|
// log (LOG_WARN, "gb: could not open %s for reading: %s",ff2,mstrerror(g_errno));
|
|
// return false;
|
|
//}
|
|
// init table slot sizes
|
|
//m_langTable.setTableSize ( 16777216 , NULL , 0 );
|
|
//m_synTable .setTableSize ( 16777216 , NULL , 0 );
|
|
//m_debugMap .setTableSize ( 8388608 , NULL , 0 );
|
|
m_dedup.set ( 8 , 0 , 16777216 , NULL , 0 , false,"ddtab");
|
|
// this has to allow dups! it maps a baseForm to a variant/syn
|
|
// now it includes langid
|
|
m_tmp.set ( 8 , 9 , 16777216 , NULL , 0 , true,"tmptab");
|
|
m_debugMap.set ( 8 , 4,0,NULL,0,false,"wkt-dmap");
|
|
//m_langTableTmp.set( 6 , 1,0,NULL,0,false,0 ,"wktlangt");
|
|
// this maps a pure word id (wid) to an offset in m_debugBuf for
|
|
// printing out the word
|
|
//m_debugMap.set ( 6 , 4 , 8388608 , NULL , 0 , false, 0,"dbgmap");
|
|
|
|
// get the size of it
|
|
int32_t size = stats1.st_size;
|
|
// now we have to load the text file
|
|
// returns false and sets g_errno on error
|
|
if ( ! generateHashTableFromWiktionaryTxt ( size ) ) return false;
|
|
// success!
|
|
return true;
|
|
}
|
|
|
|
static const char *s_lowerLangWikiStrings[] = {
|
|
"unknown","english","french","spanish","russian","turkish","japanese",
|
|
"cantonese", // "chinese traditional",
|
|
"mandarin", // "chinese simplified",
|
|
"korean","german","dutch",
|
|
"italian","finnish","swedish","norwegian","portuguese","vietnamese",
|
|
"arabic","hebrew","indonesian","greek","thai","hindi","bengala",
|
|
"polish","tagalog",
|
|
|
|
"latin",
|
|
"esperanto",
|
|
"catalan",
|
|
"bulgarian",
|
|
"translingual",
|
|
"serbo-croatian",
|
|
"hungarian",
|
|
"danish",
|
|
"lithuanian",
|
|
"czech",
|
|
"galician",
|
|
"georgian",
|
|
"scottish gaelic",
|
|
"gothic",
|
|
"romanian",
|
|
"irish",
|
|
"latvian",
|
|
"armenian",
|
|
"icelandic",
|
|
"ancient greek",
|
|
"manx",
|
|
"ido",
|
|
"persian",
|
|
"telugu",
|
|
"venetian",
|
|
"malagasy",
|
|
"kurdish",
|
|
"luxembourgish",
|
|
"estonian"
|
|
};
|
|
|
|
// add our special augmentation table
|
|
// Synonyms.cpp should check this table separately so we can keep it
|
|
// somewhat small and re-load it on the fly.
|
|
// mysynonyms.txt
|
|
bool Wiktionary::addSynsets ( const char *filename ) {
|
|
|
|
// load it up
|
|
//SafeBuf sb;
|
|
if ( m_localBuf.fillFromFile ( g_hostdb.m_dir , filename ) < 0 ) {
|
|
log(LOG_WARN, "wikt: error loading %s", filename);
|
|
return false;
|
|
}
|
|
|
|
if ( ! m_localTable.set ( 8 ,4,9000,NULL,0,false,"synloc") )
|
|
return false;
|
|
|
|
char *p = m_localBuf.getBufStart();
|
|
|
|
nextLine:
|
|
// get end of line
|
|
char *eol = p;
|
|
// sanity
|
|
char *bufEnd = m_localBuf.getBufPtr();
|
|
if ( eol >= bufEnd )
|
|
return true;
|
|
for ( ; *eol && *eol != '\n' ; eol++ );
|
|
// skip spaces
|
|
for ( ; *p == ' ' || *p == '\t' ; p++ );
|
|
// skip comment lines
|
|
if ( *p == '#' ) {
|
|
p = eol + 1;
|
|
goto nextLine;
|
|
}
|
|
// blank line?
|
|
if ( *p == '\n' ) {
|
|
p = eol + 1;
|
|
goto nextLine;
|
|
}
|
|
// over? last line?
|
|
if ( p == eol ) return true;
|
|
// pretty lines
|
|
//if ( *eol == '\n' )
|
|
// *eol = '\0';
|
|
// need a langid like "en|vs,against"
|
|
char *lang = p;
|
|
p += 2;
|
|
// is it like zh_ch?
|
|
if ( *p == '_' ) p += 3;
|
|
// sanity
|
|
if ( *p != '|' ) {
|
|
log(LOG_WARN, "wikt: bad %s file! no lang", filename);
|
|
return false;
|
|
}
|
|
// null term now
|
|
*p = '\0';
|
|
// skip that
|
|
uint8_t langId = getLangIdFromAbbr ( lang );
|
|
// put char back
|
|
*p = '|';
|
|
// skip the pipe then
|
|
p++;
|
|
// must be there
|
|
if ( langId == 0 ) {
|
|
log(LOG_WARN, "wikt: bad language abbr in %s", filename);
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// JUST ADD THESE SYNSETS as separate form wiktionary-buf.txt
|
|
// because even if duped it will not matter, Synonyms.cpp dedups
|
|
// all the word forms.
|
|
//
|
|
|
|
//
|
|
// since we now only do synonyms at query time and never index them
|
|
// it will make things much easier to deal with when we make mods
|
|
// to this stuff.
|
|
//
|
|
|
|
// make it an offset
|
|
int32_t firstLineOffset = lang - m_localBuf.getBufStart();
|
|
|
|
// remember first word
|
|
//char *first = p;
|
|
//int64_t baseHash64;
|
|
|
|
wordLoop:
|
|
// find end of word
|
|
char *e = p+1;
|
|
for ( ; *e && *e != '\n' && *e != ',' ; e++ );
|
|
|
|
// CRAP, hash each word separately???
|
|
|
|
// get word hash. ignore spaces in there... we we hash it like
|
|
// a bigram, although if a stopword leads the phrase ids will
|
|
// xor in a special number to prevent "the rapist" from being
|
|
// "therapist". see Phrases.cpp... we do not have trigrams yet
|
|
// so we will have to do like bigram list chaning somehow to
|
|
// simulate trigrams.
|
|
int64_t wh64 = hash64n_nospaces(p,e-p);
|
|
// mangle with language id so Wiktionary::getSynSet() works
|
|
wh64 ^= g_hashtab[0][langId];
|
|
// last of it?
|
|
char *nextWord = NULL;
|
|
if ( *e == ',' ) nextWord = e + 1;
|
|
//
|
|
// now add the words
|
|
//
|
|
// . point to line start... "en|..."
|
|
// . fix "en|read,,centimes,phantasia" for empty word...
|
|
if ( wh64 != 0 &&
|
|
e-p > 0 &&
|
|
! m_localTable.addKey ( &wh64 , &firstLineOffset ) )
|
|
return false;
|
|
// advance to next word
|
|
p = nextWord;
|
|
// add the word into the synset
|
|
if ( p ) goto wordLoop;
|
|
|
|
// next line otherwise
|
|
p = eol+1;
|
|
goto nextLine;
|
|
}
|
|
|
|
bool Wiktionary::generateHashTableFromWiktionaryTxt ( int32_t sizen ) {
|
|
|
|
// for debug
|
|
//sizen = 10000000;
|
|
int32_t round = 0;
|
|
|
|
//
|
|
// FILE FORMAT HELP:
|
|
//
|
|
// https://secure.wikimedia.org/wiktionary/en/wiki/Wiktionary:Entry_layout_explained
|
|
// https://secure.wikimedia.org/wiktionary/en/wiki/Wiktionary:Entry_layout_explained/POS_headers
|
|
//
|
|
//
|
|
// i downloaded this file from
|
|
// http://dumps.wikimedia.org/enwiktionary/latest/
|
|
// http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-abstract.xml
|
|
// THEN i ran split it on like 'split -b 2000000000 wiktionary.txt'
|
|
// to divide it into two files, the first one being 2GB:
|
|
// wiktionary.txt.aa and wiktionary.txt.ab
|
|
// So read those files in here.
|
|
//
|
|
// OUTPUT files:
|
|
//
|
|
// wiktionary-syns.dat (maps a wordId to ptr into wiktionary-buf.txt)
|
|
// wiktionary-buf.txt (one syn set per line)
|
|
// wiktionary-lang.txt (<landId>|<word>\n) (used by Speller.cpp)
|
|
//
|
|
char ff1[sizeof(g_hostdb.m_dir)+128];
|
|
snprintf(ff1, sizeof(ff1), "%swiktionary.txt.aa", g_hostdb.m_dir);
|
|
ff1[ sizeof(ff1)-1 ] = '\0';
|
|
|
|
log(LOG_INFO,"wikt: Loading %s",ff1);
|
|
int fd1 = open ( ff1 , O_RDONLY );
|
|
if ( fd1 < 0 ) {
|
|
log("wikt: open %s : %s",ff1,mstrerror(errno));
|
|
return false;
|
|
}
|
|
// read in whole thing
|
|
int64_t maxReadSize = 300000000; // 300MB
|
|
char *buf = (char *)mmalloc ( maxReadSize + 1 , "wikt" );
|
|
if ( ! buf ) {
|
|
close ( fd1 );
|
|
return false;
|
|
}
|
|
|
|
int64_t offset = 0LL;
|
|
|
|
// use this to scrape popularity info and other words we are missing
|
|
//if ( ! g_speller.init() ) return false;
|
|
|
|
// the wiktionary file is like 2.6GB so we can't hold the whole thing
|
|
readInSomeFile:
|
|
|
|
// limit to 300MB
|
|
int32_t readSize = sizen;
|
|
if ( readSize > maxReadSize ) readSize = maxReadSize;
|
|
|
|
// do not breach file size
|
|
if ( offset + readSize > sizen )
|
|
readSize = sizen - offset;
|
|
|
|
//
|
|
//
|
|
// ARE WE DONE????
|
|
//
|
|
//
|
|
if ( offset >= sizen ) {
|
|
// don't forget to close
|
|
close ( fd1 );
|
|
|
|
// try reading next split file
|
|
if ( round == 0 ) {
|
|
round++;
|
|
offset = 0;
|
|
snprintf(ff1, sizeof(ff1), "%swiktionary.txt.ab",g_hostdb.m_dir);
|
|
ff1[ sizeof(ff1)-1 ] = '\0';
|
|
|
|
log(LOG_INFO,"wikt: Loading %s",ff1);
|
|
fd1 = open ( ff1 , O_RDONLY );
|
|
if ( fd1 < 0 ) {
|
|
log("wikt: open %s : %s",ff1,mstrerror(errno));
|
|
return false;
|
|
}
|
|
struct stat stats;
|
|
if ( fstat ( fd1 , &stats ) == -1 ) {
|
|
g_errno = errno;
|
|
close ( fd1 );
|
|
return false;
|
|
}
|
|
sizen = stats.st_size;
|
|
goto readInSomeFile;
|
|
}
|
|
|
|
// do not save if we can't
|
|
if ( g_conf.m_readOnlyMode ) return true;
|
|
|
|
// build m_synTable from m_tmp table
|
|
if ( ! compile() ) return false;
|
|
|
|
// add unified dict entries into m_langTable if they
|
|
// belong to one and only one language.
|
|
// right now, this just cleans out m_langTable.
|
|
if ( ! integrateUnifiedDict() ) return false;
|
|
|
|
log("wikt: testing");
|
|
|
|
//log("wiktL debug skipping test!");
|
|
test();
|
|
|
|
log("wikt: test passed");
|
|
|
|
// now save this hash table for quicker loading next time
|
|
//if ( ! m_langTable.save ( g_hostdb.m_dir ,
|
|
// "wiktionary-langs.dat" ) )
|
|
// return false;
|
|
|
|
// . and the synomnyms
|
|
// . offsets into m_synBuf, text file of synsets
|
|
if ( ! m_synTable.save ( g_hostdb.m_dir ,
|
|
"wiktionary-syns.dat" ,
|
|
NULL,
|
|
0 ) )
|
|
//m_synBuf.getBufStart() ,
|
|
//m_synBuf.length() ) )
|
|
return false;
|
|
// save text file
|
|
if ( m_synBuf.saveToFile ( g_hostdb.m_dir,
|
|
"wiktionary-buf.txt" ) <= 0 )
|
|
return false;
|
|
|
|
if ( m_langBuf.saveToFile(g_hostdb.m_dir,
|
|
"wiktionary-lang.txt" ) <= 0 )
|
|
return false;
|
|
|
|
|
|
// this too?
|
|
//if ( ! m_debugMap.save ( g_hostdb.m_dir ,
|
|
// "wiktionary-strings.dat",
|
|
// m_debugBuf.getBufStart() ,
|
|
// m_debugBuf.length() ))
|
|
// return false;
|
|
|
|
// clear this
|
|
m_tmp .reset();
|
|
m_dedup.reset();
|
|
|
|
m_debugMap.reset();
|
|
m_debugBuf.purge();
|
|
|
|
m_langBuf.reset();
|
|
|
|
return true;
|
|
}
|
|
|
|
// log it
|
|
log("wikt: reading %" PRId32" bytes of %s @ %" PRId64" (filesize=%" PRId32")",
|
|
readSize,ff1,offset,sizen);
|
|
|
|
int32_t n = pread ( fd1 , buf , readSize , offset );
|
|
|
|
if ( n != readSize ) {
|
|
log("wikt: read: %s",mstrerror(errno));
|
|
g_errno = EBADENGINEER;
|
|
close ( fd1 );
|
|
return false;
|
|
}
|
|
|
|
log("wikt: processing");
|
|
|
|
// advance for next read
|
|
offset += n;
|
|
|
|
// null terminate
|
|
buf[readSize] = '\0';
|
|
|
|
//
|
|
// simple filter. back to back spaces removed in next loop.
|
|
//
|
|
char *p = buf;
|
|
for ( ; *p ; p++ ) {
|
|
// fix # {{form of|Abbreviation|biography}} for 'bio'
|
|
if ( p[0] == 'f' &&
|
|
p[1] == 'o' &&
|
|
p[2] == 'r' &&
|
|
p[3] == 'm' &&
|
|
p[4] == ' ' &&
|
|
p[5] == 'o' &&
|
|
p[6] == 'f' &&
|
|
p[7] == '|' &&
|
|
to_lower_a(p[8]) == 'a' &&
|
|
to_lower_a(p[9]) == 'b' &&
|
|
!strncasecmp(p ,"form of|abbreviation|",21) )
|
|
// overwrite the pipe with a space
|
|
gbmemcpy(p ,"abbreviated form of|",21);
|
|
}
|
|
|
|
|
|
|
|
char *src = buf;
|
|
char *dst = buf;
|
|
// filter out the annoying bold '''
|
|
for ( ; *src ; src++ ) {
|
|
// skip bold thingy
|
|
if ( src[0] =='\'' &&
|
|
src[1] =='\'' &&
|
|
src[2] =='\'' ) {
|
|
src += 2;
|
|
continue;
|
|
}
|
|
// # {{present participle of|''[[snort]]''}}
|
|
if ( src[0] =='\'' &&
|
|
src[1] =='\'' ) {
|
|
src += 1;
|
|
continue;
|
|
}
|
|
// <space>| "for |" "form |"
|
|
if ( src[0] == ' ' &&
|
|
src[1] == '|' )
|
|
continue;
|
|
// filter back-to-back spaces
|
|
if ( src[0] == ' ' &&
|
|
src[1] == ' ' )
|
|
continue;
|
|
// <space>,
|
|
if ( src[0] == ' ' &&
|
|
src[1] == ',' )
|
|
continue;
|
|
*dst++ = *src;
|
|
}
|
|
*dst = '\0';
|
|
|
|
|
|
//
|
|
// . filter the buffer
|
|
// . set "name" to the word we are a form of
|
|
//
|
|
p = buf;
|
|
for ( ; *p ; p++ ) {
|
|
// REWRITE A LINE SEGMENT
|
|
// # {{given name|male|diminutive=Samuel}}
|
|
// # {{given name|male|diminut of|Samuel}}
|
|
if ( p[0] == 'd' &&
|
|
p[1] == 'i' &&
|
|
p[2] == 'm' &&
|
|
!strncmp(p ,"diminutive=",11) ) {
|
|
gbmemcpy(p,"diminut of|",11);
|
|
p += 11;
|
|
continue;
|
|
}
|
|
|
|
bool needPound = true;
|
|
// assume no name
|
|
char *name = NULL;
|
|
// REWRITE A FULL LINE
|
|
// # A [[diminutive]] of the male [[given name]] [[Douglas]].\n
|
|
// # {{diminutive form of|Douglas}} \n
|
|
if ( p[0] == 'm' &&
|
|
p[1] == 'a' &&
|
|
p[2] == 'l' &&
|
|
!strncmp(p ,"male [[given name]] [[",22) ) {
|
|
needPound = false;
|
|
name = p + 22;
|
|
}
|
|
//# {{given name|female}}, a [[diminutive]] of [[Abigail]].
|
|
if ( p[0] == '[' &&
|
|
p[1] == '[' &&
|
|
p[2] == 'd' &&
|
|
p[3] == 'i' &&
|
|
!strncmp(p ,"[[diminutive]] of [[",20) ) {
|
|
needPound = false;
|
|
name = p + 20;
|
|
}
|
|
|
|
// set needPound = true for this below
|
|
// variant spelling of [[poo]]
|
|
if ( p[0] == 's' &&
|
|
p[1] == 'p' &&
|
|
p[2] == 'e' &&
|
|
p[3] == 'l' &&
|
|
! strncasecmp(p ,"spelling of [[",14) )
|
|
name = p + 14;
|
|
|
|
// past participle of [[block]]
|
|
if ( p[0] == 'p' &&
|
|
p[1] == 'a' &&
|
|
p[2] == 'r' &&
|
|
p[3] == 't' &&
|
|
p[4] == 'i' &&
|
|
! strncasecmp(p ,"participle of [[",16) )
|
|
name = p + 16;
|
|
|
|
|
|
// past participle of to [[block]]
|
|
if ( p[0] == 'p' &&
|
|
p[1] == 'a' &&
|
|
p[2] == 'r' &&
|
|
p[3] == 't' &&
|
|
p[4] == 'i' &&
|
|
! strncasecmp(p ,"participle of to [[",19) )
|
|
name = p + 19;
|
|
|
|
// # [[present participle|Present participle]] of [[link]].
|
|
if ( p[0] == 'a' &&
|
|
p[1] == 'r' &&
|
|
p[2] == 't' &&
|
|
p[3] == 'i' &&
|
|
p[4] == 'c' &&
|
|
! strncasecmp(p ,"articiple]] of [[",17) )
|
|
name = p + 17;
|
|
|
|
// definite [s|S]ingular of [[block]]
|
|
if ( p[0] == 'i' &&
|
|
p[1] == 'n' &&
|
|
p[2] == 'g' &&
|
|
p[3] == 'u' &&
|
|
p[4] == 'l' &&
|
|
! strncasecmp(p ,"ingular of [[",14) )
|
|
name = p + 14;
|
|
|
|
// # Singular of {{term|airwaves|lang=en}};
|
|
if ( p[0] == 'i' &&
|
|
p[1] == 'n' &&
|
|
p[2] == 'g' &&
|
|
p[3] == 'u' &&
|
|
p[4] == 'l' &&
|
|
! strncasecmp(p ,"ingular of {{term|",18) )
|
|
name = p + 18;
|
|
|
|
// definite [p|P]lural of [[block]]
|
|
if ( p[0] == 'l' &&
|
|
p[1] == 'u' &&
|
|
p[2] == 'r' &&
|
|
p[3] == 'a' &&
|
|
p[4] == 'l' &&
|
|
! strncasecmp(p ,"lural of [[",11) )
|
|
name = p + 11;
|
|
|
|
// substitue form for case
|
|
// "objective case of" ... treat like form
|
|
// should fix page for "us" which is "objective case of we"
|
|
bool mangled = false;
|
|
if ( ! name &&
|
|
p[0] == 'c' &&
|
|
p[1] == 'a' &&
|
|
p[2] == 's' &&
|
|
p[3] == 'e' ) {
|
|
gbmemcpy ( p , "form" , 4 );
|
|
mangled = true;
|
|
}
|
|
|
|
// need "form of" for shit below
|
|
if ( ! name &&
|
|
( p[0] != 'f' ||
|
|
p[1] != 'o' ||
|
|
p[2] != 'r' ||
|
|
p[3] != 'm' ) )
|
|
continue;
|
|
|
|
bool doTailCheck = true;
|
|
if ( name ) doTailCheck = false;
|
|
|
|
// # Short form of [[hippopotamus]].
|
|
if ( ! strncasecmp(p-5 ,"past form of",12) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-6 ,"short form of",13) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-6 ,"tense form of",13) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-7 ,"plural form of",14) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-7 ,"dative form of",14) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-8 ,"present form of",15) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-9 ,"familiar form of",16) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-9 ,"singular form of",16) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-9 ,"feminine form of",16) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-9 ,"emphatic form of",16) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-9 ,"genitive form of",16) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"shortened form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"inflected form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"masculine form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"imperfect form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"objective form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"partitive form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-10 ,"reflexive form of",17) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-11 ,"diminutive form of",18) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-11 ,"simplified form of",18) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-11 ,"imperative form of",18) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-11 ,"indicative form of",18) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-11 ,"possessive form of",18) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-11 ,"accusative form of",18) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-12 ,"abbreviated form of",19) )
|
|
name = p + 7;
|
|
if ( ! strncasecmp(p-12 ,"alternative form of",19) )
|
|
name = p + 7;
|
|
if ( mangled )
|
|
gbmemcpy ( p , "case" , 4 );
|
|
// skip if no match
|
|
if ( ! name ) continue;
|
|
|
|
// then after "of" comes a space
|
|
if ( doTailCheck ) {
|
|
// need to have this
|
|
if ( strncmp(name," [[",3)== 0 ) name += 3;
|
|
// OR YOU CAN HAVE THIS
|
|
// # Past tense and past participle of ''to [[block]]''
|
|
// for title of "blocked". the '' should have been
|
|
// filtered out above.
|
|
else if ( strncmp(name," to [[",6)== 0 ) name += 6;
|
|
// otherwise, forget it!!
|
|
else continue;
|
|
}
|
|
|
|
// ok, replace the line with a proper name line
|
|
char *lineStart = p;
|
|
for ( ; lineStart > buf&&*lineStart!='#'&&lineStart[-1]!='\n';
|
|
lineStart--);
|
|
// need this? this is a numbered line used as a definition
|
|
// line.
|
|
if ( needPound && *lineStart != '#' )
|
|
continue;
|
|
// end end of it
|
|
char *lineEnd = p;
|
|
for ( ; *lineEnd&&*lineEnd !='\n';lineEnd++);
|
|
// temp null that
|
|
char c = *lineEnd;
|
|
*lineEnd = '\0';
|
|
//
|
|
// check for badness
|
|
// i don't like obsolete forms!!! filter out.
|
|
//
|
|
char *bad = NULL;
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"archaic");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"rare ");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"less common");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"uncommon ");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"obsolete");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"older ");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"old ");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"nonstandard");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"eye-dialect");
|
|
if ( ! bad ) bad = gb_strcasestr(lineStart,"eye dialect");
|
|
*lineEnd = c;
|
|
if ( bad )
|
|
continue;
|
|
// now store a new form
|
|
char *dst = lineStart;
|
|
gbmemcpy(dst,"# {{form|",9);
|
|
dst += 9;
|
|
// point to name
|
|
//char *name = p + 22;
|
|
//
|
|
// PUT it in the proper formation for parsing in the logic
|
|
// below
|
|
//
|
|
// copy over name
|
|
for ( ; *name !=']' &&
|
|
*name !='\n' &&
|
|
*name != '#' &&
|
|
*name != '|' ; name++ )
|
|
*dst++ = *name;
|
|
// close it up
|
|
gbmemcpy(dst,"}}",2);
|
|
dst += 2;
|
|
// panic
|
|
if ( dst > lineEnd ) gbshutdownLogicError();
|
|
// space fill until lineEnd
|
|
for ( ; dst < lineEnd ; dst++ )
|
|
*dst = ' ';
|
|
// skip over that line then
|
|
p = lineEnd;
|
|
}
|
|
|
|
// start parsing here
|
|
p = buf;
|
|
|
|
wordLoop:
|
|
|
|
// look for <title> tag
|
|
char *title = strstr ( p , "<title>" );
|
|
|
|
if ( ! title ) goto readInSomeFile;
|
|
|
|
// find title after so we know we have a full page
|
|
char *nextTitle = strstr ( title + 5 , "<title" );
|
|
if ( ! nextTitle ) goto readInSomeFile;
|
|
|
|
// advance
|
|
p = nextTitle;
|
|
|
|
// . scan from title to next title
|
|
// . if it contains "Shavian" then bail! those are stupid
|
|
// shavian script characters. one of them is short for "of"
|
|
// so it shows up in of's synset!
|
|
char c;
|
|
if ( nextTitle ) {c = *nextTitle;*nextTitle = '\0';}
|
|
char *found = strstr ( title , "Shavian ");
|
|
if ( nextTitle ) *nextTitle = c;
|
|
if ( found ) goto wordLoop;
|
|
|
|
|
|
// get the word in the title, <title>
|
|
char *word = title + 7;
|
|
// find end of it
|
|
char *wp = word ;
|
|
for ( ; *wp && *wp != '<' ; wp++ ) {
|
|
// any space is bad
|
|
if ( is_wspace_a(*wp) ) break;
|
|
// or colon
|
|
if ( *wp == ':' ) break;
|
|
// or * (f*ck)
|
|
if ( *wp == '*' ) break;
|
|
}
|
|
// bad word that has space or colon in it?
|
|
if ( *wp != '<' ) goto wordLoop;
|
|
// remove any trailing spaces
|
|
for ( ; wp[-1] == ' ' ; wp-- );
|
|
// if word ends in hyphen skip (anxio-)
|
|
if ( wp[-1] == '-' ) goto wordLoop;
|
|
// or starts with '
|
|
if ( word[0] == '\'' ) goto wordLoop;
|
|
// or ends with ' like "o'" form of "of"
|
|
if ( wp[-1] == '\'' ) goto wordLoop;
|
|
// null term so "title" is null terminated
|
|
*wp = '\0';
|
|
// and skip
|
|
wp++;
|
|
|
|
int32_t flag = 0;
|
|
uint8_t langId = langUnknown;
|
|
|
|
bool debug = false;
|
|
//debug = true;
|
|
|
|
// set nextline
|
|
char *np = wp;
|
|
for ( ; *np && np < nextTitle ; np++ )
|
|
if ( *np =='#' || (*np == '=' && np[1]=='=') ) break;
|
|
|
|
lineLoop:
|
|
|
|
// advance to next line. unless its the first line for this word
|
|
// in which np already equals wp.
|
|
wp = np;
|
|
|
|
// . set next line for next call to goto lineLoop.
|
|
// . we do this this way because the code below inserts \0's into
|
|
// the line for easier parsing...
|
|
np++;
|
|
for ( ; *np == '=' ; np++ );
|
|
for ( ; *np && np < nextTitle ; np++ ) {
|
|
if ( *np =='#' ) break;
|
|
//if ( np[-1] == '\n' ) break;
|
|
if (*np == '=' && np[1]=='=') break;
|
|
}
|
|
|
|
// scan for next header OR part of speech description
|
|
//for ( ; *wp && wp < nextTitle ; wp++ )
|
|
// if ( *wp =='#' || (*wp == '=' && wp[1]=='=') ) break;
|
|
|
|
// get next word if no more lines
|
|
if ( ! *wp || wp >= nextTitle ) goto wordLoop;
|
|
|
|
// skip line break (\n)
|
|
//if ( *wp == '\n' ) wp++;
|
|
// get next word if no more lines
|
|
//if ( ! *wp || wp >= nextTitle ) goto wordLoop;
|
|
// need a header or a comment here
|
|
//if ( *wp != '=' && *wp != '#' ) goto lineLoop;
|
|
|
|
// we got a header, set langid or set POS
|
|
if ( *wp == '=' ) {
|
|
// count em
|
|
int32_t equalCount = 0;
|
|
// skip any extra ='s
|
|
for ( ; *wp == '=' ; wp++ ) equalCount++;
|
|
// if newline follows this equal, it was at the end of
|
|
// an equal pair like "==English=="
|
|
if ( *wp == '\n' ) goto lineLoop;
|
|
// debug
|
|
//int32_t diff = wp - buf;
|
|
//log("diff = %" PRId32,diff);
|
|
// a pos?
|
|
if ( ! strncasecmp(wp,"noun",4) ) {
|
|
flag = WF_NOUN;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (noun)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"verb",4) ) {
|
|
flag = WF_VERB;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (verb)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"participle",10) ) {
|
|
flag = WF_VERB;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (particple)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"preposition",11) ) {
|
|
flag = WF_PREPOSITION;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (preposition)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"interjection",12) ) {
|
|
flag = WF_INTERJECTION;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (interjection)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"pronoun",7) ) {
|
|
flag = WF_PRONOUN;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (pronoun)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"proper",6) ) {
|
|
flag = WF_NOUN; // proper noun
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (proper noun)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"abbrev",6) ) {
|
|
flag = WF_ABBREVIATION;//NOUN; // abbreviation
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (abbreviation)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"letter",6) ) {
|
|
flag = WF_LETTER;//NOUN; // abbreviation
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (letter)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"acronym",7) ) {
|
|
flag = WF_NOUN;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (acronym)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"initialism",10) ) {
|
|
flag = WF_INITIALISM;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (initialism)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"adjective",9) ) {
|
|
flag = WF_ADJECTIVE;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (adjective)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"adverb",6) ) {
|
|
flag = WF_ADVERB;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (adverb)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
if ( ! strncasecmp(wp,"article",7) ) {
|
|
flag = WF_ARTICLE;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (article)\n",word);
|
|
addWord ( word, flag , langId , NULL );
|
|
goto lineLoop;
|
|
}
|
|
// is it a language we support?
|
|
int32_t n = sizeof(s_lowerLangWikiStrings) / sizeof(char *);
|
|
for ( int32_t i = 0 ; i < n ; i++ ) {
|
|
const char *str = s_lowerLangWikiStrings[i];
|
|
if ( ! str ) gbshutdownLogicError();
|
|
int32_t len = strlen(str);
|
|
if ( ! strncasecmp(wp,str,len) ) {
|
|
langId = i;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (%s)\n",
|
|
word,getLanguageAbbr(langId));
|
|
addWord ( word, 0 , langId , NULL);
|
|
goto lineLoop;
|
|
}
|
|
}
|
|
// unsupported lang?
|
|
if ( equalCount == 2 ) {
|
|
langId = langUnknown;
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> (%s)\n",
|
|
word,getLanguageAbbr(langId));
|
|
addWord ( word, 0 , langId , NULL );
|
|
}
|
|
|
|
// ignore the header otherwise
|
|
goto lineLoop;
|
|
}
|
|
|
|
bool gotGoodLine = false;
|
|
|
|
// we might have "{{head|tr|abbreviation}} (''[[....
|
|
// which does not start with a #
|
|
//if ( wp[0] == '{' && wp[1] == '{' )
|
|
// gotGoodLine = true;
|
|
|
|
// we got a comment
|
|
if ( *wp == '#' ) {
|
|
gotGoodLine = true;
|
|
wp++;
|
|
}
|
|
|
|
if ( ! gotGoodLine ) goto lineLoop;
|
|
|
|
// save this
|
|
char *lineStart = wp;
|
|
// skip #
|
|
//wp++;
|
|
// skip space
|
|
if ( is_wspace_a(*wp) ) wp++;
|
|
|
|
// debug point
|
|
//if ( word[0] == 'b' && word[1] == 'i' && word[2] == 'o' && ! word[3])
|
|
// log("got bio");
|
|
|
|
//
|
|
// SPECIAL case for abbreviations.
|
|
// like for http://en.wiktionary.org/wiki/KS we got
|
|
// # [[Kansas]], a state of the [[United States of America]].
|
|
/*
|
|
if ( flag == WF_ABBREVIATION ||
|
|
flag == WF_INITIALISM ) {
|
|
// save it
|
|
char *wpsave = wp;
|
|
// forget it if single letter! too much confusion!!
|
|
if ( ! word[1] ) goto skipSpecialLogic;
|
|
// if the line has a '{' in it then do not do this stuff
|
|
// skip until we hit a [[ but stop on # or \n.
|
|
// no! hurts # "{{economics}} [[gross domestic product]]"
|
|
//for ( ; *wp &&
|
|
// // if we hit this it might be of proper form
|
|
// // like
|
|
// // # [[operating system]];
|
|
// // {{abbreviation of|operativsystem|lang=sv}}
|
|
// *wp != '{' &&
|
|
// *wp !='#' &&
|
|
// *wp !='\n' ;
|
|
// wp++ );
|
|
//if ( *wp == '{' ) { wp = wpsave; goto skipSpecialLogic; }
|
|
// restore it
|
|
wp = wpsave;
|
|
// skip until we hit a [[ but stop on # or \n
|
|
for ( ; *wp &&
|
|
*wp != '[' &&
|
|
*wp !='#' &&
|
|
*wp !='\n' ;
|
|
wp++ );
|
|
// get [ for abbreviation lists. what are we an abbrev of?
|
|
if ( *wp != '[' ) { wp = wpsave; goto skipSpecialLogic; }
|
|
wp++;
|
|
if ( *wp != '[' ) { wp = wpsave; goto skipSpecialLogic; }
|
|
wp++;
|
|
// skip w: for wikipedia references
|
|
if ( wp[0] == 'w' && wp[1] == ':' ) wp += 2;
|
|
// find ]
|
|
char *wpend = wp + 1;
|
|
for ( ; *wpend &&
|
|
//[[w:Maltese Cross#United Kingdom|Maltese Cross
|
|
*wpend != '#' &&
|
|
//[[w:Maltese Cross#United Kingdom|Maltese Cross
|
|
*wpend != '|' &&
|
|
*wpend != ']' ;
|
|
wpend++ ) ;
|
|
if ( ! *wpend || *wpend != ']' ) {
|
|
wp = wpsave; goto skipSpecialLogic; }
|
|
// if word ends in '-' toss it out... "centi-" prefix
|
|
if ( wpend[-1] == '-' ) {wp = wpsave; goto skipSpecialLogic; }
|
|
// "w/"
|
|
if ( wpend[-1] == '/' ) {wp = wpsave; goto skipSpecialLogic; }
|
|
*wpend = '\0';
|
|
// get that word then
|
|
//if ( debug )
|
|
fprintf(stderr,"%s|%s -> %s"
|
|
"\n"
|
|
//"(%s)\n",
|
|
,getLanguageAbbr(langId)
|
|
,word // TITLE!
|
|
,wp
|
|
);
|
|
addWord ( word, flag , langId , wp );
|
|
// try another line
|
|
goto lineLoop;
|
|
}
|
|
|
|
skipSpecialLogic:
|
|
*/
|
|
|
|
// look for something like "{{abbreviation of|Albuquerque|.."
|
|
if ( *wp != '{' ) goto lineLoop;
|
|
wp++;
|
|
if ( *wp != '{' ) goto lineLoop;
|
|
wp++;
|
|
|
|
// somtimes we got something like
|
|
// # {{education}} {{initialism of|Artium Magister}}
|
|
// so go to next {{'s
|
|
// so skip spaces
|
|
char *secondSet = wp;
|
|
for ( ; *secondSet && *secondSet != '\n'; secondSet++ ) {
|
|
// check
|
|
if ( secondSet[0] == '}' &&
|
|
secondSet[1] == '}' &&
|
|
secondSet[2] == ' ' &&
|
|
secondSet[3] == '{' &&
|
|
secondSet[4] == '{' ) {
|
|
// skip to the second set of {{}}'s on the
|
|
// same line
|
|
wp = secondSet += 5;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// start scan here
|
|
//char *scanStart = wp;
|
|
// assume good
|
|
bool good = false;
|
|
// loop over all little pipe-delineated sections
|
|
scanForFormIndicator:
|
|
// scan until we hit |and not }
|
|
for ( ; *wp && *wp != '}' && *wp != '|' ; wp++ ) {
|
|
// # {{nl-noun-form|pl=1|wijziginkje}}
|
|
if ( wp[0] == 'f' &&
|
|
wp[1] == 'o' &&
|
|
wp[2] == 'r' &&
|
|
wp[3] == 'm' &&
|
|
wp[4] == '|' )
|
|
good = true;
|
|
// # {{abbeviation of|camarade|...
|
|
if ( wp[0] == ' ' &&
|
|
wp[1] == 'o' &&
|
|
wp[2] == 'f' &&
|
|
wp[3] == '|' )
|
|
good = true;
|
|
// for 'BM' page:
|
|
// # {{head|tr|abbreviation}} (''[[B...
|
|
/*
|
|
if ( wp[0] == 'h' &&
|
|
wp[1] == 'e' &&
|
|
wp[2] == 'a' &&
|
|
wp[3] == 'd' &&
|
|
wp[4] == '|' )
|
|
good = true;
|
|
*/
|
|
}
|
|
// success?
|
|
if ( *wp != '|' ) goto lineLoop;
|
|
// "of" or "form" must preceed
|
|
if ( ! good ) {
|
|
// maybe try next pipe delineated section
|
|
wp++;
|
|
goto scanForFormIndicator;
|
|
}
|
|
|
|
|
|
// broken:
|
|
// # {{conjugation of|livrer||1|s|pres|ind|lang=fr}}
|
|
// # {{form of|third-person singular present|pondre|lang=fr}}
|
|
// # {{plural of|pie|lang=fr}}
|
|
// # {{inflection of|[[pius#Latin|pius]]||voc|m|s|lang=la}}
|
|
// # {{form of|Singular dative masculine|on|lang=cs}}
|
|
|
|
// skip |
|
|
wp++;
|
|
|
|
// find terminating '}'
|
|
char *end = wp;
|
|
for ( ; *end && end < nextTitle && *end != '}' ;end++ );
|
|
// try next line if could not find }
|
|
if ( ! *end || end >= nextTitle ) goto lineLoop;
|
|
// null term it
|
|
*end = '\0';
|
|
// in case there was a # in there!
|
|
if ( np < end + 1 ) {
|
|
np = end + 1;
|
|
for ( ; *np && np < nextTitle ; np++ )
|
|
if ( *np =='#' || (*np == '=' && np[1]=='=') )
|
|
break;
|
|
}
|
|
|
|
|
|
// nuke all of it! "archaic third person ..."
|
|
if ( gb_strcasestr(lineStart,"archaic ") )
|
|
goto lineLoop;
|
|
if ( gb_strcasestr(lineStart,"archaic|") )
|
|
goto lineLoop;
|
|
if ( gb_strcasestr(lineStart,"archaic}") )
|
|
goto lineLoop;
|
|
// fix 'goest' has {{archaic-verb-form
|
|
if ( gb_strcasestr(lineStart,"{archaic") )
|
|
goto lineLoop;
|
|
if ( gb_strcasestr(lineStart,"eye dialect") )
|
|
goto lineLoop;
|
|
if ( gb_strcasestr(lineStart,"eye-dialect") )
|
|
goto lineLoop;
|
|
// obslete form or spelling
|
|
if ( gb_strcasestr(lineStart,"obsolete ") )
|
|
goto lineLoop;
|
|
if ( gb_strcasestr(lineStart,"obsolete|") )
|
|
goto lineLoop;
|
|
if ( gb_strcasestr(lineStart,"obsolete}") )
|
|
goto lineLoop;
|
|
// {standard of identity|UK} (measurement)
|
|
// prevent cream->UK
|
|
if ( gb_strcasestr(lineStart,"standard ") )
|
|
goto lineLoop;
|
|
// fix 'gwine'
|
|
if ( gb_strcasestr(lineStart,"nonstandard") )
|
|
goto lineLoop;
|
|
|
|
//
|
|
// now wp = "|.....}" and end = the ending '}'
|
|
//
|
|
// CRAP: # {{sports}} {{initialism of|[[championship|Championship]] [[record|Record]] or [[competition|Competition]] Record}}
|
|
// is messing up on converting pipes to \0 because it
|
|
// ends up mapping "CR" to "championship".
|
|
int32_t inBrackets = 0;
|
|
for ( char *s = wp ; s < end ; s++ ) {
|
|
if ( *s == '[' ) inBrackets++;
|
|
if ( *s == ']' ) inBrackets--;
|
|
if ( *s == '|' && ! inBrackets ) *s = '\0';
|
|
}
|
|
// scan the strings now
|
|
char *start = NULL;
|
|
int32_t slen;
|
|
bool skipNext = false;
|
|
for ( char *s = wp ; s < end ; s += slen + 1 ) {
|
|
slen = strlen(s);
|
|
// skip numbers |1|
|
|
if ( slen == 1 && is_digit(*s) ) continue;
|
|
// skip that {{l|en|... crap {{l|fro|...
|
|
if ( ! strcmp(s,"{{l") ) { skipNext = true; continue;}
|
|
if ( skipNext ) { skipNext = false; continue; }
|
|
// skip certain words
|
|
if ( ! strcmp(s,"pass") ) continue;
|
|
if ( ! strcmp(s,"pres") ) continue;
|
|
if ( ! strcmp(s,"fut") ) continue;
|
|
if ( ! strcmp(s,"nom") ) continue;
|
|
if ( ! strcmp(s,"act") ) continue;
|
|
if ( ! strcmp(s,"voc") ) continue;
|
|
if ( ! strcmp(s,"imp") ) continue;
|
|
if ( ! strcmp(s,"acc") ) continue;
|
|
if ( ! strcmp(s,"ind") ) continue;
|
|
if ( ! strcmp(s,"sub") ) continue;
|
|
if ( ! strcmp(s,"s") ) continue;
|
|
if ( ! strcmp(s,"p") ) continue;
|
|
if ( ! strcmp(s,"m") ) continue;
|
|
if ( ! strcmp(s,"f") ) continue;
|
|
// assignment like "lang=la"
|
|
if ( strstr(s,"=" ) ) continue;
|
|
// third-person singluar
|
|
if ( gb_strcasestr(s,"person ") ) continue;
|
|
if ( gb_strcasestr(s," person") ) continue;
|
|
// third-person
|
|
if ( gb_strcasestr(s,"-person") ) continue;
|
|
// Singular dative masculine
|
|
if ( gb_strcasestr(s,"dative ") ) continue;
|
|
if ( gb_strcasestr(s,"nominative ") ) continue;
|
|
if ( gb_strcasestr(s,"imperative ") ) continue;
|
|
if ( gb_strcasestr(s,"comparative ") ) continue;
|
|
if ( gb_strcasestr(s,"genitive") ) continue;
|
|
if ( gb_strcasestr(s,"possessive ") ) continue;
|
|
if ( gb_strcasestr(s," possessive") ) continue;
|
|
if ( gb_strcasestr(s,"past tense") ) continue;
|
|
// impersonal past
|
|
if ( gb_strcasestr(s," past") ) continue;
|
|
if ( gb_strcasestr(s,"present tense") ) continue;
|
|
if ( gb_strcasestr(s,"future tense") ) continue;
|
|
// passive voice
|
|
if ( gb_strcasestr(s,"passive ") ) continue;
|
|
// present analytic
|
|
if ( gb_strcasestr(s," analytic") ) continue;
|
|
if ( gb_strcasestr(s,"subjunctive ") ) continue;
|
|
if ( gb_strcasestr(s," subjunctive ") ) continue;
|
|
// Postal abbreviation
|
|
if ( gb_strcasestr(s," abbreviation") ) continue;
|
|
// abbreviation of
|
|
if ( gb_strcasestr(s,"abbreviation ") ) continue;
|
|
// infinitive passive
|
|
if ( gb_strcasestr(s,"infinitive ") ) continue;
|
|
// infinitive passive voice
|
|
if ( gb_strcasestr(s," infinitive") ) continue;
|
|
if ( gb_strcasestr(s,"appendix:") ) continue;
|
|
// "form used..."
|
|
if ( gb_strcasestr(s,"form ") ) continue;
|
|
// inflection of
|
|
if ( gb_strcasestr(s,"inflection ") ) continue;
|
|
// front vowel variant
|
|
if ( gb_strcasestr(s," variant") ) continue;
|
|
if ( gb_strcasestr(s," spelling") ) continue;
|
|
if ( gb_strcasestr(s," misspelling") ) continue;
|
|
// definite and plural
|
|
if ( gb_strcasestr(s,"definite") ) continue;
|
|
if ( gb_strcasestr(s,"accusative ") ) continue;
|
|
if ( gb_strcasestr(s,"vocative ") ) continue;
|
|
if ( gb_strcasestr(s,"indicative") ) continue;
|
|
if ( gb_strcasestr(s,"plural") ) continue;
|
|
if ( gb_strcasestr(s,"feminine") ) continue;
|
|
if ( gb_strcasestr(s,"masculine") ) continue;
|
|
if ( gb_strcasestr(s,"oblique") ) continue;
|
|
// singuler definite
|
|
if ( gb_strcasestr(s,"singular ") ) continue;
|
|
if ( gb_strcasestr(s," singular") ) continue;
|
|
// prepositional singluar
|
|
if ( gb_strcasestr(s,"prepositional") ) continue;
|
|
if ( gb_strcasestr(s," participle") ) continue;
|
|
// han form
|
|
if ( gb_strcasestr(s," form") ) continue;
|
|
// *PRENSENT* tense
|
|
if ( gb_strcasestr(s," tense") ) continue;
|
|
if ( gb_strcasestr(s,"lower case") ) continue;
|
|
if ( gb_strcasestr(s,"upper case") ) continue;
|
|
|
|
// kills the word "present"! so hardcode that!
|
|
if ( ! strcmp(s,"present") ) continue;
|
|
if ( ! strcmp(s,"past") ) continue;
|
|
if ( ! strcmp(s,"capital form") ) continue;
|
|
if ( ! strcmp(s,"capitalized form") ) continue;
|
|
if ( ! strcmp(s,"obsolete capitalization") ) continue;
|
|
if ( ! strcmp(s,"archaic form") ) continue;
|
|
if ( ! strcmp(s,"shortened form") ) continue;
|
|
if ( ! strcmp(s,"reduced form") ) continue;
|
|
if ( ! strcmp(s,"unstressed form") ) continue;
|
|
if ( ! strcmp(s,"lowercase form") ) continue;
|
|
if ( ! strcmp(s,"uncapitalized form") ) continue;
|
|
if ( ! strcmp(s,"imperative") ) continue;
|
|
// assume that is it i guess
|
|
start = s;
|
|
break;
|
|
}
|
|
|
|
// skip if empty!!! wtf??
|
|
if ( ! start ) { wp = end + 1 ; goto lineLoop; }
|
|
// skip ['s and spaces
|
|
// skipping ' made "ve" a form of "of" where it was "'ve"
|
|
for ( ;
|
|
*start == '[' || *start == ' ' ; // || *start == '\'';
|
|
start++ );
|
|
|
|
// and ]'s
|
|
char *wend = start + strlen(start);
|
|
for ( ; wend && wend>start && wend[-1] == ']' ;wend--);
|
|
*wend = '\0';
|
|
|
|
// sometimes they start with w: like for ANZAC:
|
|
// # {{initialism of|[[w:Australian and New Zealand Army Corps|Australian and New Zealand Army Corps]]}}
|
|
if ( start[0]=='w' && start[1]==':' ) {
|
|
start += 2;
|
|
// these are wikipedia titles, skip!
|
|
//goto lineLoop;
|
|
}
|
|
if ( strncasecmp(start,"wikipedia:",10)==0 ) {
|
|
start += 10;
|
|
// these are wikipedia titles, skip!
|
|
//goto lineLoop;
|
|
}
|
|
if ( start[0]==':' && start[1]=='w' && start[2]==':'){
|
|
start += 3;
|
|
// these are wikipedia titles, skip!
|
|
//goto lineLoop;
|
|
}
|
|
|
|
// nuke after # anchor
|
|
char *a = start;
|
|
for ( ; *a ; a++ ) if ( *a == '#' ) { *a = '\0'; break; }
|
|
// do not add huge words
|
|
if ( strlen(start) > 1000 ) goto lineLoop;
|
|
// skip that
|
|
wp = end + 1;
|
|
|
|
// or the word " or " in there!
|
|
// identification|Identification]] or [[identity]] [[documentation]
|
|
// # {{comparative of|[[good]] or [[well]]
|
|
a = start;
|
|
for ( ; *a ; a++ ) {
|
|
if ( strncmp(a,"]] or [[",8) == 0 ) {
|
|
*a = '\0';
|
|
break;
|
|
}
|
|
}
|
|
|
|
// if it has any pipes, i am not dealing with that
|
|
// CRAP: # {{sports}} {{initialism of|[[championship|Championship]] [[record|Record]] or [[competition|Competition]] Record}}
|
|
// cuz it gets too complicated!!!
|
|
a = start;
|
|
int32_t pipeCount = 0;
|
|
for ( ; *a ; a++ ) { if ( *a == '|' ) pipeCount++; }
|
|
a = start;
|
|
// too many pipes?
|
|
if ( pipeCount >= 2 )
|
|
goto lineLoop;
|
|
// if just one, pick the first term i guess
|
|
// # {{initialism of|[[w:Americans for Democratic Action|Ame..
|
|
for ( ; *a ; a++ ) {
|
|
if ( *a == '|' ) {
|
|
// fix
|
|
// {{acronym of|Search for [[extraterrestrial|Extraterrestrial]] Intelligence\0
|
|
char *bs = a;
|
|
for ( ; *bs ; bs++ ) {
|
|
if ( *bs == ']' )
|
|
goto lineLoop;
|
|
}
|
|
// ok, good to go
|
|
*a = '\0';
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
// # {{British|Ireland|dated}} {{initialism of|[[&pound;sd
|
|
// nuke if semicolon
|
|
a = start;
|
|
for ( ; *a ; a++ ) {
|
|
if ( *a == ';' ) goto lineLoop;
|
|
if ( *a == '*' ) goto lineLoop; // f**k
|
|
if ( *a == '+' ) goto lineLoop;
|
|
if ( *a == ',' ) goto lineLoop;
|
|
if ( *a == '{' ) goto lineLoop;
|
|
if ( *a == '}' ) goto lineLoop;
|
|
if ( *a == '(' ) goto lineLoop;
|
|
if ( *a == ')' ) goto lineLoop;
|
|
if ( *a == '/' ) goto lineLoop;
|
|
}
|
|
|
|
// skip initial spaces again
|
|
for ( ; *start == ' ' ; start++ );
|
|
|
|
// forget it if ends or begins with hyphen
|
|
if ( start[0] == '-' ) goto lineLoop;
|
|
if ( a [-1] == '-' ) goto lineLoop;
|
|
|
|
// or starts with '
|
|
// fix "'s" for "is" (the dog's running after me)
|
|
// fix "'ve" as a form of "of"
|
|
if ( start[0] == '\'' ) goto lineLoop;
|
|
|
|
// same with underscore (fix fotch->_)
|
|
if ( start[0] == '_' ) goto lineLoop;
|
|
if ( a[-1] == '_' ) goto lineLoop;
|
|
|
|
// re-write the base word and filter out [ and ]
|
|
char normBuf[1024];
|
|
dst = normBuf;
|
|
src = start;
|
|
for ( ; *src ; src++ ) {
|
|
*dst = *src;
|
|
if ( *dst == '[' ) continue;
|
|
if ( *dst == ']' ) continue;
|
|
dst++;
|
|
}
|
|
*dst = '\0';
|
|
// trim off spaces
|
|
wend = normBuf + strlen(normBuf);
|
|
// fix ''sadden''
|
|
for ( ; wend && wend>normBuf &&
|
|
(wend[-1] == ']' ||
|
|
wend[-1] == ' ' ||
|
|
wend[-1] == '\'' ) ;
|
|
wend--);
|
|
*wend = '\0';
|
|
|
|
|
|
// or starts with '
|
|
// fix "'s" for "is" (the dog's running after me)
|
|
if ( normBuf[0] == '\'' ) goto lineLoop;
|
|
|
|
if ( debug )
|
|
fprintf(stderr,"%s -> %s"
|
|
"\n"
|
|
//"(%s)\n",
|
|
,word // TITLE!
|
|
,normBuf // baseform! // start
|
|
//getLanguageAbbr(langId)
|
|
);
|
|
addWord ( word, flag , langId , normBuf ); // start );
|
|
// try another line
|
|
goto lineLoop;
|
|
}
|
|
|
|
bool Wiktionary::addWord ( char *word ,
|
|
uint8_t posFlag ,
|
|
uint8_t langId ,
|
|
char *formOf ) {
|
|
|
|
// done if lang is unknown
|
|
if ( langId == langUnknown ) return true;
|
|
// hash the word
|
|
//int64_t wid = hash64Lower_utf8(word);
|
|
int64_t wid = hash64n(word);
|
|
|
|
/*
|
|
// see if already in there
|
|
uint8_t *langIdPtr = (uint8_t *)m_langTableTmp.getValue(&wid);
|
|
// if same
|
|
if ( langIdPtr && *langIdPtr != langId ) {
|
|
// mark it as multi-language, we will delete when done
|
|
*langIdPtr = langTranslingual;
|
|
}
|
|
// otherwise, add it!
|
|
else {
|
|
// . add that then
|
|
// . this only uses 6 byte keys
|
|
if ( ! m_langTableTmp.addKey ( &wid, &langId ) ) return false;
|
|
}
|
|
*/
|
|
|
|
// if not form of something make it form of itself
|
|
if ( ! formOf ) formOf = word;
|
|
|
|
// to file like dict.cz
|
|
int64_t lk64 = wid ;
|
|
lk64 ^= g_hashtab[4][langId];
|
|
if ( ! m_dedup.isInTable ( &lk64 ) ) {
|
|
m_dedup.addKey ( &lk64 );
|
|
m_langBuf.safePrintf ( "%s|%s\n",
|
|
getLanguageAbbr(langId),
|
|
word);
|
|
}
|
|
|
|
// store word so we can map word it to a string
|
|
int32_t len = m_debugBuf.length();
|
|
int32_t wlen = strlen(word);
|
|
if ( ! m_debugMap.isInTable ( &wid ) ) {
|
|
m_debugBuf.safeMemcpy ( word, wlen );
|
|
m_debugBuf.pushChar('\0');
|
|
// this only uess 6 byte keys
|
|
if ( ! m_debugMap.addKey ( &wid , &len ) ) return false;
|
|
}
|
|
|
|
// need a POS for adding for synonyms
|
|
//if ( ! posFlag ) return true;
|
|
|
|
// . get hash of form of
|
|
// . i.e. if word is "jumping" then formOf is "jump"
|
|
// . so this maps "jump" to all the forms it has
|
|
// . thus allowDups is true for this one too
|
|
// . but the "jump" key is language and POS sensitive
|
|
// . so "jump" as a noun does not map to "jumping" (verb) but only
|
|
// maps to "jumps" the noun
|
|
//int64_t fh64 = hash64Lower_utf8(formOf);
|
|
int64_t fh64 = hash64n(formOf);
|
|
// save that
|
|
int64_t baseForm = fh64;
|
|
|
|
|
|
// also add formOf
|
|
if ( ! m_debugMap.isInTable ( &baseForm ) ) {
|
|
len = m_debugBuf.length();
|
|
m_debugBuf.safeStrcpy ( formOf );
|
|
m_debugBuf.pushChar('\0');
|
|
// this only uess 6 byte keys
|
|
if ( ! m_debugMap.addKey ( &baseForm , &len ) ) return false;
|
|
}
|
|
|
|
// hash in langid
|
|
fh64 ^= g_hashtab[0][langId];
|
|
// include POS flag too i guess
|
|
//fh64 ^= g_hashtab[1][posFlag];
|
|
|
|
// dedup table
|
|
int64_t dk64 = hash64h ( fh64 , wid );
|
|
|
|
//if ( dk64 == 4174548643612680780LL )
|
|
// log("boo");
|
|
|
|
if ( ! m_dedup.isInTable ( &dk64 ) ) {
|
|
/*
|
|
// the data now includes popularity of wid
|
|
int32_t pop = g_speller.getPhrasePopularity(NULL,
|
|
wid,
|
|
true,
|
|
langId);
|
|
if ( pop > 32000 ) pop = 32000;
|
|
*/
|
|
// make the data
|
|
char data[9];
|
|
gbmemcpy ( data , &wid , 8 );
|
|
data[8] = langId;
|
|
// . add that. allowDups. so you should be able to get all the
|
|
// forms by just looking at the base form
|
|
// . this uses 8 byte keys
|
|
if ( ! m_tmp.addKey ( &fh64 , data ) ) return false;
|
|
// . add for both
|
|
// . this uses 8 byte keys
|
|
if ( ! m_dedup.addKey ( &dk64 ) ) return false;
|
|
}
|
|
|
|
// same for this
|
|
dk64 = hash64h ( fh64 , baseForm );
|
|
|
|
//if ( dk64 == 4174548643612680780LL )
|
|
// log("boo");
|
|
|
|
if ( ! m_dedup.isInTable ( &dk64 ) ) {
|
|
/*
|
|
// the data now includes popularity of wid
|
|
int32_t pop = g_speller.getPhrasePopularity(NULL,
|
|
baseForm,
|
|
true,
|
|
langId);
|
|
if ( pop > 32000 ) pop = 32000;
|
|
// make the data
|
|
char data[8];
|
|
gbmemcpy ( data , &baseForm , 6 );
|
|
gbmemcpy ( data + 8 , &pop , 2 );
|
|
*/
|
|
// make the data
|
|
char data[9];
|
|
gbmemcpy ( data , &baseForm , 8 );
|
|
data[8] = langId;
|
|
// . map the base form to itself as well! so compile() works
|
|
// so if we have the word "jumping" an alt for is "jump"
|
|
// . this uses 8 byte keys
|
|
if ( ! m_tmp.addKey ( &fh64, data ) ) return false;
|
|
// . add for both
|
|
// . this uses 8 byte keys
|
|
if ( ! m_dedup.addKey ( &dk64 ) ) return false;
|
|
}
|
|
|
|
// success!
|
|
return true;
|
|
}
|
|
|
|
// . make the synonym/form table from m_tmp
|
|
// . m_synTable maps a 48-bit wordid (combined with its language id and
|
|
// its part of speeach flag) to a list of alternative forms
|
|
// which are also 48-bit wordids, suitable for hashing into posdb
|
|
// . the reason we combine language id and part of speech flag with the
|
|
// word id, is because "jump" the english noun, does not map to
|
|
// "jumping" for example. so we assume a word is a noun only if it
|
|
// could be both a verb or a noun, as in the case of jump or jumps. however,
|
|
// jumping is treated as a verb.
|
|
bool Wiktionary::compile ( ) {
|
|
|
|
HashTableX dedup;
|
|
dedup.set ( 8,0,16777216,NULL,0,false,"cdtab");
|
|
|
|
// scan the m_tmp table
|
|
for ( int32_t i = 0 ; i < m_tmp.getNumSlots() ; i++ ) {
|
|
// skip empty slots
|
|
if ( ! m_tmp.m_flags[i] ) continue;
|
|
// get this guys key
|
|
int64_t fh64 = m_tmp.getKey64FromSlot(i);
|
|
// is base form "pie"? why doesn't "pie" map to it?
|
|
//if( fh64 == 4935258599006239294LL ) // balon baseform in turk
|
|
// log("en|UK");
|
|
// do not repeat
|
|
if ( dedup.isInTable ( &fh64 ) ) continue;
|
|
// this uses 8 byte keys
|
|
if ( ! dedup.addKey ( &fh64 ) ) return false;
|
|
// reset
|
|
//int64_t lastWid = 0LL;
|
|
// remove dups
|
|
HashTableX dd2;
|
|
char dbuf2[512];
|
|
dd2.set(8,0,8,dbuf2,512,false,"ddttt2");
|
|
// how many forms? must be 2+ to get added to syntable
|
|
int32_t formCount = 0;
|
|
for ( int32_t j = i ; ; j++ ) {
|
|
// wrap around
|
|
if ( j >= m_tmp.getNumSlots() ) j = 0;
|
|
// chain stops when we hit empty slot
|
|
if ( ! m_tmp.m_flags[j] ) break;
|
|
// make sure matches
|
|
int64_t kk = m_tmp.getKey64FromSlot(j);
|
|
// must match
|
|
if ( kk != fh64 ) continue;
|
|
// get a form of the base form, wid64
|
|
char *data = (char *)m_tmp.getValueFromSlot(j);
|
|
|
|
// must be there
|
|
int32_t *offPtr = (int32_t *)m_debugMap.getValue(data);
|
|
if ( ! offPtr ) gbshutdownLogicError();
|
|
char *word = m_debugBuf.getBufStart() + *offPtr;
|
|
// now re-hash it as lower case
|
|
int64_t wid = hash64Lower_utf8(word);
|
|
// dedup on it
|
|
if ( dd2.isInTable ( &wid ) ) continue;
|
|
dd2.addKey ( &wid );
|
|
|
|
// unique
|
|
//if ( *(int64_t *)data == lastWid ) continue;
|
|
// adjacent deduping
|
|
//lastWid = *(int64_t *)data;
|
|
// it matches!
|
|
formCount++;
|
|
|
|
//The original code generated synonyms from words with accents based on the unicode canonical-decomposition
|
|
//data. On the surface that sounds like a good idea, eg. if you search for 'Chloe' you'll find hits on
|
|
//'Chloë' too. However, that ignores whether the accent/mark is optional. Removing accents from 'bûche de Noël',
|
|
//'mañaja', 'Ötjendorf' or 'kål' changes the words significantly. You must either not do it or sometimes do
|
|
//language/orthography-dependent transliteration
|
|
}
|
|
// need 2+ forms!
|
|
if ( formCount <= 1 ) continue;
|
|
// base form
|
|
//int64_t wid = *(int64_t *)m_tmp.getValueFromSlot(i);
|
|
// remember buf start
|
|
int32_t bufLen = m_synBuf.length();
|
|
// remove dups
|
|
HashTableX dd;
|
|
char dbuf[512];
|
|
dd.set(8,0,8,dbuf,512,false,"ddttt");
|
|
// a byte for storing the # of synonym forms
|
|
//m_synBuf.pushChar(0);
|
|
// push the langid!
|
|
//m_synBuf.safePrintf("%" PRId32",",langId);
|
|
int32_t count = 0;
|
|
// chain for all keys that are the same
|
|
for ( int32_t j = i ; ; j++ ) {
|
|
// wrap around
|
|
if ( j >= m_tmp.getNumSlots() ) j = 0;
|
|
// chain stops when we hit empty slot
|
|
if ( ! m_tmp.m_flags[j] ) break;
|
|
// . get key of jth slot
|
|
// . this uses 8 byte keys
|
|
// . kk is the hash of the BASE form i think hashed
|
|
// with the langid
|
|
int64_t kk = m_tmp.getKey64FromSlot(j);
|
|
// must match
|
|
if ( kk != fh64 ) continue;
|
|
// get a form of the base form, wid64
|
|
char *data = (char *)m_tmp.getValueFromSlot(j);
|
|
// get the word id
|
|
//int64_t wid =*(int64_t *)data;
|
|
// CRAP! this is a case dependent hash! we need
|
|
// to make it lower case now that the synsets
|
|
// have been established based on case, since
|
|
// wiktionary is highly case-dependent.
|
|
// get the word itself
|
|
int32_t *offPtr = (int32_t *)m_debugMap.getValue(data);
|
|
// must be there
|
|
if ( ! offPtr ) gbshutdownLogicError();
|
|
char *word = m_debugBuf.getBufStart() + *offPtr;
|
|
// now re-hash it
|
|
int64_t wid = hash64Lower_utf8(word);
|
|
// i bury langid in there
|
|
uint8_t langId = data[8];
|
|
// find "pie"!
|
|
//if ( wid == 1050735555723194583LL )
|
|
// log("pie");
|
|
// xor in the langid
|
|
wid ^= g_hashtab[0][langId];
|
|
// only add this word form once per langId
|
|
if ( dd.isInTable ( &wid ) ) continue;
|
|
dd.addKey ( &wid );
|
|
// first first time lead with a "<langAbbr>|"
|
|
if ( count == 0 ) {
|
|
m_synBuf.safeStrcpy(getLanguageAbbr(langId));
|
|
m_synBuf.pushChar('|');
|
|
}
|
|
// first is the wid (6 bytes) then pop (2 bytes)
|
|
// exclude popularity for this
|
|
//m_synBuf.safeMemcpy(data , 6 );
|
|
// print that
|
|
m_synBuf.safeStrcpy(word);
|
|
// comma
|
|
if ( count+1<formCount )
|
|
m_synBuf.pushChar(',');
|
|
// . a ptr to that sequence of alt forms in the buf
|
|
// . this uses 6 byte keys
|
|
m_synTable.addKey(&wid,&bufLen);
|
|
// stratocumulus
|
|
//if ( wid == -1556090671932692078 )
|
|
// log("stratocumulus");
|
|
|
|
//
|
|
// wtf?
|
|
// "won" has two bases "win" and "won"
|
|
// en|won,wons,woned
|
|
// en|win,won,winning,wins
|
|
// and we seem to map to the first one only...
|
|
// so maybe allow dup keys in syntable?
|
|
//
|
|
|
|
//see note in preceeding lop about accent-based synonym generation
|
|
// count em up
|
|
count++;
|
|
// limit to 100 synonyms per synset
|
|
if ( count >= 100 ) break;
|
|
}
|
|
// new line
|
|
m_synBuf.pushChar('\n');
|
|
// store the count, the # of syns in this synset
|
|
//char *buf = m_synBuf.getBufStart();
|
|
//buf[bufLen] = (char)count;
|
|
// . and of course the base form. "jump"
|
|
// . no, i add the base form map to itself into m_tmp above
|
|
// in addWords() now
|
|
//m_synTable.addKey(&baseKey64,&bufLen);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// add unified dict entries into m_langTable if they
|
|
// belong to one and only one language
|
|
bool Wiktionary::integrateUnifiedDict ( ) {
|
|
|
|
/*
|
|
// scan unified dict
|
|
for ( int32_t i = 0 ; i < numSlots ; i++ ) {
|
|
// skip empty slots
|
|
if ( ! ud->m_flags[i] ) continue;
|
|
// get ptrs
|
|
int32_t off = *(int32_t *)ud->getValueFromSlot(i);
|
|
// refernce
|
|
char *p = g_speller.m_unifiedBuf + off;
|
|
// just one lang?
|
|
if ( ! justOneLang ) continue;
|
|
// skip if already there
|
|
if ( m_langTable.isInTable ( &wid ) ) continue;
|
|
// add it then
|
|
if ( ! m_langTable.addKey ( &wid , &langId ) ) return false;
|
|
}
|
|
*/
|
|
|
|
/*
|
|
// scan langtable and remove translingual entries
|
|
for ( int32_t i = 0 ; i < m_langTableTmp.m_numSlots ; i++ ) {
|
|
// skip empty slots
|
|
if ( ! m_langTableTmp.m_flags[i] ) continue;
|
|
// check it
|
|
if ( *(uint8_t *)m_langTableTmp.getValueFromSlot(i) ==
|
|
langTranslingual )
|
|
continue;
|
|
// add it
|
|
char *key = (char *)m_langTableTmp.getKeyFromSlot(i);
|
|
char *val = (char *)m_langTableTmp.getValueFromSlot(i);
|
|
if ( ! m_langTable.addKey ( key , val ) ) return false;
|
|
}
|
|
*/
|
|
|
|
return true;
|
|
}
|