99 lines
2.7 KiB
C
99 lines
2.7 KiB
C
// Matt Wells, copyright Jul 2001
|
|
|
|
// . language detector
|
|
// . TODO: use stopwords in doc to determine the language
|
|
|
|
#ifndef _LANG_H_
|
|
#define _LANG_H_
|
|
|
|
#define MAX_LANGUAGES 64
|
|
// for langs 1-55, exclude translingual
|
|
// 64 - 8 is 56, then minus 1 is 55 bits
|
|
// translingual is the 31st bit, english is the first bit
|
|
#define LANG_BIT_MASK 0x007fffffffffffffLL
|
|
#include "Unicode.h"
|
|
#include "Iso8859.h"
|
|
#include "iana_charset.h"
|
|
enum {
|
|
langUnknown = 0,
|
|
langEnglish = 1,
|
|
langFrench = 2,
|
|
langSpanish = 3,
|
|
langRussian = 4,
|
|
langTurkish = 5,
|
|
langJapanese = 6,
|
|
langChineseTrad = 7, // cantonese
|
|
langChineseSimp = 8, // mandarin
|
|
langKorean = 9,
|
|
langGerman = 10,
|
|
langDutch = 11,
|
|
langItalian = 12,
|
|
langFinnish = 13,
|
|
langSwedish = 14,
|
|
langNorwegian = 15,
|
|
langPortuguese = 16,
|
|
langVietnamese = 17,
|
|
langArabic = 18,
|
|
langHebrew = 19,
|
|
langIndonesian = 20,
|
|
langGreek = 21,
|
|
langThai = 22,
|
|
langHindi = 23,
|
|
langBengala = 24,
|
|
langPolish = 25,
|
|
langTagalog = 26,
|
|
|
|
// added for wiktionary
|
|
langLatin = 27,
|
|
langEsperanto = 28,
|
|
langCatalan = 29,
|
|
langBulgarian = 30,
|
|
langTranslingual = 31, // used by multiple langs in wiktionary
|
|
langSerboCroatian = 32,
|
|
langHungarian = 33,
|
|
langDanish = 34,
|
|
langLithuanian = 35,
|
|
langCzech = 36,
|
|
langGalician = 37,
|
|
langGeorgian = 38,
|
|
langScottishGaelic = 39,
|
|
langGothic = 40,
|
|
langRomanian = 41,
|
|
langIrish = 42,
|
|
langLatvian = 43,
|
|
langArmenian = 44,
|
|
langIcelandic = 45,
|
|
langAncientGreek = 46,
|
|
langManx = 47,
|
|
langIdo = 48,
|
|
langPersian = 49,
|
|
langTelugu = 50,
|
|
langVenetian = 51,
|
|
langMalgasy = 52,
|
|
langKurdish = 53,
|
|
langLuxembourgish = 54,
|
|
langEstonian = 55,
|
|
langLast = 56
|
|
};
|
|
|
|
uint8_t getLanguageFromName(uint8_t *name);
|
|
uint8_t getLangIdFromAbbr ( char *abbr ) ;
|
|
char *getLangAbbr ( uint8_t langId ) ;
|
|
|
|
void languageToString ( unsigned char lang , char *buf );
|
|
char* getLanguageString ( unsigned char lang);
|
|
char* getNativeLanguageString ( unsigned char lang);
|
|
char* getLanguageAbbr ( unsigned char lang);
|
|
unsigned char getLanguageCharset ( unsigned char LangId );
|
|
bool isAdult( char *s, int32_t slen, char **loc = NULL );
|
|
//unsigned char getLanguageFromScript(UChar32 c);
|
|
unsigned char getLanguageFromAbbr(char *abbr);
|
|
unsigned char getLanguageFromAbbrN(char *abbr);
|
|
//unsigned char getLanguageFromUnicodeAbbr(UChar *abbr);
|
|
// abbr is now in utf8
|
|
unsigned char getLanguageFromUnicodeAbbr(char *abbr);
|
|
unsigned char getLanguageFromUserAgent(char *abbr);
|
|
unsigned char getLanguageFromCountryCode(char *code);
|
|
|
|
#endif
|