605 lines
23 KiB
C++
605 lines
23 KiB
C++
#include "gb-include.h"
|
||
|
||
#include "LangList.h"
|
||
#include "Indexdb.h"
|
||
|
||
LangList g_langList;
|
||
|
||
|
||
struct TldInfo {
|
||
char* m_tld;
|
||
char* m_country;
|
||
char* m_languages;
|
||
uint32_t m_languagebv;
|
||
};
|
||
|
||
static int32_t s_numTlds = 0;
|
||
|
||
static TldInfo s_tldInfo[] = {
|
||
{ "arpa", "Address and Routing Parameter Area", "unknown", 0xffffffff },
|
||
{ "root", "N/A", "unknown", 0xffffffff },
|
||
{ "aero", "air-transport industry", "unknown", 0xffffffff },
|
||
{ "biz", "business", "unknown", 0xffffffff },
|
||
{ "cat", "Catalan", "unknown", 0xffffffff },
|
||
{ "com", "commercial", "unknown", 0xffffffff },
|
||
{ "coop", "cooperatives", "unknown", 0xffffffff },
|
||
{ "edu", "educational", "unknown", 0xffffffff },
|
||
{ "gov", "governmental", "unknown", 0xffffffff },
|
||
{ "info", "information", "unknown", 0xffffffff },
|
||
{ "int", "international organizations", "unknown", 0xffffffff },
|
||
{ "jobs", "companies", "unknown", 0xffffffff },
|
||
{ "mil", "United States Military", "english,carolinian,chamorro,"
|
||
"hawaiian,samoan,spanish", 0xffffffff },
|
||
{ "mobi", "mobile devices", "unknown", 0xffffffff },
|
||
{ "museum", "museums", "unknown", 0xffffffff },
|
||
{ "name", "individuals, by name", "unknown", 0xffffffff },
|
||
{ "net", "network", "unknown", 0xffffffff },
|
||
{ "org", "organization", "unknown", 0xffffffff },
|
||
{ "pro", "professions", "unknown", 0xffffffff },
|
||
{ "travel", "travel and travel-agency related sites", "unknown", 0xffffffff },
|
||
{ "ac", "Ascension Island", "unknown", 0xffffffff },
|
||
{ "ad", "Andorra", "catalan", 0xffffffff },
|
||
{ "ae", "United Arab Emirates", "arabic", 0xffffffff },
|
||
{ "af", "Afghanistan", "arabic,balochi,dari,nuristani,pashto,pamiri,"
|
||
"pashai,turkmen,uzbek", 0xffffffff },
|
||
{ "ag", "Antigua and Barbuda", "english", 0xffffffff },
|
||
{ "ai", "Anguilla", "unknown", 0xffffffff },
|
||
{ "al", "Albania", "albanian", 0xffffffff },
|
||
{ "am", "Armenia", "armenian,armenian", 0xffffffff },
|
||
{ "an", "Netherlands Antilles", "dutch,frisian", 0xffffffff },
|
||
{ "ao", "Angola", "portuguese", 0xffffffff },
|
||
{ "aq", "Antarctica", "unknown", 0xffffffff },
|
||
{ "ar", "Argentina", "spanish,guarani", 0xffffffff },
|
||
{ "as", "American Samoa", "english,samoan", 0xffffffff },
|
||
{ "at", "Austria", "croatian,czech,german,hungarian,slovak,"
|
||
"slovenian,romani", 0xffffffff },
|
||
{ "au", "Australia", "australia", 0xffffffff },
|
||
{ "aw", "Aruba", "unknown", 0xffffffff },
|
||
{ "ax", "Åland", "unknown", 0xffffffff },
|
||
{ "az", "Azerbaijan", "azerbaijani", 0xffffffff },
|
||
{ "ba", "Bosnia and Herzegovina", "bosnian,croatian,serbian", 0xffffffff },
|
||
{ "bb", "Barbados", "english", 0xffffffff },
|
||
{ "bd", "Bangladesh", "bengala", 0xffffffff },
|
||
{ "be", "Belgium", "dutch,french,german", 0xffffffff },
|
||
{ "bf", "Burkina Faso", "french,more,jula,fula", 0xffffffff },
|
||
{ "bg", "Bulgaria", "bulgarian", 0xffffffff },
|
||
{ "bh", "Bahrain", "arabic", 0xffffffff },
|
||
{ "bi", "Burundi", "french,kirundi", 0xffffffff },
|
||
{ "bj", "Benin", "french", 0xffffffff },
|
||
{ "bm", "Bermuda", "unknown", 0xffffffff },
|
||
{ "bn", "Brunei Darussalam", "malay", 0xffffffff },
|
||
{ "bo", "Bolivia", "spanish,aymara,quechua", 0xffffffff },
|
||
{ "br", "Brazil", "portuguese", 0xffffffff },
|
||
{ "bs", "Bahamas", "unknown", 0xffffffff },
|
||
{ "bt", "Bhutan", "dzongkha,english", 0xffffffff },
|
||
{ "bv", "Bouvet Island", "unknown", 0xffffffff },
|
||
{ "bw", "Botswana", "english,kalanga,tswana", 0xffffffff },
|
||
{ "by", "Belarus", "belarusian,russian", 0xffffffff },
|
||
{ "bz", "Belize", "english", 0xffffffff },
|
||
{ "ca", "Canada", "chipewyan,cree,dogrib,english,french,gwich?in,inuinnaqtun,"
|
||
"inuktitut,inuvialuktun,slavey", 0xffffffff },
|
||
{ "cc", "Cocos (Keeling) Islands", "unknown", 0xffffffff },
|
||
{ "cd", "Democratic Republic of the Congo", "french,lingala,kikongo,swahili,"
|
||
"tshiluba", 0xffffffff },
|
||
{ "cf", "Central African Republic", "french,sango", 0xffffffff },
|
||
{ "cg", "Republic of the Congo", "french,lingala,munukutuba", 0xffffffff },
|
||
{ "ch", "Switzerland (Confoederatio Helvetica)", "french,german,italian,"
|
||
"romansh", 0xffffffff },
|
||
{ "ci", "C<EFBFBD>´te d'Ivoire", "french", 0xffffffff },
|
||
{ "ck", "Cook Islands", "unknown", 0xffffffff },
|
||
{ "cl", "Chile", "spanish", 0xffffffff },
|
||
{ "cm", "Cameroon", "english,french", 0xffffffff },
|
||
{ "cn", "People's Republic of China", "cantonese,english,kazakh,korean,"
|
||
"mandarin,mongolian,portuguese,tajik,tibetan,uyghur,zhuang",
|
||
0xffffffff },
|
||
{ "co", "Colombia", "spanish", 0xffffffff },
|
||
{ "cr", "Costa Rica", "spanish", 0xffffffff },
|
||
{ "cu", "Cuba", "spanish", 0xffffffff },
|
||
{ "cv", "Cape Verde", "crioulo,portuguese", 0xffffffff },
|
||
{ "cx", "Christmas Island", "unknown", 0xffffffff },
|
||
{ "cy", "Cyprus", "greek,turkish", 0xffffffff },
|
||
{ "cz", "Czech Republic", "czech", 0xffffffff },
|
||
{ "de", "Germany (Deutschland)", "danish,frisian,german,romani,"
|
||
"lower sorbian,upper sorbian", 0xffffffff },
|
||
{ "dj", "Djibouti", "arabic,french", 0xffffffff },
|
||
{ "dk", "Denmark", "danish,faroese,kalaallisut", 0xffffffff },
|
||
{ "dm", "Dominica", "english", 0xffffffff },
|
||
{ "do", "Dominican Republic", "english", 0xffffffff },
|
||
{ "dz", "Algeria", "arabic,tamazight", 0xffffffff },
|
||
{ "ec", "Ecuador", "spanish,quechua", 0xffffffff },
|
||
{ "ee", "Estonia", "estonian", 0xffffffff },
|
||
{ "eg", "Egypt", "arabic", 0xffffffff },
|
||
{ "er", "Eritrea", "arabic,english,tigrinya", 0xffffffff },
|
||
{ "es", "Spain (Espa<70>±a)", "basque,catalan,galician,occitan,"
|
||
"spanish", 0xffffffff },
|
||
{ "et", "Ethiopia", "amharic", 0xffffffff },
|
||
{ "eu", "European Union", "unknown", 0xffffffff },
|
||
{ "fi", "Finland", "finnish,sami,swedish", 0xffffffff },
|
||
{ "fj", "Fiji", "english,fijian,hindustani", 0xffffffff },
|
||
{ "fk", "Falkland Islands", "unknown", 0xffffffff },
|
||
{ "fm", "Federated States of Micronesia", "chuuk,english,kosraean,ponapean,"
|
||
"ulithian,yapese", 0xffffffff },
|
||
{ "fo", "Faroe Islands", "unknown", 0xffffffff },
|
||
{ "fr", "France", "french,tahitian", 0xffffffff },
|
||
{ "ga", "Gabon", "french", 0xffffffff },
|
||
{ "gb", "United Kingdom (Great Britain)", "english,cornish,"
|
||
"dg<EFBFBD>¨rn<EFBFBD>©siais,english,french,irish,j<>¨rriais,"
|
||
"pitcairnese,scots,scottish gaelic,welsh", 0xffffffff },
|
||
{ "gd", "Grenada", "english", 0xffffffff },
|
||
{ "ge", "Georgia", "abkhaz,georgian,ossetic,russian", 0xffffffff },
|
||
{ "gf", "French Guiana", "unknown", 0xffffffff },
|
||
{ "gg", "Guernsey", "unknown", 0xffffffff },
|
||
{ "gh", "Ghana", "adangme,dagaare,dagbani,english,ewe,ga,gonja,kasem,"
|
||
"nzema,twi", 0xffffffff },
|
||
{ "gi", "Gibraltar", "unknown", 0xffffffff },
|
||
{ "gl", "Greenland", "unknown", 0xffffffff },
|
||
{ "gm", "The Gambia", "unknown", 0xffffffff },
|
||
{ "gn", "Guinea", "french,fula", 0xffffffff },
|
||
{ "gp", "Guadeloupe", "unknown", 0xffffffff },
|
||
{ "gq", "Equatorial Guinea", "french,spanish", 0xffffffff },
|
||
{ "gr", "Greece", "greek", 0xffffffff },
|
||
{ "gs", "South Georgia and the South Sandwich Islands", "abkhaz,georgian,"
|
||
"ossetic,russian", 0xffffffff },
|
||
{ "gt", "Guatemala", "spanish", 0xffffffff },
|
||
{ "gu", "Guam", "unknown", 0xffffffff },
|
||
{ "gw", "Guinea-Bissau", "french,fula", 0xffffffff },
|
||
{ "gy", "Guyana", "english", 0xffffffff },
|
||
{ "hk", "Hong Kong", "unknown", 0xffffffff },
|
||
{ "hm", "Heard Island and McDonald Islands", "unknown", 0xffffffff },
|
||
{ "hn", "Honduras", "spanish", 0xffffffff },
|
||
{ "hr", "Croatia (Hrvatska)", "croatian,italian", 0xffffffff },
|
||
{ "ht", "Haiti", "french,haitian creole", 0xffffffff },
|
||
{ "hu", "Hungary", "hungarian", 0xffffffff },
|
||
{ "id", "Indonesia", "balinese,javanese,indonesian,sundanese", 0xffffffff },
|
||
{ "ie", "Ireland (Éire)", "unknown", 0xffffffff },
|
||
{ "il", "Israel", "arabic,hebrew", 0xffffffff },
|
||
{ "im", "Isle of Man", "unknown", 0xffffffff },
|
||
{ "in", "India", "assamese,bengala,bodo,dogri,english,gujarati,hindi,kannada,"
|
||
"kashmiri,konkani,maithili,malayalam,meitei,marathi,nepali,oriya,"
|
||
"punjabi,sanskrit,santali,sindhi,tamil,telugu,urdu,french,karbi,"
|
||
"bhojpuri,magadhi,maithili,chhattisgarhi,portuguese,pahari,tulu,"
|
||
"garo,khasi,mizo,rajasthani,kokborok,nicobarese", 0xffffffff },
|
||
{ "io", "British Indian Ocean Territory", "assamese,bengala,bodo,dogri,"
|
||
"english,gujarati,hindi,kannada,kashmiri,konkani,maithili,malayalam,"
|
||
"meitei,marathi,nepali,oriya,punjabi,sanskrit,santali,sindhi,tamil,"
|
||
"telugu,urdu,french,karbi,bhojpuri,magadhi,maithili,chhattisgarhi,"
|
||
"portuguese,pahari,tulu,garo,khasi,mizo,rajasthani,kokborok,"
|
||
"nicobarese", 0xffffffff },
|
||
{ "iq", "Iraq", "arabic,kurdish", 0xffffffff },
|
||
{ "ir", "Iran", "persian", 0xffffffff },
|
||
{ "is", "Iceland (Island)", "icelandic", 0xffffffff },
|
||
{ "it", "Italy", "italian",
|
||
//"albanian,catalan,croatian,franco-proven<65>§al,french,"
|
||
// "friulian,german,greek,italian,ladin,occitan,sardinian,slovenian",
|
||
0xffffffff },
|
||
{ "je", "Jersey", "unknown", 0xffffffff },
|
||
{ "jm", "Jamaica", "english", 0xffffffff },
|
||
{ "jo", "Jordan", "arabic", 0xffffffff },
|
||
{ "jp", "Japan", "japanese", 0xffffffff },
|
||
{ "ke", "Kenya", "english,swahili", 0xffffffff },
|
||
{ "kg", "Kyrgyzstan", "kirghiz,russian", 0xffffffff },
|
||
{ "kh", "Cambodia (Khmer)", "khmer", 0xffffffff },
|
||
{ "ki", "Kiribati", "english,kiribati", 0xffffffff },
|
||
{ "km", "Comoros", "arabic,comorian,french", 0xffffffff },
|
||
{ "kn", "Saint Kitts and Nevis", "english", 0xffffffff },
|
||
{ "kr", "South Korea", "korean", 0xffffffff },
|
||
{ "kw", "Kuwait", "arabic", 0xffffffff },
|
||
{ "ky", "Cayman Islands", "unknown", 0xffffffff },
|
||
{ "kz", "Kazakhstan", "kazakh,russian", 0xffffffff },
|
||
{ "la", "Laos", "lao,french", 0xffffffff },
|
||
{ "lb", "Lebanon", "arabic", 0xffffffff },
|
||
{ "lc", "Saint Lucia", "english", 0xffffffff },
|
||
{ "li", "Liechtenstein", "german", 0xffffffff },
|
||
{ "lk", "Sri Lanka", "sinhala,tamil", 0xffffffff },
|
||
{ "lr", "Liberia", "english", 0xffffffff },
|
||
{ "ls", "Lesotho", "english,sotho", 0xffffffff },
|
||
{ "lt", "Lithuania", "lithuanian", 0xffffffff },
|
||
{ "lu", "Luxembourg", "french,german,luxembourgish", 0xffffffff },
|
||
{ "lv", "Latvia", "latvian", 0xffffffff },
|
||
{ "ly", "Libya", "arabic", 0xffffffff },
|
||
{ "ma", "Morocco", "arabic", 0xffffffff },
|
||
{ "mc", "Monaco", "french", 0xffffffff },
|
||
{ "md", "Moldova", "gagauz,moldovan,russian,ukrainian", 0xffffffff },
|
||
{ "mg", "Madagascar", "french,malagasy", 0xffffffff },
|
||
{ "mh", "Marshall Islands", "english,marshallese", 0xffffffff },
|
||
{ "mk", "Republic of Macedonia", "unknown", 0xffffffff },
|
||
{ "ml", "Mali", "french", 0xffffffff },
|
||
{ "mm", "Myanmar", "burmese", 0xffffffff },
|
||
{ "mn", "Mongolia", "mongolian", 0xffffffff },
|
||
{ "mo", "Macau", "unknown", 0xffffffff },
|
||
{ "mp", "Northern Mariana Islands", "unknown", 0xffffffff },
|
||
{ "mq", "Martinique", "unknown", 0xffffffff },
|
||
{ "mr", "Mauritania", "arabic,fula,soninke,wolof", 0xffffffff },
|
||
{ "ms", "Montserrat", "unknown", 0xffffffff },
|
||
{ "mt", "Malta", "english,maltese", 0xffffffff },
|
||
{ "mu", "Mauritius", "english,french", 0xffffffff },
|
||
{ "mv", "Maldives", "dhivehi", 0xffffffff },
|
||
{ "mw", "Malawi", "chichewa,english", 0xffffffff },
|
||
{ "mx", "Mexico", "spanish", 0xffffffff },
|
||
{ "my", "Malaysia", "malay", 0xffffffff },
|
||
{ "mz", "Mozambique", "portuguese", 0xffffffff },
|
||
{ "na", "Namibia", "english", 0xffffffff },
|
||
{ "nc", "New Caledonia", "unknown", 0xffffffff },
|
||
{ "ne", "Niger", "french", 0xffffffff },
|
||
{ "nf", "Norfolk Island", "unknown", 0xffffffff },
|
||
{ "ng", "Nigeria", "french", 0xffffffff },
|
||
{ "ni", "Nicaragua", "spanish", 0xffffffff },
|
||
{ "nl", "Netherlands", "dutch,frisian", 0xffffffff },
|
||
{ "no", "Norway", "norwegian,norwegian,sami", 0xffffffff },
|
||
{ "np", "Nepal", "nepali", 0xffffffff },
|
||
{ "nr", "Nauru", "english,nauruan", 0xffffffff },
|
||
{ "nu", "Niue", "unknown", 0xffffffff },
|
||
{ "nz", "New Zealand", "english,maori,new zealand sign language,"
|
||
"cook islands maori,niuean,tokelauan", 0xffffffff },
|
||
{ "om", "Oman", "arabic", 0xffffffff },
|
||
{ "pa", "Panama", "spanish", 0xffffffff },
|
||
{ "pe", "Peru", "quechua,aymara,spanish", 0xffffffff },
|
||
{ "pf", "French Polynesia", "unknown", 0xffffffff },
|
||
{ "pg", "Papua New Guinea", "french,fula", 0xffffffff },
|
||
{ "ph", "Philippines", "arabic,bikol,cebuano,english,filipino,"
|
||
"hiligaynon,ilokano,kapampangan,kinaray-a,maranao,"
|
||
"maguindanao,pangasinan,spanish,tagalog,tausug,"
|
||
"waray-waray", 0xffffffff },
|
||
{ "pk", "Pakistan", "english,urdu", 0xffffffff },
|
||
{ "pl", "Poland", "polish", 0xffffffff },
|
||
{ "pm", "Saint-Pierre and Miquelon", "unknown", 0xffffffff },
|
||
{ "pn", "Pitcairn Islands", "unknown", 0xffffffff },
|
||
{ "pr", "Puerto Rico", "unknown", 0xffffffff },
|
||
{ "ps", "Palestinian territories", "unknown", 0xffffffff },
|
||
{ "pt", "Portugal", "portuguese,mirandese", 0xffffffff },
|
||
{ "pw", "Palau", "english,palauan,japanese", 0xffffffff },
|
||
{ "py", "Paraguay", "guaran<EFBFBD>,spanish", 0xffffffff },
|
||
{ "qa", "Qatar", "arabic", 0xffffffff },
|
||
{ "re", "R<EFBFBD>©union", "unknown", 0xffffffff },
|
||
{ "ro", "Romania", "arabic", 0xffffffff },
|
||
{ "ru", "Russia", "abaza,adyghe,agul,altay,avar,bashkir,"
|
||
"buryat,chechen,chukchi,chuvash,dargin,dolgan,"
|
||
"erzya,evenk,ingush,kabardian,kalmyk,karachay-balkar,"
|
||
"khakas,khanty,komi-permyak,komi-zyrian,koryak,kumyk,"
|
||
"lak,lezgi,mansi,mari,moksha,nogai,nenets,ossetic,russian,"
|
||
"tabasaran,tatar,tuvin,udmurt,yakut,yiddish", 0xffffffff },
|
||
{ "rw", "Rwanda", "english,french,kinyarwanda", 0xffffffff },
|
||
{ "sa", "Saudi Arabia", "arabic", 0xffffffff },
|
||
{ "sb", "Solomon Islands", "english", 0xffffffff },
|
||
{ "sc", "Seychelles", "english,french,seselwa", 0xffffffff },
|
||
{ "sd", "Sudan", "arabic,english", 0xffffffff },
|
||
{ "se", "Sweden", "swedish,finnish,me<6D>¤nkieli,romani,sami,"
|
||
"yiddish", 0xffffffff },
|
||
{ "sg", "Singapore", "english,malay,mandarin,tamil", 0xffffffff },
|
||
{ "sh", "Saint Helena", "unknown", 0xffffffff },
|
||
{ "si", "Slovenia", "hungarian,italian,slovenian", 0xffffffff },
|
||
{ "sj", "Svalbard and Jan Mayen Islands", "unknown", 0xffffffff },
|
||
{ "sk", "Slovakia", "slovak", 0xffffffff },
|
||
{ "sl", "Sierra Leone", "english", 0xffffffff },
|
||
{ "sm", "San Marino", "italian", 0xffffffff },
|
||
{ "sn", "Senegal", "french,jola-fogny,malinke,mandinka,pulaar,"
|
||
"serer-sine,wolof", 0xffffffff },
|
||
{ "so", "Somalia", "french", 0xffffffff },
|
||
{ "sr", "Suriname", "dutch", 0xffffffff },
|
||
{ "st", "S<EFBFBD>£o Tom<6F>© and Pr<50>ncipe", "portuguese",
|
||
0xffffffff },
|
||
{ "su", "former Soviet Union", "unknown", 0xffffffff },
|
||
{ "sv", "El Salvador", "spanish", 0xffffffff },
|
||
{ "sy", "Syria", "arabic,french", 0xffffffff },
|
||
{ "sz", "Swaziland", "english,swazi", 0xffffffff },
|
||
{ "tc", "Turks and Caicos Islands", "unknown", 0xffffffff },
|
||
{ "td", "Chad", "arabic,french", 0xffffffff },
|
||
{ "tf", "French Southern and Antarctic Lands", "unknown", 0xffffffff },
|
||
{ "tg", "Togo", "french", 0xffffffff },
|
||
{ "th", "Thailand", "thai", 0xffffffff },
|
||
{ "tj", "Tajikistan", "tajik", 0xffffffff },
|
||
{ "tk", "Tokelau", "unknown", 0xffffffff },
|
||
{ "tl", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
|
||
{ "tm", "Turkmenistan", "turkmen", 0xffffffff },
|
||
{ "tn", "Tunisia", "arabic", 0xffffffff },
|
||
{ "to", "Tonga", "english,tongan", 0xffffffff },
|
||
{ "tp", "East Timor", "english,indonesian,portuguese,tetum", 0xffffffff },
|
||
{ "tr", "Turkey", "turkish", 0xffffffff },
|
||
{ "tt", "Trinidad and Tobago", "english", 0xffffffff },
|
||
{ "tv", "Tuvalu", "english,tuvaluan", 0xffffffff },
|
||
{ "tw", "Taiwan, Republic of China", "mandarin", 0xffffffff },
|
||
{ "tz", "Tanzania", "english,swahili", 0xffffffff },
|
||
{ "ua", "Ukraine", "ukrainian", 0xffffffff },
|
||
{ "ug", "Uganda", "english,swahili", 0xffffffff },
|
||
{ "uk", "United Kingdom", "british,cornish,dg<64>¨rn<72>©siais,"
|
||
"irish,j<>¨rriais,pitcairnese,scots,scottish gaelic,"
|
||
"welsh", 0xffffffff },
|
||
{ "um", "United States Minor Outlying Islands", "english,carolinian,chamorro,"
|
||
"hawaiian,samoan,spanish", 0xffffffff },
|
||
{ "us", "United States of America", "english,carolinian,chamorro,english,"
|
||
"hawaiian,samoan,spanish", 0xffffffff },
|
||
{ "uy", "Uruguay", "spanish", 0xffffffff },
|
||
{ "uz", "Uzbekistan", "uzbek", 0xffffffff },
|
||
{ "va", "Vatican City State", "latin", 0xffffffff },
|
||
{ "vc", "Saint Vincent and the Grenadines", "english", 0xffffffff },
|
||
{ "ve", "Venezuela", "spanish", 0xffffffff },
|
||
{ "vg", "British Virgin Islands", "unknown", 0xffffffff },
|
||
{ "vi", "U.S. Virgin Islands", "unknown", 0xffffffff },
|
||
{ "vn", "Vietnam", "vietnamese", 0xffffffff },
|
||
{ "vu", "Vanuatu", "bislama,english,french", 0xffffffff },
|
||
{ "wf", "Wallis and Futuna", "unknown", 0xffffffff },
|
||
{ "ws", "Samoa", "english,samoan", 0xffffffff },
|
||
{ "ye", "Yemen", "arabic", 0xffffffff },
|
||
{ "yt", "Mayotte", "unknown", 0xffffffff },
|
||
{ "yu", "Yugoslavia", "unknown", 0xffffffff },
|
||
{ "za", "South Africa (Zuid-Afrika)", "afrikaans,english,ndebele,"
|
||
"northern sotho,sotho,swazi,tsonga,tswana,venda,xhosa,zulu",
|
||
0xffffffff },
|
||
{ "zm", "Zambia", "english", 0xffffffff },
|
||
{ "zw", "Zimbabwe", "unknown", 0xffffffff },
|
||
};
|
||
|
||
static int s_langToCatId[] = {
|
||
0, // langUnknown
|
||
0, // langEnglish
|
||
476, // langFrench
|
||
471, // langSpanish
|
||
484, // langRussian
|
||
49884, // langJapanese
|
||
472, // langChineseTrad
|
||
494, // langChineseSimp
|
||
493, // langKorean
|
||
911729, // langGerman
|
||
478, // langDutch
|
||
477, // langItalian
|
||
503, // langFinnish
|
||
485, // langSwedish
|
||
487, // langNorwegian
|
||
483, // langPortuguese
|
||
116289, // langVietnamese
|
||
88070, // langArabic
|
||
118215, // langHebrew
|
||
464465, // langIndonesian
|
||
482, // langGreek
|
||
501, // langThai
|
||
51663, // langHindi
|
||
241315, // langBengala
|
||
480, // langPolish
|
||
173548, // langTagalog
|
||
0, // langBritish (Sadly, there are no British, UK, or Austrialian topics)
|
||
0, // langAustralia
|
||
0 // langUnknown, end of list
|
||
};
|
||
|
||
LangList::LangList ( ) {
|
||
}
|
||
|
||
LangList::~LangList ( ) {
|
||
reset();
|
||
}
|
||
|
||
void LangList::reset ( ) {
|
||
m_langTable.reset();
|
||
m_tldToCountry.reset();
|
||
}
|
||
|
||
// . returns false and sets errno on error
|
||
// . loads language lists into memory
|
||
// . looks under the langlist/ directory for langlist.# files
|
||
// each number corrisponds to a language
|
||
bool LangList::loadLists ( ) {
|
||
//log ( LOG_INIT, "lang: Loading Language Lists.");
|
||
// init the term table
|
||
m_langTable.set(8,4,100000*MAX_LANGUAGES,NULL,0,false,0,"tbl-lang");
|
||
// loop over the languages and load the files
|
||
int32_t listCount = 0;
|
||
int32_t dupCount = 0;
|
||
int32_t allocSize = 0;
|
||
char *buf = NULL;
|
||
Words w;
|
||
for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ) {
|
||
// load the file for reading
|
||
char ff[286];
|
||
sprintf(ff, "%slanglist/langlist.%" INT32 "", g_hostdb.m_dir, i );
|
||
int fd = open ( ff, O_RDONLY );
|
||
// no language file, don't complain
|
||
if ( fd < 0 ) continue;
|
||
// get the size
|
||
struct stat stats;
|
||
stats.st_size = 0;
|
||
int status = stat ( ff, &stats );
|
||
if ( status != 0 ) {
|
||
close(fd);
|
||
log ( "lang: Could not stat %s: %s.",
|
||
ff, strerror(errno) );
|
||
return false;
|
||
}
|
||
int32_t fileSize = stats.st_size;
|
||
// read the file into a buffer
|
||
int32_t thisAllocSize = 3 * fileSize;
|
||
if(thisAllocSize > allocSize) {
|
||
buf = (char*)mrealloc(buf, allocSize, thisAllocSize,
|
||
"LangList");
|
||
allocSize = thisAllocSize;
|
||
}
|
||
if ( !buf ) {
|
||
close(fd);
|
||
log ( "lang: Could not allocate %" INT32 " bytes for "
|
||
"langlist buffer: %s.",
|
||
thisAllocSize, mstrerror(g_errno) );
|
||
return false;
|
||
}
|
||
if ( read ( fd, buf, fileSize ) != fileSize ) {
|
||
close(fd);
|
||
log ( "lang: Could not read %s: %s.",
|
||
ff, strerror(errno) );
|
||
return false;
|
||
}
|
||
close(fd);
|
||
// read the words out of the file
|
||
// char *p = buf;
|
||
// char *pEnd = buf + fileSize;
|
||
// *pEnd = '\0';
|
||
|
||
//UChar* ucBuf = (UChar*)(buf + fileSize);
|
||
//int32_t ucBufLen = fileSize * 2;
|
||
int32_t wordsInList = 0;
|
||
int32_t writtenLen = gbstrlen(buf);
|
||
//int32_t writtenLen = ucToUnicode(ucBuf, ucBufLen,
|
||
// buf, fileSize,
|
||
// "UTF-8", -1,
|
||
// TITLEREC_CURRENT_VERSION);
|
||
|
||
w.reset();
|
||
//doubling the written length seems hackish, may
|
||
//need to be fixed in ucToUnicode.
|
||
|
||
if(!w.set (buf ,
|
||
fileSize ,
|
||
TITLEREC_CURRENT_VERSION,true, false)) {
|
||
char *xx = NULL; *xx = 0;
|
||
return false;
|
||
}
|
||
|
||
int32_t numWords = w.getNumWords();
|
||
for(int32_t j = 0; j < numWords; j++) {
|
||
int64_t wordId = w.m_wordIds[j];
|
||
if(wordId == 0) continue;
|
||
// add it to the table
|
||
uint32_t score = m_langTable.getScore(&wordId);
|
||
//log(LOG_WARN,
|
||
// "lang: Successfully hash %"INT64" from %s dictionary.",
|
||
//wordId, getLanguageString(i));
|
||
if ( score != (uint32_t)i ) {
|
||
if ( score > 0 ) {
|
||
dupCount++;
|
||
if ( score != 0x7fffffff )
|
||
m_langTable.addTerm ( &wordId,
|
||
0x7fffffff);
|
||
}
|
||
else {
|
||
m_langTable.addTerm ( &wordId, i );
|
||
wordsInList++;
|
||
}
|
||
}
|
||
}
|
||
|
||
// count the list
|
||
listCount++;
|
||
|
||
if ( wordsInList > 0 )
|
||
log ( LOG_DEBUG,
|
||
"lang: Successfully Loaded %" INT32 " out of %" INT32 " (%" INT32 " bytes) "
|
||
"words from %s dictionary.",
|
||
wordsInList, numWords>>1, writtenLen, getLanguageString(i) );
|
||
|
||
|
||
}
|
||
|
||
// free the buffer
|
||
if(buf) mfree ( buf, allocSize, "LangList" );
|
||
|
||
|
||
log ( LOG_INIT, "lang: Successfully Loaded %" INT32 " Language Lists and "
|
||
"%" INT32 " duplicate word hashes.",
|
||
listCount, dupCount );
|
||
// all good
|
||
return true;
|
||
}
|
||
|
||
// . lookup word in language lists
|
||
// . returns false if not found true if found and lang set
|
||
bool LangList::lookup ( int64_t termId,
|
||
unsigned char *lang ) {
|
||
// lookup the termId in the table
|
||
uint32_t score = m_langTable.getScore(&termId);
|
||
// is it unknown?
|
||
if ( score == 0 || score >= MAX_LANGUAGES ) {
|
||
*lang = 0;
|
||
return false;
|
||
}
|
||
// otherwise set lang to the score
|
||
*lang = (unsigned char)score;
|
||
return true;
|
||
}
|
||
|
||
|
||
char* LangList::getCountryFromTld(char* tld, int32_t tldLen) {
|
||
//initialize if not already initialized.
|
||
if(s_numTlds == 0) tldInit();
|
||
|
||
int32_t j = 0;
|
||
for(; j < tldLen; j++) {
|
||
if(tld[j] != '.') continue;
|
||
j++; //skip .
|
||
tld = &(tld[j]);
|
||
tldLen -= j;
|
||
break;
|
||
}
|
||
|
||
int32_t index = hash32(tld, tldLen);
|
||
int32_t slot = m_tldToCountry.getSlot(&index);
|
||
|
||
if(slot < 0) return NULL;
|
||
return s_tldInfo[*(int32_t *)m_tldToCountry.getValueFromSlot(slot)].m_country;
|
||
}
|
||
|
||
|
||
bool LangList::isLangValidForTld(char* tld, int32_t tldLen, unsigned char lang) {
|
||
if(lang == langUnknown) return true; //not much we can do here.
|
||
//initialize if not already initialized.
|
||
if(s_numTlds == 0) tldInit();
|
||
|
||
int32_t j = 0;
|
||
for(; j < tldLen; j++) {
|
||
if(tld[j] != '.') continue;
|
||
j++; //skip .
|
||
tld = &(tld[j]);
|
||
tldLen -= j;
|
||
break;
|
||
}
|
||
|
||
int32_t index = hash32(tld, tldLen);
|
||
int32_t slot = m_tldToCountry.getSlot(&index);
|
||
|
||
if(slot < 0) return true;
|
||
int32_t *tip = (int32_t *)m_tldToCountry.getValueFromSlot(slot);
|
||
if ( ! tip ) { char *xx=NULL;*xx=0; }
|
||
TldInfo* t = &s_tldInfo[*tip];
|
||
//it is uninitalized, init on demand.
|
||
if(t->m_languagebv == 0xffffffff) {
|
||
t->m_languagebv = 0;
|
||
for(int32_t i = 1; i <= langTagalog; i++) {
|
||
if(strstr(t->m_languages,getLanguageString(i)) == NULL)
|
||
continue;
|
||
//set the bit corresponding to lang
|
||
t->m_languagebv |= 0x1 << (i-1);
|
||
}
|
||
}
|
||
|
||
if(t->m_languagebv == 0) return true; //its unknown.
|
||
|
||
int32_t mask = 0x1 << (lang-1);
|
||
return mask & t->m_languagebv;
|
||
}
|
||
|
||
|
||
bool LangList::tldInit() {
|
||
s_numTlds = sizeof(s_tldInfo) / sizeof(TldInfo);
|
||
m_tldToCountry.set(4,4,0,NULL,0,false,0,"tldctrytbl");
|
||
for(int32_t i = 0; i < s_numTlds; i++) {
|
||
int32_t ndx = hash32n(s_tldInfo[i].m_tld);
|
||
if ( ! m_tldToCountry.addKey(&ndx , &i ) ) return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
uint8_t LangList::catIdToLang(uint32_t catid) {
|
||
register uint32_t i;
|
||
for(i = 0; i < sizeof(s_langToCatId)/sizeof(uint32_t); i++) {
|
||
if(catid == (uint32_t)s_langToCatId[i]) return((uint8_t)i);
|
||
}
|
||
return(0);
|
||
}
|
||
|
||
uint32_t LangList::langToCatId(uint8_t lang) {
|
||
return(s_langToCatId[(int)lang]);
|
||
}
|
||
|
||
uint8_t LangList::isLangCat(int catid) {
|
||
for(int x = 0; x < MAX_LANGUAGES; x++)
|
||
if(catid == s_langToCatId[x])
|
||
return(x);
|
||
return(langUnknown);
|
||
}
|