Cleanup related to languages/Compiler warnings

This commit is contained in:
Ai Lin Chia
2015-11-23 14:45:18 +01:00
parent 0b2e2bbe3d
commit 6184e376a4
20 changed files with 41 additions and 8252 deletions

@ -6,6 +6,7 @@
#include "HashTable.h"
#include "Categories.h"
#include "LanguageIdentifier.h"
#include "Threads.h"
// record for unified language/country hash table
typedef union catcountryrec_t {

@ -1,438 +0,0 @@
#include "Iso8859.h"
// default for charsets that are highly "non-latin"
// i.e. only allow ASCII to pass...
const unsigned char map_8859_default[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
};
// Latin-1
// Adapted from Wikipedia:
// Albanian, Basque, Catalan, Danish, Dutch (missing some letters),
// English, Estonian (missing some letters), Faroese,
// French (missing some letters), Finnish (missing some letters),
// Galician, German, Icelandic, Irish (new orthography), Italian,
// Latin, Norwegian, Portuguese, Rhaeto-Romanic, Scottish, Spanish,
// Swedish, Afrikaans, Swahili
const unsigned char map_8859_1[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'o', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'y'
};
// Latin-2
// Adapted from Wikipedia:
// Bosnian, Croatian, Czech, Hungarian, Polish, Romainian, Serbian,
// Serbocroatian, Slovak, Slovenian, Upper Sorbian and Lower Sorbian
const unsigned char map_8859_2[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', 'A', ' ', 'L', ' ', 'L', 'S', ' ', ' ', 'S', 'S', 'T', 'Z', ' ', 'Z', 'Z',
' ', 'a', ' ', 'l', ' ', 'l', 's', ' ', ' ', 's', 's', 't', 'z', ' ', 'z', 'z',
'R', 'A', 'A', 'A', 'A', 'L', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'D',
'D', 'N', 'N', 'O', 'O', 'O', 'O', ' ', 'R', 'U', 'U', 'U', 'U', 'Y', 'T', ' ',
'r', 'a', 'a', 'a', 'a', 'l', 'c', 'c', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'd',
'd', 'n', 'n', 'o', 'o', 'o', 'o', ' ', 'r', 'u', 'u', 'u', 'u', 'y', 't', ' '
};
// Latin-3 (South European)
// Adapted from Wikipedia:
// Turkish (superceded by 8859-9), Maltese, Esperanto
const unsigned char map_8859_3[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', 'H', ' ', ' ', ' ', ' ', 'H', ' ', ' ', 'L', 'S', 'G', 'J', ' ', ' ', 'Z',
' ', 'h', ' ', ' ', ' ', ' ', 'h', ' ', ' ', 'l', 's', 'g', 'j', ' ', ' ', 'z',
'A', 'A', 'A', ' ', 'A', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
' ', 'N', 'O', 'O', 'O', 'G', 'O', ' ', 'G', 'U', 'U', 'U', 'U', 'U', 'S', ' ',
'a', 'a', 'a', ' ', 'a', 'c', 'c', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
' ', 'n', 'o', 'o', 'o', 'g', 'o', ' ', 'g', 'u', 'u', 'u', 'u', 'u', 's', ' '
};
// Latin-4 (North European)
// Adapted from Wikipedia:
// Estonian, Latvian, Lithuanian, Greenlandic, and Sami
const unsigned char map_8859_4[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', 'A', 'k', 'R', ' ', 'I', 'L', ' ', ' ', 'S', 'E', 'G', 'T', ' ', 'Z', ' ',
' ', 'a', ' ', 'r', ' ', 'i', 'l', ' ', ' ', 's', 'e', 'g', 't', 'N', 'z', 'n',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'I', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I',
'D', 'N', 'O', 'K', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'U', 'U', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'i', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i',
'd', 'n', 'o', 'k', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'u', 'u', ' '
};
// Latin-5 (Turkish)
// Adapted from Wikipedia:
// Turkish
const unsigned char map_8859_9[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'G', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'I', 'S', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'g', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'i', 's', 'y'
};
// Latin-6 (Nordic)
// Adapted from Wikipedia
const unsigned char map_8859_10[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', 'A', 'E', 'G', 'I', 'I', 'K', ' ', 'L', 'D', 'S', 'T', 'Z', ' ', 'U', 'N',
' ', 'a', 'e', 'g', 'i', 'i', 'k', ' ', 'l', 'd', 's', 't', 'z', ' ', 'u', 'n',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'I', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I',
'D', 'N', 'O', 'O', 'O', 'O', 'O', 'U', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'i', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i',
'o', 'n', 'o', 'o', 'o', 'o', 'o', 'u', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'k'
};
// Latin-7 (Baltic Rim)
// Adapted from Wikipedia:
const unsigned char map_8859_13[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'R', ' ', ' ', ' ', ' ', 'A',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
'A', 'I', 'A', 'C', 'A', 'A', 'E', 'E', 'C', 'E', 'Z', 'E', 'G', 'K', 'I', 'L',
'S', 'N', 'N', 'O', 'O', 'O', 'O', ' ', 'U', 'L', 'S', 'U', 'U', 'Z', 'Z', ' ',
'a', 'i', 'a', 'c', 'a', 'a', 'e', 'e', 'c', 'e', 'z', 'e', 'g', 'k', 'i', 'l',
's', 'n', 'n', 'o', 'o', 'o', 'o', ' ', 'u', 'l', 's', 'u', 'u', 'z', 'z', ' '
};
// Latin-8 (Celtic)
// Adapted from Wikipedia:
// Gaelic, Welsh, Breton
const unsigned char map_8859_14[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', 'B', 'b', ' ', 'C', 'c', 'D', ' ', 'W', ' ', 'W', 'd', 'Y', ' ', ' ', 'Y',
'F', 'f', 'G', 'g', 'M', 'm', ' ', 'P', 'w', 'p', 'w', 'S', 'y', 'W', 'w', 's',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'W', 'N', 'O', 'O', 'O', 'O', 'O', 'T', ' ', 'U', 'U', 'U', 'U', 'Y', 'y', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'w', 'n', 'o', 'o', 'o', 'o', 'o', 't', ' ', 'u', 'u', 'u', 'u', 'y', 'y', 'y'
};
// Latin-9
// Adapted from Wikipedia:
// Update of 8859-1
// English, French, German, Spanish and Portuguese
const unsigned char map_8859_15[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', 'S', ' ', 's', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', 'Z', ' ', ' ', ' ', 'z', ' ', ' ', ' ', 'O', 'o', 'Y', ' ',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'o', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'y'
};
// Latin-10 "South-Eastern European"
// Adapted from Wikipedia:
// Albanian, Croatian, Hungarian, Polish, Romanian and Slovenian, French,
// Italian and Irish Gaelic (new orthography).
const unsigned char map_8859_16[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', 'A', 'a', 'L', ' ', ' ', 'S', ' ', 's', ' ', 'S', ' ', 'Z', ' ', 'z', 'Z',
' ', ' ', 'C', 'l', 'Z', ' ', ' ', ' ', 'z', 'c', 's', ' ', 'O', 'o', 'Y', 'z',
'A', 'A', 'A', 'A', 'A', 'C', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'D', 'N', 'O', 'O', 'O', 'O', 'O', 'S', 'U', 'U', 'U', 'U', 'U', 'E', 'T', ' ',
'a', 'a', 'a', 'a', 'a', 'c', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'd', 'n', 'o', 'o', 'o', 'o', 'o', 's', 'u', 'u', 'u', 'u', 'u', 'e', 't', 'y'
};
// array of all 8859 charsets
const unsigned char* map_8859[ISO_8859_NUM_CHARSETS + 1] =
{
map_8859_default,
map_8859_1,
map_8859_2,
map_8859_3,
map_8859_4,
map_8859_default,
map_8859_default,
map_8859_default,
map_8859_default,
map_8859_9,
map_8859_10,
map_8859_default,
map_8859_default,
map_8859_13,
map_8859_14,
map_8859_15,
map_8859_16
};
const unsigned char map_win_1251[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'K', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'k', ' ', ' ',
// ^^^ control chars ^^^
' ', 'Y', 'y', 'J', ' ', ' ', ' ', ' ', 'E', ' ', ' ', ' ', ' ', '-', ' ', 'I',
' ', 'I', 'i', ' ', ' ', ' ', ' ', ' ', 'e', ' ', ' ', ' ', 'j', 'S', 's', 'i',
'A', 'b', 'B', ' ', ' ', 'E', ' ', ' ', 'N', 'N', 'K', ' ', 'M', 'H', 'O', ' ',
'P', 'C', 'T', 'Y', ' ', 'X', ' ', ' ', 'W', 'W', 'b', ' ', 'b', ' ', ' ', 'R',
'a', 'o', 'b', ' ', ' ', 'e', ' ', ' ', 'n', 'n', 'k', ' ', 'm', 'h', 'o', ' ',
'p', 'c', 't', 'y', ' ', 'x', ' ', ' ', 'w', 'w', 'b', ' ', 'b', ' ', ' ', 'r'
};
const unsigned char map_win_1252[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'S', ' ', 'O', ' ', 'Z', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 's', ' ', 'o', ' ', 'z', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '-', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'o', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'y'
};
const unsigned char map_win_1253[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'E', 'H', 'I', ' ', 'O', ' ', 'Y', ' ',
' ', 'A', 'B', ' ', ' ', 'E', 'Z', 'H', ' ', 'I', 'K', ' ', 'M', 'N', ' ', 'O',
' ', 'P', ' ', ' ', 'T', 'Y', ' ', 'X', ' ', ' ', 'I', 'Y', 'a', 'e', 'n', 'i',
'v', 'a', 'b', ' ', ' ', 'e', ' ', 'n', ' ', 'l', 'k', ' ', ' ', 'v', ' ', 'o',
' ', 'p', ' ', 'o', 't', 'v', ' ', 'X', ' ', 'w', 'i', 'v', 'o', 'v', 'w', ' '
};
const unsigned char map_win_1254[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'S', ' ', 'O', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 's', ' ', 'o', ' ', ' ', 'Y',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
'G', 'N', 'O', 'O', 'O', 'O', 'O', 'X', '0', 'U', 'U', 'U', 'U', 'I', 'S', ' ',
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
'g', 'n', 'o', 'o', 'o', 'o', 'o', ' ', 'o', 'u', 'u', 'u', 'u', 'i', 's', ' '
};
const unsigned char map_win_1255[256] =
{
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
// vvv control chars vvv
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
// ^^^ control chars ^^^
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '
};
// array of all windows charsets
const unsigned char* map_win_125x[WIN_125X_NUM_CHARSETS + 1] =
{
map_win_1252, // dummy, will never be used - no NULL here...
map_win_1251,
map_win_1252,
map_win_1253,
map_win_1254,
map_win_1255
};

@ -1,54 +0,0 @@
// default map for charsets that are highly "non-latin"
extern const unsigned char map_8859_default[256];
// Latin-1
// Adapted from Wikipedia:
// Albanian, Basque, Catalan, Danish, Dutch (missing some letters),
// English, Estonian (missing some letters), Faroese,
// French (missing some letters), Finnish (missing some letters),
// Galician, German, Icelandic, Irish (new orthography), Italian,
// Latin, Norwegian, Portuguese, Rhaeto-Romanic, Scottish, Spanish,
// Swedish, Afrikaans, Swahili
extern const unsigned char map_8859_1[256];
// Latin-2
// Adapted from Wikipedia:
// Bosnian, Croatian, Czech, Hungarian, Polish, Romainian, Serbian,
// Serbocroatian, Slovak, Slovenian, Upper Sorbian and Lower Sorbian
extern const unsigned char map_8859_2[256];
// Latin-3 (South European)
// Adapted from Wikipedia:
// Turkish (superceded by 8859-9), Maltese, Esperanto
extern const unsigned char map_8859_3[256];
// Latin-4 (North European)
// Adapted from Wikipedia:
// Estonian, Latvian, Lithuanian, Greenlandic, and Sami
extern const unsigned char map_8859_4[256];
// Latin-5 (Turkish)
// Adapted from Wikipedia:
// Turkish
extern const unsigned char map_8859_9[256];
// Latin-6 (Nordic)
// Adapted from Wikipedia
extern const unsigned char map_8859_10[256];
// Latin-7 (Baltic Rim)
// Adapted from Wikipedia:
extern const unsigned char map_8859_13[256];
// Latin-8 (Celtic)
// Adapted from Wikipedia:
// Gaelic, Welsh, Breton
extern const unsigned char map_8859_14[256];
// Latin-9
// Adapted from Wikipedia:
// Update of 8859-1
// English, French, German, Spanish and Portuguese
extern const unsigned char map_8859_15[256];
// Latin-10 "South-Eastern European"
// Adapted from Wikipedia:
// Albanian, Croatian, Hungarian, Polish, Romanian and Slovenian, French,
// Italian and Irish Gaelic (new orthography).
extern const unsigned char map_8859_16[256];
// array of all 8859 charsets
#define ISO_8859_NUM_CHARSETS 16
extern const unsigned char* map_8859[ISO_8859_NUM_CHARSETS + 1];
// MS-WIN codepage 1252
#define WIN_125X_NUM_CHARSETS 5
extern const unsigned char* map_win_125x[WIN_125X_NUM_CHARSETS + 1];

362
Lang.cpp

@ -8,107 +8,6 @@ void languageToString ( unsigned char langId , char *buf ) {
strcpy(buf,p);
}
static char *s_nativeLangStrings[] = {
"unknown",
"english",
"français",
"español",
"русcкий",
"t<EFBFBD>rk<EFBFBD>e", // not sure...
"japanese", // don't know yet
"chinese traditional", // don't know yet
"chinese simplified", // don't know yet
"korean", // don't know yet
"deutsch",
"nederlands",
"italiano",
"suomi",
"svenska",
"norsk",
"português",
"vietnamese", // don't know yet
"arabic", // don't know yet
"hebrew", // don't know yet
"indonesian", // don't know yet
"greek", // don't know yet
"thai", // don't know yet
"hindi", // don't know yet
"bengala", // don't know yet
"polski",
"tagalog", // don't know yet
"latin",
"esperanto",
"catalan",
"bulgarian",
"translingual",
"serbo-croatin",
"hungarian",
"danish",
"lithuanian",
"czech",
"galician",
"georgian",
"scottish gaelic",
"gothic",
"romanian",
"irish",
"latvian",
"armenian",
"icelandic",
"ancient greek",
"manx",
"ido",
"persian",
"telugu",
"venetian",
"malagasy",
"kurdish",
"luxembourgish",
"estonian",
NULL
};
static char *s_lowerLangStrings[] = {
"unknown","english","french","spanish","russian","turkish","japanese",
"chinese traditional","chinese simplified","korean","german","dutch",
"italian","finnish","swedish","norwegian","portuguese","vietnamese",
"arabic","hebrew","indonesian","greek","thai","hindi","bengala",
"polish","tagalog",
"latin",
"esperanto",
"catalan",
"bulgarian",
"translingual",
"serbo-croatian",
"hungarian",
"danish",
"lithuanian",
"czech",
"galician",
"georgian",
"scottish gaelic",
"gothic",
"romanian",
"irish",
"latvian",
"armenian",
"icelandic",
"ancient greek",
"manx",
"ido",
"persian",
"telugu",
"venetian",
"malagasy",
"kurdish",
"luxembourgish",
"estonian",
NULL
};
static char *s_langStrings[] = {
"Unknown","English","French","Spanish","Russian","Turkish","Japanese",
"Chinese Traditional","Chinese Simplified","Korean","German","Dutch",
@ -151,12 +50,7 @@ static char *s_langStrings[] = {
char* getLanguageString ( unsigned char langId ) {
if ( langId >= sizeof(s_langStrings)/sizeof(char *) ) return NULL;
return s_langStrings[langId];
};
char* getNativeLanguageString ( unsigned char langId ) {
if ( langId >= sizeof(s_nativeLangStrings)/sizeof(char *) ) return NULL;
return s_nativeLangStrings[langId];
};
}
static char *s_langAbbr[] = {
"xx","en","fr","es","ru","tr","ja","zh_tw","zh_cn","ko","de","nl",
@ -195,31 +89,6 @@ static char *s_langAbbr[] = {
NULL
};
// fix bug:
//#ifndef PRIVATESTUFF
#define csISOLatin6 cslatin6
//#endif
static unsigned char s_langCharset[] = {
csUnknown,csISOLatin1,csISOLatin1,csISOLatin1,//"xx","en","fr","es",
csUnknown,csUnknown,csUnknown,csUnknown,//"ru","zz","ja","zh_tw",
csUnknown,csUnknown,csISOLatin1,csISOLatin1,//"zh_cn","ko","de","nl",
csISOLatin1,csISOLatin6,csISOLatin6,csISOLatin6,//"it","fi","sv","no",
csISOLatin1,csUnknown,csUnknown,csUnknown,//"pt","vi","ar","he",
csUnknown,csUnknown,csUnknown,csUnknown,//"id","el","th","hi",
csUnknown,csUnknown,csUnknown,//"bn","pl","tl","en_uk",
csUnknown//"en_au"
};
uint8_t getLanguageFromName(uint8_t *name) {
int x;
for(x = 0; x < MAX_LANGUAGES && s_lowerLangStrings[x]; x++)
if(!strcasecmp((char*)name, s_lowerLangStrings[x])) return(x);
for(x = 0; x < MAX_LANGUAGES && s_nativeLangStrings[x]; x++)
if(!strcasecmp((char*)name, s_nativeLangStrings[x])) return(x);
return(0);
}
uint8_t getLangIdFromAbbr ( const char *abbr ) {
int x;
for(x = 0; x < MAX_LANGUAGES && s_langAbbr[x]; x++)
@ -231,6 +100,16 @@ uint8_t getLangIdFromAbbr ( const char *abbr ) {
return langUnknown;//0;
}
uint8_t getLangIdFromAbbrN ( const char *abbr ) {
for (int x = 0; x < MAX_LANGUAGES && s_langAbbr[x]; ++x) {
if (!strncasecmp((char*)abbr, s_langAbbr[x], strlen(s_langAbbr[x]))) {
return x;
}
}
return langUnknown;
}
char *getLangAbbr ( uint8_t langId ) {
return s_langAbbr[langId];
}
@ -238,225 +117,6 @@ char *getLangAbbr ( uint8_t langId ) {
char* getLanguageAbbr ( unsigned char langId ) {
if ( langId >= sizeof(s_langAbbr)/sizeof(char *) ) return NULL;
return s_langAbbr[langId];
};
unsigned char getLanguageCharset ( unsigned char langId ){
if ( langId >= sizeof(s_langAbbr)/sizeof(char *) ) return csUnknown;
return s_langCharset[langId];
}
/*
unsigned char getLanguageFromScript(UChar32 c) {
switch(ucGetScript(c)) {
case ucScriptArabic:
return langArabic;
break;
case ucScriptGreek:
return langGreek;
break;
case ucScriptHangul:
case ucScriptHanunoo:
return langKorean;
break;
//case ucScriptHan:
//return langChineseTrad;
case ucScriptHiragana:
case ucScriptKannada:
case ucScriptKatakana:
case ucScriptKatakana_Or_Hiragana:
return langJapanese;
break;
case ucScriptHebrew:
return langHebrew;
break;
case ucScriptThai:
return langThai;
break;
case ucScriptBengali:
return langBengala;
break;
case ucScriptDevanagari:
return langHindi;
break;
default:
return langUnknown;
break;
}
};
*/
unsigned char getLanguageFromAbbr(char *abbr) {
// if(!strcmp(abbr, "en-GB")) return langBritish;
// if(!strcmp(abbr, "en_AU")) return langAustralia;
// if(!strcmp(abbr, "en-AU")) return langAustralia;
if(!strcmp(abbr, "en_US")) return langEnglish;
if(!strcmp(abbr, "en-US")) return langEnglish;
if(!strcmp(abbr, "en")) return langEnglish;
if(!strcmp(abbr, "fr")) return langFrench;
if(!strcmp(abbr, "es_MX")) return langSpanish;
if(!strcmp(abbr, "es-MX")) return langSpanish;
if(!strcmp(abbr, "es")) return langSpanish;
if(!strcmp(abbr, "ru")) return langRussian;
if(!strcmp(abbr, "ua")) return langRussian; // ukrainian?
if(!strcmp(abbr, "ja")) return langJapanese;
if(!strcmp(abbr, "zh_tw")) return langChineseTrad;
if(!strcmp(abbr, "zh_cn")) return langChineseSimp;
if(!strcmp(abbr, "ko")) return langKorean;
if(!strcmp(abbr, "de")) return langGerman;
if(!strcmp(abbr, "nl")) return langDutch;
if(!strcmp(abbr, "it")) return langItalian;
if(!strcmp(abbr, "fi")) return langFinnish;
if(!strcmp(abbr, "sv")) return langSwedish;
if(!strcmp(abbr, "no")) return langNorwegian;
if(!strcmp(abbr, "pt")) return langPortuguese;
if(!strcmp(abbr, "vi")) return langVietnamese;
if(!strcmp(abbr, "ar")) return langArabic;
if(!strcmp(abbr, "he")) return langHebrew;
if(!strcmp(abbr, "id")) return langIndonesian;
if(!strcmp(abbr, "el")) return langGreek;
if(!strcmp(abbr, "th")) return langThai;
if(!strcmp(abbr, "hi")) return langHindi;
if(!strcmp(abbr, "bn")) return langBengala;
if(!strcmp(abbr, "pl")) return langPolish;
if(!strcmp(abbr, "tl")) return langTagalog;
if(!strcmp(abbr, "tr")) return langTurkish;
return langUnknown;
}
unsigned char getLanguageFromAbbrN(char *abbr) {
// if(!strcmp(abbr, "en-GB")) return langBritish;
// if(!strcmp(abbr, "en_AU")) return langAustralia;
// if(!strcmp(abbr, "en-AU")) return langAustralia;
if(!strncasecmp(abbr, "en_US", 5)) return langEnglish;
if(!strncasecmp(abbr, "en-US", 5)) return langEnglish;
if(!strncasecmp(abbr, "en", 2)) return langEnglish;
if(!strncasecmp(abbr, "fr", 2)) return langFrench;
if(!strncasecmp(abbr, "es_MX", 5)) return langSpanish;
if(!strncasecmp(abbr, "es-MX", 5)) return langSpanish;
if(!strncasecmp(abbr, "es", 2)) return langSpanish;
if(!strncasecmp(abbr, "ru", 2)) return langRussian;
if(!strncasecmp(abbr, "ua", 2)) return langRussian; // ukrainian?
if(!strncasecmp(abbr, "ja", 2)) return langJapanese;
if(!strncasecmp(abbr, "zh_tw", 5)) return langChineseTrad;
if(!strncasecmp(abbr, "zh_cn", 5)) return langChineseSimp;
if(!strncasecmp(abbr, "ko", 2)) return langKorean;
if(!strncasecmp(abbr, "de", 2)) return langGerman;
if(!strncasecmp(abbr, "nl", 2)) return langDutch;
if(!strncasecmp(abbr, "it", 2)) return langItalian;
if(!strncasecmp(abbr, "fi", 2)) return langFinnish;
if(!strncasecmp(abbr, "sv", 2)) return langSwedish;
if(!strncasecmp(abbr, "no", 2)) return langNorwegian;
if(!strncasecmp(abbr, "pt", 2)) return langPortuguese;
if(!strncasecmp(abbr, "vi", 2)) return langVietnamese;
if(!strncasecmp(abbr, "ar", 2)) return langArabic;
if(!strncasecmp(abbr, "he", 2)) return langHebrew;
if(!strncasecmp(abbr, "id", 2)) return langIndonesian;
if(!strncasecmp(abbr, "el", 2)) return langGreek;
if(!strncasecmp(abbr, "th", 2)) return langThai;
if(!strncasecmp(abbr, "hi", 2)) return langHindi;
if(!strncasecmp(abbr, "bn", 2)) return langBengala;
if(!strncasecmp(abbr, "pl", 2)) return langPolish;
if(!strncasecmp(abbr, "tl", 2)) return langTagalog;
if(!strncasecmp(abbr, "tr", 2)) return langTurkish;
return langUnknown;
}
unsigned char getLanguageFromUnicodeAbbr(char *abbr) {
// if (!memcmp(abbr, "e\0n\0_\0g\0b\0",10)) return langBritish;
// else if(!memcmp(abbr, "e\0n\0-\0g\0b\0",10)) return langBritish;
// else if(!memcmp(abbr, "e\0n\0_\0a\0u\0",10)) return langAustralia;
// else if(!memcmp(abbr, "e\0n\0-\0a\0u\0",10)) return langAustralia;
if(!memcmp(abbr, "en_us",5)) return langEnglish;
if(!memcmp(abbr, "en-us",5)) return langEnglish;
if(!memcmp(abbr, "es_mx",5)) return langSpanish;
if(!memcmp(abbr, "es-mx",5)) return langSpanish;
if(!memcmp(abbr, "zh_tw",5)) return langChineseTrad;
if(!memcmp(abbr, "zh_cn",5)) return langChineseSimp;
if(!memcmp(abbr, "en",2)) return langEnglish;
if(!memcmp(abbr, "fr",2)) return langFrench;
if(!memcmp(abbr, "es",2)) return langSpanish;
if(!memcmp(abbr, "ru",2)) return langRussian;
if(!memcmp(abbr, "ja",2)) return langJapanese;
if(!memcmp(abbr, "ko",2)) return langKorean;
if(!memcmp(abbr, "de",2)) return langGerman;
if(!memcmp(abbr, "nl",2)) return langDutch;
if(!memcmp(abbr, "it",2)) return langItalian;
if(!memcmp(abbr, "fi",2)) return langFinnish;
if(!memcmp(abbr, "sv",2)) return langSwedish;
if(!memcmp(abbr, "no",2)) return langNorwegian;
if(!memcmp(abbr, "pt",2)) return langPortuguese;
if(!memcmp(abbr, "vi",2)) return langVietnamese;
if(!memcmp(abbr, "ar",2)) return langArabic;
if(!memcmp(abbr, "he",2)) return langHebrew;
if(!memcmp(abbr, "id",2)) return langIndonesian;
if(!memcmp(abbr, "el",2)) return langGreek;
if(!memcmp(abbr, "th",2)) return langThai;
if(!memcmp(abbr, "hi",2)) return langHindi;
if(!memcmp(abbr, "bn",2)) return langBengala;
if(!memcmp(abbr, "pl",2)) return langPolish;
if(!memcmp(abbr, "tl",2)) return langTagalog;
if(!memcmp(abbr, "tr",2)) return langTurkish;
return langUnknown;
}
unsigned char getLanguageFromCountryCode(char *code) {
// Check the ones we know are different first,
// then revert to abbr
if(!strcmp(code, "us")) return(langEnglish);
if(!strcmp(code, "uk")) return(langEnglish);
// if(!strcmp(code, "gb")) return(langBritish);
// if(!strcmp(code, "vg")) return(langBritish);
if(!strcmp(code, "vi")) return(langEnglish);
// if(!strcmp(code, "au")) return(langAustralia);
if(!strcmp(code, "ae")) return(langArabic);
if(!strcmp(code, "cn")) return(langChineseSimp);
if(!strcmp(code, "tw")) return(langChineseTrad);
if(!strcmp(code, "vn")) return(langVietnamese);
return(getLanguageFromAbbr(code));
}
// This is only here to avoid mangling the string
// as we look for tags, if at all possible use the
// getLanguageFromAbbr instead.
unsigned char getLanguageFromUserAgent(char *abbr) {
// if(!strncmp(abbr, "en_GB", 5)) return langBritish;
// if(!strncmp(abbr, "en-GB", 5)) return langBritish;
// if(!strncmp(abbr, "en_AU", 5)) return langAustralia;
// if(!strncmp(abbr, "en-AU", 5)) return langAustralia;
if(!strncmp(abbr, "en_US", 5)) return langEnglish;
if(!strncmp(abbr, "en-US", 5)) return langEnglish;
if(!strncmp(abbr, "en", 2)) return langEnglish;
if(!strncmp(abbr, "fr", 2)) return langFrench;
if(!strncmp(abbr, "es_MX", 5)) return langSpanish;
if(!strncmp(abbr, "es-MX", 5)) return langSpanish;
if(!strncmp(abbr, "es", 2)) return langSpanish;
if(!strncmp(abbr, "ru", 2)) return langRussian;
if(!strncmp(abbr, "ja", 2)) return langJapanese;
if(!strncmp(abbr, "zh_tw", 5)) return langChineseTrad;
if(!strncmp(abbr, "zh_cn", 5)) return langChineseSimp;
if(!strncmp(abbr, "ko", 2)) return langKorean;
if(!strncmp(abbr, "de", 2)) return langGerman;
if(!strncmp(abbr, "nl", 2)) return langDutch;
if(!strncmp(abbr, "it", 2)) return langItalian;
if(!strncmp(abbr, "fi", 2)) return langFinnish;
if(!strncmp(abbr, "sv", 2)) return langSwedish;
if(!strncmp(abbr, "no", 2)) return langNorwegian;
if(!strncmp(abbr, "pt", 2)) return langPortuguese;
if(!strncmp(abbr, "vi", 2)) return langVietnamese;
if(!strncmp(abbr, "ar", 2)) return langArabic;
if(!strncmp(abbr, "he", 2)) return langHebrew;
if(!strncmp(abbr, "id", 2)) return langIndonesian;
if(!strncmp(abbr, "el", 2)) return langGreek;
if(!strncmp(abbr, "th", 2)) return langThai;
if(!strncmp(abbr, "hi", 2)) return langHindi;
if(!strncmp(abbr, "bn", 2)) return langBengala;
if(!strncmp(abbr, "pl", 2)) return langPolish;
if(!strncmp(abbr, "tl", 2)) return langTagalog;
if(!strncmp(abbr, "tr", 2)) return langTurkish;
return langUnknown;
}
// . these are going to be adult, in any language

15
Lang.h

@ -12,8 +12,8 @@
// translingual is the 31st bit, english is the first bit
#define LANG_BIT_MASK 0x007fffffffffffffLL
#include "Unicode.h"
#include "Iso8859.h"
#include "iana_charset.h"
enum {
langUnknown = 0,
langEnglish = 1,
@ -76,23 +76,14 @@ enum {
langLast = 56
};
uint8_t getLanguageFromName(uint8_t *name);
uint8_t getLangIdFromAbbr ( const char *abbr ) ;
uint8_t getLangIdFromAbbrN ( const char *abbr ) ;
char *getLangAbbr ( uint8_t langId ) ;
void languageToString ( unsigned char lang , char *buf );
char* getLanguageString ( unsigned char lang);
char* getNativeLanguageString ( unsigned char lang);
char* getLanguageAbbr ( unsigned char lang);
unsigned char getLanguageCharset ( unsigned char LangId );
bool isAdult( char *s, int32_t slen, char **loc = NULL );
//unsigned char getLanguageFromScript(UChar32 c);
unsigned char getLanguageFromAbbr(char *abbr);
unsigned char getLanguageFromAbbrN(char *abbr);
//unsigned char getLanguageFromUnicodeAbbr(UChar *abbr);
// abbr is now in utf8
unsigned char getLanguageFromUnicodeAbbr(char *abbr);
unsigned char getLanguageFromUserAgent(char *abbr);
unsigned char getLanguageFromCountryCode(char *code);
#endif

File diff suppressed because it is too large Load Diff

@ -1,299 +0,0 @@
#ifndef _LANGUAGE_H_
#define _LANGUAGE_H_
//#include <wchar.h>
#include "gb-include.h"
//#include "UnicodeProperties.h" //UChar32
#include "File.h"
#include "HashTableT.h"
#include "Query.h"
#include "Lang.h"
#include "Multicast.h"
#include "Threads.h"
#include "Titledb.h"
#include "Iso8859.h"
#include "IndexList.h"
//#include "Msg3a.h"
#include "Msg20.h"
// max chars in any language
#define MAX_WORDS_PER_PHRASE 5
#define MAX_CHARS 256
#define TOP_POP_PHRASES 40 * 1024
#define NUM_CHARS 40
#define MAX_FRAG_SIZE 1024
// max chars that start the rule
#define MAX_PHRASE_LEN 80
#define MAX_RECOMMENDATIONS 10
#define LARGE_SCORE 0xfffff
#define MAX_NARROW_SEARCHES 19
/*
// used only while generating titles from wikipedia pages, makeWikiFiles()
class StateWik {
public:
bool getIndexList( );
bool getSummary ( );
bool gotSummary ( );
int m_fdw;
Msg0 m_msg0;
IndexList m_list;
Query m_q;
key_t m_startKey;
key_t m_endKey;
char *m_coll;
int32_t m_collLen;
int64_t m_termId;
int32_t m_minRecSize;
Msg20 m_msg20s[MAX_FRAG_SIZE];
int32_t m_numMsg20sOutstanding;
int32_t m_numMsg20sLaunched;
int32_t m_numMsg20sReceived;
};
class StateDict{
public:
char *m_dictBuf;
int32_t m_dictBufSize;
char *m_buf;
int32_t m_bufSize;
char **m_wordsPtr;
int64_t *m_termIds;
int64_t *m_termFreqs;
int32_t m_numTuples;
Msg37 m_msg37;
};
*/
/*class StateAff{
public:
bool openAffinityFile ( );
bool launchAffinity ( );
bool gotAffinityFreqs1 ( );
bool gotAffinityFreqs2 ( );
bool doneAffinities ( );
FILE *m_fdr;
int m_fdw;
int32_t m_fileNum;
char m_buf[1026];
Msg3a m_msg3a;
Query m_q;
int64_t m_numerator;
int64_t m_denominator;
};*/
typedef struct Reco{
char reco[MAX_PHRASE_LEN];
int32_t score;
}Reco;
class Language {
public:
Language();
~Language();
void reset();
bool init( char *unifiedBuf, int32_t unifiedBufSize, int32_t lang,
int32_t hostsPerSplit, uint32_t myHash );
void setLang( int32_t lang ) { m_lang = lang; };
//bool makeAffinities();
//int32_t getPhrasePopularity ( char *s, uint64_t h,
// bool checkTitleRecDict );
bool checkDict(char *s, int32_t slen, char encodeType);
bool getRecommendation( char *origWord, int32_t origWordLen,
char *recommendation, int32_t recommendationLen,
bool *found, int32_t *score, int32_t *popularity,
bool forceReco = false );
//int32_t narrowPhrase ( char *request, char *phrases, int32_t *pops,
// int32_t maxPhrases );
//bool generateDicts ( int32_t numWordsToDump , char *coll );
//bool convertLatin1DictToUTF8 ( char *infile );
// needed for makeDict
//bool gotTermFreqs( StateDict *st );
//StateDict *m_stateDict;
// hash table of the dictionary
HashTableT <uint64_t, int32_t>m_dict;
private:
int32_t spellcheckDict();
// always accepts only ascii chars. makeClean() converts unicode into
// ascii
bool getPhonetic( char *origWord, int32_t origWordLen,
char *target, int32_t targetLen );
bool loadRules();
bool loadSpellerDict( char *spellerBuf, int32_t spellerbufSize,
int32_t hostsPerSplit, uint32_t myHash );
//bool loadTitleRecDicts( );
//bool loadNarrow( char *spellerBuf, int32_t spellerBufSize,
// int32_t hostsPerSplit, uint32_t myHash );
bool loadDictHashTable( );
//bool genTopPopFile ( char *infile );
bool genDistributedPopFile ( char *infile, uint32_t myHash );
//bool cleanDictFile ( );
bool makeClean( char *inBuf, int32_t inBufSize,
char *outBuf, int32_t outBufSize );//, bool isUTF16 );
//bool makePhonet( char *infile);
//bool makeDict();
//bool makeQueryFiles ( );
//bool makeWikiFiles ( );
bool loadWikipediaWords();
bool loadMispelledWords();
bool hasMispelling(char *phrase, int32_t phraseLen);
int32_t tryPhonet( char *phonetTmp, char *origPhonet,
char *origClean, int32_t tryForScore,
Reco *recos, int32_t numRecos, int32_t *lowestScore );
int32_t editDistance( char *a, char *b, int32_t level, // starting level
int32_t limit ); // maximum level
int32_t weightedAverage(int32_t soundslikeScore, int32_t wordScore);
int32_t limitEditDistance( char *a, char *b, int32_t limit );
int32_t limit1EditDistance( char *a, char *b );
int32_t limit2EditDistance( char *a, char *b );
int32_t checkRest( char *a, char *b, int32_t w, char *amax, int32_t min );
int32_t check2( char *a, char *b, int32_t w, char *amax, int32_t min );
int16_t editDistance( char *a0, char *b0 );
int16_t reduceScore ( char *a, char *b );
//bool makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
// char *coll );
//bool makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
// char *coll);
//bool makeScoreFiles ( int32_t maxWordsPerFile );
// this map maps a char to a "dict char"
//unsigned char m_map [ 256 ];
// . when comparing letter pairs, we only allow them to consist of
// certain chars: 0-9, A-Z, apostrophe and space and \0 otherwise
// m_table gets too big. This implies a NUM_CHARS of
// . this compressed the value, too
// . \0, space, 0-9, A-Z, \' is the ordering
//unsigned char to_dict_char ( unsigned char c ) { return m_map[c]; };
// Temporary unicode workaround for latin-1 compatibility
//unsigned char uc_to_dict_char ( UChar c ) {
// if (c>255)c=0;
// return m_map[c];
//};
// what language loaded
int32_t m_lang;
// what charset does this language use
unsigned char m_charset;
// buffer to store the phonetic rules
char *m_rulesBuf;
int32_t m_rulesBufSize;
char **m_rulesPtr;
int32_t m_rulesPtrSize;
int32_t m_numRules;
// points to the index of each rule that starts with a new character
int32_t m_ruleStarts[MAX_CHARS];
// the chars that are in a phonet
bool m_ruleChars[MAX_CHARS];
// buffers to store the dictionaries
char *m_distributedBuf;
int32_t m_distributedBufSize;
char **m_tuplePtr;
int32_t m_tuplePtrSize;
int32_t m_numTuples;
// total number of phonets
int32_t m_numPhonets;
// narrow phrase
char *m_narrowBuf;
int32_t m_narrowBufSize;
int32_t m_numNarrowPtrs;
char **m_frntPtrs;
char **m_bckPtrs;
int32_t *m_frntCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
int32_t *m_bckCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
// m_phonetics stores the hash of the phonetic as the key.
// the value is a composite of index in m_tuplePtrs where the list
// starts as the high 32 bits of the value and the number of
// words having the same phonetic as the low 32 bits of the value
HashTableT <uint64_t, uint64_t > m_phonetics;
// hash table of the distributed pop words dictionary
// HashTableT <uint32_t, int32_t> m_titlerecDict;
// hash table of the distributed pop words dictionary
HashTableT <uint64_t, int32_t>m_distributedPopPhrases;
// hash table of the top popular words in the dictionary
// HashTableT <uint32_t, char *> m_topPopPhrases;
// hash table of mispelled words
HashTableT <uint32_t, bool>m_misp;
// hash table of wikipedia words
HashTableT <uint32_t, bool>m_wiki;
// PARMS, which can be adjusted. Currently all languages have the
// same adjustments, so using the same parms.
int32_t m_editDistanceWeightsDel1;
int32_t m_editDistanceWeightsDel2;
int32_t m_editDistanceWeightsSwap;
int32_t m_editDistanceWeightsSub;
int32_t m_editDistanceWeightsSimilar;
int32_t m_editDistanceWeightsMin;
int32_t m_editDistanceWeightsMax;
int32_t m_soundslikeWeight;
int32_t m_wordWeight;
int32_t m_span;
bool m_followup;
bool m_collapseResult;
bool m_removeAccents;
};
#endif

@ -12,46 +12,6 @@
LanguageIdentifier g_langId;
/// List of TLDs that should not be used for language detection.
/// NULL terminated.
///
/// Sadly, .de seems to be about half German pages and about half
/// English as well. We cannot use it to distinguish language.
/// Also, .at has some english pages.
/// Also, .nl has some english pages.
/// Also, .no has some english pages.
/// Also, .vn has some english pages.
/// Also, .ro has some english pages.
/// Also, .gr has some english pages.
/// Also, .th has some english pages.
/// Also, .pl has some english pages.
/// Also, .gs has some english pages.
///
/// (Pretty soon it will be faster to have a list of domains that
/// WILL work instead of domains that won't.)
///
static char *ambiguousTLDs[] = {
"info",
"com",
"org",
"net",
"mil",
"de",
"at",
"tv",
"nl",
"no",
"ws",
"vn",
"ro",
"ru",
"gr",
"th",
"pl",
"gs",
NULL
};
const uint8_t *langToTopic[] = {
(uint8_t*)"Unknown",
(uint8_t*)"English",
@ -82,527 +42,10 @@ const uint8_t *langToTopic[] = {
(uint8_t*)"Tagalog"
};
#define MAX_DOCTYPE_SEARCH_LEN (512)
/// Find a language tag in a DOCTYPE element.
///
/// This looks more complex than it is.
/// Find second quote mark, back up to
/// slash, move forward one, and that
/// should be the language identifier.
///
/// @param content pointer to the document's content
///
/// @return pointer to the language tag, or NULL
///
static char * FindLanguageIndex(char *content) {
char *str;
str = strchr(content, '"');
if(!str)
return(NULL);
// Got first quote, skip it
str++;
str = strchr(str, '"');
if(!str)
return(NULL);
// Got second quote char, skip it
str++;
// now back up to slash character...
while(str && *str && str > content && *str != '/')
str--;
// make sure we found the slash...
if(str && *str && str > content && *str == '/') {
str++;
return(str);
}
return(NULL);
}
/// Copy a language tag.
///
/// Does NULL terminate dst.
///
/// @param dst the destination
/// @param src the source (returned from FindLanguageIndex())
/// @param maxSize max length of dst, not counting NULL
///
/// @return true on successful copy, false otherwise
///
static bool copyLangTag(char *dst, char *src, int maxSize) {
int len = 0;
if(!dst || !src || maxSize < 1)
return(false);
while ( *src && *src != '"' ) { // && len++ < maxSize) {
//if(len < 2) {
// *dst++ = tolower(*src++);
//} else {
// *dst++ = *src++;
//}
*dst++ = tolower(*src++);
// how many chars have we copied over?
len++;
// leave 1 char for a \0 termination
if ( len + 1 >= maxSize ) break;
}
*dst = 0;
return(true);
}
LanguageIdentifier::LanguageIdentifier() {
return;
}
inline bool LanguageIdentifier::isAmbiguousTLD(char *tld, int len) {
register int x;
for(x = 0; ambiguousTLDs[x]; x++) {
if(!strncmp(tld, ambiguousTLDs[x],
maxOf(len, gbstrlen(ambiguousTLDs[x]))))
return(true);
}
return(false);
}
uint8_t getLanguageFromAbbr2 ( char *str , int32_t len ) {
// truncate
if ( len > 5 ) len = 5;
// copy it and check it
char lang[6];
for ( int32_t j = 0 ; j < len ; j++ )
lang[j] = to_lower_a(str[j]);
lang[len]='\0';
return getLanguageFromAbbr(lang);
}
uint8_t LanguageIdentifier::guessLanguageFromTag(Xml *xml) {
uint8_t rv = langUnknown;
int32_t len = 0;
//char lang[6];
int id;
char *str;
if(!xml) return(langUnknown);
for(int32_t i = 0; i < xml->getNumNodes(); i++) {
id = xml->getNodeId(i);
// look for meta tag
if(id == TAG_META) {
str = (char *) xml->getString(i, "name", &len);
if(str &&
(!strncasecmp(str, "Content-Language",16) ||
!strncasecmp(str, "language",8) ||
!strncasecmp(str, "Content_Language",16) ) ) {
str = (char *) xml->getString(i, "content", &len);
rv = getLanguageFromAbbr2(str,len);
if(rv != langUnknown) return(rv);
}
else {
str = (char *) xml->getString(i, "http-equiv", &len);
if(str && !strncasecmp(str, "Language", 8) ) {
str = (char *) xml->getString(i, "content", &len);
rv = getLanguageFromAbbr2(str,len);
if(rv != langUnknown) return(rv);
}
}
} // end looking for meta tag
if(id != TAG_HTML && // html
id != TAG_BODY && // body
id != TAG_HEAD) // head
continue;
str = (char *) xml->getString(i, "lang", &len);
rv = getLanguageFromAbbr2(str,len);
if(rv != langUnknown) return(rv);
}
return(rv);
}
uint8_t LanguageIdentifier::guessLanguageFromOutlinks(Links *links) {
char link[MAX_URL_LEN];
int32_t langs[32];
int lc;
char *cp = NULL;
int max = 0;
int oldmax = 0;
uint8_t l;
uint8_t maxlang = 0;
int len;
if(!links) return(langUnknown);
// Try to catch bad pointers
//if(!isValidPointer(links)) {
// log(LOG_WARN, "build: Bad pointer 0x%08x not above data segment.\n",
// (uint32_t) links);
// return(langUnknown);
//}
if(links->getNumLinks() < 1) {
return(langUnknown);
}
if(links->getNumLinks() < 15) {
return(langUnknown);
}
// clear list
memset(langs, 0, sizeof(uint32_t) * 32);
// trim to only 100 links to prevent
// spinning on some large pages
for(lc = 0; lc < links->getNumLinks() && lc < 100; lc++) {
cp = links->getLink(lc);
if(cp) {
// skip http://
cp += 7;
len = links->getLinkLen(lc) - 7;
char* p = link;
while(*cp && *cp != '/') *p++ = *cp++;
*p = '\0';
if((cp = strrchr(link, '.')) != NULL) {
// skip to tld
cp++;
// only bother if not a common TLD
len = gbstrlen(cp);
if(!isAmbiguousTLD(cp, len)) {
for(l = 1; l < 32; l++) {
if(g_langList.isLangValidForTld(cp, len, l))
langs[l]++;
}
}
}
}
}
// look for a clear winner from the list
// don't bother with langUnknown, it reduces hits
for(l = 1; l < 32; l++) {
if(langs[l] >= max) {
oldmax = max;
max = langs[l];
maxlang = l;
}
}
// 1st place must beat 2nd place by 5
if(max - oldmax > 5) {
return(maxlang);
}
return(langUnknown);
}
uint8_t LanguageIdentifier::guessLanguageFromTld(char *linktext) {
#if 0
// This is not a good check of language
int len = 0;
char *cp;
if(!linktext) return(langUnknown);
// skip http://
cp = linktext + 7;
// if no slash, start at the end of the link
if(!(cp = strchr(cp, '/')))
cp = linktext + (gbstrlen(linktext) - 1);
// find last dot
while(*cp && cp > linktext && *cp != '.') {
cp--;
len++;
}
// skip '.'
len--; cp++;
if(len != 2) return(langUnknown);
#endif // 0
return(langUnknown);
}
uint8_t LanguageIdentifier::guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip) {
int32_t x;
//int32_t y;
uint8_t languages[32];
uint8_t max = langUnknown;
uint8_t oldmax = langUnknown;
uint8_t maxIndex = 0;
uint8_t oldmaxIndex = 0;
int hits = 0;
// sanity check
//if(linkInfo->m_numLangs != linkInfo->getNumDocIds()) {
// log(LOG_DEBUG, "build: Number of languages (%"INT32") != number of docids (%"INT32")\n",
// linkInfo->m_numLangs, linkInfo->getNumDocIds());
// return(langUnknown);
//}
if(linkInfo->getNumGoodInlinks() < 7) return(langUnknown);
memset(languages, 0, 32);
// only check the first 100 inlinks, or we'll spin
// on some monstrous sites.
//for(x = 0; x < linkInfo->m_numLangs && x < 100; x++) {
for (Inlink*k=NULL;(k=linkInfo->getNextInlink(k)); ) {
//int32_t id = linkInfo->getLanguageId(x);
int32_t id = k->m_language;
// sanity check, we are still getting bad lang ids!!
if ( id < 0 || id >= 32 ) {
log("build: Got bad lang id of %"INT32". how can this "
"happen?",id);
continue;
}
// don't count langUnknown pages, it reduces hits
if ( ! id ) continue;
// skip if not from a different enough IP
if((k->m_ip&0x0000ffff)==(ip&0x0000ffff) )
continue;
// otherwise count it
languages[id]++;
hits++;
}
if(hits < 7) return(langUnknown);
for(x = 1; x < 32; x++) {
if(languages[x] >= max) {
oldmax = max;
max = languages[x];
oldmaxIndex = maxIndex;
maxIndex = x;
}
}
// sanity check
if(maxIndex > 31 || oldmaxIndex > 31) {
log(LOG_INFO,
"build: guessLanguageFromInlinks(): Possible stack corruption: %d:%d\n",
maxIndex, oldmaxIndex);
return(langUnknown);
}
// Need better than 50%
// if(max - oldmax > 4)
if(max > (linkInfo->getNumGoodInlinks() / 2))
return(maxIndex);
return(langUnknown);
}
uint8_t LanguageIdentifier::guessLanguageFromDoctype(Xml *xml, char *content) {
uint8_t rvDoc = langUnknown;
int id;
char *str;
char lang[6];
if(!content) return(langUnknown);
for(int32_t i = 0; i < xml->getNumNodes(); i++) {
id = xml->getNodeId(i);
// skip if not DOCTYPE
if ( id != TAG_DOCTYPE ) continue;
// get the tag ptr to the tag
char *tag = xml->getNode(i);
// this is in BYTES
//int32_t tagLen = xml->getNodeLen(i);
// case might be upper, so we change
// the first two letters to lower.
str = FindLanguageIndex(tag);
if(!str) continue;
if(copyLangTag(lang, str, 5))
rvDoc = getLanguageFromAbbr(lang);
return(rvDoc);
}
return(rvDoc);
}
/// Skip whitespace in a string.
///
/// Includes CR and LF.
///
/// @param str the string
///
/// @return pointer to next character that is not whitespace, or NULL
///
static char *skipwhite(char *str) {
while(str && *str &&
(*str == ' ' ||
*str == '\t' ||
*str == '\n' ||
*str == '\r'))
str++;
return(str);
}
/// Skip over 'words' in a string.
///
/// Skips over everything until there's whitespace.
///
/// @param str the string to search
///
/// @return the pointer to the next whitespace character
///
static char *skipword(char *str) {
while(str && *str &&
(*str != ' ' &&
*str != '\t' &&
*str != '\n' &&
*str != '\r'))
str++;
return(str);
}
uint8_t LanguageIdentifier::guessLanguageFromUserAgent(char *str) {
// Mozilla/5.0 (X11; U; Linux i686;
// en-US; rv:1.8.1.4) Gecko/20070531 Firefox/2.0.0.4
uint8_t lang = langUnknown;
while(*str) {
if(!(str = skipwhite(str)))
return(langUnknown);
if((lang = getLanguageFromUserAgent(str)) != langUnknown)
return(lang);
if(!(str = skipword(str)))
return(langUnknown);
}
return(langUnknown);
}
uint8_t LanguageIdentifier::guessLanguageFromDMOZ(char *addr) {
return(g_categories->findLanguage(addr));
}
uint8_t LanguageIdentifier::guessLanguageFromQuery(Query *q) {
uint8_t lang;
if(q->getNumTerms() == 1) {
if(g_langList.lookup(q->getTermId(1), &lang))
return(lang);
} else {
// Look for two consecutive identical languages
// Not as good as a frequency count, but much faster
uint8_t last = 255;
register int32_t qcount;
for(qcount = 0; qcount < q->getNumTerms(); qcount++) {
if(g_langList.lookup(q->getTermId(qcount), &lang) &&
last == lang) {
return(lang);
break;
}
}
}
return(langUnknown);
}
uint8_t LanguageIdentifier::getBestLanguage(char** method,
Url* url,
Xml* xml,
Links* links,
LinkInfo* linkInfo,
char* content) {
uint8_t langEnum;
// Let the site tell us what language it's in
langEnum = g_langId.guessLanguageFromTag(xml);
*method = "Tag";
if(langEnum != langUnknown) return langEnum;
// Get the language from a DMOZ category
// Accurate, but low hit rate
langEnum = g_langId.guessLanguageFromDMOZ(url->getUrl());
*method = "DMOZ";
if(langEnum != langUnknown) return langEnum;
// Guess from the TLD
uint8_t possibleLanguage = g_langId.guessLanguageFromTld(url->getUrl());
if(possibleLanguage) langEnum = possibleLanguage;
*method = "TLD";
if(langEnum != langUnknown) return langEnum;
// m_newDoc->getLinks() can return a bad address
// Guess from the outlinks
langEnum = g_langId.guessLanguageFromOutlinks(links);
*method = "Outlinks";
if(langEnum != langUnknown) return langEnum;
// m_newDoc->getLinks() can return a bad address
// Guess from the inlinks
// langEnum = g_langId.guessLanguageFromInlinks(linkInfo);
// *method = "Inlinks";
if(langEnum != langUnknown) return langEnum;
// Word frequency count
langEnum = xml->getLanguage();
*method = "Freq";
if(langEnum != langUnknown) return langEnum;
// Let the doctype tell us what language it's in
langEnum = g_langId.guessLanguageFromDoctype(xml, content);
*method = "Doctype";
return langEnum;
}
uint8_t LanguageIdentifier::getBestLangsFromVec(char* langCount,
//SiteType* typeVec,
int32_t *langIds ,
uint8_t *langScores ,
int32_t tagVecSize) {
int32_t bestCount = -1;
uint8_t numTags = 0;
int32_t langTotal = 0;
for(int32_t j = 0; j < MAX_LANGUAGES; j++) {
langTotal += langCount[j];
}
if(langTotal == 0 || langCount[langUnknown] == langTotal)
return 0;
//dont store unknown language
langTotal -= langCount[langUnknown];
langCount[langUnknown] = 0;
for(int32_t i = 0; i < tagVecSize; i++) {
int32_t maxCount = 0;
int32_t maxCountNdx = 0;
for(int32_t j = 0; j < MAX_LANGUAGES; j++) {
if(langCount[j] > maxCount) {
maxCount = langCount[j];
maxCountNdx = j;
}
}
if(i == 0) bestCount = maxCount;
//if none found or this one is half as much as previous
//then quit.
if(maxCount == 0 ||
maxCount < (bestCount/2)) break;
//typeVec[i].m_type = maxCountNdx;
//typeVec[i].m_score = (uint8_t)((maxCount * 100.0)
// / langTotal);
langIds [i] = maxCountNdx;
langScores[i] = (uint8_t)((maxCount * 100.0) / langTotal);
langCount[maxCountNdx] = 0;
numTags++;
}
return numTags;
}
uint8_t LanguageIdentifier::findLangFromDMOZTopic(char *topic) {
int x;
for(x = 0; x < (int)(sizeof(langToTopic)/sizeof(uint8_t *)); x++) {
@ -614,228 +57,6 @@ uint8_t LanguageIdentifier::findLangFromDMOZTopic(char *topic) {
return(langUnknown);
}
uint8_t LanguageIdentifier::guessGBLanguageFromUrl(char *url) {
if(!url) return(langUnknown);
uint8_t lang;
if((lang = guessLanguageFromUrl(url)) != langUnknown)
return(lang);
char code[6];
char *cp = url;
memset(code, 0, 6);
for(int x = 0; x < 6; x++) {
if((cp[x] < 'a' || cp[x] > 'z') &&
(cp[x] < 'A' || cp[x] > 'Z') &&
cp[x] != '_' && cp[x] != '-')
break;
code[x] = cp[x];
}
return(getLanguageFromCountryCode(code));
}
static inline bool s_checkCharIsBoundary(uint8_t x) {
if(x < '0') return(true);
if(x > '9' && x < 'A') return(true);
if(x > 'Z' && x < 'a') return(true);
if(x > 'z' && x < 128) return(true);
return(false);
}
static inline bool s_isRightBoundedAbbr(char *pointer, uint8_t l) {
if(s_checkCharIsBoundary(*(pointer + 2)))
return(true);
if((*(pointer + 3) == '-' || *(pointer + 3) == '_') &&
s_checkCharIsBoundary(*(pointer + 5)))
return(true);
return(false);
}
static inline bool s_isRightBoundedLanguageWord(char *pointer, uint8_t l) {
if(s_checkCharIsBoundary(*(pointer + gbstrlen(getNativeLanguageString(l)))))
return(true);
if(s_checkCharIsBoundary(*(pointer + gbstrlen(getLanguageString(l)))))
return(true);
return(false);
}
uint8_t s_lookForLanguageParam(char *url) {
char *cp = url;
uint8_t l;
// Try to find lan= or lang= or language=
while(cp && *cp && (cp = strstr(cp, "lan"))) {
if(!s_checkCharIsBoundary(*(cp - 1))) {
cp++;
continue;
}
if(!strncmp(cp, "lan=", 4)) cp += 4;
else if(!strncmp(cp, "lang=", 5)) cp += 5;
else if(!strncmp(cp, "language=", 9)) cp += 9;
if((l = getLanguageFromName((uint8_t*)cp)) &&
s_isRightBoundedLanguageWord(cp, l))
return(l);
if((l = getLanguageFromAbbrN(cp)) &&
s_isRightBoundedAbbr(cp, l))
return(l);
cp++;
}
// Try to find l=
cp = url;
while(cp && *cp && (cp = strstr(cp, "l="))) {
if(!s_checkCharIsBoundary(*(cp - 1))) {
cp++;
continue;
}
if((l = getLanguageFromName((uint8_t*)cp)) &&
s_isRightBoundedLanguageWord(cp, l))
return(l);
if((l = getLanguageFromAbbrN(cp)) &&
s_isRightBoundedAbbr(cp, l))
return(l);
cp++;
}
return(0);
}
uint8_t s_lookForLanguagePrefix(char *url) {
char *cp = url;
uint8_t l = 0;
// Look for a prefix on the url
// Do not add a postfix or TLD detector,
// they are not good indications at all.
if(!strncmp(url, "http://", 7)) cp = url + 7;
else cp = url;
if((l = getLanguageFromAbbrN(cp)) &&
s_isRightBoundedAbbr(cp, l))
return(l);
// Lookup, and see if it's on a word boundary
if((l = getLanguageFromName((uint8_t*)cp)) &&
s_isRightBoundedLanguageWord(cp, l))
return(l);
return(0);
}
uint8_t LanguageIdentifier::guessLanguageFromUrl(char *url) {
int len = 0;
char *cp = url;
char code[3];
uint8_t l = 0;
if(!url) return(langUnknown);
// Look for a parameter that would indicate the language
if((l = s_lookForLanguageParam(url))) return(l);
// Look for a prefix that would indicate the language
if((l = s_lookForLanguagePrefix(url))) return(l);
// if no slash, start at the end of the link
if(!(cp = strchr(url, '/')))
cp = url + (gbstrlen(url) - 1);
// find last dot
while(*cp && cp > url && *cp != '.') {
cp--;
len++;
}
// No dot?
if(cp <= url) return(langUnknown);
// skip '.'
len--; cp++;
code[0] = cp[0];
code[1] = cp[1];
code[2] = 0;
return(getLanguageFromCountryCode(code));
}
static inline int s_findMaxInList(int *list, int numItems) {
int max, oldmax, idx;
if(!list) return(0);
max = oldmax = INT_MIN;
idx = 0;
for(int x = 0; x < numItems; x++) {
if(list[x] >= max) {
oldmax = max;
max = list[x];
idx = x;
}
}
if(oldmax == max) return(0);
return(idx);
}
uint8_t LanguageIdentifier::guessLanguageFreqCount(Xml *xml,
int pageLimit /* = 512 */) {
if(!xml) return(langUnknown);
int votes[MAX_LANGUAGES];
int limit = xml->getNumNodes();
int scores[MAX_LANGUAGES];
if(pageLimit < limit) limit = pageLimit;
memset(votes, 0, sizeof(int) * MAX_LANGUAGES);
// Do term frequency count
for(int x = 0; x < limit; x++) {
if(xml->isTag(x) || xml->getNodeLen((int32_t)x) < 2) continue;
char *cp = g_speller.getPhraseRecord(xml->getNode((int32_t)x),
xml->getNodeLen((int32_t)x));
if(!cp) continue;
memset(scores, 0, sizeof(int) * MAX_LANGUAGES);
while(*cp) {
// skip leading whitespace
while(*cp && (*cp == ' ' || *cp == '\t')) cp++;
// get language
int l = atoi(cp);
// skip to next delimiter
while(*cp && *cp != '\t') cp++;
// skip over tab
cp++;
// get score
scores[l] = atoi(cp);
// skip to next delimiter
while(*cp && *cp != '\t') cp++;
}
votes[s_findMaxInList(scores, MAX_LANGUAGES)]++;
}
// Find max
int max = 0;
int maxidx = 0;
int oldmax = 0;
for(int x = 0; x < MAX_LANGUAGES; x++) {
if(votes[x] < max) continue;
oldmax = max;
max = votes[x];
maxidx = x;
}
if(max == 0) maxidx = 0;
#if 0
// English, British, and Australian are no longer separate
// If it's a toss up between any version of English, go with it.
if((max == langEnglish || max == langAustralia || max == langBritish) &&
(oldmax == langEnglish || oldmax == langAustralia || oldmax == langBritish))
return(maxidx);
#endif // 0
// Note the winner
if(oldmax <= 0 || max > oldmax)
return maxidx;
return langUnknown;
}
uint8_t LanguageIdentifier::guessCountryTLD(const char *url) {
uint8_t country = 0;
char code[3];
@ -864,46 +85,3 @@ uint8_t LanguageIdentifier::guessCountryTLD(const char *url) {
}
return(country);
}
static int s_wordLen(char *str) {
char *cp = str;
while(*cp && *cp != ' ' && *cp != ';' &&*cp != '\t' &&
*cp != '\n' && *cp != '\r' && *cp != '.' && *cp != ',')
cp++;
return(cp - str);
}
static bool s_isLangTag(char *str) {
int len = s_wordLen(str);
if(len == 2) return(true);
if(len != 5) return(false);
if(str[2] == '_' || str[2] == '-') return(true);
return(false);
}
static uint8_t s_getCountryFromSpec(char *str) {
char code[6];
memset(code, 0,6);
gbmemcpy(code, str, s_wordLen(str));
for(int x = 0; x < 6; x++)
if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');
if(code[2] == '_' || code[2] == '-')
return g_countryCode.getIndexOfAbbr(&code[3]);
return g_countryCode.getIndexOfAbbr(code);
}
uint8_t LanguageIdentifier::guessCountryFromUserAgent(char *ua) {
if(!ua) return(0);
uint8_t country = 0;
while(*ua) {
if(!(ua = skipwhite(ua)))
return(0);
if(s_isLangTag(ua) &&
(country = s_getCountryFromSpec(ua)) != 0)
return(country);
if(!(ua = skipword(ua)))
return(0);
}
return(0);
}

@ -3,22 +3,12 @@
/// Contains the main utility function, guessLanguage(), and all
/// the support routines for detecting the language of a web page.
///
/// 2007 May 24 09:02:52
/// $ID$
/// $Author: John Nanney$
/// $Workfile$
/// $Log$
///
// using a different macro because there's already a Language.h
#ifndef LANGUAGEIDENTIFIER_H
#define LANGUAGEIDENTIFIER_H
#include "gb-include.h"
#include "Xml.h"
#include "Linkdb.h"
//#include "LinkInfo.h"
#include "Query.h"
/// Contains methods of language identification by various means.
class LanguageIdentifier {
@ -29,97 +19,6 @@ class LanguageIdentifier {
/// Destructor, does very little.
~LanguageIdentifier() { return; }
/// Get the language from the page's lang="" tag.
///
/// Looks for a lang="x" property in the HTML, BODY, or HEAD
/// tag. Returns the first match. This is usually a very
/// accurate guess of the language, since the author of the
/// page went through all the trouble to make sure it was
/// in there.
///
/// @param xml the page's xml object
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromTag(Xml *xml);
/// Guess the language from the TLDs of outlinks found in the page.
///
/// TLDs which are ambiguous like .com are skipped.
///
/// @param links a list of links
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromOutlinks(Links *links);
/// Guess the language from the page's TLD.
///
/// @param linktext the ascii URL
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromTld(char *linktext);
/// Guess the language from the languages of the inlinks.
///
/// @param linkInfo
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip);
/// Determine whether a given TLD is suitable for language detection.
/// @param tld the TLD in ascii
/// @param len the length of tld
/// @return true if suitable, false if not
///
inline bool isAmbiguousTLD(char *tld, int len);
/// Return the greater of two ints.
inline int maxOf(int a, int b) {
if(b > a) return(b);
return(a);
}
/// Guesses language from the DOCTYPE string present in many pages.
///
/// @param xml the page's xml object
/// @param content the page's content, for finding the doctype
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromDoctype(Xml *xml, char *content);
/// Guess a language from a tag in the user agent string.
///
/// @param str the user agent string
///
/// @return the language, or langUknown
///
uint8_t guessLanguageFromUserAgent(char *str);
/// Find an address in DMOZ for the language.
///
/// Looks up the page address in the category language tables.
///
/// @param addr the page address
///
/// @return language, or langUnknown if not found
///
uint8_t guessLanguageFromDMOZ(char *addr);
/// Guess the query language from the query terms.
///
/// This algorithm looks for two consecutive terms with the
/// same language.
///
/// @param q the query object
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromQuery(Query *q);
/// Find a language from DMOZ topic.
///
/// The function name is a bit misleading, we expect
@ -131,29 +30,7 @@ class LanguageIdentifier {
///
uint8_t findLangFromDMOZTopic(char *topic);
uint8_t getBestLanguage(char** method,
Url* url,
Xml* xml,
Links* links,
LinkInfo* linkInfo,
char* content);
uint8_t getBestLangsFromVec(char* langCount,
//SiteType* typeVec,
int32_t *langIds ,
uint8_t *langScores ,
int32_t tagVecSize);
uint8_t guessGBLanguageFromUrl(char *url);
uint8_t guessLanguageFromUrl(char *url);
uint8_t guessLanguageFreqCount(Xml *xml,
int pageLimit /* = 512 */);
uint8_t guessCountryTLD(const char *url);
uint8_t guessCountryFromUserAgent(char *ua);
};
extern class LanguageIdentifier g_langId;

@ -45,7 +45,7 @@ OBJS = UdpSlot.o Rebalance.o \
Speller.o \
PingServer.o StopWords.o TopTree.o \
Parms.o Pages.o \
Unicode.o iana_charset.o Iso8859.o \
Unicode.o iana_charset.o \
SearchInput.o \
Categories.o Msg2a.o PageCatdb.o PageDirectory.o \
SafeBuf.o Datedb.o \
@ -56,7 +56,7 @@ OBJS = UdpSlot.o Rebalance.o \
PageLogView.o Msg1f.o Blaster.o MsgC.o \
PageSpam.o Proxy.o PageThreads.o Linkdb.o \
matches2.o LanguageIdentifier.o \
Language.o Repair.o Process.o \
Repair.o Process.o \
Abbreviations.o \
RequestTable.o TuringTest.o Msg51.o \
Msg40.o Msg4.o SpiderProxy.o \
@ -477,9 +477,6 @@ Spider.o:
test_parser2.o:
$(CXX) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
Language.o:
$(CXX) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
PostQueryRerank.o:
$(CXX) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp

@ -11,8 +11,6 @@
static void gotReplyWrapper3a ( void *state , void *state2 ) ;
//static void gotRerankedDocIds ( void *state );
int32_t *g_ggg = NULL;
Msg3a::Msg3a ( ) {
constructor();
}

@ -49,9 +49,6 @@ public:
void (* callback) ( void *state ) ,
class Host *specialHost = NULL );
bool gotTermFreqs();
// Msg40 calls this to get Query m_q to pass to Summary class
Query *getQuery ( ) { return m_q ; };

@ -25,7 +25,7 @@ bool printHttpMime ( class State0 *st ) ;
//static void handleRequest40 ( UdpSlot *slot , int32_t netnice );
//static void gotExternalReplyWrapper ( void *state , void *state2 ) ;
static void gotCacheReplyWrapper ( void *state );
//static void gotCacheReplyWrapper ( void *state );
static void gotDocIdsWrapper ( void *state );
static bool gotSummaryWrapper ( void *state );
//static void didTaskWrapper ( void *state );
@ -508,15 +508,15 @@ bool Msg40::gotExternalReply ( ) {
*/
// msg17 calls this after it gets a reply
void gotCacheReplyWrapper ( void *state ) {
Msg40 *THIS = (Msg40 *)state;
// reset g_errno, we're just a cache
g_errno = 0;
// handle the reply
if ( ! THIS->gotCacheReply() ) return;
// otherwise, call callback
THIS->m_callback ( THIS->m_state );
}
//void gotCacheReplyWrapper ( void *state ) {
// Msg40 *THIS = (Msg40 *)state;
// // reset g_errno, we're just a cache
// g_errno = 0;
// // handle the reply
// if ( ! THIS->gotCacheReply() ) return;
// // otherwise, call callback
// THIS->m_callback ( THIS->m_state );
//}
bool Msg40::gotCacheReply ( ) {
// if not found, get the result the hard way

@ -1052,7 +1052,6 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
int32_t docsWanted;
int32_t firstResultNum;
int32_t nqterms;
int32_t rerankRuleset;
int32_t wait;
char exact;
//int32_t hid = -1;

@ -2,9 +2,9 @@
#define _PAGERESULTS_H_
#include "SafeBuf.h"
#include "Language.h" // MAX_FRAG_SIZE
#include "Msg40.h"
#include "Msg0.h"
#include "Speller.h" // MAX_FRAG_SIZE
// height of each search result div in the widget
#define RESULT_HEIGHT 120

@ -1851,49 +1851,6 @@ bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// get the collection rec
/*
CollectionRec *cr = g_collectiondb.getRec ( coll );
uint8_t *hp = NULL;
int32_t hpLen;
int64_t docsInColl = -1;
if ( ! cr ) {
// use the default
Parm *pp = g_parms.getParm ( "hp" );
if ( ! pp ) {
g_errno = ENOTFOUND;
g_msg = " (error: no such collection)";
return g_httpServer.sendErrorReply(s,500,
mstrerror(g_errno));
}
hp = (uint8_t*)pp->m_def;
if ( hp ) hpLen = uint8strlen ( hp );
if ( hpLen <= 0 || ! hp )
log(LOG_INFO,"http: No root page html present.");
} else {
if(cr->m_useLanguagePages) {
uint8_t lang = g_langId.guessGBLanguageFromUrl(r->getHost());
if(lang && (hp = g_languagePages.getLanguagePage(lang)) != NULL) {
hpLen = uint8strlen(hp);
// Set sort language as well
// This might not be a good idea, as it
// overrides any other setting. May be
// better to let the user agent string
// tell us what the user wants.
strcpy(cr->m_defaultSortLanguage,
getLanguageAbbr(lang));
}
}
if(!hp) {
hp = (uint8_t*)cr->m_htmlRoot;
hpLen = cr->m_htmlRootLen;
}
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , coll );
RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , coll );
if ( base ) docsInColl = base->getNumGlobalRecs();
}
*/
// print the page out
/*
expandRootHtml ( sb,

@ -11,114 +11,6 @@
#include <stdio.h>
#include <ctype.h>
/*
static void handleRequestSpeller ( UdpSlot *slot , int32_t netnice );
static void gotSpellerReplyWrapper (void *state, void *state2);
bool Speller::registerHandler ( ) {
// . register ourselves with the udp server
// . it calls our callback when it receives a msg of type 0x39
if ( ! g_udpServer.registerHandler ( 0x3d, handleRequestSpeller ))
return false;
return true;
}
// . handle a request to get a linkInfo for a given docId/url/collection
// . returns false if slot should be nuked and no reply sent
// . sometimes sets g_errno on error
void handleRequestSpeller ( UdpSlot *slot , int32_t netnice ) {
// The request is the string to be spellchecked, null ended
char *request = slot->m_readBuf;
// first tells us if we should narrow the search stuff
bool narrowP = *(bool *) request;
request += sizeof(bool);
// is it found in dict or pop words
bool found;
int32_t score;
char reco[MAX_PHRASE_LEN];
int32_t pop;
int64_t start = gettimeofdayInMilliseconds();
bool recommendation = g_speller.m_language[langEnglish].
getRecommendation( request, gbstrlen(request),
reco, MAX_PHRASE_LEN,
&found, &score,
&pop );
log ( LOG_DEBUG,"speller: %s --> %s", request, reco );
int32_t numNarrow = 0;
char narrow[MAX_NARROW_SEARCHES * MAX_PHRASE_LEN];
int32_t narrowPops[MAX_NARROW_SEARCHES];
//if ( narrowP )
// numNarrow = g_speller.m_language[langEnglish].
// narrowPhrase ( request, narrow, narrowPops,
// MAX_NARROW_SEARCHES );
// calculate total reply size
// int32_t replySize = found + recommendation + score + pop + reco
int32_t replySize = sizeof(bool) + sizeof(bool) + 4 + 4 +
gbstrlen(reco) + 1;
if ( narrowP ){
replySize += 4; // numPhrases
for ( int32_t i = 0; i < numNarrow; i++ )
replySize += 4 + gbstrlen(&narrow[i*MAX_FRAG_SIZE]) + 1;
}
char *reply = (char*) mmalloc(replySize, "SpellerReplyBuf");
if ( !reply ) {
g_errno = ENOMEM;
//g_udpServer.sendReply_ass( NULL, 0, NULL, 0, slot );
g_udpServer.sendErrorReply( slot , g_errno );
return;
}
char *p = reply;
*(bool *)p = found;
p += sizeof(bool);
*(bool *)p = recommendation;
p += sizeof(bool);
// store the score and pop
*(int32_t *) p = score; p += 4;
*(int32_t *) p = pop; p += 4;
// store the recommendation
strcpy( p, reco );
p += gbstrlen(reco) + 1;
if ( narrowP ){
// store the number of narrow phrases found
*(int32_t *) p = numNarrow;
p += 4;
for ( int32_t i = 0; i < numNarrow; i++ ){
*(int32_t *)p = narrowPops[i];
p += 4;
strcpy(p, &narrow[i * MAX_FRAG_SIZE]);
p += gbstrlen(&narrow[i * MAX_FRAG_SIZE]) + 1;
}
}
//sanity check
if ( p - reply != replySize ){
char *xx = NULL; *xx = 0;
}
int64_t end = gettimeofdayInMilliseconds();
if ( end - start > 1 )
log (LOG_INFO,"speller: took %"INT64" ms to spellcheck "
"fragment %s", end- start, request);
g_udpServer.sendReply_ass ( reply ,
replySize,
reply ,
replySize,
slot );
}
*/
Speller g_speller;
Speller::Speller(){
@ -219,764 +111,8 @@ void Speller::test ( char *ff ) {
fclose(fd);
}
/*
///////////////////////////////////////////////////////
// RECOMMENDATION ROUTINES BELOW HERE
//
// These will spellcheck and give recommendations
///////////////////////////////////////////////////////
bool Speller::canStart( QueryWord *qw ) {
// can only start with a alpha character, no numeric
if ( ! is_alnum_utf8 ( qw->m_word+0 ) ) return false;
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_CONNECTED &&
qw->m_ignoreWord != IGNORE_QUOTED ) return false;
// don't check 'rom' in phrase "cd-rom", or 't' in "ain't"
if ( qw->m_leftConnected )
return false;
// don't start with a stop word
if ( qw->m_isStopWord )
return false;
// a lot of field terms should not be spell checked
if ( qw->m_fieldCode ) {
if ( qw->m_fieldCode != FIELD_TITLE &&
qw->m_fieldCode != FIELD_CITY &&
qw->m_fieldCode != FIELD_AUTHOR &&
qw->m_fieldCode != FIELD_COUNTRY )
return false;
}
return true;
}
// . returns false if blocked
// recommended something different than original query, "q"
// and false otherwise
// . also returns false and sets g_errno on error
// . stores recommended query in "dst" and NULL terminates it
// . if dst is too small it will bitch and return true with g_errno set
bool Speller::getRecommendation ( Query *q,
bool spellcheck,
char *dst, // recommendation destination
int32_t dstLen, // recommendation max len
bool narrowSearch,
char *narrow, // narrow search
int32_t narrowLen, // narrow search len
int32_t *numNarrows, // num narrows found
void *state,
void (*callback)(void *state) ){
*dst = '\0';
*narrow = '\0';
// no narrowing search if spellchecking is off
if ( !spellcheck )
return true;
// don't spellcheck queries that are more than MAX_FRAG_SIZE int32_t.
if ( q->getQueryLen() >= MAX_FRAG_SIZE )
return true;
StateSpeller *st ;
try { st = new (StateSpeller); }
catch ( ... ) {
g_errno = ENOMEM;
log("Speller: new(%i): %s", sizeof(StateSpeller),
mstrerror(g_errno));
return true;
}
mnew ( st , sizeof(StateSpeller) , "State00" );
st->m_state = state;
st->m_callback = callback;
st->m_q = q;
st->m_spellcheck = spellcheck;
st->m_dst = dst;
st->m_dend = dst + dstLen;
st->m_narrowSearch = narrowSearch;
st->m_nrw = narrow;
st->m_nend = narrow + narrowLen;
st->m_numNarrow = numNarrows;
*st->m_numNarrow = 0;
st->m_start = gettimeofdayInMilliseconds();
st->m_numFrags = 0;
st->m_numFragsReceived = 0;
// . break query down into fragments
// . each fragment is a string of words
// . quotes and field names will separate fragments
// . TODO: make field data in its own fragment
int32_t nqw = q->m_numWords;
for ( int32_t i = 0 ; i < nqw ; i++ ) {
// get a word in the Query to start a fragment with
QueryWord *qw = &q->m_qwords[i];
// can he start the phrase?
if ( ! canStart( qw ) )
continue;
bool inQuotes = qw->m_inQuotes;
char fieldCode = qw->m_fieldCode;
// . get longest continual fragment that starts with word #i
// . get the following words that can be in a fragment
// that starts with word #i
// . start of the frag
int32_t endQword = i;
int32_t startQword = i;
for ( ; i < nqw ; i++ ) {
// . skip if we should
// . keep punct, however
QueryWord *qw1 = &q->m_qwords[i];
if ( qw1->m_opcode ) break;
if ( qw1->m_inQuotes != inQuotes ) break;
if ( qw1->m_fieldCode != fieldCode ) break;
if ( qw1->m_ignoreWord == IGNORE_FIELDNAME ) break;
if ( qw1->m_phraseSign &&
!qw1->m_rightConnected ) break;
// are we punct?
if ( ! is_alnum_utf8(qw1->m_word) )
endQword = i - 1;
else
endQword = i;
}
// revisit this i in big loop since we did not include it
i = endQword;
//create a new stateFrag
StateFrag *stFrag;
try { stFrag = new (StateFrag); }
catch ( ... ) {
mdelete ( st, sizeof(StateSpeller), "StateSpeller" );
delete (st);
g_errno = ENOMEM;
log("Speller: new(%i): %s", sizeof(StateFrag),
mstrerror(g_errno));
//continue;
return true;
}
mnew ( stFrag, sizeof(StateFrag),
"StateFrag" );
stFrag->m_state = (void*) st;
stFrag->m_narrowPhrase = st->m_narrowSearch;
stFrag->m_q = q;
stFrag->m_startQword = startQword;
stFrag->m_endQword = endQword;
stFrag->m_errno = 0;
st->m_stFrag[st->m_numFrags] = stFrag;
st->m_numFrags++;
// blocked
if ( !getRecommendation( stFrag ) ){
continue;
}
st->m_numFragsReceived++;
}
// if outstanding frags
if ( st->m_numFragsReceived < st->m_numFrags )
return false;
gotFrags(st);
// delete state
mdelete ( st, sizeof(StateSpeller), "StateSpeller" );
delete (st);
return true;
}
bool Speller::getRecommendation ( StateFrag *st ){
st->m_recommended = false;
st->m_numFound = 0;
st->m_numNarrowPhrases = 0;
char *dst = st->m_dst;
// normalize this fragment and store in "dst"
bool wasAlnum = true;
for ( int32_t i = st->m_startQword; i <= st->m_endQword; i++ ){
// start of each word
st->m_wp[i] = dst;
char *p = st->m_q->m_qwords[i].m_word;
int32_t plen = st->m_q->m_qwords[i].m_wordLen;
for ( int32_t j = 0; dst-st->m_dst <MAX_FRAG_SIZE&&j<plen;j++ ) {
if ( !getClean_utf8(p+j) )
continue;
// skip back to back punct/spaces
if (j>0 && !is_alnum_utf8(p+j) &&!wasAlnum)
continue;
*dst = p[j];
dst++;
wasAlnum = is_alnum_utf8 ( p+j );
}
st->m_wplen[i] = dst - st->m_wp[i];
st->m_isfound[i] = false;
}
*dst = '\0';
// debug msg
log(LOG_DEBUG,"speller: Getting recommendation for frag=%s",
st->m_dst);
// give each word in the phrase a chance to start the subphrase
int32_t maxPhrase = st->m_endQword - st->m_startQword;
if ( maxPhrase > MAX_WORDS_PER_PHRASE )
maxPhrase = MAX_WORDS_PER_PHRASE;
// store the phraseLen and posn
st->m_pLen = maxPhrase;
st->m_pPosn = st->m_startQword;
return launchReco(st);
}
bool Speller::launchReco(StateFrag *st){
// if we checked all the phrases or found all the words
if ( st->m_numFound == st->m_endQword - st->m_startQword + 1 ||
st->m_pLen < 0 ){
return true;
}
bool launchPhrase = false;
for ( ; st->m_pLen >= 0; st->m_pLen-- ){
for ( ; st->m_pPosn + st->m_pLen <= st->m_endQword;
st->m_pPosn++ ) {
// find a word that can start the phrase
QueryWord *qw = &st->m_q->m_qwords[st->m_pPosn];
if ( !canStart (qw) )
continue;
// don't do this phrase if we have found even one
// word in the phrase
bool found = false;
for ( int32_t k = st->m_pPosn;
k <= st->m_pPosn + st->m_pLen; k++ ) {
if ( st->m_isfound[k] ){
found = true;
break;
}
}
if ( found )
continue;
// cannot end on a stop word, punct, right-connected
// word
QueryWord *qwEnd =
&st->m_q->m_qwords[st->m_pPosn + st->m_pLen];
if ( qwEnd->m_isStopWord || qwEnd->m_isPunct ||
qwEnd->m_rightConnected )
continue;
// found someone to start the phrase with
// what is the new phrase parms?
st->m_a = st->m_wp[st->m_pPosn];
st->m_b = st->m_wp[st->m_pLen + st->m_pPosn]+
st->m_wplen[st->m_pLen + st->m_pPosn];
// also store the tmp char that we are changing
st->m_c = *(st->m_b);
*(st->m_b) = '\0';
// if it is just a number, don't get recommendation
// lest we emabarrass ourselves
if ( st->m_pPosn == 0 && is_digit(st->m_a[0]) ) {
char *k = st->m_a+1;
while ( is_digit(*k) ) k++;
if ( ! *k ) {
*st->m_b = st->m_c ;
continue;
}
}
// if it is an adult phrase, don't get a recommendation
// check if isAdult really finds a word.
char *adultLoc = NULL;
if ( isAdult(st->m_a, gbstrlen(st->m_a), &adultLoc) &&
( adultLoc == st->m_a || *(adultLoc-1) == ' ' ) ){
// mark as found
for ( int32_t k = st->m_pPosn;
k <= st->m_pPosn + st->m_pLen; k++ )
st->m_isfound[k] = true;
*(st->m_b) = st->m_c;
continue;
}
// if the phrase is in dict or in the top pop words,
// phrase is found. Don't check if we are narrowing
// the phrase because we need to multicast anyways
uint64_t h ;
h = hash64d(st->m_a, gbstrlen(st->m_a) );
if ( !st->m_narrowPhrase &&
getPhrasePopularity( st->m_a, h, false ) > 0 ){
// mark as found
for ( int32_t k = st->m_pPosn;
k <= st->m_pPosn + st->m_pLen; k++ )
st->m_isfound[k] = true;
*(st->m_b) = st->m_c;
continue;
}
launchPhrase = true;
break;
}
if ( launchPhrase )
break;
st->m_pPosn = st->m_startQword;
}
if ( st->m_pLen < 0 ){
return true;
}
// debug msg
log(LOG_DEBUG,"speller: ----------");
log(LOG_DEBUG,"speller: Checking phrase=%s", st->m_a);
// launch for all the splits
st->m_numRequests = 0;
st->m_numReplies = 0;
int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
// don't send to twins...
hostsPerSplit /= g_hostdb.m_numHostsPerShard;
int32_t mySplit = g_hostdb.m_hostId % g_hostdb.m_indexSplits;
int32_t key = st->m_q->getQueryHash();//0;
int32_t timeout = 30;
int32_t niceness = 0;
char request[MAX_FRAG_SIZE + 4];
char *p = request;
*(bool *)p = st->m_narrowPhrase;
p += sizeof(bool);
strcpy ( p, st->m_a );
// send the null end too
p += gbstrlen(st->m_a)+1;
int32_t plen = p - request;
for ( int32_t i = 0; i < hostsPerSplit; i++ ){
// get the hostId of the host we're sending to
uint32_t hostId =
mySplit + ( i * g_hostdb.m_indexSplits );
Host *h = g_hostdb.getHost(hostId);
st->m_mcast[i].reset();
bool status = st->m_mcast[i].
send(request ,
plen , // request size
0x3d , // msgType 0x3d
false , // multicast owns m_request?
h->m_groupId, // group to send to (groupKey)
false , // send to whole group?
key ,
st , // state data
NULL , // state data
gotSpellerReplyWrapper ,
timeout , // in seconds
niceness ,
false , // realtime?
-1 , // m_q->m_bestHandlingHostId ,
NULL , // m_replyBuf ,
0 , // MSG39REPLYSIZE,
// this is true if multicast should free
// the
// reply, otherwise caller is responsible
// for freeing it after calling
// getBestReply).
// actually, this should always be false,
// there
// is a bug in Multicast.cpp.
false );
if (!status){
st->m_numReplies++;
log("speller: Multicast had error: %s",
mstrerror(g_errno));
st->m_errno = g_errno;
continue;
}
// blocked
else
st->m_numRequests++;
}
if ( st->m_numReplies == st->m_numRequests )
return true;
return false;
}
void gotSpellerReplyWrapper( void *state, void *state2 ){
StateFrag *stFrag = (StateFrag *) state;
stFrag->m_numReplies++;
if ( stFrag->m_numReplies < stFrag->m_numRequests )
return;
// blocked
if ( !g_speller.gotSpellerReply(stFrag) )
return;
StateSpeller *st = (StateSpeller *)stFrag->m_state;
// One more frag received
st->m_numFragsReceived++;
if ( st->m_numFragsReceived < st->m_numFrags )
return;
g_speller.gotFrags(st);
// callback
st->m_callback( st->m_state );
// delete state
mdelete ( st, sizeof(StateSpeller), "StateSpeller" );
delete (st);
}
bool Speller::gotSpellerReply( StateFrag *st ){
int32_t minScore = LARGE_SCORE;
int32_t maxPop = -1;
char *bestReco = NULL;
char *reply[MAX_UNIQUE_HOSTS_PER_SPLIT];
int32_t replySize[MAX_UNIQUE_HOSTS_PER_SPLIT];
int32_t replyMaxSize[MAX_UNIQUE_HOSTS_PER_SPLIT];
bool freeit;
bool found = false; //phrase was found in dict or pop words
int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
// don't send to twins...
hostsPerSplit /= g_hostdb.m_numHostsPerShard;
int32_t numNarrowPhrases[MAX_UNIQUE_HOSTS_PER_SPLIT];
char *narrowPtrs[MAX_UNIQUE_HOSTS_PER_SPLIT];
// init narrowSearch arrays
for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ ){
numNarrowPhrases[i] = 0;
narrowPtrs[i] = NULL;
}
for ( int32_t i = 0; i < hostsPerSplit; i++ ){
reply[i] = st->m_mcast[i].getBestReply( &replySize[i] ,
&replyMaxSize[i] ,
&freeit );
// multicast may have an empty reply buffer if there was an
// OOM error or something. m_errno should have been set, but
// we have to loop through all the multicasts to free the
// reply buffers.
char *p = reply[i];
if ( g_errno || st->m_errno || !p){
continue;
}
// was is found in dict
bool foundInDict = *(bool *)p;
p += sizeof(bool);
if ( foundInDict )
found = true;
// first is if there is a recommendation or not
bool recommendation = *(bool *) p;
p += sizeof (bool);
if ( !recommendation && !st->m_narrowPhrase )
continue;
int32_t score = *(int32_t *)p;
p += 4;
int32_t pop = *(int32_t *)p;
p += 4;
if ( recommendation ){
log ( LOG_DEBUG,"speller: Received reco %s, "
"score=%"INT32", pop=%"INT32"", p, score, pop );
// we have a recommendation with score and pop
// choose the one with the lowest score, and if the
// score is same then the max pop
// HACK: we are getting bad recommendations for smaller
// popularities. So don't consider them
if ( pop > 8 && ( score < minScore ||
( score == minScore && pop > maxPop ) ) ){
bestReco = p;
minScore = score;
maxPop = pop;
}
}
p += gbstrlen(p) + 1;
if ( st->m_narrowPhrase ){
numNarrowPhrases[i] = *(int32_t *)p;
p += 4;
narrowPtrs[i] = p;
}
}
// merge all the narrow results
if ( st->m_narrowPhrase ){
int32_t currPhrase[MAX_UNIQUE_HOSTS_PER_SPLIT];
for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ )
currPhrase[i] = 0;
for ( int32_t i = 0; i < MAX_NARROW_SEARCHES; i++ ){
int32_t maxHost = -1;
int32_t maxPop = 0;
for ( int32_t j = 0; j < hostsPerSplit; j++ ){
if ( numNarrowPhrases[j] <= currPhrase[j] )
continue;
int32_t pop = *(int32_t *)narrowPtrs[j];
if ( pop <= maxPop )
continue;
maxPop = pop;
maxHost = j;
}
if ( maxHost < 0 )
break;
//
narrowPtrs[maxHost] += 4;
strcpy( st->m_narrowPhrases[i], narrowPtrs[maxHost] );
narrowPtrs[maxHost] +=gbstrlen(narrowPtrs[maxHost]) + 1;
currPhrase[maxHost]++;
st->m_numNarrowPhrases++;
}
}
// make narrowPhrase false here, so that its not launched a second time
// for the same frag;
st->m_narrowPhrase = false;
// revert
*(st->m_b) = st->m_c;
// if we found a recommendation,or if the phrase was found in the
// dictionary or pop words then mark all the
// words that fall under the phrase as found
if ( found || bestReco ){
for ( int32_t k = st->m_pPosn;
k <= st->m_pLen + st->m_pPosn; k++ )
st->m_isfound[k] = true;
st->m_numFound += st->m_pLen + 1;
}
// if not found in the dictionary or a recommendation, copy the phrase
if ( !found && bestReco){
// this fragment is going to be recommended
st->m_recommended = true;
// insert our recommendation into the phrase to get a new one
char *s1 = st->m_wp[st->m_startQword];
int32_t slen1 = st->m_a - st->m_wp[st->m_startQword];
char *s2 = bestReco;
int32_t slen2 = gbstrlen(bestReco);
char *s3 = st->m_b ;
// store the difference in length between the reco and the
// original string
int32_t diff = slen2 - ( st->m_b - st->m_a );
int32_t slen3 = st->m_wp[st->m_endQword] +
st->m_wplen[st->m_endQword] - st->m_b;
if ( slen3 < 0 )
slen3 = 0;
int32_t tlen = slen1 + slen2 + slen3 ;
if ( tlen > MAX_FRAG_SIZE ){
log(LOG_LOGIC,"speller: buf too small. Fix me 3.");
// blocked
if ( !launchReco(st) )
return false;
return true;
}
// make substitution and store in "dst"
char buf2 [ MAX_FRAG_SIZE];
char *nf = buf2;
gbmemcpy ( nf , s1 , slen1 ) ; nf += slen1;
gbmemcpy ( nf , s2 , slen2 ) ; nf += slen2;
gbmemcpy ( nf , s3 , slen3 ) ;
nf += slen3;
// don't forget to NULL terminate
*nf = '\0';
// debug msg
log( LOG_DEBUG,"speller: Trying substitution \"%s\"",
buf2 );
strcpy ( st->m_dst , buf2 );
// the pointers might have to be changed if the
// recommendation was not of the same length as the words
if ( diff != 0 ){
for ( int32_t k = st->m_pLen+st->m_pPosn+1;
k <= st->m_endQword; k++ )
st->m_wp[k] += diff;
}
}
// don't forget to free the replies
for ( int32_t i = 0; i < hostsPerSplit; i++ )
if ( reply[i] && replyMaxSize[i] > 0 )
mfree( reply[i], replyMaxSize[i], "SpellerReplyBuf" );
// go to the next position in the phrase. if we have reached the end
// of the phrase position, decrement the phrase length and start again
if ( st->m_pPosn + st->m_pLen >= st->m_endQword - 1 ){
st->m_pLen--;
st->m_pPosn = st->m_startQword;
}
else
st->m_pPosn++;
if ( !launchReco(st) )
return false;
return true;
}
*/
// . break a NULL-terminated string down into a list of ptrs to the words
// . return the number of words stored into "wp"
/*
int32_t Speller::getWords ( const char *s ,
char *wp [MAX_FRAG_SIZE] ,
int32_t wplen [MAX_FRAG_SIZE] ,
bool *isstop ) {
int32_t nwp = 0;
loop:
// skip initial punct
while ( *s && ! is_alnum ( *s ) ) s++;
// bail if done
if ( ! *s ) return nwp;
// point to word
wp [ nwp ] = (char *)s;
// convenience ptr
char *ww = (char *)s;
// count over it
while ( is_alnum ( *s ) ) s++;
// how long is the word?
int32_t slen = s - wp [ nwp ];
// set length
wplen [ nwp ] = slen ;
// is it a stop word?
if ( isstop ) {
// TODO: make the stop words utf8!!!
int64_t h = hash64Lower_utf8 ( ww , slen ) ;
bool stop = ::isStopWord ( ww , slen , h ) ;
// BUT ok if Capitalized or number
if ( stop ) {
if ( is_digit (ww[0]) ) stop = false;
if ( is_cap (ww,slen ) ) stop = false;
// e-mail, c file, c. s. lewis
if ( slen == 1 && ww[0] != 'a' ) stop = false;
}
isstop[nwp] = stop;
}
nwp++;
goto loop;
}
*/
/*
void Speller::gotFrags( void *state ){
StateSpeller *st = (StateSpeller *) state;
char *dptr = st->m_dst;
char *nptr = st->m_nrw;
bool recommendation = false;
Query *q = st->m_q;
// . break query down into fragments
// . each fragment is a string of words
// . quotes and field names will separate fragments
// . TODO: make field data in its own fragment
int32_t nqw = q->m_numWords;
int32_t currFrag = 0;
for ( int32_t i = 0 ; i < nqw ; i++ ) {
// get a word in the Query to start a fragment with
QueryWord *qw = &q->m_qwords[i];
// if he has a phraseSign, put it right away
//if ( qw->m_phraseSign ) {
// *dptr = qw->m_phraseSign;
// dptr++;
// }
// can he start the phrase?
// if he can't start our fragment, just copy over to "dst"
if ( !canStart( qw )) {
// copy to rp and get next word
char *w = qw->m_word;
int32_t wlen = qw->m_wordLen;
if ( dptr + wlen >= st->m_dend ) {
g_errno = EBUFTOOSMALL; continue; }
// watch out for LeFtP and RiGhP
if ( qw->m_opcode == OP_LEFTPAREN ) *dptr++ = '(';
else if ( qw->m_opcode == OP_RIGHTPAREN) *dptr++ = ')';
else if ( qw->m_opcode == OP_PIPE ) *dptr++ = '|';
else {
gbmemcpy ( dptr , w , wlen );
dptr += wlen;
}
*dptr = '\0';
continue;
}
bool inQuotes = qw->m_inQuotes;
char fieldCode = qw->m_fieldCode;
// . get longest continual fragment that starts with word #i
// . get the following words that can be in a fragment
// that starts with word #i
// . start of the frag
int32_t endQword = i;
for ( ; i < nqw ; i++ ) {
// . skip if we should
// . keep punct, however
QueryWord *qw1 = &q->m_qwords[i];
if ( qw1->m_opcode ) break;
if ( qw1->m_inQuotes != inQuotes ) break;
if ( qw1->m_fieldCode != fieldCode ) break;
if ( qw1->m_ignoreWord== IGNORE_FIELDNAME ) break;
if ( qw1->m_phraseSign && !qw1->m_rightConnected )
break;
// are we punct?
if ( ! is_alnum_utf8 (qw1->m_word) )
endQword = i - 1;
else
endQword = i;
}
// revisit this i in big loop since we did not include it
i = endQword;
// OOM errors might cause us not to launch frags
if ( currFrag >= st->m_numFrags )
continue;
StateFrag *stFrag = st->m_stFrag[currFrag];
// don't breech
if ( dptr + gbstrlen(stFrag->m_dst) >= st->m_dend ) {
g_errno = EBUFTOOSMALL;
}
else {
// store it
strcpy ( dptr, stFrag->m_dst );
dptr += gbstrlen ( dptr );
// add a space between fragments
// *dptr = ' ';
//dptr++;
*dptr = '\0';
// set the flag
if ( stFrag->m_recommended )
recommendation = true;
}
// copy over all the narrow searches that can fit
for ( int32_t j = 0; j < stFrag->m_numNarrowPhrases; j++ ){
// don't breech
if ( nptr +gbstrlen(stFrag->m_narrowPhrases[j]) >
st->m_nend )
break;
strcpy(nptr, stFrag->m_narrowPhrases[j]);
nptr += gbstrlen(stFrag->m_narrowPhrases[j]) + 1;
(*st->m_numNarrow)++;
}
mdelete(stFrag, sizeof(StateFrag), "StateFrag");
delete (stFrag);
// now we get the next frag
currFrag++;
}
if ( !recommendation )
*st->m_dst = '\0';
int64_t now = gettimeofdayInMilliseconds();
if ( now - st->m_start > 50 )
log(LOG_INFO,"speller: Took %"INT64" ms to spell check %s",
now - st->m_start, st->m_q->getQuery() );
return;
}
*/
bool Speller::generateDicts ( int32_t numWordsToDump , char *coll ){
m_language[2].setLang(2);
//m_language[2].setLang(2);
//m_language[2].generateDicts ( numWordsToDump, coll );
return false;
}
@ -1776,107 +912,6 @@ bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn,
return false;
}*/
bool Speller::createUnifiedDict (){
// first get all the tuples from wordlist and query file
//HashTableT <uint64_t, char*> ht[MAX_LANGUAGES];
HashTableX ht[MAX_LANGUAGES];
char ff[1024];
for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){
ht[i].set ( 8,4,0,NULL,0,false,0,"cud");
sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir,
getLanguageAbbr(i), getLanguageAbbr(i) );
populateHashTable(ff, &ht[i], i);
sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir,
getLanguageAbbr(i), getLanguageAbbr(i) );
populateHashTable(ff, &ht[i], i);
for ( int32_t j = 0; j < NUM_CHARS; j++ ){
sprintf ( ff , "%sdict/%s/%s.dict.%"INT32"", g_hostdb.m_dir,
getLanguageAbbr(i), getLanguageAbbr(i), j );
populateHashTable(ff, &ht[i], i);
}
}
//sprintf ( ff, "%sdict/unifiedDict",g_hostdb.m_dir );
sprintf ( ff, "%sunifiedDict.txt",g_hostdb.m_dir );
// delete it first
unlink ( ff );
// then open a new one for appending
int fdw = open ( ff ,
O_CREAT | O_RDWR | O_APPEND ,
getFileCreationFlags());
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
if ( fdw < 0 ){
return log("lang: Could not open for %s "
"writing: %s.",ff, strerror(errno));
}
log(LOG_INIT,"spell: Making %s.", ff );
//HashTableT <uint64_t, int32_t> phrases;
HashTableX phrases;
phrases.set(8,4,0,NULL,0,false,0,"phud");
char buf[1024];
for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){
// get each slot
for ( int32_t j = 0; j < ht[i].getNumSlots(); j++ ){
uint64_t key = *(uint64_t *)ht[i].getKey(j);
if ( key == 0 )
continue;
// if key is already found
int32_t slot = phrases.getSlot(&key);
if ( slot != -1 )
continue;
char *tuple = *(char **)ht[i].getValueFromSlot(j);
// here we print the phrase and the phonet if present
// skip the score
while ( *tuple != '\t' )
tuple++;
tuple++;
sprintf( buf, "%s", tuple );
char *p = buf;
p += gbstrlen(buf);
// if there wasn't a phonet, its from the titleRec.
// add another tab
bool fromTitleRec = false;
if ( strstr (tuple,"\t") == NULL ){
*p = '\t';
p++;
fromTitleRec = true;
}
for ( int32_t k = 0; k < MAX_LANGUAGES; k++ ){
slot = ht[k].getSlot(&key);
if ( slot == -1 )
continue;
char *val = *(char **)ht[k].getValueFromSlot(slot);
int32_t pop = atoi(val);
if ( fromTitleRec ) pop *= -1;
sprintf(p,"\t%"INT32"\t%"INT32"",k,pop);
p += gbstrlen(p);
}
// write out the trailing \n as well
*p = '\n';
p++;
*p = '\0';
p++;
int32_t bufLen = gbstrlen(buf);
int32_t wn = write ( fdw , buf , bufLen ) ;
if ( wn != bufLen )
return log("lang: write: %s",strerror(errno));
int32_t val = 1;
phrases.addKey(&key, &val);
}
}
return true;
}
bool Speller::populateHashTable( char *ff, HashTableX *htable,
unsigned char langId ){

@ -9,11 +9,15 @@
#ifndef _SPELLER_H_
#define _SPELLER_H_
#define MAX_FRAG_SIZE 1024
// max int32_t returned by getPhrasePopularity() function
#define MAX_PHRASE_POP 16800
#include "StopWords.h"
#include "Language.h"
#include "Query.h"
#include "Multicast.h"
// . the height and width of m_stable[][] that takes a letter pair as an index
// . valid chars are returned by isValidChar() routine
// . we use A-Z, 0-9, space, hyphen, apostrophe and \0... that's it
@ -48,7 +52,7 @@ class StateFrag{
char m_c;
bool m_narrowPhrase;
int32_t m_numNarrowPhrases;
char m_narrowPhrases[MAX_NARROW_SEARCHES][MAX_FRAG_SIZE];
//char m_narrowPhrases[MAX_NARROW_SEARCHES][MAX_FRAG_SIZE];
};
@ -99,10 +103,10 @@ class Speller {
bool findNext( char *s, char *send, char **nextWord, bool *isPorn,
unsigned char langId, int32_t encodeType );
int32_t checkDict ( char *s, int32_t slen, char encodeType,
unsigned char lang = langEnglish ){
return m_language[lang].checkDict(s,slen,encodeType);
}
// int32_t checkDict ( char *s, int32_t slen, char encodeType,
// unsigned char lang = langEnglish ){
// return m_language[lang].checkDict(s,slen,encodeType);
// }
// should be same hash algo to make wordId
bool isInDict ( uint64_t wordId ) {
@ -137,11 +141,10 @@ class Speller {
int32_t wplen [MAX_FRAG_SIZE] ,
bool *isstop );
Language m_language[MAX_LANGUAGES];
// Language m_language[MAX_LANGUAGES];
char *getRandomWord() ;
bool loadUnifiedDict();
bool createUnifiedDict ();
void dictLookupTest ( char *ff );

@ -3,6 +3,7 @@
#include "Words.h"
//#include "AppendingWordsWindow.h"
#include "Sections.h"
#include "Msg20.h"
Summary::Summary()
: m_summaryLocs(m_summaryLocBuf,

@ -7266,118 +7266,6 @@ char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {
}
return maxi;
//m_langId = maxi;
//m_langIdValid = true;
//return &m_langId;
/*
int32_t freqScore = 0;
int32_t lang;
if ( ! m_processedLang ) {
// do not repeat this call for this document
m_processedLang = true;
lang = words->getLanguage( sections ,
1000 , // sampleSize ,
m_niceness,
&freqScore);
// return NULL on error with g_errno set
if ( lang == -1 ) return NULL;
// we got it from words, return
if ( lang != 0 ) {
m_langId = lang;
m_langIdValid = true;
return &m_langId;
}
}
m_langId = 0;
// try from charset
uint16_t *charset = getCharset ( );
if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset;
// do based on charset
if ( *charset == csGB18030 ) m_langId = langChineseTrad;
if ( *charset == csGBK ) m_langId = langChineseSimp;
if ( m_langId ) {
m_langIdValid = true;
return &m_langId;
}
// are we a root?
char *isRoot = getIsSiteRoot();
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
// this lookup here might be unnecessary
uint8_t *rl = NULL;
if ( ! *isRoot ) {
rl = getRootLangId();
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
}
//Url *u = getCurrentUrl();
Url *u = getFirstUrl();
uint8_t gs[METHOD_CAP];
// reset language method vector
memset( gs , 0, sizeof(uint8_t) * METHOD_CAP );
// Let the site tell us what language it's in
gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml );
// Guess from the FIRST URL (unredirected url)
gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() );
// Guess from the outlinks
gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links );
// Guess from the inlinks
gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip);
// root page's language, if there was one
if ( ! *isRoot ) gs [METHOD_ROOT] = *rl;
int32_t scores[MAX_LANGUAGES];
memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES );
// weights for the 10 methods
char cw[] = { 8,9,4,7,6,7,8,1,2};
// add up weighted scores
for(int i = 0; i < METHOD_CAP; i++ )
scores[gs[i]] += cw[i];
// reset the "lang" to langUnknown which is 0
lang = langUnknown ;
int max, oldmax;
max = oldmax = 0;
// find best language
for ( int32_t i = MAX_LANGUAGES - 1; i > 0 ; i-- ) {
if ( scores[i] < max) continue;
oldmax = max;
max = scores[i];
lang = i;
}
// give up if not too conclusive
if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) {
//log(LOG_DEBUG, "build: Language: Threshold, score "
// "(%"INT32" - %"INT32") %"INT32" vs. %"INT32".\n",
// (int32_t)max,
// (int32_t)oldmax,
// (int32_t)max - oldmax,
// (int32_t)3);//(int32_t)cr->m_languageThreshold);
lang = langUnknown;
}
// Make sure we're over the bailout value, this
// keeps low scoring methods like TLD from being
// the decider if it was the only successful method.
if ( max < 5 ) { // cr->m_languageBailout ) {
//log(LOG_DEBUG, "build: Language: Bailout, "
// "score %"INT32" vs. %"INT32".",
// (int32_t)max, (int32_t)5);//cr->m_languageBailout);
lang = langUnknown;
}
// If the language is still not known,
// use the language detected from the frames.
//if(lang == langUnknown) lang = frameFoundLang;
// . try dmoz if still unknown
// . limit to 10 of them
// all done, do not repeat
m_langIdValid = true;
m_langId = lang;
m_langIdScore = max;
return &m_langId;
*/
}