mirror of
https://github.com/privacore/open-source-search-engine.git
synced 2025-07-12 02:26:07 -04:00
Cleanup related to languages/Compiler warnings
This commit is contained in:
@ -6,6 +6,7 @@
|
||||
#include "HashTable.h"
|
||||
#include "Categories.h"
|
||||
#include "LanguageIdentifier.h"
|
||||
#include "Threads.h"
|
||||
|
||||
// record for unified language/country hash table
|
||||
typedef union catcountryrec_t {
|
||||
|
438
Iso8859.cpp
438
Iso8859.cpp
@ -1,438 +0,0 @@
|
||||
#include "Iso8859.h"
|
||||
// default for charsets that are highly "non-latin"
|
||||
// i.e. only allow ASCII to pass...
|
||||
const unsigned char map_8859_default[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
};
|
||||
// Latin-1
|
||||
// Adapted from Wikipedia:
|
||||
// Albanian, Basque, Catalan, Danish, Dutch (missing some letters),
|
||||
// English, Estonian (missing some letters), Faroese,
|
||||
// French (missing some letters), Finnish (missing some letters),
|
||||
// Galician, German, Icelandic, Irish (new orthography), Italian,
|
||||
// Latin, Norwegian, Portuguese, Rhaeto-Romanic, Scottish, Spanish,
|
||||
// Swedish, Afrikaans, Swahili
|
||||
const unsigned char map_8859_1[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'o', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'y'
|
||||
};
|
||||
// Latin-2
|
||||
// Adapted from Wikipedia:
|
||||
// Bosnian, Croatian, Czech, Hungarian, Polish, Romainian, Serbian,
|
||||
// Serbocroatian, Slovak, Slovenian, Upper Sorbian and Lower Sorbian
|
||||
const unsigned char map_8859_2[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'A', ' ', 'L', ' ', 'L', 'S', ' ', ' ', 'S', 'S', 'T', 'Z', ' ', 'Z', 'Z',
|
||||
' ', 'a', ' ', 'l', ' ', 'l', 's', ' ', ' ', 's', 's', 't', 'z', ' ', 'z', 'z',
|
||||
'R', 'A', 'A', 'A', 'A', 'L', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'D',
|
||||
'D', 'N', 'N', 'O', 'O', 'O', 'O', ' ', 'R', 'U', 'U', 'U', 'U', 'Y', 'T', ' ',
|
||||
'r', 'a', 'a', 'a', 'a', 'l', 'c', 'c', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'd',
|
||||
'd', 'n', 'n', 'o', 'o', 'o', 'o', ' ', 'r', 'u', 'u', 'u', 'u', 'y', 't', ' '
|
||||
};
|
||||
// Latin-3 (South European)
|
||||
// Adapted from Wikipedia:
|
||||
// Turkish (superceded by 8859-9), Maltese, Esperanto
|
||||
const unsigned char map_8859_3[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'H', ' ', ' ', ' ', ' ', 'H', ' ', ' ', 'L', 'S', 'G', 'J', ' ', ' ', 'Z',
|
||||
' ', 'h', ' ', ' ', ' ', ' ', 'h', ' ', ' ', 'l', 's', 'g', 'j', ' ', ' ', 'z',
|
||||
'A', 'A', 'A', ' ', 'A', 'C', 'C', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
' ', 'N', 'O', 'O', 'O', 'G', 'O', ' ', 'G', 'U', 'U', 'U', 'U', 'U', 'S', ' ',
|
||||
'a', 'a', 'a', ' ', 'a', 'c', 'c', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
' ', 'n', 'o', 'o', 'o', 'g', 'o', ' ', 'g', 'u', 'u', 'u', 'u', 'u', 's', ' '
|
||||
};
|
||||
// Latin-4 (North European)
|
||||
// Adapted from Wikipedia:
|
||||
// Estonian, Latvian, Lithuanian, Greenlandic, and Sami
|
||||
const unsigned char map_8859_4[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'A', 'k', 'R', ' ', 'I', 'L', ' ', ' ', 'S', 'E', 'G', 'T', ' ', 'Z', ' ',
|
||||
' ', 'a', ' ', 'r', ' ', 'i', 'l', ' ', ' ', 's', 'e', 'g', 't', 'N', 'z', 'n',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'I', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I',
|
||||
'D', 'N', 'O', 'K', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'U', 'U', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'i', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i',
|
||||
'd', 'n', 'o', 'k', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'u', 'u', ' '
|
||||
};
|
||||
// Latin-5 (Turkish)
|
||||
// Adapted from Wikipedia:
|
||||
// Turkish
|
||||
const unsigned char map_8859_9[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'G', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'I', 'S', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'g', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'i', 's', 'y'
|
||||
};
|
||||
// Latin-6 (Nordic)
|
||||
// Adapted from Wikipedia
|
||||
const unsigned char map_8859_10[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'A', 'E', 'G', 'I', 'I', 'K', ' ', 'L', 'D', 'S', 'T', 'Z', ' ', 'U', 'N',
|
||||
' ', 'a', 'e', 'g', 'i', 'i', 'k', ' ', 'l', 'd', 's', 't', 'z', ' ', 'u', 'n',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'I', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I',
|
||||
'D', 'N', 'O', 'O', 'O', 'O', 'O', 'U', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'i', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i',
|
||||
'o', 'n', 'o', 'o', 'o', 'o', 'o', 'u', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'k'
|
||||
};
|
||||
// Latin-7 (Baltic Rim)
|
||||
// Adapted from Wikipedia:
|
||||
const unsigned char map_8859_13[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'R', ' ', ' ', ' ', ' ', 'A',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
'A', 'I', 'A', 'C', 'A', 'A', 'E', 'E', 'C', 'E', 'Z', 'E', 'G', 'K', 'I', 'L',
|
||||
'S', 'N', 'N', 'O', 'O', 'O', 'O', ' ', 'U', 'L', 'S', 'U', 'U', 'Z', 'Z', ' ',
|
||||
'a', 'i', 'a', 'c', 'a', 'a', 'e', 'e', 'c', 'e', 'z', 'e', 'g', 'k', 'i', 'l',
|
||||
's', 'n', 'n', 'o', 'o', 'o', 'o', ' ', 'u', 'l', 's', 'u', 'u', 'z', 'z', ' '
|
||||
};
|
||||
// Latin-8 (Celtic)
|
||||
// Adapted from Wikipedia:
|
||||
// Gaelic, Welsh, Breton
|
||||
const unsigned char map_8859_14[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'B', 'b', ' ', 'C', 'c', 'D', ' ', 'W', ' ', 'W', 'd', 'Y', ' ', ' ', 'Y',
|
||||
'F', 'f', 'G', 'g', 'M', 'm', ' ', 'P', 'w', 'p', 'w', 'S', 'y', 'W', 'w', 's',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'W', 'N', 'O', 'O', 'O', 'O', 'O', 'T', ' ', 'U', 'U', 'U', 'U', 'Y', 'y', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'w', 'n', 'o', 'o', 'o', 'o', 'o', 't', ' ', 'u', 'u', 'u', 'u', 'y', 'y', 'y'
|
||||
};
|
||||
// Latin-9
|
||||
// Adapted from Wikipedia:
|
||||
// Update of 8859-1
|
||||
// English, French, German, Spanish and Portuguese
|
||||
const unsigned char map_8859_15[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', 'S', ' ', 's', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', 'Z', ' ', ' ', ' ', 'z', ' ', ' ', ' ', 'O', 'o', 'Y', ' ',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'o', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'y'
|
||||
};
|
||||
// Latin-10 "South-Eastern European"
|
||||
// Adapted from Wikipedia:
|
||||
// Albanian, Croatian, Hungarian, Polish, Romanian and Slovenian, French,
|
||||
// Italian and Irish Gaelic (new orthography).
|
||||
const unsigned char map_8859_16[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'A', 'a', 'L', ' ', ' ', 'S', ' ', 's', ' ', 'S', ' ', 'Z', ' ', 'z', 'Z',
|
||||
' ', ' ', 'C', 'l', 'Z', ' ', ' ', ' ', 'z', 'c', 's', ' ', 'O', 'o', 'Y', 'z',
|
||||
'A', 'A', 'A', 'A', 'A', 'C', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'D', 'N', 'O', 'O', 'O', 'O', 'O', 'S', 'U', 'U', 'U', 'U', 'U', 'E', 'T', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'c', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'd', 'n', 'o', 'o', 'o', 'o', 'o', 's', 'u', 'u', 'u', 'u', 'u', 'e', 't', 'y'
|
||||
};
|
||||
// array of all 8859 charsets
|
||||
const unsigned char* map_8859[ISO_8859_NUM_CHARSETS + 1] =
|
||||
{
|
||||
map_8859_default,
|
||||
map_8859_1,
|
||||
map_8859_2,
|
||||
map_8859_3,
|
||||
map_8859_4,
|
||||
map_8859_default,
|
||||
map_8859_default,
|
||||
map_8859_default,
|
||||
map_8859_default,
|
||||
map_8859_9,
|
||||
map_8859_10,
|
||||
map_8859_default,
|
||||
map_8859_default,
|
||||
map_8859_13,
|
||||
map_8859_14,
|
||||
map_8859_15,
|
||||
map_8859_16
|
||||
};
|
||||
const unsigned char map_win_1251[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'K', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'k', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', 'Y', 'y', 'J', ' ', ' ', ' ', ' ', 'E', ' ', ' ', ' ', ' ', '-', ' ', 'I',
|
||||
' ', 'I', 'i', ' ', ' ', ' ', ' ', ' ', 'e', ' ', ' ', ' ', 'j', 'S', 's', 'i',
|
||||
'A', 'b', 'B', ' ', ' ', 'E', ' ', ' ', 'N', 'N', 'K', ' ', 'M', 'H', 'O', ' ',
|
||||
'P', 'C', 'T', 'Y', ' ', 'X', ' ', ' ', 'W', 'W', 'b', ' ', 'b', ' ', ' ', 'R',
|
||||
'a', 'o', 'b', ' ', ' ', 'e', ' ', ' ', 'n', 'n', 'k', ' ', 'm', 'h', 'o', ' ',
|
||||
'p', 'c', 't', 'y', ' ', 'x', ' ', ' ', 'w', 'w', 'b', ' ', 'b', ' ', ' ', 'r'
|
||||
};
|
||||
const unsigned char map_win_1252[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'S', ' ', 'O', ' ', 'Z', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 's', ' ', 'o', ' ', 'z', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '-', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ', ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'o', 'n', 'o', 'o', 'o', 'o', 'o', ' ', ' ', 'u', 'u', 'u', 'u', 'y', ' ', 'y'
|
||||
};
|
||||
const unsigned char map_win_1253[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'E', 'H', 'I', ' ', 'O', ' ', 'Y', ' ',
|
||||
' ', 'A', 'B', ' ', ' ', 'E', 'Z', 'H', ' ', 'I', 'K', ' ', 'M', 'N', ' ', 'O',
|
||||
' ', 'P', ' ', ' ', 'T', 'Y', ' ', 'X', ' ', ' ', 'I', 'Y', 'a', 'e', 'n', 'i',
|
||||
'v', 'a', 'b', ' ', ' ', 'e', ' ', 'n', ' ', 'l', 'k', ' ', ' ', 'v', ' ', 'o',
|
||||
' ', 'p', ' ', 'o', 't', 'v', ' ', 'X', ' ', 'w', 'i', 'v', 'o', 'v', 'w', ' '
|
||||
};
|
||||
const unsigned char map_win_1254[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'S', ' ', 'O', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 's', ' ', 'o', ' ', ' ', 'Y',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
|
||||
'G', 'N', 'O', 'O', 'O', 'O', 'O', 'X', '0', 'U', 'U', 'U', 'U', 'I', 'S', ' ',
|
||||
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
|
||||
'g', 'n', 'o', 'o', 'o', 'o', 'o', ' ', 'o', 'u', 'u', 'u', 'u', 'i', 's', ' '
|
||||
};
|
||||
const unsigned char map_win_1255[256] =
|
||||
{
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',0x09,0x0A, ' ', ' ',0x0D, ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
|
||||
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
|
||||
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
|
||||
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
|
||||
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
|
||||
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E, ' ',
|
||||
// vvv control chars vvv
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
// ^^^ control chars ^^^
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '
|
||||
};
|
||||
// array of all windows charsets
|
||||
const unsigned char* map_win_125x[WIN_125X_NUM_CHARSETS + 1] =
|
||||
{
|
||||
map_win_1252, // dummy, will never be used - no NULL here...
|
||||
map_win_1251,
|
||||
map_win_1252,
|
||||
map_win_1253,
|
||||
map_win_1254,
|
||||
map_win_1255
|
||||
};
|
54
Iso8859.h
54
Iso8859.h
@ -1,54 +0,0 @@
|
||||
// default map for charsets that are highly "non-latin"
|
||||
extern const unsigned char map_8859_default[256];
|
||||
// Latin-1
|
||||
// Adapted from Wikipedia:
|
||||
// Albanian, Basque, Catalan, Danish, Dutch (missing some letters),
|
||||
// English, Estonian (missing some letters), Faroese,
|
||||
// French (missing some letters), Finnish (missing some letters),
|
||||
// Galician, German, Icelandic, Irish (new orthography), Italian,
|
||||
// Latin, Norwegian, Portuguese, Rhaeto-Romanic, Scottish, Spanish,
|
||||
// Swedish, Afrikaans, Swahili
|
||||
extern const unsigned char map_8859_1[256];
|
||||
// Latin-2
|
||||
// Adapted from Wikipedia:
|
||||
// Bosnian, Croatian, Czech, Hungarian, Polish, Romainian, Serbian,
|
||||
// Serbocroatian, Slovak, Slovenian, Upper Sorbian and Lower Sorbian
|
||||
extern const unsigned char map_8859_2[256];
|
||||
// Latin-3 (South European)
|
||||
// Adapted from Wikipedia:
|
||||
// Turkish (superceded by 8859-9), Maltese, Esperanto
|
||||
extern const unsigned char map_8859_3[256];
|
||||
// Latin-4 (North European)
|
||||
// Adapted from Wikipedia:
|
||||
// Estonian, Latvian, Lithuanian, Greenlandic, and Sami
|
||||
extern const unsigned char map_8859_4[256];
|
||||
// Latin-5 (Turkish)
|
||||
// Adapted from Wikipedia:
|
||||
// Turkish
|
||||
extern const unsigned char map_8859_9[256];
|
||||
// Latin-6 (Nordic)
|
||||
// Adapted from Wikipedia
|
||||
extern const unsigned char map_8859_10[256];
|
||||
// Latin-7 (Baltic Rim)
|
||||
// Adapted from Wikipedia:
|
||||
extern const unsigned char map_8859_13[256];
|
||||
// Latin-8 (Celtic)
|
||||
// Adapted from Wikipedia:
|
||||
// Gaelic, Welsh, Breton
|
||||
extern const unsigned char map_8859_14[256];
|
||||
// Latin-9
|
||||
// Adapted from Wikipedia:
|
||||
// Update of 8859-1
|
||||
// English, French, German, Spanish and Portuguese
|
||||
extern const unsigned char map_8859_15[256];
|
||||
// Latin-10 "South-Eastern European"
|
||||
// Adapted from Wikipedia:
|
||||
// Albanian, Croatian, Hungarian, Polish, Romanian and Slovenian, French,
|
||||
// Italian and Irish Gaelic (new orthography).
|
||||
extern const unsigned char map_8859_16[256];
|
||||
// array of all 8859 charsets
|
||||
#define ISO_8859_NUM_CHARSETS 16
|
||||
extern const unsigned char* map_8859[ISO_8859_NUM_CHARSETS + 1];
|
||||
// MS-WIN codepage 1252
|
||||
#define WIN_125X_NUM_CHARSETS 5
|
||||
extern const unsigned char* map_win_125x[WIN_125X_NUM_CHARSETS + 1];
|
362
Lang.cpp
362
Lang.cpp
@ -8,107 +8,6 @@ void languageToString ( unsigned char langId , char *buf ) {
|
||||
strcpy(buf,p);
|
||||
}
|
||||
|
||||
static char *s_nativeLangStrings[] = {
|
||||
"unknown",
|
||||
"english",
|
||||
"français",
|
||||
"español",
|
||||
"русcкий",
|
||||
"t<EFBFBD>rk<EFBFBD>e", // not sure...
|
||||
"japanese", // don't know yet
|
||||
"chinese traditional", // don't know yet
|
||||
"chinese simplified", // don't know yet
|
||||
"korean", // don't know yet
|
||||
"deutsch",
|
||||
"nederlands",
|
||||
"italiano",
|
||||
"suomi",
|
||||
"svenska",
|
||||
"norsk",
|
||||
"português",
|
||||
"vietnamese", // don't know yet
|
||||
"arabic", // don't know yet
|
||||
"hebrew", // don't know yet
|
||||
"indonesian", // don't know yet
|
||||
"greek", // don't know yet
|
||||
"thai", // don't know yet
|
||||
"hindi", // don't know yet
|
||||
"bengala", // don't know yet
|
||||
"polski",
|
||||
"tagalog", // don't know yet
|
||||
|
||||
"latin",
|
||||
"esperanto",
|
||||
"catalan",
|
||||
"bulgarian",
|
||||
"translingual",
|
||||
"serbo-croatin",
|
||||
"hungarian",
|
||||
"danish",
|
||||
"lithuanian",
|
||||
"czech",
|
||||
"galician",
|
||||
"georgian",
|
||||
"scottish gaelic",
|
||||
"gothic",
|
||||
"romanian",
|
||||
"irish",
|
||||
"latvian",
|
||||
"armenian",
|
||||
"icelandic",
|
||||
"ancient greek",
|
||||
"manx",
|
||||
"ido",
|
||||
"persian",
|
||||
"telugu",
|
||||
"venetian",
|
||||
"malagasy",
|
||||
"kurdish",
|
||||
"luxembourgish",
|
||||
"estonian",
|
||||
|
||||
NULL
|
||||
};
|
||||
static char *s_lowerLangStrings[] = {
|
||||
"unknown","english","french","spanish","russian","turkish","japanese",
|
||||
"chinese traditional","chinese simplified","korean","german","dutch",
|
||||
"italian","finnish","swedish","norwegian","portuguese","vietnamese",
|
||||
"arabic","hebrew","indonesian","greek","thai","hindi","bengala",
|
||||
"polish","tagalog",
|
||||
|
||||
"latin",
|
||||
"esperanto",
|
||||
"catalan",
|
||||
"bulgarian",
|
||||
"translingual",
|
||||
"serbo-croatian",
|
||||
"hungarian",
|
||||
"danish",
|
||||
"lithuanian",
|
||||
"czech",
|
||||
"galician",
|
||||
"georgian",
|
||||
"scottish gaelic",
|
||||
"gothic",
|
||||
"romanian",
|
||||
"irish",
|
||||
"latvian",
|
||||
"armenian",
|
||||
"icelandic",
|
||||
"ancient greek",
|
||||
"manx",
|
||||
"ido",
|
||||
"persian",
|
||||
"telugu",
|
||||
"venetian",
|
||||
"malagasy",
|
||||
"kurdish",
|
||||
"luxembourgish",
|
||||
"estonian",
|
||||
|
||||
NULL
|
||||
};
|
||||
|
||||
static char *s_langStrings[] = {
|
||||
"Unknown","English","French","Spanish","Russian","Turkish","Japanese",
|
||||
"Chinese Traditional","Chinese Simplified","Korean","German","Dutch",
|
||||
@ -151,12 +50,7 @@ static char *s_langStrings[] = {
|
||||
char* getLanguageString ( unsigned char langId ) {
|
||||
if ( langId >= sizeof(s_langStrings)/sizeof(char *) ) return NULL;
|
||||
return s_langStrings[langId];
|
||||
};
|
||||
|
||||
char* getNativeLanguageString ( unsigned char langId ) {
|
||||
if ( langId >= sizeof(s_nativeLangStrings)/sizeof(char *) ) return NULL;
|
||||
return s_nativeLangStrings[langId];
|
||||
};
|
||||
}
|
||||
|
||||
static char *s_langAbbr[] = {
|
||||
"xx","en","fr","es","ru","tr","ja","zh_tw","zh_cn","ko","de","nl",
|
||||
@ -195,31 +89,6 @@ static char *s_langAbbr[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
// fix bug:
|
||||
//#ifndef PRIVATESTUFF
|
||||
#define csISOLatin6 cslatin6
|
||||
//#endif
|
||||
|
||||
static unsigned char s_langCharset[] = {
|
||||
csUnknown,csISOLatin1,csISOLatin1,csISOLatin1,//"xx","en","fr","es",
|
||||
csUnknown,csUnknown,csUnknown,csUnknown,//"ru","zz","ja","zh_tw",
|
||||
csUnknown,csUnknown,csISOLatin1,csISOLatin1,//"zh_cn","ko","de","nl",
|
||||
csISOLatin1,csISOLatin6,csISOLatin6,csISOLatin6,//"it","fi","sv","no",
|
||||
csISOLatin1,csUnknown,csUnknown,csUnknown,//"pt","vi","ar","he",
|
||||
csUnknown,csUnknown,csUnknown,csUnknown,//"id","el","th","hi",
|
||||
csUnknown,csUnknown,csUnknown,//"bn","pl","tl","en_uk",
|
||||
csUnknown//"en_au"
|
||||
};
|
||||
|
||||
uint8_t getLanguageFromName(uint8_t *name) {
|
||||
int x;
|
||||
for(x = 0; x < MAX_LANGUAGES && s_lowerLangStrings[x]; x++)
|
||||
if(!strcasecmp((char*)name, s_lowerLangStrings[x])) return(x);
|
||||
for(x = 0; x < MAX_LANGUAGES && s_nativeLangStrings[x]; x++)
|
||||
if(!strcasecmp((char*)name, s_nativeLangStrings[x])) return(x);
|
||||
return(0);
|
||||
}
|
||||
|
||||
uint8_t getLangIdFromAbbr ( const char *abbr ) {
|
||||
int x;
|
||||
for(x = 0; x < MAX_LANGUAGES && s_langAbbr[x]; x++)
|
||||
@ -231,6 +100,16 @@ uint8_t getLangIdFromAbbr ( const char *abbr ) {
|
||||
return langUnknown;//0;
|
||||
}
|
||||
|
||||
uint8_t getLangIdFromAbbrN ( const char *abbr ) {
|
||||
for (int x = 0; x < MAX_LANGUAGES && s_langAbbr[x]; ++x) {
|
||||
if (!strncasecmp((char*)abbr, s_langAbbr[x], strlen(s_langAbbr[x]))) {
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
char *getLangAbbr ( uint8_t langId ) {
|
||||
return s_langAbbr[langId];
|
||||
}
|
||||
@ -238,225 +117,6 @@ char *getLangAbbr ( uint8_t langId ) {
|
||||
char* getLanguageAbbr ( unsigned char langId ) {
|
||||
if ( langId >= sizeof(s_langAbbr)/sizeof(char *) ) return NULL;
|
||||
return s_langAbbr[langId];
|
||||
};
|
||||
|
||||
unsigned char getLanguageCharset ( unsigned char langId ){
|
||||
if ( langId >= sizeof(s_langAbbr)/sizeof(char *) ) return csUnknown;
|
||||
return s_langCharset[langId];
|
||||
}
|
||||
|
||||
/*
|
||||
unsigned char getLanguageFromScript(UChar32 c) {
|
||||
switch(ucGetScript(c)) {
|
||||
case ucScriptArabic:
|
||||
return langArabic;
|
||||
break;
|
||||
case ucScriptGreek:
|
||||
return langGreek;
|
||||
break;
|
||||
case ucScriptHangul:
|
||||
case ucScriptHanunoo:
|
||||
return langKorean;
|
||||
break;
|
||||
//case ucScriptHan:
|
||||
//return langChineseTrad;
|
||||
|
||||
case ucScriptHiragana:
|
||||
case ucScriptKannada:
|
||||
case ucScriptKatakana:
|
||||
case ucScriptKatakana_Or_Hiragana:
|
||||
return langJapanese;
|
||||
break;
|
||||
case ucScriptHebrew:
|
||||
return langHebrew;
|
||||
break;
|
||||
case ucScriptThai:
|
||||
return langThai;
|
||||
break;
|
||||
case ucScriptBengali:
|
||||
return langBengala;
|
||||
break;
|
||||
case ucScriptDevanagari:
|
||||
return langHindi;
|
||||
break;
|
||||
|
||||
default:
|
||||
return langUnknown;
|
||||
break;
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
unsigned char getLanguageFromAbbr(char *abbr) {
|
||||
// if(!strcmp(abbr, "en-GB")) return langBritish;
|
||||
// if(!strcmp(abbr, "en_AU")) return langAustralia;
|
||||
// if(!strcmp(abbr, "en-AU")) return langAustralia;
|
||||
if(!strcmp(abbr, "en_US")) return langEnglish;
|
||||
if(!strcmp(abbr, "en-US")) return langEnglish;
|
||||
if(!strcmp(abbr, "en")) return langEnglish;
|
||||
if(!strcmp(abbr, "fr")) return langFrench;
|
||||
if(!strcmp(abbr, "es_MX")) return langSpanish;
|
||||
if(!strcmp(abbr, "es-MX")) return langSpanish;
|
||||
if(!strcmp(abbr, "es")) return langSpanish;
|
||||
if(!strcmp(abbr, "ru")) return langRussian;
|
||||
if(!strcmp(abbr, "ua")) return langRussian; // ukrainian?
|
||||
if(!strcmp(abbr, "ja")) return langJapanese;
|
||||
if(!strcmp(abbr, "zh_tw")) return langChineseTrad;
|
||||
if(!strcmp(abbr, "zh_cn")) return langChineseSimp;
|
||||
if(!strcmp(abbr, "ko")) return langKorean;
|
||||
if(!strcmp(abbr, "de")) return langGerman;
|
||||
if(!strcmp(abbr, "nl")) return langDutch;
|
||||
if(!strcmp(abbr, "it")) return langItalian;
|
||||
if(!strcmp(abbr, "fi")) return langFinnish;
|
||||
if(!strcmp(abbr, "sv")) return langSwedish;
|
||||
if(!strcmp(abbr, "no")) return langNorwegian;
|
||||
if(!strcmp(abbr, "pt")) return langPortuguese;
|
||||
if(!strcmp(abbr, "vi")) return langVietnamese;
|
||||
if(!strcmp(abbr, "ar")) return langArabic;
|
||||
if(!strcmp(abbr, "he")) return langHebrew;
|
||||
if(!strcmp(abbr, "id")) return langIndonesian;
|
||||
if(!strcmp(abbr, "el")) return langGreek;
|
||||
if(!strcmp(abbr, "th")) return langThai;
|
||||
if(!strcmp(abbr, "hi")) return langHindi;
|
||||
if(!strcmp(abbr, "bn")) return langBengala;
|
||||
if(!strcmp(abbr, "pl")) return langPolish;
|
||||
if(!strcmp(abbr, "tl")) return langTagalog;
|
||||
if(!strcmp(abbr, "tr")) return langTurkish;
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
unsigned char getLanguageFromAbbrN(char *abbr) {
|
||||
// if(!strcmp(abbr, "en-GB")) return langBritish;
|
||||
// if(!strcmp(abbr, "en_AU")) return langAustralia;
|
||||
// if(!strcmp(abbr, "en-AU")) return langAustralia;
|
||||
if(!strncasecmp(abbr, "en_US", 5)) return langEnglish;
|
||||
if(!strncasecmp(abbr, "en-US", 5)) return langEnglish;
|
||||
if(!strncasecmp(abbr, "en", 2)) return langEnglish;
|
||||
if(!strncasecmp(abbr, "fr", 2)) return langFrench;
|
||||
if(!strncasecmp(abbr, "es_MX", 5)) return langSpanish;
|
||||
if(!strncasecmp(abbr, "es-MX", 5)) return langSpanish;
|
||||
if(!strncasecmp(abbr, "es", 2)) return langSpanish;
|
||||
if(!strncasecmp(abbr, "ru", 2)) return langRussian;
|
||||
if(!strncasecmp(abbr, "ua", 2)) return langRussian; // ukrainian?
|
||||
if(!strncasecmp(abbr, "ja", 2)) return langJapanese;
|
||||
if(!strncasecmp(abbr, "zh_tw", 5)) return langChineseTrad;
|
||||
if(!strncasecmp(abbr, "zh_cn", 5)) return langChineseSimp;
|
||||
if(!strncasecmp(abbr, "ko", 2)) return langKorean;
|
||||
if(!strncasecmp(abbr, "de", 2)) return langGerman;
|
||||
if(!strncasecmp(abbr, "nl", 2)) return langDutch;
|
||||
if(!strncasecmp(abbr, "it", 2)) return langItalian;
|
||||
if(!strncasecmp(abbr, "fi", 2)) return langFinnish;
|
||||
if(!strncasecmp(abbr, "sv", 2)) return langSwedish;
|
||||
if(!strncasecmp(abbr, "no", 2)) return langNorwegian;
|
||||
if(!strncasecmp(abbr, "pt", 2)) return langPortuguese;
|
||||
if(!strncasecmp(abbr, "vi", 2)) return langVietnamese;
|
||||
if(!strncasecmp(abbr, "ar", 2)) return langArabic;
|
||||
if(!strncasecmp(abbr, "he", 2)) return langHebrew;
|
||||
if(!strncasecmp(abbr, "id", 2)) return langIndonesian;
|
||||
if(!strncasecmp(abbr, "el", 2)) return langGreek;
|
||||
if(!strncasecmp(abbr, "th", 2)) return langThai;
|
||||
if(!strncasecmp(abbr, "hi", 2)) return langHindi;
|
||||
if(!strncasecmp(abbr, "bn", 2)) return langBengala;
|
||||
if(!strncasecmp(abbr, "pl", 2)) return langPolish;
|
||||
if(!strncasecmp(abbr, "tl", 2)) return langTagalog;
|
||||
if(!strncasecmp(abbr, "tr", 2)) return langTurkish;
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
unsigned char getLanguageFromUnicodeAbbr(char *abbr) {
|
||||
// if (!memcmp(abbr, "e\0n\0_\0g\0b\0",10)) return langBritish;
|
||||
// else if(!memcmp(abbr, "e\0n\0-\0g\0b\0",10)) return langBritish;
|
||||
// else if(!memcmp(abbr, "e\0n\0_\0a\0u\0",10)) return langAustralia;
|
||||
// else if(!memcmp(abbr, "e\0n\0-\0a\0u\0",10)) return langAustralia;
|
||||
if(!memcmp(abbr, "en_us",5)) return langEnglish;
|
||||
if(!memcmp(abbr, "en-us",5)) return langEnglish;
|
||||
if(!memcmp(abbr, "es_mx",5)) return langSpanish;
|
||||
if(!memcmp(abbr, "es-mx",5)) return langSpanish;
|
||||
if(!memcmp(abbr, "zh_tw",5)) return langChineseTrad;
|
||||
if(!memcmp(abbr, "zh_cn",5)) return langChineseSimp;
|
||||
if(!memcmp(abbr, "en",2)) return langEnglish;
|
||||
if(!memcmp(abbr, "fr",2)) return langFrench;
|
||||
if(!memcmp(abbr, "es",2)) return langSpanish;
|
||||
if(!memcmp(abbr, "ru",2)) return langRussian;
|
||||
if(!memcmp(abbr, "ja",2)) return langJapanese;
|
||||
if(!memcmp(abbr, "ko",2)) return langKorean;
|
||||
if(!memcmp(abbr, "de",2)) return langGerman;
|
||||
if(!memcmp(abbr, "nl",2)) return langDutch;
|
||||
if(!memcmp(abbr, "it",2)) return langItalian;
|
||||
if(!memcmp(abbr, "fi",2)) return langFinnish;
|
||||
if(!memcmp(abbr, "sv",2)) return langSwedish;
|
||||
if(!memcmp(abbr, "no",2)) return langNorwegian;
|
||||
if(!memcmp(abbr, "pt",2)) return langPortuguese;
|
||||
if(!memcmp(abbr, "vi",2)) return langVietnamese;
|
||||
if(!memcmp(abbr, "ar",2)) return langArabic;
|
||||
if(!memcmp(abbr, "he",2)) return langHebrew;
|
||||
if(!memcmp(abbr, "id",2)) return langIndonesian;
|
||||
if(!memcmp(abbr, "el",2)) return langGreek;
|
||||
if(!memcmp(abbr, "th",2)) return langThai;
|
||||
if(!memcmp(abbr, "hi",2)) return langHindi;
|
||||
if(!memcmp(abbr, "bn",2)) return langBengala;
|
||||
if(!memcmp(abbr, "pl",2)) return langPolish;
|
||||
if(!memcmp(abbr, "tl",2)) return langTagalog;
|
||||
if(!memcmp(abbr, "tr",2)) return langTurkish;
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
|
||||
unsigned char getLanguageFromCountryCode(char *code) {
|
||||
// Check the ones we know are different first,
|
||||
// then revert to abbr
|
||||
if(!strcmp(code, "us")) return(langEnglish);
|
||||
if(!strcmp(code, "uk")) return(langEnglish);
|
||||
// if(!strcmp(code, "gb")) return(langBritish);
|
||||
// if(!strcmp(code, "vg")) return(langBritish);
|
||||
if(!strcmp(code, "vi")) return(langEnglish);
|
||||
// if(!strcmp(code, "au")) return(langAustralia);
|
||||
if(!strcmp(code, "ae")) return(langArabic);
|
||||
if(!strcmp(code, "cn")) return(langChineseSimp);
|
||||
if(!strcmp(code, "tw")) return(langChineseTrad);
|
||||
if(!strcmp(code, "vn")) return(langVietnamese);
|
||||
return(getLanguageFromAbbr(code));
|
||||
}
|
||||
|
||||
// This is only here to avoid mangling the string
|
||||
// as we look for tags, if at all possible use the
|
||||
// getLanguageFromAbbr instead.
|
||||
unsigned char getLanguageFromUserAgent(char *abbr) {
|
||||
// if(!strncmp(abbr, "en_GB", 5)) return langBritish;
|
||||
// if(!strncmp(abbr, "en-GB", 5)) return langBritish;
|
||||
// if(!strncmp(abbr, "en_AU", 5)) return langAustralia;
|
||||
// if(!strncmp(abbr, "en-AU", 5)) return langAustralia;
|
||||
if(!strncmp(abbr, "en_US", 5)) return langEnglish;
|
||||
if(!strncmp(abbr, "en-US", 5)) return langEnglish;
|
||||
if(!strncmp(abbr, "en", 2)) return langEnglish;
|
||||
if(!strncmp(abbr, "fr", 2)) return langFrench;
|
||||
if(!strncmp(abbr, "es_MX", 5)) return langSpanish;
|
||||
if(!strncmp(abbr, "es-MX", 5)) return langSpanish;
|
||||
if(!strncmp(abbr, "es", 2)) return langSpanish;
|
||||
if(!strncmp(abbr, "ru", 2)) return langRussian;
|
||||
if(!strncmp(abbr, "ja", 2)) return langJapanese;
|
||||
if(!strncmp(abbr, "zh_tw", 5)) return langChineseTrad;
|
||||
if(!strncmp(abbr, "zh_cn", 5)) return langChineseSimp;
|
||||
if(!strncmp(abbr, "ko", 2)) return langKorean;
|
||||
if(!strncmp(abbr, "de", 2)) return langGerman;
|
||||
if(!strncmp(abbr, "nl", 2)) return langDutch;
|
||||
if(!strncmp(abbr, "it", 2)) return langItalian;
|
||||
if(!strncmp(abbr, "fi", 2)) return langFinnish;
|
||||
if(!strncmp(abbr, "sv", 2)) return langSwedish;
|
||||
if(!strncmp(abbr, "no", 2)) return langNorwegian;
|
||||
if(!strncmp(abbr, "pt", 2)) return langPortuguese;
|
||||
if(!strncmp(abbr, "vi", 2)) return langVietnamese;
|
||||
if(!strncmp(abbr, "ar", 2)) return langArabic;
|
||||
if(!strncmp(abbr, "he", 2)) return langHebrew;
|
||||
if(!strncmp(abbr, "id", 2)) return langIndonesian;
|
||||
if(!strncmp(abbr, "el", 2)) return langGreek;
|
||||
if(!strncmp(abbr, "th", 2)) return langThai;
|
||||
if(!strncmp(abbr, "hi", 2)) return langHindi;
|
||||
if(!strncmp(abbr, "bn", 2)) return langBengala;
|
||||
if(!strncmp(abbr, "pl", 2)) return langPolish;
|
||||
if(!strncmp(abbr, "tl", 2)) return langTagalog;
|
||||
if(!strncmp(abbr, "tr", 2)) return langTurkish;
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
// . these are going to be adult, in any language
|
||||
|
15
Lang.h
15
Lang.h
@ -12,8 +12,8 @@
|
||||
// translingual is the 31st bit, english is the first bit
|
||||
#define LANG_BIT_MASK 0x007fffffffffffffLL
|
||||
#include "Unicode.h"
|
||||
#include "Iso8859.h"
|
||||
#include "iana_charset.h"
|
||||
|
||||
enum {
|
||||
langUnknown = 0,
|
||||
langEnglish = 1,
|
||||
@ -76,23 +76,14 @@ enum {
|
||||
langLast = 56
|
||||
};
|
||||
|
||||
uint8_t getLanguageFromName(uint8_t *name);
|
||||
uint8_t getLangIdFromAbbr ( const char *abbr ) ;
|
||||
uint8_t getLangIdFromAbbrN ( const char *abbr ) ;
|
||||
char *getLangAbbr ( uint8_t langId ) ;
|
||||
|
||||
void languageToString ( unsigned char lang , char *buf );
|
||||
char* getLanguageString ( unsigned char lang);
|
||||
char* getNativeLanguageString ( unsigned char lang);
|
||||
char* getLanguageAbbr ( unsigned char lang);
|
||||
unsigned char getLanguageCharset ( unsigned char LangId );
|
||||
|
||||
bool isAdult( char *s, int32_t slen, char **loc = NULL );
|
||||
//unsigned char getLanguageFromScript(UChar32 c);
|
||||
unsigned char getLanguageFromAbbr(char *abbr);
|
||||
unsigned char getLanguageFromAbbrN(char *abbr);
|
||||
//unsigned char getLanguageFromUnicodeAbbr(UChar *abbr);
|
||||
// abbr is now in utf8
|
||||
unsigned char getLanguageFromUnicodeAbbr(char *abbr);
|
||||
unsigned char getLanguageFromUserAgent(char *abbr);
|
||||
unsigned char getLanguageFromCountryCode(char *code);
|
||||
|
||||
#endif
|
||||
|
5002
Language.cpp
5002
Language.cpp
File diff suppressed because it is too large
Load Diff
299
Language.h
299
Language.h
@ -1,299 +0,0 @@
|
||||
|
||||
#ifndef _LANGUAGE_H_
|
||||
#define _LANGUAGE_H_
|
||||
//#include <wchar.h>
|
||||
#include "gb-include.h"
|
||||
//#include "UnicodeProperties.h" //UChar32
|
||||
#include "File.h"
|
||||
#include "HashTableT.h"
|
||||
#include "Query.h"
|
||||
#include "Lang.h"
|
||||
#include "Multicast.h"
|
||||
#include "Threads.h"
|
||||
#include "Titledb.h"
|
||||
#include "Iso8859.h"
|
||||
#include "IndexList.h"
|
||||
//#include "Msg3a.h"
|
||||
|
||||
#include "Msg20.h"
|
||||
|
||||
// max chars in any language
|
||||
#define MAX_WORDS_PER_PHRASE 5
|
||||
#define MAX_CHARS 256
|
||||
#define TOP_POP_PHRASES 40 * 1024
|
||||
#define NUM_CHARS 40
|
||||
#define MAX_FRAG_SIZE 1024
|
||||
// max chars that start the rule
|
||||
|
||||
#define MAX_PHRASE_LEN 80
|
||||
#define MAX_RECOMMENDATIONS 10
|
||||
#define LARGE_SCORE 0xfffff
|
||||
#define MAX_NARROW_SEARCHES 19
|
||||
|
||||
/*
|
||||
// used only while generating titles from wikipedia pages, makeWikiFiles()
|
||||
class StateWik {
|
||||
public:
|
||||
bool getIndexList( );
|
||||
bool getSummary ( );
|
||||
bool gotSummary ( );
|
||||
|
||||
int m_fdw;
|
||||
Msg0 m_msg0;
|
||||
IndexList m_list;
|
||||
Query m_q;
|
||||
key_t m_startKey;
|
||||
key_t m_endKey;
|
||||
char *m_coll;
|
||||
int32_t m_collLen;
|
||||
int64_t m_termId;
|
||||
int32_t m_minRecSize;
|
||||
Msg20 m_msg20s[MAX_FRAG_SIZE];
|
||||
int32_t m_numMsg20sOutstanding;
|
||||
int32_t m_numMsg20sLaunched;
|
||||
int32_t m_numMsg20sReceived;
|
||||
};
|
||||
|
||||
class StateDict{
|
||||
public:
|
||||
char *m_dictBuf;
|
||||
int32_t m_dictBufSize;
|
||||
char *m_buf;
|
||||
int32_t m_bufSize;
|
||||
char **m_wordsPtr;
|
||||
int64_t *m_termIds;
|
||||
int64_t *m_termFreqs;
|
||||
int32_t m_numTuples;
|
||||
Msg37 m_msg37;
|
||||
};
|
||||
*/
|
||||
|
||||
/*class StateAff{
|
||||
public:
|
||||
bool openAffinityFile ( );
|
||||
bool launchAffinity ( );
|
||||
bool gotAffinityFreqs1 ( );
|
||||
bool gotAffinityFreqs2 ( );
|
||||
bool doneAffinities ( );
|
||||
|
||||
FILE *m_fdr;
|
||||
int m_fdw;
|
||||
int32_t m_fileNum;
|
||||
char m_buf[1026];
|
||||
Msg3a m_msg3a;
|
||||
Query m_q;
|
||||
int64_t m_numerator;
|
||||
int64_t m_denominator;
|
||||
};*/
|
||||
|
||||
typedef struct Reco{
|
||||
char reco[MAX_PHRASE_LEN];
|
||||
int32_t score;
|
||||
}Reco;
|
||||
|
||||
class Language {
|
||||
|
||||
public:
|
||||
|
||||
Language();
|
||||
~Language();
|
||||
|
||||
void reset();
|
||||
|
||||
bool init( char *unifiedBuf, int32_t unifiedBufSize, int32_t lang,
|
||||
int32_t hostsPerSplit, uint32_t myHash );
|
||||
|
||||
void setLang( int32_t lang ) { m_lang = lang; };
|
||||
|
||||
//bool makeAffinities();
|
||||
|
||||
//int32_t getPhrasePopularity ( char *s, uint64_t h,
|
||||
// bool checkTitleRecDict );
|
||||
|
||||
bool checkDict(char *s, int32_t slen, char encodeType);
|
||||
|
||||
bool getRecommendation( char *origWord, int32_t origWordLen,
|
||||
char *recommendation, int32_t recommendationLen,
|
||||
bool *found, int32_t *score, int32_t *popularity,
|
||||
bool forceReco = false );
|
||||
|
||||
//int32_t narrowPhrase ( char *request, char *phrases, int32_t *pops,
|
||||
// int32_t maxPhrases );
|
||||
|
||||
//bool generateDicts ( int32_t numWordsToDump , char *coll );
|
||||
|
||||
//bool convertLatin1DictToUTF8 ( char *infile );
|
||||
|
||||
// needed for makeDict
|
||||
//bool gotTermFreqs( StateDict *st );
|
||||
//StateDict *m_stateDict;
|
||||
|
||||
// hash table of the dictionary
|
||||
HashTableT <uint64_t, int32_t>m_dict;
|
||||
|
||||
private:
|
||||
int32_t spellcheckDict();
|
||||
|
||||
// always accepts only ascii chars. makeClean() converts unicode into
|
||||
// ascii
|
||||
bool getPhonetic( char *origWord, int32_t origWordLen,
|
||||
char *target, int32_t targetLen );
|
||||
|
||||
bool loadRules();
|
||||
|
||||
bool loadSpellerDict( char *spellerBuf, int32_t spellerbufSize,
|
||||
int32_t hostsPerSplit, uint32_t myHash );
|
||||
|
||||
//bool loadTitleRecDicts( );
|
||||
|
||||
//bool loadNarrow( char *spellerBuf, int32_t spellerBufSize,
|
||||
// int32_t hostsPerSplit, uint32_t myHash );
|
||||
|
||||
bool loadDictHashTable( );
|
||||
|
||||
//bool genTopPopFile ( char *infile );
|
||||
|
||||
bool genDistributedPopFile ( char *infile, uint32_t myHash );
|
||||
|
||||
//bool cleanDictFile ( );
|
||||
|
||||
bool makeClean( char *inBuf, int32_t inBufSize,
|
||||
char *outBuf, int32_t outBufSize );//, bool isUTF16 );
|
||||
|
||||
//bool makePhonet( char *infile);
|
||||
|
||||
//bool makeDict();
|
||||
|
||||
//bool makeQueryFiles ( );
|
||||
|
||||
//bool makeWikiFiles ( );
|
||||
|
||||
bool loadWikipediaWords();
|
||||
|
||||
bool loadMispelledWords();
|
||||
|
||||
bool hasMispelling(char *phrase, int32_t phraseLen);
|
||||
|
||||
int32_t tryPhonet( char *phonetTmp, char *origPhonet,
|
||||
char *origClean, int32_t tryForScore,
|
||||
Reco *recos, int32_t numRecos, int32_t *lowestScore );
|
||||
|
||||
int32_t editDistance( char *a, char *b, int32_t level, // starting level
|
||||
int32_t limit ); // maximum level
|
||||
|
||||
int32_t weightedAverage(int32_t soundslikeScore, int32_t wordScore);
|
||||
|
||||
int32_t limitEditDistance( char *a, char *b, int32_t limit );
|
||||
|
||||
int32_t limit1EditDistance( char *a, char *b );
|
||||
|
||||
int32_t limit2EditDistance( char *a, char *b );
|
||||
|
||||
int32_t checkRest( char *a, char *b, int32_t w, char *amax, int32_t min );
|
||||
|
||||
int32_t check2( char *a, char *b, int32_t w, char *amax, int32_t min );
|
||||
|
||||
int16_t editDistance( char *a0, char *b0 );
|
||||
|
||||
int16_t reduceScore ( char *a, char *b );
|
||||
|
||||
//bool makeWordFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
|
||||
// char *coll );
|
||||
|
||||
//bool makePopFiles ( int32_t numWordsToDump , int32_t numWordsPerPhrase ,
|
||||
// char *coll);
|
||||
|
||||
//bool makeScoreFiles ( int32_t maxWordsPerFile );
|
||||
|
||||
// this map maps a char to a "dict char"
|
||||
//unsigned char m_map [ 256 ];
|
||||
|
||||
// . when comparing letter pairs, we only allow them to consist of
|
||||
// certain chars: 0-9, A-Z, apostrophe and space and \0 otherwise
|
||||
// m_table gets too big. This implies a NUM_CHARS of
|
||||
// . this compressed the value, too
|
||||
// . \0, space, 0-9, A-Z, \' is the ordering
|
||||
//unsigned char to_dict_char ( unsigned char c ) { return m_map[c]; };
|
||||
|
||||
// Temporary unicode workaround for latin-1 compatibility
|
||||
//unsigned char uc_to_dict_char ( UChar c ) {
|
||||
// if (c>255)c=0;
|
||||
// return m_map[c];
|
||||
//};
|
||||
|
||||
// what language loaded
|
||||
int32_t m_lang;
|
||||
|
||||
// what charset does this language use
|
||||
unsigned char m_charset;
|
||||
|
||||
// buffer to store the phonetic rules
|
||||
char *m_rulesBuf;
|
||||
int32_t m_rulesBufSize;
|
||||
char **m_rulesPtr;
|
||||
int32_t m_rulesPtrSize;
|
||||
int32_t m_numRules;
|
||||
// points to the index of each rule that starts with a new character
|
||||
int32_t m_ruleStarts[MAX_CHARS];
|
||||
// the chars that are in a phonet
|
||||
bool m_ruleChars[MAX_CHARS];
|
||||
|
||||
// buffers to store the dictionaries
|
||||
char *m_distributedBuf;
|
||||
int32_t m_distributedBufSize;
|
||||
char **m_tuplePtr;
|
||||
int32_t m_tuplePtrSize;
|
||||
int32_t m_numTuples;
|
||||
|
||||
// total number of phonets
|
||||
int32_t m_numPhonets;
|
||||
|
||||
// narrow phrase
|
||||
char *m_narrowBuf;
|
||||
int32_t m_narrowBufSize;
|
||||
int32_t m_numNarrowPtrs;
|
||||
char **m_frntPtrs;
|
||||
char **m_bckPtrs;
|
||||
int32_t *m_frntCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
|
||||
int32_t *m_bckCharPtrs;//[NUM_CHARS][NUM_CHARS][NUM_CHARS];
|
||||
|
||||
// m_phonetics stores the hash of the phonetic as the key.
|
||||
// the value is a composite of index in m_tuplePtrs where the list
|
||||
// starts as the high 32 bits of the value and the number of
|
||||
// words having the same phonetic as the low 32 bits of the value
|
||||
HashTableT <uint64_t, uint64_t > m_phonetics;
|
||||
|
||||
// hash table of the distributed pop words dictionary
|
||||
// HashTableT <uint32_t, int32_t> m_titlerecDict;
|
||||
|
||||
// hash table of the distributed pop words dictionary
|
||||
HashTableT <uint64_t, int32_t>m_distributedPopPhrases;
|
||||
|
||||
// hash table of the top popular words in the dictionary
|
||||
// HashTableT <uint32_t, char *> m_topPopPhrases;
|
||||
|
||||
// hash table of mispelled words
|
||||
HashTableT <uint32_t, bool>m_misp;
|
||||
|
||||
// hash table of wikipedia words
|
||||
HashTableT <uint32_t, bool>m_wiki;
|
||||
|
||||
// PARMS, which can be adjusted. Currently all languages have the
|
||||
// same adjustments, so using the same parms.
|
||||
int32_t m_editDistanceWeightsDel1;
|
||||
int32_t m_editDistanceWeightsDel2;
|
||||
int32_t m_editDistanceWeightsSwap;
|
||||
int32_t m_editDistanceWeightsSub;
|
||||
int32_t m_editDistanceWeightsSimilar;
|
||||
int32_t m_editDistanceWeightsMin;
|
||||
int32_t m_editDistanceWeightsMax;
|
||||
int32_t m_soundslikeWeight;
|
||||
int32_t m_wordWeight;
|
||||
int32_t m_span;
|
||||
|
||||
bool m_followup;
|
||||
bool m_collapseResult;
|
||||
bool m_removeAccents;
|
||||
};
|
||||
|
||||
#endif
|
@ -12,46 +12,6 @@
|
||||
|
||||
LanguageIdentifier g_langId;
|
||||
|
||||
/// List of TLDs that should not be used for language detection.
|
||||
/// NULL terminated.
|
||||
///
|
||||
/// Sadly, .de seems to be about half German pages and about half
|
||||
/// English as well. We cannot use it to distinguish language.
|
||||
/// Also, .at has some english pages.
|
||||
/// Also, .nl has some english pages.
|
||||
/// Also, .no has some english pages.
|
||||
/// Also, .vn has some english pages.
|
||||
/// Also, .ro has some english pages.
|
||||
/// Also, .gr has some english pages.
|
||||
/// Also, .th has some english pages.
|
||||
/// Also, .pl has some english pages.
|
||||
/// Also, .gs has some english pages.
|
||||
///
|
||||
/// (Pretty soon it will be faster to have a list of domains that
|
||||
/// WILL work instead of domains that won't.)
|
||||
///
|
||||
static char *ambiguousTLDs[] = {
|
||||
"info",
|
||||
"com",
|
||||
"org",
|
||||
"net",
|
||||
"mil",
|
||||
"de",
|
||||
"at",
|
||||
"tv",
|
||||
"nl",
|
||||
"no",
|
||||
"ws",
|
||||
"vn",
|
||||
"ro",
|
||||
"ru",
|
||||
"gr",
|
||||
"th",
|
||||
"pl",
|
||||
"gs",
|
||||
NULL
|
||||
};
|
||||
|
||||
const uint8_t *langToTopic[] = {
|
||||
(uint8_t*)"Unknown",
|
||||
(uint8_t*)"English",
|
||||
@ -82,527 +42,10 @@ const uint8_t *langToTopic[] = {
|
||||
(uint8_t*)"Tagalog"
|
||||
};
|
||||
|
||||
#define MAX_DOCTYPE_SEARCH_LEN (512)
|
||||
|
||||
/// Find a language tag in a DOCTYPE element.
|
||||
///
|
||||
/// This looks more complex than it is.
|
||||
/// Find second quote mark, back up to
|
||||
/// slash, move forward one, and that
|
||||
/// should be the language identifier.
|
||||
///
|
||||
/// @param content pointer to the document's content
|
||||
///
|
||||
/// @return pointer to the language tag, or NULL
|
||||
///
|
||||
static char * FindLanguageIndex(char *content) {
|
||||
char *str;
|
||||
str = strchr(content, '"');
|
||||
if(!str)
|
||||
return(NULL);
|
||||
|
||||
// Got first quote, skip it
|
||||
str++;
|
||||
str = strchr(str, '"');
|
||||
if(!str)
|
||||
return(NULL);
|
||||
|
||||
// Got second quote char, skip it
|
||||
str++;
|
||||
// now back up to slash character...
|
||||
while(str && *str && str > content && *str != '/')
|
||||
str--;
|
||||
// make sure we found the slash...
|
||||
if(str && *str && str > content && *str == '/') {
|
||||
str++;
|
||||
return(str);
|
||||
}
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
/// Copy a language tag.
|
||||
///
|
||||
/// Does NULL terminate dst.
|
||||
///
|
||||
/// @param dst the destination
|
||||
/// @param src the source (returned from FindLanguageIndex())
|
||||
/// @param maxSize max length of dst, not counting NULL
|
||||
///
|
||||
/// @return true on successful copy, false otherwise
|
||||
///
|
||||
static bool copyLangTag(char *dst, char *src, int maxSize) {
|
||||
int len = 0;
|
||||
|
||||
if(!dst || !src || maxSize < 1)
|
||||
return(false);
|
||||
|
||||
while ( *src && *src != '"' ) { // && len++ < maxSize) {
|
||||
//if(len < 2) {
|
||||
// *dst++ = tolower(*src++);
|
||||
//} else {
|
||||
// *dst++ = *src++;
|
||||
//}
|
||||
*dst++ = tolower(*src++);
|
||||
// how many chars have we copied over?
|
||||
len++;
|
||||
// leave 1 char for a \0 termination
|
||||
if ( len + 1 >= maxSize ) break;
|
||||
}
|
||||
*dst = 0;
|
||||
return(true);
|
||||
}
|
||||
|
||||
|
||||
LanguageIdentifier::LanguageIdentifier() {
|
||||
return;
|
||||
}
|
||||
|
||||
inline bool LanguageIdentifier::isAmbiguousTLD(char *tld, int len) {
|
||||
register int x;
|
||||
for(x = 0; ambiguousTLDs[x]; x++) {
|
||||
if(!strncmp(tld, ambiguousTLDs[x],
|
||||
maxOf(len, gbstrlen(ambiguousTLDs[x]))))
|
||||
return(true);
|
||||
}
|
||||
return(false);
|
||||
}
|
||||
|
||||
uint8_t getLanguageFromAbbr2 ( char *str , int32_t len ) {
|
||||
// truncate
|
||||
if ( len > 5 ) len = 5;
|
||||
// copy it and check it
|
||||
char lang[6];
|
||||
for ( int32_t j = 0 ; j < len ; j++ )
|
||||
lang[j] = to_lower_a(str[j]);
|
||||
lang[len]='\0';
|
||||
return getLanguageFromAbbr(lang);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromTag(Xml *xml) {
|
||||
uint8_t rv = langUnknown;
|
||||
int32_t len = 0;
|
||||
//char lang[6];
|
||||
int id;
|
||||
char *str;
|
||||
|
||||
if(!xml) return(langUnknown);
|
||||
|
||||
for(int32_t i = 0; i < xml->getNumNodes(); i++) {
|
||||
id = xml->getNodeId(i);
|
||||
|
||||
// look for meta tag
|
||||
if(id == TAG_META) {
|
||||
str = (char *) xml->getString(i, "name", &len);
|
||||
if(str &&
|
||||
(!strncasecmp(str, "Content-Language",16) ||
|
||||
!strncasecmp(str, "language",8) ||
|
||||
!strncasecmp(str, "Content_Language",16) ) ) {
|
||||
str = (char *) xml->getString(i, "content", &len);
|
||||
rv = getLanguageFromAbbr2(str,len);
|
||||
if(rv != langUnknown) return(rv);
|
||||
}
|
||||
else {
|
||||
str = (char *) xml->getString(i, "http-equiv", &len);
|
||||
if(str && !strncasecmp(str, "Language", 8) ) {
|
||||
str = (char *) xml->getString(i, "content", &len);
|
||||
rv = getLanguageFromAbbr2(str,len);
|
||||
if(rv != langUnknown) return(rv);
|
||||
}
|
||||
}
|
||||
} // end looking for meta tag
|
||||
|
||||
|
||||
if(id != TAG_HTML && // html
|
||||
id != TAG_BODY && // body
|
||||
id != TAG_HEAD) // head
|
||||
continue;
|
||||
|
||||
str = (char *) xml->getString(i, "lang", &len);
|
||||
rv = getLanguageFromAbbr2(str,len);
|
||||
if(rv != langUnknown) return(rv);
|
||||
}
|
||||
return(rv);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromOutlinks(Links *links) {
|
||||
char link[MAX_URL_LEN];
|
||||
int32_t langs[32];
|
||||
int lc;
|
||||
char *cp = NULL;
|
||||
int max = 0;
|
||||
int oldmax = 0;
|
||||
uint8_t l;
|
||||
uint8_t maxlang = 0;
|
||||
int len;
|
||||
|
||||
if(!links) return(langUnknown);
|
||||
|
||||
// Try to catch bad pointers
|
||||
//if(!isValidPointer(links)) {
|
||||
// log(LOG_WARN, "build: Bad pointer 0x%08x not above data segment.\n",
|
||||
// (uint32_t) links);
|
||||
// return(langUnknown);
|
||||
//}
|
||||
|
||||
if(links->getNumLinks() < 1) {
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
if(links->getNumLinks() < 15) {
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
// clear list
|
||||
memset(langs, 0, sizeof(uint32_t) * 32);
|
||||
|
||||
// trim to only 100 links to prevent
|
||||
// spinning on some large pages
|
||||
for(lc = 0; lc < links->getNumLinks() && lc < 100; lc++) {
|
||||
cp = links->getLink(lc);
|
||||
|
||||
if(cp) {
|
||||
// skip http://
|
||||
cp += 7;
|
||||
|
||||
len = links->getLinkLen(lc) - 7;
|
||||
char* p = link;
|
||||
while(*cp && *cp != '/') *p++ = *cp++;
|
||||
*p = '\0';
|
||||
|
||||
if((cp = strrchr(link, '.')) != NULL) {
|
||||
|
||||
// skip to tld
|
||||
cp++;
|
||||
|
||||
// only bother if not a common TLD
|
||||
len = gbstrlen(cp);
|
||||
if(!isAmbiguousTLD(cp, len)) {
|
||||
for(l = 1; l < 32; l++) {
|
||||
if(g_langList.isLangValidForTld(cp, len, l))
|
||||
langs[l]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// look for a clear winner from the list
|
||||
// don't bother with langUnknown, it reduces hits
|
||||
for(l = 1; l < 32; l++) {
|
||||
if(langs[l] >= max) {
|
||||
oldmax = max;
|
||||
max = langs[l];
|
||||
maxlang = l;
|
||||
}
|
||||
}
|
||||
|
||||
// 1st place must beat 2nd place by 5
|
||||
if(max - oldmax > 5) {
|
||||
return(maxlang);
|
||||
}
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromTld(char *linktext) {
|
||||
#if 0
|
||||
// This is not a good check of language
|
||||
int len = 0;
|
||||
char *cp;
|
||||
|
||||
if(!linktext) return(langUnknown);
|
||||
|
||||
// skip http://
|
||||
cp = linktext + 7;
|
||||
|
||||
// if no slash, start at the end of the link
|
||||
if(!(cp = strchr(cp, '/')))
|
||||
cp = linktext + (gbstrlen(linktext) - 1);
|
||||
|
||||
// find last dot
|
||||
while(*cp && cp > linktext && *cp != '.') {
|
||||
cp--;
|
||||
len++;
|
||||
}
|
||||
|
||||
// skip '.'
|
||||
len--; cp++;
|
||||
|
||||
if(len != 2) return(langUnknown);
|
||||
#endif // 0
|
||||
|
||||
return(langUnknown);
|
||||
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip) {
|
||||
int32_t x;
|
||||
//int32_t y;
|
||||
uint8_t languages[32];
|
||||
uint8_t max = langUnknown;
|
||||
uint8_t oldmax = langUnknown;
|
||||
uint8_t maxIndex = 0;
|
||||
uint8_t oldmaxIndex = 0;
|
||||
int hits = 0;
|
||||
|
||||
// sanity check
|
||||
//if(linkInfo->m_numLangs != linkInfo->getNumDocIds()) {
|
||||
// log(LOG_DEBUG, "build: Number of languages (%"INT32") != number of docids (%"INT32")\n",
|
||||
// linkInfo->m_numLangs, linkInfo->getNumDocIds());
|
||||
// return(langUnknown);
|
||||
//}
|
||||
|
||||
if(linkInfo->getNumGoodInlinks() < 7) return(langUnknown);
|
||||
|
||||
memset(languages, 0, 32);
|
||||
|
||||
// only check the first 100 inlinks, or we'll spin
|
||||
// on some monstrous sites.
|
||||
//for(x = 0; x < linkInfo->m_numLangs && x < 100; x++) {
|
||||
for (Inlink*k=NULL;(k=linkInfo->getNextInlink(k)); ) {
|
||||
//int32_t id = linkInfo->getLanguageId(x);
|
||||
int32_t id = k->m_language;
|
||||
// sanity check, we are still getting bad lang ids!!
|
||||
if ( id < 0 || id >= 32 ) {
|
||||
log("build: Got bad lang id of %"INT32". how can this "
|
||||
"happen?",id);
|
||||
continue;
|
||||
}
|
||||
// don't count langUnknown pages, it reduces hits
|
||||
if ( ! id ) continue;
|
||||
|
||||
// skip if not from a different enough IP
|
||||
if((k->m_ip&0x0000ffff)==(ip&0x0000ffff) )
|
||||
continue;
|
||||
// otherwise count it
|
||||
languages[id]++;
|
||||
hits++;
|
||||
}
|
||||
if(hits < 7) return(langUnknown);
|
||||
for(x = 1; x < 32; x++) {
|
||||
if(languages[x] >= max) {
|
||||
oldmax = max;
|
||||
max = languages[x];
|
||||
oldmaxIndex = maxIndex;
|
||||
maxIndex = x;
|
||||
}
|
||||
}
|
||||
|
||||
// sanity check
|
||||
if(maxIndex > 31 || oldmaxIndex > 31) {
|
||||
log(LOG_INFO,
|
||||
"build: guessLanguageFromInlinks(): Possible stack corruption: %d:%d\n",
|
||||
maxIndex, oldmaxIndex);
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
// Need better than 50%
|
||||
// if(max - oldmax > 4)
|
||||
if(max > (linkInfo->getNumGoodInlinks() / 2))
|
||||
return(maxIndex);
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromDoctype(Xml *xml, char *content) {
|
||||
uint8_t rvDoc = langUnknown;
|
||||
int id;
|
||||
char *str;
|
||||
char lang[6];
|
||||
|
||||
if(!content) return(langUnknown);
|
||||
|
||||
for(int32_t i = 0; i < xml->getNumNodes(); i++) {
|
||||
id = xml->getNodeId(i);
|
||||
// skip if not DOCTYPE
|
||||
if ( id != TAG_DOCTYPE ) continue;
|
||||
// get the tag ptr to the tag
|
||||
char *tag = xml->getNode(i);
|
||||
// this is in BYTES
|
||||
//int32_t tagLen = xml->getNodeLen(i);
|
||||
// case might be upper, so we change
|
||||
// the first two letters to lower.
|
||||
str = FindLanguageIndex(tag);
|
||||
if(!str) continue;
|
||||
if(copyLangTag(lang, str, 5))
|
||||
rvDoc = getLanguageFromAbbr(lang);
|
||||
return(rvDoc);
|
||||
}
|
||||
return(rvDoc);
|
||||
}
|
||||
|
||||
/// Skip whitespace in a string.
|
||||
///
|
||||
/// Includes CR and LF.
|
||||
///
|
||||
/// @param str the string
|
||||
///
|
||||
/// @return pointer to next character that is not whitespace, or NULL
|
||||
///
|
||||
static char *skipwhite(char *str) {
|
||||
while(str && *str &&
|
||||
(*str == ' ' ||
|
||||
*str == '\t' ||
|
||||
*str == '\n' ||
|
||||
*str == '\r'))
|
||||
str++;
|
||||
return(str);
|
||||
}
|
||||
|
||||
/// Skip over 'words' in a string.
|
||||
///
|
||||
/// Skips over everything until there's whitespace.
|
||||
///
|
||||
/// @param str the string to search
|
||||
///
|
||||
/// @return the pointer to the next whitespace character
|
||||
///
|
||||
static char *skipword(char *str) {
|
||||
while(str && *str &&
|
||||
(*str != ' ' &&
|
||||
*str != '\t' &&
|
||||
*str != '\n' &&
|
||||
*str != '\r'))
|
||||
str++;
|
||||
return(str);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromUserAgent(char *str) {
|
||||
// Mozilla/5.0 (X11; U; Linux i686;
|
||||
// en-US; rv:1.8.1.4) Gecko/20070531 Firefox/2.0.0.4
|
||||
uint8_t lang = langUnknown;
|
||||
while(*str) {
|
||||
if(!(str = skipwhite(str)))
|
||||
return(langUnknown);
|
||||
if((lang = getLanguageFromUserAgent(str)) != langUnknown)
|
||||
return(lang);
|
||||
if(!(str = skipword(str)))
|
||||
return(langUnknown);
|
||||
}
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromDMOZ(char *addr) {
|
||||
return(g_categories->findLanguage(addr));
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromQuery(Query *q) {
|
||||
uint8_t lang;
|
||||
if(q->getNumTerms() == 1) {
|
||||
if(g_langList.lookup(q->getTermId(1), &lang))
|
||||
return(lang);
|
||||
} else {
|
||||
// Look for two consecutive identical languages
|
||||
// Not as good as a frequency count, but much faster
|
||||
uint8_t last = 255;
|
||||
register int32_t qcount;
|
||||
for(qcount = 0; qcount < q->getNumTerms(); qcount++) {
|
||||
if(g_langList.lookup(q->getTermId(qcount), &lang) &&
|
||||
last == lang) {
|
||||
return(lang);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::getBestLanguage(char** method,
|
||||
Url* url,
|
||||
Xml* xml,
|
||||
Links* links,
|
||||
LinkInfo* linkInfo,
|
||||
char* content) {
|
||||
uint8_t langEnum;
|
||||
// Let the site tell us what language it's in
|
||||
langEnum = g_langId.guessLanguageFromTag(xml);
|
||||
*method = "Tag";
|
||||
|
||||
if(langEnum != langUnknown) return langEnum;
|
||||
|
||||
// Get the language from a DMOZ category
|
||||
// Accurate, but low hit rate
|
||||
langEnum = g_langId.guessLanguageFromDMOZ(url->getUrl());
|
||||
*method = "DMOZ";
|
||||
if(langEnum != langUnknown) return langEnum;
|
||||
|
||||
|
||||
// Guess from the TLD
|
||||
uint8_t possibleLanguage = g_langId.guessLanguageFromTld(url->getUrl());
|
||||
if(possibleLanguage) langEnum = possibleLanguage;
|
||||
*method = "TLD";
|
||||
if(langEnum != langUnknown) return langEnum;
|
||||
|
||||
// m_newDoc->getLinks() can return a bad address
|
||||
// Guess from the outlinks
|
||||
langEnum = g_langId.guessLanguageFromOutlinks(links);
|
||||
*method = "Outlinks";
|
||||
if(langEnum != langUnknown) return langEnum;
|
||||
// m_newDoc->getLinks() can return a bad address
|
||||
|
||||
// Guess from the inlinks
|
||||
// langEnum = g_langId.guessLanguageFromInlinks(linkInfo);
|
||||
// *method = "Inlinks";
|
||||
if(langEnum != langUnknown) return langEnum;
|
||||
|
||||
// Word frequency count
|
||||
langEnum = xml->getLanguage();
|
||||
*method = "Freq";
|
||||
if(langEnum != langUnknown) return langEnum;
|
||||
|
||||
// Let the doctype tell us what language it's in
|
||||
langEnum = g_langId.guessLanguageFromDoctype(xml, content);
|
||||
*method = "Doctype";
|
||||
|
||||
return langEnum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uint8_t LanguageIdentifier::getBestLangsFromVec(char* langCount,
|
||||
//SiteType* typeVec,
|
||||
int32_t *langIds ,
|
||||
uint8_t *langScores ,
|
||||
int32_t tagVecSize) {
|
||||
int32_t bestCount = -1;
|
||||
uint8_t numTags = 0;
|
||||
|
||||
int32_t langTotal = 0;
|
||||
for(int32_t j = 0; j < MAX_LANGUAGES; j++) {
|
||||
langTotal += langCount[j];
|
||||
}
|
||||
if(langTotal == 0 || langCount[langUnknown] == langTotal)
|
||||
return 0;
|
||||
|
||||
//dont store unknown language
|
||||
langTotal -= langCount[langUnknown];
|
||||
langCount[langUnknown] = 0;
|
||||
|
||||
|
||||
for(int32_t i = 0; i < tagVecSize; i++) {
|
||||
int32_t maxCount = 0;
|
||||
int32_t maxCountNdx = 0;
|
||||
for(int32_t j = 0; j < MAX_LANGUAGES; j++) {
|
||||
if(langCount[j] > maxCount) {
|
||||
maxCount = langCount[j];
|
||||
maxCountNdx = j;
|
||||
}
|
||||
}
|
||||
if(i == 0) bestCount = maxCount;
|
||||
//if none found or this one is half as much as previous
|
||||
//then quit.
|
||||
if(maxCount == 0 ||
|
||||
maxCount < (bestCount/2)) break;
|
||||
//typeVec[i].m_type = maxCountNdx;
|
||||
//typeVec[i].m_score = (uint8_t)((maxCount * 100.0)
|
||||
// / langTotal);
|
||||
langIds [i] = maxCountNdx;
|
||||
langScores[i] = (uint8_t)((maxCount * 100.0) / langTotal);
|
||||
langCount[maxCountNdx] = 0;
|
||||
numTags++;
|
||||
}
|
||||
return numTags;
|
||||
}
|
||||
|
||||
|
||||
uint8_t LanguageIdentifier::findLangFromDMOZTopic(char *topic) {
|
||||
int x;
|
||||
for(x = 0; x < (int)(sizeof(langToTopic)/sizeof(uint8_t *)); x++) {
|
||||
@ -614,228 +57,6 @@ uint8_t LanguageIdentifier::findLangFromDMOZTopic(char *topic) {
|
||||
return(langUnknown);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessGBLanguageFromUrl(char *url) {
|
||||
if(!url) return(langUnknown);
|
||||
uint8_t lang;
|
||||
if((lang = guessLanguageFromUrl(url)) != langUnknown)
|
||||
return(lang);
|
||||
char code[6];
|
||||
char *cp = url;
|
||||
memset(code, 0, 6);
|
||||
for(int x = 0; x < 6; x++) {
|
||||
if((cp[x] < 'a' || cp[x] > 'z') &&
|
||||
(cp[x] < 'A' || cp[x] > 'Z') &&
|
||||
cp[x] != '_' && cp[x] != '-')
|
||||
break;
|
||||
code[x] = cp[x];
|
||||
}
|
||||
return(getLanguageFromCountryCode(code));
|
||||
}
|
||||
|
||||
static inline bool s_checkCharIsBoundary(uint8_t x) {
|
||||
if(x < '0') return(true);
|
||||
if(x > '9' && x < 'A') return(true);
|
||||
if(x > 'Z' && x < 'a') return(true);
|
||||
if(x > 'z' && x < 128) return(true);
|
||||
return(false);
|
||||
}
|
||||
|
||||
static inline bool s_isRightBoundedAbbr(char *pointer, uint8_t l) {
|
||||
if(s_checkCharIsBoundary(*(pointer + 2)))
|
||||
return(true);
|
||||
if((*(pointer + 3) == '-' || *(pointer + 3) == '_') &&
|
||||
s_checkCharIsBoundary(*(pointer + 5)))
|
||||
return(true);
|
||||
return(false);
|
||||
}
|
||||
|
||||
static inline bool s_isRightBoundedLanguageWord(char *pointer, uint8_t l) {
|
||||
if(s_checkCharIsBoundary(*(pointer + gbstrlen(getNativeLanguageString(l)))))
|
||||
return(true);
|
||||
if(s_checkCharIsBoundary(*(pointer + gbstrlen(getLanguageString(l)))))
|
||||
return(true);
|
||||
return(false);
|
||||
}
|
||||
|
||||
uint8_t s_lookForLanguageParam(char *url) {
|
||||
char *cp = url;
|
||||
uint8_t l;
|
||||
// Try to find lan= or lang= or language=
|
||||
while(cp && *cp && (cp = strstr(cp, "lan"))) {
|
||||
if(!s_checkCharIsBoundary(*(cp - 1))) {
|
||||
cp++;
|
||||
continue;
|
||||
}
|
||||
if(!strncmp(cp, "lan=", 4)) cp += 4;
|
||||
else if(!strncmp(cp, "lang=", 5)) cp += 5;
|
||||
else if(!strncmp(cp, "language=", 9)) cp += 9;
|
||||
|
||||
if((l = getLanguageFromName((uint8_t*)cp)) &&
|
||||
s_isRightBoundedLanguageWord(cp, l))
|
||||
return(l);
|
||||
|
||||
if((l = getLanguageFromAbbrN(cp)) &&
|
||||
s_isRightBoundedAbbr(cp, l))
|
||||
return(l);
|
||||
cp++;
|
||||
}
|
||||
// Try to find l=
|
||||
cp = url;
|
||||
while(cp && *cp && (cp = strstr(cp, "l="))) {
|
||||
if(!s_checkCharIsBoundary(*(cp - 1))) {
|
||||
cp++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if((l = getLanguageFromName((uint8_t*)cp)) &&
|
||||
s_isRightBoundedLanguageWord(cp, l))
|
||||
return(l);
|
||||
|
||||
if((l = getLanguageFromAbbrN(cp)) &&
|
||||
s_isRightBoundedAbbr(cp, l))
|
||||
return(l);
|
||||
cp++;
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
uint8_t s_lookForLanguagePrefix(char *url) {
|
||||
char *cp = url;
|
||||
uint8_t l = 0;
|
||||
// Look for a prefix on the url
|
||||
// Do not add a postfix or TLD detector,
|
||||
// they are not good indications at all.
|
||||
if(!strncmp(url, "http://", 7)) cp = url + 7;
|
||||
else cp = url;
|
||||
|
||||
if((l = getLanguageFromAbbrN(cp)) &&
|
||||
s_isRightBoundedAbbr(cp, l))
|
||||
return(l);
|
||||
|
||||
// Lookup, and see if it's on a word boundary
|
||||
if((l = getLanguageFromName((uint8_t*)cp)) &&
|
||||
s_isRightBoundedLanguageWord(cp, l))
|
||||
return(l);
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFromUrl(char *url) {
|
||||
int len = 0;
|
||||
char *cp = url;
|
||||
char code[3];
|
||||
uint8_t l = 0;
|
||||
|
||||
if(!url) return(langUnknown);
|
||||
|
||||
// Look for a parameter that would indicate the language
|
||||
if((l = s_lookForLanguageParam(url))) return(l);
|
||||
|
||||
// Look for a prefix that would indicate the language
|
||||
if((l = s_lookForLanguagePrefix(url))) return(l);
|
||||
|
||||
// if no slash, start at the end of the link
|
||||
if(!(cp = strchr(url, '/')))
|
||||
cp = url + (gbstrlen(url) - 1);
|
||||
|
||||
// find last dot
|
||||
while(*cp && cp > url && *cp != '.') {
|
||||
cp--;
|
||||
len++;
|
||||
}
|
||||
|
||||
// No dot?
|
||||
if(cp <= url) return(langUnknown);
|
||||
|
||||
// skip '.'
|
||||
len--; cp++;
|
||||
|
||||
code[0] = cp[0];
|
||||
code[1] = cp[1];
|
||||
code[2] = 0;
|
||||
|
||||
return(getLanguageFromCountryCode(code));
|
||||
}
|
||||
|
||||
static inline int s_findMaxInList(int *list, int numItems) {
|
||||
int max, oldmax, idx;
|
||||
if(!list) return(0);
|
||||
max = oldmax = INT_MIN;
|
||||
idx = 0;
|
||||
for(int x = 0; x < numItems; x++) {
|
||||
if(list[x] >= max) {
|
||||
oldmax = max;
|
||||
max = list[x];
|
||||
idx = x;
|
||||
}
|
||||
}
|
||||
if(oldmax == max) return(0);
|
||||
return(idx);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessLanguageFreqCount(Xml *xml,
|
||||
int pageLimit /* = 512 */) {
|
||||
if(!xml) return(langUnknown);
|
||||
|
||||
int votes[MAX_LANGUAGES];
|
||||
int limit = xml->getNumNodes();
|
||||
int scores[MAX_LANGUAGES];
|
||||
|
||||
if(pageLimit < limit) limit = pageLimit;
|
||||
|
||||
memset(votes, 0, sizeof(int) * MAX_LANGUAGES);
|
||||
|
||||
// Do term frequency count
|
||||
for(int x = 0; x < limit; x++) {
|
||||
if(xml->isTag(x) || xml->getNodeLen((int32_t)x) < 2) continue;
|
||||
char *cp = g_speller.getPhraseRecord(xml->getNode((int32_t)x),
|
||||
xml->getNodeLen((int32_t)x));
|
||||
if(!cp) continue;
|
||||
memset(scores, 0, sizeof(int) * MAX_LANGUAGES);
|
||||
while(*cp) {
|
||||
// skip leading whitespace
|
||||
while(*cp && (*cp == ' ' || *cp == '\t')) cp++;
|
||||
// get language
|
||||
int l = atoi(cp);
|
||||
// skip to next delimiter
|
||||
while(*cp && *cp != '\t') cp++;
|
||||
// skip over tab
|
||||
cp++;
|
||||
// get score
|
||||
scores[l] = atoi(cp);
|
||||
// skip to next delimiter
|
||||
while(*cp && *cp != '\t') cp++;
|
||||
}
|
||||
votes[s_findMaxInList(scores, MAX_LANGUAGES)]++;
|
||||
}
|
||||
|
||||
// Find max
|
||||
int max = 0;
|
||||
int maxidx = 0;
|
||||
int oldmax = 0;
|
||||
for(int x = 0; x < MAX_LANGUAGES; x++) {
|
||||
if(votes[x] < max) continue;
|
||||
oldmax = max;
|
||||
max = votes[x];
|
||||
maxidx = x;
|
||||
}
|
||||
|
||||
if(max == 0) maxidx = 0;
|
||||
|
||||
#if 0
|
||||
// English, British, and Australian are no longer separate
|
||||
// If it's a toss up between any version of English, go with it.
|
||||
if((max == langEnglish || max == langAustralia || max == langBritish) &&
|
||||
(oldmax == langEnglish || oldmax == langAustralia || oldmax == langBritish))
|
||||
return(maxidx);
|
||||
#endif // 0
|
||||
|
||||
// Note the winner
|
||||
if(oldmax <= 0 || max > oldmax)
|
||||
return maxidx;
|
||||
return langUnknown;
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessCountryTLD(const char *url) {
|
||||
uint8_t country = 0;
|
||||
char code[3];
|
||||
@ -864,46 +85,3 @@ uint8_t LanguageIdentifier::guessCountryTLD(const char *url) {
|
||||
}
|
||||
return(country);
|
||||
}
|
||||
|
||||
static int s_wordLen(char *str) {
|
||||
char *cp = str;
|
||||
while(*cp && *cp != ' ' && *cp != ';' &&*cp != '\t' &&
|
||||
*cp != '\n' && *cp != '\r' && *cp != '.' && *cp != ',')
|
||||
cp++;
|
||||
return(cp - str);
|
||||
}
|
||||
|
||||
static bool s_isLangTag(char *str) {
|
||||
int len = s_wordLen(str);
|
||||
if(len == 2) return(true);
|
||||
if(len != 5) return(false);
|
||||
if(str[2] == '_' || str[2] == '-') return(true);
|
||||
return(false);
|
||||
}
|
||||
|
||||
static uint8_t s_getCountryFromSpec(char *str) {
|
||||
char code[6];
|
||||
memset(code, 0,6);
|
||||
gbmemcpy(code, str, s_wordLen(str));
|
||||
for(int x = 0; x < 6; x++)
|
||||
if(code[x] > 'A' && code[x] < 'Z') code[x] -= ('A' - 'a');
|
||||
if(code[2] == '_' || code[2] == '-')
|
||||
return g_countryCode.getIndexOfAbbr(&code[3]);
|
||||
return g_countryCode.getIndexOfAbbr(code);
|
||||
}
|
||||
|
||||
uint8_t LanguageIdentifier::guessCountryFromUserAgent(char *ua) {
|
||||
if(!ua) return(0);
|
||||
uint8_t country = 0;
|
||||
while(*ua) {
|
||||
if(!(ua = skipwhite(ua)))
|
||||
return(0);
|
||||
if(s_isLangTag(ua) &&
|
||||
(country = s_getCountryFromSpec(ua)) != 0)
|
||||
return(country);
|
||||
if(!(ua = skipword(ua)))
|
||||
return(0);
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
@ -3,22 +3,12 @@
|
||||
/// Contains the main utility function, guessLanguage(), and all
|
||||
/// the support routines for detecting the language of a web page.
|
||||
///
|
||||
/// 2007 May 24 09:02:52
|
||||
/// $ID$
|
||||
/// $Author: John Nanney$
|
||||
/// $Workfile$
|
||||
/// $Log$
|
||||
///
|
||||
|
||||
// using a different macro because there's already a Language.h
|
||||
#ifndef LANGUAGEIDENTIFIER_H
|
||||
#define LANGUAGEIDENTIFIER_H
|
||||
|
||||
#include "gb-include.h"
|
||||
#include "Xml.h"
|
||||
#include "Linkdb.h"
|
||||
//#include "LinkInfo.h"
|
||||
#include "Query.h"
|
||||
|
||||
/// Contains methods of language identification by various means.
|
||||
class LanguageIdentifier {
|
||||
@ -29,97 +19,6 @@ class LanguageIdentifier {
|
||||
/// Destructor, does very little.
|
||||
~LanguageIdentifier() { return; }
|
||||
|
||||
/// Get the language from the page's lang="" tag.
|
||||
///
|
||||
/// Looks for a lang="x" property in the HTML, BODY, or HEAD
|
||||
/// tag. Returns the first match. This is usually a very
|
||||
/// accurate guess of the language, since the author of the
|
||||
/// page went through all the trouble to make sure it was
|
||||
/// in there.
|
||||
///
|
||||
/// @param xml the page's xml object
|
||||
///
|
||||
/// @return the language, or langUnknown
|
||||
///
|
||||
uint8_t guessLanguageFromTag(Xml *xml);
|
||||
|
||||
/// Guess the language from the TLDs of outlinks found in the page.
|
||||
///
|
||||
/// TLDs which are ambiguous like .com are skipped.
|
||||
///
|
||||
/// @param links a list of links
|
||||
///
|
||||
/// @return the language, or langUnknown
|
||||
///
|
||||
uint8_t guessLanguageFromOutlinks(Links *links);
|
||||
|
||||
/// Guess the language from the page's TLD.
|
||||
///
|
||||
/// @param linktext the ascii URL
|
||||
///
|
||||
/// @return the language, or langUnknown
|
||||
///
|
||||
uint8_t guessLanguageFromTld(char *linktext);
|
||||
|
||||
/// Guess the language from the languages of the inlinks.
|
||||
///
|
||||
/// @param linkInfo
|
||||
///
|
||||
/// @return the language, or langUnknown
|
||||
///
|
||||
uint8_t guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip);
|
||||
|
||||
/// Determine whether a given TLD is suitable for language detection.
|
||||
/// @param tld the TLD in ascii
|
||||
/// @param len the length of tld
|
||||
/// @return true if suitable, false if not
|
||||
///
|
||||
inline bool isAmbiguousTLD(char *tld, int len);
|
||||
|
||||
/// Return the greater of two ints.
|
||||
inline int maxOf(int a, int b) {
|
||||
if(b > a) return(b);
|
||||
return(a);
|
||||
}
|
||||
|
||||
/// Guesses language from the DOCTYPE string present in many pages.
|
||||
///
|
||||
/// @param xml the page's xml object
|
||||
/// @param content the page's content, for finding the doctype
|
||||
///
|
||||
/// @return the language, or langUnknown
|
||||
///
|
||||
uint8_t guessLanguageFromDoctype(Xml *xml, char *content);
|
||||
|
||||
/// Guess a language from a tag in the user agent string.
|
||||
///
|
||||
/// @param str the user agent string
|
||||
///
|
||||
/// @return the language, or langUknown
|
||||
///
|
||||
uint8_t guessLanguageFromUserAgent(char *str);
|
||||
|
||||
/// Find an address in DMOZ for the language.
|
||||
///
|
||||
/// Looks up the page address in the category language tables.
|
||||
///
|
||||
/// @param addr the page address
|
||||
///
|
||||
/// @return language, or langUnknown if not found
|
||||
///
|
||||
uint8_t guessLanguageFromDMOZ(char *addr);
|
||||
|
||||
/// Guess the query language from the query terms.
|
||||
///
|
||||
/// This algorithm looks for two consecutive terms with the
|
||||
/// same language.
|
||||
///
|
||||
/// @param q the query object
|
||||
///
|
||||
/// @return the language, or langUnknown
|
||||
///
|
||||
uint8_t guessLanguageFromQuery(Query *q);
|
||||
|
||||
/// Find a language from DMOZ topic.
|
||||
///
|
||||
/// The function name is a bit misleading, we expect
|
||||
@ -131,29 +30,7 @@ class LanguageIdentifier {
|
||||
///
|
||||
uint8_t findLangFromDMOZTopic(char *topic);
|
||||
|
||||
|
||||
uint8_t getBestLanguage(char** method,
|
||||
Url* url,
|
||||
Xml* xml,
|
||||
Links* links,
|
||||
LinkInfo* linkInfo,
|
||||
char* content);
|
||||
|
||||
uint8_t getBestLangsFromVec(char* langCount,
|
||||
//SiteType* typeVec,
|
||||
int32_t *langIds ,
|
||||
uint8_t *langScores ,
|
||||
int32_t tagVecSize);
|
||||
|
||||
uint8_t guessGBLanguageFromUrl(char *url);
|
||||
uint8_t guessLanguageFromUrl(char *url);
|
||||
|
||||
uint8_t guessLanguageFreqCount(Xml *xml,
|
||||
int pageLimit /* = 512 */);
|
||||
|
||||
uint8_t guessCountryTLD(const char *url);
|
||||
|
||||
uint8_t guessCountryFromUserAgent(char *ua);
|
||||
};
|
||||
|
||||
extern class LanguageIdentifier g_langId;
|
||||
|
7
Makefile
7
Makefile
@ -45,7 +45,7 @@ OBJS = UdpSlot.o Rebalance.o \
|
||||
Speller.o \
|
||||
PingServer.o StopWords.o TopTree.o \
|
||||
Parms.o Pages.o \
|
||||
Unicode.o iana_charset.o Iso8859.o \
|
||||
Unicode.o iana_charset.o \
|
||||
SearchInput.o \
|
||||
Categories.o Msg2a.o PageCatdb.o PageDirectory.o \
|
||||
SafeBuf.o Datedb.o \
|
||||
@ -56,7 +56,7 @@ OBJS = UdpSlot.o Rebalance.o \
|
||||
PageLogView.o Msg1f.o Blaster.o MsgC.o \
|
||||
PageSpam.o Proxy.o PageThreads.o Linkdb.o \
|
||||
matches2.o LanguageIdentifier.o \
|
||||
Language.o Repair.o Process.o \
|
||||
Repair.o Process.o \
|
||||
Abbreviations.o \
|
||||
RequestTable.o TuringTest.o Msg51.o \
|
||||
Msg40.o Msg4.o SpiderProxy.o \
|
||||
@ -477,9 +477,6 @@ Spider.o:
|
||||
test_parser2.o:
|
||||
$(CXX) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
Language.o:
|
||||
$(CXX) $(DEFS) $(CPPFLAGS) -O3 -c $*.cpp
|
||||
|
||||
PostQueryRerank.o:
|
||||
$(CXX) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
|
@ -11,8 +11,6 @@
|
||||
static void gotReplyWrapper3a ( void *state , void *state2 ) ;
|
||||
//static void gotRerankedDocIds ( void *state );
|
||||
|
||||
int32_t *g_ggg = NULL;
|
||||
|
||||
Msg3a::Msg3a ( ) {
|
||||
constructor();
|
||||
}
|
||||
|
3
Msg3a.h
3
Msg3a.h
@ -49,9 +49,6 @@ public:
|
||||
void (* callback) ( void *state ) ,
|
||||
class Host *specialHost = NULL );
|
||||
|
||||
|
||||
bool gotTermFreqs();
|
||||
|
||||
// Msg40 calls this to get Query m_q to pass to Summary class
|
||||
Query *getQuery ( ) { return m_q ; };
|
||||
|
||||
|
20
Msg40.cpp
20
Msg40.cpp
@ -25,7 +25,7 @@ bool printHttpMime ( class State0 *st ) ;
|
||||
|
||||
//static void handleRequest40 ( UdpSlot *slot , int32_t netnice );
|
||||
//static void gotExternalReplyWrapper ( void *state , void *state2 ) ;
|
||||
static void gotCacheReplyWrapper ( void *state );
|
||||
//static void gotCacheReplyWrapper ( void *state );
|
||||
static void gotDocIdsWrapper ( void *state );
|
||||
static bool gotSummaryWrapper ( void *state );
|
||||
//static void didTaskWrapper ( void *state );
|
||||
@ -508,15 +508,15 @@ bool Msg40::gotExternalReply ( ) {
|
||||
*/
|
||||
|
||||
// msg17 calls this after it gets a reply
|
||||
void gotCacheReplyWrapper ( void *state ) {
|
||||
Msg40 *THIS = (Msg40 *)state;
|
||||
// reset g_errno, we're just a cache
|
||||
g_errno = 0;
|
||||
// handle the reply
|
||||
if ( ! THIS->gotCacheReply() ) return;
|
||||
// otherwise, call callback
|
||||
THIS->m_callback ( THIS->m_state );
|
||||
}
|
||||
//void gotCacheReplyWrapper ( void *state ) {
|
||||
// Msg40 *THIS = (Msg40 *)state;
|
||||
// // reset g_errno, we're just a cache
|
||||
// g_errno = 0;
|
||||
// // handle the reply
|
||||
// if ( ! THIS->gotCacheReply() ) return;
|
||||
// // otherwise, call callback
|
||||
// THIS->m_callback ( THIS->m_state );
|
||||
//}
|
||||
|
||||
bool Msg40::gotCacheReply ( ) {
|
||||
// if not found, get the result the hard way
|
||||
|
@ -1052,7 +1052,6 @@ void sleepWrapper1 ( int bogusfd , void *state ) {
|
||||
int32_t docsWanted;
|
||||
int32_t firstResultNum;
|
||||
int32_t nqterms;
|
||||
int32_t rerankRuleset;
|
||||
int32_t wait;
|
||||
char exact;
|
||||
//int32_t hid = -1;
|
||||
|
@ -2,9 +2,9 @@
|
||||
#define _PAGERESULTS_H_
|
||||
|
||||
#include "SafeBuf.h"
|
||||
#include "Language.h" // MAX_FRAG_SIZE
|
||||
#include "Msg40.h"
|
||||
#include "Msg0.h"
|
||||
#include "Speller.h" // MAX_FRAG_SIZE
|
||||
|
||||
// height of each search result div in the widget
|
||||
#define RESULT_HEIGHT 120
|
||||
|
43
PageRoot.cpp
43
PageRoot.cpp
@ -1851,49 +1851,6 @@ bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
}
|
||||
|
||||
|
||||
// get the collection rec
|
||||
/*
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
uint8_t *hp = NULL;
|
||||
int32_t hpLen;
|
||||
int64_t docsInColl = -1;
|
||||
if ( ! cr ) {
|
||||
// use the default
|
||||
Parm *pp = g_parms.getParm ( "hp" );
|
||||
if ( ! pp ) {
|
||||
g_errno = ENOTFOUND;
|
||||
g_msg = " (error: no such collection)";
|
||||
return g_httpServer.sendErrorReply(s,500,
|
||||
mstrerror(g_errno));
|
||||
}
|
||||
hp = (uint8_t*)pp->m_def;
|
||||
if ( hp ) hpLen = uint8strlen ( hp );
|
||||
if ( hpLen <= 0 || ! hp )
|
||||
log(LOG_INFO,"http: No root page html present.");
|
||||
} else {
|
||||
if(cr->m_useLanguagePages) {
|
||||
uint8_t lang = g_langId.guessGBLanguageFromUrl(r->getHost());
|
||||
if(lang && (hp = g_languagePages.getLanguagePage(lang)) != NULL) {
|
||||
hpLen = uint8strlen(hp);
|
||||
// Set sort language as well
|
||||
// This might not be a good idea, as it
|
||||
// overrides any other setting. May be
|
||||
// better to let the user agent string
|
||||
// tell us what the user wants.
|
||||
strcpy(cr->m_defaultSortLanguage,
|
||||
getLanguageAbbr(lang));
|
||||
}
|
||||
}
|
||||
if(!hp) {
|
||||
hp = (uint8_t*)cr->m_htmlRoot;
|
||||
hpLen = cr->m_htmlRootLen;
|
||||
}
|
||||
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , coll );
|
||||
RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , coll );
|
||||
if ( base ) docsInColl = base->getNumGlobalRecs();
|
||||
}
|
||||
*/
|
||||
// print the page out
|
||||
/*
|
||||
expandRootHtml ( sb,
|
||||
|
967
Speller.cpp
967
Speller.cpp
@ -11,114 +11,6 @@
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
|
||||
/*
|
||||
static void handleRequestSpeller ( UdpSlot *slot , int32_t netnice );
|
||||
|
||||
static void gotSpellerReplyWrapper (void *state, void *state2);
|
||||
|
||||
bool Speller::registerHandler ( ) {
|
||||
// . register ourselves with the udp server
|
||||
// . it calls our callback when it receives a msg of type 0x39
|
||||
if ( ! g_udpServer.registerHandler ( 0x3d, handleRequestSpeller ))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// . handle a request to get a linkInfo for a given docId/url/collection
|
||||
// . returns false if slot should be nuked and no reply sent
|
||||
// . sometimes sets g_errno on error
|
||||
void handleRequestSpeller ( UdpSlot *slot , int32_t netnice ) {
|
||||
// The request is the string to be spellchecked, null ended
|
||||
char *request = slot->m_readBuf;
|
||||
|
||||
// first tells us if we should narrow the search stuff
|
||||
bool narrowP = *(bool *) request;
|
||||
request += sizeof(bool);
|
||||
|
||||
// is it found in dict or pop words
|
||||
bool found;
|
||||
int32_t score;
|
||||
char reco[MAX_PHRASE_LEN];
|
||||
int32_t pop;
|
||||
int64_t start = gettimeofdayInMilliseconds();
|
||||
bool recommendation = g_speller.m_language[langEnglish].
|
||||
getRecommendation( request, gbstrlen(request),
|
||||
reco, MAX_PHRASE_LEN,
|
||||
&found, &score,
|
||||
&pop );
|
||||
|
||||
log ( LOG_DEBUG,"speller: %s --> %s", request, reco );
|
||||
|
||||
int32_t numNarrow = 0;
|
||||
char narrow[MAX_NARROW_SEARCHES * MAX_PHRASE_LEN];
|
||||
int32_t narrowPops[MAX_NARROW_SEARCHES];
|
||||
//if ( narrowP )
|
||||
// numNarrow = g_speller.m_language[langEnglish].
|
||||
// narrowPhrase ( request, narrow, narrowPops,
|
||||
// MAX_NARROW_SEARCHES );
|
||||
|
||||
// calculate total reply size
|
||||
// int32_t replySize = found + recommendation + score + pop + reco
|
||||
int32_t replySize = sizeof(bool) + sizeof(bool) + 4 + 4 +
|
||||
gbstrlen(reco) + 1;
|
||||
|
||||
if ( narrowP ){
|
||||
replySize += 4; // numPhrases
|
||||
for ( int32_t i = 0; i < numNarrow; i++ )
|
||||
replySize += 4 + gbstrlen(&narrow[i*MAX_FRAG_SIZE]) + 1;
|
||||
}
|
||||
|
||||
char *reply = (char*) mmalloc(replySize, "SpellerReplyBuf");
|
||||
if ( !reply ) {
|
||||
g_errno = ENOMEM;
|
||||
//g_udpServer.sendReply_ass( NULL, 0, NULL, 0, slot );
|
||||
g_udpServer.sendErrorReply( slot , g_errno );
|
||||
return;
|
||||
}
|
||||
char *p = reply;
|
||||
|
||||
*(bool *)p = found;
|
||||
p += sizeof(bool);
|
||||
|
||||
*(bool *)p = recommendation;
|
||||
p += sizeof(bool);
|
||||
|
||||
// store the score and pop
|
||||
*(int32_t *) p = score; p += 4;
|
||||
*(int32_t *) p = pop; p += 4;
|
||||
|
||||
// store the recommendation
|
||||
strcpy( p, reco );
|
||||
p += gbstrlen(reco) + 1;
|
||||
if ( narrowP ){
|
||||
// store the number of narrow phrases found
|
||||
*(int32_t *) p = numNarrow;
|
||||
p += 4;
|
||||
for ( int32_t i = 0; i < numNarrow; i++ ){
|
||||
*(int32_t *)p = narrowPops[i];
|
||||
p += 4;
|
||||
strcpy(p, &narrow[i * MAX_FRAG_SIZE]);
|
||||
p += gbstrlen(&narrow[i * MAX_FRAG_SIZE]) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
//sanity check
|
||||
if ( p - reply != replySize ){
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
|
||||
int64_t end = gettimeofdayInMilliseconds();
|
||||
if ( end - start > 1 )
|
||||
log (LOG_INFO,"speller: took %"INT64" ms to spellcheck "
|
||||
"fragment %s", end- start, request);
|
||||
g_udpServer.sendReply_ass ( reply ,
|
||||
replySize,
|
||||
reply ,
|
||||
replySize,
|
||||
slot );
|
||||
}
|
||||
*/
|
||||
|
||||
Speller g_speller;
|
||||
|
||||
Speller::Speller(){
|
||||
@ -219,764 +111,8 @@ void Speller::test ( char *ff ) {
|
||||
fclose(fd);
|
||||
}
|
||||
|
||||
/*
|
||||
///////////////////////////////////////////////////////
|
||||
// RECOMMENDATION ROUTINES BELOW HERE
|
||||
//
|
||||
// These will spellcheck and give recommendations
|
||||
///////////////////////////////////////////////////////
|
||||
|
||||
bool Speller::canStart( QueryWord *qw ) {
|
||||
// can only start with a alpha character, no numeric
|
||||
if ( ! is_alnum_utf8 ( qw->m_word+0 ) ) return false;
|
||||
|
||||
if ( qw->m_ignoreWord &&
|
||||
qw->m_ignoreWord != IGNORE_CONNECTED &&
|
||||
qw->m_ignoreWord != IGNORE_QUOTED ) return false;
|
||||
|
||||
// don't check 'rom' in phrase "cd-rom", or 't' in "ain't"
|
||||
if ( qw->m_leftConnected )
|
||||
return false;
|
||||
|
||||
// don't start with a stop word
|
||||
if ( qw->m_isStopWord )
|
||||
return false;
|
||||
|
||||
// a lot of field terms should not be spell checked
|
||||
if ( qw->m_fieldCode ) {
|
||||
if ( qw->m_fieldCode != FIELD_TITLE &&
|
||||
qw->m_fieldCode != FIELD_CITY &&
|
||||
qw->m_fieldCode != FIELD_AUTHOR &&
|
||||
qw->m_fieldCode != FIELD_COUNTRY )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// . returns false if blocked
|
||||
// recommended something different than original query, "q"
|
||||
// and false otherwise
|
||||
// . also returns false and sets g_errno on error
|
||||
// . stores recommended query in "dst" and NULL terminates it
|
||||
// . if dst is too small it will bitch and return true with g_errno set
|
||||
bool Speller::getRecommendation ( Query *q,
|
||||
bool spellcheck,
|
||||
char *dst, // recommendation destination
|
||||
int32_t dstLen, // recommendation max len
|
||||
bool narrowSearch,
|
||||
char *narrow, // narrow search
|
||||
int32_t narrowLen, // narrow search len
|
||||
int32_t *numNarrows, // num narrows found
|
||||
void *state,
|
||||
void (*callback)(void *state) ){
|
||||
*dst = '\0';
|
||||
*narrow = '\0';
|
||||
// no narrowing search if spellchecking is off
|
||||
if ( !spellcheck )
|
||||
return true;
|
||||
|
||||
// don't spellcheck queries that are more than MAX_FRAG_SIZE int32_t.
|
||||
if ( q->getQueryLen() >= MAX_FRAG_SIZE )
|
||||
return true;
|
||||
|
||||
StateSpeller *st ;
|
||||
try { st = new (StateSpeller); }
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
log("Speller: new(%i): %s", sizeof(StateSpeller),
|
||||
mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
mnew ( st , sizeof(StateSpeller) , "State00" );
|
||||
|
||||
st->m_state = state;
|
||||
st->m_callback = callback;
|
||||
st->m_q = q;
|
||||
st->m_spellcheck = spellcheck;
|
||||
st->m_dst = dst;
|
||||
st->m_dend = dst + dstLen;
|
||||
st->m_narrowSearch = narrowSearch;
|
||||
st->m_nrw = narrow;
|
||||
st->m_nend = narrow + narrowLen;
|
||||
st->m_numNarrow = numNarrows;
|
||||
*st->m_numNarrow = 0;
|
||||
st->m_start = gettimeofdayInMilliseconds();
|
||||
st->m_numFrags = 0;
|
||||
st->m_numFragsReceived = 0;
|
||||
|
||||
// . break query down into fragments
|
||||
// . each fragment is a string of words
|
||||
// . quotes and field names will separate fragments
|
||||
// . TODO: make field data in its own fragment
|
||||
int32_t nqw = q->m_numWords;
|
||||
|
||||
for ( int32_t i = 0 ; i < nqw ; i++ ) {
|
||||
// get a word in the Query to start a fragment with
|
||||
QueryWord *qw = &q->m_qwords[i];
|
||||
// can he start the phrase?
|
||||
if ( ! canStart( qw ) )
|
||||
continue;
|
||||
|
||||
bool inQuotes = qw->m_inQuotes;
|
||||
char fieldCode = qw->m_fieldCode;
|
||||
// . get longest continual fragment that starts with word #i
|
||||
// . get the following words that can be in a fragment
|
||||
// that starts with word #i
|
||||
// . start of the frag
|
||||
int32_t endQword = i;
|
||||
int32_t startQword = i;
|
||||
for ( ; i < nqw ; i++ ) {
|
||||
// . skip if we should
|
||||
// . keep punct, however
|
||||
QueryWord *qw1 = &q->m_qwords[i];
|
||||
if ( qw1->m_opcode ) break;
|
||||
if ( qw1->m_inQuotes != inQuotes ) break;
|
||||
if ( qw1->m_fieldCode != fieldCode ) break;
|
||||
if ( qw1->m_ignoreWord == IGNORE_FIELDNAME ) break;
|
||||
if ( qw1->m_phraseSign &&
|
||||
!qw1->m_rightConnected ) break;
|
||||
// are we punct?
|
||||
if ( ! is_alnum_utf8(qw1->m_word) )
|
||||
endQword = i - 1;
|
||||
else
|
||||
endQword = i;
|
||||
}
|
||||
// revisit this i in big loop since we did not include it
|
||||
i = endQword;
|
||||
|
||||
//create a new stateFrag
|
||||
StateFrag *stFrag;
|
||||
try { stFrag = new (StateFrag); }
|
||||
catch ( ... ) {
|
||||
mdelete ( st, sizeof(StateSpeller), "StateSpeller" );
|
||||
delete (st);
|
||||
g_errno = ENOMEM;
|
||||
log("Speller: new(%i): %s", sizeof(StateFrag),
|
||||
mstrerror(g_errno));
|
||||
//continue;
|
||||
return true;
|
||||
}
|
||||
mnew ( stFrag, sizeof(StateFrag),
|
||||
"StateFrag" );
|
||||
|
||||
stFrag->m_state = (void*) st;
|
||||
stFrag->m_narrowPhrase = st->m_narrowSearch;
|
||||
stFrag->m_q = q;
|
||||
stFrag->m_startQword = startQword;
|
||||
stFrag->m_endQword = endQword;
|
||||
stFrag->m_errno = 0;
|
||||
st->m_stFrag[st->m_numFrags] = stFrag;
|
||||
st->m_numFrags++;
|
||||
// blocked
|
||||
if ( !getRecommendation( stFrag ) ){
|
||||
continue;
|
||||
}
|
||||
st->m_numFragsReceived++;
|
||||
}
|
||||
// if outstanding frags
|
||||
if ( st->m_numFragsReceived < st->m_numFrags )
|
||||
return false;
|
||||
gotFrags(st);
|
||||
// delete state
|
||||
mdelete ( st, sizeof(StateSpeller), "StateSpeller" );
|
||||
delete (st);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Speller::getRecommendation ( StateFrag *st ){
|
||||
st->m_recommended = false;
|
||||
st->m_numFound = 0;
|
||||
st->m_numNarrowPhrases = 0;
|
||||
char *dst = st->m_dst;
|
||||
|
||||
// normalize this fragment and store in "dst"
|
||||
bool wasAlnum = true;
|
||||
for ( int32_t i = st->m_startQword; i <= st->m_endQword; i++ ){
|
||||
// start of each word
|
||||
st->m_wp[i] = dst;
|
||||
char *p = st->m_q->m_qwords[i].m_word;
|
||||
int32_t plen = st->m_q->m_qwords[i].m_wordLen;
|
||||
for ( int32_t j = 0; dst-st->m_dst <MAX_FRAG_SIZE&&j<plen;j++ ) {
|
||||
if ( !getClean_utf8(p+j) )
|
||||
continue;
|
||||
// skip back to back punct/spaces
|
||||
if (j>0 && !is_alnum_utf8(p+j) &&!wasAlnum)
|
||||
continue;
|
||||
*dst = p[j];
|
||||
dst++;
|
||||
wasAlnum = is_alnum_utf8 ( p+j );
|
||||
}
|
||||
st->m_wplen[i] = dst - st->m_wp[i];
|
||||
st->m_isfound[i] = false;
|
||||
}
|
||||
*dst = '\0';
|
||||
|
||||
// debug msg
|
||||
log(LOG_DEBUG,"speller: Getting recommendation for frag=%s",
|
||||
st->m_dst);
|
||||
|
||||
// give each word in the phrase a chance to start the subphrase
|
||||
int32_t maxPhrase = st->m_endQword - st->m_startQword;
|
||||
if ( maxPhrase > MAX_WORDS_PER_PHRASE )
|
||||
maxPhrase = MAX_WORDS_PER_PHRASE;
|
||||
|
||||
// store the phraseLen and posn
|
||||
st->m_pLen = maxPhrase;
|
||||
st->m_pPosn = st->m_startQword;
|
||||
|
||||
return launchReco(st);
|
||||
}
|
||||
|
||||
bool Speller::launchReco(StateFrag *st){
|
||||
// if we checked all the phrases or found all the words
|
||||
if ( st->m_numFound == st->m_endQword - st->m_startQword + 1 ||
|
||||
st->m_pLen < 0 ){
|
||||
return true;
|
||||
}
|
||||
|
||||
bool launchPhrase = false;
|
||||
for ( ; st->m_pLen >= 0; st->m_pLen-- ){
|
||||
for ( ; st->m_pPosn + st->m_pLen <= st->m_endQword;
|
||||
st->m_pPosn++ ) {
|
||||
// find a word that can start the phrase
|
||||
QueryWord *qw = &st->m_q->m_qwords[st->m_pPosn];
|
||||
if ( !canStart (qw) )
|
||||
continue;
|
||||
// don't do this phrase if we have found even one
|
||||
// word in the phrase
|
||||
bool found = false;
|
||||
for ( int32_t k = st->m_pPosn;
|
||||
k <= st->m_pPosn + st->m_pLen; k++ ) {
|
||||
if ( st->m_isfound[k] ){
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( found )
|
||||
continue;
|
||||
|
||||
// cannot end on a stop word, punct, right-connected
|
||||
// word
|
||||
QueryWord *qwEnd =
|
||||
&st->m_q->m_qwords[st->m_pPosn + st->m_pLen];
|
||||
if ( qwEnd->m_isStopWord || qwEnd->m_isPunct ||
|
||||
qwEnd->m_rightConnected )
|
||||
continue;
|
||||
|
||||
// found someone to start the phrase with
|
||||
// what is the new phrase parms?
|
||||
st->m_a = st->m_wp[st->m_pPosn];
|
||||
st->m_b = st->m_wp[st->m_pLen + st->m_pPosn]+
|
||||
st->m_wplen[st->m_pLen + st->m_pPosn];
|
||||
|
||||
// also store the tmp char that we are changing
|
||||
st->m_c = *(st->m_b);
|
||||
*(st->m_b) = '\0';
|
||||
|
||||
// if it is just a number, don't get recommendation
|
||||
// lest we emabarrass ourselves
|
||||
if ( st->m_pPosn == 0 && is_digit(st->m_a[0]) ) {
|
||||
char *k = st->m_a+1;
|
||||
while ( is_digit(*k) ) k++;
|
||||
if ( ! *k ) {
|
||||
*st->m_b = st->m_c ;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// if it is an adult phrase, don't get a recommendation
|
||||
// check if isAdult really finds a word.
|
||||
char *adultLoc = NULL;
|
||||
if ( isAdult(st->m_a, gbstrlen(st->m_a), &adultLoc) &&
|
||||
( adultLoc == st->m_a || *(adultLoc-1) == ' ' ) ){
|
||||
// mark as found
|
||||
for ( int32_t k = st->m_pPosn;
|
||||
k <= st->m_pPosn + st->m_pLen; k++ )
|
||||
st->m_isfound[k] = true;
|
||||
*(st->m_b) = st->m_c;
|
||||
continue;
|
||||
}
|
||||
// if the phrase is in dict or in the top pop words,
|
||||
// phrase is found. Don't check if we are narrowing
|
||||
// the phrase because we need to multicast anyways
|
||||
uint64_t h ;
|
||||
h = hash64d(st->m_a, gbstrlen(st->m_a) );
|
||||
if ( !st->m_narrowPhrase &&
|
||||
getPhrasePopularity( st->m_a, h, false ) > 0 ){
|
||||
// mark as found
|
||||
for ( int32_t k = st->m_pPosn;
|
||||
k <= st->m_pPosn + st->m_pLen; k++ )
|
||||
st->m_isfound[k] = true;
|
||||
*(st->m_b) = st->m_c;
|
||||
continue;
|
||||
}
|
||||
launchPhrase = true;
|
||||
break;
|
||||
}
|
||||
if ( launchPhrase )
|
||||
break;
|
||||
st->m_pPosn = st->m_startQword;
|
||||
}
|
||||
|
||||
if ( st->m_pLen < 0 ){
|
||||
return true;
|
||||
}
|
||||
|
||||
// debug msg
|
||||
log(LOG_DEBUG,"speller: ----------");
|
||||
log(LOG_DEBUG,"speller: Checking phrase=%s", st->m_a);
|
||||
|
||||
|
||||
// launch for all the splits
|
||||
st->m_numRequests = 0;
|
||||
st->m_numReplies = 0;
|
||||
|
||||
|
||||
int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
|
||||
// don't send to twins...
|
||||
hostsPerSplit /= g_hostdb.m_numHostsPerShard;
|
||||
int32_t mySplit = g_hostdb.m_hostId % g_hostdb.m_indexSplits;
|
||||
|
||||
int32_t key = st->m_q->getQueryHash();//0;
|
||||
int32_t timeout = 30;
|
||||
int32_t niceness = 0;
|
||||
char request[MAX_FRAG_SIZE + 4];
|
||||
char *p = request;
|
||||
*(bool *)p = st->m_narrowPhrase;
|
||||
p += sizeof(bool);
|
||||
strcpy ( p, st->m_a );
|
||||
// send the null end too
|
||||
p += gbstrlen(st->m_a)+1;
|
||||
int32_t plen = p - request;
|
||||
for ( int32_t i = 0; i < hostsPerSplit; i++ ){
|
||||
// get the hostId of the host we're sending to
|
||||
uint32_t hostId =
|
||||
mySplit + ( i * g_hostdb.m_indexSplits );
|
||||
Host *h = g_hostdb.getHost(hostId);
|
||||
st->m_mcast[i].reset();
|
||||
|
||||
bool status = st->m_mcast[i].
|
||||
send(request ,
|
||||
plen , // request size
|
||||
0x3d , // msgType 0x3d
|
||||
false , // multicast owns m_request?
|
||||
h->m_groupId, // group to send to (groupKey)
|
||||
false , // send to whole group?
|
||||
key ,
|
||||
st , // state data
|
||||
NULL , // state data
|
||||
gotSpellerReplyWrapper ,
|
||||
timeout , // in seconds
|
||||
niceness ,
|
||||
false , // realtime?
|
||||
-1 , // m_q->m_bestHandlingHostId ,
|
||||
NULL , // m_replyBuf ,
|
||||
0 , // MSG39REPLYSIZE,
|
||||
// this is true if multicast should free
|
||||
// the
|
||||
// reply, otherwise caller is responsible
|
||||
// for freeing it after calling
|
||||
// getBestReply).
|
||||
// actually, this should always be false,
|
||||
// there
|
||||
// is a bug in Multicast.cpp.
|
||||
false );
|
||||
|
||||
if (!status){
|
||||
st->m_numReplies++;
|
||||
log("speller: Multicast had error: %s",
|
||||
mstrerror(g_errno));
|
||||
st->m_errno = g_errno;
|
||||
continue;
|
||||
}
|
||||
// blocked
|
||||
else
|
||||
st->m_numRequests++;
|
||||
}
|
||||
|
||||
if ( st->m_numReplies == st->m_numRequests )
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void gotSpellerReplyWrapper( void *state, void *state2 ){
|
||||
StateFrag *stFrag = (StateFrag *) state;
|
||||
stFrag->m_numReplies++;
|
||||
if ( stFrag->m_numReplies < stFrag->m_numRequests )
|
||||
return;
|
||||
// blocked
|
||||
if ( !g_speller.gotSpellerReply(stFrag) )
|
||||
return;
|
||||
|
||||
StateSpeller *st = (StateSpeller *)stFrag->m_state;
|
||||
// One more frag received
|
||||
st->m_numFragsReceived++;
|
||||
if ( st->m_numFragsReceived < st->m_numFrags )
|
||||
return;
|
||||
|
||||
g_speller.gotFrags(st);
|
||||
// callback
|
||||
st->m_callback( st->m_state );
|
||||
// delete state
|
||||
mdelete ( st, sizeof(StateSpeller), "StateSpeller" );
|
||||
delete (st);
|
||||
}
|
||||
|
||||
bool Speller::gotSpellerReply( StateFrag *st ){
|
||||
int32_t minScore = LARGE_SCORE;
|
||||
int32_t maxPop = -1;
|
||||
char *bestReco = NULL;
|
||||
|
||||
char *reply[MAX_UNIQUE_HOSTS_PER_SPLIT];
|
||||
int32_t replySize[MAX_UNIQUE_HOSTS_PER_SPLIT];
|
||||
int32_t replyMaxSize[MAX_UNIQUE_HOSTS_PER_SPLIT];
|
||||
bool freeit;
|
||||
bool found = false; //phrase was found in dict or pop words
|
||||
int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
|
||||
// don't send to twins...
|
||||
hostsPerSplit /= g_hostdb.m_numHostsPerShard;
|
||||
|
||||
int32_t numNarrowPhrases[MAX_UNIQUE_HOSTS_PER_SPLIT];
|
||||
char *narrowPtrs[MAX_UNIQUE_HOSTS_PER_SPLIT];
|
||||
|
||||
// init narrowSearch arrays
|
||||
for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ ){
|
||||
numNarrowPhrases[i] = 0;
|
||||
narrowPtrs[i] = NULL;
|
||||
}
|
||||
|
||||
for ( int32_t i = 0; i < hostsPerSplit; i++ ){
|
||||
reply[i] = st->m_mcast[i].getBestReply( &replySize[i] ,
|
||||
&replyMaxSize[i] ,
|
||||
&freeit );
|
||||
// multicast may have an empty reply buffer if there was an
|
||||
// OOM error or something. m_errno should have been set, but
|
||||
// we have to loop through all the multicasts to free the
|
||||
// reply buffers.
|
||||
char *p = reply[i];
|
||||
|
||||
if ( g_errno || st->m_errno || !p){
|
||||
continue;
|
||||
}
|
||||
// was is found in dict
|
||||
bool foundInDict = *(bool *)p;
|
||||
p += sizeof(bool);
|
||||
if ( foundInDict )
|
||||
found = true;
|
||||
|
||||
// first is if there is a recommendation or not
|
||||
bool recommendation = *(bool *) p;
|
||||
p += sizeof (bool);
|
||||
|
||||
if ( !recommendation && !st->m_narrowPhrase )
|
||||
continue;
|
||||
|
||||
int32_t score = *(int32_t *)p;
|
||||
p += 4;
|
||||
int32_t pop = *(int32_t *)p;
|
||||
p += 4;
|
||||
|
||||
if ( recommendation ){
|
||||
log ( LOG_DEBUG,"speller: Received reco %s, "
|
||||
"score=%"INT32", pop=%"INT32"", p, score, pop );
|
||||
|
||||
// we have a recommendation with score and pop
|
||||
// choose the one with the lowest score, and if the
|
||||
// score is same then the max pop
|
||||
// HACK: we are getting bad recommendations for smaller
|
||||
// popularities. So don't consider them
|
||||
if ( pop > 8 && ( score < minScore ||
|
||||
( score == minScore && pop > maxPop ) ) ){
|
||||
bestReco = p;
|
||||
minScore = score;
|
||||
maxPop = pop;
|
||||
}
|
||||
}
|
||||
|
||||
p += gbstrlen(p) + 1;
|
||||
if ( st->m_narrowPhrase ){
|
||||
numNarrowPhrases[i] = *(int32_t *)p;
|
||||
p += 4;
|
||||
narrowPtrs[i] = p;
|
||||
}
|
||||
}
|
||||
|
||||
// merge all the narrow results
|
||||
if ( st->m_narrowPhrase ){
|
||||
int32_t currPhrase[MAX_UNIQUE_HOSTS_PER_SPLIT];
|
||||
for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ )
|
||||
currPhrase[i] = 0;
|
||||
for ( int32_t i = 0; i < MAX_NARROW_SEARCHES; i++ ){
|
||||
int32_t maxHost = -1;
|
||||
int32_t maxPop = 0;
|
||||
for ( int32_t j = 0; j < hostsPerSplit; j++ ){
|
||||
if ( numNarrowPhrases[j] <= currPhrase[j] )
|
||||
continue;
|
||||
int32_t pop = *(int32_t *)narrowPtrs[j];
|
||||
if ( pop <= maxPop )
|
||||
continue;
|
||||
maxPop = pop;
|
||||
maxHost = j;
|
||||
}
|
||||
if ( maxHost < 0 )
|
||||
break;
|
||||
//
|
||||
narrowPtrs[maxHost] += 4;
|
||||
strcpy( st->m_narrowPhrases[i], narrowPtrs[maxHost] );
|
||||
narrowPtrs[maxHost] +=gbstrlen(narrowPtrs[maxHost]) + 1;
|
||||
currPhrase[maxHost]++;
|
||||
st->m_numNarrowPhrases++;
|
||||
}
|
||||
}
|
||||
|
||||
// make narrowPhrase false here, so that its not launched a second time
|
||||
// for the same frag;
|
||||
st->m_narrowPhrase = false;
|
||||
|
||||
// revert
|
||||
*(st->m_b) = st->m_c;
|
||||
|
||||
// if we found a recommendation,or if the phrase was found in the
|
||||
// dictionary or pop words then mark all the
|
||||
// words that fall under the phrase as found
|
||||
if ( found || bestReco ){
|
||||
for ( int32_t k = st->m_pPosn;
|
||||
k <= st->m_pLen + st->m_pPosn; k++ )
|
||||
st->m_isfound[k] = true;
|
||||
st->m_numFound += st->m_pLen + 1;
|
||||
}
|
||||
|
||||
// if not found in the dictionary or a recommendation, copy the phrase
|
||||
if ( !found && bestReco){
|
||||
// this fragment is going to be recommended
|
||||
st->m_recommended = true;
|
||||
// insert our recommendation into the phrase to get a new one
|
||||
char *s1 = st->m_wp[st->m_startQword];
|
||||
int32_t slen1 = st->m_a - st->m_wp[st->m_startQword];
|
||||
char *s2 = bestReco;
|
||||
int32_t slen2 = gbstrlen(bestReco);
|
||||
char *s3 = st->m_b ;
|
||||
// store the difference in length between the reco and the
|
||||
// original string
|
||||
int32_t diff = slen2 - ( st->m_b - st->m_a );
|
||||
int32_t slen3 = st->m_wp[st->m_endQword] +
|
||||
st->m_wplen[st->m_endQword] - st->m_b;
|
||||
|
||||
if ( slen3 < 0 )
|
||||
slen3 = 0;
|
||||
|
||||
int32_t tlen = slen1 + slen2 + slen3 ;
|
||||
if ( tlen > MAX_FRAG_SIZE ){
|
||||
log(LOG_LOGIC,"speller: buf too small. Fix me 3.");
|
||||
// blocked
|
||||
if ( !launchReco(st) )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
// make substitution and store in "dst"
|
||||
char buf2 [ MAX_FRAG_SIZE];
|
||||
char *nf = buf2;
|
||||
gbmemcpy ( nf , s1 , slen1 ) ; nf += slen1;
|
||||
gbmemcpy ( nf , s2 , slen2 ) ; nf += slen2;
|
||||
gbmemcpy ( nf , s3 , slen3 ) ;
|
||||
nf += slen3;
|
||||
|
||||
// don't forget to NULL terminate
|
||||
*nf = '\0';
|
||||
// debug msg
|
||||
log( LOG_DEBUG,"speller: Trying substitution \"%s\"",
|
||||
buf2 );
|
||||
|
||||
strcpy ( st->m_dst , buf2 );
|
||||
|
||||
// the pointers might have to be changed if the
|
||||
// recommendation was not of the same length as the words
|
||||
if ( diff != 0 ){
|
||||
for ( int32_t k = st->m_pLen+st->m_pPosn+1;
|
||||
k <= st->m_endQword; k++ )
|
||||
st->m_wp[k] += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// don't forget to free the replies
|
||||
for ( int32_t i = 0; i < hostsPerSplit; i++ )
|
||||
if ( reply[i] && replyMaxSize[i] > 0 )
|
||||
mfree( reply[i], replyMaxSize[i], "SpellerReplyBuf" );
|
||||
|
||||
// go to the next position in the phrase. if we have reached the end
|
||||
// of the phrase position, decrement the phrase length and start again
|
||||
if ( st->m_pPosn + st->m_pLen >= st->m_endQword - 1 ){
|
||||
st->m_pLen--;
|
||||
st->m_pPosn = st->m_startQword;
|
||||
}
|
||||
else
|
||||
st->m_pPosn++;
|
||||
|
||||
if ( !launchReco(st) )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
// . break a NULL-terminated string down into a list of ptrs to the words
|
||||
// . return the number of words stored into "wp"
|
||||
/*
|
||||
int32_t Speller::getWords ( const char *s ,
|
||||
char *wp [MAX_FRAG_SIZE] ,
|
||||
int32_t wplen [MAX_FRAG_SIZE] ,
|
||||
bool *isstop ) {
|
||||
int32_t nwp = 0;
|
||||
loop:
|
||||
// skip initial punct
|
||||
while ( *s && ! is_alnum ( *s ) ) s++;
|
||||
// bail if done
|
||||
if ( ! *s ) return nwp;
|
||||
// point to word
|
||||
wp [ nwp ] = (char *)s;
|
||||
// convenience ptr
|
||||
char *ww = (char *)s;
|
||||
// count over it
|
||||
while ( is_alnum ( *s ) ) s++;
|
||||
// how long is the word?
|
||||
int32_t slen = s - wp [ nwp ];
|
||||
// set length
|
||||
wplen [ nwp ] = slen ;
|
||||
// is it a stop word?
|
||||
if ( isstop ) {
|
||||
// TODO: make the stop words utf8!!!
|
||||
int64_t h = hash64Lower_utf8 ( ww , slen ) ;
|
||||
bool stop = ::isStopWord ( ww , slen , h ) ;
|
||||
// BUT ok if Capitalized or number
|
||||
if ( stop ) {
|
||||
if ( is_digit (ww[0]) ) stop = false;
|
||||
if ( is_cap (ww,slen ) ) stop = false;
|
||||
// e-mail, c file, c. s. lewis
|
||||
if ( slen == 1 && ww[0] != 'a' ) stop = false;
|
||||
}
|
||||
isstop[nwp] = stop;
|
||||
}
|
||||
nwp++;
|
||||
goto loop;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
void Speller::gotFrags( void *state ){
|
||||
StateSpeller *st = (StateSpeller *) state;
|
||||
|
||||
char *dptr = st->m_dst;
|
||||
char *nptr = st->m_nrw;
|
||||
bool recommendation = false;
|
||||
Query *q = st->m_q;
|
||||
|
||||
// . break query down into fragments
|
||||
// . each fragment is a string of words
|
||||
// . quotes and field names will separate fragments
|
||||
// . TODO: make field data in its own fragment
|
||||
int32_t nqw = q->m_numWords;
|
||||
int32_t currFrag = 0;
|
||||
for ( int32_t i = 0 ; i < nqw ; i++ ) {
|
||||
// get a word in the Query to start a fragment with
|
||||
QueryWord *qw = &q->m_qwords[i];
|
||||
// if he has a phraseSign, put it right away
|
||||
//if ( qw->m_phraseSign ) {
|
||||
// *dptr = qw->m_phraseSign;
|
||||
// dptr++;
|
||||
// }
|
||||
// can he start the phrase?
|
||||
// if he can't start our fragment, just copy over to "dst"
|
||||
if ( !canStart( qw )) {
|
||||
// copy to rp and get next word
|
||||
char *w = qw->m_word;
|
||||
int32_t wlen = qw->m_wordLen;
|
||||
if ( dptr + wlen >= st->m_dend ) {
|
||||
g_errno = EBUFTOOSMALL; continue; }
|
||||
// watch out for LeFtP and RiGhP
|
||||
if ( qw->m_opcode == OP_LEFTPAREN ) *dptr++ = '(';
|
||||
else if ( qw->m_opcode == OP_RIGHTPAREN) *dptr++ = ')';
|
||||
else if ( qw->m_opcode == OP_PIPE ) *dptr++ = '|';
|
||||
else {
|
||||
gbmemcpy ( dptr , w , wlen );
|
||||
dptr += wlen;
|
||||
}
|
||||
*dptr = '\0';
|
||||
continue;
|
||||
}
|
||||
bool inQuotes = qw->m_inQuotes;
|
||||
char fieldCode = qw->m_fieldCode;
|
||||
// . get longest continual fragment that starts with word #i
|
||||
// . get the following words that can be in a fragment
|
||||
// that starts with word #i
|
||||
// . start of the frag
|
||||
int32_t endQword = i;
|
||||
for ( ; i < nqw ; i++ ) {
|
||||
// . skip if we should
|
||||
// . keep punct, however
|
||||
QueryWord *qw1 = &q->m_qwords[i];
|
||||
if ( qw1->m_opcode ) break;
|
||||
if ( qw1->m_inQuotes != inQuotes ) break;
|
||||
if ( qw1->m_fieldCode != fieldCode ) break;
|
||||
if ( qw1->m_ignoreWord== IGNORE_FIELDNAME ) break;
|
||||
if ( qw1->m_phraseSign && !qw1->m_rightConnected )
|
||||
break;
|
||||
// are we punct?
|
||||
if ( ! is_alnum_utf8 (qw1->m_word) )
|
||||
endQword = i - 1;
|
||||
else
|
||||
endQword = i;
|
||||
}
|
||||
// revisit this i in big loop since we did not include it
|
||||
i = endQword;
|
||||
|
||||
// OOM errors might cause us not to launch frags
|
||||
if ( currFrag >= st->m_numFrags )
|
||||
continue;
|
||||
StateFrag *stFrag = st->m_stFrag[currFrag];
|
||||
// don't breech
|
||||
if ( dptr + gbstrlen(stFrag->m_dst) >= st->m_dend ) {
|
||||
g_errno = EBUFTOOSMALL;
|
||||
}
|
||||
else {
|
||||
// store it
|
||||
strcpy ( dptr, stFrag->m_dst );
|
||||
dptr += gbstrlen ( dptr );
|
||||
// add a space between fragments
|
||||
// *dptr = ' ';
|
||||
//dptr++;
|
||||
*dptr = '\0';
|
||||
// set the flag
|
||||
if ( stFrag->m_recommended )
|
||||
recommendation = true;
|
||||
}
|
||||
// copy over all the narrow searches that can fit
|
||||
for ( int32_t j = 0; j < stFrag->m_numNarrowPhrases; j++ ){
|
||||
// don't breech
|
||||
if ( nptr +gbstrlen(stFrag->m_narrowPhrases[j]) >
|
||||
st->m_nend )
|
||||
break;
|
||||
strcpy(nptr, stFrag->m_narrowPhrases[j]);
|
||||
nptr += gbstrlen(stFrag->m_narrowPhrases[j]) + 1;
|
||||
(*st->m_numNarrow)++;
|
||||
}
|
||||
|
||||
mdelete(stFrag, sizeof(StateFrag), "StateFrag");
|
||||
delete (stFrag);
|
||||
// now we get the next frag
|
||||
currFrag++;
|
||||
}
|
||||
if ( !recommendation )
|
||||
*st->m_dst = '\0';
|
||||
|
||||
int64_t now = gettimeofdayInMilliseconds();
|
||||
if ( now - st->m_start > 50 )
|
||||
log(LOG_INFO,"speller: Took %"INT64" ms to spell check %s",
|
||||
now - st->m_start, st->m_q->getQuery() );
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
bool Speller::generateDicts ( int32_t numWordsToDump , char *coll ){
|
||||
m_language[2].setLang(2);
|
||||
//m_language[2].setLang(2);
|
||||
//m_language[2].generateDicts ( numWordsToDump, coll );
|
||||
return false;
|
||||
}
|
||||
@ -1776,107 +912,6 @@ bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn,
|
||||
return false;
|
||||
}*/
|
||||
|
||||
bool Speller::createUnifiedDict (){
|
||||
// first get all the tuples from wordlist and query file
|
||||
//HashTableT <uint64_t, char*> ht[MAX_LANGUAGES];
|
||||
HashTableX ht[MAX_LANGUAGES];
|
||||
char ff[1024];
|
||||
for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){
|
||||
ht[i].set ( 8,4,0,NULL,0,false,0,"cud");
|
||||
sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir,
|
||||
getLanguageAbbr(i), getLanguageAbbr(i) );
|
||||
populateHashTable(ff, &ht[i], i);
|
||||
|
||||
sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir,
|
||||
getLanguageAbbr(i), getLanguageAbbr(i) );
|
||||
populateHashTable(ff, &ht[i], i);
|
||||
|
||||
for ( int32_t j = 0; j < NUM_CHARS; j++ ){
|
||||
sprintf ( ff , "%sdict/%s/%s.dict.%"INT32"", g_hostdb.m_dir,
|
||||
getLanguageAbbr(i), getLanguageAbbr(i), j );
|
||||
populateHashTable(ff, &ht[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
//sprintf ( ff, "%sdict/unifiedDict",g_hostdb.m_dir );
|
||||
sprintf ( ff, "%sunifiedDict.txt",g_hostdb.m_dir );
|
||||
// delete it first
|
||||
unlink ( ff );
|
||||
// then open a new one for appending
|
||||
int fdw = open ( ff ,
|
||||
O_CREAT | O_RDWR | O_APPEND ,
|
||||
getFileCreationFlags());
|
||||
// S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
|
||||
if ( fdw < 0 ){
|
||||
return log("lang: Could not open for %s "
|
||||
"writing: %s.",ff, strerror(errno));
|
||||
}
|
||||
|
||||
log(LOG_INIT,"spell: Making %s.", ff );
|
||||
|
||||
//HashTableT <uint64_t, int32_t> phrases;
|
||||
HashTableX phrases;
|
||||
phrases.set(8,4,0,NULL,0,false,0,"phud");
|
||||
char buf[1024];
|
||||
for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){
|
||||
// get each slot
|
||||
for ( int32_t j = 0; j < ht[i].getNumSlots(); j++ ){
|
||||
uint64_t key = *(uint64_t *)ht[i].getKey(j);
|
||||
if ( key == 0 )
|
||||
continue;
|
||||
// if key is already found
|
||||
int32_t slot = phrases.getSlot(&key);
|
||||
if ( slot != -1 )
|
||||
continue;
|
||||
|
||||
char *tuple = *(char **)ht[i].getValueFromSlot(j);
|
||||
|
||||
// here we print the phrase and the phonet if present
|
||||
// skip the score
|
||||
while ( *tuple != '\t' )
|
||||
tuple++;
|
||||
tuple++;
|
||||
|
||||
sprintf( buf, "%s", tuple );
|
||||
|
||||
char *p = buf;
|
||||
p += gbstrlen(buf);
|
||||
|
||||
// if there wasn't a phonet, its from the titleRec.
|
||||
// add another tab
|
||||
bool fromTitleRec = false;
|
||||
if ( strstr (tuple,"\t") == NULL ){
|
||||
*p = '\t';
|
||||
p++;
|
||||
fromTitleRec = true;
|
||||
}
|
||||
|
||||
for ( int32_t k = 0; k < MAX_LANGUAGES; k++ ){
|
||||
slot = ht[k].getSlot(&key);
|
||||
if ( slot == -1 )
|
||||
continue;
|
||||
char *val = *(char **)ht[k].getValueFromSlot(slot);
|
||||
int32_t pop = atoi(val);
|
||||
if ( fromTitleRec ) pop *= -1;
|
||||
sprintf(p,"\t%"INT32"\t%"INT32"",k,pop);
|
||||
p += gbstrlen(p);
|
||||
}
|
||||
// write out the trailing \n as well
|
||||
*p = '\n';
|
||||
p++;
|
||||
*p = '\0';
|
||||
p++;
|
||||
int32_t bufLen = gbstrlen(buf);
|
||||
int32_t wn = write ( fdw , buf , bufLen ) ;
|
||||
if ( wn != bufLen )
|
||||
return log("lang: write: %s",strerror(errno));
|
||||
int32_t val = 1;
|
||||
phrases.addKey(&key, &val);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool Speller::populateHashTable( char *ff, HashTableX *htable,
|
||||
unsigned char langId ){
|
||||
|
19
Speller.h
19
Speller.h
@ -9,11 +9,15 @@
|
||||
#ifndef _SPELLER_H_
|
||||
#define _SPELLER_H_
|
||||
|
||||
#define MAX_FRAG_SIZE 1024
|
||||
|
||||
// max int32_t returned by getPhrasePopularity() function
|
||||
#define MAX_PHRASE_POP 16800
|
||||
|
||||
#include "StopWords.h"
|
||||
#include "Language.h"
|
||||
#include "Query.h"
|
||||
#include "Multicast.h"
|
||||
|
||||
// . the height and width of m_stable[][] that takes a letter pair as an index
|
||||
// . valid chars are returned by isValidChar() routine
|
||||
// . we use A-Z, 0-9, space, hyphen, apostrophe and \0... that's it
|
||||
@ -48,7 +52,7 @@ class StateFrag{
|
||||
char m_c;
|
||||
bool m_narrowPhrase;
|
||||
int32_t m_numNarrowPhrases;
|
||||
char m_narrowPhrases[MAX_NARROW_SEARCHES][MAX_FRAG_SIZE];
|
||||
//char m_narrowPhrases[MAX_NARROW_SEARCHES][MAX_FRAG_SIZE];
|
||||
};
|
||||
|
||||
|
||||
@ -99,10 +103,10 @@ class Speller {
|
||||
bool findNext( char *s, char *send, char **nextWord, bool *isPorn,
|
||||
unsigned char langId, int32_t encodeType );
|
||||
|
||||
int32_t checkDict ( char *s, int32_t slen, char encodeType,
|
||||
unsigned char lang = langEnglish ){
|
||||
return m_language[lang].checkDict(s,slen,encodeType);
|
||||
}
|
||||
// int32_t checkDict ( char *s, int32_t slen, char encodeType,
|
||||
// unsigned char lang = langEnglish ){
|
||||
// return m_language[lang].checkDict(s,slen,encodeType);
|
||||
// }
|
||||
|
||||
// should be same hash algo to make wordId
|
||||
bool isInDict ( uint64_t wordId ) {
|
||||
@ -137,11 +141,10 @@ class Speller {
|
||||
int32_t wplen [MAX_FRAG_SIZE] ,
|
||||
bool *isstop );
|
||||
|
||||
Language m_language[MAX_LANGUAGES];
|
||||
// Language m_language[MAX_LANGUAGES];
|
||||
|
||||
char *getRandomWord() ;
|
||||
bool loadUnifiedDict();
|
||||
bool createUnifiedDict ();
|
||||
|
||||
void dictLookupTest ( char *ff );
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "Words.h"
|
||||
//#include "AppendingWordsWindow.h"
|
||||
#include "Sections.h"
|
||||
#include "Msg20.h"
|
||||
|
||||
Summary::Summary()
|
||||
: m_summaryLocs(m_summaryLocBuf,
|
||||
|
112
XmlDoc.cpp
112
XmlDoc.cpp
@ -7266,118 +7266,6 @@ char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) {
|
||||
}
|
||||
|
||||
return maxi;
|
||||
//m_langId = maxi;
|
||||
//m_langIdValid = true;
|
||||
//return &m_langId;
|
||||
|
||||
/*
|
||||
int32_t freqScore = 0;
|
||||
int32_t lang;
|
||||
if ( ! m_processedLang ) {
|
||||
// do not repeat this call for this document
|
||||
m_processedLang = true;
|
||||
lang = words->getLanguage( sections ,
|
||||
1000 , // sampleSize ,
|
||||
m_niceness,
|
||||
&freqScore);
|
||||
// return NULL on error with g_errno set
|
||||
if ( lang == -1 ) return NULL;
|
||||
// we got it from words, return
|
||||
if ( lang != 0 ) {
|
||||
m_langId = lang;
|
||||
m_langIdValid = true;
|
||||
return &m_langId;
|
||||
}
|
||||
}
|
||||
|
||||
m_langId = 0;
|
||||
// try from charset
|
||||
uint16_t *charset = getCharset ( );
|
||||
if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset;
|
||||
// do based on charset
|
||||
if ( *charset == csGB18030 ) m_langId = langChineseTrad;
|
||||
if ( *charset == csGBK ) m_langId = langChineseSimp;
|
||||
|
||||
if ( m_langId ) {
|
||||
m_langIdValid = true;
|
||||
return &m_langId;
|
||||
}
|
||||
|
||||
// are we a root?
|
||||
char *isRoot = getIsSiteRoot();
|
||||
if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot;
|
||||
// this lookup here might be unnecessary
|
||||
uint8_t *rl = NULL;
|
||||
if ( ! *isRoot ) {
|
||||
rl = getRootLangId();
|
||||
if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl;
|
||||
}
|
||||
|
||||
//Url *u = getCurrentUrl();
|
||||
Url *u = getFirstUrl();
|
||||
uint8_t gs[METHOD_CAP];
|
||||
// reset language method vector
|
||||
memset( gs , 0, sizeof(uint8_t) * METHOD_CAP );
|
||||
// Let the site tell us what language it's in
|
||||
gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml );
|
||||
// Guess from the FIRST URL (unredirected url)
|
||||
gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() );
|
||||
// Guess from the outlinks
|
||||
gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links );
|
||||
// Guess from the inlinks
|
||||
gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip);
|
||||
// root page's language, if there was one
|
||||
if ( ! *isRoot ) gs [METHOD_ROOT] = *rl;
|
||||
|
||||
int32_t scores[MAX_LANGUAGES];
|
||||
memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES );
|
||||
// weights for the 10 methods
|
||||
char cw[] = { 8,9,4,7,6,7,8,1,2};
|
||||
// add up weighted scores
|
||||
for(int i = 0; i < METHOD_CAP; i++ )
|
||||
scores[gs[i]] += cw[i];
|
||||
|
||||
// reset the "lang" to langUnknown which is 0
|
||||
lang = langUnknown ;
|
||||
int max, oldmax;
|
||||
max = oldmax = 0;
|
||||
// find best language
|
||||
for ( int32_t i = MAX_LANGUAGES - 1; i > 0 ; i-- ) {
|
||||
if ( scores[i] < max) continue;
|
||||
oldmax = max;
|
||||
max = scores[i];
|
||||
lang = i;
|
||||
}
|
||||
// give up if not too conclusive
|
||||
if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) {
|
||||
//log(LOG_DEBUG, "build: Language: Threshold, score "
|
||||
// "(%"INT32" - %"INT32") %"INT32" vs. %"INT32".\n",
|
||||
// (int32_t)max,
|
||||
// (int32_t)oldmax,
|
||||
// (int32_t)max - oldmax,
|
||||
// (int32_t)3);//(int32_t)cr->m_languageThreshold);
|
||||
lang = langUnknown;
|
||||
}
|
||||
// Make sure we're over the bailout value, this
|
||||
// keeps low scoring methods like TLD from being
|
||||
// the decider if it was the only successful method.
|
||||
if ( max < 5 ) { // cr->m_languageBailout ) {
|
||||
//log(LOG_DEBUG, "build: Language: Bailout, "
|
||||
// "score %"INT32" vs. %"INT32".",
|
||||
// (int32_t)max, (int32_t)5);//cr->m_languageBailout);
|
||||
lang = langUnknown;
|
||||
}
|
||||
// If the language is still not known,
|
||||
// use the language detected from the frames.
|
||||
//if(lang == langUnknown) lang = frameFoundLang;
|
||||
// . try dmoz if still unknown
|
||||
// . limit to 10 of them
|
||||
// all done, do not repeat
|
||||
m_langIdValid = true;
|
||||
m_langId = lang;
|
||||
m_langIdScore = max;
|
||||
return &m_langId;
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user