2016-09-23 05:05:22 -04:00
|
|
|
|
#include <gtest/gtest.h>
|
2015-11-13 11:55:44 -05:00
|
|
|
|
|
2018-02-05 08:54:18 -05:00
|
|
|
|
#include "utf8.h"
|
2018-02-04 04:59:31 -05:00
|
|
|
|
#include "TitleSummaryCodepointFilter.h"
|
|
|
|
|
|
2015-11-13 11:55:44 -05:00
|
|
|
|
#include <sstream>
|
|
|
|
|
|
|
|
|
|
#include "Log.h"
|
|
|
|
|
|
|
|
|
|
// Test is based on: http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
|
|
|
|
|
|
|
|
|
TEST(UnicodeTest, ValidUtf8) {
|
|
|
|
|
const char* inputs[] = {
|
|
|
|
|
// 1. correct UTF-8 text
|
|
|
|
|
"κ",
|
|
|
|
|
"ό",
|
|
|
|
|
"σ",
|
|
|
|
|
"μ",
|
|
|
|
|
"ε",
|
|
|
|
|
"幸",
|
|
|
|
|
"运",
|
|
|
|
|
"Б",
|
|
|
|
|
"ъ",
|
|
|
|
|
"л",
|
|
|
|
|
"г",
|
|
|
|
|
"а",
|
|
|
|
|
"р",
|
|
|
|
|
"с",
|
|
|
|
|
"к",
|
|
|
|
|
"и",
|
|
|
|
|
|
|
|
|
|
// 2. boundary condition test cases
|
|
|
|
|
// 2.1 first possible sequence of range
|
|
|
|
|
"\x00",
|
|
|
|
|
"\xc2\x80",
|
|
|
|
|
"\xe0\xa0\x80",
|
|
|
|
|
"\xe1\x80\x80",
|
|
|
|
|
"\xf0\x90\x80\x80",
|
|
|
|
|
"\xf1\x80\x80\x80",
|
|
|
|
|
"\xf4\x80\x80\x80",
|
|
|
|
|
|
|
|
|
|
// 2.2 last possible sequence of range
|
|
|
|
|
"\x7f",
|
|
|
|
|
"\xdf\xbf"
|
|
|
|
|
"\xe0\xbf\xbf",
|
|
|
|
|
"\xef\xbf\xbf",
|
|
|
|
|
"\xf0\xbf\xbf\xbf",
|
|
|
|
|
"\xf3\xbf\xbf\xbf",
|
|
|
|
|
"\xf4\x8f\xbf\xbf",
|
|
|
|
|
|
|
|
|
|
// 2.3 other boundry conditions
|
|
|
|
|
"\xed\x9f\xbf",
|
|
|
|
|
"\xee\x80\x80",
|
|
|
|
|
"\xef\xbf\xbd",
|
|
|
|
|
"\xf4\x8f\x80\x80"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
size_t len = sizeof(inputs) / sizeof(inputs[0]);
|
|
|
|
|
for (size_t i = 0; i < len; i++) {
|
|
|
|
|
std::stringstream ss;
|
|
|
|
|
ss << "inputs[" << i << "]";
|
|
|
|
|
SCOPED_TRACE(ss.str());
|
|
|
|
|
|
|
|
|
|
EXPECT_TRUE(isValidUtf8Char(inputs[i]));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(UnicodeTest, InvalidUtf8) {
|
|
|
|
|
const char* inputs[] = {
|
|
|
|
|
// 3 malformed sequences
|
|
|
|
|
// 3.1 unexpected continuation bytes
|
|
|
|
|
"\x80",
|
|
|
|
|
"\xbf",
|
|
|
|
|
|
|
|
|
|
// 3.2 lonely start characters
|
|
|
|
|
"\xc0 ",
|
|
|
|
|
"\xdf ",
|
|
|
|
|
"\xe0 ",
|
|
|
|
|
"\xef ",
|
|
|
|
|
"\xf0 ",
|
|
|
|
|
"\xf7 ",
|
|
|
|
|
"\xf8 ",
|
|
|
|
|
"\xfb ",
|
|
|
|
|
"\xfc ",
|
|
|
|
|
"\xfd ",
|
|
|
|
|
|
|
|
|
|
// 3.3 sequences with last continuation byte missing
|
|
|
|
|
"\xc2",
|
|
|
|
|
"\xe0\xa0",
|
|
|
|
|
"\xe1\x80",
|
|
|
|
|
"\xf0\x90\x80",
|
|
|
|
|
"\xf1\x80\x80",
|
|
|
|
|
"\xf4\x80\x80",
|
|
|
|
|
"\xdf",
|
|
|
|
|
"\xe0\xbf",
|
|
|
|
|
"\xef\xbf",
|
|
|
|
|
"\xf0\xbf\xbf",
|
|
|
|
|
"\xf3\xbf\xbf",
|
|
|
|
|
"\xf4\x8f\xbf",
|
|
|
|
|
|
|
|
|
|
// 3.5 impossible bytes
|
|
|
|
|
"\xfe",
|
|
|
|
|
"\xff",
|
|
|
|
|
"\xfe\xfe\xff\xff",
|
|
|
|
|
|
|
|
|
|
// 4.1 examples of an overlong ascii character
|
|
|
|
|
"\xc0\xaf",
|
|
|
|
|
"\xe0\x80\xaf",
|
|
|
|
|
"\xf0\x80\x80\xaf",
|
|
|
|
|
|
|
|
|
|
// 4.2 maximum overlong sequences
|
|
|
|
|
"\xc1\xbf",
|
|
|
|
|
"\xe0\x9f\xbf",
|
|
|
|
|
"\xf0\x8f\xbf\xbf",
|
|
|
|
|
|
|
|
|
|
// 4.3 overlong representation of the NUL character
|
|
|
|
|
"\xc0\x80",
|
|
|
|
|
"\xe0\x80\x80",
|
|
|
|
|
"\xf0\x80\x80",
|
|
|
|
|
|
2016-06-01 05:10:06 -04:00
|
|
|
|
/// @todo ALC support for the rest of the test cases
|
2015-11-13 11:55:44 -05:00
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
size_t len = sizeof(inputs) / sizeof(inputs[0]);
|
|
|
|
|
for (size_t i = 0; i < len; i++) {
|
|
|
|
|
std::stringstream ss;
|
|
|
|
|
ss << "inputs[" << i << "]";
|
|
|
|
|
SCOPED_TRACE(ss.str());
|
|
|
|
|
EXPECT_FALSE(isValidUtf8Char(inputs[i]));
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-01-08 09:20:42 -05:00
|
|
|
|
|
|
|
|
|
TEST(UnicodeTest, UnwantedSymbols) {
|
|
|
|
|
const char* inputs[] = {
|
|
|
|
|
// Emoji & Pictographs
|
|
|
|
|
// 2600–26FF: Miscellaneous Symbols
|
|
|
|
|
"☀",
|
|
|
|
|
"⛿",
|
|
|
|
|
|
|
|
|
|
// 2700–27BF: Dingbats
|
|
|
|
|
"✀",
|
|
|
|
|
"➿",
|
|
|
|
|
|
2016-12-12 04:52:28 -05:00
|
|
|
|
// FFF0-FFFF: Specials
|
|
|
|
|
"",
|
|
|
|
|
"",
|
|
|
|
|
|
2016-01-08 09:20:42 -05:00
|
|
|
|
// 1F300–1F5FF: Miscellaneous Symbols and Pictographs
|
|
|
|
|
"🌀",
|
|
|
|
|
"🗿",
|
|
|
|
|
|
|
|
|
|
// 1F600–1F64F: Emoticons
|
|
|
|
|
"😀",
|
|
|
|
|
"🙏",
|
|
|
|
|
|
|
|
|
|
// 1F650–1F67F: Ornamental Dingbats
|
|
|
|
|
"🙐",
|
|
|
|
|
"🙿",
|
|
|
|
|
|
|
|
|
|
// 1F680–1F6FF: Transport and Map Symbols
|
|
|
|
|
"🚀",
|
|
|
|
|
"🛰",
|
|
|
|
|
|
|
|
|
|
// 1F900–1F9FF: Supplemental Symbols and Pictographs
|
|
|
|
|
"🤀",
|
|
|
|
|
"🧿",
|
|
|
|
|
|
|
|
|
|
// Game Symbols
|
|
|
|
|
// 1F000–1F02F: Mahjong Tiles
|
|
|
|
|
"🀀",
|
|
|
|
|
"",
|
|
|
|
|
|
|
|
|
|
// 1F030–1F09F: Domino Tiles
|
|
|
|
|
"🀰",
|
|
|
|
|
"",
|
|
|
|
|
|
|
|
|
|
// 1F0A0–1F0FF: Playing Cards
|
|
|
|
|
"🂠",
|
|
|
|
|
"",
|
|
|
|
|
|
2016-01-20 07:32:13 -05:00
|
|
|
|
// Enclosed Alphanumeric Supplement
|
|
|
|
|
// 1F1E6–1F1FF: Regional indicator symbols
|
|
|
|
|
"🇦",
|
|
|
|
|
"🇿",
|
|
|
|
|
|
|
|
|
|
// Geometric Shapes
|
|
|
|
|
// 25A0–25FF: Geometric Shapes
|
|
|
|
|
"■",
|
|
|
|
|
"◿",
|
2016-01-08 09:20:42 -05:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
size_t len = sizeof(inputs) / sizeof(inputs[0]);
|
|
|
|
|
for (size_t i = 0; i < len; i++) {
|
|
|
|
|
std::stringstream ss;
|
|
|
|
|
ss << "inputs[" << i << "]";
|
|
|
|
|
SCOPED_TRACE(ss.str());
|
|
|
|
|
|
|
|
|
|
EXPECT_TRUE(isUtf8UnwantedSymbols(inputs[i]));
|
|
|
|
|
}
|
|
|
|
|
}
|
2017-10-06 11:12:17 -04:00
|
|
|
|
|
|
|
|
|
TEST(UnicodeTest, CharSize) {
|
|
|
|
|
std::vector<std::tuple<const char *, size_t>> test_cases = {
|
|
|
|
|
std::make_tuple("💩", 4)
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
for (auto it = test_cases.begin(); it != test_cases.end(); ++it) {
|
|
|
|
|
EXPECT_EQ(std::get<1>(*it), getUtf8CharSize(std::get<0>(*it)));
|
|
|
|
|
}
|
2018-02-04 04:59:31 -05:00
|
|
|
|
}
|