privacore-open-source-searc.../test/unit/UnicodeTest.cpp
Ivan Skytte Jørgensen 2d3faa2b3e unicode: fix unittests
2018-02-08 16:13:18 +01:00

212 lines
4.0 KiB
C++
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <gtest/gtest.h>
#include "utf8.h"
#include "TitleSummaryCodepointFilter.h"
#include <sstream>
#include "Log.h"
// Test is based on: http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
TEST(UnicodeTest, ValidUtf8) {
const char* inputs[] = {
// 1. correct UTF-8 text
"κ",
"",
"σ",
"μ",
"ε",
"",
"",
"Б",
"ъ",
"л",
"г",
"а",
"р",
"с",
"к",
"и",
// 2. boundary condition test cases
// 2.1 first possible sequence of range
"\x00",
"\xc2\x80",
"\xe0\xa0\x80",
"\xe1\x80\x80",
"\xf0\x90\x80\x80",
"\xf1\x80\x80\x80",
"\xf4\x80\x80\x80",
// 2.2 last possible sequence of range
"\x7f",
"\xdf\xbf"
"\xe0\xbf\xbf",
"\xef\xbf\xbf",
"\xf0\xbf\xbf\xbf",
"\xf3\xbf\xbf\xbf",
"\xf4\x8f\xbf\xbf",
// 2.3 other boundry conditions
"\xed\x9f\xbf",
"\xee\x80\x80",
"\xef\xbf\xbd",
"\xf4\x8f\x80\x80"
};
size_t len = sizeof(inputs) / sizeof(inputs[0]);
for (size_t i = 0; i < len; i++) {
std::stringstream ss;
ss << "inputs[" << i << "]";
SCOPED_TRACE(ss.str());
EXPECT_TRUE(isValidUtf8Char(inputs[i]));
}
}
TEST(UnicodeTest, InvalidUtf8) {
const char* inputs[] = {
// 3 malformed sequences
// 3.1 unexpected continuation bytes
"\x80",
"\xbf",
// 3.2 lonely start characters
"\xc0 ",
"\xdf ",
"\xe0 ",
"\xef ",
"\xf0 ",
"\xf7 ",
"\xf8 ",
"\xfb ",
"\xfc ",
"\xfd ",
// 3.3 sequences with last continuation byte missing
"\xc2",
"\xe0\xa0",
"\xe1\x80",
"\xf0\x90\x80",
"\xf1\x80\x80",
"\xf4\x80\x80",
"\xdf",
"\xe0\xbf",
"\xef\xbf",
"\xf0\xbf\xbf",
"\xf3\xbf\xbf",
"\xf4\x8f\xbf",
// 3.5 impossible bytes
"\xfe",
"\xff",
"\xfe\xfe\xff\xff",
// 4.1 examples of an overlong ascii character
"\xc0\xaf",
"\xe0\x80\xaf",
"\xf0\x80\x80\xaf",
// 4.2 maximum overlong sequences
"\xc1\xbf",
"\xe0\x9f\xbf",
"\xf0\x8f\xbf\xbf",
// 4.3 overlong representation of the NUL character
"\xc0\x80",
"\xe0\x80\x80",
"\xf0\x80\x80",
/// @todo ALC support for the rest of the test cases
};
size_t len = sizeof(inputs) / sizeof(inputs[0]);
for (size_t i = 0; i < len; i++) {
std::stringstream ss;
ss << "inputs[" << i << "]";
SCOPED_TRACE(ss.str());
EXPECT_FALSE(isValidUtf8Char(inputs[i]));
}
}
TEST(UnicodeTest, UnwantedSymbols) {
const char* inputs[] = {
// Emoji & Pictographs
// 260026FF: Miscellaneous Symbols
"",
"",
// 270027BF: Dingbats
"",
"",
// FFF0-FFFF: Specials
"",
"￿",
// 1F3001F5FF: Miscellaneous Symbols and Pictographs
"🌀",
"🗿",
// 1F6001F64F: Emoticons
"😀",
"🙏",
// 1F6501F67F: Ornamental Dingbats
"🙐",
"🙿",
// 1F6801F6FF: Transport and Map Symbols
"🚀",
"🛰",
// 1F9001F9FF: Supplemental Symbols and Pictographs
"🤀",
"🧿",
// Game Symbols
// 1F0001F02F: Mahjong Tiles
"🀀",
"🀯",
// 1F0301F09F: Domino Tiles
"🀰",
"🂟",
// 1F0A01F0FF: Playing Cards
"🂠",
"🃿",
// Enclosed Alphanumeric Supplement
// 1F1E61F1FF: Regional indicator symbols
"🇦",
"🇿",
// Geometric Shapes
// 25A025FF: Geometric Shapes
"",
"",
};
size_t len = sizeof(inputs) / sizeof(inputs[0]);
for (size_t i = 0; i < len; i++) {
std::stringstream ss;
ss << "inputs[" << i << "]";
SCOPED_TRACE(ss.str());
EXPECT_TRUE(isUtf8UnwantedSymbols(inputs[i]));
}
}
TEST(UnicodeTest, CharSize) {
std::vector<std::tuple<const char *, size_t>> test_cases = {
std::make_tuple("💩", 4)
};
for (auto it = test_cases.begin(); it != test_cases.end(); ++it) {
EXPECT_EQ(std::get<1>(*it), getUtf8CharSize(std::get<0>(*it)));
}
}