Commit | Line | Data |
---|---|---|
9c92ab61 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
f0d6cc00 GKB |
2 | /* |
3 | * Kernel module for testing utf-8 support. | |
4 | * | |
5 | * Copyright 2017 Collabora Ltd. | |
f0d6cc00 GKB |
6 | */ |
7 | ||
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | |
9 | ||
10 | #include <linux/module.h> | |
11 | #include <linux/printk.h> | |
12 | #include <linux/unicode.h> | |
13 | #include <linux/dcache.h> | |
14 | ||
15 | #include "utf8n.h" | |
16 | ||
17 | unsigned int failed_tests; | |
18 | unsigned int total_tests; | |
19 | ||
20 | /* Tests will be based on this version. */ | |
6ca99ce7 | 21 | #define UTF8_LATEST UNICODE_AGE(12, 1, 0) |
f0d6cc00 GKB |
22 | |
23 | #define _test(cond, func, line, fmt, ...) do { \ | |
24 | total_tests++; \ | |
25 | if (!cond) { \ | |
26 | failed_tests++; \ | |
27 | pr_err("test %s:%d Failed: %s%s", \ | |
28 | func, line, #cond, (fmt?":":".")); \ | |
29 | if (fmt) \ | |
30 | pr_err(fmt, ##__VA_ARGS__); \ | |
31 | } \ | |
32 | } while (0) | |
33 | #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) | |
34 | #define test(cond) _test(cond, __func__, __LINE__, "") | |
35 | ||
334b427e | 36 | static const struct { |
f0d6cc00 GKB |
37 | /* UTF-8 strings in this vector _must_ be NULL-terminated. */ |
38 | unsigned char str[10]; | |
39 | unsigned char dec[10]; | |
40 | } nfdi_test_data[] = { | |
41 | /* Trivial sequence */ | |
42 | { | |
43 | /* "ABba" decomposes to itself */ | |
44 | .str = "aBba", | |
45 | .dec = "aBba", | |
46 | }, | |
47 | /* Simple equivalent sequences */ | |
48 | { | |
49 | /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to | |
50 | 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on | |
51 | canonical decomposition */ | |
52 | .str = {0xc2, 0xbc, 0x00}, | |
53 | .dec = {0xc2, 0xbc, 0x00}, | |
54 | }, | |
55 | { | |
56 | /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to | |
57 | 'LETTER A' + 'COMBINING DIAERESIS' */ | |
58 | .str = {0xc3, 0xa4, 0x00}, | |
59 | .dec = {0x61, 0xcc, 0x88, 0x00}, | |
60 | }, | |
61 | { | |
62 | /* 'LATIN SMALL LETTER LJ' can't decompose to | |
63 | 'LETTER L' + 'LETTER J' on canonical decomposition */ | |
64 | .str = {0xC7, 0x89, 0x00}, | |
65 | .dec = {0xC7, 0x89, 0x00}, | |
66 | }, | |
67 | { | |
68 | /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ | |
69 | .str = {0xCE, 0x87, 0x00}, | |
70 | .dec = {0xC2, 0xB7, 0x00} | |
71 | }, | |
72 | /* Canonical ordering */ | |
73 | { | |
74 | /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes | |
75 | to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ | |
76 | .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, | |
77 | .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, | |
78 | }, | |
79 | { | |
80 | /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' | |
81 | decomposes to | |
82 | 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ | |
83 | .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, | |
84 | ||
85 | .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, | |
86 | }, | |
87 | ||
88 | }; | |
89 | ||
334b427e | 90 | static const struct { |
f0d6cc00 GKB |
91 | /* UTF-8 strings in this vector _must_ be NULL-terminated. */ |
92 | unsigned char str[30]; | |
93 | unsigned char ncf[30]; | |
94 | } nfdicf_test_data[] = { | |
95 | /* Trivial sequences */ | |
96 | { | |
97 | /* "ABba" folds to lowercase */ | |
98 | .str = {0x41, 0x42, 0x62, 0x61, 0x00}, | |
99 | .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, | |
100 | }, | |
101 | { | |
102 | /* All ASCII folds to lower-case */ | |
103 | .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1", | |
104 | .ncf = "abcdefghijklmnopqrstuvwxyz0.1", | |
105 | }, | |
106 | { | |
107 | /* LATIN SMALL LETTER SHARP S folds to | |
108 | LATIN SMALL LETTER S + LATIN SMALL LETTER S */ | |
109 | .str = {0xc3, 0x9f, 0x00}, | |
110 | .ncf = {0x73, 0x73, 0x00}, | |
111 | }, | |
112 | { | |
113 | /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to | |
114 | LATIN SMALL LETTER A + COMBINING RING ABOVE */ | |
115 | .str = {0xC3, 0x85, 0x00}, | |
116 | .ncf = {0x61, 0xcc, 0x8a, 0x00}, | |
117 | }, | |
118 | /* Introduced by UTF-8.0.0. */ | |
119 | /* Cherokee letters are interesting test-cases because they fold | |
120 | to upper-case. Before 8.0.0, Cherokee lowercase were | |
121 | undefined, thus, the folding from LC is not stable between | |
122 | 7.0.0 -> 8.0.0, but it is from UC. */ | |
123 | { | |
124 | /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ | |
125 | .str = {0xea, 0xad, 0xb0, 0x00}, | |
126 | .ncf = {0xe1, 0x8e, 0xa0, 0x00}, | |
127 | }, | |
128 | { | |
129 | /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ | |
130 | .str = {0xe1, 0x8f, 0xb8, 0x00}, | |
131 | .ncf = {0xe1, 0x8f, 0xb0, 0x00}, | |
132 | }, | |
133 | { | |
134 | /* OLD HUNGARIAN CAPITAL LETTER AMB folds to | |
135 | OLD HUNGARIAN SMALL LETTER AMB */ | |
136 | .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, | |
137 | .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, | |
138 | }, | |
139 | /* Introduced by UTF-9.0.0. */ | |
140 | { | |
141 | /* OSAGE CAPITAL LETTER CHA folds to | |
142 | OSAGE SMALL LETTER CHA */ | |
143 | .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, | |
144 | .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, | |
145 | }, | |
146 | { | |
147 | /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to | |
148 | LATIN LETTER SMALL CAPITAL I */ | |
149 | .str = {0xea, 0x9e, 0xae, 0x00}, | |
150 | .ncf = {0xc9, 0xaa, 0x00}, | |
151 | }, | |
152 | /* Introduced by UTF-11.0.0. */ | |
153 | { | |
154 | /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI | |
155 | CAPITAL LETTER AN */ | |
156 | .str = {0xe1, 0xb2, 0x90, 0x00}, | |
157 | .ncf = {0xe1, 0x83, 0x90, 0x00}, | |
158 | } | |
159 | }; | |
160 | ||
6ca99ce7 CH |
161 | static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n, |
162 | const char *s) | |
9012d79c | 163 | { |
6ca99ce7 | 164 | return utf8nlen(um, n, s, (size_t)-1); |
9012d79c CH |
165 | } |
166 | ||
6ca99ce7 CH |
167 | static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, |
168 | enum utf8_normalization n, const char *s) | |
fbc59d65 | 169 | { |
6ca99ce7 | 170 | return utf8ncursor(u8c, um, n, s, (unsigned int)-1); |
fbc59d65 CH |
171 | } |
172 | ||
6ca99ce7 | 173 | static void check_utf8_nfdi(struct unicode_map *um) |
f0d6cc00 GKB |
174 | { |
175 | int i; | |
176 | struct utf8cursor u8c; | |
f0d6cc00 GKB |
177 | |
178 | for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { | |
179 | int len = strlen(nfdi_test_data[i].str); | |
180 | int nlen = strlen(nfdi_test_data[i].dec); | |
181 | int j = 0; | |
182 | unsigned char c; | |
183 | ||
6ca99ce7 CH |
184 | test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); |
185 | test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == | |
186 | nlen)); | |
f0d6cc00 | 187 | |
6ca99ce7 | 188 | if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) |
f0d6cc00 GKB |
189 | pr_err("can't create cursor\n"); |
190 | ||
191 | while ((c = utf8byte(&u8c)) > 0) { | |
192 | test_f((c == nfdi_test_data[i].dec[j]), | |
193 | "Unexpected byte 0x%x should be 0x%x\n", | |
194 | c, nfdi_test_data[i].dec[j]); | |
195 | j++; | |
196 | } | |
197 | ||
198 | test((j == nlen)); | |
199 | } | |
200 | } | |
201 | ||
6ca99ce7 | 202 | static void check_utf8_nfdicf(struct unicode_map *um) |
f0d6cc00 GKB |
203 | { |
204 | int i; | |
205 | struct utf8cursor u8c; | |
f0d6cc00 GKB |
206 | |
207 | for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { | |
208 | int len = strlen(nfdicf_test_data[i].str); | |
209 | int nlen = strlen(nfdicf_test_data[i].ncf); | |
210 | int j = 0; | |
211 | unsigned char c; | |
212 | ||
6ca99ce7 CH |
213 | test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == |
214 | nlen)); | |
215 | test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == | |
216 | nlen)); | |
f0d6cc00 | 217 | |
6ca99ce7 CH |
218 | if (utf8cursor(&u8c, um, UTF8_NFDICF, |
219 | nfdicf_test_data[i].str) < 0) | |
f0d6cc00 GKB |
220 | pr_err("can't create cursor\n"); |
221 | ||
222 | while ((c = utf8byte(&u8c)) > 0) { | |
223 | test_f((c == nfdicf_test_data[i].ncf[j]), | |
224 | "Unexpected byte 0x%x should be 0x%x\n", | |
225 | c, nfdicf_test_data[i].ncf[j]); | |
226 | j++; | |
227 | } | |
228 | ||
229 | test((j == nlen)); | |
230 | } | |
231 | } | |
232 | ||
6ca99ce7 | 233 | static void check_utf8_comparisons(struct unicode_map *table) |
f0d6cc00 GKB |
234 | { |
235 | int i; | |
f0d6cc00 GKB |
236 | |
237 | for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { | |
238 | const struct qstr s1 = {.name = nfdi_test_data[i].str, | |
239 | .len = sizeof(nfdi_test_data[i].str)}; | |
240 | const struct qstr s2 = {.name = nfdi_test_data[i].dec, | |
241 | .len = sizeof(nfdi_test_data[i].dec)}; | |
242 | ||
243 | test_f(!utf8_strncmp(table, &s1, &s2), | |
244 | "%s %s comparison mismatch\n", s1.name, s2.name); | |
245 | } | |
246 | ||
247 | for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { | |
248 | const struct qstr s1 = {.name = nfdicf_test_data[i].str, | |
249 | .len = sizeof(nfdicf_test_data[i].str)}; | |
250 | const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, | |
251 | .len = sizeof(nfdicf_test_data[i].ncf)}; | |
252 | ||
253 | test_f(!utf8_strncasecmp(table, &s1, &s2), | |
254 | "%s %s comparison mismatch\n", s1.name, s2.name); | |
255 | } | |
f0d6cc00 GKB |
256 | } |
257 | ||
2b3d0478 | 258 | static void check_supported_versions(struct unicode_map *um) |
f0d6cc00 GKB |
259 | { |
260 | /* Unicode 7.0.0 should be supported. */ | |
2b3d0478 | 261 | test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); |
f0d6cc00 GKB |
262 | |
263 | /* Unicode 9.0.0 should be supported. */ | |
2b3d0478 | 264 | test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); |
f0d6cc00 GKB |
265 | |
266 | /* Unicode 1x.0.0 (the latest version) should be supported. */ | |
2b3d0478 | 267 | test(utf8version_is_supported(um, UTF8_LATEST)); |
f0d6cc00 GKB |
268 | |
269 | /* Next versions don't exist. */ | |
2b3d0478 CH |
270 | test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); |
271 | test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); | |
272 | test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); | |
f0d6cc00 GKB |
273 | } |
274 | ||
275 | static int __init init_test_ucd(void) | |
276 | { | |
6ca99ce7 CH |
277 | struct unicode_map *um; |
278 | ||
f0d6cc00 GKB |
279 | failed_tests = 0; |
280 | total_tests = 0; | |
281 | ||
6ca99ce7 CH |
282 | um = utf8_load(UTF8_LATEST); |
283 | if (IS_ERR(um)) { | |
284 | pr_err("%s: Unable to load utf8 table.\n", __func__); | |
285 | return PTR_ERR(um); | |
286 | } | |
287 | ||
2b3d0478 | 288 | check_supported_versions(um); |
6ca99ce7 CH |
289 | check_utf8_nfdi(um); |
290 | check_utf8_nfdicf(um); | |
291 | check_utf8_comparisons(um); | |
f0d6cc00 GKB |
292 | |
293 | if (!failed_tests) | |
294 | pr_info("All %u tests passed\n", total_tests); | |
295 | else | |
296 | pr_err("%u out of %u tests failed\n", failed_tests, | |
297 | total_tests); | |
6ca99ce7 | 298 | utf8_unload(um); |
f0d6cc00 GKB |
299 | return 0; |
300 | } | |
301 | ||
302 | static void __exit exit_test_ucd(void) | |
303 | { | |
304 | } | |
305 | ||
306 | module_init(init_test_ucd); | |
307 | module_exit(exit_test_ucd); | |
308 | ||
309 | MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); | |
310 | MODULE_LICENSE("GPL"); |