Asterisk - The Open Source Telephony Project  18.5.0
utf8.c
Go to the documentation of this file.
1 /*
2  * Asterisk -- An open source telephony toolkit.
3  *
4  * Copyright (C) 2020, Sean Bright
5  *
6  * Sean Bright <[email protected]>
7  *
8  * See http://www.asterisk.org for more information about
9  * the Asterisk project. Please do not directly contact
10  * any of the maintainers of this project for assistance;
11  * the project provides a web site, mailing lists and IRC
12  * channels for your use.
13  *
14  * This program is free software, distributed under the terms of
15  * the GNU General Public License Version 2. See the LICENSE file
16  * at the top of the source tree.
17  */
18 
19 /*! \file
20  *
21  * \brief UTF-8 information and validation functions
22  */
23 
24 /*** MODULEINFO
25  <support_level>core</support_level>
26 ***/
27 
28 #include "asterisk.h"
29 
30 #include "asterisk/utils.h"
31 #include "asterisk/utf8.h"
32 #include "asterisk/test.h"
33 
34 /*
35  * BEGIN THIRD PARTY CODE
36  *
37  * Copyright (c) 2008-2010 Björn Höhrmann <[email protected]>
38  *
39  * Permission is hereby granted, free of charge, to any person obtaining a copy
40  * of this software and associated documentation files (the "Software"), to deal
41  * in the Software without restriction, including without limitation the rights
42  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
43  * copies of the Software, and to permit persons to whom the Software is
44  * furnished to do so, subject to the following conditions:
45  *
46  * The above copyright notice and this permission notice shall be included in all
47  * copies or substantial portions of the Software.
48  *
49  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
54  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
55  * SOFTWARE.
56  *
57  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
58  */
59 
60 #define UTF8_ACCEPT 0
61 #define UTF8_REJECT 12
62 
63 static const uint8_t utf8d[] = {
64  /* The first part of the table maps bytes to character classes that
65  * to reduce the size of the transition table and create bitmasks. */
66  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
67  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
68  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
69  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
71  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
72  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
74 
75  /* The second part is a transition table that maps a combination
76  * of a state of the automaton and a character class to a state. */
77  0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
78  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
79  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
80  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
81  12,36,12,12,12,12,12,12,12,12,12,12,
82 };
83 
84 #if 0
85 /* We can bring this back if we need the codepoint? */
86 static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
87  uint32_t type = utf8d[byte];
88 
89  *codep = (*state != UTF8_ACCEPT) ?
90  (byte & 0x3fu) | (*codep << 6) :
91  (0xff >> type) & (byte);
92 
93  *state = utf8d[256 + *state + type];
94  return *state;
95 }
96 #endif
97 
98 static uint32_t inline decode(uint32_t *state, uint32_t byte) {
99  uint32_t type = utf8d[byte];
100  *state = utf8d[256 + *state + type];
101  return *state;
102 }
103 
104 /*
105  * END THIRD PARTY CODE
106  *
107  * See copyright notice above.
108  */
109 
110 int ast_utf8_is_valid(const char *src)
111 {
112  uint32_t state = UTF8_ACCEPT;
113 
114  while (*src) {
115  decode(&state, (uint8_t) *src++);
116  }
117 
118  return state == UTF8_ACCEPT;
119 }
120 
121 int ast_utf8_is_validn(const char *src, size_t size)
122 {
123  uint32_t state = UTF8_ACCEPT;
124 
125  while (size && *src) {
126  decode(&state, (uint8_t) *src++);
127  size--;
128  }
129 
130  return state == UTF8_ACCEPT;
131 }
132 
133 void ast_utf8_copy_string(char *dst, const char *src, size_t size)
134 {
135  uint32_t state = UTF8_ACCEPT;
136  char *last_good = dst;
137 
138  ast_assert(size > 0);
139 
140  while (size && *src) {
141  if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
142  /* We _could_ replace with U+FFFD and try to recover, but for now
143  * we treat this the same as if we had run out of space */
144  break;
145  }
146 
147  *dst++ = *src++;
148  size--;
149 
150  if (size && state == UTF8_ACCEPT) {
151  /* last_good is where we will ultimately write the 0 byte */
152  last_good = dst;
153  }
154  }
155 
156  *last_good = '\0';
157 }
158 
160  uint32_t state;
161 };
162 
164 {
165  struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
166 
167  if (!tmp) {
168  return 1;
169  }
170 
171  tmp->state = UTF8_ACCEPT;
172  *validator = tmp;
173  return 0;
174 }
175 
177  struct ast_utf8_validator *validator)
178 {
179  switch (validator->state) {
180  case UTF8_ACCEPT:
181  return AST_UTF8_VALID;
182  case UTF8_REJECT:
183  return AST_UTF8_INVALID;
184  default:
185  return AST_UTF8_UNKNOWN;
186  }
187 }
188 
190  struct ast_utf8_validator *validator, const char *data)
191 {
192  while (*data) {
193  decode(&validator->state, (uint8_t) *data++);
194  }
195 
196  return ast_utf8_validator_state(validator);
197 }
198 
200  struct ast_utf8_validator *validator, const char *data, size_t size)
201 {
202  while (size && *data) {
203  decode(&validator->state, (uint8_t) *data++);
204  size--;
205  }
206 
207  return ast_utf8_validator_state(validator);
208 }
209 
211 {
212  validator->state = UTF8_ACCEPT;
213 }
214 
216 {
217  ast_free(validator);
218 }
219 
220 #ifdef TEST_FRAMEWORK
221 
222 AST_TEST_DEFINE(test_utf8_is_valid)
223 {
224  switch (cmd) {
225  case TEST_INIT:
226  info->name = "is_valid";
227  info->category = "/main/utf8/";
228  info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn";
229  info->description =
230  "Tests UTF-8 string validation code.";
231  return AST_TEST_NOT_RUN;
232  case TEST_EXECUTE:
233  break;
234  }
235 
236  /* Valid UTF-8 */
237  ast_test_validate(test, ast_utf8_is_valid("Asterisk"));
238  ast_test_validate(test, ast_utf8_is_valid("\xce\xbb"));
239  ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b"));
240  ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e"));
241 
242  /* Valid with leading */
243  ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk"));
244  ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb"));
245  ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b"));
246  ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e"));
247 
248  /* Valid with trailing */
249  ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa"));
250  ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa"));
251  ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa"));
252  ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa"));
253 
254  /* Valid with leading and trailing */
255  ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa"));
256  ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa"));
257  ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa"));
258  ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa"));
259 
260  /* Valid if limited by number of bytes */
261  ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk")));
262  ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb")));
263  ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b")));
264  ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e")));
265 
266  /* Invalid */
267  ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */
268  ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */
269  ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28"));
270  ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1"));
271  ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1"));
272  ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28"));
273  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc"));
274  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc"));
275  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28"));
276 
277  return AST_TEST_PASS;
278 }
279 
280 static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
281 {
282  char dst[dst_len];
283  ast_utf8_copy_string(dst, src, dst_len);
284  return strcmp(dst, cmp) == 0;
285 }
286 
287 AST_TEST_DEFINE(test_utf8_copy_string)
288 {
289  switch (cmd) {
290  case TEST_INIT:
291  info->name = "copy_string";
292  info->category = "/main/utf8/";
293  info->summary = "Test ast_utf8_copy_string";
294  info->description =
295  "Tests UTF-8 string copying code.";
296  return AST_TEST_NOT_RUN;
297  case TEST_EXECUTE:
298  break;
299  }
300 
301  ast_test_validate(test, test_copy_and_compare("Asterisk", 6, "Aster"));
302  ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk "));
303  ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae"));
304  ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk "));
305  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, ""));
306  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, ""));
307  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb"));
308  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb "));
309  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x"));
310  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy"));
311  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz"));
312 
313  return AST_TEST_PASS;
314 }
315 
316 AST_TEST_DEFINE(test_utf8_validator)
317 {
318  struct ast_utf8_validator *validator;
319 
320  switch (cmd) {
321  case TEST_INIT:
322  info->name = "utf8_validator";
323  info->category = "/main/utf8/";
324  info->summary = "Test ast_utf8_validator";
325  info->description =
326  "Tests UTF-8 progressive validator code.";
327  return AST_TEST_NOT_RUN;
328  case TEST_EXECUTE:
329  break;
330  }
331 
332  if (ast_utf8_validator_new(&validator)) {
333  return AST_TEST_FAIL;
334  }
335 
336  ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID);
337  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2") == AST_UTF8_UNKNOWN);
338  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae") == AST_UTF8_VALID);
339  ast_test_validate(test, ast_utf8_validator_feed(validator, "Private") == AST_UTF8_VALID);
340  ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch") == AST_UTF8_VALID);
341  ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID);
342  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2") == AST_UTF8_UNKNOWN);
343  ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84") == AST_UTF8_UNKNOWN);
344  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb") == AST_UTF8_VALID);
345  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID);
346  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
347  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
348  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
349 
350  ast_utf8_validator_destroy(validator);
351 
352  return AST_TEST_PASS;
353 }
354 
355 static void test_utf8_shutdown(void)
356 {
357  AST_TEST_UNREGISTER(test_utf8_is_valid);
358  AST_TEST_UNREGISTER(test_utf8_copy_string);
359  AST_TEST_UNREGISTER(test_utf8_validator);
360 }
361 
362 int ast_utf8_init(void)
363 {
364  AST_TEST_REGISTER(test_utf8_is_valid);
365  AST_TEST_REGISTER(test_utf8_copy_string);
366  AST_TEST_REGISTER(test_utf8_validator);
367 
369 
370  return 0;
371 }
372 
373 #else /* !TEST_FRAMEWORK */
374 
375 int ast_utf8_init(void)
376 {
377  return 0;
378 }
379 
380 #endif
static const char type[]
Definition: chan_ooh323.c:109
The consumed sequence is invalid UTF-8.
Definition: utf8.h:86
enum sip_cc_notify_state state
Definition: chan_sip.c:959
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:215
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:163
Asterisk main include file. File version handling, generic pbx functions.
int ast_utf8_is_validn(const char *src, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:210
static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
Definition: utf8.c:280
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
uint32_t state
Definition: utf8.c:160
int ast_utf8_is_valid(const char *src)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110
static int tmp()
Definition: bt_open.c:389
#define UTF8_REJECT
Definition: utf8.c:61
Test Framework API.
#define AST_TEST_REGISTER(cb)
Definition: test.h:127
#define ast_assert(a)
Definition: utils.h:695
static void test_utf8_shutdown(void)
Definition: utf8.c:355
UTF-8 information and validation functions.
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:362
Utility functions.
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:176
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:189
int ast_register_cleanup(void(*func)(void))
Register a function to be executed before Asterisk gracefully exits.
Definition: clicompat.c:19
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133
The consumed sequence is valid UTF-8.
Definition: utf8.h:78
#define ast_malloc(len)
A wrapper for malloc()
Definition: astmm.h:193
#define UTF8_ACCEPT
Definition: utf8.c:60
ast_utf8_validation_result
Definition: utf8.h:71
#define AST_TEST_UNREGISTER(cb)
Definition: test.h:128
def info(msg)
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:199
#define ast_free(a)
Definition: astmm.h:182
AST_TEST_DEFINE(test_utf8_is_valid)
Definition: utf8.c:222
The validator is in an intermediate state.
Definition: utf8.h:96
static const uint8_t utf8d[]
Definition: utf8.c:63