Asterisk - The Open Source Telephony Project  18.5.0
Data Structures | Macros | Functions | Variables
utf8.c File Reference

UTF-8 information and validation functions. More...

#include "asterisk.h"
#include "asterisk/utils.h"
#include "asterisk/utf8.h"
#include "asterisk/test.h"
Include dependency graph for utf8.c:

Go to the source code of this file.

Data Structures

struct  ast_utf8_validator
 

Macros

#define UTF8_ACCEPT   0
 
#define UTF8_REJECT   12
 

Functions

 AST_TEST_DEFINE (test_utf8_is_valid)
 
 AST_TEST_DEFINE (test_utf8_copy_string)
 
 AST_TEST_DEFINE (test_utf8_validator)
 
void ast_utf8_copy_string (char *dst, const char *src, size_t size)
 Copy a string safely ensuring valid UTF-8. More...
 
int ast_utf8_init (void)
 Register UTF-8 tests. More...
 
int ast_utf8_is_valid (const char *src)
 Check if a zero-terminated string is valid UTF-8. More...
 
int ast_utf8_is_validn (const char *src, size_t size)
 Check if the first size bytes of a string are valid UTF-8. More...
 
void ast_utf8_validator_destroy (struct ast_utf8_validator *validator)
 Destroy a UTF-8 validator. More...
 
enum ast_utf8_validation_result ast_utf8_validator_feed (struct ast_utf8_validator *validator, const char *data)
 Feed a zero-terminated string into the UTF-8 validator. More...
 
enum ast_utf8_validation_result ast_utf8_validator_feedn (struct ast_utf8_validator *validator, const char *data, size_t size)
 Feed a string into the UTF-8 validator. More...
 
int ast_utf8_validator_new (struct ast_utf8_validator **validator)
 Create a new UTF-8 validator. More...
 
void ast_utf8_validator_reset (struct ast_utf8_validator *validator)
 Reset the state of a UTF-8 validator. More...
 
enum ast_utf8_validation_result ast_utf8_validator_state (struct ast_utf8_validator *validator)
 Get the current UTF-8 validator state. More...
 
static uint32_t decode (uint32_t *state, uint32_t byte)
 
static int test_copy_and_compare (const char *src, size_t dst_len, const char *cmp)
 
static void test_utf8_shutdown (void)
 

Variables

static const uint8_t utf8d []
 

Detailed Description

UTF-8 information and validation functions.

Definition in file utf8.c.

Macro Definition Documentation

◆ UTF8_ACCEPT

#define UTF8_ACCEPT   0

◆ UTF8_REJECT

#define UTF8_REJECT   12

Definition at line 61 of file utf8.c.

Referenced by ast_utf8_copy_string(), and ast_utf8_validator_state().

Function Documentation

◆ AST_TEST_DEFINE() [1/3]

AST_TEST_DEFINE ( test_utf8_is_valid  )

Definition at line 222 of file utf8.c.

References AST_TEST_NOT_RUN, AST_TEST_PASS, ast_utf8_is_valid(), ast_utf8_is_validn(), sip_to_pjsip::info(), TEST_EXECUTE, and TEST_INIT.

223 {
224  switch (cmd) {
225  case TEST_INIT:
226  info->name = "is_valid";
227  info->category = "/main/utf8/";
228  info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn";
229  info->description =
230  "Tests UTF-8 string validation code.";
231  return AST_TEST_NOT_RUN;
232  case TEST_EXECUTE:
233  break;
234  }
235 
236  /* Valid UTF-8 */
237  ast_test_validate(test, ast_utf8_is_valid("Asterisk"));
238  ast_test_validate(test, ast_utf8_is_valid("\xce\xbb"));
239  ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b"));
240  ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e"));
241 
242  /* Valid with leading */
243  ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk"));
244  ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb"));
245  ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b"));
246  ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e"));
247 
248  /* Valid with trailing */
249  ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa"));
250  ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa"));
251  ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa"));
252  ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa"));
253 
254  /* Valid with leading and trailing */
255  ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa"));
256  ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa"));
257  ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa"));
258  ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa"));
259 
260  /* Valid if limited by number of bytes */
261  ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk")));
262  ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb")));
263  ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b")));
264  ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e")));
265 
266  /* Invalid */
267  ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */
268  ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */
269  ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28"));
270  ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1"));
271  ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1"));
272  ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28"));
273  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc"));
274  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc"));
275  ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28"));
276 
277  return AST_TEST_PASS;
278 }
int ast_utf8_is_validn(const char *src, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121
int ast_utf8_is_valid(const char *src)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110
def info(msg)

◆ AST_TEST_DEFINE() [2/3]

AST_TEST_DEFINE ( test_utf8_copy_string  )

Definition at line 287 of file utf8.c.

References AST_TEST_NOT_RUN, AST_TEST_PASS, sip_to_pjsip::info(), test_copy_and_compare(), TEST_EXECUTE, and TEST_INIT.

288 {
289  switch (cmd) {
290  case TEST_INIT:
291  info->name = "copy_string";
292  info->category = "/main/utf8/";
293  info->summary = "Test ast_utf8_copy_string";
294  info->description =
295  "Tests UTF-8 string copying code.";
296  return AST_TEST_NOT_RUN;
297  case TEST_EXECUTE:
298  break;
299  }
300 
301  ast_test_validate(test, test_copy_and_compare("Asterisk", 6, "Aster"));
302  ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk "));
303  ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae"));
304  ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk "));
305  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, ""));
306  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, ""));
307  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb"));
308  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb "));
309  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x"));
310  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy"));
311  ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz"));
312 
313  return AST_TEST_PASS;
314 }
static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
Definition: utf8.c:280
def info(msg)

◆ AST_TEST_DEFINE() [3/3]

AST_TEST_DEFINE ( test_utf8_validator  )

Definition at line 316 of file utf8.c.

References AST_TEST_FAIL, AST_TEST_NOT_RUN, AST_TEST_PASS, AST_UTF8_INVALID, AST_UTF8_UNKNOWN, AST_UTF8_VALID, ast_utf8_validator_destroy(), ast_utf8_validator_feed(), ast_utf8_validator_new(), sip_to_pjsip::info(), TEST_EXECUTE, and TEST_INIT.

317 {
318  struct ast_utf8_validator *validator;
319 
320  switch (cmd) {
321  case TEST_INIT:
322  info->name = "utf8_validator";
323  info->category = "/main/utf8/";
324  info->summary = "Test ast_utf8_validator";
325  info->description =
326  "Tests UTF-8 progressive validator code.";
327  return AST_TEST_NOT_RUN;
328  case TEST_EXECUTE:
329  break;
330  }
331 
332  if (ast_utf8_validator_new(&validator)) {
333  return AST_TEST_FAIL;
334  }
335 
336  ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID);
337  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2") == AST_UTF8_UNKNOWN);
338  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae") == AST_UTF8_VALID);
339  ast_test_validate(test, ast_utf8_validator_feed(validator, "Private") == AST_UTF8_VALID);
340  ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch") == AST_UTF8_VALID);
341  ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID);
342  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2") == AST_UTF8_UNKNOWN);
343  ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84") == AST_UTF8_UNKNOWN);
344  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb") == AST_UTF8_VALID);
345  ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID);
346  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
347  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
348  ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
349 
350  ast_utf8_validator_destroy(validator);
351 
352  return AST_TEST_PASS;
353 }
The consumed sequence is invalid UTF-8.
Definition: utf8.h:86
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:215
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:163
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:189
The consumed sequence is valid UTF-8.
Definition: utf8.h:78
def info(msg)
The validator is in an intermediate state.
Definition: utf8.h:96

◆ ast_utf8_copy_string()

void ast_utf8_copy_string ( char *  dst,
const char *  src,
size_t  size 
)

Copy a string safely ensuring valid UTF-8.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

This is similar to ast_copy_string, but it will only copy valid UTF-8 sequences from the source string into the destination buffer. If an invalid UTF-8 sequence is encountered, or the available space in the destination buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the destination buffer will be truncated to ensure that it only contains valid UTF-8.

Parameters
dstThe destination buffer.
srcThe source string
sizeThe size of the destination buffer
Returns
Nothing.

Definition at line 133 of file utf8.c.

References ast_assert, decode(), UTF8_ACCEPT, and UTF8_REJECT.

Referenced by test_copy_and_compare().

134 {
135  uint32_t state = UTF8_ACCEPT;
136  char *last_good = dst;
137 
138  ast_assert(size > 0);
139 
140  while (size && *src) {
141  if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
142  /* We _could_ replace with U+FFFD and try to recover, but for now
143  * we treat this the same as if we had run out of space */
144  break;
145  }
146 
147  *dst++ = *src++;
148  size--;
149 
150  if (size && state == UTF8_ACCEPT) {
151  /* last_good is where we will ultimately write the 0 byte */
152  last_good = dst;
153  }
154  }
155 
156  *last_good = '\0';
157 }
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
#define UTF8_REJECT
Definition: utf8.c:61
#define ast_assert(a)
Definition: utils.h:695
#define UTF8_ACCEPT
Definition: utf8.c:60

◆ ast_utf8_init()

int ast_utf8_init ( void  )

Register UTF-8 tests.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Does nothing unless TEST_FRAMEWORK is defined.

Returns
Always returns 0

Definition at line 362 of file utf8.c.

References ast_register_cleanup(), AST_TEST_REGISTER, and test_utf8_shutdown().

Referenced by asterisk_daemon().

363 {
364  AST_TEST_REGISTER(test_utf8_is_valid);
365  AST_TEST_REGISTER(test_utf8_copy_string);
366  AST_TEST_REGISTER(test_utf8_validator);
367 
369 
370  return 0;
371 }
#define AST_TEST_REGISTER(cb)
Definition: test.h:127
static void test_utf8_shutdown(void)
Definition: utf8.c:355
int ast_register_cleanup(void(*func)(void))
Register a function to be executed before Asterisk gracefully exits.
Definition: clicompat.c:19

◆ ast_utf8_is_valid()

int ast_utf8_is_valid ( const char *  str)

Check if a zero-terminated string is valid UTF-8.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
strThe zero-terminated string to check
Return values
0if the string is not valid UTF-8
Non-zeroif the string is valid UTF-8

Definition at line 110 of file utf8.c.

References decode(), and UTF8_ACCEPT.

Referenced by AST_TEST_DEFINE().

111 {
112  uint32_t state = UTF8_ACCEPT;
113 
114  while (*src) {
115  decode(&state, (uint8_t) *src++);
116  }
117 
118  return state == UTF8_ACCEPT;
119 }
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
#define UTF8_ACCEPT
Definition: utf8.c:60

◆ ast_utf8_is_validn()

int ast_utf8_is_validn ( const char *  str,
size_t  size 
)

Check if the first size bytes of a string are valid UTF-8.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Similar to ast_utf8_is_valid() but checks the first size bytes or until a zero byte is reached, whichever comes first.

Parameters
strThe string to check
sizeThe number of bytes to evaluate
Return values
0if the string is not valid UTF-8
Non-zeroif the string is valid UTF-8

Definition at line 121 of file utf8.c.

References decode(), and UTF8_ACCEPT.

Referenced by AST_TEST_DEFINE().

122 {
123  uint32_t state = UTF8_ACCEPT;
124 
125  while (size && *src) {
126  decode(&state, (uint8_t) *src++);
127  size--;
128  }
129 
130  return state == UTF8_ACCEPT;
131 }
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
#define UTF8_ACCEPT
Definition: utf8.c:60

◆ ast_utf8_validator_destroy()

void ast_utf8_validator_destroy ( struct ast_utf8_validator validator)

Destroy a UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
validatorThe validator instance to destroy

Definition at line 215 of file utf8.c.

References ast_free.

Referenced by AST_TEST_DEFINE().

216 {
217  ast_free(validator);
218 }
#define ast_free(a)
Definition: astmm.h:182

◆ ast_utf8_validator_feed()

enum ast_utf8_validation_result ast_utf8_validator_feed ( struct ast_utf8_validator validator,
const char *  data 
)

Feed a zero-terminated string into the UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
validatorThe validator instance
dataThe zero-terminated string to feed into the validator
Returns
The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 189 of file utf8.c.

References ast_utf8_validator_state(), decode(), and ast_utf8_validator::state.

Referenced by AST_TEST_DEFINE().

191 {
192  while (*data) {
193  decode(&validator->state, (uint8_t) *data++);
194  }
195 
196  return ast_utf8_validator_state(validator);
197 }
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
uint32_t state
Definition: utf8.c:160
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:176

◆ ast_utf8_validator_feedn()

enum ast_utf8_validation_result ast_utf8_validator_feedn ( struct ast_utf8_validator validator,
const char *  data,
size_t  size 
)

Feed a string into the UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Similar to ast_utf8_validator_feed but will stop feeding in data if a zero byte is encountered or size bytes have been read.

Parameters
validatorThe validator instance
dataThe string to feed into the validator
sizeThe number of bytes to feed into the validator
Returns
The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 199 of file utf8.c.

References ast_utf8_validator_state(), decode(), and ast_utf8_validator::state.

201 {
202  while (size && *data) {
203  decode(&validator->state, (uint8_t) *data++);
204  size--;
205  }
206 
207  return ast_utf8_validator_state(validator);
208 }
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98
uint32_t state
Definition: utf8.c:160
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:176

◆ ast_utf8_validator_new()

int ast_utf8_validator_new ( struct ast_utf8_validator **  validator)

Create a new UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
[out]validatorThe validator instance
Return values
0on success
-1on failure

Definition at line 163 of file utf8.c.

References ast_malloc, ast_utf8_validator::state, tmp(), and UTF8_ACCEPT.

Referenced by AST_TEST_DEFINE().

164 {
165  struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
166 
167  if (!tmp) {
168  return 1;
169  }
170 
171  tmp->state = UTF8_ACCEPT;
172  *validator = tmp;
173  return 0;
174 }
uint32_t state
Definition: utf8.c:160
static int tmp()
Definition: bt_open.c:389
#define ast_malloc(len)
A wrapper for malloc()
Definition: astmm.h:193
#define UTF8_ACCEPT
Definition: utf8.c:60

◆ ast_utf8_validator_reset()

void ast_utf8_validator_reset ( struct ast_utf8_validator validator)

Reset the state of a UTF-8 validator.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0

Resets the provided UTF-8 validator to its initial state so that it can be reused.

Parameters
validatorThe validator instance to reset

Definition at line 210 of file utf8.c.

References ast_utf8_validator::state, and UTF8_ACCEPT.

211 {
212  validator->state = UTF8_ACCEPT;
213 }
uint32_t state
Definition: utf8.c:160
#define UTF8_ACCEPT
Definition: utf8.c:60

◆ ast_utf8_validator_state()

enum ast_utf8_validation_result ast_utf8_validator_state ( struct ast_utf8_validator validator)

Get the current UTF-8 validator state.

Since
13.36.0, 16.13.0, 17.7.0, 18.0.0
Parameters
validatorThe validator instance
Returns
The ast_utf8_validation_result indicating the current state of the validator.

Definition at line 176 of file utf8.c.

References AST_UTF8_INVALID, AST_UTF8_UNKNOWN, AST_UTF8_VALID, ast_utf8_validator::state, UTF8_ACCEPT, and UTF8_REJECT.

Referenced by ast_utf8_validator_feed(), and ast_utf8_validator_feedn().

178 {
179  switch (validator->state) {
180  case UTF8_ACCEPT:
181  return AST_UTF8_VALID;
182  case UTF8_REJECT:
183  return AST_UTF8_INVALID;
184  default:
185  return AST_UTF8_UNKNOWN;
186  }
187 }
The consumed sequence is invalid UTF-8.
Definition: utf8.h:86
uint32_t state
Definition: utf8.c:160
#define UTF8_REJECT
Definition: utf8.c:61
The consumed sequence is valid UTF-8.
Definition: utf8.h:78
#define UTF8_ACCEPT
Definition: utf8.c:60
The validator is in an intermediate state.
Definition: utf8.h:96

◆ decode()

static uint32_t decode ( uint32_t *  state,
uint32_t  byte 
)
inlinestatic

Definition at line 98 of file utf8.c.

References state, type, and utf8d.

Referenced by ast_utf8_copy_string(), ast_utf8_is_valid(), ast_utf8_is_validn(), ast_utf8_validator_feed(), and ast_utf8_validator_feedn().

98  {
99  uint32_t type = utf8d[byte];
100  *state = utf8d[256 + *state + type];
101  return *state;
102 }
static const char type[]
Definition: chan_ooh323.c:109
enum sip_cc_notify_state state
Definition: chan_sip.c:959
static const uint8_t utf8d[]
Definition: utf8.c:63

◆ test_copy_and_compare()

static int test_copy_and_compare ( const char *  src,
size_t  dst_len,
const char *  cmp 
)
static

Definition at line 280 of file utf8.c.

References ast_utf8_copy_string().

Referenced by AST_TEST_DEFINE().

281 {
282  char dst[dst_len];
283  ast_utf8_copy_string(dst, src, dst_len);
284  return strcmp(dst, cmp) == 0;
285 }
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133

◆ test_utf8_shutdown()

static void test_utf8_shutdown ( void  )
static

Definition at line 355 of file utf8.c.

References AST_TEST_UNREGISTER.

Referenced by ast_utf8_init().

356 {
357  AST_TEST_UNREGISTER(test_utf8_is_valid);
358  AST_TEST_UNREGISTER(test_utf8_copy_string);
359  AST_TEST_UNREGISTER(test_utf8_validator);
360 }
#define AST_TEST_UNREGISTER(cb)
Definition: test.h:128

Variable Documentation

◆ utf8d

const uint8_t utf8d[]
static

Definition at line 63 of file utf8.c.

Referenced by decode().