df/d51/utf8_8c_source.html

 /*
  * Asterisk -- An open source telephony toolkit.
  *
  * Copyright (C) 2020, Sean Bright
  *
  * Sean Bright <sean.bright@gmail.com>
  *
  * See http://www.asterisk.org for more information about
  * the Asterisk project. Please do not directly contact
  * any of the maintainers of this project for assistance;
  * the project provides a web site, mailing lists and IRC
  * channels for your use.
  *
  * This program is free software, distributed under the terms of
  * the GNU General Public License Version 2. See the LICENSE file
  * at the top of the source tree.
  */

 /*! \file
  *
  * \brief UTF-8 information and validation functions
  */

 /*** MODULEINFO
     <support_level>core</support_level>
 ***/

 #include "asterisk.h"

 #include "asterisk/utils.h"
 #include "asterisk/utf8.h"
 #include "asterisk/test.h"

 /*
  * BEGIN THIRD PARTY CODE
  *
  * Copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  * copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  */

 #define UTF8_ACCEPT 0
 #define UTF8_REJECT 12

 static const uint8_t utf8d[] = {
    /* The first part of the table maps bytes to character classes that
     * to reduce the size of the transition table and create bitmasks. */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

    /* The second part is a transition table that maps a combination
     * of a state of the automaton and a character class to a state. */
    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
    12,36,12,12,12,12,12,12,12,12,12,12,
 };

 #if 0
 /* We can bring this back if we need the codepoint? */
 static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
    uint32_t type = utf8d[byte];

    *codep = (*state != UTF8_ACCEPT) ?
       (byte & 0x3fu) | (*codep << 6) :
       (0xff >> type) & (byte);

    *state = utf8d[256 + *state + type];
    return *state;
 }
 #endif

 static uint32_t inline decode(uint32_t *state, uint32_t byte) {
    uint32_t type = utf8d[byte];
    *state = utf8d[256 + *state + type];
    return *state;
 }

 /*
  * END THIRD PARTY CODE
  *
  * See copyright notice above.
  */

 int ast_utf8_is_valid(const char *src)
 {
    uint32_t state = UTF8_ACCEPT;

    while (*src) {
       decode(&state, (uint8_t) *src++);
    }

    return state == UTF8_ACCEPT;
 }

 int ast_utf8_is_validn(const char *src, size_t size)
 {
    uint32_t state = UTF8_ACCEPT;

    while (size && *src) {
       decode(&state, (uint8_t) *src++);
       size--;
    }

    return state == UTF8_ACCEPT;
 }

 void ast_utf8_copy_string(char *dst, const char *src, size_t size)
 {
    uint32_t state = UTF8_ACCEPT;
    char *last_good = dst;

    ast_assert(size > 0);

    while (size && *src) {
       if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
          /* We _could_ replace with U+FFFD and try to recover, but for now
           * we treat this the same as if we had run out of space */
          break;
       }

       *dst++ = *src++;
       size--;

       if (size && state == UTF8_ACCEPT) {
          /* last_good is where we will ultimately write the 0 byte */
          last_good = dst;
       }
    }

    *last_good = '\0';
 }

 struct ast_utf8_validator {
    uint32_t state;
 };

 int ast_utf8_validator_new(struct ast_utf8_validator **validator)
 {
    struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));

    if (!tmp) {
       return 1;
    }

    tmp->state = UTF8_ACCEPT;
    *validator = tmp;
    return 0;
 }

 enum ast_utf8_validation_result ast_utf8_validator_state(
    struct ast_utf8_validator *validator)
 {
    switch (validator->state) {
    case UTF8_ACCEPT:
       return AST_UTF8_VALID;
    case UTF8_REJECT:
       return AST_UTF8_INVALID;
    default:
       return AST_UTF8_UNKNOWN;
    }
 }

 enum ast_utf8_validation_result ast_utf8_validator_feed(
    struct ast_utf8_validator *validator, const char *data)
 {
    while (*data) {
       decode(&validator->state, (uint8_t) *data++);
    }

    return ast_utf8_validator_state(validator);
 }

 enum ast_utf8_validation_result ast_utf8_validator_feedn(
    struct ast_utf8_validator *validator, const char *data, size_t size)
 {
    while (size && *data) {
       decode(&validator->state, (uint8_t) *data++);
       size--;
    }

    return ast_utf8_validator_state(validator);
 }

 void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
 {
    validator->state = UTF8_ACCEPT;
 }

 void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
 {
    ast_free(validator);
 }

 #ifdef TEST_FRAMEWORK

 AST_TEST_DEFINE(test_utf8_is_valid)
 {
    switch (cmd) {
    case TEST_INIT:
       info->name = "is_valid";
       info->category = "/main/utf8/";
       info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn";
       info->description =
          "Tests UTF-8 string validation code.";
       return AST_TEST_NOT_RUN;
    case TEST_EXECUTE:
       break;
    }

    /* Valid UTF-8 */
    ast_test_validate(test, ast_utf8_is_valid("Asterisk"));
    ast_test_validate(test, ast_utf8_is_valid("\xce\xbb"));
    ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b"));
    ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e"));

    /* Valid with leading */
    ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk"));
    ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb"));
    ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b"));
    ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e"));

    /* Valid with trailing */
    ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa"));
    ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa"));
    ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa"));
    ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa"));

    /* Valid with leading and trailing */
    ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa"));
    ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa"));
    ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa"));
    ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa"));

    /* Valid if limited by number of bytes */
    ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk")));
    ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb")));
    ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b")));
    ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e")));

    /* Invalid */
    ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */
    ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */
    ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28"));
    ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1"));
    ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1"));
    ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28"));
    ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc"));
    ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc"));
    ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28"));

    return AST_TEST_PASS;
 }

 static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
 {
    char dst[dst_len];
    ast_utf8_copy_string(dst, src, dst_len);
    return strcmp(dst, cmp) == 0;
 }

 AST_TEST_DEFINE(test_utf8_copy_string)
 {
    switch (cmd) {
    case TEST_INIT:
       info->name = "copy_string";
       info->category = "/main/utf8/";
       info->summary = "Test ast_utf8_copy_string";
       info->description =
          "Tests UTF-8 string copying code.";
       return AST_TEST_NOT_RUN;
    case TEST_EXECUTE:
       break;
    }

    ast_test_validate(test, test_copy_and_compare("Asterisk",           6, "Aster"));
    ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk "));
    ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae"));
    ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk "));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, ""));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, ""));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb"));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb "));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x"));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy"));
    ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz"));

    return AST_TEST_PASS;
 }

 AST_TEST_DEFINE(test_utf8_validator)
 {
    struct ast_utf8_validator *validator;

    switch (cmd) {
    case TEST_INIT:
       info->name = "utf8_validator";
       info->category = "/main/utf8/";
       info->summary = "Test ast_utf8_validator";
       info->description =
          "Tests UTF-8 progressive validator code.";
       return AST_TEST_NOT_RUN;
    case TEST_EXECUTE:
       break;
    }

    if (ast_utf8_validator_new(&validator)) {
       return AST_TEST_FAIL;
    }

    ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2")     == AST_UTF8_UNKNOWN);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae")     == AST_UTF8_VALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "Private")  == AST_UTF8_VALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch")   == AST_UTF8_VALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2")     == AST_UTF8_UNKNOWN);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84")     == AST_UTF8_UNKNOWN);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb")     == AST_UTF8_VALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "valid")    == AST_UTF8_INVALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "valid")    == AST_UTF8_INVALID);
    ast_test_validate(test, ast_utf8_validator_feed(validator, "valid")    == AST_UTF8_INVALID);

    ast_utf8_validator_destroy(validator);

    return AST_TEST_PASS;
 }

 static void test_utf8_shutdown(void)
 {
    AST_TEST_UNREGISTER(test_utf8_is_valid);
    AST_TEST_UNREGISTER(test_utf8_copy_string);
    AST_TEST_UNREGISTER(test_utf8_validator);
 }

 int ast_utf8_init(void)
 {
    AST_TEST_REGISTER(test_utf8_is_valid);
    AST_TEST_REGISTER(test_utf8_copy_string);
    AST_TEST_REGISTER(test_utf8_validator);

    ast_register_cleanup(test_utf8_shutdown);

    return 0;
 }

 #else /* !TEST_FRAMEWORK */

 int ast_utf8_init(void)
 {
    return 0;
 }

 #endif
type
static const char type[]
Definition: chan_ooh323.c:109

AST_UTF8_INVALID
The consumed sequence is invalid UTF-8.
Definition: utf8.h:86

state
enum sip_cc_notify_state state
Definition: chan_sip.c:959

ast_utf8_validator_destroy
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
Destroy a UTF-8 validator.
Definition: utf8.c:215

ast_utf8_validator_new
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
Create a new UTF-8 validator.
Definition: utf8.c:163

asterisk.h
Asterisk main include file. File version handling, generic pbx functions.

ast_utf8_is_validn
int ast_utf8_is_validn(const char *src, size_t size)
Check if the first size bytes of a string are valid UTF-8.
Definition: utf8.c:121

ast_utf8_validator_reset
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
Reset the state of a UTF-8 validator.
Definition: utf8.c:210

test_copy_and_compare
static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
Definition: utf8.c:280

decode
static uint32_t decode(uint32_t *state, uint32_t byte)
Definition: utf8.c:98

ast_utf8_validator::state
uint32_t state
Definition: utf8.c:160

TEST_EXECUTE
Definition: test.h:208

ast_utf8_is_valid
int ast_utf8_is_valid(const char *src)
Check if a zero-terminated string is valid UTF-8.
Definition: utf8.c:110

tmp
static int tmp()
Definition: bt_open.c:389

UTF8_REJECT
#define UTF8_REJECT
Definition: utf8.c:61

ast_utf8_validator
Definition: utf8.c:159

test.h
Test Framework API.

AST_TEST_REGISTER
#define AST_TEST_REGISTER(cb)
Definition: test.h:127

ast_assert
#define ast_assert(a)
Definition: utils.h:695

TEST_INIT
Definition: test.h:207

test_utf8_shutdown
static void test_utf8_shutdown(void)
Definition: utf8.c:355

utf8.h
UTF-8 information and validation functions.

ast_utf8_init
int ast_utf8_init(void)
Register UTF-8 tests.
Definition: utf8.c:362

AST_TEST_NOT_RUN
Definition: test.h:201

utils.h
Utility functions.

ast_utf8_validator_state
enum ast_utf8_validation_result ast_utf8_validator_state(struct ast_utf8_validator *validator)
Get the current UTF-8 validator state.
Definition: utf8.c:176

ast_utf8_validator_feed
enum ast_utf8_validation_result ast_utf8_validator_feed(struct ast_utf8_validator *validator, const char *data)
Feed a zero-terminated string into the UTF-8 validator.
Definition: utf8.c:189

ast_register_cleanup
int ast_register_cleanup(void(*func)(void))
Register a function to be executed before Asterisk gracefully exits.
Definition: clicompat.c:19

ast_utf8_copy_string
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
Copy a string safely ensuring valid UTF-8.
Definition: utf8.c:133

AST_UTF8_VALID
The consumed sequence is valid UTF-8.
Definition: utf8.h:78

ast_malloc
#define ast_malloc(len)
A wrapper for malloc()
Definition: astmm.h:193

UTF8_ACCEPT
#define UTF8_ACCEPT
Definition: utf8.c:60

ast_utf8_validation_result
ast_utf8_validation_result
Definition: utf8.h:71

AST_TEST_UNREGISTER
#define AST_TEST_UNREGISTER(cb)
Definition: test.h:128

sip_to_pjsip.info
def info(msg)
Definition: sip_to_pjsip.py:1313

ast_utf8_validator_feedn
enum ast_utf8_validation_result ast_utf8_validator_feedn(struct ast_utf8_validator *validator, const char *data, size_t size)
Feed a string into the UTF-8 validator.
Definition: utf8.c:199

AST_TEST_PASS
Definition: test.h:202

ast_free
#define ast_free(a)
Definition: astmm.h:182

AST_TEST_DEFINE
AST_TEST_DEFINE(test_utf8_is_valid)
Definition: utf8.c:222

state
Definition: localtime.c:158

test
Definition: test_logger.c:43

AST_UTF8_UNKNOWN
The validator is in an intermediate state.
Definition: utf8.h:96

utf8d
static const uint8_t utf8d[]
Definition: utf8.c:63

AST_TEST_FAIL
Definition: test.h:203