details: https://hg.nginx.org/njs/rev/63106bd2e9bf branches: changeset: 1471:63106bd2e9bf user: Alexander Borisov <alexander.bori...@nginx.com> date: Wed Jul 15 19:19:18 2020 +0300 description: Introduced UTF-16 according to WHATWG encoding spec.
diffstat: auto/make | 4 +- auto/sources | 3 +- src/njs_main.h | 2 + src/njs_unicode.h | 23 +++ src/njs_utf16.c | 116 +++++++++++++++ src/njs_utf16.h | 25 +++ src/test/unicode_unit_test.c | 312 +++++++++++++++++++++++++++++++++++++++++++ src/test/utf8_unit_test.c | 202 --------------------------- 8 files changed, 482 insertions(+), 205 deletions(-) diffs (749 lines): diff -r c39329b57a06 -r 63106bd2e9bf auto/make --- a/auto/make Wed Jul 15 15:34:16 2020 +0000 +++ b/auto/make Wed Jul 15 19:19:18 2020 +0300 @@ -241,12 +241,12 @@ lib_test: $NJS_BUILD_DIR/njs_auto_config $NJS_BUILD_DIR/random_unit_test \\ $NJS_BUILD_DIR/rbtree_unit_test \\ $NJS_BUILD_DIR/lvlhsh_unit_test \\ - $NJS_BUILD_DIR/utf8_unit_test + $NJS_BUILD_DIR/unicode_unit_test $NJS_BUILD_DIR/random_unit_test $NJS_BUILD_DIR/rbtree_unit_test $NJS_BUILD_DIR/lvlhsh_unit_test - $NJS_BUILD_DIR/utf8_unit_test + $NJS_BUILD_DIR/unicode_unit_test unit_test: $NJS_BUILD_DIR/njs_auto_config.h \\ $NJS_BUILD_DIR/njs_unit_test diff -r c39329b57a06 -r 63106bd2e9bf auto/sources --- a/auto/sources Wed Jul 15 15:34:16 2020 +0000 +++ b/auto/sources Wed Jul 15 19:19:18 2020 +0300 @@ -6,6 +6,7 @@ NJS_LIB_SRCS=" \ src/njs_murmur_hash.c \ src/njs_djb_hash.c \ src/njs_utf8.c \ + src/njs_utf16.c \ src/njs_arr.c \ src/njs_rbtree.c \ src/njs_lvlhsh.c \ @@ -60,7 +61,7 @@ NJS_LIB_TEST_SRCS=" \ src/test/lvlhsh_unit_test.c \ src/test/random_unit_test.c \ src/test/rbtree_unit_test.c \ - src/test/utf8_unit_test.c \ + src/test/unicode_unit_test.c \ " NJS_TEST_SRCS=" \ diff -r c39329b57a06 -r 63106bd2e9bf src/njs_main.h --- a/src/njs_main.h Wed Jul 15 15:34:16 2020 +0000 +++ b/src/njs_main.h Wed Jul 15 19:19:18 2020 +0300 @@ -14,7 +14,9 @@ #include <njs_types.h> #include <njs_clang.h> #include <njs_str.h> +#include <njs_unicode.h> #include <njs_utf8.h> +#include <njs_utf16.h> #include <njs_diyfp.h> #include <njs_dtoa.h> #include <njs_dtoa_fixed.h> diff -r c39329b57a06 -r 63106bd2e9bf src/njs_unicode.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/njs_unicode.h Wed Jul 15 19:19:18 2020 +0300 @@ -0,0 +1,23 @@ + +/* + * Copyright (C) Alexander Borisov + * Copyright (C) NGINX, Inc. + */ + +#ifndef _NJS_UNICODE_H_INCLUDED_ +#define _NJS_UNICODE_H_INCLUDED_ + + +enum { + NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF, + NJS_UNICODE_ERROR = 0x1FFFFF, + NJS_UNICODE_CONTINUE = 0x2FFFFF +}; + +typedef struct { + uint32_t codepoint; + u_char upper; +} njs_unicode_decode_t; + + +#endif /* _NJS_UNICODE_H_INCLUDED_ */ diff -r c39329b57a06 -r 63106bd2e9bf src/njs_utf16.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/njs_utf16.c Wed Jul 15 19:19:18 2020 +0300 @@ -0,0 +1,116 @@ + +/* + * Copyright (C) Alexander Borisov + * Copyright (C) NGINX, Inc. + */ + + +#include <njs_main.h> + + +njs_inline void +njs_utf16_encode_write(uint32_t cp, u_char **start) +{ +#ifdef NJS_HAVE_BIG_ENDIAN + *(*start)++ = cp >> 8; + *(*start)++ = cp & 0x00FF; +#else + *(*start)++ = cp & 0x00FF; + *(*start)++ = cp >> 8; +#endif +} + + +ssize_t +njs_utf16_encode(uint32_t cp, u_char **start, const u_char *end) +{ + if ((*start + 2) > end) { + return NJS_ERROR; + } + + if (cp < 0x10000) { + njs_utf16_encode_write(cp, start); + + return 2; + } + + if ((*start + 4) > end) { + return NJS_ERROR; + } + + cp -= 0x10000; + + njs_utf16_encode_write((0xD800 | (cp >> 0x0A)), start); + njs_utf16_encode_write((0xDC00 | (cp & 0x03FF)), start); + + return 4; +} + + +uint32_t +njs_utf16_decode(njs_unicode_decode_t *ctx, const u_char **start, + const u_char *end) +{ + uint32_t unit; + unsigned lead; + + if (ctx->upper != 0x00) { + lead = ctx->upper - 0x01; + ctx->upper = 0x00; + + goto lead_state; + } + +pair_state: + + lead = *(*start)++; + + if (*start >= end) { + ctx->upper = lead + 0x01; + return NJS_UNICODE_CONTINUE; + } + +lead_state: + +#ifdef NJS_HAVE_BIG_ENDIAN + unit = (lead << 8) + *(*start)++; +#else + unit = (*(*start)++ << 8) + lead; +#endif + + if (ctx->codepoint != 0x00) { + if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) { + unit = 0x10000 + ((ctx->codepoint - 0xD800) << 10) + + (unit - 0xDC00); + + ctx->codepoint = 0x00; + + return unit; + } + + (*start)--; + + ctx->upper = lead + 0x01; + ctx->codepoint = 0x00; + + return NJS_UNICODE_ERROR; + } + + /* Surrogate pair. */ + + if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) { + if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) { + return NJS_UNICODE_ERROR; + } + + ctx->codepoint = unit; + + if (*start >= end) { + return NJS_UNICODE_CONTINUE; + } + + goto pair_state; + } + + return unit; +} diff -r c39329b57a06 -r 63106bd2e9bf src/njs_utf16.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/njs_utf16.h Wed Jul 15 19:19:18 2020 +0300 @@ -0,0 +1,25 @@ + +/* + * Copyright (C) Alexander Borisov + * Copyright (C) NGINX, Inc. + */ + +#ifndef _NJS_UTF16_H_INCLUDED_ +#define _NJS_UTF16_H_INCLUDED_ + + +NJS_EXPORT ssize_t njs_utf16_encode(uint32_t cp, u_char **start, + const u_char *end); +NJS_EXPORT uint32_t njs_utf16_decode(njs_unicode_decode_t *ctx, + const u_char **start, const u_char *end); + + +njs_inline void +njs_utf16_decode_init(njs_unicode_decode_t *ctx) +{ + ctx->upper = 0x00; + ctx->codepoint = 0x00; +} + + +#endif /* _NJS_UTF16_H_INCLUDED_ */ diff -r c39329b57a06 -r 63106bd2e9bf src/test/unicode_unit_test.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/unicode_unit_test.c Wed Jul 15 19:19:18 2020 +0300 @@ -0,0 +1,312 @@ + +/* + * Copyright (C) Igor Sysoev + * Copyright (C) NGINX, Inc. + */ + + +#include <njs_main.h> + + +#define NJS_UTF8_START_TEST 0xC2 + + +static u_char invalid[] = { + + /* Invalid first byte less than 0xC2. */ + 1, 0x80, 0x00, 0x00, 0x00, + 1, 0xC0, 0x00, 0x00, 0x00, + 2, 0xC0, 0x00, 0x00, 0x00, + 3, 0xC0, 0x00, 0x00, 0x00, + 4, 0xC0, 0x00, 0x00, 0x00, + + /* Invalid 0x0x110000 value. */ + 4, 0xF4, 0x90, 0x80, 0x80, + + /* Incomplete length. */ + 2, 0xE0, 0xAF, 0xB5, 0x00, + + /* Overlong values. */ + 2, 0xC0, 0x80, 0x00, 0x00, + 2, 0xC1, 0xB3, 0x00, 0x00, + 3, 0xE0, 0x80, 0x80, 0x00, + 3, 0xE0, 0x81, 0xB3, 0x00, + 3, 0xE0, 0x90, 0x9A, 0x00, + 4, 0xF0, 0x80, 0x8A, 0x80, + 4, 0xF0, 0x80, 0x81, 0xB3, + 4, 0xF0, 0x80, 0xAF, 0xB5, +}; + + +static njs_int_t +utf8_overlong(u_char *overlong, size_t len) +{ + u_char *p, utf8[4]; + size_t size; + uint32_t u, d; + njs_uint_t i; + const u_char *pp; + + pp = overlong; + + d = njs_utf8_decode(&pp, overlong + len); + + len = pp - overlong; + + if (d != 0xFFFFFFFF) { + p = njs_utf8_encode(utf8, d); + + size = (p != NULL) ? p - utf8 : 0; + + if (len != size || memcmp(overlong, utf8, size) != 0) { + + u = 0; + for (i = 0; i < len; i++) { + u = (u << 8) + overlong[i]; + } + + njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD, %uz\n", + u, len, d, size); + + return NJS_ERROR; + } + } + + return NJS_OK; +} + + +static njs_int_t +utf8_unit_test(njs_uint_t start) +{ + u_char *p, utf8[4]; + size_t len; + int32_t n; + uint32_t u, d; + njs_uint_t i, k, l, m; + const u_char *pp; + + njs_printf("utf8 test started\n"); + + /* Test valid UTF-8. */ + + for (u = 0; u < 0x110000; u++) { + + p = njs_utf8_encode(utf8, u); + + if (p == NULL) { + njs_printf("njs_utf8_encode(%05uXD) failed\n", u); + return NJS_ERROR; + } + + pp = utf8; + + d = njs_utf8_decode(&pp, p); + + if (u != d) { + njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d); + return NJS_ERROR; + } + } + + /* Test some invalid UTF-8. */ + + for (i = 0; i < sizeof(invalid); i += 5) { + + len = invalid[i]; + utf8[0] = invalid[i + 1]; + utf8[1] = invalid[i + 2]; + utf8[2] = invalid[i + 3]; + utf8[3] = invalid[i + 4]; + + pp = utf8; + + d = njs_utf8_decode(&pp, utf8 + len); + + if (d != 0xFFFFFFFF) { + + u = 0; + for (i = 0; i < len; i++) { + u = (u << 8) + utf8[i]; + } + + njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD\n", + u, len, d); + return NJS_ERROR; + } + } + + /* Test all overlong UTF-8. */ + + for (i = start; i < 256; i++) { + utf8[0] = i; + + if (utf8_overlong(utf8, 1) != NJS_OK) { + return NJS_ERROR; + } + + for (k = 0; k < 256; k++) { + utf8[1] = k; + + if (utf8_overlong(utf8, 2) != NJS_OK) { + return NJS_ERROR; + } + + for (l = 0; l < 256; l++) { + utf8[2] = l; + + if (utf8_overlong(utf8, 3) != NJS_OK) { + return NJS_ERROR; + } + + for (m = 0; m < 256; m++) { + utf8[3] = m; + + if (utf8_overlong(utf8, 4) != NJS_OK) { + return NJS_ERROR; + } + } + } + } + } + + n = njs_utf8_casecmp((u_char *) "ABC АБВ ΑΒΓ", + (u_char *) "abc абв αβγ", + njs_length("ABC АБВ ΑΒΓ"), + njs_length("abc абв αβγ")); + + if (n != 0) { + njs_printf("njs_utf8_casecmp() failed\n"); + return NJS_ERROR; + } + + njs_printf("utf8 test passed\n"); + return NJS_OK; +} + + +static njs_int_t +utf16_unit_test() +{ + int8_t length, length_to; + u_char *start, *end, *end_to; + uint32_t cp, i; + njs_unicode_decode_t ctx; + u_char buf[8], to[4]; + + njs_printf("utf16 test started\n"); + + end = buf + sizeof(buf); + end_to = to + sizeof(to); + + for (i = 0; i <= NJS_UNICODE_MAX_CODEPOINT; i++) { + + /* Skip surrogate pair. */ + + if (i >= 0xD800 && i <= 0xDFFF) { + continue; + } + + start = buf; + + length = njs_utf16_encode(i, &start, end); + if (length < NJS_OK) { + njs_printf("utf16 test encode failed\n"); + return NJS_ERROR; + } + + njs_utf16_decode_init(&ctx); + + start = buf; + + cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + length); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + njs_printf("utf16 test decode failed\n"); + return NJS_ERROR; + } + + if (cp != i) { + njs_printf("utf16 test decode code point does not match\n"); + return NJS_ERROR; + } + + start = to; + + length_to = njs_utf16_encode(cp, &start, end_to); + if (length_to < NJS_OK) { + njs_printf("utf16 test encode failed\n"); + return NJS_ERROR; + } + + if (length_to != length || njs_strncmp(buf, to, length) != 0) { + njs_printf("utf16 test decode-encode failed\n"); + return NJS_ERROR; + } + } + + /* Surrogate pair. */ + + for (i = 0xD800; i <= 0xDFFF; i++) { + start = buf; + + length = njs_utf16_encode(i, &start, end); + if (length < NJS_OK) { + njs_printf("utf16 test surrogate pair encode lead failed\n"); + return NJS_ERROR; + } + + length_to = njs_utf16_encode(i - 0xD800 + 0xDC00, &start, end); + if (length_to < NJS_OK) { + njs_printf("utf16 test surrogate pair encode failed\n"); + return NJS_ERROR; + } + + njs_utf16_decode_init(&ctx); + + start = buf; + + cp = njs_utf16_decode(&ctx, (const u_char **) &start, + start + length + length_to); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (i < 0xDC00) { + njs_printf("utf16 test surrogate pair decode failed\n"); + return NJS_ERROR; + } + } + } + + njs_printf("utf16 test passed\n"); + + return NJS_OK; +} + + +int +main(int argc, char **argv) +{ + njs_int_t ret; + njs_uint_t start; + + njs_printf("unicode unit test started\n"); + + if (argc > 1 && argv[1][0] == 'a') { + start = NJS_UTF8_START_TEST; + + } else { + start = 256; + } + + ret = utf8_unit_test(start); + if (ret != NJS_OK) { + return ret; + } + + ret = utf16_unit_test(); + if (ret != NJS_OK) { + return ret; + } + + njs_printf("unicode unit test passed\n"); + + return 0; +} diff -r c39329b57a06 -r 63106bd2e9bf src/test/utf8_unit_test.c --- a/src/test/utf8_unit_test.c Wed Jul 15 15:34:16 2020 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ - -/* - * Copyright (C) Igor Sysoev - * Copyright (C) NGINX, Inc. - */ - - -#include <njs_main.h> - - -#define NJS_UTF8_START_TEST 0xC2 -//#define NJS_UTF8_START_TEST 0 - - -static u_char invalid[] = { - - /* Invalid first byte less than 0xC2. */ - 1, 0x80, 0x00, 0x00, 0x00, - 1, 0xC0, 0x00, 0x00, 0x00, - 2, 0xC0, 0x00, 0x00, 0x00, - 3, 0xC0, 0x00, 0x00, 0x00, - 4, 0xC0, 0x00, 0x00, 0x00, - - /* Invalid 0x0x110000 value. */ - 4, 0xF4, 0x90, 0x80, 0x80, - - /* Incomplete length. */ - 2, 0xE0, 0xAF, 0xB5, 0x00, - - /* Overlong values. */ - 2, 0xC0, 0x80, 0x00, 0x00, - 2, 0xC1, 0xB3, 0x00, 0x00, - 3, 0xE0, 0x80, 0x80, 0x00, - 3, 0xE0, 0x81, 0xB3, 0x00, - 3, 0xE0, 0x90, 0x9A, 0x00, - 4, 0xF0, 0x80, 0x8A, 0x80, - 4, 0xF0, 0x80, 0x81, 0xB3, - 4, 0xF0, 0x80, 0xAF, 0xB5, -}; - - -static njs_int_t -utf8_overlong(u_char *overlong, size_t len) -{ - u_char *p, utf8[4]; - size_t size; - uint32_t u, d; - njs_uint_t i; - const u_char *pp; - - pp = overlong; - - d = njs_utf8_decode(&pp, overlong + len); - - len = pp - overlong; - - if (d != 0xFFFFFFFF) { - p = njs_utf8_encode(utf8, d); - - size = (p != NULL) ? p - utf8 : 0; - - if (len != size || memcmp(overlong, utf8, size) != 0) { - - u = 0; - for (i = 0; i < len; i++) { - u = (u << 8) + overlong[i]; - } - - njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD, %uz\n", - u, len, d, size); - - return NJS_ERROR; - } - } - - return NJS_OK; -} - - -static njs_int_t -utf8_unit_test(njs_uint_t start) -{ - u_char *p, utf8[4]; - size_t len; - int32_t n; - uint32_t u, d; - njs_uint_t i, k, l, m; - const u_char *pp; - - njs_printf("utf8 unit test started\n"); - - /* Test valid UTF-8. */ - - for (u = 0; u < 0x110000; u++) { - - p = njs_utf8_encode(utf8, u); - - if (p == NULL) { - njs_printf("njs_utf8_encode(%05uXD) failed\n", u); - return NJS_ERROR; - } - - pp = utf8; - - d = njs_utf8_decode(&pp, p); - - if (u != d) { - njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d); - return NJS_ERROR; - } - } - - /* Test some invalid UTF-8. */ - - for (i = 0; i < sizeof(invalid); i += 5) { - - len = invalid[i]; - utf8[0] = invalid[i + 1]; - utf8[1] = invalid[i + 2]; - utf8[2] = invalid[i + 3]; - utf8[3] = invalid[i + 4]; - - pp = utf8; - - d = njs_utf8_decode(&pp, utf8 + len); - - if (d != 0xFFFFFFFF) { - - u = 0; - for (i = 0; i < len; i++) { - u = (u << 8) + utf8[i]; - } - - njs_printf("njs_utf8_decode(%05uXD, %uz) failed: %05uXD\n", - u, len, d); - return NJS_ERROR; - } - } - - /* Test all overlong UTF-8. */ - - for (i = start; i < 256; i++) { - utf8[0] = i; - - if (utf8_overlong(utf8, 1) != NJS_OK) { - return NJS_ERROR; - } - - for (k = 0; k < 256; k++) { - utf8[1] = k; - - if (utf8_overlong(utf8, 2) != NJS_OK) { - return NJS_ERROR; - } - - for (l = 0; l < 256; l++) { - utf8[2] = l; - - if (utf8_overlong(utf8, 3) != NJS_OK) { - return NJS_ERROR; - } - - for (m = 0; m < 256; m++) { - utf8[3] = m; - - if (utf8_overlong(utf8, 4) != NJS_OK) { - return NJS_ERROR; - } - } - } - } - } - - n = njs_utf8_casecmp((u_char *) "ABC АБВ ΑΒΓ", - (u_char *) "abc абв αβγ", - njs_length("ABC АБВ ΑΒΓ"), - njs_length("abc абв αβγ")); - - if (n != 0) { - njs_printf("njs_utf8_casecmp() failed\n"); - return NJS_ERROR; - } - - njs_printf("utf8 unit test passed\n"); - return NJS_OK; -} - - -int -main(int argc, char **argv) -{ - njs_uint_t start; - - if (argc > 1 && argv[1][0] == 'a') { - start = NJS_UTF8_START_TEST; - - } else { - start = 256; - } - - return utf8_unit_test(start); -} _______________________________________________ nginx-devel mailing list nginx-devel@nginx.org http://mailman.nginx.org/mailman/listinfo/nginx-devel