details: https://hg.nginx.org/njs/rev/f1a70d67646d branches: changeset: 1037:f1a70d67646d user: Alexander Borisov <alexander.bori...@nginx.com> date: Wed Jul 10 14:20:53 2019 +0300 description: Added UTF8 validation for string literals.
All bad UTF-8 characters are replaced by '\uFFFD' (REPLACEMENT CHARACTER). diffstat: njs/njs_parser_terminal.c | 60 +++++++++++++-------- njs/test/njs_unit_test.c | 81 +++++++++++++++++++++++++++++ nxt/nxt_utf8.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++ nxt/nxt_utf8.h | 6 ++ 4 files changed, 250 insertions(+), 22 deletions(-) diffs (362 lines): diff -r b946c1073968 -r f1a70d67646d njs/njs_parser_terminal.c --- a/njs/njs_parser_terminal.c Mon Jul 08 17:51:58 2019 +0300 +++ b/njs/njs_parser_terminal.c Wed Jul 10 14:20:53 2019 +0300 @@ -907,31 +907,35 @@ done: nxt_int_t njs_parser_string_create(njs_vm_t *vm, njs_value_t *value) { - u_char *p; - ssize_t length; - nxt_str_t *src; + u_char *dst; + ssize_t size, length; + uint32_t cp; + nxt_str_t *src; + const u_char *p, *end; src = njs_parser_text(vm->parser); - length = nxt_utf8_length(src->start, src->length); + length = nxt_utf8_safe_length(src->start, src->length, &size); - if (nxt_slow_path(length < 0)) { - length = 0; + dst = njs_string_alloc(vm, value, size, length); + if (nxt_slow_path(dst == NULL)) { + return NXT_ERROR; } - p = njs_string_alloc(vm, value, src->length, length); - - if (nxt_fast_path(p != NULL)) { - memcpy(p, src->start, src->length); + p = src->start; + end = src->start + src->length; - if (length > NJS_STRING_MAP_STRIDE && (size_t) length != src->length) { - njs_string_offset_map_init(p, src->length); - } + while (p < end) { + cp = nxt_utf8_safe_decode(&p, end); - return NXT_OK; + dst = nxt_utf8_encode(dst, cp); } - return NXT_ERROR; + if (size > NJS_STRING_MAP_STRIDE && size != length) { + njs_string_offset_map_init(value->long_string.data->start, size); + } + + return NXT_OK; } @@ -1042,11 +1046,27 @@ njs_parser_escape_string_create(njs_vm_t continue; default: + if (c >= 0x80) { + src--; + goto utf8_copy; + } + break; } } - *dst++ = c; + if (c < 0x80) { + *dst++ = c; + + continue; + } + + utf8_copy: + + src--; + + cp = nxt_utf8_safe_decode2(&src, end); + dst = nxt_utf8_encode(dst, cp); continue; @@ -1166,13 +1186,9 @@ njs_parser_escape_string_calc_length(njs } if (*src >= 0x80) { - ptr = src; + cp = nxt_utf8_safe_decode2(&src, end); - if (nxt_slow_path(nxt_utf8_decode(&src, end) == 0xffffffff)) { - goto invalid; - } - - size += src - ptr; + size += nxt_utf8_size(cp); length++; continue; diff -r b946c1073968 -r f1a70d67646d njs/test/njs_unit_test.c --- a/njs/test/njs_unit_test.c Mon Jul 08 17:51:58 2019 +0300 +++ b/njs/test/njs_unit_test.c Wed Jul 10 14:20:53 2019 +0300 @@ -4367,6 +4367,48 @@ static njs_unit_test_t njs_test[] = { nxt_string("var a = '123'\n[2].toString();a"), nxt_string("3") }, + { nxt_string("'\xE5\x96\x9C\xE3\x81\xB6'"), + nxt_string("喜ぶ") }, + + /* Broken UTF-8 literals.*/ + + { nxt_string("'\x96\xE5\x9C\xE3\x81\xB6'"), + nxt_string("��ぶ") }, + + { nxt_string("'\x96\xE5\x9C'"), + nxt_string("��") }, + + { nxt_string("'\x96\xE5'"), + nxt_string("��") }, + + { nxt_string("'\x96'"), + nxt_string("�") }, + + { nxt_string("'\xF3'"), + nxt_string("�") }, + + { nxt_string("'\xF3\xFF'"), + nxt_string("��") }, + + { nxt_string("'\x96\x96\xE5\x9C\xE3\x81\xB6'"), + nxt_string("���ぶ") }, + + { nxt_string("'\x9C\x96\xE5\xE3\x81\xB6'"), + nxt_string("���ぶ") }, + + { nxt_string("'\xE5\x9C\xE3\x81\xB6'"), + nxt_string("�ぶ") }, + + { nxt_string("'\xEF\xBF\xBD\xE3\x81\xB6'"), + nxt_string("�ぶ") }, + + { nxt_string("'\xE5\xF6\x9C\xE3\x81\xB6'"), + nxt_string("���ぶ") }, + + { nxt_string("var a = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xF3'; " + "[a.length, a[33], a[34]]"), + nxt_string("35,a,�") }, + /* Escape strings. */ { nxt_string("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"), @@ -4495,6 +4537,45 @@ static njs_unit_test_t njs_test[] = { nxt_string("'\\u{D800}\\u{'"), nxt_string("SyntaxError: Invalid Unicode code point \"\\u{D800}\\u{\" in 1") }, + /* Broken UTF-8 literals.*/ + + { nxt_string("'\\a\x96\xE5\x9C\xE3\x81\xB6'"), + nxt_string("a��ぶ") }, + + { nxt_string("'\x96\\a\xE5\x9C'"), + nxt_string("�a�") }, + + { nxt_string("'\x96\xE5\\a'"), + nxt_string("��a") }, + + { nxt_string("'\\a\x96\\a'"), + nxt_string("a�a") }, + + { nxt_string("'\xF3\\a'"), + nxt_string("�a") }, + + { nxt_string("'\xF3\\a\xFF'"), + nxt_string("�a�") }, + + { nxt_string("'\\a\x96\x96\xE5\x9C\xE3\x81\xB6'"), + nxt_string("a���ぶ") }, + + { nxt_string("'\\a\x9C\x96\xE5\xE3\x81\xB6'"), + nxt_string("a���ぶ") }, + + { nxt_string("'\\a\xE5\x9C\xE3\x81\xB6'"), + nxt_string("a�ぶ") }, + + { nxt_string("'\\a\xEF\xBF\xBD\xE3\x81\xB6'"), + nxt_string("a�ぶ") }, + + { nxt_string("'\\a\xE5\xF6\x9C\xE3\x81\xB6'"), + nxt_string("a���ぶ") }, + + { nxt_string("var a = '\\aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xF3'; " + "[a.length, a[34], a[35]]"), + nxt_string("36,a,�") }, + { nxt_string("''.hasOwnProperty('length')"), nxt_string("true") }, diff -r b946c1073968 -r f1a70d67646d nxt/nxt_utf8.c --- a/nxt/nxt_utf8.c Mon Jul 08 17:51:58 2019 +0300 +++ b/nxt/nxt_utf8.c Wed Jul 10 14:20:53 2019 +0300 @@ -163,6 +163,103 @@ nxt_utf8_decode2(const u_char **start, c } +uint32_t +nxt_utf8_safe_decode(const u_char **start, const u_char *end) +{ + uint32_t u; + + u = (uint32_t) **start; + + if (u < 0x80) { + (*start)++; + return u; + } + + return nxt_utf8_safe_decode2(start, end); +} + + +uint32_t +nxt_utf8_safe_decode2(const u_char **start, const u_char *end) +{ + u_char c; + size_t n; + uint32_t u, overlong; + const u_char *p; + + p = *start; + u = (uint32_t) *p; + + if (u >= 0xE0) { + + if (u >= 0xF0) { + + if (nxt_slow_path(u > 0xF4)) { + /* + * The maximum valid Unicode character is 0x10FFFF + * which is encoded as 0xF4 0x8F 0xBF 0xBF. + */ + goto fail_one; + } + + u &= 0x07; + overlong = 0x00FFFF; + n = 3; + + } else { + u &= 0x0F; + overlong = 0x07FF; + n = 2; + } + + } else if (u >= 0xC2) { + + /* 0x80 is encoded as 0xC2 0x80. */ + + u &= 0x1F; + overlong = 0x007F; + n = 1; + + } else { + /* u <= 0xC2 */ + goto fail_one; + } + + p++; + + while (p < end && n != 0) { + c = *p++; + /* + * The byte must in the 0x80 - 0xBF range. + * Values below 0x80 become >= 0x80. + */ + c = c - 0x80; + + if (nxt_slow_path(c > 0x3F)) { + *start = --p; + return NXT_UTF8_REPLACEMENT; + } + + u = (u << 6) | c; + n--; + } + + *start = p; + + if (n == 0 && overlong < u && u < 0x110000) { + return u; + } + + return NXT_UTF8_REPLACEMENT; + +fail_one: + + (*start)++; + + return NXT_UTF8_REPLACEMENT; +} + + /* * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but * requires lengths of both strings because otherwise nxt_utf8_decode2() @@ -279,6 +376,34 @@ nxt_utf8_length(const u_char *p, size_t } +ssize_t +nxt_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size) +{ + ssize_t size, length; + uint32_t codepoint; + const u_char *end; + + size = 0; + length = 0; + + end = p + len; + + while (p < end) { + codepoint = nxt_utf8_safe_decode(&p, end); + + size += nxt_utf8_size(codepoint); + + length++; + } + + if (out_size != NULL) { + *out_size = size; + } + + return length; +} + + nxt_bool_t nxt_utf8_is_valid(const u_char *p, size_t len) { diff -r b946c1073968 -r f1a70d67646d nxt/nxt_utf8.h --- a/nxt/nxt_utf8.h Mon Jul 08 17:51:58 2019 +0300 +++ b/nxt/nxt_utf8.h Wed Jul 10 14:20:53 2019 +0300 @@ -21,6 +21,10 @@ NXT_EXPORT u_char *nxt_utf8_encode(u_char *p, uint32_t u); NXT_EXPORT uint32_t nxt_utf8_decode(const u_char **start, const u_char *end); NXT_EXPORT uint32_t nxt_utf8_decode2(const u_char **start, const u_char *end); +NXT_EXPORT uint32_t nxt_utf8_safe_decode(const u_char **start, + const u_char *end); +NXT_EXPORT uint32_t nxt_utf8_safe_decode2(const u_char **start, + const u_char *end); NXT_EXPORT nxt_int_t nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, size_t len2); NXT_EXPORT uint32_t nxt_utf8_lower_case(const u_char **start, @@ -28,6 +32,8 @@ NXT_EXPORT uint32_t nxt_utf8_lower_case( NXT_EXPORT uint32_t nxt_utf8_upper_case(const u_char **start, const u_char *end); NXT_EXPORT ssize_t nxt_utf8_length(const u_char *p, size_t len); +NXT_EXPORT ssize_t nxt_utf8_safe_length(const u_char *p, size_t len, + ssize_t *out_size); NXT_EXPORT nxt_bool_t nxt_utf8_is_valid(const u_char *p, size_t len); _______________________________________________ nginx-devel mailing list nginx-devel@nginx.org http://mailman.nginx.org/mailman/listinfo/nginx-devel