details: https://hg.nginx.org/njs/rev/46d505a902bb branches: changeset: 1953:46d505a902bb user: Dmitry Volyntsev <xei...@nginx.com> date: Wed Sep 14 22:14:50 2022 -0700 description: Parser: properly handling unicode space characters.
diffstat: src/njs_lexer.c | 35 +++++++++++++++++++++++++++++------ src/njs_str.h | 1 - src/test/njs_unit_test.c | 5 +++++ 3 files changed, 34 insertions(+), 7 deletions(-) diffs (83 lines): diff -r 05efe34376ab -r 46d505a902bb src/njs_lexer.c --- a/src/njs_lexer.c Tue Sep 13 21:13:17 2022 -0700 +++ b/src/njs_lexer.c Wed Sep 14 22:14:50 2022 -0700 @@ -45,8 +45,8 @@ static const uint8_t njs_tokens[256] n NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, /* \t */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_SPACE, - /* \n */ NJS_TOKEN_LINE_END, NJS_TOKEN_ILLEGAL, - /* \r */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_SPACE, + /* \n */ NJS_TOKEN_LINE_END, NJS_TOKEN_SPACE, + /* \r */ NJS_TOKEN_SPACE, NJS_TOKEN_SPACE, NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, /* 0x10 */ NJS_TOKEN_ILLEGAL, NJS_TOKEN_ILLEGAL, @@ -437,15 +437,38 @@ njs_lexer_consume_token(njs_lexer_t *lex njs_int_t njs_lexer_make_token(njs_lexer_t *lexer, njs_lexer_token_t *token) { - u_char c, *p; + u_char c, *p; + uint32_t cp; + njs_unicode_decode_t ctx; c = ' '; + njs_utf8_decode_init(&ctx); + while (lexer->start < lexer->end) { - c = *lexer->start++; + c = *lexer->start; + + if (njs_fast_path(!(c & 0x80))) { + lexer->start++; + + if (njs_tokens[c] != NJS_TOKEN_SPACE) { + break; + } - if (njs_tokens[c] != NJS_TOKEN_SPACE) { - break; + } else { + + /* Unicode. */ + + cp = njs_utf8_decode(&ctx, (const u_char **) &lexer->start, + lexer->end); + if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) { + c = '\0'; + break; + } + + if (!njs_utf8_is_whitespace(cp)) { + break; + } } } diff -r 05efe34376ab -r 46d505a902bb src/njs_str.h --- a/src/njs_str.h Tue Sep 13 21:13:17 2022 -0700 +++ b/src/njs_str.h Wed Sep 14 22:14:50 2022 -0700 @@ -51,7 +51,6 @@ njs_is_whitespace(u_char c) case 0x0C: /* <FF> */ case 0x0D: /* <CR> */ case 0x20: /* <SP> */ - case 0xA0: /* <NBSP> */ return 1; default: diff -r 05efe34376ab -r 46d505a902bb src/test/njs_unit_test.c --- a/src/test/njs_unit_test.c Tue Sep 13 21:13:17 2022 -0700 +++ b/src/test/njs_unit_test.c Wed Sep 14 22:14:50 2022 -0700 @@ -7341,6 +7341,11 @@ static njs_unit_test_t njs_test[] = "[a.length, a[33], a[34]]"), njs_str("35,a,�") }, + /* Spaces: U+0009U+000BU+000CU+0020U+00A0U+000AU+000DU+2028U+2029 */ + + { njs_str("\x09\x0a\x0b\x0c\x0d \xc2\xa0'a'\xe2\x80\xa8+\xe2\x80\xa9'b'"), + njs_str("ab") }, + /* Escape strings. */ { njs_str("'\\a \\' \\\" \\\\ \\0 \\b \\f \\n \\r \\t \\v'"), _______________________________________________ nginx-devel mailing list -- nginx-devel@nginx.org To unsubscribe send an email to nginx-devel-le...@nginx.org