Repeating email with patches renamed to .txt extension. On Sun, 16 Apr 2023 at 23:51, Costas Argyris <[email protected]> wrote:
> Fix the mbrtoc16 function reported here: > > https://sourceforge.net/p/mingw-w64/bugs/957/ > > In contrast to mbrtoc32 which was only slightly misbehaving (now fixed), > mbrtoc16 is entirely broken, returning the UTF-8 bytes unchanged. > > The proposed implementation in this patch has mbrtoc16 assume UTF-8 > input, something which its mbrtoc32 counterpart has always been doing, > and reuses mbrtoc32 to simplify the implementation significantly, as it is > quite easy to get the UTF-16 code unit(s) once you have the UTF-32 > code unit (Unicode code point value) first. > > I also added tests for both functions, which are essentially taken from > > https://en.cppreference.com/w/c/string/multibyte/mbrtoc16 > https://en.cppreference.com/w/c/string/multibyte/mbrtoc32 > > with a little bit of added checking in the end. > > I put the tests in a separate patch because I think adding them requires > re-generating mingw-w64-crt/Makefile.in and configure and I wasn't > sure how this is typically handled and if we even want to do that in this > case. > > Thanks, > Costas >
From 14847703ec4a860c7029e78261057f8672ab3f9f Mon Sep 17 00:00:00 2001 From: Costas Argyris <[email protected]> Date: Sun, 16 Apr 2023 22:48:04 +0100 Subject: [PATCH] Re-implement (broken) mbrtoc16 to assume UTF-8 input like its mbrtoc32 counterpart. The new implementation uses mbrtoc32 first to convert the UTF-8 character to UTF-32 (Unicode code point), and then converts that code point to UTF-16. Tested implementation against the example in: https://en.cppreference.com/w/c/string/multibyte/mbrtoc16 as reported in: https://sourceforge.net/p/mingw-w64/bugs/957/ Signed-off-by: Costas Argyris <[email protected]> --- mingw-w64-crt/misc/uchar_mbrtoc16.c | 53 +++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/mingw-w64-crt/misc/uchar_mbrtoc16.c b/mingw-w64-crt/misc/uchar_mbrtoc16.c index 9de35fe07..3e2c84075 100644 --- a/mingw-w64-crt/misc/uchar_mbrtoc16.c +++ b/mingw-w64-crt/misc/uchar_mbrtoc16.c @@ -21,13 +21,62 @@ #include <errno.h> #include <uchar.h> +#include <assert.h> + +static mbstate_t sstate = 0; size_t mbrtoc16 (char16_t *__restrict__ pc16, const char *__restrict__ s, size_t n, mbstate_t *__restrict__ state) { -/* wchar_t should compatible to char16_t on Windows */ - return mbrtowc((wchar_t *)pc16, s, n, state); + if (!state) state = &sstate; + + if (*state) { + /* State must be holding a valid UTF-16 low (trailing) surrogate. */ + char16_t ls = (char16_t) *state; + if ( !(0xDC00 <= ls && ls <= 0xDFFF) ) { + errno = EILSEQ; + return (size_t) -1; + } + if (pc16) *pc16 = ls; + *state = 0; + return (size_t) -3; + } + + size_t rc; + char32_t cp; /* UTF-32 value (Unicode code point) */ + rc = mbrtoc32 (&cp, s, n, NULL); /* state not used in mbrtoc32 */ + + /* Check if mbrtoc32 succeeded in getting a Unicode code point + from max n bytes of the UTF-8 multibyte character string s. */ + assert (rc != (size_t) -3); /* no surrogate pairs in UTF-32 */ + if (rc == (size_t) -1) { + errno = EILSEQ; + return (size_t) -1; /* invalid input */ + } + if (rc == (size_t) -2) return (size_t) -2; /* truncated input */ + + /* At this point we have a valid Unicode code point. Convert it + to one or two UTF-16 code units. */ + if ((cp <= 0xD7FF) || (0xE000 <= cp && cp <= 0xFFFF)) { + /* Unicode code points in these ranges take a single UTF-16 + code unit that is numerically equal to the code point value. */ + if (pc16) *pc16 = (char16_t) cp; + } + else if (0x010000 <= cp && cp <= 0x10FFFF) { + /* In this range we have surrogate pairs. */ + /* Write high (leading) surrogate to output. */ + if (pc16) *pc16 = (char16_t) (0xD800 + ((cp - 0x10000) >> 10)); + /* Write low (trailing) surrogate to state to + return it with the next function call. */ + *state = (char16_t) (0xDC00 + (cp & 0x3FF)); + } + else { + /* mbrtoc32 should never return a bad code point. */ + assert (0); + } + + return rc; } -- 2.30.2
From 31a5824bdbed2b005b1c639dcfce79762b99d0f4 Mon Sep 17 00:00:00 2001 From: Costas Argyris <[email protected]> Date: Sun, 16 Apr 2023 23:19:22 +0100 Subject: [PATCH] Tests for mbrtoc{16,32} functions using examples from: https://en.cppreference.com/w/c/string/multibyte/mbrtoc16 https://en.cppreference.com/w/c/string/multibyte/mbrtoc32 Signed-off-by: Costas Argyris <[email protected]> --- mingw-w64-crt/Makefile.am | 4 +- mingw-w64-crt/testcases/t_mbrtoc16.c | 54 +++++++++++++++++++++++++++ mingw-w64-crt/testcases/t_mbrtoc32.c | 55 ++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 mingw-w64-crt/testcases/t_mbrtoc16.c create mode 100644 mingw-w64-crt/testcases/t_mbrtoc32.c diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am index 3cf7203e9..43bb308bc 100644 --- a/mingw-w64-crt/Makefile.am +++ b/mingw-w64-crt/Makefile.am @@ -2480,7 +2480,9 @@ testcase_progs = \ testcases/t_trycatch \ testcases/t_stat_slash \ testcases/t_wreaddir \ - testcases/t_fseeko64 + testcases/t_fseeko64 \ + testcases/t_mbrtoc16 \ + testcases/t_mbrtoc32 testcases_tstmaincpp_SOURCES = testcases/tstmaincpp.cpp testcases_t_trycatch_SOURCES = testcases/t_trycatch.cpp diff --git a/mingw-w64-crt/testcases/t_mbrtoc16.c b/mingw-w64-crt/testcases/t_mbrtoc16.c new file mode 100644 index 000000000..d6de92529 --- /dev/null +++ b/mingw-w64-crt/testcases/t_mbrtoc16.c @@ -0,0 +1,54 @@ +// Test case from: +// https://en.cppreference.com/w/c/string/multibyte/mbrtoc16 + +#include <stdio.h> +#include <uchar.h> + +mbstate_t state; + +int main(void) +{ + char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C" + size_t in_sz = sizeof in / sizeof *in; + + printf("Processing %zu UTF-8 code units: [ ", in_sz); + for(size_t n = 0; n < in_sz; ++n) printf("%#x ", (unsigned char)in[n]); + puts("]"); + + char16_t out[in_sz]; + char *p_in = in, *end = in + in_sz; + char16_t *p_out = out; + size_t rc; + while((rc = mbrtoc16(p_out, p_in, (size_t) (end - p_in), &state))) + { + if(rc == (size_t)-1) // invalid input + break; + else if(rc == (size_t)-2) // truncated input + break; + else if(rc == (size_t)-3) // UTF-16 high surrogate + p_out += 1; + else { + p_in += rc; + p_out += 1; + }; + } + + size_t out_sz = (size_t) (p_out - out) + 1; + printf("into %zu UTF-16 code units: [ ", out_sz); + for(size_t x = 0; x < out_sz; ++x) printf("%#x ", out[x]); + puts("]"); + + size_t out_sz_exp = 6; + if(out_sz_exp != out_sz) { + printf("Expected %zu UTF-16 code units but got %zu!\n", out_sz_exp, out_sz); + return 1; + } + char16_t out_exp[] = {0x7a, 0xdf, 0x6c34, 0xd83c, 0xdf4c, 0}; + for(size_t x = 0; x < out_sz; ++x) + if(out_exp[x] != out[x]) { + printf("Code unit %zu was expected %zu but found %zu!\n", x, out_exp[x], out[x]); + return 1; + } + + return 0; +} diff --git a/mingw-w64-crt/testcases/t_mbrtoc32.c b/mingw-w64-crt/testcases/t_mbrtoc32.c new file mode 100644 index 000000000..033a6041b --- /dev/null +++ b/mingw-w64-crt/testcases/t_mbrtoc32.c @@ -0,0 +1,55 @@ +// Test case from: +// https://en.cppreference.com/w/c/string/multibyte/mbrtoc32 + +#include <stdio.h> +#include <string.h> +#include <uchar.h> +#include <assert.h> + +int main(void) +{ + char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C" + const size_t in_size = sizeof in / sizeof *in; + + printf("Processing %zu UTF-8 code units: [ ", in_size); + for (size_t i = 0; i < in_size; ++i) + printf("0x%02x ", (unsigned char)in[i]); + + puts("]"); + + char32_t out[in_size]; + char32_t *p_out = out; + char *p_in = in, *end = in + in_size; + mbstate_t state; + memset(&state, 0, sizeof(state)); + size_t rc; + while ((rc = mbrtoc32(p_out, p_in, end - p_in, &state))) + { + assert(rc != (size_t)-3); // no surrogate pairs in UTF-32 + if (rc == (size_t)-1) break; // invalid input + if (rc == (size_t)-2) break; // truncated input + p_in += rc; + ++p_out; + } + + size_t out_size = p_out+1 - out; + printf("into %zu UTF-32 code units: [ ", out_size); + for (size_t i = 0; i < out_size; ++i) + printf("0x%08X ", out[i]); + + puts("]"); + + size_t out_size_exp = 5; + if(out_size_exp != out_size) { + printf("Expected %zu UTF-32 code units but got %zu!\n", out_size_exp, out_size); + return 1; + } + char32_t out_exp[] = {0x0000007A, 0x000000DF, 0x00006C34, 0x0001F34C, 0x00000000}; + for(size_t i = 0; i < out_size; ++i) + if(out_exp[i] != out[i]) { + printf("Code unit %zu was expected %zu but found %zu!\n", i, out_exp[i], out[i]); + return 1; + } + + return 0; +} -- 2.30.2
_______________________________________________ Mingw-w64-public mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/mingw-w64-public
