Repeating email with patches renamed to .txt extension.

On Sun, 16 Apr 2023 at 23:51, Costas Argyris <[email protected]>
wrote:

> Fix the mbrtoc16 function reported here:
>
> https://sourceforge.net/p/mingw-w64/bugs/957/
>
> In contrast to mbrtoc32 which was only slightly misbehaving (now fixed),
> mbrtoc16 is entirely broken, returning the UTF-8 bytes unchanged.
>
> The proposed implementation in this patch has mbrtoc16 assume UTF-8
> input, something which its mbrtoc32 counterpart has always been doing,
> and reuses mbrtoc32 to simplify the implementation significantly, as it is
> quite easy to get the UTF-16 code unit(s) once you have the UTF-32
> code unit (Unicode code point value) first.
>
> I also added tests for both functions, which are essentially taken from
>
> https://en.cppreference.com/w/c/string/multibyte/mbrtoc16
> https://en.cppreference.com/w/c/string/multibyte/mbrtoc32
>
> with a little bit of added checking in the end.
>
> I put the tests in a separate patch because I think adding them requires
> re-generating mingw-w64-crt/Makefile.in and configure and I wasn't
> sure how this is typically handled and if we even want to do that in this
> case.
>
> Thanks,
> Costas
>
From 14847703ec4a860c7029e78261057f8672ab3f9f Mon Sep 17 00:00:00 2001
From: Costas Argyris <[email protected]>
Date: Sun, 16 Apr 2023 22:48:04 +0100
Subject: [PATCH] Re-implement (broken) mbrtoc16 to assume UTF-8 input like its
 mbrtoc32 counterpart.

The new implementation uses mbrtoc32 first to convert
the UTF-8 character to UTF-32 (Unicode code point),
and then converts that code point to UTF-16.

Tested implementation against the example in:

https://en.cppreference.com/w/c/string/multibyte/mbrtoc16

as reported in:

https://sourceforge.net/p/mingw-w64/bugs/957/

Signed-off-by: Costas Argyris <[email protected]>
---
 mingw-w64-crt/misc/uchar_mbrtoc16.c | 53 +++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/mingw-w64-crt/misc/uchar_mbrtoc16.c 
b/mingw-w64-crt/misc/uchar_mbrtoc16.c
index 9de35fe07..3e2c84075 100644
--- a/mingw-w64-crt/misc/uchar_mbrtoc16.c
+++ b/mingw-w64-crt/misc/uchar_mbrtoc16.c
@@ -21,13 +21,62 @@
 
 #include <errno.h>
 #include <uchar.h>
+#include <assert.h>
+
+static mbstate_t sstate = 0;
 
 size_t mbrtoc16 (char16_t *__restrict__ pc16,
                 const char *__restrict__ s,
                 size_t n,
                 mbstate_t *__restrict__ state)
 {
-/* wchar_t should compatible to char16_t on Windows */
-    return mbrtowc((wchar_t *)pc16, s, n, state);
+    if (!state) state = &sstate;
+
+    if (*state) {
+        /* State must be holding a valid UTF-16 low (trailing) surrogate. */
+        char16_t ls = (char16_t) *state;
+        if ( !(0xDC00 <= ls && ls <= 0xDFFF) ) {
+            errno = EILSEQ;
+            return (size_t) -1;
+        }
+        if (pc16) *pc16 = ls;
+        *state = 0;
+        return (size_t) -3;
+    }
+
+    size_t rc;
+    char32_t cp; /* UTF-32 value (Unicode code point) */
+    rc = mbrtoc32 (&cp, s, n, NULL); /* state not used in mbrtoc32 */
+
+    /* Check if mbrtoc32 succeeded in getting a Unicode code point
+       from max n bytes of the UTF-8 multibyte character string s. */
+    assert (rc != (size_t) -3); /* no surrogate pairs in UTF-32 */
+    if (rc == (size_t) -1) {
+        errno = EILSEQ;
+        return (size_t) -1; /* invalid input */
+    }
+    if (rc == (size_t) -2) return (size_t) -2; /* truncated input */
+
+    /* At this point we have a valid Unicode code point. Convert it
+       to one or two UTF-16 code units. */
+    if ((cp <= 0xD7FF) || (0xE000 <= cp && cp <= 0xFFFF)) {
+        /* Unicode code points in these ranges take a single UTF-16
+           code unit that is numerically equal to the code point value. */
+        if (pc16) *pc16 = (char16_t) cp;
+    }
+    else if (0x010000 <= cp && cp <= 0x10FFFF) {
+        /* In this range we have surrogate pairs. */
+        /* Write high (leading) surrogate to output. */
+        if (pc16) *pc16 = (char16_t) (0xD800 + ((cp - 0x10000) >> 10));
+        /* Write low (trailing) surrogate to state to
+           return it with the next function call. */
+        *state = (char16_t) (0xDC00 + (cp & 0x3FF));
+    }
+    else {
+        /* mbrtoc32 should never return a bad code point. */
+        assert (0);
+    }
+
+    return rc;
 }
 
-- 
2.30.2

From 31a5824bdbed2b005b1c639dcfce79762b99d0f4 Mon Sep 17 00:00:00 2001
From: Costas Argyris <[email protected]>
Date: Sun, 16 Apr 2023 23:19:22 +0100
Subject: [PATCH] Tests for mbrtoc{16,32} functions using examples from:

https://en.cppreference.com/w/c/string/multibyte/mbrtoc16
https://en.cppreference.com/w/c/string/multibyte/mbrtoc32

Signed-off-by: Costas Argyris <[email protected]>
---
 mingw-w64-crt/Makefile.am            |  4 +-
 mingw-w64-crt/testcases/t_mbrtoc16.c | 54 +++++++++++++++++++++++++++
 mingw-w64-crt/testcases/t_mbrtoc32.c | 55 ++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 mingw-w64-crt/testcases/t_mbrtoc16.c
 create mode 100644 mingw-w64-crt/testcases/t_mbrtoc32.c

diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am
index 3cf7203e9..43bb308bc 100644
--- a/mingw-w64-crt/Makefile.am
+++ b/mingw-w64-crt/Makefile.am
@@ -2480,7 +2480,9 @@ testcase_progs = \
   testcases/t_trycatch \
   testcases/t_stat_slash \
   testcases/t_wreaddir \
-  testcases/t_fseeko64
+  testcases/t_fseeko64 \
+  testcases/t_mbrtoc16 \
+  testcases/t_mbrtoc32
 
 testcases_tstmaincpp_SOURCES = testcases/tstmaincpp.cpp
 testcases_t_trycatch_SOURCES = testcases/t_trycatch.cpp
diff --git a/mingw-w64-crt/testcases/t_mbrtoc16.c 
b/mingw-w64-crt/testcases/t_mbrtoc16.c
new file mode 100644
index 000000000..d6de92529
--- /dev/null
+++ b/mingw-w64-crt/testcases/t_mbrtoc16.c
@@ -0,0 +1,54 @@
+// Test case from:
+// https://en.cppreference.com/w/c/string/multibyte/mbrtoc16
+
+#include <stdio.h>
+#include <uchar.h>
+
+mbstate_t state;
+
+int main(void)
+{
+    char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C"
+    size_t in_sz = sizeof in / sizeof *in;
+ 
+    printf("Processing %zu UTF-8 code units: [ ", in_sz);
+    for(size_t n = 0; n < in_sz; ++n) printf("%#x ", (unsigned char)in[n]);
+    puts("]");
+ 
+    char16_t out[in_sz];
+    char *p_in = in, *end = in + in_sz;
+    char16_t *p_out = out;
+    size_t rc;
+    while((rc = mbrtoc16(p_out, p_in, (size_t) (end - p_in), &state)))
+    {
+        if(rc == (size_t)-1)      // invalid input
+            break;
+        else if(rc == (size_t)-2) // truncated input
+            break;
+        else if(rc == (size_t)-3) // UTF-16 high surrogate
+            p_out += 1;
+        else {
+            p_in += rc;
+            p_out += 1;
+        };
+    }
+ 
+    size_t out_sz = (size_t) (p_out - out) + 1;
+    printf("into %zu UTF-16 code units: [ ", out_sz);
+    for(size_t x = 0; x < out_sz; ++x) printf("%#x ", out[x]);
+    puts("]");
+
+    size_t out_sz_exp = 6;
+    if(out_sz_exp != out_sz) {
+        printf("Expected %zu UTF-16 code units but got %zu!\n", out_sz_exp, 
out_sz);
+        return 1;
+    }
+    char16_t out_exp[] = {0x7a, 0xdf, 0x6c34, 0xd83c, 0xdf4c, 0};
+    for(size_t x = 0; x < out_sz; ++x)
+        if(out_exp[x] != out[x]) {
+            printf("Code unit %zu was expected %zu but found %zu!\n", x, 
out_exp[x], out[x]);
+            return 1;
+        }
+
+    return 0;
+}
diff --git a/mingw-w64-crt/testcases/t_mbrtoc32.c 
b/mingw-w64-crt/testcases/t_mbrtoc32.c
new file mode 100644
index 000000000..033a6041b
--- /dev/null
+++ b/mingw-w64-crt/testcases/t_mbrtoc32.c
@@ -0,0 +1,55 @@
+// Test case from:
+// https://en.cppreference.com/w/c/string/multibyte/mbrtoc32
+
+#include <stdio.h>
+#include <string.h>
+#include <uchar.h>
+#include <assert.h>
+ 
+int main(void)
+{
+    char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C"
+    const size_t in_size = sizeof in / sizeof *in;
+ 
+    printf("Processing %zu UTF-8 code units: [ ", in_size);
+    for (size_t i = 0; i < in_size; ++i)
+        printf("0x%02x ", (unsigned char)in[i]);
+ 
+    puts("]");
+ 
+    char32_t out[in_size];
+    char32_t *p_out = out;
+    char *p_in = in, *end = in + in_size;
+    mbstate_t state;
+    memset(&state, 0, sizeof(state));
+    size_t rc;
+    while ((rc = mbrtoc32(p_out, p_in, end - p_in, &state)))
+    {
+        assert(rc != (size_t)-3); // no surrogate pairs in UTF-32
+        if (rc == (size_t)-1) break; // invalid input
+        if (rc == (size_t)-2) break; // truncated input
+        p_in += rc;
+        ++p_out;
+    }
+ 
+    size_t out_size = p_out+1 - out;
+    printf("into %zu UTF-32 code units: [ ", out_size);
+    for (size_t i = 0; i < out_size; ++i)
+        printf("0x%08X ", out[i]);
+ 
+    puts("]");
+
+    size_t out_size_exp = 5;
+    if(out_size_exp != out_size) {
+        printf("Expected %zu UTF-32 code units but got %zu!\n", out_size_exp, 
out_size);
+        return 1;
+    }
+    char32_t out_exp[] = {0x0000007A, 0x000000DF, 0x00006C34, 0x0001F34C, 
0x00000000};
+    for(size_t i = 0; i < out_size; ++i)
+        if(out_exp[i] != out[i]) {
+            printf("Code unit %zu was expected %zu but found %zu!\n", i, 
out_exp[i], out[i]);
+            return 1;
+        }
+
+    return 0;
+}
-- 
2.30.2

_______________________________________________
Mingw-w64-public mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public

Reply via email to