Hi,

I have noticed that behavior of `btowc` function is inconsistent between MSVCRT 
and UCRT when current locale is "C".

UCRT's `btowc` converts bytes in range 128-255 as if source charset was 
ISO-8859-1 (code page 28591). MSVCRT's (on Windows 11) fails and returns `WEOF`.

I have attached a simple program which can be used to reproduce it. It takes 
one argument which is locale string (you can use ".CodePage" for simplicity).

I must mention that I have UTF-8 enabled globally, which means GetACP() == 
CP_UTF8. If anyone can test on system where this is not the case, this would be 
helpful.

This made me wonder how mingw-w64 implements replacement for msvcr*.dll which 
do not have it: It passes return value of `
___lc_codepage_func()` directly to `MultiByteToWideChar`.

The `___lc_codepage_func()` function returns 0 for "C" locale and actual code 
page for any other locale. When 0 (CP_ACP) passed to `MultiByteToWideChar`, it 
uses code page returned by `GetACP()` to perform conversion.

If this behavior is consistent in case GetACP() != CP_UTF8, this I am just 
overthinking. Otherwise, I have attached a simple patch which checks return 
value of `___lc_codepage_func()` and in case it returns 0, perform simple range 
check as if we called `isascii()`.

I have also noticed that if you explicitly specify locale with code page 20127 
(ASCII) (e.g. just ".20127") with UCRT, the conversion will actually fail.

MultiByteToWideChar(20127, ...):

While experimenting, I noticed that `MultiByteToWideChar` when used with code 
page 20127 succeeds even when string contains characters in range 128-255. It 
acts is if it would call `toascii()` before performing conversion of a single 
character.

This slightly concerns me.

- Kirill Makurin
From e4778b8201a407e78cc7461d48e89744f0995435 Mon Sep 17 00:00:00 2001
From: Kirill Makurin <maiddais...@outlook.com>
Date: Sun, 1 Jun 2025 18:05:49 +0900
Subject: crt: check return value of ___lc_codepage_func() in btowc
 and wctob

When current locale is "C", ___lc_codepage_func() will return 0.
When 0 (CP_ACP) is passed to MultiByteToWideChar, it will use code page
returned by GetACP() during conversion.

This may lead to unexpected behavior in programs relying on "C" locale
being consistent.

Check return value of ___lc_codepage_func(), and if it returns 0, perform
simple range check as with isascii().

Signed-off-by: Kirill Makurin <maiddais...@outlook.com>
---
 mingw-w64-crt/misc/btowc.c | 22 +++++++++++++---------
 mingw-w64-crt/misc/wctob.c | 26 ++++++++++++++++----------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/mingw-w64-crt/misc/btowc.c b/mingw-w64-crt/misc/btowc.c
index c8fbd8e74..5df50209d 100644
--- a/mingw-w64-crt/misc/btowc.c
+++ b/mingw-w64-crt/misc/btowc.c
@@ -15,14 +15,18 @@ wint_t btowc (int c)
 {
   if (c == EOF)
     return (WEOF);
-  else
-    {
-      unsigned char ch = c;
-      wchar_t wc = WEOF;
-      if (!MultiByteToWideChar (___lc_codepage_func(), MB_ERR_INVALID_CHARS,
-                                (char*)&ch, 1, &wc, 1))
-        return WEOF;

-      return wc;
-    }
+  unsigned cp = ___lc_codepage_func();
+
+  /* "C" locale */
+  if (cp == 0)
+    return (unsigned) c < 128 ? c : WEOF;
+
+  unsigned char ch = c;
+  wchar_t wc = WEOF;
+
+  if (!MultiByteToWideChar (cp, MB_ERR_INVALID_CHARS, (char*)&ch, 1, &wc, 1))
+    return WEOF;
+
+  return wc;
 }
diff --git a/mingw-w64-crt/misc/wctob.c b/mingw-w64-crt/misc/wctob.c
index 995f6db6e..1e11d51a2 100644
--- a/mingw-w64-crt/misc/wctob.c
+++ b/mingw-w64-crt/misc/wctob.c
@@ -14,16 +14,22 @@
 #include <windows.h>

 /* Return just the first byte after translating to multibyte.  */
-int wctob (wint_t wc )
+int wctob (wint_t wc)
 {
-    wchar_t w = wc;
-    char c;
-    int invalid_char = 0;
-    if (!WideCharToMultiByte (___lc_codepage_func(),
-                             0 /* Is this correct flag? */,
-                             &w, 1, &c, 1, NULL, &invalid_char)
-        || invalid_char)
-      return EOF;
+  unsigned cp = ___lc_codepage_func();

-    return (unsigned char) c;
+  /* "C" locale */
+  if (cp == 0)
+    return wc < 128 ? wc : EOF;
+
+  wchar_t w = wc;
+  char c;
+  int invalid_char = 0;
+
+  /* Do not use WC_NO_BEST_FIT_CHARS, CRT's wctob uses best-fit conversion */
+  if (!WideCharToMultiByte (cp, 0, &w, 1, &c, 1, NULL, &invalid_char)
+      || invalid_char)
+    return EOF;
+
+  return (unsigned char) c;
 }
--
2.46.1.windows.1
#define __USE_MINGW_ANSI_STDIO 0

#include <fcntl.h>
#include <io.h>
#include <locale.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>

int main (int argc, char **argv) {
        setlocale (LC_ALL, argc > 1 ? argv[1] : "C");

        _setmode (_fileno (stdout), _O_U8TEXT);

        wprintf (L"CP: %u\n", ___lc_codepage_func());

        for (unsigned c = 0; c < 0xFF; ++c) {
                wchar_t wc = btowc (c);
                if (wc != WEOF && iswprint (wc)) {
                        wprintf (L"%.2X: %c\n", c, wc);
                }
        }

        return 0;
}
_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public

Reply via email to