This internal function is used to implement C95 functions mbrlen, mbrtowc and mbsrtowcs. This makes difference for crtdll.dll, where `___lc_codepage_func` parses return value of setlocale(LC_ALL, NULL) and converts code page part to an `int`. With this change `mbsrtowcs` only needs to call both `___lc_codepage_func` and `___mb_cur_max_func` once.
I wasn't sure where to declare __mingw_mbrtowc_cp; I don't feel like creating internal header file just for one function, so I declared it in wchar.h. If we're good with these patches, I'll send similar patches for `wcrtomb` and `wcsrtombs`. I just pushed them to my GitHub fork to run CI tests[1]. - Kirill Makurin [1] https://github.com/maiddaisuki/mingw-w64/actions/runs/18244326038
From 6faece58c5c89255f803387e2d9ebe322fade097 Mon Sep 17 00:00:00 2001 From: Kirill Makurin <[email protected]> Date: Sat, 4 Oct 2025 21:18:50 +0900 Subject: [PATCH 1/3] crt: add internal function __mingw_mbrtowc_cp This function is internally called from mbrlen, mbrtowc and mbsrtowcs functions. Previous implementation of mbsrtowcs was calling mbrtowc, which internally was calling ___lc_codepage_func. Implementation of ___lc_codepage_func for crtdll.dll is quite expensive as it parses return value of setlocale(LC_ALL, NULL). Using internal __mingw_mbrtowc_cp instead mbrtowc allows mbsrtowcs call both ___lc_codepage_func and ___mb_cur_max_func only once. Signed-off-by: Kirill Makurin <[email protected]> --- mingw-w64-crt/misc/mbrlen.c | 11 +++++- mingw-w64-crt/misc/mbrtowc.c | 61 ++++++++++++++++++++++++++-------- mingw-w64-crt/misc/mbsrtowcs.c | 9 +++-- mingw-w64-headers/crt/wchar.h | 3 ++ 4 files changed, 66 insertions(+), 18 deletions(-) diff --git a/mingw-w64-crt/misc/mbrlen.c b/mingw-w64-crt/misc/mbrlen.c index 7b57f5753..5bddaf53f 100644 --- a/mingw-w64-crt/misc/mbrlen.c +++ b/mingw-w64-crt/misc/mbrlen.c @@ -3,6 +3,8 @@ * This file is part of the mingw-w64 runtime package. * No warranty is given; refer to the file DISCLAIMER.PD within this package. */ +#include <locale.h> +#include <stdlib.h> #include <wchar.h> size_t mbrlen ( @@ -15,5 +17,12 @@ size_t mbrlen ( static mbstate_t state_mbrlen = {0}; state = &state_mbrlen; } - return mbrtowc (NULL, mbs, count, state); + + /* Code page used by current locale */ + unsigned cp = ___lc_codepage_func (); + + /* Maximum character length used by current locale */ + int mb_cur_max = ___mb_cur_max_func (); + + return __mingw_mbrtowc_cp (NULL, mbs, count, state, cp, mb_cur_max); } diff --git a/mingw-w64-crt/misc/mbrtowc.c b/mingw-w64-crt/misc/mbrtowc.c index a6933a0cb..c4f7556cb 100644 --- a/mingw-w64-crt/misc/mbrtowc.c +++ b/mingw-w64-crt/misc/mbrtowc.c @@ -11,18 +11,36 @@ #define WIN32_LEAN_AND_MEAN #include <windows.h> -size_t mbrtowc ( +/** + * __mingw_mbrtowc_cp is internal implementation for C95 functions mbrlen, + * mbrtowc and mbsrtowcs. + * + * In order to perform conversion we need the following information: + * + * - code page used by active locale (which can be a thread locale for + * msvcr80.dll and later); obtained by calling ___lc_codepage_func + * + * - maximum character length in used code page; obtained by calling + * ___mb_cur_max_func + * + * - for double-byte code pages, we need to recognize leading bytes in order + * to correctly convert multibyte characters; this can be done with Win32 + * function IsDBCSLeadByteEx or CRT function isleadbyte + * + * crtdll.dll's ___lc_codepage_func is quite expensive as it obtains this + * information by parsing return value of setlocale(LC_CTYPE, NULL). Using + * __mingw_mbrtowc_cp allows mbsrtowcs call both ___lc_codepage_func and + * ___mb_cur_max_func only once. + */ + +size_t __mingw_mbrtowc_cp ( wchar_t *__restrict__ wc, const char *__restrict__ mbs, size_t count, - mbstate_t *__restrict__ state + mbstate_t *__restrict__ state, + unsigned cp, + int mb_cur_max ) { - /* Use private `mbstate_t` if caller did not supply one */ - if (state == NULL) { - static mbstate_t state_mbrtowc = {0}; - state = &state_mbrtowc; - } - /** * Calling mbrtowc (..., NULL, ..., state) is equivalent to * @@ -44,12 +62,6 @@ size_t mbrtowc ( return (size_t) -2; } - /* Code page used by current locale */ - unsigned cp = ___lc_codepage_func (); - - /* Maximum character length used by current locale */ - int mb_cur_max = ___mb_cur_max_func (); - /* Treat `state` as an array of bytes */ union { mbstate_t state; @@ -141,3 +153,24 @@ einval: errno = EINVAL; return (size_t) -1; } + +size_t mbrtowc ( + wchar_t *__restrict__ wc, + const char *__restrict__ mbs, + size_t count, + mbstate_t *__restrict__ state +) { + /* Use private `mbstate_t` if caller did not supply one */ + if (state == NULL) { + static mbstate_t state_mbrtowc = {0}; + state = &state_mbrtowc; + } + + /* Code page used by current locale */ + unsigned cp = ___lc_codepage_func (); + + /* Maximum character length used by current locale */ + int mb_cur_max = ___mb_cur_max_func (); + + return __mingw_mbrtowc_cp (wc, mbs, count, state, cp, mb_cur_max); +} diff --git a/mingw-w64-crt/misc/mbsrtowcs.c b/mingw-w64-crt/misc/mbsrtowcs.c index e7e4105bb..1c3ccec6c 100644 --- a/mingw-w64-crt/misc/mbsrtowcs.c +++ b/mingw-w64-crt/misc/mbsrtowcs.c @@ -4,8 +4,8 @@ * No warranty is given; refer to the file DISCLAIMER.PD within this package. */ #include <locale.h> -#include <wchar.h> #include <stdlib.h> +#include <wchar.h> size_t mbsrtowcs ( wchar_t *wcs, @@ -29,12 +29,15 @@ size_t mbsrtowcs ( /* Next multibyte character to convert */ const char *mbc = *mbs; + /* Code page used by current locale */ + unsigned cp = ___lc_codepage_func (); + /* Maximum character length in `cp` */ int mb_cur_max = ___mb_cur_max_func(); while (1) { - const size_t length = mbrtowc ( - &wc, mbc, mb_cur_max, &conversion_state + const size_t length = __mingw_mbrtowc_cp ( + &wc, mbc, mb_cur_max, &conversion_state, cp, mb_cur_max ); /* Conversion failed */ diff --git a/mingw-w64-headers/crt/wchar.h b/mingw-w64-headers/crt/wchar.h index 898d0e821..1bb3abe5b 100644 --- a/mingw-w64-headers/crt/wchar.h +++ b/mingw-w64-headers/crt/wchar.h @@ -1206,6 +1206,9 @@ __MINGW_ASM_CALL(__mingw_vsnwprintf); #endif typedef wchar_t _Wint_t; +#ifndef _UCRT + size_t __cdecl __mingw_mbrtowc_cp(wchar_t * __restrict__ _DstCh,const char * __restrict__ _SrcCh,size_t _SizeInBytes,mbstate_t * __restrict__ _State, unsigned _Cp, int _MbCurMax); +#endif wint_t __cdecl btowc(int); int __cdecl mbsinit(const mbstate_t *ps); size_t __cdecl mbrlen(const char * __restrict__ _Ch,size_t _SizeInBytes,mbstate_t * __restrict__ _State); -- 2.51.0.windows.1
From 725ff47bfaa5762a214ed79194d5eb162c7fa1bf Mon Sep 17 00:00:00 2001 From: Kirill Makurin <[email protected]> Date: Sat, 4 Oct 2025 21:28:27 +0900 Subject: [PATCH 2/3] crt: move definition of __mingw_mbrtowc_cp to a separate file Signed-off-by: Kirill Makurin <[email protected]> --- mingw-w64-crt/Makefile.am | 1 + mingw-w64-crt/misc/__mingw_mbrtowc_cp.c | 155 ++++++++++++++++++++++++ mingw-w64-crt/misc/mbrtowc.c | 147 ---------------------- 3 files changed, 156 insertions(+), 147 deletions(-) create mode 100644 mingw-w64-crt/misc/__mingw_mbrtowc_cp.c diff --git a/mingw-w64-crt/Makefile.am b/mingw-w64-crt/Makefile.am index 439f60ea5..389f956ec 100644 --- a/mingw-w64-crt/Makefile.am +++ b/mingw-w64-crt/Makefile.am @@ -167,6 +167,7 @@ src_libws2_32=libsrc/ws2_32.c \ # Files included in all libmsvcr*.a src_msvcrt_common=\ + misc/__mingw_mbrtowc_cp.c \ misc/_onexit.c \ misc/mbrlen.c \ misc/mbrtowc.c \ diff --git a/mingw-w64-crt/misc/__mingw_mbrtowc_cp.c b/mingw-w64-crt/misc/__mingw_mbrtowc_cp.c new file mode 100644 index 000000000..9972d2a19 --- /dev/null +++ b/mingw-w64-crt/misc/__mingw_mbrtowc_cp.c @@ -0,0 +1,155 @@ +/** + * This file has no copyright assigned and is placed in the Public Domain. + * This file is part of the mingw-w64 runtime package. + * No warranty is given; refer to the file DISCLAIMER.PD within this package. + */ +#include <errno.h> +#include <locale.h> +#include <stdlib.h> +#include <wchar.h> + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +/** + * __mingw_mbrtowc_cp is internal implementation for C95 functions mbrlen, + * mbrtowc and mbsrtowcs. + * + * In order to perform conversion we need the following information: + * + * - code page used by active locale (which can be a thread locale for + * msvcr80.dll and later); obtained by calling ___lc_codepage_func + * + * - maximum character length in used code page; obtained by calling + * ___mb_cur_max_func + * + * - for double-byte code pages, we need to recognize leading bytes in order + * to correctly convert multibyte characters; this can be done with Win32 + * function IsDBCSLeadByteEx or CRT function isleadbyte + * + * crtdll.dll's ___lc_codepage_func is quite expensive as it obtains this + * information by parsing return value of setlocale(LC_CTYPE, NULL). Using + * __mingw_mbrtowc_cp allows mbsrtowcs call both ___lc_codepage_func and + * ___mb_cur_max_func only once. + */ + +size_t __mingw_mbrtowc_cp ( + wchar_t *__restrict__ wc, + const char *__restrict__ mbs, + size_t count, + mbstate_t *__restrict__ state, + unsigned cp, + int mb_cur_max +) { + /** + * Calling mbrtowc (..., NULL, ..., state) is equivalent to + * + * mbrtowc (NULL, "", 1, state) + */ + if (mbs == NULL) { + wc = NULL; + mbs = ""; + count = 1; + } + + /* Detect invalid conversion state */ + if ((unsigned) *state > 0xFF) { + goto einval; + } + + /* Both ISO C and POSIX do not mention this case */ + if (count == 0) { + return (size_t) -2; + } + + /* Treat `state` as an array of bytes */ + union { + mbstate_t state; + char bytes[4]; + } conversion_state = {.state = *state}; + + /* For SBCS code pages `state` must always be in initial state */ + if (mb_cur_max == 1 && conversion_state.bytes[0]) { + goto einval; + } + + /* Handle "C" locale */ + if (cp == 0) { + if (wc != NULL) { + *wc = (unsigned char) mbs[0]; + } + return !!mbs[0]; + } + + /* Length of potential multibyte character */ + int length = 1; + + /* Number of bytes consumed from `mbs` */ + int bytes_consumed = 0; + + if (conversion_state.bytes[0]) { + conversion_state.bytes[1] = mbs[0]; + bytes_consumed = 1; + length = 2; + } else if (mb_cur_max == 2 && isleadbyte ((unsigned char) mbs[0])) { + conversion_state.bytes[0] = mbs[0]; + + /* We need to examine mbs[1] */ + if (count < 2) { + *state = conversion_state.state; + return (size_t) -2; + } + + conversion_state.bytes[1] = mbs[1]; + bytes_consumed = 2; + length = 2; + } else { + conversion_state.bytes[0] = mbs[0]; + bytes_consumed = 1; + } + + /* Store terminating '\0' */ + if (conversion_state.bytes[0] == '\0') { + if (wc != NULL) { + *wc = L'\0'; + } + + /* Set `state` to initial conversion state */ + *state = 0; + + return 0; + } + + /* Truncated multibyte character */ + if (length == 2 && conversion_state.bytes[1] == '\0') { + goto eilseq; + } + + /* Converted wide character */ + wchar_t wcOut = WEOF; + + int ret = MultiByteToWideChar ( + cp, MB_ERR_INVALID_CHARS, conversion_state.bytes, length, &wcOut, 1 + ); + + if (ret != 1) { + goto eilseq; + } + + if (wc != NULL) { + *wc = wcOut; + } + + /* Set `state` to initial conversion state */ + *state = 0; + + return bytes_consumed; + +eilseq: + errno = EILSEQ; + return (size_t) -1; + +einval: + errno = EINVAL; + return (size_t) -1; +} diff --git a/mingw-w64-crt/misc/mbrtowc.c b/mingw-w64-crt/misc/mbrtowc.c index c4f7556cb..195adf63d 100644 --- a/mingw-w64-crt/misc/mbrtowc.c +++ b/mingw-w64-crt/misc/mbrtowc.c @@ -3,157 +3,10 @@ * This file is part of the mingw-w64 runtime package. * No warranty is given; refer to the file DISCLAIMER.PD within this package. */ -#include <errno.h> #include <locale.h> #include <stdlib.h> #include <wchar.h> -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -/** - * __mingw_mbrtowc_cp is internal implementation for C95 functions mbrlen, - * mbrtowc and mbsrtowcs. - * - * In order to perform conversion we need the following information: - * - * - code page used by active locale (which can be a thread locale for - * msvcr80.dll and later); obtained by calling ___lc_codepage_func - * - * - maximum character length in used code page; obtained by calling - * ___mb_cur_max_func - * - * - for double-byte code pages, we need to recognize leading bytes in order - * to correctly convert multibyte characters; this can be done with Win32 - * function IsDBCSLeadByteEx or CRT function isleadbyte - * - * crtdll.dll's ___lc_codepage_func is quite expensive as it obtains this - * information by parsing return value of setlocale(LC_CTYPE, NULL). Using - * __mingw_mbrtowc_cp allows mbsrtowcs call both ___lc_codepage_func and - * ___mb_cur_max_func only once. - */ - -size_t __mingw_mbrtowc_cp ( - wchar_t *__restrict__ wc, - const char *__restrict__ mbs, - size_t count, - mbstate_t *__restrict__ state, - unsigned cp, - int mb_cur_max -) { - /** - * Calling mbrtowc (..., NULL, ..., state) is equivalent to - * - * mbrtowc (NULL, "", 1, state) - */ - if (mbs == NULL) { - wc = NULL; - mbs = ""; - count = 1; - } - - /* Detect invalid conversion state */ - if ((unsigned) *state > 0xFF) { - goto einval; - } - - /* Both ISO C and POSIX do not mention this case */ - if (count == 0) { - return (size_t) -2; - } - - /* Treat `state` as an array of bytes */ - union { - mbstate_t state; - char bytes[4]; - } conversion_state = {.state = *state}; - - /* For SBCS code pages `state` must always be in initial state */ - if (mb_cur_max == 1 && conversion_state.bytes[0]) { - goto einval; - } - - /* Handle "C" locale */ - if (cp == 0) { - if (wc != NULL) { - *wc = (unsigned char) mbs[0]; - } - return !!mbs[0]; - } - - /* Length of potential multibyte character */ - int length = 1; - - /* Number of bytes consumed from `mbs` */ - int bytes_consumed = 0; - - if (conversion_state.bytes[0]) { - conversion_state.bytes[1] = mbs[0]; - bytes_consumed = 1; - length = 2; - } else if (mb_cur_max == 2 && isleadbyte ((unsigned char) mbs[0])) { - conversion_state.bytes[0] = mbs[0]; - - /* We need to examine mbs[1] */ - if (count < 2) { - *state = conversion_state.state; - return (size_t) -2; - } - - conversion_state.bytes[1] = mbs[1]; - bytes_consumed = 2; - length = 2; - } else { - conversion_state.bytes[0] = mbs[0]; - bytes_consumed = 1; - } - - /* Store terminating '\0' */ - if (conversion_state.bytes[0] == '\0') { - if (wc != NULL) { - *wc = L'\0'; - } - - /* Set `state` to initial conversion state */ - *state = 0; - - return 0; - } - - /* Truncated multibyte character */ - if (length == 2 && conversion_state.bytes[1] == '\0') { - goto eilseq; - } - - /* Converted wide character */ - wchar_t wcOut = WEOF; - - int ret = MultiByteToWideChar ( - cp, MB_ERR_INVALID_CHARS, conversion_state.bytes, length, &wcOut, 1 - ); - - if (ret != 1) { - goto eilseq; - } - - if (wc != NULL) { - *wc = wcOut; - } - - /* Set `state` to initial conversion state */ - *state = 0; - - return bytes_consumed; - -eilseq: - errno = EILSEQ; - return (size_t) -1; - -einval: - errno = EINVAL; - return (size_t) -1; -} - size_t mbrtowc ( wchar_t *__restrict__ wc, const char *__restrict__ mbs, -- 2.51.0.windows.1
_______________________________________________ Mingw-w64-public mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/mingw-w64-public
