Module Name: src Committed By: riastradh Date: Tue Aug 20 17:43:09 UTC 2024
Modified Files: src/lib/libc/locale: mbrtoc32.c mbrtoc32.h src/tests/lib/libc/locale: t_mbrtoc16.c t_mbrtoc8.c Log Message: mbrtoc32(3): Use conversion state to handle shift sequences. PR lib/58618: mbrtocN(3) fails to keep shift state To generate a diff of this commit: cvs rdiff -u -r1.7 -r1.8 src/lib/libc/locale/mbrtoc32.c cvs rdiff -u -r1.1 -r1.2 src/lib/libc/locale/mbrtoc32.h cvs rdiff -u -r1.2 -r1.3 src/tests/lib/libc/locale/t_mbrtoc16.c \ src/tests/lib/libc/locale/t_mbrtoc8.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libc/locale/mbrtoc32.c diff -u src/lib/libc/locale/mbrtoc32.c:1.7 src/lib/libc/locale/mbrtoc32.c:1.8 --- src/lib/libc/locale/mbrtoc32.c:1.7 Sun Aug 18 20:06:05 2024 +++ src/lib/libc/locale/mbrtoc32.c Tue Aug 20 17:43:09 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: mbrtoc32.c,v 1.7 2024/08/18 20:06:05 rillig Exp $ */ +/* $NetBSD: mbrtoc32.c,v 1.8 2024/08/20 17:43:09 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -52,7 +52,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: mbrtoc32.c,v 1.7 2024/08/18 20:06:05 rillig Exp $"); +__RCSID("$NetBSD: mbrtoc32.c,v 1.8 2024/08/20 17:43:09 riastradh Exp $"); #include "namespace.h" @@ -102,10 +102,17 @@ mbrtoc32_l(char32_t *restrict pc32, cons mbstate_t *restrict ps, locale_t restrict loc) { static mbstate_t psbuf; - struct mbrtoc32state *S; struct _citrus_iconv *iconv = NULL; - size_t len; + wchar_t wc; + mbstate_t wcrtombstate = {0}; + char mb[MB_LEN_MAX]; + int mblen; + char utf32le[MB_LEN_MAX]; + const char *src; + char *dst; + size_t srcleft, dstleft, inval; char32_t c32; + size_t len; int error, errno_save; /* @@ -141,11 +148,6 @@ mbrtoc32_l(char32_t *restrict pc32, cons } /* - * Get the private conversion state. - */ - S = (struct mbrtoc32state *)(void *)ps; - - /* * If input length is zero, the result is always incomplete by * definition. Don't bother with iconv -- we'd have to * disentangle truncated outputs. @@ -156,12 +158,6 @@ mbrtoc32_l(char32_t *restrict pc32, cons } /* - * Reset the destination buffer if this is the initial state. - */ - if (S->dstleft == 0) - S->dstleft = sizeof(S->dstbuf); - - /* * Open an iconv handle to convert locale-dependent multibyte * input to UTF-32LE. */ @@ -173,47 +169,55 @@ mbrtoc32_l(char32_t *restrict pc32, cons } /* - * Try to iconv a minimal prefix. If we succeed, set len to - * the length consumed and goto ok. + * Consume the next locale-dependent wide character. If no + * wide character can be obtained, stop here. + */ + len = mbrtowc_l(&wc, s, n, ps, loc); + switch (len) { + case 0: /* NUL */ + if (pc32) + *pc32 = 0; + goto out; + case (size_t)-2: /* still incomplete after n bytes */ + case (size_t)-1: /* error */ + goto out; + default: /* consumed len bytes of input */ + break; + } + + /* + * We consumed a wide character from the input. Convert it to + * a multibyte sequence _in the initial conversion state_, so + * we can pass that through iconv to get a Unicode scalar + * value. */ - for (len = 0; len < MIN(n, sizeof(S->srcbuf) - S->nsrc);) { - const char *src = S->srcbuf; - size_t srcleft; - char *dst = S->dstbuf + sizeof(S->dstbuf) - S->dstleft; - size_t inval; - - S->srcbuf[S->nsrc++] = s[len++]; - srcleft = S->nsrc; - - error = _citrus_iconv_convert(iconv, - &src, &srcleft, - &dst, &S->dstleft, - _CITRUS_ICONV_F_HIDE_INVALID, &inval); - if (error != EINVAL) { - if (error == 0) - break; - errno = error; - len = (size_t)-1; - goto out; - } + if ((mblen = wcrtomb_l(mb, wc, &wcrtombstate, loc)) == -1) { + len = (size_t)-1; + goto out; } /* - * If it is still incomplete after trying the whole input - * buffer, return (size_t)-2 and let the caller try again. + * Convert the multibyte sequence to UTF-16LE. */ + src = mb; + srcleft = (size_t)mblen; + dst = utf32le; + dstleft = sizeof(utf32le); + error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft, + _CITRUS_ICONV_F_HIDE_INVALID, &inval); if (error) { - len = (size_t)-2; + errno = error; + len = (size_t)-1; goto out; } /* - * Successfully converted a minimal byte sequence, which should - * produce exactly one UTF-32 code unit, encoded in - * little-endian, representing a code point. Get the code + * Successfully converted the multibyte sequence to UTF-16LE, + * which should produce exactly one UTF-32 code unit, encoded + * in little-endian, representing a code point. Get the code * point. */ - c32 = le32dec(S->dstbuf); + c32 = le32dec(utf32le); /* * Reject surrogate code points. We only deal in scalar @@ -245,11 +249,7 @@ mbrtoc32_l(char32_t *restrict pc32, cons */ errno = errno_save; -out: if (len != (size_t)-2) { - S->nsrc = 0; - memset(S, 0, sizeof(*S)); /* paranoia */ - } - errno_save = errno; +out: errno_save = errno; _citrus_iconv_close(iconv); errno = errno_save; return len; Index: src/lib/libc/locale/mbrtoc32.h diff -u src/lib/libc/locale/mbrtoc32.h:1.1 src/lib/libc/locale/mbrtoc32.h:1.2 --- src/lib/libc/locale/mbrtoc32.h:1.1 Thu Aug 15 14:16:33 2024 +++ src/lib/libc/locale/mbrtoc32.h Tue Aug 20 17:43:09 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: mbrtoc32.h,v 1.1 2024/08/15 14:16:33 riastradh Exp $ */ +/* $NetBSD: mbrtoc32.h,v 1.2 2024/08/20 17:43:09 riastradh Exp $ */ /*- * Copyright (c) 2024 The NetBSD Foundation, Inc. @@ -29,14 +29,12 @@ #ifndef LIB_LIBC_LOCALE_MBRTOC32_H_ #define LIB_LIBC_LOCALE_MBRTOC32_H_ -#include <limits.h> -#include <uchar.h> - struct mbrtoc32state { - char srcbuf[MB_LEN_MAX]; - size_t nsrc; - char dstbuf[4]; - size_t dstleft; + /* + * XXX This needs to match the maximum size of any conversion + * state actually used by mbrtowc_l. + */ + char dummy; }; #endif /* LIB_LIBC_LOCALE_MBRTOC32_H_ */ Index: src/tests/lib/libc/locale/t_mbrtoc16.c diff -u src/tests/lib/libc/locale/t_mbrtoc16.c:1.2 src/tests/lib/libc/locale/t_mbrtoc16.c:1.3 --- src/tests/lib/libc/locale/t_mbrtoc16.c:1.2 Mon Aug 19 16:24:05 2024 +++ src/tests/lib/libc/locale/t_mbrtoc16.c Tue Aug 20 17:43:09 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: t_mbrtoc16.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $ */ +/* $NetBSD: t_mbrtoc16.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $ */ /*- * Copyright (c) 2002 Tim J. Robbins @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_mbrtoc16.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $"); +__RCSID("$NetBSD: t_mbrtoc16.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $"); #include <errno.h> #include <inttypes.h> @@ -171,22 +171,16 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te /* Incomplete character sequence (shift sequence only). */ memset(&s, 0, sizeof(s)); c16 = 0; - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J", 3, &s)), (size_t)-2, "n=%zu", n); - atf_tc_expect_pass(); ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16); /* Same as above, but complete (U+00A5 YEN SIGN). */ memset(&s, 0, sizeof(s)); c16 = 0; - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J\x5c", 4, &s)), 4, "n=%zu", n); ATF_CHECK_EQ_MSG(c16, 0xa5, "c16=U+%04"PRIx16, (uint16_t)c16); - atf_tc_expect_pass(); /* Test restarting behaviour. */ memset(&s, 0, sizeof(s)); @@ -194,11 +188,8 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(", 2, &s)), (size_t)-2, "n=%zu", n); ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16); - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "J\x5c", 2, &s)), 2, "n=%zu", n); ATF_CHECK_EQ_MSG(c16, 0xa5, "c16=U+%04"PRIx16, (uint16_t)c16); - atf_tc_expect_pass(); /* * Test shift sequence state in various increments: @@ -215,8 +206,6 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te "n=%zu", n); ATF_CHECK_EQ_MSG(c16, L'A', "c16=U+%04"PRIx16, (uint16_t)c16); c16 = 0; - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J", 3, &s)), (size_t)-2, "n=%zu", n); ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16); @@ -240,7 +229,6 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x22\x1b(B\x00", 5, &s)), 1, "n=%zu", n); ATF_CHECK_EQ_MSG(c16, 0x30a2, "c16=U+%04"PRIx16, (uint16_t)c16); - atf_tc_expect_pass(); c16 = 0; ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(", 2, &s)), (size_t)-2, "n=%zu", n); Index: src/tests/lib/libc/locale/t_mbrtoc8.c diff -u src/tests/lib/libc/locale/t_mbrtoc8.c:1.2 src/tests/lib/libc/locale/t_mbrtoc8.c:1.3 --- src/tests/lib/libc/locale/t_mbrtoc8.c:1.2 Mon Aug 19 16:24:05 2024 +++ src/tests/lib/libc/locale/t_mbrtoc8.c Tue Aug 20 17:43:09 2024 @@ -1,4 +1,4 @@ -/* $NetBSD: t_mbrtoc8.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $ */ +/* $NetBSD: t_mbrtoc8.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $ */ /*- * Copyright (c) 2002 Tim J. Robbins @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__RCSID("$NetBSD: t_mbrtoc8.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $"); +__RCSID("$NetBSD: t_mbrtoc8.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $"); #include <errno.h> #include <inttypes.h> @@ -172,25 +172,19 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes /* Incomplete character sequence (shift sequence only). */ memset(&s, 0, sizeof(s)); c8 = 0; - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J", 3, &s)), (size_t)-2, "n=%zu", n); - atf_tc_expect_pass(); ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); /* Same as above, but complete (U+00A5 YEN SIGN). */ memset(&s, 0, sizeof(s)); c8 = 0; - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J\x5c", 4, &s)), 4, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8); ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0xa5, "c8=0x%"PRIx8, (uint8_t)c8); - atf_tc_expect_pass(); /* Test restarting behaviour. */ memset(&s, 0, sizeof(s)); @@ -198,14 +192,11 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-2, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); - atf_tc_expect_fail("PR lib/58618:" - " mbrtocN(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "J\x5c", 2, &s)), 2, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8); ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0xa5, "c8=0x%"PRIx8, (uint8_t)c8); - atf_tc_expect_pass(); /* * Test shift sequence state in various increments: @@ -221,8 +212,6 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A\x1b(J", 4, &s)), 1, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8, (uint8_t)c8); c8 = 0; - atf_tc_expect_fail("PR lib/58618:" - " mbrtocn(3) fails to keep shift state"); ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J", 3, &s)), (size_t)-2, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8); @@ -264,7 +253,6 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-3, "n=%zu", n); ATF_CHECK_EQ_MSG(c8, 0xa2, "c8=0x%"PRIx8, (uint8_t)c8); - atf_tc_expect_pass(); c8 = 0; ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-2, "n=%zu", n);