Module Name:    src
Committed By:   riastradh
Date:           Tue Aug 20 17:43:09 UTC 2024

Modified Files:
        src/lib/libc/locale: mbrtoc32.c mbrtoc32.h
        src/tests/lib/libc/locale: t_mbrtoc16.c t_mbrtoc8.c

Log Message:
mbrtoc32(3): Use conversion state to handle shift sequences.

PR lib/58618: mbrtocN(3) fails to keep shift state


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/lib/libc/locale/mbrtoc32.c
cvs rdiff -u -r1.1 -r1.2 src/lib/libc/locale/mbrtoc32.h
cvs rdiff -u -r1.2 -r1.3 src/tests/lib/libc/locale/t_mbrtoc16.c \
    src/tests/lib/libc/locale/t_mbrtoc8.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libc/locale/mbrtoc32.c
diff -u src/lib/libc/locale/mbrtoc32.c:1.7 src/lib/libc/locale/mbrtoc32.c:1.8
--- src/lib/libc/locale/mbrtoc32.c:1.7	Sun Aug 18 20:06:05 2024
+++ src/lib/libc/locale/mbrtoc32.c	Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: mbrtoc32.c,v 1.7 2024/08/18 20:06:05 rillig Exp $	*/
+/*	$NetBSD: mbrtoc32.c,v 1.8 2024/08/20 17:43:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -52,7 +52,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: mbrtoc32.c,v 1.7 2024/08/18 20:06:05 rillig Exp $");
+__RCSID("$NetBSD: mbrtoc32.c,v 1.8 2024/08/20 17:43:09 riastradh Exp $");
 
 #include "namespace.h"
 
@@ -102,10 +102,17 @@ mbrtoc32_l(char32_t *restrict pc32, cons
     mbstate_t *restrict ps, locale_t restrict loc)
 {
 	static mbstate_t psbuf;
-	struct mbrtoc32state *S;
 	struct _citrus_iconv *iconv = NULL;
-	size_t len;
+	wchar_t wc;
+	mbstate_t wcrtombstate = {0};
+	char mb[MB_LEN_MAX];
+	int mblen;
+	char utf32le[MB_LEN_MAX];
+	const char *src;
+	char *dst;
+	size_t srcleft, dstleft, inval;
 	char32_t c32;
+	size_t len;
 	int error, errno_save;
 
 	/*
@@ -141,11 +148,6 @@ mbrtoc32_l(char32_t *restrict pc32, cons
 	}
 
 	/*
-	 * Get the private conversion state.
-	 */
-	S = (struct mbrtoc32state *)(void *)ps;
-
-	/*
 	 * If input length is zero, the result is always incomplete by
 	 * definition.  Don't bother with iconv -- we'd have to
 	 * disentangle truncated outputs.
@@ -156,12 +158,6 @@ mbrtoc32_l(char32_t *restrict pc32, cons
 	}
 
 	/*
-	 * Reset the destination buffer if this is the initial state.
-	 */
-	if (S->dstleft == 0)
-		S->dstleft = sizeof(S->dstbuf);
-
-	/*
 	 * Open an iconv handle to convert locale-dependent multibyte
 	 * input to UTF-32LE.
 	 */
@@ -173,47 +169,55 @@ mbrtoc32_l(char32_t *restrict pc32, cons
 	}
 
 	/*
-	 * Try to iconv a minimal prefix.  If we succeed, set len to
-	 * the length consumed and goto ok.
+	 * Consume the next locale-dependent wide character.  If no
+	 * wide character can be obtained, stop here.
+	 */
+	len = mbrtowc_l(&wc, s, n, ps, loc);
+	switch (len) {
+	case 0:			/* NUL */
+		if (pc32)
+			*pc32 = 0;
+		goto out;
+	case (size_t)-2:	/* still incomplete after n bytes */
+	case (size_t)-1:	/* error */
+		goto out;
+	default:		/* consumed len bytes of input */
+		break;
+	}
+
+	/*
+	 * We consumed a wide character from the input.  Convert it to
+	 * a multibyte sequence _in the initial conversion state_, so
+	 * we can pass that through iconv to get a Unicode scalar
+	 * value.
 	 */
-	for (len = 0; len < MIN(n, sizeof(S->srcbuf) - S->nsrc);) {
-		const char *src = S->srcbuf;
-		size_t srcleft;
-		char *dst = S->dstbuf + sizeof(S->dstbuf) - S->dstleft;
-		size_t inval;
-
-		S->srcbuf[S->nsrc++] = s[len++];
-		srcleft = S->nsrc;
-
-		error = _citrus_iconv_convert(iconv,
-		    &src, &srcleft,
-		    &dst, &S->dstleft,
-		    _CITRUS_ICONV_F_HIDE_INVALID, &inval);
-		if (error != EINVAL) {
-			if (error == 0)
-				break;
-			errno = error;
-			len = (size_t)-1;
-			goto out;
-		}
+	if ((mblen = wcrtomb_l(mb, wc, &wcrtombstate, loc)) == -1) {
+		len = (size_t)-1;
+		goto out;
 	}
 
 	/*
-	 * If it is still incomplete after trying the whole input
-	 * buffer, return (size_t)-2 and let the caller try again.
+	 * Convert the multibyte sequence to UTF-16LE.
 	 */
+	src = mb;
+	srcleft = (size_t)mblen;
+	dst = utf32le;
+	dstleft = sizeof(utf32le);
+	error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft,
+	    _CITRUS_ICONV_F_HIDE_INVALID, &inval);
 	if (error) {
-		len = (size_t)-2;
+		errno = error;
+		len = (size_t)-1;
 		goto out;
 	}
 
 	/*
-	 * Successfully converted a minimal byte sequence, which should
-	 * produce exactly one UTF-32 code unit, encoded in
-	 * little-endian, representing a code point.  Get the code
+	 * Successfully converted the multibyte sequence to UTF-16LE,
+	 * which should produce exactly one UTF-32 code unit, encoded
+	 * in little-endian, representing a code point.  Get the code
 	 * point.
 	 */
-	c32 = le32dec(S->dstbuf);
+	c32 = le32dec(utf32le);
 
 	/*
 	 * Reject surrogate code points.  We only deal in scalar
@@ -245,11 +249,7 @@ mbrtoc32_l(char32_t *restrict pc32, cons
 	 */
 	errno = errno_save;
 
-out:	if (len != (size_t)-2) {
-		S->nsrc = 0;
-		memset(S, 0, sizeof(*S)); /* paranoia */
-	}
-	errno_save = errno;
+out:	errno_save = errno;
 	_citrus_iconv_close(iconv);
 	errno = errno_save;
 	return len;

Index: src/lib/libc/locale/mbrtoc32.h
diff -u src/lib/libc/locale/mbrtoc32.h:1.1 src/lib/libc/locale/mbrtoc32.h:1.2
--- src/lib/libc/locale/mbrtoc32.h:1.1	Thu Aug 15 14:16:33 2024
+++ src/lib/libc/locale/mbrtoc32.h	Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: mbrtoc32.h,v 1.1 2024/08/15 14:16:33 riastradh Exp $	*/
+/*	$NetBSD: mbrtoc32.h,v 1.2 2024/08/20 17:43:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2024 The NetBSD Foundation, Inc.
@@ -29,14 +29,12 @@
 #ifndef	LIB_LIBC_LOCALE_MBRTOC32_H_
 #define	LIB_LIBC_LOCALE_MBRTOC32_H_
 
-#include <limits.h>
-#include <uchar.h>
-
 struct mbrtoc32state {
-	char			srcbuf[MB_LEN_MAX];
-	size_t			nsrc;
-	char			dstbuf[4];
-	size_t			dstleft;
+	/*
+	 * XXX This needs to match the maximum size of any conversion
+	 * state actually used by mbrtowc_l.
+	 */
+	char		dummy;
 };
 
 #endif	/* LIB_LIBC_LOCALE_MBRTOC32_H_ */

Index: src/tests/lib/libc/locale/t_mbrtoc16.c
diff -u src/tests/lib/libc/locale/t_mbrtoc16.c:1.2 src/tests/lib/libc/locale/t_mbrtoc16.c:1.3
--- src/tests/lib/libc/locale/t_mbrtoc16.c:1.2	Mon Aug 19 16:24:05 2024
+++ src/tests/lib/libc/locale/t_mbrtoc16.c	Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: t_mbrtoc16.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $	*/
+/*	$NetBSD: t_mbrtoc16.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: t_mbrtoc16.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $");
+__RCSID("$NetBSD: t_mbrtoc16.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $");
 
 #include <errno.h>
 #include <inttypes.h>
@@ -171,22 +171,16 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
 	/* Incomplete character sequence (shift sequence only). */
 	memset(&s, 0, sizeof(s));
 	c16 = 0;
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J", 3, &s)), (size_t)-2,
 	    "n=%zu", n);
-	atf_tc_expect_pass();
 	ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16);
 
 	/* Same as above, but complete (U+00A5 YEN SIGN). */
 	memset(&s, 0, sizeof(s));
 	c16 = 0;
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J\x5c", 4, &s)), 4,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c16, 0xa5, "c16=U+%04"PRIx16, (uint16_t)c16);
-	atf_tc_expect_pass();
 
 	/* Test restarting behaviour. */
 	memset(&s, 0, sizeof(s));
@@ -194,11 +188,8 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(", 2, &s)), (size_t)-2,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16);
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "J\x5c", 2, &s)), 2, "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c16, 0xa5, "c16=U+%04"PRIx16, (uint16_t)c16);
-	atf_tc_expect_pass();
 
 	/*
 	 * Test shift sequence state in various increments:
@@ -215,8 +206,6 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c16, L'A', "c16=U+%04"PRIx16, (uint16_t)c16);
 	c16 = 0;
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(J", 3, &s)), (size_t)-2,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c16, 0, "c16=U+%04"PRIx16, (uint16_t)c16);
@@ -240,7 +229,6 @@ ATF_TC_BODY(mbrtoc16_iso2022jp_locale_te
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x22\x1b(B\x00", 5, &s)), 1,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c16, 0x30a2, "c16=U+%04"PRIx16, (uint16_t)c16);
-	atf_tc_expect_pass();
 	c16 = 0;
 	ATF_CHECK_EQ_MSG((n = mbrtoc16(&c16, "\x1b(", 2, &s)), (size_t)-2,
 	    "n=%zu", n);
Index: src/tests/lib/libc/locale/t_mbrtoc8.c
diff -u src/tests/lib/libc/locale/t_mbrtoc8.c:1.2 src/tests/lib/libc/locale/t_mbrtoc8.c:1.3
--- src/tests/lib/libc/locale/t_mbrtoc8.c:1.2	Mon Aug 19 16:24:05 2024
+++ src/tests/lib/libc/locale/t_mbrtoc8.c	Tue Aug 20 17:43:09 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: t_mbrtoc8.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $	*/
+/*	$NetBSD: t_mbrtoc8.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2002 Tim J. Robbins
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: t_mbrtoc8.c,v 1.2 2024/08/19 16:24:05 riastradh Exp $");
+__RCSID("$NetBSD: t_mbrtoc8.c,v 1.3 2024/08/20 17:43:09 riastradh Exp $");
 
 #include <errno.h>
 #include <inttypes.h>
@@ -172,25 +172,19 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
 	/* Incomplete character sequence (shift sequence only). */
 	memset(&s, 0, sizeof(s));
 	c8 = 0;
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J", 3, &s)), (size_t)-2,
 	    "n=%zu", n);
-	atf_tc_expect_pass();
 	ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8);
 
 	/* Same as above, but complete (U+00A5 YEN SIGN). */
 	memset(&s, 0, sizeof(s));
 	c8 = 0;
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J\x5c", 4, &s)), 4,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8);
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0xa5, "c8=0x%"PRIx8, (uint8_t)c8);
-	atf_tc_expect_pass();
 
 	/* Test restarting behaviour. */
 	memset(&s, 0, sizeof(s));
@@ -198,14 +192,11 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-2,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8);
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocN(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "J\x5c", 2, &s)), 2, "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0xc2, "c8=0x%"PRIx8, (uint8_t)c8);
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "", 0, &s)), (size_t)-3,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0xa5, "c8=0x%"PRIx8, (uint8_t)c8);
-	atf_tc_expect_pass();
 
 	/*
 	 * Test shift sequence state in various increments:
@@ -221,8 +212,6 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "A\x1b(J", 4, &s)), 1, "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 'A', "c8=0x%"PRIx8, (uint8_t)c8);
 	c8 = 0;
-	atf_tc_expect_fail("PR lib/58618:"
-	    " mbrtocn(3) fails to keep shift state");
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(J", 3, &s)), (size_t)-2,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0, "c8=0x%"PRIx8, (uint8_t)c8);
@@ -264,7 +253,6 @@ ATF_TC_BODY(mbrtoc8_iso2022jp_locale_tes
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-3,
 	    "n=%zu", n);
 	ATF_CHECK_EQ_MSG(c8, 0xa2, "c8=0x%"PRIx8, (uint8_t)c8);
-	atf_tc_expect_pass();
 	c8 = 0;
 	ATF_CHECK_EQ_MSG((n = mbrtoc8(&c8, "\x1b(", 2, &s)), (size_t)-2,
 	    "n=%zu", n);

Reply via email to