Running a testdir of the modules
  mbrtowc mbrlen mbslen mbsstr mbmemcasecoll
on NetBSD 10 shows a couple of test failures:


FAIL: test-mbmemcasecoll-3.sh
=============================

../../gltests/test-mbmemcasecmp.h:432: assertion 'my_casecmp (input1, countof 
(input1), input2, countof (input2)) == 0' failed
[1]   Abort trap (core dumped) LC_ALL="${testlocale}" ${CHECKER} ./test-mbmem...
FAIL test-mbmemcasecoll-3.sh (exit status: 134)

FAIL: test-mbslen.sh
====================

../../gltests/test-mbslen.c:62: assertion 'mbslen ("\341") == 1' failed
[1]   Abort trap (core dumped) LC_ALL="${testlocale}" ${CHECKER} ./test-mbsle...
FAIL test-mbslen.sh (exit status: 134)

FAIL: test-mbsstr2.sh
=====================

../../gltests/test-mbsstr2.c:127: assertion 'result == input + 1' failed
[1]   Abort trap (core dumped) LC_ALL="${testlocale}" ${CHECKER} ./test-mbsst...
FAIL test-mbsstr2.sh (exit status: 134)


This patch fixes it by doing the mbrtowc processing in UTF-8 locales outselves.


2026-06-02  Bruno Haible  <[email protected]>

        mbrtowc, mbrlen: Work around a NetBSD bug in UTF-8 locales.
        * m4/mbrtowc.m4 (gl_MBRTOWC_INVALID_UTF8): New macro.
        (gl_FUNC_MBRTOWC): Invoke it. Define MBRTOWC_INVALID_UTF8_BUG if mbrtowc
        does not recognize some invalid UTF-8 byte sequences.
        * lib/mbrtowc.c (is_locale_utf8, is_locale_utf8_cached): Define also if
        MBRTOWC_INVALID_UTF8_BUG.
        (rpl_mbrtowc): Handle UTF-8 locales specially also on NetBSD.
        * tests/test-mbrtowc.c (main): Add more test cases for the UTF-8
        encoding.
        * tests/test-mbrlen.c (main): Likewise.
        * doc/posix-functions/mbrtowc.texi: Mention the NetBSD bug.
        * doc/posix-functions/mbrlen.texi: Likewise.

diff --git a/doc/posix-functions/mbrlen.texi b/doc/posix-functions/mbrlen.texi
index 07ece94fee..acc4b52f0b 100644
--- a/doc/posix-functions/mbrlen.texi
+++ b/doc/posix-functions/mbrlen.texi
@@ -36,6 +36,10 @@
 character, on some platforms:
 HP-UX 11.11, Solaris 11 2010-11.
 @item
+This function returns @code{(size_t) -2} instead of @code{(size_t) -1}
+for some invalid byte sequences on some platforms:
+NetBSD 10.
+@item
 This function may not return 0 when parsing the NUL character on some 
platforms:
 Solaris 9.
 @end itemize
diff --git a/doc/posix-functions/mbrtowc.texi b/doc/posix-functions/mbrtowc.texi
index 9666c58d48..b5d155cd2b 100644
--- a/doc/posix-functions/mbrtowc.texi
+++ b/doc/posix-functions/mbrtowc.texi
@@ -42,6 +42,10 @@
 character, on some platforms:
 HP-UX 11.11, Solaris 11 2010-11, mingw, MSVC 14.
 @item
+This function returns @code{(size_t) -2} instead of @code{(size_t) -1}
+for some invalid byte sequences on some platforms:
+NetBSD 10.
+@item
 This function may not return 0 when parsing the NUL character on some 
platforms:
 Solaris 9.
 @end itemize
diff --git a/lib/mbrtowc.c b/lib/mbrtowc.c
index f533b554f3..48316a068a 100644
--- a/lib/mbrtowc.c
+++ b/lib/mbrtowc.c
@@ -83,7 +83,7 @@ mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 #  include <locale.h>
 # endif
 
-# if (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
+# if MBRTOWC_INVALID_UTF8_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 
&& !__UCLIBC__)
 
 /* Returns 1 if the current locale is an UTF-8 locale, 0 otherwise.  */
 static inline int
@@ -110,7 +110,8 @@ is_locale_utf8_cached (void)
 size_t
 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 {
-# if MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG || 
(GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2)
+# if (MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG || 
MBRTOWC_INVALID_UTF8_BUG \
+      || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2))
   if (s == NULL)
     {
       pwc = NULL;
@@ -119,24 +120,26 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, 
mbstate_t *ps)
     }
 # endif
 
-# if (MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T \
+# if (MBRTOC32_EMPTY_INPUT_BUG || MBRTOWC_INVALID_UTF8_BUG || 
_GL_SMALL_WCHAR_T \
       || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__))
   if (n == 0)
     return (size_t) -2;
 # endif
 
-# if (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
+# if MBRTOWC_INVALID_UTF8_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 
&& !__UCLIBC__)
   /* Optimize the frequent case of an UTF-8 locale.
      Since here we are in the !GNULIB_defined_mbstate_t case, i.e. we use
      the system's mbstate_t type and have to provide interoperability with
      the system's mbsinit() function, this requires knowledge about how the
      system's UTF-8 mbrtowc() function stores the state.  This knowledge is
-     platform-specific.  For simplicity, we handle only glibc systems.  */
+     platform-specific.  For simplicity, we handle only glibc and NetBSD
+     systems.  */
   if (is_locale_utf8_cached ())
     {
       static mbstate_t internal_state;
       if (ps == NULL)
         ps = &internal_state;
+      #if __GLIBC__ >= 2
       /* Structure of mbstate_t =
          { int __count; union { wint_t __wch; char __wchb[4]; } __value; }
          (see glibc/iconv/gconv_simple.c function utf8_internal_loop):
@@ -145,10 +148,25 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, 
mbstate_t *ps)
          entire byte sequence.
          __value.__wch is the already inferrable bits of the character, of
          the form (x << (r*6)) when r bytes are still expected.  */
+      #endif
+      #ifdef __NetBSD__
+      /* Structure of mbstate_t =
+         union { int64_t __mbstateL; char __mbstate8[128]; }
+         (see src/lib/libc/citrus/modules/citrus_utf8.c):
+         { void *header; char ch[6]; int chlen; },
+         i.e. ch[0..5] is __mbstate8[sizeof(void*)+0..sizeof(void*)+5],
+              chlen is __mbstate8[sizeof(void*)+8..sizeof(void*)+11].  */
+      #endif
 
       /* Here n > 0.  */
 
-      size_t nstate = ps->__count & 7;
+      size_t nstate;
+      #if __GLIBC__ >= 2
+      nstate = ps->__count & 7;
+      #endif
+      #ifdef __NetBSD__
+      nstate = *(int *) &ps->__mbstate8[sizeof (void *) + 8];
+      #endif
       char buf[4];
       const char *p;
       size_t m;
@@ -160,6 +178,7 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, 
mbstate_t *ps)
         }
       else
         {
+          #if __GLIBC__ >= 2
           size_t t = ps->__count >> 8; /* total expected number of bytes */
           if (t > nstate && t <= 4)
             {
@@ -181,6 +200,18 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, 
mbstate_t *ps)
               errno = EINVAL;
               return (size_t)(-1);
             }
+          #endif
+          #ifdef __NetBSD__
+          buf[0] = ps->__mbstate8[sizeof (void *) + 0];
+          if (nstate >= 2)
+            {
+              buf[1] = ps->__mbstate8[sizeof (void *) + 1];
+              if (nstate >= 3)
+                {
+                  buf[2] = ps->__mbstate8[sizeof (void *) + 2];
+                }
+            }
+          #endif
           p = buf;
           m = nstate;
           buf[m++] = s[0];
@@ -206,12 +237,18 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, 
mbstate_t *ps)
       if (nstate >= (res > 0 ? res : 1))
         abort ();
       res -= nstate;
+      #if __GLIBC__ >= 2
       ps->__count = 0;
+      #endif
+      #ifdef __NetBSD__
+      *(int *) &ps->__mbstate8[sizeof (void *) + 8] = 0;
+      #endif
       return res;
 
      incomplete:
       /* Here 0 < m < 4.  */
       {
+        #if __GLIBC__ >= 2
         unsigned char c = (unsigned char) p[0];
         if (c < 0xE0)
           {
@@ -233,6 +270,19 @@ rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, 
mbstate_t *ps)
               | (m > 1 ? ((unsigned char) p[1] & 0x3F) << 12 : 0)
               | (m > 2 ? ((unsigned char) p[2] & 0x3F) << 6 : 0);
           }
+        #endif
+        #ifdef __NetBSD__
+        *(int *) &ps->__mbstate8[sizeof (void *) + 8] = m;
+        ps->__mbstate8[sizeof (void *) + 0] = p[0];
+        if (m > 1)
+          {
+            ps->__mbstate8[sizeof (void *) + 1] = p[1];
+            if (m > 2)
+              {
+                ps->__mbstate8[sizeof (void *) + 2] = p[2];
+              }
+          }
+        #endif
       }
       return (size_t)(-2);
 
diff --git a/m4/mbrtowc.m4 b/m4/mbrtowc.m4
index 381b22dd21..fdc05da3a1 100644
--- a/m4/mbrtowc.m4
+++ b/m4/mbrtowc.m4
@@ -1,5 +1,5 @@
 # mbrtowc.m4
-# serial 49
+# serial 50
 dnl Copyright (C) 2001-2002, 2004-2005, 2008-2026 Free Software Foundation,
 dnl Inc.
 dnl This file is free software; the Free Software Foundation
@@ -38,6 +38,7 @@ AC_DEFUN([gl_FUNC_MBRTOWC]
       gl_MBRTOWC_STORES_INCOMPLETE
       gl_MBRTOWC_EMPTY_INPUT
       gl_MBRTOWC_C_LOCALE
+      gl_MBRTOWC_INVALID_UTF8
       case "$gl_cv_func_mbrtowc_null_arg1" in
         *yes) ;;
         *) AC_DEFINE([MBRTOWC_NULL_ARG1_BUG], [1],
@@ -81,6 +82,13 @@ AC_DEFUN([gl_FUNC_MBRTOWC]
            REPLACE_MBRTOWC=1
            ;;
       esac
+      case "$gl_cv_func_mbrtowc_invalid_UTF8" in
+        *yes) ;;
+        *) AC_DEFINE([MBRTOWC_INVALID_UTF8_BUG], [1],
+             [Define if the mbrtowc function does not recognize some invalid 
UTF-8 byte sequences.])
+           REPLACE_MBRTOWC=1
+           ;;
+      esac
     fi
   fi
   if test $REPLACE_MBSTATE_T = 1; then
@@ -700,6 +708,65 @@ AC_DEFUN([gl_MBRTOWC_C_LOCALE]
     ])
 ])
 
+dnl Test whether mbrtowc recognizes invalid UTF-8 byte sequences.
+
+AC_DEFUN([gl_MBRTOWC_INVALID_UTF8],
+[
+  AC_REQUIRE([gt_LOCALE_EN_UTF8])
+  AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
+  AC_CACHE_CHECK([whether mbrtowc recognizes invalid UTF-8],
+    [gl_cv_func_mbrtowc_invalid_UTF8],
+    [
+      dnl Initial guess, used when cross-compiling or when no suitable locale
+      dnl is present.
+changequote(,)dnl
+      case "$host_os" in
+                 # Guess no on NetBSD.
+        netbsd*) gl_cv_func_mbrtowc_invalid_UTF8="guessing no" ;;
+                 # Guess yes otherwise.
+        *)       gl_cv_func_mbrtowc_invalid_UTF8="guessing yes" ;;
+      esac
+changequote([,])dnl
+      if test "$LOCALE_EN_UTF8" != none; then
+        AC_RUN_IFELSE(
+          [AC_LANG_SOURCE([[
+#include <locale.h>
+#include <string.h>
+#include <wchar.h>
+int main ()
+{
+  if (setlocale (LC_ALL, "$LOCALE_EN_UTF8") != NULL)
+    {
+      int result = 0;
+      /* This test fails on NetBSD 10.  */
+      {
+        mbstate_t state;
+        wchar_t wc;
+
+        memset (&state, '\0', sizeof (mbstate_t));
+        if (mbrtowc (&wc, "\340x", 2, &state) != (size_t)(-1))
+          result |= 1;
+      }
+      /* This test fails on NetBSD 10.  */
+      {
+        mbstate_t state;
+        wchar_t wc;
+
+        memset (&state, '\0', sizeof (mbstate_t));
+        if (mbrtowc (&wc, "\360x\360", 3, &state) != (size_t)(-1))
+          result |= 2;
+      }
+      return result;
+    }
+  return 0;
+}]])],
+          [gl_cv_func_mbrtowc_invalid_UTF8=yes],
+          [gl_cv_func_mbrtowc_invalid_UTF8=no],
+          [:])
+      fi
+    ])
+])
+
 # Prerequisites of lib/mbrtowc.c and lib/lc-charset-dispatch.c.
 AC_DEFUN([gl_PREREQ_MBRTOWC], [
   AC_REQUIRE([AC_C_INLINE])
diff --git a/tests/test-mbrlen.c b/tests/test-mbrlen.c
index 77e0f0ea35..b38173ded6 100644
--- a/tests/test-mbrlen.c
+++ b/tests/test-mbrlen.c
@@ -209,6 +209,17 @@ main (int argc, char *argv[])
           ASSERT (ret == 1);
           ASSERT (mbsinit (&state));
         }
+        /* Test recognition of invalid byte sequences.  */
+        {
+          memset (&state, 0, sizeof (mbstate_t));
+          ret = mbrlen ("\340x", 2, &state);
+          ASSERT (ret == (size_t)(-1));
+        }
+        {
+          memset (&state, 0, sizeof (mbstate_t));
+          ret = mbrlen ("\360x\360", 3, &state);
+          ASSERT (ret == (size_t)(-1));
+        }
         return test_exit_status;
 
       case '4':
diff --git a/tests/test-mbrtowc.c b/tests/test-mbrtowc.c
index 3b10e9daed..a5d0741ef2 100644
--- a/tests/test-mbrtowc.c
+++ b/tests/test-mbrtowc.c
@@ -297,6 +297,19 @@ main (int argc, char *argv[])
             ASSERT (wctob (wc) == EOF);
             ASSERT (mbsinit (&state));
           }
+        /* Test recognition of invalid byte sequences.  */
+        {
+          memset (&state, 0, sizeof (mbstate_t));
+          wc = (wchar_t) {0xBADFACE};
+          ret = mbrtowc (&wc, "\340x", 2, &state);
+          ASSERT (ret == (size_t)(-1));
+        }
+        {
+          memset (&state, 0, sizeof (mbstate_t));
+          wc = (wchar_t) {0xBADFACE};
+          ret = mbrtowc (&wc, "\360x\360", 3, &state);
+          ASSERT (ret == (size_t)(-1));
+        }
         return test_exit_status;
 
       case '4':




Reply via email to