This is a naive implementation of the UTF-8 validation algorithm, which
could definitely be optimized. But it's faster than using
std::codecvt_utf8 and checking the result of that, which is the only
existing code we have to do it in the library.

As the TODO suggests, we could do the UTF-8 to UTF-16 conversion at the
same time. But that is only needed for Windows and as I said in the 1/2
email, the output for Windows seems to be broken currently anyway and I
can't test it properly.

-- >8 --

libstdc++-v3/ChangeLog:

        * include/bits/locale_conv.h (__to_valid_utf8): New function.
        * include/std/ostream (vprint_unicode): Use it.
        * include/std/print (vprint_unicode): Use it.
---
 libstdc++-v3/include/bits/locale_conv.h | 104 ++++++++++++++++++++++++
 libstdc++-v3/include/std/ostream        |  74 +++++++++++------
 libstdc++-v3/include/std/print          |   8 +-
 3 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/libstdc++-v3/include/bits/locale_conv.h 
b/libstdc++-v3/include/bits/locale_conv.h
index 284142a360a..f6ade1d0395 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -624,6 +624,110 @@ _GLIBCXX_END_NAMESPACE_CXX11
       bool                     _M_always_noconv;
     };
 
+#if __cplusplus >= 202002L
+  template<typename _CharT = char>
+  bool
+  __to_valid_utf8(string& __s)
+  {
+    // TODO if _CharT is wchar_t then transcode at the same time.
+
+    unsigned __seen = 0, __needed = 0;
+    unsigned char __lo_bound = 0x80, __hi_bound = 0xBF;
+    size_t __errors = 0;
+
+    auto __q = __s.data(), __eoq = __q + __s.size();
+    while (__q != __eoq)
+      {
+       unsigned char __byte = *__q;
+       if (__needed == 0)
+         {
+           if (__byte <= 0x7F)      // 0x00 to 0x7F
+             {
+               while (++__q != __eoq && (unsigned char)*__q <= 0x7F)
+                 { } // Fast forward to the next non-ASCII character.
+               continue;
+             }
+           else if (__byte < 0xC2)
+             {
+               *__q = 0xFF;
+               ++__errors;
+             }
+           else if (__byte <= 0xDF) // 0xC2 to 0xDF
+             {
+               __needed = 1;
+             }
+           else if (__byte <= 0xEF) // 0xE0 to 0xEF
+             {
+               if (__byte == 0xE0)
+                 __lo_bound = 0xA0;
+               else if (__byte == 0xED)
+                 __hi_bound = 0x9F;
+
+               __needed = 2;
+             }
+           else if (__byte <= 0xF4) // 0xF0 to 0xF4
+             {
+               if (__byte == 0xF0)
+                 __lo_bound = 0x90;
+               else if (__byte == 0xF4)
+                 __hi_bound = 0x8F;
+
+               __needed = 3;
+             }
+           else
+             {
+               *__q = 0xFF;
+               ++__errors;
+             }
+         }
+       else
+         {
+           if (__byte < __lo_bound || __byte > __hi_bound)
+             {
+               *(__q - __seen - 1) = 0xFF;
+               __builtin_memset(__q - __seen, 0xFE, __seen);
+               ++__errors;
+               __needed = __seen = 0;
+               __lo_bound = 0x80;
+               __hi_bound = 0xBF;
+               continue; // Reprocess the current character.
+             }
+
+           __lo_bound = 0x80;
+           __hi_bound = 0xBF;
+           ++__seen;
+           if (__seen == __needed)
+             __needed = __seen = 0;
+         }
+       __q++;
+      }
+
+    if (__needed)
+      {
+       // The string ends with an incomplete multibyte sequence.
+       if (__seen)
+         __s.resize(__s.size() - __seen);
+       __s.back() = 0xFF;
+       ++__errors;
+      }
+
+    if (__errors == 0)
+      return true;
+
+    string __s2;
+    __s2.reserve(__s.size() + __errors * 2);
+    for (unsigned char __byte : __s)
+      {
+       if (__byte == 0xFF)
+         __s2 += "\uFFFD";
+       else if (__byte != 0xFE)
+         __s2 += (char)__byte;
+      }
+    __s = std::move(__s2);
+    return false;
+  }
+#endif // C++20
+
   /// @} group locales
 
 _GLIBCXX_END_NAMESPACE_VERSION
diff --git a/libstdc++-v3/include/std/ostream b/libstdc++-v3/include/std/ostream
index e81c39a7c80..760aaa206da 100644
--- a/libstdc++-v3/include/std/ostream
+++ b/libstdc++-v3/include/std/ostream
@@ -917,42 +917,68 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline void
   vprint_unicode(ostream& __os, string_view __fmt, format_args __args)
   {
-    // TODO: diagnose invalid UTF-8 code units
-#ifdef _WIN32
-    int __fd_for_console(std::streambuf*);
-    void __write_utf16_to_console(int, string);
-
-    // If stream refers to a terminal convert to UTF-16 and use WriteConsoleW.
-    if (int __fd = __fd_for_console(__os.rdbuf()); __fd >= 0)
+    ostream::sentry __cerb(__os);
+    if (__cerb)
       {
-       ostream::sentry __cerb(__os);
-       if (__cerb)
+       string __out = std::vformat(__fmt, __args);
+       std::__to_valid_utf8(__out);
+
+#ifdef _WIN32
+       int __fd_for_console(std::streambuf*);
+       void __write_utf16_to_console(int, string);
+
+       // If stream refers to a terminal output UTF-16 using WriteConsoleW.
+       if (int __fd = __fd_for_console(__os.rdbuf()); __fd >= 0)
          {
-           string __out = std::vformat(__fmt, __args);
            ios_base::iostate __err = ios_base::goodbit;
            __try
-             {
-               if (__os.rdbuf()->pubsync() == -1)
-                 __err = ios::badbit;
-               else if (__write_utf16_to_console(__fd, __out))
-                 __err = ios::badbit;
-             }
+           {
+             if (__os.rdbuf()->pubsync() == -1)
+               __err = ios::badbit;
+             else if (__write_utf16_to_console(__fd, __out))
+               __err = ios::badbit;
+           }
            __catch(const __cxxabiv1::__forced_unwind&)
-             {
-               __os._M_setstate(ios_base::badbit);
-               __throw_exception_again;
-             }
+           {
+             __os._M_setstate(ios_base::badbit);
+             __throw_exception_again;
+           }
            __catch(...)
-             { __os._M_setstate(ios_base::badbit); }
+           { __os._M_setstate(ios_base::badbit); }
 
            if (__err)
              __os.setstate(__err);
+           return;
          }
-      }
 #endif
-    std::vprint_nonunicode(__os, __fmt, __args);
-  }
 
+       __try
+         {
+           const streamsize __w = __os.width();
+           const streamsize __n = __out.size();
+           if (__w > __n)
+             {
+               const bool __left
+                 = (__os.flags() & ios_base::adjustfield) == ios_base::left;
+               if (!__left)
+                 std::__ostream_fill(__os, __w - __n);
+               if (__os.good())
+                 std::__ostream_write(__os, __out.data(), __n);
+               if (__left && __os.good())
+                 std::__ostream_fill(__os, __w - __n);
+             }
+           else
+             std::__ostream_write(__os, __out.data(), __n);
+         }
+       __catch(const __cxxabiv1::__forced_unwind&)
+         {
+           __os._M_setstate(ios_base::badbit);
+           __throw_exception_again;
+         }
+       __catch(...)
+         { __os._M_setstate(ios_base::badbit); }
+      }
+  }
 
   template<typename... _Args>
     inline void
diff --git a/libstdc++-v3/include/std/print b/libstdc++-v3/include/std/print
index 75e78841247..096b97b1ef7 100644
--- a/libstdc++-v3/include/std/print
+++ b/libstdc++-v3/include/std/print
@@ -62,7 +62,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline void
   vprint_unicode(FILE* __stream, string_view __fmt, format_args __args)
   {
-    // TODO: diagnose invalid UTF-8 code units
+    string __out = std::vformat(__fmt, __args);
+    std::__to_valid_utf8(__out);
+
 #ifdef _WIN32
     int __fd_for_console(FILE*);
     void __write_utf16_to_console(int, string);
@@ -82,7 +84,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        _GLIBCXX_THROW_OR_ABORT(system_error(__e, "std::vprint_unicode"));
       }
 #endif
-    std::vprint_nonunicode(__stream, __fmt, __args);
+
+    if (std::fwrite(__out.data(), 1, __out.size(), __stream) != __out.size())
+      __throw_system_error(EIO);
   }
 
   template<typename... _Args>
-- 
2.41.0

Reply via email to