On Tue, Oct 14, 2025 at 2:57 PM Tomasz Kaminski <[email protected]> wrote:

>
>
>
> On Tue, Oct 14, 2025 at 2:53 PM Jonathan Wakely <[email protected]>
> wrote:
>
>> On Wed, 10 Sep 2025 at 15:53 +0200, Tomasz Kamiński wrote:
>> >This patch implements _Escaping_sink that stores characters in a local
>> (stack)
>> >buffer. When the buffer is full, the range of characters is escaped and
>> written
>> >to the underlying sink.
>> >
>> >To support above, the __write_escaped_unicode_part function are defined.
>> >It takes __str and __prev_esc by reference. The __prev_esc value is
>> updated based
>> >on the last character written. If the buffer ends with an incomplete
>> code point
>> >sequence, __str is left non-empty and the characters are not written.
>> >_Escaping_sink then copies these characters to the front of the buffer to
>> >reconstruct the full code point.
>> >
>> >__formatter__str::_M_format_range now uses _Escaping_sink to escape any
>> >non-continuous character sequences.
>> >
>> >This addresses PR119820 by removing the code constructing string
>> >completely.
>> >
>> >       PR libstdc++/PR119820
>> >
>> >libstdc++-v3/ChangeLog:
>> >
>> >       * include/std/format (__format::__write_escape_seqs)
>> >       (__format::_Escaping_sink): Define.
>> >       (__format::__write_escaped_unicode_part): Extract from
>> >       __format::__write_escaped_unicode.
>> >       (__format::__write_escaped_unicode): Forward to
>> >       __write_escaped_unicode_part.
>> >       (__formatter_str::_M_format_range): Use _Escaping sink.
>> >       * testsuite/std/format/ranges/string.cc: New tests for
>> >       character which codepoints will be split in buffer and
>> >       escaping. Invoked test_padding.
>> >---
>> >v2 just updates the patch description.
>> >
>> > libstdc++-v3/include/std/format               | 197 +++++++++++++-----
>> > .../testsuite/std/format/ranges/string.cc     |  89 ++++++++
>> > 2 files changed, 231 insertions(+), 55 deletions(-)
>> >
>> >diff --git a/libstdc++-v3/include/std/format
>> b/libstdc++-v3/include/std/format
>> >index d6a2170e45d..347f9f0a479 100644
>> >--- a/libstdc++-v3/include/std/format
>> >+++ b/libstdc++-v3/include/std/format
>> >@@ -105,6 +105,7 @@ namespace __format
>> >   template<typename _CharT> class _Sink;
>> >   template<typename _CharT> class _Fixedbuf_sink;
>> >   template<typename _Out, typename _CharT> class _Padding_sink;
>> >+  template<typename _Out, typename _CharT> class _Escaping_sink;
>> >
>> >   // Output iterator that writes to a type-erase character sink.
>> >   template<typename _CharT>
>> >@@ -1066,6 +1067,17 @@ namespace __format
>> >       return ++__out;
>> >     }
>> >
>> >+  template<typename _Out, typename _CharT>
>> >+    _Out
>> >+    __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units)
>> >+    {
>> >+      using _UChar = make_unsigned_t<_CharT>;
>> >+      for (_CharT __c : __units)
>> >+      __out = __format::__write_escape_seq(
>> >+                __out, static_cast<_UChar>(__c),
>> _Escapes<_CharT>::_S_x());
>>
>> This is always a _Sink_iter, so we don't need to pass
>> std::move(__out), right?
>>
> Yes, but that the case everywhere, so I will just add std::move here for
> consistency.
>
Other functions for writing escaped sequences also do not do moves, so I
will leave it
as is. Both _Sink_iter and _Drop_iter do not need move.

>
>> OK for trunk.
>>
>>
>> >+      return __out;
>> >+    }
>> >+
>> >   template<typename _Out, typename _CharT>
>> >     _Out
>> >     __write_escaped_char(_Out __out, _CharT __c)
>> >@@ -1124,12 +1136,10 @@ namespace __format
>> >
>> >   template<typename _CharT, typename _Out>
>> >     _Out
>> >-    __write_escaped_unicode(_Out __out,
>> >-                          basic_string_view<_CharT> __str,
>> >-                          _Term_char __term)
>> >+    __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>&
>> __str,
>> >+                               bool& __prev_esc, _Term_char __term)
>> >     {
>> >       using _Str_view = basic_string_view<_CharT>;
>> >-      using _UChar = make_unsigned_t<_CharT>;
>> >       using _Esc = _Escapes<_CharT>;
>> >
>> >       static constexpr char32_t __replace = U'\uFFFD';
>> >@@ -1143,10 +1153,10 @@ namespace __format
>> >       }();
>> >
>> >       __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str));
>> >+      __str = {};
>> >+
>> >       auto __first = __v.begin();
>> >       auto const __last = __v.end();
>> >-
>> >-      bool __prev_esc = true;
>> >       while (__first != __last)
>> >       {
>> >         bool __esc_ascii = false;
>> >@@ -1185,15 +1195,32 @@ namespace __format
>> >           __out = __format::__write_escaped_char(__out,
>> *__first.base());
>> >         else if (__esc_unicode)
>> >           __out = __format::__write_escape_seq(__out, *__first,
>> _Esc::_S_u());
>> >-        else // __esc_replace
>> >-          for (_CharT __c : _Str_view(__first.base(),
>> __first._M_units()))
>> >-            __out = __format::__write_escape_seq(__out,
>> >-
>>  static_cast<_UChar>(__c),
>> >-                                                 _Esc::_S_x());
>> >+        // __esc_replace
>> >+        else if (_Str_view __units(__first.base(), __first._M_units());
>> >+                 __units.end() != __last.base())
>> >+          __out = __format::__write_escape_seqs(__out, __units);
>> >+        else
>> >+          {
>> >+            __str = __units;
>> >+            return __out;
>> >+          }
>> >+
>> >         __prev_esc = true;
>> >         ++__first;
>> >-
>> >       }
>> >+
>> >+      return __out;
>> >+    }
>> >+
>> >+  template<typename _CharT, typename _Out>
>> >+    _Out
>> >+    __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str,
>> >+                          _Term_char __term)
>> >+    {
>> >+      bool __prev_escape = true;
>> >+      __out = __format::__write_escaped_unicode_part(__out, __str,
>> >+                                                   __prev_escape,
>> __term);
>> >+      __out = __format::__write_escape_seqs(__out, __str);
>> >       return __out;
>> >     }
>> >
>> >@@ -1412,55 +1439,28 @@ namespace __format
>> >                                size_t(ranges::distance(__rg)));
>> >             return format(__str, __fc);
>> >           }
>> >-        else if (!_M_spec._M_debug)
>> >+        else
>> >           {
>> >+            auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut
>> __nout)
>> >+              {
>> >+                if (!_M_spec._M_debug)
>> >+                  return ranges::copy(__rg, std::move(__nout)).out;
>> >+
>> >+                _Escaping_sink<_NOut, _CharT>
>> >+                  __sink(std::move(__nout), _Term_quote);
>> >+                ranges::copy(__rg, __sink.out());
>> >+                return __sink._M_finish();
>> >+              };
>> >+
>> >             const size_t __padwidth = _M_spec._M_get_width(__fc);
>> >             if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none)
>> >-              return ranges::copy(__rg, __fc.out()).out;
>> >+              return __handle_debug(__fc.out());
>> >
>> >-            _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth,
>> >-
>>  _M_spec._M_get_precision(__fc));
>> >-            ranges::copy(__rg, __sink.out());
>> >+            _Padding_sink<_Out, _CharT>
>> >+              __sink(__fc.out(), __padwidth,
>> _M_spec._M_get_precision(__fc));
>> >+            __handle_debug(__sink.out());
>> >             return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
>> >           }
>> >-        else if constexpr (ranges::forward_range<_Rg> ||
>> ranges::sized_range<_Rg>)
>> >-          {
>> >-            const size_t __n(ranges::distance(__rg));
>> >-            size_t __w = __n;
>> >-            if constexpr
>> (!__unicode::__literal_encoding_is_unicode<_CharT>())
>> >-              if (size_t __max = _M_spec._M_get_precision(__fc); __n >
>> __max)
>> >-                __w == __max;
>> >-
>> >-            if (__w <= __format::__stackbuf_size<_CharT>)
>> >-              {
>> >-                _CharT __buf[__format::__stackbuf_size<_CharT>];
>> >-                ranges::copy_n(ranges::begin(__rg), __w, __buf);
>> >-                return _M_format_escaped(_String_view(__buf, __n),
>> __fc);
>> >-              }
>> >-            else if constexpr (ranges::random_access_range<_Rg>)
>> >-              {
>> >-                ranges::iterator_t<_Rg> __first = ranges::begin(__rg);
>> >-                ranges::subrange __sub(__first, __first + __w);
>> >-                return _M_format_escaped(_String(from_range, __sub),
>> __fc);
>> >-              }
>> >-            else if (__w <= __n)
>> >-              {
>> >-                ranges::subrange __sub(
>> >-                  counted_iterator(ranges::begin(__rg), __w),
>> >-                  default_sentinel);
>> >-                return _M_format_escaped(_String(from_range, __sub),
>> __fc);
>> >-              }
>> >-            else if constexpr (ranges::sized_range<_Rg>)
>> >-              return _M_format_escaped(_String(from_range, __rg), __fc);
>> >-            else
>> >-              {
>> >-                // N.B. preserve the computed size
>> >-                ranges::subrange __sub(__rg, __n);
>> >-                return _M_format_escaped(_String(from_range, __sub),
>> __fc);
>> >-              }
>> >-          }
>> >-        else
>> >-          return _M_format_escaped(_String(from_range, __rg), __fc);
>> >       }
>> >
>> >       constexpr void
>> >@@ -3915,6 +3915,93 @@ namespace __format
>> >       }
>> >     };
>> >
>> >+  template<typename _Out, typename _CharT>
>> >+    class _Escaping_sink : public _Buf_sink<_CharT>
>> >+    {
>> >+      using _Esc = _Escapes<_CharT>;
>> >+
>> >+      _Out _M_out;
>> >+      _Term_char _M_term : 2;
>> >+      unsigned _M_prev_escape : 1;
>> >+      unsigned _M_out_discards : 1;
>> >+
>> >+      void
>> >+      _M_sync_discarding()
>> >+      {
>> >+      if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>)
>> >+        _M_out_discards = _M_out._M_discarding();
>> >+      }
>> >+
>> >+      void
>> >+      _M_write()
>> >+      {
>> >+      span<_CharT> __bytes = this->_M_used();
>> >+      basic_string_view<_CharT> __str(__bytes.data(), __bytes.size());
>> >+
>> >+      size_t __rem = 0;
>> >+      if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
>> >+        {
>> >+          bool __prev_escape = _M_prev_escape;
>> >+          _M_out = __format::__write_escaped_unicode_part(
>> >+                     std::move(_M_out), __str, __prev_escape, _M_term);
>> >+          _M_prev_escape = __prev_escape;
>> >+
>> >+          __rem = __str.size();
>> >+          if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]]
>> >+            ranges::move(__str, this->_M_buf);
>> >+        }
>> >+      else
>> >+        _M_out = __format::__write_escaped_ascii(
>> >+                    std::move(_M_out), __str, _M_term);
>> >+
>> >+      this->_M_reset(this->_M_buf, __rem);
>> >+      _M_sync_discarding();
>> >+      }
>> >+
>> >+      void
>> >+      _M_overflow() override
>> >+      {
>> >+      if (_M_out_discards)
>> >+        this->_M_rewind();
>> >+      else
>> >+        _M_write();
>> >+      }
>> >+
>> >+      bool
>> >+      _M_discarding() const override
>> >+      { return _M_out_discards; }
>> >+
>> >+    public:
>> >+      [[__gnu__::__always_inline__]]
>> >+      explicit
>> >+      _Escaping_sink(_Out __out, _Term_char __term)
>> >+      : _M_out(std::move(__out)), _M_term(__term),
>> >+      _M_prev_escape(true), _M_out_discards(false)
>> >+      {
>> >+      _M_out = __format::__write(std::move(_M_out),
>> _Esc::_S_term(_M_term));
>> >+      _M_sync_discarding();
>> >+      }
>> >+
>> >+      _Out
>> >+      _M_finish()
>> >+      {
>> >+      if (_M_out_discards)
>> >+        return std::move(_M_out);
>> >+
>> >+      if (!this->_M_used().empty())
>> >+      {
>> >+        _M_write();
>> >+        if constexpr
>> (__unicode::__literal_encoding_is_unicode<_CharT>())
>> >+          if (auto __rem = this->_M_used(); !__rem.empty())
>> >+            {
>> >+              basic_string_view<_CharT> __str(__rem.data(),
>> __rem.size());
>> >+              _M_out = __format::__write_escape_seqs(std::move(_M_out),
>> __str);
>> >+            }
>> >+      }
>> >+      return __format::__write(std::move(_M_out),
>> _Esc::_S_term(_M_term));
>> >+      }
>> >+    };
>> >+
>> >   enum class _Arg_t : unsigned char {
>> >     _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull,
>> >     _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr,
>> _Arg_handle,
>> >diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc
>> b/libstdc++-v3/testsuite/std/format/ranges/string.cc
>> >index 99e5eaf411f..a7d584f8e42 100644
>> >--- a/libstdc++-v3/testsuite/std/format/ranges/string.cc
>> >+++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc
>> >@@ -279,6 +279,93 @@ void test_padding()
>> >   VERIFY( strip_prefix(resv, 46, '*') );
>> >   VERIFY( strip_quotes(resv) );
>> >   VERIFY( resv == in );
>> >+
>> >+  // width is 5, size is 15
>> >+  in = "\u2160\u2161\u2162\u2163\u2164";
>> >+  in += in; // width is 10, size is 30
>> >+  in += in; // width is 20, size is 60
>> >+  in += in; // width is 40, size is 120
>> >+  in += in; // width is 80, size is 240
>> >+  in += in; // width is 160, size is 480
>> >+
>> >+  lc.assign_range(in);
>> >+
>> >+  resv = res = std::format("{:s}", lc);
>> >+  VERIFY( resv == in );
>> >+
>> >+  resv = res = std::format("{:*>10s}", lc);
>> >+  VERIFY( resv == in );
>> >+
>> >+  resv = res = std::format("{:*>200s}", lc);
>> >+  VERIFY( strip_prefix(resv, 40, '*') );
>> >+  VERIFY( resv == in );
>> >+
>> >+  resv = res = std::format("{:?s}", lc);
>> >+  VERIFY( strip_quotes(resv) );
>> >+  VERIFY( resv == in );
>> >+
>> >+  resv = res = std::format("{:*>10?s}", lc);
>> >+  VERIFY( strip_quotes(resv) );
>> >+  VERIFY( resv == in );
>> >+
>> >+  resv = res = std::format("{:*>200?s}", lc);
>> >+  VERIFY( strip_prefix(resv, 38, '*') );
>> >+  VERIFY( strip_quotes(resv) );
>> >+  VERIFY( resv == in );
>> >+}
>> >+
>> >+void test_escaping()
>> >+{
>> >+  std::string res;
>> >+  std::string_view resv;
>> >+
>> >+  const std::string_view input =
>> >+    "\t\n\r\\\""
>> >+    "\u008a"     // Cc, Control,             Line Tabulation Set,
>> >+    "\u00ad"     // Cf, Format,              Soft Hyphen
>> >+    "\u1d3d"     // Lm, Modifier letter,     Modifier Letter Capital Ou
>> >+    "\u00a0"     // Zs, Space Separator,     No-Break Space (NBSP)
>> >+    "\u2029"     // Zp, Paragraph Separator, Paragraph Separator
>> >+    "\U0001f984" // So, Other Symbol,        Unicorn Face
>> >+  ;
>> >+  const std::string_view output =
>> >+   R"(\t\n\r\\\")"
>> >+   R"(\u{8a})"
>> >+   R"(\u{ad})"
>> >+   "\u1d3d"
>> >+   R"(\u{a0})"
>> >+   R"(\u{2029})"
>> >+   "\U0001f984";
>> >+
>> >+  std::forward_list<char> lc(std::from_range, input);
>> >+  resv = res = std::format("{:s}", lc);
>> >+  VERIFY( resv == input );
>> >+  resv = res = std::format("{:?s}", lc);
>> >+  VERIFY( strip_quotes(resv) );
>> >+  VERIFY( resv == output );
>> >+
>> >+  // width is 5, size is 15
>> >+  std::string in = "\u2160\u2161\u2162\u2163\u2164";
>> >+  in += in; // width is 10, size is 30
>> >+  in += in; // width is 20, size is 60
>> >+  in += in; // width is 40, size is 120
>> >+  in += in; // width is 80, size is 240
>> >+  in += in; // width is 160, size is 480
>> >+  std::string_view inv = in;
>> >+
>> >+  // last charcter is incomplete
>> >+  lc.assign_range(inv.substr(0, 479));
>> >+
>> >+  // non-debug format, chars copied as is
>> >+  resv = res = std::format("{:s}", lc);
>> >+  VERIFY( resv == inv.substr(0, 479) );
>> >+
>> >+  // debug-format, incomplete code-point sequence is copied
>> >+  resv = res = std::format("{:?s}", lc);
>> >+  VERIFY( strip_quotes(resv) );
>> >+  VERIFY( resv.substr(0, 477) == inv.substr(0, 477) );
>> >+  resv.remove_prefix(477);
>> >+  VERIFY( resv == R"(\x{e2}\x{85})" );
>> > }
>> >
>> > int main()
>> >@@ -287,4 +374,6 @@ int main()
>> >   test_outputs<char>();
>> >   test_outputs<wchar_t>();
>> >   test_nested();
>> >+  test_padding();
>> >+  test_escaping();
>> > }
>> >--
>> >2.51.0
>> >
>> >
>>
>>

Reply via email to