On Tue, Oct 14, 2025 at 2:57 PM Tomasz Kaminski <[email protected]> wrote:
> > > > On Tue, Oct 14, 2025 at 2:53 PM Jonathan Wakely <[email protected]> > wrote: > >> On Wed, 10 Sep 2025 at 15:53 +0200, Tomasz Kamiński wrote: >> >This patch implements _Escaping_sink that stores characters in a local >> (stack) >> >buffer. When the buffer is full, the range of characters is escaped and >> written >> >to the underlying sink. >> > >> >To support above, the __write_escaped_unicode_part function are defined. >> >It takes __str and __prev_esc by reference. The __prev_esc value is >> updated based >> >on the last character written. If the buffer ends with an incomplete >> code point >> >sequence, __str is left non-empty and the characters are not written. >> >_Escaping_sink then copies these characters to the front of the buffer to >> >reconstruct the full code point. >> > >> >__formatter__str::_M_format_range now uses _Escaping_sink to escape any >> >non-continuous character sequences. >> > >> >This addresses PR119820 by removing the code constructing string >> >completely. >> > >> > PR libstdc++/PR119820 >> > >> >libstdc++-v3/ChangeLog: >> > >> > * include/std/format (__format::__write_escape_seqs) >> > (__format::_Escaping_sink): Define. >> > (__format::__write_escaped_unicode_part): Extract from >> > __format::__write_escaped_unicode. >> > (__format::__write_escaped_unicode): Forward to >> > __write_escaped_unicode_part. >> > (__formatter_str::_M_format_range): Use _Escaping sink. >> > * testsuite/std/format/ranges/string.cc: New tests for >> > character which codepoints will be split in buffer and >> > escaping. Invoked test_padding. >> >--- >> >v2 just updates the patch description. >> > >> > libstdc++-v3/include/std/format | 197 +++++++++++++----- >> > .../testsuite/std/format/ranges/string.cc | 89 ++++++++ >> > 2 files changed, 231 insertions(+), 55 deletions(-) >> > >> >diff --git a/libstdc++-v3/include/std/format >> b/libstdc++-v3/include/std/format >> >index d6a2170e45d..347f9f0a479 100644 >> >--- a/libstdc++-v3/include/std/format >> >+++ b/libstdc++-v3/include/std/format >> >@@ -105,6 +105,7 @@ namespace __format >> > template<typename _CharT> class _Sink; >> > template<typename _CharT> class _Fixedbuf_sink; >> > template<typename _Out, typename _CharT> class _Padding_sink; >> >+ template<typename _Out, typename _CharT> class _Escaping_sink; >> > >> > // Output iterator that writes to a type-erase character sink. >> > template<typename _CharT> >> >@@ -1066,6 +1067,17 @@ namespace __format >> > return ++__out; >> > } >> > >> >+ template<typename _Out, typename _CharT> >> >+ _Out >> >+ __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units) >> >+ { >> >+ using _UChar = make_unsigned_t<_CharT>; >> >+ for (_CharT __c : __units) >> >+ __out = __format::__write_escape_seq( >> >+ __out, static_cast<_UChar>(__c), >> _Escapes<_CharT>::_S_x()); >> >> This is always a _Sink_iter, so we don't need to pass >> std::move(__out), right? >> > Yes, but that the case everywhere, so I will just add std::move here for > consistency. > Other functions for writing escaped sequences also do not do moves, so I will leave it as is. Both _Sink_iter and _Drop_iter do not need move. > >> OK for trunk. >> >> >> >+ return __out; >> >+ } >> >+ >> > template<typename _Out, typename _CharT> >> > _Out >> > __write_escaped_char(_Out __out, _CharT __c) >> >@@ -1124,12 +1136,10 @@ namespace __format >> > >> > template<typename _CharT, typename _Out> >> > _Out >> >- __write_escaped_unicode(_Out __out, >> >- basic_string_view<_CharT> __str, >> >- _Term_char __term) >> >+ __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>& >> __str, >> >+ bool& __prev_esc, _Term_char __term) >> > { >> > using _Str_view = basic_string_view<_CharT>; >> >- using _UChar = make_unsigned_t<_CharT>; >> > using _Esc = _Escapes<_CharT>; >> > >> > static constexpr char32_t __replace = U'\uFFFD'; >> >@@ -1143,10 +1153,10 @@ namespace __format >> > }(); >> > >> > __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str)); >> >+ __str = {}; >> >+ >> > auto __first = __v.begin(); >> > auto const __last = __v.end(); >> >- >> >- bool __prev_esc = true; >> > while (__first != __last) >> > { >> > bool __esc_ascii = false; >> >@@ -1185,15 +1195,32 @@ namespace __format >> > __out = __format::__write_escaped_char(__out, >> *__first.base()); >> > else if (__esc_unicode) >> > __out = __format::__write_escape_seq(__out, *__first, >> _Esc::_S_u()); >> >- else // __esc_replace >> >- for (_CharT __c : _Str_view(__first.base(), >> __first._M_units())) >> >- __out = __format::__write_escape_seq(__out, >> >- >> static_cast<_UChar>(__c), >> >- _Esc::_S_x()); >> >+ // __esc_replace >> >+ else if (_Str_view __units(__first.base(), __first._M_units()); >> >+ __units.end() != __last.base()) >> >+ __out = __format::__write_escape_seqs(__out, __units); >> >+ else >> >+ { >> >+ __str = __units; >> >+ return __out; >> >+ } >> >+ >> > __prev_esc = true; >> > ++__first; >> >- >> > } >> >+ >> >+ return __out; >> >+ } >> >+ >> >+ template<typename _CharT, typename _Out> >> >+ _Out >> >+ __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str, >> >+ _Term_char __term) >> >+ { >> >+ bool __prev_escape = true; >> >+ __out = __format::__write_escaped_unicode_part(__out, __str, >> >+ __prev_escape, >> __term); >> >+ __out = __format::__write_escape_seqs(__out, __str); >> > return __out; >> > } >> > >> >@@ -1412,55 +1439,28 @@ namespace __format >> > size_t(ranges::distance(__rg))); >> > return format(__str, __fc); >> > } >> >- else if (!_M_spec._M_debug) >> >+ else >> > { >> >+ auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut >> __nout) >> >+ { >> >+ if (!_M_spec._M_debug) >> >+ return ranges::copy(__rg, std::move(__nout)).out; >> >+ >> >+ _Escaping_sink<_NOut, _CharT> >> >+ __sink(std::move(__nout), _Term_quote); >> >+ ranges::copy(__rg, __sink.out()); >> >+ return __sink._M_finish(); >> >+ }; >> >+ >> > const size_t __padwidth = _M_spec._M_get_width(__fc); >> > if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none) >> >- return ranges::copy(__rg, __fc.out()).out; >> >+ return __handle_debug(__fc.out()); >> > >> >- _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth, >> >- >> _M_spec._M_get_precision(__fc)); >> >- ranges::copy(__rg, __sink.out()); >> >+ _Padding_sink<_Out, _CharT> >> >+ __sink(__fc.out(), __padwidth, >> _M_spec._M_get_precision(__fc)); >> >+ __handle_debug(__sink.out()); >> > return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill); >> > } >> >- else if constexpr (ranges::forward_range<_Rg> || >> ranges::sized_range<_Rg>) >> >- { >> >- const size_t __n(ranges::distance(__rg)); >> >- size_t __w = __n; >> >- if constexpr >> (!__unicode::__literal_encoding_is_unicode<_CharT>()) >> >- if (size_t __max = _M_spec._M_get_precision(__fc); __n > >> __max) >> >- __w == __max; >> >- >> >- if (__w <= __format::__stackbuf_size<_CharT>) >> >- { >> >- _CharT __buf[__format::__stackbuf_size<_CharT>]; >> >- ranges::copy_n(ranges::begin(__rg), __w, __buf); >> >- return _M_format_escaped(_String_view(__buf, __n), >> __fc); >> >- } >> >- else if constexpr (ranges::random_access_range<_Rg>) >> >- { >> >- ranges::iterator_t<_Rg> __first = ranges::begin(__rg); >> >- ranges::subrange __sub(__first, __first + __w); >> >- return _M_format_escaped(_String(from_range, __sub), >> __fc); >> >- } >> >- else if (__w <= __n) >> >- { >> >- ranges::subrange __sub( >> >- counted_iterator(ranges::begin(__rg), __w), >> >- default_sentinel); >> >- return _M_format_escaped(_String(from_range, __sub), >> __fc); >> >- } >> >- else if constexpr (ranges::sized_range<_Rg>) >> >- return _M_format_escaped(_String(from_range, __rg), __fc); >> >- else >> >- { >> >- // N.B. preserve the computed size >> >- ranges::subrange __sub(__rg, __n); >> >- return _M_format_escaped(_String(from_range, __sub), >> __fc); >> >- } >> >- } >> >- else >> >- return _M_format_escaped(_String(from_range, __rg), __fc); >> > } >> > >> > constexpr void >> >@@ -3915,6 +3915,93 @@ namespace __format >> > } >> > }; >> > >> >+ template<typename _Out, typename _CharT> >> >+ class _Escaping_sink : public _Buf_sink<_CharT> >> >+ { >> >+ using _Esc = _Escapes<_CharT>; >> >+ >> >+ _Out _M_out; >> >+ _Term_char _M_term : 2; >> >+ unsigned _M_prev_escape : 1; >> >+ unsigned _M_out_discards : 1; >> >+ >> >+ void >> >+ _M_sync_discarding() >> >+ { >> >+ if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>) >> >+ _M_out_discards = _M_out._M_discarding(); >> >+ } >> >+ >> >+ void >> >+ _M_write() >> >+ { >> >+ span<_CharT> __bytes = this->_M_used(); >> >+ basic_string_view<_CharT> __str(__bytes.data(), __bytes.size()); >> >+ >> >+ size_t __rem = 0; >> >+ if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>()) >> >+ { >> >+ bool __prev_escape = _M_prev_escape; >> >+ _M_out = __format::__write_escaped_unicode_part( >> >+ std::move(_M_out), __str, __prev_escape, _M_term); >> >+ _M_prev_escape = __prev_escape; >> >+ >> >+ __rem = __str.size(); >> >+ if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]] >> >+ ranges::move(__str, this->_M_buf); >> >+ } >> >+ else >> >+ _M_out = __format::__write_escaped_ascii( >> >+ std::move(_M_out), __str, _M_term); >> >+ >> >+ this->_M_reset(this->_M_buf, __rem); >> >+ _M_sync_discarding(); >> >+ } >> >+ >> >+ void >> >+ _M_overflow() override >> >+ { >> >+ if (_M_out_discards) >> >+ this->_M_rewind(); >> >+ else >> >+ _M_write(); >> >+ } >> >+ >> >+ bool >> >+ _M_discarding() const override >> >+ { return _M_out_discards; } >> >+ >> >+ public: >> >+ [[__gnu__::__always_inline__]] >> >+ explicit >> >+ _Escaping_sink(_Out __out, _Term_char __term) >> >+ : _M_out(std::move(__out)), _M_term(__term), >> >+ _M_prev_escape(true), _M_out_discards(false) >> >+ { >> >+ _M_out = __format::__write(std::move(_M_out), >> _Esc::_S_term(_M_term)); >> >+ _M_sync_discarding(); >> >+ } >> >+ >> >+ _Out >> >+ _M_finish() >> >+ { >> >+ if (_M_out_discards) >> >+ return std::move(_M_out); >> >+ >> >+ if (!this->_M_used().empty()) >> >+ { >> >+ _M_write(); >> >+ if constexpr >> (__unicode::__literal_encoding_is_unicode<_CharT>()) >> >+ if (auto __rem = this->_M_used(); !__rem.empty()) >> >+ { >> >+ basic_string_view<_CharT> __str(__rem.data(), >> __rem.size()); >> >+ _M_out = __format::__write_escape_seqs(std::move(_M_out), >> __str); >> >+ } >> >+ } >> >+ return __format::__write(std::move(_M_out), >> _Esc::_S_term(_M_term)); >> >+ } >> >+ }; >> >+ >> > enum class _Arg_t : unsigned char { >> > _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull, >> > _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, >> _Arg_handle, >> >diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc >> b/libstdc++-v3/testsuite/std/format/ranges/string.cc >> >index 99e5eaf411f..a7d584f8e42 100644 >> >--- a/libstdc++-v3/testsuite/std/format/ranges/string.cc >> >+++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc >> >@@ -279,6 +279,93 @@ void test_padding() >> > VERIFY( strip_prefix(resv, 46, '*') ); >> > VERIFY( strip_quotes(resv) ); >> > VERIFY( resv == in ); >> >+ >> >+ // width is 5, size is 15 >> >+ in = "\u2160\u2161\u2162\u2163\u2164"; >> >+ in += in; // width is 10, size is 30 >> >+ in += in; // width is 20, size is 60 >> >+ in += in; // width is 40, size is 120 >> >+ in += in; // width is 80, size is 240 >> >+ in += in; // width is 160, size is 480 >> >+ >> >+ lc.assign_range(in); >> >+ >> >+ resv = res = std::format("{:s}", lc); >> >+ VERIFY( resv == in ); >> >+ >> >+ resv = res = std::format("{:*>10s}", lc); >> >+ VERIFY( resv == in ); >> >+ >> >+ resv = res = std::format("{:*>200s}", lc); >> >+ VERIFY( strip_prefix(resv, 40, '*') ); >> >+ VERIFY( resv == in ); >> >+ >> >+ resv = res = std::format("{:?s}", lc); >> >+ VERIFY( strip_quotes(resv) ); >> >+ VERIFY( resv == in ); >> >+ >> >+ resv = res = std::format("{:*>10?s}", lc); >> >+ VERIFY( strip_quotes(resv) ); >> >+ VERIFY( resv == in ); >> >+ >> >+ resv = res = std::format("{:*>200?s}", lc); >> >+ VERIFY( strip_prefix(resv, 38, '*') ); >> >+ VERIFY( strip_quotes(resv) ); >> >+ VERIFY( resv == in ); >> >+} >> >+ >> >+void test_escaping() >> >+{ >> >+ std::string res; >> >+ std::string_view resv; >> >+ >> >+ const std::string_view input = >> >+ "\t\n\r\\\"" >> >+ "\u008a" // Cc, Control, Line Tabulation Set, >> >+ "\u00ad" // Cf, Format, Soft Hyphen >> >+ "\u1d3d" // Lm, Modifier letter, Modifier Letter Capital Ou >> >+ "\u00a0" // Zs, Space Separator, No-Break Space (NBSP) >> >+ "\u2029" // Zp, Paragraph Separator, Paragraph Separator >> >+ "\U0001f984" // So, Other Symbol, Unicorn Face >> >+ ; >> >+ const std::string_view output = >> >+ R"(\t\n\r\\\")" >> >+ R"(\u{8a})" >> >+ R"(\u{ad})" >> >+ "\u1d3d" >> >+ R"(\u{a0})" >> >+ R"(\u{2029})" >> >+ "\U0001f984"; >> >+ >> >+ std::forward_list<char> lc(std::from_range, input); >> >+ resv = res = std::format("{:s}", lc); >> >+ VERIFY( resv == input ); >> >+ resv = res = std::format("{:?s}", lc); >> >+ VERIFY( strip_quotes(resv) ); >> >+ VERIFY( resv == output ); >> >+ >> >+ // width is 5, size is 15 >> >+ std::string in = "\u2160\u2161\u2162\u2163\u2164"; >> >+ in += in; // width is 10, size is 30 >> >+ in += in; // width is 20, size is 60 >> >+ in += in; // width is 40, size is 120 >> >+ in += in; // width is 80, size is 240 >> >+ in += in; // width is 160, size is 480 >> >+ std::string_view inv = in; >> >+ >> >+ // last charcter is incomplete >> >+ lc.assign_range(inv.substr(0, 479)); >> >+ >> >+ // non-debug format, chars copied as is >> >+ resv = res = std::format("{:s}", lc); >> >+ VERIFY( resv == inv.substr(0, 479) ); >> >+ >> >+ // debug-format, incomplete code-point sequence is copied >> >+ resv = res = std::format("{:?s}", lc); >> >+ VERIFY( strip_quotes(resv) ); >> >+ VERIFY( resv.substr(0, 477) == inv.substr(0, 477) ); >> >+ resv.remove_prefix(477); >> >+ VERIFY( resv == R"(\x{e2}\x{85})" ); >> > } >> > >> > int main() >> >@@ -287,4 +374,6 @@ int main() >> > test_outputs<char>(); >> > test_outputs<wchar_t>(); >> > test_nested(); >> >+ test_padding(); >> >+ test_escaping(); >> > } >> >-- >> >2.51.0 >> > >> > >> >>
