Re: optimize basic_string

Jonathan Wakely Mon, 25 Nov 2024 09:26:05 -0800

On 24/11/24 01:42 +0100, Jan Hubicka wrote:

Hi,
another common source of unnecesary throw_bad_alloc calls is 
basic_string::_M_create.
   basic_string<_CharT, _Traits, _Alloc>::
   _M_create(size_type& __capacity, size_type __old_capacity)
   {
     // _GLIBCXX_RESOLVE_LIB_DEFECTS
     // 83.  String::npos vs. string::max_size()
     if (__capacity > max_size())
        std::__throw_length_error(__N("basic_string::_M_create"));


     // The below implements an exponential growth policy, necessary to
     // meet amortized linear time requirements of the library: see
     // http://gcc.gnu.org/ml/libstdc++/2001-07/msg00085.html.
     if (__capacity > __old_capacity && __capacity < 2 * __old_capacity)
        {
          __capacity = 2 * __old_capacity;
          // Never allocate a string bigger than max_size.
          if (__capacity > max_size())
            __capacity = max_size();
        }

     // NB: Need an array of char_type[__capacity], plus a terminating
     // null char_type() element.
     return _S_allocate(_M_get_allocator(), __capacity + 1);
   }

The code makes it visible that string is never bigger then max_size, however + 1
makes it one byte bigger than maximal size allowed by allocator.  I believe
this is by miscounting the 0 at the end of string in max_size function:

-      { return (_Alloc_traits::max_size(_M_get_allocator()) - 1) / 2; }
+      { return _Alloc_traits::max_size(_M_get_allocator()) / 2 - 1; }


Ah yes, good catch. That code was copied from ext/sso_string_base.h
where it has been present since r0-76546-g3ad7074772808f in 2006!

Path also adds __builtin_unreachable to express value ranges of capacity and 
length.
This makes it possible to optimize out throw when copying strings as in the 
testcase.

std::string
test (std::string &a)
{
        return a;
}

This now yields the following:

struct string test (struct string & a)
{
 size_type __siz;
 struct string & _2(D);
 char[16] * _4;
 char * _5;
 char * _7;
 char * prephitmp_10;
 char * pretmp_11;
 char _15;
 char * _18;
 char * _27;
 long unsigned int _32;

 <bb 2> [local count: 1073741824]:
 _4 = &MEM[(struct basic_string *)_2(D)].D.32970._M_local_buf;
 MEM[(struct _Alloc_hider *)_2(D)]._M_p = _4;
 _5 = MEM[(const struct basic_string *)a_3(D)]._M_dataplus._M_p;
 __siz_6 = MEM[(const struct basic_string *)a_3(D)]._M_string_length;
 if (__siz_6 > 15)
   goto <bb 3>; [33.00%]
 else
   goto <bb 4>; [67.00%]

 <bb 3> [local count: 354334800]:
 _32 = __siz_6 + 1;
 _27 = operator new (_32);
 MEM[(struct basic_string *)_2(D)]._M_dataplus._M_p = _27;
 MEM[(struct basic_string *)_2(D)].D.32970._M_allocated_capacity = __siz_6;
 goto <bb 8>; [100.00%]

 <bb 4> [local count: 719407024]:
 if (__siz_6 == 1)
   goto <bb 5>; [34.00%]
 else
   goto <bb 7>; [66.00%]

 <bb 5> [local count: 353024841]:
 _15 = MEM[(const char_type &)_5];
 MEM[(char_type &)_2(D) + 16] = _15;

 <bb 6> [local count: 350316931]:
 goto <bb 9>; [100.00%]

 <bb 7> [local count: 474808633]:
 if (__siz_6 == 0)
   goto <bb 6>; [73.78%]
 else
   goto <bb 8>; [26.22%]

 <bb 8> [local count: 334966574]:
 # _7 = PHI <_4(7), _27(3)>
 __builtin_memcpy (_7, _5, __siz_6);
 pretmp_11 = MEM[(const struct basic_string *)_2(D)]._M_dataplus._M_p;

 <bb 9> [local count: 1038308344]:
 # prephitmp_10 = PHI <_4(6), pretmp_11(8)>
 MEM[(struct basic_string *)_2(D)]._M_string_length = __siz_6;
 _18 = prephitmp_10 + __siz_6;
 MEM[(char_type &)_18] = 0;
 return _2(D);

}

.text segment is reduced from 213 bytes to 192 (saving 2 conditionals
and 2 calls to throw)

I find it funny that the code special cases a.size () == 1 to copy single byte
and a.size () == 0 to bypass memcpy call.  I would say it is job of middle-end
to optimize memcpy reasonably even for small blocks.  Was this based on some 
data
showing that many strings have size of 0 and 1 which is not visible to
compiler?


That was added as _M_copy in 2004 by r0-62838-gec61e852bc917b and when
I implemented the new std::__cxx11::basic_string I copied it, only
renaming it to _S_copy because that's our convention for static member
functions.

There's no explanation in the ChangeLog (obviously, because ChangeLog
files are useless at telling you *why* something happened), but this
comment was added:

+      // When __n = 1 way faster than the general multichar
+      // traits_type::copy/move/assign.

Back in 2004 char_traits<char>::copy called memcpy (not
__builtin_memcpy) and maybe it was badly optimized.

I think we can probably drop that "optimization" for __n == 1.

regtested x86_64-linux, OK?

Honza

libstdc++-v3/ChangeLog:

        * include/bits/basic_string.h:

gcc/testsuite/ChangeLog:

        * g++.dg/tree-ssa/string-1.C: New test.

diff --git a/gcc/testsuite/g++.dg/tree-ssa/string-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/string-1.C
new file mode 100644
index 00000000000..d38c23a7628
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/string-1.C
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -std=c++20 -fdump-tree-optimized" } */
+#include <string>
+std::string
+test (std::string &a)
+{
+       return a;
+}
+/* { dg-final { scan-tree-dump-not "throw" "optimized" } } */
diff --git a/libstdc++-v3/include/bits/basic_string.h 
b/libstdc++-v3/include/bits/basic_string.h
index f5b320099b1..d754deb28ed 100644
--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -1079,20 +1079,30 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
      _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
      size_type
      size() const _GLIBCXX_NOEXCEPT
-      { return _M_string_length; }
+      {
+       size_type __siz = _M_string_length;


Please use __sz for these variables, as that's consistent with names
used elsewhere.

+       if (__siz > max_size ())
+         __builtin_unreachable ();
+       return __siz;
+      }

      ///  Returns the number of characters in the string, not including any
      ///  null-termination.
      _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
      size_type
      length() const _GLIBCXX_NOEXCEPT
-      { return _M_string_length; }
+      {
+       size_type __siz = _M_string_length;
+       if (__siz > max_size ())
+         __builtin_unreachable ();
+       return __siz;


Should this just call size() instead of repeating the definition?

+      }

      ///  Returns the size() of the largest possible %string.
      _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
      size_type
      max_size() const _GLIBCXX_NOEXCEPT
-      { return (_Alloc_traits::max_size(_M_get_allocator()) - 1) / 2; }
+      { return _Alloc_traits::max_size(_M_get_allocator()) / 2 - 1; }

      /**
       *  @brief  Resizes the %string to the specified number of characters.
@@ -1184,8 +1194,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
      size_type
      capacity() const _GLIBCXX_NOEXCEPT
      {
-       return _M_is_local() ? size_type(_S_local_capacity)
-                            : _M_allocated_capacity;
+       size_t __siz = _M_is_local() ? size_type(_S_local_capacity)


size_type __sz = ...

+                                    : _M_allocated_capacity;
+       if (__siz < _S_local_capacity || __siz > max_size () + 1)


Is this +1 correct?

I think it should be simply `__siz > max_size()`

The largest allocation allowed is max_size()+1 but the value stored in
_M_allocated_capacity will be just max_size() without the +1.

+         __builtin_unreachable ();
+       return __siz;
      }

      /**

Re: optimize basic_string

Reply via email to