Hi,
this patch speeds up the push_back at -O3 significantly by making the
reallocation to be inlined by default.  _M_realloc_insert is general
insertion that takes iterator pointing to location where the value
should be inserted.  As such it contains code to move other entries around
that is quite large.

Since appending to the end of array is common operation, I think we should
have specialized code for that.  Sadly it is really hard to work out this
from IPA passes, since we basically care whether the iterator points to
the same place as the end pointer, which are both passed by reference.
This is inter-procedural value numbering that is quite out of reach.

I also added extra check making it clear that the new length of the vector
is non-zero.  This saves extra conditionals.  Again it is quite hard case
since _M_check_len seem to be able to return 0 if its parameter is 0.
This never happens here, but we are not able to propagate this early nor
at IPA stage.

Would it be OK to duplciate code as this?  The resulting code is still not quite
optimal.

Regtested on x86_64-linux, OK?

Honza

void std::vector<pair_t>::_M_realloc_append<const pair_t&> (struct vector * 
const this, const struct pair_t & __args#0)
{
  struct _Guard __guard;
  struct pair_t * __new_finish;
  struct pair_t * __old_finish;
  struct pair_t * __old_start;
  struct _Vector_impl * _1;
  long unsigned int _2;
  struct pair_t * _3;
  struct pair_t * _4;
  long int _5;
  long int _6;
  long unsigned int _7;
  long unsigned int _8;
  struct pair_t * _9;
  const size_type _13;
  struct pair_t * _16;
  struct _Vector_impl * _18;
  long int _27;
  long unsigned int _34;

  <bb 2> [local count: 1073741824]:
  _13 = std::vector<pair_t>::_M_check_len (this_11(D), 1, 
"vector::_M_realloc_append");
  if (_13 == 0)
    goto <bb 3>; [0.00%]
  else
    goto <bb 4>; [100.00%]

  <bb 3> [count: 0]:
  __builtin_unreachable ();

  <bb 4> [local count: 1073741824]:
  __old_start_14 = this_11(D)->D.26060._M_impl.D.25361._M_start;
  __old_finish_15 = this_11(D)->D.26060._M_impl.D.25361._M_finish;
  _27 = __old_finish_15 - __old_start_14;
  _18 = &MEM[(struct _Vector_base *)this_11(D)]._M_impl;
  _16 = std::__new_allocator<pair_t>::allocate (_18, _13, 0B);
  _1 = &this_11(D)->D.26060._M_impl;
  __guard ={v} {CLOBBER};
  __guard._M_alloc = _1;
  _2 = (long unsigned int) _27;
  _3 = _16 + _2;
  *_3 = *__args#0_17(D);
  if (_27 > 0)
    goto <bb 5>; [41.48%]
  else
    goto <bb 6>; [58.52%]

  <bb 5> [local count: 445388112]:
  __builtin_memmove (_16, __old_start_14, _2);

  <bb 6> [local count: 1073741824]:
  _34 = _2 + 8;
  __new_finish_19 = _16 + _34;
  __guard._M_storage = __old_start_14;
  _4 = this_11(D)->D.26060._M_impl.D.25361._M_end_of_storage;
  _5 = _4 - __old_start_14;
  _6 = _5 /[ex] 8;
  _7 = (long unsigned int) _6;
  __guard._M_len = _7;
  std::vector<pair_t>::_M_realloc_append<const pair_t&>(const 
pair_t&)::_Guard::~_Guard (&__guard);
  __guard ={v} {CLOBBER(eol)};
  this_11(D)->D.26060._M_impl.D.25361._M_start = _16;
  this_11(D)->D.26060._M_impl.D.25361._M_finish = __new_finish_19;
  _8 = _13 * 8;
  _9 = _16 + _8;
  this_11(D)->D.26060._M_impl.D.25361._M_end_of_storage = _9;
  return;

}

Notice that memmove can be memcopy and the test whether block size is non-zero 
is useless.


libstdc++-v3/ChangeLog:

        PR libstdc++/110287
        * include/bits/stl_vector.h (_M_realloc_append): New member function.
        (push_back): Use it.
        * include/bits/vector.tcc: (emplace_back): Use it.
        (_M_realloc_insert): Let compiler know that new vector size is non-zero.
        (_M_realloc_append): New member function.

diff --git a/libstdc++-v3/include/bits/stl_vector.h 
b/libstdc++-v3/include/bits/stl_vector.h
index 5e18f6eedce..973f4d7e2e9 100644
--- a/libstdc++-v3/include/bits/stl_vector.h
+++ b/libstdc++-v3/include/bits/stl_vector.h
@@ -1288,7 +1288,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
            _GLIBCXX_ASAN_ANNOTATE_GREW(1);
          }
        else
-         _M_realloc_insert(end(), __x);
+         _M_realloc_append(__x);
       }
 
 #if __cplusplus >= 201103L
@@ -1822,6 +1822,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 
       void
       _M_realloc_insert(iterator __position, const value_type& __x);
+
+      void
+      _M_realloc_append(const value_type& __x);
 #else
       // A value_type object constructed with _Alloc_traits::construct()
       // and destroyed with _Alloc_traits::destroy().
@@ -1871,6 +1874,11 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
        void
        _M_realloc_insert(iterator __position, _Args&&... __args);
 
+      template<typename... _Args>
+       _GLIBCXX20_CONSTEXPR
+       void
+       _M_realloc_append(_Args&&... __args);
+
       // Either move-construct at the end, or forward to _M_insert_aux.
       _GLIBCXX20_CONSTEXPR
       iterator
diff --git a/libstdc++-v3/include/bits/vector.tcc 
b/libstdc++-v3/include/bits/vector.tcc
index 80631d1e2a1..1306676e795 100644
--- a/libstdc++-v3/include/bits/vector.tcc
+++ b/libstdc++-v3/include/bits/vector.tcc
@@ -120,7 +120,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
            _GLIBCXX_ASAN_ANNOTATE_GREW(1);
          }
        else
-         _M_realloc_insert(end(), std::forward<_Args>(__args)...);
+         _M_realloc_append(std::forward<_Args>(__args)...);
 #if __cplusplus > 201402L
        return back();
 #endif
@@ -459,6 +459,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
 #endif
     {
       const size_type __len = _M_check_len(1u, "vector::_M_realloc_insert");
+      if (__len <= 0)
+       __builtin_unreachable ();
       pointer __old_start = this->_M_impl._M_start;
       pointer __old_finish = this->_M_impl._M_finish;
       const size_type __elems_before = __position - begin();
@@ -571,6 +573,129 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       this->_M_impl._M_end_of_storage = __new_start + __len;
     }
 
+#if __cplusplus >= 201103L
+  template<typename _Tp, typename _Alloc>
+    template<typename... _Args>
+      _GLIBCXX20_CONSTEXPR
+      void
+      vector<_Tp, _Alloc>::
+      _M_realloc_append(_Args&&... __args)
+#else
+  template<typename _Tp, typename _Alloc>
+    void
+    vector<_Tp, _Alloc>::
+    _M_realloc_append(const _Tp& __x)
+#endif
+    {
+      const size_type __len = _M_check_len(1u, "vector::_M_realloc_append");
+      if (__len <= 0)
+       __builtin_unreachable ();
+      pointer __old_start = this->_M_impl._M_start;
+      pointer __old_finish = this->_M_impl._M_finish;
+      const size_type __elems = end() - begin();
+      pointer __new_start(this->_M_allocate(__len));
+      pointer __new_finish(__new_start);
+
+      // RAII guard for allocated storage.
+      struct _Guard
+      {
+       pointer _M_storage;         // Storage to deallocate
+       size_type _M_len;
+       _Tp_alloc_type& _M_alloc;
+
+       _GLIBCXX20_CONSTEXPR
+       _Guard(pointer __s, size_type __l, _Tp_alloc_type& __a)
+       : _M_storage(__s), _M_len(__l), _M_alloc(__a)
+       { }
+
+       _GLIBCXX20_CONSTEXPR
+       ~_Guard()
+       {
+         if (_M_storage)
+           __gnu_cxx::__alloc_traits<_Tp_alloc_type>::
+             deallocate(_M_alloc, _M_storage, _M_len);
+       }
+
+      private:
+       _Guard(const _Guard&);
+      };
+
+      {
+       _Guard __guard(__new_start, __len, _M_impl);
+
+       // The order of the three operations is dictated by the C++11
+       // case, where the moves could alter a new element belonging
+       // to the existing vector.  This is an issue only for callers
+       // taking the element by lvalue ref (see last bullet of C++11
+       // [res.on.arguments]).
+
+       // If this throws, the existing elements are unchanged.
+#if __cplusplus >= 201103L
+       _Alloc_traits::construct(this->_M_impl,
+                                std::__to_address(__new_start + __elems),
+                                std::forward<_Args>(__args)...);
+#else
+       _Alloc_traits::construct(this->_M_impl,
+                                __new_start + __elems,
+                                __x);
+#endif
+
+#if __cplusplus >= 201103L
+       if _GLIBCXX17_CONSTEXPR (_S_use_relocate())
+         {
+           // Relocation cannot throw.
+           __new_finish = _S_relocate(__old_start, __old_finish,
+                                      __new_start, _M_get_Tp_allocator());
+           ++__new_finish;
+         }
+       else
+#endif
+         {
+           // RAII type to destroy initialized elements.
+           struct _Guard_elts
+           {
+             pointer _M_first, _M_last;  // Elements to destroy
+             _Tp_alloc_type& _M_alloc;
+
+             _GLIBCXX20_CONSTEXPR
+             _Guard_elts(pointer __elt, _Tp_alloc_type& __a)
+             : _M_first(__elt), _M_last(__elt + 1), _M_alloc(__a)
+             { }
+
+             _GLIBCXX20_CONSTEXPR
+             ~_Guard_elts()
+             { std::_Destroy(_M_first, _M_last, _M_alloc); }
+
+           private:
+             _Guard_elts(const _Guard_elts&);
+           };
+
+           // Guard the new element so it will be destroyed if anything throws.
+           _Guard_elts __guard_elts(__new_start + __elems, _M_impl);
+
+           __new_finish = std::__uninitialized_move_if_noexcept_a(
+                            __old_start, __old_finish,
+                            __new_start, _M_get_Tp_allocator());
+
+           ++__new_finish;
+           // Guard everything before the new element too.
+           __guard_elts._M_first = __new_start;
+
+           // New storage has been fully initialized, destroy the old elements.
+           __guard_elts._M_first = __old_start;
+           __guard_elts._M_last = __old_finish;
+         }
+       __guard._M_storage = __old_start;
+       __guard._M_len = this->_M_impl._M_end_of_storage - __old_start;
+      }
+      // deallocate should be called before assignments to _M_impl,
+      // to avoid call-clobbering
+
+      this->_M_impl._M_start = __new_start;
+      this->_M_impl._M_finish = __new_finish;
+      this->_M_impl._M_end_of_storage = __new_start + __len;
+    }
+
   template<typename _Tp, typename _Alloc>
     _GLIBCXX20_CONSTEXPR
     void

Reply via email to