Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE

Michael Riesch Fri, 15 Mar 2019 01:17:46 -0700

Hello Christoph and Gael,

I tried to extract the relevant assembly and attached it. In the fileeigen_forceinline.txt is the critical part generated by default, thefile eigen_inline.txt is the same but with EIGEN_STRONG_INLINE set to"inline". The compiler options are in both cases -O3 -g -DNDEBUG -fPIC-xHost -qopenmp -std=gnu++11


Thanks and regards,
Michael

PS.:


    > I think "Science" is the best fit, since mbsolve [1] "is an
    open-source
    > solver tool for the Maxwell-Bloch equations, which are used to
    model
    > light-matter interaction in nonlinear optics."

    Done!

Thank you :-)

#pragma omp for schedule(static)
  837c9b:       41 ff cf                dec    %r15d
  837c9e:       bb 01 00 00 00          mov    $0x1,%ebx
  837ca3:       44 89 ac 24 20 01 00    mov    %r13d,0x120(%rsp)
  837caa:       00
  837cab:       48 8d 3d 22 77 37 00    lea    0x377722(%rip),%rdi        # 
baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092>
  837cb2:       44 89 bc 24 24 01 00    mov    %r15d,0x124(%rsp)
  837cb9:       00
  837cba:       ba 22 00 00 00          mov    $0x22,%edx
  837cbf:       44 89 ac 24 28 01 00    mov    %r13d,0x128(%rsp)
  837cc6:       00
  837cc7:       89 9c 24 2c 01 00 00    mov    %ebx,0x12c(%rsp)
  837cce:       48 83 c4 e0             add    $0xffffffffffffffe0,%rsp
  837cd2:       48 8d 84 24 4c 01 00    lea    0x14c(%rsp),%rax
  837cd9:       00
  837cda:       48 8d 8c 24 48 01 00    lea    0x148(%rsp),%rcx
  837ce1:       00
  837ce2:       4c 8d 84 24 40 01 00    lea    0x140(%rsp),%r8
  837ce9:       00
  837cea:       48 89 04 24             mov    %rax,(%rsp)
  837cee:       4c 8d 8c 24 44 01 00    lea    0x144(%rsp),%r9
  837cf5:       00
  837cf6:       89 5c 24 08             mov    %ebx,0x8(%rsp)
  837cfa:       89 5c 24 10             mov    %ebx,0x10(%rsp)
  837cfe:       8b 70 3c                mov    0x3c(%rax),%esi
  837d01:       e8 3a 24 db ff          callq  5ea140 
<__kmpc_for_static_init_4u@plt>
  837d06:       48 83 c4 20             add    $0x20,%rsp
  837d0a:       8b 84 24 20 01 00 00    mov    0x120(%rsp),%eax
  837d11:       8b 94 24 24 01 00 00    mov    0x124(%rsp),%edx
  837d18:       41 3b c7                cmp    %r15d,%eax
  837d1b:       0f 87 ca 00 00 00       ja     837deb 
<mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab>
  837d21:       41 3b d7                cmp    %r15d,%edx
  837d24:       44 0f 42 fa             cmovb  %edx,%r15d
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  837d28:       41 3b c7                cmp    %r15d,%eax
  837d2b:       0f 87 ba 00 00 00       ja     837deb 
<mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbab>
#pragma omp for schedule(static)
  837d31:       44 2b f8                sub    %eax,%r15d
  837d34:       44 89 eb                mov    %r13d,%ebx
  837d37:       44 89 b4 24 f8 00 00    mov    %r14d,0xf8(%rsp)
  837d3e:       00
  837d3f:       41 ff c7                inc    %r15d
  837d42:       41 89 c6                mov    %eax,%r14d
                unsigned int mat_idx = m_mat_indices[i];
  837d45:       4d 8b 04 24             mov    (%r12),%r8
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  837d49:       45 8d 2c 1e             lea    (%r14,%rbx,1),%r13d
  837d4d:       4d 63 ed                movslq %r13d,%r13
  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
  837d50:       48 8d 3c 24             lea    (%rsp),%rdi
  837d54:       48 8d 74 24 40          lea    0x40(%rsp),%rsi
                unsigned int mat_idx = m_mat_indices[i];
  837d59:       49 8b 90 b8 00 00 00    mov    0xb8(%r8),%rdx
      { return *(this->_M_impl._M_start + __n); }
  837d60:       4d 8b 98 d8 00 00 00    mov    0xd8(%r8),%r11
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  837d67:       4d 8b 48 60             mov    0x60(%r8),%r9
  837d6b:       4f 8d 54 6d 00          lea    0x0(%r13,%r13,2),%r10
                unsigned int mat_idx = m_mat_indices[i];
  837d70:       42 8b 0c aa             mov    (%rdx,%r13,4),%ecx
  837d74:       48 69 d1 38 02 00 00    imul   $0x238,%rcx,%rdx
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  837d7b:       4f 8d 2c d1             lea    (%r9,%r10,8),%r13
    }

    //    static inline void
    static void
    update_density(const sim_constants &sc, density& d, real e, real *p_t) {
        d = sc.A1 * (d + sc.d_in) - sc.d_in;
  837d7f:       4c 89 6e 08             mov    %r13,0x8(%rsi)
  837d83:       49 8d 8c 13 08 02 00    lea    0x208(%r11,%rdx,1),%rcx
  837d8a:       00
  837d8b:       48 89 4e 10             mov    %rcx,0x10(%rsi)
  837d8f:       48 89 4e 20             mov    %rcx,0x20(%rsi)
  837d93:       49 8d 94 13 90 01 00    lea    0x190(%r11,%rdx,1),%rdx
  837d9a:       00
  837d9b:       48 89 16                mov    %rdx,(%rsi)
  837d9e:       e8 dd 96 db ff          callq  5f1480 
<Eigen::internal::binary_evaluator<Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double,
 double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, 
Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, 
Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 
1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const>, 
Eigen::internal::IndexBased, Eigen::internal::IndexBased, double, 
double>::binary_evaluator(Eigen::CwiseBinaryOp<Eigen::internal::scalar_difference_op<double,
 double>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, 
Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, 
Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 
1> const>, 0> const, Eigen::Matrix<double, 3, 1, 0, 3, 1> const> const&)@plt>
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
  837da3:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
#pragma omp for schedule(static)
  837da8:       ff c3                   inc    %ebx
  837daa:       48 8b 4c 24 30          mov    0x30(%rsp),%rcx
  837daf:       41 3b df                cmp    %r15d,%ebx
  return _mm_loadu_pd(from);
  837db2:       c5 f9 10 02             vmovupd (%rdx),%xmm0
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const 
Packet2d& b) { return _mm_sub_pd(a,b); }
  837db6:       c5 f9 5c 09             vsubpd (%rcx),%xmm0,%xmm1
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& 
from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
  837dba:       c4 c1 79 11 4d 00       vmovupd %xmm1,0x0(%r13)
  837dc0:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
  837dc5:       48 8b 7c 24 30          mov    0x30(%rsp),%rdi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const 
LhsScalar& a, const RhsScalar& b) const { return a - b; }
  837dca:       c5 fb 10 56 10          vmovsd 0x10(%rsi),%xmm2
  837dcf:       c5 eb 5c 5f 10          vsubsd 0x10(%rdi),%xmm2,%xmm3
  837dd4:       c4 c1 7b 11 5d 10       vmovsd %xmm3,0x10(%r13)
  837dda:       0f 82 65 ff ff ff       jb     837d45 
<mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xb05>
  837de0:       44 8b b4 24 f8 00 00    mov    0xf8(%rsp),%r14d
  837de7:       00
  837de8:       45 33 ed                xor    %r13d,%r13d
  837deb:       48 8d 3d e2 75 37 00    lea    0x3775e2(%rip),%rdi        # 
baf3d4 <.2.22385_2_kmpc_loc_struct_pack.1092>
  837df2:       8b b4 24 68 01 00 00    mov    0x168(%rsp),%esi
  837df9:       e8 92 a7 de ff          callq  622590 
<__kmpc_for_static_fini@plt>

#pragma omp for schedule(static)
  8468db:       41 ff cf                dec    %r15d
  8468de:       bb 01 00 00 00          mov    $0x1,%ebx
  8468e3:       44 89 ac 24 18 01 00    mov    %r13d,0x118(%rsp)
  8468ea:       00
  8468eb:       48 8d 3d a2 ab 37 00    lea    0x37aba2(%rip),%rdi        # 
bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251>
  8468f2:       44 89 bc 24 1c 01 00    mov    %r15d,0x11c(%rsp)
  8468f9:       00
  8468fa:       ba 22 00 00 00          mov    $0x22,%edx
  8468ff:       44 89 ac 24 20 01 00    mov    %r13d,0x120(%rsp)
  846906:       00
  846907:       89 9c 24 24 01 00 00    mov    %ebx,0x124(%rsp)
  84690e:       48 83 c4 e0             add    $0xffffffffffffffe0,%rsp
  846912:       48 8d 84 24 44 01 00    lea    0x144(%rsp),%rax
  846919:       00
  84691a:       48 8d 8c 24 40 01 00    lea    0x140(%rsp),%rcx
  846921:       00
  846922:       4c 8d 84 24 38 01 00    lea    0x138(%rsp),%r8
  846929:       00
  84692a:       48 89 04 24             mov    %rax,(%rsp)
  84692e:       4c 8d 8c 24 3c 01 00    lea    0x13c(%rsp),%r9
  846935:       00
  846936:       89 5c 24 08             mov    %ebx,0x8(%rsp)
  84693a:       89 5c 24 10             mov    %ebx,0x10(%rsp)
  84693e:       8b 70 3c                mov    0x3c(%rax),%esi
  846941:       e8 7a a0 da ff          callq  5f09c0 
<__kmpc_for_static_init_4u@plt>
  846946:       48 83 c4 20             add    $0x20,%rsp
  84694a:       8b 84 24 18 01 00 00    mov    0x118(%rsp),%eax
  846951:       8b 94 24 1c 01 00 00    mov    0x11c(%rsp),%edx
  846958:       41 3b c7                cmp    %r15d,%eax
  84695b:       0f 87 e0 00 00 00       ja     846a41 
<mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1>
  846961:       41 3b d7                cmp    %r15d,%edx
  846964:       44 0f 42 fa             cmovb  %edx,%r15d
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  846968:       41 3b c7                cmp    %r15d,%eax
  84696b:       0f 87 d0 00 00 00       ja     846a41 
<mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xbb1>
#pragma omp for schedule(static)
  846971:       44 2b f8                sub    %eax,%r15d
  846974:       44 89 eb                mov    %r13d,%ebx
  846977:       41 ff c7                inc    %r15d
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
  84697a:       44 89 bc 24 e8 00 00    mov    %r15d,0xe8(%rsp)
  846981:       00
  846982:       41 89 c7                mov    %eax,%r15d
  846985:       44 89 b4 24 f0 00 00    mov    %r14d,0xf0(%rsp)
  84698c:       00
                unsigned int mat_idx = m_mat_indices[i];
  84698d:       4d 8b 04 24             mov    (%r12),%r8
            for (int i = 0; i < m_scenario->get_num_gridpoints(); i++) {
  846991:       45 8d 2c 1f             lea    (%r15,%rbx,1),%r13d
  846995:       4d 63 ed                movslq %r13d,%r13
  SrcEvaluatorType srcEvaluator(src);
  846998:       48 8d 7c 24 18          lea    0x18(%rsp),%rdi
  84699d:       48 89 7f f0             mov    %rdi,-0x10(%rdi)
    call_dense_assignment_loop(dst, src, func);
  8469a1:       48 8d 74 24 68          lea    0x68(%rsp),%rsi
                unsigned int mat_idx = m_mat_indices[i];
  8469a6:       4d 8b b0 b8 00 00 00    mov    0xb8(%r8),%r14
      { return *(this->_M_impl._M_start + __n); }
  8469ad:       4d 8b 98 d8 00 00 00    mov    0xd8(%r8),%r11
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  8469b4:       4d 8b 48 60             mov    0x60(%r8),%r9
  8469b8:       4f 8d 54 6d 00          lea    0x0(%r13,%r13,2),%r10
                unsigned int mat_idx = m_mat_indices[i];
  8469bd:       43 8b 0c ae             mov    (%r14,%r13,4),%ecx
  8469c1:       48 69 d1 38 02 00 00    imul   $0x238,%rcx,%rdx
    }

    //    static inline void
    static void
    update_density(const sim_constants &sc, density& d, real e, real *p_t) {
        d = sc.A1 * (d + sc.d_in) - sc.d_in;
  8469c8:       4d 8d b4 13 08 02 00    lea    0x208(%r11,%rdx,1),%r14
  8469cf:       00
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), 
internal::assign_op<typename Dst::Scalar,Scalar>());
  8469d0:       4c 89 76 10             mov    %r14,0x10(%rsi)
  8469d4:       49 8d 94 13 90 01 00    lea    0x190(%r11,%rdx,1),%rdx
  8469db:       00
  8469dc:       48 89 16                mov    %rdx,(%rsi)
  8469df:       48 8d 94 24 68 01 00    lea    0x168(%rsp),%rdx
  8469e6:       00
                    (m_sim_consts_qm[mat_idx], m_d[i], m_e[i], &m_p[i]);
  8469e7:       4f 8d 2c d1             lea    (%r9,%r10,8),%r13
  8469eb:       4c 89 6e 08             mov    %r13,0x8(%rsi)
  8469ef:       e8 6c cc da ff          callq  5f3660 <void 
Eigen::internal::call_dense_assignment_loop<Eigen::Matrix<double, 3, 1, 0, 3, 
1>, Eigen::Product<Eigen::Matrix<double, 3, 3, 0, 3, 3>, 
Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, double>, 
Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 1, 0, 3, 
1> const>, 1>, Eigen::internal::assign_op<double, double> 
>(Eigen::Matrix<double, 3, 1, 0, 3, 1>&, Eigen::Product<Eigen::Matrix<double, 
3, 3, 0, 3, 3>, Eigen::CwiseBinaryOp<Eigen::internal::scalar_sum_op<double, 
double>, Eigen::Matrix<double, 3, 1, 0, 3, 1> const, Eigen::Matrix<double, 3, 
1, 0, 3, 1> const>, 1> const&, Eigen::internal::assign_op<double, double> 
const&)@plt>
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
  8469f4:       48 8b 54 24 08          mov    0x8(%rsp),%rdx
#pragma omp for schedule(static)
  8469f9:       ff c3                   inc    %ebx
  SrcEvaluatorType srcEvaluator(src);
  8469fb:       4c 89 74 24 30          mov    %r14,0x30(%rsp)
  return _mm_loadu_pd(from);
  846a00:       c5 f9 10 02             vmovupd (%rdx),%xmm0
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const 
Packet2d& b) { return _mm_sub_pd(a,b); }
  846a04:       c4 c1 79 5c 0e          vsubpd (%r14),%xmm0,%xmm1
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& 
from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
  846a09:       c4 c1 79 11 4d 00       vmovupd %xmm1,0x0(%r13)
  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
  846a0f:       48 8b 4c 24 08          mov    0x8(%rsp),%rcx
  846a14:       48 8b 74 24 30          mov    0x30(%rsp),%rsi
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const 
LhsScalar& a, const RhsScalar& b) const { return a - b; }
  846a19:       c5 fb 10 51 10          vmovsd 0x10(%rcx),%xmm2
  846a1e:       c5 eb 5c 5e 10          vsubsd 0x10(%rsi),%xmm2,%xmm3
  846a23:       c4 c1 7b 11 5d 10       vmovsd %xmm3,0x10(%r13)
  846a29:       3b 9c 24 e8 00 00 00    cmp    0xe8(%rsp),%ebx
  846a30:       0f 82 57 ff ff ff       jb     84698d 
<mbsolve::solver_openmp_fdtd<2u, mbsolve::lindblad_cvr_rodr>::run() const+0xafd>
  846a36:       44 8b b4 24 f0 00 00    mov    0xf0(%rsp),%r14d
  846a3d:       00
  846a3e:       45 33 ed                xor    %r13d,%r13d
  846a41:       48 8d 3d 4c aa 37 00    lea    0x37aa4c(%rip),%rdi        # 
bc1494 <.2.22438_2_kmpc_loc_struct_pack.1251>
  846a48:       8b b4 24 60 01 00 00    mov    0x160(%rsp),%esi
  846a4f:       e8 9c 24 de ff          callq  628ef0 
<__kmpc_for_static_fini@plt>

Re: [eigen] Performance difference icc <-> gcc, EIGEN_STRONG_INLINE

Reply via email to