On Mon, Sep 2, 2019 at 9:19 AM Gero Peterhoff <g.peterh...@t-online.de> wrote:
>
> Hello gcc team,
> I have a big problem with code generation for conditions and loop control and 
> in general.
>
> 1) Once a function is defined as inline, all attributes and pragmas are 
> ignored. There is therefore no control at all for loops within inline 
> functions.
> 2) Only (almost) always conditional jumps are generated:
> - even mini-functions do not produce cmov
> - the jump control itself is completely inefficient and therefore slow, since 
> conditional jumps are not summarized
> - The functionality is always moved to the end of a function and must return 
> to the exit point.
> All this means that a lot of codebloats are generated. It jumps around in 
> confusion. This kill any jump prediction.
> An Example for swap bits in a integer:
>
> #define bitsof(Arg)     std::size_t(sizeof(Arg) * CHAR_BIT)
> using numeric_type      = std::uint8_t;
>
> template <typename Type, typename Bits>
> constexpr bool  is_bits(const Type&, const Bits& bits)  noexcept
> {
>         return bits < bitsof(Type);
> }
>
> template <bool Numeric, typename... Args>
> constexpr bool  all_of(const Args... args)      noexcept
> {
>         static_assert
>         (
>                 sizeof...(Args) <= 
> std::size_t{std::numeric_limits<numeric_type>::max()}        &&
>                 sizeof...(Args) > 0,
>                 "invalid argument count"
>         );
>
>         if constexpr (Numeric)
>                 return (numeric_type(bool(args)) + ...) == 
> numeric_type(sizeof...(Args));
>         else
>                 return (bool(args) && ...);
> }
>
> template <size_t Test, typename Type, typename Bit>
> constexpr Type  swap(const Type& arg, const Bit& bit1, const Bit& bit2) 
> noexcept
> {
>         //      Test 0 = default
>         //      Test 1 = fold
>         //      Test 2 = numeric
>
>         using unsigned_type = std::make_unsigned_t<Type>;
>
>         auto work = [&]() constexpr noexcept -> Type
>         {
>                 unsigned_type
>                         res = arg;
>
>                 res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
>                 res = (res<<bit1) | (res<<bit2);
>                 res ^= arg;
>                 return res;
>         };
>
>         if constexpr (Test == 2)
>                 return all_of<true>(bit1!=bit2, is_bits(arg, bit1), 
> is_bits(arg, bit2)) ? work() : arg;
>         else if constexpr (Test == 1)
>                 return all_of<false>(bit1!=bit2, is_bits(arg, bit1), 
> is_bits(arg, bit2)) ? work() : arg;
>         else
>                 return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, 
> bit2)) ? work() : arg;
> }

Couldn't get it to compile even with some fiddling.  Pleas always
provide testcases
that can be compiled without editing.

> generates:
>
> 1) default:
> 2 jumps to different targets: 20->28, 26->30
> return back 4e->28
>
> 0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned 
> char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
>     0:  49 89 d0                mov    %rdx,%r8
>     3:  0f b6 16                movzbl (%rsi),%edx
>     6:  41 0f b6 08             movzbl (%r8),%ecx
>     a:  44 8b 07                mov    (%rdi),%r8d
>     d:  80 fa 1f                cmp    $0x1f,%dl
>    10:  0f 96 c0                setbe  %al
>    13:  38 d1                   cmp    %dl,%cl
>    15:  0f b6 f9                movzbl %cl,%edi
>    18:  40 0f 95 c6             setne  %sil
>    1c:  21 f0                   and    %esi,%eax
>    1e:  a8 01                   test   $0x1,%al
>    20:  74 06                   je     28 <unsigned int 
> silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x28>
>    22:  66 83 ff 1f             cmp    $0x1f,%di
>    26:  76 08                   jbe    30 <unsigned int 
> silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x30>
>    28:  44 89 c0                mov    %r8d,%eax
>    2b:  c3                      retq
>    2c:  0f 1f 40 00             nopl   0x0(%rax)
>    30:  c4 c2 6b f7 c0          shrx   %edx,%r8d,%eax
>    35:  c4 c2 73 f7 f0          shrx   %ecx,%r8d,%esi
>    3a:  31 f0                   xor    %esi,%eax
>    3c:  83 e0 01                and    $0x1,%eax
>    3f:  c4 e2 69 f7 d0          shlx   %edx,%eax,%edx
>    44:  c4 e2 71 f7 c8          shlx   %ecx,%eax,%ecx
>    49:  09 ca                   or     %ecx,%edx
>    4b:  41 31 d0                xor    %edx,%r8d
>    4e:  eb d8                   jmp    28 <unsigned int 
> silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x28>
>
>
> 2) fold-expr:
> 3 jumps with one target: 11,17,1d->3d
> ~25% faster than 1)
>
> 0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned 
> char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
>     0:  0f b6 12                movzbl (%rdx),%edx
>     3:  0f b6 0e                movzbl (%rsi),%ecx
>     6:  44 8b 07                mov    (%rdi),%r8d
>     9:  0f b6 f2                movzbl %dl,%esi
>     c:  89 c8                   mov    %ecx,%eax
>     e:  66 39 ce                cmp    %cx,%si
>    11:  74 2a                   je     3d <unsigned int 
> silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x3d>
>    13:  66 83 f9 1f             cmp    $0x1f,%cx
>    17:  77 24                   ja     3d <unsigned int 
> silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x3d>
>    19:  66 83 fe 1f             cmp    $0x1f,%si
>    1d:  77 1e                   ja     3d <unsigned int 
> silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x3d>
>    1f:  c4 c2 7b f7 c8          shrx   %eax,%r8d,%ecx
>    24:  c4 c2 6b f7 f0          shrx   %edx,%r8d,%esi
>    29:  31 f1                   xor    %esi,%ecx
>    2b:  83 e1 01                and    $0x1,%ecx
>    2e:  c4 e2 79 f7 c1          shlx   %eax,%ecx,%eax
>    33:  c4 e2 69 f7 d1          shlx   %edx,%ecx,%edx
>    38:  09 d0                   or     %edx,%eax
>    3a:  41 31 c0                xor    %eax,%r8d
>    3d:  44 89 c0                mov    %r8d,%eax
>    40:  c3                      retq
>
>
> 3) numeric
> calculate condiditions by numeric:
> 1 jump to 1 target: 29->49
> ~50% faster than 1)
>
> 0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned 
> char>(unsigned int const&, unsigned char const&, unsigned char const&)>:
>     0:  0f b6 0a                movzbl (%rdx),%ecx
>     3:  0f b6 16                movzbl (%rsi),%edx
>     6:  31 c0                   xor    %eax,%eax
>     8:  44 8b 07                mov    (%rdi),%r8d
>     b:  80 f9 1f                cmp    $0x1f,%cl
>     e:  0f 96 c0                setbe  %al
>    11:  31 f6                   xor    %esi,%esi
>    13:  80 fa 1f                cmp    $0x1f,%dl
>    16:  40 0f 96 c6             setbe  %sil
>    1a:  01 f0                   add    %esi,%eax
>    1c:  31 f6                   xor    %esi,%esi
>    1e:  38 ca                   cmp    %cl,%dl
>    20:  40 0f 95 c6             setne  %sil
>    24:  01 f0                   add    %esi,%eax
>    26:  83 f8 03                cmp    $0x3,%eax
>    29:  75 1e                   jne    49 <unsigned int 
> silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, 
> unsigned char const&, unsigned char const&)+0x49>
>    2b:  c4 c2 6b f7 c0          shrx   %edx,%r8d,%eax
>    30:  c4 c2 73 f7 f0          shrx   %ecx,%r8d,%esi
>    35:  31 f0                   xor    %esi,%eax
>    37:  83 e0 01                and    $0x1,%eax
>    3a:  c4 e2 69 f7 d0          shlx   %edx,%eax,%edx
>    3f:  c4 e2 71 f7 c8          shlx   %ecx,%eax,%ecx
>    44:  09 ca                   or     %ecx,%edx
>    46:  41 31 d0                xor    %edx,%r8d
>    49:  44 89 c0                mov    %r8d,%eax
>    4c:  c3                      retq
>
>
> but clang can more tricky with adc:
> (code by godbolt)
>
>         mov     rax, rdi
>         xor     ecx, ecx
>          cmp     sil, dl
>          setne   cl
>          cmp     sil, 64
>          adc     ecx, 0
>          cmp     dl, 64
>          adc     ecx, 0
>          cmp     ecx, 3
>          jne     .LBB0_2
>          shrx    rcx, rax, rsi
>          shrx    rdi, rax, rdx
>          xor     edi, ecx
>          and     edi, 1
>          shlx    rcx, rdi, rsi
>          shlx    rdx, rdi, rdx
>          or      rdx, rcx
>          xor     rax, rdx
> .LBB0_2:
>          ret
>
> I do not understand why you still insist on outdated function attributes, 
> rather than on optimization/loop control within a function.
>
> best regards
> Gero

Reply via email to