https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113701

--- Comment #2 from Uroš Bizjak <ubizjak at gmail dot com> ---
(In reply to Uroš Bizjak from comment #0)
> Following testcase:
> 
> --cut here--
> typedef unsigned __int128 U;
> 
> U f0 (U x, U y) { return x + y; }
> U f1 (U x, U y) { return x - y; }
> 
> U f2 (U x, U y) { return x | y; }
> 
> int f3 (U x, U y) { return x == y; }
> int f4 (U x, U y) { return x < y; }
> --cut here--
> 
> shows some issues with __int128 parameter passing.
> 
> gcc -O2:
> 
> f0:
>         movq    %rdx, %rax
>         movq    %rcx, %rdx
>         addq    %rdi, %rax
>         adcq    %rsi, %rdx
>         ret
> 
> f1:
>         xchgq   %rdi, %rsi
>         movq    %rdx, %r8
>         movq    %rsi, %rax
>         movq    %rdi, %rdx
>         subq    %r8, %rax
>         sbbq    %rcx, %rdx
>         ret
> 
> f2:
>         xchgq   %rdi, %rsi
>         movq    %rdx, %rax
>         movq    %rcx, %rdx
>         orq     %rsi, %rax
>         orq     %rdi, %rdx
>         ret
> 
> f3:
>         xchgq   %rdi, %rsi
>         movq    %rdx, %r8
>         movq    %rcx, %rax
>         movq    %rsi, %rdx
>         movq    %rdi, %rcx
>         xorq    %rax, %rcx
>         xorq    %r8, %rdx
>         xorl    %eax, %eax
>         orq     %rcx, %rdx
>         sete    %al
>         ret
> 
> f4:
>         xorl    %eax, %eax
>         cmpq    %rdx, %rdi
>         sbbq    %rcx, %rsi
>         setc    %al
>         ret
> 
> Functions f0 and f4 are now optimal.
> 
> Functions f1, f2 and f3 emit extra XCHG, but the swap should be propagated
> to MOV instructions instead.
> 
> The most problematic function is f3, which regressed noticeably from gcc-12.3:

This patch solves the regression:

--cut here--
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bac0a6ade67..02fed16db72 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1632,11 +1632,6 @@ (define_insn_and_split "*cmp<dwi>_doubleword"
              (set (match_dup 4) (ior:DWIH (match_dup 4) (match_dup 5)))])]
 {
   split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[2]);
-  /* Placing the SUBREG pieces in pseudos helps reload.  */
-  for (int i = 0; i < 4; i++)
-    if (SUBREG_P (operands[i]))
-      operands[i] = force_reg (<MODE>mode, operands[i]);
-
   operands[4] = gen_reg_rtx (<MODE>mode);

   /* Special case comparisons against -1.  */
--cut here--

gcc -O2:

f3:
        xchgq   %rdi, %rsi
        xorl    %eax, %eax
        xorq    %rsi, %rdx
        xorq    %rdi, %rcx
        orq     %rcx, %rdx
        sete    %al
        ret

Reply via email to