[gcc r15-6215] i386: Add vec_fm{addsub, subadd}v2sf4 patterns [PR116979]

Jakub Jelinek via Gcc-cvs Fri, 13 Dec 2024 01:33:14 -0800

https://gcc.gnu.org/g:99b9dfaff66ca6edd534bcf0e7b943a6f816c9bf


commit r15-6215-g99b9dfaff66ca6edd534bcf0e7b943a6f816c9bf
Author: Jakub Jelinek <ja...@redhat.com>
Date:   Fri Dec 13 10:31:04 2024 +0100

    i386: Add vec_fm{addsub,subadd}v2sf4 patterns [PR116979]
    
    As mentioned in the PR, the addition of vec_addsubv2sf3 expander caused
    the testcase to be vectorized and no longer to use fma.
    The following patch adds new expanders so that it can be vectorized
    again with the alternating add/sub fma instructions.
    
    There is some bug on the slp cost computation side which causes it
    not to count some scalar multiplication costs, but I think the patch
    is desirable anyway before that is fixed and the testcase for now just
    uses -fvect-cost-model=unlimited.
    
    2024-12-13  Jakub Jelinek  <ja...@redhat.com>
    
            PR target/116979
            * config/i386/mmx.md (vec_fmaddsubv2sf4, vec_fmsubaddv2sf4): New
            define_expand patterns.
    
            * gcc.target/i386/pr116979.c: New test.

Diff:
---
 gcc/config/i386/mmx.md                   | 48 ++++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr116979.c | 24 ++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 4daaa2baa25d..d9725a4e5764 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1132,6 +1132,54 @@
   DONE;
 })
 
+(define_expand "vec_fmaddsubv2sf4"
+  [(match_operand:V2SF 0 "register_operand")
+   (match_operand:V2SF 1 "nonimmediate_operand")
+   (match_operand:V2SF 2 "nonimmediate_operand")
+   (match_operand:V2SF 3 "nonimmediate_operand")]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V4SFmode);
+  rtx op2 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_movq_v2sf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v2sf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v2sf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmaddsubv4sf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
+(define_expand "vec_fmsubaddv2sf4"
+  [(match_operand:V2SF 0 "register_operand")
+   (match_operand:V2SF 1 "nonimmediate_operand")
+   (match_operand:V2SF 2 "nonimmediate_operand")
+   (match_operand:V2SF 3 "nonimmediate_operand")]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512VL)
+   && TARGET_MMX_WITH_SSE
+   && ix86_partial_vec_fp_math"
+{
+  rtx op3 = gen_reg_rtx (V4SFmode);
+  rtx op2 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_movq_v2sf_to_sse (op3, operands[3]));
+  emit_insn (gen_movq_v2sf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v2sf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_vec_fmsubaddv4sf4 (op0, op1, op2, op3));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel single-precision floating point comparisons
diff --git a/gcc/testsuite/gcc.target/i386/pr116979.c 
b/gcc/testsuite/gcc.target/i386/pr116979.c
new file mode 100644
index 000000000000..0d2a958af493
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116979.c
@@ -0,0 +1,24 @@
+/* PR target/116979 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfma -fvect-cost-model=unlimited" } */
+/* { dg-final { scan-assembler "vfmaddsub(?:132|213|231)pd" } } */
+/* { dg-final { scan-assembler "vfmaddsub(?:132|213|231)ps" { target { ! ia32 
} } } } */
+
+struct S { __complex__ float f; };
+struct T { __complex__ double f; };
+
+struct S
+foo (const struct S *a, const struct S *b)
+{
+  struct S r;
+  r.f = a->f * b->f;
+  return r;
+}
+
+struct T
+bar (const struct T *a, const struct T *b)
+{
+  struct T r;
+  r.f = a->f * b->f;
+  return r;
+}

[gcc r15-6215] i386: Add vec_fm{addsub, subadd}v2sf4 patterns [PR116979]

Reply via email to