[gcc r15-6660] AArch64: Block combine_and_move from creating FP literal loads

Wilco Dijkstra via Gcc-cvs Tue, 07 Jan 2025 10:12:52 -0800

https://gcc.gnu.org/g:45d306a835cb3f865a897dc7c04efbe1f9f46c28


commit r15-6660-g45d306a835cb3f865a897dc7c04efbe1f9f46c28
Author: Wilco Dijkstra <wilco.dijks...@arm.com>
Date:   Fri Nov 1 14:44:56 2024 +0000

    AArch64: Block combine_and_move from creating FP literal loads
    
    The IRA combine_and_move pass runs if the scheduler is disabled and 
aggressively
    combines moves.  The movsf/df patterns allow all FP immediates since they 
rely
    on a split pattern.  However splits do not happen during IRA, so the result 
is
    extra literal loads.  To avoid this, split early during expand and block
    creation of FP immediates that need this split.  Mark a few testcases that
    rely on late splitting as xfail.
    
    double f(void) { return 128.0; }
    
    -O2 -fno-schedule-insns gives:
    
            adrp    x0, .LC0
            ldr     d0, [x0, #:lo12:.LC0]
            ret
    
    After patch:
    
            mov     x0, 4638707616191610880
            fmov    d0, x0
            ret
    
    Passes bootstrap & regress, OK for commit?
    
    gcc:
            * config/aarch64/aarch64.md (movhf_aarch64): Use 
aarch64_valid_fp_move.
            (movsf_aarch64): Likewise.
            (movdf_aarch64): Likewise.
            * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function.
            * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise.
    
    gcc/testsuite:
            * gcc.target/aarch64/dbl_mov_immediate_1.c: Add xfail for -0.0.
            * gcc.target/aarch64/fmul_scvtf_1.c: Fixup test cases, add xfail,
            reduce duplication.

Diff:
---
 gcc/config/aarch64/aarch64-protos.h                |  1 +
 gcc/config/aarch64/aarch64.cc                      | 30 ++++++++++
 gcc/config/aarch64/aarch64.md                      | 50 ++++++++--------
 .../gcc.target/aarch64/dbl_mov_immediate_1.c       |  8 +--
 gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c    | 68 +++++++---------------
 5 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 70a134f0365b..fa7bc8029be0 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -857,6 +857,7 @@ opt_machine_mode aarch64_v64_mode (scalar_mode);
 opt_machine_mode aarch64_v128_mode (scalar_mode);
 opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
+bool aarch64_valid_fp_move (rtx, rtx, machine_mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
                                            HOST_WIDE_INT);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d6a8e4c20952..3e700ed41e97 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -11299,6 +11299,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
   return aarch64_simd_valid_mov_imm (v_op);
 }
 
+/* Return TRUE if DST and SRC with mode MODE is a valid fp move.  */
+bool
+aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
+{
+  if (!TARGET_FLOAT)
+    return false;
+
+  if (aarch64_reg_or_fp_zero (src, mode))
+    return true;
+
+  if (!register_operand (dst, mode))
+    return false;
+
+  if (MEM_P (src))
+    return true;
+
+  if (!DECIMAL_FLOAT_MODE_P (mode))
+    {
+      if (aarch64_can_const_movi_rtx_p (src, mode)
+         || aarch64_float_const_representable_p (src)
+         || aarch64_float_const_zero_rtx_p (src))
+       return true;
+
+      /* Block FP immediates which are split during expand.  */
+      if (aarch64_float_const_rtx_p (src))
+       return false;
+    }
+
+  return can_create_pseudo_p ();
+}
 
 /* Return the fixed registers used for condition codes.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6758a1db1173..0ed3c93b379e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1760,14 +1760,33 @@
         && ! (GET_CODE (operands[1]) == CONST_DOUBLE
              && aarch64_float_const_zero_rtx_p (operands[1])))
       operands[1] = force_reg (<MODE>mode, operands[1]);
+
+    if (!DECIMAL_FLOAT_MODE_P (<MODE>mode)
+       && GET_CODE (operands[1]) == CONST_DOUBLE
+       && can_create_pseudo_p ()
+       && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
+       && !aarch64_float_const_representable_p (operands[1])
+       && !aarch64_float_const_zero_rtx_p (operands[1])
+       &&  aarch64_float_const_rtx_p (operands[1]))
+      {
+       unsigned HOST_WIDE_INT ival;
+       bool res = aarch64_reinterpret_float_as_int (operands[1], &ival);
+       gcc_assert (res);
+
+       machine_mode intmode
+         = int_mode_for_size (GET_MODE_BITSIZE (<MODE>mode), 0).require ();
+       rtx tmp = gen_reg_rtx (intmode);
+       emit_move_insn (tmp, gen_int_mode (ival, intmode));
+       emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
+       DONE;
+      }
   }
 )
 
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:HFBF 0 "nonimmediate_operand")
        (match_operand:HFBF 1 "general_operand"))]
-  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
-    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
   {@ [ cons: =0 , 1   ; attrs: type , arch  ]
      [ w        , Y   ; neon_move   , simd  ] movi\t%0.4h, #0
      [ w        , ?rY ; f_mcr       , fp16  ] fmov\t%h0, %w1
@@ -1790,8 +1809,7 @@
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:SFD 0 "nonimmediate_operand")
        (match_operand:SFD 1 "general_operand"))]
-  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
-    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
   {@ [ cons: =0 , 1   ; attrs: type , arch  ]
      [ w        , Y   ; neon_move   , simd  ] movi\t%0.2s, #0
      [ w        , ?rY ; f_mcr       , *     ] fmov\t%s0, %w1
@@ -1811,8 +1829,7 @@
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:DFD 0 "nonimmediate_operand")
        (match_operand:DFD 1 "general_operand"))]
-  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
-    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
   {@ [ cons: =0 , 1   ; attrs: type , arch  ]
      [ w        , Y   ; neon_move   , simd  ] movi\t%d0, #0
      [ w        , ?rY ; f_mcr       , *     ] fmov\t%d0, %x1
@@ -1829,27 +1846,6 @@
   }
 )
 
-(define_split
-  [(set (match_operand:GPF_HF 0 "nonimmediate_operand")
-       (match_operand:GPF_HF 1 "const_double_operand"))]
-  "can_create_pseudo_p ()
-   && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
-   && !aarch64_float_const_representable_p (operands[1])
-   && !aarch64_float_const_zero_rtx_p (operands[1])
-   &&  aarch64_float_const_rtx_p (operands[1])"
-  [(const_int 0)]
-  {
-    unsigned HOST_WIDE_INT ival;
-    if (!aarch64_reinterpret_float_as_int (operands[1], &ival))
-      FAIL;
-
-    rtx tmp = gen_reg_rtx (<FCVT_TARGET>mode);
-    emit_move_insn (tmp, gen_int_mode (ival, <FCVT_TARGET>mode));
-    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
-    DONE;
-  }
-)
-
 (define_insn "*mov<mode>_aarch64"
   [(set (match_operand:TFD 0
         "nonimmediate_operand" "=w,w,?r ,w ,?r,w,?w,w,m,?r,m ,m")
diff --git a/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c 
b/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
index 4838f74d052e..8332035d80b9 100644
--- a/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
@@ -48,8 +48,8 @@ double d4(void)
 
 /* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, 25838523252736"       1 
} } */
 /* { dg-final { scan-assembler-times "movk\tx\[0-9\]+, 0x40fe, lsl 48"      1 
} } */
-/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, -9223372036854775808" 0 
} } */
-/* { dg-final { scan-assembler-times {movi\tv[0-9]+.4s, #?0} 1 } } */
-/* { dg-final { scan-assembler-times {fneg\tv[0-9]+.2d, v[0-9]+.2d} 1 } } */
-/* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, x\[0-9\]+"           1 
} } */
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, -9223372036854775808" 0 
{ xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {movi\tv[0-9]+.4s, #?0} 1 { xfail *-*-* } 
} } */
+/* { dg-final { scan-assembler-times {fneg\tv[0-9]+.2d, v[0-9]+.2d} 1 { xfail 
*-*-* } } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, x\[0-9\]+"           1 
{ xfail *-*-* } } } */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c 
b/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
index 8bfe06ac3e61..198ba0a574e8 100644
--- a/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
@@ -45,53 +45,29 @@ dulfoo##__a (unsigned long long x)          \
   return ((double) x)/(1lu << __a);            \
 }
 
-FUNC_DEFS (4)
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#4" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#4" 1 
} } */
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#4" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#4" 1 
} } */
+FUNC_DEFS (2)
+       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#2" 1 
} } */
+       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#2" 1 
} } */
+       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#2" 1 
} } */
+       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#2" 1 
} } */
 
-FUNC_DEFD (4)
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#4" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#4" 1 
} } */
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#4" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#4" 1 
} } */
-
-FUNC_DEFS (8)
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#8" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#8" 1 
} } */
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#8" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#8" 1 
} } */
-
-FUNC_DEFD (8)
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#8" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#8" 1 
} } */
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#8" 1 
} } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#8" 1 
} } */
-
-FUNC_DEFS (16)
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#16" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#16" 
1 } } */
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#16" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#16" 
1 } } */
-
-FUNC_DEFD (16)
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#16" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#16" 
1 } } */
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#16" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#16" 
1 } } */
+FUNC_DEFD (2)
+       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#2" 1 
} } */
+       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#2" 1 
} } */
+       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#2" 1 
} } */
+       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#2" 1 
} } */
 
 FUNC_DEFS (32)
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#32" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#32" 
1 } } */
-       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#32" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#32" 
1 } } */
+       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
+       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
+       /* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
+       /* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
 
 FUNC_DEFD (32)
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#32" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#32" 
1 } } */
-       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#32" 
1 } } */
-       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#32" 
1 } } */
+       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
+       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
+       /* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
+       /* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#32" 
1 { xfail *-*-* } } } */
 
 #define FUNC_TESTS(__a, __b)                                   \
 do                                                             \
@@ -126,14 +102,10 @@ main (void)
 
        for (i = 0; i < 32; i ++)
        {
-               FUNC_TESTS (4, i);
-               FUNC_TESTS (8, i);
-               FUNC_TESTS (16, i);
+               FUNC_TESTS (2, i);
                FUNC_TESTS (32, i);
 
-               FUNC_TESTD (4, i);
-               FUNC_TESTD (8, i);
-               FUNC_TESTD (16, i);
+               FUNC_TESTD (2, i);
                FUNC_TESTD (32, i);
        }
        return 0;

[gcc r15-6660] AArch64: Block combine_and_move from creating FP literal loads

Reply via email to