Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> The early scheduler takes up ~33% of the total build time, however it doesn't
> provide a meaningful performance gain.  This is partly because modern OoO 
> cores
> need far less scheduling, partly because the scheduler tends to create many
> unnecessary spills by increasing register pressure.  Building applications
> 56% faster is far more useful than ~0.1% improvement on SPEC, so switch off
> early scheduling on AArch64.  Codesize reduces by ~0.2%.
>
> The combine_and_move pass runs if the scheduler is disabled and aggressively
> combines moves.  The movsf/df patterns allow all FP immediates since they
> rely on a split pattern, however splits do not happen this late.  To fix this,
> use a more accurate check that blocks creation of literal loads during
> combine_and_move.  Fix various tests that depend on scheduling by explicitly
> adding -fschedule-insns.
>
> Passes bootstrap & regress, OK for commit?

I'm in favour of this.  Obviously the numbers are what count, but
also from first principles:

- I can't remember the last time a scheduling model was added to the port.

- We've (consciously) never added scheduling types for SVE.

- It doesn't make logical sense to schedule for Neoverse V3 (say)
  as thought it were a Cortex A57.

So at this point, it seems better for scheduling to be opt-in rather
than opt-out.  (That is, we can switch to a tune-based default if
anyone does add a new scheduling model in future.)

Let's see what others think.

Please split the md changes out into a separate pre-patch though.

What do you think about disabling late scheduling as well?

Thanks,
Richard

> gcc/ChangeLog:
>         * common/config/aarch64/aarch64-common.cc: Switch off fschedule_insns.
>         * config/aarch64/aarch64.md (movhf_aarch64): Use 
> aarch64_valid_fp_move.
>         (movsf_aarch64): Likewise.
>         (movdf_aarch64): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function.
>         * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise.
>
> gcc/testsuite/ChangeLog:
>         * testsuite/gcc.target/aarch64/ldp_aligned.c: Fix test.
>         * testsuite/gcc.target/aarch64/ldp_always.c: Likewise.
>         * testsuite/gcc.target/aarch64/ldp_stp_10.c: Add -fschedule-insns.
>         * testsuite/gcc.target/aarch64/ldp_stp_12.c: Likewise.
>         * testsuite/gcc.target/aarch64/ldp_stp_13.c: Remove test.
>         * testsuite/gcc.target/aarch64/ldp_stp_21.c: Add -fschedule-insns.
>         * testsuite/gcc.target/aarch64/ldp_stp_8.c: Likewise.
>         * testsuite/gcc.target/aarch64/ldp_vec_v2sf.c: Likewise.
>         * testsuite/gcc.target/aarch64/ldp_vec_v2si.c: Likewise.
>         * testsuite/gcc.target/aarch64/test_frame_16.c: Fix test.
>         * testsuite/gcc.target/aarch64/sve/vcond_12.c: Add -fschedule-insns.
>         * testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c: Likewise.
>
> ---
>
> diff --git a/gcc/common/config/aarch64/aarch64-common.cc 
> b/gcc/common/config/aarch64/aarch64-common.cc
> index 
> 2bfc597e333b6018970a9ee6e370a66b6d0960ef..845747e31e821c2f3970fd39ea70f046eddbe920
>  100644
> --- a/gcc/common/config/aarch64/aarch64-common.cc
> +++ b/gcc/common/config/aarch64/aarch64-common.cc
> @@ -54,6 +54,8 @@ static const struct default_options 
> aarch_option_optimization_table[] =
>      { OPT_LEVELS_ALL, OPT_fomit_frame_pointer, NULL, 0 },
>      /* Enable -fsched-pressure by default when optimizing.  */
>      { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
> +    /* Disable early scheduling due to high compile-time overheads.  */
> +    { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
>      /* Enable redundant extension instructions removal at -O2 and higher.  */
>      { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL },
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 250c5b96a21ea1c969a0e77e420525eec90e4de4..b30329d7f85f5b962dca43cf12ca938898425874
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -758,6 +758,7 @@ bool aarch64_advsimd_struct_mode_p (machine_mode mode);
>  opt_machine_mode aarch64_vq_mode (scalar_mode);
>  opt_machine_mode aarch64_full_sve_mode (scalar_mode);
>  bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
> +bool aarch64_valid_fp_move (rtx, rtx, machine_mode);
>  bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
>  bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
>                                             HOST_WIDE_INT);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 2647293f7cf020378dacc37b7bfbccc856573e44..965ec18412a6486e6ac4ff2e4a7d742bf61e5d75
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -11223,6 +11223,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode 
> mode)
>    return aarch64_simd_valid_mov_imm (v_op);
>  }
>
> +/* Return TRUE if DST and SRC with mode MODE is a valid fp move.  */
> +bool
> +aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode)
> +{
> +  if (!TARGET_FLOAT)
> +    return false;
> +
> +  if (aarch64_reg_or_fp_zero (src, mode))
> +    return true;
> +
> +  if (!register_operand (dst, mode))
> +    return false;
> +
> +  if (MEM_P (src))
> +    return true;
> +
> +  if (!DECIMAL_FLOAT_MODE_P (mode))
> +    {
> +      if (aarch64_can_const_movi_rtx_p (src, mode)
> +         || aarch64_float_const_representable_p (src)
> +         || aarch64_float_const_zero_rtx_p (src))
> +       return true;
> +
> +      /* This requires a split which is only allowed before regalloc.  */
> +      if (aarch64_float_const_rtx_p (src))
> +       return can_create_pseudo_p () && !ira_in_progress;
> +    }
> +
> +  return can_create_pseudo_p ();
> +}
>
>  /* Return the fixed registers used for condition codes.  */
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 20956fc49d8232763b127629ded17037ad7d7960..5d3fa9628952031f52474291e160b957d774b011
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1644,8 +1644,7 @@ (define_expand "mov<mode>"
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:HFBF 0 "nonimmediate_operand")
>         (match_operand:HFBF 1 "general_operand"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> -    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> +  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
>    {@ [ cons: =0 , 1   ; attrs: type , arch  ]
>       [ w        , Y   ; neon_move   , simd  ] movi\t%0.4h, #0
>       [ w        , ?rY ; f_mcr       , fp16  ] fmov\t%h0, %w1
> @@ -1668,8 +1667,7 @@ (define_insn "*mov<mode>_aarch64"
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:SFD 0 "nonimmediate_operand")
>         (match_operand:SFD 1 "general_operand"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> -    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> +  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
>    {@ [ cons: =0 , 1   ; attrs: type , arch  ]
>       [ w        , Y   ; neon_move   , simd  ] movi\t%0.2s, #0
>       [ w        , ?rY ; f_mcr       , *     ] fmov\t%s0, %w1
> @@ -1689,8 +1687,7 @@ (define_insn "*mov<mode>_aarch64"
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:DFD 0 "nonimmediate_operand")
>         (match_operand:DFD 1 "general_operand"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> -    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
> +  "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)"
>    {@ [ cons: =0 , 1   ; attrs: type , arch  ]
>       [ w        , Y   ; neon_move   , simd  ] movi\t%d0, #0
>       [ w        , ?rY ; f_mcr       , *     ] fmov\t%d0, %x1
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> index 
> 75495d71df28235b2bb2dc634c3e5121d398bac2..8ec2b0392b80d4c0d8b47a512ba291e3bade3be3
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
> @@ -14,25 +14,11 @@ TYPE ldp_aligned_##TYPE(char* ptr){ \
>      return a_0 + a_1; \
>  }
>
> -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \
> -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \
> -    TYPE a_0, a_1, a_2, a_3, a_4; \
> -    TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
> -    a_0 = arr[100]; \
> -    a_1 = arr[101]; \
> -    a_2 = arr[102]; \
> -    a_3 = arr[103]; \
> -    a_4 = arr[110]; \
> -    return a_0 + a_1 + a_2 + a_3 + a_4; \
> -}
> -
>  LDP_TEST_ALIGNED(int32_t);
>  LDP_TEST_ALIGNED(int64_t);
>  LDP_TEST_ALIGNED(v4si);
> -LDP_TEST_ADJUST_ALIGNED(int32_t);
> -LDP_TEST_ADJUST_ALIGNED(int64_t);
>
> -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 3 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */
> +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 1 } } */
>  /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_always.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> index 
> 9cada57db8947e8ace4ad0bdacc14c80ee0fe9b5..5ffb98a886ecb659bb5c7a5e7ef013cacd14ffb7
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_always.c
> @@ -24,43 +24,14 @@ TYPE ldp_unaligned_##TYPE(char* ptr){ \
>      return a_0 + a_1; \
>  }
>
> -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \
> -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \
> -    TYPE a_0, a_1, a_2, a_3, a_4; \
> -    TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
> -    a_0 = arr[100]; \
> -    a_1 = arr[101]; \
> -    a_2 = arr[102]; \
> -    a_3 = arr[103]; \
> -    a_4 = arr[110]; \
> -    return a_0 + a_1 + a_2 + a_3 + a_4; \
> -}
> -
> -#define LDP_TEST_ADJUST_UNALIGNED(TYPE) \
> -TYPE ldp_unaligned_adjust_##TYPE(char* ptr){ \
> -    TYPE a_0, a_1, a_2, a_3, a_4; \
> -    TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \
> -    TYPE *a = arr+1; \
> -    a_0 = a[100]; \
> -    a_1 = a[101]; \
> -    a_2 = a[102]; \
> -    a_3 = a[103]; \
> -    a_4 = a[110]; \
> -    return a_0 + a_1 + a_2 + a_3 + a_4; \
> -}
> -
>  LDP_TEST_ALIGNED(int32_t);
>  LDP_TEST_ALIGNED(int64_t);
>  LDP_TEST_ALIGNED(v4si);
>  LDP_TEST_UNALIGNED(int32_t);
>  LDP_TEST_UNALIGNED(int64_t);
>  LDP_TEST_UNALIGNED(v4si);
> -LDP_TEST_ADJUST_ALIGNED(int32_t);
> -LDP_TEST_ADJUST_ALIGNED(int64_t);
> -LDP_TEST_ADJUST_UNALIGNED(int32_t);
> -LDP_TEST_ADJUST_UNALIGNED(int64_t);
>
> -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 6 } } */
> -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 6 } } */
> +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 2 } } */
> +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 2 } } */
>  /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 2 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> index 
> 31f392901d2ca9e9e31cb20735fdf86eb040ee88..ac4828af76175388aa0112458476b02064c4e8fc
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  int
>  load (int *arr)
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> index 
> 718e82b53f0ccfd09a19afa26ebdb88654359e33..495e199270a60f797a8de21bbe6b8a771f927f23
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  void
>  store_offset (int *array, int x, int y)
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
> deleted file mode 100644
> index 
> 9cc3942f153773e8ffe9bcaf07f6b32dc0d5f95e..0000000000000000000000000000000000000000
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c
> +++ /dev/null
> @@ -1,18 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-O2 -mabi=ilp32" } */
> -
> -long long
> -load_long (long long int *arr)
> -{
> -  return arr[400] << 1 + arr[401] << 1 + arr[403] << 1 + arr[404] << 1;
> -}
> -
> -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, " 2 } } */
> -
> -int
> -load (int *arr)
> -{
> -  return arr[527] << 1 + arr[400] << 1 + arr[401] << 1 + arr[528] << 1;
> -}
> -
> -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]+, " 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> index 
> d54c322ce860688de734721718a9c57185d4be63..ac7bc164840ddff765fe599c525aa1d62f217401
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  #pragma GCC target "+nosimd+fp"
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> index 
> b25678323b85046d4a320d534be24aee429274b8..2adf151491b76fbdae8382852feefd810ab3611a
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  typedef float __attribute__ ((vector_size (8))) fvec;
>  typedef int __attribute__ ((vector_size (8))) ivec;
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> index 
> fbdae1c6cff1aef40db644361381ce511f0be64a..7a87fe7dd0a4715230733e25acd791dcd082f360
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  typedef float __attribute__((vector_size(8))) vec;
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c 
> b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> index 
> 7714cd6cd9e8fa7dc1febf484d6726d44c246408..068f53e28ce5c5d1e60105a7c2b4001fa96f5153
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  typedef int __attribute__((vector_size(8))) vec;
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> index 
> 41ad0bcea00f287757dd510b21915decafbc48c1..14eacce09c0585ec2132cd5dd185626e051ca588
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2" } */
> +/* { dg-options "-O2 -fschedule-insns" } */
>
>  #include <arm_sve.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> index 
> de650bf39e27b5cdb0f06d04b5d7948b3cc94a54..59dcc0abecf57455bb43ba47a65a2bfd3eae1929
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
> +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fschedule-insns" } */
>
>  #include <stdint.h>
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c 
> b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> index 
> 28f3826adadd5eaa6486659e4d6b6d7c5960b9d2..0f67458f71856afc54741960e0ac045ad5447395
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c
> @@ -17,7 +17,7 @@ double vararg_outgoing (int x1, ...)
>    double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = 
> x1 * 6;
>    __builtin_va_list vl;
>    __builtin_va_start (vl, x1);
> -  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1));
> +  outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1), REP8 (1));
>    __builtin_va_end (vl);
>    return a1 + a2 + a3 + a4 + a5 + a6;
>  }

Reply via email to