Wilco Dijkstra <wilco.dijks...@arm.com> writes: > The early scheduler takes up ~33% of the total build time, however it doesn't > provide a meaningful performance gain. This is partly because modern OoO > cores > need far less scheduling, partly because the scheduler tends to create many > unnecessary spills by increasing register pressure. Building applications > 56% faster is far more useful than ~0.1% improvement on SPEC, so switch off > early scheduling on AArch64. Codesize reduces by ~0.2%. > > The combine_and_move pass runs if the scheduler is disabled and aggressively > combines moves. The movsf/df patterns allow all FP immediates since they > rely on a split pattern, however splits do not happen this late. To fix this, > use a more accurate check that blocks creation of literal loads during > combine_and_move. Fix various tests that depend on scheduling by explicitly > adding -fschedule-insns. > > Passes bootstrap & regress, OK for commit?
I'm in favour of this. Obviously the numbers are what count, but also from first principles: - I can't remember the last time a scheduling model was added to the port. - We've (consciously) never added scheduling types for SVE. - It doesn't make logical sense to schedule for Neoverse V3 (say) as thought it were a Cortex A57. So at this point, it seems better for scheduling to be opt-in rather than opt-out. (That is, we can switch to a tune-based default if anyone does add a new scheduling model in future.) Let's see what others think. Please split the md changes out into a separate pre-patch though. What do you think about disabling late scheduling as well? Thanks, Richard > gcc/ChangeLog: > * common/config/aarch64/aarch64-common.cc: Switch off fschedule_insns. > * config/aarch64/aarch64.md (movhf_aarch64): Use > aarch64_valid_fp_move. > (movsf_aarch64): Likewise. > (movdf_aarch64): Likewise. > * config/aarch64/aarch64.cc (aarch64_valid_fp_move): New function. > * config/aarch64/aarch64-protos.h (aarch64_valid_fp_move): Likewise. > > gcc/testsuite/ChangeLog: > * testsuite/gcc.target/aarch64/ldp_aligned.c: Fix test. > * testsuite/gcc.target/aarch64/ldp_always.c: Likewise. > * testsuite/gcc.target/aarch64/ldp_stp_10.c: Add -fschedule-insns. > * testsuite/gcc.target/aarch64/ldp_stp_12.c: Likewise. > * testsuite/gcc.target/aarch64/ldp_stp_13.c: Remove test. > * testsuite/gcc.target/aarch64/ldp_stp_21.c: Add -fschedule-insns. > * testsuite/gcc.target/aarch64/ldp_stp_8.c: Likewise. > * testsuite/gcc.target/aarch64/ldp_vec_v2sf.c: Likewise. > * testsuite/gcc.target/aarch64/ldp_vec_v2si.c: Likewise. > * testsuite/gcc.target/aarch64/test_frame_16.c: Fix test. > * testsuite/gcc.target/aarch64/sve/vcond_12.c: Add -fschedule-insns. > * testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c: Likewise. > > --- > > diff --git a/gcc/common/config/aarch64/aarch64-common.cc > b/gcc/common/config/aarch64/aarch64-common.cc > index > 2bfc597e333b6018970a9ee6e370a66b6d0960ef..845747e31e821c2f3970fd39ea70f046eddbe920 > 100644 > --- a/gcc/common/config/aarch64/aarch64-common.cc > +++ b/gcc/common/config/aarch64/aarch64-common.cc > @@ -54,6 +54,8 @@ static const struct default_options > aarch_option_optimization_table[] = > { OPT_LEVELS_ALL, OPT_fomit_frame_pointer, NULL, 0 }, > /* Enable -fsched-pressure by default when optimizing. */ > { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 }, > + /* Disable early scheduling due to high compile-time overheads. */ > + { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 }, > /* Enable redundant extension instructions removal at -O2 and higher. */ > { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 }, > { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL }, > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index > 250c5b96a21ea1c969a0e77e420525eec90e4de4..b30329d7f85f5b962dca43cf12ca938898425874 > 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -758,6 +758,7 @@ bool aarch64_advsimd_struct_mode_p (machine_mode mode); > opt_machine_mode aarch64_vq_mode (scalar_mode); > opt_machine_mode aarch64_full_sve_mode (scalar_mode); > bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode); > +bool aarch64_valid_fp_move (rtx, rtx, machine_mode); > bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT); > bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT, > HOST_WIDE_INT); > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 2647293f7cf020378dacc37b7bfbccc856573e44..965ec18412a6486e6ac4ff2e4a7d742bf61e5d75 > 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -11223,6 +11223,36 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode > mode) > return aarch64_simd_valid_mov_imm (v_op); > } > > +/* Return TRUE if DST and SRC with mode MODE is a valid fp move. */ > +bool > +aarch64_valid_fp_move (rtx dst, rtx src, machine_mode mode) > +{ > + if (!TARGET_FLOAT) > + return false; > + > + if (aarch64_reg_or_fp_zero (src, mode)) > + return true; > + > + if (!register_operand (dst, mode)) > + return false; > + > + if (MEM_P (src)) > + return true; > + > + if (!DECIMAL_FLOAT_MODE_P (mode)) > + { > + if (aarch64_can_const_movi_rtx_p (src, mode) > + || aarch64_float_const_representable_p (src) > + || aarch64_float_const_zero_rtx_p (src)) > + return true; > + > + /* This requires a split which is only allowed before regalloc. */ > + if (aarch64_float_const_rtx_p (src)) > + return can_create_pseudo_p () && !ira_in_progress; > + } > + > + return can_create_pseudo_p (); > +} > > /* Return the fixed registers used for condition codes. */ > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > 20956fc49d8232763b127629ded17037ad7d7960..5d3fa9628952031f52474291e160b957d774b011 > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -1644,8 +1644,7 @@ (define_expand "mov<mode>" > (define_insn "*mov<mode>_aarch64" > [(set (match_operand:HFBF 0 "nonimmediate_operand") > (match_operand:HFBF 1 "general_operand"))] > - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) > - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" > + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)" > {@ [ cons: =0 , 1 ; attrs: type , arch ] > [ w , Y ; neon_move , simd ] movi\t%0.4h, #0 > [ w , ?rY ; f_mcr , fp16 ] fmov\t%h0, %w1 > @@ -1668,8 +1667,7 @@ (define_insn "*mov<mode>_aarch64" > (define_insn "*mov<mode>_aarch64" > [(set (match_operand:SFD 0 "nonimmediate_operand") > (match_operand:SFD 1 "general_operand"))] > - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) > - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" > + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)" > {@ [ cons: =0 , 1 ; attrs: type , arch ] > [ w , Y ; neon_move , simd ] movi\t%0.2s, #0 > [ w , ?rY ; f_mcr , * ] fmov\t%s0, %w1 > @@ -1689,8 +1687,7 @@ (define_insn "*mov<mode>_aarch64" > (define_insn "*mov<mode>_aarch64" > [(set (match_operand:DFD 0 "nonimmediate_operand") > (match_operand:DFD 1 "general_operand"))] > - "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode) > - || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))" > + "aarch64_valid_fp_move (operands[0], operands[1], <MODE>mode)" > {@ [ cons: =0 , 1 ; attrs: type , arch ] > [ w , Y ; neon_move , simd ] movi\t%d0, #0 > [ w , ?rY ; f_mcr , * ] fmov\t%d0, %x1 > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > index > 75495d71df28235b2bb2dc634c3e5121d398bac2..8ec2b0392b80d4c0d8b47a512ba291e3bade3be3 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_aligned.c > @@ -14,25 +14,11 @@ TYPE ldp_aligned_##TYPE(char* ptr){ \ > return a_0 + a_1; \ > } > > -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \ > -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \ > - TYPE a_0, a_1, a_2, a_3, a_4; \ > - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \ > - a_0 = arr[100]; \ > - a_1 = arr[101]; \ > - a_2 = arr[102]; \ > - a_3 = arr[103]; \ > - a_4 = arr[110]; \ > - return a_0 + a_1 + a_2 + a_3 + a_4; \ > -} > - > LDP_TEST_ALIGNED(int32_t); > LDP_TEST_ALIGNED(int64_t); > LDP_TEST_ALIGNED(v4si); > -LDP_TEST_ADJUST_ALIGNED(int32_t); > -LDP_TEST_ADJUST_ALIGNED(int64_t); > > -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 3 } } */ > -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 3 } } */ > +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */ > +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 1 } } */ > /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 1 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_always.c > b/gcc/testsuite/gcc.target/aarch64/ldp_always.c > index > 9cada57db8947e8ace4ad0bdacc14c80ee0fe9b5..5ffb98a886ecb659bb5c7a5e7ef013cacd14ffb7 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_always.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_always.c > @@ -24,43 +24,14 @@ TYPE ldp_unaligned_##TYPE(char* ptr){ \ > return a_0 + a_1; \ > } > > -#define LDP_TEST_ADJUST_ALIGNED(TYPE) \ > -TYPE ldp_aligned_adjust_##TYPE(char* ptr){ \ > - TYPE a_0, a_1, a_2, a_3, a_4; \ > - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \ > - a_0 = arr[100]; \ > - a_1 = arr[101]; \ > - a_2 = arr[102]; \ > - a_3 = arr[103]; \ > - a_4 = arr[110]; \ > - return a_0 + a_1 + a_2 + a_3 + a_4; \ > -} > - > -#define LDP_TEST_ADJUST_UNALIGNED(TYPE) \ > -TYPE ldp_unaligned_adjust_##TYPE(char* ptr){ \ > - TYPE a_0, a_1, a_2, a_3, a_4; \ > - TYPE *arr = (TYPE*) ((uintptr_t)ptr & ~(2 * 8 * _Alignof(TYPE) - 1)); \ > - TYPE *a = arr+1; \ > - a_0 = a[100]; \ > - a_1 = a[101]; \ > - a_2 = a[102]; \ > - a_3 = a[103]; \ > - a_4 = a[110]; \ > - return a_0 + a_1 + a_2 + a_3 + a_4; \ > -} > - > LDP_TEST_ALIGNED(int32_t); > LDP_TEST_ALIGNED(int64_t); > LDP_TEST_ALIGNED(v4si); > LDP_TEST_UNALIGNED(int32_t); > LDP_TEST_UNALIGNED(int64_t); > LDP_TEST_UNALIGNED(v4si); > -LDP_TEST_ADJUST_ALIGNED(int32_t); > -LDP_TEST_ADJUST_ALIGNED(int64_t); > -LDP_TEST_ADJUST_UNALIGNED(int32_t); > -LDP_TEST_ADJUST_UNALIGNED(int64_t); > > -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 6 } } */ > -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 6 } } */ > +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 2 } } */ > +/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]" 2 } } */ > /* { dg-final { scan-assembler-times "ldp\tq\[0-9\]+, q\[0-9\]" 2 } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > index > 31f392901d2ca9e9e31cb20735fdf86eb040ee88..ac4828af76175388aa0112458476b02064c4e8fc > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_10.c > @@ -1,4 +1,4 @@ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > int > load (int *arr) > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > index > 718e82b53f0ccfd09a19afa26ebdb88654359e33..495e199270a60f797a8de21bbe6b8a771f927f23 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_12.c > @@ -1,4 +1,4 @@ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > void > store_offset (int *array, int x, int y) > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c > deleted file mode 100644 > index > 9cc3942f153773e8ffe9bcaf07f6b32dc0d5f95e..0000000000000000000000000000000000000000 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_13.c > +++ /dev/null > @@ -1,18 +0,0 @@ > -/* { dg-do compile } */ > -/* { dg-options "-O2 -mabi=ilp32" } */ > - > -long long > -load_long (long long int *arr) > -{ > - return arr[400] << 1 + arr[401] << 1 + arr[403] << 1 + arr[404] << 1; > -} > - > -/* { dg-final { scan-assembler-times "ldp\tx\[0-9\]+, x\[0-9\]+, " 2 } } */ > - > -int > -load (int *arr) > -{ > - return arr[527] << 1 + arr[400] << 1 + arr[401] << 1 + arr[528] << 1; > -} > - > -/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]+, " 2 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > index > d54c322ce860688de734721718a9c57185d4be63..ac7bc164840ddff765fe599c525aa1d62f217401 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c > @@ -1,4 +1,4 @@ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > #pragma GCC target "+nosimd+fp" > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > index > b25678323b85046d4a320d534be24aee429274b8..2adf151491b76fbdae8382852feefd810ab3611a > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_8.c > @@ -1,4 +1,4 @@ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > typedef float __attribute__ ((vector_size (8))) fvec; > typedef int __attribute__ ((vector_size (8))) ivec; > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > index > fbdae1c6cff1aef40db644361381ce511f0be64a..7a87fe7dd0a4715230733e25acd791dcd082f360 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > typedef float __attribute__((vector_size(8))) vec; > > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > index > 7714cd6cd9e8fa7dc1febf484d6726d44c246408..068f53e28ce5c5d1e60105a7c2b4001fa96f5153 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > typedef int __attribute__((vector_size(8))) vec; > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > index > 41ad0bcea00f287757dd510b21915decafbc48c1..14eacce09c0585ec2132cd5dd185626e051ca588 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fschedule-insns" } */ > > #include <arm_sve.h> > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > index > de650bf39e27b5cdb0f06d04b5d7948b3cc94a54..59dcc0abecf57455bb43ba47a65a2bfd3eae1929 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_12.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */ > +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fschedule-insns" } */ > > #include <stdint.h> > > diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > index > 28f3826adadd5eaa6486659e4d6b6d7c5960b9d2..0f67458f71856afc54741960e0ac045ad5447395 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_16.c > @@ -17,7 +17,7 @@ double vararg_outgoing (int x1, ...) > double a1 = x1, a2 = x1 * 2, a3 = x1 * 3, a4 = x1 * 4, a5 = x1 * 5, a6 = > x1 * 6; > __builtin_va_list vl; > __builtin_va_start (vl, x1); > - outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1)); > + outgoing (vl, a1, a2, a3, a4, a5, a6, REP64 (1), REP8 (1)); > __builtin_va_end (vl); > return a1 + a2 + a3 + a4 + a5 + a6; > }