Hi, On ThunderX, load (and store) pair that does a pair of two word (32bits) load/stores is slower in some cases than doing two load/stores. For some internal benchmarks, it provides a 2-5% improvement.
This patch disables the forming of the load/store pairs for SImode if we are tuning for ThunderX. I used the tuning flags route so it can be overridden if needed later on or if someone else wants to use the same method for their core. OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions. Thanks, Andrew Pinski ChangeLog: * config/aarch64/aarch64-tuning-flags.def (slow_ldpw): New tuning option. * config/aarch64/aarch64.c (thunderx_tunings): Enable AARCH64_EXTRA_TUNE_SLOW_LDPW. (aarch64_operands_ok_for_ldpstp): Return false if AARCH64_EXTRA_TUNE_SLOW_LDPW and the mode was SImode. (aarch64_operands_adjust_ok_for_ldpstp): Likewise.
Index: gcc/config/aarch64/aarch64-tuning-flags.def =================================================================== --- gcc/config/aarch64/aarch64-tuning-flags.def (revision 239150) +++ gcc/config/aarch64/aarch64-tuning-flags.def (working copy) @@ -29,3 +29,4 @@ AARCH64_TUNE_ to give an enum name. */ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) +AARCH64_EXTRA_TUNING_OPTION ("slow_ldpw", SLOW_LDPW) Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c (revision 239150) +++ gcc/config/aarch64/aarch64.c (working copy) @@ -712,7 +712,7 @@ 0, /* max_case_values. */ 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_SLOW_LDPW) /* tune_flags. */ }; static const struct tune_params xgene1_tunings = @@ -13574,6 +13574,11 @@ enum reg_class rclass_1, rclass_2; rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2; + if (mode == SImode + && AARCH64_EXTRA_TUNE_SLOW_LDPW + && !optimize_size) + return false; + if (load) { mem_1 = operands[1]; @@ -13673,6 +13678,11 @@ rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4; rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4; + if (mode == SImode + && AARCH64_EXTRA_TUNE_SLOW_LDPW + && !optimize_size) + return false; + if (load) { reg_1 = operands[0];