Version v2 uses movsi/di for GOT accesses until after reload as suggested. This caused worse spilling, however improving the costs of GOT accesses resulted in better codesize and performance gains:
Improve GOT addressing by treating the instructions as a pair. This reduces register pressure and improves code quality significantly. SPECINT2017 improves by 0.30% with -fPIC and codesize is 0.7% smaller. Perlbench has 0.9% smaller codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1. Passes bootstrap and regress. OK for commit? ChangeLog: 2021-05-21 Wilco Dijkstra <wdijk...@arm.com> * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload. (movdi): Likewise. * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay splitting of GOT accesses until after reload. (aarch64_rtx_costs): Set rematerialization cost for GOT accesses. (aarch64_macro_fusion_pair_p): Fuse GOT accesses. --- diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 641c83b479e76cbcc75b299eb7ae5f634d9db7cd..75b3caa94dd8a52342bbddbfcb73ab06a7418907 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm, case SYMBOL_SMALL_GOT_4G: { + /* Don't split into ADRP/LDR until after reload - this improves + CSE and rematerialization of GOT accesses. */ + if (!reload_completed) + { + emit_insn (gen_rtx_SET (dest, imm)); + return; + } + /* In ILP32, the mode of dest can be either SImode or DImode, while the got entry is always of SImode size. The mode of dest depends on how dest is used: if dest is assigned to a @@ -13460,6 +13468,14 @@ cost_plus: *cost += COSTS_N_INSNS (1); if (speed) *cost += 2 * extra_cost->alu.arith; + + /* Set a low remateralization cost for GOT accesses - this blocks + them from being spilled and reduces register pressure. */ + if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC + && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G) + *cost = COSTS_N_INSNS (1) / 2; + + return true; } else if (aarch64_cmodel == AARCH64_CMODEL_TINY || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) @@ -19930,6 +19946,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) return aarch64_simd_valid_immediate (x, NULL); } + /* GOT accesses are split after regalloc. */ + if (SYMBOL_REF_P (x) + && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G) + return true; + x = strip_salt (x); if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x)) return true; @@ -23746,6 +23767,24 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + /* Always treat GOT accesses as a pair to ensure they can be easily + identified and optimized in linkers. */ + if (simple_sets_p) + { + /* We're trying to match: + prev (adrp) == (set (reg r1) (high (symbol_ref ("SYM")))) + curr (add) == (set (reg r0) + (unspec [(mem (lo_sum (reg r1) (symbol_ref ("SYM"))))] + UNSPEC_GOTSMALLPIC)) */ + + if (satisfies_constraint_Ush (SET_SRC (prev_set)) + && REG_P (SET_DEST (prev_set)) + && REG_P (SET_DEST (curr_set)) + && GET_CODE (SET_SRC (curr_set)) == UNSPEC + && XINT (SET_SRC (curr_set), 1) == UNSPEC_GOTSMALLPIC) + return true; + } + if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK)) { diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index abfd84526745d029ad4953eabad6dd17b159a218..2527c96576a78f2071da20721143a27adeb1551b 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64" fmov\\t%w0, %s1 fmov\\t%s0, %s1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);" - "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), SImode) + && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))) + || (reload_completed + && (aarch64_classify_symbolic_expression (operands[1]) + == SYMBOL_SMALL_GOT_4G))" [(const_int 0)] "{ aarch64_expand_mov_immediate (operands[0], operands[1]); @@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64" fmov\\t%x0, %d1 fmov\\t%d0, %d1 * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);" - "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode)) - && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), DImode) + && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))) + || (reload_completed + && (aarch64_classify_symbolic_expression (operands[1]) + == SYMBOL_SMALL_GOT_4G))" [(const_int 0)] "{ aarch64_expand_mov_immediate (operands[0], operands[1]);