Version v2 uses movsi/di for GOT accesses until after reload as suggested. This
caused worse spilling, however improving the costs of GOT accesses resulted in
better codesize and performance gains:

Improve GOT addressing by treating the instructions as a pair.  This reduces
register pressure and improves code quality significantly.  SPECINT2017 improves
by 0.30% with -fPIC and codesize is 0.7% smaller.  Perlbench has 0.9% smaller
codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2021-05-21  Wilco Dijkstra  <wdijk...@arm.com>

        * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload.
        (movdi): Likewise.
        * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay
        splitting of GOT accesses until after reload.
        (aarch64_rtx_costs): Set rematerialization cost for GOT accesses.
        (aarch64_macro_fusion_pair_p): Fuse GOT accesses.

---

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
641c83b479e76cbcc75b299eb7ae5f634d9db7cd..75b3caa94dd8a52342bbddbfcb73ab06a7418907
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 
     case SYMBOL_SMALL_GOT_4G:
       {
+       /* Don't split into ADRP/LDR until after reload - this improves
+          CSE and rematerialization of GOT accesses.  */
+       if (!reload_completed)
+         {
+           emit_insn (gen_rtx_SET (dest, imm));
+           return;
+         }
+
        /* In ILP32, the mode of dest can be either SImode or DImode,
           while the got entry is always of SImode size.  The mode of
           dest depends on how dest is used: if dest is assigned to a
@@ -13460,6 +13468,14 @@ cost_plus:
          *cost += COSTS_N_INSNS (1);
          if (speed)
            *cost += 2 * extra_cost->alu.arith;
+
+         /* Set a low remateralization cost for GOT accesses - this blocks
+            them from being spilled and reduces register pressure.  */
+         if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC
+             && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G)
+           *cost = COSTS_N_INSNS (1) / 2;
+
+         return true;
        }
       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
               || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
@@ -19930,6 +19946,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
       return aarch64_simd_valid_immediate (x, NULL);
     }
 
+  /* GOT accesses are split after regalloc.  */
+  if (SYMBOL_REF_P (x)
+      && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
+    return true;
+
   x = strip_salt (x);
   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
@@ -23746,6 +23767,24 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn 
*curr)
         }
     }
 
+  /* Always treat GOT accesses as a pair to ensure they can be easily
+     identified and optimized in linkers.  */
+  if (simple_sets_p)
+    {
+      /*  We're trying to match:
+         prev (adrp) == (set (reg r1) (high (symbol_ref ("SYM"))))
+         curr (add) == (set (reg r0)
+                       (unspec [(mem (lo_sum (reg r1) (symbol_ref ("SYM"))))]
+                        UNSPEC_GOTSMALLPIC))  */
+
+      if (satisfies_constraint_Ush (SET_SRC (prev_set))
+         && REG_P (SET_DEST (prev_set))
+         && REG_P (SET_DEST (curr_set))
+         && GET_CODE (SET_SRC (curr_set)) == UNSPEC
+         && XINT (SET_SRC (curr_set), 1) == UNSPEC_GOTSMALLPIC)
+       return true;
+    }
+
   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
     {
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
abfd84526745d029ad4953eabad6dd17b159a218..2527c96576a78f2071da20721143a27adeb1551b
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
-  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
SImode)
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
SImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+       && (aarch64_classify_symbolic_expression (operands[1])
+           == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
-   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
DImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+       && (aarch64_classify_symbolic_expression (operands[1])
+           == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);

Reply via email to