ping

From: Wilco Dijkstra
Sent: 04 June 2021 14:44
To: Richard Sandiford <richard.sandif...@arm.com>
Cc: Kyrylo Tkachov <kyrylo.tkac...@arm.com>; GCC Patches 
<gcc-patches@gcc.gnu.org>
Subject: [PATCH v3] AArch64: Improve GOT addressing 
 
Hi Richard,

This merges the v1 and v2 patches and removes the spurious MEM from
ldr_got_small_si/di. This has been rebased after [1], and the performance
gain has now doubled.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-June/571708.html

Improve GOT addressing by treating the instructions as a pair.  This reduces
register pressure and improves code quality significantly.  SPECINT2017 improves
by 0.6% with -fPIC and codesize is 0.73% smaller.  Perlbench has 0.9% smaller
codesize, 1.5% fewer executed instructions and is 1.8% faster on Neoverse N1.

Passes bootstrap and regress. OK for commit?

ChangeLog:
2021-06-04  Wilco Dijkstra  <wdijk...@arm.com>

        * config/aarch64/aarch64.md (movsi): Split GOT accesses after reload.
        (movdi): Likewise.
        (ldr_got_small_<mode>): Remove MEM and LO_SUM, emit ADRP+LDR GOT 
sequence.
        (ldr_got_small_sidi): Likewise.
        * config/aarch64/aarch64.c (aarch64_load_symref_appropriately): Delay
        splitting of GOT accesses until after reload. Remove tmp_reg and MEM.
        (aarch64_print_operand): Correctly print got_lo12 in L specifier.
        (aarch64_rtx_costs): Set rematerialization cost for GOT accesses.
        (aarch64_mov_operand_p): Make GOT accesses valid move operands.

---

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
08245827daa3f8199b29031e754244c078f0f500..11ea33c70fb06194fadfe94322fdfa098e5320fc
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3615,6 +3615,14 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
 
     case SYMBOL_SMALL_GOT_4G:
       {
+       /* Use movdi for GOT accesses until after reload - this improves
+          CSE and rematerialization.  */
+       if (!reload_completed)
+         {
+           emit_insn (gen_rtx_SET (dest, imm));
+           return;
+         }
+
         /* In ILP32, the mode of dest can be either SImode or DImode,
            while the got entry is always of SImode size.  The mode of
            dest depends on how dest is used: if dest is assigned to a
@@ -3624,34 +3632,21 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm,
            patterns here (two patterns for ILP32).  */
 
         rtx insn;
-       rtx mem;
-       rtx tmp_reg = dest;
         machine_mode mode = GET_MODE (dest);
 
-       if (can_create_pseudo_p ())
-         tmp_reg = gen_reg_rtx (mode);
-
-       emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
         if (mode == ptr_mode)
           {
             if (mode == DImode)
-             insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
+             insn = gen_ldr_got_small_di (dest, imm);
             else
-             insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
-
-           mem = XVECEXP (SET_SRC (insn), 0, 0);
+             insn = gen_ldr_got_small_si (dest, imm);
           }
         else
           {
             gcc_assert (mode == Pmode);
-
-           insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
-           mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
+           insn = gen_ldr_got_small_sidi (dest, imm);
           }
 
-       gcc_assert (MEM_P (mem));
-       MEM_READONLY_P (mem) = 1;
-       MEM_NOTRAP_P (mem) = 1;
         emit_insn (insn);
         return;
       }
@@ -11019,7 +11014,7 @@ aarch64_print_operand (FILE *f, rtx x, int code)
       switch (aarch64_classify_symbolic_expression (x))
         {
         case SYMBOL_SMALL_GOT_4G:
-         asm_fprintf (asm_out_file, ":lo12:");
+         asm_fprintf (asm_out_file, ":got_lo12:");
           break;
 
         case SYMBOL_SMALL_TLSGD:
@@ -13452,6 +13447,12 @@ cost_plus:
 
     case SYMBOL_REF:
       *cost = 0;
+
+      /* Use a separate remateralization cost for GOT accesses.  */
+      if (aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC
+         && aarch64_classify_symbol (x, 0) == SYMBOL_SMALL_GOT_4G)
+       *cost = COSTS_N_INSNS (1) / 2;
+
       return true;
 
     case HIGH:
@@ -19907,6 +19908,11 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
       return aarch64_simd_valid_immediate (x, NULL);
     }
 
+  /* GOT accesses are valid moves until after regalloc.  */
+  if (SYMBOL_REF_P (x)
+      && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
+    return true;
+
   x = strip_salt (x);
   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
abfd84526745d029ad4953eabad6dd17b159a218..30effca6f3562f6870a6cc8097750e63bb0d424d
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1283,8 +1283,11 @@ (define_insn_and_split "*movsi_aarch64"
    fmov\\t%w0, %s1
    fmov\\t%s0, %s1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);"
-  "CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
SImode)
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
SImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+       && (aarch64_classify_symbolic_expression (operands[1])
+           == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -1319,8 +1322,11 @@ (define_insn_and_split "*movdi_aarch64"
    fmov\\t%x0, %d1
    fmov\\t%d0, %d1
    * return aarch64_output_scalar_simd_mov_immediate (operands[1], DImode);"
-   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
DImode))
-    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+   "(CONST_INT_P (operands[1]) && !aarch64_move_imm (INTVAL (operands[1]), 
DImode)
+    && REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0])))
+    || (reload_completed
+       && (aarch64_classify_symbolic_expression (operands[1])
+           == SYMBOL_SMALL_GOT_4G))"
    [(const_int 0)]
    "{
        aarch64_expand_mov_immediate (operands[0], operands[1]);
@@ -6703,27 +6709,26 @@ (define_insn "add_losym_<mode>"
   [(set_attr "type" "alu_imm")]
 )
 
+;;; GOT accesses use a single insn so linkers can relax GOT relocations.
 (define_insn "ldr_got_small_<mode>"
   [(set (match_operand:PTR 0 "register_operand" "=r")
-       (unspec:PTR [(mem:PTR (lo_sum:PTR
-                             (match_operand:PTR 1 "register_operand" "r")
-                             (match_operand:PTR 2 "aarch64_valid_symref" 
"S")))]
+       (unspec:PTR [(match_operand:PTR 1 "aarch64_valid_symref" "S")]
                     UNSPEC_GOTSMALLPIC))]
   ""
-  "ldr\\t%<w>0, [%1, #:got_lo12:%c2]"
-  [(set_attr "type" "load_<ldst_sz>")]
+  "adrp\\t%0, %A1\;ldr\\t%<w>0, [%0, %L1]"
+  [(set_attr "type" "load_<ldst_sz>")
+   (set_attr "length" "8")]
 )
 
 (define_insn "ldr_got_small_sidi"
   [(set (match_operand:DI 0 "register_operand" "=r")
         (zero_extend:DI
-        (unspec:SI [(mem:SI (lo_sum:DI
-                            (match_operand:DI 1 "register_operand" "r")
-                            (match_operand:DI 2 "aarch64_valid_symref" "S")))]
+        (unspec:SI [(match_operand:DI 1 "aarch64_valid_symref" "S")]
                     UNSPEC_GOTSMALLPIC)))]
   "TARGET_ILP32"
-  "ldr\\t%w0, [%1, #:got_lo12:%c2]"
-  [(set_attr "type" "load_4")]
+  "adrp\\t%0, %A1\;ldr\\t%w0, [%0, %L1]"
+  [(set_attr "type" "load_4")
+   (set_attr "length" "8")]
 )
 
 (define_insn "ldr_got_small_28k_<mode>"

Reply via email to