[PATCH 1/3] Power10: Add PCREL_OPT load support. This patch adds support for optimizing power10 loads of an external variable to eliminate loading the address of the variable, and then doing a subsequent load using that address.
I have built compilers with and without these set of 3 patches doing a bootstrap build and make check. There were no regressions, and the new tests passed. Can I check these patches into the master branch for GCC? Because this is new functionality, I do not intend to back port these patches to GCC 10 at this time. gcc/ 2020-08-18 Michael Meissner <meiss...@linux.ibm.com> * config.gcc (powerpc*-*-*): Add pcrel-opt.o. (rs6000*-*-*): Add pcrel-opt.o. * config/rs6000/pcrel-opt.c: New file. * config/rs6000/pcrel-opt.md: New file. * config/rs6000/predicates.md (d_form_memory): New predicate. * config/rs6000/rs6000-cpus.def (OTHER_POWER10_MASKS): Add -mpcrel-opt. (POWERPC_MASKS): Add -mpcrel-opt. * config/rs6000/rs6000-passes.def: Add PCREL_OPT pass. * config/rs6000/rs6000-protos.h (reg_to_non_prefixed): New declaration. (make_pass_pcrel_opt): New declaration. * config/rs6000/rs6000.c (rs6000_option_override_internal): Add support for -mpcrel-opt. (rs6000_delegitimize_address): Add support for PCREL_OPT addresses. (print_operand, 'r' case): New operand for PCREL_OPT. (rs6000_opt_masks): Add -mpcrel-opt. (rs6000_asm_output_opcode): Reset flag to emit the initial 'p' after use. * config/rs6000/rs6000.md (loads_extern_addr attribute): New attribute. (isa attribute): Add pcrel_opt sub-case. (enabled attribute): Add support for pcrel_opt. (pcrel_extern_addr): Set loads_extern_addr attribute. (toplevel): Include pcrel-opt.md. * config/rs6000/rs6000.opt (-mpcrel-opt): New debug option. * config/rs6000/t-rs6000 (pcrel-opt.o): Add build rule. (MD_INCLUDES): Add pcrel-opt.md. --- gcc/config.gcc | 6 +- gcc/config/rs6000/pcrel-opt.c | 656 ++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/pcrel-opt.md | 248 ++++++++++++++ gcc/config/rs6000/predicates.md | 23 ++ gcc/config/rs6000/rs6000-cpus.def | 2 + gcc/config/rs6000/rs6000-passes.def | 8 + gcc/config/rs6000/rs6000-protos.h | 2 + gcc/config/rs6000/rs6000.c | 40 ++- gcc/config/rs6000/rs6000.md | 14 +- gcc/config/rs6000/rs6000.opt | 4 + gcc/config/rs6000/t-rs6000 | 7 +- 11 files changed, 1001 insertions(+), 9 deletions(-) create mode 100644 gcc/config/rs6000/pcrel-opt.c create mode 100644 gcc/config/rs6000/pcrel-opt.md diff --git a/gcc/config.gcc b/gcc/config.gcc index 2370368..605d743 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -505,7 +505,7 @@ or1k*-*-*) ;; powerpc*-*-*) cpu_type=rs6000 - extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-call.o" + extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-call.o pcrel-opt.o" extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" @@ -520,6 +520,7 @@ powerpc*-*-*) esac extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.c \$(srcdir)/config/rs6000/rs6000-call.c" + target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/pcrel-opt.c" ;; pru-*-*) cpu_type=pru @@ -531,8 +532,9 @@ riscv*) ;; rs6000*-*-*) extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" - extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-call.o" + extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o rs6000-call.o pcrel-opt.o" target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/rs6000-logue.c \$(srcdir)/config/rs6000/rs6000-call.c" + target_gtfiles="$target_gtfiles \$(srcdir)/config/rs6000/pcrel-opt.c" ;; sparc*-*-*) cpu_type=sparc diff --git a/gcc/config/rs6000/pcrel-opt.c b/gcc/config/rs6000/pcrel-opt.c new file mode 100644 index 0000000..10b4bc4 --- /dev/null +++ b/gcc/config/rs6000/pcrel-opt.c @@ -0,0 +1,656 @@ +/* Subroutines used support the pc-relative linker optimization. + Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* This file implements a RTL pass that looks for pc-relative loads of the + address of an external variable using the PCREL_GOT relocation and a single + load that uses that external address. If that is found we create the + PCREL_OPT relocation to possibly convert: + + pld addr_reg,var@pcrel@got(0),1 + + <possibly other insns that do not use 'addr_reg' or 'data_reg'> + + lwz data_reg,0(addr_reg) + + into: + + plwz data_reg,var@pcrel(0),1 + + <possibly other insns that do not use 'addr_reg' or 'data_reg'> + + nop + + If the variable is not defined in the main program or the code using it is + not in the main program, the linker put the address in the .got section and + do: + + .section .got + .Lvar_got: + .dword var + + .section .text + pld addr_reg,.Lvar_got@pcrel(0),1 + + <possibly other insns that do not use 'addr_reg' or 'data_reg'> + + lwz data_reg,0(addr_reg) + + We only look for a single usage in the basic block where the external + address is loaded. Multiple uses or references in another basic block will + force us to not use the PCREL_OPT relocation. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "expmed.h" +#include "optabs.h" +#include "recog.h" +#include "df.h" +#include "tm_p.h" +#include "ira.h" +#include "print-tree.h" +#include "varasm.h" +#include "explow.h" +#include "expr.h" +#include "output.h" +#include "tree-pass.h" +#include "rtx-vector-builder.h" +#include "print-rtl.h" +#include "insn-attr.h" +#include "insn-codes.h" + + +// Maximum number of insns to scan between the load address and the load that +// uses that address. This can be bumped up if desired. If the insns are far +// enough away, the PCREL_OPT optimization probably does not help, since the +// load of the external address has probably completed by the time we do the +// load of the variable at that address. +const int MAX_PCREL_OPT_INSNS = 10; + +/* Next PCREL_OPT label number. */ +static unsigned int pcrel_opt_next_num; + +/* Various counters. */ +static struct { + unsigned long extern_addrs; + unsigned long loads; + unsigned long load_separation[MAX_PCREL_OPT_INSNS+1]; +} counters; + + +// Optimize a PC-relative load address to be used in a load. +// +// If the sequence of insns is safe to use the PCREL_OPT optimization (i.e. no +// additional references to the address register, the address register dies at +// the load, and no references to the load), convert insns of the form: +// +// (set (reg:DI addr) +// (symbol_ref:DI "ext_symbol")) +// +// ... +// +// (set (reg:<MODE> value) +// (mem:<MODE> (reg:DI addr))) +// +// into: +// +// (parallel [(set (reg:DI addr) +// (unspec:<MODE> [(symbol_ref:DI "ext_symbol") +// (const_int label_num) +// (const_int 0)] +// UNSPEC_PCREL_OPT_LD_ADDR)) +// (set (reg:DI data) +// (unspec:DI [(const_int 0)] +// UNSPEC_PCREL_OPT_LD_ADDR))]) +// +// ... +// +// (parallel [(set (reg:<MODE>) +// (unspec:<MODE> [(mem:<MODE> (reg:DI addr)) +// (reg:DI data) +// (const_int label_num)] +// UNSPEC_PCREL_OPT_LD_RELOC)) +// (clobber (reg:DI addr))]) +// +// If the register being loaded is the same register that was used to hold the +// external address, we generate the following insn instead: +// +// (set (reg:DI data) +// (unspec:DI [(symbol_ref:DI "ext_symbol") +// (const_int label_num) +// (const_int 1)] +// UNSPEC_PCREL_OPT_LD_ADDR)) +// +// In the first insn, we set both the address of the external variable, and +// mark that the variable being loaded both are created in that insn, and are +// consumed in the second insn. It doesn't matter what mode the register that +// we will ultimately do the load into, so we use DImode. We just need to mark +// that both registers may be set in the first insn, and will be used in the +// second insn. +// +// The UNSPEC_PCREL_OPT_LD_ADDR insn will generate the load address plus +// a definition of a label (.Lpcrel<n>), while the UNSPEC_PCREL_OPT_LD_RELOC +// insn will generate the .reloc to tell the linker to tie the load address and +// load using that address together. +// +// pld b,ext_symbol@got@pcrel(0),1 +// .Lpcrel1: +// +// ... +// +// .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8) +// lwz r,0(b) +// +// If ext_symbol is defined in another object file in the main program and we +// are linking the main program, the linker will convert the above instructions +// to: +// +// plwz r,ext_symbol@got@pcrel(0),1 +// +// ... +// +// nop +// +// Return true if the PCREL_OPT load optimization succeeded. + +static bool +do_pcrel_opt_load (rtx_insn *addr_insn, // insn loading address + rtx_insn *load_insn) // insn using address +{ + rtx addr_set = PATTERN (addr_insn); + rtx addr_reg = SET_DEST (addr_set); + rtx addr_symbol = SET_SRC (addr_set); + rtx load_set = single_set (load_insn); + rtx reg = SET_DEST (load_set); + rtx mem = SET_SRC (load_set); + machine_mode reg_mode = GET_MODE (reg); + machine_mode mem_mode = GET_MODE (mem); + rtx mem_inner = mem; + unsigned int reg_regno = reg_or_subregno (reg); + + // LWA is a DS format instruction, but LWZ is a D format instruction. We use + // DImode for the mode to force checking whether the bottom 2 bits are 0. + // However FPR and vector registers uses the LFIWAX instruction which is + // indexed only. + if (GET_CODE (mem) == SIGN_EXTEND && GET_MODE (XEXP (mem, 0)) == SImode) + { + if (!INT_REGNO_P (reg_regno)) + return false; + + mem_inner = XEXP (mem, 0); + mem_mode = DImode; + } + + else if (GET_CODE (mem) == SIGN_EXTEND + || GET_CODE (mem) == ZERO_EXTEND + || GET_CODE (mem) == FLOAT_EXTEND) + { + mem_inner = XEXP (mem, 0); + mem_mode = GET_MODE (mem_inner); + } + + if (!MEM_P (mem_inner)) + return false; + + // If this is LFIWAX or similar instructions that are indexed only, we can't + // do the optimization. + enum non_prefixed_form non_prefixed = reg_to_non_prefixed (reg, mem_mode); + if (non_prefixed == NON_PREFIXED_X) + return false; + + // The optimization will only work on non-prefixed offsettable loads. + rtx addr = XEXP (mem_inner, 0); + enum insn_form iform = address_to_insn_form (addr, mem_mode, non_prefixed); + if (iform != INSN_FORM_BASE_REG + && iform != INSN_FORM_D + && iform != INSN_FORM_DS + && iform != INSN_FORM_DQ) + return false; + + // Allocate a new PC-relative label, and update the load external address + // insn. + // + // (parallel [(set (reg load) + // (unspec [(symbol_ref addr_symbol) + // (const_int label_num) + // (const_int 0)] + // UNSPEC_PCREL_OPT_LD_ADDR)) + // (set (reg addr) + // (unspec [(const_int 0)] + // UNSPEC_PCREL_OPT_LD_ADDR))]) + + ++pcrel_opt_next_num; + unsigned int addr_regno = reg_or_subregno (addr_reg); + rtx label_num = GEN_INT (pcrel_opt_next_num); + rtx reg_di = gen_rtx_REG (DImode, reg_regno); + + PATTERN (addr_insn) + = ((addr_regno != reg_regno) + ? gen_pcrel_opt_ld_addr (addr_reg, addr_symbol, label_num, reg_di) + : gen_pcrel_opt_ld_addr_same_reg (addr_reg, addr_symbol, label_num)); + + // Revalidate the insn, backing out of the optimization if the insn is not + // supported. + INSN_CODE (addr_insn) = recog (PATTERN (addr_insn), addr_insn, 0); + if (INSN_CODE (addr_insn) < 0) + { + PATTERN (addr_insn) = addr_set; + INSN_CODE (addr_insn) = recog (PATTERN (addr_insn), addr_insn, 0); + return false; + } + + // Update the load insn. If the mem had a sign/zero/float extend, add that + // also after doing the UNSPEC. Add an explicit clobber of the external + // address register just to make it clear that the address register dies. + // + // (parallel [(set (reg:<MODE> data) + // (unspec:<MODE> [(mem (addr_reg) + // (reg:DI data) + // (const_int label_num)] + // UNSPEC_PCREL_OPT_LD_RELOC)) + // (clobber (reg:DI addr_reg))]) + + rtvec v_load = gen_rtvec (3, mem_inner, reg_di, label_num); + rtx new_load = gen_rtx_UNSPEC (GET_MODE (mem_inner), v_load, + UNSPEC_PCREL_OPT_LD_RELOC); + + if (GET_CODE (mem) != GET_CODE (mem_inner)) + new_load = gen_rtx_fmt_e (GET_CODE (mem), reg_mode, new_load); + + rtx old_load_set = PATTERN (load_insn); + rtx new_load_set = gen_rtx_SET (reg, new_load); + rtx load_clobber = gen_rtx_CLOBBER (VOIDmode, + (addr_regno == reg_regno + ? gen_rtx_SCRATCH (Pmode) + : addr_reg)); + PATTERN (load_insn) + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, new_load_set, load_clobber)); + + // Revalidate the insn, backing out of the optimization if the insn is not + // supported. + + INSN_CODE (load_insn) = recog (PATTERN (load_insn), load_insn, 0); + if (INSN_CODE (load_insn) < 0) + { + PATTERN (addr_insn) = addr_set; + INSN_CODE (addr_insn) = recog (PATTERN (addr_insn), addr_insn, 0); + + PATTERN (load_insn) = old_load_set; + INSN_CODE (load_insn) = recog (PATTERN (load_insn), load_insn, 0); + return false; + } + + return true; +} + + +/* Given an insn, find the next insn in the basic block. Stop if we find a the + end of a basic block, such as a label, call or jump, and return NULL. */ + +static rtx_insn * +next_active_insn_in_basic_block (rtx_insn *insn) +{ + insn = NEXT_INSN (insn); + + while (insn != NULL_RTX) + { + /* If the basic block ends or there is a jump of some kind, exit the + loop. */ + if (CALL_P (insn) + || JUMP_P (insn) + || JUMP_TABLE_DATA_P (insn) + || LABEL_P (insn) + || BARRIER_P (insn)) + return NULL; + + /* If this is a real insn, return it. */ + if (!insn->deleted () + && NONJUMP_INSN_P (insn) + && GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + return insn; + + /* Loop for USE, CLOBBER, DEBUG_INSN, NOTEs. */ + insn = NEXT_INSN (insn); + } + + return NULL; +} + + +// Validate that a load is actually a single instruction that can be optimized +// with the PCREL_OPT optimization. + +static bool +is_single_instruction (rtx_insn *insn, rtx reg) +{ + if (!REG_P (reg) && !SUBREG_P (reg)) + return false; + + if (get_attr_length (insn) != 4) + return false; + + // _Decimal128 and IBM extended double are always multiple instructions. + machine_mode mode = GET_MODE (reg); + if (mode == TFmode && !TARGET_IEEEQUAD) + return false; + + if (mode == TDmode || mode == IFmode) + return false; + + // Don't optimize PLQ/PSTQ instructions + unsigned int regno = reg_or_subregno (reg); + unsigned int size = GET_MODE_SIZE (mode); + if (size >= 16 && !VSX_REGNO_P (regno)) + return false; + + return true; +} + + +// Given an insn with that loads up a base register with the address of an +// external symbol, see if we can optimize it with the PCREL_OPT optimization. + +static void +do_pcrel_opt_addr (rtx_insn *addr_insn) +{ + int num_insns = 0; + + // Do some basic validation. + rtx addr_set = PATTERN (addr_insn); + if (GET_CODE (addr_set) != SET) + return; + + rtx addr_reg = SET_DEST (addr_set); + rtx addr_symbol = SET_SRC (addr_set); + + if (!base_reg_operand (addr_reg, Pmode) + || !pcrel_external_address (addr_symbol, Pmode)) + return; + + rtx_insn *insn = addr_insn; + bool looping = true; + bool had_load = false; // whether intermediate insns had a load + bool had_store = false; // whether intermediate insns had a store + bool is_load = false; // whether the current insn is a load + bool is_store = false; // whether the current insn is a store + + // Check the following insns and see if it is a load or store that uses the + // external address. If we can't do the optimization, just return. + while (looping) + { + is_load = is_store = false; + + // Don't allow too many insns between the load of the external address + // and the eventual load or store. + if (++num_insns >= MAX_PCREL_OPT_INSNS) + return; + + insn = next_active_insn_in_basic_block (insn); + if (!insn) + return; + + // See if the current insn is a load or store + switch (get_attr_type (insn)) + { + // While load of the external address is a 'load' for scheduling + // purposes, it should be safe to allow loading other external + // addresses between the load of the external address we are + // currently looking at and the load or store using that address. + case TYPE_LOAD: + if (get_attr_loads_extern_addr (insn) == LOADS_EXTERN_ADDR_YES) + break; + /* fall through */ + + case TYPE_FPLOAD: + case TYPE_VECLOAD: + is_load = true; + break; + + case TYPE_STORE: + case TYPE_FPSTORE: + case TYPE_VECSTORE: + is_store = true; + break; + + // Don't do the optimization through atomic operations. + case TYPE_LOAD_L: + case TYPE_STORE_C: + case TYPE_HTM: + case TYPE_HTMSIMPLE: + return; + + default: + break; + } + + // If the external addresss register was referenced, it must also die in + // the same insn. + if (reg_referenced_p (addr_reg, PATTERN (insn))) + { + if (!dead_or_set_p (insn, addr_reg)) + return; + + looping = false; + } + + // If it dies by being set without being referenced, exit. + else if (dead_or_set_p (insn, addr_reg)) + return; + + // If it isn't the insn we want, remember if there were loads or stores. + else + { + had_load |= is_load; + had_store |= is_store; + } + } + + // If the insn does not use the external address, or the external address + // register does not die at this insn, we can't do the optimization. + if (!reg_referenced_p (addr_reg, PATTERN (insn)) + || !dead_or_set_p (insn, addr_reg)) + return; + + rtx set = single_set (insn); + if (!set) + return; + + // Optimize loads + if (is_load) + { + // If there were any stores in the insns between loading the external + // address and doing the load, turn off the optimization. + if (had_store) + return; + + rtx reg = SET_DEST (set); + if (!is_single_instruction (insn, reg)) + return; + + rtx mem = SET_SRC (set); + switch (GET_CODE (mem)) + { + case MEM: + break; + + case SIGN_EXTEND: + case ZERO_EXTEND: + case FLOAT_EXTEND: + if (!MEM_P (XEXP (mem, 0))) + return; + break; + + default: + return; + } + + // If the register being loaded was used or set between the load of + // the external address and the load using the address, we can't do + // the optimization. + if (reg_used_between_p (reg, addr_insn, insn) + || reg_set_between_p (reg, addr_insn, insn)) + return; + + // Process the load in detail + if (do_pcrel_opt_load (addr_insn, insn)) + { + counters.loads++; + counters.load_separation[num_insns-1]++; + } + } + + return; +} + + +// Optimize pcrel external variable references + +static unsigned int +do_pcrel_opt_pass (function *fun) +{ + basic_block bb; + rtx_insn *insn, *curr_insn = 0; + + memset ((char *) &counters, '\0', sizeof (counters)); + + // Dataflow analysis for use-def chains. + df_set_flags (DF_RD_PRUNE_DEAD_DEFS); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_note_add_problem (); + df_analyze (); + df_set_flags (DF_DEFER_INSN_RESCAN | DF_LR_RUN_DCE); + + // Look at each basic block to see if there is a load of an external + // variable's external address, and a single load using that external + // address. + FOR_ALL_BB_FN (bb, fun) + { + FOR_BB_INSNS_SAFE (bb, insn, curr_insn) + { + if (NONJUMP_INSN_P (insn) && single_set (insn) + && get_attr_loads_extern_addr (insn) == LOADS_EXTERN_ADDR_YES) + { + counters.extern_addrs++; + do_pcrel_opt_addr (insn); + } + } + } + + df_remove_problem (df_chain); + df_process_deferred_rescans (); + df_set_flags (DF_RD_PRUNE_DEAD_DEFS | DF_LR_RUN_DCE); + df_chain_add_problem (DF_UD_CHAIN); + df_note_add_problem (); + df_analyze (); + + if (dump_file) + { + if (!counters.extern_addrs) + fprintf (dump_file, "\nNo external symbols were referenced\n"); + + else + { + fprintf (dump_file, + "\n# of loads of an address of an external symbol = %lu\n", + counters.extern_addrs); + + if (!counters.loads) + fprintf (dump_file, + "\nNo PCREL_OPT load optimizations were done\n"); + + else + { + fprintf (dump_file, "# of PCREL_OPT loads = %lu\n", + counters.loads); + + fprintf (dump_file, "# of adjacent PCREL_OPT loads = %lu\n", + counters.load_separation[0]); + + for (int i = 1; i < MAX_PCREL_OPT_INSNS; i++) + { + if (counters.load_separation[i]) + fprintf (dump_file, + "# of PCREL_OPT loads separated by %d insn%s = %lu\n", + i, (i == 1) ? "" : "s", + counters.load_separation[i]); + } + } + } + + fprintf (dump_file, "\n"); + } + + return 0; +} + + +// Optimize pc-relative references for the new PCREL_OPT pass +const pass_data pass_data_pcrel_opt = +{ + RTL_PASS, // type + "pcrel_opt", // name + OPTGROUP_NONE, // optinfo_flags + TV_NONE, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish, // todo_flags_finish +}; + +// Pass data structures +class pcrel_opt : public rtl_opt_pass +{ +public: + pcrel_opt (gcc::context *ctxt) + : rtl_opt_pass (pass_data_pcrel_opt, ctxt) + {} + + ~pcrel_opt (void) + {} + + // opt_pass methods: + virtual bool gate (function *) + { + return (TARGET_PCREL && TARGET_PCREL_OPT && optimize); + } + + virtual unsigned int execute (function *fun) + { + return do_pcrel_opt_pass (fun); + } + + opt_pass *clone () + { + return new pcrel_opt (m_ctxt); + } +}; + +rtl_opt_pass * +make_pass_pcrel_opt (gcc::context *ctxt) +{ + return new pcrel_opt (ctxt); +} diff --git a/gcc/config/rs6000/pcrel-opt.md b/gcc/config/rs6000/pcrel-opt.md new file mode 100644 index 0000000..00a3bc4 --- /dev/null +++ b/gcc/config/rs6000/pcrel-opt.md @@ -0,0 +1,248 @@ +;; Machine description for the PCREL_OPT optimization. +;; Copyright (C) 2020 Free Software Foundation, Inc. +;; Contributed by Michael Meissner (meiss...@linux.ibm.com) + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. + +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. + +;; Support for the PCREL_OPT optimization. PCREL_OPT looks for instances where +;; an external variable is used only once, either for reading or for writing. +;; +;; If we are optimizing a single read, normally the code would look like: +;; +;; (set (reg:DI <ptr>) +;; (symbol_ref:DI "<extern_addr>")) # <data> is currently dead +;; +;; ... # insns do not need to be adjacent +;; +;; (set (reg:SI <data>) +;; (mem:SI (reg:DI <xxx>))) # <ptr> dies with this insn +;; +;; We transform this into: +;; +;; (parallel [(set (reg:DI <ptr>) +;; (unspec:SI [(symbol_ref:DI <extern_addr>) +;; (const_int <marker>)] +;; UNSPEC_PCREL_OPT_LD_ADDR)) +;; (set (reg:DI <data>) +;; (unspec:DI [(const_int 0)] +;; UNSPEC_PCREL_OPT_LD_ADDR))]) +;; +;; ... +;; +;; (parallel [(set (reg:SI <data>) +;; (unspec:SI [(mem:SI (reg:DI <ptr>)) +;; (reg:DI <data>) +;; (const_int <marker>)] +;; UNSPEC_PCREL_OPT_LD)) +;; (clobber (reg:DI <ptr>))]) +;; +;; The marker is an integer constant that links the load of the external +;; address to the load of the actual variable. +;; +;; In the first insn, we set both the address of the external variable, and +;; mark that the variable being loaded both are created in that insn, and are +;; consumed in the second insn. It doesn't matter what mode the register that +;; we will ultimately do the load into, so we use DImode. We just need to mark +;; that both registers may be set in the first insn, and will be used in the +;; second insn. +;; +;; Since we use UNSPEC's and link both the the register holding the external +;; address and the value being loaded, it should prevent other passes from +;; modifying it. +;; +;; If the register being loaded is the same as the base register, we use an +;; alternate form of the insns. +;; +;; (set (reg:DI <data_ptr>) +;; (unspec:DI [(symbol_ref:DI <extern_addr>) +;; (const_int <marker>)] +;; UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG)) +;; +;; ... +;; +;; (parallel [(set (reg:SI <data>) +;; (unspec:SI [(mem:SI (reg:DI <ptr>)) +;; (reg:DI <data>) +;; (const_int <marker>)] +;; UNSPEC_PCREL_OPT_LD)) +;; (clobber (reg:DI <ptr>))]) + +(define_c_enum "unspec" + [UNSPEC_PCREL_OPT_LD_ADDR + UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG + UNSPEC_PCREL_OPT_LD_RELOC]) + +;; Modes that are supported for PCREL_OPT +(define_mode_iterator PO [QI HI SI DI TI SF DF KF + V1TI V2DI V4SI V8HI V16QI V2DF V4SF + (TF "TARGET_FLOAT128_TYPE && TARGET_IEEEQUAD")]) + +;; Vector modes for PCREL_OPT +(define_mode_iterator PO_VECT [TI KF V1TI V2DI V4SI V8HI V16QI V2DF V4SF + (TF "TARGET_FLOAT128_TYPE && TARGET_IEEEQUAD")]) + +;; Insn for loading the external address, where the register being loaded is not +;; the same as the register being loaded with the data. +(define_insn "pcrel_opt_ld_addr" + [(set (match_operand:DI 0 "base_reg_operand" "=&b,&b") + (unspec:DI [(match_operand:DI 1 "pcrel_external_address") + (match_operand 2 "const_int_operand" "n,n")] + UNSPEC_PCREL_OPT_LD_ADDR)) + (set (match_operand:DI 3 "gpc_reg_operand" "=r,wa") + (unspec:DI [(const_int 0)] + UNSPEC_PCREL_OPT_LD_ADDR))] + "TARGET_PCREL_OPT + && reg_or_subregno (operands[0]) != reg_or_subregno (operands[3])" + "ld %0,%a1\n.Lpcrel%2:" + [(set_attr "prefixed" "yes") + (set_attr "type" "load") + (set_attr "isa" "pcrel_opt") + (set_attr "loads_extern_addr" "yes")]) + +;; Alternate form of loading up the external address that is the same register +;; as the final load. +(define_insn "pcrel_opt_ld_addr_same_reg" + [(set (match_operand:DI 0 "base_reg_operand" "=b") + (unspec:DI [(match_operand:DI 1 "pcrel_external_address") + (match_operand 2 "const_int_operand" "n")] + UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG))] + "TARGET_PCREL_OPT" + "ld %0,%a1\n.Lpcrel%2:" + [(set_attr "prefixed" "yes") + (set_attr "type" "load") + (set_attr "isa" "pcrel_opt") + (set_attr "loads_extern_addr" "yes")]) + +;; PCREL_OPT modes that are optimized for loading or storing GPRs. +(define_mode_iterator PO_GPR [QI HI SI DI SF DF]) + +(define_mode_attr PO_GPR_LD [(QI "lbz") + (HI "lhz") + (SI "lwz") + (SF "lwz") + (DI "ld") + (DF "ld")]) + +;; PCREL_OPT load operation of GPRs. Operand 4 (the register used to hold the +;; address of the external symbol) is SCRATCH if the same register is used for +;; the normal load. +(define_insn "*pcrel_opt_ld<mode>_gpr" + [(parallel [(set (match_operand:PO_GPR 0 "int_reg_operand" "+r") + (unspec:PO_GPR [(match_operand:PO_GPR 1 "d_form_memory" "o") + (match_operand:DI 2 "int_reg_operand" "0") + (match_operand 3 "const_int_operand" "n")] + UNSPEC_PCREL_OPT_LD_RELOC)) + (clobber (match_scratch:DI 4 "=bX"))])] + "TARGET_PCREL_OPT + && (GET_CODE (operands[4]) == SCRATCH + || reg_mentioned_p (operands[4], operands[1]))" + "%r3<PO_GPR_LD> %0,%1" + [(set_attr "type" "load") + (set_attr "isa" "pcrel_opt")]) + +;; PCREL_OPT load with sign/zero extension +(define_insn "*pcrel_opt_ldsi_<u><mode>_gpr" + [(set (match_operand:EXTSI 0 "int_reg_operand" "+r") + (any_extend:EXTSI + (unspec:SI [(match_operand:SI 1 "d_form_memory" "o") + (match_operand:DI 2 "int_reg_operand" "0") + (match_operand 3 "const_int_operand" "n")] + UNSPEC_PCREL_OPT_LD_RELOC))) + (clobber (match_scratch:DI 4 "=bX"))] + "TARGET_PCREL_OPT" + "%r3lw<az> %0,%1" + [(set_attr "type" "load") + (set_attr "isa" "pcrel_opt")]) + +(define_insn "*pcrel_opt_ldhi_<u><mode>_gpr" + [(set (match_operand:EXTHI 0 "int_reg_operand" "+r") + (any_extend:EXTHI + (unspec:HI [(match_operand:HI 1 "d_form_memory" "o") + (match_operand:DI 2 "int_reg_operand" "0") + (match_operand 3 "const_int_operand" "n")] + UNSPEC_PCREL_OPT_LD_RELOC))) + (clobber (match_scratch:DI 4 "=bX"))] + "TARGET_PCREL_OPT" + "%r3lh<az> %0,%1" + [(set_attr "type" "load") + (set_attr "isa" "pcrel_opt")]) + +(define_insn "*pcrel_opt_ldqi_u<mode>_gpr" + [(set (match_operand:EXTQI 0 "int_reg_operand" "+r") + (zero_extend:EXTQI + (unspec:QI [(match_operand:QI 1 "d_form_memory" "o") + (match_operand:DI 2 "int_reg_operand" "0") + (match_operand 3 "const_int_operand" "n")] + UNSPEC_PCREL_OPT_LD_RELOC))) + (clobber (match_scratch:DI 4 "=bX"))] + "TARGET_PCREL_OPT" + "%r3lbz %0,%1" + [(set_attr "type" "load") + (set_attr "isa" "pcrel_opt")]) + +;; Scalar types that can be optimized by loading them into floating point +;; or Altivec registers. +(define_mode_iterator PO_FP [DI DF SF]) + +;; Load instructions to load up scalar floating point or 64-bit integer values +;; into floating point registers or Altivec registers. +(define_mode_attr PO_FPR_LD [(DI "lfd") (DF "lfd") (SF "lfs")]) +(define_mode_attr PO_AVX_LD [(DI "lxsd") (DF "lxsd") (SF "lxssp")]) + +;; PCREL_OPT load operation of scalar DF/DI/SF into vector registers. +(define_insn "*pcrel_opt_ld<mode>_vsx" + [(set (match_operand:PO_FP 0 "vsx_register_operand" "+d,v") + (unspec:PO_FP [(match_operand:PO_FP 1 "d_form_memory" "o,o") + (match_operand:DI 2 "vsx_register_operand" "0,0") + (match_operand 3 "const_int_operand" "n,n")] + UNSPEC_PCREL_OPT_LD_RELOC)) + (clobber (match_operand:DI 4 "base_reg_operand" "=b,b"))] + "TARGET_PCREL_OPT" + "@ + %r3<PO_FPR_LD> %0,%1 + %r3<PO_AVX_LD> %0,%1" + [(set_attr "type" "fpload") + (set_attr "isa" "pcrel_opt")]) + +;; PCREL_OPT optimization extending SFmode to DFmode via a load. +(define_insn "*pcrel_opt_ldsf_df" + [(set (match_operand:DF 0 "vsx_register_operand" "+d,v") + (float_extend:DF + (unspec:SF [(match_operand:SF 1 "d_form_memory" "o,o") + (match_operand:DI 2 "vsx_register_operand" "0,0") + (match_operand 3 "const_int_operand" "n,n")] + UNSPEC_PCREL_OPT_LD_RELOC))) + (clobber (match_operand:DI 4 "base_reg_operand" "=b,b"))] + "TARGET_PCREL_OPT" + "@ + %r3lfs %0,%1 + %r3lxssp %0,%1" + [(set_attr "type" "fpload") + (set_attr "isa" "pcrel_opt")]) + +;; PCREL_OPT load operation of vector/float128 types into vector registers. +(define_insn "*pcrel_opt_ld<mode>" + [(set (match_operand:PO_VECT 0 "vsx_register_operand" "+wa") + (unspec:PO_VECT [(match_operand:PO_VECT 1 "d_form_memory" "o") + (match_operand:DI 2 "vsx_register_operand" "0") + (match_operand 3 "const_int_operand" "n")] + UNSPEC_PCREL_OPT_LD_RELOC)) + (clobber (match_operand:DI 4 "base_reg_operand" "=b"))] + "TARGET_PCREL_OPT" + "%r3lxv %x0,%1" + [(set_attr "type" "vecload") + (set_attr "isa" "pcrel_opt")]) diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 2709e46..38ae9cd 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1876,3 +1876,26 @@ (define_predicate "prefixed_memory" { return address_is_prefixed (XEXP (op, 0), mode, NON_PREFIXED_DEFAULT); }) + +;; Return true if the operand is a valid memory operand with an offsettable +;; address that could be merged with the load of a PC-relative external address +;; with the PCREL_OPT optimization. We don't check here whether or not the +;; offset needs to be used in a DS-FORM (bottom 2 bits 0) or DQ-FORM (bottom 4 +;; bits 0) instruction. +(define_predicate "d_form_memory" + (match_code "mem") +{ + if (!memory_operand (op, mode)) + return false; + + rtx addr = XEXP (op, 0); + + if (REG_P (addr) || SUBREG_P (addr)) + return true; + + if (GET_CODE (addr) != PLUS) + return false; + + return (base_reg_operand (XEXP (addr, 0), Pmode) + && satisfies_constraint_I (XEXP (addr, 1))); +}) diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 8d2c1ff..d3f72d7 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -78,6 +78,7 @@ /* Flags that need to be turned off if -mno-power10. */ #define OTHER_POWER10_MASKS (OPTION_MASK_MMA \ | OPTION_MASK_PCREL \ + | OPTION_MASK_PCREL_OPT \ | OPTION_MASK_PREFIXED) #define ISA_3_1_MASKS_SERVER (ISA_3_0_MASKS_SERVER \ @@ -142,6 +143,7 @@ | OPTION_MASK_P9_MISC \ | OPTION_MASK_P9_VECTOR \ | OPTION_MASK_PCREL \ + | OPTION_MASK_PCREL_OPT \ | OPTION_MASK_POPCNTB \ | OPTION_MASK_POPCNTD \ | OPTION_MASK_POWERPC64 \ diff --git a/gcc/config/rs6000/rs6000-passes.def b/gcc/config/rs6000/rs6000-passes.def index 5164c52..9b93fc7 100644 --- a/gcc/config/rs6000/rs6000-passes.def +++ b/gcc/config/rs6000/rs6000-passes.def @@ -24,4 +24,12 @@ along with GCC; see the file COPYING3. If not see REPLACE_PASS (PASS, INSTANCE, TGT_PASS) */ + /* Pass to add the appropriate vector swaps on power8 little endian systems. + The power8 does not have instructions that automaticaly do the byte swaps + for loads and stores. */ INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps); + + /* Pass to do the PCREL_OPT optimization that combines the load of an + external symbol's address along with a single load or store using that + address as a base register. */ + INSERT_PASS_AFTER (pass_sched2, 1, pass_pcrel_opt); diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f..517713a 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -189,6 +189,7 @@ enum non_prefixed_form { extern enum insn_form address_to_insn_form (rtx, machine_mode, enum non_prefixed_form); +extern enum non_prefixed_form reg_to_non_prefixed (rtx, machine_mode); extern bool prefixed_load_p (rtx_insn *); extern bool prefixed_store_p (rtx_insn *); extern bool prefixed_paddi_p (rtx_insn *); @@ -305,6 +306,7 @@ namespace gcc { class context; } class rtl_opt_pass; extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *); +extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *); extern bool rs6000_sum_of_two_registers_p (const_rtx expr); extern bool rs6000_quadword_masked_address_p (const_rtx exp); extern rtx rs6000_gen_lvx (enum machine_mode, rtx, rtx); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6..6877de5 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1175,7 +1175,6 @@ static bool rs6000_secondary_reload_move (enum rs6000_reg_type, machine_mode, secondary_reload_info *, bool); -static enum non_prefixed_form reg_to_non_prefixed (rtx reg, machine_mode mode); rtl_opt_pass *make_pass_analyze_swaps (gcc::context*); /* Hash table stuff for keeping track of TOC entries. */ @@ -4316,6 +4315,14 @@ rs6000_option_override_internal (bool global_init_p) rs6000_isa_flags &= ~OPTION_MASK_MMA; } + if (!TARGET_PCREL && TARGET_PCREL_OPT) + { + if ((rs6000_isa_flags_explicit & OPTION_MASK_PCREL_OPT) != 0) + error ("%qs requires %qs", "-mpcrel-opt", "-mpcrel"); + + rs6000_isa_flags &= ~OPTION_MASK_PCREL_OPT; + } + if (TARGET_DEBUG_REG || TARGET_DEBUG_TARGET) rs6000_print_isa_options (stderr, 0, "after subtarget", rs6000_isa_flags); @@ -8515,7 +8522,10 @@ rs6000_delegitimize_address (rtx orig_x) { rtx x, y, offset; - if (GET_CODE (orig_x) == UNSPEC && XINT (orig_x, 1) == UNSPEC_FUSION_GPR) + if (GET_CODE (orig_x) == UNSPEC + && (XINT (orig_x, 1) == UNSPEC_FUSION_GPR + || XINT (orig_x, 1) == UNSPEC_PCREL_OPT_LD_ADDR + || XINT (orig_x, 1) == UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG)) orig_x = XVECEXP (orig_x, 0, 0); orig_x = delegitimize_mem_from_attrs (orig_x); @@ -13197,6 +13207,19 @@ print_operand (FILE *file, rtx x, int code) fprintf (file, "%d", 128 >> (REGNO (x) - CR0_REGNO)); return; + case 'r': + /* X is a label number for the PCREL_OPT optimization. Emit the .reloc + to enable this optimization, unless the value is 0. */ + gcc_assert (CONST_INT_P (x)); + if (UINTVAL (x) != 0) + { + unsigned int label_num = UINTVAL (x); + fprintf (file, + ".reloc .Lpcrel%u-8,R_PPC64_PCREL_OPT,.-(.Lpcrel%u-8)\n\t", + label_num, label_num); + } + return; + case 's': /* Low 5 bits of 32 - value */ if (! INT_P (x)) @@ -23244,6 +23267,7 @@ static struct rs6000_opt_mask const rs6000_opt_masks[] = { "mulhw", OPTION_MASK_MULHW, false, true }, { "multiple", OPTION_MASK_MULTIPLE, false, true }, { "pcrel", OPTION_MASK_PCREL, false, true }, + { "pcrel-opt", OPTION_MASK_PCREL_OPT, false, true }, { "popcntb", OPTION_MASK_POPCNTB, false, true }, { "popcntd", OPTION_MASK_POPCNTD, false, true }, { "power8-fusion", OPTION_MASK_P8_FUSION, false, true }, @@ -25368,7 +25392,7 @@ is_lfs_stfs_insn (rtx_insn *insn) /* Helper function to take a REG and a MODE and turn it into the non-prefixed instruction format (D/DS/DQ) used for offset memory. */ -static enum non_prefixed_form +enum non_prefixed_form reg_to_non_prefixed (rtx reg, machine_mode mode) { /* If it isn't a register, use the defaults. */ @@ -25591,7 +25615,15 @@ void rs6000_asm_output_opcode (FILE *stream) { if (next_insn_prefixed_p) - fprintf (stream, "p"); + { + fprintf (stream, "p"); + + /* Reset flag in case there are separate insn lines in the sequence, so + the 'p' is only emited for the first line. This shows up when we are + doing the PCREL_OPT optimization, in that the label created with %r<n> + would have a leading 'p' printed. */ + next_insn_prefixed_p = false; + } return; } diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 43b620a..d9dd25f 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -292,6 +292,10 @@ (define_attr "prefixed" "no,yes" (const_string "no"))) +;; Whether an insn loads an external address for the PCREL_OPT optimizaton. +(define_attr "loads_extern_addr" "no,yes" + (const_string "no")) + ;; Return the number of real hardware instructions in a combined insn. If it ;; is 0, just use the length / 4. (define_attr "num_insns" "" (const_int 0)) @@ -323,7 +327,7 @@ (define_attr "cpu" (const (symbol_ref "(enum attr_cpu) rs6000_tune"))) ;; The ISA we implement. -(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9,p9v,p9kf,p9tf,p10" +(define_attr "isa" "any,p5,p6,p7,p7v,p8v,p9,p9v,p9kf,p9tf,p10,pcrel_opt" (const_string "any")) ;; Is this alternative enabled for the current CPU/ISA/etc.? @@ -371,6 +375,10 @@ (define_attr "enabled" "" (and (eq_attr "isa" "p10") (match_test "TARGET_POWER10")) (const_int 1) + + (and (eq_attr "isa" "pcrel_opt") + (match_test "TARGET_PCREL_OPT")) + (const_int 1) ] (const_int 0))) ;; If this instruction is microcoded on the CELL processor @@ -10226,7 +10234,8 @@ (define_insn "*pcrel_extern_addr" "TARGET_PCREL" "ld %0,%a1" [(set_attr "prefixed" "yes") - (set_attr "type" "load")]) + (set_attr "type" "load") + (set_attr "loads_extern_addr" "yes")]) ;; TOC register handling. @@ -14900,3 +14909,4 @@ (define_insn "*cmpeqb_internal" (include "dfp.md") (include "crypto.md") (include "htm.md") +(include "pcrel-opt.md") diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 9d3e740..22d3af4 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -582,6 +582,10 @@ mpcrel Target Report Mask(PCREL) Var(rs6000_isa_flags) Generate (do not generate) pc-relative memory addressing. +mpcrel-opt +Target Undocumented Mask(PCREL_OPT) Var(rs6000_isa_flags) +Generate (do not generate) pc-relative memory optimizations for externals. + mmma Target Report Mask(MMA) Var(rs6000_isa_flags) Generate (do not generate) MMA instructions. diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index 1ddb572..a617276 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -23,6 +23,10 @@ TM_H += $(srcdir)/config/rs6000/rs6000-cpus.def TM_H += $(srcdir)/config/rs6000/rs6000-modes.h PASSES_EXTRA += $(srcdir)/config/rs6000/rs6000-passes.def +pcrel-opt.o: $(srcdir)/config/rs6000/pcrel-opt.c + $(COMPILE) $< + $(POSTCOMPILE) + rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.c $(COMPILE) $< $(POSTCOMPILE) @@ -86,4 +90,5 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \ $(srcdir)/config/rs6000/mma.md \ $(srcdir)/config/rs6000/crypto.md \ $(srcdir)/config/rs6000/htm.md \ - $(srcdir)/config/rs6000/dfp.md + $(srcdir)/config/rs6000/dfp.md \ + $(srcdir)/config/rs6000/pcrel-opt.md -- 1.8.3.1 -- Michael Meissner, IBM IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA email: meiss...@linux.ibm.com, phone: +1 (978) 899-4797