> -----Original Message----- > From: Gcc-patches <gcc-patches- > bounces+kyrylo.tkachov=arm....@gcc.gnu.org> On Behalf Of Richard > Sandiford via Gcc-patches > Sent: Monday, November 14, 2022 11:11 AM > To: Philipp Tomsich <philipp.toms...@vrull.eu> > Cc: gcc-patches@gcc.gnu.org; JiangNing Liu > <jiangning....@amperecomputing.com>; Christoph Muellner > <christoph.muell...@vrull.eu> > Subject: Re: [PATCH] aarch64: Add support for Ampere-1A (- > mcpu=ampere1a) CPU > > Philipp Tomsich <philipp.toms...@vrull.eu> writes: > > This patch adds support for Ampere-1A CPU: > > - recognize the name of the core and provide detection for -mcpu=native, > > - updated extra_costs, > > - adds a new fusion pair for (A+B+1 and A-B-1). > > > > Ampere-1A and Ampere-1 have more timing difference than the extra > > costs indicate, but these don't propagate through to the headline > > items in our extra costs (e.g. the change in latency for scalar sqrt > > doesn't have a corresponding table entry). > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add > ampere1a. > > * config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs. > > * config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR): > > Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two > > registers and then +1/-1). > > * config/aarch64/aarch64-tune.md: Regenerate. > > * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): > Implement > > idiom-matcher for the new fusion pair. > > OK except for a minor formatting nit: > > > > > Signed-off-by: Philipp Tomsich <philipp.toms...@vrull.eu> > > --- > > > > gcc/config/aarch64/aarch64-cores.def | 1 + > > gcc/config/aarch64/aarch64-cost-tables.h | 107 ++++++++++++++++++++ > > gcc/config/aarch64/aarch64-fusion-pairs.def | 1 + > > gcc/config/aarch64/aarch64-tune.md | 2 +- > > gcc/config/aarch64/aarch64.cc | 63 ++++++++++++ > > 5 files changed, 173 insertions(+), 1 deletion(-) > > > > diff --git a/gcc/config/aarch64/aarch64-cores.def > b/gcc/config/aarch64/aarch64-cores.def > > index d2671778928..aead587cec1 100644 > > --- a/gcc/config/aarch64/aarch64-cores.def > > +++ b/gcc/config/aarch64/aarch64-cores.def > > @@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83", thunderxt83, > thunderx, V8A, (CRC, CRYPTO), thu > > > > /* Ampere Computing ('\xC0') cores. */ > > AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, > SHA3), ampere1, 0xC0, 0xac3, -1) > > +AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, > AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1) > > /* Do not swap around "emag" and "xgene1", > > this order is required to handle variant correctly. */ > > AARCH64_CORE("emag", emag, xgene1, V8A, (CRC, CRYPTO), > emag, 0x50, 0x000, 3) > > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h > b/gcc/config/aarch64/aarch64-cost-tables.h > > index 760d7b30368..48522606fbe 100644 > > --- a/gcc/config/aarch64/aarch64-cost-tables.h > > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > > @@ -775,4 +775,111 @@ const struct cpu_cost_table > ampere1_extra_costs = > > } > > }; > > > > +const struct cpu_cost_table ampere1a_extra_costs = > > +{ > > + /* ALU */ > > + { > > + 0, /* arith. */ > > + 0, /* logical. */ > > + 0, /* shift. */ > > + COSTS_N_INSNS (1), /* shift_reg. */ > > + 0, /* arith_shift. */ > > + COSTS_N_INSNS (1), /* arith_shift_reg. */ > > + 0, /* log_shift. */ > > + COSTS_N_INSNS (1), /* log_shift_reg. */ > > + 0, /* extend. */ > > + COSTS_N_INSNS (1), /* extend_arith. */ > > + 0, /* bfi. */ > > + 0, /* bfx. */ > > + 0, /* clz. */ > > + 0, /* rev. */ > > + 0, /* non_exec. */ > > + true /* non_exec_costs_exec. */ > > + }, > > + { > > + /* MULT SImode */ > > + { > > + COSTS_N_INSNS (3), /* simple. */ > > + COSTS_N_INSNS (3), /* flag_setting. */ > > + COSTS_N_INSNS (3), /* extend. */ > > + COSTS_N_INSNS (4), /* add. */ > > + COSTS_N_INSNS (4), /* extend_add. */ > > + COSTS_N_INSNS (19) /* idiv. */ > > + }, > > + /* MULT DImode */ > > + { > > + COSTS_N_INSNS (3), /* simple. */ > > + 0, /* flag_setting (N/A). */ > > + COSTS_N_INSNS (3), /* extend. */ > > + COSTS_N_INSNS (4), /* add. */ > > + COSTS_N_INSNS (4), /* extend_add. */ > > + COSTS_N_INSNS (35) /* idiv. */ > > + } > > + }, > > + /* LD/ST */ > > + { > > + COSTS_N_INSNS (4), /* load. */ > > + COSTS_N_INSNS (4), /* load_sign_extend. */ > > + 0, /* ldrd (n/a). */ > > + 0, /* ldm_1st. */ > > + 0, /* ldm_regs_per_insn_1st. */ > > + 0, /* ldm_regs_per_insn_subsequent. */ > > + COSTS_N_INSNS (5), /* loadf. */ > > + COSTS_N_INSNS (5), /* loadd. */ > > + COSTS_N_INSNS (5), /* load_unaligned. */ > > + 0, /* store. */ > > + 0, /* strd. */ > > + 0, /* stm_1st. */ > > + 0, /* stm_regs_per_insn_1st. */ > > + 0, /* stm_regs_per_insn_subsequent. */ > > + COSTS_N_INSNS (2), /* storef. */ > > + COSTS_N_INSNS (2), /* stored. */ > > + COSTS_N_INSNS (2), /* store_unaligned. */ > > + COSTS_N_INSNS (3), /* loadv. */ > > + COSTS_N_INSNS (3) /* storev. */ > > + }, > > + { > > + /* FP SFmode */ > > + { > > + COSTS_N_INSNS (25), /* div. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (4), /* mult_addsub. */ > > + COSTS_N_INSNS (4), /* fma. */ > > + COSTS_N_INSNS (4), /* addsub. */ > > + COSTS_N_INSNS (2), /* fpconst. */ > > + COSTS_N_INSNS (4), /* neg. */ > > + COSTS_N_INSNS (4), /* compare. */ > > + COSTS_N_INSNS (4), /* widen. */ > > + COSTS_N_INSNS (4), /* narrow. */ > > + COSTS_N_INSNS (4), /* toint. */ > > + COSTS_N_INSNS (4), /* fromint. */ > > + COSTS_N_INSNS (4) /* roundint. */ > > + }, > > + /* FP DFmode */ > > + { > > + COSTS_N_INSNS (34), /* div. */ > > + COSTS_N_INSNS (5), /* mult. */ > > + COSTS_N_INSNS (5), /* mult_addsub. */ > > + COSTS_N_INSNS (5), /* fma. */ > > + COSTS_N_INSNS (5), /* addsub. */ > > + COSTS_N_INSNS (2), /* fpconst. */ > > + COSTS_N_INSNS (5), /* neg. */ > > + COSTS_N_INSNS (5), /* compare. */ > > + COSTS_N_INSNS (5), /* widen. */ > > + COSTS_N_INSNS (5), /* narrow. */ > > + COSTS_N_INSNS (6), /* toint. */ > > + COSTS_N_INSNS (6), /* fromint. */ > > + COSTS_N_INSNS (5) /* roundint. */ > > + } > > + }, > > + /* Vector */ > > + { > > + COSTS_N_INSNS (3), /* alu. */ > > + COSTS_N_INSNS (3), /* mult. */ > > + COSTS_N_INSNS (2), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > + } > > +}; > > + > > #endif > > diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def > b/gcc/config/aarch64/aarch64-fusion-pairs.def > > index c064fb9b85d..3990f77c127 100644 > > --- a/gcc/config/aarch64/aarch64-fusion-pairs.def > > +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def > > @@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", > CMP_BRANCH) > > AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC) > > AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH) > > AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ) > > +AARCH64_FUSION_PAIR ("A+B+1/A-B-1", ADDSUB_2REG_CONST1) These names can be used from the command line through the developer-only -moverride option. I think maybe "addsub_2reg_const1" would be a better name here? (I wonder if '/' can cause trouble in some shells) Thanks, Kyrilll > > > > #undef AARCH64_FUSION_PAIR > > diff --git a/gcc/config/aarch64/aarch64-tune.md > b/gcc/config/aarch64/aarch64-tune.md > > index 22ec1be5a4c..b7d6fc8cc88 100644 > > --- a/gcc/config/aarch64/aarch64-tune.md > > +++ b/gcc/config/aarch64/aarch64-tune.md > > @@ -1,5 +1,5 @@ > > ;; -*- buffer-read-only: t -*- > > ;; Generated automatically by gentune.sh from aarch64-cores.def > > (define_attr "tune" > > - > "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thun > derx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunder > xt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,t > hunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa > 76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,co > rtexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeo > ntx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,ts > v110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cort > exa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa7 > 5cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715 > ,cortexx2,neoversen2,demeter,neoversev2" > > + > "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thun > derx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunder > xt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm > 1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa > 76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,corte > xa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx > 2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95m > m,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cor > texa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa5 > 3,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,c > ortexa715,cortexx2,neoversen2,demeter,neoversev2" > > (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index d1f979ebcf8..1f7777b5214 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -1921,6 +1921,43 @@ static const struct tune_params > ampere1_tunings = > > &ere1_prefetch_tune > > }; > > > > +static const struct tune_params ampere1a_tunings = > > +{ > > + &ere1a_extra_costs, > > + &generic_addrcost_table, > > + &generic_regmove_cost, > > + &ere1_vector_cost, > > + &generic_branch_cost, > > + &generic_approx_modes, > > + SVE_NOT_IMPLEMENTED, /* sve_width */ > > + { 4, /* load_int. */ > > + 4, /* store_int. */ > > + 4, /* load_fp. */ > > + 4, /* store_fp. */ > > + 4, /* load_pred. */ > > + 4 /* store_pred. */ > > + }, /* memmov_cost. */ > > + 4, /* issue_rate */ > > + (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC | > > + AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK | > > + AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ | > > + AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ | > > + AARCH64_FUSE_ADDSUB_2REG_CONST1), > > + /* fusible_ops */ > > + "32", /* function_align. */ > > + "4", /* jump_align. */ > > + "32:16", /* loop_align. */ > > + 2, /* int_reassoc_width. */ > > + 4, /* fp_reassoc_width. */ > > + 2, /* vec_reassoc_width. */ > > + 2, /* min_div_recip_mul_sf. */ > > + 2, /* min_div_recip_mul_df. */ > > + 0, /* max_case_values. */ > > + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ > > + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ > > + &ere1_prefetch_tune > > +}; > > + > > static const advsimd_vec_cost neoversev1_advsimd_vector_cost = > > { > > 2, /* int_stmt_cost */ > > @@ -25539,6 +25576,32 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, > rtx_insn *curr) > > } > > } > > > > + /* Fuse A+B+1 and A-B-1 */ > > + if (simple_sets_p && aarch64_fusion_enabled_p > (AARCH64_FUSE_ADDSUB_2REG_CONST1)) > > Line exceeds 80 chars. > > Thanks, > Richard > > > + { > > + /* We're trying to match: > > + prev == (set (r0) (plus (r0) (r1))) > > + curr == (set (r0) (plus (r0) (const_int 1))) > > + or: > > + prev == (set (r0) (minus (r0) (r1))) > > + curr == (set (r0) (plus (r0) (const_int -1))) */ > > + > > + rtx prev_src = SET_SRC (prev_set); > > + rtx curr_src = SET_SRC (curr_set); > > + > > + int polarity = 1; > > + if (GET_CODE (prev_src) == MINUS) > > + polarity = -1; > > + > > + if (GET_CODE (curr_src) == PLUS > > + && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == > MINUS) > > + && CONST_INT_P (XEXP (curr_src, 1)) > > + && INTVAL (XEXP (curr_src, 1)) == polarity > > + && REG_P (XEXP (curr_src, 0)) > > + && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0))) > > + return true; > > + } > > + > > return false; > > }
RE: [PATCH] aarch64: Add support for Ampere-1A (-mcpu=ampere1a) CPU
Kyrylo Tkachov via Gcc-patches Mon, 14 Nov 2022 03:26:54 -0800
- [PATCH] aarch64: Add support for Ampere-... Philipp Tomsich
- Re: [PATCH] aarch64: Add support fo... Richard Sandiford via Gcc-patches
- RE: [PATCH] aarch64: Add suppor... Kyrylo Tkachov via Gcc-patches