Re: [PATCH 2/2] AArch64: Enable dispatch scheduling for Olympus core.

Andrew Pinski Thu, 23 Oct 2025 22:25:15 -0700

On Thu, Oct 23, 2025 at 7:24 AM Jennifer Schmitz <[email protected]> wrote:
>
> This patch enables dispatch scheduling for the NVIDIA Olympus core.
> The dispatch constraints are based on the Olympus CPU Core Software
> Optimization Guide
> (https://docs.nvidia.com/olympus-cpu-core-software-optimization-guide-dp12531-001v0-7.pdf).
>
> The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
> OK for trunk?
>
> Signed-off-by: Jennifer Schmitz <[email protected]>
>
> gcc/
>         * config/aarch64/aarch64.md: Include olympus.md.
>         * config/aarch64/olympus.md: New file.
>         * config/aarch64/tuning_models/olympus.h: Add dispatch
>         constraints and enable dispatch scheduling.
> ---
>  gcc/config/aarch64/aarch64.md              |   1 +
>  gcc/config/aarch64/olympus.md              | 199 +++++++++++
>  gcc/config/aarch64/tuning_models/olympus.h | 363 ++++++++++++++++++++-
>  3 files changed, 561 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/config/aarch64/olympus.md
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 98c65a74c8e..8aef3858a79 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -686,6 +686,7 @@
>
>  ;; Dispatch scheduling
>  (include "neoversev2.md")
> +(include "olympus.md")
>
>  ;; -------------------------------------------------------------------
>  ;; Jumps and other miscellaneous insns
> diff --git a/gcc/config/aarch64/olympus.md b/gcc/config/aarch64/olympus.md
> new file mode 100644
> index 00000000000..22b12016ffd
> --- /dev/null
> +++ b/gcc/config/aarch64/olympus.md
> @@ -0,0 +1,199 @@
> +;; Instruction attribute for dispatch scheduling for NVIDIA Olympus.
> +;; Copyright The GNU Toolchain Authors.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3.  If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; Attribute that groups other instruction attributes into dispatch groups
> +;; for the Olympus core. Dispatch groups are groups of pipelines for which
> +;; the SWOG specifies a dispatch constraint. For example: Because the SWOG
> +;; contains a dispatch constraint for the V12 pipelines, there is an 
> attribute
> +;; value "v12" that groups instructions that are processed by the V1 and V2
> +;; pipelines.
> +;; Values that contain a "_" represent combinations of dispatch groups.
> +;; For example, there are dispatch constraints for the M and V pipelines. The
> +;; value "m_v" groups instructions that utilize the M as well as the
> +;; V pipelines, such that both dispatch constraints apply.
> +
> +(define_attr "olympus_dispatch"
> +  "none,b,i,m,m0,l,v,v0,v03,v12,v45,v0123,m_v,l_v,m_l,m_v0123,v_v0123,\
> +   l_v03,sa_d,sa_v0123,sa_v_v0123"
> +  (cond [(eq_attr "type" "branch,call")
> +        (const_string "b")
> +        (eq_attr "type" "adr,adc_reg,alu_ext,alu_imm,alu_sreg,alus_ext,\
> +         alus_imm,alus_shift_imm,alus_sreg,clz,csel,extend,logic_imm,\
> +         logic_reg,logics_imm,logics_reg,mov_imm,mov_reg,rbit,rev")
> +        (const_string "i")
> +        (ior
> +          (eq_attr "type" "bfm,bfx,crc,f_mrc,logic_shift_imm,\
> +           logics_shift_imm,memtag,mul,neon_from_gp,neon_from_gp_q,\
> +           rotate_imm,shift_reg,smull,sdiv,udiv,umull")
> +          (eq_attr "autodetect_type" "alu_shift_asr_op2,alu_shift_lsl_op2,\
> +           alu_shift_lsr_op2")
> +          (eq_attr "sve_type" "sve_pred_cnt_ctrl,sve_pred_cnt_scalar,\
> +           sve_pred_logical,sve_pred_misc"))
> +        (const_string "m")
> +        (eq_attr "sve_type" "sve_ffr")
> +        (const_string "m0")
> +        (ior
> +          (eq_attr "type" "f_loadd,f_loads,load_4,load_8,load_16,\
> +           neon_ldp,neon_ldp_q,neon_load1_1reg,neon_load1_1reg_q,\
> +           neon_load1_2reg,neon_load1_2reg_q,neon_load1_3reg,\
> +           neon_load1_3reg_q,neon_load1_4reg,neon_load1_4reg_q,\
> +           neon_load1_all_lanes")
> +          (eq_attr "sve_type" "sve_load_1reg"))
> +        (const_string "l")
> +        (ior
> +          (eq_attr "type" 
> "crypto_aese,crypto_aesmc,crypto_pmull,faddd,fadds,\
> +           fccmpd,fccmps,fcmpd,fcmps,fcsel,fconstd,fconsts,fmuld,fmuls,\
> +           ffarithd,ffariths,fmacd,fmacs,f_mcr,f_minmaxd,f_minmaxs,fmov,\
> +           f_rintd,f_rints,neon_abs,neon_abs_q,\
> +           neon_add,neon_add_halve,neon_add_halve_narrow_q,neon_add_halve_q,\
> +           neon_add_long,neon_add_q,neon_add_widen,neon_abd,neon_abd_long,\
> +           neon_abd_q,neon_arith_acc,neon_arith_acc_q,neon_bsl,neon_bsl_q,\
> +           neon_cls,neon_cls_q,neon_compare,neon_compare_q,\
> +           neon_compare_zero,neon_compare_zero_q,neon_cnt,neon_cnt_q,\
> +           neon_dup,neon_dup_q,neon_ext,neon_ext_q,neon_fcadd,neon_fcmla,\
> +           neon_fp_abs_d,neon_fp_abs_d_q,neon_fp_abs_s,neon_fp_abs_s_q,\
> +           neon_fp_abd_d,neon_fp_abd_d_q,neon_fp_abd_s,neon_fp_abd_s_q,\
> +           neon_fp_addsub_d,neon_fp_addsub_d_q,neon_fp_addsub_s,\
> +           neon_fp_addsub_s_q,neon_fp_compare_d,neon_fp_compare_d_q,\
> +           neon_fp_compare_s,neon_fp_compare_s_q,neon_fp_mla_d,\
> +           neon_fp_mla_d_q,neon_fp_mla_d_scalar_q,neon_fp_mla_s,\
> +           neon_fp_mla_s_q,neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
> +           neon_fp_minmax_d,neon_fp_minmax_d_q,neon_fp_minmax_s,\
> +           neon_fp_minmax_s_q,neon_fp_mul_d,neon_fp_mul_d_q,neon_fp_mul_s,\
> +           neon_fp_mul_s_q,neon_fp_mul_s_scalar,neon_fp_mul_s_scalar_q,\
> +           neon_fp_mul_d_scalar_q,neon_fp_neg_s,neon_fp_neg_s_q,\
> +           neon_fp_neg_d,neon_fp_neg_d_q,neon_fp_recps_d,\
> +           neon_fp_recps_d_q,neon_fp_recps_s,neon_fp_recps_s_q,\
> +           neon_fp_reduc_add_d,neon_fp_reduc_add_d_q,neon_fp_reduc_add_s,\
> +           neon_fp_reduc_add_s_q,neon_fp_reduc_minmax_d,\
> +           neon_fp_reduc_minmax_d_q,neon_fp_reduc_minmax_s,\
> +           neon_fp_reduc_minmax_s_q,neon_fp_rsqrts_d,neon_fp_rsqrts_d_q,\
> +           neon_fp_rsqrts_s,neon_fp_rsqrts_s_q,neon_logic,neon_logic_q,\
> +           neon_minmax,neon_minmax_q,neon_move,neon_move_narrow_q,\
> +           neon_move_q,neon_neg,neon_neg_q,neon_permute,neon_permute_q,\
> +           
> neon_qabs,neon_qabs_q,neon_qadd,neon_qadd_q,neon_qneg,neon_qneg_q,\
> +           neon_qsub,neon_qsub_q,neon_rev,neon_rev_q,neon_rbit,neon_rbit_q,\
> +           neon_sat_shift_imm,neon_sat_shift_imm_narrow_q,\
> +           neon_sat_shift_imm_q,neon_sat_shift_reg,neon_sat_shift_reg_q,\
> +           neon_shift_acc,neon_shift_acc_q,neon_shift_imm,\
> +           neon_shift_imm_long,neon_shift_imm_narrow_q,neon_shift_imm_q,\
> +           neon_shift_reg,neon_shift_reg_q,neon_sub,neon_sub_halve,\
> +           neon_sub_halve_narrow_q,neon_sub_halve_q,neon_sub_long,\
> +           neon_sub_q,neon_sub_widen,neon_tbl1,neon_tbl1_q,neon_tbl2,\
> +           neon_tbl2_q,neon_tbl3,neon_tbl3_q,neon_tbl4,neon_tbl4_q,\
> +           neon_tst,neon_tst_q,neon_zip,neon_zip_q")
> +          (eq_attr "sve_type" "sve_fp_arith,sve_fp_misc,\
> +           sve_fp_mul,sve_fp_reduc,sve_int_accum,sve_int_dot,sve_int_extend,\
> +           sve_int_general,sve_int_pmul,sve_int_shift"))
> +        (const_string "v")
> +        (ior
> +          (eq_attr "type" 
> "crypto_sha1_fast,crypto_sha1_slow,crypto_sha1_xor,\
> +           crypto_sha256_fast,crypto_sha256_slow,crypto_sha3,crypto_sha512,\
> +           crypto_sm4")
> +          (eq_attr "sve_type" "sve_crypto_sha3"))
> +        (const_string "v0")
> +        (ior
> +          (eq_attr "type" "fccmpd,fccmps,fcmpd,fcmps,neon_fp_to_int_d,\
> +           neon_fp_to_int_d_q,neon_fp_to_int_s,neon_fp_to_int_s_q,\
> +           neon_to_gp,neon_to_gp_q")
> +          (eq_attr "sve_type" "sve_fp_assoc_add,sve_fp_cmp"))
> +        (const_string "v03")
> +        (ior
> +          (eq_attr "type" "fdivd,fdivs,fsqrtd,fsqrts,neon_fp_div_d,\
> +           neon_fp_div_d_q,neon_fp_div_s,neon_fp_div_s_q,neon_fp_sqrt_d,\
> +           neon_fp_sqrt_d_q,neon_fp_sqrt_s,neon_fp_sqrt_s_q")
> +          (eq_attr "sve_type" "sve_fp_div,sve_fp_exp,sve_fp_sqrt,\
> +           sve_int_extract,sve_int_bit_perm"))
> +        (const_string "v12")
> +        (eq_attr "sve_type" "sve_int_div")
> +        (const_string "v45")
> +        (ior
> +          (eq_attr "type" "crypto_sm3,f_cvt,f_cvtf2i,f_cvti2f,f_rintd,\
> +           f_rints,mla,neon_fp_cvt_narrow_d_q,neon_fp_cvt_narrow_s_q,\
> +           neon_fp_cvt_widen_h,neon_fp_cvt_widen_s,\
> +           neon_fp_recpe_d,neon_fp_recpe_d_q,neon_fp_recpe_s,\
> +           neon_fp_recpe_s_q,neon_fp_recpx_d,neon_fp_recpx_d_q,\
> +           neon_fp_recpx_s,neon_fp_recpx_s_q,neon_fp_round_d,\
> +           neon_fp_round_d_q,neon_fp_round_s,neon_fp_round_s_q,\
> +           neon_fp_rsqrte_d,neon_fp_rsqrte_d_q,neon_fp_rsqrte_s,\
> +           neon_fp_rsqrte_s_q,neon_int_to_fp_s,\
> +           neon_int_to_fp_s_q,neon_int_to_fp_d,neon_int_to_fp_d_q,\
> +           neon_mla_b,neon_mla_b_long,neon_mla_b_q,neon_mla_h,\
> +           neon_mla_h_long,neon_mla_h_q,neon_mla_h_scalar,\
> +           neon_mla_h_scalar_q,neon_mla_h_scalar_long,neon_mla_s,\
> +           neon_mla_s_long,neon_mla_s_q,neon_mla_s_scalar,\
> +           neon_mla_s_scalar_q,neon_mla_s_scalar_long,neon_mul_b,\
> +           neon_mul_b_long,neon_mul_b_q,neon_mul_d_long,neon_mul_h,\
> +           neon_mul_h_q,neon_mul_h_long,neon_mul_h_scalar,\
> +           neon_mul_h_scalar_long,neon_mul_h_scalar_q,neon_mul_s,\
> +           neon_mul_s_scalar_q,neon_mul_s_q,neon_mul_s_long,\
> +           neon_mul_s_scalar,neon_mul_s_scalar_long,neon_reduc_add,\
> +           neon_reduc_add_long,neon_reduc_add_q,neon_reduc_minmax,\
> +           neon_reduc_minmax_q,neon_sat_mla_b_long,neon_sat_mla_h_long,\
> +           neon_sat_mla_h_scalar_long,neon_sat_mla_s_long,\
> +           neon_sat_mla_s_scalar_long,neon_sat_mul_b,neon_sat_mul_b_q,\
> +           neon_sat_mul_b_long,neon_sat_mul_h,neon_sat_mul_h_q,\
> +           neon_sat_mul_h_long,neon_sat_mul_h_scalar,\
> +           neon_sat_mul_h_scalar_q,neon_sat_mul_h_scalar_long,\
> +           neon_sat_mul_s,neon_sat_mul_s_q,neon_sat_mul_s_long,\
> +           neon_sat_mul_s_scalar,neon_sat_mul_s_scalar_q,\
> +           neon_sat_mul_s_scalar_long,smlal,umlal")
> +          (eq_attr "sve_type" "sve_fp_cvt,sve_fp_log,sve_int_cvt,\
> +           sve_int_mul,sve_int_recip_est"))
> +        (const_string "v0123")
> +        (eq_attr "type" "neon_ins,neon_ins_q")
> +        (const_string "m_v")
> +        (ior
> +          (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q,\
> +           neon_load2_2reg,neon_load2_2reg_q,neon_load2_all_lanes,\
> +           neon_load2_all_lanes_q,neon_load2_one_lane,neon_load3_3reg,\
> +           neon_load3_3reg_q,neon_load3_all_lanes,neon_load3_all_lanes_q,\
> +           neon_load3_one_lane,neon_load4_4reg,neon_load4_4reg_q,\
> +           neon_load4_all_lanes,neon_load4_all_lanes_q,neon_load4_one_lane")
> +          (eq_attr "sve_type" "sve_load_2reg,sve_load_3reg,sve_load_4reg"))
> +        (const_string "l_v")
> +        (eq_attr "sve_type" "sve_load_pred,sve_pred_vec")
> +        (const_string "m_l")
> +        (eq_attr "sve_type" "sve_int_cmp_set,sve_int_index,sve_int_match")
> +        (const_string "m_v0123")
> +        (eq_attr "sve_type" "sve_int_reduc")
> +        (const_string "v_v0123")
> +        (eq_attr "sve_type" "sve_gatherload_32,sve_gatherload_64")
> +        (const_string "l_v03")
> +        (ior
> +          (eq_attr "type" "store_4,store_8,store_16")
> +          (eq_attr "sve_type" "sve_store_pred"))
> +        (const_string "sa_d")
> +        (ior
> +          (eq_attr "type" "f_stored,f_stores,neon_stp,neon_stp_q,\
> +           neon_store1_1reg,neon_store1_1reg_q,neon_store1_2reg,\
> +           neon_store1_2reg_q,neon_store1_3reg,neon_store1_3reg_q,\
> +           neon_store1_4reg,neon_store1_4reg_q")
> +          (eq_attr "sve_type" "sve_store_1reg"))
> +        (const_string "sa_v0123")
> +        (ior
> +          (eq_attr "type" "neon_store1_one_lane,neon_store1_one_lane_q,\
> +           neon_store2_2reg,neon_store2_2reg_q,neon_store2_one_lane,\
> +           neon_store2_one_lane_q,neon_store3_3reg,neon_store3_3reg_q,\
> +           neon_store3_one_lane,neon_store3_one_lane_q,neon_store4_4reg,\
> +           neon_store4_4reg_q,neon_store4_one_lane,neon_store4_one_lane_q")
> +          (eq_attr "sve_type" "sve_store_2reg,sve_store_3reg,sve_store_4reg,\
> +           sve_scatterstore_32,sve_scatterstore_64"))
> +        (const_string "sa_v_v0123")]
> +        (const_string "none")))
> \ No newline at end of file
> diff --git a/gcc/config/aarch64/tuning_models/olympus.h 
> b/gcc/config/aarch64/tuning_models/olympus.h
> index d19aca8c323..404d79307df 100644
> --- a/gcc/config/aarch64/tuning_models/olympus.h
> +++ b/gcc/config/aarch64/tuning_models/olympus.h
> @@ -21,6 +21,8 @@
>  #define GCC_AARCH64_H_OLYMPUS
>
>  #include "generic.h"
> +#include "../aarch64-sched-dispatch.h"
> +#include "vec.h"
>
>  static struct cpu_regmove_cost olympus_regmove_cost =
>  {
> @@ -169,6 +171,362 @@ static cpu_prefetch_tune olympus_prefetch_tune =
>    -1                   /* default_opt_level  */
>  };
>
> +/* Olympus dispatch constraint types.  */
> +enum olympus_dispatch_constraint_type
> +{
> +  OLYMPUS_TOTAL_SLOTS, /* total slots  */
> +  OLYMPUS_M_PIPE,      /* m pipelines  */
> +  OLYMPUS_M0_PIPE,     /* m0 pipeline  */
> +  OLYMPUS_BRANCH_PIPE, /* branch pipelines  */
> +  OLYMPUS_L_SA_PIPE,   /* l, sa pipelines  */
> +  OLYMPUS_SA_PIPE,     /* sa pipelines  */
> +  OLYMPUS_V_PIPE,      /* v pipelines  */
> +  OLYMPUS_V0123_PIPE,  /* v0, v1, v2, v3 pipelines  */
> +  OLYMPUS_V03_PIPE,    /* v0, v3 pipelines  */
> +  OLYMPUS_V12_PIPE,    /* v1, v2 pipelines  */
> +  OLYMPUS_V45_PIPE,    /* v4, v5 pipelines  */
> +  OLYMPUS_V0_PIPE      /* v0 pipeline  */
> +};
> +
> +/* Olympus dispatch constraints for instruction scheduling.
> +   The numbers are based on the Olympus CPU Core SWOG, section 4.1.  */
> +static const int olympus_dispatch_max_slots[] = {
> +  10, /* total slots  */
> +  6,  /* m pipelines  */
> +  3,  /* m0 pipeline  */
> +  3,  /* branch pipelines  */
> +  8,  /* l, sa pipelines  */
> +  4,  /* sa pipelines  */
> +  6,  /* v pipelines  */
> +  4,  /* v0, v1, v2, v3 pipelines  */
> +  4,  /* v0, v3 pipelines  */
> +  4,  /* v1, v2 pipelines  */
> +  2,  /* v4, v5 pipelines  */
> +  2   /* v0 pipeline  */
> +};
> +
> +/* Olympus dispatch constraint callback function.
> +   Determines which constraints apply to an instruction and how many slots
> +   it requires.  Returns a vec of (constraint_index, slots_required) pairs.  
> */
> +static vec<std::pair<int, int>>
> +olympus_dispatch_constraint_callback (rtx_insn *insn)
> +{
> +  auto dispatch_group = get_attr_olympus_dispatch (insn);
> +  vec<std::pair<int, int>> constraints = vNULL;
> +
> +  /* In addition to deducting slots from the specific pipeline types required
> +     by an instruction, we keep track of the total number of slots required.
> +     There are different cases how total_slots is derived from the specific
> +     pipeline slots:
> +     Case 1: Single top-level pipeline type used
> +       Example groups: OLYMPUS_DISPATCH_B, OLYMPUS_DISPATCH_V_V0123
> +       Total_slots is equal to the number of slots for the top-level
> +       pipeline type.
> +       Example: Assume an instruction in the OLYMPUS_DISPATCH_V_V0123
> +       dispatch group is executed as 2 MOps: 1 utilizing any V pipeline and
> +       1 utilizing a V0123 pipeline.  It requires 1 slot in the
> +       OLYMPUS_V0123_PIPE constraint and a total of 2 slots in the
> +       OLYMPUS_V_PIPE constraint, because the V0123 pipelines are a subset of
> +       the V pipelines. Total_slots is 2.
> +     Case 2: Multiple top-level pipeline types used
> +       Example groups: OLYMPUS_DISPATCH_M_V, OLYMPUS_DISPATCH_SA_V_V0123
> +       Total_slots is equal to the sum of the slots for the top-level
> +       pipeline types.  */
> +  int total_slots = 1;
> +
> +  switch (dispatch_group)
> +    {
> +    case OLYMPUS_DISPATCH_NONE:
> +    case OLYMPUS_DISPATCH_I:
> +      break;
> +
> +    case OLYMPUS_DISPATCH_B:
> +      constraints.safe_push ({OLYMPUS_BRANCH_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_M:
> +      constraints.safe_push ({OLYMPUS_M_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_M0:
> +      constraints.safe_push ({OLYMPUS_M_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_M0_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_L:
> +      {
> +       auto type_attr = get_attr_type (insn);
> +       int l_slots = 1;
> +       if (type_attr == TYPE_NEON_LDP_Q
> +           || type_attr == TYPE_NEON_LOAD1_2REG_Q
> +           || type_attr == TYPE_NEON_LOAD1_3REG
> +           || type_attr == TYPE_NEON_LOAD1_4REG)
> +         l_slots = 2;
> +       else if (type_attr == TYPE_NEON_LOAD1_3REG_Q)
> +         l_slots = 3;
> +       else if (type_attr == TYPE_NEON_LOAD1_4REG_Q)
> +         l_slots = 4;
> +       constraints.safe_push ({OLYMPUS_L_SA_PIPE, l_slots});
> +       total_slots = l_slots;
> +      }
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V:
> +      {
> +       auto type_attr = get_attr_type (insn);
> +       int v_slots = 1;
> +       if (type_attr == TYPE_NEON_TBL3
> +           || type_attr == TYPE_NEON_FP_REDUC_MINMAX_D
> +           || type_attr == TYPE_NEON_FP_REDUC_MINMAX_S
> +           || get_attr_sve_type (insn) == SVE_TYPE_SVE_FP_REDUC)
> +         v_slots = 2;
> +       else if (type_attr == TYPE_NEON_TBL4
> +                || type_attr == TYPE_NEON_FP_REDUC_MINMAX_D_Q
> +                || type_attr == TYPE_NEON_FP_REDUC_MINMAX_S_Q)
> +         v_slots = 3;
> +       constraints.safe_push ({OLYMPUS_V_PIPE, v_slots});
> +       total_slots = v_slots;
> +      }
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V0:
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V03_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V0_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V03:
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V03_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V12:
> +      {
> +       auto sve_type_attr = get_attr_sve_type (insn);
> +       int slots = (sve_type_attr == SVE_TYPE_SVE_INT_BIT_PERM) ? 2 : 1;
> +       constraints.safe_push ({OLYMPUS_V_PIPE, slots});
> +       constraints.safe_push ({OLYMPUS_V0123_PIPE, slots});
> +       constraints.safe_push ({OLYMPUS_V12_PIPE, slots});
> +       total_slots = slots;
> +      }
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V45:
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V45_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V0123:
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_M_V:
> +      constraints.safe_push ({OLYMPUS_M_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +      total_slots = 2;
> +      break;
> +
> +    case OLYMPUS_DISPATCH_L_V:
> +      {
> +       auto type_attr = get_attr_type (insn);
> +       auto sve_type_attr = get_attr_sve_type (insn);
> +       int l_sa_slots = 1;
> +       int v_slots = 1;
> +       if (type_attr == TYPE_NEON_LOAD2_2REG
> +           || type_attr == TYPE_NEON_LOAD2_ALL_LANES
> +           || type_attr == TYPE_NEON_LOAD2_ALL_LANES_Q
> +           || type_attr == TYPE_NEON_LOAD2_ONE_LANE)
> +         v_slots = 2;
> +       else if (type_attr == TYPE_NEON_LOAD2_2REG_Q
> +                || sve_type_attr == SVE_TYPE_SVE_LOAD_2REG)
> +         {
> +           l_sa_slots = 2;
> +           v_slots = 2;
> +         }
> +       else if (type_attr == TYPE_NEON_LOAD3_3REG
> +                || type_attr == TYPE_NEON_LOAD3_ALL_LANES
> +                || type_attr == TYPE_NEON_LOAD3_ONE_LANE
> +                || sve_type_attr == SVE_TYPE_SVE_LOAD_3REG)
> +         {
> +           l_sa_slots = 2;
> +           v_slots = 3;
> +         }
> +       else if (type_attr == TYPE_NEON_LOAD3_3REG_Q)
> +         {
> +           l_sa_slots = 3;
> +           v_slots = 3;
> +         }
> +       else if (type_attr == TYPE_NEON_LOAD4_4REG
> +                || type_attr == TYPE_NEON_LOAD4_ALL_LANES
> +                || type_attr == TYPE_NEON_LOAD4_ONE_LANE)
> +         {
> +           l_sa_slots = 2;
> +           v_slots = 4;
> +         }
> +       else if (type_attr == TYPE_NEON_LOAD4_4REG_Q
> +                || sve_type_attr == SVE_TYPE_SVE_LOAD_4REG)
> +         {
> +           l_sa_slots = 4;
> +           v_slots = 6;
> +         }
> +       constraints.safe_push ({OLYMPUS_L_SA_PIPE, l_sa_slots});
> +       constraints.safe_push ({OLYMPUS_V_PIPE, v_slots});
> +       total_slots = l_sa_slots + v_slots;
> +      }
> +      break;
> +
> +    case OLYMPUS_DISPATCH_M_L:
> +      constraints.safe_push ({OLYMPUS_M_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_L_SA_PIPE, 1});
> +      total_slots = 2;
> +      break;
> +
> +    case OLYMPUS_DISPATCH_M_V0123:
> +      constraints.safe_push ({OLYMPUS_M_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +      constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +      total_slots = 2;
> +      break;
> +
> +    case OLYMPUS_DISPATCH_V_V0123:
> +      constraints.safe_push ({OLYMPUS_V_PIPE, 2});
> +      constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +      total_slots = 2;
> +      break;
> +
> +    case OLYMPUS_DISPATCH_L_V03:
> +      {
> +       auto sve_type_attr = get_attr_sve_type (insn);
> +       int l_slots = 1;
> +       if (sve_type_attr == SVE_TYPE_SVE_GATHERLOAD_32)
> +         l_slots = 4;
> +       else if (sve_type_attr == SVE_TYPE_SVE_GATHERLOAD_64)
> +         l_slots = 2;
> +       constraints.safe_push ({OLYMPUS_L_SA_PIPE, l_slots});
> +       constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +       constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +       constraints.safe_push ({OLYMPUS_V03_PIPE, 1});
> +       total_slots = l_slots + 1;
> +      }
> +      break;
> +
> +    case OLYMPUS_DISPATCH_SA_D:
> +      constraints.safe_push ({OLYMPUS_SA_PIPE, 1});
> +      break;
> +
> +    case OLYMPUS_DISPATCH_SA_V0123:
> +      {
> +       /* According to the note in section 4.1 of the SWOG, MOps using the
> +          V0123 pipeline do not count towards the limits, when those MOps
> +          are in the same instruction as a MOp in the SA pipeline.  That is
> +          why total_slots is set to the number of slots for the SA pipelines,
> +          disregarding the slots for the V0123 pipelines.  */
> +       auto type_attr = get_attr_type (insn);
> +       if (type_attr == TYPE_NEON_STORE1_3REG
> +           || type_attr == TYPE_NEON_STORE1_3REG_Q
> +           || type_attr == TYPE_NEON_STORE1_4REG
> +           || type_attr == TYPE_NEON_STORE1_4REG_Q)
> +         {
> +           constraints.safe_push ({OLYMPUS_SA_PIPE, 2});
> +           constraints.safe_push ({OLYMPUS_V_PIPE, 2});
> +           constraints.safe_push ({OLYMPUS_V0123_PIPE, 2});
> +           total_slots = 2;
> +         }
> +       else
> +         {
> +           constraints.safe_push ({OLYMPUS_SA_PIPE, 1});
> +           constraints.safe_push ({OLYMPUS_V_PIPE, 1});
> +           constraints.safe_push ({OLYMPUS_V0123_PIPE, 1});
> +           total_slots = 1;
> +         }
> +      }
> +      break;
> +
> +    case OLYMPUS_DISPATCH_SA_V_V0123:
> +      {
> +       auto type_attr = get_attr_type (insn);
> +       auto sve_type_attr = get_attr_sve_type (insn);
> +       int sa_slots = 1;
> +       int v_slots = 2;
> +       int v0123_slots = 1;
> +       if (type_attr == TYPE_NEON_STORE2_2REG_Q
> +           || type_attr == TYPE_NEON_STORE4_ONE_LANE
> +           || type_attr == TYPE_NEON_STORE4_ONE_LANE_Q)
> +         v_slots = 3;
> +       else if (type_attr == TYPE_NEON_STORE3_3REG
> +                || type_attr == TYPE_NEON_STORE3_ONE_LANE
> +                || type_attr == TYPE_NEON_STORE3_ONE_LANE_Q
> +                || sve_type_attr == SVE_TYPE_SVE_STORE_2REG)
> +         {
> +           sa_slots = 2;
> +           v_slots = 4;
> +           v0123_slots = 2;
> +         }
> +       else if (type_attr == TYPE_NEON_STORE3_3REG_Q)
> +         {
> +           sa_slots = 2;
> +           v_slots = 5;
> +           v0123_slots = 2;
> +         }
> +       else if (type_attr == TYPE_NEON_STORE4_4REG)
> +         v_slots = 5;
> +       else if (type_attr == TYPE_NEON_STORE4_4REG_Q)
> +         {
> +           sa_slots = 2;
> +           v_slots = 6;
> +           v0123_slots = 2;
> +         }
> +       else if (sve_type_attr == SVE_TYPE_SVE_STORE_3REG)
> +         {
> +           sa_slots = 3;
> +           v_slots = 6;
> +           v0123_slots = 3;
> +         }
> +       else if (sve_type_attr == SVE_TYPE_SVE_STORE_4REG)
> +         {
> +           sa_slots = 4;
> +           v_slots = 6;
> +           v0123_slots = 4;
> +         }
> +       else if (sve_type_attr == SVE_TYPE_SVE_SCATTERSTORE_32)
> +         {
> +           sa_slots = 4;
> +           v_slots = 5;
> +           v0123_slots = 4;
> +         }
> +       else if (sve_type_attr == SVE_TYPE_SVE_SCATTERSTORE_64)
> +         {
> +           sa_slots = 2;
> +           v_slots = 4;
> +           v0123_slots = 3;
> +         }
> +       constraints.safe_push ({OLYMPUS_SA_PIPE, sa_slots});
> +       constraints.safe_push ({OLYMPUS_V_PIPE, v_slots});
> +       constraints.safe_push ({OLYMPUS_V0123_PIPE, v0123_slots});
> +       /* We disregard slots from the V0123 pipelines in total_slots when
> +          the instruction also uses the SA pipelines, see comment in
> +          OLYMPUS_DISPATCH_SA_V0123.  */
> +       total_slots = sa_slots + (v_slots - v0123_slots);
> +      }
> +      break;
> +    }
> +
> +  /* Add total slots constraint  */
> +  constraints.safe_push ({OLYMPUS_TOTAL_SLOTS, total_slots});


Note I think we should change the callback API here and pass in a vec
that we add to.
The main reason is this vec is maybe max 5 elements so having an
auto_vec on the 5 elements would speed things up slightly.
Depending on how often this callback is called.
Otherwise at least in this function you could use reserve space for at
least 4 elements and that would still improve how many times malloc is
called. Yes this might waste 12 bytes but wasting 12 bytes might be
faster than calling malloc each time safe_push is called; especially
since it is called at least once at the end always.

A similar thing should be done for neoversev2_dispatch_constraint_callback too.

Otherwise LGTM.

Thanks,
Andrew Pinski

> +
> +  return constraints;
> +}
> +
> +/* Olympus dispatch constraints configuration. */
> +static const struct dispatch_constraint_info 
> olympus_dispatch_constraint_info = {
> +  olympus_dispatch_max_slots,  /* max_slots */
> +  ARRAY_SIZE (olympus_dispatch_max_slots),  /* num_constraints */
> +  olympus_dispatch_constraint_callback  /* callback */
> +};
> +
>  static struct tune_params olympus_tunings =
>  {
>    &cortexa76_extra_costs,
> @@ -201,11 +559,12 @@ static struct tune_params olympus_tunings =
>    (AARCH64_EXTRA_TUNE_BASE
>     | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
>     | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
> -   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
> +   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
> +   | AARCH64_EXTRA_TUNE_DISPATCH_SCHED),       /* tune_flags.  */
>    &olympus_prefetch_tune,
>    AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
>    AARCH64_LDP_STP_POLICY_ALWAYS,   /* stp_policy_model.  */
> -  nullptr      /* dispatch_constraints.  */
> +  &olympus_dispatch_constraint_info    /* dispatch_constraints.  */
>  };
>
>  #endif /* GCC_AARCH64_H_OLYMPUS.  */
> --
> 2.34.1

Re: [PATCH 2/2] AArch64: Enable dispatch scheduling for Olympus core.

Reply via email to