Hi, I reworked patch use a tuning flag instead of checking explicit for CPU flavor. I will send soon an update for it, which won't use the static variable anymore, and uses instead the SCHED-api.
I would like first to get some comments on current version. Regards, Kai
Jim Wilson <jim.wil...@linaro.org> Kai Tietz <kai.ti...@linaro.org> * config/aarch64/aarch64.c (aarch64_sched_reorder): Implement TARGET_SCHED_REORDER hook. (aarch64_variable_issue): Implement TARGET_SCHED_VARIABLE_ISSUE hook. (qdf24xx_): Add AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS tune flag. (TARGET_SCHED_REORDER): Define. (TARGET_SCHED_VARIABLE_ISSUE): Likewise. * config/aarch64/falkor.md (falkor_variable_issue): New. * onfig/aarch64/aarch64-tuning-flags.def (SCHED_MICRO_OPS): New flag. Index: trunk/gcc/config/aarch64/aarch64.c =================================================================== --- trunk.orig/gcc/config/aarch64/aarch64.c +++ trunk/gcc/config/aarch64/aarch64.c @@ -955,7 +955,7 @@ static const struct tune_params qdf24xx_ &generic_branch_cost, &generic_approx_modes, 4, /* memmov_cost */ - 4, /* issue_rate */ + 8, /* issue_rate */ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ "16", /* function_align. */ @@ -968,7 +968,7 @@ static const struct tune_params qdf24xx_ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */ + AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS | AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS, /* tune_flags. */ &qdf24xx_prefetch_tune }; @@ -18037,6 +18037,109 @@ aarch64_run_selftests (void) #endif /* #if CHECKING_P */ +/* The number of micro ops left over after issuing the last instruction in a + cycle. This is subtracted from the next cycle before we start issuing insns. + This is initialized to 0 at the start of every basic block. */ +static int leftover_uops = 0; + +/* Implement TARGET_SCHED_REORDER. */ + +static int +aarch64_sched_reorder (FILE *file, int verbose, + rtx_insn **ready ATTRIBUTE_UNUSED, + int *n_readyp ATTRIBUTE_UNUSED, + int clock) +{ + int can_issue_more = aarch64_sched_issue_rate (); + + if ((aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS) != 0) + { + /* The start of a basic block. */ + if (clock == 0) + { + if (leftover_uops && file && (verbose > 3)) + fprintf (file, ";;\tLeftover uops ignored at bb start.\n"); + + leftover_uops = 0; + } + + /* Account for issue slots left over from previous cycle. This value + can be larger than the number of issue slots per cycle, so we need + to check it here before scheduling any instructions. */ + else if (leftover_uops) + { + can_issue_more -= leftover_uops; + + if (file && (verbose > 3)) + { + fprintf (file, ";;\tUse %d issue slots for leftover uops.\n", + leftover_uops); + fprintf (file, ";;\t%d issue slots left.\n", can_issue_more); + } + + leftover_uops = 0; + + if (can_issue_more < 0) + { + leftover_uops = 0 - can_issue_more; + can_issue_more = 0; + + if (file && (verbose > 3)) + { + fprintf (file, ";;skipping issue cycle.\n"); + fprintf (file, ";;\t%d uops left over.\n", leftover_uops); + } + } + } + } + + return can_issue_more; +} + +/* Implement TARGET_SCHED_VARIABLE_ISSUE. */ + +static int +aarch64_variable_issue (FILE *file, int verbose, + rtx_insn *insn, int more) +{ + if (GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + { + if ((aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS) == 0) + more -= 1; + else + { + /* There is for now just falkor target supporting scheduling + of micro operations. Therefore we don't need to check. */ + int issue_slots = get_attr_falkor_variable_issue (insn); + more -= issue_slots; + + if (file && (verbose > 3)) + { + fprintf (file, ";;\tInsn takes %d issue slots.\n", issue_slots); + fprintf (file, ";;\t%d issue slots left.\n", more); + } + + /* We schedule an instruction first, and then subtract issue slots, + which means the result can be negative. We carry the extra over + to the next cycle. */ + + if (more < 0) + { + leftover_uops = 0 - more; + more = 0; + + if (file && (verbose > 3)) + fprintf (file, ";;\t%d uops left over.\n", leftover_uops); + } + } + } + + return more; +} + #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST aarch64_address_cost @@ -18265,6 +18368,12 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_SCHED_ISSUE_RATE #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate +#undef TARGET_SCHED_REORDER +#define TARGET_SCHED_REORDER aarch64_sched_reorder + +#undef TARGET_SCHED_VARIABLE_ISSUE +#define TARGET_SCHED_VARIABLE_ISSUE aarch64_variable_issue + #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ aarch64_sched_first_cycle_multipass_dfa_lookahead Index: trunk/gcc/config/aarch64/falkor.md =================================================================== --- trunk.orig/gcc/config/aarch64/falkor.md +++ trunk/gcc/config/aarch64/falkor.md @@ -685,3 +685,24 @@ (define_bypass 3 "falkor_afp_5_vxvy_mul,falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mul,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mul,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mul,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mul,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mul,falkor_fpdt_6_vxvy_mla" "falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mla") + + +(define_attr "falkor_variable_issue" "" + (cond [ +;; A64 Instructions + (eq_attr "type" "neon_fp_neg_s_q,neon_fp_neg_d_q,neon_fp_abs_s_q,neon_fp_abs_d_q,neon_fp_minmax_s_q,neon_fp_minmax_d_q,neon_fp_compare_s_q,neon_fp_compare_d_q,neon_fp_round_s_q,neon_fp_round_d_q,neon_fp_abd_s_q,neon_fp_abd_d_q,neon_fp_addsub_s_q,neon_fp_addsub_d_q,neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q,neon_fp_to_int_s_q,neon_fp_to_int_d_q,neon_int_to_fp_s_q,neon_int_to_fp_d_q,neon_fp_mla_d_q,neon_fp_mla_d_scalar_q,neon_fp_div_s,neon_fp_div_d,neon_fp_sqrt_s,neon_fp_sqrt_d,neon_shift_imm_long,neon_add_q,neon_reduc_add_q,neon_logic_q,neon_neg_q,neon_sub_q,neon_add_halve_q,neon_sub_halve_q,neon_shift_imm_q,neon_shift_reg_q,neon_minmax_q,neon_abs_q,neon_compare_q,neon_compare_zero_q,neon_tst_q,neon_reduc_add_long,neon_shift_acc_q,neon_reduc_add_acc_q,neon_abd_q,neon_abd_long,neon_qadd_q,neon_qsub_q,neon_qabs_q,neon_qneg_q,neon_sat_shift_imm_q,neon_sat_shift_reg_q,neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,neon_mul_h_scalar_q,neon_mul_s_scalar_q,neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,neon_mul_d_long,neon_mul_h_scalar_long,neon_mul_s_scalar_long,neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,neon_sat_mul_h_scalar_q,neon_sat_mul_s_scalar_q,neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long,neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,neon_mla_h_scalar_q,neon_mla_s_scalar_q,neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,neon_mla_h_scalar_long,neon_mla_s_scalar_long,neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long,neon_add_halve_narrow_q,neon_sub_halve_narrow_q,neon_arith_acc,neon_load1_2reg,neon_load2_2reg,neon_load2_all_lanes,neon_load1_2reg_q,neon_load2_2reg_q,neon_load2_all_lanes_q,neon_load3_one_lane,neon_load4_one_lane,neon_ldp,neon_ldp_q,neon_from_gp_q,neon_bsl_q,neon_dup_q,neon_ext_q,neon_move_q,neon_rev_q,neon_tbl1_q,neon_permute_q,neon_cls_q,neon_cnt_q,neon_rbit_q,neon_tbl2,neon_fp_recpe_s_q,neon_fp_recpe_d_q,neon_fp_rsqrte_s_q,neon_fp_rsqrte_d_q,neon_fp_recps_s_q,neon_fp_recps_d_q,neon_fp_rsqrts_d_q,neon_store1_1reg,neon_store1_1reg_q,neon_store1_one_lane,neon_store1_one_lane_q,neon_store1_2reg,neon_store2_2reg,neon_store2_one_lane,neon_store2_one_lane_q,neon_stp,crypto_sha1_xor,crypto_sha256_fast,crypto_sha1_slow,crypto_sha256_slow,crypto_aese,f_stores,f_stored,fdivs,fdivd,sdiv,udiv") + (const_int 2) + (eq_attr "type" "neon_fp_cvt_narrow_s_q,neon_fp_cvt_narrow_d_q,neon_load1_3reg,neon_load3_3reg,neon_load3_all_lanes,neon_load1_3reg_q,neon_load3_3reg_q,neon_load3_all_lanes_q,neon_tbl2_q,neon_tbl3") + (const_int 3) + (eq_attr "type" "neon_fp_div_s_q,neon_fp_div_d_q,neon_fp_sqrt_s_q,neon_fp_sqrt_d_q,neon_add_widen,neon_sub_widen,neon_arith_acc_q,neon_load1_4reg,neon_load4_4reg,neon_load1_4reg_q,neon_load4_4reg_q,neon_load4_all_lanes,neon_load4_all_lanes_q,neon_tbl3_q,neon_tbl4,neon_store1_2reg_q,neon_store1_3reg,neon_store1_4reg,neon_store2_2reg_q,neon_store3_3reg,neon_store4_4reg,neon_store3_one_lane,neon_store3_one_lane_q,neon_store4_one_lane,neon_store4_one_lane_q,neon_stp_q") + (const_int 4) + (eq_attr "type" "neon_tbl4_q") + (const_int 5) + (eq_attr "type" "neon_store1_3reg_q,neon_store3_3reg_q") + (const_int 6) + (eq_attr "type" "neon_store1_4reg_q,neon_store4_4reg_q") + (const_int 8) + (eq_attr "type" "multiple") + (const_int 2) + ] + (const_int 1))) Index: trunk/gcc/config/aarch64/aarch64-tuning-flags.def =================================================================== --- trunk.orig/gcc/config/aarch64/aarch64-tuning-flags.def +++ trunk/gcc/config/aarch64/aarch64-tuning-flags.def @@ -46,4 +46,7 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) +/* Enable micro-op scheduling instead of default instruction one. */ +AARCH64_EXTRA_TUNING_OPTION ("sched_microops", SCHED_MICRO_OPS) + #undef AARCH64_EXTRA_TUNING_OPTION