On Wed Sep 13, 2023 at 6:24 AM AEST, Glenn Miles wrote: > This commit continues adding support for the Branch History > Rolling Buffer (BHRB) as is provided starting with the P8 > processor and continuing with its successors. This commit > is limited to the recording and filtering of taken branches. > > The following changes were made: > > - Added a BHRB buffer for storing branch instruction and > target addresses for taken branches > - Renamed gen_update_cfar to gen_update_branch_history and > added a 'target' parameter to hold the branch target > address and 'inst_type' parameter to use for filtering > - Added a combination of jit-time and run-time checks to > gen_update_branch_history for determining if a branch > should be recorded > - Added TCG code to gen_update_branch_history that stores > data to the BHRB and updates the BHRB offset. > - Added BHRB resource initialization and reset functions > - Enabled functionality for P8, P9 and P10 processors. > > Signed-off-by: Glenn Miles <mil...@linux.vnet.ibm.com> > --- > target/ppc/cpu.h | 18 +++- > target/ppc/cpu_init.c | 41 ++++++++- > target/ppc/helper_regs.c | 32 +++++++ > target/ppc/helper_regs.h | 1 + > target/ppc/power8-pmu.c | 2 + > target/ppc/power8-pmu.h | 7 ++ > target/ppc/translate.c | 114 +++++++++++++++++++++++-- > target/ppc/translate/branch-impl.c.inc | 2 +- > 8 files changed, 205 insertions(+), 12 deletions(-) > > diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h > index 20ae1466a5..bda1afb700 100644 > --- a/target/ppc/cpu.h > +++ b/target/ppc/cpu.h > @@ -454,8 +454,9 @@ FIELD(MSR, LE, MSR_LE, 1) > #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \ > MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0) > > -#define MMCRA_BHRBRD PPC_BIT(26) /* BHRB Recording Disable */ > - > +#define MMCRA_BHRBRD PPC_BIT(26) /* BHRB Recording Disable */
Fold this tidying into patch 1. > +#define MMCRA_IFM_MASK PPC_BITMASK(32, 33) /* BHRB Instruction Filtering */ > +#define MMCRA_IFM_SHIFT PPC_BIT_NR(33) > > #define MMCR1_EVT_SIZE 8 > /* extract64() does a right shift before extracting */ > @@ -682,6 +683,8 @@ enum { > POWERPC_FLAG_SMT = 0x00400000, > /* Using "LPAR per core" mode (as opposed to per-thread) > */ > POWERPC_FLAG_SMT_1LPAR = 0x00800000, > + /* Has BHRB */ > + POWERPC_FLAG_BHRB = 0x01000000, > }; Interesting question of which patch to add different flags. I'm strongly in add when you add code that uses them like this one, but it's a matter of taste and not always practical to be an absolute rule. I don't mind too much what others do, but maybe this and the pcc->flags init should go in patch 1 since that's adding flags that aren't yet used? > > /* > @@ -1110,6 +1113,9 @@ DEXCR_ASPECT(PHIE, 6) > #define PPC_CPU_OPCODES_LEN 0x40 > #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20 > > +#define BHRB_MAX_NUM_ENTRIES_LOG2 (5) > +#define BHRB_MAX_NUM_ENTRIES (1 << BHRB_MAX_NUM_ENTRIES_LOG2) > + > struct CPUArchState { > /* Most commonly used resources during translated code execution first */ > target_ulong gpr[32]; /* general purpose registers */ > @@ -1196,6 +1202,14 @@ struct CPUArchState { > int dcache_line_size; > int icache_line_size; > > + /* Branch History Rolling Buffer (BHRB) resources */ > + target_ulong bhrb_num_entries; > + target_ulong bhrb_base; > + target_ulong bhrb_filter; > + target_ulong bhrb_offset; > + target_ulong bhrb_offset_mask; > + uint64_t bhrb[BHRB_MAX_NUM_ENTRIES]; Put these under ifdef TARGET_PPC64? > + > /* These resources are used during exception processing */ > /* CPU model definition */ > target_ulong msr_mask; > diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c > index 568f9c3b88..19d7505a73 100644 > --- a/target/ppc/cpu_init.c > +++ b/target/ppc/cpu_init.c > @@ -6100,6 +6100,28 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data) > pcc->l1_icache_size = 0x8000; > } > > +static void bhrb_init_state(CPUPPCState *env, target_long num_entries_log2) > +{ > + if (env->flags & POWERPC_FLAG_BHRB) { > + if (num_entries_log2 > BHRB_MAX_NUM_ENTRIES_LOG2) { > + num_entries_log2 = BHRB_MAX_NUM_ENTRIES_LOG2; > + } > + env->bhrb_num_entries = 1 << num_entries_log2; > + env->bhrb_base = (target_long)&env->bhrb[0]; > + env->bhrb_offset_mask = (env->bhrb_num_entries * sizeof(uint64_t)) - > 1; > + } > +} > + > +static void bhrb_reset_state(CPUPPCState *env) > +{ > + if (env->flags & POWERPC_FLAG_BHRB) { > + env->bhrb_offset = 0; > + env->bhrb_filter = 0; > + memset(env->bhrb, 0, sizeof(env->bhrb)); > + } > +} > + > +#define POWER8_BHRB_ENTRIES_LOG2 5 > static void init_proc_POWER8(CPUPPCState *env) > { > /* Common Registers */ > @@ -6141,6 +6163,8 @@ static void init_proc_POWER8(CPUPPCState *env) > env->dcache_line_size = 128; > env->icache_line_size = 128; > > + bhrb_init_state(env, POWER8_BHRB_ENTRIES_LOG2); > + > /* Allocate hardware IRQ controller */ > init_excp_POWER8(env); > ppcPOWER7_irq_init(env_archcpu(env)); > @@ -6241,7 +6265,8 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data) > pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE | > POWERPC_FLAG_BE | POWERPC_FLAG_PMM | > POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR | > - POWERPC_FLAG_VSX | POWERPC_FLAG_TM; > + POWERPC_FLAG_VSX | POWERPC_FLAG_TM | > + POWERPC_FLAG_BHRB; > pcc->l1_dcache_size = 0x8000; > pcc->l1_icache_size = 0x8000; > } > @@ -6265,6 +6290,7 @@ static struct ppc_radix_page_info > POWER9_radix_page_info = { > }; > #endif /* CONFIG_USER_ONLY */ > > +#define POWER9_BHRB_ENTRIES_LOG2 5 > static void init_proc_POWER9(CPUPPCState *env) > { > /* Common Registers */ > @@ -6315,6 +6341,8 @@ static void init_proc_POWER9(CPUPPCState *env) > env->dcache_line_size = 128; > env->icache_line_size = 128; > > + bhrb_init_state(env, POWER9_BHRB_ENTRIES_LOG2); > + > /* Allocate hardware IRQ controller */ > init_excp_POWER9(env); > ppcPOWER9_irq_init(env_archcpu(env)); > @@ -6434,7 +6462,8 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data) > pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE | > POWERPC_FLAG_BE | POWERPC_FLAG_PMM | > POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR | > - POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV; > + POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV | > + POWERPC_FLAG_BHRB; > pcc->l1_dcache_size = 0x8000; > pcc->l1_icache_size = 0x8000; > } > @@ -6458,6 +6487,7 @@ static struct ppc_radix_page_info > POWER10_radix_page_info = { > }; > #endif /* !CONFIG_USER_ONLY */ > > +#define POWER10_BHRB_ENTRIES_LOG2 5 > static void init_proc_POWER10(CPUPPCState *env) > { > /* Common Registers */ > @@ -6505,6 +6535,8 @@ static void init_proc_POWER10(CPUPPCState *env) > env->dcache_line_size = 128; > env->icache_line_size = 128; > > + bhrb_init_state(env, POWER10_BHRB_ENTRIES_LOG2); > + > /* Allocate hardware IRQ controller */ > init_excp_POWER10(env); > ppcPOWER9_irq_init(env_archcpu(env)); > @@ -6610,7 +6642,8 @@ POWERPC_FAMILY(POWER10)(ObjectClass *oc, void *data) > pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE | > POWERPC_FLAG_BE | POWERPC_FLAG_PMM | > POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR | > - POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV; > + POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV | > + POWERPC_FLAG_BHRB; > pcc->l1_dcache_size = 0x8000; > pcc->l1_icache_size = 0x8000; > } > @@ -7178,6 +7211,8 @@ static void ppc_cpu_reset_hold(Object *obj) > } > env->spr[i] = spr->default_value; > } > + > + bhrb_reset_state(env); > } > > #ifndef CONFIG_USER_ONLY > diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c > index 4ff054063d..68b27196d9 100644 > --- a/target/ppc/helper_regs.c > +++ b/target/ppc/helper_regs.c > @@ -752,3 +752,35 @@ void register_usprgh_sprs(CPUPPCState *env) > &spr_read_ureg, SPR_NOACCESS, > 0x00000000); > } > + > +void hreg_bhrb_filter_update(CPUPPCState *env) > +{ > + target_long ifm; > + > + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE)) { > + /* disable recording to BHRB */ > + env->bhrb_filter = BHRB_TYPE_NORECORD; > + return; > + } > + > + ifm = (env->spr[SPR_POWER_MMCRA] & MMCRA_IFM_MASK) >> MMCRA_IFM_SHIFT; > + switch (ifm) { > + case 0: > + /* record all branches */ > + env->bhrb_filter = -1; > + break; > + case 1: > + /* only record calls (LK = 1) */ > + env->bhrb_filter = BHRB_TYPE_CALL; > + break; > + case 2: > + /* only record indirect branches */ > + env->bhrb_filter = BHRB_TYPE_INDIRECT; > + break; > + case 3: > + /* only record conditional branches */ > + env->bhrb_filter = BHRB_TYPE_COND; > + break; > + } > +} > + > diff --git a/target/ppc/helper_regs.h b/target/ppc/helper_regs.h > index 8196c1346d..ab6aba1c24 100644 > --- a/target/ppc/helper_regs.h > +++ b/target/ppc/helper_regs.h > @@ -25,6 +25,7 @@ void hreg_compute_hflags(CPUPPCState *env); > void hreg_update_pmu_hflags(CPUPPCState *env); > void cpu_interrupt_exittb(CPUState *cs); > int hreg_store_msr(CPUPPCState *env, target_ulong value, int alter_hv); > +void hreg_bhrb_filter_update(CPUPPCState *env); > > #ifdef CONFIG_USER_ONLY > static inline void check_tlb_flush(CPUPPCState *env, bool global) { } > diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c > index 6f5d4e1256..18149a15b2 100644 > --- a/target/ppc/power8-pmu.c > +++ b/target/ppc/power8-pmu.c > @@ -261,6 +261,7 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong > value) > env->spr[SPR_POWER_MMCR0] = value; > > pmu_mmcr01a_updated(env); > + hreg_bhrb_filter_update(env); > > /* Update cycle overflow timers with the current MMCR0 state */ > pmu_update_overflow_timers(env); > @@ -280,6 +281,7 @@ void helper_store_mmcrA(CPUPPCState *env, uint64_t value) > env->spr[SPR_POWER_MMCRA] = value; > > pmu_mmcr01a_updated(env); > + hreg_bhrb_filter_update(env); > } > Can these just be moved into pmu_mmcr01a_updated? AFAIKS they only depend on MMCR0/A. > target_ulong helper_read_pmc(CPUPPCState *env, uint32_t sprn) > diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h > index 87fa8c9334..a887094045 100644 > --- a/target/ppc/power8-pmu.h > +++ b/target/ppc/power8-pmu.h > @@ -17,6 +17,13 @@ > > #define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL > > +#define BHRB_TYPE_NORECORD 0x00 > +#define BHRB_TYPE_CALL 0x01 > +#define BHRB_TYPE_INDIRECT 0x02 > +#define BHRB_TYPE_COND 0x04 > +#define BHRB_TYPE_OTHER 0x08 > +#define BHRB_TYPE_XL_FORM 0x10 > + > void cpu_ppc_pmu_init(CPUPPCState *env); > void pmu_mmcr01a_updated(CPUPPCState *env); > #else > diff --git a/target/ppc/translate.c b/target/ppc/translate.c > index d93fbd4574..7824475f54 100644 > --- a/target/ppc/translate.c > +++ b/target/ppc/translate.c > @@ -177,6 +177,7 @@ struct DisasContext { > #if defined(TARGET_PPC64) > bool sf_mode; > bool has_cfar; > + bool has_bhrb; > #endif > bool fpu_enabled; > bool altivec_enabled; > @@ -4104,12 +4105,100 @@ static void gen_rvwinkle(DisasContext *ctx) > } > #endif /* #if defined(TARGET_PPC64) */ > > -static inline void gen_update_cfar(DisasContext *ctx, target_ulong nip) > + > +static inline void gen_update_branch_history(DisasContext *ctx, > + target_ulong nip, > + TCGv target, > + target_long inst_type) > { > #if defined(TARGET_PPC64) > + TCGv t0; > + TCGv t1; > + TCGv t2; > + TCGv t3; > + TCGLabel *no_update; > + > if (ctx->has_cfar) { > tcg_gen_movi_tl(cpu_cfar, nip); > } > + > + if (!ctx->has_bhrb || inst_type == BHRB_TYPE_NORECORD) { > + return; > + } > + > + /* ISA 3.1 adds the PMCRA[BRHBRD] and problem state checks */ > + if ((ctx->insns_flags2 & PPC2_ISA310) && (ctx->mmcra_bhrbrd || > !ctx->pr)) { > + return; > + } > + > + /* Check for BHRB "frozen" conditions */ > + if (ctx->mmcr0_fcpc) { > + if (ctx->mmcr0_fcp) { > + if ((ctx->hv) && (ctx->pr)) { > + return; > + } > + } else if (!(ctx->hv) && (ctx->pr)) { > + return; > + } > + } else if ((ctx->mmcr0_fcp) && (ctx->pr)) { > + return; > + } > + > + t0 = tcg_temp_new(); > + t1 = tcg_temp_new(); > + t2 = tcg_temp_new(); > + t3 = tcg_temp_new(); > + no_update = gen_new_label(); > + > + /* check for bhrb filtering */ > + tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_filter)); > + tcg_gen_andi_tl(t0, t0, inst_type); > + tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, no_update); > + > + /* load bhrb base address into t2 */ > + tcg_gen_ld_tl(t2, cpu_env, offsetof(CPUPPCState, bhrb_base)); > + > + /* load current bhrb_offset into t0 */ > + tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_offset)); > + > + /* load a BHRB offset mask into t3 */ > + tcg_gen_ld_tl(t3, cpu_env, offsetof(CPUPPCState, bhrb_offset_mask)); > + > + /* add t2 to t0 to get address of bhrb entry */ > + tcg_gen_add_tl(t1, t0, t2); > + > + /* store nip into bhrb at bhrb_offset */ > + tcg_gen_st_i64(tcg_constant_i64(nip), (TCGv_ptr)t1, 0); > + > + /* add 8 to current bhrb_offset */ > + tcg_gen_addi_tl(t0, t0, 8); > + > + /* apply offset mask */ > + tcg_gen_and_tl(t0, t0, t3); For some cases where the value remains live for a time and not reused, maybe use a more descriptive name? I know existing code is pretty lazy with names (and I share some blame). t0,t2,t3 could be base, offset, mask respectively, and t1 could be tmp that is used for filter and address generation. Just a suggestion? > + > + if (inst_type & BHRB_TYPE_XL_FORM) { > + /* Also record the target address for XL-Form branches */ > + > + /* add t2 to t0 to get address of bhrb entry */ > + tcg_gen_add_tl(t1, t0, t2); > + > + /* Set the 'T' bit for target entries */ > + tcg_gen_ori_tl(t2, target, 0x2); > + > + /* Store target address in bhrb */ > + tcg_gen_st_tl(t2, (TCGv_ptr)t1, 0); > + > + /* add 8 to current bhrb_offset */ > + tcg_gen_addi_tl(t0, t0, 8); > + > + /* apply offset mask */ > + tcg_gen_and_tl(t0, t0, t3); > + } I would say this is bordering on warranting a new function since it mostly duplicates previous block. You could have it take base, mask, offset and value to store, and have it return new offset. > + > + /* save updated bhrb_offset for next time */ > + tcg_gen_st_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_offset)); > + > + gen_set_label(no_update); > #endif > } It all looks pretty good otherwise. I do worry about POWER8/9 which do not have BHRB disable bit. How much do they slow down I wonder? Thanks, Nick