On 28.08.14 19:14, Paolo Bonzini wrote: > PowerPC TCG flushes the TLB on every IR/DR change, which basically > means on every user<->kernel context switch. Use the 6-element > TLB array as a cache, where each MMU index is mapped to a different > state of the IR/DR/PR/HV bits. > > This brings the number of TLB flushes down from ~900000 to ~50000 > for starting up the Debian installer, which is in line with x86 > and gives a ~10% performance improvement. > > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> > --- > cputlb.c | 19 +++++++++++++++++ > hw/ppc/spapr_hcall.c | 6 +++++- > include/exec/exec-all.h | 5 +++++ > target-ppc/cpu.h | 4 +++- > target-ppc/excp_helper.c | 6 +----- > target-ppc/helper_regs.h | 52 > +++++++++++++++++++++++++++++++-------------- > target-ppc/translate_init.c | 5 +++++ > 7 files changed, 74 insertions(+), 23 deletions(-) > > diff --git a/cputlb.c b/cputlb.c > index afd3705..17e1b03 100644 > --- a/cputlb.c > +++ b/cputlb.c > @@ -67,6 +67,25 @@ void tlb_flush(CPUState *cpu, int flush_global) > tlb_flush_count++; > } > > +void tlb_flush_idx(CPUState *cpu, int mmu_idx) > +{ > + CPUArchState *env = cpu->env_ptr; > + > +#if defined(DEBUG_TLB) > + printf("tlb_flush_idx %d:\n", mmu_idx); > +#endif > + /* must reset current TB so that interrupts cannot modify the > + links while we are modifying them */ > + cpu->current_tb = NULL; > + > + memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[mmu_idx])); > + memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); > + > + env->tlb_flush_addr = -1; > + env->tlb_flush_mask = 0; > + tlb_flush_count++; > +} > + > static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr) > { > if (addr == (tlb_entry->addr_read & > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 467858c..b95961c 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -556,13 +556,17 @@ static target_ulong h_cede(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > { > CPUPPCState *env = &cpu->env; > CPUState *cs = CPU(cpu); > + bool flush; > > env->msr |= (1ULL << MSR_EE); > - hreg_compute_hflags(env); > + flush = hreg_compute_hflags(env); > if (!cpu_has_work(cs)) { > cs->halted = 1; > cs->exception_index = EXCP_HLT; > cs->exit_request = 1; > + } else if (flush) { > + cs->interrupt_request |= CPU_INTERRUPT_EXITTB; > + cs->exit_request = 1;
Can this ever happen? > } > return H_SUCCESS; > } > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index 5e5d86e..629a550 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -100,6 +100,7 @@ void tcg_cpu_address_space_init(CPUState *cpu, > AddressSpace *as); > /* cputlb.c */ > void tlb_flush_page(CPUState *cpu, target_ulong addr); > void tlb_flush(CPUState *cpu, int flush_global); > +void tlb_flush_idx(CPUState *cpu, int mmu_idx); > void tlb_set_page(CPUState *cpu, target_ulong vaddr, > hwaddr paddr, int prot, > int mmu_idx, target_ulong size); > @@ -112,6 +113,10 @@ static inline void tlb_flush_page(CPUState *cpu, > target_ulong addr) > static inline void tlb_flush(CPUState *cpu, int flush_global) > { > } > + > +static inline void tlb_flush_idx(CPUState *cpu, int mmu_idx) > +{ > +} > #endif > > #define CODE_GEN_ALIGN 16 /* must be >= of the size of a icache > line */ > diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h > index b64c652..c1cb27f 100644 > --- a/target-ppc/cpu.h > +++ b/target-ppc/cpu.h > @@ -922,7 +922,7 @@ struct ppc_segment_page_sizes { > > > /*****************************************************************************/ > /* The whole PowerPC CPU context */ > -#define NB_MMU_MODES 3 > +#define NB_MMU_MODES 6 > > #define PPC_CPU_OPCODES_LEN 0x40 > > @@ -1085,6 +1085,8 @@ struct CPUPPCState { > target_ulong hflags; /* hflags is a MSR & HFLAGS_MASK */ > target_ulong hflags_nmsr; /* specific hflags, not coming from MSR */ > int mmu_idx; /* precomputed MMU index to speed up mem accesses */ > + uint32_t mmu_msr[NB_MMU_MODES]; /* ir/dr/hv/pr values for TLBs */ > + int mmu_fifo; /* for replacement in mmu_msr */ > > /* Power management */ > int (*check_pow)(CPUPPCState *env); > diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c > index be71590..bf25d44 100644 > --- a/target-ppc/excp_helper.c > +++ b/target-ppc/excp_helper.c > @@ -623,9 +623,6 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int > excp_model, int excp) > > if (env->spr[SPR_LPCR] & LPCR_AIL) { > new_msr |= (1 << MSR_IR) | (1 << MSR_DR); > - } else if (msr & ((1 << MSR_IR) | (1 << MSR_DR))) { > - /* If we disactivated any translation, flush TLBs */ > - tlb_flush(cs, 1); > } > > #ifdef TARGET_PPC64 > @@ -678,8 +675,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int > excp_model, int excp) > if ((env->mmu_model == POWERPC_MMU_BOOKE) || > (env->mmu_model == POWERPC_MMU_BOOKE206)) { > /* XXX: The BookE changes address space when switching modes, > - we should probably implement that as different MMU indexes, > - but for the moment we do it the slow way and flush all. */ > + TODO: still needed?!? */ > tlb_flush(cs, 1); > } > } > diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h > index 271fddf..291f9c1 100644 > --- a/target-ppc/helper_regs.h > +++ b/target-ppc/helper_regs.h > @@ -39,17 +39,38 @@ static inline void hreg_swap_gpr_tgpr(CPUPPCState *env) > env->tgpr[3] = tmp; > } > > -static inline void hreg_compute_mem_idx(CPUPPCState *env) > +static inline bool hreg_compute_mem_idx(CPUPPCState *env) > { > - /* Precompute MMU index */ > - if (msr_pr == 0 && msr_hv != 0) { > - env->mmu_idx = 2; > - } else { > - env->mmu_idx = 1 - msr_pr; > + CPUState *cs = CPU(ppc_env_get_cpu(env)); > + int msr = env->msr; > + int i; > + > + if (!tcg_enabled()) { > + return false; > + } > + > + msr &= (1 << MSR_IR) | (1 << MSR_DR) | (1 << MSR_PR) | MSR_HVB; > + if (msr_pr == 1) { > + msr &= ~MSR_HVB; > } > + > + for (i = 0; i < NB_MMU_MODES; i++) { > + if (env->mmu_msr[i] == msr) { > + env->mmu_idx = i; > + return false; > + } > + } > + > + /* Use a new index with FIFO replacement. */ > + i = (env->mmu_fifo == NB_MMU_MODES - 1 ? 0 : env->mmu_fifo + 1); > + env->mmu_fifo = i; > + env->mmu_msr[i] = msr; > + env->mmu_idx = i; > + tlb_flush_idx(cs, i); > + return true; > } Ok, so this basically changes the semantics of mmu_idx from a static array with predefined meanings to a dynamic array with runtime changing semantics. The first thing that comes to mind here is why we're not just extending the existing array? After all, we have 4 bits -> 16 states minus one for PR+HV. Can our existing logic not deal with this? Second thing I'm failing to grasp still is that in the previous patch you're changing ctx.mem_idx into to different static semantics. But that mem_idx gets passed to our ld/st helpers which again boils down to the mem_idx above, no? So aren't we accessing random unrelated mmu contexts now? There's a good chance I'm not fully grasping something here :). Alex