Attaching data in excel which could not be sent with the patch at the same time.
On Thu, Jan 23, 2014 at 1:49 PM, Xin Tong <trent.t...@gmail.com> wrote: > This patch adds a victim TLB to the QEMU system mode TLB. > > QEMU system mode page table walks are expensive. Taken by running QEMU > qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a > 4-level page tables in guest Linux OS takes ~450 X86 instructions on > average. > > QEMU system mode TLB is implemented using a directly-mapped hashtable. > This structure suffers from conflict misses. Increasing the > associativity of the TLB may not be the solution to conflict misses as > all the ways may have to be walked in serial. > > A victim TLB is a TLB used to hold translations evicted from the > primary TLB upon replacement. The victim TLB lies between the main TLB > and its refill path. Victim TLB is of greater associativity (fully > associative in this patch). It takes longer to lookup the victim TLB, > but its likely better than a full page table walk. The memory > translation path is changed as follows : > > Before Victim TLB: > 1. Inline TLB lookup > 2. Exit code cache on TLB miss. > 3. Check for unaligned, IO accesses > 4. TLB refill. > 5. Do the memory access. > 6. Return to code cache. > > After Victim TLB: > 1. Inline TLB lookup > 2. Exit code cache on TLB miss. > 3. Check for unaligned, IO accesses > 4. Victim TLB lookup. > 5. If victim TLB misses, TLB refill > 6. Do the memory access. > 7. Return to code cache > > The advantage is that victim TLB can offer more associativity to a > directly mapped TLB and thus potentially fewer page table walks while > still keeping the time taken to flush within reasonable limits. > However, placing a victim TLB before the refill path increase TLB > refill path as the victim TLB is consulted before the TLB refill. The > performance results demonstrate that the pros outweigh the cons. > > Attached are some performance results taken on SPECINT2006 train > datasets and kernel boot and qemu configure script on an > Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine. In > summary, victim TLB improves the performance of qemu-system-x86_64 by > 10.7% on average on SPECINT2006 and with highest improvement of in 25.4% > in 464.h264ref. And victim TLB does not result in any performance > degradation in any of the measured benchmarks. Furthermore, the > implemented victim TLB is architecture independent and is expected to > benefit other architectures in QEMU as well. > > Although there are measurement fluctuations, the performance > improvement is very significant and by no means in the range of > noises. > > Reviewed-by: Richard Henderson <r...@twiddle.net> > Signed-off-by: Xin Tong <trent.t...@gmail.com> > > --- > cputlb.c | 50 +++++++++++++++++++++++++- > include/exec/cpu-defs.h | 16 ++++++--- > include/exec/exec-all.h | 2 ++ > include/exec/softmmu_template.h | 80 > ++++++++++++++++++++++++++++++++++++++--- > 4 files changed, 138 insertions(+), 10 deletions(-) > > diff --git a/cputlb.c b/cputlb.c > index b533f3f..03a048a 100644 > --- a/cputlb.c > +++ b/cputlb.c > @@ -34,6 +34,22 @@ > /* statistics */ > int tlb_flush_count; > > +/* swap the 2 given TLB entries as well as their corresponding IOTLB */ > +inline void swap_tlb(CPUTLBEntry *te, CPUTLBEntry *se, hwaddr *iote, > + hwaddr *iose) > +{ > + hwaddr iotmp; > + CPUTLBEntry t; > + /* swap iotlb */ > + iotmp = *iote; > + *iote = *iose; > + *iose = iotmp; > + /* swap tlb */ > + memcpy(&t, te, sizeof(CPUTLBEntry)); > + memcpy(te, se, sizeof(CPUTLBEntry)); > + memcpy(se, &t, sizeof(CPUTLBEntry)); > +} > + > /* NOTE: > * If flush_global is true (the usual case), flush all tlb entries. > * If flush_global is false, flush (at least) all tlb entries not > @@ -58,8 +74,10 @@ void tlb_flush(CPUArchState *env, int flush_global) > cpu->current_tb = NULL; > > memset(env->tlb_table, -1, sizeof(env->tlb_table)); > + memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table)); > memset(env->tb_jmp_cache, 0, sizeof(env->tb_jmp_cache)); > > + env->vtlb_index = 0; > env->tlb_flush_addr = -1; > env->tlb_flush_mask = 0; > tlb_flush_count++; > @@ -106,6 +124,14 @@ void tlb_flush_page(CPUArchState *env, target_ulong addr) > tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr); > } > > + /* check whether there are entries that need to be flushed in the vtlb */ > + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > + unsigned int k; > + for (k = 0; k < CPU_VTLB_SIZE; k++) { > + tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr); > + } > + } > + > tb_flush_jmp_cache(env, addr); > } > > @@ -170,6 +196,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1, > ram_addr_t length) > tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i], > start1, length); > } > + > + for (i = 0; i < CPU_VTLB_SIZE; i++) { > + tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i], > + start1, length); > + } > } > } > } > @@ -193,6 +224,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong vaddr) > for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr); > } > + > + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { > + unsigned int k; > + for (k = 0; k < CPU_VTLB_SIZE; k++) { > + tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr); > + } > + } > } > > /* Our TLB does not support large pages, so remember the area covered by > @@ -264,8 +302,18 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, > prot, &address); > > index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); > - env->iotlb[mmu_idx][index] = iotlb - vaddr; > te = &env->tlb_table[mmu_idx][index]; > + > + /* do not discard the translation in te, evict it into a victim tlb */ > + unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE; > + env->tlb_v_table[mmu_idx][vidx].addr_read = te->addr_read; > + env->tlb_v_table[mmu_idx][vidx].addr_write = te->addr_write; > + env->tlb_v_table[mmu_idx][vidx].addr_code = te->addr_code; > + env->tlb_v_table[mmu_idx][vidx].addend = te->addend; > + env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index]; > + > + /* refill the tlb */ > + env->iotlb[mmu_idx][index] = iotlb - vaddr; > te->addend = addend - vaddr; > if (prot & PAGE_READ) { > te->addr_read = address; > diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h > index 01cd8c7..18d5f0d 100644 > --- a/include/exec/cpu-defs.h > +++ b/include/exec/cpu-defs.h > @@ -72,8 +72,10 @@ typedef uint64_t target_ulong; > #define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE) > > #if !defined(CONFIG_USER_ONLY) > -#define CPU_TLB_BITS 8 > -#define CPU_TLB_SIZE (1 << CPU_TLB_BITS) > +#define CPU_TLB_BITS 8 > +#define CPU_TLB_SIZE (1 << CPU_TLB_BITS) > +/* use a fully associative victim tlb */ > +#define CPU_VTLB_SIZE 8 > > #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32 > #define CPU_TLB_ENTRY_BITS 4 > @@ -103,12 +105,16 @@ typedef struct CPUTLBEntry { > > QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS)); > > +/* The meaning of the MMU modes is defined in the target code. */ > #define CPU_COMMON_TLB \ > /* The meaning of the MMU modes is defined in the target code. */ \ > - CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \ > - hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE]; \ > + hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE]; \ > + hwaddr iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE]; \ > target_ulong tlb_flush_addr; \ > - target_ulong tlb_flush_mask; > + target_ulong tlb_flush_mask; \ > + target_ulong vtlb_index; \ > > #else > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index ea90b64..7e88b08 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -102,6 +102,8 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, > hwaddr paddr, int prot, > int mmu_idx, target_ulong size); > void tb_invalidate_phys_addr(hwaddr addr); > +/* swap the 2 given tlb entries as well as their iotlb */ > +void swap_tlb(CPUTLBEntry *te, CPUTLBEntry *se, hwaddr *iote, hwaddr *iose); > #else > static inline void tlb_flush_page(CPUArchState *env, target_ulong addr) > { > diff --git a/include/exec/softmmu_template.h b/include/exec/softmmu_template.h > index c6a5440..fe11343 100644 > --- a/include/exec/softmmu_template.h > +++ b/include/exec/softmmu_template.h > @@ -141,6 +141,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > uintptr_t haddr; > DATA_TYPE res; > + int vtlb_idx; > > /* Adjust the given return address. */ > retaddr -= GETPC_ADJ; > @@ -153,7 +154,24 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > do_unaligned_access(env, addr, READ_ACCESS_TYPE, mmu_idx, > retaddr); > } > #endif > - tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); > + /* we are about to do a page table walk. our last hope is the > + * victim tlb. try to refill from the victim tlb before walking the > + * page table. */ > + for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) { > + if (env->tlb_v_table[mmu_idx][vtlb_idx].ADDR_READ > + == (addr & TARGET_PAGE_MASK)) { > + /* found entry in victim tlb */ > + swap_tlb(&env->tlb_table[mmu_idx][index], > + &env->tlb_v_table[mmu_idx][vtlb_idx], > + &env->iotlb[mmu_idx][index], > + &env->iotlb_v[mmu_idx][vtlb_idx]); > + break; > + } > + } > + /* miss victim tlb */ > + if (vtlb_idx < 0) { > + tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > } > > @@ -223,6 +241,7 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > uintptr_t haddr; > DATA_TYPE res; > + int vtlb_idx; > > /* Adjust the given return address. */ > retaddr -= GETPC_ADJ; > @@ -235,7 +254,24 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, > target_ulong addr, int mmu_idx, > do_unaligned_access(env, addr, READ_ACCESS_TYPE, mmu_idx, > retaddr); > } > #endif > - tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); > + /* we are about to do a page table walk. our last hope is the > + * victim tlb. try to refill from the victim tlb before walking the > + * page table. */ > + for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) { > + if (env->tlb_v_table[mmu_idx][vtlb_idx].ADDR_READ > + == (addr & TARGET_PAGE_MASK)) { > + /* found entry in victim tlb */ > + swap_tlb(&env->tlb_table[mmu_idx][index], > + &env->tlb_v_table[mmu_idx][vtlb_idx], > + &env->iotlb[mmu_idx][index], > + &env->iotlb_v[mmu_idx][vtlb_idx]); > + break; > + } > + } > + /* miss victim tlb */ > + if (vtlb_idx < 0) { > + tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; > } > > @@ -342,6 +378,7 @@ void helper_le_st_name(CPUArchState *env, target_ulong > addr, DATA_TYPE val, > int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); > target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write; > uintptr_t haddr; > + int vtlb_idx; > > /* Adjust the given return address. */ > retaddr -= GETPC_ADJ; > @@ -354,7 +391,24 @@ void helper_le_st_name(CPUArchState *env, target_ulong > addr, DATA_TYPE val, > do_unaligned_access(env, addr, 1, mmu_idx, retaddr); > } > #endif > - tlb_fill(env, addr, 1, mmu_idx, retaddr); > + /* we are about to do a page table walk. our last hope is the > + * victim tlb. try to refill from the victim tlb before walking the > + * page table. */ > + for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) { > + if (env->tlb_v_table[mmu_idx][vtlb_idx].addr_write > + == (addr & TARGET_PAGE_MASK)) { > + /* found entry in victim tlb */ > + swap_tlb(&env->tlb_table[mmu_idx][index], > + &env->tlb_v_table[mmu_idx][vtlb_idx], > + &env->iotlb[mmu_idx][index], > + &env->iotlb_v[mmu_idx][vtlb_idx]); > + break; > + } > + } > + /* miss victim tlb */ > + if (vtlb_idx < 0) { > + tlb_fill(env, addr, 1, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].addr_write; > } > > @@ -418,6 +472,7 @@ void helper_be_st_name(CPUArchState *env, target_ulong > addr, DATA_TYPE val, > int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); > target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write; > uintptr_t haddr; > + int vtlb_idx; > > /* Adjust the given return address. */ > retaddr -= GETPC_ADJ; > @@ -430,7 +485,24 @@ void helper_be_st_name(CPUArchState *env, target_ulong > addr, DATA_TYPE val, > do_unaligned_access(env, addr, 1, mmu_idx, retaddr); > } > #endif > - tlb_fill(env, addr, 1, mmu_idx, retaddr); > + /* we are about to do a page table walk. our last hope is the > + * victim tlb. try to refill from the victim tlb before walking the > + * page table. */ > + for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) { > + if (env->tlb_v_table[mmu_idx][vtlb_idx].addr_write > + == (addr & TARGET_PAGE_MASK)) { > + /* found entry in victim tlb */ > + swap_tlb(&env->tlb_table[mmu_idx][index], > + &env->tlb_v_table[mmu_idx][vtlb_idx], > + &env->iotlb[mmu_idx][index], > + &env->iotlb_v[mmu_idx][vtlb_idx]); > + break; > + } > + } > + /* miss victim tlb */ > + if (vtlb_idx < 0) { > + tlb_fill(env, addr, 1, mmu_idx, retaddr); > + } > tlb_addr = env->tlb_table[mmu_idx][index].addr_write; > } > > -- > 1.8.3.2 >
vtlb.xlsx
Description: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet