Re: [Qemu-devel] [PATCH v2] cpu: implementing victim TLB for QEMU system emulated TLB

Xin Tong Thu, 23 Jan 2014 11:55:44 -0800

Attaching data in excel which could not be sent with the patch at the same time.


On Thu, Jan 23, 2014 at 1:49 PM, Xin Tong <trent.t...@gmail.com> wrote:
> This patch adds a victim TLB to the QEMU system mode TLB.
>
> QEMU system mode page table walks are expensive. Taken by running QEMU
> qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a
> 4-level page tables in guest Linux OS takes ~450 X86 instructions on
> average.
>
> QEMU system mode TLB is implemented using a directly-mapped hashtable.
> This structure suffers from conflict misses. Increasing the
> associativity of the TLB may not be the solution to conflict misses as
> all the ways may have to be walked in serial.
>
> A victim TLB is a TLB used to hold translations evicted from the
> primary TLB upon replacement. The victim TLB lies between the main TLB
> and its refill path. Victim TLB is of greater associativity (fully
> associative in this patch). It takes longer to lookup the victim TLB,
> but its likely better than a full page table walk. The memory
> translation path is changed as follows :
>
> Before Victim TLB:
> 1. Inline TLB lookup
> 2. Exit code cache on TLB miss.
> 3. Check for unaligned, IO accesses
> 4. TLB refill.
> 5. Do the memory access.
> 6. Return to code cache.
>
> After Victim TLB:
> 1. Inline TLB lookup
> 2. Exit code cache on TLB miss.
> 3. Check for unaligned, IO accesses
> 4. Victim TLB lookup.
> 5. If victim TLB misses, TLB refill
> 6. Do the memory access.
> 7. Return to code cache
>
> The advantage is that victim TLB can offer more associativity to a
> directly mapped TLB and thus potentially fewer page table walks while
> still keeping the time taken to flush within reasonable limits.
> However, placing a victim TLB before the refill path increase TLB
> refill path as the victim TLB is consulted before the TLB refill. The
> performance results demonstrate that the pros outweigh the cons.
>
> Attached are some performance results taken on SPECINT2006 train
> datasets and kernel boot and qemu configure script on an
> Intel(R) Xeon(R) CPU  E5620  @ 2.40GHz Linux machine. In
> summary, victim TLB improves the performance of qemu-system-x86_64 by
> 10.7% on average on SPECINT2006 and with highest improvement of in 25.4%
> in 464.h264ref. And victim TLB does not result in any performance
> degradation in any of the measured benchmarks. Furthermore, the
> implemented victim TLB is architecture independent and is expected to
> benefit other architectures in QEMU as well.
>
> Although there are measurement fluctuations, the performance
> improvement is very significant and by no means in the range of
> noises.
>
> Reviewed-by: Richard Henderson <r...@twiddle.net>
> Signed-off-by: Xin Tong <trent.t...@gmail.com>
>
> ---
>  cputlb.c                        | 50 +++++++++++++++++++++++++-
>  include/exec/cpu-defs.h         | 16 ++++++---
>  include/exec/exec-all.h         |  2 ++
>  include/exec/softmmu_template.h | 80 
> ++++++++++++++++++++++++++++++++++++++---
>  4 files changed, 138 insertions(+), 10 deletions(-)
>
> diff --git a/cputlb.c b/cputlb.c
> index b533f3f..03a048a 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -34,6 +34,22 @@
>  /* statistics */
>  int tlb_flush_count;
>
> +/* swap the 2 given TLB entries as well as their corresponding IOTLB */
> +inline void swap_tlb(CPUTLBEntry *te, CPUTLBEntry *se, hwaddr *iote,
> +                     hwaddr *iose)
> +{
> +   hwaddr iotmp;
> +   CPUTLBEntry t;
> +   /* swap iotlb */
> +   iotmp = *iote;
> +   *iote = *iose;
> +   *iose = iotmp;
> +   /* swap tlb */
> +   memcpy(&t, te, sizeof(CPUTLBEntry));
> +   memcpy(te, se, sizeof(CPUTLBEntry));
> +   memcpy(se, &t, sizeof(CPUTLBEntry));
> +}
> +
>  /* NOTE:
>   * If flush_global is true (the usual case), flush all tlb entries.
>   * If flush_global is false, flush (at least) all tlb entries not
> @@ -58,8 +74,10 @@ void tlb_flush(CPUArchState *env, int flush_global)
>      cpu->current_tb = NULL;
>
>      memset(env->tlb_table, -1, sizeof(env->tlb_table));
> +    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
>      memset(env->tb_jmp_cache, 0, sizeof(env->tb_jmp_cache));
>
> +    env->vtlb_index = 0;
>      env->tlb_flush_addr = -1;
>      env->tlb_flush_mask = 0;
>      tlb_flush_count++;
> @@ -106,6 +124,14 @@ void tlb_flush_page(CPUArchState *env, target_ulong addr)
>          tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
>      }
>
> +    /* check whether there are entries that need to be flushed in the vtlb */
> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
> +        unsigned int k;
> +        for (k = 0; k < CPU_VTLB_SIZE; k++) {
> +            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
> +        }
> +    }
> +
>      tb_flush_jmp_cache(env, addr);
>  }
>
> @@ -170,6 +196,11 @@ void cpu_tlb_reset_dirty_all(ram_addr_t start1, 
> ram_addr_t length)
>                  tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
>                                        start1, length);
>              }
> +
> +            for (i = 0; i < CPU_VTLB_SIZE; i++) {
> +                tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
> +                                      start1, length);
> +            }
>          }
>      }
>  }
> @@ -193,6 +224,13 @@ void tlb_set_dirty(CPUArchState *env, target_ulong vaddr)
>      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
>          tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
>      }
> +
> +    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
> +        unsigned int k;
> +        for (k = 0; k < CPU_VTLB_SIZE; k++) {
> +            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
> +        }
> +    }
>  }
>
>  /* Our TLB does not support large pages, so remember the area covered by
> @@ -264,8 +302,18 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr,
>                                              prot, &address);
>
>      index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
> -    env->iotlb[mmu_idx][index] = iotlb - vaddr;
>      te = &env->tlb_table[mmu_idx][index];
> +
> +    /* do not discard the translation in te, evict it into a victim tlb */
> +    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
> +    env->tlb_v_table[mmu_idx][vidx].addr_read  = te->addr_read;
> +    env->tlb_v_table[mmu_idx][vidx].addr_write = te->addr_write;
> +    env->tlb_v_table[mmu_idx][vidx].addr_code  = te->addr_code;
> +    env->tlb_v_table[mmu_idx][vidx].addend     = te->addend;
> +    env->iotlb_v[mmu_idx][vidx]                = env->iotlb[mmu_idx][index];
> +
> +    /* refill the tlb */
> +    env->iotlb[mmu_idx][index] = iotlb - vaddr;
>      te->addend = addend - vaddr;
>      if (prot & PAGE_READ) {
>          te->addr_read = address;
> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
> index 01cd8c7..18d5f0d 100644
> --- a/include/exec/cpu-defs.h
> +++ b/include/exec/cpu-defs.h
> @@ -72,8 +72,10 @@ typedef uint64_t target_ulong;
>  #define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
>
>  #if !defined(CONFIG_USER_ONLY)
> -#define CPU_TLB_BITS 8
> -#define CPU_TLB_SIZE (1 << CPU_TLB_BITS)
> +#define CPU_TLB_BITS  8
> +#define CPU_TLB_SIZE  (1 << CPU_TLB_BITS)
> +/* use a fully associative victim tlb */
> +#define CPU_VTLB_SIZE 8
>
>  #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
>  #define CPU_TLB_ENTRY_BITS 4
> @@ -103,12 +105,16 @@ typedef struct CPUTLBEntry {
>
>  QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
>
> +/* The meaning of the MMU modes is defined in the target code. */
>  #define CPU_COMMON_TLB \
>      /* The meaning of the MMU modes is defined in the target code. */   \
> -    CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
> -    hwaddr iotlb[NB_MMU_MODES][CPU_TLB_SIZE];               \
> +    CPUTLBEntry  tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                 \
> +    CPUTLBEntry  tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];              \
> +    hwaddr       iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                     \
> +    hwaddr       iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                  \
>      target_ulong tlb_flush_addr;                                        \
> -    target_ulong tlb_flush_mask;
> +    target_ulong tlb_flush_mask;                                        \
> +    target_ulong vtlb_index;                                            \
>
>  #else
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index ea90b64..7e88b08 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -102,6 +102,8 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr,
>                    hwaddr paddr, int prot,
>                    int mmu_idx, target_ulong size);
>  void tb_invalidate_phys_addr(hwaddr addr);
> +/* swap the 2 given tlb entries as well as their iotlb */
> +void swap_tlb(CPUTLBEntry *te, CPUTLBEntry *se, hwaddr *iote, hwaddr *iose);
>  #else
>  static inline void tlb_flush_page(CPUArchState *env, target_ulong addr)
>  {
> diff --git a/include/exec/softmmu_template.h b/include/exec/softmmu_template.h
> index c6a5440..fe11343 100644
> --- a/include/exec/softmmu_template.h
> +++ b/include/exec/softmmu_template.h
> @@ -141,6 +141,7 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, 
> target_ulong addr, int mmu_idx,
>      target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>      uintptr_t haddr;
>      DATA_TYPE res;
> +    int vtlb_idx;
>
>      /* Adjust the given return address.  */
>      retaddr -= GETPC_ADJ;
> @@ -153,7 +154,24 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, 
> target_ulong addr, int mmu_idx,
>              do_unaligned_access(env, addr, READ_ACCESS_TYPE, mmu_idx, 
> retaddr);
>          }
>  #endif
> -        tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
> +        /* we are about to do a page table walk. our last hope is the
> +         * victim tlb. try to refill from the victim tlb before walking the
> +         * page table. */
> +        for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) {
> +            if (env->tlb_v_table[mmu_idx][vtlb_idx].ADDR_READ
> +                == (addr & TARGET_PAGE_MASK)) {
> +                /* found entry in victim tlb */
> +                swap_tlb(&env->tlb_table[mmu_idx][index],
> +                         &env->tlb_v_table[mmu_idx][vtlb_idx],
> +                         &env->iotlb[mmu_idx][index],
> +                         &env->iotlb_v[mmu_idx][vtlb_idx]);
> +                break;
> +            }
> +        }
> +        /* miss victim tlb */
> +        if (vtlb_idx < 0) {
> +            tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>      }
>
> @@ -223,6 +241,7 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, 
> target_ulong addr, int mmu_idx,
>      target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>      uintptr_t haddr;
>      DATA_TYPE res;
> +    int vtlb_idx;
>
>      /* Adjust the given return address.  */
>      retaddr -= GETPC_ADJ;
> @@ -235,7 +254,24 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, 
> target_ulong addr, int mmu_idx,
>              do_unaligned_access(env, addr, READ_ACCESS_TYPE, mmu_idx, 
> retaddr);
>          }
>  #endif
> -        tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
> +        /* we are about to do a page table walk. our last hope is the
> +         * victim tlb. try to refill from the victim tlb before walking the
> +         * page table. */
> +        for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) {
> +            if (env->tlb_v_table[mmu_idx][vtlb_idx].ADDR_READ
> +                == (addr & TARGET_PAGE_MASK)) {
> +                /* found entry in victim tlb */
> +                swap_tlb(&env->tlb_table[mmu_idx][index],
> +                         &env->tlb_v_table[mmu_idx][vtlb_idx],
> +                         &env->iotlb[mmu_idx][index],
> +                         &env->iotlb_v[mmu_idx][vtlb_idx]);
> +                break;
> +            }
> +        }
> +        /* miss victim tlb */
> +        if (vtlb_idx < 0) {
> +            tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
>      }
>
> @@ -342,6 +378,7 @@ void helper_le_st_name(CPUArchState *env, target_ulong 
> addr, DATA_TYPE val,
>      int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
>      target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>      uintptr_t haddr;
> +    int vtlb_idx;
>
>      /* Adjust the given return address.  */
>      retaddr -= GETPC_ADJ;
> @@ -354,7 +391,24 @@ void helper_le_st_name(CPUArchState *env, target_ulong 
> addr, DATA_TYPE val,
>              do_unaligned_access(env, addr, 1, mmu_idx, retaddr);
>          }
>  #endif
> -        tlb_fill(env, addr, 1, mmu_idx, retaddr);
> +        /* we are about to do a page table walk. our last hope is the
> +         * victim tlb. try to refill from the victim tlb before walking the
> +         * page table. */
> +        for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) {
> +            if (env->tlb_v_table[mmu_idx][vtlb_idx].addr_write
> +                == (addr & TARGET_PAGE_MASK)) {
> +                /* found entry in victim tlb */
> +                swap_tlb(&env->tlb_table[mmu_idx][index],
> +                         &env->tlb_v_table[mmu_idx][vtlb_idx],
> +                         &env->iotlb[mmu_idx][index],
> +                         &env->iotlb_v[mmu_idx][vtlb_idx]);
> +                break;
> +            }
> +        }
> +        /* miss victim tlb */
> +        if (vtlb_idx < 0) {
> +            tlb_fill(env, addr, 1, mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>      }
>
> @@ -418,6 +472,7 @@ void helper_be_st_name(CPUArchState *env, target_ulong 
> addr, DATA_TYPE val,
>      int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
>      target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>      uintptr_t haddr;
> +    int vtlb_idx;
>
>      /* Adjust the given return address.  */
>      retaddr -= GETPC_ADJ;
> @@ -430,7 +485,24 @@ void helper_be_st_name(CPUArchState *env, target_ulong 
> addr, DATA_TYPE val,
>              do_unaligned_access(env, addr, 1, mmu_idx, retaddr);
>          }
>  #endif
> -        tlb_fill(env, addr, 1, mmu_idx, retaddr);
> +        /* we are about to do a page table walk. our last hope is the
> +         * victim tlb. try to refill from the victim tlb before walking the
> +         * page table. */
> +        for (vtlb_idx = CPU_VTLB_SIZE; vtlb_idx >= 0; --vtlb_idx) {
> +            if (env->tlb_v_table[mmu_idx][vtlb_idx].addr_write
> +                == (addr & TARGET_PAGE_MASK)) {
> +                /* found entry in victim tlb */
> +                swap_tlb(&env->tlb_table[mmu_idx][index],
> +                         &env->tlb_v_table[mmu_idx][vtlb_idx],
> +                         &env->iotlb[mmu_idx][index],
> +                         &env->iotlb_v[mmu_idx][vtlb_idx]);
> +                break;
> +            }
> +        }
> +        /* miss victim tlb */
> +        if (vtlb_idx < 0) {
> +            tlb_fill(env, addr, 1, mmu_idx, retaddr);
> +        }
>          tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
>      }
>
> --
> 1.8.3.2
>

vtlb.xlsx
Description: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet

Re: [Qemu-devel] [PATCH v2] cpu: implementing victim TLB for QEMU system emulated TLB

Reply via email to