Re: [Qemu-devel] [PATCH v7 4/7] exec: [tcg] Use different TBs according to the vCPU's dynamic tracing state

2017-06-01 Thread Emilio G. Cota
On Fri, Jan 13, 2017 at 21:48:35 +0100, Lluís Vilanova wrote:
>  9 files changed, 54 insertions(+), 11 deletions(-)
> 
> diff --git a/cpu-exec.c b/cpu-exec.c
> index 4188fed3c6..36709cba1f 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -261,6 +261,7 @@ struct tb_desc {
>  CPUArchState *env;
>  tb_page_addr_t phys_page1;
>  uint32_t flags;
> +TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
(snip)
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 57cd978578..ae74f61ea2 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -200,6 +200,10 @@ static inline void tlb_flush_by_mmuidx(CPUState *cpu, 
> ...)
>  #define USE_DIRECT_JUMP
>  #endif
>  
> +/**
> + * TranslationBlock:
> + * @trace_vcpu_dstate: Per-vCPU dynamic tracing state used to generate this 
> TB.
> + */
>  struct TranslationBlock {
>  target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS 
> base) */
>  target_ulong cs_base; /* CS base for this block */
> @@ -215,6 +219,7 @@ struct TranslationBlock {
>  #define CF_IGNORE_ICOUNT 0x4 /* Do not generate icount code */
>  
>  uint16_t invalid;
> +TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
(snip)
> --- a/include/exec/tb-hash-xx.h
> +++ b/include/exec/tb-hash-xx.h
> @@ -35,6 +35,7 @@
>  #define EXEC_TB_HASH_XX_H
>  
>  #include "qemu/bitops.h"
> +#include "qemu-common.h"
>  
>  #define PRIME32_1   2654435761U
>  #define PRIME32_2   2246822519U
> @@ -49,7 +50,8 @@
>   * contiguous in memory.
>   */
>  static inline
> -uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
> +uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e,
> +   TRACE_QHT_VCPU_DSTATE_TYPE f)

I find this typedef unnecessary. Why not use u32 everywhere?
If ever we need more bits, then we'll add additional u32's here
as well.

Also, including above qemu-common.h goes against the spirit of
keeping this file (as well as tb-hash.h) free of external
dependences. (originally tb-hash.h's contents were in exec-all.h)

If we're worried about forgetting to update the hash function,
we could have a compile-time check + a comment elsewhere
(e.g. translate-all.c).

>  {
>  uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
>  uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
> @@ -83,6 +85,10 @@ uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t 
> e)
Right here you should also do:

@@ -78,7 +78,7 @@ uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
 v4 *= PRIME32_1;

 h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
-h32 += 20;
+h32 += 24;

to take into account the newly added parameter.

>  h32 += e * PRIME32_3;
>  h32  = rol32(h32, 17) * PRIME32_4;
>  
> +QEMU_BUILD_BUG_ON(sizeof(TRACE_QHT_VCPU_DSTATE_TYPE) != 
> sizeof(uint32_t));
> +h32 += f * PRIME32_3;
> +h32  = rol32(h32, 17) * PRIME32_4;

If we get rid of the typedef, this compile-time check will go away too.

(snip)
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index 1430390eb6..73a6fe 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -151,4 +151,7 @@ void page_size_init(void);
>   * returned. */
>  bool dump_in_progress(void);
>  
> +/* Use a macro to allow safe changes to its size in the future */
> +#define TRACE_QHT_VCPU_DSTATE_TYPE uint32_t
> +
(snip)
> diff --git a/translate-all.c b/translate-all.c
> index 29ccb9e546..6e1b1d474c 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -54,6 +54,7 @@
>  #include "exec/tb-hash.h"
>  #include "translate-all.h"
>  #include "qemu/bitmap.h"
> +#include "qemu/error-report.h"
>  #include "qemu/timer.h"
>  #include "exec/log.h"
>  
> @@ -813,6 +814,12 @@ static void tb_htable_init(void)
>  {
>  unsigned int mode = QHT_MODE_AUTO_RESIZE;
>  
> +/* Ensure TB hash function covers the bitmap size */
> +if (DIV_ROUND_UP(trace_get_vcpu_event_count(), BITS_PER_BYTE) >
> +sizeof(TRACE_QHT_VCPU_DSTATE_TYPE)) {
> +error_report("too many 'vcpu' events for the TB hash function");
> +}
> +

This is a better place to do the above check, I think.

Thanks,

Emilio



[Qemu-devel] [PATCH v7 4/7] exec: [tcg] Use different TBs according to the vCPU's dynamic tracing state

2017-01-13 Thread Lluís Vilanova
Every vCPU now uses a separate set of TBs for each set of dynamic
tracing event state values. Each set of TBs can be used by any number of
vCPUs to maximize TB reuse when vCPUs have the same tracing state.

This feature is later used by tracetool to optimize tracing of guest
code events.

The maximum number of TB sets is defined as 2^E, where E is the number
of events that have the 'vcpu' property (their state is stored in
CPUState->trace_dstate).

For this to work, a change on the dynamic tracing state of a vCPU will
force it to flush its virtual TB cache (which is only indexed by
address), and fall back to the physical TB cache (which now contains the
vCPU's dynamic tracing state as part of the hashing function).

Signed-off-by: Lluís Vilanova 
Reviewed-by: Richard Henderson 
---
 cpu-exec.c|   22 +-
 include/exec/exec-all.h   |5 +
 include/exec/tb-hash-xx.h |8 +++-
 include/exec/tb-hash.h|5 +++--
 include/qemu-common.h |3 +++
 tests/qht-bench.c |2 +-
 trace/control-target.c|1 +
 trace/control.h   |3 +++
 translate-all.c   |   16 ++--
 9 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 4188fed3c6..36709cba1f 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -261,6 +261,7 @@ struct tb_desc {
 CPUArchState *env;
 tb_page_addr_t phys_page1;
 uint32_t flags;
+TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
 };
 
 static bool tb_cmp(const void *p, const void *d)
@@ -272,6 +273,7 @@ static bool tb_cmp(const void *p, const void *d)
 tb->page_addr[0] == desc->phys_page1 &&
 tb->cs_base == desc->cs_base &&
 tb->flags == desc->flags &&
+tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
 !atomic_read(>invalid)) {
 /* check next page if needed */
 if (tb->page_addr[1] == -1) {
@@ -293,7 +295,8 @@ static bool tb_cmp(const void *p, const void *d)
 static TranslationBlock *tb_htable_lookup(CPUState *cpu,
   target_ulong pc,
   target_ulong cs_base,
-  uint32_t flags)
+  uint32_t flags,
+  uint32_t trace_vcpu_dstate)
 {
 tb_page_addr_t phys_pc;
 struct tb_desc desc;
@@ -302,10 +305,11 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu,
 desc.env = (CPUArchState *)cpu->env_ptr;
 desc.cs_base = cs_base;
 desc.flags = flags;
+desc.trace_vcpu_dstate = trace_vcpu_dstate;
 desc.pc = pc;
 phys_pc = get_page_addr_code(desc.env, pc);
 desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
-h = tb_hash_func(phys_pc, pc, flags);
+h = tb_hash_func(phys_pc, pc, flags, trace_vcpu_dstate);
 return qht_lookup(_ctx.tb_ctx.htable, tb_cmp, , h);
 }
 
@@ -317,16 +321,24 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 TranslationBlock *tb;
 target_ulong cs_base, pc;
 uint32_t flags;
+unsigned long trace_vcpu_dstate_bitmap;
+TRACE_QHT_VCPU_DSTATE_TYPE trace_vcpu_dstate;
 bool have_tb_lock = false;
 
+bitmap_copy(_vcpu_dstate_bitmap, cpu->trace_dstate,
+trace_get_vcpu_event_count());
+memcpy(_vcpu_dstate, _vcpu_dstate_bitmap,
+   sizeof(trace_vcpu_dstate));
+
 /* we record a subset of the CPU state. It will
always be the same before a given translated block
is executed. */
 cpu_get_tb_cpu_state(env, , _base, );
 tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
 if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
- tb->flags != flags)) {
-tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+ tb->flags != flags ||
+ tb->trace_vcpu_dstate != trace_vcpu_dstate)) {
+tb = tb_htable_lookup(cpu, pc, cs_base, flags, trace_vcpu_dstate);
 if (!tb) {
 
 /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
@@ -340,7 +352,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 /* There's a chance that our desired tb has been translated while
  * taking the locks so we check again inside the lock.
  */
-tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+tb = tb_htable_lookup(cpu, pc, cs_base, flags, trace_vcpu_dstate);
 if (!tb) {
 /* if no translated code available, then translate it now */
 tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 57cd978578..ae74f61ea2 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -200,6 +200,10 @@ static inline void tlb_flush_by_mmuidx(CPUState *cpu, ...)
 #define USE_DIRECT_JUMP
 #endif
 
+/**
+ * TranslationBlock:
+ *