Re: [Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading

2015-07-17 Thread Alex Bennée

Alvise Rigo a.r...@virtualopensystems.com writes:

 Update the TCG LL/SC instructions to work in multi-threading.

 The basic idea remains untouched, but the whole mechanism is improved to
 make use of the callback support to query TLB flush requests and the
 rendezvous callback to synchronize all the currently running vCPUs.

 In essence, if a vCPU wants to LL to a page which is not already set as
 EXCL, it will arrange a rendezvous with all the vCPUs that are executing
 a TB and query a TLB flush for *all* the vCPUs.
 Doing so, we make sure that:
 - the running vCPUs do not touch the EXCL page while the requesting vCPU
   is setting the transaction to EXCL of the page
 - all the vCPUs will have the EXCL flag in the TLB entry for that
   specific page *before* entering the next TB

 Suggested-by: Jani Kokkonen jani.kokko...@huawei.com
 Suggested-by: Claudio Fontana claudio.font...@huawei.com
 Signed-off-by: Alvise Rigo a.r...@virtualopensystems.com
 ---
  cputlb.c|  2 +
  include/exec/cpu-defs.h |  4 ++
  softmmu_llsc_template.h | 97 
 -
  3 files changed, 69 insertions(+), 34 deletions(-)

 diff --git a/cputlb.c b/cputlb.c
 index 66df41a..0566e0f 100644
 --- a/cputlb.c
 +++ b/cputlb.c
 @@ -30,6 +30,8 @@
  #include exec/ram_addr.h
  #include tcg/tcg.h
  
 +#include sysemu/cpus.h
 +
  void qemu_mutex_lock_iothread(void);
  void qemu_mutex_unlock_iothread(void);
  
 diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
 index c73a75f..40742b3 100644
 --- a/include/exec/cpu-defs.h
 +++ b/include/exec/cpu-defs.h
 @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry {
  /* Used for atomic instruction translation. */  \
  bool ll_sc_context; \
  hwaddr excl_protected_hwaddr;   \
 +/* Used to carry the stcond result and also as a flag to flag a
 + * normal store access made by a stcond. */ \
 +int excl_succeeded; \
 +
  
  #endif
 diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
 index 81e9d8e..4105e72 100644
 --- a/softmmu_llsc_template.h
 +++ b/softmmu_llsc_template.h
 @@ -54,7 +54,21 @@
   (TARGET_PAGE_MASK | TLB_INVALID_MASK)); 
 \
  })   
 \
  
 -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
 +#define is_read_tlb_entry_set(env, page, index)  
 \
 +({   
 \
 +(addr  TARGET_PAGE_MASK)
 \
 + == ((env-tlb_table[mmu_idx][index].addr_read) 
 \
 + (TARGET_PAGE_MASK | TLB_INVALID_MASK)); 
 \
 +})   
 \
 +
 +/* Whenever a SC operation fails, we add a small delay to reduce the
 + * concurrency among the atomic instruction emulation code. Without this 
 delay,
 + * in very congested situation where plain stores make all the pending LLs
 + * fail, the code could reach a stalling situation in which all the SCs 
 happen
 + * to fail.
 + * TODO: make the delay dynamic according with the SC failing rate.
 + * */
 +#define TCG_ATOMIC_INSN_EMUL_DELAY 100

I'd be tempted to split out this sort of chicanery into a separate patch. 

  
  WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
  TCGMemOpIdx oi, uintptr_t retaddr)
 @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, 
 target_ulong addr,
  hwaddr hw_addr;
  unsigned mmu_idx = get_mmuidx(oi);
  
 -/* Use the proper load helper from cpu_ldst.h */
 -ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
 -
 -/* The last legacy access ensures that the TLB and IOTLB entry for 'addr'
 - * have been created. */
  index = (addr  TARGET_PAGE_BITS)  (CPU_TLB_SIZE - 1);
 +if (!is_read_tlb_entry_set(env, addr, index)) {
 +tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
 +}
  
  /* hw_addr = hwaddr of the page (i.e. section-mr-ram_addr + xlat)
   * plus the offset (i.e. addr  ~TARGET_PAGE_MASK) */
  hw_addr = (env-iotlb[mmu_idx][index].addr  TARGET_PAGE_MASK) + addr;
  
  /* Set the exclusive-protected hwaddr. */
 -env-excl_protected_hwaddr = hw_addr;
 -env-ll_sc_context = true;
 +qemu_mutex_lock(tcg_excl_access_lock);
 +if (cpu_physical_memory_excl_is_dirty(hw_addr)  !exit_flush_request) {
 +exit_flush_request = 1;
  
 -/* No need to mask hw_addr with TARGET_PAGE_MASK since
 - * cpu_physical_memory_excl_is_dirty() will take care of that. */
 -if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
 -

Re: [Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading

2015-07-17 Thread alvise rigo
On Fri, Jul 17, 2015 at 5:27 PM, Alex Bennée alex.ben...@linaro.org wrote:

 Alvise Rigo a.r...@virtualopensystems.com writes:

 Update the TCG LL/SC instructions to work in multi-threading.

 The basic idea remains untouched, but the whole mechanism is improved to
 make use of the callback support to query TLB flush requests and the
 rendezvous callback to synchronize all the currently running vCPUs.

 In essence, if a vCPU wants to LL to a page which is not already set as
 EXCL, it will arrange a rendezvous with all the vCPUs that are executing
 a TB and query a TLB flush for *all* the vCPUs.
 Doing so, we make sure that:
 - the running vCPUs do not touch the EXCL page while the requesting vCPU
   is setting the transaction to EXCL of the page
 - all the vCPUs will have the EXCL flag in the TLB entry for that
   specific page *before* entering the next TB

 Suggested-by: Jani Kokkonen jani.kokko...@huawei.com
 Suggested-by: Claudio Fontana claudio.font...@huawei.com
 Signed-off-by: Alvise Rigo a.r...@virtualopensystems.com
 ---
  cputlb.c|  2 +
  include/exec/cpu-defs.h |  4 ++
  softmmu_llsc_template.h | 97 
 -
  3 files changed, 69 insertions(+), 34 deletions(-)

 diff --git a/cputlb.c b/cputlb.c
 index 66df41a..0566e0f 100644
 --- a/cputlb.c
 +++ b/cputlb.c
 @@ -30,6 +30,8 @@
  #include exec/ram_addr.h
  #include tcg/tcg.h

 +#include sysemu/cpus.h
 +
  void qemu_mutex_lock_iothread(void);
  void qemu_mutex_unlock_iothread(void);

 diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
 index c73a75f..40742b3 100644
 --- a/include/exec/cpu-defs.h
 +++ b/include/exec/cpu-defs.h
 @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry {
  /* Used for atomic instruction translation. */  \
  bool ll_sc_context; \
  hwaddr excl_protected_hwaddr;   \
 +/* Used to carry the stcond result and also as a flag to flag a
 + * normal store access made by a stcond. */ \
 +int excl_succeeded; \
 +

  #endif
 diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
 index 81e9d8e..4105e72 100644
 --- a/softmmu_llsc_template.h
 +++ b/softmmu_llsc_template.h
 @@ -54,7 +54,21 @@
   (TARGET_PAGE_MASK | TLB_INVALID_MASK));
  \
  })  
  \

 -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
 +#define is_read_tlb_entry_set(env, page, index) 
  \
 +({  
  \
 +(addr  TARGET_PAGE_MASK)   
  \
 + == ((env-tlb_table[mmu_idx][index].addr_read)
  \
 + (TARGET_PAGE_MASK | TLB_INVALID_MASK));
  \
 +})  
  \
 +
 +/* Whenever a SC operation fails, we add a small delay to reduce the
 + * concurrency among the atomic instruction emulation code. Without this 
 delay,
 + * in very congested situation where plain stores make all the pending LLs
 + * fail, the code could reach a stalling situation in which all the SCs 
 happen
 + * to fail.
 + * TODO: make the delay dynamic according with the SC failing rate.
 + * */
 +#define TCG_ATOMIC_INSN_EMUL_DELAY 100

 I'd be tempted to split out this sort of chicanery into a separate patch.

OK, I think it's a good idea since it's not strictly required.

Regards,
alvise



  WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
  TCGMemOpIdx oi, uintptr_t retaddr)
 @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, 
 target_ulong addr,
  hwaddr hw_addr;
  unsigned mmu_idx = get_mmuidx(oi);

 -/* Use the proper load helper from cpu_ldst.h */
 -ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
 -
 -/* The last legacy access ensures that the TLB and IOTLB entry for 
 'addr'
 - * have been created. */
  index = (addr  TARGET_PAGE_BITS)  (CPU_TLB_SIZE - 1);
 +if (!is_read_tlb_entry_set(env, addr, index)) {
 +tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, 
 retaddr);
 +}

  /* hw_addr = hwaddr of the page (i.e. section-mr-ram_addr + xlat)
   * plus the offset (i.e. addr  ~TARGET_PAGE_MASK) */
  hw_addr = (env-iotlb[mmu_idx][index].addr  TARGET_PAGE_MASK) + addr;

  /* Set the exclusive-protected hwaddr. */
 -env-excl_protected_hwaddr = hw_addr;
 -env-ll_sc_context = true;
 +qemu_mutex_lock(tcg_excl_access_lock);
 +if (cpu_physical_memory_excl_is_dirty(hw_addr)  !exit_flush_request) {
 +exit_flush_request = 1;

 -/* No need to mask hw_addr with TARGET_PAGE_MASK since
 - * 

[Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading

2015-07-10 Thread Alvise Rigo
Update the TCG LL/SC instructions to work in multi-threading.

The basic idea remains untouched, but the whole mechanism is improved to
make use of the callback support to query TLB flush requests and the
rendezvous callback to synchronize all the currently running vCPUs.

In essence, if a vCPU wants to LL to a page which is not already set as
EXCL, it will arrange a rendezvous with all the vCPUs that are executing
a TB and query a TLB flush for *all* the vCPUs.
Doing so, we make sure that:
- the running vCPUs do not touch the EXCL page while the requesting vCPU
  is setting the transaction to EXCL of the page
- all the vCPUs will have the EXCL flag in the TLB entry for that
  specific page *before* entering the next TB

Suggested-by: Jani Kokkonen jani.kokko...@huawei.com
Suggested-by: Claudio Fontana claudio.font...@huawei.com
Signed-off-by: Alvise Rigo a.r...@virtualopensystems.com
---
 cputlb.c|  2 +
 include/exec/cpu-defs.h |  4 ++
 softmmu_llsc_template.h | 97 -
 3 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/cputlb.c b/cputlb.c
index 66df41a..0566e0f 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -30,6 +30,8 @@
 #include exec/ram_addr.h
 #include tcg/tcg.h
 
+#include sysemu/cpus.h
+
 void qemu_mutex_lock_iothread(void);
 void qemu_mutex_unlock_iothread(void);
 
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index c73a75f..40742b3 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry {
 /* Used for atomic instruction translation. */  \
 bool ll_sc_context; \
 hwaddr excl_protected_hwaddr;   \
+/* Used to carry the stcond result and also as a flag to flag a
+ * normal store access made by a stcond. */ \
+int excl_succeeded; \
+
 
 #endif
diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
index 81e9d8e..4105e72 100644
--- a/softmmu_llsc_template.h
+++ b/softmmu_llsc_template.h
@@ -54,7 +54,21 @@
  (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \
 })   \
 
-#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
+#define is_read_tlb_entry_set(env, page, index)  \
+({   \
+(addr  TARGET_PAGE_MASK)\
+ == ((env-tlb_table[mmu_idx][index].addr_read) \
+ (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \
+})   \
+
+/* Whenever a SC operation fails, we add a small delay to reduce the
+ * concurrency among the atomic instruction emulation code. Without this delay,
+ * in very congested situation where plain stores make all the pending LLs
+ * fail, the code could reach a stalling situation in which all the SCs happen
+ * to fail.
+ * TODO: make the delay dynamic according with the SC failing rate.
+ * */
+#define TCG_ATOMIC_INSN_EMUL_DELAY 100
 
 WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
 TCGMemOpIdx oi, uintptr_t retaddr)
@@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, 
target_ulong addr,
 hwaddr hw_addr;
 unsigned mmu_idx = get_mmuidx(oi);
 
-/* Use the proper load helper from cpu_ldst.h */
-ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
-
-/* The last legacy access ensures that the TLB and IOTLB entry for 'addr'
- * have been created. */
 index = (addr  TARGET_PAGE_BITS)  (CPU_TLB_SIZE - 1);
+if (!is_read_tlb_entry_set(env, addr, index)) {
+tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
+}
 
 /* hw_addr = hwaddr of the page (i.e. section-mr-ram_addr + xlat)
  * plus the offset (i.e. addr  ~TARGET_PAGE_MASK) */
 hw_addr = (env-iotlb[mmu_idx][index].addr  TARGET_PAGE_MASK) + addr;
 
 /* Set the exclusive-protected hwaddr. */
-env-excl_protected_hwaddr = hw_addr;
-env-ll_sc_context = true;
+qemu_mutex_lock(tcg_excl_access_lock);
+if (cpu_physical_memory_excl_is_dirty(hw_addr)  !exit_flush_request) {
+exit_flush_request = 1;
 
-/* No need to mask hw_addr with TARGET_PAGE_MASK since
- * cpu_physical_memory_excl_is_dirty() will take care of that. */
-if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
-cpu_physical_memory_clear_excl_dirty(hw_addr);
+qemu_mutex_unlock(tcg_excl_access_lock);
+
+cpu_exit_init_rendezvous();
 
-/* Invalidate the TLB entry for the other processors. The next TLB
- * entries for this page will have the TLB_EXCL flag set. */