Re: [Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading
Alvise Rigo a.r...@virtualopensystems.com writes: Update the TCG LL/SC instructions to work in multi-threading. The basic idea remains untouched, but the whole mechanism is improved to make use of the callback support to query TLB flush requests and the rendezvous callback to synchronize all the currently running vCPUs. In essence, if a vCPU wants to LL to a page which is not already set as EXCL, it will arrange a rendezvous with all the vCPUs that are executing a TB and query a TLB flush for *all* the vCPUs. Doing so, we make sure that: - the running vCPUs do not touch the EXCL page while the requesting vCPU is setting the transaction to EXCL of the page - all the vCPUs will have the EXCL flag in the TLB entry for that specific page *before* entering the next TB Suggested-by: Jani Kokkonen jani.kokko...@huawei.com Suggested-by: Claudio Fontana claudio.font...@huawei.com Signed-off-by: Alvise Rigo a.r...@virtualopensystems.com --- cputlb.c| 2 + include/exec/cpu-defs.h | 4 ++ softmmu_llsc_template.h | 97 - 3 files changed, 69 insertions(+), 34 deletions(-) diff --git a/cputlb.c b/cputlb.c index 66df41a..0566e0f 100644 --- a/cputlb.c +++ b/cputlb.c @@ -30,6 +30,8 @@ #include exec/ram_addr.h #include tcg/tcg.h +#include sysemu/cpus.h + void qemu_mutex_lock_iothread(void); void qemu_mutex_unlock_iothread(void); diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index c73a75f..40742b3 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry { /* Used for atomic instruction translation. */ \ bool ll_sc_context; \ hwaddr excl_protected_hwaddr; \ +/* Used to carry the stcond result and also as a flag to flag a + * normal store access made by a stcond. */ \ +int excl_succeeded; \ + #endif diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h index 81e9d8e..4105e72 100644 --- a/softmmu_llsc_template.h +++ b/softmmu_llsc_template.h @@ -54,7 +54,21 @@ (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \ }) \ -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX +#define is_read_tlb_entry_set(env, page, index) \ +({ \ +(addr TARGET_PAGE_MASK) \ + == ((env-tlb_table[mmu_idx][index].addr_read) \ + (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \ +}) \ + +/* Whenever a SC operation fails, we add a small delay to reduce the + * concurrency among the atomic instruction emulation code. Without this delay, + * in very congested situation where plain stores make all the pending LLs + * fail, the code could reach a stalling situation in which all the SCs happen + * to fail. + * TODO: make the delay dynamic according with the SC failing rate. + * */ +#define TCG_ATOMIC_INSN_EMUL_DELAY 100 I'd be tempted to split out this sort of chicanery into a separate patch. WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, hwaddr hw_addr; unsigned mmu_idx = get_mmuidx(oi); -/* Use the proper load helper from cpu_ldst.h */ -ret = helper_ld_legacy(env, addr, mmu_idx, retaddr); - -/* The last legacy access ensures that the TLB and IOTLB entry for 'addr' - * have been created. */ index = (addr TARGET_PAGE_BITS) (CPU_TLB_SIZE - 1); +if (!is_read_tlb_entry_set(env, addr, index)) { +tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr); +} /* hw_addr = hwaddr of the page (i.e. section-mr-ram_addr + xlat) * plus the offset (i.e. addr ~TARGET_PAGE_MASK) */ hw_addr = (env-iotlb[mmu_idx][index].addr TARGET_PAGE_MASK) + addr; /* Set the exclusive-protected hwaddr. */ -env-excl_protected_hwaddr = hw_addr; -env-ll_sc_context = true; +qemu_mutex_lock(tcg_excl_access_lock); +if (cpu_physical_memory_excl_is_dirty(hw_addr) !exit_flush_request) { +exit_flush_request = 1; -/* No need to mask hw_addr with TARGET_PAGE_MASK since - * cpu_physical_memory_excl_is_dirty() will take care of that. */ -if (cpu_physical_memory_excl_is_dirty(hw_addr)) { -
Re: [Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading
On Fri, Jul 17, 2015 at 5:27 PM, Alex Bennée alex.ben...@linaro.org wrote: Alvise Rigo a.r...@virtualopensystems.com writes: Update the TCG LL/SC instructions to work in multi-threading. The basic idea remains untouched, but the whole mechanism is improved to make use of the callback support to query TLB flush requests and the rendezvous callback to synchronize all the currently running vCPUs. In essence, if a vCPU wants to LL to a page which is not already set as EXCL, it will arrange a rendezvous with all the vCPUs that are executing a TB and query a TLB flush for *all* the vCPUs. Doing so, we make sure that: - the running vCPUs do not touch the EXCL page while the requesting vCPU is setting the transaction to EXCL of the page - all the vCPUs will have the EXCL flag in the TLB entry for that specific page *before* entering the next TB Suggested-by: Jani Kokkonen jani.kokko...@huawei.com Suggested-by: Claudio Fontana claudio.font...@huawei.com Signed-off-by: Alvise Rigo a.r...@virtualopensystems.com --- cputlb.c| 2 + include/exec/cpu-defs.h | 4 ++ softmmu_llsc_template.h | 97 - 3 files changed, 69 insertions(+), 34 deletions(-) diff --git a/cputlb.c b/cputlb.c index 66df41a..0566e0f 100644 --- a/cputlb.c +++ b/cputlb.c @@ -30,6 +30,8 @@ #include exec/ram_addr.h #include tcg/tcg.h +#include sysemu/cpus.h + void qemu_mutex_lock_iothread(void); void qemu_mutex_unlock_iothread(void); diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index c73a75f..40742b3 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry { /* Used for atomic instruction translation. */ \ bool ll_sc_context; \ hwaddr excl_protected_hwaddr; \ +/* Used to carry the stcond result and also as a flag to flag a + * normal store access made by a stcond. */ \ +int excl_succeeded; \ + #endif diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h index 81e9d8e..4105e72 100644 --- a/softmmu_llsc_template.h +++ b/softmmu_llsc_template.h @@ -54,7 +54,21 @@ (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \ }) \ -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX +#define is_read_tlb_entry_set(env, page, index) \ +({ \ +(addr TARGET_PAGE_MASK) \ + == ((env-tlb_table[mmu_idx][index].addr_read) \ + (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \ +}) \ + +/* Whenever a SC operation fails, we add a small delay to reduce the + * concurrency among the atomic instruction emulation code. Without this delay, + * in very congested situation where plain stores make all the pending LLs + * fail, the code could reach a stalling situation in which all the SCs happen + * to fail. + * TODO: make the delay dynamic according with the SC failing rate. + * */ +#define TCG_ATOMIC_INSN_EMUL_DELAY 100 I'd be tempted to split out this sort of chicanery into a separate patch. OK, I think it's a good idea since it's not strictly required. Regards, alvise WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, hwaddr hw_addr; unsigned mmu_idx = get_mmuidx(oi); -/* Use the proper load helper from cpu_ldst.h */ -ret = helper_ld_legacy(env, addr, mmu_idx, retaddr); - -/* The last legacy access ensures that the TLB and IOTLB entry for 'addr' - * have been created. */ index = (addr TARGET_PAGE_BITS) (CPU_TLB_SIZE - 1); +if (!is_read_tlb_entry_set(env, addr, index)) { +tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr); +} /* hw_addr = hwaddr of the page (i.e. section-mr-ram_addr + xlat) * plus the offset (i.e. addr ~TARGET_PAGE_MASK) */ hw_addr = (env-iotlb[mmu_idx][index].addr TARGET_PAGE_MASK) + addr; /* Set the exclusive-protected hwaddr. */ -env-excl_protected_hwaddr = hw_addr; -env-ll_sc_context = true; +qemu_mutex_lock(tcg_excl_access_lock); +if (cpu_physical_memory_excl_is_dirty(hw_addr) !exit_flush_request) { +exit_flush_request = 1; -/* No need to mask hw_addr with TARGET_PAGE_MASK since - *
[Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading
Update the TCG LL/SC instructions to work in multi-threading. The basic idea remains untouched, but the whole mechanism is improved to make use of the callback support to query TLB flush requests and the rendezvous callback to synchronize all the currently running vCPUs. In essence, if a vCPU wants to LL to a page which is not already set as EXCL, it will arrange a rendezvous with all the vCPUs that are executing a TB and query a TLB flush for *all* the vCPUs. Doing so, we make sure that: - the running vCPUs do not touch the EXCL page while the requesting vCPU is setting the transaction to EXCL of the page - all the vCPUs will have the EXCL flag in the TLB entry for that specific page *before* entering the next TB Suggested-by: Jani Kokkonen jani.kokko...@huawei.com Suggested-by: Claudio Fontana claudio.font...@huawei.com Signed-off-by: Alvise Rigo a.r...@virtualopensystems.com --- cputlb.c| 2 + include/exec/cpu-defs.h | 4 ++ softmmu_llsc_template.h | 97 - 3 files changed, 69 insertions(+), 34 deletions(-) diff --git a/cputlb.c b/cputlb.c index 66df41a..0566e0f 100644 --- a/cputlb.c +++ b/cputlb.c @@ -30,6 +30,8 @@ #include exec/ram_addr.h #include tcg/tcg.h +#include sysemu/cpus.h + void qemu_mutex_lock_iothread(void); void qemu_mutex_unlock_iothread(void); diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index c73a75f..40742b3 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry { /* Used for atomic instruction translation. */ \ bool ll_sc_context; \ hwaddr excl_protected_hwaddr; \ +/* Used to carry the stcond result and also as a flag to flag a + * normal store access made by a stcond. */ \ +int excl_succeeded; \ + #endif diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h index 81e9d8e..4105e72 100644 --- a/softmmu_llsc_template.h +++ b/softmmu_llsc_template.h @@ -54,7 +54,21 @@ (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \ }) \ -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX +#define is_read_tlb_entry_set(env, page, index) \ +({ \ +(addr TARGET_PAGE_MASK)\ + == ((env-tlb_table[mmu_idx][index].addr_read) \ + (TARGET_PAGE_MASK | TLB_INVALID_MASK)); \ +}) \ + +/* Whenever a SC operation fails, we add a small delay to reduce the + * concurrency among the atomic instruction emulation code. Without this delay, + * in very congested situation where plain stores make all the pending LLs + * fail, the code could reach a stalling situation in which all the SCs happen + * to fail. + * TODO: make the delay dynamic according with the SC failing rate. + * */ +#define TCG_ATOMIC_INSN_EMUL_DELAY 100 WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, hwaddr hw_addr; unsigned mmu_idx = get_mmuidx(oi); -/* Use the proper load helper from cpu_ldst.h */ -ret = helper_ld_legacy(env, addr, mmu_idx, retaddr); - -/* The last legacy access ensures that the TLB and IOTLB entry for 'addr' - * have been created. */ index = (addr TARGET_PAGE_BITS) (CPU_TLB_SIZE - 1); +if (!is_read_tlb_entry_set(env, addr, index)) { +tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, retaddr); +} /* hw_addr = hwaddr of the page (i.e. section-mr-ram_addr + xlat) * plus the offset (i.e. addr ~TARGET_PAGE_MASK) */ hw_addr = (env-iotlb[mmu_idx][index].addr TARGET_PAGE_MASK) + addr; /* Set the exclusive-protected hwaddr. */ -env-excl_protected_hwaddr = hw_addr; -env-ll_sc_context = true; +qemu_mutex_lock(tcg_excl_access_lock); +if (cpu_physical_memory_excl_is_dirty(hw_addr) !exit_flush_request) { +exit_flush_request = 1; -/* No need to mask hw_addr with TARGET_PAGE_MASK since - * cpu_physical_memory_excl_is_dirty() will take care of that. */ -if (cpu_physical_memory_excl_is_dirty(hw_addr)) { -cpu_physical_memory_clear_excl_dirty(hw_addr); +qemu_mutex_unlock(tcg_excl_access_lock); + +cpu_exit_init_rendezvous(); -/* Invalidate the TLB entry for the other processors. The next TLB - * entries for this page will have the TLB_EXCL flag set. */