On Fri, Jul 17, 2015 at 5:27 PM, Alex Bennée <alex.ben...@linaro.org> wrote: > > Alvise Rigo <a.r...@virtualopensystems.com> writes: > >> Update the TCG LL/SC instructions to work in multi-threading. >> >> The basic idea remains untouched, but the whole mechanism is improved to >> make use of the callback support to query TLB flush requests and the >> rendezvous callback to synchronize all the currently running vCPUs. >> >> In essence, if a vCPU wants to LL to a page which is not already set as >> EXCL, it will arrange a rendezvous with all the vCPUs that are executing >> a TB and query a TLB flush for *all* the vCPUs. >> Doing so, we make sure that: >> - the running vCPUs do not touch the EXCL page while the requesting vCPU >> is setting the transaction to EXCL of the page >> - all the vCPUs will have the EXCL flag in the TLB entry for that >> specific page *before* entering the next TB >> >> Suggested-by: Jani Kokkonen <jani.kokko...@huawei.com> >> Suggested-by: Claudio Fontana <claudio.font...@huawei.com> >> Signed-off-by: Alvise Rigo <a.r...@virtualopensystems.com> >> --- >> cputlb.c | 2 + >> include/exec/cpu-defs.h | 4 ++ >> softmmu_llsc_template.h | 97 >> ++++++++++++++++++++++++++++++++----------------- >> 3 files changed, 69 insertions(+), 34 deletions(-) >> >> diff --git a/cputlb.c b/cputlb.c >> index 66df41a..0566e0f 100644 >> --- a/cputlb.c >> +++ b/cputlb.c >> @@ -30,6 +30,8 @@ >> #include "exec/ram_addr.h" >> #include "tcg/tcg.h" >> >> +#include "sysemu/cpus.h" >> + >> void qemu_mutex_lock_iothread(void); >> void qemu_mutex_unlock_iothread(void); >> >> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h >> index c73a75f..40742b3 100644 >> --- a/include/exec/cpu-defs.h >> +++ b/include/exec/cpu-defs.h >> @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry { >> /* Used for atomic instruction translation. */ \ >> bool ll_sc_context; \ >> hwaddr excl_protected_hwaddr; \ >> + /* Used to carry the stcond result and also as a flag to flag a >> + * normal store access made by a stcond. */ \ >> + int excl_succeeded; \ >> + >> >> #endif >> diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h >> index 81e9d8e..4105e72 100644 >> --- a/softmmu_llsc_template.h >> +++ b/softmmu_llsc_template.h >> @@ -54,7 +54,21 @@ >> (TARGET_PAGE_MASK | TLB_INVALID_MASK)); >> \ >> }) >> \ >> >> -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX >> +#define is_read_tlb_entry_set(env, page, index) >> \ >> +({ >> \ >> + (addr & TARGET_PAGE_MASK) >> \ >> + == ((env->tlb_table[mmu_idx][index].addr_read) & >> \ >> + (TARGET_PAGE_MASK | TLB_INVALID_MASK)); >> \ >> +}) >> \ >> + >> +/* Whenever a SC operation fails, we add a small delay to reduce the >> + * concurrency among the atomic instruction emulation code. Without this >> delay, >> + * in very congested situation where plain stores make all the pending LLs >> + * fail, the code could reach a stalling situation in which all the SCs >> happen >> + * to fail. >> + * TODO: make the delay dynamic according with the SC failing rate. >> + * */ >> +#define TCG_ATOMIC_INSN_EMUL_DELAY 100 > > I'd be tempted to split out this sort of chicanery into a separate patch.
OK, I think it's a good idea since it's not strictly required. Regards, alvise > >> >> WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr, >> TCGMemOpIdx oi, uintptr_t retaddr) >> @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, >> target_ulong addr, >> hwaddr hw_addr; >> unsigned mmu_idx = get_mmuidx(oi); >> >> - /* Use the proper load helper from cpu_ldst.h */ >> - ret = helper_ld_legacy(env, addr, mmu_idx, retaddr); >> - >> - /* The last legacy access ensures that the TLB and IOTLB entry for >> 'addr' >> - * have been created. */ >> index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); >> + if (!is_read_tlb_entry_set(env, addr, index)) { >> + tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, >> retaddr); >> + } >> >> /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat) >> * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */ >> hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr; >> >> /* Set the exclusive-protected hwaddr. */ >> - env->excl_protected_hwaddr = hw_addr; >> - env->ll_sc_context = true; >> + qemu_mutex_lock(&tcg_excl_access_lock); >> + if (cpu_physical_memory_excl_is_dirty(hw_addr) && !exit_flush_request) { >> + exit_flush_request = 1; >> >> - /* No need to mask hw_addr with TARGET_PAGE_MASK since >> - * cpu_physical_memory_excl_is_dirty() will take care of that. */ >> - if (cpu_physical_memory_excl_is_dirty(hw_addr)) { >> - cpu_physical_memory_clear_excl_dirty(hw_addr); >> + qemu_mutex_unlock(&tcg_excl_access_lock); >> + >> + cpu_exit_init_rendezvous(); >> >> - /* Invalidate the TLB entry for the other processors. The next TLB >> - * entries for this page will have the TLB_EXCL flag set. */ >> CPU_FOREACH(cpu) { >> - if (cpu != current_cpu) { >> - tlb_flush(cpu, 1); >> + if ((cpu->thread_id != qemu_get_thread_id())) { >> + if (!cpu->pending_tlb_flush) { >> + /* Flush the TLB cache before executing the next TB. */ >> + cpu->pending_tlb_flush = 1; >> + cpu_exit_callback_add(cpu, cpu_exit_tlb_flush_all_cb, >> NULL); >> + } >> + if (cpu->tcg_executing) { >> + /* We want to wait all the vCPUs that are running in >> this >> + * exact moment. >> + * Add a callback to be executed as soon as the vCPU >> exits >> + * from the current TB. Force it to exit. */ >> + cpu_exit_rendezvous_add_attendee(cpu); >> + qemu_cpu_kick_thread(cpu); >> + } >> } >> } >> + >> + cpu_exit_rendezvous_wait_others(ENV_GET_CPU(env)); >> + >> + exit_flush_request = 0; >> + >> + qemu_mutex_lock(&tcg_excl_access_lock); >> + cpu_physical_memory_clear_excl_dirty(hw_addr); >> } >> >> + env->ll_sc_context = true; >> + >> + /* Use the proper load helper from cpu_ldst.h */ >> + env->excl_protected_hwaddr = hw_addr; >> + ret = helper_ld_legacy(env, addr, mmu_idx, retaddr); >> + >> + qemu_mutex_unlock(&tcg_excl_access_lock); >> + >> /* For this vCPU, just update the TLB entry, no need to flush. */ >> env->tlb_table[mmu_idx][index].addr_write |= TLB_EXCL; >> >> @@ -106,7 +143,6 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, >> target_ulong addr, >> { >> WORD_TYPE ret; >> int index; >> - hwaddr hw_addr; >> unsigned mmu_idx = get_mmuidx(oi); >> >> /* If the TLB entry is not the right one, create it. */ >> @@ -115,29 +151,22 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, >> target_ulong addr, >> tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr); >> } >> >> - /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat) >> - * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */ >> - hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr; >> - >> - if (!env->ll_sc_context) { >> - /* No LoakLink has been set, the StoreCond has to fail. */ >> - return 1; >> - } >> - >> env->ll_sc_context = 0; >> >> - if (cpu_physical_memory_excl_is_dirty(hw_addr)) { >> - /* Another vCPU has accessed the memory after the LoadLink. */ >> - ret = 1; >> - } else { >> - helper_st_legacy(env, addr, val, mmu_idx, retaddr); >> + /* We set it preventively to true to distinguish the following legacy >> + * access as one made by the store conditional wrapper. If the store >> + * conditional does not succeed, the value will be set to 0.*/ >> + env->excl_succeeded = 1; >> + helper_st_legacy(env, addr, val, mmu_idx, retaddr); >> >> - /* The StoreConditional succeeded */ >> + if (env->excl_succeeded) { >> + env->excl_succeeded = 0; >> ret = 0; >> + } else { >> + g_usleep(TCG_ATOMIC_INSN_EMUL_DELAY); >> + ret = 1; >> } >> >> - env->tlb_table[mmu_idx][index].addr_write &= ~TLB_EXCL; >> - env->excl_protected_hwaddr = EXCLUSIVE_RESET_ADDR; >> /* It's likely that the page will be used again for exclusive accesses, >> * for this reason we don't flush any TLB cache at the price of some >> * additional slow paths and we don't set the page bit as dirty. > > -- > Alex Bennée