Re: [Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading

alvise rigo Fri, 17 Jul 2015 08:33:03 -0700

On Fri, Jul 17, 2015 at 5:27 PM, Alex Bennée <alex.ben...@linaro.org> wrote:
>
> Alvise Rigo <a.r...@virtualopensystems.com> writes:
>
>> Update the TCG LL/SC instructions to work in multi-threading.
>>
>> The basic idea remains untouched, but the whole mechanism is improved to
>> make use of the callback support to query TLB flush requests and the
>> rendezvous callback to synchronize all the currently running vCPUs.
>>
>> In essence, if a vCPU wants to LL to a page which is not already set as
>> EXCL, it will arrange a rendezvous with all the vCPUs that are executing
>> a TB and query a TLB flush for *all* the vCPUs.
>> Doing so, we make sure that:
>> - the running vCPUs do not touch the EXCL page while the requesting vCPU
>>   is setting the transaction to EXCL of the page
>> - all the vCPUs will have the EXCL flag in the TLB entry for that
>>   specific page *before* entering the next TB
>>
>> Suggested-by: Jani Kokkonen <jani.kokko...@huawei.com>
>> Suggested-by: Claudio Fontana <claudio.font...@huawei.com>
>> Signed-off-by: Alvise Rigo <a.r...@virtualopensystems.com>
>> ---
>>  cputlb.c                |  2 +
>>  include/exec/cpu-defs.h |  4 ++
>>  softmmu_llsc_template.h | 97 
>> ++++++++++++++++++++++++++++++++-----------------
>>  3 files changed, 69 insertions(+), 34 deletions(-)
>>
>> diff --git a/cputlb.c b/cputlb.c
>> index 66df41a..0566e0f 100644
>> --- a/cputlb.c
>> +++ b/cputlb.c
>> @@ -30,6 +30,8 @@
>>  #include "exec/ram_addr.h"
>>  #include "tcg/tcg.h"
>>
>> +#include "sysemu/cpus.h"
>> +
>>  void qemu_mutex_lock_iothread(void);
>>  void qemu_mutex_unlock_iothread(void);
>>
>> diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
>> index c73a75f..40742b3 100644
>> --- a/include/exec/cpu-defs.h
>> +++ b/include/exec/cpu-defs.h
>> @@ -169,5 +169,9 @@ typedef struct CPUIOTLBEntry {
>>      /* Used for atomic instruction translation. */                      \
>>      bool ll_sc_context;                                                 \
>>      hwaddr excl_protected_hwaddr;                                       \
>> +    /* Used to carry the stcond result and also as a flag to flag a
>> +     * normal store access made by a stcond. */                         \
>> +    int excl_succeeded;                                                 \
>> +
>>
>>  #endif
>> diff --git a/softmmu_llsc_template.h b/softmmu_llsc_template.h
>> index 81e9d8e..4105e72 100644
>> --- a/softmmu_llsc_template.h
>> +++ b/softmmu_llsc_template.h
>> @@ -54,7 +54,21 @@
>>                   (TARGET_PAGE_MASK | TLB_INVALID_MASK));                    
>>  \
>>  })                                                                          
>>  \
>>
>> -#define EXCLUSIVE_RESET_ADDR ULLONG_MAX
>> +#define is_read_tlb_entry_set(env, page, index)                             
>>  \
>> +({                                                                          
>>  \
>> +    (addr & TARGET_PAGE_MASK)                                               
>>  \
>> +         == ((env->tlb_table[mmu_idx][index].addr_read) &                   
>>  \
>> +                 (TARGET_PAGE_MASK | TLB_INVALID_MASK));                    
>>  \
>> +})                                                                          
>>  \
>> +
>> +/* Whenever a SC operation fails, we add a small delay to reduce the
>> + * concurrency among the atomic instruction emulation code. Without this 
>> delay,
>> + * in very congested situation where plain stores make all the pending LLs
>> + * fail, the code could reach a stalling situation in which all the SCs 
>> happen
>> + * to fail.
>> + * TODO: make the delay dynamic according with the SC failing rate.
>> + * */
>> +#define TCG_ATOMIC_INSN_EMUL_DELAY 100
>
> I'd be tempted to split out this sort of chicanery into a separate patch.


OK, I think it's a good idea since it's not strictly required.

Regards,
alvise

>
>>
>>  WORD_TYPE helper_le_ldlink_name(CPUArchState *env, target_ulong addr,
>>                                  TCGMemOpIdx oi, uintptr_t retaddr)
>> @@ -65,35 +79,58 @@ WORD_TYPE helper_le_ldlink_name(CPUArchState *env, 
>> target_ulong addr,
>>      hwaddr hw_addr;
>>      unsigned mmu_idx = get_mmuidx(oi);
>>
>> -    /* Use the proper load helper from cpu_ldst.h */
>> -    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
>> -
>> -    /* The last legacy access ensures that the TLB and IOTLB entry for 
>> 'addr'
>> -     * have been created. */
>>      index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
>> +    if (!is_read_tlb_entry_set(env, addr, index)) {
>> +        tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE, mmu_idx, 
>> retaddr);
>> +    }
>>
>>      /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
>>       * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
>>      hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
>>
>>      /* Set the exclusive-protected hwaddr. */
>> -    env->excl_protected_hwaddr = hw_addr;
>> -    env->ll_sc_context = true;
>> +    qemu_mutex_lock(&tcg_excl_access_lock);
>> +    if (cpu_physical_memory_excl_is_dirty(hw_addr) && !exit_flush_request) {
>> +        exit_flush_request = 1;
>>
>> -    /* No need to mask hw_addr with TARGET_PAGE_MASK since
>> -     * cpu_physical_memory_excl_is_dirty() will take care of that. */
>> -    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
>> -        cpu_physical_memory_clear_excl_dirty(hw_addr);
>> +        qemu_mutex_unlock(&tcg_excl_access_lock);
>> +
>> +        cpu_exit_init_rendezvous();
>>
>> -        /* Invalidate the TLB entry for the other processors. The next TLB
>> -         * entries for this page will have the TLB_EXCL flag set. */
>>          CPU_FOREACH(cpu) {
>> -            if (cpu != current_cpu) {
>> -                tlb_flush(cpu, 1);
>> +            if ((cpu->thread_id != qemu_get_thread_id())) {
>> +                if (!cpu->pending_tlb_flush) {
>> +                    /* Flush the TLB cache before executing the next TB. */
>> +                    cpu->pending_tlb_flush = 1;
>> +                    cpu_exit_callback_add(cpu, cpu_exit_tlb_flush_all_cb, 
>> NULL);
>> +                }
>> +                if (cpu->tcg_executing) {
>> +                    /* We want to wait all the vCPUs that are running in 
>> this
>> +                     * exact moment.
>> +                     * Add a callback to be executed as soon as the vCPU 
>> exits
>> +                     * from the current TB. Force it to exit. */
>> +                    cpu_exit_rendezvous_add_attendee(cpu);
>> +                    qemu_cpu_kick_thread(cpu);
>> +                }
>>              }
>>          }
>> +
>> +        cpu_exit_rendezvous_wait_others(ENV_GET_CPU(env));
>> +
>> +        exit_flush_request = 0;
>> +
>> +        qemu_mutex_lock(&tcg_excl_access_lock);
>> +        cpu_physical_memory_clear_excl_dirty(hw_addr);
>>      }
>>
>> +    env->ll_sc_context = true;
>> +
>> +    /* Use the proper load helper from cpu_ldst.h */
>> +    env->excl_protected_hwaddr = hw_addr;
>> +    ret = helper_ld_legacy(env, addr, mmu_idx, retaddr);
>> +
>> +    qemu_mutex_unlock(&tcg_excl_access_lock);
>> +
>>      /* For this vCPU, just update the TLB entry, no need to flush. */
>>      env->tlb_table[mmu_idx][index].addr_write |= TLB_EXCL;
>>
>> @@ -106,7 +143,6 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, 
>> target_ulong addr,
>>  {
>>      WORD_TYPE ret;
>>      int index;
>> -    hwaddr hw_addr;
>>      unsigned mmu_idx = get_mmuidx(oi);
>>
>>      /* If the TLB entry is not the right one, create it. */
>> @@ -115,29 +151,22 @@ WORD_TYPE helper_le_stcond_name(CPUArchState *env, 
>> target_ulong addr,
>>          tlb_fill(ENV_GET_CPU(env), addr, MMU_DATA_STORE, mmu_idx, retaddr);
>>      }
>>
>> -    /* hw_addr = hwaddr of the page (i.e. section->mr->ram_addr + xlat)
>> -     * plus the offset (i.e. addr & ~TARGET_PAGE_MASK) */
>> -    hw_addr = (env->iotlb[mmu_idx][index].addr & TARGET_PAGE_MASK) + addr;
>> -
>> -    if (!env->ll_sc_context) {
>> -        /* No LoakLink has been set, the StoreCond has to fail. */
>> -        return 1;
>> -    }
>> -
>>      env->ll_sc_context = 0;
>>
>> -    if (cpu_physical_memory_excl_is_dirty(hw_addr)) {
>> -        /* Another vCPU has accessed the memory after the LoadLink. */
>> -        ret = 1;
>> -    } else {
>> -        helper_st_legacy(env, addr, val, mmu_idx, retaddr);
>> +    /* We set it preventively to true to distinguish the following legacy
>> +     * access as one made by the store conditional wrapper. If the store
>> +     * conditional does not succeed, the value will be set to 0.*/
>> +    env->excl_succeeded = 1;
>> +    helper_st_legacy(env, addr, val, mmu_idx, retaddr);
>>
>> -        /* The StoreConditional succeeded */
>> +    if (env->excl_succeeded) {
>> +        env->excl_succeeded = 0;
>>          ret = 0;
>> +    } else {
>> +        g_usleep(TCG_ATOMIC_INSN_EMUL_DELAY);
>> +        ret = 1;
>>      }
>>
>> -    env->tlb_table[mmu_idx][index].addr_write &= ~TLB_EXCL;
>> -    env->excl_protected_hwaddr = EXCLUSIVE_RESET_ADDR;
>>      /* It's likely that the page will be used again for exclusive accesses,
>>       * for this reason we don't flush any TLB cache at the price of some
>>       * additional slow paths and we don't set the page bit as dirty.
>
> --
> Alex Bennée

Re: [Qemu-devel] [RFC v3 12/13] softmmu_llsc_template.h: move to multithreading

Reply via email to