On Fri, Jul 17, 2015 at 2:51 PM, Alex Bennée <alex.ben...@linaro.org> wrote: > > Alvise Rigo <a.r...@virtualopensystems.com> writes: > >> Implement strex and ldrex instruction relying on TCG's qemu_ldlink and >> qemu_stcond. For the time being only the 32bit instructions are supported. >> >> Suggested-by: Jani Kokkonen <jani.kokko...@huawei.com> >> Suggested-by: Claudio Fontana <claudio.font...@huawei.com> >> Signed-off-by: Alvise Rigo <a.r...@virtualopensystems.com> >> --- >> target-arm/translate.c | 87 ++++++++++++++++++++++++++++++++++- >> tcg/arm/tcg-target.c | 121 >> +++++++++++++++++++++++++++++++++++++------------ >> 2 files changed, 178 insertions(+), 30 deletions(-) >> >> diff --git a/target-arm/translate.c b/target-arm/translate.c >> index 80302cd..0366c76 100644 >> --- a/target-arm/translate.c >> +++ b/target-arm/translate.c >> @@ -72,6 +72,8 @@ static TCGv_i64 cpu_exclusive_test; >> static TCGv_i32 cpu_exclusive_info; >> #endif >> >> +static TCGv_i32 cpu_ll_sc_context; >> + >> /* FIXME: These should be removed. */ >> static TCGv_i32 cpu_F0s, cpu_F1s; >> static TCGv_i64 cpu_F0d, cpu_F1d; >> @@ -103,6 +105,8 @@ void arm_translate_init(void) >> offsetof(CPUARMState, exclusive_addr), "exclusive_addr"); >> cpu_exclusive_val = tcg_global_mem_new_i64(TCG_AREG0, >> offsetof(CPUARMState, exclusive_val), "exclusive_val"); >> + cpu_ll_sc_context = tcg_global_mem_new_i32(TCG_AREG0, >> + offsetof(CPUARMState, ll_sc_context), "ll_sc_context"); >> #ifdef CONFIG_USER_ONLY >> cpu_exclusive_test = tcg_global_mem_new_i64(TCG_AREG0, >> offsetof(CPUARMState, exclusive_test), "exclusive_test"); >> @@ -961,6 +965,18 @@ DO_GEN_ST(8, MO_UB) >> DO_GEN_ST(16, MO_TEUW) >> DO_GEN_ST(32, MO_TEUL) >> >> +/* Load/Store exclusive generators (always unsigned) */ >> +static inline void gen_aa32_ldex32(TCGv_i32 val, TCGv_i32 addr, int index) >> +{ >> + tcg_gen_qemu_ldlink_i32(val, addr, index, MO_TEUL | MO_EXCL); >> +} >> + >> +static inline void gen_aa32_stex32(TCGv_i32 is_dirty, TCGv_i32 val, >> + TCGv_i32 addr, int index) >> +{ >> + tcg_gen_qemu_stcond_i32(is_dirty, val, addr, index, MO_TEUL | MO_EXCL); >> +} >> + >> static inline void gen_set_pc_im(DisasContext *s, target_ulong val) >> { >> tcg_gen_movi_i32(cpu_R[15], val); >> @@ -7427,6 +7443,26 @@ static void gen_load_exclusive(DisasContext *s, int >> rt, int rt2, >> store_reg(s, rt, tmp); >> } >> >> +static void gen_load_exclusive_multi(DisasContext *s, int rt, int rt2, >> + TCGv_i32 addr, int size) >> +{ >> + TCGv_i32 tmp = tcg_temp_new_i32(); >> + >> + switch (size) { >> + case 0: >> + case 1: >> + abort(); >> + case 2: >> + gen_aa32_ldex32(tmp, addr, get_mem_index(s)); >> + break; >> + case 3: >> + default: >> + abort(); >> + } >> + >> + store_reg(s, rt, tmp); >> +} >> + >> static void gen_clrex(DisasContext *s) >> { >> gen_helper_atomic_clear(cpu_env); >> @@ -7460,6 +7496,52 @@ static void gen_store_exclusive(DisasContext *s, int >> rd, int rt, int rt2, >> tcg_temp_free_i64(val); >> tcg_temp_free_i32(tmp_size); >> } >> + >> +static void gen_store_exclusive_multi(DisasContext *s, int rd, int rt, int >> rt2, >> + TCGv_i32 addr, int size) >> +{ >> + TCGv_i32 tmp; >> + TCGv_i32 is_dirty; >> + TCGLabel *done_label; >> + TCGLabel *fail_label; >> + >> + fail_label = gen_new_label(); >> + done_label = gen_new_label(); >> + >> + tmp = tcg_temp_new_i32(); >> + is_dirty = tcg_temp_new_i32(); >> + >> + /* Fail if we are not in LL/SC context. */ >> + tcg_gen_brcondi_i32(TCG_COND_NE, cpu_ll_sc_context, 1, fail_label); >> + >> + tmp = load_reg(s, rt); >> + switch (size) { >> + case 0: >> + case 1: >> + abort(); >> + break; >> + case 2: >> + gen_aa32_stex32(is_dirty, tmp, addr, get_mem_index(s)); >> + break; >> + case 3: >> + default: >> + abort(); >> + } >> + >> + tcg_temp_free_i32(tmp); >> + >> + /* Check if the store conditional has to fail. */ >> + tcg_gen_brcondi_i32(TCG_COND_EQ, is_dirty, 1, fail_label); >> + tcg_temp_free_i32(is_dirty); >> + >> + tcg_temp_free_i32(tmp); >> + >> + tcg_gen_movi_i32(cpu_R[rd], 0); /* is_dirty = 0 */ >> + tcg_gen_br(done_label); >> + gen_set_label(fail_label); >> + tcg_gen_movi_i32(cpu_R[rd], 1); /* is_dirty = 1 */ >> + gen_set_label(done_label); >> +} >> #endif >> >> /* gen_srs: >> @@ -8308,7 +8390,7 @@ static void disas_arm_insn(DisasContext *s, unsigned >> int insn) >> } else if (insn & (1 << 20)) { >> switch (op1) { >> case 0: /* ldrex */ >> - gen_load_exclusive(s, rd, 15, addr, 2); >> + gen_load_exclusive_multi(s, rd, 15, addr, >> 2); >> break; >> case 1: /* ldrexd */ >> gen_load_exclusive(s, rd, rd + 1, addr, 3); >> @@ -8326,7 +8408,8 @@ static void disas_arm_insn(DisasContext *s, unsigned >> int insn) >> rm = insn & 0xf; >> switch (op1) { >> case 0: /* strex */ >> - gen_store_exclusive(s, rd, rm, 15, addr, 2); >> + gen_store_exclusive_multi(s, rd, rm, 15, >> + addr, 2); >> break; >> case 1: /* strexd */ >> gen_store_exclusive(s, rd, rm, rm + 1, >> addr, 3); >> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c >> index ae2ec7a..f2b69a0 100644 >> --- a/tcg/arm/tcg-target.c >> +++ b/tcg/arm/tcg-target.c >> @@ -1069,6 +1069,17 @@ static void * const qemu_ld_helpers[16] = { >> [MO_BESL] = helper_be_ldul_mmu, >> }; > > So I'm guessing we'll be implementing this for every TCG backend?
Correct. According to the architecture's atomic instructions, a subset of these helpers will be implemented. > >> >> +/* LoadLink helpers, only unsigned. Use the macro below to access them. */ >> +static void * const qemu_ldex_helpers[16] = { >> + [MO_LEUL] = helper_le_ldlinkul_mmu, >> +}; >> + >> +#define LDEX_HELPER(mem_op) \ >> +({ \ >> + assert(mem_op & MO_EXCL); \ >> + qemu_ldex_helpers[((int)mem_op - MO_EXCL)]; \ >> +}) >> + >> /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, >> * uintxx_t val, int mmu_idx, uintptr_t >> ra) >> */ >> @@ -1082,6 +1093,19 @@ static void * const qemu_st_helpers[16] = { >> [MO_BEQ] = helper_be_stq_mmu, >> }; >> >> +/* StoreConditional helpers. Use the macro below to access them. */ >> +static void * const qemu_stex_helpers[16] = { >> + [MO_LEUL] = helper_le_stcondl_mmu, >> +}; >> + >> +#define STEX_HELPER(mem_op) \ >> +({ \ >> + assert(mem_op & MO_EXCL); \ >> + qemu_stex_helpers[(int)mem_op - MO_EXCL]; \ >> +}) > > Can the lookup not be merged with the existing ldst_helpers? Sure. > >> + >> + >> + >> /* Helper routines for marshalling helper function arguments into >> * the correct registers and stack. >> * argreg is where we want to put this argument, arg is the argument itself. >> @@ -1222,13 +1246,14 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg >> addrlo, TCGReg addrhi, >> path for a load or store, so that we can later generate the correct >> helper code. */ >> static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi, >> - TCGReg datalo, TCGReg datahi, TCGReg addrlo, >> - TCGReg addrhi, tcg_insn_unit *raddr, >> - tcg_insn_unit *label_ptr) >> + TCGReg llsc_success, TCGReg datalo, >> + TCGReg datahi, TCGReg addrlo, TCGReg addrhi, >> + tcg_insn_unit *raddr, tcg_insn_unit >> *label_ptr) >> { >> TCGLabelQemuLdst *label = new_ldst_label(s); >> >> label->is_ld = is_ld; >> + label->llsc_success = llsc_success; >> label->oi = oi; >> label->datalo_reg = datalo; >> label->datahi_reg = datahi; >> @@ -1259,12 +1284,16 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, >> TCGLabelQemuLdst *lb) >> /* For armv6 we can use the canonical unsigned helpers and minimize >> icache usage. For pre-armv6, use the signed helpers since we do >> not have a single insn sign-extend. */ >> - if (use_armv6_instructions) { >> - func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]; >> + if (opc & MO_EXCL) { >> + func = LDEX_HELPER(opc); >> } else { >> - func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]; >> - if (opc & MO_SIGN) { >> - opc = MO_UL; >> + if (use_armv6_instructions) { >> + func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]; >> + } else { >> + func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]; >> + if (opc & MO_SIGN) { >> + opc = MO_UL; >> + } >> } >> } >> tcg_out_call(s, func); >> @@ -1336,8 +1365,15 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, >> TCGLabelQemuLdst *lb) >> argreg = tcg_out_arg_imm32(s, argreg, oi); >> argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14); >> >> - /* Tail-call to the helper, which will return to the fast path. */ >> - tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]); >> + if (opc & MO_EXCL) { >> + tcg_out_call(s, STEX_HELPER(opc)); >> + /* Save the output of the StoreConditional */ >> + tcg_out_mov_reg(s, COND_AL, lb->llsc_success, TCG_REG_R0); >> + tcg_out_goto(s, COND_AL, lb->raddr); >> + } else { >> + /* Tail-call to the helper, which will return to the fast path. */ >> + tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & (MO_BSWAP | >> MO_SIZE)]); >> + } >> } >> #endif /* SOFTMMU */ >> >> @@ -1461,7 +1497,8 @@ static inline void tcg_out_qemu_ld_direct(TCGContext >> *s, TCGMemOp opc, >> } >> } >> >> -static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64) >> +static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64, >> + bool isLoadLink) >> { >> TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused)); >> TCGMemOpIdx oi; >> @@ -1484,13 +1521,20 @@ static void tcg_out_qemu_ld(TCGContext *s, const >> TCGArg *args, bool is64) >> addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, >> 1); >> >> /* This a conditional BL only to load a pointer within this opcode into >> LR >> - for the slow path. We will not be using the value for a tail call. >> */ >> - label_ptr = s->code_ptr; >> - tcg_out_bl_noaddr(s, COND_NE); >> + for the slow path. We will not be using the value for a tail call. >> + In the context of a LoadLink instruction, we don't check the TLB but >> we >> + always follow the slow path. */ >> + if (isLoadLink) { >> + label_ptr = s->code_ptr; >> + tcg_out_bl_noaddr(s, COND_AL); >> + } else { >> + label_ptr = s->code_ptr; >> + tcg_out_bl_noaddr(s, COND_NE); > > This seems a little redundant, we could set label_ptr outside the > isLoadLink check as it will be the same in both cases. However if we are > always taking the slow path for exclusive accesses then why should we > generate a tcg_out_tlb_read()? Do we need some side effects from calling > it for the slow path? Good point, we can avoid the TLB read if we are doing a SC/LL. In fact, the first thing we do in softmmu_llsc_template.h is to update the corresponding TLB entry if necessary. Thank you, alvise > >> >> - tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend); >> + tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend); >> + } >> >> - add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi, >> + add_qemu_ldst_label(s, true, oi, 0, datalo, datahi, addrlo, addrhi, >> s->code_ptr, label_ptr); >> #else /* !CONFIG_SOFTMMU */ >> if (GUEST_BASE) { >> @@ -1592,9 +1636,11 @@ static inline void tcg_out_qemu_st_direct(TCGContext >> *s, TCGMemOp opc, >> } >> } >> >> -static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) >> +static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64, >> + bool isStoreCond) >> { >> TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused)); >> + TCGReg llsc_success; >> TCGMemOpIdx oi; >> TCGMemOp opc; >> #ifdef CONFIG_SOFTMMU >> @@ -1603,6 +1649,9 @@ static void tcg_out_qemu_st(TCGContext *s, const >> TCGArg *args, bool is64) >> tcg_insn_unit *label_ptr; >> #endif >> >> + /* The stcond variant has one more param */ >> + llsc_success = (isStoreCond ? *args++ : 0); >> + >> datalo = *args++; >> datahi = (is64 ? *args++ : 0); >> addrlo = *args++; >> @@ -1612,16 +1661,24 @@ static void tcg_out_qemu_st(TCGContext *s, const >> TCGArg *args, bool is64) >> >> #ifdef CONFIG_SOFTMMU >> mem_index = get_mmuidx(oi); >> - addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, >> 0); >> >> - tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend); >> + if (isStoreCond) { >> + /* Always follow the slow-path for an exclusive access */ >> + label_ptr = s->code_ptr; >> + tcg_out_bl_noaddr(s, COND_AL); >> + } else { >> + addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, >> + mem_index, 0); >> >> - /* The conditional call must come last, as we're going to return here. >> */ >> - label_ptr = s->code_ptr; >> - tcg_out_bl_noaddr(s, COND_NE); >> + tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, >> addend); >> >> - add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi, >> - s->code_ptr, label_ptr); >> + /* The conditional call must come last, as we're going to return >> here.*/ >> + label_ptr = s->code_ptr; >> + tcg_out_bl_noaddr(s, COND_NE); >> + } >> + >> + add_qemu_ldst_label(s, false, oi, llsc_success, datalo, datahi, addrlo, >> + addrhi, s->code_ptr, label_ptr); > > Indeed this does what I commented about above ;-) > >> #else /* !CONFIG_SOFTMMU */ >> if (GUEST_BASE) { >> tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, GUEST_BASE); >> @@ -1864,16 +1921,22 @@ static inline void tcg_out_op(TCGContext *s, >> TCGOpcode opc, >> break; >> >> case INDEX_op_qemu_ld_i32: >> - tcg_out_qemu_ld(s, args, 0); >> + tcg_out_qemu_ld(s, args, 0, 0); >> + break; >> + case INDEX_op_qemu_ldlink_i32: >> + tcg_out_qemu_ld(s, args, 0, 1); /* LoadLink */ >> break; >> case INDEX_op_qemu_ld_i64: >> - tcg_out_qemu_ld(s, args, 1); >> + tcg_out_qemu_ld(s, args, 1, 0); >> break; >> case INDEX_op_qemu_st_i32: >> - tcg_out_qemu_st(s, args, 0); >> + tcg_out_qemu_st(s, args, 0, 0); >> + break; >> + case INDEX_op_qemu_stcond_i32: >> + tcg_out_qemu_st(s, args, 0, 1); /* StoreConditional */ >> break; >> case INDEX_op_qemu_st_i64: >> - tcg_out_qemu_st(s, args, 1); >> + tcg_out_qemu_st(s, args, 1, 0); >> break; >> >> case INDEX_op_bswap16_i32: >> @@ -1957,8 +2020,10 @@ static const TCGTargetOpDef arm_op_defs[] = { >> >> #if TARGET_LONG_BITS == 32 >> { INDEX_op_qemu_ld_i32, { "r", "l" } }, >> + { INDEX_op_qemu_ldlink_i32, { "r", "l" } }, >> { INDEX_op_qemu_ld_i64, { "r", "r", "l" } }, >> { INDEX_op_qemu_st_i32, { "s", "s" } }, >> + { INDEX_op_qemu_stcond_i32, { "r", "s", "s" } }, >> { INDEX_op_qemu_st_i64, { "s", "s", "s" } }, >> #else >> { INDEX_op_qemu_ld_i32, { "r", "l", "l" } }, > > -- > Alex Bennée