Richard Henderson <r...@twiddle.net> writes: > From: "Emilio G. Cota" <c...@braap.org> > > Emulating LL/SC with cmpxchg is not correct, since it can > suffer from the ABA problem. Portable parallel code, however, > is written assuming only cmpxchg--and not LL/SC--is available. > This means that in practice emulating LL/SC with cmpxchg is > a viable alternative. > > The appended emulates LL/SC pairs in ARM with cmpxchg helpers. > This works in both user and system mode. In usermode, it avoids > pausing all other CPUs to perform the LL/SC pair. The subsequent > performance and scalability improvement is significant, as the > plots below show. They plot the throughput of atomic_add-bench > compiled for ARM and executed on a 64-core x86 machine. > > Hi-res plots: http://imgur.com/a/aNQpB > > atomic_add-bench: 1000000 ops/thread, [0,1] range > > 9 ++---------+----------+----------+----------+----------+----------+---++ > +cmpxchg +-E--+ + + + + + | > 8 +Emaster +-H--+ ++ > | | | > 7 ++E ++ > | | | > 6 ++++ ++ > | | | > 5 ++ | ++ > 4 ++ | ++ > | | | > 3 ++ | ++ > | | | > 2 ++ | ++ > |H++E+--- +++ ---+E+------+E+------+E| > 1 +++ +E+-----+E+------+E+------+E+------+E+-- +++ +++ ++ > ++H+ + +++ + +++ ++++ + + + | > 0 ++--H----H-+-----H----+----------+----------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 1000000 ops/thread, [0,2] range > > 16 ++---------+----------+---------+----------+----------+----------+---++ > +cmpxchg +-E--+ + + + + + | > 14 ++master +-H--+ ++ > | | | > 12 ++| ++ > | E | > 10 ++| ++ > | | | > 8 ++++ ++ > |E+| | > | | | > 6 ++ | ++ > | | | > 4 ++ | ++ > | +E+--- +++ +++ +++ ---+E+------+E| > 2 +H+ +E+------E-------+E+-----+E+------+E+------+E+-- +++ > + | + +++ + ++++ + + + | > 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 1000000 ops/thread, [0,128] range > > 70 ++---------+----------+---------+----------+----------+----------+---++ > +cmpxchg +-E--+ + + + ++++ + | > 60 ++master +-H--+ ----E------+E+-------++ > | -+E+--- +++ +++ +E| > | +++ ---- +++ ++| > 50 ++ +++ ---+E+- ++ > | -E--- | > 40 ++ ---+++ ++ > | +++--- | > | -+E+ | > 30 ++ +++---- ++ > | +E+ | > 20 ++ +++-- ++ > | +E+ | > |+E+ | > 10 +E+ ++ > + + + + + + + | > 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 1000000 ops/thread, [0,1024] range > > 120 ++---------+---------+----------+---------+----------+----------+---++ > +cmpxchg +-E--+ + + + + + | > | master +-H--+ ++| > 100 ++ ----E+ > | +++ ---+E+--- ++| > | --E--- +++ | > 80 ++ ---- +++ ++ > | ---+E+- | > 60 ++ -+E+-- ++ > | +++ ---- +++ | > | -+E+- | > 40 ++ +++---- ++ > | +++ ---+E+ | > | -+E+--- | > 20 ++ +E+ ++ > |+E+++ | > +E+ + + + + + + | > 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > [rth: Enforce alignment for ldrexd.] > > Reviewed-by: Alex Bennée <alex.ben...@linaro.org> > Signed-off-by: Emilio G. Cota <c...@braap.org> > Message-Id: <1467054136-10430-23-git-send-email-c...@braap.org> > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > target-arm/translate.c | 136 > ++++++++++++++++--------------------------------- > 1 file changed, 45 insertions(+), 91 deletions(-) > > diff --git a/target-arm/translate.c b/target-arm/translate.c > index f745c37..e0e29d9 100644 > --- a/target-arm/translate.c > +++ b/target-arm/translate.c > @@ -7683,45 +7683,31 @@ static void gen_logicq_cc(TCGv_i32 lo, TCGv_i32 hi) > > /* Load/Store exclusive instructions are implemented by remembering > the value/address loaded, and seeing if these are the same > - when the store is performed. This should be sufficient to implement > + when the store is performed. This should be sufficient to implement > the architecturally mandated semantics, and avoids having to monitor > - regular stores. > - > - In system emulation mode only one CPU will be running at once, so > - this sequence is effectively atomic. In user emulation mode we > - throw an exception and handle the atomic operation elsewhere. */ > + regular stores. The compare vs the remembered value is done during > + the cmpxchg operation, but we must compare the addresses manually. */ > static void gen_load_exclusive(DisasContext *s, int rt, int rt2, > TCGv_i32 addr, int size) > { > TCGv_i32 tmp = tcg_temp_new_i32(); > + TCGMemOp opc = size | MO_ALIGN | s->be_data; > > s->is_ldex = true; > > - switch (size) { > - case 0: > - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); > - break; > - case 1: > - gen_aa32_ld16ua(s, tmp, addr, get_mem_index(s)); > - break; > - case 2: > - case 3: > - gen_aa32_ld32ua(s, tmp, addr, get_mem_index(s)); > - break; > - default: > - abort(); > - } > - > if (size == 3) { > TCGv_i32 tmp2 = tcg_temp_new_i32(); > - TCGv_i32 tmp3 = tcg_temp_new_i32(); > + TCGv_i64 t64 = tcg_temp_new_i64(); > + > + gen_aa32_ld_i64(s, t64, addr, get_mem_index(s), opc); > + tcg_gen_mov_i64(cpu_exclusive_val, t64); > + tcg_gen_extr_i64_i32(tmp, tmp2, t64); > + tcg_temp_free_i64(t64); > > - tcg_gen_addi_i32(tmp2, addr, 4); > - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); > + store_reg(s, rt2, tmp2); > tcg_temp_free_i32(tmp2); > - tcg_gen_concat_i32_i64(cpu_exclusive_val, tmp, tmp3); > - store_reg(s, rt2, tmp3);
Magically store_reg frees the tmp it is passed which means this trips up TCG when debugging in turned on. > } else { > + gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), opc); > tcg_gen_extu_i32_i64(cpu_exclusive_val, tmp); > } > > @@ -7734,23 +7720,15 @@ static void gen_clrex(DisasContext *s) > tcg_gen_movi_i64(cpu_exclusive_addr, -1); > } > > -#ifdef CONFIG_USER_ONLY > static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, > TCGv_i32 addr, int size) > { > - tcg_gen_extu_i32_i64(cpu_exclusive_test, addr); > - tcg_gen_movi_i32(cpu_exclusive_info, > - size | (rd << 4) | (rt << 8) | (rt2 << 12)); > - gen_exception_internal_insn(s, 4, EXCP_STREX); > -} > -#else > -static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, > - TCGv_i32 addr, int size) > -{ > - TCGv_i32 tmp; > - TCGv_i64 val64, extaddr; > + TCGv_i32 t0, t1, t2; > + TCGv_i64 extaddr; > + TCGv taddr; > TCGLabel *done_label; > TCGLabel *fail_label; > + TCGMemOp opc = size | MO_ALIGN | s->be_data; > > /* if (env->exclusive_addr == addr && env->exclusive_val == [addr]) { > [addr] = {Rt}; > @@ -7765,69 +7743,45 @@ static void gen_store_exclusive(DisasContext *s, int > rd, int rt, int rt2, > tcg_gen_brcond_i64(TCG_COND_NE, extaddr, cpu_exclusive_addr, fail_label); > tcg_temp_free_i64(extaddr); > > - tmp = tcg_temp_new_i32(); > - switch (size) { > - case 0: > - gen_aa32_ld8u(s, tmp, addr, get_mem_index(s)); > - break; > - case 1: > - gen_aa32_ld16u(s, tmp, addr, get_mem_index(s)); > - break; > - case 2: > - case 3: > - gen_aa32_ld32u(s, tmp, addr, get_mem_index(s)); > - break; > - default: > - abort(); > - } > - > - val64 = tcg_temp_new_i64(); > + taddr = gen_aa32_addr(s, addr, opc); > + t0 = tcg_temp_new_i32(); > + t1 = load_reg(s, rt); > if (size == 3) { > - TCGv_i32 tmp2 = tcg_temp_new_i32(); > - TCGv_i32 tmp3 = tcg_temp_new_i32(); > - tcg_gen_addi_i32(tmp2, addr, 4); > - gen_aa32_ld32u(s, tmp3, tmp2, get_mem_index(s)); > - tcg_temp_free_i32(tmp2); > - tcg_gen_concat_i32_i64(val64, tmp, tmp3); > - tcg_temp_free_i32(tmp3); > - } else { > - tcg_gen_extu_i32_i64(val64, tmp); > - } > - tcg_temp_free_i32(tmp); > + TCGv_i64 o64 = tcg_temp_new_i64(); > + TCGv_i64 n64 = tcg_temp_new_i64(); > > - tcg_gen_brcond_i64(TCG_COND_NE, val64, cpu_exclusive_val, fail_label); > - tcg_temp_free_i64(val64); > + t2 = load_reg(s, rt2); > + tcg_gen_concat_i32_i64(n64, t1, t2); > + tcg_temp_free_i32(t2); > + gen_aa32_frob64(s, n64); > > - tmp = load_reg(s, rt); > - switch (size) { > - case 0: > - gen_aa32_st8(s, tmp, addr, get_mem_index(s)); > - break; > - case 1: > - gen_aa32_st16(s, tmp, addr, get_mem_index(s)); > - break; > - case 2: > - case 3: > - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); > - break; > - default: > - abort(); > - } > - tcg_temp_free_i32(tmp); > - if (size == 3) { > - tcg_gen_addi_i32(addr, addr, 4); > - tmp = load_reg(s, rt2); > - gen_aa32_st32(s, tmp, addr, get_mem_index(s)); > - tcg_temp_free_i32(tmp); > + tcg_gen_atomic_cmpxchg_i64(o64, taddr, cpu_exclusive_val, n64, > + get_mem_index(s), opc); > + tcg_temp_free_i64(n64); > + > + gen_aa32_frob64(s, o64); > + tcg_gen_setcond_i64(TCG_COND_NE, o64, o64, cpu_exclusive_val); > + tcg_gen_extrl_i64_i32(t0, o64); > + > + tcg_temp_free_i64(o64); > + } else { > + t2 = tcg_temp_new_i32(); > + tcg_gen_extrl_i64_i32(t2, cpu_exclusive_val); > + tcg_gen_atomic_cmpxchg_i32(t0, taddr, t2, t1, get_mem_index(s), opc); > + tcg_gen_setcond_i32(TCG_COND_NE, t0, t0, t2); > + tcg_temp_free_i32(t2); > } > - tcg_gen_movi_i32(cpu_R[rd], 0); > + tcg_temp_free_i32(t1); > + tcg_temp_free(taddr); > + tcg_gen_mov_i32(cpu_R[rd], t0); > + tcg_temp_free_i32(t0); > tcg_gen_br(done_label); > + > gen_set_label(fail_label); > tcg_gen_movi_i32(cpu_R[rd], 1); > gen_set_label(done_label); > tcg_gen_movi_i64(cpu_exclusive_addr, -1); > } > -#endif > > /* gen_srs: > * @env: CPUARMState -- Alex Bennée