Richard Henderson <r...@twiddle.net> writes: > From: "Emilio G. Cota" <c...@braap.org> > > It's been superseded by the atomic helpers. > > The use of the atomic helpers provides a significant performance and > scalability > improvement. Below is the result of running the atomic_add-test > microbenchmark with: > $ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n > , where $n is the number of threads and $r is the allowed range for the > additions. > > The scenarios measured are: > - atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset) > - cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper > - master: before this patchset > > Results sorted in ascending range, i.e. descending degree of contention. > Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64 > Opteron 6376 cores. > > atomic_add-bench: 5000000 ops/thread, [0,1] range > > 25 ++---------+----------+---------+----------+----------+----------+---++ > + atomic +-E--+ + + + + + | > |cmpxchg +-H--+ | > 20 +Emaster +-N--+ ++ > || | > |++ | > || | > 15 +++ ++ > |N| | > |+| | > 10 ++| ++ > |+|+ | > | | -+E+------ +++ ---+E+------+E+------+E+-----+E+------+E| > |+E+E+- +++ +E+------+E+-- | > 5 ++|+ ++ > |+N+H+--- +++ | > ++++N+--+H++----+++ + +++ --++H+------+H+------+H++----+H+---+--- | > 0 ++---------+-----H----+---H-----+----------+----------+----------+---H+ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 5000000 ops/thread, [0,2] range > > 25 ++---------+----------+---------+----------+----------+----------+---++ > ++atomic +-E--+ + + + + + | > |cmpxchg +-H--+ | > 20 ++master +-N--+ ++ > |E| | > |++ | > ||E | > 15 ++| ++ > |N|| | > |+|| ---+E+------+E+-----+E+------+E| > 10 ++| | ---+E+------+E+-----+E+--- +++ +++ > ||H+E+--+E+-- | > |+++++ | > | || | > 5 ++|+H+-- +++ ++ > |+N+ - ---+H+------+H+------ | > + +N+--+H++----+H+---+--+H+----++H+--- + + +H+---+--+H| > 0 ++---------+----------+---------+----------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 5000000 ops/thread, [0,8] range > > 40 ++---------+----------+---------+----------+----------+----------+---++ > ++atomic +-E--+ + + + + + | > 35 +cmpxchg +-H--+ ++ > | master +-N--+ ---+E+------+E+------+E+-----+E+------+E| > 30 ++| ---+E+-- +++ ++ > | | -+E+--- | > 25 ++E ---- +++ ++ > |+++++ -+E+ | > 20 +E+ E-- +++ ++ > |H|+++ | > |+| +H+------- | > 15 ++H+ ---+++ +H+------ ++ > |N++H+-- +++--- +H+------++| > 10 ++ +++ - +++ ---+H+ +++ +H+ > | | +H+-----+H+------+H+-- | > 5 ++| +++ ++ > ++N+N+--+N++ + + + + + | > 0 ++---------+----------+---------+----------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 5000000 ops/thread, [0,128] range > > 160 ++---------+---------+----------+---------+----------+----------+---++ > + atomic +-E--+ + + + + + | > 140 +cmpxchg +-H--+ +++ +++ ++ > | master +-N--+ E--------E------+E+------++| > 120 ++ --| | +++ E+ > | -- +++ +++ ++| > 100 ++ - ++ > | +++- +++ ++| > 80 ++ -+E+ -+H+------+H+------H--------++ > | ---- ---- +++ H| > | ---+E+-----+E+- ---+H+ ++| > 60 ++ +E+--- +++ ---+H+--- ++ > | --+++ ---+H+-- | > 40 ++ +E+-+H+--- ++ > | +H+ | > 20 +EE+ ++ > +N+ + + + + + + | > 0 ++N-N---N--+---------+----------+---------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > atomic_add-bench: 5000000 ops/thread, [0,1024] range > > 350 ++---------+---------+----------+---------+----------+----------+---++ > + atomic +-E--+ + + + + + | > 300 +cmpxchg +-H--+ +++ > | master +-N--+ +++ || > | +++ | ----E| > 250 ++ | ----E---- ++ > | ----E--- | ---+H| > 200 ++ -+E+--- +++ ---+H+--- ++ > | ---- -+H+-- | > | +E+ +++ ---- +++ | > 150 ++ ---+++ ---+H+- ++ > | --- -+H+-- | > 100 ++ ---+E+ ---- +++ ++ > | +++ ---+E+-----+H+- | > | -+E+------+H+-- | > 50 ++ +E+ ++ > +EE+ + + + + + + | > 0 ++N-N---N--+---------+----------+---------+----------+----------+---++ > 0 10 20 30 40 50 60 > Number of threads > > hi-res: http://imgur.com/a/fMRmq > > For master I stopped measuring master after 8 threads, because there is little > point in measuring the well-known performance collapse of a contended lock. > > Signed-off-by: Emilio G. Cota <c...@braap.org> > Message-Id: <1467054136-10430-21-git-send-email-c...@braap.org> > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > target-i386/helper.h | 2 -- > target-i386/mem_helper.c | 33 --------------------------------- > target-i386/translate.c | 15 --------------- > 3 files changed, 50 deletions(-) > > diff --git a/target-i386/helper.h b/target-i386/helper.h > index 729d4b6..4e859eb 100644 > --- a/target-i386/helper.h > +++ b/target-i386/helper.h > @@ -1,8 +1,6 @@ > DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int) > DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int) > > -DEF_HELPER_0(lock, void) > -DEF_HELPER_0(unlock, void) > DEF_HELPER_3(write_eflags, void, env, tl, i32) > DEF_HELPER_1(read_eflags, tl, env) > DEF_HELPER_2(divb_AL, void, env, tl) > diff --git a/target-i386/mem_helper.c b/target-i386/mem_helper.c > index 8b72fda..f8d2a1e 100644 > --- a/target-i386/mem_helper.c > +++ b/target-i386/mem_helper.c > @@ -25,39 +25,6 @@ > #include "qemu/int128.h" > #include "tcg.h" > > -/* broken thread support */ > - > -#if defined(CONFIG_USER_ONLY) > -QemuMutex global_cpu_lock; > - > -void helper_lock(void) > -{ > - qemu_mutex_lock(&global_cpu_lock); > -} > - > -void helper_unlock(void) > -{ > - qemu_mutex_unlock(&global_cpu_lock); > -} > - > -void helper_lock_init(void) > -{ > - qemu_mutex_init(&global_cpu_lock); > -} > -#else > -void helper_lock(void) > -{ > -} > - > -void helper_unlock(void) > -{ > -} > - > -void helper_lock_init(void) > -{ > -} > -#endif > - > void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0) > { > uintptr_t ra = GETPC(); > diff --git a/target-i386/translate.c b/target-i386/translate.c > index b8c5dde..755a18f 100644 > --- a/target-i386/translate.c > +++ b/target-i386/translate.c > @@ -4537,10 +4537,6 @@ static target_ulong disas_insn(CPUX86State *env, > DisasContext *s, > s->aflag = aflag; > s->dflag = dflag; > > - /* lock generation */ > - if (prefixes & PREFIX_LOCK) > - gen_helper_lock(); > - > /* now check op code */ > reswitch: > switch(b) { > @@ -8203,20 +8199,11 @@ static target_ulong disas_insn(CPUX86State *env, > DisasContext *s, > default: > goto unknown_op; > } > - /* lock generation */ > - if (s->prefix & PREFIX_LOCK) > - gen_helper_unlock(); > return s->pc; > illegal_op: > - if (s->prefix & PREFIX_LOCK) > - gen_helper_unlock(); > - /* XXX: ensure that no lock was generated */ > gen_illegal_opcode(s); > return s->pc; > unknown_op: > - if (s->prefix & PREFIX_LOCK) > - gen_helper_unlock(); > - /* XXX: ensure that no lock was generated */ > gen_unknown_opcode(env, s); > return s->pc; > } > @@ -8308,8 +8295,6 @@ void tcg_x86_init(void) > offsetof(CPUX86State, bnd_regs[i].ub), > bnd_regu_names[i]); > } > - > - helper_lock_init(); > } > > /* generate intermediate code for basic block 'tb'. */
Reviewed-by: Alex Bennée <alex.ben...@linaro.org> I also looked over the rest of the target-i386 stuff and it seems fine to me but I'm not really an x86 expert. You can treat that as a r-b if you want if no one with more x86-fu want to look at it. -- Alex Bennée