Richard Henderson <r...@twiddle.net> writes: > From: "Emilio G. Cota" <c...@braap.org> > > With this microbenchmark we can measure the overhead of emulating atomic > instructions with a configurable degree of contention. > > The benchmark spawns $n threads, each performing $o atomic ops (additions) > in a loop. Each atomic operation is performed on a different cache line > (assuming lines are 64b long) that is randomly selected from a range [0, $r). > > [ Note: each $foo corresponds to a -foo flag ] > > Signed-off-by: Emilio G. Cota <c...@braap.org> > Signed-off-by: Richard Henderson <r...@twiddle.net> > Message-Id: <1467054136-10430-20-git-send-email-c...@braap.org> > --- > tests/.gitignore | 1 + > tests/Makefile.include | 4 +- > tests/atomic_add-bench.c | 180 > +++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 184 insertions(+), 1 deletion(-) > create mode 100644 tests/atomic_add-bench.c > > diff --git a/tests/.gitignore b/tests/.gitignore > index dbb5263..ec3137a 100644 > --- a/tests/.gitignore > +++ b/tests/.gitignore > @@ -1,3 +1,4 @@ > +atomic_add-bench > check-qdict > check-qfloat > check-qint > diff --git a/tests/Makefile.include b/tests/Makefile.include > index 14be491..e1957ed 100644 > --- a/tests/Makefile.include > +++ b/tests/Makefile.include > @@ -421,7 +421,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o > tests/check-qdict.o \ > tests/test-opts-visitor.o tests/test-qmp-event.o \ > tests/rcutorture.o tests/test-rcu-list.o \ > tests/test-qdist.o \ > - tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o > + tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \ > + tests/atomic_add-bench.o > > $(test-obj-y): QEMU_INCLUDES += -Itests > QEMU_CFLAGS += -I$(SRC_PATH)/tests > @@ -465,6 +466,7 @@ tests/test-qdist$(EXESUF): tests/test-qdist.o > $(test-util-obj-y) > tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y) > tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) > $(test-util-obj-y) > tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y) > +tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o > $(test-util-obj-y)
This probably more properly lives in tests/tcg/generic or some such but that needs the tcg/tests being rehabilitated into the build system so at least here it gets built. > > tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \ > hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\ > diff --git a/tests/atomic_add-bench.c b/tests/atomic_add-bench.c > new file mode 100644 > index 0000000..5bbecf6 > --- /dev/null > +++ b/tests/atomic_add-bench.c I wonder if this would be worth making atomic-bench and adding the other atomic operations into the benchmark? I know given the current helper overhead its unlikely to show much difference between the ops but if we move to backend support for the tcg atomics it would be a useful tool to have. > @@ -0,0 +1,180 @@ > +#include "qemu/osdep.h" > +#include "qemu/thread.h" > +#include "qemu/host-utils.h" > +#include "qemu/processor.h" > + > +struct thread_info { > + uint64_t r; > +} QEMU_ALIGNED(64); > + > +struct count { > + unsigned long val; > +} QEMU_ALIGNED(64); > + > +static QemuThread *threads; > +static struct thread_info *th_info; > +static unsigned int n_threads = 1; > +static unsigned int n_ready_threads; > +static struct count *counts; > +static unsigned long n_ops = 10000; > +static double duration; > +static unsigned int range = 1; > +static bool test_start; > + > +static const char commands_string[] = > + " -n = number of threads\n" > + " -o = number of ops per thread\n" > + " -r = range (will be rounded up to pow2)"; > + > +static void usage_complete(char *argv[]) > +{ > + fprintf(stderr, "Usage: %s [options]\n", argv[0]); > + fprintf(stderr, "options:\n%s\n", commands_string); > +} > + > +/* > + * From: https://en.wikipedia.org/wiki/Xorshift > + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only > + * guaranteed to be >= INT_MAX). > + */ > +static uint64_t xorshift64star(uint64_t x) > +{ > + x ^= x >> 12; /* a */ > + x ^= x << 25; /* b */ > + x ^= x >> 27; /* c */ > + return x * UINT64_C(2685821657736338717); > +} > + > +static void *thread_func(void *arg) > +{ > + struct thread_info *info = arg; > + unsigned long i; > + > + atomic_inc(&n_ready_threads); > + while (!atomic_mb_read(&test_start)) { > + cpu_relax(); > + } > + > + for (i = 0; i < n_ops; i++) { > + unsigned int index; > + > + info->r = xorshift64star(info->r); > + index = info->r & (range - 1); > + atomic_inc(&counts[index].val); > + } > + return NULL; > +} > + > +static inline > +uint64_t ts_subtract(const struct timespec *a, const struct timespec *b) > +{ > + uint64_t ns; > + > + ns = (b->tv_sec - a->tv_sec) * 1000000000ULL; > + ns += (b->tv_nsec - a->tv_nsec); > + return ns; > +} > + > +static void run_test(void) > +{ > + unsigned int i; > + struct timespec ts_start, ts_end; > + > + while (atomic_read(&n_ready_threads) != n_threads) { > + cpu_relax(); > + } > + atomic_mb_set(&test_start, true); > + > + clock_gettime(CLOCK_MONOTONIC, &ts_start); > + for (i = 0; i < n_threads; i++) { > + qemu_thread_join(&threads[i]); > + } > + clock_gettime(CLOCK_MONOTONIC, &ts_end); > + duration = ts_subtract(&ts_start, &ts_end) / 1e9; > +} > + > +static void create_threads(void) > +{ > + unsigned int i; > + > + threads = g_new(QemuThread, n_threads); > + th_info = g_new(struct thread_info, n_threads); > + counts = qemu_memalign(64, sizeof(*counts) * range); This fails on my setup as AFAICT qemu_memalign doesn't give you zeroed memory. I added a memset after to zero it out. > + > + for (i = 0; i < n_threads; i++) { > + struct thread_info *info = &th_info[i]; > + > + info->r = (i + 1) ^ time(NULL); > + qemu_thread_create(&threads[i], NULL, thread_func, info, > + QEMU_THREAD_JOINABLE); > + } > +} > + > +static void pr_params(void) > +{ > + printf("Parameters:\n"); > + printf(" # of threads: %u\n", n_threads); > + printf(" n_ops: %lu\n", n_ops); > + printf(" ops' range: %u\n", range); > +} > + > +static void pr_stats(void) > +{ > + unsigned long long val = 0; > + unsigned int i; > + double tx; > + > + for (i = 0; i < range; i++) { > + val += counts[i].val; > + } > + assert(val == n_threads * n_ops); Again while I was testing this failed due to the above. It would proably also be worth reporting the fail condition for the test so my current hacky patch looks like: modified tests/atomic_add-bench.c @@ -100,6 +100,7 @@ static void create_threads(void) threads = g_new(QemuThread, n_threads); th_info = g_new(struct thread_info, n_threads); counts = qemu_memalign(64, sizeof(*counts) * range); + memset(counts, 0, sizeof(*counts) * range); for (i = 0; i < n_threads; i++) { struct thread_info *info = &th_info[i]; @@ -118,22 +119,29 @@ static void pr_params(void) printf(" ops' range: %u\n", range); } -static void pr_stats(void) +static int pr_stats(void) { - unsigned long long val = 0; + unsigned long long target_val, val = 0; unsigned int i; double tx; for (i = 0; i < range; i++) { val += counts[i].val; } - assert(val == n_threads * n_ops); + + target_val = (n_threads * n_ops); + if (val != target_val) { + printf("Bad total: %llu vs %llu\n", val, target_val); + return -1; + }; tx = val / duration / 1e6; printf("Results:\n"); printf("Duration: %.2f s\n", duration); printf(" Throughput: %.2f Mops/s\n", tx); printf(" Throughput/thread: %.2f Mops/s/thread\n", tx / n_threads); + + return 0; } static void parse_args(int argc, char *argv[]) @@ -175,6 +183,5 @@ int main(int argc, char *argv[]) pr_params(); create_threads(); run_test(); - pr_stats(); - return 0; + return pr_stats(); } -- Alex Bennée