'__sync' built-in functions are deprecated, should use the '__atomic' built-in instead. the sync built-in functions are full barriers, while atomic built-in functions offer less restrictive one-way barriers, which help performance.
Here is the example test result on TX2: sudo ./arm64-armv8a-linuxapp-gcc/app/test -l 112-139 \ -n 4 --socket-mem=1024,1024 -- -i RTE>>distributor_perf_autotest *** distributor_perf_autotest without this patch *** ==== Cache line switch test === Time for 33554432 iterations = 1519202730 ticks Ticks per iteration = 45 *** distributor_perf_autotest with this patch *** ==== Cache line switch test === Time for 33554432 iterations = 1251715496 ticks Ticks per iteration = 37 Less ticks needed for the cache line switch test. It got 17% of performance improvement. Signed-off-by: Phil Yang <phil.y...@arm.com> Reviewed-by: Gavin Hu <gavin...@arm.com> Reviewed-by: Ruifeng Wang <ruifeng.w...@arm.com> Reviewed-by: Joyce Kong <joyce.k...@arm.com> Reviewed-by: Dharmik Thakkar <dharmik.thak...@arm.com> --- app/test/test_distributor.c | 18 +++++++++++++++++- app/test/test_distributor_perf.c | 7 ++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c index 98919ec..ddab08d 100644 --- a/app/test/test_distributor.c +++ b/app/test/test_distributor.c @@ -62,9 +62,14 @@ handle_work(void *arg) struct worker_params *wp = arg; struct rte_distributor *db = wp->dist; unsigned int count = 0, num = 0; - unsigned int id = __sync_fetch_and_add(&worker_idx, 1); int i; +#ifdef RTE_USE_C11_MEM_MODEL + unsigned int id = __atomic_fetch_add(&worker_idx, 1, __ATOMIC_RELAXED); +#else + unsigned int id = __sync_fetch_and_add(&worker_idx, 1); +#endif + for (i = 0; i < 8; i++) buf[i] = NULL; num = rte_distributor_get_pkt(db, id, buf, buf, num); @@ -270,7 +275,12 @@ handle_work_with_free_mbufs(void *arg) unsigned int count = 0; unsigned int i; unsigned int num = 0; + +#ifdef RTE_USE_C11_MEM_MODEL + unsigned int id = __atomic_fetch_add(&worker_idx, 1, __ATOMIC_RELAXED); +#else unsigned int id = __sync_fetch_and_add(&worker_idx, 1); +#endif for (i = 0; i < 8; i++) buf[i] = NULL; @@ -343,7 +353,13 @@ handle_work_for_shutdown_test(void *arg) unsigned int total = 0; unsigned int i; unsigned int returned = 0; + +#ifdef RTE_USE_C11_MEM_MODEL + const unsigned int id = __atomic_fetch_add(&worker_idx, 1, + __ATOMIC_RELAXED); +#else const unsigned int id = __sync_fetch_and_add(&worker_idx, 1); +#endif num = rte_distributor_get_pkt(d, id, buf, buf, num); diff --git a/app/test/test_distributor_perf.c b/app/test/test_distributor_perf.c index edf1998..9367460 100644 --- a/app/test/test_distributor_perf.c +++ b/app/test/test_distributor_perf.c @@ -111,9 +111,14 @@ handle_work(void *arg) unsigned int count = 0; unsigned int num = 0; int i; - unsigned int id = __sync_fetch_and_add(&worker_idx, 1); struct rte_mbuf *buf[8] __rte_cache_aligned; +#ifdef RTE_USE_C11_MEM_MODEL + unsigned int id = __atomic_fetch_add(&worker_idx, 1, __ATOMIC_RELAXED); +#else + unsigned int id = __sync_fetch_and_add(&worker_idx, 1); +#endif + for (i = 0; i < 8; i++) buf[i] = NULL; -- 2.7.4