The current implementation using '__sync' built-ins to synchronize
statistics within worker threads. '__sync' built-ins functions are
full barriers which will affect the performance, so add a per worker
packets statistics.

Enable by option --insight-worker.

For example:
sudo examples/packet_ordering/arm64-armv8a-linuxapp-gcc/packet_ordering \
-l 112-115 --socket-mem=1024,1024 -n 4 -- -p 0x03 --insight-worker

RX thread stats:
 - Pkts rxd:                            226539223
 - Pkts enqd to workers ring:           226539223

Worker thread stats on core [113]:
 - Pkts deqd from workers ring:         77557888
 - Pkts enqd to tx ring:                77557888
 - Pkts enq to tx failed:               0

Worker thread stats on core [114]:
 - Pkts deqd from workers ring:         148981335
 - Pkts enqd to tx ring:                148981335
 - Pkts enq to tx failed:               0

Worker thread stats:
 - Pkts deqd from workers ring:         226539223
 - Pkts enqd to tx ring:                226539223
 - Pkts enq to tx failed:               0

TX stats:
 - Pkts deqd from tx ring:              226539223
 - Ro Pkts transmitted:                 226539168
 - Ro Pkts tx failed:                   0
 - Pkts transmitted w/o reorder:        0
 - Pkts tx failed w/o reorder:          0

Suggested-by: Honnappa Nagarahalli <honnappa.nagaraha...@arm.com>
Signed-off-by: Phil Yang <phil.y...@arm.com>
Reviewed-by: Gavin Hu <gavin...@arm.com>
---
 doc/guides/sample_app_ug/packet_ordering.rst |  4 ++-
 examples/packet_ordering/main.c              | 50 +++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/doc/guides/sample_app_ug/packet_ordering.rst 
b/doc/guides/sample_app_ug/packet_ordering.rst
index 7cfcf3f..9c5ceea 100644
--- a/doc/guides/sample_app_ug/packet_ordering.rst
+++ b/doc/guides/sample_app_ug/packet_ordering.rst
@@ -43,7 +43,7 @@ The application execution command line is:
 
 .. code-block:: console
 
-    ./test-pipeline [EAL options] -- -p PORTMASK [--disable-reorder]
+    ./packet_ordering [EAL options] -- -p PORTMASK [--disable-reorder] 
[--insight-worker]
 
 The -c EAL CPU_COREMASK option has to contain at least 3 CPU cores.
 The first CPU core in the core mask is the master core and would be assigned to
@@ -56,3 +56,5 @@ then the other pair from 2 to 3 and from 3 to 2, having [0,1] 
and [2,3] pairs.
 
 The disable-reorder long option does, as its name implies, disable the 
reordering
 of traffic, which should help evaluate reordering performance impact.
+
+The insight-worker long option enable output the packet statistics of each 
worker thread.
diff --git a/examples/packet_ordering/main.c b/examples/packet_ordering/main.c
index 149bfdd..8145074 100644
--- a/examples/packet_ordering/main.c
+++ b/examples/packet_ordering/main.c
@@ -31,6 +31,7 @@
 
 unsigned int portmask;
 unsigned int disable_reorder;
+unsigned int insight_worker;
 volatile uint8_t quit_signal;
 
 static struct rte_mempool *mbuf_pool;
@@ -71,6 +72,14 @@ volatile struct app_stats {
        } tx __rte_cache_aligned;
 } app_stats;
 
+/* per worker lcore stats */
+struct wkr_stats_per {
+               uint64_t deq_pkts;
+               uint64_t enq_pkts;
+               uint64_t enq_failed_pkts;
+} __rte_cache_aligned;
+
+static struct wkr_stats_per wkr_stats[RTE_MAX_LCORE] = {0};
 /**
  * Get the last enabled lcore ID
  *
@@ -152,6 +161,7 @@ parse_args(int argc, char **argv)
        char *prgname = argv[0];
        static struct option lgopts[] = {
                {"disable-reorder", 0, 0, 0},
+               {"insight-worker", 0, 0, 0},
                {NULL, 0, 0, 0}
        };
 
@@ -175,6 +185,11 @@ parse_args(int argc, char **argv)
                                printf("reorder disabled\n");
                                disable_reorder = 1;
                        }
+                       if (!strcmp(lgopts[option_index].name,
+                                               "insight-worker")) {
+                               printf("print all worker statistics\n");
+                               insight_worker = 1;
+                       }
                        break;
                default:
                        print_usage(prgname);
@@ -319,6 +334,11 @@ print_stats(void)
 {
        uint16_t i;
        struct rte_eth_stats eth_stats;
+       unsigned int lcore_id, last_lcore_id, master_lcore_id, end_w_lcore_id;
+
+       last_lcore_id   = get_last_lcore_id();
+       master_lcore_id = rte_get_master_lcore();
+       end_w_lcore_id  = get_previous_lcore_id(last_lcore_id);
 
        printf("\nRX thread stats:\n");
        printf(" - Pkts rxd:                            %"PRIu64"\n",
@@ -326,6 +346,26 @@ print_stats(void)
        printf(" - Pkts enqd to workers ring:           %"PRIu64"\n",
                                                app_stats.rx.enqueue_pkts);
 
+       for (lcore_id = 0; lcore_id <= end_w_lcore_id; lcore_id++) {
+               if (insight_worker
+                       && rte_lcore_is_enabled(lcore_id)
+                       && lcore_id != master_lcore_id) {
+                       printf("\nWorker thread stats on core [%u]:\n",
+                                       lcore_id);
+                       printf(" - Pkts deqd from workers ring:         
%"PRIu64"\n",
+                                       wkr_stats[lcore_id].deq_pkts);
+                       printf(" - Pkts enqd to tx ring:                
%"PRIu64"\n",
+                                       wkr_stats[lcore_id].enq_pkts);
+                       printf(" - Pkts enq to tx failed:               
%"PRIu64"\n",
+                                       wkr_stats[lcore_id].enq_failed_pkts);
+               }
+
+               app_stats.wkr.dequeue_pkts += wkr_stats[lcore_id].deq_pkts;
+               app_stats.wkr.enqueue_pkts += wkr_stats[lcore_id].enq_pkts;
+               app_stats.wkr.enqueue_failed_pkts +=
+                       wkr_stats[lcore_id].enq_failed_pkts;
+       }
+
        printf("\nWorker thread stats:\n");
        printf(" - Pkts deqd from workers ring:         %"PRIu64"\n",
                                                app_stats.wkr.dequeue_pkts);
@@ -432,13 +472,14 @@ worker_thread(void *args_ptr)
        struct rte_mbuf *burst_buffer[MAX_PKTS_BURST] = { NULL };
        struct rte_ring *ring_in, *ring_out;
        const unsigned xor_val = (nb_ports > 1);
+       unsigned int core_id = rte_lcore_id();
 
        args = (struct worker_thread_args *) args_ptr;
        ring_in  = args->ring_in;
        ring_out = args->ring_out;
 
        RTE_LOG(INFO, REORDERAPP, "%s() started on lcore %u\n", __func__,
-                                                       rte_lcore_id());
+                                                       core_id);
 
        while (!quit_signal) {
 
@@ -448,7 +489,7 @@ worker_thread(void *args_ptr)
                if (unlikely(burst_size == 0))
                        continue;
 
-               __sync_fetch_and_add(&app_stats.wkr.dequeue_pkts, burst_size);
+               wkr_stats[core_id].deq_pkts += burst_size;
 
                /* just do some operation on mbuf */
                for (i = 0; i < burst_size;)
@@ -457,11 +498,10 @@ worker_thread(void *args_ptr)
                /* enqueue the modified mbufs to workers_to_tx ring */
                ret = rte_ring_enqueue_burst(ring_out, (void *)burst_buffer,
                                burst_size, NULL);
-               __sync_fetch_and_add(&app_stats.wkr.enqueue_pkts, ret);
+               wkr_stats[core_id].enq_pkts += ret;
                if (unlikely(ret < burst_size)) {
                        /* Return the mbufs to their respective pool, dropping 
packets */
-                       __sync_fetch_and_add(&app_stats.wkr.enqueue_failed_pkts,
-                                       (int)burst_size - ret);
+                       wkr_stats[core_id].enq_failed_pkts += burst_size - ret;
                        pktmbuf_free_bulk(&burst_buffer[ret], burst_size - ret);
                }
        }
-- 
2.7.4

Reply via email to