On Mon, Jun 22, 2026 at 12:11 AM Morten Brørup <[email protected]>
wrote:
>
> Added graph node profiling stats, build time configurable by enabling
> RTE_GRAPH_PROFILE in rte_config.h.
>
> Signed-off-by: Morten Brørup <[email protected]>
> ---
> v5:
> * Added stats for a half burst and a full burst.
> v4:
> * Added documentation. (AI)
> * Added more comments. (AI)
> * Improved dump. (AI)
> * Debug shows both cycles/call and cycles/obj.
> v3:
> * Debug shows cycles/obj instead of cycles/call.
> * Fixed missing --in-reply-to.
> v2:
> * Fixed indentation.
> ---
> config/rte_config.h | 1 +
> doc/guides/prog_guide/graph_lib.rst | 1 +
> lib/graph/graph_debug.c | 57 ++++++++++++++++++++++++++++-
> lib/graph/node.c | 2 +
> lib/graph/rte_graph_worker_common.h | 33 +++++++++++++++--
Please update app/test/test_graph.c to validate this featue.
> 5 files changed, 90 insertions(+), 4 deletions(-)
>
> diff --git a/config/rte_config.h b/config/rte_config.h
> index 0447cdf2ad..1942c1b1ec 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -106,6 +106,7 @@
> /* rte_graph defines */
> #define RTE_GRAPH_BURST_SIZE 256
> #define RTE_LIBRTE_GRAPH_STATS 1
> +/* RTE_GRAPH_PROFILE is not set */
>
> /****** driver defines ********/
>
> diff --git a/doc/guides/prog_guide/graph_lib.rst
> b/doc/guides/prog_guide/graph_lib.rst
> index 8dd49c19d2..bc36042296 100644
> --- a/doc/guides/prog_guide/graph_lib.rst
> +++ b/doc/guides/prog_guide/graph_lib.rst
> @@ -49,6 +49,7 @@ Performance tuning parameters
> RTE_GRAPH_BURST_SIZE config option.
> The testing shows, on x86 and arm64 servers, The sweet spot is 256 burst
> size. While on arm64 embedded SoCs, it is either 64 or 128.
> +- Enable the ``RTE_GRAPH_PROFILE`` config option for more profiling details.
> - Disable node statistics (using ``RTE_LIBRTE_GRAPH_STATS`` config option)
> if not needed.
>
> diff --git a/lib/graph/graph_debug.c b/lib/graph/graph_debug.c
> index e3b8cccdc1..7c97e23748 100644
> --- a/lib/graph/graph_debug.c
> +++ b/lib/graph/graph_debug.c
> @@ -92,7 +92,62 @@ rte_graph_obj_dump(FILE *f, struct rte_graph *g, bool all)
> fprintf(f, " total_sched_fail=%" PRId64 "\n",
> n->dispatch.total_sched_fail);
> }
> - fprintf(f, " total_calls=%" PRId64 "\n",
> n->total_calls);
> + fprintf(f, " total_calls=%" PRIu64 "\n",
> n->total_calls);
> + if (rte_graph_has_stats_feature()) {
> + fprintf(f, " total_cycles=%" PRIu64 ", avg
> cycles/call=%.1f\n",
> + n->total_cycles,
> + n->total_calls == 0 ? (double)0 :
> + (double)n->total_cycles /
> (double)n->total_calls);
> + }
> +#ifdef RTE_GRAPH_PROFILE
Please introduce rte_graph_has_profile_featue() similar to
rte_graph_has_stats_feature() to reduce if def clutter as possible.
> + uint64_t calls = n->usage_stats[0].calls;
> + fprintf(f, " objs[0]\n");
> + fprintf(f, " calls=%" PRIu64 ", cycles=%" PRIu64 ",
> avg cycles/call=%.1f\n",
> + calls,
>
> diff --git a/lib/graph/rte_graph_worker_common.h
> b/lib/graph/rte_graph_worker_common.h
> index 4ab53a533e..0d8039575d 100644
> --- a/lib/graph/rte_graph_worker_common.h
> +++ b/lib/graph/rte_graph_worker_common.h
> @@ -144,12 +144,26 @@ struct __rte_cache_aligned rte_node {
> rte_node_process_t process; /**< Process function. */
> uint64_t process_u64;
> };
> + /** Fast path area cache line 3. */
> +#ifdef RTE_GRAPH_PROFILE
> + struct {
> + uint64_t calls; /**< Calls processing resp. 0 or
> 1 objects. */
> + uint64_t cycles; /**< Cycles spent processing
> resp. 0 or 1 objects. */
> + } usage_stats[2]; /**< Usage when this node processed 0
> or 1 objects. */
> + uint64_t full_burst_calls; /**< Calls processing a full
> burst of objects. */
> + uint64_t full_burst_cycles; /**< Cycles spent processing a
> full burst of objects. */
> + uint64_t half_burst_calls; /**< Calls processing a half
> burst of objects. */
> + uint64_t half_burst_cycles; /**< Cycles spent processing a
> half burst of objects. */
> + /** Fast path area cache line 4. */
> +#endif
Is it an ABI breakage?
> alignas(RTE_CACHE_LINE_MIN_SIZE) struct rte_node *nodes[];
> /**< Next nodes. */
> };
> };
>