>Replace the per-object rte_node_enqueue_x1() calls with batched
>rte_node_enqueue() calls. The function now tracks runs of consecutive
>objects going to the same edge and flushes them in bulk.
>
>When all objects go to the same edge and come from the node's own
>buffer (objs == node->objs), use rte_node_next_stream_move() which
>swaps pointers instead of copying.
>
>Signed-off-by: Robin Jarry <[email protected]>
>---
> lib/graph/rte_graph_worker_common.h | 15 +++++++++++++--
> 1 file changed, 13 insertions(+), 2 deletions(-)
>
>diff --git a/lib/graph/rte_graph_worker_common.h 
>b/lib/graph/rte_graph_worker_common.h
>index 4ab53a533e4c..7fda67c07169 100644
>--- a/lib/graph/rte_graph_worker_common.h
>+++ b/lib/graph/rte_graph_worker_common.h
>@@ -432,10 +432,21 @@ static inline void
> rte_node_enqueue_next(struct rte_graph *graph, struct rte_node *node,
>                      rte_edge_t *nexts, void **objs, uint16_t nb_objs)
> {
>+       rte_edge_t last = nexts[0];
>+       uint16_t run_start = 0;
>        uint16_t i;
>
>-       for (i = 0; i < nb_objs; i++)
>-               rte_node_enqueue_x1(graph, node, nexts[i], objs[i]);
>+       for (i = 1; i < nb_objs; i++) {
>+               if (nexts[i] != last) {

We can probably use SIMD here for comparision, 128b would process 8 nexts at a 
time.
But might not be worth it if nexts are too unique.

>+                       rte_node_enqueue(graph, node, last, &objs[run_start], 
>i - run_start);
>+                       run_start = i;
>+                       last = nexts[i];
>+               }
>+       }
>+       if (run_start == 0 && objs == node->objs)
>+               rte_node_next_stream_move(graph, node, last);
>+       else
>+               rte_node_enqueue(graph, node, last, &objs[run_start], nb_objs 
>- run_start);
> }
>
> /**
--
2.52.0


Reply via email to