> From: Andrew Rybchenko [mailto:[email protected]] > Sent: Tuesday, 17 February 2026 07.37 > > On 2/16/26 6:23 PM, Morten Brørup wrote: > > De-inline unlikely code paths, for smaller footprint. > > The idea is interesting and makes sense to me. But could you share > performance figures to know the impact. > > > > > Signed-off-by: Morten Brørup <[email protected]> > > --- > > v3: > > * New functions are called from inline code, so make them > experimental > > instead of internal. > > v2: > > * Removed review functions. > > * Changed #if 0 to #if AVOID_RTE_MEMCPY. > > --- > > lib/mempool/rte_mempool.c | 112 ++++++++++++++++++++ > > lib/mempool/rte_mempool.h | 212 ++++++++++++++++++++--------------- > --- > > 2 files changed, 223 insertions(+), 101 deletions(-) > > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c > > index 3042d94c14..30dce3a2fd 100644 > > --- a/lib/mempool/rte_mempool.c > > +++ b/lib/mempool/rte_mempool.c > > @@ -1016,6 +1016,118 @@ rte_mempool_create(const char *name, unsigned > n, unsigned elt_size, > > return NULL; > > } > > > > +/* internal */ > > +RTE_EXPORT_EXPERIMENTAL_SYMBOL(_rte_mempool_do_generic_put_more, > 26.03) > > +void > > +_rte_mempool_do_generic_put_more(struct rte_mempool *mp, void * > const *obj_table, > > + unsigned int n, struct rte_mempool_cache *cache) > > +{ > > I'd add comments which explain why stats are not updated by the > function. It is the drawback of the solution when at least > comments should be added to make it clear. Stats update would > be very easy to loose in the case of future changes. > > > + __rte_assume(cache->flushthresh <= RTE_MEMPOOL_CACHE_MAX_SIZE * > 2); > > + __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2); > > + __rte_assume(cache->len <= cache->flushthresh); > > + __rte_assume(cache->len + n > cache->flushthresh); > > + if (likely(n <= cache->flushthresh)) { > > + uint32_t len; > > + void **cache_objs; > > + > > + /* > > + * The cache is big enough for the objects, but - as > detected by > > + * rte_mempool_do_generic_put() - has insufficient room for > them. > > + * Flush the cache to make room for the objects. > > + */ > > + len = cache->len; > > + cache_objs = &cache->objs[0]; > > + cache->len = n; > > + rte_mempool_ops_enqueue_bulk(mp, cache_objs, len); > > + > > + /* Add the objects to the cache. */ > > +#ifdef AVOID_RTE_MEMCPY /* Simple alternative to rte_memcpy(). */ > > I'd not mix introduction of AVOID_RTE_MEMCPY and other goals of the > patch. If AVOID_RTE_MEMCPY is really useful, it could be added > separately and appropriately motivated. > > > + for (uint32_t index = 0; index < n; index++) > > + *cache_objs++ = *obj_table++; > > +#else > > + rte_memcpy(cache_objs, obj_table, sizeof(void *) * n); > > +#endif > > + > > + return; > > + } > > + > > + /* The request itself is too big for the cache. Push objects > directly to the backend. */ > > + rte_mempool_ops_enqueue_bulk(mp, obj_table, n); > > +} > > + > > +/* internal */ > > +RTE_EXPORT_EXPERIMENTAL_SYMBOL(_rte_mempool_do_generic_get_more, > 26.03) > > +int > > +_rte_mempool_do_generic_get_more(struct rte_mempool *mp, void > **obj_table, > > + unsigned int n, struct rte_mempool_cache *cache) > > +{ > > + int ret; > > + unsigned int remaining; > > + uint32_t index, len; > > + void **cache_objs; > > + > > + /* Use the cache as much as we have to return hot objects first. > */ > > + __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2); > > + len = cache->len; > > + remaining = n - len; > > + cache_objs = &cache->objs[len]; > > + cache->len = 0; > > + for (index = 0; index < len; index++) > > + *obj_table++ = *--cache_objs; > > + > > + /* Dequeue below would overflow mem allocated for cache? */ > > + if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE)) > > + goto driver_dequeue; > > + > > + /* Fill the cache from the backend; fetch size + remaining > objects. */ > > + ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, > > + cache->size + remaining); > > + if (unlikely(ret < 0)) { > > + /* > > + * We are buffer constrained, and not able to fetch all > that. > > + * Do not fill the cache, just satisfy the remaining part > of > > + * the request directly from the backend. > > + */ > > + goto driver_dequeue; > > + } > > + > > + /* Satisfy the remaining part of the request from the filled > cache. */ > > + RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1); > > + RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n); > > + > > + __rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE); > > + __rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE); > > + cache_objs = &cache->objs[cache->size + remaining]; > > + cache->len = cache->size; > > + for (index = 0; index < remaining; index++) > > + *obj_table++ = *--cache_objs; > > + > > + return 0; > > + > > +driver_dequeue: > > + > > + /* Get remaining objects directly from the backend. */ > > + ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining); > > + > > + if (unlikely(ret < 0)) { > > + cache->len = n - remaining; > > + /* > > + * No further action is required to roll the first part > > + * of the request back into the cache, as objects in > > + * the cache are intact. > > + */ > > + > > + RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1); > > + RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n); > > + } else { > > + RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1); > > + RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n); > > + __rte_assume(ret == 0); > > + } > > + > > + return ret; > > +} > > + > > /* Return the number of entries in the mempool */ > > RTE_EXPORT_SYMBOL(rte_mempool_avail_count) > > unsigned int > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h > > index 7989d7a475..c6df285194 100644 > > --- a/lib/mempool/rte_mempool.h > > +++ b/lib/mempool/rte_mempool.h > > @@ -1370,8 +1370,31 @@ rte_mempool_cache_flush(struct > rte_mempool_cache *cache, > > cache->len = 0; > > } > > > > +/** > > + * @warning > > + * @b EXPERIMENTAL: this API may change without prior notice. > > + * > > + * @internal > > + * Put several objects back in the mempool, more than the cache has > room for; used internally. > > + * > > + * @param mp > > + * A pointer to the mempool structure. > > + * @param obj_table > > + * A pointer to a table of void * pointers (objects). > > + * @param n > > + * The number of objects to store back in the mempool, must be > strictly > > + * positive. > > + * @param cache > > + * A pointer to a mempool cache structure. > > + */ > > +__rte_experimental > > +void > > +_rte_mempool_do_generic_put_more(struct rte_mempool *mp, void * > const *obj_table, > > + unsigned int n, struct rte_mempool_cache *cache); > > + > > /** > > * @internal Put several objects back in the mempool; used > internally. > > + * > > * @param mp > > * A pointer to the mempool structure. > > * @param obj_table > > @@ -1388,9 +1411,16 @@ rte_mempool_do_generic_put(struct rte_mempool > *mp, void * const *obj_table, > > { > > void **cache_objs; > > > > - /* No cache provided? */ > > - if (unlikely(cache == NULL)) > > - goto driver_enqueue; > > + if (unlikely(cache == NULL)) { > > Patch summary says about de-inline of unlikely code, but you still have > it here. May be it is better to be consistent and the case in de-line > code. > > > + /* No cache. Push objects directly to the backend. */ > > + /* Increment stats now, adding in mempool always succeeds. > */ > > + RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1); > > + RTE_MEMPOOL_STAT_ADD(mp, put_objs, n); > > + > > + rte_mempool_ops_enqueue_bulk(mp, obj_table, n); > > + > > + return; > > + } > > > > /* Increment stats now, adding in mempool always succeeds. */ > > RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_bulk, 1); > > @@ -1403,35 +1433,43 @@ rte_mempool_do_generic_put(struct rte_mempool > *mp, void * const *obj_table, > > /* Sufficient room in the cache for the objects. */ > > cache_objs = &cache->objs[cache->len]; > > cache->len += n; > > - } else if (n <= cache->flushthresh) { > > + > > +cache_enqueue: > > +#ifdef AVOID_RTE_MEMCPY /* Simple alternative to rte_memcpy(). */ > > /* > > - * The cache is big enough for the objects, but - as > detected by > > - * the comparison above - has insufficient room for them. > > - * Flush the cache to make room for the objects. > > + * Add the objects to the cache. > > + * If the request size is known at build time, > > + * the compiler unrolls the fixed length copy loop. > > */ > > - cache_objs = &cache->objs[0]; > > - rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len); > > - cache->len = n; > > - } else { > > - /* The request itself is too big for the cache. */ > > - goto driver_enqueue_stats_incremented; > > - } > > - > > - /* Add the objects to the cache. */ > > - rte_memcpy(cache_objs, obj_table, sizeof(void *) * n); > > + for (uint32_t index = 0; index < n; index++) > > + *cache_objs++ = *obj_table++; > > +#else > > + /* Add the objects to the cache. */ > > + rte_memcpy(cache_objs, obj_table, sizeof(void *) * n); > > +#endif > > > > - return; > > + return; > > + } > > > > -driver_enqueue: > > + if (__rte_constant(n) && likely(n <= cache->flushthresh)) { > > + uint32_t len; > > > > - /* increment stat now, adding in mempool always success */ > > - RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1); > > - RTE_MEMPOOL_STAT_ADD(mp, put_objs, n); > > + /* > > + * The cache is big enough for the objects, but - as > detected > > + * above - has insufficient room for them. > > + * Flush the cache to make room for the objects. > > + */ > > + len = cache->len; > > + cache_objs = &cache->objs[0]; > > + cache->len = n; > > + rte_mempool_ops_enqueue_bulk(mp, cache_objs, len); > > > > -driver_enqueue_stats_incremented: > > + /* Add the objects to the cache. */ > > + goto cache_enqueue; > > + } > > > > - /* push objects to the backend */ > > - rte_mempool_ops_enqueue_bulk(mp, obj_table, n); > > + /* Insufficient room in the cache for the objects. */ > > + _rte_mempool_do_generic_put_more(mp, obj_table, n, cache); > > } > > > > > > @@ -1498,8 +1536,33 @@ rte_mempool_put(struct rte_mempool *mp, void > *obj) > > rte_mempool_put_bulk(mp, &obj, 1); > > } > > > > +/** > > + * @warning > > + * @b EXPERIMENTAL: this API may change without prior notice. > > + * > > + * @internal > > + * Get several objects from the mempool, more than held in the > cache; used internally. > > + * > > + * @param mp > > + * A pointer to the mempool structure. > > + * @param obj_table > > + * A pointer to a table of void * pointers (objects). > > + * @param n > > + * The number of objects to get, must be strictly positive. > > + * @param cache > > + * A pointer to a mempool cache structure. > > + * @return > > + * - 0: Success. > > + * - <0: Error; code of driver dequeue function. > > + */ > > +__rte_experimental > > +int > > +_rte_mempool_do_generic_get_more(struct rte_mempool *mp, void > **obj_table, > > + unsigned int n, struct rte_mempool_cache *cache); > > + > > /** > > * @internal Get several objects from the mempool; used internally. > > + * > > * @param mp > > * A pointer to the mempool structure. > > * @param obj_table > > @@ -1516,26 +1579,36 @@ static __rte_always_inline int > > rte_mempool_do_generic_get(struct rte_mempool *mp, void > **obj_table, > > unsigned int n, struct rte_mempool_cache *cache) > > { > > - int ret; > > - unsigned int remaining; > > - uint32_t index, len; > > - void **cache_objs; > > - > > - /* No cache provided? */ > > if (unlikely(cache == NULL)) { > > Patch summary says about de-inline of unlikely code, but you still have > it here. > > > - remaining = n; > > - goto driver_dequeue; > > - } > > + int ret; > > + > > + /* No cache. Get objects directly from the backend. */ > > + ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, n); > > + > > + if (unlikely(ret < 0)) { > > + RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1); > > + RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n); > > + } else { > > + RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1); > > + RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n); > > + __rte_assume(ret == 0); > > + } > > > > - /* The cache is a stack, so copy will be in reverse order. */ > > - cache_objs = &cache->objs[cache->len]; > > + return ret; > > + } > > > > __rte_assume(cache->len <= RTE_MEMPOOL_CACHE_MAX_SIZE * 2); > > if (likely(n <= cache->len)) { > > + uint32_t index; > > + void **cache_objs; > > + > > /* The entire request can be satisfied from the cache. */ > > RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1); > > RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n); > > > > + /* The cache is a stack, so copy will be in reverse order. > */ > > + cache_objs = &cache->objs[cache->len]; > > + > > /* > > * If the request size is known at build time, > > * the compiler unrolls the fixed length copy loop. > > @@ -1547,71 +1620,8 @@ rte_mempool_do_generic_get(struct rte_mempool > *mp, void **obj_table, > > return 0; > > } > > > > - /* Use the cache as much as we have to return hot objects first. > */ > > - len = cache->len; > > - remaining = n - len; > > - cache->len = 0; > > - for (index = 0; index < len; index++) > > - *obj_table++ = *--cache_objs; > > - > > - /* Dequeue below would overflow mem allocated for cache? */ > > - if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE)) > > - goto driver_dequeue; > > - > > - /* Fill the cache from the backend; fetch size + remaining > objects. */ > > - ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs, > > - cache->size + remaining); > > - if (unlikely(ret < 0)) { > > - /* > > - * We are buffer constrained, and not able to fetch all > that. > > - * Do not fill the cache, just satisfy the remaining part > of > > - * the request directly from the backend. > > - */ > > - goto driver_dequeue; > > - } > > - > > - /* Satisfy the remaining part of the request from the filled > cache. */ > > - RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1); > > - RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n); > > - > > - __rte_assume(cache->size <= RTE_MEMPOOL_CACHE_MAX_SIZE); > > - __rte_assume(remaining <= RTE_MEMPOOL_CACHE_MAX_SIZE); > > - cache_objs = &cache->objs[cache->size + remaining]; > > - cache->len = cache->size; > > - for (index = 0; index < remaining; index++) > > - *obj_table++ = *--cache_objs; > > - > > - return 0; > > - > > -driver_dequeue: > > - > > - /* Get remaining objects directly from the backend. */ > > - ret = rte_mempool_ops_dequeue_bulk(mp, obj_table, remaining); > > - > > - if (unlikely(ret < 0)) { > > - if (likely(cache != NULL)) { > > - cache->len = n - remaining; > > - /* > > - * No further action is required to roll the first > part > > - * of the request back into the cache, as objects in > > - * the cache are intact. > > - */ > > - } > > - > > - RTE_MEMPOOL_STAT_ADD(mp, get_fail_bulk, 1); > > - RTE_MEMPOOL_STAT_ADD(mp, get_fail_objs, n); > > - } else { > > - if (likely(cache != NULL)) { > > - RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, > 1); > > - RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, > n); > > - } else { > > - RTE_MEMPOOL_STAT_ADD(mp, get_success_bulk, 1); > > - RTE_MEMPOOL_STAT_ADD(mp, get_success_objs, n); > > - } > > - __rte_assume(ret == 0); > > - } > > - > > - return ret; > > + /* The entire request cannot be satisfied from the cache. */ > > + return _rte_mempool_do_generic_get_more(mp, obj_table, n, cache); > > } > > > > /**
Good feedback, Andrew. I'll mark as changes requested, and follow up with a new version later. -Morten

