I haven't looked into the details yet, but have a quick question inline below.
> @@ -345,12 +345,20 @@ ci_txq_release_all_mbufs(struct ci_tx_queue *txq,
> bool use_ctx)
> return;
>
> if (!txq->vector_tx) {
> - for (uint16_t i = 0; i < txq->nb_tx_desc; i++) {
> - if (txq->sw_ring[i].mbuf != NULL) {
You changed this loop to only operate on not-yet-cleaned descriptors.
Here comes the first part of my question:
You removed the NULL check for txq->sw_ring[i].mbuf, thereby assuming that it
is never NULL for not-yet-cleaned descriptors.
> + /* Free mbufs from (last_desc_cleaned + 1) to (tx_tail -
> 1). */
> + const uint16_t start = (txq->last_desc_cleaned + 1) % txq-
> >nb_tx_desc;
> + const uint16_t nb_desc = txq->nb_tx_desc;
> + const uint16_t end = txq->tx_tail;
> +
> + uint16_t i = start;
> + if (end < i) {
> + for (; i < nb_desc; i++)
> rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
> - txq->sw_ring[i].mbuf = NULL;
> - }
> + i = 0;
> }
> + for (; i < end; i++)
> + rte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);
> + memset(txq->sw_ring, 0, sizeof(txq->sw_ring[0]) * nb_desc);
> return;
> }
>
> diff --git a/drivers/net/intel/common/tx_scalar_fns.h
> b/drivers/net/intel/common/tx_scalar_fns.h
> index 82dc54438f..47ddcf411b 100644
> --- a/drivers/net/intel/common/tx_scalar_fns.h
> +++ b/drivers/net/intel/common/tx_scalar_fns.h
> @@ -30,16 +30,60 @@ ci_tx_xmit_cleanup(struct ci_tx_queue *txq)
> const uint16_t rs_idx = (last_desc_cleaned == nb_tx_desc - 1) ?
> 0 :
> (last_desc_cleaned + 1) >> txq->log2_rs_thresh;
> - uint16_t desc_to_clean_to = (rs_idx << txq->log2_rs_thresh) +
> (txq->tx_rs_thresh - 1);
> + const uint16_t dd_idx = txq->rs_last_id[rs_idx];
> + const uint16_t first_to_clean = rs_idx << txq->log2_rs_thresh;
>
> /* Check if descriptor is done - all drivers use 0xF as done
> value in bits 3:0 */
> - if ((txd[txq->rs_last_id[rs_idx]].cmd_type_offset_bsz &
> rte_cpu_to_le_64(CI_TXD_QW1_DTYPE_M)) !=
> + if ((txd[dd_idx].cmd_type_offset_bsz &
> rte_cpu_to_le_64(CI_TXD_QW1_DTYPE_M)) !=
> rte_cpu_to_le_64(CI_TX_DESC_DTYPE_DESC_DONE))
> /* Descriptor not yet processed by hardware */
> return -1;
>
> + /* DD bit is set, descriptors are done. Now free the mbufs. */
> + /* Note: nb_tx_desc is guaranteed to be a multiple of
> tx_rs_thresh,
> + * validated during queue setup. This means cleanup never wraps
> around
> + * the ring within a single burst (e.g., ring=256, rs_thresh=32
> gives
> + * bursts of 0-31, 32-63, ..., 224-255).
> + */
> + const uint16_t nb_to_clean = txq->tx_rs_thresh;
> + struct ci_tx_entry *sw_ring = txq->sw_ring;
> +
> + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> + /* FAST_FREE path: mbufs are already reset, just return to
> pool */
> + uint16_t nb_free = 0;
> +
> + /* Get cached mempool pointer, or cache it on first use */
> + struct rte_mempool *mp =
> + likely(txq->fast_free_mp != (void *)UINTPTR_MAX) ?
> + txq->fast_free_mp :
> + (txq->fast_free_mp = sw_ring[dd_idx].mbuf->pool);
> +
> + /* Pack non-NULL mbufs in-place at start of sw_ring range.
Here is the second part of my question:
How can they (sw_ring[X].mbuf) be NULL here, when they cannot be NULL in
ci_txq_release_all_mbufs()?
> + * No modulo needed in loop since we're guaranteed not to
> wrap.
> + */
> + for (uint16_t i = 0; i < nb_to_clean; i++) {
> + struct rte_mbuf *m = sw_ring[first_to_clean +
> i].mbuf;
> + if (m != NULL) {
> + /* Pack into sw_ring at packed position */
> + sw_ring[first_to_clean + nb_free].mbuf = m;
> + nb_free++;
> + }
> + }
> +
> + /* Bulk return to mempool using packed sw_ring entries
> directly */
> + if (nb_free > 0)
> + rte_mempool_put_bulk(mp, (void
> **)&sw_ring[first_to_clean].mbuf, nb_free);
> + } else {
> + /* Non-FAST_FREE path: use prefree_seg for refcount checks
> */
> + for (uint16_t i = 0; i < nb_to_clean; i++) {
> + struct rte_mbuf *m = sw_ring[first_to_clean +
> i].mbuf;
> + if (m != NULL)
> + rte_pktmbuf_free_seg(m);
> + }
> + }
> +
> /* Update the txq to reflect the last descriptor that was cleaned
> */
> - txq->last_desc_cleaned = desc_to_clean_to;
> + txq->last_desc_cleaned = first_to_clean + txq->tx_rs_thresh - 1;
> txq->nb_tx_free += txq->tx_rs_thresh;
>
> return 0;