On Tue, Feb 24, 2026 at 01:17:24PM +0530, Shaiq Wani wrote:
> Add AVX2 vectorized split queue Rx path.
> In case some CPUs don't support AVX512. Enable AVX2 for them to
> get better per-core performance.
> 
> Signed-off-by: Shaiq Wani <[email protected]>
> ---
>  drivers/net/intel/idpf/idpf_common_device.h   |   1 +
>  drivers/net/intel/idpf/idpf_common_rxtx.c     |  59 ++++++++
>  drivers/net/intel/idpf/idpf_common_rxtx.h     |   5 +
>  .../net/intel/idpf/idpf_common_rxtx_avx2.c    | 138 ++++++++++++++++++
>  .../net/intel/idpf/idpf_common_rxtx_avx512.c  |  56 -------
>  5 files changed, 203 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/net/intel/idpf/idpf_common_device.h 
> b/drivers/net/intel/idpf/idpf_common_device.h
> index bbc969c734..1424046a16 100644
> --- a/drivers/net/intel/idpf/idpf_common_device.h
> +++ b/drivers/net/intel/idpf/idpf_common_device.h
> @@ -70,6 +70,7 @@ enum idpf_rx_func_type {
>       IDPF_RX_SINGLEQ,
>       IDPF_RX_SINGLEQ_SCATTERED,
>       IDPF_RX_SINGLEQ_AVX2,
> +     IDPF_RX_AVX2,
>       IDPF_RX_AVX512,
>       IDPF_RX_SINGLEQ_AVX512,
>       IDPF_RX_MAX
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c 
> b/drivers/net/intel/idpf/idpf_common_rxtx.c
> index b8f6418d4a..ead31fd0f8 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
> @@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq)
>       cq->expected_gen_id = 1;
>  }
>  
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common)
> +void
> +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
> +{
> +     struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
> +     volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
> +     uint16_t rx_id;
> +     int i;
> +
> +     rxdp += rx_bufq->rxrearm_start;
> +
> +     /* Pull 'n' more MBUFs into the software ring */
> +     if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
> +                     (void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) {
> +             if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
> +                             rx_bufq->nb_rx_desc) {
> +                     for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
> +                             rxp[i] = &rx_bufq->fake_mbuf;
> +                             rxdp[i] = (union virtchnl2_rx_buf_desc){0};
> +                     }
> +             }
> +             
> rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
> +                     IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed);
> +             return;
> +     }
> +
> +     /* Initialize the mbufs in vector, process 8 mbufs in one loop */
> +     for (i = 0; i < IDPF_RXQ_REARM_THRESH;
> +                     i += 8, rxp += 8, rxdp += 8) {
> +             rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +             rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + 
> RTE_PKTMBUF_HEADROOM;
> +     }
> +
> +     rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
> +     if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
> +             rx_bufq->rxrearm_start = 0;
> +
> +     rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
> +
> +     rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
> +                          (rx_bufq->nb_rx_desc - 1) : 
> (rx_bufq->rxrearm_start - 1));
> +
> +     /* Update the tail pointer on the NIC */
> +     IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
> +}
> +
>  RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset)
>  void
>  idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq)
> @@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
>                       .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
>                       .simd_width = RTE_VECT_SIMD_256,
>                       .single_queue = true}},
> +     [IDPF_RX_AVX2] = {
> +             .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
> +             .info = "Split AVX2 Vector",
> +             .features = {
> +                     .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
> +                     .simd_width = RTE_VECT_SIMD_256,
> +                     }},
>  #ifdef CC_AVX512_SUPPORT
>       [IDPF_RX_AVX512] = {
>               .pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h 
> b/drivers/net/intel/idpf/idpf_common_rxtx.h
> index 914cab0f25..256e9ff54c 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx.h
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
> @@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue 
> *txq);
>  __rte_internal
>  void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq);
>  __rte_internal
> +void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq);
> +__rte_internal
>  void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq);
>  __rte_internal
>  void idpf_qc_rx_queue_release(void *rxq);
> @@ -249,6 +251,9 @@ __rte_internal
>  uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf 
> **tx_pkts,
>                                        uint16_t nb_pkts);
>  __rte_internal
> +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
> +                                  uint16_t nb_pkts);
> +__rte_internal
>  uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf 
> **rx_pkts,
>                         uint16_t nb_pkts);
>  __rte_internal
> diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c 
> b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> index e228b72fa5..c2f41db9f6 100644
> --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
> @@ -482,6 +482,144 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct 
> rte_mbuf **rx_pkts, uint16
>       return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
>  }
>  
> +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2)
> +uint16_t
> +idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t 
> nb_pkts)
> +{
> +     struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq;
> +     const uint32_t *ptype_tbl = queue->adapter->ptype_tbl;
> +     struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail];
> +     volatile union virtchnl2_rx_desc *rxdp =
> +             (volatile union virtchnl2_rx_desc *)queue->rx_ring + 
> queue->rx_tail;
> +     const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, 
> queue->mbuf_initializer);
> +     uint64_t head_gen;
> +     uint16_t received = 0;
> +     int i;
> +
> +     /* Shuffle mask: picks fields from each 16-byte descriptor pair into the
> +      * layout that will be merged into mbuf->rearm_data candidates.
> +      */
> +     const __m256i shuf = _mm256_set_epi8(
> +             /* high 128 bits (desc 3 then desc 2 lanes) */
> +             0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +             0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF,
> +             /* low 128 bits (desc 1 then desc 0 lanes) */
> +             0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4,
> +             0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF
> +     );
> +
> +     /* mask that clears bits 14 and 15 of the packet length word  */
> +     const __m256i len_mask = _mm256_set_epi32(
> +             0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff,
> +             0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff
> +     );
> +
> +     const __m256i ptype_mask = 
> _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M);
> +
> +     rte_prefetch0(rxdp);
> +     nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */
> +
> +     if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH)
> +             idpf_splitq_rearm_common(queue->bufq2);
> +
> +     /* head gen check */
> +     head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +     if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +              VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id)
> +             return 0;
> +
> +     for (i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= 
> IDPF_VPMD_DESCS_PER_LOOP) {
> +             rxdp -= IDPF_VPMD_DESCS_PER_LOOP;
> +
> +             uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +             uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +             uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +             uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id;
> +
> +             /* Extract DD bits */
> +             bool dd3 = (g3 & 1ULL) != 0ULL;
> +             bool dd2 = (g2 & 1ULL) != 0ULL;
> +             bool dd1 = (g1 & 1ULL) != 0ULL;
> +             bool dd0 = (g0 & 1ULL) != 0ULL;
> +
> +             /* Extract generation bits */
> +             uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +                                                     
> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +             uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +                                                     
> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +             uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +                                                     
> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +             uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) &
> +                                                     
> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M;
> +
> +             /* Validate descriptors */
> +             bool valid3 = dd3 && (gen3 == queue->expected_gen_id);
> +             bool valid2 = dd2 && (gen2 == queue->expected_gen_id);
> +             bool valid1 = dd1 && (gen1 == queue->expected_gen_id);
> +             bool valid0 = dd0 && (gen0 == queue->expected_gen_id);
> +
> +             if (!(valid0 && valid1 && valid2 && valid3))
> +                     break;
> +
This looks wrong. It means that if one packet is received by the NIC and
then the link goes down, for example, that one packet will never actually
be received by software.


Driver datapaths must always be able to receive single packets or whatever
number is available. However - based on past precedent - they are allowed
to request that the input buffer is a multiple of 4 or 8, but they cannot
require that packets are received in bursts of a given multiple. This will
never pass any RFC2544 test, unless, by a co-incidence, the number of sent
packets is a multiple of 4.

/Bruce

Reply via email to