On 9/26/2025 10:54 AM, Shaiq Wani wrote:
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.
In the single queue model, the same descriptor queue is used by SW
to post descriptors to the device and used by device to report completed
descriptors to SW. While as the split queue model separates them into
different queues for parallel processing and improved performance.
Signed-off-by: Shaiq Wani <[email protected]>
---
Hi Shaiq,
Bruce has already provided some feedback, so I'm only going to touch on
thigns that weren't yet touched on.
<snip>
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c
b/drivers/net/intel/idpf/idpf_common_rxtx.c
index a2b8c372d6..57753180a2 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.c
@@ -1656,6 +1656,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
.simd_width = RTE_VECT_SIMD_256,
.extra.single_queue = true}},
+ [IDPF_RX_AVX2] = {
+ .pkt_burst = idpf_dp_splitq_recv_pkts_avx2,
+ .info = "Split AVX2 Vector",
+ .features = {
+ .rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
+ .simd_width = RTE_VECT_SIMD_256,
+ }},
The indentation is different from surrounding code.
#ifdef CC_AVX512_SUPPORT
[IDPF_RX_AVX512] = {
.pkt_burst = idpf_dp_splitq_recv_pkts_avx512,
@@ -1663,7 +1670,7 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = {
.features = {
.rx_offloads = IDPF_RX_VECTOR_OFFLOADS,
.simd_width = RTE_VECT_SIMD_512}},
- [IDPF_RX_SINGLQ_AVX512] = {
+ [IDPF_RX_SINGLEQ_AVX512] = {
.pkt_burst = idpf_dp_singleq_recv_pkts_avx512,
.info = "Single AVX512 Vector",
.features = {
<snip>
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h
b/drivers/net/intel/idpf/idpf_common_rxtx.h
index 3bc3323af4..3a9af06c86 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx.h
+++ b/drivers/net/intel/idpf/idpf_common_rxtx.h
@@ -252,6 +252,9 @@ __rte_internal
uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf
**tx_pkts,
uint16_t nb_pkts);
__rte_internal
+uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
+__rte_internal
uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf
**rx_pkts,
uint16_t nb_pkts);
__rte_internal
diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
index 21c8f79254..b00f85ce78 100644
--- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c
@@ -482,6 +482,248 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct
rte_mbuf **rx_pkts, uint16
return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts);
}
+static __rte_always_inline void
+idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq)
+{
+ int i;
+ uint16_t rx_id;
+ volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring;
+ struct rte_mbuf **rxep = &rx_bufq->sw_ring[rx_bufq->rxrearm_start];
+
+ rxdp += rx_bufq->rxrearm_start;
+
+ /* Try to bulk allocate mbufs from mempool */
+ if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp,
+ rxep,
+ IDPF_RXQ_REARM_THRESH) < 0) {
+ if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
rx_bufq->nb_rx_desc) {
+ __m128i zero_dma = _mm_setzero_si128();
+
+ for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+ rxep[i] = &rx_bufq->fake_mbuf;
+ _mm_storeu_si128((__m128i
*)(uintptr_t)&rxdp[i], zero_dma);
+ }
+ }
+
rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed,
+ IDPF_RXQ_REARM_THRESH,
+
rte_memory_order_relaxed);
+ return;
+ }
+
+ __m128i headroom = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
RTE_PKTMBUF_HEADROOM);
+
+ for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2, rxdp += 2) {
+ struct rte_mbuf *mb0 = rxep[0];
+ struct rte_mbuf *mb1 = rxep[1];
+
+ __m128i buf_addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+ __m128i buf_addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+ __m128i dma_addr0 = _mm_unpackhi_epi64(buf_addr0, buf_addr0);
+ __m128i dma_addr1 = _mm_unpackhi_epi64(buf_addr1, buf_addr1);
+
+ dma_addr0 = _mm_add_epi64(dma_addr0, headroom);
+ dma_addr1 = _mm_add_epi64(dma_addr1, headroom);
+
+ rxdp[0].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr0);
+ rxdp[1].split_rd.pkt_addr = _mm_cvtsi128_si64(dma_addr1);
+ }
+
+ rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+ if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc)
+ rx_bufq->rxrearm_start = 0;
+
+ rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH;
+
+ rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ?
+ (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start -
1));
+
+ IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id);
All of this code looks really familiar. Perhaps porting IDPF to use
'common' infrastructure should have been a prerequisite for this work,
because I believe we have pretty much identical Rx rearm code there
already (see net/intel/common/rx_vec_x86.h - there's already a scalar as
well as SSE/AVX2/AVX512 implementations for rearm that are 32- and
64-bit compatible). Perhaps the only problem might be that they use a
"common descriptor format" rather than the virtchnl2 format used by the
IDPF but as far as I can tell they're pretty much identical?
I realize it's a lot more work than this patch, but I really don't think
adding things we know we will eventually deduplicate is a good practice.
--
Thanks,
Anatoly