Add AVX2 vectorized split queue Rx path. In case some CPUs don't support AVX512. Enable AVX2 for them to get better per-core performance.
Signed-off-by: Shaiq Wani <[email protected]> --- drivers/net/intel/idpf/idpf_common_device.h | 1 + drivers/net/intel/idpf/idpf_common_rxtx.c | 59 ++++++++ drivers/net/intel/idpf/idpf_common_rxtx.h | 5 + .../net/intel/idpf/idpf_common_rxtx_avx2.c | 138 ++++++++++++++++++ .../net/intel/idpf/idpf_common_rxtx_avx512.c | 56 ------- 5 files changed, 203 insertions(+), 56 deletions(-) diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h index bbc969c734..1424046a16 100644 --- a/drivers/net/intel/idpf/idpf_common_device.h +++ b/drivers/net/intel/idpf/idpf_common_device.h @@ -70,6 +70,7 @@ enum idpf_rx_func_type { IDPF_RX_SINGLEQ, IDPF_RX_SINGLEQ_SCATTERED, IDPF_RX_SINGLEQ_AVX2, + IDPF_RX_AVX2, IDPF_RX_AVX512, IDPF_RX_SINGLEQ_AVX512, IDPF_RX_MAX diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c index b8f6418d4a..ead31fd0f8 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx.c +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c @@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq) cq->expected_gen_id = 1; } +RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common) +void +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) +{ + struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; + volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; + uint16_t rx_id; + int i; + + rxdp += rx_bufq->rxrearm_start; + + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp, + (void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) { + if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= + rx_bufq->nb_rx_desc) { + for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { + rxp[i] = &rx_bufq->fake_mbuf; + rxdp[i] = (union virtchnl2_rx_buf_desc){0}; + } + } + rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, + IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); + return; + } + + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ + for (i = 0; i < IDPF_RXQ_REARM_THRESH; + i += 8, rxp += 8, rxdp += 8) { + rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; + } + + rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; + if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) + rx_bufq->rxrearm_start = 0; + + rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; + + rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? + (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); + + /* Update the tail pointer on the NIC */ + IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); +} + RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset) void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq) @@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = { .rx_offloads = IDPF_RX_VECTOR_OFFLOADS, .simd_width = RTE_VECT_SIMD_256, .single_queue = true}}, + [IDPF_RX_AVX2] = { + .pkt_burst = idpf_dp_splitq_recv_pkts_avx2, + .info = "Split AVX2 Vector", + .features = { + .rx_offloads = IDPF_RX_VECTOR_OFFLOADS, + .simd_width = RTE_VECT_SIMD_256, + }}, #ifdef CC_AVX512_SUPPORT [IDPF_RX_AVX512] = { .pkt_burst = idpf_dp_splitq_recv_pkts_avx512, diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h index 914cab0f25..256e9ff54c 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx.h +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h @@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq); __rte_internal void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq); __rte_internal +void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq); +__rte_internal void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq); __rte_internal void idpf_qc_rx_queue_release(void *rxq); @@ -249,6 +251,9 @@ __rte_internal uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); __rte_internal +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +__rte_internal uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); __rte_internal diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c index e228b72fa5..c2f41db9f6 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c @@ -482,6 +482,144 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16 return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts); } +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2) +uint16_t +idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq; + const uint32_t *ptype_tbl = queue->adapter->ptype_tbl; + struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail]; + volatile union virtchnl2_rx_desc *rxdp = + (volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail; + const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer); + uint64_t head_gen; + uint16_t received = 0; + int i; + + /* Shuffle mask: picks fields from each 16-byte descriptor pair into the + * layout that will be merged into mbuf->rearm_data candidates. + */ + const __m256i shuf = _mm256_set_epi8( + /* high 128 bits (desc 3 then desc 2 lanes) */ + 0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4, + 0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF, + /* low 128 bits (desc 1 then desc 0 lanes) */ + 0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4, + 0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF + ); + + /* mask that clears bits 14 and 15 of the packet length word */ + const __m256i len_mask = _mm256_set_epi32( + 0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff + ); + + const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M); + + rte_prefetch0(rxdp); + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, 4); /* 4 desc per AVX2 iteration */ + + if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH) + idpf_splitq_rearm_common(queue->bufq2); + + /* head gen check */ + head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id; + if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id) + return 0; + + for (i = nb_pkts; i >= IDPF_VPMD_DESCS_PER_LOOP; i -= IDPF_VPMD_DESCS_PER_LOOP) { + rxdp -= IDPF_VPMD_DESCS_PER_LOOP; + + uint64_t g3 = rxdp[3].flex_adv_nic_3_wb.pktlen_gen_bufq_id; + uint64_t g2 = rxdp[2].flex_adv_nic_3_wb.pktlen_gen_bufq_id; + uint64_t g1 = rxdp[1].flex_adv_nic_3_wb.pktlen_gen_bufq_id; + uint64_t g0 = rxdp[0].flex_adv_nic_3_wb.pktlen_gen_bufq_id; + + /* Extract DD bits */ + bool dd3 = (g3 & 1ULL) != 0ULL; + bool dd2 = (g2 & 1ULL) != 0ULL; + bool dd1 = (g1 & 1ULL) != 0ULL; + bool dd0 = (g0 & 1ULL) != 0ULL; + + /* Extract generation bits */ + uint64_t gen3 = (g3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M; + uint64_t gen2 = (g2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M; + uint64_t gen1 = (g1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M; + uint64_t gen0 = (g0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M; + + /* Validate descriptors */ + bool valid3 = dd3 && (gen3 == queue->expected_gen_id); + bool valid2 = dd2 && (gen2 == queue->expected_gen_id); + bool valid1 = dd1 && (gen1 == queue->expected_gen_id); + bool valid0 = dd0 && (gen0 == queue->expected_gen_id); + + if (!(valid0 && valid1 && valid2 && valid3)) + break; + + /* copy mbuf pointers */ + memcpy(&rx_pkts[i - IDPF_VPMD_DESCS_PER_LOOP], + &sw_ring[i - IDPF_VPMD_DESCS_PER_LOOP], + sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP); + + __m128i d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3])); + __m128i d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2])); + __m128i d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1])); + __m128i d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0])); + + __m256i d23 = _mm256_set_m128i(d3, d2); + __m256i d01 = _mm256_set_m128i(d1, d0); + + /* mask length and shuffle to build mbuf rearm data */ + __m256i desc01 = _mm256_and_si256(d01, len_mask); + __m256i desc23 = _mm256_and_si256(d23, len_mask); + __m256i mb10 = _mm256_shuffle_epi8(desc01, shuf); + __m256i mb32 = _mm256_shuffle_epi8(desc23, shuf); + + /* Extract ptypes */ + __m256i pt10 = _mm256_and_si256(d01, ptype_mask); + __m256i pt32 = _mm256_and_si256(d23, ptype_mask); + + uint16_t ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1); + uint16_t ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9); + uint16_t ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1); + uint16_t ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9); + + mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2); + mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0); + mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2); + mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0); + + /* Build rearm data for each mbuf */ + __m256i rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20); + __m256i rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0); + __m256i rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20); + __m256i rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0); + + /* Write out mbuf rearm data */ + _mm256_storeu_si256((__m256i *)&rx_pkts[i - 1]->rearm_data, rearm3); + _mm256_storeu_si256((__m256i *)&rx_pkts[i - 2]->rearm_data, rearm2); + _mm256_storeu_si256((__m256i *)&rx_pkts[i - 3]->rearm_data, rearm1); + _mm256_storeu_si256((__m256i *)&rx_pkts[i - 4]->rearm_data, rearm0); + + received += IDPF_VPMD_DESCS_PER_LOOP; + } + + queue->rx_tail += received; + queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0); + queue->rx_tail &= (queue->nb_rx_desc - 1); + if ((queue->rx_tail & 1) == 1 && received > 1) { + queue->rx_tail--; + received--; + } + queue->bufq2->rxrearm_nb += received; + return received; +} + static inline void idpf_singleq_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c index fe870617bc..eda5f929cf 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c @@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts); } -static __rte_always_inline void -idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) -{ - struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; - volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; - uint16_t rx_id; - int i; - - rxdp += rx_bufq->rxrearm_start; - - /* Pull 'n' more MBUFs into the software ring */ - if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp, - (void *)rxp, - IDPF_RXQ_REARM_THRESH) < 0) { - if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= - rx_bufq->nb_rx_desc) { - __m128i dma_addr0; - - dma_addr0 = _mm_setzero_si128(); - for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { - rxp[i] = &rx_bufq->fake_mbuf; - _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]), - dma_addr0); - } - } - rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, - IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); - return; - } - - /* Initialize the mbufs in vector, process 8 mbufs in one loop */ - for (i = 0; i < IDPF_RXQ_REARM_THRESH; - i += 8, rxp += 8, rxdp += 8) { - rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; - } - - rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; - if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) - rx_bufq->rxrearm_start = 0; - - rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; - - rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? - (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); - - /* Update the tail pointer on the NIC */ - IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); -} - static __rte_always_inline void idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq) { -- 2.34.1

