Add AVX2 vectorized split queue Rx path. In case some CPUs don't support AVX512. Enable AVX2 for them to get better per-core performance.
Signed-off-by: Shaiq Wani <[email protected]> --- drivers/net/intel/idpf/idpf_common_device.h | 1 + drivers/net/intel/idpf/idpf_common_rxtx.c | 59 +++++++ drivers/net/intel/idpf/idpf_common_rxtx.h | 5 + .../net/intel/idpf/idpf_common_rxtx_avx2.c | 151 ++++++++++++++++++ .../net/intel/idpf/idpf_common_rxtx_avx512.c | 56 ------- 5 files changed, 216 insertions(+), 56 deletions(-) diff --git a/drivers/net/intel/idpf/idpf_common_device.h b/drivers/net/intel/idpf/idpf_common_device.h index bbc969c734..1424046a16 100644 --- a/drivers/net/intel/idpf/idpf_common_device.h +++ b/drivers/net/intel/idpf/idpf_common_device.h @@ -70,6 +70,7 @@ enum idpf_rx_func_type { IDPF_RX_SINGLEQ, IDPF_RX_SINGLEQ_SCATTERED, IDPF_RX_SINGLEQ_AVX2, + IDPF_RX_AVX2, IDPF_RX_AVX512, IDPF_RX_SINGLEQ_AVX512, IDPF_RX_MAX diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.c b/drivers/net/intel/idpf/idpf_common_rxtx.c index b8f6418d4a..ead31fd0f8 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx.c +++ b/drivers/net/intel/idpf/idpf_common_rxtx.c @@ -253,6 +253,58 @@ idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq) cq->expected_gen_id = 1; } +RTE_EXPORT_INTERNAL_SYMBOL(idpf_splitq_rearm_common) +void +idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) +{ + struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; + volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; + uint16_t rx_id; + int i; + + rxdp += rx_bufq->rxrearm_start; + + /* Pull 'n' more MBUFs into the software ring */ + if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp, + (void *)rxp, IDPF_RXQ_REARM_THRESH) < 0) { + if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= + rx_bufq->nb_rx_desc) { + for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { + rxp[i] = &rx_bufq->fake_mbuf; + rxdp[i] = (union virtchnl2_rx_buf_desc){0}; + } + } + rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, + IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); + return; + } + + /* Initialize the mbufs in vector, process 8 mbufs in one loop */ + for (i = 0; i < IDPF_RXQ_REARM_THRESH; + i += 8, rxp += 8, rxdp += 8) { + rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; + rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; + } + + rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; + if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) + rx_bufq->rxrearm_start = 0; + + rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; + + rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? + (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); + + /* Update the tail pointer on the NIC */ + IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); +} + RTE_EXPORT_INTERNAL_SYMBOL(idpf_qc_single_tx_queue_reset) void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq) @@ -1506,6 +1558,13 @@ const struct ci_rx_path_info idpf_rx_path_infos[] = { .rx_offloads = IDPF_RX_VECTOR_OFFLOADS, .simd_width = RTE_VECT_SIMD_256, .single_queue = true}}, + [IDPF_RX_AVX2] = { + .pkt_burst = idpf_dp_splitq_recv_pkts_avx2, + .info = "Split AVX2 Vector", + .features = { + .rx_offloads = IDPF_RX_VECTOR_OFFLOADS, + .simd_width = RTE_VECT_SIMD_256, + }}, #ifdef CC_AVX512_SUPPORT [IDPF_RX_AVX512] = { .pkt_burst = idpf_dp_splitq_recv_pkts_avx512, diff --git a/drivers/net/intel/idpf/idpf_common_rxtx.h b/drivers/net/intel/idpf/idpf_common_rxtx.h index 914cab0f25..256e9ff54c 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx.h +++ b/drivers/net/intel/idpf/idpf_common_rxtx.h @@ -197,6 +197,8 @@ void idpf_qc_split_tx_descq_reset(struct ci_tx_queue *txq); __rte_internal void idpf_qc_split_tx_complq_reset(struct ci_tx_queue *cq); __rte_internal +void idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq); +__rte_internal void idpf_qc_single_tx_queue_reset(struct ci_tx_queue *txq); __rte_internal void idpf_qc_rx_queue_release(void *rxq); @@ -249,6 +251,9 @@ __rte_internal uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); __rte_internal +uint16_t idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); +__rte_internal uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); __rte_internal diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c index e228b72fa5..0122c82951 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx2.c @@ -482,6 +482,157 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct rte_mbuf **rx_pkts, uint16 return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts); } +RTE_EXPORT_INTERNAL_SYMBOL(idpf_dp_splitq_recv_pkts_avx2) +uint16_t +idpf_dp_splitq_recv_pkts_avx2(void *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct idpf_rx_queue *queue = (struct idpf_rx_queue *)rxq; + const uint32_t *ptype_tbl = queue->adapter->ptype_tbl; + struct rte_mbuf **sw_ring = &queue->bufq2->sw_ring[queue->rx_tail]; + volatile union virtchnl2_rx_desc *rxdp = + (volatile union virtchnl2_rx_desc *)queue->rx_ring + queue->rx_tail; + const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, queue->mbuf_initializer); + uint64_t head_gen; + uint16_t received = 0; + int i; + + /* Shuffle mask: picks fields from each 16-byte descriptor pair into the + * layout that will be merged into mbuf->rearm_data candidates. + */ + const __m256i shuf = _mm256_set_epi8( + /* high 128 bits (desc 3 then desc 2 lanes) */ + 0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4, + 0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF, + /* low 128 bits (desc 1 then desc 0 lanes) */ + 0xFF, 0xFF, 0xFF, 0xFF, 11, 10, 5, 4, + 0xFF, 0xFF, 5, 4, 0xFF, 0xFF, 0xFF, 0xFF + ); + + /* mask that clears bits 14 and 15 of the packet length word */ + const __m256i len_mask = _mm256_set_epi32( + 0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffff3fff, 0xffffffff + ); + + const __m256i ptype_mask = _mm256_set1_epi16(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M); + + rte_prefetch0(rxdp); + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IDPF_VPMD_DESCS_PER_LOOP); + + if (queue->bufq2->rxrearm_nb > IDPF_RXQ_REARM_THRESH) + idpf_splitq_rearm_common(queue->bufq2); + + /* check if there is at least one packet available */ + head_gen = rxdp->flex_adv_nic_3_wb.pktlen_gen_bufq_id; + if (((head_gen >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) != queue->expected_gen_id) + return 0; + + for (i = 0; i < nb_pkts; + i += IDPF_VPMD_DESCS_PER_LOOP, + rxdp += IDPF_VPMD_DESCS_PER_LOOP) { + uint16_t pktlen_gen0, pktlen_gen1, pktlen_gen2, pktlen_gen3; + uint8_t stat0, stat1, stat2, stat3; + bool valid0, valid1, valid2, valid3; + uint16_t burst; + uint16_t ptype0, ptype1, ptype2, ptype3; + __m128i d0, d1, d2, d3; + __m256i d01, d23, desc01, desc23; + __m256i mb10, mb32, pt10, pt32; + __m256i rearm0, rearm1, rearm2, rearm3; + + /* copy mbuf pointers (harmless for invalid descs) */ + memcpy(&rx_pkts[i], &sw_ring[i], + sizeof(rx_pkts[0]) * IDPF_VPMD_DESCS_PER_LOOP); + d3 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[3])); + rte_compiler_barrier(); + d2 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[2])); + rte_compiler_barrier(); + d1 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[1])); + rte_compiler_barrier(); + d0 = _mm_load_si128(RTE_CAST_PTR(const __m128i *, &rxdp[0])); + + d23 = _mm256_set_m128i(d3, d2); + d01 = _mm256_set_m128i(d1, d0); + + /* mask length and shuffle to build mbuf rearm data */ + desc01 = _mm256_and_si256(d01, len_mask); + desc23 = _mm256_and_si256(d23, len_mask); + mb10 = _mm256_shuffle_epi8(desc01, shuf); + mb32 = _mm256_shuffle_epi8(desc23, shuf); + + /* Extract ptypes */ + pt10 = _mm256_and_si256(d01, ptype_mask); + pt32 = _mm256_and_si256(d23, ptype_mask); + + ptype0 = (uint16_t)_mm256_extract_epi16(pt10, 1); + ptype1 = (uint16_t)_mm256_extract_epi16(pt10, 9); + ptype2 = (uint16_t)_mm256_extract_epi16(pt32, 1); + ptype3 = (uint16_t)_mm256_extract_epi16(pt32, 9); + + mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype1], 2); + mb10 = _mm256_insert_epi32(mb10, (int)ptype_tbl[ptype0], 0); + mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype3], 2); + mb32 = _mm256_insert_epi32(mb32, (int)ptype_tbl[ptype2], 0); + + /* Build rearm data for each mbuf */ + rearm0 = _mm256_permute2f128_si256(mbuf_init, mb10, 0x20); + rearm1 = _mm256_blend_epi32(mbuf_init, mb10, 0xF0); + rearm2 = _mm256_permute2f128_si256(mbuf_init, mb32, 0x20); + rearm3 = _mm256_blend_epi32(mbuf_init, mb32, 0xF0); + + /* Write out mbuf rearm data */ + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3); + + /* Extract DD and generation bits from the already-loaded + * descriptor data (d0-d3) */ + stat0 = (uint8_t)_mm_extract_epi8(d0, 1); + stat1 = (uint8_t)_mm_extract_epi8(d1, 1); + stat2 = (uint8_t)_mm_extract_epi8(d2, 1); + stat3 = (uint8_t)_mm_extract_epi8(d3, 1); + + pktlen_gen0 = (uint16_t)_mm_extract_epi16(d0, 2); + pktlen_gen1 = (uint16_t)_mm_extract_epi16(d1, 2); + pktlen_gen2 = (uint16_t)_mm_extract_epi16(d2, 2); + pktlen_gen3 = (uint16_t)_mm_extract_epi16(d3, 2); + + valid0 = (stat0 & 1) && + (((pktlen_gen0 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id); + valid1 = (stat1 & 1) && + (((pktlen_gen1 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id); + valid2 = (stat2 & 1) && + (((pktlen_gen2 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id); + valid3 = (stat3 & 1) && + (((pktlen_gen3 >> VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_S) & + VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M) == queue->expected_gen_id); + + /* count valid descriptors (holes are impossible because + * descriptors are read in reverse order while the NIC + * completes them in forward order) + */ + burst = valid0 + valid1 + valid2 + valid3; + received += burst; + if (burst != IDPF_VPMD_DESCS_PER_LOOP) + break; + } + + queue->rx_tail += received; + queue->expected_gen_id ^= ((queue->rx_tail & queue->nb_rx_desc) != 0); + queue->rx_tail &= (queue->nb_rx_desc - 1); + if ((queue->rx_tail & 1) == 1 && received > 1) { + queue->rx_tail--; + received--; + } + queue->bufq2->rxrearm_nb += received; + return received; +} + static inline void idpf_singleq_vtx1(volatile struct ci_tx_desc *txdp, struct rte_mbuf *pkt, uint64_t flags) diff --git a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c index fe870617bc..eda5f929cf 100644 --- a/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c +++ b/drivers/net/intel/idpf/idpf_common_rxtx_avx512.c @@ -540,62 +540,6 @@ idpf_dp_singleq_recv_pkts_avx512(void *rx_queue, struct rte_mbuf **rx_pkts, return _idpf_singleq_recv_raw_pkts_avx512(rx_queue, rx_pkts, nb_pkts); } -static __rte_always_inline void -idpf_splitq_rearm_common(struct idpf_rx_queue *rx_bufq) -{ - struct rte_mbuf **rxp = &rx_bufq->sw_ring[rx_bufq->rxrearm_start]; - volatile union virtchnl2_rx_buf_desc *rxdp = rx_bufq->rx_ring; - uint16_t rx_id; - int i; - - rxdp += rx_bufq->rxrearm_start; - - /* Pull 'n' more MBUFs into the software ring */ - if (rte_mbuf_raw_alloc_bulk(rx_bufq->mp, - (void *)rxp, - IDPF_RXQ_REARM_THRESH) < 0) { - if (rx_bufq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >= - rx_bufq->nb_rx_desc) { - __m128i dma_addr0; - - dma_addr0 = _mm_setzero_si128(); - for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) { - rxp[i] = &rx_bufq->fake_mbuf; - _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[i]), - dma_addr0); - } - } - rte_atomic_fetch_add_explicit(&rx_bufq->rx_stats.mbuf_alloc_failed, - IDPF_RXQ_REARM_THRESH, rte_memory_order_relaxed); - return; - } - - /* Initialize the mbufs in vector, process 8 mbufs in one loop */ - for (i = 0; i < IDPF_RXQ_REARM_THRESH; - i += 8, rxp += 8, rxdp += 8) { - rxdp[0].split_rd.pkt_addr = rxp[0]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[1].split_rd.pkt_addr = rxp[1]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[2].split_rd.pkt_addr = rxp[2]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[3].split_rd.pkt_addr = rxp[3]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[4].split_rd.pkt_addr = rxp[4]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[5].split_rd.pkt_addr = rxp[5]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[6].split_rd.pkt_addr = rxp[6]->buf_iova + RTE_PKTMBUF_HEADROOM; - rxdp[7].split_rd.pkt_addr = rxp[7]->buf_iova + RTE_PKTMBUF_HEADROOM; - } - - rx_bufq->rxrearm_start += IDPF_RXQ_REARM_THRESH; - if (rx_bufq->rxrearm_start >= rx_bufq->nb_rx_desc) - rx_bufq->rxrearm_start = 0; - - rx_bufq->rxrearm_nb -= IDPF_RXQ_REARM_THRESH; - - rx_id = (uint16_t)((rx_bufq->rxrearm_start == 0) ? - (rx_bufq->nb_rx_desc - 1) : (rx_bufq->rxrearm_start - 1)); - - /* Update the tail pointer on the NIC */ - IDPF_PCI_REG_WRITE(rx_bufq->qrx_tail, rx_id); -} - static __rte_always_inline void idpf_splitq_rearm(struct idpf_rx_queue *rx_bufq) { -- 2.34.1

