From: Jie Liu <[email protected]> Add AVX512 vector data path for Rx and Tx burst functions. The decision to use AVX512 is based on: 1. CPU hardware flags (AVX512F, AVX512BW). 2. Compiler support (CC_AVX512_SUPPORT). 3. Max SIMD bitwidth configuration.
Performance shows approximately X% improvement in small packet forwarding scenarios. Signed-off-by: Jie Liu <[email protected]> --- drivers/net/sxe2/meson.build | 24 + drivers/net/sxe2/sxe2_ethdev.c | 2 +- drivers/net/sxe2/sxe2_txrx.c | 91 ++- drivers/net/sxe2/sxe2_txrx_vec.c | 46 +- drivers/net/sxe2/sxe2_txrx_vec.h | 18 +- drivers/net/sxe2/sxe2_txrx_vec_avx512.c | 897 ++++++++++++++++++++++++ 6 files changed, 1063 insertions(+), 15 deletions(-) create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx512.c diff --git a/drivers/net/sxe2/meson.build b/drivers/net/sxe2/meson.build index 3df57aee8c..30f2c7d816 100644 --- a/drivers/net/sxe2/meson.build +++ b/drivers/net/sxe2/meson.build @@ -15,6 +15,30 @@ includes += include_directories('../../common/sxe2') if arch_subdir == 'x86' sources += files('sxe2_txrx_vec_sse.c') + + sxe2_avx512_cpu_support =( + cc.get_define('__AVX512F__', args: machine_args) != '' and + cc.get_define('__AVX512BW__', args: machine_args) != '') + + sxe2_avx512_cc_support = ( + not machine_args.contains('-mno-avx512f') and + cc.has_argument('-mavx512f') and + cc.has_argument('-mavx512bw')) + + if sxe2_avx512_cpu_support == true or sxe2_avx512_cc_support == true + cflags += ['-DCC_AVX512_SUPPORT'] + avx512_args = [cflags, '-mavx512f', '-mavx512bw'] + if cc.has_argument('-march=skylake-avx512') + avx512_args += '-march=skylake-avx512' + endif + sxe2_avx512_lib = static_library('sxe2_avx512_lib', 'sxe2_txrx_vec_avx512.c', + dependencies: [static_rte_ethdev, + static_rte_kvargs, static_rte_hash, + static_rte_security, static_rte_cryptodev, static_rte_bus_pci], + include_directories: includes, + c_args: avx512_args) + objs += sxe2_avx512_lib.extract_objects('sxe2_txrx_vec_avx512.c') + endif endif sources += files( diff --git a/drivers/net/sxe2/sxe2_ethdev.c b/drivers/net/sxe2/sxe2_ethdev.c index 8d66e5d8c5..e0f7002138 100644 --- a/drivers/net/sxe2/sxe2_ethdev.c +++ b/drivers/net/sxe2/sxe2_ethdev.c @@ -891,7 +891,7 @@ static int32_t sxe2_eth_pmd_probe_pf(struct sxe2_common_device *cdev, static int32_t sxe2_parse_eth_devargs(struct rte_device *dev, struct rte_eth_devargs *eth_da) { - int ret = 0; + int32_t ret = 0; if (dev->devargs == NULL) return 0; diff --git a/drivers/net/sxe2/sxe2_txrx.c b/drivers/net/sxe2/sxe2_txrx.c index 8bd5f2eca4..ee70a2a431 100644 --- a/drivers/net/sxe2/sxe2_txrx.c +++ b/drivers/net/sxe2/sxe2_txrx.c @@ -158,6 +158,19 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev) if (ret == 0 && rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { tx_mode_flags = vec_flags; +#ifdef RTE_ARCH_X86 + if ((rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512) && + (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) && + (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)) { +#ifdef CC_AVX512_SUPPORT + tx_mode_flags |= SXE2_TX_MODE_VEC_AVX512; +#else + PMD_LOG_INFO(TX, "AVX512 is not supported in build env."); +#endif + } + if ((tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) == 0) + tx_mode_flags |= SXE2_TX_MODE_VEC_SSE; +#endif if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) { ret = sxe2_tx_queues_vec_prepare(dev); if (ret != 0) @@ -173,7 +186,6 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev) tx_mode_flags = adapter->q_ctxt.tx_mode_flags; } -#ifdef RTE_ARCH_X86 if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) { if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) { dev->tx_pkt_prepare = sxe2_tx_pkts_prepare; @@ -182,6 +194,27 @@ void sxe2_tx_mode_func_set(struct rte_eth_dev *dev) dev->tx_pkt_prepare = NULL; dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse_simple; } + } + if (tx_mode_flags & SXE2_TX_MODE_VEC_SET_MASK) { + dev->tx_pkt_prepare = NULL; +#ifdef RTE_ARCH_X86 + if (tx_mode_flags & SXE2_TX_MODE_VEC_AVX512) { +#ifdef CC_AVX512_SUPPORT + if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) { + dev->tx_pkt_prepare = sxe2_tx_pkts_prepare; + dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx512; + } else { + dev->tx_pkt_burst = sxe2_tx_pkts_vec_avx512_simple; + } +#endif + } else { + if (tx_mode_flags & SXE2_TX_MODE_VEC_OFFLOAD) { + dev->tx_pkt_prepare = sxe2_tx_pkts_prepare; + dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse; + } else { + dev->tx_pkt_burst = sxe2_tx_pkts_vec_sse_simple; + } + } } else { #endif if (tx_mode_flags & SXE2_TX_MODE_SIMPLE_BATCH) { @@ -202,8 +235,16 @@ static const struct { } sxe2_tx_burst_infos[] = { { sxe2_tx_pkts, "Scalar" }, #ifdef RTE_ARCH_X86 - { sxe2_tx_pkts_vec_sse, "Vector SSE" }, - { sxe2_tx_pkts_vec_sse_simple, "Vector SSE Simple" }, +#ifdef CC_AVX512_SUPPORT + { sxe2_tx_pkts_vec_avx512, + "Vector AVX512" }, + { sxe2_tx_pkts_vec_avx512_simple, + "Vector AVX512 Simple" }, +#endif + { sxe2_tx_pkts_vec_sse, + "Vector SSE" }, + { sxe2_tx_pkts_vec_sse_simple, + "Vector SSE Simple" }, #endif }; @@ -289,6 +330,20 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev) if (ret == 0 && rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { rx_mode_flags = vec_flags; +#ifdef RTE_ARCH_X86 + if ((rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512) && + (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) && + (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)) { +#ifdef CC_AVX512_SUPPORT + rx_mode_flags |= SXE2_RX_MODE_VEC_AVX512; +#else + PMD_LOG_INFO(RX, "AVX512 support detected but not enabled"); +#endif + } + if ((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) == 0 && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) + rx_mode_flags |= SXE2_RX_MODE_VEC_SSE; +#endif if ((rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) != 0) { ret = sxe2_rx_queues_vec_prepare(dev); if (ret != 0) @@ -302,7 +357,16 @@ void sxe2_rx_mode_func_set(struct rte_eth_dev *dev) #ifdef RTE_ARCH_X86 if (rx_mode_flags & SXE2_RX_MODE_VEC_SET_MASK) { - dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_sse_offload; + if (rx_mode_flags & SXE2_RX_MODE_VEC_AVX512) { +#ifdef CC_AVX512_SUPPORT + if (rx_mode_flags & SXE2_RX_MODE_VEC_OFFLOAD) + dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx512_offload; + else + dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_avx512; +#endif + } else { + dev->rx_pkt_burst = sxe2_rx_pkts_scattered_vec_sse_offload; + } return; } #endif @@ -316,19 +380,30 @@ static const struct { eth_rx_burst_t rx_burst; const char *info; } sxe2_rx_burst_infos[] = { - { sxe2_rx_pkts_scattered, "Scalar Scattered" }, - { sxe2_rx_pkts_scattered_split, "Scalar Scattered split" }, + { sxe2_rx_pkts_scattered, + "Scalar Scattered" }, + { sxe2_rx_pkts_scattered_split, + "Scalar Scattered split" }, #ifdef RTE_ARCH_X86 - { sxe2_rx_pkts_scattered_vec_sse_offload, "Vector SSE Scattered" }, +#ifdef CC_AVX512_SUPPORT + { sxe2_rx_pkts_scattered_vec_avx512, + "Vector AVX512 Scattered" }, + { sxe2_rx_pkts_scattered_vec_avx512_offload, + "Offload Vector AVX512 Scattered" }, +#endif + { sxe2_rx_pkts_scattered_vec_sse_offload, + "Vector SSE Scattered" }, #endif }; int32_t sxe2_rx_burst_mode_get(struct rte_eth_dev *dev, - __rte_unused uint16_t queue_id, struct rte_eth_burst_mode *mode) + __rte_unused uint16_t queue_id, + struct rte_eth_burst_mode *mode) { eth_rx_burst_t pkt_burst = dev->rx_pkt_burst; int32_t ret = -EINVAL; uint32_t i, size; + size = RTE_DIM(sxe2_rx_burst_infos); for (i = 0; i < size; ++i) { if (pkt_burst == sxe2_rx_burst_infos[i].rx_burst) { diff --git a/drivers/net/sxe2/sxe2_txrx_vec.c b/drivers/net/sxe2/sxe2_txrx_vec.c index 8df4954d86..cf004f5eb2 100644 --- a/drivers/net/sxe2/sxe2_txrx_vec.c +++ b/drivers/net/sxe2/sxe2_txrx_vec.c @@ -165,16 +165,54 @@ static void sxe2_tx_queue_mbufs_release_vec(struct sxe2_tx_queue *txq) return; } i = txq->next_dd - (txq->rs_thresh - 1); - buffer = txq->buffer_ring; - if (txq->next_use < i) { - for ( ; i < txq->ring_depth; ++i) { +#ifdef CC_AVX512_SUPPORT + struct rte_eth_dev *dev; + struct sxe2_tx_buffer_vec *buffer_vec; + + dev = &rte_eth_devices[txq->port_id]; + + if (dev->tx_pkt_burst == sxe2_tx_pkts_vec_avx512 || + dev->tx_pkt_burst == sxe2_tx_pkts_vec_avx512_simple) { + buffer_vec = (struct sxe2_tx_buffer_vec *)txq->buffer_ring; + + if (txq->next_use < i) { + for ( ; i < txq->ring_depth; ++i) { + if (buffer_vec[i].mbuf != NULL) { + rte_pktmbuf_free_seg(buffer_vec[i].mbuf); + buffer_vec[i].mbuf = NULL; + } + } + i = 0; + } + for ( ; i < txq->next_use; ++i) { + if (buffer_vec[i].mbuf != NULL) { + rte_pktmbuf_free_seg(buffer_vec[i].mbuf); + buffer_vec[i].mbuf = NULL; + } + } + } else { +#endif + buffer = txq->buffer_ring; + buffer = txq->buffer_ring; + if (txq->next_use < i) { + for ( ; i < txq->ring_depth; ++i) { + if (buffer[i].mbuf != NULL) { + rte_pktmbuf_free_seg(buffer[i].mbuf); + buffer[i].mbuf = NULL; + } + } + i = 0; + } + for (; i < txq->next_use; ++i) { if (buffer[i].mbuf != NULL) { rte_pktmbuf_free_seg(buffer[i].mbuf); buffer[i].mbuf = NULL; } } - i = 0; +#ifdef CC_AVX512_SUPPORT } +#endif + for (; i < txq->next_use; ++i) { if (buffer[i].mbuf != NULL) { rte_pktmbuf_free_seg(buffer[i].mbuf); diff --git a/drivers/net/sxe2/sxe2_txrx_vec.h b/drivers/net/sxe2/sxe2_txrx_vec.h index 4aef93d140..62a5b1f3f5 100644 --- a/drivers/net/sxe2/sxe2_txrx_vec.h +++ b/drivers/net/sxe2/sxe2_txrx_vec.h @@ -10,15 +10,19 @@ #define SXE2_RX_MODE_VEC_SIMPLE RTE_BIT32(0) #define SXE2_RX_MODE_VEC_OFFLOAD RTE_BIT32(1) #define SXE2_RX_MODE_VEC_SSE RTE_BIT32(2) +#define SXE2_RX_MODE_VEC_AVX512 RTE_BIT32(4) #define SXE2_RX_MODE_BATCH_ALLOC RTE_BIT32(10) #define SXE2_RX_MODE_VEC_SET_MASK (SXE2_RX_MODE_VEC_SIMPLE | \ - SXE2_RX_MODE_VEC_OFFLOAD | SXE2_RX_MODE_VEC_SSE) + SXE2_RX_MODE_VEC_OFFLOAD | SXE2_RX_MODE_VEC_SSE | \ + SXE2_RX_MODE_VEC_AVX512) #define SXE2_TX_MODE_VEC_SIMPLE RTE_BIT32(0) #define SXE2_TX_MODE_VEC_OFFLOAD RTE_BIT32(1) #define SXE2_TX_MODE_VEC_SSE RTE_BIT32(2) +#define SXE2_TX_MODE_VEC_AVX512 RTE_BIT32(4) #define SXE2_TX_MODE_SIMPLE_BATCH RTE_BIT32(10) #define SXE2_TX_MODE_VEC_SET_MASK (SXE2_TX_MODE_VEC_SIMPLE | \ - SXE2_TX_MODE_VEC_OFFLOAD | SXE2_TX_MODE_VEC_SSE) + SXE2_TX_MODE_VEC_OFFLOAD | SXE2_TX_MODE_VEC_SSE | \ + SXE2_TX_MODE_VEC_AVX512) #define SXE2_TX_VEC_NO_SUPPORT_OFFLOAD ( \ RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \ RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \ @@ -53,6 +57,16 @@ uint16_t sxe2_tx_pkts_vec_sse(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_ uint16_t sxe2_tx_pkts_vec_sse_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); uint16_t sxe2_rx_pkts_scattered_vec_sse_offload(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t sxe2_tx_pkts_vec_avx512_simple(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t sxe2_tx_pkts_vec_avx512(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t sxe2_tx_pkts_vec_avx512_ctx_offload(void *tx_queue, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t sxe2_rx_pkts_scattered_vec_avx512(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts); +uint16_t sxe2_rx_pkts_scattered_vec_avx512_offload(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts); #endif int32_t __rte_cold sxe2_tx_vec_support_check(struct rte_eth_dev *dev, uint32_t *vec_flags); int32_t __rte_cold sxe2_tx_queues_vec_prepare(struct rte_eth_dev *dev); diff --git a/drivers/net/sxe2/sxe2_txrx_vec_avx512.c b/drivers/net/sxe2/sxe2_txrx_vec_avx512.c new file mode 100644 index 0000000000..2aec8037dd --- /dev/null +++ b/drivers/net/sxe2/sxe2_txrx_vec_avx512.c @@ -0,0 +1,897 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (C), 2025, Wuxi Stars Micro System Technologies Co., Ltd. + */ + +#ifndef SXE2_TEST +#include <rte_vect.h> + +#include "sxe2_ethdev.h" +#include "sxe2_common_log.h" +#include "sxe2_queue.h" +#include "sxe2_txrx_vec.h" +#include "sxe2_txrx_vec_common.h" +#include "sxe2_vsi.h" + +static __rte_always_inline int32_t sxe2_tx_bufs_free_vec_avx512(struct sxe2_tx_queue *txq) +{ + struct sxe2_tx_buffer_vec *buffer; + struct rte_mbuf *mbuf; + struct rte_mbuf *mbuf_free_arr[SXE2_TX_FREE_BUFFER_SIZE_MAX_VEC]; + struct rte_mempool *mp; + struct rte_mempool_cache *cache; + void **cache_objs; + uint32_t copied; + uint32_t i; + int32_t ret; + uint16_t rs_thresh; + uint16_t free_num; + + if (rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_DESC_DONE) != + (txq->desc_ring[txq->next_dd].wb.dd & + rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_MASK))) { + ret = 0; + goto l_end; + } + + rs_thresh = txq->rs_thresh; + + buffer = (struct sxe2_tx_buffer_vec *)txq->buffer_ring; + buffer += txq->next_dd - (rs_thresh - 1); + + if ((txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) && + (rs_thresh & 31) == 0) { + mp = buffer[0].mbuf->pool; + cache = rte_mempool_default_cache(mp, rte_lcore_id()); + + if (cache == NULL || cache->len) + goto normal; + + if (rs_thresh > RTE_MEMPOOL_CACHE_MAX_SIZE) { + (void)rte_mempool_ops_enqueue_bulk(mp, (void *)buffer, rs_thresh); + goto done; + } + cache_objs = &cache->objs[cache->len]; + + copied = 0; + while (copied < rs_thresh) { + const __m512i objs0 = _mm512_loadu_si512(&buffer[copied]); + const __m512i objs1 = _mm512_loadu_si512(&buffer[copied + 8]); + const __m512i objs2 = _mm512_loadu_si512(&buffer[copied + 16]); + const __m512i objs3 = _mm512_loadu_si512(&buffer[copied + 24]); + + _mm512_storeu_si512(&cache_objs[copied], objs0); + _mm512_storeu_si512(&cache_objs[copied + 8], objs1); + _mm512_storeu_si512(&cache_objs[copied + 16], objs2); + _mm512_storeu_si512(&cache_objs[copied + 24], objs3); + copied += 32; + } + cache->len += rs_thresh; + + if (cache->len >= cache->flushthresh) { + (void)rte_mempool_ops_enqueue_bulk(mp, + &cache->objs[cache->size], cache->len - cache->size); + cache->len = cache->size; + } + goto done; + } + +normal: + mbuf = rte_pktmbuf_prefree_seg(buffer[0].mbuf); + + if (likely(mbuf)) { + mbuf_free_arr[0] = mbuf; + free_num = 1; + + for (i = 1; i < rs_thresh; ++i) { + mbuf = rte_pktmbuf_prefree_seg(buffer[i].mbuf); + + if (likely(mbuf)) { + if (likely(mbuf->pool == mbuf_free_arr[0]->pool)) { + mbuf_free_arr[free_num] = mbuf; + free_num++; + } else { + rte_mempool_put_bulk(mbuf_free_arr[0]->pool, + (void *)mbuf_free_arr, free_num); + + mbuf_free_arr[0] = mbuf; + free_num = 1; + } + } + } + + rte_mempool_put_bulk(mbuf_free_arr[0]->pool, + (void *)mbuf_free_arr, free_num); + } else { + for (i = 1; i < rs_thresh; ++i) { + mbuf = rte_pktmbuf_prefree_seg(buffer[i].mbuf); + if (mbuf != NULL) + rte_mempool_put(mbuf->pool, mbuf); + } + } + +done: + txq->desc_free_num += txq->rs_thresh; + txq->next_dd += txq->rs_thresh; + if (txq->next_dd >= txq->ring_depth) + txq->next_dd = txq->rs_thresh - 1; + ret = rs_thresh; + +l_end: + return ret; +} + +static __rte_always_inline void +sxe2_tx_desc_fill_one_avx512(volatile union sxe2_tx_data_desc *desc, struct rte_mbuf *pkt, + uint64_t desc_cmd, bool with_offloads) +{ + __m128i data_desc; + uint64_t desc_qw1; + uint32_t desc_offset; + + desc_qw1 = (SXE2_TX_DESC_DTYPE_DATA | + ((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT | + ((uint64_t)pkt->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT); + desc_offset = SXE2_TX_DATA_DESC_MACLEN_VAL(pkt->l2_len); + desc_qw1 |= ((uint64_t)desc_offset) << SXE2_TX_DATA_DESC_OFFSET_SHIFT; + if (with_offloads) + sxe2_tx_desc_fill_offloads(pkt, &desc_qw1); + + data_desc = _mm_set_epi64x(desc_qw1, rte_pktmbuf_iova(pkt)); + + _mm_store_si128(RTE_CAST_PTR(__m128i *, desc), data_desc); +} + +static __rte_always_inline +void sxe2_tx_desc_fill_avx512(volatile union sxe2_tx_data_desc *desc, struct rte_mbuf **pkts, + uint16_t pkts_num, uint64_t desc_cmd, bool with_offloads) +{ + __m512i desc_group; + uint64_t desc0_qw1; + uint64_t desc1_qw1; + uint64_t desc2_qw1; + uint64_t desc3_qw1; + + const uint64_t desc_qw1_com = (SXE2_TX_DESC_DTYPE_DATA | + ((uint64_t)desc_cmd) << SXE2_TX_DATA_DESC_CMD_SHIFT); + uint32_t desc_offset[4] = {0}; + + while (pkts_num > 3) { + desc3_qw1 = desc_qw1_com | + ((uint64_t)pkts[3]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT; + + desc_offset[3] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[3]->l2_len); + desc3_qw1 |= ((uint64_t)desc_offset[3]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT; + if (with_offloads) + sxe2_tx_desc_fill_offloads(pkts[3], &desc3_qw1); + + desc2_qw1 = desc_qw1_com | + ((uint64_t)pkts[2]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT; + desc_offset[2] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[2]->l2_len); + desc2_qw1 |= ((uint64_t)desc_offset[2]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT; + if (with_offloads) + sxe2_tx_desc_fill_offloads(pkts[2], &desc2_qw1); + + desc1_qw1 = (desc_qw1_com | + ((uint64_t)pkts[1]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT); + desc_offset[1] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[1]->l2_len); + desc1_qw1 |= ((uint64_t)desc_offset[1]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT; + if (with_offloads) + sxe2_tx_desc_fill_offloads(pkts[1], &desc1_qw1); + + desc0_qw1 = (desc_qw1_com | + ((uint64_t)pkts[0]->data_len) << SXE2_TX_DATA_DESC_BUF_SZ_SHIFT); + desc_offset[0] = SXE2_TX_DATA_DESC_MACLEN_VAL(pkts[0]->l2_len); + desc0_qw1 |= ((uint64_t)desc_offset[0]) << SXE2_TX_DATA_DESC_OFFSET_SHIFT; + if (with_offloads) + sxe2_tx_desc_fill_offloads(pkts[0], &desc0_qw1); + + desc_group = + _mm512_set_epi64(desc3_qw1, rte_pktmbuf_iova(pkts[3]), + desc2_qw1, rte_pktmbuf_iova(pkts[2]), + desc1_qw1, rte_pktmbuf_iova(pkts[1]), + desc0_qw1, rte_pktmbuf_iova(pkts[0])); + + _mm512_storeu_si512(RTE_CAST_PTR(void *, desc), desc_group); + + pkts_num -= 4; + desc += 4; + pkts += 4; + } + + while (pkts_num) { + sxe2_tx_desc_fill_one_avx512(desc, *pkts, desc_cmd, with_offloads); + + pkts_num--; + desc++; + pkts++; + } +} + +static __rte_always_inline void +sxe2_tx_pkts_mbuf_fill_avx512(struct sxe2_tx_buffer_vec *buffer, + struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + uint16_t i; + + for (i = 0; i < nb_pkts; ++i) + buffer[i].mbuf = tx_pkts[i]; +} + +static __rte_always_inline uint16_t +sxe2_tx_pkts_vec_avx512_batch(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts, bool with_offloads) +{ + volatile union sxe2_tx_data_desc *desc; + struct sxe2_tx_buffer_vec *buffer; + uint16_t next_use; + uint16_t res_num; + uint16_t tx_num; + + if (txq->desc_free_num < txq->free_thresh) + (void)sxe2_tx_bufs_free_vec_avx512(txq); + + nb_pkts = RTE_MIN(txq->desc_free_num, nb_pkts); + if (unlikely(nb_pkts == 0)) { + PMD_LOG_DEBUG(TX, "Tx pkts avx512 batch: may not enough free desc, " + "free_desc=%u, need_tx_pkts=%u", + txq->desc_free_num, nb_pkts); + goto l_end; + } + tx_num = nb_pkts; + + next_use = txq->next_use; + desc = &txq->desc_ring[next_use]; + buffer = (struct sxe2_tx_buffer_vec *)txq->buffer_ring; + buffer += next_use; + + txq->desc_free_num -= nb_pkts; + + res_num = txq->ring_depth - txq->next_use; + + if (tx_num >= res_num) { + sxe2_tx_pkts_mbuf_fill_avx512(buffer, tx_pkts, res_num); + + sxe2_tx_desc_fill_avx512(desc, tx_pkts, res_num, + SXE2_TX_DATA_DESC_CMD_EOP, with_offloads); + tx_pkts += (res_num - 1); + desc += (res_num - 1); + + sxe2_tx_desc_fill_one_avx512(desc, *tx_pkts++, + (SXE2_TX_DATA_DESC_CMD_EOP | SXE2_TX_DATA_DESC_CMD_RS), + with_offloads); + + tx_num -= res_num; + + next_use = 0; + txq->next_rs = txq->rs_thresh - 1; + desc = txq->desc_ring; + buffer = (struct sxe2_tx_buffer_vec *)txq->buffer_ring; + } + + sxe2_tx_pkts_mbuf_fill_avx512(buffer, tx_pkts, tx_num); + + sxe2_tx_desc_fill_avx512(desc, tx_pkts, tx_num, + SXE2_TX_DATA_DESC_CMD_EOP, with_offloads); + + next_use += tx_num; + if (next_use > txq->next_rs) { + txq->desc_ring[txq->next_rs].read.type_cmd_off_bsz_l2t |= + rte_cpu_to_le_64(SXE2_TX_DATA_DESC_CMD_RS_MASK); + + txq->next_rs += txq->rs_thresh; + } + txq->next_use = next_use; + + SXE2_PCI_REG_WRITE_WC(txq->tdt_reg_addr, next_use); + PMD_LOG_DEBUG(TX, "port_id=%u queue_id=%u next_use=%u send_pkts=%u", + txq->port_id, txq->queue_id, next_use, nb_pkts); +l_end: + return nb_pkts; +} + +static __rte_always_inline uint16_t +sxe2_tx_pkts_vec_avx512_common(struct sxe2_tx_queue *txq, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts, bool with_offloads) +{ + uint16_t tx_done_num = 0; + uint16_t tx_once_num; + uint16_t tx_need_num; + + while (nb_pkts) { + tx_need_num = RTE_MIN(nb_pkts, txq->rs_thresh); + tx_once_num = + sxe2_tx_pkts_vec_avx512_batch(txq, tx_pkts + tx_done_num, + tx_need_num, with_offloads); + nb_pkts -= tx_once_num; + tx_done_num += tx_once_num; + if (tx_once_num < tx_need_num) + break; + } + + return tx_done_num; +} + +uint16_t sxe2_tx_pkts_vec_avx512_simple(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + return sxe2_tx_pkts_vec_avx512_common((struct sxe2_tx_queue *)tx_queue, + tx_pkts, nb_pkts, false); +} + +uint16_t sxe2_tx_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) +{ + return sxe2_tx_pkts_vec_avx512_common((struct sxe2_tx_queue *)tx_queue, + tx_pkts, nb_pkts, true); +} + +static inline void sxe2_rx_queue_rearm_avx512(struct sxe2_rx_queue *rxq) +{ + volatile union sxe2_rx_desc *desc; + struct rte_mbuf **buffer; + struct rte_mbuf *mbuf0, *mbuf1; + __m128i dma_addr0, dma_addr1; + __m128i virt_addr0, virt_addr1; + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM); + int32_t ret; + uint16_t i; + uint16_t new_tail; + + buffer = &rxq->buffer_ring[rxq->realloc_start]; + desc = &rxq->desc_ring[rxq->realloc_start]; + + ret = rte_mempool_get_bulk(rxq->mb_pool, (void *)buffer, SXE2_RX_REARM_THRESH_VEC); + if (ret != 0) { + if ((rxq->realloc_num + SXE2_RX_REARM_THRESH_VEC) >= rxq->ring_depth) { + dma_addr0 = _mm_setzero_si128(); + for (i = 0; i < SXE2_RX_NUM_PER_LOOP_AVX; ++i) { + buffer[i] = &rxq->fake_mbuf; + _mm_store_si128(RTE_CAST_PTR(__m128i *, &desc[i].read), + dma_addr0); + } + } + + rxq->vsi->adapter->dev_info.dev_data->rx_mbuf_alloc_failed += + SXE2_RX_REARM_THRESH_VEC; + goto l_end; + } + + for (i = 0; i < SXE2_RX_REARM_THRESH_VEC; i += 2, buffer += 2) { + mbuf0 = buffer[0]; + mbuf1 = buffer[1]; + +#if RTE_IOVA_IN_MBUF + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != + offsetof(struct rte_mbuf, buf_addr) + 8); +#endif + virt_addr0 = _mm_loadu_si128((__m128i *)&mbuf0->buf_addr); + virt_addr1 = _mm_loadu_si128((__m128i *)&mbuf1->buf_addr); + +#if RTE_IOVA_IN_MBUF + + dma_addr0 = _mm_unpackhi_epi64(virt_addr0, virt_addr0); + dma_addr1 = _mm_unpackhi_epi64(virt_addr1, virt_addr1); +#else + + dma_addr0 = _mm_unpacklo_epi64(virt_addr0, virt_addr0); + dma_addr1 = _mm_unpacklo_epi64(virt_addr1, virt_addr1); +#endif + + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); + + _mm_store_si128(RTE_CAST_PTR(__m128i *, &desc++->read), dma_addr0); + _mm_store_si128(RTE_CAST_PTR(__m128i *, &desc++->read), dma_addr1); + } + + rxq->realloc_start += SXE2_RX_REARM_THRESH_VEC; + if (rxq->realloc_start >= rxq->ring_depth) + rxq->realloc_start = 0; + rxq->realloc_num -= SXE2_RX_REARM_THRESH_VEC; + + new_tail = (rxq->realloc_start == 0) ? (rxq->ring_depth - 1) : + (rxq->realloc_start - 1); + + SXE2_PCI_REG_WRITE_WC(rxq->rdt_reg_addr, new_tail); + +l_end: + return; +} + +static __rte_always_inline uint16_t +sxe2_rx_pkts_common_vec_avx512(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts, uint8_t *split_rxe_flags, + uint8_t *umbcast_flags, bool do_offload) +{ + const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; + const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0, rxq->mbuf_init_value); + struct rte_mbuf **buffer; + volatile union sxe2_rx_desc *desc; + __m512i mbufs4_7; + __m512i mbufs0_3; + __m256i mbufs6_7; + __m256i mbufs4_5; + __m256i mbufs2_3; + __m256i mbufs0_1; + uint32_t bit_num = 0; + uint16_t done_num = 0; + uint16_t i = 0; + uint16_t j = 0; + + buffer = &rxq->buffer_ring[rxq->processing_idx]; + desc = &rxq->desc_ring[rxq->processing_idx]; + + rte_prefetch0(desc); + + nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, SXE2_RX_NUM_PER_LOOP_AVX); + + if (rxq->realloc_num > SXE2_RX_REARM_THRESH_VEC) + sxe2_rx_queue_rearm_avx512(rxq); + + if (0 == (rte_le_to_cpu_64(desc->wb.status_err_ptype_len) & SXE2_RX_DESC_STATUS_DD_MASK)) + goto l_end; + + const __m512i crc_adjust = + _mm512_set4_epi32(0, -rxq->crc_len, -rxq->crc_len, 0); + + const __m256i dd_mask = _mm256_set1_epi32(1); + + const __m512i rvp_shuf_mask = + _mm512_set4_epi32((7 << 24) | (6 << 16) | (5 << 8) | 4, + (3 << 24) | (2 << 16) | (13 << 8) | 12, + (0xFFU << 24) | (0xFF << 16) | (13 << 8) | 12, + 0xFFFFFFFF); + + const __m128i eop_shuf_mask = + _mm_set_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 8, 0, 10, 2, 12, 4, 14, 6); + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); + + for (i = 0; i < nb_pkts; i += SXE2_RX_NUM_PER_LOOP_AVX, + desc += SXE2_RX_NUM_PER_LOOP_AVX) { + _mm256_storeu_si256((void *)&rx_pkts[i], + _mm256_loadu_si256((void *)&buffer[i])); +#ifdef RTE_ARCH_X86_64 + _mm256_storeu_si256((void *)&rx_pkts[i + 4], + _mm256_loadu_si256((void *)&buffer[i + 4])); +#endif + + const __m128i desc7 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 7)); + rte_compiler_barrier(); + const __m128i desc6 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 6)); + rte_compiler_barrier(); + const __m128i desc5 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 5)); + rte_compiler_barrier(); + const __m128i desc4 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 4)); + rte_compiler_barrier(); + const __m128i desc3 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 3)); + rte_compiler_barrier(); + const __m128i desc2 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 2)); + rte_compiler_barrier(); + const __m128i desc1 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 1)); + rte_compiler_barrier(); + const __m128i desc0 = _mm_loadu_si128(RTE_CAST_PTR(const __m128i *, desc + 0)); + + const __m256i descs6_7 = + _mm256_inserti128_si256(_mm256_castsi128_si256(desc6), desc7, 1); + const __m256i descs4_5 = + _mm256_inserti128_si256(_mm256_castsi128_si256(desc4), desc5, 1); + const __m256i descs2_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(desc2), desc3, 1); + const __m256i descs0_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(desc0), desc1, 1); + + const __m512i descs4_7 = + _mm512_inserti64x4(_mm512_castsi256_si512(descs4_5), descs6_7, 1); + const __m512i descs0_3 = + _mm512_inserti64x4(_mm512_castsi256_si512(descs0_1), descs2_3, 1); + + if (split_rxe_flags != NULL) { + for (j = 0; j < SXE2_RX_NUM_PER_LOOP_AVX; j++) + rte_mbuf_prefetch_part2(rx_pkts[i + j]); + } + + mbufs4_7 = _mm512_shuffle_epi8(descs4_7, rvp_shuf_mask); + mbufs0_3 = _mm512_shuffle_epi8(descs0_3, rvp_shuf_mask); + + mbufs4_7 = _mm512_add_epi32(mbufs4_7, crc_adjust); + mbufs0_3 = _mm512_add_epi32(mbufs0_3, crc_adjust); + + const __m512i ptype_mask = _mm512_set1_epi64(SXE2_RX_FLEX_DESC_PTYPE_M << + SXE2_RX_FLEX_DESC_PTYPE_S); + + __m512i ptypes4_7 = _mm512_and_si512(descs4_7, ptype_mask); + __m512i ptypes0_3 = _mm512_and_si512(descs0_3, ptype_mask); + + const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1); + const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0); + const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1); + const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0); + + const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 13); + const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 5); + const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 13); + const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 5); + const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 13); + const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 5); + const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 13); + const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 5); + + const __m512i ptype_mask4_7 = + _mm512_set_epi32(0, 0, 0, ptype_tbl[ptype7], + 0, 0, 0, ptype_tbl[ptype6], + 0, 0, 0, ptype_tbl[ptype5], + 0, 0, 0, ptype_tbl[ptype4]); + const __m512i ptype_mask0_3 = + _mm512_set_epi32(0, 0, 0, ptype_tbl[ptype3], + 0, 0, 0, ptype_tbl[ptype2], + 0, 0, 0, ptype_tbl[ptype1], + 0, 0, 0, ptype_tbl[ptype0]); + + mbufs4_7 = _mm512_or_si512(mbufs4_7, ptype_mask4_7); + mbufs0_3 = _mm512_or_si512(mbufs0_3, ptype_mask0_3); + + mbufs6_7 = _mm512_extracti64x4_epi64(mbufs4_7, 1); + mbufs4_5 = _mm512_extracti64x4_epi64(mbufs4_7, 0); + mbufs2_3 = _mm512_extracti64x4_epi64(mbufs0_3, 1); + mbufs0_1 = _mm512_extracti64x4_epi64(mbufs0_3, 0); + + const __m512i staterr_per_mask = + _mm512_set_epi32(0x17, 0x1F, 0x07, 0x0F, + 0x13, 0x1B, 0x03, 0x0B, + 0x16, 0x1E, 0x06, 0x0E, + 0x12, 0x1A, 0x02, 0x0A); + __m512i qw1_0_7 = _mm512_permutex2var_epi32(descs4_7, + staterr_per_mask, + descs0_3); + + __m256i staterrs0_7 = _mm512_extracti64x4_epi64(qw1_0_7, 0); + + __m256i stu_len0_7 = _mm512_extracti64x4_epi64(qw1_0_7, 1); + __m256i mbuf_flags = _mm256_setzero_si256(); + + if (do_offload) { + const __m256i desc_flags_mask = _mm256_set1_epi32(0xC0001C04); + const __m256i desc_flags_rss_mask = _mm256_set1_epi32(0x20000000); + const __m256i vlan_flags = + _mm256_set_epi8(0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, RTE_MBUF_F_RX_VLAN | + RTE_MBUF_F_RX_VLAN_STRIPPED, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, RTE_MBUF_F_RX_VLAN | + RTE_MBUF_F_RX_VLAN_STRIPPED, + 0, 0, 0, 0); + + const __m256i rss_flags = + _mm256_set_epi8(0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, RTE_MBUF_F_RX_RSS_HASH, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, RTE_MBUF_F_RX_RSS_HASH, + 0, 0, 0, 0); + + const __m256i cksum_flags = + _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, + 0, + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + 0, 0, 0, 0, 0, 0, 0, 0, + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_GOOD | + RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1), + ((RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1)); + + const __m256i cksum_mask = + _mm256_set1_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK | + RTE_MBUF_F_RX_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK | + RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD); + + const __m256i vlan_mask = + _mm256_set1_epi32(RTE_MBUF_F_RX_VLAN | + RTE_MBUF_F_RX_VLAN_STRIPPED); + + __m256i tmp_flags; + __m256i descs_flags = _mm256_and_si256(staterrs0_7, desc_flags_mask); + stu_len0_7 = _mm256_and_si256(stu_len0_7, desc_flags_rss_mask); + + tmp_flags = _mm256_shuffle_epi8(vlan_flags, descs_flags); + mbuf_flags = _mm256_and_si256(tmp_flags, vlan_mask); + + descs_flags = _mm256_srli_epi32(descs_flags, 10); + tmp_flags = _mm256_shuffle_epi8(cksum_flags, descs_flags); + tmp_flags = _mm256_slli_epi32(tmp_flags, 1); + tmp_flags = _mm256_and_si256(tmp_flags, cksum_mask); + mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags); + + descs_flags = _mm256_srli_epi32(stu_len0_7, 27); + tmp_flags = _mm256_shuffle_epi8(rss_flags, descs_flags); + mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags); + +#ifndef RTE_LIBRTE_SXE2_16BYTE_RX_DESC + if (rxq->fnav_enable) { + __m256i fnav_vld0_3, fnav_vld4_7; + __m256i fnav_vld0_7; + __m256i v_zeros, v_ffff, v_u32_one; + const __m256i fdir_flags = + _mm256_set1_epi32(RTE_MBUF_F_RX_FDIR | + RTE_MBUF_F_RX_FDIR_ID); + fnav_vld0_3 = _mm256_unpacklo_epi32(descs2_3, descs0_1); + fnav_vld4_7 = _mm256_unpacklo_epi32(descs6_7, descs4_5); + + fnav_vld0_7 = _mm256_unpacklo_epi64(fnav_vld4_7, fnav_vld0_3); + + fnav_vld0_7 = _mm256_slli_epi32(fnav_vld0_7, 26); + fnav_vld0_7 = _mm256_srli_epi32(fnav_vld0_7, 31); + + v_zeros = _mm256_setzero_si256(); + v_ffff = _mm256_cmpeq_epi32(v_zeros, v_zeros); + v_u32_one = _mm256_srli_epi32(v_ffff, 31); + + tmp_flags = _mm256_cmpeq_epi32(fnav_vld0_7, v_u32_one); + + tmp_flags = _mm256_and_si256(tmp_flags, fdir_flags); + + mbuf_flags = _mm256_or_si256(mbuf_flags, tmp_flags); + + rx_pkts[i + 0]->hash.fdir.hi = desc[0].wb.fd_filter_id; + rx_pkts[i + 1]->hash.fdir.hi = desc[1].wb.fd_filter_id; + rx_pkts[i + 2]->hash.fdir.hi = desc[2].wb.fd_filter_id; + rx_pkts[i + 3]->hash.fdir.hi = desc[3].wb.fd_filter_id; + rx_pkts[i + 4]->hash.fdir.hi = desc[4].wb.fd_filter_id; + rx_pkts[i + 5]->hash.fdir.hi = desc[5].wb.fd_filter_id; + rx_pkts[i + 6]->hash.fdir.hi = desc[6].wb.fd_filter_id; + rx_pkts[i + 7]->hash.fdir.hi = desc[7].wb.fd_filter_id; + } +#endif + } + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != + offsetof(struct rte_mbuf, rearm_data) + 8); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rx_descriptor_fields1) != + offsetof(struct rte_mbuf, rearm_data) + 16); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != + RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); + + __m256i rearm_arr[8]; + + rearm_arr[6] = _mm256_blend_epi32(mbuf_init, + _mm256_slli_si256(mbuf_flags, 8), 0x04); + rearm_arr[4] = _mm256_blend_epi32(mbuf_init, + _mm256_slli_si256(mbuf_flags, 4), 0x04); + rearm_arr[2] = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04); + rearm_arr[0] = _mm256_blend_epi32(mbuf_init, + _mm256_srli_si256(mbuf_flags, 4), 0x04); + + rearm_arr[6] = _mm256_permute2f128_si256(rearm_arr[6], mbufs6_7, 0x20); + rearm_arr[4] = _mm256_permute2f128_si256(rearm_arr[4], mbufs4_5, 0x20); + rearm_arr[2] = _mm256_permute2f128_si256(rearm_arr[2], mbufs2_3, 0x20); + rearm_arr[0] = _mm256_permute2f128_si256(rearm_arr[0], mbufs0_1, 0x20); + + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm_arr[6]); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm_arr[4]); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm_arr[2]); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm_arr[0]); + + const __m256i tmp_mbuf_flags = + _mm256_castsi128_si256(_mm256_extracti128_si256(mbuf_flags, 1)); + + rearm_arr[7] = _mm256_blend_epi32(mbuf_init, + _mm256_slli_si256(tmp_mbuf_flags, 8), 4); + rearm_arr[5] = _mm256_blend_epi32(mbuf_init, + _mm256_slli_si256(tmp_mbuf_flags, 4), 4); + rearm_arr[3] = _mm256_blend_epi32(mbuf_init, tmp_mbuf_flags, 4); + rearm_arr[1] = _mm256_blend_epi32(mbuf_init, + _mm256_srli_si256(tmp_mbuf_flags, 4), 4); + + rearm_arr[7] = _mm256_blend_epi32(rearm_arr[7], mbufs6_7, 0XF0); + rearm_arr[5] = _mm256_blend_epi32(rearm_arr[5], mbufs4_5, 0XF0); + rearm_arr[3] = _mm256_blend_epi32(rearm_arr[3], mbufs2_3, 0XF0); + rearm_arr[1] = _mm256_blend_epi32(rearm_arr[1], mbufs0_1, 0XF0); + + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm_arr[7]); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm_arr[5]); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm_arr[3]); + _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm_arr[1]); + + if (umbcast_flags) { + const __m256i umbcast_mask = + _mm256_set1_epi32(SXE2_RX_DESC_STATUS_UMBCAST_MASK); + __m256i umbcast_bits_256 = + _mm256_and_si256(staterrs0_7, umbcast_mask); + + umbcast_bits_256 = _mm256_srli_epi32(umbcast_bits_256, 24); + __m128i umbcast_bits_128 = + _mm_packs_epi32(_mm256_castsi256_si128(umbcast_bits_256), + _mm256_extractf128_si256(umbcast_bits_256, 1)); + + umbcast_bits_128 = _mm_shuffle_epi8(umbcast_bits_128, eop_shuf_mask); + + *(uint64_t *)umbcast_flags = _mm_cvtsi128_si64(umbcast_bits_128); + umbcast_flags += SXE2_RX_NUM_PER_LOOP_AVX; + } + + if (split_rxe_flags) { + const __m256i eop_rxe_mask = + _mm256_set1_epi32(SXE2_RX_DESC_STATUS_EOP_MASK | + SXE2_RX_DESC_ERROR_RXE_MASK | + SXE2_RX_DESC_ERROR_OVERSIZE_MASK); + const __m128i eop_mask_128 = + _mm_set1_epi16(SXE2_RX_DESC_STATUS_EOP_MASK); + const __m128i rxe_mask_128 = + _mm_set1_epi16(SXE2_RX_DESC_ERROR_RXE_MASK | + SXE2_RX_DESC_ERROR_OVERSIZE_MASK); + + const __m256i tmp_stats = _mm256_and_si256(staterrs0_7, eop_rxe_mask); + + const __m128i eop_rxe_bits = _mm_packs_epi32 + (_mm256_castsi256_si128(tmp_stats), + _mm256_extractf128_si256(tmp_stats, 1)); + + __m128i not_eop_bits = _mm_andnot_si128(eop_rxe_bits, eop_mask_128); + + not_eop_bits = + _mm_or_si128(not_eop_bits, + _mm_srli_epi16(_mm_and_si128(eop_rxe_bits, + rxe_mask_128), + 7)); + + not_eop_bits = _mm_shuffle_epi8(not_eop_bits, eop_shuf_mask); + + *(uint64_t *)split_rxe_flags = _mm_cvtsi128_si64(not_eop_bits); + split_rxe_flags += SXE2_RX_NUM_PER_LOOP_AVX; + } + + staterrs0_7 = _mm256_and_si256(staterrs0_7, dd_mask); + + staterrs0_7 = _mm256_packs_epi32(staterrs0_7, _mm256_setzero_si256()); + + bit_num = rte_popcount64 + (_mm_cvtsi128_si64(_mm256_extracti128_si256(staterrs0_7, 1))); + bit_num += rte_popcount64 + (_mm_cvtsi128_si64(_mm256_castsi256_si128(staterrs0_7))); + done_num += bit_num; + + if (bit_num != SXE2_RX_NUM_PER_LOOP_AVX) + break; + } + + rxq->processing_idx += done_num; + rxq->processing_idx &= (rxq->ring_depth - 1); + if ((rxq->processing_idx & 1) == 1 && done_num > 1) { + rxq->processing_idx--; + done_num--; + } + rxq->realloc_num += done_num; + +l_end: + PMD_LOG_DEBUG(RX, "port_id=%u queue_id=%u last_id=%u recv_pkts=%d", + rxq->port_id, rxq->queue_id, rxq->processing_idx, done_num); + return done_num; +} + +static __rte_always_inline uint16_t +sxe2_rx_pkts_scattered_batch_vec_avx512(struct sxe2_rx_queue *rxq, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts, bool do_offload) +{ + const uint64_t *split_rxe_flags64; + uint8_t split_rxe_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0}; + uint8_t umbcast_flags[SXE2_RX_PKTS_BURST_BATCH_NUM_VEC] = {0}; + uint16_t rx_done_num; + uint16_t rx_pkt_done_num; + + rx_pkt_done_num = 0; + + if (rxq->vsi->adapter->devargs.sw_stats_en) { + rx_done_num = sxe2_rx_pkts_common_vec_avx512(rxq, rx_pkts, + nb_pkts, split_rxe_flags, + umbcast_flags, do_offload); + } else { + rx_done_num = sxe2_rx_pkts_common_vec_avx512(rxq, rx_pkts, + nb_pkts, split_rxe_flags, + NULL, do_offload); + } + if (rx_done_num == 0) + goto l_end; + + if (!rxq->vsi->adapter->devargs.sw_stats_en) { + split_rxe_flags64 = (uint64_t *)split_rxe_flags; + + if (rxq->pkt_first_seg == NULL && + !split_rxe_flags64[0] && !split_rxe_flags64[1] && + !split_rxe_flags64[2] && !split_rxe_flags64[3]) { + rx_pkt_done_num = rx_done_num; + goto l_end; + } + + if (rxq->pkt_first_seg == NULL) { + while (rx_pkt_done_num < rx_done_num && + split_rxe_flags[rx_pkt_done_num] == 0) + rx_pkt_done_num++; + + if (rx_pkt_done_num == rx_done_num) + goto l_end; + + rxq->pkt_first_seg = rx_pkts[rx_pkt_done_num]; + } + } + + rx_pkt_done_num += sxe2_rx_pkts_refactor(rxq, &rx_pkts[rx_pkt_done_num], + rx_done_num - rx_pkt_done_num, &split_rxe_flags[rx_pkt_done_num], + &umbcast_flags[rx_pkt_done_num]); + +l_end: + + return rx_pkt_done_num; +} + +static __rte_always_inline uint16_t +sxe2_rx_pkts_scattered_common_vec_avx512(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts, bool offload) +{ + uint16_t done_num = 0; + uint16_t once_num = 0; + + while (nb_pkts > SXE2_RX_PKTS_BURST_BATCH_NUM) { + once_num = sxe2_rx_pkts_scattered_batch_vec_avx512(rx_queue, rx_pkts + done_num, + SXE2_RX_PKTS_BURST_BATCH_NUM, offload); + + done_num += once_num; + nb_pkts -= once_num; + + if (once_num < SXE2_RX_PKTS_BURST_BATCH_NUM) + goto end; + } + + done_num += sxe2_rx_pkts_scattered_batch_vec_avx512(rx_queue, + rx_pkts + done_num, nb_pkts, offload); + +end: + return done_num; +} + +uint16_t sxe2_rx_pkts_scattered_vec_avx512(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + return sxe2_rx_pkts_scattered_common_vec_avx512(rx_queue, + rx_pkts, nb_pkts, false); +} + +uint16_t sxe2_rx_pkts_scattered_vec_avx512_offload(void *rx_queue, + struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + return sxe2_rx_pkts_scattered_common_vec_avx512(rx_queue, + rx_pkts, nb_pkts, true); +} + +#endif -- 2.47.3

