sxe2: support AVX512 vectorized path for Rx and Tx

Stephen Hemminger Mon, 08 Jun 2026 13:56:44 -0700

On Mon,  8 Jun 2026 15:42:38 +0800
[email protected] wrote:

>  
> -struct sxe2_drv_queue_caps {
> +struct __rte_aligned(4) __rte_packed_begin sxe2_drv_queue_caps {
>       uint16_t queues_cnt;
>       uint16_t base_idx_in_pf;
> -};
> +} __rte_packed_end;


I don't see the point of packed and aligned. This structure is already embedded
in other struct. The alignment will be right.
Packed should be reserved for where you are avoiding padding.

Bottom line: drivers should not use packed except for hardware registers,
network headers.  You probably could just use existing structures here
and elsewhere; with static_assert() added in the vector code to make
sure future changes don't break assumptions.

> diff --git a/drivers/net/sxe2/sxe2_ethdev.c b/drivers/net/sxe2/sxe2_ethdev.c
> index 8d66e5d8c5..e0f7002138 100644
> --- a/drivers/net/sxe2/sxe2_ethdev.c
> +++ b/drivers/net/sxe2/sxe2_ethdev.c
> @@ -891,7 +891,7 @@ static int32_t sxe2_eth_pmd_probe_pf(struct 
> sxe2_common_device *cdev,
>  static int32_t sxe2_parse_eth_devargs(struct rte_device *dev,
>                         struct rte_eth_devargs *eth_da)
>  {
> -     int ret = 0;
> +     int32_t ret = 0;
>  
>       if (dev->devargs == NULL)
>               return 0;

Hmm. This raises the question why did sxe2 clone the model from other drivers 
using vdpa
but change the return to int32_t.

I think you are making things unnecessarily complex here:

$ git grep probe_t | grep int
drivers/bus/auxiliary/bus_auxiliary_driver.h:typedef int 
(rte_auxiliary_probe_t)(struct rte_auxiliary_driver *drv,
drivers/bus/cdx/bus_cdx_driver.h:typedef int (rte_cdx_probe_t)(struct 
rte_cdx_driver *, struct rte_cdx_device *);
drivers/bus/dpaa/bus_dpaa_driver.h:typedef int (*rte_dpaa_probe_t)(struct 
rte_dpaa_driver *dpaa_drv,
drivers/bus/fslmc/bus_fslmc_driver.h:typedef int (*rte_dpaa2_probe_t)(struct 
rte_dpaa2_driver *dpaa2_drv,
drivers/bus/ifpga/bus_ifpga_driver.h:typedef int (afu_probe_t)(struct 
rte_afu_device *);
drivers/bus/pci/bus_pci_driver.h:typedef int (rte_pci_probe_t)(struct 
rte_pci_driver *, struct rte_pci_device *);
drivers/bus/platform/bus_platform_driver.h:typedef int 
(rte_platform_probe_t)(struct rte_platform_device *pdev);
drivers/bus/uacce/bus_uacce_driver.h:typedef int (rte_uacce_probe_t)(struct 
rte_uacce_driver *, struct rte_uacce_device *);
drivers/bus/vdev/bus_vdev_driver.h:typedef int (rte_vdev_probe_t)(struct 
rte_vdev_device *dev);
drivers/bus/vmbus/bus_vmbus_driver.h:typedef int (vmbus_probe_t)(struct 
rte_vmbus_driver *,
drivers/common/mlx5/mlx5_common.h:typedef int 
(mlx5_class_driver_probe_t)(struct mlx5_common_device *cdev,
drivers/common/nfp/nfp_common_pci.h:typedef int 
(nfp_class_driver_probe_t)(struct rte_pci_device *dev);
drivers/common/sxe2/sxe2_common.h:typedef int32_t 
(sxe2_class_driver_probe_t)(struct sxe2_common_device *scdev,
lib/eal/include/bus_driver.h:typedef int (*rte_bus_probe_t)(struct rte_bus 
*bus);

No other driver uses int32...

> @@ -315,19 +370,30 @@ static const struct {
>       eth_rx_burst_t rx_burst;
>       const char *info;
>  } sxe2_rx_burst_infos[] = {
> -     { sxe2_rx_pkts_scattered,          "Scalar Scattered" },
> -     { sxe2_rx_pkts_scattered_split,          "Scalar Scattered split" },
> +     { sxe2_rx_pkts_scattered,
> +           "Scalar Scattered" },
> +     { sxe2_rx_pkts_scattered_split,
> +           "Scalar Scattered split" },
>  #ifdef RTE_ARCH_X86
> -     { sxe2_rx_pkts_scattered_vec_sse_offload,      "Vector SSE Scattered" },
> +#ifdef CC_AVX512_SUPPORT
> +     { sxe2_rx_pkts_scattered_vec_avx512,
> +           "Vector AVX512 Scattered" },
> +     { sxe2_rx_pkts_scattered_vec_avx512_offload,
> +           "Offload Vector AVX512 Scattered" },
> +#endif
> +     { sxe2_rx_pkts_scattered_vec_sse_offload,
> +           "Vector SSE Scattered" },
>  #endif
>  };

The table looked better before with longer lines.
The DPDK coding style allows lines up to 100 characters; why not use it

>  int32_t sxe2_rx_burst_mode_get(struct rte_eth_dev *dev,
> -                     __rte_unused uint16_t queue_id, struct 
> rte_eth_burst_mode *mode)
> +                            __rte_unused uint16_t queue_id,
> +                            struct rte_eth_burst_mode *mode)
>  {
>       eth_rx_burst_t pkt_burst = dev->rx_pkt_burst;
>       int32_t ret = -EINVAL;
>       uint32_t i, size;
> +
>       size = RTE_DIM(sxe2_rx_burst_infos);
>       for (i = 0; i < size; ++i) {
>               if (pkt_burst == sxe2_rx_burst_infos[i].rx_burst) {

The old code was fine, no need to change this. Either indentation style is OK.
Prefer not to have non-related changes.


> +static __rte_always_inline void
> +sxe2_tx_pkts_mbuf_fill_avx512(struct sxe2_tx_buffer_vec *buffer,
> +     struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> +{

Please only use always_inline where it is absolutely necessary.
Dont fight with the compiler.

> +static __rte_always_inline int32_t sxe2_tx_bufs_free_vec_avx512(struct 
> sxe2_tx_queue *txq)
> +{
> +     struct sxe2_tx_buffer_vec *buffer;
> +     struct rte_mbuf *mbuf;
> +     struct rte_mbuf *mbuf_free_arr[SXE2_TX_FREE_BUFFER_SIZE_MAX_VEC];
> +     struct rte_mempool *mp;
> +     struct rte_mempool_cache *cache;
> +     void **cache_objs;
> +     uint32_t copied;
> +     uint32_t i;
> +     int32_t ret;
> +     uint16_t rs_thresh;
> +     uint16_t free_num;
> +
> +     if (rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_DESC_DONE) !=
> +             (txq->desc_ring[txq->next_dd].wb.dd &
> +                     rte_cpu_to_le_64(SXE2_TX_DESC_DTYPE_MASK))) {
> +             ret = 0;
> +             goto l_end;
> +     }
> +
> +     rs_thresh = txq->rs_thresh;
> +
> +     buffer = (struct sxe2_tx_buffer_vec *)txq->buffer_ring;
> +     buffer += txq->next_dd - (rs_thresh - 1);
> +
> +     if ((txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) &&
> +                     (rs_thresh & 31) == 0) {
> +             mp = buffer[0].mbuf->pool;
> +             cache = rte_mempool_default_cache(mp, rte_lcore_id());
> +
> +             if (cache == NULL || cache->len)
> +                     goto normal;
> +
> +             if (rs_thresh > RTE_MEMPOOL_CACHE_MAX_SIZE) {
> +                     (void)rte_mempool_ops_enqueue_bulk(mp, (void *)buffer, 
> rs_thresh);
> +                     goto done;
> +             }
> +             cache_objs = &cache->objs[cache->len];

Directly using cache is going to be brittle and likely get broken by other
coming changes to mempool cache.

Re: [PATCH v13 01/20] net/sxe2: support AVX512 vectorized path for Rx and Tx

Reply via email to