The additional annotations clutter the code. How big a performance hit is it to disable for whole driver? Or just use memcpy instead of rte_memcpy?
On Mon, Nov 12, 2018, 4:01 PM Thomas Monjalon <tho...@monjalon.net wrote: > A bug was found when the inline function mlx5_tx_complete() > is optimized with AVX512F instructions. It corrupts an offset > in the instructions vmovdqu8 of the AVX2 version of rte_mov128(), > used in rte_memcpy(), which is called in rte_mempool_put_bulk(). > > All the above functions are inline. So the workaround is > to disable AVX512F optimization for the functions calling the > top-level function of this call stack, i.e. mlx5_tx_complete(). > All GCC versions supporting AVX512 are supposed to be affected. > > The root cause is not identified yet. It may be thought that > more related bugs may happen in other functions. > That's why the initial workaround was to disable AVX512F globally. > This patch takes the risk of applying the workaround only for the > functions known to be affected, in order to preserve the optimization > everywhere else. > > Bugzilla ID: 97 > Fixes: 8d07c82b239f ("mk: disable gcc AVX512F support") > > Signed-off-by: Thomas Monjalon <tho...@monjalon.net> > --- > drivers/net/mlx5/mlx5_rxtx.c | 10 +++++----- > drivers/net/mlx5/mlx5_rxtx_vec.c | 4 ++-- > drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 2 +- > drivers/net/mlx5/mlx5_utils.h | 11 +++++++++++ > mk/rte.cpuflags.mk | 5 ----- > 5 files changed, 19 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c > index 6eceea5fe..c08f63299 100644 > --- a/drivers/net/mlx5/mlx5_rxtx.c > +++ b/drivers/net/mlx5/mlx5_rxtx.c > @@ -403,7 +403,7 @@ inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf > *buf, > * @return > * The status of the tx descriptor. > */ > -int > +int MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset) > { > struct mlx5_txq_data *txq = tx_queue; > @@ -537,7 +537,7 @@ mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t > rx_queue_id) > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -uint16_t > +uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) > { > struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; > @@ -978,7 +978,7 @@ mlx5_mpw_close(struct mlx5_txq_data *txq, struct > mlx5_mpw *mpw) > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -uint16_t > +uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) > { > struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; > @@ -1196,7 +1196,7 @@ mlx5_mpw_inline_close(struct mlx5_txq_data *txq, > struct mlx5_mpw *mpw) > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -uint16_t > +uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, > uint16_t pkts_n) > { > @@ -1725,7 +1725,7 @@ txq_burst_empw(struct mlx5_txq_data *txq, struct > rte_mbuf **pkts, > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -uint16_t > +uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t > pkts_n) > { > struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c > b/drivers/net/mlx5/mlx5_rxtx_vec.c > index 340292add..da9f30f16 100644 > --- a/drivers/net/mlx5/mlx5_rxtx_vec.c > +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c > @@ -104,7 +104,7 @@ txq_calc_offload(struct rte_mbuf **pkts, uint16_t > pkts_n, uint8_t *cs_flags, > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -uint16_t > +uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, > uint16_t pkts_n) > { > @@ -137,7 +137,7 @@ mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf > **pkts, > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -uint16_t > +uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) > { > struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq; > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > index e0f95f923..399fd39c5 100644 > --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > @@ -89,7 +89,7 @@ txq_wr_dseg_v(struct mlx5_txq_data *txq, __m128i *dseg, > * @return > * Number of packets successfully transmitted (<= pkts_n). > */ > -static uint16_t > +static uint16_t MLX5_WORKAROUND_GCC_BUG_AVX512F > txq_scatter_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, > uint16_t pkts_n) > { > diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h > index 886f60e61..10503f3f0 100644 > --- a/drivers/net/mlx5/mlx5_utils.h > +++ b/drivers/net/mlx5/mlx5_utils.h > @@ -15,6 +15,17 @@ > > #include "mlx5_defs.h" > > +/* > + * GCC bug workaround for rte_memcpy broken when optimized for AVX512. > + * Details are in https://bugs.dpdk.org/show_bug.cgi?id=97 > + * AVX512F is disabled for affected functions (calling mlx5_tx_complete). > + */ > +#if defined(__clang__) || defined(__INTEL_COMPILER) > +#define MLX5_WORKAROUND_GCC_BUG_AVX512F > +#else > +#define MLX5_WORKAROUND_GCC_BUG_AVX512F > __attribute__((target("no-avx512f"))) > +#endif > + > /* Bit-field manipulation. */ > #define BITFIELD_DECLARE(bf, type, size) \ > type bf[(((size_t)(size) / (sizeof(type) * CHAR_BIT)) + \ > diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk > index c3291b17a..43ed84155 100644 > --- a/mk/rte.cpuflags.mk > +++ b/mk/rte.cpuflags.mk > @@ -68,11 +68,6 @@ endif > ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),) > ifeq ($(CONFIG_RTE_ENABLE_AVX512),y) > CPUFLAGS += AVX512F > -else > -# disable AVX512F support of gcc as a workaround for Bug 97 > -ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) > -MACHINE_CFLAGS += -mno-avx512f > -endif > endif > endif > > -- > 2.19.0 > >