Hi,
> -----Original Message-----
> From: Viacheslav Ovsiienko <viachesl...@mellanox.com>
> Sent: Friday, November 8, 2019 5:08 PM
> To: dev@dpdk.org
> Cc: Matan Azrad <ma...@mellanox.com>; Raslan Darawsheh
> <rasl...@mellanox.com>; Ori Kam <or...@mellanox.com>;
> sta...@dpdk.org
> Subject: [PATCH v4] net/mlx5: control transmit doorbell register mapping
> 
> The rdma core library can map doorbell register in two ways, depending on
> the environment variable "MLX5_SHUT_UP_BF":
> 
>   - as regular cached memory, the variable is either missing or
>     set to zero. This type of mapping may cause the significant
>     doorbell register writing latency and requires explicit
>     memory write barrier to mitigate this issue and prevent
>     write combining.
> 
>   - as non-cached memory, the variable is present and set to
>     not "0" value. This type of mapping may cause performance
>     impact under heavy loading conditions but the explicit write
>     memory barrier is not required and it may improve core
>     performance.
> 
> The new devarg is introduced "tx_db_nc", if this parameter is set to zero, the
> doorbell register is forced to be mapped to cached memory and requires
> explicit memory barrier after writing to. If "tx_db_nc" is set to non-zero 
> value
> the doorbell will be mapped as non-cached memory, not requiring the
> memory barrier. If "tx_db_nc" is missing the behaviour will be defined by
> presence of "MLX5_SHUT_UP_BF" in environment. If variable is missed the
> default value zero will be set for ARM64 hosts and one for others.
> 
> In run time the code checks the mapping type and provides the memory
> barrier after writing to tx doorbell register if it is needed. The mapping 
> type is
> extracted directly from the uar_mmap_offset field in the queue properties.
> 
> Fixes: 18a1c20044c0 ("net/mlx5: implement Tx burst template")
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com>
> Acked-by: Matan Azrad <ma...@mellanox.com>
> 
> ---
> It would be nice to have this fix in 19.08.1+
> 
> v4: rebase on top
> v3:
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F62773%2F&amp;data=02%7C01%7Crasland%40mell
> anox.com%7Ce26268efc564403f944e08d7645d71aa%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637088224824292194&amp;sdata=qYY7BVPW
> MZHjRpoaxgtUg2vJzuEBG3bSXd042sLsCuw%3D&amp;reserved=0
>     default tx_db_nc values are changed
> v2:
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> es.dpdk.org%2Fpatch%2F62739%2F&amp;data=02%7C01%7Crasland%40mell
> anox.com%7Ce26268efc564403f944e08d7645d71aa%7Ca652971c7d2e4d9ba6
> a4d149256f461b%7C0%7C0%7C637088224824292194&amp;sdata=ND76ZF3M
> kjPOh8qkDxYswzovfZ3dXY8u6UzY%2Fm9%2FH4c%3D&amp;reserved=0
> 
>  doc/guides/nics/mlx5.rst     | 23 +++++++++++
>  drivers/net/mlx5/Makefile    |  5 +++
>  drivers/net/mlx5/meson.build |  2 +
>  drivers/net/mlx5/mlx5.c      | 90
> ++++++++++++++++++++++++++++++++++++++------
>  drivers/net/mlx5/mlx5.h      |  1 +
>  drivers/net/mlx5/mlx5_defs.h | 16 ++++++++  drivers/net/mlx5/mlx5_rxtx.c
> | 17 ++++++++-  drivers/net/mlx5/mlx5_rxtx.h |  1 +
> drivers/net/mlx5/mlx5_txq.c  | 27 ++++++++++++-
>  9 files changed, 169 insertions(+), 13 deletions(-)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index
> 3651e82..5fd313c 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -552,6 +552,29 @@ Run-time configuration
>    Also, if minimal data inlining is requested by non-zero ``txq_inline_min``
>    option or reported by the NIC, the eMPW feature is disengaged.
> 
> +- ``tx_db_nc`` parameter [int]
> +
> +  The rdma core library can map doorbell register in two ways,
> + depending on the  environment variable "MLX5_SHUT_UP_BF":
> +
> +  - As regular cached memory, if the variable is either missing or set to 
> zero.
> +  - As non-cached memory, if the variable is present and set to not "0" 
> value.
> +
> +  The type of mapping may slightly affect the Tx performance, the
> + optimal choice  is strongly relied on the host architecture and should be
> deduced practically.
> +
> +  If ``tx_db_nc`` is either omitted or set to zero, the doorbell is
> + forced to be  mapped to regular memory, the PMD will perform the extra
> + write memory barrier  after writing to doorbell, it might increase the
> + needed CPU clocks per packet  to send, but latency might be improved.
> +
> +  If ``tx_db_nc`` is set to not zero, the doorbell is forced to be
> + mapped to  non cached memory, the PMD will not perform the extra write
> + memory barrier  after writing to doorbell, on some architectures it
> + might improve the  performance.
> +
> +  The default ``tx_db_nc`` value is zero ARM64 hosts and one for others.
> +
>  - ``tx_vec_en`` parameter [int]
> 
>    A nonzero value enables Tx vector on ConnectX-5, ConnectX-6, ConnectX-6
> DX diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index
> d01fa73..5b79631 100644
> --- a/drivers/net/mlx5/Makefile
> +++ b/drivers/net/mlx5/Makefile
> @@ -206,6 +206,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-
> config-h.sh
>               func mlx5dv_dr_action_create_flow_meter \
>               $(AUTOCONF_OUTPUT)
>       $Q sh -- '$<' '$@' \
> +             HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD \
> +             infiniband/mlx5dv.h \
> +             enum MLX5_MMAP_GET_NC_PAGES_CMD \
> +             $(AUTOCONF_OUTPUT)
> +     $Q sh -- '$<' '$@' \
>               HAVE_ETHTOOL_LINK_MODE_25G \
>               /usr/include/linux/ethtool.h \
>               enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \ diff --
> git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build index
> 511f5b7..05fadf6 100644
> --- a/drivers/net/mlx5/meson.build
> +++ b/drivers/net/mlx5/meson.build
> @@ -134,6 +134,8 @@ if build
>               'mlx5dv_dr_action_create_dest_devx_tir' ],
>               [ 'HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER',
> 'infiniband/mlx5dv.h',
>               'mlx5dv_dr_action_create_flow_meter' ],
> +             [ 'HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD',
> 'infiniband/mlx5dv.h',
> +             'MLX5_MMAP_GET_NC_PAGES_CMD' ],
>               [ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h',
>               'MLX5DV_DR_DOMAIN_TYPE_NIC_RX' ],
>               [ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h', diff --
> git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> 9a2c711..276087d 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -96,6 +96,12 @@
>  #define MLX5_TXQ_MPW_EN "txq_mpw_en"
> 
>  /*
> + * Device parameter to force doorbell register mapping
> + * to non-cahed region eliminating the extra write memory barrier.
> + */
> +#define MLX5_TX_DB_NC "tx_db_nc"
> +
> +/*
>   * Device parameter to include 2 dsegs in the title WQEBB.
>   * Deprecated, ignored.
>   */
> @@ -421,6 +427,37 @@ struct mlx5_flow_id_pool *  }  #endif /*
> HAVE_IBV_FLOW_DV_SUPPORT */
> 
> +static int
> +mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
> +{
> +     char *env;
> +     int value;
> +
> +     assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +     /* Get environment variable to store. */
> +     env = getenv(MLX5_SHUT_UP_BF);
> +     value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
> +     if (config->dbnc == MLX5_ARG_UNSET)
> +             setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT,
> 1);
> +     else
> +             setenv(MLX5_SHUT_UP_BF, config->dbnc ? "1" : "0", 1);
> +     return value;
> +}
> +
> +static void
> +mlx5_restore_doorbell_mapping_env(const struct mlx5_dev_config
> *config,
> +                               int value)
> +{
> +     assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
> +     if (config->dbnc == MLX5_ARG_UNSET)
> +             return;
> +     /* Restore the original environment variable state. */
> +     if (value == MLX5_ARG_UNSET)
> +             unsetenv(MLX5_SHUT_UP_BF);
> +     else
> +             setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); }
> +
>  /**
>   * Allocate shared IB device context. If there is multiport device the
>   * master and representors will share this context, if there is single @@ -
> 434,22 +471,26 @@ struct mlx5_flow_id_pool *
>   *
>   * @param[in] spawn
>   *   Pointer to the IB device attributes (name, port, etc).
> + * @param[in] config
> + *   Pointer to device configuration structure.
>   *
>   * @return
>   *   Pointer to mlx5_ibv_shared object on success,
>   *   otherwise NULL and rte_errno is set.
>   */
>  static struct mlx5_ibv_shared *
> -mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn)
> +mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn,
> +                     const struct mlx5_dev_config *config)
>  {
>       struct mlx5_ibv_shared *sh;
> +     int dbmap_env;
>       int err = 0;
>       uint32_t i;
>  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
>       struct mlx5_devx_tis_attr tis_attr = { 0 };  #endif
> 
> -assert(spawn);
> +     assert(spawn);
>       /* Secondary process should not create the shared context. */
>       assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>       pthread_mutex_lock(&mlx5_ibv_list_mutex);
> @@ -472,16 +513,31 @@ struct mlx5_flow_id_pool *
>               rte_errno  = ENOMEM;
>               goto exit;
>       }
> +     /*
> +      * Configure environment variable "MLX5_BF_SHUT_UP"
> +      * before the device creation. The rdma_core library
> +      * checks the variable at device creation and
> +      * stores the result internally.
> +      */
> +     dbmap_env = mlx5_config_doorbell_mapping_env(config);
>       /* Try to open IB device with DV first, then usual Verbs. */
>       errno = 0;
>       sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev);
>       if (sh->ctx) {
>               sh->devx = 1;
>               DRV_LOG(DEBUG, "DevX is supported");
> +             /* The device is created, no need for environment. */
> +             mlx5_restore_doorbell_mapping_env(config, dbmap_env);
>       } else {
> +             /* The environment variable is still configured. */
>               sh->ctx = mlx5_glue->open_device(spawn->ibv_dev);
> +             err = errno ? errno : ENODEV;
> +             /*
> +              * The environment variable is not needed anymore,
> +              * all device creation attempts are completed.
> +              */
> +             mlx5_restore_doorbell_mapping_env(config, dbmap_env);
>               if (!sh->ctx) {
> -                     err = errno ? errno : ENODEV;
>                       goto error;
>               }
>               DRV_LOG(DEBUG, "DevX is NOT supported"); @@ -1300,6
> +1356,8 @@ struct mlx5_flow_id_pool *
>               DRV_LOG(WARNING, "%s: deprecated parameter, ignored",
> key);
>       } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
>               config->mps = !!tmp;
> +     } else if (strcmp(MLX5_TX_DB_NC, key) == 0) {
> +             config->dbnc = !!tmp;
>       } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
>               DRV_LOG(WARNING, "%s: deprecated parameter, ignored",
> key);
>       } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { @@ -
> 1373,6 +1431,7 @@ struct mlx5_flow_id_pool *
>               MLX5_TXQ_MPW_EN,
>               MLX5_TXQ_MPW_HDR_DSEG_EN,
>               MLX5_TXQ_MAX_INLINE_LEN,
> +             MLX5_TX_DB_NC,
>               MLX5_TX_VEC_EN,
>               MLX5_RX_VEC_EN,
>               MLX5_L3_VXLAN_EN,
> @@ -1938,7 +1997,20 @@ struct mlx5_flow_id_pool *
>               eth_dev->tx_pkt_burst =
> mlx5_select_tx_function(eth_dev);
>               return eth_dev;
>       }
> -     sh = mlx5_alloc_shared_ibctx(spawn);
> +     /*
> +      * Some parameters ("tx_db_nc" in particularly) are needed in
> +      * advance to create dv/verbs device context. We proceed the
> +      * devargs here to get ones, and later proceed devargs again
> +      * to override some hardware settings.
> +      */
> +     err = mlx5_args(&config, dpdk_dev->devargs);
> +     if (err) {
> +             err = rte_errno;
> +             DRV_LOG(ERR, "failed to process device arguments: %s",
> +                     strerror(rte_errno));
> +             goto error;
> +     }
> +     sh = mlx5_alloc_shared_ibctx(spawn, &config);
>       if (!sh)
>               return NULL;
>       config.devx = sh->devx;
> @@ -2180,13 +2252,8 @@ struct mlx5_flow_id_pool *
>               }
>               own_domain_id = 1;
>       }
> -     err = mlx5_args(&config, dpdk_dev->devargs);
> -     if (err) {
> -             err = rte_errno;
> -             DRV_LOG(ERR, "failed to process device arguments: %s",
> -                     strerror(rte_errno));
> -             goto error;
> -     }
> +     /* Override some values set by hardware configuration. */
> +     mlx5_args(&config, dpdk_dev->devargs);
>       err = mlx5_dev_check_sibling_config(priv, &config);
>       if (err)
>               goto error;
> @@ -3031,6 +3098,7 @@ struct mlx5_flow_id_pool *
>       dev_config = (struct mlx5_dev_config){
>               .hw_padding = 0,
>               .mps = MLX5_ARG_UNSET,
> +             .dbnc = MLX5_ARG_UNSET,
>               .rx_vec_en = 1,
>               .txq_inline_max = MLX5_ARG_UNSET,
>               .txq_inline_min = MLX5_ARG_UNSET,
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> e8148ce..1a92c10 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -267,6 +267,7 @@ struct mlx5_dev_config {
>               /* Rx queue count threshold to enable MPRQ. */
>       } mprq; /* Configurations for Multi-Packet RQ. */
>       int mps; /* Multi-packet send supported mode. */
> +     int dbnc; /* Skip doorbell register write barrier. */
>       unsigned int flow_prio; /* Number of flow priorities. */
>       enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM];
>       /* Availibility of mreg_c's. */
> diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
> index 0ef532f..03060aa 100644
> --- a/drivers/net/mlx5/mlx5_defs.h
> +++ b/drivers/net/mlx5/mlx5_defs.h
> @@ -123,6 +123,22 @@
>  #define MLX5_UAR_PAGE_NUM_MAX 64
>  #define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) -
> 1)
> 
> +/* Fields of memory mapping type in offset parameter of mmap() */
> +#define MLX5_UAR_MMAP_CMD_SHIFT 8 #define
> MLX5_UAR_MMAP_CMD_MASK 0xff
> +
> +/* Environment variable to control the doorbell register mapping. */
> +#define MLX5_SHUT_UP_BF "MLX5_SHUT_UP_BF"
> +#if defined(RTE_ARCH_ARM64)
> +#define MLX5_SHUT_UP_BF_DEFAULT "0"
> +#else
> +#define MLX5_SHUT_UP_BF_DEFAULT "1"
> +#endif
> +
> +#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD #define
> +MLX5_MMAP_GET_NC_PAGES_CMD 3 #endif
> +
>  /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
> #define MLX5_MPRQ_STRIDE_NUM_N 6U
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 8bc0542..1ea5960 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -4754,8 +4754,23 @@ enum mlx5_txcmp_code {
>        * to improve latencies. The pure software related data treatment
>        * can be completed after doorbell. Tx CQEs for this SQ are
>        * processed in this thread only by the polling.
> +      *
> +      * The rdma core library can map doorbell register in two ways,
> +      * depending on the environment variable "MLX5_SHUT_UP_BF":
> +      *
> +      * - as regular cached memory, the variable is either missing or
> +      *   set to zero. This type of mapping may cause the significant
> +      *   doorbell register writing latency and requires explicit
> +      *   memory write barrier to mitigate this issue and prevent
> +      *   write combining.
> +      *
> +      * - as non-cached memory, the variable is present and set to
> +      *   not "0" value. This type of mapping may cause performance
> +      *   impact under heavy loading conditions but the explicit write
> +      *   memory barrier is not required and it may improve core
> +      *   performance.
>        */
> -     mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0);
> +     mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc);
>       /* Not all of the mbufs may be stored into elts yet. */
>       part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent -
> loc.pkts_copy;
>       if (!MLX5_TXOFF_CONFIG(INLINE) && part) { diff --git
> a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index
> 559d225..88c50aa 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -287,6 +287,7 @@ struct mlx5_txq_data {
>       /* When set TX offload for tunneled packets are supported. */
>       uint16_t swp_en:1; /* Whether SW parser is enabled. */
>       uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
> +     uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
>       uint16_t inlen_send; /* Ordinary send data inline size. */
>       uint16_t inlen_empw; /* eMPW max packet size to inline. */
>       uint16_t inlen_mode; /* Minimal data length to inline. */ diff --git
> a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index
> 97991f0..a0d6164 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -18,6 +18,7 @@
>  #pragma GCC diagnostic ignored "-Wpedantic"
>  #endif
>  #include <infiniband/verbs.h>
> +#include <infiniband/mlx5dv.h>
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> @@ -302,6 +303,28 @@
>  }
> 
>  /**
> + * Configure the doorbell register non-cached attribute.
> + *
> + * @param txq_ctrl
> + *   Pointer to Tx queue control structure.
> + * @param page_size
> + *   Systme page size
> + */
> +static void
> +txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size) {
> +     unsigned int cmd;
> +
> +     txq_ctrl->txq.db_nc = 0;
> +     /* Check the doorbell register mapping type. */
> +     cmd = txq_ctrl->uar_mmap_offset / page_size;
> +     cmd >>= MLX5_UAR_MMAP_CMD_SHIFT;
> +     cmd &= MLX5_UAR_MMAP_CMD_MASK;
> +     if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD)
> +             txq_ctrl->txq.db_nc = 1;
> +}
> +
> +/**
>   * Initialize Tx UAR registers for primary process.
>   *
>   * @param txq_ctrl
> @@ -312,9 +335,9 @@
>  {
>       struct mlx5_priv *priv = txq_ctrl->priv;
>       struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv));
> +     const size_t page_size = sysconf(_SC_PAGESIZE);
>  #ifndef RTE_ARCH_64
>       unsigned int lock_idx;
> -     const size_t page_size = sysconf(_SC_PAGESIZE);
>  #endif
> 
>       if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) @@ -322,6 +345,7
> @@
>       assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
>       assert(ppriv);
>       ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg;
> +     txq_uar_ncattr_init(txq_ctrl, page_size);
>  #ifndef RTE_ARCH_64
>       /* Assign an UAR lock according to UAR page number */
>       lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & @@ -375,6
> +399,7 @@
>       }
>       addr = RTE_PTR_ADD(addr, offset);
>       ppriv->uar_table[txq->idx] = addr;
> +     txq_uar_ncattr_init(txq_ctrl, page_size);
>       return 0;
>  }
> 
> --
> 1.8.3.1


Patch applied to next-net-mlx,

Kindest regards,
Raslan Darawsheh

Reply via email to