Hi, > -----Original Message----- > From: Viacheslav Ovsiienko <viachesl...@mellanox.com> > Sent: Friday, November 8, 2019 5:08 PM > To: dev@dpdk.org > Cc: Matan Azrad <ma...@mellanox.com>; Raslan Darawsheh > <rasl...@mellanox.com>; Ori Kam <or...@mellanox.com>; > sta...@dpdk.org > Subject: [PATCH v4] net/mlx5: control transmit doorbell register mapping > > The rdma core library can map doorbell register in two ways, depending on > the environment variable "MLX5_SHUT_UP_BF": > > - as regular cached memory, the variable is either missing or > set to zero. This type of mapping may cause the significant > doorbell register writing latency and requires explicit > memory write barrier to mitigate this issue and prevent > write combining. > > - as non-cached memory, the variable is present and set to > not "0" value. This type of mapping may cause performance > impact under heavy loading conditions but the explicit write > memory barrier is not required and it may improve core > performance. > > The new devarg is introduced "tx_db_nc", if this parameter is set to zero, the > doorbell register is forced to be mapped to cached memory and requires > explicit memory barrier after writing to. If "tx_db_nc" is set to non-zero > value > the doorbell will be mapped as non-cached memory, not requiring the > memory barrier. If "tx_db_nc" is missing the behaviour will be defined by > presence of "MLX5_SHUT_UP_BF" in environment. If variable is missed the > default value zero will be set for ARM64 hosts and one for others. > > In run time the code checks the mapping type and provides the memory > barrier after writing to tx doorbell register if it is needed. The mapping > type is > extracted directly from the uar_mmap_offset field in the queue properties. > > Fixes: 18a1c20044c0 ("net/mlx5: implement Tx burst template") > Cc: sta...@dpdk.org > > Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com> > Acked-by: Matan Azrad <ma...@mellanox.com> > > --- > It would be nice to have this fix in 19.08.1+ > > v4: rebase on top > v3: > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch > es.dpdk.org%2Fpatch%2F62773%2F&data=02%7C01%7Crasland%40mell > anox.com%7Ce26268efc564403f944e08d7645d71aa%7Ca652971c7d2e4d9ba6 > a4d149256f461b%7C0%7C0%7C637088224824292194&sdata=qYY7BVPW > MZHjRpoaxgtUg2vJzuEBG3bSXd042sLsCuw%3D&reserved=0 > default tx_db_nc values are changed > v2: > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch > es.dpdk.org%2Fpatch%2F62739%2F&data=02%7C01%7Crasland%40mell > anox.com%7Ce26268efc564403f944e08d7645d71aa%7Ca652971c7d2e4d9ba6 > a4d149256f461b%7C0%7C0%7C637088224824292194&sdata=ND76ZF3M > kjPOh8qkDxYswzovfZ3dXY8u6UzY%2Fm9%2FH4c%3D&reserved=0 > > doc/guides/nics/mlx5.rst | 23 +++++++++++ > drivers/net/mlx5/Makefile | 5 +++ > drivers/net/mlx5/meson.build | 2 + > drivers/net/mlx5/mlx5.c | 90 > ++++++++++++++++++++++++++++++++++++++------ > drivers/net/mlx5/mlx5.h | 1 + > drivers/net/mlx5/mlx5_defs.h | 16 ++++++++ drivers/net/mlx5/mlx5_rxtx.c > | 17 ++++++++- drivers/net/mlx5/mlx5_rxtx.h | 1 + > drivers/net/mlx5/mlx5_txq.c | 27 ++++++++++++- > 9 files changed, 169 insertions(+), 13 deletions(-) > > diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index > 3651e82..5fd313c 100644 > --- a/doc/guides/nics/mlx5.rst > +++ b/doc/guides/nics/mlx5.rst > @@ -552,6 +552,29 @@ Run-time configuration > Also, if minimal data inlining is requested by non-zero ``txq_inline_min`` > option or reported by the NIC, the eMPW feature is disengaged. > > +- ``tx_db_nc`` parameter [int] > + > + The rdma core library can map doorbell register in two ways, > + depending on the environment variable "MLX5_SHUT_UP_BF": > + > + - As regular cached memory, if the variable is either missing or set to > zero. > + - As non-cached memory, if the variable is present and set to not "0" > value. > + > + The type of mapping may slightly affect the Tx performance, the > + optimal choice is strongly relied on the host architecture and should be > deduced practically. > + > + If ``tx_db_nc`` is either omitted or set to zero, the doorbell is > + forced to be mapped to regular memory, the PMD will perform the extra > + write memory barrier after writing to doorbell, it might increase the > + needed CPU clocks per packet to send, but latency might be improved. > + > + If ``tx_db_nc`` is set to not zero, the doorbell is forced to be > + mapped to non cached memory, the PMD will not perform the extra write > + memory barrier after writing to doorbell, on some architectures it > + might improve the performance. > + > + The default ``tx_db_nc`` value is zero ARM64 hosts and one for others. > + > - ``tx_vec_en`` parameter [int] > > A nonzero value enables Tx vector on ConnectX-5, ConnectX-6, ConnectX-6 > DX diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile index > d01fa73..5b79631 100644 > --- a/drivers/net/mlx5/Makefile > +++ b/drivers/net/mlx5/Makefile > @@ -206,6 +206,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto- > config-h.sh > func mlx5dv_dr_action_create_flow_meter \ > $(AUTOCONF_OUTPUT) > $Q sh -- '$<' '$@' \ > + HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD \ > + infiniband/mlx5dv.h \ > + enum MLX5_MMAP_GET_NC_PAGES_CMD \ > + $(AUTOCONF_OUTPUT) > + $Q sh -- '$<' '$@' \ > HAVE_ETHTOOL_LINK_MODE_25G \ > /usr/include/linux/ethtool.h \ > enum ETHTOOL_LINK_MODE_25000baseCR_Full_BIT \ diff -- > git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build index > 511f5b7..05fadf6 100644 > --- a/drivers/net/mlx5/meson.build > +++ b/drivers/net/mlx5/meson.build > @@ -134,6 +134,8 @@ if build > 'mlx5dv_dr_action_create_dest_devx_tir' ], > [ 'HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER', > 'infiniband/mlx5dv.h', > 'mlx5dv_dr_action_create_flow_meter' ], > + [ 'HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD', > 'infiniband/mlx5dv.h', > + 'MLX5_MMAP_GET_NC_PAGES_CMD' ], > [ 'HAVE_MLX5DV_DR', 'infiniband/mlx5dv.h', > 'MLX5DV_DR_DOMAIN_TYPE_NIC_RX' ], > [ 'HAVE_MLX5DV_DR_ESWITCH', 'infiniband/mlx5dv.h', diff -- > git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index > 9a2c711..276087d 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -96,6 +96,12 @@ > #define MLX5_TXQ_MPW_EN "txq_mpw_en" > > /* > + * Device parameter to force doorbell register mapping > + * to non-cahed region eliminating the extra write memory barrier. > + */ > +#define MLX5_TX_DB_NC "tx_db_nc" > + > +/* > * Device parameter to include 2 dsegs in the title WQEBB. > * Deprecated, ignored. > */ > @@ -421,6 +427,37 @@ struct mlx5_flow_id_pool * } #endif /* > HAVE_IBV_FLOW_DV_SUPPORT */ > > +static int > +mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config) > +{ > + char *env; > + int value; > + > + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); > + /* Get environment variable to store. */ > + env = getenv(MLX5_SHUT_UP_BF); > + value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET; > + if (config->dbnc == MLX5_ARG_UNSET) > + setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, > 1); > + else > + setenv(MLX5_SHUT_UP_BF, config->dbnc ? "1" : "0", 1); > + return value; > +} > + > +static void > +mlx5_restore_doorbell_mapping_env(const struct mlx5_dev_config > *config, > + int value) > +{ > + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); > + if (config->dbnc == MLX5_ARG_UNSET) > + return; > + /* Restore the original environment variable state. */ > + if (value == MLX5_ARG_UNSET) > + unsetenv(MLX5_SHUT_UP_BF); > + else > + setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1); } > + > /** > * Allocate shared IB device context. If there is multiport device the > * master and representors will share this context, if there is single @@ - > 434,22 +471,26 @@ struct mlx5_flow_id_pool * > * > * @param[in] spawn > * Pointer to the IB device attributes (name, port, etc). > + * @param[in] config > + * Pointer to device configuration structure. > * > * @return > * Pointer to mlx5_ibv_shared object on success, > * otherwise NULL and rte_errno is set. > */ > static struct mlx5_ibv_shared * > -mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) > +mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn, > + const struct mlx5_dev_config *config) > { > struct mlx5_ibv_shared *sh; > + int dbmap_env; > int err = 0; > uint32_t i; > #ifdef HAVE_IBV_FLOW_DV_SUPPORT > struct mlx5_devx_tis_attr tis_attr = { 0 }; #endif > > -assert(spawn); > + assert(spawn); > /* Secondary process should not create the shared context. */ > assert(rte_eal_process_type() == RTE_PROC_PRIMARY); > pthread_mutex_lock(&mlx5_ibv_list_mutex); > @@ -472,16 +513,31 @@ struct mlx5_flow_id_pool * > rte_errno = ENOMEM; > goto exit; > } > + /* > + * Configure environment variable "MLX5_BF_SHUT_UP" > + * before the device creation. The rdma_core library > + * checks the variable at device creation and > + * stores the result internally. > + */ > + dbmap_env = mlx5_config_doorbell_mapping_env(config); > /* Try to open IB device with DV first, then usual Verbs. */ > errno = 0; > sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); > if (sh->ctx) { > sh->devx = 1; > DRV_LOG(DEBUG, "DevX is supported"); > + /* The device is created, no need for environment. */ > + mlx5_restore_doorbell_mapping_env(config, dbmap_env); > } else { > + /* The environment variable is still configured. */ > sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); > + err = errno ? errno : ENODEV; > + /* > + * The environment variable is not needed anymore, > + * all device creation attempts are completed. > + */ > + mlx5_restore_doorbell_mapping_env(config, dbmap_env); > if (!sh->ctx) { > - err = errno ? errno : ENODEV; > goto error; > } > DRV_LOG(DEBUG, "DevX is NOT supported"); @@ -1300,6 > +1356,8 @@ struct mlx5_flow_id_pool * > DRV_LOG(WARNING, "%s: deprecated parameter, ignored", > key); > } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { > config->mps = !!tmp; > + } else if (strcmp(MLX5_TX_DB_NC, key) == 0) { > + config->dbnc = !!tmp; > } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { > DRV_LOG(WARNING, "%s: deprecated parameter, ignored", > key); > } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { @@ - > 1373,6 +1431,7 @@ struct mlx5_flow_id_pool * > MLX5_TXQ_MPW_EN, > MLX5_TXQ_MPW_HDR_DSEG_EN, > MLX5_TXQ_MAX_INLINE_LEN, > + MLX5_TX_DB_NC, > MLX5_TX_VEC_EN, > MLX5_RX_VEC_EN, > MLX5_L3_VXLAN_EN, > @@ -1938,7 +1997,20 @@ struct mlx5_flow_id_pool * > eth_dev->tx_pkt_burst = > mlx5_select_tx_function(eth_dev); > return eth_dev; > } > - sh = mlx5_alloc_shared_ibctx(spawn); > + /* > + * Some parameters ("tx_db_nc" in particularly) are needed in > + * advance to create dv/verbs device context. We proceed the > + * devargs here to get ones, and later proceed devargs again > + * to override some hardware settings. > + */ > + err = mlx5_args(&config, dpdk_dev->devargs); > + if (err) { > + err = rte_errno; > + DRV_LOG(ERR, "failed to process device arguments: %s", > + strerror(rte_errno)); > + goto error; > + } > + sh = mlx5_alloc_shared_ibctx(spawn, &config); > if (!sh) > return NULL; > config.devx = sh->devx; > @@ -2180,13 +2252,8 @@ struct mlx5_flow_id_pool * > } > own_domain_id = 1; > } > - err = mlx5_args(&config, dpdk_dev->devargs); > - if (err) { > - err = rte_errno; > - DRV_LOG(ERR, "failed to process device arguments: %s", > - strerror(rte_errno)); > - goto error; > - } > + /* Override some values set by hardware configuration. */ > + mlx5_args(&config, dpdk_dev->devargs); > err = mlx5_dev_check_sibling_config(priv, &config); > if (err) > goto error; > @@ -3031,6 +3098,7 @@ struct mlx5_flow_id_pool * > dev_config = (struct mlx5_dev_config){ > .hw_padding = 0, > .mps = MLX5_ARG_UNSET, > + .dbnc = MLX5_ARG_UNSET, > .rx_vec_en = 1, > .txq_inline_max = MLX5_ARG_UNSET, > .txq_inline_min = MLX5_ARG_UNSET, > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index > e8148ce..1a92c10 100644 > --- a/drivers/net/mlx5/mlx5.h > +++ b/drivers/net/mlx5/mlx5.h > @@ -267,6 +267,7 @@ struct mlx5_dev_config { > /* Rx queue count threshold to enable MPRQ. */ > } mprq; /* Configurations for Multi-Packet RQ. */ > int mps; /* Multi-packet send supported mode. */ > + int dbnc; /* Skip doorbell register write barrier. */ > unsigned int flow_prio; /* Number of flow priorities. */ > enum modify_reg flow_mreg_c[MLX5_MREG_C_NUM]; > /* Availibility of mreg_c's. */ > diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h > index 0ef532f..03060aa 100644 > --- a/drivers/net/mlx5/mlx5_defs.h > +++ b/drivers/net/mlx5/mlx5_defs.h > @@ -123,6 +123,22 @@ > #define MLX5_UAR_PAGE_NUM_MAX 64 > #define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX) - > 1) > > +/* Fields of memory mapping type in offset parameter of mmap() */ > +#define MLX5_UAR_MMAP_CMD_SHIFT 8 #define > MLX5_UAR_MMAP_CMD_MASK 0xff > + > +/* Environment variable to control the doorbell register mapping. */ > +#define MLX5_SHUT_UP_BF "MLX5_SHUT_UP_BF" > +#if defined(RTE_ARCH_ARM64) > +#define MLX5_SHUT_UP_BF_DEFAULT "0" > +#else > +#define MLX5_SHUT_UP_BF_DEFAULT "1" > +#endif > + > +#ifndef HAVE_MLX5DV_MMAP_GET_NC_PAGES_CMD #define > +MLX5_MMAP_GET_NC_PAGES_CMD 3 #endif > + > /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */ > #define MLX5_MPRQ_STRIDE_NUM_N 6U > > diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c > index 8bc0542..1ea5960 100644 > --- a/drivers/net/mlx5/mlx5_rxtx.c > +++ b/drivers/net/mlx5/mlx5_rxtx.c > @@ -4754,8 +4754,23 @@ enum mlx5_txcmp_code { > * to improve latencies. The pure software related data treatment > * can be completed after doorbell. Tx CQEs for this SQ are > * processed in this thread only by the polling. > + * > + * The rdma core library can map doorbell register in two ways, > + * depending on the environment variable "MLX5_SHUT_UP_BF": > + * > + * - as regular cached memory, the variable is either missing or > + * set to zero. This type of mapping may cause the significant > + * doorbell register writing latency and requires explicit > + * memory write barrier to mitigate this issue and prevent > + * write combining. > + * > + * - as non-cached memory, the variable is present and set to > + * not "0" value. This type of mapping may cause performance > + * impact under heavy loading conditions but the explicit write > + * memory barrier is not required and it may improve core > + * performance. > */ > - mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, 0); > + mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc); > /* Not all of the mbufs may be stored into elts yet. */ > part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - > loc.pkts_copy; > if (!MLX5_TXOFF_CONFIG(INLINE) && part) { diff --git > a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index > 559d225..88c50aa 100644 > --- a/drivers/net/mlx5/mlx5_rxtx.h > +++ b/drivers/net/mlx5/mlx5_rxtx.h > @@ -287,6 +287,7 @@ struct mlx5_txq_data { > /* When set TX offload for tunneled packets are supported. */ > uint16_t swp_en:1; /* Whether SW parser is enabled. */ > uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */ > + uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */ > uint16_t inlen_send; /* Ordinary send data inline size. */ > uint16_t inlen_empw; /* eMPW max packet size to inline. */ > uint16_t inlen_mode; /* Minimal data length to inline. */ diff --git > a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c index > 97991f0..a0d6164 100644 > --- a/drivers/net/mlx5/mlx5_txq.c > +++ b/drivers/net/mlx5/mlx5_txq.c > @@ -18,6 +18,7 @@ > #pragma GCC diagnostic ignored "-Wpedantic" > #endif > #include <infiniband/verbs.h> > +#include <infiniband/mlx5dv.h> > #ifdef PEDANTIC > #pragma GCC diagnostic error "-Wpedantic" > #endif > @@ -302,6 +303,28 @@ > } > > /** > + * Configure the doorbell register non-cached attribute. > + * > + * @param txq_ctrl > + * Pointer to Tx queue control structure. > + * @param page_size > + * Systme page size > + */ > +static void > +txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size) { > + unsigned int cmd; > + > + txq_ctrl->txq.db_nc = 0; > + /* Check the doorbell register mapping type. */ > + cmd = txq_ctrl->uar_mmap_offset / page_size; > + cmd >>= MLX5_UAR_MMAP_CMD_SHIFT; > + cmd &= MLX5_UAR_MMAP_CMD_MASK; > + if (cmd == MLX5_MMAP_GET_NC_PAGES_CMD) > + txq_ctrl->txq.db_nc = 1; > +} > + > +/** > * Initialize Tx UAR registers for primary process. > * > * @param txq_ctrl > @@ -312,9 +335,9 @@ > { > struct mlx5_priv *priv = txq_ctrl->priv; > struct mlx5_proc_priv *ppriv = MLX5_PROC_PRIV(PORT_ID(priv)); > + const size_t page_size = sysconf(_SC_PAGESIZE); > #ifndef RTE_ARCH_64 > unsigned int lock_idx; > - const size_t page_size = sysconf(_SC_PAGESIZE); > #endif > > if (txq_ctrl->type != MLX5_TXQ_TYPE_STANDARD) @@ -322,6 +345,7 > @@ > assert(rte_eal_process_type() == RTE_PROC_PRIMARY); > assert(ppriv); > ppriv->uar_table[txq_ctrl->txq.idx] = txq_ctrl->bf_reg; > + txq_uar_ncattr_init(txq_ctrl, page_size); > #ifndef RTE_ARCH_64 > /* Assign an UAR lock according to UAR page number */ > lock_idx = (txq_ctrl->uar_mmap_offset / page_size) & @@ -375,6 > +399,7 @@ > } > addr = RTE_PTR_ADD(addr, offset); > ppriv->uar_table[txq->idx] = addr; > + txq_uar_ncattr_init(txq_ctrl, page_size); > return 0; > } > > -- > 1.8.3.1
Patch applied to next-net-mlx, Kindest regards, Raslan Darawsheh