Peer-to-peer DMA between a mlx5 NIC and a foreign PCIe endpoint (typically a GPU or a vfio-pci passthrough device) traverses the host PCIe fabric. The endpoint exporting the dma-buf knows which PCIe TLP Processing Hint (TPH) Steering Tag yields the best placement for the traffic it will sink: per-endpoint hint selection lets the root complex or switch direct DMA to a specific cache slice / NUMA node, cutting cross-socket snoop traffic and DRAM pressure under sustained p2p workloads.
Until now the mlx5 importer had no way to learn the exporter's chosen ST tag, so dma-buf MRs were registered without TPH and ran with the default (no-hint) routing. With dma_buf_get_pci_tph() in place this patch wires up mlx5_ib to query that metadata at MR registration time for p2p access and use it to program requester-side TPH on the outbound mkey. If the exporter has no metadata, fall back to the existing no-TPH path so behavior for non-TPH-aware exporters is unchanged. Use mlx5_st_alloc_index_by_tag() to translate exporter-provided steering tags into local ST entries when table mode is active, and add mlx5_st_get_index() for DMAH-backed flows that already carry an ST index. For TPH-backed FRMRs, keep the extra ST-table reference tied to MR lifetime rather than pooled mkey lifetime. Acquire the ref before MR creation and release it again when the MR is returned to the pool or the backing mkey is destroyed, while leaving the generic FRMR pool core unchanged. Import the DMA_BUF namespace for the new dma_buf_get_pci_tph() call so modular mlx5_ib builds link cleanly. Signed-off-by: Zhiping Zhang <[email protected]> --- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mr.c | 103 +++++++++++++++++- .../net/ethernet/mellanox/mlx5/core/lib/st.c | 49 +++++++-- include/linux/mlx5/driver.h | 13 ++ 4 files changed, 157 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 02809114fc79..a2b497f6b16b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -60,6 +60,7 @@ MODULE_AUTHOR("Eli Cohen <[email protected]>"); MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_IMPORT_NS("DMA_BUF"); struct mlx5_ib_event_work { struct work_struct work; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index e6b74955d95d..7aced3f55456 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -39,6 +39,7 @@ #include <linux/delay.h> #include <linux/dma-buf.h> #include <linux/dma-resv.h> +#include <linux/pci-tph.h> #include <rdma/frmr_pools.h> #include <rdma/ib_umem_odp.h> #include "dm.h" @@ -167,6 +168,32 @@ static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, #define MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK GENMASK_ULL(23, 16) #define MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK GENMASK_ULL(15, 0) +static int mlx5_ib_get_frmr_st_handle_ref(struct mlx5_ib_dev *dev, + u16 st_index) +{ + if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + return 0; + + return mlx5_st_get_index(dev->mdev, st_index); +} + +static void mlx5_ib_put_st_index_ref(struct mlx5_ib_dev *dev, u16 st_index) +{ + if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) + return; + + mlx5_st_dealloc_index(dev->mdev, st_index); +} + +static void mlx5_ib_put_frmr_st_handle_ref(struct mlx5_ib_dev *dev, + u64 kernel_vendor_key) +{ + u16 st_index = FIELD_GET(MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK, + kernel_vendor_key); + + mlx5_ib_put_st_index_ref(dev, st_index); +} + static struct mlx5_ib_mr * _mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct ib_umem *umem, int access_flags, int access_mode, @@ -218,7 +245,9 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 0 : MLX5_FRMR_POOLS_KEY_ACCESS_MODE_KSM_MASK, .num_dma_blocks = ndescs, - .kernel_vendor_key = 0, /* no PH and no ST index */ + .kernel_vendor_key = + FIELD_PREP(MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK, + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX), }; struct mlx5_ib_mr *mr; int ret; @@ -335,6 +364,7 @@ static int mlx5r_build_frmr_key(struct ib_device *device, get_unchangeable_access_flags(dev, in->access_flags); out->vendor_key = in->vendor_key; out->num_dma_blocks = in->num_dma_blocks; + out->kernel_vendor_key = in->kernel_vendor_key; return 0; } @@ -557,6 +587,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, mr->ibmr.pd = pd; mr->access_flags = access_flags; mr->page_shift = order_base_2(page_size); + mr->ibmr.frmr.key.kernel_vendor_key = + FIELD_PREP(MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK, st_index) | + FIELD_PREP(MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK, ph); inlen = MLX5_ST_SZ_BYTES(create_mkey_in); if (populate) @@ -755,6 +788,12 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { + err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index); + if (err) { + ib_umem_release(umem); + return ERR_PTR(err); + } + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, MLX5_MKC_ACCESS_MODE_MTT, st_index, ph); @@ -769,6 +808,8 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { + if (xlt_with_umr) + mlx5_ib_put_st_index_ref(dev, st_index); ib_umem_release(umem); return ERR_CAST(mr); } @@ -903,6 +944,52 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb, }; +static void get_pci_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf, + u16 *st_index, u8 *ph) +{ + u16 local_st_index; + u16 steering_tag; + u8 local_ph; + bool extended; + int ret; + + switch (pcie_tph_enabled_req_type(dev->mdev->pdev)) { + case PCI_TPH_REQ_TPH_ONLY: + extended = false; + break; + case PCI_TPH_REQ_EXT_TPH: + extended = true; + break; + default: + return; + } + + dma_resv_lock(dmabuf->resv, NULL); + ret = dma_buf_get_pci_tph(dmabuf, extended, &steering_tag, &local_ph); + dma_resv_unlock(dmabuf->resv); + if (ret) { + if (ret != -EOPNOTSUPP) + mlx5_ib_dbg(dev, "get_pci_tph failed (%d)\n", ret); + return; + } + + ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, + &local_st_index); + if (ret) { + mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret); + return; + } + + *st_index = local_st_index; + *ph = local_ph; +} + +static void mlx5_ib_mr_put_frmr_st_handle_ref(struct mlx5_ib_mr *mr) +{ + mlx5_ib_put_frmr_st_handle_ref(mr_to_mdev(mr), + mr->ibmr.frmr.key.kernel_vendor_key); +} + static struct ib_mr * reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, u64 offset, u64 length, u64 virt_addr, @@ -945,12 +1032,22 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, ph = dmah->ph; if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS)) st_index = mdmah->st_index; + + err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index); + if (err) { + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(err); + } + } else { + get_pci_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf, + &st_index, &ph); } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, access_flags, access_mode, st_index, ph); if (IS_ERR(mr)) { + mlx5_ib_put_st_index_ref(dev, st_index); ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); } @@ -1405,6 +1502,7 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr) if (mr->ibmr.frmr.pool) { if (!mlx5_umr_revoke_mr_with_lock(mr)) { ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr); + mlx5_ib_mr_put_frmr_st_handle_ref(mr); return 0; } } @@ -1432,6 +1530,9 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr *mr) if (mr->ibmr.frmr.pool && !ret) ib_frmr_pool_drop(&mr->ibmr); + if (!ret) + mlx5_ib_mr_put_frmr_st_handle_ref(mr); + return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c index 7cedc348790d..877b37b4e639 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c @@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev) kfree(st); } -int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, - unsigned int cpu_uid, u16 *st_index) +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag, + u16 *st_index) { struct mlx5_st_idx_data *idx_data; struct mlx5_st *st = dev->st; unsigned long index; u32 xa_id; - u16 tag; - int ret; + int ret = 0; if (!st) return -EOPNOTSUPP; - ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag); - if (ret) - return ret; - if (st->direct_mode) { *st_index = tag; return 0; @@ -152,8 +147,46 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, mutex_unlock(&st->lock); return ret; } +EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag); + +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index) +{ + u16 tag; + int ret; + + ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag); + if (ret) + return ret; + + return mlx5_st_alloc_index_by_tag(dev, tag, st_index); +} EXPORT_SYMBOL_GPL(mlx5_st_alloc_index); +int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index) +{ + struct mlx5_st_idx_data *idx_data; + struct mlx5_st *st = dev->st; + int ret = 0; + + if (!st) + return -EOPNOTSUPP; + + if (st->direct_mode) + return 0; + + mutex_lock(&st->lock); + idx_data = xa_load(&st->idx_xa, st_index); + if (WARN_ON_ONCE(!idx_data)) + ret = -EINVAL; + else + refcount_inc(&idx_data->usecount); + mutex_unlock(&st->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_st_get_index); + int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index) { struct mlx5_st_idx_data *idx_data; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b1871c0821d0..b3295b338267 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1182,10 +1182,23 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type u64 length, u16 uid, phys_addr_t addr, u32 obj_id); #ifdef CONFIG_PCIE_TPH +int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag, + u16 *st_index); +int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index); int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, unsigned int cpu_uid, u16 *st_index); int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index); #else +static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, + u16 tag, u16 *st_index) +{ + return -EOPNOTSUPP; +} + +static inline int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index) +{ + return -EOPNOTSUPP; +} static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, unsigned int cpu_uid, u16 *st_index) -- 2.53.0-Meta
