Query dma-buf TPH metadata when registering a dma-buf MR for peer-to-
peer access and translate the returned steering tag into an mlx5 ST
index. Keep the DMAH path as the first priority and only fall back to
DMA-buf metadata when no DMAH is supplied.

Split the existing mlx5_st_alloc_index() into mlx5_st_alloc_index_by_tag()
plus a tag-from-cpu wrapper so the dma-buf path can allocate an ST
index directly from a raw steering tag without going through the
per-CPU table. mlx5_st_alloc_index_by_tag() explicitly initialises
'ret' so the duplicate-tag fast path doesn't return an uninitialised
value, which would otherwise be observed by callers when an MR
re-uses a tag that already has an ST index allocated.

For TPH-backed FRMRs, the extra ST-table reference belongs to the
hardware mkey handle, not the transient MR object. Add mlx5_st_get_index()
and extend the FRMR pool API so ib_frmr_pool_pop() reports whether a
handle was reused and destroy_frmrs() receives the pool key. The DMAH
and dma-buf paths take a provisional ST ref before pool lookup; reuse
drops that provisional ref immediately, while newly created handles
keep it and release it only when the FRMR handle is actually destroyed,
either directly or through FRMR pool aging/cleanup.

Also decode the PH bits stored in kernel_vendor_key when recreating
pooled mkeys so the programmed requester hint matches the pool key.

Signed-off-by: Zhiping Zhang <[email protected]>
---
 drivers/infiniband/core/frmr_pools.c          |  20 ++-
 drivers/infiniband/hw/mlx5/mr.c               | 124 +++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/lib/st.c  |  49 +++++--
 include/linux/mlx5/driver.h                   |  12 ++
 include/rdma/frmr_pools.h                     |   5 +-
 5 files changed, 191 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/core/frmr_pools.c 
b/drivers/infiniband/core/frmr_pools.c
index 5e992ff3d7cf..61a77847118e 100644
--- a/drivers/infiniband/core/frmr_pools.c
+++ b/drivers/infiniband/core/frmr_pools.c
@@ -92,7 +92,8 @@ static void destroy_all_handles_in_queue(struct ib_device 
*device,
        u32 count;
 
        while (pop_frmr_handles_page(pool, queue, &page, &count)) {
-               pools->pool_ops->destroy_frmrs(device, page->handles, count);
+               pools->pool_ops->destroy_frmrs(device, &pool->key,
+                                              page->handles, count);
                kfree(page);
        }
 }
@@ -136,7 +137,8 @@ static bool age_pinned_pool(struct ib_device *device, 
struct ib_frmr_pool *pool)
        spin_unlock(&pool->lock);
 
        if (destroyed)
-               pools->pool_ops->destroy_frmrs(device, handles, destroyed);
+               pools->pool_ops->destroy_frmrs(device, &pool->key, handles,
+                                              destroyed);
        kfree(handles);
        return has_work;
 }
@@ -453,9 +455,11 @@ int ib_frmr_pools_set_pinned(struct ib_device *device, 
struct ib_frmr_key *key,
 }
 
 static int get_frmr_from_pool(struct ib_device *device,
-                             struct ib_frmr_pool *pool, struct ib_mr *mr)
+                             struct ib_frmr_pool *pool, struct ib_mr *mr,
+                             bool *reused)
 {
        struct ib_frmr_pools *pools = device->frmr_pools;
+       bool local_reused = false;
        u32 handle;
        int err;
 
@@ -464,6 +468,7 @@ static int get_frmr_from_pool(struct ib_device *device,
                if (pool->inactive_queue.ci > 0) {
                        handle = pop_handle_from_queue_locked(
                                &pool->inactive_queue);
+                       local_reused = true;
                } else {
                        spin_unlock(&pool->lock);
                        err = pools->pool_ops->create_frmrs(device, &pool->key,
@@ -474,6 +479,7 @@ static int get_frmr_from_pool(struct ib_device *device,
                }
        } else {
                handle = pop_handle_from_queue_locked(&pool->queue);
+               local_reused = true;
        }
 
        pool->in_use++;
@@ -484,6 +490,8 @@ static int get_frmr_from_pool(struct ib_device *device,
 
        mr->frmr.pool = pool;
        mr->frmr.handle = handle;
+       if (reused)
+               *reused = local_reused;
 
        return 0;
 }
@@ -493,10 +501,12 @@ static int get_frmr_from_pool(struct ib_device *device,
  *
  * @device: The device to pop the FRMR handle from.
  * @mr: The MR to pop the FRMR handle from.
+ * @reused: Optional output that reports whether the returned handle was
+ *         reused from the pool instead of freshly created.
  *
  * Returns 0 on success, negative error code on failure.
  */
-int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr)
+int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr, bool *reused)
 {
        struct ib_frmr_pools *pools = device->frmr_pools;
        struct ib_frmr_pool *pool;
@@ -509,7 +519,7 @@ int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr 
*mr)
                        return PTR_ERR(pool);
        }
 
-       return get_frmr_from_pool(device, pool, mr);
+       return get_frmr_from_pool(device, pool, mr, reused);
 }
 EXPORT_SYMBOL(ib_frmr_pool_pop);
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3b6da45061a5..b56df39d3385 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -38,6 +38,7 @@
 #include <linux/delay.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-resv.h>
+#include <linux/pci-tph.h>
 #include <rdma/frmr_pools.h>
 #include <rdma/ib_umem_odp.h>
 #include "dm.h"
@@ -167,12 +168,39 @@ static int get_unchangeable_access_flags(struct 
mlx5_ib_dev *dev,
 #define MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK 0xFF0000
 #define MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK 0xFFFF
 
+static int mlx5_ib_get_frmr_st_handle_ref(struct mlx5_ib_dev *dev,
+                                         u16 st_index)
+{
+       if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+               return 0;
+
+       return mlx5_st_get_index(dev->mdev, st_index);
+}
+
+static void mlx5_ib_put_st_index_ref(struct mlx5_ib_dev *dev, u16 st_index)
+{
+       if (st_index == MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+               return;
+
+       mlx5_st_dealloc_index(dev->mdev, st_index);
+}
+
+static void mlx5_ib_put_frmr_st_handle_ref(struct mlx5_ib_dev *dev,
+                                          u64 kernel_vendor_key)
+{
+       u16 st_index = kernel_vendor_key &
+                      MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK;
+
+       mlx5_ib_put_st_index_ref(dev, st_index);
+}
+
 static struct mlx5_ib_mr *
 _mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct ib_umem *umem,
                      int access_flags, int access_mode,
                      unsigned long page_size, u16 st_index, u8 ph)
 {
        struct mlx5_ib_mr *mr;
+       bool reused = false;
        int err;
 
        mr = kzalloc_obj(*mr);
@@ -195,11 +223,14 @@ _mlx5_frmr_pool_alloc(struct mlx5_ib_dev *dev, struct 
ib_umem *umem,
 
        mr->ibmr.frmr.key.kernel_vendor_key =
                st_index | (ph << MLX5_FRMR_POOLS_KERNEL_KEY_PH_SHIFT);
-       err = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr);
+       err = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr, &reused);
        if (err) {
                kfree(mr);
                return ERR_PTR(err);
        }
+       if (reused)
+               mlx5_ib_put_frmr_st_handle_ref(
+                       dev, mr->ibmr.frmr.key.kernel_vendor_key);
        mr->mmkey.key = mr->ibmr.frmr.handle;
        init_waitqueue_head(&mr->mmkey.wait);
 
@@ -229,7 +260,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev 
*dev,
        init_waitqueue_head(&mr->mmkey.wait);
 
        mr->ibmr.frmr.key = key;
-       ret = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr);
+       ret = ib_frmr_pool_pop(&dev->ib_dev, &mr->ibmr, NULL);
        if (ret) {
                kfree(mr);
                return ERR_PTR(ret);
@@ -273,7 +304,8 @@ static int mlx5r_create_mkeys(struct ib_device *device, 
struct ib_frmr_key *key,
 
        st_index = key->kernel_vendor_key &
                   MLX5_FRMR_POOLS_KERNEL_KEY_ST_INDEX_MASK;
-       ph = key->kernel_vendor_key & MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK;
+       ph = (key->kernel_vendor_key & MLX5_FRMR_POOLS_KERNEL_KEY_PH_MASK) >>
+            MLX5_FRMR_POOLS_KERNEL_KEY_PH_SHIFT;
        if (ph) {
                /* Normalize ph: swap MLX5_IB_NO_PH for 0 */
                if (ph == MLX5_IB_NO_PH)
@@ -299,7 +331,8 @@ static int mlx5r_create_mkeys(struct ib_device *device, 
struct ib_frmr_key *key,
        return err;
 }
 
-static void mlx5r_destroy_mkeys(struct ib_device *device, u32 *handles,
+static void mlx5r_destroy_mkeys(struct ib_device *device,
+                               const struct ib_frmr_key *key, u32 *handles,
                                unsigned int count)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
@@ -311,6 +344,9 @@ static void mlx5r_destroy_mkeys(struct ib_device *device, 
u32 *handles,
                        pr_warn_ratelimited(
                                "mlx5_ib: failed to destroy mkey %d: %d",
                                handles[i], err);
+               else
+                       mlx5_ib_put_frmr_st_handle_ref(dev,
+                                                      key->kernel_vendor_key);
        }
 }
 
@@ -333,6 +369,7 @@ static int mlx5r_build_frmr_key(struct ib_device *device,
                get_unchangeable_access_flags(dev, in->access_flags);
        out->vendor_key = in->vendor_key;
        out->num_dma_blocks = in->num_dma_blocks;
+       out->kernel_vendor_key = in->kernel_vendor_key;
 
        return 0;
 }
@@ -753,6 +790,12 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, 
struct ib_umem *umem,
 
        xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
        if (xlt_with_umr) {
+               err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index);
+               if (err) {
+                       ib_umem_release(umem);
+                       return ERR_PTR(err);
+               }
+
                mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
                                        MLX5_MKC_ACCESS_MODE_MTT,
                                        st_index, ph);
@@ -767,6 +810,8 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, 
struct ib_umem *umem,
                mutex_unlock(&dev->slow_path_mutex);
        }
        if (IS_ERR(mr)) {
+               if (xlt_with_umr)
+                       mlx5_ib_put_st_index_ref(dev, st_index);
                ib_umem_release(umem);
                return ERR_CAST(mr);
        }
@@ -899,6 +944,65 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops 
= {
        .invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
 };
 
+/*
+ * Query TPH metadata from @dmabuf and translate the raw steering tag into
+ * an mlx5 ST index. On success *@st_index is updated with a provisional
+ * reference for a candidate FRMR handle and *@ph is updated to the dma-buf's
+ * processing hint. Callers that fail to allocate a handle, or that reuse an
+ * existing pooled handle, must drop the provisional ST reference. On any
+ * failure *@st_index and *@ph are left untouched, so the caller's no-TPH
+ * defaults stand.
+ *
+ * @dmabuf must already be referenced by the caller (e.g. via the umem's
+ * attachment) so we don't re-resolve the user's fd here and avoid a
+ * dup2() TOCTOU between umem creation and TPH lookup.
+ */
+static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf,
+                             u16 *st_index, u8 *ph)
+{
+       u16 local_st_index;
+       u16 steering_tag;
+       u8 local_ph;
+       bool extended;
+       int ret;
+
+       if (!dmabuf->ops->get_tph)
+               return;
+
+       switch (pcie_tph_enabled_req_type(dev->mdev->pdev)) {
+       case PCI_TPH_REQ_TPH_ONLY:
+               extended = false;
+               break;
+       case PCI_TPH_REQ_EXT_TPH:
+               extended = true;
+               break;
+       default:
+               return;
+       }
+
+       ret = dmabuf->ops->get_tph(dmabuf, extended, &steering_tag, &local_ph);
+       if (ret) {
+               mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
+               return;
+       }
+
+       ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag,
+                                        &local_st_index);
+       if (ret) {
+               mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
+               return;
+       }
+
+       *st_index = local_st_index;
+       *ph = local_ph;
+}
+
+static void mlx5_ib_mr_put_frmr_st_handle_ref(struct mlx5_ib_mr *mr)
+{
+       mlx5_ib_put_frmr_st_handle_ref(mr_to_mdev(mr),
+                                      mr->ibmr.frmr.key.kernel_vendor_key);
+}
+
 static struct ib_mr *
 reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
                   u64 offset, u64 length, u64 virt_addr,
@@ -941,12 +1045,22 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device 
*dma_device,
                ph = dmah->ph;
                if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
                        st_index = mdmah->st_index;
+
+               err = mlx5_ib_get_frmr_st_handle_ref(dev, st_index);
+               if (err) {
+                       ib_umem_release(&umem_dmabuf->umem);
+                       return ERR_PTR(err);
+               }
+       } else {
+               get_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf,
+                                 &st_index, &ph);
        }
 
        mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
                                access_flags, access_mode,
                                st_index, ph);
        if (IS_ERR(mr)) {
+               mlx5_ib_put_st_index_ref(dev, st_index);
                ib_umem_release(&umem_dmabuf->umem);
                return ERR_CAST(mr);
        }
@@ -1400,6 +1514,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr 
*mr)
                dma_resv_unlock(
                        to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
        }
+       if (!ret)
+               mlx5_ib_mr_put_frmr_st_handle_ref(mr);
        return ret;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
index 7cedc348790d..877b37b4e639 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
@@ -92,23 +92,18 @@ void mlx5_st_destroy(struct mlx5_core_dev *dev)
        kfree(st);
 }
 
-int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
-                       unsigned int cpu_uid, u16 *st_index)
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+                              u16 *st_index)
 {
        struct mlx5_st_idx_data *idx_data;
        struct mlx5_st *st = dev->st;
        unsigned long index;
        u32 xa_id;
-       u16 tag;
-       int ret;
+       int ret = 0;
 
        if (!st)
                return -EOPNOTSUPP;
 
-       ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
-       if (ret)
-               return ret;
-
        if (st->direct_mode) {
                *st_index = tag;
                return 0;
@@ -152,8 +147,46 @@ int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum 
tph_mem_type mem_type,
        mutex_unlock(&st->lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(mlx5_st_alloc_index_by_tag);
+
+int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
+                       unsigned int cpu_uid, u16 *st_index)
+{
+       u16 tag;
+       int ret;
+
+       ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
+       if (ret)
+               return ret;
+
+       return mlx5_st_alloc_index_by_tag(dev, tag, st_index);
+}
 EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
 
+int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index)
+{
+       struct mlx5_st_idx_data *idx_data;
+       struct mlx5_st *st = dev->st;
+       int ret = 0;
+
+       if (!st)
+               return -EOPNOTSUPP;
+
+       if (st->direct_mode)
+               return 0;
+
+       mutex_lock(&st->lock);
+       idx_data = xa_load(&st->idx_xa, st_index);
+       if (WARN_ON_ONCE(!idx_data))
+               ret = -EINVAL;
+       else
+               refcount_inc(&idx_data->usecount);
+       mutex_unlock(&st->lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_st_get_index);
+
 int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
 {
        struct mlx5_st_idx_data *idx_data;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04b96c5abb57..0480b5c4f189 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1166,10 +1166,22 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, 
enum mlx5_sw_icm_type type
                           u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
 
 #ifdef CONFIG_PCIE_TPH
+int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev, u16 tag,
+                              u16 *st_index);
+int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index);
 int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
                        unsigned int cpu_uid, u16 *st_index);
 int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
 #else
+static inline int mlx5_st_alloc_index_by_tag(struct mlx5_core_dev *dev,
+                                            u16 tag, u16 *st_index)
+{
+       return -EOPNOTSUPP;
+}
+static inline int mlx5_st_get_index(struct mlx5_core_dev *dev, u16 st_index)
+{
+       return -EOPNOTSUPP;
+}
 static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
                                      enum tph_mem_type mem_type,
                                      unsigned int cpu_uid, u16 *st_index)
diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h
index af1b88801fa4..a08d2b2cf9f3 100644
--- a/include/rdma/frmr_pools.h
+++ b/include/rdma/frmr_pools.h
@@ -24,7 +24,8 @@ struct ib_frmr_key {
 struct ib_frmr_pool_ops {
        int (*create_frmrs)(struct ib_device *device, struct ib_frmr_key *key,
                            u32 *handles, u32 count);
-       void (*destroy_frmrs)(struct ib_device *device, u32 *handles,
+       void (*destroy_frmrs)(struct ib_device *device,
+                             const struct ib_frmr_key *key, u32 *handles,
                              u32 count);
        int (*build_key)(struct ib_device *device, const struct ib_frmr_key *in,
                         struct ib_frmr_key *out);
@@ -33,7 +34,7 @@ struct ib_frmr_pool_ops {
 int ib_frmr_pools_init(struct ib_device *device,
                       const struct ib_frmr_pool_ops *pool_ops);
 void ib_frmr_pools_cleanup(struct ib_device *device);
-int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr);
+int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr, bool *reused);
 int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr);
 
 #endif /* FRMR_POOLS_H */
-- 
2.53.0-Meta

Reply via email to