On 5/26/2026 5:43 PM, Zhiping Zhang wrote:
Query dma-buf TPH metadata when registering a dma-buf MR for peer-to-
peer access and translate the returned steering tag into an mlx5 ST
index. Keep the DMAH path as the first priority and only fall back to
DMA-buf metadata when no DMAH is supplied.
Track per-MR ownership of the allocated ST index and release it on MR
setup failure, destroy, and FRMR-pool reuse. Release the ST index before
the MR is pushed back into the FRMR pool, and free mlx5_st_idx_data when
its refcount reaches zero so repeated allocation/deallocation does not
leak memory.
Signed-off-by: Zhiping Zhang <[email protected]>
---
drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 ++
drivers/infiniband/hw/mlx5/mr.c | 86 ++++++++++++++++++-
.../net/ethernet/mellanox/mlx5/core/lib/st.c | 28 ++++--
include/linux/mlx5/driver.h | 7 ++
4 files changed, 115 insertions(+), 12 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h
b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e156dc4d7529..4ab867392267 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -721,6 +721,12 @@ struct mlx5_ib_mr {
u8 revoked :1;
/* Indicates previous dmabuf page fault occurred */
u8 dmabuf_faulted:1;
+ /* Set when the MR owns dmabuf_st_index and must
+ * release it via mlx5_st_dealloc_index() once the
+ * firmware mkey is no longer referencing it.
+ */
mkey st value is kept after revoke, regardless of st alloc and dealloc.
mkeys are kept in FRMR pool for future reuse even if their st index is
currently stale.
+ u8 dmabuf_st_owned:1;
+ u16 dmabuf_st_index;
st_index can be read from the frmr pool key. No need to store again.
struct mlx5_ib_mkey null_mmkey;
};
};
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3b6da45061a5..8059b5e4da97 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -38,6 +38,7 @@
#include <linux/delay.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
+#include <linux/pci-tph.h>
#include <rdma/frmr_pools.h>
#include <rdma/ib_umem_odp.h>
#include "dm.h"
@@ -46,6 +47,8 @@
#include "data_direct.h"
#include "dmah.h"
+MODULE_IMPORT_NS("DMA_BUF");
+
static int mkey_max_umr_order(struct mlx5_ib_dev *dev)
{
if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
@@ -899,6 +902,63 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops
= {
.invalidate_mappings = mlx5_ib_dmabuf_invalidate_cb,
};
+/*
+ * Query TPH metadata from @dmabuf and translate the raw steering tag into
+ * an mlx5 ST index. On success, returns 0 and the caller becomes the
+ * owner of *@st_index (must be released with mlx5_st_dealloc_index()
+ * once the firmware mkey no longer references it). On any failure
+ * *@st_index and *@ph are left as the no-TPH defaults set by the caller.
+ *
+ * @dmabuf must already be referenced by the caller (e.g. via the umem's
+ * attachment) so we don't re-resolve the user's fd here and avoid a
+ * dup2() TOCTOU between umem creation and TPH lookup.
+ */
+static void get_tph_mr_dmabuf(struct mlx5_ib_dev *dev, struct dma_buf *dmabuf,
+ u16 *st_index, u8 *ph)
+{
+ u8 req_type;
+ u16 steering_tag;
+ u8 st_width;
+ int ret;
+
+ if (!dmabuf->ops->get_tph)
+ return;
+
+ req_type = pcie_tph_enabled_req_type(dev->mdev->pdev);
+ switch (req_type) {
+ case PCI_TPH_REQ_TPH_ONLY:
+ st_width = 8;
+ break;
+ case PCI_TPH_REQ_EXT_TPH:
+ st_width = 16;
+ break;
+ default:
+ return;
+ }
+
+ ret = dmabuf->ops->get_tph(dmabuf, &steering_tag, ph, st_width);
+ if (ret) {
+ mlx5_ib_dbg(dev, "get_tph failed (%d)\n", ret);
+ *ph = MLX5_IB_NO_PH;
+ return;
+ }
+
+ ret = mlx5_st_alloc_index_by_tag(dev->mdev, steering_tag, st_index);
+ if (ret) {
+ *ph = MLX5_IB_NO_PH;
+ mlx5_ib_dbg(dev, "st_alloc_index_by_tag failed (%d)\n", ret);
+ }
+}
+
+static void mlx5_ib_mr_put_dmabuf_st(struct mlx5_ib_mr *mr)
+{
+ if (mr->umem && mr->dmabuf_st_owned) {
+ mlx5_st_dealloc_index(mr_to_mdev(mr)->mdev,
+ mr->dmabuf_st_index);
+ mr->dmabuf_st_owned = 0;
+ }
+}
+
static struct ib_mr *
reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
u64 offset, u64 length, u64 virt_addr,
@@ -941,16 +1001,26 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device
*dma_device,
ph = dmah->ph;
if (dmah->valid_fields & BIT(IB_DMAH_CPU_ID_EXISTS))
st_index = mdmah->st_index;
+ } else {
+ get_tph_mr_dmabuf(dev, umem_dmabuf->attach->dmabuf,
+ &st_index, &ph);
}
mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
access_flags, access_mode,
st_index, ph);
if (IS_ERR(mr)) {
+ if (!dmah && st_index !=
MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX)
+ mlx5_st_dealloc_index(dev->mdev, st_index);
ib_umem_release(&umem_dmabuf->umem);
return ERR_CAST(mr);
}
+ if (!dmah && st_index != MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX) {
+ mr->dmabuf_st_index = st_index;
+ mr->dmabuf_st_owned = 1;
+ }
+
mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
@@ -1377,9 +1447,17 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr
*mr)
bool is_odp = is_odp_mr(mr);
int ret;
- if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr) &&
- !ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
- return 0;
+ if (mr->ibmr.frmr.pool && !mlx5_umr_revoke_mr_with_lock(mr)) {
+ /*
+ * The mkey has been revoked: firmware no longer references
+ * dmabuf_st_index, so release it before this mr can re-enter
+ * the FRMR cache for reuse by another registration.
+ */
+ mlx5_ib_mr_put_dmabuf_st(mr);
+
+ if (!ib_frmr_pool_push(mr->ibmr.device, &mr->ibmr))
+ return 0;
+ }
The Sashiko comment on previous version of this series was wrong about
the concept of FRMR pools and its reuse of mkeys.
Please move the st put operation outside the mkey cleanup flow.
if (is_odp)
mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
@@ -1400,6 +1478,8 @@ static int mlx5r_handle_mkey_cleanup(struct mlx5_ib_mr
*mr)
dma_resv_unlock(
to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
}
+ if (!ret)
+ mlx5_ib_mr_put_dmabuf_st(mr);
return ret;
}