Register reset_notify and resume_notify callbacks so the RDMA driver
is informed when the MANA service undergoes a reset cycle.
On reset notification:
- Acquire reset_rwsem write lock to serialize with resource creation
- Walk every tracked ucontext and invalidate firmware handles for
all PD, CQ, WQ, QP, and MR resources (set to INVALID_MANA_HANDLE)
- Dispatch IB_EVENT_PORT_ERR to each affected ucontext so userspace
(e.g. DPDK) learns about the reset
On resume notification:
- Release reset_rwsem write lock, unblocking new resource creation
Resource creation paths (alloc_pd, create_cq, create_wq, create_qp for
RAW_PACKET, reg_user_mr) acquire reset_rwsem read lock to ensure handles
are not invalidated while being set up.
Signed-off-by: Long Li <[email protected]>
---
drivers/infiniband/hw/mana/cq.c | 15 ++-
drivers/infiniband/hw/mana/device.c | 103 ++++++++++++++++++
drivers/infiniband/hw/mana/main.c | 9 ++
drivers/infiniband/hw/mana/mana_ib.h | 2 +
drivers/infiniband/hw/mana/mr.c | 4 +
drivers/infiniband/hw/mana/qp.c | 5 +
drivers/infiniband/hw/mana/wq.c | 4 +
drivers/net/ethernet/microsoft/mana/mana_en.c | 14 ++-
include/net/mana/gdma.h | 6 +
9 files changed, 155 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 89cf60987ff5..b054684b8de7 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -41,13 +41,17 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
ib_cq_init_attr *attr,
ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
return -EINVAL;
}
+ }
+
+ down_read(&mdev->reset_rwsem);
+ if (udata) {
cq->cqe = attr->cqe;
err = mana_ib_create_queue(mdev, ucmd.buf_addr, cq->cqe *
COMP_ENTRY_SIZE,
&cq->queue);
if (err) {
ibdev_dbg(ibdev, "Failed to create queue for create cq,
%d\n", err);
- return err;
+ goto err_unlock;
}
mana_ucontext = rdma_udata_to_drv_context(udata, struct
mana_ib_ucontext,
@@ -56,14 +60,15 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
ib_cq_init_attr *attr,
} else {
if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) {
ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
- return -EINVAL;
+ err = -EINVAL;
+ goto err_unlock;
}
buf_size = MANA_PAGE_ALIGN(roundup_pow_of_two(attr->cqe *
COMP_ENTRY_SIZE));
cq->cqe = buf_size / COMP_ENTRY_SIZE;
err = mana_ib_create_kernel_queue(mdev, buf_size, GDMA_CQ,
&cq->queue);
if (err) {
ibdev_dbg(ibdev, "Failed to create kernel queue for
create cq, %d\n", err);
- return err;
+ goto err_unlock;
}
doorbell = mdev->gdma_dev->doorbell;
}
@@ -105,6 +110,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
ib_cq_init_attr *attr,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&mdev->reset_rwsem);
return 0;
err_remove_cq_cb:
@@ -113,7 +119,8 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct
ib_cq_init_attr *attr,
mana_ib_gd_destroy_cq(mdev, cq);
err_destroy_queue:
mana_ib_destroy_queue(mdev, &cq->queue);
-
+err_unlock:
+ up_read(&mdev->reset_rwsem);
return err;
}
diff --git a/drivers/infiniband/hw/mana/device.c
b/drivers/infiniband/hw/mana/device.c
index 149e8d4d5b8e..081be31563ca 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -103,6 +103,7 @@ static int mana_ib_netdev_event(struct notifier_block *this,
netdev_put(ndev, &dev->dev_tracker);
return NOTIFY_OK;
+
default:
return NOTIFY_DONE;
}
@@ -110,6 +111,93 @@ static int mana_ib_netdev_event(struct notifier_block
*this,
return NOTIFY_DONE;
}
+/*
+ * Reset cleanup: invalidate firmware handles for all tracked user objects.
+ *
+ * Called during service reset BEFORE dispatching IB_EVENT_PORT_ERR to
+ * user-mode.
+ *
+ * Only invalidates FW handles — does NOT free kernel resources (umem, queues)
+ * or remove objects from lists. The IB core's destroy callbacks handle full
+ * resource teardown when user-space closes the uverbs FD or
ib_unregister_device
+ * is called. The destroy callbacks skip FW commands when the handle is already
+ * INVALID_MANA_HANDLE.
+ *
+ * For CQs, also removes the CQ callback to prevent stale completions.
+ */
+static void mana_ib_reset_notify(void *ctx)
+{
+ struct mana_ib_dev *mdev = ctx;
+ struct mana_ib_ucontext *uctx;
+ struct mana_ib_qp *qp;
+ struct mana_ib_wq *wq;
+ struct mana_ib_cq *cq;
+ struct mana_ib_mr *mr;
+ struct mana_ib_pd *pd;
+ struct ib_event ibev;
+ int i;
+
+ down_write(&mdev->reset_rwsem);
+
+ ibdev_dbg(&mdev->ib_dev, "reset cleanup starting\n");
+
+ mutex_lock(&mdev->ucontext_lock);
+ list_for_each_entry(uctx, &mdev->ucontext_list, dev_list) {
+ mutex_lock(&uctx->lock);
+
+ list_for_each_entry(qp, &uctx->qp_list, ucontext_list)
+ qp->qp_handle = INVALID_MANA_HANDLE;
+
+ list_for_each_entry(wq, &uctx->wq_list, ucontext_list)
+ wq->rx_object = INVALID_MANA_HANDLE;
+
+ list_for_each_entry(cq, &uctx->cq_list, ucontext_list) {
+ mana_ib_remove_cq_cb(mdev, cq);
+ cq->cq_handle = INVALID_MANA_HANDLE;
+ }
+
+ list_for_each_entry(mr, &uctx->mr_list, ucontext_list)
+ mr->mr_handle = INVALID_MANA_HANDLE;
+
+ list_for_each_entry(pd, &uctx->pd_list, ucontext_list)
+ pd->pd_handle = INVALID_MANA_HANDLE;
+
+ uctx->doorbell = INVALID_DOORBELL;
+
+ mutex_unlock(&uctx->lock);
+ }
+ mutex_unlock(&mdev->ucontext_lock);
+
+ up_write(&mdev->reset_rwsem);
+
+ /* Revoke user doorbell mappings so userspace cannot ring
+ * stale doorbells after firmware handles are invalidated.
+ */
+ rdma_user_mmap_disassociate(&mdev->ib_dev);
+
+ /* Notify userspace (e.g. DPDK) that the port is down */
+ for (i = 0; i < mdev->ib_dev.phys_port_cnt; i++) {
+ ibev.device = &mdev->ib_dev;
+ ibev.element.port_num = i + 1;
+ ibev.event = IB_EVENT_PORT_ERR;
+ ib_dispatch_event(&ibev);
+ }
+}
+
+static void mana_ib_resume_notify(void *ctx)
+{
+ struct mana_ib_dev *dev = ctx;
+ struct ib_event ibev;
+ int i;
+
+ for (i = 0; i < dev->ib_dev.phys_port_cnt; i++) {
+ ibev.device = &dev->ib_dev;
+ ibev.element.port_num = i + 1;
+ ibev.event = IB_EVENT_PORT_ACTIVE;
+ ib_dispatch_event(&ibev);
+ }
+}
+
static int mana_ib_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
@@ -134,6 +222,7 @@ static int mana_ib_probe(struct auxiliary_device *adev,
xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ);
mutex_init(&dev->ucontext_lock);
INIT_LIST_HEAD(&dev->ucontext_list);
+ init_rwsem(&dev->reset_rwsem);
if (mana_ib_is_rnic(dev)) {
dev->ib_dev.phys_port_cnt = 1;
@@ -216,6 +305,15 @@ static int mana_ib_probe(struct auxiliary_device *adev,
dev_set_drvdata(&adev->dev, dev);
+ /* ETH device persists across reset — use callback for cleanup.
+ * RNIC device is removed/re-added, so its cleanup happens in remove.
+ */
+ if (!mana_ib_is_rnic(dev)) {
+ mdev->reset_notify = mana_ib_reset_notify;
+ mdev->resume_notify = mana_ib_resume_notify;
+ mdev->reset_notify_ctx = dev;
+ }
+
return 0;
deallocate_pool:
@@ -242,6 +340,11 @@ static void mana_ib_remove(struct auxiliary_device *adev)
if (mana_ib_is_rnic(dev))
mana_drain_gsi_sqs(dev);
+ if (!mana_ib_is_rnic(dev)) {
+ dev->gdma_dev->reset_notify = NULL;
+ dev->gdma_dev->resume_notify = NULL;
+ dev->gdma_dev->reset_notify_ctx = NULL;
+ }
ib_unregister_device(&dev->ib_dev);
dma_pool_destroy(dev->av_pool);
if (mana_ib_is_rnic(dev)) {
diff --git a/drivers/infiniband/hw/mana/main.c
b/drivers/infiniband/hw/mana/main.c
index f739e6da5435..61ce30aa9cb2 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -81,6 +81,8 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata
*udata)
dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
gc = mdev_to_gc(dev);
+ down_read(&dev->reset_rwsem);
+
mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
sizeof(resp));
@@ -98,6 +100,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata
*udata)
if (!err)
err = -EPROTO;
+ up_read(&dev->reset_rwsem);
return err;
}
@@ -118,6 +121,7 @@ int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata
*udata)
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&dev->reset_rwsem);
return 0;
}
@@ -230,10 +234,13 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
gc = mdev_to_gc(mdev);
+ down_read(&mdev->reset_rwsem);
+
/* Allocate a doorbell page index */
ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
if (ret) {
ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
+ up_read(&mdev->reset_rwsem);
return ret;
}
@@ -252,6 +259,8 @@ int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
list_add_tail(&ucontext->dev_list, &mdev->ucontext_list);
mutex_unlock(&mdev->ucontext_lock);
+ up_read(&mdev->reset_rwsem);
+
return 0;
}
diff --git a/drivers/infiniband/hw/mana/mana_ib.h
b/drivers/infiniband/hw/mana/mana_ib.h
index ce5c6c030fb2..29201cf3274c 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -86,6 +86,8 @@ struct mana_ib_dev {
/* Protects ucontext_list */
struct mutex ucontext_lock;
struct list_head ucontext_list;
+ /* Serializes resource create callbacks vs reset cleanup */
+ struct rw_semaphore reset_rwsem;
};
struct mana_ib_wq {
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 559bb4f7c31d..7189ccd41576 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -141,6 +141,8 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64
start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);
+ down_read(&dev->reset_rwsem);
+
mr->umem = ib_umem_get(ibdev, start, length, access_flags);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
@@ -195,6 +197,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64
start, u64 length,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&dev->reset_rwsem);
return &mr->ibmr;
err_dma_region:
@@ -204,6 +207,7 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64
start, u64 length,
ib_umem_release(mr->umem);
err_free:
+ up_read(&dev->reset_rwsem);
kfree(mr);
return ERR_PTR(err);
}
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 315bc54d8ae6..d590aca9b93a 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -701,12 +701,16 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct
ib_qp_init_attr *attr,
struct ib_udata *udata)
{
struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+ struct mana_ib_dev *mdev =
+ container_of(ibqp->device, struct mana_ib_dev, ib_dev);
int err;
INIT_LIST_HEAD(&qp->ucontext_list);
switch (attr->qp_type) {
case IB_QPT_RAW_PACKET:
+ down_read(&mdev->reset_rwsem);
+
/* When rwq_ind_tbl is used, it's for creating WQs for RSS */
if (attr->rwq_ind_tbl)
err = mana_ib_create_qp_rss(ibqp, ibqp->pd, attr,
@@ -724,6 +728,7 @@ int mana_ib_create_qp(struct ib_qp *ibqp, struct
ib_qp_init_attr *attr,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&mdev->reset_rwsem);
return err;
case IB_QPT_RC:
return mana_ib_create_rc_qp(ibqp, ibqp->pd, attr, udata);
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
index 1af9869933aa..67b757cf30f9 100644
--- a/drivers/infiniband/hw/mana/wq.c
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -31,6 +31,8 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
+ down_read(&mdev->reset_rwsem);
+
err = mana_ib_create_queue(mdev, ucmd.wq_buf_addr, ucmd.wq_buf_size,
&wq->queue);
if (err) {
ibdev_dbg(&mdev->ib_dev,
@@ -52,9 +54,11 @@ struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
mutex_unlock(&mana_ucontext->lock);
}
+ up_read(&mdev->reset_rwsem);
return &wq->ibwq;
err_free_wq:
+ up_read(&mdev->reset_rwsem);
kfree(wq);
return ERR_PTR(err);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ea71de39f996..3493b36426f7 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -3659,15 +3659,19 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
}
}
- err = add_adev(gd, "eth");
+ if (!resuming)
+ err = add_adev(gd, "eth");
INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
schedule_delayed_work(&ac->gf_stats_work, MANA_GF_STATS_PERIOD);
-
out:
if (err) {
mana_remove(gd, false);
} else {
+ /* Notify IB layer that ports are back up after reset */
+ if (resuming && gd->resume_notify)
+ gd->resume_notify(gd->reset_notify_ctx);
+
dev_dbg(dev, "gd=%p, id=%u, num_ports=%d, type=%u,
instance=%u\n",
gd, gd->dev_id.as_uint32, ac->num_ports,
gd->dev_id.type, gd->dev_id.instance);
@@ -3691,9 +3695,13 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
cancel_delayed_work_sync(&ac->gf_stats_work);
/* adev currently doesn't support suspending, always remove it */
- if (gd->adev)
+ if (gd->adev && !suspending)
remove_adev(gd);
+ /* Notify IB layer before tearing down net devices during reset */
+ if (suspending && gd->reset_notify)
+ gd->reset_notify(gd->reset_notify_ctx);
+
for (i = 0; i < ac->num_ports; i++) {
ndev = ac->ports[i];
if (!ndev) {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index ec17004b10c0..9187c5b4d0d1 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -249,6 +249,12 @@ struct gdma_dev {
struct auxiliary_device *adev;
bool is_suspended;
bool rdma_teardown;
+
+ /* Called by mana_remove() during reset to notify IB layer */
+ void (*reset_notify)(void *ctx);
+ /* Called by mana_probe() during resume to notify IB layer */
+ void (*resume_notify)(void *ctx);
+ void *reset_notify_ctx;
};
/* MANA_PAGE_SIZE is the DMA unit */
--
2.43.0