Wire an MSI-X vector to a dedicated EQ so the mlx5 driver supports
send_msi().

Each EQ can be linked to an MSI-X vector, and the CQ can be set up
to deliver an event to the EQ. Thus, when everything is armed, an
RDMA WRITE posted to the QP generates a CQE, which generates an
EQE, which generates an MSI-X.

To keep things simple this just re-uses all the existing QPs and
CQs, so they generate single MSIs during memcpy.

send_msi() drains any accumulated MSI EQ events from prior memcpy
completions, posts a small signaled RDMA Write, then polls the CQ to
consume the resulting CQE (avoiding stale completions on subsequent
test cycles).

Assisted-by: Claude:claude-opus-4.6
Signed-off-by: Jason Gunthorpe <[email protected]>
---
 .../selftests/vfio/lib/drivers/mlx5/mlx5.c    | 165 +++++++++++++++++-
 .../selftests/vfio/lib/drivers/mlx5/mlx5_hw.h |   6 +
 2 files changed, 168 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c 
b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
index e5e75adb253166..c8388aabb8c672 100644
--- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
+++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
@@ -57,17 +57,23 @@ struct mlx5st_device {
        /* CQ */
        u32 cqn;
        u32 cq_ci;
+       u32 cq_arm_sn;
 
        /* UAR */
        u32 uar_page;
        void __iomem *uar_base;
        unsigned int uar_bf_offset;
 
-       /* EQ */
+       /* EQ (cmd/pages events — polled, not interrupt-driven) */
        u32 eqn;
        u32 eq_cons_index;
        bool have_eq;
 
+       /* MSI EQ (CQ completion events — fires MSI-X) */
+       u32 msi_eqn;
+       u32 msi_eq_cons_index;
+       bool have_msi_eq;
+
        /* Async pages slot state */
        bool pages_slot_in_use;
        bool pages_slot_is_reclaim;
@@ -91,6 +97,10 @@ struct mlx5st_device {
        bool fl_supported;
        u8 log_max_msg;
 
+       /* Buffers used by send_msi() to trigger an interrupt */
+       u64 send_msi_src;
+       u64 send_msi_dst;
+
        /*
         * HW-visible DMA buffers below — device reads/writes via DMA.
         */
@@ -113,6 +123,9 @@ struct mlx5st_device {
        /* EQ does not support page_offset */
        struct mlx5st_eqe eq_buf[EQ_NENT] __aligned(MLX5_HW_PAGE_SIZE);
 
+       /* MSI EQ buffer — CQ completions generate EQEs here -> MSI-X */
+       struct mlx5st_eqe msi_eq_buf[MSI_EQ_NENT] __aligned(MLX5_HW_PAGE_SIZE);
+
        u8 fw_pages[MAX_FW_PAGES][MLX5_HW_PAGE_SIZE]
                __aligned(MLX5_HW_PAGE_SIZE);
 };
@@ -135,6 +148,9 @@ static_assert(offsetof(struct mlx5st_device, qp_dbrec) % 64 
== 0,
 static_assert(offsetof(struct mlx5st_device, eq_buf) %
                              MLX5_HW_PAGE_SIZE == 0,
              "eq_buf must be page-aligned");
+static_assert(offsetof(struct mlx5st_device, msi_eq_buf) %
+                             MLX5_HW_PAGE_SIZE == 0,
+             "msi_eq_buf must be page-aligned");
 static_assert(offsetof(struct mlx5st_device, fw_pages) %
                              MLX5_HW_PAGE_SIZE == 0,
              "fw_pages must be page-aligned");
@@ -1013,6 +1029,85 @@ static void mlx5st_process_events(struct mlx5st_device 
*dev)
                mlx5st_eq_update_ci(dev, cc, 0);
 }
 
+/*
+ * MSI EQ — dedicated EQ for CQ completion events that fires MSI-X.
+ * Separate from the cmd/pages EQ so that only CQ completions (from
+ * send_msi or memcpy) trigger the interrupt vector.
+ */
+
+static void mlx5st_msi_eq_drain(struct mlx5st_device *dev)
+{
+       u32 cc = 0;
+       u32 val;
+
+       while (cc < MSI_EQ_NENT) {
+               u32 ci = dev->msi_eq_cons_index + cc;
+               struct mlx5st_eqe *eqe =
+                       &dev->msi_eq_buf[ci % MSI_EQ_NENT];
+
+               if (MLX5_GET_ONCE(eqe, eqe, owner) != !!(ci & MSI_EQ_NENT))
+                       break;
+               cc++;
+       }
+
+       /* Update consumer index and re-arm for next interrupt */
+       dev->msi_eq_cons_index += cc;
+       val = (dev->msi_eq_cons_index & 0xffffff) | (dev->msi_eqn << 24);
+       iowrite32be(val, (u8 __iomem *)dev->uar_base + MLX5_EQ_DOORBELL_OFFSET);
+}
+
+static void mlx5st_create_msi_eq(struct mlx5st_device *dev)
+{
+       struct vfio_pci_device *device = dev->device;
+       u64 in[MLX5_ST_SZ_QW(create_eq_in) + 1] = {};
+       u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {};
+       struct mlx5_ifc_eqc_bits *eqc;
+       unsigned int i;
+       __be64 *pas;
+
+       /* Initialize EQE owner bits */
+       for (i = 0; i < MSI_EQ_NENT; i++) {
+               struct mlx5st_eqe *eqe = &dev->msi_eq_buf[i];
+
+               MLX5_SET_ONCE(eqe, eqe, owner, 1);
+       }
+
+       MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
+
+       /*
+        * No event_bitmask — completion events are routed to this EQ via
+        * the CQ's c_eqn field, not through CREATE_EQ subscription.
+        */
+       eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
+       MLX5_SET(eqc, eqc, log_eq_size, LOG_MSI_EQ_SIZE);
+       MLX5_SET(eqc, eqc, uar_page, dev->uar_page);
+       MLX5_SET(eqc, eqc, intr, MSI_VECTOR);
+       pas = MLX5_ADDR_OF(create_eq_in, in, pas);
+       VFIO_ASSERT_EQ(mlx5st_fill_pas(device, dev->msi_eq_buf, pas), 0u);
+       MLX5_SET(eqc, eqc, log_page_size, 0);
+
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+       dev->msi_eqn = MLX5_GET(create_eq_out, out, eq_number);
+       dev->msi_eq_cons_index = 0;
+       dev->have_msi_eq = true;
+       mlx5st_msi_eq_drain(dev);
+
+       dev_dbg(device,
+                "Created MSI EQ: eqn=%u, %d entries (COMP), vector=%d\n",
+                dev->msi_eqn, MSI_EQ_NENT, MSI_VECTOR);
+}
+
+static void mlx5st_destroy_msi_eq(struct mlx5st_device *dev)
+{
+       u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {};
+
+       MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
+       MLX5_SET(destroy_eq_in, in, eq_number, dev->msi_eqn);
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
 /*
  * HCA init / teardown
  */
@@ -1369,7 +1464,7 @@ static void mlx5st_create_cq(struct mlx5st_device *dev)
        cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
        MLX5_SET(cqc, cqc, log_cq_size, LOG_CQ_SIZE);
        MLX5_SET(cqc, cqc, uar_page, dev->uar_page);
-       MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->eqn);
+       MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->msi_eqn);
        MLX5_SET(cqc, cqc, cqe_sz, 0);
        pas = MLX5_ADDR_OF(create_cq_in, in, pas);
        MLX5_SET(cqc, cqc, page_offset, mlx5st_fill_pas(device, dev->cq_buf, 
pas));
@@ -1394,6 +1489,30 @@ static void mlx5st_destroy_cq(struct mlx5st_device *dev)
        mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
+/*
+ * Arm CQ for event generation.  The CQ event delivery state machine is
+ * single-shot: after generating one EQE the CQ enters "Fired" state and
+ * won't generate another until re-armed via ARM_NEXT.  Both the CQ doorbell
+ * record and the UAR CQ doorbell register must be written.
+ */
+static void mlx5st_arm_cq(struct mlx5st_device *dev)
+{
+       u32 sn = dev->cq_arm_sn & 3;
+       u32 ci = dev->cq_ci & 0xffffff;
+       u64 doorbell;
+
+       /* Update CQ doorbell record arm word */
+       WRITE_ONCE(dev->cq_dbrec.send_counter,
+                  cpu_to_be32(sn << 28 | ci));
+
+       /* Ring CQ doorbell register, iowrite has an internal dma_wmb() */
+       doorbell = ((u64)(sn << 28 | ci) << 32) | dev->cqn;
+       iowrite64be(doorbell,
+                   (u8 __iomem *)dev->uar_base + MLX5_CQ_DOORBELL_OFFSET);
+
+       dev->cq_arm_sn++;
+}
+
 /*
  * QP create/destroy
  */
@@ -1650,6 +1769,7 @@ static void mlx5st_teardown_datapath(struct mlx5st_device 
*dev)
        }
        dev->sq_pi = 0;
        dev->sq_ci = 0;
+       dev->cq_arm_sn = 0;
        memset(&dev->qp_dbrec, 0, sizeof(dev->qp_dbrec));
        memset(&dev->cq_dbrec, 0, sizeof(dev->cq_dbrec));
 }
@@ -1691,6 +1811,34 @@ static int mlx5st_memcpy_wait(struct vfio_pci_device 
*device)
        return ret;
 }
 
+/*
+ * send_msi callback — trigger CQE -> EQE -> MSI-X via a small RDMA Write.
+ *
+ * Both the CQ and MSI EQ use single-shot arming: the CQ must be armed so the
+ * CQE generates an EQE, and the MSI EQ must be armed so the EQE fires MSI-X.
+ */
+static void mlx5st_send_msi(struct vfio_pci_device *device)
+{
+       struct mlx5st_device *dev = to_mlx5st(device);
+
+       /* Drain accumulated MSI EQ events and re-arm for next interrupt */
+       mlx5st_msi_eq_drain(dev);
+
+       /* Arm CQ so the next CQE generates an EQE on the MSI EQ */
+       mlx5st_arm_cq(dev);
+
+       /* Post a signaled RDMA Write to trigger CQE -> EQE -> MSI-X */
+       mlx5st_post_rdma_write(dev,
+                              to_iova(device, &dev->send_msi_src),
+                              dev->global_lkey,
+                              to_iova(device, &dev->send_msi_dst),
+                              dev->global_rkey,
+                              sizeof(dev->send_msi_src), true);
+
+       /* Consume the CQE to avoid stale completions */
+       VFIO_ASSERT_EQ(mlx5st_poll_cq(dev, MLX5ST_MEMCPY_TIMEOUT_MS), 0);
+}
+
 /*
  * Driver ops callbacks
  */
@@ -1721,8 +1869,13 @@ static void mlx5st_init(struct vfio_pci_device *device)
        mlx5st_alloc_pd(dev);
        mlx5st_create_mkey(dev);
 
+       /* MSI EQ must be created before CQ so CQ can reference its eqn */
+       mlx5st_create_msi_eq(dev);
        mlx5st_setup_datapath(dev);
 
+       vfio_pci_msix_enable(device, MSI_VECTOR, 1);
+       device->driver.msi = MSI_VECTOR;
+
        device->driver.max_memcpy_size = 1ULL << dev->log_max_msg;
        device->driver.max_memcpy_count = SQ_WQE_CNT - 1;
 
@@ -1733,8 +1886,14 @@ static void mlx5st_remove(struct vfio_pci_device *device)
 {
        struct mlx5st_device *dev = to_mlx5st(device);
 
+       vfio_pci_msix_disable(device);
        mlx5st_teardown_datapath(dev);
 
+       if (dev->have_msi_eq) {
+               mlx5st_destroy_msi_eq(dev);
+               dev->have_msi_eq = false;
+       }
+
        dev_dbg(device, "teardown: destroy_mkey\n");
        if (dev->mkey_index) {
                mlx5st_destroy_mkey(dev);
@@ -1765,5 +1924,5 @@ struct vfio_pci_driver_ops mlx5st_ops = {
        .remove = mlx5st_remove,
        .memcpy_start = mlx5st_memcpy_start,
        .memcpy_wait = mlx5st_memcpy_wait,
-       .send_msi = NULL,
+       .send_msi = mlx5st_send_msi,
 };
diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h 
b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h
index a2506ec8a19523..2c451e411ec13f 100644
--- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h
+++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5_hw.h
@@ -80,6 +80,9 @@ struct mlx5st_dbrec {
 #define MLX5_BF_OFFSET 0x800
 #define MLX5_BF_SIZE 0x100
 
+/* CQ doorbell offset within UAR page */
+#define MLX5_CQ_DOORBELL_OFFSET 0x20
+
 /* EQ doorbell offset within UAR page */
 #define MLX5_EQ_DOORBELL_OFFSET 0x40
 
@@ -94,6 +97,9 @@ struct mlx5st_dbrec {
 #define LOG_CQ_SIZE 4
 #define EQ_NENT 64
 #define LOG_EQ_SIZE 6
+#define MSI_EQ_NENT 16
+#define LOG_MSI_EQ_SIZE 4
+#define MSI_VECTOR 0
 
 #define MAX_FW_PAGES 8192
 #define MAX_FW_PAGES_PER_CMD 512
-- 
2.43.0


Reply via email to