Complete the mlx5 driver by adding CQ/QP creation, QP state
transitions, WQE posting, CQ polling, and the
memcpy_start/memcpy_wait callbacks. After this patch the driver is
functional for DMA tests.

The data path implements RDMA Write self-loopback via an RC QP with
force-loopback.  WQEs are posted to a 16-entry send queue with an
NC doorbell, and completions are polled from a 16-entry CQ.

Assisted-by: Claude:claude-opus-4.6
Signed-off-by: Jason Gunthorpe <[email protected]>
---
 .../selftests/vfio/lib/drivers/mlx5/mlx5.c    | 359 +++++++++++++++++-
 1 file changed, 357 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c 
b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
index 804801cc564e7a..e5e75adb253166 100644
--- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
+++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c
@@ -1343,6 +1343,354 @@ static void mlx5st_destroy_mkey(struct mlx5st_device 
*dev)
        mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
+/*
+ * CQ create/destroy
+ */
+
+static void mlx5st_create_cq(struct mlx5st_device *dev)
+{
+       struct vfio_pci_device *device = dev->device;
+       u64 in[MLX5_ST_SZ_QW(create_cq_in) + 1] = {};
+       u32 out[MLX5_ST_SZ_DW(create_cq_out)] = {};
+       struct mlx5_ifc_cqc_bits *cqc;
+       unsigned int i;
+       __be64 *pas;
+
+       /* Initialize CQEs before CREATE_CQ: opcode=0xF, owner=1 */
+       for (i = 0; i < CQ_CQE_CNT; i++) {
+               struct mlx5st_cqe64 *cqe = &dev->cq_buf[i];
+
+               MLX5_SET(cqe64, cqe, opcode, 0xF);
+               MLX5_SET_ONCE(cqe64, cqe, owner, 1);
+       }
+
+       MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
+
+       cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+       MLX5_SET(cqc, cqc, log_cq_size, LOG_CQ_SIZE);
+       MLX5_SET(cqc, cqc, uar_page, dev->uar_page);
+       MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->eqn);
+       MLX5_SET(cqc, cqc, cqe_sz, 0);
+       pas = MLX5_ADDR_OF(create_cq_in, in, pas);
+       MLX5_SET(cqc, cqc, page_offset, mlx5st_fill_pas(device, dev->cq_buf, 
pas));
+       MLX5_SET(cqc, cqc, log_page_size, 0);
+       MLX5_SET64(cqc, cqc, dbr_addr, to_iova(device, &dev->cq_dbrec));
+
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+       dev->cqn = MLX5_GET(create_cq_out, out, cqn);
+       dev->cq_ci = 0;
+       dev_dbg(device, "Created CQ: cqn=%u, %d entries\n", dev->cqn,
+                CQ_CQE_CNT);
+}
+
+static void mlx5st_destroy_cq(struct mlx5st_device *dev)
+{
+       u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {};
+
+       MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ);
+       MLX5_SET(destroy_cq_in, in, cqn, dev->cqn);
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/*
+ * QP create/destroy
+ */
+
+static void mlx5st_create_qp(struct mlx5st_device *dev)
+{
+       struct vfio_pci_device *device = dev->device;
+       u64 in[MLX5_ST_SZ_QW(create_qp_in) + 1] = {};
+       u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+       struct mlx5_ifc_qpc_bits *qpc;
+       __be64 *pas;
+
+       MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+
+       qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+       MLX5_SET(qpc, qpc, st, MLX5_QPC_ST_RC);
+       MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+       MLX5_SET(qpc, qpc, pd, dev->pdn);
+       MLX5_SET(qpc, qpc, uar_page, dev->uar_page);
+       MLX5_SET(qpc, qpc, cqn_snd, dev->cqn);
+       MLX5_SET(qpc, qpc, cqn_rcv, dev->cqn);
+       MLX5_SET(qpc, qpc, log_sq_size, LOG_SQ_SIZE);
+       MLX5_SET(qpc, qpc, log_msg_max, dev->log_max_msg);
+       MLX5_SET(qpc, qpc, rq_type, 0x3);
+       MLX5_SET(qpc, qpc, ts_format, 1);
+       pas = MLX5_ADDR_OF(create_qp_in, in, pas);
+       MLX5_SET(qpc, qpc, page_offset,
+                mlx5st_fill_pas(device, dev->sq_buf, pas));
+       MLX5_SET(qpc, qpc, log_page_size, 0);
+       MLX5_SET64(qpc, qpc, dbr_addr, to_iova(device, &dev->qp_dbrec));
+
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+
+       dev->qpn = MLX5_GET(create_qp_out, out, qpn);
+       dev->sq_pi = 0;
+       dev_dbg(device, "Created QP: qpn=%u, RC, sq=%d wqes\n", dev->qpn,
+                SQ_WQE_CNT);
+}
+
+static void mlx5st_destroy_qp(struct mlx5st_device *dev)
+{
+       u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+       MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+       MLX5_SET(destroy_qp_in, in, qpn, dev->qpn);
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/*
+ * QP state transitions
+ */
+
+static void mlx5st_qp_rst2init(struct mlx5st_device *dev)
+{
+       u32 out[MLX5_ST_SZ_DW(rst2init_qp_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
+       struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
+
+       MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
+       MLX5_SET(rst2init_qp_in, in, qpn, dev->qpn);
+
+       MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+       MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+       MLX5_SET(qpc, qpc, rre, 1);
+       MLX5_SET(qpc, qpc, rwe, 1);
+
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+       dev_dbg(dev->device, "QP RST->INIT\n");
+}
+
+static void mlx5st_qp_init2rtr(struct mlx5st_device *dev)
+{
+       u32 out[MLX5_ST_SZ_DW(init2rtr_qp_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
+       struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
+
+       MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
+       MLX5_SET(init2rtr_qp_in, in, qpn, dev->qpn);
+
+       MLX5_SET(qpc, qpc, mtu, 3);
+       MLX5_SET(qpc, qpc, log_msg_max, dev->log_max_msg);
+       MLX5_SET(qpc, qpc, remote_qpn, dev->qpn);
+       MLX5_SET(qpc, qpc, min_rnr_nak, 12);
+       MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+       MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
+
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+       dev_dbg(dev->device, "QP INIT->RTR (fl=1)\n");
+}
+
+static void mlx5st_qp_rtr2rts(struct mlx5st_device *dev)
+{
+       u32 out[MLX5_ST_SZ_DW(rtr2rts_qp_out)] = {};
+       u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
+       struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
+
+       MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
+       MLX5_SET(rtr2rts_qp_in, in, qpn, dev->qpn);
+
+       MLX5_SET(qpc, qpc, log_ack_req_freq, 0);
+       MLX5_SET(qpc, qpc, retry_count, 7);
+       MLX5_SET(qpc, qpc, rnr_retry, 7);
+       MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 14);
+
+       mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+       dev_dbg(dev->device, "QP RTR->RTS\n");
+}
+
+/*
+ * Post RDMA Write WQE
+ */
+static void mlx5st_post_rdma_write(struct mlx5st_device *dev, u64 src_addr,
+                                   u32 src_lkey, u64 dst_addr, u32 dst_rkey,
+                                   u32 length, bool signaled)
+{
+       struct mlx5st_send_wqe *wqe;
+       unsigned int idx;
+
+       idx = dev->sq_pi % SQ_WQE_CNT;
+       wqe = &dev->sq_buf[idx];
+
+       memset(wqe, 0, sizeof(*wqe));
+       MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, opcode, MLX5_OPCODE_RDMA_WRITE);
+       MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, wqe_index, dev->sq_pi);
+       MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, qp_or_sq, dev->qpn);
+       MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, ds, MLX5_RDMA_WRITE_DS);
+       if (signaled)
+               MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, ce, MLX5_WQE_CE_CQE_ALWAYS);
+
+       MLX5_SET64(wqe_raddr_seg, &wqe->raddr, raddr, dst_addr);
+       MLX5_SET(wqe_raddr_seg, &wqe->raddr, rkey, dst_rkey);
+
+       MLX5_SET(wqe_data_seg, &wqe->data, byte_count, length);
+       MLX5_SET(wqe_data_seg, &wqe->data, lkey, src_lkey);
+       MLX5_SET64(wqe_data_seg, &wqe->data, addr, src_addr);
+
+       dev->sq_pi++;
+
+       /* Ensure WQE is visible to device before doorbell record */
+       dma_wmb();
+
+       WRITE_ONCE(dev->qp_dbrec.send_counter,
+                  cpu_to_be32(dev->sq_pi & 0xffff));
+
+       /*
+        * Ring doorbell: write first 8 bytes of ctrl to UAR BF register,
+        * iowrite has an internal dma_wmb() so the doorbell record will be
+        * visible.
+        */
+       iowrite64be(be64_to_cpu(*(__be64 *)wqe),
+                   (u8 __iomem *)dev->uar_base + dev->uar_bf_offset);
+       dev->uar_bf_offset ^= MLX5_BF_SIZE;
+}
+
+/*
+ * Poll CQ
+ */
+static int mlx5st_poll_cq_batch(struct mlx5st_device *dev,
+                               unsigned int max_cqe)
+{
+       unsigned int polled = 0;
+
+       while (polled < max_cqe) {
+               unsigned int idx = dev->cq_ci % CQ_CQE_CNT;
+               struct mlx5st_cqe64 *cqe = &dev->cq_buf[idx];
+               u8 owner, opcode;
+
+               owner = MLX5_GET_ONCE(cqe64, cqe, owner);
+               if (owner != ((dev->cq_ci >> LOG_CQ_SIZE) & 1))
+                       break;
+
+               dma_rmb();
+
+               opcode = MLX5_GET(cqe64, cqe, opcode);
+
+               dev->cq_ci++;
+               WRITE_ONCE(dev->cq_dbrec.recv_counter,
+                          cpu_to_be32(dev->cq_ci & 0xffffff));
+
+               if (opcode == MLX5_CQE_REQ) {
+                       dev->sq_ci =
+                               (u16)(MLX5_GET(cqe64, cqe, wqe_counter) + 1);
+                       polled++;
+                       continue;
+               }
+               if (opcode == MLX5_CQE_REQ_ERR ||
+                   opcode == MLX5_CQE_RESP_ERR) {
+                       dev_dbg(dev->device,
+                               "CQE error: opcode=0x%x syndrome=0x%x 
vendor=0x%x\n",
+                               opcode,
+                               MLX5_GET(cqe64, cqe, error_syndrome.syndrome),
+                               MLX5_GET(cqe64, cqe,
+                                        error_syndrome.vendor_error_syndrome));
+                       return -1;
+               }
+               dev_err(dev->device, "CQE unexpected opcode=0x%x\n", opcode);
+               return -1;
+       }
+
+       return polled;
+}
+
+static int mlx5st_poll_cq(struct mlx5st_device *dev, unsigned int timeout_ms)
+{
+       struct timespec start, now;
+       unsigned int elapsed;
+       int ret;
+
+       clock_gettime(CLOCK_MONOTONIC, &start);
+       for (;;) {
+               ret = mlx5st_poll_cq_batch(dev, 1);
+               if (ret < 0)
+                       return -1;
+               if (ret > 0)
+                       return 0;
+
+               if (dev->have_eq)
+                       mlx5st_process_events(dev);
+
+               clock_gettime(CLOCK_MONOTONIC, &now);
+               elapsed = (now.tv_sec - start.tv_sec) * 1000 +
+                         (now.tv_nsec - start.tv_nsec) / 1000000;
+               if (elapsed > timeout_ms) {
+                       dev_err(dev->device, "CQ poll timeout after %u ms\n",
+                               timeout_ms);
+                       return -1;
+               }
+       }
+}
+
+/*
+ * Data path setup/teardown helpers
+ */
+
+static void mlx5st_setup_datapath(struct mlx5st_device *dev)
+{
+       mlx5st_create_cq(dev);
+       mlx5st_create_qp(dev);
+       mlx5st_qp_rst2init(dev);
+       mlx5st_qp_init2rtr(dev);
+       mlx5st_qp_rtr2rts(dev);
+}
+
+static void mlx5st_teardown_datapath(struct mlx5st_device *dev)
+{
+       if (dev->qpn) {
+               mlx5st_destroy_qp(dev);
+               dev->qpn = 0;
+       }
+       if (dev->cqn) {
+               mlx5st_destroy_cq(dev);
+               dev->cqn = 0;
+       }
+       dev->sq_pi = 0;
+       dev->sq_ci = 0;
+       memset(&dev->qp_dbrec, 0, sizeof(dev->qp_dbrec));
+       memset(&dev->cq_dbrec, 0, sizeof(dev->cq_dbrec));
+}
+
+/*
+ * memcpy callbacks
+ */
+
+#define MLX5ST_MEMCPY_TIMEOUT_MS 60000
+
+static void mlx5st_memcpy_start(struct vfio_pci_device *device,
+                                iova_t src, iova_t dst, u64 size, u64 count)
+{
+       struct mlx5st_device *dev = to_mlx5st(device);
+       u64 i;
+
+       for (i = 0; i < count; i++) {
+               bool signaled = (i == count - 1);
+
+               mlx5st_post_rdma_write(dev, src, dev->global_lkey, dst,
+                                      dev->global_rkey, size, signaled);
+       }
+}
+
+static int mlx5st_memcpy_wait(struct vfio_pci_device *device)
+{
+       struct mlx5st_device *dev = to_mlx5st(device);
+       int ret;
+
+       ret = mlx5st_poll_cq(dev, MLX5ST_MEMCPY_TIMEOUT_MS);
+       if (ret) {
+               /*
+                * CQE error puts the QP in error state.  Rebuild the data path
+                * so subsequent operations can succeed.
+                */
+               mlx5st_teardown_datapath(dev);
+               mlx5st_setup_datapath(dev);
+       }
+       return ret;
+}
+
 /*
  * Driver ops callbacks
  */
@@ -1373,6 +1721,11 @@ static void mlx5st_init(struct vfio_pci_device *device)
        mlx5st_alloc_pd(dev);
        mlx5st_create_mkey(dev);
 
+       mlx5st_setup_datapath(dev);
+
+       device->driver.max_memcpy_size = 1ULL << dev->log_max_msg;
+       device->driver.max_memcpy_count = SQ_WQE_CNT - 1;
+
        dev_dbg(device, "mlx5 driver initialized\n");
 }
 
@@ -1380,6 +1733,8 @@ static void mlx5st_remove(struct vfio_pci_device *device)
 {
        struct mlx5st_device *dev = to_mlx5st(device);
 
+       mlx5st_teardown_datapath(dev);
+
        dev_dbg(device, "teardown: destroy_mkey\n");
        if (dev->mkey_index) {
                mlx5st_destroy_mkey(dev);
@@ -1408,7 +1763,7 @@ struct vfio_pci_driver_ops mlx5st_ops = {
        .probe = mlx5st_probe,
        .init = mlx5st_init,
        .remove = mlx5st_remove,
-       .memcpy_start = NULL,
-       .memcpy_wait = NULL,
+       .memcpy_start = mlx5st_memcpy_start,
+       .memcpy_wait = mlx5st_memcpy_wait,
        .send_msi = NULL,
 };
-- 
2.43.0


Reply via email to