From: xiongweimin <[email protected]>

This comprehensive commit adds critical memory management capabilities to the 
virtio RDMA driver:

1. Port link layer identification
   - Reports Ethernet as the link layer (vrdma_port_link_layer)

2. Memory Region scatter-gather mapping
   - Implements two-level page table for efficient large MR handling 
(vrdma_set_page)
   - Adds SG list to MR mapping with device notification (vrdma_map_mr_sg)

3. User-space memory mapping
   - Supports mmap() for CQ/QP resources (vrdma_mmap)
   - Handles vring descriptors, user buffers, and fast doorbells
   - Implements mmap entry cleanup (vrdma_mmap_free)

Key features:
- Efficient 2-level page table for MRs (512 entries per level)
- Virtio command for backend MR mapping notification
- Unified mmap handling for CQ/QP with size validation
- Support for fast doorbell mapping optimization
- Comprehensive error handling in all code paths

Signed-off-by: Xiong Weimin <[email protected]>
---
 .../infiniband/hw/virtio/vrdma_dev_api.h      |  13 +
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 250 ++++++++++++++++++
 2 files changed, 263 insertions(+)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index da99f1f32..84dc05a96 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -200,6 +200,19 @@ struct vrdma_cmd_del_gid {
        __u32 port_num;
 };
 
+struct vrdma_cmd_map_mr_sg {
+       __u32 mrn;
+       __u32 npages;
+       __u64 start;
+       __u64 length;
+
+       __u64 pages;
+};
+
+struct vrdma_rsp_map_mr_sg {
+       __u32 npages;
+};
+
 #define VRDMA_CTRL_OK  0
 #define VRDMA_CTRL_ERR 1
 
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index b4c16ddbb..738935e3d 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -12,6 +12,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_addr.h>
+#include <linux/mm_types.h>
 
 #include "vrdma.h"
 #include "vrdma_dev.h"
@@ -21,6 +22,8 @@
 #include "vrdma_mmap.h"
 #include "vrdma_queue.h"
 
+#define VRTIO_RDMA_PAGE_PER_TBL 512
+
 /**
  * cmd_str - String representation of virtio RDMA control commands
  *
@@ -1677,6 +1680,248 @@ static int vrdma_destroy_ah(struct ib_ah *ibah, u32 
flags)
        return 0;
 }
 
+static void vrdma_get_fw_ver_str(struct ib_device *device, char *str)
+{
+       snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n", 1, 0, 0);
+}
+
+static enum rdma_link_layer vrdma_port_link_layer(struct ib_device *ibdev,
+                                                u32 port)
+{
+       return IB_LINK_LAYER_ETHERNET;
+}
+
+/**
+ * vrdma_set_page - Callback to collect physical pages from scatterlist
+ * @ibmr:      Memory region being mapped
+ * @addr:      Physical address of the current page
+ *
+ * This function is called by ib_sg_to_pages() for each page in the SG list.
+ * It stores the physical address into a two-level page table:
+ *   - Level 1: Array of pointers to L2 tables (512 entries each)
+ *   - Level 2: Each holds up to 512 page addresses
+ *
+ * The layout allows efficient DMA mapping of large MRs without allocating one 
huge array.
+ *
+ * Context: Called from ib_sg_to_pages(); may sleep if GFP_KERNEL used 
internally.
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if number of pages exceeds pre-allocated limit
+ */
+static int vrdma_set_page(struct ib_mr *ibmr, u64 addr)
+{
+       struct vrdma_mr *mr = to_vmr(ibmr);
+
+       if (mr->npages >= mr->max_pages) {
+               pr_debug("vRDMA: too many pages for MR (max=%u)\n", 
mr->max_pages);
+               return -ENOMEM;
+       }
+
+       /* Two-level indexing: [L1 index][L2 offset] */
+       mr->pages_k[mr->npages / VRTIO_RDMA_PAGE_PER_TBL][mr->npages % 
VRTIO_RDMA_PAGE_PER_TBL] = addr;
+       mr->npages++;
+       return 0;
+}
+
+/**
+ * vrdma_map_mr_sg - Map scatter-gather list into MR's page table and notify 
device
+ * @ibmr:      The memory region to map
+ * @sg:                Scatterlist describing user/kernel memory chunks
+ * @sg_nents:  Number of entries in sg
+ * @sg_offset: Optional offset within first sg element (ignored here)
+ *
+ * This function:
+ *   1. Walks the SG list via ib_sg_to_pages()
+ *   2. Populates software page table using vrdma_set_page()
+ *   3. Sends VIRTIO_RDMA_CMD_MAP_MR_SG to inform backend about IOVA range and 
page list
+ *
+ * Note: The actual DMA mapping was already done during ib_umem_get() or 
get_dma_mr().
+ *       This only sets up hardware-visible metadata.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * Number of successfully mapped sg entries (>0)
+ * * Negative errno on failure
+ */
+static int vrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                   int sg_nents, unsigned int *sg_offset)
+{
+       struct vrdma_dev *vdev = to_vdev(ibmr->device);
+       struct vrdma_mr *mr = to_vmr(ibmr);
+       struct vrdma_cmd_map_mr_sg *cmd;
+       struct vrdma_rsp_map_mr_sg *rsp;
+       struct scatterlist in, out;
+       int mapped;
+
+       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+       if (!cmd)
+               return -ENOMEM;
+
+       rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+       if (!rsp) {
+               kfree(cmd);
+               return -ENOMEM;
+       }
+
+       /* Reset page counter before traversal */
+       mr->npages = 0;
+
+       /* Use RDMA core helper to walk SG and call vrdma_set_page() per page */
+       mapped = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, vrdma_set_page);
+       if (mapped < 0) {
+               dev_err(&vdev->vdev->dev, "Failed to map SG to pages: %d\n", 
mapped);
+               kfree(rsp);
+               kfree(cmd);
+               return mapped;
+       }
+
+       /* Prepare command for device notification */
+       cmd->mrn = mr->mr_handle;
+       cmd->start = ibmr->iova;
+       cmd->length = ibmr->length;
+       cmd->npages = mr->npages;
+       cmd->pages = mr->dma_pages; /* Pre-DMA-mapped array of page addrs */
+
+       sg_init_one(&in, cmd, sizeof(*cmd));
+       sg_init_one(&out, rsp, sizeof(*rsp));
+
+       /* Notify backend about new mapping (optional optimization) */
+       int rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_MAP_MR_SG, &in, 
&out);
+       if (rc) {
+               dev_err(&vdev->vdev->dev,
+                       "VIRTIO_RDMA_CMD_MAP_MR_SG failed for mrn=0x%x, 
err=%d\n",
+                       mr->mr_handle, rc);
+               rc = -EIO;
+               goto out_free;
+       }
+
+       /* Success: return number of processed sg entries */
+       kfree(rsp);
+       kfree(cmd);
+       return mapped;
+
+out_free:
+       kfree(rsp);
+       kfree(cmd);
+       return rc;
+}
+
+/**
+ * vrdma_mmap - Map device memory (vring, ubuf, doorbell) into user space
+ * @ctx:       User's RDMA context
+ * @vma:       VMA describing the mapping request
+ *
+ * Maps memory regions associated with QP/CQ virtqueues into user space.
+ * Supports three components:
+ *   - vring descriptors (shared ring buffer)
+ *   - user buffer (optional data exchange area)
+ *   - fast doorbell page (if enabled)
+ *
+ * Uses PFN-based remapping for normal memory and I/O remapping for doorbells.
+ *
+ * Context: Called during mmap() in process context.
+ * Return:
+ * * 0 on success
+ * * -EINVAL for invalid parameters or layout mismatch
+ * * -EAGAIN/-EFAULT if remap fails
+ */
+int vrdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+       struct vrdma_ucontext *uctx = to_vucontext(ctx);
+       size_t requested_size = vma->vm_end - vma->vm_start;
+       struct rdma_user_mmap_entry *rdma_entry;
+       struct vrdma_user_mmap_entry *entry;
+       int rc;
+
+       /* Must be page-aligned */
+       if (vma->vm_start & (PAGE_SIZE - 1)) {
+               pr_warn("vRDMA: mmap start not page aligned: %#lx\n", 
vma->vm_start);
+               return -EINVAL;
+       }
+
+       /* Look up the registered mmap entry */
+       rdma_entry = rdma_user_mmap_entry_get(&uctx->ibucontext, vma);
+       if (!rdma_entry) {
+               pr_err("vRDMA: mmap lookup failed: pgoff=%lu size=%zu\n",
+                      vma->vm_pgoff, requested_size);
+               return -EINVAL;
+       }
+       entry = to_ventry(rdma_entry);
+
+       switch (entry->mmap_type) {
+       case VRDMA_MMAP_CQ:
+       case VRDMA_MMAP_QP:
+       {
+               unsigned long vq_size = 
PAGE_ALIGN(vring_size(virtqueue_get_vring_size(entry->vq),
+                                                             SMP_CACHE_BYTES));
+               unsigned long total_size = vq_size + entry->ubuf_size;
+
+               if (uctx->dev->fast_doorbell && entry->mmap_type == 
VRDMA_MMAP_QP)
+                       total_size += PAGE_SIZE;
+
+               if (requested_size != total_size) {
+                       WARN(1, "mmap size mismatch: got=%zu, expected=%lu\n",
+                            requested_size, total_size);
+                       rc = -EINVAL;
+                       goto out_put;
+               }
+
+               /* Map vring descriptor table */
+               rc = remap_pfn_range(vma, vma->vm_start,
+                                    
page_to_pfn(virt_to_page(virtqueue_get_vring(entry->vq)->desc)),
+                                    vq_size, vma->vm_page_prot);
+               if (rc) {
+                       pr_warn("vRDMA: remap vring failed: %d\n", rc);
+                       goto out_put;
+               }
+
+               /* Map user buffer (shared data region) */
+               rc = remap_pfn_range(vma, vma->vm_start + vq_size,
+                                    page_to_pfn(virt_to_page(entry->user_buf)),
+                                    entry->ubuf_size, vma->vm_page_prot);
+               if (rc) {
+                       pr_warn("vRDMA: remap ubuf failed: %d\n", rc);
+                       goto out_put;
+               }
+
+               /* Optionally map fast doorbell register (QP only) */
+               if (uctx->dev->fast_doorbell && entry->mmap_type == 
VRDMA_MMAP_QP) {
+                       unsigned long db_addr = vma->vm_start + vq_size + 
entry->ubuf_size;
+                       struct virtqueue *vq = entry->vq;
+
+                       rc = io_remap_pfn_range(vma, db_addr,
+                                               vmalloc_to_pfn(vq->priv),
+                                               PAGE_SIZE, vma->vm_page_prot);
+                       if (rc) {
+                               pr_warn("vRDMA: remap doorbell failed: %d\n", 
rc);
+                               goto out_put;
+                       }
+               }
+
+               break;
+       }
+       default:
+               pr_err("vRDMA: invalid mmap type %d\n", entry->mmap_type);
+               rc = -EINVAL;
+               goto out_put;
+       }
+
+       /* Success */
+       rdma_user_mmap_entry_put(rdma_entry);
+       return 0;
+
+out_put:
+       rdma_user_mmap_entry_put(rdma_entry);
+       return rc;
+}
+
+void vrdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+       struct vrdma_user_mmap_entry *entry = to_ventry(rdma_entry);
+
+       kfree(entry);
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
        .owner = THIS_MODULE,
        .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -1701,6 +1946,11 @@ static const struct ib_device_ops vrdma_dev_ops = {
        .dealloc_ucontext = vrdma_dealloc_ucontext,
        .create_ah = vrdma_create_ah,
        .destroy_ah = vrdma_destroy_ah,
+       .get_dev_fw_str = vrdma_get_fw_ver_str,
+       .get_link_layer = vrdma_port_link_layer,
+       .map_mr_sg = vrdma_map_mr_sg,
+       .mmap = vrdma_mmap,
+       .mmap_free = vrdma_mmap_free,           
 };
 
 /**
-- 
2.43.0

Reply via email to