From: xiongweimin <[email protected]> This comprehensive commit adds critical memory management capabilities to the virtio RDMA driver:
1. Port link layer identification - Reports Ethernet as the link layer (vrdma_port_link_layer) 2. Memory Region scatter-gather mapping - Implements two-level page table for efficient large MR handling (vrdma_set_page) - Adds SG list to MR mapping with device notification (vrdma_map_mr_sg) 3. User-space memory mapping - Supports mmap() for CQ/QP resources (vrdma_mmap) - Handles vring descriptors, user buffers, and fast doorbells - Implements mmap entry cleanup (vrdma_mmap_free) Key features: - Efficient 2-level page table for MRs (512 entries per level) - Virtio command for backend MR mapping notification - Unified mmap handling for CQ/QP with size validation - Support for fast doorbell mapping optimization - Comprehensive error handling in all code paths Signed-off-by: Xiong Weimin <[email protected]> --- .../infiniband/hw/virtio/vrdma_dev_api.h | 13 + .../drivers/infiniband/hw/virtio/vrdma_ib.c | 250 ++++++++++++++++++ 2 files changed, 263 insertions(+) diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h index da99f1f32..84dc05a96 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h @@ -200,6 +200,19 @@ struct vrdma_cmd_del_gid { __u32 port_num; }; +struct vrdma_cmd_map_mr_sg { + __u32 mrn; + __u32 npages; + __u64 start; + __u64 length; + + __u64 pages; +}; + +struct vrdma_rsp_map_mr_sg { + __u32 npages; +}; + #define VRDMA_CTRL_OK 0 #define VRDMA_CTRL_ERR 1 diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c index b4c16ddbb..738935e3d 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c @@ -12,6 +12,7 @@ #include <rdma/ib_umem.h> #include <rdma/ib_verbs.h> #include <rdma/ib_addr.h> +#include <linux/mm_types.h> #include "vrdma.h" #include "vrdma_dev.h" @@ -21,6 +22,8 @@ #include "vrdma_mmap.h" #include "vrdma_queue.h" +#define VRTIO_RDMA_PAGE_PER_TBL 512 + /** * cmd_str - String representation of virtio RDMA control commands * @@ -1677,6 +1680,248 @@ static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags) return 0; } +static void vrdma_get_fw_ver_str(struct ib_device *device, char *str) +{ + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n", 1, 0, 0); +} + +static enum rdma_link_layer vrdma_port_link_layer(struct ib_device *ibdev, + u32 port) +{ + return IB_LINK_LAYER_ETHERNET; +} + +/** + * vrdma_set_page - Callback to collect physical pages from scatterlist + * @ibmr: Memory region being mapped + * @addr: Physical address of the current page + * + * This function is called by ib_sg_to_pages() for each page in the SG list. + * It stores the physical address into a two-level page table: + * - Level 1: Array of pointers to L2 tables (512 entries each) + * - Level 2: Each holds up to 512 page addresses + * + * The layout allows efficient DMA mapping of large MRs without allocating one huge array. + * + * Context: Called from ib_sg_to_pages(); may sleep if GFP_KERNEL used internally. + * Return: + * * 0 on success + * * -ENOMEM if number of pages exceeds pre-allocated limit + */ +static int vrdma_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct vrdma_mr *mr = to_vmr(ibmr); + + if (mr->npages >= mr->max_pages) { + pr_debug("vRDMA: too many pages for MR (max=%u)\n", mr->max_pages); + return -ENOMEM; + } + + /* Two-level indexing: [L1 index][L2 offset] */ + mr->pages_k[mr->npages / VRTIO_RDMA_PAGE_PER_TBL][mr->npages % VRTIO_RDMA_PAGE_PER_TBL] = addr; + mr->npages++; + return 0; +} + +/** + * vrdma_map_mr_sg - Map scatter-gather list into MR's page table and notify device + * @ibmr: The memory region to map + * @sg: Scatterlist describing user/kernel memory chunks + * @sg_nents: Number of entries in sg + * @sg_offset: Optional offset within first sg element (ignored here) + * + * This function: + * 1. Walks the SG list via ib_sg_to_pages() + * 2. Populates software page table using vrdma_set_page() + * 3. Sends VIRTIO_RDMA_CMD_MAP_MR_SG to inform backend about IOVA range and page list + * + * Note: The actual DMA mapping was already done during ib_umem_get() or get_dma_mr(). + * This only sets up hardware-visible metadata. + * + * Context: Can sleep (called in process context). + * Return: + * * Number of successfully mapped sg entries (>0) + * * Negative errno on failure + */ +static int vrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) +{ + struct vrdma_dev *vdev = to_vdev(ibmr->device); + struct vrdma_mr *mr = to_vmr(ibmr); + struct vrdma_cmd_map_mr_sg *cmd; + struct vrdma_rsp_map_mr_sg *rsp; + struct scatterlist in, out; + int mapped; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + kfree(cmd); + return -ENOMEM; + } + + /* Reset page counter before traversal */ + mr->npages = 0; + + /* Use RDMA core helper to walk SG and call vrdma_set_page() per page */ + mapped = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, vrdma_set_page); + if (mapped < 0) { + dev_err(&vdev->vdev->dev, "Failed to map SG to pages: %d\n", mapped); + kfree(rsp); + kfree(cmd); + return mapped; + } + + /* Prepare command for device notification */ + cmd->mrn = mr->mr_handle; + cmd->start = ibmr->iova; + cmd->length = ibmr->length; + cmd->npages = mr->npages; + cmd->pages = mr->dma_pages; /* Pre-DMA-mapped array of page addrs */ + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Notify backend about new mapping (optional optimization) */ + int rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_MAP_MR_SG, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, + "VIRTIO_RDMA_CMD_MAP_MR_SG failed for mrn=0x%x, err=%d\n", + mr->mr_handle, rc); + rc = -EIO; + goto out_free; + } + + /* Success: return number of processed sg entries */ + kfree(rsp); + kfree(cmd); + return mapped; + +out_free: + kfree(rsp); + kfree(cmd); + return rc; +} + +/** + * vrdma_mmap - Map device memory (vring, ubuf, doorbell) into user space + * @ctx: User's RDMA context + * @vma: VMA describing the mapping request + * + * Maps memory regions associated with QP/CQ virtqueues into user space. + * Supports three components: + * - vring descriptors (shared ring buffer) + * - user buffer (optional data exchange area) + * - fast doorbell page (if enabled) + * + * Uses PFN-based remapping for normal memory and I/O remapping for doorbells. + * + * Context: Called during mmap() in process context. + * Return: + * * 0 on success + * * -EINVAL for invalid parameters or layout mismatch + * * -EAGAIN/-EFAULT if remap fails + */ +int vrdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct vrdma_ucontext *uctx = to_vucontext(ctx); + size_t requested_size = vma->vm_end - vma->vm_start; + struct rdma_user_mmap_entry *rdma_entry; + struct vrdma_user_mmap_entry *entry; + int rc; + + /* Must be page-aligned */ + if (vma->vm_start & (PAGE_SIZE - 1)) { + pr_warn("vRDMA: mmap start not page aligned: %#lx\n", vma->vm_start); + return -EINVAL; + } + + /* Look up the registered mmap entry */ + rdma_entry = rdma_user_mmap_entry_get(&uctx->ibucontext, vma); + if (!rdma_entry) { + pr_err("vRDMA: mmap lookup failed: pgoff=%lu size=%zu\n", + vma->vm_pgoff, requested_size); + return -EINVAL; + } + entry = to_ventry(rdma_entry); + + switch (entry->mmap_type) { + case VRDMA_MMAP_CQ: + case VRDMA_MMAP_QP: + { + unsigned long vq_size = PAGE_ALIGN(vring_size(virtqueue_get_vring_size(entry->vq), + SMP_CACHE_BYTES)); + unsigned long total_size = vq_size + entry->ubuf_size; + + if (uctx->dev->fast_doorbell && entry->mmap_type == VRDMA_MMAP_QP) + total_size += PAGE_SIZE; + + if (requested_size != total_size) { + WARN(1, "mmap size mismatch: got=%zu, expected=%lu\n", + requested_size, total_size); + rc = -EINVAL; + goto out_put; + } + + /* Map vring descriptor table */ + rc = remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(virtqueue_get_vring(entry->vq)->desc)), + vq_size, vma->vm_page_prot); + if (rc) { + pr_warn("vRDMA: remap vring failed: %d\n", rc); + goto out_put; + } + + /* Map user buffer (shared data region) */ + rc = remap_pfn_range(vma, vma->vm_start + vq_size, + page_to_pfn(virt_to_page(entry->user_buf)), + entry->ubuf_size, vma->vm_page_prot); + if (rc) { + pr_warn("vRDMA: remap ubuf failed: %d\n", rc); + goto out_put; + } + + /* Optionally map fast doorbell register (QP only) */ + if (uctx->dev->fast_doorbell && entry->mmap_type == VRDMA_MMAP_QP) { + unsigned long db_addr = vma->vm_start + vq_size + entry->ubuf_size; + struct virtqueue *vq = entry->vq; + + rc = io_remap_pfn_range(vma, db_addr, + vmalloc_to_pfn(vq->priv), + PAGE_SIZE, vma->vm_page_prot); + if (rc) { + pr_warn("vRDMA: remap doorbell failed: %d\n", rc); + goto out_put; + } + } + + break; + } + default: + pr_err("vRDMA: invalid mmap type %d\n", entry->mmap_type); + rc = -EINVAL; + goto out_put; + } + + /* Success */ + rdma_user_mmap_entry_put(rdma_entry); + return 0; + +out_put: + rdma_user_mmap_entry_put(rdma_entry); + return rc; +} + +void vrdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct vrdma_user_mmap_entry *entry = to_ventry(rdma_entry); + + kfree(entry); +} + static const struct ib_device_ops vrdma_dev_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION, @@ -1701,6 +1946,11 @@ static const struct ib_device_ops vrdma_dev_ops = { .dealloc_ucontext = vrdma_dealloc_ucontext, .create_ah = vrdma_create_ah, .destroy_ah = vrdma_destroy_ah, + .get_dev_fw_str = vrdma_get_fw_ver_str, + .get_link_layer = vrdma_port_link_layer, + .map_mr_sg = vrdma_map_mr_sg, + .mmap = vrdma_mmap, + .mmap_free = vrdma_mmap_free, }; /** -- 2.43.0
