From: xiongweimin <[email protected]> This commit adds foundational resource management capabilities to the vhost-user RDMA driver, enabling full RDMA operations:
1. Memory Region (MR) Management: - DMA MR registration via GET_DMA_MR - Two-level page table for large scatter-gather lists - CREATE_MR/DEREG_MR backend command flow - Atomic command execution with virtqueue 2. Global Identifier (GID) Management: - ADD_GID/DEL_GID backend commands - RoCE v1/v2 GID type support - Port-based GID table operations 3. User Context (ucontext) Support: - Allocation and deallocation hooks - Device association for future PD/CQ/MR management 4. Address Handle (AH) Management: - RoCE-specific AH creation/validation - Unicast GRH enforcement - Device-wide AH limit tracking Key technical features: - MRs support both DMA-direct and user-backed registrations - Page-table optimized for large scatter-lists - GID operations integrate with RDMA core notifications - AHs store full address vectors for packet construction - Resource limits enforced via atomic counters Signed-off-by: Xiong Weimin <[email protected]> --- .../infiniband/hw/virtio/vrdma_dev_api.h | 40 ++ .../drivers/infiniband/hw/virtio/vrdma_ib.c | 600 ++++++++++++++++++ .../drivers/infiniband/hw/virtio/vrdma_ib.h | 80 +++ 3 files changed, 720 insertions(+) diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h index d1db1bea4..da99f1f32 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h @@ -160,6 +160,46 @@ struct vrdma_cmd_destroy_qp { __u32 qpn; }; +struct vrdma_cmd_get_dma_mr { + __u32 pdn; + __u32 access_flags; +}; + +struct vrdma_rsp_get_dma_mr { + __u32 mrn; + __u32 lkey; + __u32 rkey; +}; + +struct vrdma_cmd_create_mr { + __u32 pdn; + __u32 access_flags; + + __u32 max_num_sg; +}; + +struct vrdma_rsp_create_mr { + __u32 mrn; + __u32 lkey; + __u32 rkey; +}; + +struct vrdma_cmd_dereg_mr { + __u32 mrn; +}; + +struct vrdma_cmd_add_gid { + __u8 gid[16]; + __u32 gid_type; + __u16 index; + __u32 port_num; +}; + +struct vrdma_cmd_del_gid { + __u16 index; + __u32 port_num; +}; + #define VRDMA_CTRL_OK 0 #define VRDMA_CTRL_ERR 1 diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c index f1f53314f..b4c16ddbb 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c @@ -1086,6 +1086,597 @@ static int vrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return rc; } +/** + * vrdma_get_dma_mr - Get a DMA memory region (uncached, direct-access MR) + * @pd: Protection Domain to associate this MR with + * @flags: Access permissions (IB_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_READ, etc.) + * + * This function creates a special type of Memory Region (MR) that refers to + * physically contiguous or scatter-gather DMA-capable memory, typically used + * for zero-copy or kernel-space registrations without user buffer backing. + * + * It issues the VIRTIO_RDMA_CMD_GET_DMA_MR command to the backend device, + * which returns: + * - An MR handle (mrn) + * - Local Key (lkey) + * - Remote Key (rkey) + * + * Unlike regular MRs created via ib_reg_mr(), this MR does not back any + * user-space virtual memory (i.e., no ib_umem). It is typically used for + * device-specific buffers, scratch memory, or control structures. + * + * Context: Called in process context. May sleep. + * Return: + * * &mr->ibmr on success + * * ERR_PTR(-ENOMEM) if memory allocation fails + * * ERR_PTR(-EIO) if device communication fails + */ +static struct ib_mr *vrdma_get_dma_mr(struct ib_pd *pd, int flags) +{ + struct vrdma_dev *vdev = to_vdev(pd->device); + struct vrdma_mr *mr; + struct vrdma_cmd_get_dma_mr *cmd; + struct vrdma_rsp_get_dma_mr *rsp; + struct scatterlist in, out; + int rc; + + /* Allocate software MR structure */ + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + rc = -ENOMEM; + goto err_free_mr; + } + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + rc = -ENOMEM; + goto err_free_cmd; + } + + /* Prepare command parameters */ + cmd->pdn = to_vpd(pd)->pd_handle; + cmd->access_flags = flags; + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Send GET_DMA_MR command to device */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_GET_DMA_MR, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, + "GET_DMA_MR command failed: %d\n", rc); + goto err_free_rsp; + } + + /* Initialize MR fields from response */ + mr->mr_handle = rsp->mrn; + mr->ibmr.lkey = rsp->lkey; + mr->ibmr.rkey = rsp->rkey; + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; /* Standard memory registration */ + + /* No backing user memory */ + mr->umem = NULL; + mr->iova = 0; + mr->size = 0; + mr->pages = NULL; + mr->pages_k = NULL; + mr->dma_pages = 0; + mr->npages = 0; + mr->max_pages = 0; + + /* Cleanup command/response buffers */ + kfree(cmd); + kfree(rsp); + + return &mr->ibmr; + +err_free_rsp: + kfree(rsp); + +err_free_cmd: + kfree(cmd); + +err_free_mr: + kfree(mr); + return ERR_PTR(rc); +} + +/** + * vrdma_init_page_tbl - Initialize a two-level page table for MR management + * @dev: vRDMA device pointer + * @npages: Maximum number of data pages this table can map + * @pages_dma: Output: L1 table with entries pointing to DMA addresses of L2 tables + * @dma_pages_p: Output: DMA address of the L1 table itself + * + * This function sets up a two-level page table structure used in Memory Region (MR) + * registration to support scatter-gather I/O. The layout is: + * + * L1 (Level 1): Single page, DMA-coherent, holds pointers to L2 tables. + * Will be passed to hardware via WQE or command. + * + * L2 (Level 2): Array of pages, each holding up to 512 x 8-byte DMA addresses + * (for 4KB page size). Each L2 table maps part of the S/G list. + * + * Example: + * npages = 1024 => needs 1024 / 512 = 2 L2 tables + * + * Return: + * Pointer to kernel virtual address of L1 table (pages_k), which stores + * virtual addresses of L2 tables for cleanup. + * On failure, returns NULL and cleans up all allocated memory. + */ +static uint64_t **vrdma_init_page_tbl(struct vrdma_dev *dev, + unsigned int npages, + uint64_t ***pages_dma, + dma_addr_t *dma_pages_p) +{ + unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512; /* ceil(npages / 512) */ + uint64_t **l1_table; /* L1: stores DMA addrs of L2s (device-readable) */ + uint64_t **l1_table_k; /* L1: stores kernel vaddrs of L2s (for free) */ + dma_addr_t l1_dma_addr; + dma_addr_t l2_dma_addr; + int i; + + /* Allocate L1 table: must be DMA-coherent because device reads it */ + l1_table = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, &l1_dma_addr, GFP_KERNEL); + if (!l1_table) + return NULL; + + /* Allocate kernel-space array to track L2 virtual addresses */ + l1_table_k = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!l1_table_k) + goto err_free_l1_table; + + /* Allocate each L2 table (DMA-coherent, one per 512 entries) */ + for (i = 0; i < nl2; i++) { + l1_table_k[i] = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, &l2_dma_addr, GFP_KERNEL); + if (!l1_table_k[i]) + goto err_free_l2_tables; + + l1_table[i] = (uint64_t *)l2_dma_addr; /* Device sees DMA address */ + } + + /* Output parameters */ + *pages_dma = l1_table; /* Device-visible L1 (with DMA pointers) */ + *dma_pages_p = l1_dma_addr; /* DMA address of L1 table */ + + return l1_table_k; /* Return kernel view for later cleanup */ + +err_free_l2_tables: + /* Roll back any successfully allocated L2 tables */ + while (--i >= 0) { + dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table_k[i], (dma_addr_t)l1_table[i]); + } + kfree(l1_table_k); + +err_free_l1_table: + dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table, l1_dma_addr); + + return NULL; +} + +/** + * vrdma_free_page_tbl - Free a two-level page table + * @dev: vRDMA device + * @pages_k: Return value from vrdma_init_page_tbl (kernel L2 pointers) + * @pages: L1 table with DMA addresses (output of pages_dma) + * @dma_pages: DMA address of L1 table + * @npages: Number of pages that were to be supported + * + * Frees both L1 and all L2 page tables allocated by vrdma_init_page_tbl. + */ +static void vrdma_free_page_tbl(struct vrdma_dev *dev, + uint64_t **pages_k, + uint64_t **pages, + dma_addr_t dma_pages, + unsigned int npages) +{ + unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512; + int i; + + if (!pages_k || !pages) + return; + + /* Free all L2 tables */ + for (i = 0; i < nl2; i++) { + if (pages_k[i]) + dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages_k[i], + virt_to_phys((void *)pages[i])); + } + + /* Free L1 tracking array */ + kfree(pages_k); + + /* Free L1 DMA table */ + dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages, dma_pages); +} + +/** + * vrdma_alloc_mr - Allocate a multi-segment Memory Region (MR) with page tables + * @pd: Protection Domain to associate the MR with + * @mr_type: Type of MR (must be IB_MR_TYPE_MEM_REG) + * @max_num_sg: Maximum number of scatter/gather entries supported by this MR + * + * This function allocates a software MR structure and reserves a hardware MR + * context on the backend vRDMA device. It prepares a two-level page table + * (L1/L2) to support up to @max_num_sg pages, which will later be filled during + * memory registration (e.g., via ib_update_page()). + * + * The allocated MR is not yet backed by any actual memory - it serves as a + * container for future page population (used primarily by ib_get_dma_mr() path + * or special fast-register mechanisms). + * + * Command flow: + * - Sends VIRTIO_RDMA_CMD_CREATE_MR to device + * - Receives mr_handle, lkey, rkey from response + * + * Context: Called in process context. May sleep. + * Return: + * * &mr->ibmr on success + * * ERR_PTR(-EINVAL) if unsupported MR type + * * ERR_PTR(-ENOMEM) if memory allocation fails + * * ERR_PTR(-EIO) if device command fails + */ +static struct ib_mr *vrdma_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct vrdma_dev *vdev = to_vdev(pd->device); + struct vrdma_mr *mr; + struct vrdma_cmd_create_mr *cmd; + struct vrdma_rsp_create_mr *rsp; + struct scatterlist in, out; + int rc; + + /* Only support standard memory registration */ + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + /* Allocate software MR structure */ + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + rc = -ENOMEM; + goto err_free_mr; + } + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + rc = -ENOMEM; + goto err_free_cmd; + } + + /* + * Allocate two-level page table for S/G support. + * Each L2 table holds PAGE_SIZE / sizeof(u64) entries. + * L1 table points to multiple L2s. + */ + mr->pages_k = vrdma_init_page_tbl(vdev, max_num_sg, + &mr->pages, &mr->dma_pages); + if (!mr->pages_k) { + dev_err(&vdev->vdev->dev, + "Failed to allocate page table for %u S/G entries\n", + max_num_sg); + rc = -ENOMEM; + goto err_free_rsp; + } + + mr->max_pages = max_num_sg; + mr->npages = 0; + mr->umem = NULL; /* No user memory backing at this stage */ + mr->iova = 0; + mr->size = 0; + + /* Prepare command */ + cmd->pdn = to_vpd(pd)->pd_handle; + cmd->max_num_sg = max_num_sg; + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Send CREATE_MR command to backend device */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_CREATE_MR, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, "CREATE_MR failed: %d\n", rc); + goto err_free_page_tbl; + } + + /* Initialize MR metadata from response */ + mr->mr_handle = rsp->mrn; + mr->ibmr.lkey = rsp->lkey; + mr->ibmr.rkey = rsp->rkey; + mr->ibmr.pd = pd; + mr->ibmr.device = &vdev->ib_dev; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; + + /* Clean up command/response buffers */ + kfree(cmd); + kfree(rsp); + + return &mr->ibmr; + +err_free_page_tbl: + vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages, + max_num_sg); +err_free_rsp: + kfree(rsp); +err_free_cmd: + kfree(cmd); +err_free_mr: + kfree(mr); + return ERR_PTR(rc); +} + +/** + * vrdma_dereg_mr - Deregister and destroy a Memory Region (MR) + * @ibmr: The IB memory region to deregister + * @udata: User data (optional, for user-space MRs) + * + * This function unregisters a previously allocated MR from the vRDMA device. + * It performs the following steps: + * 1. Sends VIRTIO_RDMA_CMD_DEREG_MR command to the backend device + * 2. Frees software page tables (L1/L2) used for scatter-gather mapping + * 3. Releases user memory (if any) via ib_umem_release() + * 4. Frees local metadata (struct vrdma_mr) + * + * Context: Can be called in process context. May sleep. + * Return: + * * 0 on success + * * -EIO if device communication fails + * * Other negative errno codes on allocation failure (rare during dereg) + */ +static int vrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct vrdma_dev *vdev = to_vdev(ibmr->device); + struct vrdma_mr *mr = to_vmr(ibmr); + struct vrdma_cmd_dereg_mr *cmd; + struct scatterlist in; + int rc; + + /* Allocate command buffer */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + /* Prepare command */ + cmd->mrn = mr->mr_handle; + sg_init_one(&in, cmd, sizeof(*cmd)); + + /* Notify hardware to release MR context */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEREG_MR, &in, NULL); + if (rc) { + dev_err(&vdev->vdev->dev, + "VIRTIO_RDMA_CMD_DEREG_MR failed for mrn=0x%x, err=%d\n", + mr->mr_handle, rc); + rc = -EIO; + goto out_free_cmd; + } + + /* Free two-level page table used for S/G entries */ + vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages, mr->max_pages); + + /* Release user memory if present */ + if (mr->umem) + ib_umem_release(mr->umem); + + /* Success */ + kfree(cmd); + return 0; + +out_free_cmd: + kfree(cmd); + return rc; +} + +/** + * vrdma_add_gid - Add a GID (Global Identifier) entry to the hardware + * @attr: GID attribute containing port, index, GID value, and GID type + * @context: Pointer to store driver-specific context (unused in vRDMA) + * + * This callback is invoked by the RDMA core when a GID table entry is added, + * either manually via sysfs or automatically during IPv6 address assignment. + * + * The function sends VIRTIO_RDMA_CMD_ADD_GID to the backend device to register + * the GID at the specified index and port. This allows the device to use this + * GID for RoCE traffic (e.g., as source in GRH). + * + * Note: The @context parameter is unused in vRDMA drivers since no additional + * per-GID software state is maintained. + * + * Context: Can sleep (called in process context). + * Return: + * * 0 on success + * * -ENOMEM if kmalloc fails + * * -EIO if device command fails + */ +static int vrdma_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct vrdma_dev *vdev = to_vdev(attr->device); + struct vrdma_cmd_add_gid *cmd; + struct scatterlist in; + int rc; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + /* Fill command parameters */ + memcpy(cmd->gid, attr->gid.raw, sizeof(cmd->gid)); + cmd->index = attr->index; + cmd->port_num = attr->port_num; + cmd->gid_type = attr->gid_type; /* e.g., IB_GID_TYPE_ROCE or IB_GID_TYPE_ROCE_UDP_ENCAP */ + + sg_init_one(&in, cmd, sizeof(*cmd)); + + /* Send command to backend */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_ADD_GID, &in, NULL); + if (rc) + dev_err(&vdev->vdev->dev, + "ADD_GID failed: port=%u index=%u type=%d, err=%d\n", + attr->port_num, attr->index, attr->gid_type, rc); + + kfree(cmd); + return rc ? -EIO : 0; +} + +/** + * vrdma_del_gid - Remove a GID entry from the hardware + * @attr: GID attribute specifying which GID to delete (by index/port) + * @context: Driver-specific context (passed from add_gid; unused here) + * + * This callback is called when a GID is removed from the GID table. + * It notifies the backend device to invalidate the GID mapping at the given index. + * + * The @context pointer is ignored because vRDMA does not maintain per-GID software state. + * + * Context: Can sleep (process context). + * Return: + * * 0 on success + * * -ENOMEM if allocation fails + * * -EIO if device command fails + */ +static int vrdma_del_gid(const struct ib_gid_attr *attr, void **context) +{ + struct vrdma_dev *vdev = to_vdev(attr->device); + struct vrdma_cmd_del_gid *cmd; + struct scatterlist in; + int rc; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + /* Only index and port are needed to identify the GID */ + cmd->index = attr->index; + cmd->port_num = attr->port_num; + + sg_init_one(&in, cmd, sizeof(*cmd)); + + /* Send command to backend */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEL_GID, &in, NULL); + if (rc) + dev_err(&vdev->vdev->dev, + "DEL_GID failed: port=%u index=%u, err=%d\n", + attr->port_num, attr->index, rc); + + kfree(cmd); + return rc ? -EIO : 0; +} + +static int vrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) +{ + struct vrdma_ucontext *vuc = to_vucontext(uctx); + + vuc->dev = to_vdev(uctx->device); + + return 0; +} + +static void vrdma_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ +} + +/** + * vrdma_create_ah - Create an Address Handle (AH) for RoCE communication + * @ibah: IB address handle to initialize + * @init_attr: AH initialization attributes + * @udata: User data (unused in vRDMA) + * + * This function creates a software-only Address Handle (AH), which represents + * a remote destination for UD or RC QP sends. Since this is a virtualized driver, + * no hardware command is sent; instead, the AH context is stored locally in + * struct vrdma_ah for later use during packet construction. + * + * The AH must: + * - Be RoCE type + * - Contain GRH (Global Routing Header) + * - Not be multicast (currently unsupported) + * + * Also enforces device limit on maximum number of active AHs via atomic counter. + * + * Context: Can sleep (called in process context). + * Return: + * * 0 on success + * * -EINVAL if attributes are invalid + * * -ENOMEM if AH limit exceeded + */ +static int vrdma_create_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + struct vrdma_dev *vdev = to_vdev(ibah->device); + struct vrdma_ah *ah = to_vah(ibah); + const struct ib_global_route *grh; + u32 port_num = rdma_ah_get_port_num(init_attr->ah_attr); + + /* Must have GRH enabled */ + if (!(rdma_ah_get_ah_flags(init_attr->ah_attr) & IB_AH_GRH)) + return -EINVAL; + + grh = rdma_ah_read_grh(init_attr->ah_attr); + + /* Only support RoCE type and unicast DGRAM */ + if (init_attr->ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE) + return -EINVAL; + + if (rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw)) { + dev_dbg(&vdev->vdev->dev, "Multicast GID not supported in AH\n"); + return -EINVAL; + } + + /* Enforce max_ah limit using atomic increment with barrier */ + if (!atomic_add_unless(&vdev->num_ah, 1, vdev->ib_dev.attrs.max_ah)) { + dev_dbg(&vdev->vdev->dev, "Exceeded max number of AHs (%u)\n", + vdev->ib_dev.attrs.max_ah); + return -ENOMEM; + } + + /* Initialize AV (Address Vector) with relevant fields */ + ah->av.port = port_num; + ah->av.pdn = to_vpd(ibah->pd)->pd_handle; /* Protection Domain Number */ + ah->av.gid_index = grh->sgid_index; /* Source GID table index */ + ah->av.hop_limit = grh->hop_limit; + ah->av.sl_tclass_flowlabel = (u32)(grh->traffic_class << 20) | + (grh->flow_label & 0xfffff); /* 8-bit SL + 20-bit flow label */ + + memcpy(ah->av.dgid, grh->dgid.raw, sizeof(ah->av.dgid)); /* 128-bit Dest GID */ + memcpy(ah->av.dmac, init_attr->ah_attr->roce.dmac, ETH_ALEN); /* Next-hop MAC */ + + return 0; +} + +/** + * vrdma_destroy_ah - Destroy an Address Handle + * @ibah: The IB address handle to destroy + * @flags: Destroy flags (e.g., for deferred cleanup; unused here) + * + * This callback releases the software state associated with an AH. + * It decrements the per-device AH counter to allow new AH creation. + * + * No hardware interaction is needed since AHs are purely software constructs + * in this virtio-rdma implementation. + * + * Context: Can sleep (process context). May be called from RCU read-side critical section. + * Return: Always returns 0 (success). + */ +static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct vrdma_dev *vdev = to_vdev(ibah->device); + + atomic_dec(&vdev->num_ah); + + return 0; +} + static const struct ib_device_ops vrdma_dev_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION, @@ -1101,6 +1692,15 @@ static const struct ib_device_ops vrdma_dev_ops = { .dealloc_pd = vrdma_dealloc_pd, .create_qp = vrdma_create_qp, .destroy_qp = vrdma_destroy_qp, + .get_dma_mr = vrdma_get_dma_mr, + .alloc_mr = vrdma_alloc_mr, + .dereg_mr = vrdma_dereg_mr, + .add_gid = vrdma_add_gid, + .del_gid = vrdma_del_gid, + .alloc_ucontext = vrdma_alloc_ucontext, + .dealloc_ucontext = vrdma_dealloc_ucontext, + .create_ah = vrdma_create_ah, + .destroy_ah = vrdma_destroy_ah, }; /** diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h index ba88599c8..6759c4349 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h @@ -11,6 +11,8 @@ #include <rdma/ib_verbs.h> #include <rdma/vrdma_abi.h> +#include "vrdma_abi.h" + #define VRDMA_COMM_TIMEOUT 1000000 enum vrdma_type { @@ -130,6 +132,11 @@ struct vrdma_ucontext { struct vrdma_dev *dev; }; +struct vrdma_ah { + struct ib_ah ibah; + struct vrdma_av av; +}; + /** * struct vrdma_qp - Virtual RDMA Queue Pair (QP) private data * @@ -166,6 +173,64 @@ struct vrdma_qp { struct vrdma_user_mmap_entry *rq_entry; /* Mmap entry for RQ buffer */ }; +/** + * struct vrdma_mr - Software state of a Virtio-RDMA Memory Region (MR) + * @ibmr: InfiniBand core MR object (contains rkey, lkey, etc.) + * @umem: User memory descriptor from ib_umem_get(), holds + * page list and reference to user VMA + * @mr_handle: Handle returned by backend device for this MR + * @iova: I/O virtual address (start of the mapped region) + * @size: Total size of the memory region in bytes + * @pages: Level 1 (L1) page table - array of kernel pointers to + * level 2 (L2) page tables containing DMA addresses. + * This is used by the host driver to manage scatter-gather layout. + * @pages_k: Array of kernel virtual addresses of L2 page tables. + * Used to free memory correctly during cleanup. + * @dma_pages: DMA address of the L1 page table (first-level table), + * to be passed to the device or written in command WQE. + * @npages: Number of valid pages in the memory region + * @max_pages: Maximum number of pages that can be held in current + * page table allocation (based on initial mapping size) + * + * This structure represents a registered memory region in the vRDMA driver. + * It supports large memory registrations using a two-level page table design: + * + * L1 Page Table (contiguous DMA-mapped): + * Contains pointers to multiple L2 tables (each L2 = one page). + * + * L2 Page Tables: + * Each stores up to N DMA addresses (physical page addresses). + * + * The layout allows efficient hardware access while keeping kernel allocations + * manageable for very large mappings (e.g., tens of GB). + * + * Example layout for 4K pages and 512 entries per L2 table: + * + * L1 (dma_pages) -> [L2_0] -> [DMA_ADDR_A, ..., DMA_ADDR_Z] + * [L2_1] -> [DMA_ADDR_X, ..., DMA_ADDR_Y] + * ... + * + * Used during: + * - ib_reg_mr() + * - SEND/WRITE/READ operations with remote access + * - MR invalidation and cleanup in vrdma_dereg_mr() + */ +struct vrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + + u32 mr_handle; + u64 iova; + u64 size; + + u64 **pages; /* L1: array of L2 table DMA address pointers */ + u64 **pages_k; /* L1: array of L2 table kernel virtual addresses */ + dma_addr_t dma_pages; /* DMA address of the L1 table itself */ + + u32 npages; + u32 max_pages; +}; + static inline struct vrdma_cq *to_vcq(struct ib_cq *ibcq) { return container_of(ibcq, struct vrdma_cq, ibcq); @@ -181,6 +246,21 @@ static inline struct vrdma_qp *to_vqp(struct ib_qp *ibqp) return container_of(ibqp, struct vrdma_qp, ibqp); } +static inline struct vrdma_mr *to_vmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct vrdma_mr, ibmr); +} + +static inline struct vrdma_ucontext *to_vucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct vrdma_ucontext, ibucontext); +} + +static inline struct vrdma_ah *to_vah(struct ib_ah *ibah) +{ + return container_of(ibah, struct vrdma_ah, ibah); +} + int vrdma_register_ib_device(struct vrdma_dev *vrdev); void vrdma_unregister_ib_device(struct vrdma_dev *vrdev); -- 2.43.0
