From: Manish Honap <[email protected]> Wire the CXL DPA range up as a VFIO demand-paged region so QEMU can mmap guest device memory directly. Faults call vmf_insert_pfn() to insert one PFN at a time rather than mapping the full range upfront.
CXL region lifecycle: - The CXL memory region is registered with VFIO layer during vfio_pci_open_device - mmap() establishes the VMA with vm_ops but inserts no PTEs - Each guest page fault calls vfio_cxl_region_page_fault() which inserts a single PFN under the memory_lock read side - On device reset, vfio_cxl_zap_region_locked() sets region_active=false and calls unmap_mapping_range() to invalidate all DPA PTEs atomically while holding memory_lock for writing - Faults racing with reset see region_active==false and return VM_FAULT_SIGBUS - vfio_cxl_reactivate_region() restores region_active after successful hardware reset Also integrate the zap/reactivate calls into vfio_pci_ioctl_reset() so that FLR correctly invalidates DPA mappings and restores them on success. Co-developed-by: Zhi Wang <[email protected]> Signed-off-by: Zhi Wang <[email protected]> Signed-off-by: Manish Honap <[email protected]> --- drivers/vfio/pci/cxl/vfio_cxl_core.c | 187 +++++++++++++++++++++++++++ drivers/vfio/pci/cxl/vfio_cxl_emu.c | 2 +- drivers/vfio/pci/cxl/vfio_cxl_priv.h | 3 + drivers/vfio/pci/vfio_pci_core.c | 11 ++ drivers/vfio/pci/vfio_pci_priv.h | 6 + 5 files changed, 208 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c b/drivers/vfio/pci/cxl/vfio_cxl_core.c index 30b365b91903..19d3dc205f99 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_core.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c @@ -435,4 +435,191 @@ void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) vfio_cxl_destroy_cxl_region(cxl); } +static vm_fault_t vfio_cxl_region_vm_fault(struct vm_fault *vmf) +{ + struct vfio_pci_region *region = vmf->vma->vm_private_data; + struct vfio_pci_cxl_state *cxl = region->data; + unsigned long pgoff; + unsigned long pfn; + + if (!READ_ONCE(cxl->region_active)) + return VM_FAULT_SIGBUS; + + pgoff = vmf->pgoff & + ((1UL << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (pgoff >= (cxl->region_size >> PAGE_SHIFT)) + return VM_FAULT_SIGBUS; + + pfn = PHYS_PFN(cxl->region_hpa) + pgoff; + + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); +} + +static const struct vm_operations_struct vfio_cxl_region_vm_ops = { + .fault = vfio_cxl_region_vm_fault, +}; + +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + u64 req_len, pgoff, end; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) + return -EOVERFLOW; + + if (end > cxl->region_size) + return -EINVAL; + + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + + vma->vm_ops = &vfio_cxl_region_vm_ops; + vma->vm_private_data = region; + + return 0; +} + +/* + * vfio_cxl_zap_region_locked - Invalidate all DPA region PTEs. + * + * Must be called with vdev->memory_lock held for writing. Sets + * region_active=false before zapping so any subsequent I/O to the region + * sees the inactive state and returns an error rather than accessing + * stale mappings. + */ +void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + + WRITE_ONCE(cxl->region_active, false); +} + +/* + * vfio_cxl_reactivate_region - Re-enable DPA region after successful reset. + * + * Must be called with vdev->memory_lock held for writing. Re-reads the + * HDM decoder state from hardware (FLR cleared it) and sets region_active + * so that subsequent I/O to the region is permitted again. + */ +void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_cxl_state *cxl = vdev->cxl; + + lockdep_assert_held_write(&vdev->memory_lock); + + if (!cxl) + return; + /* + * Re-initialise the emulated HDM comp_reg_virt[] from hardware. + * After FLR the decoder registers read as zero; mirror that in + * the emulated state so QEMU sees a clean slate. + */ + vfio_cxl_reinit_comp_regs(cxl); + + /* + * Only re-enable the DPA mmap if the hardware has actually + * re-committed decoder 0 after FLR. Read the COMMITTED bit from the + * freshly-re-snapshotted comp_reg_virt[] so we check the post-FLR + * hardware state, not stale pre-reset state. + * + * If COMMITTED is 0 (slow firmware re-commit path), leave + * region_active=false. Guest faults will return VM_FAULT_SIGBUS + * until the decoder is re-committed and the region is re-enabled. + */ + if (cxl->precommitted && cxl->comp_reg_virt) { + /* + * Read CTRL via the full CXL.mem-relative index: hdm_reg_offset + * (now CXL.mem-relative) plus the within-HDM-block offset. + */ + u32 ctrl = le32_to_cpu(*hdm_reg_ptr(cxl, + CXL_HDM_DECODER0_CTRL_OFFSET(0))); + + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + WRITE_ONCE(cxl->region_active, true); + } +} + +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_cxl_state *cxl = core_dev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count || pos >= cxl->region_size) + return 0; + + /* + * Guard against access after a failed reset (region_active=false) + * or a release race (region_vaddr=NULL). Either condition means + * the memremap'd window is no longer valid; touching it would produce + * a Synchronous External Abort. Return -EIO so the caller gets a + * clean error rather than a kernel oops. + */ + if (!READ_ONCE(cxl->region_active) || !cxl->region_vaddr) + return -EIO; + + count = min(count, (size_t)(cxl->region_size - pos)); + + if (iswrite) { + if (copy_from_user(cxl->region_vaddr + pos, buf, count)) + return -EFAULT; + } else { + if (copy_to_user(buf, cxl->region_vaddr + pos, count)) + return -EFAULT; + } + + return count; +} + +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_cxl_state *cxl = region->data; + + /* + * Deactivate the region before removing user mappings so that any + * fault handler racing the release returns VM_FAULT_SIGBUS rather + * than inserting a PFN into an unmapped region. + */ + WRITE_ONCE(cxl->region_active, false); + + if (cxl->region_vaddr) { + memunmap(cxl->region_vaddr); + cxl->region_vaddr = NULL; + } +} + +static const struct vfio_pci_regops vfio_cxl_regops = { + .rw = vfio_cxl_region_rw, + .mmap = vfio_cxl_region_mmap, + .release = vfio_cxl_region_release, +}; + MODULE_IMPORT_NS("CXL"); diff --git a/drivers/vfio/pci/cxl/vfio_cxl_emu.c b/drivers/vfio/pci/cxl/vfio_cxl_emu.c index 11195e8c21d7..781328a79b43 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_emu.c +++ b/drivers/vfio/pci/cxl/vfio_cxl_emu.c @@ -33,7 +33,7 @@ * +0x1c: (reserved) */ -static inline __le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off) { /* * hdm_off is a byte offset within the HDM decoder block. diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h b/drivers/vfio/pci/cxl/vfio_cxl_priv.h index 72a0d7d7e183..3458768445af 100644 --- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h +++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h @@ -33,6 +33,7 @@ struct vfio_pci_cxl_state { u8 comp_reg_bar; bool cache_capable; bool precommitted; + bool region_active; }; /* Register access sizes */ @@ -96,4 +97,6 @@ int vfio_cxl_create_cxl_region(struct vfio_pci_cxl_state *cxl, resource_size_t size); void vfio_cxl_destroy_cxl_region(struct vfio_pci_cxl_state *cxl); +__le32 *hdm_reg_ptr(struct vfio_pci_cxl_state *cxl, u32 hdm_off); + #endif /* __LINUX_VFIO_CXL_PRIV_H */ diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index b7364178e23d..48e0274c19aa 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1223,6 +1223,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_zap_and_down_write_memory_lock(vdev); + /* Zap CXL DPA region PTEs before hardware reset clears HDM state */ + vfio_cxl_zap_region_locked(vdev); + /* * This function can be invoked while the power state is non-D0. If * pci_try_reset_function() has been called while the power state is @@ -1236,6 +1239,14 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + + /* + * Re-enable DPA region if reset succeeded; fault handler will + * re-insert PFNs on next access without requiring a new mmap. + */ + if (!ret) + vfio_cxl_reactivate_region(vdev); + if (__vfio_pci_memory_enabled(vdev)) vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 1082ba43bafe..726063b6ff70 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -145,6 +145,8 @@ static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev); void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev); +void vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev); +void vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev); #else @@ -152,6 +154,10 @@ static inline void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev) { } static inline void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_zap_region_locked(struct vfio_pci_core_device *vdev) { } +static inline void +vfio_cxl_reactivate_region(struct vfio_pci_core_device *vdev) { } #endif /* CONFIG_VFIO_CXL_CORE */ -- 2.25.1

