The RFC attempts to implement the IOMMUFD support on PPC64 by adding new iommu_ops for paging domain. The existing platform domain continues to be the default domain for in-kernel use. The domain ownership transfer ensures the reset of iommu states for the new paging domain and in-kernel usage.
On PPC64, IOVA ranges are based on the type of the DMA window and their properties. Currently, there is no way to expose the attributes of the non-default 64-bit DMA window, which the platform supports. The platform allows the operating system to select the starting offset(at 4GiB or 512PiB default offset), pagesize and window size for the non-default 64-bit DMA window. For example, with VFIO, this is handled via VFIO_IOMMU_SPAPR_TCE_GET_INFO and VFIO_IOMMU_SPAPR_TCE_CREATE|REMOVE ioctls. While I am exploring the ways to expose and configure these DMA window attributes as per user input, any suggestions in this regard will be very helpful. Currently existing vfio type1 specific vfio-compat driver even with this patch will not work for PPC64. I believe we need to have a separate "vfio-spapr-compat" driver to make it work. So brief list of current open problems and ongoing reworks: - Second DMA window support as mentioned above. - KVM support. - EEH support. - The vfio compat driver for the spapr tce iommu. - Multiple devices (multifunction, same/different iommu group checks, SRIOV VF assignment) support. - Race conditions, device plug/unplug. - self|tests. The patch currently works for single device and exposes only the default DMA window of 1GB to the user. It has been tested for both PowerNV and pSeries machine tce iommu backends. The testing was done using a Qemu[1] and TCG guest having a NVME device passthrough. One can use the command like below to try: qemu-system-ppc64 -machine pseries -accel tcg \ -device spapr-pci-host-bridge,index=1,id=pci.1,ddw=off \ -device vfio-pci,host=<hostdev>,id=hostdev0,\ bus=pci.1.0,addr=0x1,iommufd=iommufd0 \ -object iommufd,id=iommufd0 <...> ... root:localhost# mount /dev/nvme0n1 /mnt root:localhost# ls /mnt ... The current patch is based on linux kernel 6.19-rc6 tree. Signed-off-by: Shivaprasad G Bhat <[email protected]> References: 1 : https://github.com/shivaprasadbhat/qemu/tree/iommufd-wip --- arch/powerpc/include/asm/iommu.h | 2 arch/powerpc/kernel/iommu.c | 181 +++++++++++++++++++++++++ arch/powerpc/platforms/powernv/pci-ioda-tce.c | 4 - arch/powerpc/platforms/powernv/pci-ioda.c | 4 - arch/powerpc/platforms/powernv/pci.h | 2 arch/powerpc/platforms/pseries/iommu.c | 6 - drivers/vfio/Kconfig | 4 - 7 files changed, 190 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index eafdd63cd6c4..1dc72fbb89e7 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -46,7 +46,7 @@ struct iommu_table_ops { long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs); + unsigned long attrs, bool is_phys); #ifdef CONFIG_IOMMU_API /* * Exchanges existing TCE with new TCE plus direction bits; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0ce71310b7d9..e6543480c461 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -365,7 +365,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* Put the TCEs in the HW table */ build_fail = tbl->it_ops->set(tbl, entry, npages, (unsigned long)page & - IOMMU_PAGE_MASK(tbl), direction, attrs); + IOMMU_PAGE_MASK(tbl), direction, attrs, false); /* tbl->it_ops->set() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return @@ -539,7 +539,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Insert into HW table */ build_fail = tbl->it_ops->set(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK(tbl), - direction, attrs); + direction, attrs, false); if(unlikely(build_fail)) goto failure; @@ -1201,7 +1201,15 @@ spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, * also sets the dma_api ops */ table_group = iommu_group_get_iommudata(grp); + + if (old && old->type == IOMMU_DOMAIN_DMA) { + ret = table_group->ops->unset_window(table_group, 0); + if (ret) + goto exit; + } + ret = table_group->ops->take_ownership(table_group, dev); +exit: iommu_group_put(grp); return ret; @@ -1260,6 +1268,167 @@ static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev) return hose->controller_ops.device_group(hose, pdev); } +struct ppc64_domain { + struct iommu_domain domain; + struct device *device; /* Make it a list */ + struct iommu_table *table; + spinlock_t list_lock; + struct rcu_head rcu; +}; + +static struct ppc64_domain *to_ppc64_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct ppc64_domain, domain); +} + +static void spapr_tce_domain_free(struct iommu_domain *domain) +{ + struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain); + + kfree(ppc64_domain); +} + +static const struct iommu_ops spapr_tce_iommu_ops; +static struct iommu_domain *spapr_tce_domain_alloc_paging(struct device *dev) +{ + struct iommu_group *grp = iommu_group_get(dev); + struct iommu_table_group *table_group; + struct ppc64_domain *ppc64_domain; + struct iommu_table *ptbl; + int ret = -1; + + table_group = iommu_group_get_iommudata(grp); + ppc64_domain = kzalloc(sizeof(*ppc64_domain), GFP_KERNEL); + if (!ppc64_domain) + return NULL; + + /* Just the default window hardcode for now */ + ret = table_group->ops->create_table(table_group, 0, 0xc, 0x40000000, 1, &ptbl); + iommu_tce_table_get(ptbl); + ppc64_domain->table = ptbl; /* REVISIT: Single device for now */ + if (!ppc64_domain->table) { + kfree(ppc64_domain); + iommu_tce_table_put(ptbl); + iommu_group_put(grp); + return NULL; + } + + table_group->ops->set_window(table_group, 0, ptbl); + iommu_group_put(grp); + + ppc64_domain->domain.pgsize_bitmap = SZ_4K; + ppc64_domain->domain.geometry.force_aperture = true; + ppc64_domain->domain.geometry.aperture_start = 0; + ppc64_domain->domain.geometry.aperture_end = 0x40000000; /*default window */ + ppc64_domain->domain.ops = spapr_tce_iommu_ops.default_domain_ops; + + spin_lock_init(&ppc64_domain->list_lock); + + return &ppc64_domain->domain; +} + +static size_t spapr_tce_iommu_unmap_pages(struct iommu_domain *domain, + unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain); + struct iommu_table *tbl = ppc64_domain->table; + unsigned long pgshift = __ffs(pgsize); + size_t size = pgcount << pgshift; + size_t mapped = 0; + unsigned int tcenum; + int mask; + + if (pgsize != SZ_4K) + return -EINVAL; + + size = PAGE_ALIGN(size); + + mask = IOMMU_PAGE_MASK(tbl); + tcenum = iova >> tbl->it_page_shift; + + tbl->it_ops->clear(tbl, tcenum, pgcount); + + mapped = pgsize * pgcount; + + return mapped; +} + +static phys_addr_t spapr_tce_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) +{ + struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain); + struct iommu_table *tbl = ppc64_domain->table; + phys_addr_t paddr, rpn, tceval; + unsigned int tcenum; + + tcenum = iova >> tbl->it_page_shift; + tceval = tbl->it_ops->get(tbl, tcenum); + + /* Ignore the direction bits */ + rpn = tceval >> tbl->it_page_shift; + paddr = rpn << tbl->it_page_shift; + + return paddr; +} + +static int spapr_tce_iommu_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain); + enum dma_data_direction direction = DMA_BIDIRECTIONAL; + struct iommu_table *tbl = ppc64_domain->table; + unsigned long pgshift = __ffs(pgsize); + size_t size = pgcount << pgshift; + unsigned int tcenum; + int ret; + + if (pgsize != SZ_4K) + return -EINVAL; + + if (iova < ppc64_domain->domain.geometry.aperture_start || + (iova + size - 1) > ppc64_domain->domain.geometry.aperture_end) + return -EINVAL; + + if (!IS_ALIGNED(iova | paddr, pgsize)) + return -EINVAL; + + if (!(prot & IOMMU_WRITE)) + direction = DMA_FROM_DEVICE; + + if (!(prot & IOMMU_READ)) + direction = DMA_TO_DEVICE; + + size = PAGE_ALIGN(size); + tcenum = iova >> tbl->it_page_shift; + + /* Put the TCEs in the HW table */ + ret = tbl->it_ops->set(tbl, tcenum, pgcount, + paddr, direction, 0, true); + if (!ret && mapped) + *mapped = pgsize; + + return 0; +} + +static int spapr_tce_iommu_attach_device(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) +{ + struct ppc64_domain *ppc64_domain = to_ppc64_domain(domain); + + /* REVISIT */ + if (!domain) + return 0; + + /* REVISIT: Check table group, list handling */ + ppc64_domain->device = dev; + + return 0; +} + + static const struct iommu_ops spapr_tce_iommu_ops = { .default_domain = &spapr_tce_platform_domain, .blocked_domain = &spapr_tce_blocked_domain, @@ -1267,6 +1436,14 @@ static const struct iommu_ops spapr_tce_iommu_ops = { .probe_device = spapr_tce_iommu_probe_device, .release_device = spapr_tce_iommu_release_device, .device_group = spapr_tce_iommu_device_group, + .domain_alloc_paging = spapr_tce_domain_alloc_paging, + .default_domain_ops = &(const struct iommu_domain_ops) { + .attach_dev = spapr_tce_iommu_attach_device, + .map_pages = spapr_tce_iommu_map_pages, + .unmap_pages = spapr_tce_iommu_unmap_pages, + .iova_to_phys = spapr_tce_iommu_iova_to_phys, + .free = spapr_tce_domain_free, + } }; static struct attribute *spapr_tce_iommu_attrs[] = { diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index e96324502db0..8800bf86d17a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -123,10 +123,10 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs) + unsigned long attrs, bool is_phys) { u64 proto_tce = iommu_direction_to_tce_perm(direction); - u64 rpn = __pa(uaddr) >> tbl->it_page_shift; + u64 rpn = !is_phys ? __pa(uaddr) >> tbl->it_page_shift : uaddr >> tbl->it_page_shift; long i; if (proto_tce & TCE_PCI_WRITE) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b0c1d9d16fb5..610146a63e3b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1241,10 +1241,10 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs) + unsigned long attrs, bool is_phys) { int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, - attrs); + attrs, is_phys); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, npages); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 42075501663b..3579ecd55d00 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -300,7 +300,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs); + unsigned long attrs, bool is_phys); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index eec333dd2e59..8c6f9f18e462 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -122,7 +122,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group, static int tce_build_pSeries(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs) + unsigned long attrs, bool false) { u64 proto_tce; __be64 *tcep; @@ -250,7 +250,7 @@ static DEFINE_PER_CPU(__be64 *, tce_page); static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs) + unsigned long attrs, bool is_phys) { u64 rc = 0; u64 proto_tce; @@ -287,7 +287,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, __this_cpu_write(tce_page, tcep); } - rpn = __pa(uaddr) >> tceshift; + rpn = !is_phys ? __pa(uaddr) >> tceshift : uaddr >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index ceae52fd7586..9929aa78a5da 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -4,7 +4,7 @@ menuconfig VFIO select IOMMU_API depends on IOMMUFD || !IOMMUFD select INTERVAL_TREE - select VFIO_GROUP if SPAPR_TCE_IOMMU || IOMMUFD=n + select VFIO_GROUP if IOMMUFD=n select VFIO_DEVICE_CDEV if !VFIO_GROUP select VFIO_CONTAINER if IOMMUFD=n help @@ -16,7 +16,7 @@ menuconfig VFIO if VFIO config VFIO_DEVICE_CDEV bool "Support for the VFIO cdev /dev/vfio/devices/vfioX" - depends on IOMMUFD && !SPAPR_TCE_IOMMU + depends on IOMMUFD default !VFIO_GROUP help The VFIO device cdev is another way for userspace to get device
