[PATCH v3 10/24] powerpc/powernv/ioda2: Rework IOMMU ownership control
At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. The set_bypass() callback is not really an iommu_table function but IOMMU/PE function. This introduces a powerpc_iommu_ops struct and adds a set_ownership() callback to it which is called when an external user takes control over the IOMMU. This renames set_bypass() to set_ownership() as it is not necessarily just enabling bypassing, it can be something else/more so let's give it more generic name. The bool parameter is inverted. The callback is implemented for IODA2 only. This replaces iommu_take_ownership()/iommu_release_ownership() calls with the callback calls and it is up to the platform code to call iommu_take_ownership()/iommu_release_ownership() if needed. Next patches will remove these calls from IODA2 code. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h | 18 +-- arch/powerpc/kernel/iommu.c | 53 +++ arch/powerpc/platforms/powernv/pci-ioda.c | 30 - drivers/vfio/vfio_iommu_spapr_tce.c | 19 --- 4 files changed, 90 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 4fe..ba16aa0 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -92,7 +92,6 @@ struct iommu_table { unsigned long it_page_shift;/* table iommu page size */ struct powerpc_iommu *it_iommu; struct iommu_table_ops *it_ops; - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ @@ -127,11 +126,24 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, #define POWERPC_IOMMU_MAX_TABLES 1 +struct powerpc_iommu; + +struct powerpc_iommu_ops { + /* +* Switches ownership from the kernel itself to an external +* user. While onwership is enabled, the kernel cannot use IOMMU +* for itself. +*/ + void (*set_ownership)(struct powerpc_iommu *iommu, + bool enable); +}; + struct powerpc_iommu { #ifdef CONFIG_IOMMU_API struct iommu_group *group; #endif struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; + struct powerpc_iommu_ops *ops; }; #ifdef CONFIG_IOMMU_API @@ -219,8 +231,8 @@ extern unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry); extern void iommu_flush_tce(struct iommu_table *tbl); -extern int iommu_take_ownership(struct iommu_table *tbl); -extern void iommu_release_ownership(struct iommu_table *tbl); +extern int iommu_take_ownership(struct powerpc_iommu *iommu); +extern void iommu_release_ownership(struct powerpc_iommu *iommu); #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 407d0d6..9d06425 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1022,7 +1022,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_build); -int iommu_take_ownership(struct iommu_table *tbl) +static int iommu_table_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; int ret = 0; @@ -1047,19 +1047,36 @@ int iommu_take_ownership(struct iommu_table *tbl) spin_unlock(>pools[i].lock); spin_unlock_irqrestore(>large_pool.lock, flags); - /* -* Disable iommu bypass, otherwise the user can DMA to all of -* our physical memory via the bypass window instead of just -* the pages that has been explicitly mapped into the iommu -*/ - if (!ret && tbl->set_bypass) - tbl->set_bypass(tbl, false); - - return ret; + return 0; +} + +static void iommu_table_release_ownership(struct iommu_table *tbl); + +int iommu_take_ownership(struct powerpc_iommu *iommu) +{ + int i, j, rc = 0; + + for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) { + struct iommu_table *tbl = >tables[i]; + + if (!tbl->it_map) + continue; + + rc = iommu_table_take_ownership(tbl); + if (rc) { + for (j = 0; j < i; ++j) + iommu_table_release_ownership( + >tables[j]); + + return rc; + } + } + + return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); -void iommu_release_ownership(struct iommu_table *tbl) +static void iommu_table_release_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it
[PATCH v3 14/24] vfio: powerpc/spapr: Register memory
The existing implementation accounts the whole DMA window in the locked_vm counter which is going to be even worse with multiple containers and huge DMA windows. This introduces 2 ioctls to register/unregister DMA memory which receive user space address and size of the memory region which needs to be pinned/unpinned and counted in locked_vm. If any memory region was registered, all subsequent DMA map requests should address already pinned memory. If no memory was registered, then the amount of memory required for a single default memory will be accounted when the container is enabled and every map/unmap will pin/unpin a page. Dynamic DMA window and in-kernel acceleration will require memory to be registered in order to work. The accounting is done per VFIO container. When the support of multiple groups per container is added, we will have accurate locked_vm accounting. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 333 include/uapi/linux/vfio.h | 29 2 files changed, 331 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8256275..d0987ae 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -86,8 +86,169 @@ struct tce_container { struct mutex lock; struct iommu_group *grp; bool enabled; + struct list_head mem_list; }; +struct tce_memory { + struct list_head next; + struct rcu_head rcu; + __u64 vaddr; + __u64 size; + __u64 pfns[]; +}; + +static void tce_unpin_pages(struct tce_container *container, + struct tce_memory *mem, __u64 vaddr, __u64 size) +{ + __u64 off; + struct page *page = NULL; + + + for (off = 0; off < size; off += PAGE_SIZE) { + if (!mem->pfns[off >> PAGE_SHIFT]) + continue; + + page = pfn_to_page(mem->pfns[off >> PAGE_SHIFT]); + if (!page) + continue; + + put_page(page); + mem->pfns[off >> PAGE_SHIFT] = 0; + } +} + +static void release_tce_memory(struct rcu_head *head) +{ + struct tce_memory *mem = container_of(head, struct tce_memory, rcu); + + kfree(mem); +} + +static void tce_do_unregister_pages(struct tce_container *container, + struct tce_memory *mem) +{ + tce_unpin_pages(container, mem, mem->vaddr, mem->size); + decrement_locked_vm(mem->size); + list_del_rcu(>next); + call_rcu_sched(>rcu, release_tce_memory); +} + +static long tce_unregister_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + struct tce_memory *mem, *memtmp; + + if (container->enabled) + return -EBUSY; + + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) + return -EINVAL; + + list_for_each_entry_safe(mem, memtmp, >mem_list, next) { + if ((mem->vaddr == vaddr) && (mem->size == size)) { + tce_do_unregister_pages(container, mem); + return 0; + } + } + + return -ENOENT; +} + +static long tce_pin_pages(struct tce_container *container, + struct tce_memory *mem, __u64 vaddr, __u64 size) +{ + __u64 off; + struct page *page = NULL; + + for (off = 0; off < size; off += PAGE_SIZE) { + if (1 != get_user_pages_fast(vaddr + off, + 1/* pages */, 1/* iswrite */, )) { + tce_unpin_pages(container, mem, vaddr, off); + return -EFAULT; + } + + mem->pfns[off >> PAGE_SHIFT] = page_to_pfn(page); + } + + return 0; +} + +static long tce_register_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + long ret; + struct tce_memory *mem; + + if (container->enabled) + return -EBUSY; + + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || + ((vaddr + size) < vaddr)) + return -EINVAL; + + /* Any overlap with registered chunks? */ + rcu_read_lock(); + list_for_each_entry_rcu(mem, >mem_list, next) { + if ((mem->vaddr < (vaddr + size)) && + (vaddr < (mem->vaddr + mem->size))) { + ret = -EBUSY; + goto unlock_exit; + } + } + + ret = try_increment_locked_vm(size >> PAGE_SHIFT); + if (ret) + goto unlock_exit; + + mem = kzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)), GFP_KERNEL); + if (!mem) + goto unlock_exit; + + if (tce_pin_pages(container
[PATCH v3 19/24] powerpc/powernv: Implement multilevel TCE tables
This adds multi-level TCE tables support to pnv_pci_ioda2_create_table() and pnv_pci_ioda2_free_table() callbacks. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h | 4 + arch/powerpc/platforms/powernv/pci-ioda.c | 125 +++--- arch/powerpc/platforms/powernv/pci.c | 19 + 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index cc26eca..283f70f 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -85,6 +85,8 @@ struct iommu_pool { struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ + unsigned long it_indirect_levels; + unsigned long it_level_size; unsigned long it_offset;/* Offset into global table */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ @@ -133,6 +135,8 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, #define POWERPC_IOMMU_MAX_TABLES 1 +#define POWERPC_IOMMU_DEFAULT_LEVELS 1 + struct powerpc_iommu; struct powerpc_iommu_ops { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1f725d4..f542819 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1295,16 +1295,79 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } +static void pnv_free_tce_table(unsigned long addr, unsigned size, + unsigned level) +{ + addr &= ~(TCE_PCI_READ | TCE_PCI_WRITE); + + if (level) { + long i; + u64 *tmp = (u64 *) addr; + + for (i = 0; i < size; ++i) { + unsigned long hpa = be64_to_cpu(tmp[i]); + + if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) + continue; + + pnv_free_tce_table((unsigned long) __va(hpa), + size, level - 1); + } + } + + free_pages(addr, get_order(size << 3)); +} + +static __be64 *pnv_alloc_tce_table(int nid, + unsigned shift, unsigned levels, unsigned long *left) +{ + struct page *tce_mem = NULL; + __be64 *addr, *tmp; + unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; + unsigned long chunk = 1UL << shift, i; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory\n"); + return NULL; + } + + if (!*left) + return NULL; + + addr = page_address(tce_mem); + memset(addr, 0, chunk); + + --levels; + if (!levels) { + /* This is last level, actual TCEs */ + *left -= min(*left, chunk); + return addr; + } + + for (i = 0; i < (chunk >> 3); ++i) { + /* We allocated required TCEs, mark the rest "page fault" */ + if (!*left) { + addr[i] = cpu_to_be64(0); + continue; + } + + tmp = pnv_alloc_tce_table(nid, shift, levels, left); + addr[i] = cpu_to_be64(__pa(tmp) | + TCE_PCI_READ | TCE_PCI_WRITE); + } + + return addr; +} + static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, - __u32 page_shift, __u32 window_shift, + __u32 page_shift, __u32 window_shift, __u32 levels, struct iommu_table *tbl) { int nid = pe->phb->hose->node; - struct page *tce_mem = NULL; void *addr; - unsigned long tce_table_size; - int64_t rc; - unsigned order; + unsigned long tce_table_size, left; + unsigned shift; if ((page_shift != 12) && (page_shift != 16) && (page_shift != 24)) return -EINVAL; @@ -1312,20 +1375,27 @@ static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, if ((1ULL << window_shift) > memory_hotplug_max()) return -EINVAL; + if (!levels || (levels > 5)) + return -EINVAL; + tce_table_size = (1ULL << (window_shift - page_shift)) * 8; tce_table_size = max(0x1000UL, tce_table_size); /* Allocate TCE table */ - order = get_order(tce_table_size); +#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) + shift = ROUND_UP(window_shift - page_shift, levels) / levels; + shift += 3; + shift = max_t(unsigned, shift,
[PATCH v3 20/24] powerpc/powernv: Change prototypes to receive iommu
This changes few functions to receive a powerpc_iommu pointer rather than PE as they are going to be a part of upcoming powerpc_iommu_ops callback set. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f542819..29bd7a4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1360,10 +1360,12 @@ static __be64 *pnv_alloc_tce_table(int nid, return addr; } -static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, +static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, __u32 page_shift, __u32 window_shift, __u32 levels, struct iommu_table *tbl) { + struct pnv_ioda_pe *pe = container_of(iommu, struct pnv_ioda_pe, + iommu); int nid = pe->phb->hose->node; void *addr; unsigned long tce_table_size, left; @@ -1419,9 +1421,11 @@ static void pnv_pci_ioda2_free_table(struct iommu_table *tbl) iommu_reset_table(tbl, "ioda2"); } -static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe, +static long pnv_pci_ioda2_set_window(struct powerpc_iommu *iommu, struct iommu_table *tbl) { + struct pnv_ioda_pe *pe = container_of(iommu, struct pnv_ioda_pe, + iommu); struct pnv_phb *phb = pe->phb; const __be64 *swinvp; int64_t rc; @@ -1554,12 +1558,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; - end = (1 << ilog2(phb->ioda.m32_pci_base)); pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", end); - rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K, + rc = pnv_pci_ioda2_create_table(>iommu, IOMMU_PAGE_SHIFT_4K, ilog2(phb->ioda.m32_pci_base), POWERPC_IOMMU_DEFAULT_LEVELS, tbl); if (rc) { @@ -1571,7 +1574,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->iommu.tables[0].it_iommu = >iommu; pe->iommu.ops = _pci_ioda2_ops; - rc = pnv_pci_ioda2_set_window(pe, tbl); + rc = pnv_pci_ioda2_set_window(>iommu, tbl); if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table," " err %ld\n", rc); -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 22/24] powerpc/iommu: Get rid of ownership helpers
iommu_take_ownership/iommu_release_ownership used to be used to mark bits in iommu_table::it_map. Since the IOMMU tables are recreated for VFIO, it_map is always NULL. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h | 2 - arch/powerpc/kernel/iommu.c | 96 2 files changed, 98 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 8393822..33009f9 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -272,8 +272,6 @@ extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, enum dma_data_direction direction); extern void iommu_flush_tce(struct iommu_table *tbl); -extern int iommu_take_ownership(struct powerpc_iommu *iommu); -extern void iommu_release_ownership(struct powerpc_iommu *iommu); #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 5f87076..6987115 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1007,102 +1007,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_xchg); -static int iommu_table_take_ownership(struct iommu_table *tbl) -{ - unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; - int ret = 0; - - /* -* VFIO does not control TCE entries allocation and the guest -* can write new TCEs on top of existing ones so iommu_tce_build() -* must be able to release old pages. This functionality -* requires exchange() callback defined so if it is not -* implemented, we disallow taking ownership over the table. -*/ - if (!tbl->it_ops->exchange) - return -EINVAL; - - spin_lock_irqsave(>large_pool.lock, flags); - for (i = 0; i < tbl->nr_pools; i++) - spin_lock(>pools[i].lock); - - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); - - if (!bitmap_empty(tbl->it_map, tbl->it_size)) { - pr_err("iommu_tce: it_map is not empty"); - ret = -EBUSY; - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); - } else { - memset(tbl->it_map, 0xff, sz); - } - - for (i = 0; i < tbl->nr_pools; i++) - spin_unlock(>pools[i].lock); - spin_unlock_irqrestore(>large_pool.lock, flags); - - return 0; -} - -static void iommu_table_release_ownership(struct iommu_table *tbl); - -int iommu_take_ownership(struct powerpc_iommu *iommu) -{ - int i, j, rc = 0; - - for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) { - struct iommu_table *tbl = >tables[i]; - - if (!tbl->it_map) - continue; - - rc = iommu_table_take_ownership(tbl); - if (rc) { - for (j = 0; j < i; ++j) - iommu_table_release_ownership( - >tables[j]); - - return rc; - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_take_ownership); - -static void iommu_table_release_ownership(struct iommu_table *tbl) -{ - unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; - - spin_lock_irqsave(>large_pool.lock, flags); - for (i = 0; i < tbl->nr_pools; i++) - spin_lock(>pools[i].lock); - - memset(tbl->it_map, 0, sz); - - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); - - for (i = 0; i < tbl->nr_pools; i++) - spin_unlock(>pools[i].lock); - spin_unlock_irqrestore(>large_pool.lock, flags); -} - -extern void iommu_release_ownership(struct powerpc_iommu *iommu) -{ - int i; - - for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) { - struct iommu_table *tbl = >tables[i]; - - if (tbl->it_map) - iommu_table_release_ownership(tbl); - } -} -EXPORT_SYMBOL_GPL(iommu_release_ownership); - int iommu_add_device(struct device *dev) { struct iommu_table *tbl; -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 13/24] powerpc/pseries/lpar: Enable VFIO
The previous patch introduced iommu_table_ops::exchange() callback which effectively disabled VFIO on pseries. This implements exchange() for pseries/lpar so VFIO can work in nested guests. Since exchange() callback returns an old TCE, it has to call H_GET_TCE for every TCE being put to the table so VFIO performance in guests running under PR KVM is expected to be slower than in guests running under HV KVM or bare metal hosts. Signed-off-by: Alexey Kardashevskiy --- Changes: v5: * added global lock for xchg operations * added missing be64_to_cpu(oldtce) --- arch/powerpc/platforms/pseries/iommu.c | 44 -- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index f537e6e..a903a27 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -137,14 +137,25 @@ static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, + unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 rc = 0; u64 proto_tce, tce; u64 rpn; - int ret = 0; + int ret = 0, i = 0; long tcenum_start = tcenum, npages_start = npages; + static spinlock_t get_tces_lock; + static bool get_tces_lock_initialized; + + if (old_tces) { + if (!get_tces_lock_initialized) { + spin_lock_init(_tces_lock); + get_tces_lock_initialized = true; + } + spin_lock(_tces_lock); + } rpn = __pa(uaddr) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; @@ -153,6 +164,14 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, while (npages--) { tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + if (old_tces) { + unsigned long oldtce = 0; + + plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, + ); + old_tces[i] = be64_to_cpu(oldtce); + i++; + } rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { @@ -173,13 +192,18 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum++; rpn++; } + + if (old_tces) + spin_unlock(_tces_lock); + return ret; } static DEFINE_PER_CPU(__be64 *, tce_page); -static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_xchg_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, +unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -194,6 +218,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } @@ -210,6 +235,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } __this_cpu_write(tce_page, tcep); @@ -231,6 +257,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, for (l = 0; l < limit; l++) { tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); rpn++; + if (old_tces) + plpar_tce_get((u64)tbl->it_index, + (u64)(tcenum + l) << 12, + _tces[tcenum + l]); } rc = plpar_tce_put_indirect((u64)tbl->it_index, @@ -261,6 +291,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +long npages, uns
[PATCH v3 21/24] powerpc/powernv/ioda: Define and implement DMA table/window management callbacks
This extends powerpc_iommu_ops by a set of callbacks to support dynamic DMA windows management. query() returns IOMMU capabilities such as default DMA window address and supported number of DMA windows and TCE table levels. create_table() creates a TCE table with specific parameters. For now it receives powerpc_iommu to know nodeid in order to allocate TCE table memory closer to the PHB. The exact format of allocated multi-level table might be also specific to the PHB model (not the case now though). set_window() sets the window at specified TVT index on PHB. unset_window() unsets the window from specified TVT. free_table() frees the memory occupied by a table. The purpose of this separation is that we need to be able to create one table and assign it to a set of PHB. This way we can support multiple IOMMU groups in one VFIO container and make use of VFIO on SPAPR closer to the way it works on x86. This uses new helpers to remove the default TCE table if the ownership is being taken and create it otherwise. So once an external user (such as VFIO) obtained the ownership over a group, it does not have any DMA windows, neither default 32bit not bypass window. The external user is expected to unprogram DMA windows on PHBs before returning ownership back to the kernel. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h | 31 ++ arch/powerpc/platforms/powernv/pci-ioda.c | 98 ++- 2 files changed, 113 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 283f70f..8393822 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -147,12 +147,43 @@ struct powerpc_iommu_ops { */ void (*set_ownership)(struct powerpc_iommu *iommu, bool enable); + + long (*create_table)(struct powerpc_iommu *iommu, + int num, + __u32 page_shift, + __u32 window_shift, + __u32 levels, + struct iommu_table *tbl); + long (*set_window)(struct powerpc_iommu *iommu, + int num, + struct iommu_table *tblnew); + long (*unset_window)(struct powerpc_iommu *iommu, + int num); + void (*free_table)(struct iommu_table *tbl); }; +/* Page size flags for ibm,query-pe-dma-window */ +#define DDW_PGSIZE_4K 0x01 +#define DDW_PGSIZE_64K 0x02 +#define DDW_PGSIZE_16M 0x04 +#define DDW_PGSIZE_32M 0x08 +#define DDW_PGSIZE_64M 0x10 +#define DDW_PGSIZE_128M 0x20 +#define DDW_PGSIZE_256M 0x40 +#define DDW_PGSIZE_16G 0x80 +#define DDW_PGSIZE_MASK 0xFF + struct powerpc_iommu { #ifdef CONFIG_IOMMU_API struct iommu_group *group; #endif + /* Some key properties of IOMMU */ + __u32 tce32_start; + __u32 tce32_size; + __u32 windows_supported; + __u32 levels; + __u32 flags; + struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; struct powerpc_iommu_ops *ops; }; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 29bd7a4..cf63ebb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1360,7 +1360,7 @@ static __be64 *pnv_alloc_tce_table(int nid, return addr; } -static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, +static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, int num, __u32 page_shift, __u32 window_shift, __u32 levels, struct iommu_table *tbl) { @@ -1388,8 +1388,8 @@ static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, shift = ROUND_UP(window_shift - page_shift, levels) / levels; shift += 3; shift = max_t(unsigned, shift, IOMMU_PAGE_SHIFT_4K); - pr_info("Creating TCE table %08llx, %d levels, TCE table size = %lx\n", - 1ULL << window_shift, levels, 1UL << shift); + pr_info("Creating TCE table #%d %08llx, %d levels, TCE table size = %lx\n", + num, 1ULL << window_shift, levels, 1UL << shift); tbl->it_level_size = 1ULL << (shift - 3); left = tce_table_size; @@ -1400,11 +1400,10 @@ static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, tbl->it_indirect_levels = levels - 1; /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - page_shift); + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, + num ? pe->tce_bypass_base : 0, page_shift); tbl->it_ops = _ioda2_iommu_ops; - iommu_init_table(tbl, nid); return 0; } @@ -1421,8 +1420
[PATCH v3 24/24] vfio: powerpc/spapr: Support Dynamic DMA windows
This adds create/remove window ioctls to create and remove DMA windows. This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional information such as a number of supported windows and maximum number levels of TCE tables. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 2 +- drivers/vfio/vfio_iommu_spapr_tce.c | 137 +++- include/uapi/linux/vfio.h | 24 ++- 3 files changed, 160 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 33009f9..7ca1c8c 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -133,7 +133,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); -#define POWERPC_IOMMU_MAX_TABLES 1 +#define POWERPC_IOMMU_MAX_TABLES 2 #define POWERPC_IOMMU_DEFAULT_LEVELS 1 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8bcafb7..d3a1cc9 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -300,6 +300,20 @@ static struct iommu_table *spapr_tce_find_table( return ret; } +static int spapr_tce_find_free_table(struct tce_container *container) +{ + int i; + + for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) { + struct iommu_table *tbl = >tables[i]; + + if (!tbl->it_size) + return i; + } + + return -1; +} + static unsigned long tce_default_winsize(struct tce_container *container) { struct tce_iommu_group *tcegrp; @@ -594,7 +608,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -636,6 +650,15 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_start = iommu->tce32_start; info.dma32_window_size = iommu->tce32_size; + info.windows_supported = iommu->windows_supported; + info.levels = iommu->levels; + info.flags = iommu->flags; + + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, + levels); + + if (info.argsz == ddwsz) + minsz = ddwsz; if (copy_to_user((void __user *)arg, , minsz)) return -EFAULT; @@ -800,6 +823,118 @@ static long tce_iommu_ioctl(void *iommu_data, return ret; } + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct powerpc_iommu *iommu; + struct tce_iommu_group *tcegrp; + int num; + + if (!tce_preregistered(container)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz < minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + num = spapr_tce_find_free_table(container); + if (num < 0) + return -ENOSYS; + + tcegrp = list_first_entry(>group_list, + struct tce_iommu_group, next); + iommu = iommu_group_get_iommudata(tcegrp->grp); + + ret = iommu->ops->create_table(iommu, num, + create.page_shift, create.window_shift, + create.levels, + >tables[num]); + if (ret) + return ret; + + list_for_each_entry(tcegrp, >group_list, next) { + struct powerpc_iommu *iommutmp = + iommu_group_get_iommudata(tcegrp->grp); + + if (WARN_ON_ONCE(iommutmp->ops != iommu->ops)) + return -EFAULT; + + ret = iommu->ops->set_window(iommutmp, num, + >tables[num]); + if (ret) + return ret; + } + + create.start_addr = + container->tables[num].it_offset << + container->tables[num].it_page_shift; + + if (copy_to_user((void __user *)arg, , mins
[PATCH v3 23/24] vfio/spapr: Enable multiple groups in a container
Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 243 +++- 1 file changed, 155 insertions(+), 88 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index d0987ae..8bcafb7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -84,9 +84,15 @@ static void decrement_locked_vm(long npages) */ struct tce_container { struct mutex lock; - struct iommu_group *grp; bool enabled; struct list_head mem_list; + struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; + struct list_head group_list; +}; + +struct tce_iommu_group { + struct list_head next; + struct iommu_group *grp; }; struct tce_memory { @@ -265,17 +271,21 @@ static bool tce_check_page_size(struct page *page, unsigned page_shift) return false; } +static inline bool tce_groups_attached(struct tce_container *container) +{ + return !list_empty(>group_list); +} + static struct iommu_table *spapr_tce_find_table( struct tce_container *container, phys_addr_t ioba) { long i; struct iommu_table *ret = NULL; - struct powerpc_iommu *iommu = iommu_group_get_iommudata(container->grp); mutex_lock(>lock); for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) { - struct iommu_table *tbl = >tables[i]; + struct iommu_table *tbl = >tables[i]; unsigned long entry = ioba >> tbl->it_page_shift; unsigned long start = tbl->it_offset; unsigned long end = start + tbl->it_size; @@ -290,13 +300,31 @@ static struct iommu_table *spapr_tce_find_table( return ret; } +static unsigned long tce_default_winsize(struct tce_container *container) +{ + struct tce_iommu_group *tcegrp; + struct powerpc_iommu *iommu; + + if (!tce_groups_attached(container)) + return 0; + + tcegrp = list_first_entry(>group_list, + struct tce_iommu_group, next); + if (!tcegrp) + return 0; + + iommu = iommu_group_get_iommudata(tcegrp->grp); + if (!iommu) + return 0; + + return iommu->tce32_size; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - struct powerpc_iommu *iommu; - struct iommu_table *tbl; - if (!container->grp) + if (!tce_groups_attached(container)) return -ENXIO; if (container->enabled) @@ -328,12 +356,8 @@ static int tce_iommu_enable(struct tce_container *container) * KVM agnostic. */ if (!tce_preregistered(container)) { - iommu = iommu_group_get_iommudata(container->grp); - if (!iommu) - return -EFAULT; - - tbl = >tables[0]; - ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl)); + ret = try_increment_locked_vm( + tce_default_winsize(container) >> PAGE_SHIFT); if (ret) return ret; } @@ -343,27 +367,23 @@ static int tce_iommu_enable(struct tce_container *container) return ret; } +static int tce_iommu_clear(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long pages); + static void tce_iommu_disable(struct tce_container *container) { - struct powerpc_iommu *iommu; - struct iommu_table *tbl; - if (!container->enabled) return; container->enabled = false; - if (!container->grp || !current->mm) + if (!current->mm) return; - if (!tce_preregistered(container)) { - iommu = iommu_group_get_iommudata(container->grp); - if (!iommu) - return; - - tbl = >tables[0]; - decrement_locked_vm(IOMMU_TABLE_PAGES(tbl)); - } + if (!tce_preregistered(container)) + decrement_locked_vm( + tce_default_winsize(container) >> PAGE_SHIFT); } static void *tce_iommu_open(unsigned long arg) @@ -381,20 +401,44 @@ static void *tce_iommu_open(unsigned long arg) mutex_init(>lock); INIT_LIST_HEAD_RCU(>mem_list); + INIT_LIST_HEAD_RCU(>group_list); return container; } static void tce_iommu_release(void *iommu_data) { + int i; + struct powerpc_iommu *iommu; + struct tce_iommu_group *tcegrp; struct tce_container *container = iommu_data; struct tce_memory *mem, *memtmp; + struct powerpc_iommu_ops *iommuops = NULL; - WARN_ON(container->grp); tce_iommu_disable(container); - if
[PATCH v3 15/24] poweppc/powernv/ioda2: Rework iommu_table creation
This moves iommu_table creation to the beginning. This is a mechanical patch. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +-- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6d279d5..ebfea0a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1393,27 +1393,31 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, addr = page_address(tce_mem); memset(addr, 0, tce_table_size); + /* Setup iommu */ + pe->iommu.tables[0].it_iommu = >iommu; + + /* Setup linux iommu table */ + tbl = >iommu.tables[0]; + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + IOMMU_PAGE_SHIFT_4K); + + tbl->it_ops = _ioda2_iommu_ops; + iommu_init_table(tbl, phb->hose->node); + pe->iommu.ops = _pci_ioda2_ops; + /* * Map TCE table through TVT. The TVE index is the PE number * shifted by 1 bit for 32-bits DMA space. */ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(addr), - tce_table_size, 0x1000); + pe->pe_number << 1, 1, __pa(tbl->it_base), + tbl->it_size << 3, 1ULL << tbl->it_page_shift); if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table," " err %ld\n", rc); goto fail; } - /* Setup iommu */ - pe->iommu.tables[0].it_iommu = >iommu; - - /* Setup linux iommu table */ - tbl = >iommu.tables[0]; - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); - /* OPAL variant of PHB3 invalidated TCEs */ swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); if (swinvp) { @@ -1427,14 +1431,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 8); tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } - tbl->it_ops = _ioda2_iommu_ops; - iommu_init_table(tbl, phb->hose->node); - pe->iommu.ops = _pci_ioda2_ops; + iommu_register_group(>iommu, phb->hose->global_number, pe->pe_number); if (pe->pdev) - set_iommu_table_base_and_group(>pdev->dev, tbl); + set_iommu_table_base_and_group(>pdev->dev, + >iommu.tables[0]); else pnv_ioda_setup_bus_dma(pe, pe->pbus, true); -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 01/24] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver
This moves page pinning (get_user_pages_fast()/put_page()) code out of the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs to as the platform code does not deal with page pinning. This makes iommu_take_ownership()/iommu_release_ownership() deal with the IOMMU table bitmap only. This removes page unpinning from iommu_take_ownership() as the actual TCE table might contain garbage and doing put_page() on it is undefined behaviour. Besides the last part, the rest of the patch is mechanical. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 6 --- arch/powerpc/kernel/iommu.c | 68 --- drivers/vfio/vfio_iommu_spapr_tce.c | 91 +++-- 3 files changed, 78 insertions(+), 87 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 9cfa370..45b07f6 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -191,16 +191,10 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction); extern unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry); -extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages); -extern int iommu_put_tce_user_mode(struct iommu_table *tbl, - unsigned long entry, unsigned long tce); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); extern void iommu_release_ownership(struct iommu_table *tbl); -extern enum dma_data_direction iommu_tce_direction(unsigned long tce); - #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 5d3968c..456acb1 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -903,19 +903,6 @@ void iommu_register_group(struct iommu_table *tbl, kfree(name); } -enum dma_data_direction iommu_tce_direction(unsigned long tce) -{ - if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) - return DMA_BIDIRECTIONAL; - else if (tce & TCE_PCI_READ) - return DMA_TO_DEVICE; - else if (tce & TCE_PCI_WRITE) - return DMA_FROM_DEVICE; - else - return DMA_NONE; -} -EXPORT_SYMBOL_GPL(iommu_tce_direction); - void iommu_flush_tce(struct iommu_table *tbl) { /* Flush/invalidate TLB caches if necessary */ @@ -991,30 +978,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) } EXPORT_SYMBOL_GPL(iommu_clear_tce); -int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) -{ - unsigned long oldtce; - struct page *page; - - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); - /* * hwaddr is a kernel virtual address here (0xc... bazillion), * tce_build converts it to a physical address. @@ -1044,35 +1007,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_build); -int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, - unsigned long tce) -{ - int ret; - struct page *page = NULL; - unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; - enum dma_data_direction direction = iommu_tce_direction(tce); - - ret = get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, ); - if (unlikely(ret != 1)) { - /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", - tce, entry << tbl->it_page_shift, ret); */ - return -EFAULT; - } - hwaddr = (unsigned long) page_address(page) + offset; - - ret = iommu_tce_build(tbl, entry, hwaddr, direction); - if (ret) - put_page(page); - - if (ret < 0) - pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", - __func__, entry << tbl->it_page_shift, tce, ret); - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); - int iommu_take_ownership(struct iommu_table *tbl) { unsigned long sz = (tbl->it_size + 7) >> 3; @@
[PATCH v3 06/24] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
This adds a iommu_table_ops struct and puts pointer to it into the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush callbacks from ppc_md to the new struct where they really belong to. This adds the requirement for @it_ops to be initialized before calling iommu_init_table() to make sure that we do not leave any IOMMU table with iommu_table_ops uninitialized. This is not a parameter of iommu_init_table() though as there will be cases when iommu_init_table() will not be called on TCE tables used by VFIO. This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_" redundand prefixes. This removes tce_xxx_rm handlers from ppc_md but does not add them to iommu_table_ops as this will be done later if we decide to support TCE hypercalls in real mode. For pSeries, this always uses tce_buildmulti_pSeriesLP/ tce_buildmulti_pSeriesLP. This changes multi callback to fall back to tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not present. The reason for this is we still have to support "multitce=off" boot parameter in disable_multitce() and we do not want to walk through all IOMMU tables in the system and replace "multi" callbacks with single ones. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 17 +++ arch/powerpc/include/asm/machdep.h | 25 arch/powerpc/kernel/iommu.c | 46 +++-- arch/powerpc/kernel/vio.c | 5 arch/powerpc/platforms/cell/iommu.c | 8 +++-- arch/powerpc/platforms/pasemi/iommu.c | 7 +++-- arch/powerpc/platforms/powernv/pci-ioda.c | 2 ++ arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 23 --- arch/powerpc/platforms/powernv/pci.h| 1 + arch/powerpc/platforms/pseries/iommu.c | 34 +++-- arch/powerpc/sysdev/dart_iommu.c| 12 12 files changed, 93 insertions(+), 88 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 45b07f6..eb5822d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -43,6 +43,22 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*clear)(struct iommu_table *tbl, + long index, long npages); + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -77,6 +93,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index c8175a3..2abe744 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void(*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void(*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free_rm)(struct iommu_table *tbl, - long index, - long npages); - void(*tce_flush_rm)(struct iommu_table *tbl); - void __iomem * (*ioremap)(phys_a
[PATCH v3 17/24] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window
This is a part of moving DMA window programming to an iommu_ops callback. This is a mechanical patch. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 84 --- 1 file changed, 56 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 95d9119..1f725d4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1351,6 +1351,57 @@ static void pnv_pci_ioda2_free_table(struct iommu_table *tbl) memset(tbl, 0, sizeof(struct iommu_table)); } +static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe, + struct iommu_table *tbl) +{ + struct pnv_phb *phb = pe->phb; + const __be64 *swinvp; + int64_t rc; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; + + pe_info(pe, "Setting up window at %llx..%llx pagesize=0x%x tablesize=0x%lx\n", + start_addr, start_addr + win_size - 1, + 1UL << tbl->it_page_shift, tbl->it_size << 3); + + pe->iommu.tables[0] = *tbl; + tbl = >iommu.tables[0]; + tbl->it_iommu = >iommu; + + /* +* Map TCE table through TVT. The TVE index is the PE number +* shifted by 1 bit for 32-bits DMA space. +*/ + rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + pe->pe_number << 1, 1, __pa(tbl->it_base), + tbl->it_size << 3, 1ULL << tbl->it_page_shift); + if (rc) { + pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); + goto fail; + } + + /* OPAL variant of PHB3 invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data +* to or. Since the bus is only printed out on table free +* errors, and on the first pass the data will be a relative +* bus number, print that out instead. +*/ + pe->tce_inval_reg_phys = be64_to_cpup(swinvp); + tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, + 8); + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + } + + return 0; +fail: + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + + return rc; +} + static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe->pe_number << 1 ) + 1; @@ -1421,7 +1472,6 @@ static struct powerpc_iommu_ops pnv_pci_ioda2_ops = { static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - const __be64 *swinvp; unsigned int end; struct iommu_table *tbl = >iommu.tables[0]; int64_t rc; @@ -1448,31 +1498,14 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->iommu.tables[0].it_iommu = >iommu; pe->iommu.ops = _pci_ioda2_ops; - /* -* Map TCE table through TVT. The TVE index is the PE number -* shifted by 1 bit for 32-bits DMA space. -*/ - rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(tbl->it_base), - tbl->it_size << 3, 1ULL << tbl->it_page_shift); + rc = pnv_pci_ioda2_set_window(pe, tbl); if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table," " err %ld\n", rc); - goto fail; - } - - /* OPAL variant of PHB3 invalidated TCEs */ - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); - if (swinvp) { - /* We need a couple more fields -- an address and a data -* to or. Since the bus is only printed out on table free -* errors, and on the first pass the data will be a relative -* bus number, print that out instead. -*/ - pe->tce_inval_reg_phys = be64_to_cpup(swinvp); - tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, - 8); - tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + pnv_pci_ioda2_free_table(tbl); + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + return; } iommu_register_group
[PATCH v3 16/24] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_create_table
This is a part of moving TCE table allocation into an iommu_ops callback to support multiple IOMMU groups per one VFIO container. This is a mechanical patch. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++ 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ebfea0a..95d9119 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1295,6 +1295,62 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } +static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, + __u32 page_shift, __u32 window_shift, + struct iommu_table *tbl) +{ + int nid = pe->phb->hose->node; + struct page *tce_mem = NULL; + void *addr; + unsigned long tce_table_size; + int64_t rc; + unsigned order; + + if ((page_shift != 12) && (page_shift != 16) && (page_shift != 24)) + return -EINVAL; + + if ((1ULL << window_shift) > memory_hotplug_max()) + return -EINVAL; + + tce_table_size = (1ULL << (window_shift - page_shift)) * 8; + tce_table_size = max(0x1000UL, tce_table_size); + + /* Allocate TCE table */ + order = get_order(tce_table_size); + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory, order=%d\n", order); + rc = -ENOMEM; + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, tce_table_size); + + /* Setup linux iommu table */ + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + page_shift); + + tbl->it_ops = _ioda2_iommu_ops; + iommu_init_table(tbl, nid); + + return 0; +fail: + if (tce_mem) + __free_pages(tce_mem, get_order(tce_table_size)); + + return rc; +} + +static void pnv_pci_ioda2_free_table(struct iommu_table *tbl) +{ + if (!tbl->it_size) + return; + + free_pages(tbl->it_base, get_order(tbl->it_size << 3)); + memset(tbl, 0, sizeof(struct iommu_table)); +} + static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe->pe_number << 1 ) + 1; @@ -1365,11 +1421,9 @@ static struct powerpc_iommu_ops pnv_pci_ioda2_ops = { static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct page *tce_mem = NULL; - void *addr; const __be64 *swinvp; - struct iommu_table *tbl; - unsigned int tce_table_size, end; + unsigned int end; + struct iommu_table *tbl = >iommu.tables[0]; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -1378,31 +1432,20 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* The PE will reserve all possible 32-bits space */ pe->tce32_seg = 0; + end = (1 << ilog2(phb->ioda.m32_pci_base)); - tce_table_size = (end / 0x1000) * 8; pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", end); - /* Allocate TCE table */ - tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(tce_table_size)); - if (!tce_mem) { - pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); - goto fail; + rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K, + ilog2(phb->ioda.m32_pci_base), tbl); + if (rc) { + pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); + return; } - addr = page_address(tce_mem); - memset(addr, 0, tce_table_size); /* Setup iommu */ pe->iommu.tables[0].it_iommu = >iommu; - - /* Setup linux iommu table */ - tbl = >iommu.tables[0]; - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); - - tbl->it_ops = _ioda2_iommu_ops; - iommu_init_table(tbl, phb->hose->node); pe->iommu.ops = _pci_ioda2_ops; /* @@ -1447,8 +1490,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, fail: if (pe->tce32_seg >= 0) pe->tce32_seg = -1; - if (tce_mem) - __free_pages(tce_mem, get_order(tce_table_size)); + pnv_pci_ioda2_free_table(tbl); } static void pnv_ioda_setup_dma(struct pnv_phb *phb) -- 2.0.0 -- To unsubscribe from this list: send the line "
[PATCH v3 08/24] powerpc/spapr: vfio: Switch from iommu_table to new powerpc_iommu
Modern IBM POWERPC systems support multiple (currently two) TCE tables per IOMMU group (a.k.a. PE). This adds a powerpc_iommu container for TCE tables. Right now just one table is supported. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 18 ++-- arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/kernel/iommu.c | 34 arch/powerpc/platforms/powernv/pci-ioda.c | 37 +--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 16 ++-- arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/powernv/pci.h| 4 +- arch/powerpc/platforms/pseries/iommu.c | 9 +- drivers/vfio/vfio_iommu_spapr_tce.c | 131 9 files changed, 170 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 335e3d4..4fe 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -90,9 +90,7 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ -#ifdef CONFIG_IOMMU_API - struct iommu_group *it_group; -#endif + struct powerpc_iommu *it_iommu; struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; @@ -126,13 +124,23 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +#define POWERPC_IOMMU_MAX_TABLES 1 + +struct powerpc_iommu { #ifdef CONFIG_IOMMU_API -extern void iommu_register_group(struct iommu_table *tbl, + struct iommu_group *group; +#endif + struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; +}; + +#ifdef CONFIG_IOMMU_API +extern void iommu_register_group(struct powerpc_iommu *iommu, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else -static inline void iommu_register_group(struct iommu_table *tbl, +static inline void iommu_register_group(struct powerpc_iommu *iommu, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index e1b6d8e..319eae3 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1360,7 +1360,7 @@ static int dev_has_iommu_table(struct device *dev, void *data) return 0; tbl = get_iommu_table_base(dev); - if (tbl && tbl->it_group) { + if (tbl && tbl->it_iommu) { *ppdev = pdev; return 1; } diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 2f7e92b..952939f 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -712,17 +712,20 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) struct iommu_table *iommu_table_alloc(int node) { - struct iommu_table *tbl; + struct powerpc_iommu *iommu; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); + iommu = kzalloc_node(sizeof(struct powerpc_iommu), GFP_KERNEL, + node); + iommu->tables[0].it_iommu = iommu; - return tbl; + return >tables[0]; } void iommu_free_table(struct iommu_table *tbl, const char *node_name) { unsigned long bitmap_sz; unsigned int order; + struct powerpc_iommu *iommu = tbl->it_iommu; if (!tbl || !tbl->it_map) { printk(KERN_ERR "%s: expected TCE map for %s\n", __func__, @@ -738,9 +741,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) clear_bit(0, tbl->it_map); #ifdef CONFIG_IOMMU_API - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); + if (iommu->group) { + iommu_group_put(iommu->group); + BUG_ON(iommu->group); } #endif @@ -756,7 +759,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) free_pages((unsigned long) tbl->it_map, order); /* free table */ - kfree(tbl); + kfree(iommu); } /* Creates TCEs for a user provided buffer. The user buffer must be @@ -888,11 +891,12 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl->it_group = NULL; + struct powerpc_iommu *iommu = iommu_data; + + iommu->group = NULL; } -void iommu
[PATCH v3 03/24] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE
Normally a bitmap from the iommu_table is used to track what TCE entry is in use. Since we are going to use iommu_table without its locks and do xchg() instead, it becomes essential not to put bits which are not implied in the direction flag. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- arch/powerpc/platforms/powernv/pci.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 4945e87..9ec7d68 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -589,19 +589,27 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; +static unsigned long pnv_dmadir_to_flags(enum dma_data_direction direction) +{ + switch (direction) { + case DMA_BIDIRECTIONAL: + case DMA_FROM_DEVICE: + return TCE_PCI_READ | TCE_PCI_WRITE; + case DMA_TO_DEVICE: + return TCE_PCI_READ; + default: + return 0; + } +} + static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs, bool rm) { - u64 proto_tce; + u64 proto_tce = pnv_dmadir_to_flags(direction); __be64 *tcep, *tces; u64 rpn; - proto_tce = TCE_PCI_READ; // Read allowed - - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; - tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> tbl->it_page_shift; -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 15/24] poweppc/powernv/ioda2: Rework iommu_table creation
This moves iommu_table creation to the beginning. This is a mechanical patch. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +-- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6d279d5..ebfea0a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1393,27 +1393,31 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, addr = page_address(tce_mem); memset(addr, 0, tce_table_size); + /* Setup iommu */ + pe-iommu.tables[0].it_iommu = pe-iommu; + + /* Setup linux iommu table */ + tbl = pe-iommu.tables[0]; + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + IOMMU_PAGE_SHIFT_4K); + + tbl-it_ops = pnv_ioda2_iommu_ops; + iommu_init_table(tbl, phb-hose-node); + pe-iommu.ops = pnv_pci_ioda2_ops; + /* * Map TCE table through TVT. The TVE index is the PE number * shifted by 1 bit for 32-bits DMA space. */ rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number, - pe-pe_number 1, 1, __pa(addr), - tce_table_size, 0x1000); + pe-pe_number 1, 1, __pa(tbl-it_base), + tbl-it_size 3, 1ULL tbl-it_page_shift); if (rc) { pe_err(pe, Failed to configure 32-bit TCE table, err %ld\n, rc); goto fail; } - /* Setup iommu */ - pe-iommu.tables[0].it_iommu = pe-iommu; - - /* Setup linux iommu table */ - tbl = pe-iommu.tables[0]; - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); - /* OPAL variant of PHB3 invalidated TCEs */ swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); if (swinvp) { @@ -1427,14 +1431,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, 8); tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } - tbl-it_ops = pnv_ioda2_iommu_ops; - iommu_init_table(tbl, phb-hose-node); - pe-iommu.ops = pnv_pci_ioda2_ops; + iommu_register_group(pe-iommu, phb-hose-global_number, pe-pe_number); if (pe-pdev) - set_iommu_table_base_and_group(pe-pdev-dev, tbl); + set_iommu_table_base_and_group(pe-pdev-dev, + pe-iommu.tables[0]); else pnv_ioda_setup_bus_dma(pe, pe-pbus, true); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 04/24] vfio: powerpc/spapr: Use it_page_size
This makes use of the it_page_size from the iommu_table struct as page size can differ. This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code as recently introduced IOMMU_PAGE_XXX macros do not include IOMMU_PAGE_SHIFT. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Reviewed-by: David Gibson da...@gibson.dropbear.id.au --- drivers/vfio/vfio_iommu_spapr_tce.c | 26 +- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 99b98fa..c596053 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -97,7 +97,7 @@ static int tce_iommu_enable(struct tce_container *container) * enforcing the limit based on the max that the guest can map. */ down_write(current-mm-mmap_sem); - npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + npages = (tbl-it_size tbl-it_page_shift) PAGE_SHIFT; locked = current-mm-locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; if (locked lock_limit !capable(CAP_IPC_LOCK)) { @@ -126,7 +126,7 @@ static void tce_iommu_disable(struct tce_container *container) down_write(current-mm-mmap_sem); current-mm-locked_vm -= (container-tbl-it_size - IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + container-tbl-it_page_shift) PAGE_SHIFT; up_write(current-mm-mmap_sem); } @@ -232,7 +232,7 @@ static long tce_iommu_build(struct tce_container *container, tce, ret); break; } - tce += IOMMU_PAGE_SIZE_4K; + tce += IOMMU_PAGE_SIZE(tbl); } if (ret) @@ -277,8 +277,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz minsz) return -EINVAL; - info.dma32_window_start = tbl-it_offset IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl-it_size IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = tbl-it_offset tbl-it_page_shift; + info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; if (copy_to_user((void __user *)arg, info, minsz)) @@ -308,8 +308,8 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; - if ((param.size ~IOMMU_PAGE_MASK_4K) || - (param.vaddr ~IOMMU_PAGE_MASK_4K)) + if ((param.size ~IOMMU_PAGE_MASK(tbl)) || + (param.vaddr ~IOMMU_PAGE_MASK(tbl))) return -EINVAL; /* iova is checked by the IOMMU API */ @@ -324,8 +324,8 @@ static long tce_iommu_ioctl(void *iommu_data, return ret; ret = tce_iommu_build(container, tbl, - param.iova IOMMU_PAGE_SHIFT_4K, - tce, param.size IOMMU_PAGE_SHIFT_4K); + param.iova tbl-it_page_shift, + tce, param.size tbl-it_page_shift); iommu_flush_tce(tbl); @@ -351,17 +351,17 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; - if (param.size ~IOMMU_PAGE_MASK_4K) + if (param.size ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size IOMMU_PAGE_SHIFT_4K); + param.size tbl-it_page_shift); if (ret) return ret; ret = tce_iommu_clear(container, tbl, - param.iova IOMMU_PAGE_SHIFT_4K, - param.size IOMMU_PAGE_SHIFT_4K); + param.iova tbl-it_page_shift, + param.size tbl-it_page_shift); iommu_flush_tce(tbl); return ret; -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 05/24] vfio: powerpc/spapr: Move locked_vm accounting to helpers
There moves locked pages accounting to helpers. Later they will be reused for Dynamic DMA windows (DDW). While we are here, update the comment explaining why RLIMIT_MEMLOCK might be required to be bigger than the guest RAM. This also prints pid of the current process in pr_warn/pr_debug. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 72 +++-- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c596053..29d5708 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -29,6 +29,47 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +#define IOMMU_TABLE_PAGES(tbl) \ + (((tbl)-it_size (tbl)-it_page_shift) PAGE_SHIFT) + +static long try_increment_locked_vm(long npages) +{ + long ret = 0, locked, lock_limit; + + if (!current || !current-mm) + return -ESRCH; /* process exited */ + + down_write(current-mm-mmap_sem); + locked = current-mm-locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; + if (locked lock_limit !capable(CAP_IPC_LOCK)) { + pr_warn([%d] RLIMIT_MEMLOCK (%ld) exceeded\n, + current-pid, rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current-mm-locked_vm += npages; + } + pr_debug([%d] RLIMIT_MEMLOCK+ %ld pages\n, current-pid, + current-mm-locked_vm); + up_write(current-mm-mmap_sem); + + return ret; +} + +static void decrement_locked_vm(long npages) +{ + if (!current || !current-mm) + return; /* process exited */ + + down_write(current-mm-mmap_sem); + if (npages current-mm-locked_vm) + npages = current-mm-locked_vm; + current-mm-locked_vm -= npages; + pr_debug([%d] RLIMIT_MEMLOCK- %ld pages\n, current-pid, + current-mm-locked_vm); + up_write(current-mm-mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -66,8 +107,6 @@ static bool tce_check_page_size(struct page *page, unsigned page_shift) static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; - struct iommu_table *tbl = container-tbl; if (!container-tbl) return -ENXIO; @@ -95,21 +134,19 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. +* +* Unfortunately at the moment it counts whole tables, no matter how +* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups +* each with 2GB DMA window, 8GB will be counted here. The reason for +* this is that we cannot tell here the amount of RAM used by the guest +* as this information is only available from KVM and VFIO is +* KVM agnostic. */ - down_write(current-mm-mmap_sem); - npages = (tbl-it_size tbl-it_page_shift) PAGE_SHIFT; - locked = current-mm-locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; - if (locked lock_limit !capable(CAP_IPC_LOCK)) { - pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { + ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(container-tbl)); + if (ret) + return ret; - current-mm-locked_vm += npages; - container-enabled = true; - } - up_write(current-mm-mmap_sem); + container-enabled = true; return ret; } @@ -124,10 +161,7 @@ static void tce_iommu_disable(struct tce_container *container) if (!container-tbl || !current-mm) return; - down_write(current-mm-mmap_sem); - current-mm-locked_vm -= (container-tbl-it_size - container-tbl-it_page_shift) PAGE_SHIFT; - up_write(current-mm-mmap_sem); + decrement_locked_vm(IOMMU_TABLE_PAGES(container-tbl)); } static void *tce_iommu_open(unsigned long arg) -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 11/24] powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free()
The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is supposed to be called on IODA1/2 and not called on p5ioc2. It receives start and end host addresses of TCE table. This approach makes it possible to get pnv_pci_ioda_tce_invalidate() unintentionally called on p5ioc2. Another issue is that IODA2 needs PCI addresses to invalidate the cache and those can be calculated from host addresses but since we are going to implement multi-level TCE tables, calculating PCI address from a host address might get either tricky or ugly as TCE table remains flat on PCI bus but not in RAM. This defines separate iommu_table_ops callbacks for p5ioc2 and IODA1/2 PHBs. They all call common pnv_tce_build/pnv_tce_free/pnv_tce_get helpers but call PHB specific TCE invalidation helper (when needed). This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and number of pages which are PCI addresses shifted by IOMMU page shift. The patch is pretty mechanical and behaviour is not expected to change. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 92 ++--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 8 ++- arch/powerpc/platforms/powernv/pci.c| 76 +--- arch/powerpc/platforms/powernv/pci.h| 7 ++- 4 files changed, 110 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index a33a116..dfc56fc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1041,18 +1041,20 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, } } -static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, -struct iommu_table *tbl, -__be64 *startp, __be64 *endp, bool rm) +static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) { + struct pnv_ioda_pe *pe = container_of(tbl-it_iommu, + struct pnv_ioda_pe, iommu); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe-tce_inval_reg_phys : (__be64 __iomem *)tbl-it_index; unsigned long start, end, inc; const unsigned shift = tbl-it_page_shift; - start = __pa(startp); - end = __pa(endp); + start = __pa((__be64 *)tbl-it_base + index - tbl-it_offset); + end = __pa((__be64 *)tbl-it_base + index - tbl-it_offset + + npages - 1); /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ if (tbl-it_busno) { @@ -1088,10 +1090,40 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, */ } -static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, -struct iommu_table *tbl, -__be64 *startp, __be64 *endp, bool rm) +static int pnv_ioda1_tce_build_vm(struct iommu_table *tbl, long index, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) { + long ret = pnv_tce_build(tbl, index, npages, uaddr, direction, + attrs); + + if (!ret (tbl-it_type TCE_PCI_SWINV_CREATE)) + pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); + + return ret; +} + +static void pnv_ioda1_tce_free_vm(struct iommu_table *tbl, long index, + long npages) +{ + pnv_tce_free(tbl, index, npages); + + if (tbl-it_type TCE_PCI_SWINV_FREE) + pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); +} + +struct iommu_table_ops pnv_ioda1_iommu_ops = { + .set = pnv_ioda1_tce_build_vm, + .clear = pnv_ioda1_tce_free_vm, + .get = pnv_tce_get, +}; + +static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) +{ + struct pnv_ioda_pe *pe = container_of(tbl-it_iommu, + struct pnv_ioda_pe, iommu); unsigned long start, end, inc; __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe-tce_inval_reg_phys : @@ -1104,9 +1136,9 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, end = start; /* Figure out the start, end and step */ - inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64)); + inc = tbl-it_offset + index / sizeof(u64); start |= (inc shift); - inc = tbl-it_offset + (((u64)endp - tbl-it_base) / sizeof(u64)); + inc = tbl-it_offset + (index + npages - 1) / sizeof(u64); end |= (inc shift); inc = (0x1ull shift); mb(); @@ -1120,19 +1152,35 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe
[PATCH v3 02/24] vfio: powerpc/iommu: Check that TCE page size is equal to it_page_size
This checks that the TCE table page size is not bigger that the size of a page we just pinned and going to put its physical address to the table. Otherwise the hardware gets unwanted access to physical memory between the end of the actual page and the end of the aligned up TCE page. Since compound_order() and compound_head() work correctly on non-huge pages, there is no need for additional check whether the page is huge. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v5: * check is done for all page sizes now, not just for huge pages * failed check returns EFAULT now (was EINVAL) * moved the check to VFIO SPAPR IOMMU driver --- drivers/vfio/vfio_iommu_spapr_tce.c | 22 ++ 1 file changed, 22 insertions(+) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index dc4a886..99b98fa 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -47,6 +47,22 @@ struct tce_container { bool enabled; }; +static bool tce_check_page_size(struct page *page, unsigned page_shift) +{ + unsigned shift; + + /* +* Check that the TCE table granularity is not bigger than the size of +* a page we just found. Otherwise the hardware can get access to +* a bigger memory chunk that it should. +*/ + shift = PAGE_SHIFT + compound_order(compound_head(page)); + if (shift = page_shift) + return true; + + return false; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; @@ -199,6 +215,12 @@ static long tce_iommu_build(struct tce_container *container, ret = -EFAULT; break; } + + if (!tce_check_page_size(page, tbl-it_page_shift)) { + ret = -EFAULT; + break; + } + hva = (unsigned long) page_address(page) + (tce IOMMU_PAGE_MASK(tbl) ~PAGE_MASK); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 09/24] powerpc/iommu: Fix IOMMU ownership control functions
This adds missing locks in iommu_take_ownership()/ iommu_release_ownership(). This marks all pages busy in iommu_table::it_map in order to catch errors if there is an attempt to use this table while ownership over it is taken. This only clears TCE content if there is no page marked busy in it_map. Clearing must be done outside of the table locks as iommu_clear_tce() called from iommu_clear_tces_and_put_pages() does this. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Note: we might want to get rid of it as this patchset removes it_map from tables passed to VFIO. Changes: v5: * do not store bit#0 value, it has to be set for zero-based table anyway * removed test_and_clear_bit * only disable bypass if succeeded --- arch/powerpc/kernel/iommu.c | 31 +-- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 952939f..407d0d6 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1024,33 +1024,48 @@ EXPORT_SYMBOL_GPL(iommu_tce_build); int iommu_take_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; + int ret = 0; + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); if (tbl-it_offset == 0) clear_bit(0, tbl-it_map); if (!bitmap_empty(tbl-it_map, tbl-it_size)) { pr_err(iommu_tce: it_map is not empty); - return -EBUSY; + ret = -EBUSY; + if (tbl-it_offset == 0) + set_bit(0, tbl-it_map); + } else { + memset(tbl-it_map, 0xff, sz); } - memset(tbl-it_map, 0xff, sz); + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); /* * Disable iommu bypass, otherwise the user can DMA to all of * our physical memory via the bypass window instead of just * the pages that has been explicitly mapped into the iommu */ - if (tbl-set_bypass) + if (!ret tbl-set_bypass) tbl-set_bypass(tbl, false); - return 0; + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); memset(tbl-it_map, 0, sz); @@ -1058,6 +1073,10 @@ void iommu_release_ownership(struct iommu_table *tbl) if (tbl-it_offset == 0) set_bit(0, tbl-it_map); + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); + /* The kernel owns the device now, we can restore the iommu bypass */ if (tbl-set_bypass) tbl-set_bypass(tbl, true); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 07/24] powerpc/iommu: Introduce iommu_table_alloc() helper
This replaces multiple calls of kzalloc_node() with a new iommu_table_alloc() helper. Right now it calls kzalloc_node() but later it will be modified to allocate a powerpc_iommu struct with a single iommu_table in it. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/kernel/iommu.c| 9 + arch/powerpc/platforms/powernv/pci.c | 2 +- arch/powerpc/platforms/pseries/iommu.c | 12 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index eb5822d..335e3d4 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -117,6 +117,7 @@ static inline void *get_iommu_table_base(struct device *dev) return dev-archdata.dma_data.iommu_table_base; } +extern struct iommu_table *iommu_table_alloc(int node); /* Frees table for an individual device node */ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c51ad3e..2f7e92b 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -710,6 +710,15 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) return tbl; } +struct iommu_table *iommu_table_alloc(int node) +{ + struct iommu_table *tbl; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); + + return tbl; +} + void iommu_free_table(struct iommu_table *tbl, const char *node_name) { unsigned long bitmap_sz; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index c4782b1..bbe529b 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -693,7 +693,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose) hose-dn-full_name); return NULL; } - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose-node); + tbl = iommu_table_alloc(hose-node); if (WARN_ON(!tbl)) return NULL; pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 1aa1815..bc14299 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -617,8 +617,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pci-phb-dma_window_size = 0x800ul; pci-phb-dma_window_base_cur = 0x800ul; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci-phb-node); + tbl = iommu_table_alloc(pci-phb-node); iommu_table_setparms(pci-phb, dn, tbl); tbl-it_ops = iommu_table_pseries_ops; @@ -669,8 +668,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) pdn-full_name, ppci-iommu_table); if (!ppci-iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - ppci-phb-node); + tbl = iommu_table_alloc(ppci-phb-node); iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window); tbl-it_ops = iommu_table_lpar_multi_ops; ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node); @@ -697,8 +695,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) struct pci_controller *phb = PCI_DN(dn)-phb; pr_debug( -- first child, no bridge. Allocating iommu table.\n); - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb-node); + tbl = iommu_table_alloc(phb-node); iommu_table_setparms(phb, dn, tbl); tbl-it_ops = iommu_table_pseries_ops; PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node); @@ -1120,8 +1117,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pci = PCI_DN(pdn); if (!pci-iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci-phb-node); + tbl = iommu_table_alloc(pci-phb-node); iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window); tbl-it_ops = iommu_table_lpar_multi_ops; pci-iommu_table = iommu_init_table(tbl, pci-phb-node); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 08/24] powerpc/spapr: vfio: Switch from iommu_table to new powerpc_iommu
Modern IBM POWERPC systems support multiple (currently two) TCE tables per IOMMU group (a.k.a. PE). This adds a powerpc_iommu container for TCE tables. Right now just one table is supported. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 18 ++-- arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/kernel/iommu.c | 34 arch/powerpc/platforms/powernv/pci-ioda.c | 37 +--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 16 ++-- arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/powernv/pci.h| 4 +- arch/powerpc/platforms/pseries/iommu.c | 9 +- drivers/vfio/vfio_iommu_spapr_tce.c | 131 9 files changed, 170 insertions(+), 83 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 335e3d4..4fe 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -90,9 +90,7 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ -#ifdef CONFIG_IOMMU_API - struct iommu_group *it_group; -#endif + struct powerpc_iommu *it_iommu; struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; @@ -126,13 +124,23 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +#define POWERPC_IOMMU_MAX_TABLES 1 + +struct powerpc_iommu { #ifdef CONFIG_IOMMU_API -extern void iommu_register_group(struct iommu_table *tbl, + struct iommu_group *group; +#endif + struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; +}; + +#ifdef CONFIG_IOMMU_API +extern void iommu_register_group(struct powerpc_iommu *iommu, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else -static inline void iommu_register_group(struct iommu_table *tbl, +static inline void iommu_register_group(struct powerpc_iommu *iommu, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index e1b6d8e..319eae3 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1360,7 +1360,7 @@ static int dev_has_iommu_table(struct device *dev, void *data) return 0; tbl = get_iommu_table_base(dev); - if (tbl tbl-it_group) { + if (tbl tbl-it_iommu) { *ppdev = pdev; return 1; } diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 2f7e92b..952939f 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -712,17 +712,20 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) struct iommu_table *iommu_table_alloc(int node) { - struct iommu_table *tbl; + struct powerpc_iommu *iommu; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); + iommu = kzalloc_node(sizeof(struct powerpc_iommu), GFP_KERNEL, + node); + iommu-tables[0].it_iommu = iommu; - return tbl; + return iommu-tables[0]; } void iommu_free_table(struct iommu_table *tbl, const char *node_name) { unsigned long bitmap_sz; unsigned int order; + struct powerpc_iommu *iommu = tbl-it_iommu; if (!tbl || !tbl-it_map) { printk(KERN_ERR %s: expected TCE map for %s\n, __func__, @@ -738,9 +741,9 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) clear_bit(0, tbl-it_map); #ifdef CONFIG_IOMMU_API - if (tbl-it_group) { - iommu_group_put(tbl-it_group); - BUG_ON(tbl-it_group); + if (iommu-group) { + iommu_group_put(iommu-group); + BUG_ON(iommu-group); } #endif @@ -756,7 +759,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) free_pages((unsigned long) tbl-it_map, order); /* free table */ - kfree(tbl); + kfree(iommu); } /* Creates TCEs for a user provided buffer. The user buffer must be @@ -888,11 +891,12 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl-it_group = NULL; + struct powerpc_iommu *iommu = iommu_data; + + iommu-group = NULL; } -void iommu_register_group(struct iommu_table *tbl, +void iommu_register_group(struct
[PATCH v3 03/24] powerpc/powernv: Do not set read flag if direction==DMA_NONE
Normally a bitmap from the iommu_table is used to track what TCE entry is in use. Since we are going to use iommu_table without its locks and do xchg() instead, it becomes essential not to put bits which are not implied in the direction flag. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Reviewed-by: David Gibson da...@gibson.dropbear.id.au --- arch/powerpc/platforms/powernv/pci.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 4945e87..9ec7d68 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -589,19 +589,27 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; +static unsigned long pnv_dmadir_to_flags(enum dma_data_direction direction) +{ + switch (direction) { + case DMA_BIDIRECTIONAL: + case DMA_FROM_DEVICE: + return TCE_PCI_READ | TCE_PCI_WRITE; + case DMA_TO_DEVICE: + return TCE_PCI_READ; + default: + return 0; + } +} + static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs, bool rm) { - u64 proto_tce; + u64 proto_tce = pnv_dmadir_to_flags(direction); __be64 *tcep, *tces; u64 rpn; - proto_tce = TCE_PCI_READ; // Read allowed - - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; - tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset; rpn = __pa(uaddr) tbl-it_page_shift; -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 01/24] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver
This moves page pinning (get_user_pages_fast()/put_page()) code out of the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs to as the platform code does not deal with page pinning. This makes iommu_take_ownership()/iommu_release_ownership() deal with the IOMMU table bitmap only. This removes page unpinning from iommu_take_ownership() as the actual TCE table might contain garbage and doing put_page() on it is undefined behaviour. Besides the last part, the rest of the patch is mechanical. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 6 --- arch/powerpc/kernel/iommu.c | 68 --- drivers/vfio/vfio_iommu_spapr_tce.c | 91 +++-- 3 files changed, 78 insertions(+), 87 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 9cfa370..45b07f6 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -191,16 +191,10 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction); extern unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry); -extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages); -extern int iommu_put_tce_user_mode(struct iommu_table *tbl, - unsigned long entry, unsigned long tce); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); extern void iommu_release_ownership(struct iommu_table *tbl); -extern enum dma_data_direction iommu_tce_direction(unsigned long tce); - #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 5d3968c..456acb1 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -903,19 +903,6 @@ void iommu_register_group(struct iommu_table *tbl, kfree(name); } -enum dma_data_direction iommu_tce_direction(unsigned long tce) -{ - if ((tce TCE_PCI_READ) (tce TCE_PCI_WRITE)) - return DMA_BIDIRECTIONAL; - else if (tce TCE_PCI_READ) - return DMA_TO_DEVICE; - else if (tce TCE_PCI_WRITE) - return DMA_FROM_DEVICE; - else - return DMA_NONE; -} -EXPORT_SYMBOL_GPL(iommu_tce_direction); - void iommu_flush_tce(struct iommu_table *tbl) { /* Flush/invalidate TLB caches if necessary */ @@ -991,30 +978,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) } EXPORT_SYMBOL_GPL(iommu_clear_tce); -int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) -{ - unsigned long oldtce; - struct page *page; - - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); - /* * hwaddr is a kernel virtual address here (0xc... bazillion), * tce_build converts it to a physical address. @@ -1044,35 +1007,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_build); -int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, - unsigned long tce) -{ - int ret; - struct page *page = NULL; - unsigned long hwaddr, offset = tce IOMMU_PAGE_MASK(tbl) ~PAGE_MASK; - enum dma_data_direction direction = iommu_tce_direction(tce); - - ret = get_user_pages_fast(tce PAGE_MASK, 1, - direction != DMA_TO_DEVICE, page); - if (unlikely(ret != 1)) { - /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n, - tce, entry tbl-it_page_shift, ret); */ - return -EFAULT; - } - hwaddr = (unsigned long) page_address(page) + offset; - - ret = iommu_tce_build(tbl, entry, hwaddr, direction); - if (ret) - put_page(page); - - if (ret 0) - pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n, - __func__, entry tbl-it_page_shift, tce, ret); - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); - int iommu_take_ownership(struct iommu_table *tbl) { unsigned long sz = (tbl-it_size + 7) 3; @@ -1086,7 +1020,6 @@ int iommu_take_ownership(struct iommu_table *tbl) } memset(tbl
[PATCH v3 06/24] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
This adds a iommu_table_ops struct and puts pointer to it into the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush callbacks from ppc_md to the new struct where they really belong to. This adds the requirement for @it_ops to be initialized before calling iommu_init_table() to make sure that we do not leave any IOMMU table with iommu_table_ops uninitialized. This is not a parameter of iommu_init_table() though as there will be cases when iommu_init_table() will not be called on TCE tables used by VFIO. This does s/tce_build/set/, s/tce_free/clear/ and removes tce_ redundand prefixes. This removes tce_xxx_rm handlers from ppc_md but does not add them to iommu_table_ops as this will be done later if we decide to support TCE hypercalls in real mode. For pSeries, this always uses tce_buildmulti_pSeriesLP/ tce_buildmulti_pSeriesLP. This changes multi callback to fall back to tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not present. The reason for this is we still have to support multitce=off boot parameter in disable_multitce() and we do not want to walk through all IOMMU tables in the system and replace multi callbacks with single ones. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 17 +++ arch/powerpc/include/asm/machdep.h | 25 arch/powerpc/kernel/iommu.c | 46 +++-- arch/powerpc/kernel/vio.c | 5 arch/powerpc/platforms/cell/iommu.c | 8 +++-- arch/powerpc/platforms/pasemi/iommu.c | 7 +++-- arch/powerpc/platforms/powernv/pci-ioda.c | 2 ++ arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 23 --- arch/powerpc/platforms/powernv/pci.h| 1 + arch/powerpc/platforms/pseries/iommu.c | 34 +++-- arch/powerpc/sysdev/dart_iommu.c| 12 12 files changed, 93 insertions(+), 88 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 45b07f6..eb5822d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -43,6 +43,22 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*clear)(struct iommu_table *tbl, + long index, long npages); + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -77,6 +93,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + struct iommu_table_ops *it_ops; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index c8175a3..2abe744 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void(*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void(*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free_rm)(struct iommu_table *tbl, - long index, - long npages); - void(*tce_flush_rm)(struct iommu_table *tbl); - void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size
[PATCH v3 17/24] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window
This is a part of moving DMA window programming to an iommu_ops callback. This is a mechanical patch. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 84 --- 1 file changed, 56 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 95d9119..1f725d4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1351,6 +1351,57 @@ static void pnv_pci_ioda2_free_table(struct iommu_table *tbl) memset(tbl, 0, sizeof(struct iommu_table)); } +static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe, + struct iommu_table *tbl) +{ + struct pnv_phb *phb = pe-phb; + const __be64 *swinvp; + int64_t rc; + const __u64 start_addr = tbl-it_offset tbl-it_page_shift; + const __u64 win_size = tbl-it_size tbl-it_page_shift; + + pe_info(pe, Setting up window at %llx..%llx pagesize=0x%x tablesize=0x%lx\n, + start_addr, start_addr + win_size - 1, + 1UL tbl-it_page_shift, tbl-it_size 3); + + pe-iommu.tables[0] = *tbl; + tbl = pe-iommu.tables[0]; + tbl-it_iommu = pe-iommu; + + /* +* Map TCE table through TVT. The TVE index is the PE number +* shifted by 1 bit for 32-bits DMA space. +*/ + rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number, + pe-pe_number 1, 1, __pa(tbl-it_base), + tbl-it_size 3, 1ULL tbl-it_page_shift); + if (rc) { + pe_err(pe, Failed to configure TCE table, err %ld\n, rc); + goto fail; + } + + /* OPAL variant of PHB3 invalidated TCEs */ + swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data +* to or. Since the bus is only printed out on table free +* errors, and on the first pass the data will be a relative +* bus number, print that out instead. +*/ + pe-tce_inval_reg_phys = be64_to_cpup(swinvp); + tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys, + 8); + tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + } + + return 0; +fail: + if (pe-tce32_seg = 0) + pe-tce32_seg = -1; + + return rc; +} + static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe-pe_number 1 ) + 1; @@ -1421,7 +1472,6 @@ static struct powerpc_iommu_ops pnv_pci_ioda2_ops = { static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - const __be64 *swinvp; unsigned int end; struct iommu_table *tbl = pe-iommu.tables[0]; int64_t rc; @@ -1448,31 +1498,14 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe-iommu.tables[0].it_iommu = pe-iommu; pe-iommu.ops = pnv_pci_ioda2_ops; - /* -* Map TCE table through TVT. The TVE index is the PE number -* shifted by 1 bit for 32-bits DMA space. -*/ - rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number, - pe-pe_number 1, 1, __pa(tbl-it_base), - tbl-it_size 3, 1ULL tbl-it_page_shift); + rc = pnv_pci_ioda2_set_window(pe, tbl); if (rc) { pe_err(pe, Failed to configure 32-bit TCE table, err %ld\n, rc); - goto fail; - } - - /* OPAL variant of PHB3 invalidated TCEs */ - swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); - if (swinvp) { - /* We need a couple more fields -- an address and a data -* to or. Since the bus is only printed out on table free -* errors, and on the first pass the data will be a relative -* bus number, print that out instead. -*/ - pe-tce_inval_reg_phys = be64_to_cpup(swinvp); - tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys, - 8); - tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + pnv_pci_ioda2_free_table(tbl); + if (pe-tce32_seg = 0) + pe-tce32_seg = -1; + return; } iommu_register_group(pe-iommu, phb-hose-global_number, @@ -1486,11 +1519,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* Also create a bypass window */ pnv_pci_ioda2_setup_bypass_pe(phb, pe); - return; -fail: - if (pe-tce32_seg = 0) - pe-tce32_seg = -1
[PATCH v3 16/24] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_create_table
This is a part of moving TCE table allocation into an iommu_ops callback to support multiple IOMMU groups per one VFIO container. This is a mechanical patch. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++ 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ebfea0a..95d9119 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1295,6 +1295,62 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } +static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, + __u32 page_shift, __u32 window_shift, + struct iommu_table *tbl) +{ + int nid = pe-phb-hose-node; + struct page *tce_mem = NULL; + void *addr; + unsigned long tce_table_size; + int64_t rc; + unsigned order; + + if ((page_shift != 12) (page_shift != 16) (page_shift != 24)) + return -EINVAL; + + if ((1ULL window_shift) memory_hotplug_max()) + return -EINVAL; + + tce_table_size = (1ULL (window_shift - page_shift)) * 8; + tce_table_size = max(0x1000UL, tce_table_size); + + /* Allocate TCE table */ + order = get_order(tce_table_size); + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); + if (!tce_mem) { + pr_err(Failed to allocate a TCE memory, order=%d\n, order); + rc = -ENOMEM; + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, tce_table_size); + + /* Setup linux iommu table */ + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + page_shift); + + tbl-it_ops = pnv_ioda2_iommu_ops; + iommu_init_table(tbl, nid); + + return 0; +fail: + if (tce_mem) + __free_pages(tce_mem, get_order(tce_table_size)); + + return rc; +} + +static void pnv_pci_ioda2_free_table(struct iommu_table *tbl) +{ + if (!tbl-it_size) + return; + + free_pages(tbl-it_base, get_order(tbl-it_size 3)); + memset(tbl, 0, sizeof(struct iommu_table)); +} + static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe-pe_number 1 ) + 1; @@ -1365,11 +1421,9 @@ static struct powerpc_iommu_ops pnv_pci_ioda2_ops = { static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct page *tce_mem = NULL; - void *addr; const __be64 *swinvp; - struct iommu_table *tbl; - unsigned int tce_table_size, end; + unsigned int end; + struct iommu_table *tbl = pe-iommu.tables[0]; int64_t rc; /* We shouldn't already have a 32-bit DMA associated */ @@ -1378,31 +1432,20 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* The PE will reserve all possible 32-bits space */ pe-tce32_seg = 0; + end = (1 ilog2(phb-ioda.m32_pci_base)); - tce_table_size = (end / 0x1000) * 8; pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n, end); - /* Allocate TCE table */ - tce_mem = alloc_pages_node(phb-hose-node, GFP_KERNEL, - get_order(tce_table_size)); - if (!tce_mem) { - pe_err(pe, Failed to allocate a 32-bit TCE memory\n); - goto fail; + rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K, + ilog2(phb-ioda.m32_pci_base), tbl); + if (rc) { + pe_err(pe, Failed to create 32-bit TCE table, err %ld, rc); + return; } - addr = page_address(tce_mem); - memset(addr, 0, tce_table_size); /* Setup iommu */ pe-iommu.tables[0].it_iommu = pe-iommu; - - /* Setup linux iommu table */ - tbl = pe-iommu.tables[0]; - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); - - tbl-it_ops = pnv_ioda2_iommu_ops; - iommu_init_table(tbl, phb-hose-node); pe-iommu.ops = pnv_pci_ioda2_ops; /* @@ -1447,8 +1490,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, fail: if (pe-tce32_seg = 0) pe-tce32_seg = -1; - if (tce_mem) - __free_pages(tce_mem, get_order(tce_table_size)); + pnv_pci_ioda2_free_table(tbl); } static void pnv_ioda_setup_dma(struct pnv_phb *phb) -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org
[PATCH v3 14/24] vfio: powerpc/spapr: Register memory
The existing implementation accounts the whole DMA window in the locked_vm counter which is going to be even worse with multiple containers and huge DMA windows. This introduces 2 ioctls to register/unregister DMA memory which receive user space address and size of the memory region which needs to be pinned/unpinned and counted in locked_vm. If any memory region was registered, all subsequent DMA map requests should address already pinned memory. If no memory was registered, then the amount of memory required for a single default memory will be accounted when the container is enabled and every map/unmap will pin/unpin a page. Dynamic DMA window and in-kernel acceleration will require memory to be registered in order to work. The accounting is done per VFIO container. When the support of multiple groups per container is added, we will have accurate locked_vm accounting. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 333 include/uapi/linux/vfio.h | 29 2 files changed, 331 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8256275..d0987ae 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -86,8 +86,169 @@ struct tce_container { struct mutex lock; struct iommu_group *grp; bool enabled; + struct list_head mem_list; }; +struct tce_memory { + struct list_head next; + struct rcu_head rcu; + __u64 vaddr; + __u64 size; + __u64 pfns[]; +}; + +static void tce_unpin_pages(struct tce_container *container, + struct tce_memory *mem, __u64 vaddr, __u64 size) +{ + __u64 off; + struct page *page = NULL; + + + for (off = 0; off size; off += PAGE_SIZE) { + if (!mem-pfns[off PAGE_SHIFT]) + continue; + + page = pfn_to_page(mem-pfns[off PAGE_SHIFT]); + if (!page) + continue; + + put_page(page); + mem-pfns[off PAGE_SHIFT] = 0; + } +} + +static void release_tce_memory(struct rcu_head *head) +{ + struct tce_memory *mem = container_of(head, struct tce_memory, rcu); + + kfree(mem); +} + +static void tce_do_unregister_pages(struct tce_container *container, + struct tce_memory *mem) +{ + tce_unpin_pages(container, mem, mem-vaddr, mem-size); + decrement_locked_vm(mem-size); + list_del_rcu(mem-next); + call_rcu_sched(mem-rcu, release_tce_memory); +} + +static long tce_unregister_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + struct tce_memory *mem, *memtmp; + + if (container-enabled) + return -EBUSY; + + if ((vaddr ~PAGE_MASK) || (size ~PAGE_MASK)) + return -EINVAL; + + list_for_each_entry_safe(mem, memtmp, container-mem_list, next) { + if ((mem-vaddr == vaddr) (mem-size == size)) { + tce_do_unregister_pages(container, mem); + return 0; + } + } + + return -ENOENT; +} + +static long tce_pin_pages(struct tce_container *container, + struct tce_memory *mem, __u64 vaddr, __u64 size) +{ + __u64 off; + struct page *page = NULL; + + for (off = 0; off size; off += PAGE_SIZE) { + if (1 != get_user_pages_fast(vaddr + off, + 1/* pages */, 1/* iswrite */, page)) { + tce_unpin_pages(container, mem, vaddr, off); + return -EFAULT; + } + + mem-pfns[off PAGE_SHIFT] = page_to_pfn(page); + } + + return 0; +} + +static long tce_register_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + long ret; + struct tce_memory *mem; + + if (container-enabled) + return -EBUSY; + + if ((vaddr ~PAGE_MASK) || (size ~PAGE_MASK) || + ((vaddr + size) vaddr)) + return -EINVAL; + + /* Any overlap with registered chunks? */ + rcu_read_lock(); + list_for_each_entry_rcu(mem, container-mem_list, next) { + if ((mem-vaddr (vaddr + size)) + (vaddr (mem-vaddr + mem-size))) { + ret = -EBUSY; + goto unlock_exit; + } + } + + ret = try_increment_locked_vm(size PAGE_SHIFT); + if (ret) + goto unlock_exit; + + mem = kzalloc(sizeof(*mem) + (size (PAGE_SHIFT - 3)), GFP_KERNEL); + if (!mem) + goto unlock_exit; + + if (tce_pin_pages(container, mem, vaddr, size)) + goto free_exit; + + mem-vaddr = vaddr; + mem-size = size; + + list_add_rcu(mem
[PATCH v3 19/24] powerpc/powernv: Implement multilevel TCE tables
This adds multi-level TCE tables support to pnv_pci_ioda2_create_table() and pnv_pci_ioda2_free_table() callbacks. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 4 + arch/powerpc/platforms/powernv/pci-ioda.c | 125 +++--- arch/powerpc/platforms/powernv/pci.c | 19 + 3 files changed, 122 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index cc26eca..283f70f 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -85,6 +85,8 @@ struct iommu_pool { struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ + unsigned long it_indirect_levels; + unsigned long it_level_size; unsigned long it_offset;/* Offset into global table */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ @@ -133,6 +135,8 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, #define POWERPC_IOMMU_MAX_TABLES 1 +#define POWERPC_IOMMU_DEFAULT_LEVELS 1 + struct powerpc_iommu; struct powerpc_iommu_ops { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1f725d4..f542819 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1295,16 +1295,79 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } +static void pnv_free_tce_table(unsigned long addr, unsigned size, + unsigned level) +{ + addr = ~(TCE_PCI_READ | TCE_PCI_WRITE); + + if (level) { + long i; + u64 *tmp = (u64 *) addr; + + for (i = 0; i size; ++i) { + unsigned long hpa = be64_to_cpu(tmp[i]); + + if (!(hpa (TCE_PCI_READ | TCE_PCI_WRITE))) + continue; + + pnv_free_tce_table((unsigned long) __va(hpa), + size, level - 1); + } + } + + free_pages(addr, get_order(size 3)); +} + +static __be64 *pnv_alloc_tce_table(int nid, + unsigned shift, unsigned levels, unsigned long *left) +{ + struct page *tce_mem = NULL; + __be64 *addr, *tmp; + unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; + unsigned long chunk = 1UL shift, i; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); + if (!tce_mem) { + pr_err(Failed to allocate a TCE memory\n); + return NULL; + } + + if (!*left) + return NULL; + + addr = page_address(tce_mem); + memset(addr, 0, chunk); + + --levels; + if (!levels) { + /* This is last level, actual TCEs */ + *left -= min(*left, chunk); + return addr; + } + + for (i = 0; i (chunk 3); ++i) { + /* We allocated required TCEs, mark the rest page fault */ + if (!*left) { + addr[i] = cpu_to_be64(0); + continue; + } + + tmp = pnv_alloc_tce_table(nid, shift, levels, left); + addr[i] = cpu_to_be64(__pa(tmp) | + TCE_PCI_READ | TCE_PCI_WRITE); + } + + return addr; +} + static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, - __u32 page_shift, __u32 window_shift, + __u32 page_shift, __u32 window_shift, __u32 levels, struct iommu_table *tbl) { int nid = pe-phb-hose-node; - struct page *tce_mem = NULL; void *addr; - unsigned long tce_table_size; - int64_t rc; - unsigned order; + unsigned long tce_table_size, left; + unsigned shift; if ((page_shift != 12) (page_shift != 16) (page_shift != 24)) return -EINVAL; @@ -1312,20 +1375,27 @@ static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, if ((1ULL window_shift) memory_hotplug_max()) return -EINVAL; + if (!levels || (levels 5)) + return -EINVAL; + tce_table_size = (1ULL (window_shift - page_shift)) * 8; tce_table_size = max(0x1000UL, tce_table_size); /* Allocate TCE table */ - order = get_order(tce_table_size); +#define ROUND_UP(x, n) (((x) + (n) - 1u) ~((n) - 1u)) + shift = ROUND_UP(window_shift - page_shift, levels) / levels; + shift += 3; + shift = max_t(unsigned, shift, IOMMU_PAGE_SHIFT_4K); + pr_info(Creating TCE table %08llx, %d levels, TCE table size = %lx\n, + 1ULL
[PATCH v3 20/24] powerpc/powernv: Change prototypes to receive iommu
This changes few functions to receive a powerpc_iommu pointer rather than PE as they are going to be a part of upcoming powerpc_iommu_ops callback set. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 13 - 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f542819..29bd7a4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1360,10 +1360,12 @@ static __be64 *pnv_alloc_tce_table(int nid, return addr; } -static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe, +static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, __u32 page_shift, __u32 window_shift, __u32 levels, struct iommu_table *tbl) { + struct pnv_ioda_pe *pe = container_of(iommu, struct pnv_ioda_pe, + iommu); int nid = pe-phb-hose-node; void *addr; unsigned long tce_table_size, left; @@ -1419,9 +1421,11 @@ static void pnv_pci_ioda2_free_table(struct iommu_table *tbl) iommu_reset_table(tbl, ioda2); } -static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe, +static long pnv_pci_ioda2_set_window(struct powerpc_iommu *iommu, struct iommu_table *tbl) { + struct pnv_ioda_pe *pe = container_of(iommu, struct pnv_ioda_pe, + iommu); struct pnv_phb *phb = pe-phb; const __be64 *swinvp; int64_t rc; @@ -1554,12 +1558,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* The PE will reserve all possible 32-bits space */ pe-tce32_seg = 0; - end = (1 ilog2(phb-ioda.m32_pci_base)); pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n, end); - rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K, + rc = pnv_pci_ioda2_create_table(pe-iommu, IOMMU_PAGE_SHIFT_4K, ilog2(phb-ioda.m32_pci_base), POWERPC_IOMMU_DEFAULT_LEVELS, tbl); if (rc) { @@ -1571,7 +1574,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe-iommu.tables[0].it_iommu = pe-iommu; pe-iommu.ops = pnv_pci_ioda2_ops; - rc = pnv_pci_ioda2_set_window(pe, tbl); + rc = pnv_pci_ioda2_set_window(pe-iommu, tbl); if (rc) { pe_err(pe, Failed to configure 32-bit TCE table, err %ld\n, rc); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 12/24] powerpc/iommu/powernv: Release replaced TCE
At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first. Another problem this patch is addressing is the use of pool locks for external IOMMU users such as VFIO. The pool locks are to protect DMA page allocator rather than entries and since the host kernel does not control what pages are in use, there is no point in pool locks and exchange()+put_page(oldtce) is sufficient to avoid possible races. This adds an exchange() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE(s) so the caller can release the pages afterwards. This implements exchange() for IODA2 only. This adds a requirement for a platform to have exchange() implemented so from now on IODA2 is the only supported PHB for VFIO-SPAPR. This replaces iommu_tce_build() and iommu_clear_tce() with a single iommu_tce_xchg(). Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 13 +--- arch/powerpc/kernel/iommu.c | 50 +++ arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++ arch/powerpc/platforms/powernv/pci.c | 22 ++ arch/powerpc/platforms/powernv/pci.h | 4 +++ drivers/vfio/vfio_iommu_spapr_tce.c | 36 ++ 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index ba16aa0..bf26d47 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -49,6 +49,12 @@ struct iommu_table_ops { unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); + int (*exchange)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + unsigned long *old_tces, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*clear)(struct iommu_table *tbl, long index, long npages); unsigned long (*get)(struct iommu_table *tbl, long index); @@ -225,10 +231,9 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl, unsigned long npages); extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); -extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); +extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, unsigned long *oldtce, + enum dma_data_direction direction); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct powerpc_iommu *iommu); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 9d06425..26feaff 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -974,44 +974,18 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) -{ - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock((pool-lock)); - - oldtce = tbl-it_ops-get(tbl, entry); - if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) - tbl-it_ops-clear(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock((pool-lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - /* * hwaddr is a kernel virtual address here (0xc... bazillion), * tce_build converts it to a physical address. */ -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction) +long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, unsigned long *oldtce, + enum dma_data_direction direction) { - int ret = -EBUSY; - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); + long ret; - spin_lock((pool-lock)); - - oldtce = tbl-it_ops-get(tbl, entry); - /* Add new entry if it is not busy */ - if (!(oldtce (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = tbl-it_ops-set(tbl, entry, 1, hwaddr, direction, NULL); - - spin_unlock((pool-lock)); + ret = tbl-it_ops-exchange(tbl, entry, 1, hwaddr, oldtce, + direction, NULL); /* if (unlikely(ret)) pr_err(iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret
[PATCH v3 18/24] powerpc/iommu: Split iommu_free_table into 2 helpers
The iommu_free_table helper release memory it is using (the TCE table and @it_map) and release the iommu_table struct as well. We might not want the very last step as we store iommu_table in parent structures. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/kernel/iommu.c | 57 2 files changed, 35 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index bf26d47..cc26eca 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -122,6 +122,7 @@ static inline void *get_iommu_table_base(struct device *dev) extern struct iommu_table *iommu_table_alloc(int node); /* Frees table for an individual device node */ +extern void iommu_reset_table(struct iommu_table *tbl, const char *node_name); extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); /* Initializes an iommu_table based in values set in the passed-in diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 26feaff..5f87076 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -721,24 +721,46 @@ struct iommu_table *iommu_table_alloc(int node) return iommu-tables[0]; } +void iommu_reset_table(struct iommu_table *tbl, const char *node_name) +{ + if (!tbl) + return; + + if (tbl-it_map) { + unsigned long bitmap_sz; + unsigned int order; + + /* +* In case we have reserved the first bit, we should not emit +* the warning below. +*/ + if (tbl-it_offset == 0) + clear_bit(0, tbl-it_map); + + /* verify that table contains no entries */ + if (!bitmap_empty(tbl-it_map, tbl-it_size)) + pr_warn(%s: Unexpected TCEs for %s\n, __func__, + node_name); + + /* calculate bitmap size in bytes */ + bitmap_sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long); + + /* free bitmap */ + order = get_order(bitmap_sz); + free_pages((unsigned long) tbl-it_map, order); + } + + memset(tbl, 0, sizeof(*tbl)); +} + void iommu_free_table(struct iommu_table *tbl, const char *node_name) { - unsigned long bitmap_sz; - unsigned int order; struct powerpc_iommu *iommu = tbl-it_iommu; - if (!tbl || !tbl-it_map) { - printk(KERN_ERR %s: expected TCE map for %s\n, __func__, - node_name); + if (!tbl) return; - } - /* -* In case we have reserved the first bit, we should not emit -* the warning below. -*/ - if (tbl-it_offset == 0) - clear_bit(0, tbl-it_map); + iommu_reset_table(tbl, node_name); #ifdef CONFIG_IOMMU_API if (iommu-group) { @@ -747,17 +769,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) } #endif - /* verify that table contains no entries */ - if (!bitmap_empty(tbl-it_map, tbl-it_size)) - pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name); - - /* calculate bitmap size in bytes */ - bitmap_sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long); - - /* free bitmap */ - order = get_order(bitmap_sz); - free_pages((unsigned long) tbl-it_map, order); - /* free table */ kfree(iommu); } -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 10/24] powerpc/powernv/ioda2: Rework IOMMU ownership control
At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. The set_bypass() callback is not really an iommu_table function but IOMMU/PE function. This introduces a powerpc_iommu_ops struct and adds a set_ownership() callback to it which is called when an external user takes control over the IOMMU. This renames set_bypass() to set_ownership() as it is not necessarily just enabling bypassing, it can be something else/more so let's give it more generic name. The bool parameter is inverted. The callback is implemented for IODA2 only. This replaces iommu_take_ownership()/iommu_release_ownership() calls with the callback calls and it is up to the platform code to call iommu_take_ownership()/iommu_release_ownership() if needed. Next patches will remove these calls from IODA2 code. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 18 +-- arch/powerpc/kernel/iommu.c | 53 +++ arch/powerpc/platforms/powernv/pci-ioda.c | 30 - drivers/vfio/vfio_iommu_spapr_tce.c | 19 --- 4 files changed, 90 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 4fe..ba16aa0 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -92,7 +92,6 @@ struct iommu_table { unsigned long it_page_shift;/* table iommu page size */ struct powerpc_iommu *it_iommu; struct iommu_table_ops *it_ops; - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ @@ -127,11 +126,24 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, #define POWERPC_IOMMU_MAX_TABLES 1 +struct powerpc_iommu; + +struct powerpc_iommu_ops { + /* +* Switches ownership from the kernel itself to an external +* user. While onwership is enabled, the kernel cannot use IOMMU +* for itself. +*/ + void (*set_ownership)(struct powerpc_iommu *iommu, + bool enable); +}; + struct powerpc_iommu { #ifdef CONFIG_IOMMU_API struct iommu_group *group; #endif struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; + struct powerpc_iommu_ops *ops; }; #ifdef CONFIG_IOMMU_API @@ -219,8 +231,8 @@ extern unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry); extern void iommu_flush_tce(struct iommu_table *tbl); -extern int iommu_take_ownership(struct iommu_table *tbl); -extern void iommu_release_ownership(struct iommu_table *tbl); +extern int iommu_take_ownership(struct powerpc_iommu *iommu); +extern void iommu_release_ownership(struct powerpc_iommu *iommu); #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 407d0d6..9d06425 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1022,7 +1022,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_build); -int iommu_take_ownership(struct iommu_table *tbl) +static int iommu_table_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl-it_size + 7) 3; int ret = 0; @@ -1047,19 +1047,36 @@ int iommu_take_ownership(struct iommu_table *tbl) spin_unlock(tbl-pools[i].lock); spin_unlock_irqrestore(tbl-large_pool.lock, flags); - /* -* Disable iommu bypass, otherwise the user can DMA to all of -* our physical memory via the bypass window instead of just -* the pages that has been explicitly mapped into the iommu -*/ - if (!ret tbl-set_bypass) - tbl-set_bypass(tbl, false); - - return ret; + return 0; +} + +static void iommu_table_release_ownership(struct iommu_table *tbl); + +int iommu_take_ownership(struct powerpc_iommu *iommu) +{ + int i, j, rc = 0; + + for (i = 0; i POWERPC_IOMMU_MAX_TABLES; ++i) { + struct iommu_table *tbl = iommu-tables[i]; + + if (!tbl-it_map) + continue; + + rc = iommu_table_take_ownership(tbl); + if (rc) { + for (j = 0; j i; ++j) + iommu_table_release_ownership( + iommu-tables[j]); + + return rc; + } + } + + return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); -void iommu_release_ownership(struct iommu_table *tbl) +static void iommu_table_release_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl-it_size + 7) 3; @@ -1076,10 +1093,18 @@ void
[PATCH v3 24/24] vfio: powerpc/spapr: Support Dynamic DMA windows
This adds create/remove window ioctls to create and remove DMA windows. This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional information such as a number of supported windows and maximum number levels of TCE tables. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 2 +- drivers/vfio/vfio_iommu_spapr_tce.c | 137 +++- include/uapi/linux/vfio.h | 24 ++- 3 files changed, 160 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 33009f9..7ca1c8c 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -133,7 +133,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); -#define POWERPC_IOMMU_MAX_TABLES 1 +#define POWERPC_IOMMU_MAX_TABLES 2 #define POWERPC_IOMMU_DEFAULT_LEVELS 1 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8bcafb7..d3a1cc9 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -300,6 +300,20 @@ static struct iommu_table *spapr_tce_find_table( return ret; } +static int spapr_tce_find_free_table(struct tce_container *container) +{ + int i; + + for (i = 0; i POWERPC_IOMMU_MAX_TABLES; ++i) { + struct iommu_table *tbl = container-tables[i]; + + if (!tbl-it_size) + return i; + } + + return -1; +} + static unsigned long tce_default_winsize(struct tce_container *container) { struct tce_iommu_group *tcegrp; @@ -594,7 +608,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -636,6 +650,15 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_start = iommu-tce32_start; info.dma32_window_size = iommu-tce32_size; + info.windows_supported = iommu-windows_supported; + info.levels = iommu-levels; + info.flags = iommu-flags; + + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, + levels); + + if (info.argsz == ddwsz) + minsz = ddwsz; if (copy_to_user((void __user *)arg, info, minsz)) return -EFAULT; @@ -800,6 +823,118 @@ static long tce_iommu_ioctl(void *iommu_data, return ret; } + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct powerpc_iommu *iommu; + struct tce_iommu_group *tcegrp; + int num; + + if (!tce_preregistered(container)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(create, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + num = spapr_tce_find_free_table(container); + if (num 0) + return -ENOSYS; + + tcegrp = list_first_entry(container-group_list, + struct tce_iommu_group, next); + iommu = iommu_group_get_iommudata(tcegrp-grp); + + ret = iommu-ops-create_table(iommu, num, + create.page_shift, create.window_shift, + create.levels, + container-tables[num]); + if (ret) + return ret; + + list_for_each_entry(tcegrp, container-group_list, next) { + struct powerpc_iommu *iommutmp = + iommu_group_get_iommudata(tcegrp-grp); + + if (WARN_ON_ONCE(iommutmp-ops != iommu-ops)) + return -EFAULT; + + ret = iommu-ops-set_window(iommutmp, num, + container-tables[num]); + if (ret) + return ret; + } + + create.start_addr = + container-tables[num].it_offset + container-tables[num].it_page_shift; + + if (copy_to_user((void __user *)arg, create, minsz
[PATCH v3 23/24] vfio/spapr: Enable multiple groups in a container
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 243 +++- 1 file changed, 155 insertions(+), 88 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index d0987ae..8bcafb7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -84,9 +84,15 @@ static void decrement_locked_vm(long npages) */ struct tce_container { struct mutex lock; - struct iommu_group *grp; bool enabled; struct list_head mem_list; + struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; + struct list_head group_list; +}; + +struct tce_iommu_group { + struct list_head next; + struct iommu_group *grp; }; struct tce_memory { @@ -265,17 +271,21 @@ static bool tce_check_page_size(struct page *page, unsigned page_shift) return false; } +static inline bool tce_groups_attached(struct tce_container *container) +{ + return !list_empty(container-group_list); +} + static struct iommu_table *spapr_tce_find_table( struct tce_container *container, phys_addr_t ioba) { long i; struct iommu_table *ret = NULL; - struct powerpc_iommu *iommu = iommu_group_get_iommudata(container-grp); mutex_lock(container-lock); for (i = 0; i POWERPC_IOMMU_MAX_TABLES; ++i) { - struct iommu_table *tbl = iommu-tables[i]; + struct iommu_table *tbl = container-tables[i]; unsigned long entry = ioba tbl-it_page_shift; unsigned long start = tbl-it_offset; unsigned long end = start + tbl-it_size; @@ -290,13 +300,31 @@ static struct iommu_table *spapr_tce_find_table( return ret; } +static unsigned long tce_default_winsize(struct tce_container *container) +{ + struct tce_iommu_group *tcegrp; + struct powerpc_iommu *iommu; + + if (!tce_groups_attached(container)) + return 0; + + tcegrp = list_first_entry(container-group_list, + struct tce_iommu_group, next); + if (!tcegrp) + return 0; + + iommu = iommu_group_get_iommudata(tcegrp-grp); + if (!iommu) + return 0; + + return iommu-tce32_size; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - struct powerpc_iommu *iommu; - struct iommu_table *tbl; - if (!container-grp) + if (!tce_groups_attached(container)) return -ENXIO; if (container-enabled) @@ -328,12 +356,8 @@ static int tce_iommu_enable(struct tce_container *container) * KVM agnostic. */ if (!tce_preregistered(container)) { - iommu = iommu_group_get_iommudata(container-grp); - if (!iommu) - return -EFAULT; - - tbl = iommu-tables[0]; - ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl)); + ret = try_increment_locked_vm( + tce_default_winsize(container) PAGE_SHIFT); if (ret) return ret; } @@ -343,27 +367,23 @@ static int tce_iommu_enable(struct tce_container *container) return ret; } +static int tce_iommu_clear(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long pages); + static void tce_iommu_disable(struct tce_container *container) { - struct powerpc_iommu *iommu; - struct iommu_table *tbl; - if (!container-enabled) return; container-enabled = false; - if (!container-grp || !current-mm) + if (!current-mm) return; - if (!tce_preregistered(container)) { - iommu = iommu_group_get_iommudata(container-grp); - if (!iommu) - return; - - tbl = iommu-tables[0]; - decrement_locked_vm(IOMMU_TABLE_PAGES(tbl)); - } + if (!tce_preregistered(container)) + decrement_locked_vm( + tce_default_winsize(container) PAGE_SHIFT); } static void *tce_iommu_open(unsigned long arg) @@ -381,20 +401,44 @@ static void *tce_iommu_open(unsigned long arg) mutex_init(container-lock); INIT_LIST_HEAD_RCU(container-mem_list); + INIT_LIST_HEAD_RCU(container-group_list); return container; } static void tce_iommu_release(void *iommu_data) { + int i; + struct powerpc_iommu *iommu; + struct tce_iommu_group *tcegrp; struct tce_container *container = iommu_data; struct tce_memory *mem, *memtmp; + struct powerpc_iommu_ops *iommuops = NULL; - WARN_ON(container-grp); tce_iommu_disable(container); - if (container-grp
[PATCH v3 13/24] powerpc/pseries/lpar: Enable VFIO
The previous patch introduced iommu_table_ops::exchange() callback which effectively disabled VFIO on pseries. This implements exchange() for pseries/lpar so VFIO can work in nested guests. Since exchange() callback returns an old TCE, it has to call H_GET_TCE for every TCE being put to the table so VFIO performance in guests running under PR KVM is expected to be slower than in guests running under HV KVM or bare metal hosts. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v5: * added global lock for xchg operations * added missing be64_to_cpu(oldtce) --- arch/powerpc/platforms/pseries/iommu.c | 44 -- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index f537e6e..a903a27 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -137,14 +137,25 @@ static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, + unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 rc = 0; u64 proto_tce, tce; u64 rpn; - int ret = 0; + int ret = 0, i = 0; long tcenum_start = tcenum, npages_start = npages; + static spinlock_t get_tces_lock; + static bool get_tces_lock_initialized; + + if (old_tces) { + if (!get_tces_lock_initialized) { + spin_lock_init(get_tces_lock); + get_tces_lock_initialized = true; + } + spin_lock(get_tces_lock); + } rpn = __pa(uaddr) TCE_SHIFT; proto_tce = TCE_PCI_READ; @@ -153,6 +164,14 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, while (npages--) { tce = proto_tce | (rpn TCE_RPN_MASK) TCE_RPN_SHIFT; + if (old_tces) { + unsigned long oldtce = 0; + + plpar_tce_get((u64)tbl-it_index, (u64)tcenum 12, + oldtce); + old_tces[i] = be64_to_cpu(oldtce); + i++; + } rc = plpar_tce_put((u64)tbl-it_index, (u64)tcenum 12, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { @@ -173,13 +192,18 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum++; rpn++; } + + if (old_tces) + spin_unlock(get_tces_lock); + return ret; } static DEFINE_PER_CPU(__be64 *, tce_page); -static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_xchg_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, +unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -194,6 +218,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } @@ -210,6 +235,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } __this_cpu_write(tce_page, tcep); @@ -231,6 +257,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, for (l = 0; l limit; l++) { tcep[l] = cpu_to_be64(proto_tce | (rpn TCE_RPN_MASK) TCE_RPN_SHIFT); rpn++; + if (old_tces) + plpar_tce_get((u64)tbl-it_index, + (u64)(tcenum + l) 12, + old_tces[tcenum + l]); } rc = plpar_tce_put_indirect((u64)tbl-it_index, @@ -261,6 +291,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +long npages, unsigned long uaddr, +enum
[PATCH v3 21/24] powerpc/powernv/ioda: Define and implement DMA table/window management callbacks
This extends powerpc_iommu_ops by a set of callbacks to support dynamic DMA windows management. query() returns IOMMU capabilities such as default DMA window address and supported number of DMA windows and TCE table levels. create_table() creates a TCE table with specific parameters. For now it receives powerpc_iommu to know nodeid in order to allocate TCE table memory closer to the PHB. The exact format of allocated multi-level table might be also specific to the PHB model (not the case now though). set_window() sets the window at specified TVT index on PHB. unset_window() unsets the window from specified TVT. free_table() frees the memory occupied by a table. The purpose of this separation is that we need to be able to create one table and assign it to a set of PHB. This way we can support multiple IOMMU groups in one VFIO container and make use of VFIO on SPAPR closer to the way it works on x86. This uses new helpers to remove the default TCE table if the ownership is being taken and create it otherwise. So once an external user (such as VFIO) obtained the ownership over a group, it does not have any DMA windows, neither default 32bit not bypass window. The external user is expected to unprogram DMA windows on PHBs before returning ownership back to the kernel. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 31 ++ arch/powerpc/platforms/powernv/pci-ioda.c | 98 ++- 2 files changed, 113 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 283f70f..8393822 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -147,12 +147,43 @@ struct powerpc_iommu_ops { */ void (*set_ownership)(struct powerpc_iommu *iommu, bool enable); + + long (*create_table)(struct powerpc_iommu *iommu, + int num, + __u32 page_shift, + __u32 window_shift, + __u32 levels, + struct iommu_table *tbl); + long (*set_window)(struct powerpc_iommu *iommu, + int num, + struct iommu_table *tblnew); + long (*unset_window)(struct powerpc_iommu *iommu, + int num); + void (*free_table)(struct iommu_table *tbl); }; +/* Page size flags for ibm,query-pe-dma-window */ +#define DDW_PGSIZE_4K 0x01 +#define DDW_PGSIZE_64K 0x02 +#define DDW_PGSIZE_16M 0x04 +#define DDW_PGSIZE_32M 0x08 +#define DDW_PGSIZE_64M 0x10 +#define DDW_PGSIZE_128M 0x20 +#define DDW_PGSIZE_256M 0x40 +#define DDW_PGSIZE_16G 0x80 +#define DDW_PGSIZE_MASK 0xFF + struct powerpc_iommu { #ifdef CONFIG_IOMMU_API struct iommu_group *group; #endif + /* Some key properties of IOMMU */ + __u32 tce32_start; + __u32 tce32_size; + __u32 windows_supported; + __u32 levels; + __u32 flags; + struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES]; struct powerpc_iommu_ops *ops; }; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 29bd7a4..cf63ebb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1360,7 +1360,7 @@ static __be64 *pnv_alloc_tce_table(int nid, return addr; } -static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, +static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, int num, __u32 page_shift, __u32 window_shift, __u32 levels, struct iommu_table *tbl) { @@ -1388,8 +1388,8 @@ static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, shift = ROUND_UP(window_shift - page_shift, levels) / levels; shift += 3; shift = max_t(unsigned, shift, IOMMU_PAGE_SHIFT_4K); - pr_info(Creating TCE table %08llx, %d levels, TCE table size = %lx\n, - 1ULL window_shift, levels, 1UL shift); + pr_info(Creating TCE table #%d %08llx, %d levels, TCE table size = %lx\n, + num, 1ULL window_shift, levels, 1UL shift); tbl-it_level_size = 1ULL (shift - 3); left = tce_table_size; @@ -1400,11 +1400,10 @@ static long pnv_pci_ioda2_create_table(struct powerpc_iommu *iommu, tbl-it_indirect_levels = levels - 1; /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - page_shift); + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, + num ? pe-tce_bypass_base : 0, page_shift); tbl-it_ops = pnv_ioda2_iommu_ops; - iommu_init_table(tbl, nid); return 0; } @@ -1421,8 +1420,18 @@ static void pnv_pci_ioda2_free_table(struct iommu_table
[PATCH v3 22/24] powerpc/iommu: Get rid of ownership helpers
iommu_take_ownership/iommu_release_ownership used to be used to mark bits in iommu_table::it_map. Since the IOMMU tables are recreated for VFIO, it_map is always NULL. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 2 - arch/powerpc/kernel/iommu.c | 96 2 files changed, 98 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 8393822..33009f9 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -272,8 +272,6 @@ extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, enum dma_data_direction direction); extern void iommu_flush_tce(struct iommu_table *tbl); -extern int iommu_take_ownership(struct powerpc_iommu *iommu); -extern void iommu_release_ownership(struct powerpc_iommu *iommu); #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 5f87076..6987115 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1007,102 +1007,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_xchg); -static int iommu_table_take_ownership(struct iommu_table *tbl) -{ - unsigned long flags, i, sz = (tbl-it_size + 7) 3; - int ret = 0; - - /* -* VFIO does not control TCE entries allocation and the guest -* can write new TCEs on top of existing ones so iommu_tce_build() -* must be able to release old pages. This functionality -* requires exchange() callback defined so if it is not -* implemented, we disallow taking ownership over the table. -*/ - if (!tbl-it_ops-exchange) - return -EINVAL; - - spin_lock_irqsave(tbl-large_pool.lock, flags); - for (i = 0; i tbl-nr_pools; i++) - spin_lock(tbl-pools[i].lock); - - if (tbl-it_offset == 0) - clear_bit(0, tbl-it_map); - - if (!bitmap_empty(tbl-it_map, tbl-it_size)) { - pr_err(iommu_tce: it_map is not empty); - ret = -EBUSY; - if (tbl-it_offset == 0) - set_bit(0, tbl-it_map); - } else { - memset(tbl-it_map, 0xff, sz); - } - - for (i = 0; i tbl-nr_pools; i++) - spin_unlock(tbl-pools[i].lock); - spin_unlock_irqrestore(tbl-large_pool.lock, flags); - - return 0; -} - -static void iommu_table_release_ownership(struct iommu_table *tbl); - -int iommu_take_ownership(struct powerpc_iommu *iommu) -{ - int i, j, rc = 0; - - for (i = 0; i POWERPC_IOMMU_MAX_TABLES; ++i) { - struct iommu_table *tbl = iommu-tables[i]; - - if (!tbl-it_map) - continue; - - rc = iommu_table_take_ownership(tbl); - if (rc) { - for (j = 0; j i; ++j) - iommu_table_release_ownership( - iommu-tables[j]); - - return rc; - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_take_ownership); - -static void iommu_table_release_ownership(struct iommu_table *tbl) -{ - unsigned long flags, i, sz = (tbl-it_size + 7) 3; - - spin_lock_irqsave(tbl-large_pool.lock, flags); - for (i = 0; i tbl-nr_pools; i++) - spin_lock(tbl-pools[i].lock); - - memset(tbl-it_map, 0, sz); - - /* Restore bit#0 set by iommu_init_table() */ - if (tbl-it_offset == 0) - set_bit(0, tbl-it_map); - - for (i = 0; i tbl-nr_pools; i++) - spin_unlock(tbl-pools[i].lock); - spin_unlock_irqrestore(tbl-large_pool.lock, flags); -} - -extern void iommu_release_ownership(struct powerpc_iommu *iommu) -{ - int i; - - for (i = 0; i POWERPC_IOMMU_MAX_TABLES; ++i) { - struct iommu_table *tbl = iommu-tables[i]; - - if (tbl-it_map) - iommu_table_release_ownership(tbl); - } -} -EXPORT_SYMBOL_GPL(iommu_release_ownership); - int iommu_add_device(struct device *dev) { struct iommu_table *tbl; -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 00/24] powerpc/iommu/vfio: Enable Dynamic DMA windows
This enables PAPR defined feature called Dynamic DMA windows (DDW). Each Partitionable Endpoint (IOMMU group) has a separate DMA window on a PCI bus where devices are allows to perform DMA. By default there is 1 or 2GB window allocated at the host boot time and these windows are used when an IOMMU group is passed to the userspace (guest). These windows are mapped at zero offset on a PCI bus. Hi-speed devices may suffer from limited size of this window. On the host side a TCE bypass mode is enabled on POWER8 CPU which implements direct mapping of the host memory to a PCI bus at 159. For the guest, PAPR defines a DDW RTAS API which allows the pseries guest to query the hypervisor if it supports DDW and what are the parameters of possible windows. Currently POWER8 supports 2 DMA windows per PE - already mentioned and used small 32bit window and 64bit window which can only start from 159 and can support various page sizes. This patchset reworks PPC IOMMU code and adds necessary structures to extend it to support big windows. When the guest detectes the feature and the PE is capable of 64bit DMA, it does: 1. query to hypervisor about number of available windows and page masks; 2. creates a window with the biggest possible page size (current guests can do 64K or 16MB TCEs); 3. maps the entire guest RAM via H_PUT_TCE* hypercalls 4. switches dma_ops to direct_dma_ops on the selected PE. Once this is done, H_PUT_TCE is not called anymore and the guest gets maximum performance. Changes: v3: * (!) redesigned the whole thing * multiple IOMMU groups per PHB - one PHB is needed for VFIO in the guest - no problems with locked_vm counting; also we save memory on actual tables * guest RAM preregistration is required for DDW * PEs (IOMMU groups) are passed to VFIO with no DMA windows at all so we do not bother with iommu_table::it_map anymore * added multilevel TCE tables support to support really huge guests v2: * added missing __pa() in powerpc/powernv: Release replaced TCE * reposted to make some noise Alexey Kardashevskiy (24): vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver vfio: powerpc/iommu: Check that TCE page size is equal to it_page_size powerpc/powernv: Do not set read flag if direction==DMA_NONE vfio: powerpc/spapr: Use it_page_size vfio: powerpc/spapr: Move locked_vm accounting to helpers powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table powerpc/iommu: Introduce iommu_table_alloc() helper powerpc/spapr: vfio: Switch from iommu_table to new powerpc_iommu powerpc/iommu: Fix IOMMU ownership control functions powerpc/powernv/ioda2: Rework IOMMU ownership control powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free() powerpc/iommu/powernv: Release replaced TCE powerpc/pseries/lpar: Enable VFIO vfio: powerpc/spapr: Register memory poweppc/powernv/ioda2: Rework iommu_table creation powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_create_table powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window powerpc/iommu: Split iommu_free_table into 2 helpers powerpc/powernv: Implement multilevel TCE tables powerpc/powernv: Change prototypes to receive iommu powerpc/powernv/ioda: Define and implement DMA table/window management callbacks powerpc/iommu: Get rid of ownership helpers vfio/spapr: Enable multiple groups in a container vfio: powerpc/spapr: Support Dynamic DMA windows arch/powerpc/include/asm/iommu.h| 107 +++- arch/powerpc/include/asm/machdep.h | 25 - arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/kernel/iommu.c | 282 +++-- arch/powerpc/kernel/vio.c | 5 + arch/powerpc/platforms/cell/iommu.c | 8 +- arch/powerpc/platforms/pasemi/iommu.c | 7 +- arch/powerpc/platforms/powernv/pci-ioda.c | 470 --- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 21 +- arch/powerpc/platforms/powernv/pci.c| 130 +++-- arch/powerpc/platforms/powernv/pci.h| 14 +- arch/powerpc/platforms/pseries/iommu.c | 99 +++- arch/powerpc/sysdev/dart_iommu.c| 12 +- drivers/vfio/vfio_iommu_spapr_tce.c | 874 include/uapi/linux/vfio.h | 53 +- 15 files changed, 1584 insertions(+), 525 deletions(-) -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows
On 09/23/2014 11:56 PM, Alex Williamson wrote: > On Tue, 2014-09-23 at 13:01 +1000, Alexey Kardashevskiy wrote: >> This defines and implements VFIO IOMMU API which lets the userspace >> create and remove DMA windows. >> >> This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of >> available windows and page mask. >> >> This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE >> to allow the user space to create and remove window(s). >> >> The VFIO IOMMU driver does basic sanity checks and calls corresponding >> SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) >> implements them. >> >> This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via >> VFIO_IOMMU_SPAPR_TCE_GET_INFO. >> >> This calls platform DDW reset() callback when IOMMU is being disabled >> to reset the DMA configuration to its original state. >> >> Signed-off-by: Alexey Kardashevskiy >> --- >> drivers/vfio/vfio_iommu_spapr_tce.c | 135 >> ++-- >> include/uapi/linux/vfio.h | 25 ++- >> 2 files changed, 153 insertions(+), 7 deletions(-) >> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c >> b/drivers/vfio/vfio_iommu_spapr_tce.c >> index 0dccbc4..b518891 100644 >> --- a/drivers/vfio/vfio_iommu_spapr_tce.c >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c >> @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container >> *container) >> >> container->enabled = false; >> >> -if (!container->grp || !current->mm) >> +if (!container->grp) >> return; >> >> data = iommu_group_get_iommudata(container->grp); >> if (!data || !data->iommu_owner || !data->ops->get_table) >> return; >> >> -tbl = data->ops->get_table(data, 0); >> -if (!tbl) >> -return; >> +if (current->mm) { >> +tbl = data->ops->get_table(data, 0); >> +if (tbl) >> +decrement_locked_vm(tbl); >> >> -decrement_locked_vm(tbl); >> +tbl = data->ops->get_table(data, 1); >> +if (tbl) >> +decrement_locked_vm(tbl); >> +} >> + >> +if (data->ops->reset) >> +data->ops->reset(data); >> } >> >> static void *tce_iommu_open(unsigned long arg) >> @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, >> unsigned int cmd, unsigned long arg) >> { >> struct tce_container *container = iommu_data; >> -unsigned long minsz; >> +unsigned long minsz, ddwsz; >> long ret; >> >> switch (cmd) { >> @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, >> info.dma32_window_size = tbl->it_size << tbl->it_page_shift; >> info.flags = 0; >> >> +ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, >> +page_size_mask); >> + >> +if (info.argsz == ddwsz) { > >> = > >> +if (data->ops->query && data->ops->create && >> +data->ops->remove) { >> +info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; > > I think you want to set this flag regardless of whether the user has > provided space for it. A valid use model is to call with the minimum > size and look at the flags to determine if it needs to be called again > with a larger size. > >> + >> +ret = data->ops->query(data, >> +_windows, >> +_available, >> +_size_mask); >> +if (ret) >> +return ret; >> +} else { >> +info.current_windows = 0; >> +info.windows_available = 0; >> +info.page_size_mask = 0; >> +} >> +minsz = ddwsz; > > It's not really any longer the min size, is it? > >> +} >> + >> if (copy_to_user((void __user *)arg, , minsz)) >> return -EFAULT; >> >> @@ -412,12 +441,106 @
Re: [PATCH v2 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows
On 09/23/2014 11:56 PM, Alex Williamson wrote: On Tue, 2014-09-23 at 13:01 +1000, Alexey Kardashevskiy wrote: This defines and implements VFIO IOMMU API which lets the userspace create and remove DMA windows. This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of available windows and page mask. This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE to allow the user space to create and remove window(s). The VFIO IOMMU driver does basic sanity checks and calls corresponding SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) implements them. This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via VFIO_IOMMU_SPAPR_TCE_GET_INFO. This calls platform DDW reset() callback when IOMMU is being disabled to reset the DMA configuration to its original state. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 135 ++-- include/uapi/linux/vfio.h | 25 ++- 2 files changed, 153 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0dccbc4..b518891 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container *container) container-enabled = false; -if (!container-grp || !current-mm) +if (!container-grp) return; data = iommu_group_get_iommudata(container-grp); if (!data || !data-iommu_owner || !data-ops-get_table) return; -tbl = data-ops-get_table(data, 0); -if (!tbl) -return; +if (current-mm) { +tbl = data-ops-get_table(data, 0); +if (tbl) +decrement_locked_vm(tbl); -decrement_locked_vm(tbl); +tbl = data-ops-get_table(data, 1); +if (tbl) +decrement_locked_vm(tbl); +} + +if (data-ops-reset) +data-ops-reset(data); } static void *tce_iommu_open(unsigned long arg) @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; -unsigned long minsz; +unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; +ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, +page_size_mask); + +if (info.argsz == ddwsz) { = +if (data-ops-query data-ops-create +data-ops-remove) { +info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; I think you want to set this flag regardless of whether the user has provided space for it. A valid use model is to call with the minimum size and look at the flags to determine if it needs to be called again with a larger size. + +ret = data-ops-query(data, +info.current_windows, +info.windows_available, +info.page_size_mask); +if (ret) +return ret; +} else { +info.current_windows = 0; +info.windows_available = 0; +info.page_size_mask = 0; +} +minsz = ddwsz; It's not really any longer the min size, is it? +} + if (copy_to_user((void __user *)arg, info, minsz)) return -EFAULT; @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(container-lock); return 0; + case VFIO_EEH_PE_OP: if (!container-grp) return -ENODEV; return vfio_spapr_iommu_eeh_ioctl(container-grp, cmd, arg); + +case VFIO_IOMMU_SPAPR_TCE_CREATE: { +struct vfio_iommu_spapr_tce_create create; +struct spapr_tce_iommu_group *data; +struct iommu_table *tbl; + +if (WARN_ON(!container-grp)) redux previous comment on this warning +return -ENXIO; + +data = iommu_group_get_iommudata(container-grp); + +minsz = offsetofend(struct vfio_iommu_spapr_tce_create, +start_addr); + +if (copy_from_user(create, (void __user *)arg, minsz
[PATCH v3] powerpc/iommu/ddw: Fix endianness
rtas_call() accepts and returns values in CPU endianness. The ddw_query_response and ddw_create_response structs members are defined and treated as BE but as they are passed to rtas_call() as (u32 *) and they get byteswapped automatically, the data is CPU-endian. This fixes ddw_query_response and ddw_create_response definitions and use. of_read_number() is designed to work with device tree cells - it assumes the input is big-endian and returns data in CPU-endian. However due to the ddw_create_response struct fix, create.addr_hi/lo are already CPU-endian so do not byteswap them. ddw_avail is a pointer to the "ibm,ddw-applicable" property which contains 3 cells which are big-endian as it is a device tree. rtas_call() accepts a RTAS token in CPU-endian. This makes use of of_property_read_u32_array to byte swap and avoid the need for a number of be32_to_cpu calls. Cc: sta...@vger.kernel.org # v3.13 Cc: Benjamin Herrenschmidt Reviewed-by: Anton Blanchard [aik: folded Anton's patch with of_property_read_u32_array] Signed-off-by: Alexey Kardashevskiy --- Changes: v3: * of_property_read_u32_array() is used for ddw_avail[] v2: * updated commit log * fixed definition of ddw_query_response and ddw_create_response --- arch/powerpc/platforms/pseries/iommu.c | 51 +++--- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 4642d6a..de1ec54 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -329,16 +329,16 @@ struct direct_window { /* Dynamic DMA Window support */ struct ddw_query_response { - __be32 windows_available; - __be32 largest_available_block; - __be32 page_size; - __be32 migration_capable; + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; }; struct ddw_create_response { - __be32 liobn; - __be32 addr_hi; - __be32 addr_lo; + u32 liobn; + u32 addr_hi; + u32 addr_lo; }; static LIST_HEAD(direct_window_list); @@ -725,16 +725,18 @@ static void remove_ddw(struct device_node *np, bool remove_prop) { struct dynamic_dma_window_prop *dwp; struct property *win64; - const u32 *ddw_avail; + u32 ddw_avail[3]; u64 liobn; - int len, ret = 0; + int ret = 0; + + ret = of_property_read_u32_array(np, "ibm,ddw-applicable", +_avail[0], 3); - ddw_avail = of_get_property(np, "ibm,ddw-applicable", ); win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); if (!win64) return; - if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp)) + if (ret || win64->length < sizeof(*dwp)) goto delprop; dwp = win64->value; @@ -872,8 +874,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(>dev, "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " @@ -910,7 +913,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) int page_shift; u64 dma_addr, max_addr; struct device_node *dn; - const u32 *uninitialized_var(ddw_avail); + u32 ddw_avail[3]; struct direct_window *window; struct property *win64; struct dynamic_dma_window_prop *ddwprop; @@ -942,8 +945,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * for the given node in that order. * the property is actually in the parent, not the PE */ - ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", ); - if (!ddw_avail || len < 3 * sizeof(u32)) + ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable", +_avail[0], 3); + if (ret) goto out_failed; /* @@ -966,11 +970,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(>dev, "no free dynamic windows"); goto out_failed; } - if (be32_to_cpu(query.page_size) & 4) { + if (query.page_size & 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) & 2) { + } else if (quer
[PATCH v3] powerpc/iommu/ddw: Fix endianness
rtas_call() accepts and returns values in CPU endianness. The ddw_query_response and ddw_create_response structs members are defined and treated as BE but as they are passed to rtas_call() as (u32 *) and they get byteswapped automatically, the data is CPU-endian. This fixes ddw_query_response and ddw_create_response definitions and use. of_read_number() is designed to work with device tree cells - it assumes the input is big-endian and returns data in CPU-endian. However due to the ddw_create_response struct fix, create.addr_hi/lo are already CPU-endian so do not byteswap them. ddw_avail is a pointer to the ibm,ddw-applicable property which contains 3 cells which are big-endian as it is a device tree. rtas_call() accepts a RTAS token in CPU-endian. This makes use of of_property_read_u32_array to byte swap and avoid the need for a number of be32_to_cpu calls. Cc: sta...@vger.kernel.org # v3.13 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Reviewed-by: Anton Blanchard an...@samba.org [aik: folded Anton's patch with of_property_read_u32_array] Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v3: * of_property_read_u32_array() is used for ddw_avail[] v2: * updated commit log * fixed definition of ddw_query_response and ddw_create_response --- arch/powerpc/platforms/pseries/iommu.c | 51 +++--- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 4642d6a..de1ec54 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -329,16 +329,16 @@ struct direct_window { /* Dynamic DMA Window support */ struct ddw_query_response { - __be32 windows_available; - __be32 largest_available_block; - __be32 page_size; - __be32 migration_capable; + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; }; struct ddw_create_response { - __be32 liobn; - __be32 addr_hi; - __be32 addr_lo; + u32 liobn; + u32 addr_hi; + u32 addr_lo; }; static LIST_HEAD(direct_window_list); @@ -725,16 +725,18 @@ static void remove_ddw(struct device_node *np, bool remove_prop) { struct dynamic_dma_window_prop *dwp; struct property *win64; - const u32 *ddw_avail; + u32 ddw_avail[3]; u64 liobn; - int len, ret = 0; + int ret = 0; + + ret = of_property_read_u32_array(np, ibm,ddw-applicable, +ddw_avail[0], 3); - ddw_avail = of_get_property(np, ibm,ddw-applicable, len); win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); if (!win64) return; - if (!ddw_avail || len 3 * sizeof(u32) || win64-length sizeof(*dwp)) + if (ret || win64-length sizeof(*dwp)) goto delprop; dwp = win64-value; @@ -872,8 +874,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(dev-dev, ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d @@ -910,7 +913,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) int page_shift; u64 dma_addr, max_addr; struct device_node *dn; - const u32 *uninitialized_var(ddw_avail); + u32 ddw_avail[3]; struct direct_window *window; struct property *win64; struct dynamic_dma_window_prop *ddwprop; @@ -942,8 +945,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * for the given node in that order. * the property is actually in the parent, not the PE */ - ddw_avail = of_get_property(pdn, ibm,ddw-applicable, len); - if (!ddw_avail || len 3 * sizeof(u32)) + ret = of_property_read_u32_array(pdn, ibm,ddw-applicable, +ddw_avail[0], 3); + if (ret) goto out_failed; /* @@ -966,11 +970,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(dev-dev, no free dynamic windows); goto out_failed; } - if (be32_to_cpu(query.page_size) 4) { + if (query.page_size 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) 2) { + } else if (query.page_size 2) { page_shift = 16; /* 64kB
Re: [PATCH v2 03/13] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
On 09/24/2014 06:42 AM, Alex Williamson wrote: > On Tue, 2014-09-23 at 13:00 +1000, Alexey Kardashevskiy wrote: >> Modern IBM POWERPC systems support multiple IOMMU tables per PE >> so we need a more reliable way (compared to container_of()) to get >> a PE pointer from the iommu_table struct pointer used in IOMMU functions. >> >> At the moment IOMMU group data points to an iommu_table struct. This >> introduces a spapr_tce_iommu_group struct which keeps an iommu_owner >> and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to >> the pnv_ioda_pe struct, for others it is still a pointer to >> the iommu_table struct. The ops structs correspond to the type which >> iommu_owner points to. >> >> This defines a get_table() callback which returns an iommu_table >> by its number. >> >> As the IOMMU group data pointer points to variable type instead of >> iommu_table, VFIO SPAPR TCE driver is updated to use the new type. >> This changes the tce_container struct to store iommu_group instead of >> iommu_table. >> >> So, it was: >> - iommu_table points to iommu_group via iommu_table::it_group; >> - iommu_group points to iommu_table via iommu_group_get_iommudata(); >> >> now it is: >> - iommu_table points to iommu_group via iommu_table::it_group; >> - iommu_group points to spapr_tce_iommu_group via >> iommu_group_get_iommudata(); >> - spapr_tce_iommu_group points to either (depending on .get_table()): >> - iommu_table; >> - pnv_ioda_pe; >> >> This uses pnv_ioda1_iommu_get_table for both IODA1&2 but IODA2 will >> have own pnv_ioda2_iommu_get_table soon and pnv_ioda1_iommu_get_table >> will only be used for IODA1. >> >> Signed-off-by: Alexey Kardashevskiy >> --- >> arch/powerpc/include/asm/iommu.h| 6 ++ >> arch/powerpc/include/asm/tce.h | 13 +++ >> arch/powerpc/kernel/iommu.c | 35 ++- >> arch/powerpc/platforms/powernv/pci-ioda.c | 31 +- >> arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + >> arch/powerpc/platforms/powernv/pci.c| 2 +- >> arch/powerpc/platforms/pseries/iommu.c | 10 +- >> drivers/vfio/vfio_iommu_spapr_tce.c | 148 >> ++-- >> 8 files changed, 208 insertions(+), 38 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/iommu.h >> b/arch/powerpc/include/asm/iommu.h >> index 42632c7..84ee339 100644 >> --- a/arch/powerpc/include/asm/iommu.h >> +++ b/arch/powerpc/include/asm/iommu.h >> @@ -108,13 +108,19 @@ extern void iommu_free_table(struct iommu_table *tbl, >> const char *node_name); >> */ >> extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, >> int nid); >> + >> +struct spapr_tce_iommu_ops; >> #ifdef CONFIG_IOMMU_API >> extern void iommu_register_group(struct iommu_table *tbl, >> + void *iommu_owner, >> + struct spapr_tce_iommu_ops *ops, >> int pci_domain_number, unsigned long pe_num); >> extern int iommu_add_device(struct device *dev); >> extern void iommu_del_device(struct device *dev); >> #else >> static inline void iommu_register_group(struct iommu_table *tbl, >> +void *iommu_owner, >> +struct spapr_tce_iommu_ops *ops, >> int pci_domain_number, >> unsigned long pe_num) >> { >> diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h >> index 743f36b..9f159eb 100644 >> --- a/arch/powerpc/include/asm/tce.h >> +++ b/arch/powerpc/include/asm/tce.h >> @@ -50,5 +50,18 @@ >> #define TCE_PCI_READ0x1 /* read from PCI >> allowed */ >> #define TCE_VB_WRITE0x1 /* write from VB >> allowed */ >> >> +struct spapr_tce_iommu_group; >> + >> +struct spapr_tce_iommu_ops { >> +struct iommu_table *(*get_table)( >> +struct spapr_tce_iommu_group *data, >> +int num); >> +}; >> + >> +struct spapr_tce_iommu_group { >> +void *iommu_owner; >> +struct spapr_tce_iommu_ops *ops; >> +}; >> + >> #endif /* __KERNEL__ */ >> #endif /* _ASM_POWERPC_TCE_H */ >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c >> index b378f78..
Re: [PATCH v2 03/13] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
On 09/24/2014 06:42 AM, Alex Williamson wrote: On Tue, 2014-09-23 at 13:00 +1000, Alexey Kardashevskiy wrote: Modern IBM POWERPC systems support multiple IOMMU tables per PE so we need a more reliable way (compared to container_of()) to get a PE pointer from the iommu_table struct pointer used in IOMMU functions. At the moment IOMMU group data points to an iommu_table struct. This introduces a spapr_tce_iommu_group struct which keeps an iommu_owner and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to the pnv_ioda_pe struct, for others it is still a pointer to the iommu_table struct. The ops structs correspond to the type which iommu_owner points to. This defines a get_table() callback which returns an iommu_table by its number. As the IOMMU group data pointer points to variable type instead of iommu_table, VFIO SPAPR TCE driver is updated to use the new type. This changes the tce_container struct to store iommu_group instead of iommu_table. So, it was: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to iommu_table via iommu_group_get_iommudata(); now it is: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to spapr_tce_iommu_group via iommu_group_get_iommudata(); - spapr_tce_iommu_group points to either (depending on .get_table()): - iommu_table; - pnv_ioda_pe; This uses pnv_ioda1_iommu_get_table for both IODA12 but IODA2 will have own pnv_ioda2_iommu_get_table soon and pnv_ioda1_iommu_get_table will only be used for IODA1. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 6 ++ arch/powerpc/include/asm/tce.h | 13 +++ arch/powerpc/kernel/iommu.c | 35 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/pseries/iommu.c | 10 +- drivers/vfio/vfio_iommu_spapr_tce.c | 148 ++-- 8 files changed, 208 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 42632c7..84ee339 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -108,13 +108,19 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API extern void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, + struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else static inline void iommu_register_group(struct iommu_table *tbl, +void *iommu_owner, +struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 743f36b..9f159eb 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -50,5 +50,18 @@ #define TCE_PCI_READ0x1 /* read from PCI allowed */ #define TCE_VB_WRITE0x1 /* write from VB allowed */ +struct spapr_tce_iommu_group; + +struct spapr_tce_iommu_ops { +struct iommu_table *(*get_table)( +struct spapr_tce_iommu_group *data, +int num); +}; + +struct spapr_tce_iommu_group { +void *iommu_owner; +struct spapr_tce_iommu_ops *ops; +}; + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TCE_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b378f78..1c5dae7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -878,24 +878,53 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { -struct iommu_table *tbl = iommu_data; -tbl-it_group = NULL; +kfree(iommu_data); } +static struct iommu_table *spapr_tce_default_get_table( +struct spapr_tce_iommu_group *data, int num) +{ +struct iommu_table *tbl = data-iommu_owner; + +switch (num) { +case 0: +if (tbl-it_size) +return tbl; +/* fallthru */ +default: +return NULL; +} +} + +static struct spapr_tce_iommu_ops spapr_tce_default_ops = { +.get_table = spapr_tce_default_get_table +}; + void
[PATCH v2 01/13] powerpc/iommu: Check that TCE page size is equal to it_page_size
This checks that the TCE table page size is not bigger that the size of a page we just pinned and going to put its physical address to the table. Otherwise the hardware gets unwanted access to physical memory between the end of the actual page and the end of the aligned up TCE page. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/kernel/iommu.c | 28 +--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index a10642a..b378f78 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1059,16 +1060,37 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, tce, entry << tbl->it_page_shift, ret); */ return -EFAULT; } + + /* +* Check that the TCE table granularity is not bigger than the size of +* a page we just found. Otherwise the hardware can get access to +* a bigger memory chunk that it should. +*/ + if (PageHuge(page)) { + struct page *head = compound_head(page); + long shift = PAGE_SHIFT + compound_order(head); + + if (shift < tbl->it_page_shift) { + ret = -EINVAL; + goto put_page_exit; + } + + } + hwaddr = (unsigned long) page_address(page) + offset; ret = iommu_tce_build(tbl, entry, hwaddr, direction); if (ret) - put_page(page); + goto put_page_exit; - if (ret < 0) - pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", + return 0; + +put_page_exit: + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", __func__, entry << tbl->it_page_shift, tce, ret); + put_page(page); + return ret; } EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 02/13] powerpc/powernv: Make invalidate() a callback
At the moment pnv_pci_ioda_tce_invalidate() gets the PE pointer via container_of(tbl). Since we are going to have to add Dynamic DMA windows and that means having 2 IOMMU tables per PE, this is not going to work. This implements pnv_pci_ioda(1|2)_tce_invalidate as a pnv_ioda_pe callback. This adds a pnv_iommu_table wrapper around iommu_table and stores a pointer to PE there. PNV's ppc_md.tce_build() call uses this to find PE and do the invalidation. This will be used later for Dynamic DMA windows too. This registers invalidate() callbacks for IODA1 and IODA2: - pnv_pci_ioda1_tce_invalidate; - pnv_pci_ioda2_tce_invalidate. Signed-off-by: Alexey Kardashevskiy --- Changes: v4: * changed commit log to explain why this change is needed --- arch/powerpc/platforms/powernv/pci-ioda.c | 35 --- arch/powerpc/platforms/powernv/pci.c | 31 --- arch/powerpc/platforms/powernv/pci.h | 13 +++- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index df241b1..136e765 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = >ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(>dev) != _iommu_ops); - set_iommu_table_base_and_group(>dev, >tce32_table); + set_iommu_table_base_and_group(>dev, >tce32.table); } static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, @@ -884,7 +884,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, } else { dev_info(>dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(>dev, _iommu_ops); - set_iommu_table_base(>dev, >tce32_table); + set_iommu_table_base(>dev, >tce32.table); } *pdev->dev.dma_mask = dma_mask; return 0; @@ -899,9 +899,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, list_for_each_entry(dev, >devices, bus_list) { if (add_to_iommu_group) set_iommu_table_base_and_group(>dev, - >tce32_table); + >tce32.table); else - set_iommu_table_base(>dev, >tce32_table); + set_iommu_table_base(>dev, >tce32.table); if (dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate, @@ -988,19 +988,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, } } -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, -__be64 *startp, __be64 *endp, bool rm) -{ - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); - struct pnv_phb *phb = pe->phb; - - if (phb->type == PNV_PHB_IODA1) - pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); - else - pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); -} - static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, unsigned int segs) @@ -1058,9 +1045,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = >tce32_table; + tbl = >tce32.table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base << 28, IOMMU_PAGE_SHIFT_4K); + pe->tce32.pe = pe; + pe->tce32.invalidate_fn = pnv_pci_ioda1_tce_invalidate; /* OPAL variant of P7IOC SW invalidated TCEs */ swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); @@ -1097,7 +1086,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); + tce32.table); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1142,10 +1131,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pe->tce_bypass_base = 1ull << 59; /* Install set_bypass callback for VFIO */ - pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + pe->tce32.table.set_bypass = pnv_pci_ioda2_set_bypass; /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(>tce32_tabl
[PATCH v2 03/13] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
Modern IBM POWERPC systems support multiple IOMMU tables per PE so we need a more reliable way (compared to container_of()) to get a PE pointer from the iommu_table struct pointer used in IOMMU functions. At the moment IOMMU group data points to an iommu_table struct. This introduces a spapr_tce_iommu_group struct which keeps an iommu_owner and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to the pnv_ioda_pe struct, for others it is still a pointer to the iommu_table struct. The ops structs correspond to the type which iommu_owner points to. This defines a get_table() callback which returns an iommu_table by its number. As the IOMMU group data pointer points to variable type instead of iommu_table, VFIO SPAPR TCE driver is updated to use the new type. This changes the tce_container struct to store iommu_group instead of iommu_table. So, it was: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to iommu_table via iommu_group_get_iommudata(); now it is: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to spapr_tce_iommu_group via iommu_group_get_iommudata(); - spapr_tce_iommu_group points to either (depending on .get_table()): - iommu_table; - pnv_ioda_pe; This uses pnv_ioda1_iommu_get_table for both IODA1&2 but IODA2 will have own pnv_ioda2_iommu_get_table soon and pnv_ioda1_iommu_get_table will only be used for IODA1. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 6 ++ arch/powerpc/include/asm/tce.h | 13 +++ arch/powerpc/kernel/iommu.c | 35 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/pseries/iommu.c | 10 +- drivers/vfio/vfio_iommu_spapr_tce.c | 148 ++-- 8 files changed, 208 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 42632c7..84ee339 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -108,13 +108,19 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API extern void iommu_register_group(struct iommu_table *tbl, +void *iommu_owner, +struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else static inline void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, + struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 743f36b..9f159eb 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -50,5 +50,18 @@ #define TCE_PCI_READ 0x1 /* read from PCI allowed */ #define TCE_VB_WRITE 0x1 /* write from VB allowed */ +struct spapr_tce_iommu_group; + +struct spapr_tce_iommu_ops { + struct iommu_table *(*get_table)( + struct spapr_tce_iommu_group *data, + int num); +}; + +struct spapr_tce_iommu_group { + void *iommu_owner; + struct spapr_tce_iommu_ops *ops; +}; + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TCE_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b378f78..1c5dae7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -878,24 +878,53 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl->it_group = NULL; + kfree(iommu_data); } +static struct iommu_table *spapr_tce_default_get_table( + struct spapr_tce_iommu_group *data, int num) +{ + struct iommu_table *tbl = data->iommu_owner; + + switch (num) { + case 0: + if (tbl->it_size) + return tbl; + /* fallthru */ + default: + return NULL; + } +} + +static struct spapr_tce_iommu_ops spapr_tce_default_ops = { + .get_table = spapr_tce_default_get_table +}; + void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigne
[PATCH v2 04/13] powerpc/powernv: Convert/move set_bypass() callback to take_ownership()
At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. Since the set_bypass() is not really an iommu_table function but PE's function, and we have an ops struct per IOMMU owner, let's move set_bypass() to the spapr_tce_iommu_ops struct. As arch/powerpc/kernel/iommu.c is more about POWERPC IOMMU tables and has very little to do with PEs, this moves take_ownership() calls to the VFIO SPAPR TCE driver. This renames set_bypass() to take_ownership() as it is not necessarily just enabling bypassing, it can be something else/more so let's give it a generic name. The bool parameter is inverted. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan --- arch/powerpc/include/asm/iommu.h | 1 - arch/powerpc/include/asm/tce.h| 2 ++ arch/powerpc/kernel/iommu.c | 12 arch/powerpc/platforms/powernv/pci-ioda.c | 20 drivers/vfio/vfio_iommu_spapr_tce.c | 16 5 files changed, 30 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 84ee339..2b0b01d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -77,7 +77,6 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 9f159eb..e6355f9 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -56,6 +56,8 @@ struct spapr_tce_iommu_ops { struct iommu_table *(*get_table)( struct spapr_tce_iommu_group *data, int num); + void (*take_ownership)(struct spapr_tce_iommu_group *data, + bool enable); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 1c5dae7..c2c8d9d 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1139,14 +1139,6 @@ int iommu_take_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0xff, sz); iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); - /* -* Disable iommu bypass, otherwise the user can DMA to all of -* our physical memory via the bypass window instead of just -* the pages that has been explicitly mapped into the iommu -*/ - if (tbl->set_bypass) - tbl->set_bypass(tbl, false); - return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); @@ -1161,10 +1153,6 @@ void iommu_release_ownership(struct iommu_table *tbl) /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, true); } EXPORT_SYMBOL_GPL(iommu_release_ownership); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2d32a1c..8cb2f31 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1105,10 +1105,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32.table); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1136,7 +1134,7 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) * host side. */ if (pe->pdev) - set_iommu_table_base(>pdev->dev, tbl); + set_iommu_table_base(>pdev->dev, >tce32.table); else pnv_ioda_setup_bus_dma(pe, pe->pbus, false); } @@ -1152,15 +1150,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; - /* Install set_bypass callback for VFIO */ - pe->tce32.table.set_bypass = pnv_pci_ioda2_set_bypass; - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(>tce32.table, true); + pnv_pci_ioda2_set_bypass(pe, true); +} + +static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *d
[PATCH v2 10/13] powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA
SPAPR defines an interface to create additional DMA windows dynamically. "Dynamically" means that the window is not allocated before the guest even started, the guest can request it later. In practice, existing linux guests check for the capability and if it is there, they create and map a DMA window as big as the entire guest RAM. This adds 4 callbacks to the spapr_tce_iommu_ops struct: 1. query - ibm,query-pe-dma-window - returns number/size of windows which can be created (one, any page size); 2. create - ibm,create-pe-dma-window - creates a window; 3. remove - ibm,remove-pe-dma-window - removes a window; removing the default 32bit window is not allowed by this patch, this will be added later if needed; 4. reset - ibm,reset-pe-dma-window - reset the DMA windows configuration to the default state; as the default window cannot be removed, it only removes the additional window if it was created. The next patch will add corresponding ioctls to VFIO SPAPR TCE driver to provide necessary support to the userspace. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/tce.h| 22 + arch/powerpc/platforms/powernv/pci-ioda.c | 159 +- arch/powerpc/platforms/powernv/pci.h | 1 + 3 files changed, 181 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index e6355f9..23b0362 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -58,6 +58,28 @@ struct spapr_tce_iommu_ops { int num); void (*take_ownership)(struct spapr_tce_iommu_group *data, bool enable); + + /* Dynamic DMA window */ + /* Page size flags for ibm,query-pe-dma-window */ +#define DDW_PGSIZE_4K 0x01 +#define DDW_PGSIZE_64K 0x02 +#define DDW_PGSIZE_16M 0x04 +#define DDW_PGSIZE_32M 0x08 +#define DDW_PGSIZE_64M 0x10 +#define DDW_PGSIZE_128M 0x20 +#define DDW_PGSIZE_256M 0x40 +#define DDW_PGSIZE_16G 0x80 + long (*query)(struct spapr_tce_iommu_group *data, + __u32 *current_windows, + __u32 *windows_available, + __u32 *page_size_mask); + long (*create)(struct spapr_tce_iommu_group *data, + __u32 page_shift, + __u32 window_shift, + struct iommu_table **ptbl); + long (*remove)(struct spapr_tce_iommu_group *data, + struct iommu_table *tbl); + long (*reset)(struct spapr_tce_iommu_group *data); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 296f49b..a6318cb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1154,6 +1154,26 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pnv_pci_ioda2_set_bypass(pe, true); } +static struct iommu_table *pnv_ioda2_iommu_get_table( + struct spapr_tce_iommu_group *data, + int num) +{ + struct pnv_ioda_pe *pe = data->iommu_owner; + + switch (num) { + case 0: + if (pe->tce32.table.it_size) + return >tce32.table; + return NULL; + case 1: + if (pe->tce64.table.it_size) + return >tce64.table; + return NULL; + default: + return NULL; + } +} + static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, bool enable) { @@ -1162,9 +1182,146 @@ static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, pnv_pci_ioda2_set_bypass(pe, !enable); } +static long pnv_pci_ioda2_ddw_query(struct spapr_tce_iommu_group *data, + __u32 *current_windows, + __u32 *windows_available, __u32 *page_size_mask) +{ + struct pnv_ioda_pe *pe = data->iommu_owner; + + *windows_available = 2; + *current_windows = 0; + if (pe->tce32.table.it_size) { + --*windows_available; + ++*current_windows; + } + if (pe->tce64.table.it_size) { + --*windows_available; + ++*current_windows; + } + *page_size_mask = + DDW_PGSIZE_4K | + DDW_PGSIZE_64K | + DDW_PGSIZE_16M; + + return 0; +} + +static long pnv_pci_ioda2_ddw_create(struct spapr_tce_iommu_group *data, + __u32 page_shift, __u32 window_shift, + struct iommu_table **ptbl) +{ + struct pnv_ioda_pe *pe = data->iommu_owner; + struct pnv_phb *phb = pe->phb; + struct page *tce_mem = NULL; + void *addr; + long ret; + unsigned long tce_table_size = + (1ULL << (window_sh
[PATCH v2 07/13] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE
Normally a bitmap from the iommu_table is used to track what TCE entry is in use. Since we are going to use iommu_table without its locks and do xchg() instead, it becomes essential not to put bits which are not implied in the direction flag. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index deddcad..ab79e2d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -628,10 +628,18 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, __be64 *tcep, *tces; u64 rpn; - proto_tce = TCE_PCI_READ; // Read allowed - - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; + switch (direction) { + case DMA_BIDIRECTIONAL: + case DMA_FROM_DEVICE: + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + break; + case DMA_TO_DEVICE: + proto_tce = TCE_PCI_READ; + break; + default: + proto_tce = 0; + break; + } tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> tbl->it_page_shift; -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows
This defines and implements VFIO IOMMU API which lets the userspace create and remove DMA windows. This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of available windows and page mask. This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE to allow the user space to create and remove window(s). The VFIO IOMMU driver does basic sanity checks and calls corresponding SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) implements them. This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via VFIO_IOMMU_SPAPR_TCE_GET_INFO. This calls platform DDW reset() callback when IOMMU is being disabled to reset the DMA configuration to its original state. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 135 ++-- include/uapi/linux/vfio.h | 25 ++- 2 files changed, 153 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0dccbc4..b518891 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; - if (!container->grp || !current->mm) + if (!container->grp) return; data = iommu_group_get_iommudata(container->grp); if (!data || !data->iommu_owner || !data->ops->get_table) return; - tbl = data->ops->get_table(data, 0); - if (!tbl) - return; + if (current->mm) { + tbl = data->ops->get_table(data, 0); + if (tbl) + decrement_locked_vm(tbl); - decrement_locked_vm(tbl); + tbl = data->ops->get_table(data, 1); + if (tbl) + decrement_locked_vm(tbl); + } + + if (data->ops->reset) + data->ops->reset(data); } static void *tce_iommu_open(unsigned long arg) @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_size = tbl->it_size << tbl->it_page_shift; info.flags = 0; + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, + page_size_mask); + + if (info.argsz == ddwsz) { + if (data->ops->query && data->ops->create && + data->ops->remove) { + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; + + ret = data->ops->query(data, + _windows, + _available, + _size_mask); + if (ret) + return ret; + } else { + info.current_windows = 0; + info.windows_available = 0; + info.page_size_mask = 0; + } + minsz = ddwsz; + } + if (copy_to_user((void __user *)arg, , minsz)) return -EFAULT; @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(>lock); return 0; + case VFIO_EEH_PE_OP: if (!container->grp) return -ENODEV; return vfio_spapr_iommu_eeh_ioctl(container->grp, cmd, arg); + + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct spapr_tce_iommu_group *data; + struct iommu_table *tbl; + + if (WARN_ON(!container->grp)) + return -ENXIO; + + data = iommu_group_get_iommudata(container->grp); + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz < minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + if (!data->ops->create || !data->iommu_owner) +
[PATCH v2 09/13] powerpc/pseries/lpar: Enable VFIO
The previous patch introduced iommu_table_ops::exchange() callback which effectively disabled VFIO on pseries. This implements exchange() for pseries/lpar so VFIO can work in nested guests. Since exchaange() callback returns an old TCE, it has to call H_GET_TCE for every TCE being put to the table so VFIO performance in guests running under PR KVM is expected to be slower than in guests running under HV KVM or bare metal hosts. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/pseries/iommu.c | 25 +++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 9a7364f..ae15b5a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -138,13 +138,14 @@ static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, + unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 rc = 0; u64 proto_tce, tce; u64 rpn; - int ret = 0; + int ret = 0, i = 0; long tcenum_start = tcenum, npages_start = npages; rpn = __pa(uaddr) >> TCE_SHIFT; @@ -154,6 +155,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, while (npages--) { tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + if (old_tces) + plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, + _tces[i++]); rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { @@ -179,8 +183,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, static DEFINE_PER_CPU(__be64 *, tce_page); -static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_xchg_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, +unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -195,6 +200,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } @@ -211,6 +217,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } __get_cpu_var(tce_page) = tcep; @@ -232,6 +239,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, for (l = 0; l < limit; l++) { tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); rpn++; + if (old_tces) + plpar_tce_get((u64)tbl->it_index, + (u64)(tcenum + l) << 12, + _tces[tcenum + l]); } rc = plpar_tce_put_indirect((u64)tbl->it_index, @@ -262,6 +273,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +long npages, unsigned long uaddr, +enum dma_data_direction direction, +struct dma_attrs *attrs) +{ + return tce_xchg_pSeriesLP(tbl, tcenum, npages, uaddr, NULL, + direction, attrs); +} + static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) { u64 rc; @@ -637,6 +657,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, + .exchange = tce_xchg_pSeriesLP, .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP }; -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body o
[PATCH v2 08/13] powerpc/powernv: Release replaced TCE
At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first. Another problem this patch is addressing is the use of pool locks for external IOMMU users such as VFIO. The pool locks are to protect DMA page allocator rather than entries and since the host kernel does not control what pages are in use, there is no point in pool locks and exchange()+put_page(oldtce) is sufficient to avoid possible races. This adds an exchange() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE(s) so the caller can release the pages afterwards. This makes iommu_tce_build() put pages returned by exchange(). This replaces iommu_clear_tce() with iommu_tce_build which now can call exchange() with TCE==NULL (i.e. clear). This preserves permission bits in TCE in iommu_put_tce_user_mode(). This removes use of pool locks for external IOMMU uses. This disables external IOMMU use (i.e. VFIO) for IOMMUs which do not implement exchange() callback. Therefore the "powernv" platform is the only supported one after this patch. Signed-off-by: Alexey Kardashevskiy --- Changes: v2: * added missing __pa() for TCE which was read from the table --- arch/powerpc/include/asm/iommu.h | 8 +++-- arch/powerpc/kernel/iommu.c | 62 arch/powerpc/platforms/powernv/pci.c | 40 +++ 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index c725e4a..8e0537d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -49,6 +49,12 @@ struct iommu_table_ops { unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); + int (*exchange)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + unsigned long *old_tces, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*clear)(struct iommu_table *tbl, long index, long npages); unsigned long (*get)(struct iommu_table *tbl, long index); @@ -209,8 +215,6 @@ extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages); extern int iommu_put_tce_user_mode(struct iommu_table *tbl, diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 678fee8..39ccce7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1006,43 +1006,11 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) -{ - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); - - oldtce = tbl->it_ops->get(tbl, entry); - if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) - tbl->it_ops->clear(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock(&(pool->lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { - unsigned long oldtce; - struct page *page; - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } + iommu_tce_build(tbl, entry, 0, DMA_NONE); } return 0; @@ -1056,18 +1024,19 @@ EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction) { - int ret = -EBUSY; + int ret; unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - spin_lock(&(pool->lock)); + ret = tbl->it_ops->exchange(t
[PATCH v2 11/13] vfio: powerpc/spapr: Move locked_vm accounting to helpers
There moves locked pages accounting to helpers. Later they will be reused for Dynamic DMA windows (DDW). While we are here, update the comment explaining why RLIMIT_MEMLOCK might be required to be bigger than the guest RAM. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 71 +++-- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 1c1a9c4..c9fac97 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -29,6 +29,46 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +static long try_increment_locked_vm(struct iommu_table *tbl) +{ + long ret = 0, locked, lock_limit, npages; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + + down_write(>mm->mmap_sem); + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current->mm->locked_vm += npages; + } + up_write(>mm->mmap_sem); + + return ret; +} + +static void decrement_locked_vm(struct iommu_table *tbl) +{ + long npages; + + if (!current || !current->mm) + return; /* process exited */ + + npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + + down_write(>mm->mmap_sem); + if (npages > current->mm->locked_vm) + npages = current->mm->locked_vm; + current->mm->locked_vm -= npages; + up_write(>mm->mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -86,7 +126,6 @@ static void tce_iommu_take_ownership_notify(struct spapr_tce_iommu_group *data, static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; struct iommu_table *tbl; struct spapr_tce_iommu_group *data; @@ -120,24 +159,23 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. +* +* Unfortunately at the moment it counts whole tables, no matter how +* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups +* each with 2GB DMA window, 8GB will be counted here. The reason for +* this is that we cannot tell here the amount of RAM used by the guest +* as this information is only available from KVM and VFIO is +* KVM agnostic. */ tbl = data->ops->get_table(data, 0); if (!tbl) return -ENXIO; - down_write(>mm->mmap_sem); - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; - locked = current->mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { - current->mm->locked_vm += npages; - container->enabled = true; - } - up_write(>mm->mmap_sem); + ret = try_increment_locked_vm(tbl); + if (ret) + return ret; + + container->enabled = true; return ret; } @@ -163,10 +201,7 @@ static void tce_iommu_disable(struct tce_container *container) if (!tbl) return; - down_write(>mm->mmap_sem); - current->mm->locked_vm -= (tbl->it_size << - IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; - up_write(>mm->mmap_sem); + decrement_locked_vm(tbl); } static void *tce_iommu_open(unsigned long arg) -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 05/13] powerpc/iommu: Fix IOMMU ownership control functions
This adds missing locks in iommu_take_ownership()/ iommu_release_ownership(). This marks all pages busy in iommu_table::it_map in order to catch errors if there is an attempt to use this table while ownership over it is taken. This only clears TCE content if there is no page marked busy in it_map. Clearing must be done outside of the table locks as iommu_clear_tce() called from iommu_clear_tces_and_put_pages() does this. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/kernel/iommu.c | 36 +--- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c2c8d9d..cd80867 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1126,33 +1126,55 @@ EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); int iommu_take_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl->it_size + 7) >> 3; + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; + int ret = 0, bit0 = 0; + + spin_lock_irqsave(>large_pool.lock, flags); + for (i = 0; i < tbl->nr_pools; i++) + spin_lock(>pools[i].lock); if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + bit0 = test_and_clear_bit(0, tbl->it_map); if (!bitmap_empty(tbl->it_map, tbl->it_size)) { pr_err("iommu_tce: it_map is not empty"); - return -EBUSY; + ret = -EBUSY; + if (bit0) + set_bit(0, tbl->it_map); + } else { + memset(tbl->it_map, 0xff, sz); } - memset(tbl->it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + for (i = 0; i < tbl->nr_pools; i++) + spin_unlock(>pools[i].lock); + spin_unlock_irqrestore(>large_pool.lock, flags); - return 0; + if (!ret) + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, + tbl->it_size); + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl->it_size + 7) >> 3; + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + + spin_lock_irqsave(>large_pool.lock, flags); + for (i = 0; i < tbl->nr_pools; i++) + spin_lock(>pools[i].lock); + memset(tbl->it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); + + for (i = 0; i < tbl->nr_pools; i++) + spin_unlock(>pools[i].lock); + spin_unlock_irqrestore(>large_pool.lock, flags); } EXPORT_SYMBOL_GPL(iommu_release_ownership); -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 12/13] vfio: powerpc/spapr: Use it_page_size
This makes use of the it_page_size from the iommu_table struct as page size can differ. This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code as recently introduced IOMMU_PAGE_XXX macros do not include IOMMU_PAGE_SHIFT. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 36 ++-- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c9fac97..0dccbc4 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -36,7 +36,7 @@ static long try_increment_locked_vm(struct iommu_table *tbl) if (!current || !current->mm) return -ESRCH; /* process exited */ - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; down_write(>mm->mmap_sem); locked = current->mm->locked_vm + npages; @@ -60,7 +60,7 @@ static void decrement_locked_vm(struct iommu_table *tbl) if (!current || !current->mm) return; /* process exited */ - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; down_write(>mm->mmap_sem); if (npages > current->mm->locked_vm) @@ -284,8 +284,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz < minsz) return -EINVAL; - info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = tbl->it_offset << tbl->it_page_shift; + info.dma32_window_size = tbl->it_size << tbl->it_page_shift; info.flags = 0; if (copy_to_user((void __user *)arg, , minsz)) @@ -318,10 +318,6 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; - if ((param.size & ~IOMMU_PAGE_MASK_4K) || - (param.vaddr & ~IOMMU_PAGE_MASK_4K)) - return -EINVAL; - /* iova is checked by the IOMMU API */ tce = param.vaddr; if (param.flags & VFIO_DMA_MAP_FLAG_READ) @@ -334,21 +330,25 @@ static long tce_iommu_ioctl(void *iommu_data, return -ENXIO; BUG_ON(!tbl->it_group); + if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || + (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) + return -EINVAL; + ret = iommu_tce_put_param_check(tbl, param.iova, tce); if (ret) return ret; - for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) { + for (i = 0; i < (param.size >> tbl->it_page_shift); ++i) { ret = iommu_put_tce_user_mode(tbl, - (param.iova >> IOMMU_PAGE_SHIFT_4K) + i, + (param.iova >> tbl->it_page_shift) + i, tce); if (ret) break; - tce += IOMMU_PAGE_SIZE_4K; + tce += IOMMU_PAGE_SIZE(tbl); } if (ret) iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, i); + param.iova >> tbl->it_page_shift, i); iommu_flush_tce(tbl); @@ -379,23 +379,23 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; - if (param.size & ~IOMMU_PAGE_MASK_4K) - return -EINVAL; - tbl = spapr_tce_find_table(container, data, param.iova); if (!tbl) return -ENXIO; + if (param.size & ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + BUG_ON(!tbl->it_group); ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size >> IOMMU_PAGE_SHIFT_4K); + param.size >> tbl->it_page_shift); if (ret) return ret; ret = iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, - param.size >
[PATCH v2 06/13] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
This adds a iommu_table_ops struct and puts pointer to it into the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush callbacks from ppc_md to the new struct where they really belong to. This adds an extra @ops parameter to iommu_init_table() to make sure that we do not leave any IOMMU table without iommu_table_ops. @it_ops is initialized in the very beginning as iommu_init_table() calls iommu_table_clear() and the latter uses callbacks already. This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_" prefixes for better readability. This removes tce_xxx_rm handlers from ppc_md as well but does not add them to iommu_table_ops, this will be done later if we decide to support TCE hypercalls in real mode. This always uses tce_buildmulti_pSeriesLP/tce_buildmulti_pSeriesLP as callbacks for pseries. This changes "multi" callbacks to fall back to tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not present. The reason for this is we still have to support "multitce=off" boot parameter in disable_multitce() and we do not want to walk through all IOMMU tables in the system and replace "multi" callbacks with single ones. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 20 +++- arch/powerpc/include/asm/machdep.h | 25 --- arch/powerpc/kernel/iommu.c | 50 - arch/powerpc/kernel/vio.c | 5 ++- arch/powerpc/platforms/cell/iommu.c | 9 -- arch/powerpc/platforms/pasemi/iommu.c | 8 +++-- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 +- arch/powerpc/platforms/powernv/pci.c| 24 -- arch/powerpc/platforms/powernv/pci.h| 1 + arch/powerpc/platforms/pseries/iommu.c | 42 +--- arch/powerpc/sysdev/dart_iommu.c| 13 12 files changed, 102 insertions(+), 102 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 2b0b01d..c725e4a 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -43,6 +43,22 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*clear)(struct iommu_table *tbl, + long index, long npages); + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -77,6 +93,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + struct iommu_table_ops *it_ops; }; /* Pure 2^n version of get_order */ @@ -106,7 +123,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); * structure */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, - int nid); + int nid, + struct iommu_table_ops *ops); struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index b125cea..1fc824d 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void(*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void(*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -
[PATCH v2 00/13] powerpc/iommu/vfio: Enable Dynamic DMA windows
This enables PAPR defined feature called Dynamic DMA windows (DDW). Each Partitionable Endpoint (IOMMU group) has a separate DMA window on a PCI bus where devices are allows to perform DMA. By default there is 1 or 2GB window allocated at the host boot time and these windows are used when an IOMMU group is passed to the userspace (guest). These windows are mapped at zero offset on a PCI bus. Hi-speed devices may suffer from limited size of this window. On the host side a TCE bypass mode is enabled on POWER8 CPU which implements direct mapping of the host memory to a PCI bus at 1<<59. For the guest, PAPR defines a DDW RTAS API which allows the pseries guest to query the hypervisor if it supports DDW and what are the parameters of possible windows. Currently POWER8 supports 2 DMA windows per PE - already mentioned and used small 32bit window and 64bit window which can only start from 1<<59 and can support various page sizes. This patchset reworks PPC IOMMU code and adds necessary structures to extend it to support big windows. When the guest detectes the feature and the PE is capable of 64bit DMA, it does: 1. query to hypervisor about number of available windows and page masks; 2. creates a window with the biggest possible page size (current guests can do 64K or 16MB TCEs); 3. maps the entire guest RAM via H_PUT_TCE* hypercalls 4. switches dma_ops to direct_dma_ops on the selected PE. Once this is done, H_PUT_TCE is not called anymore and the guest gets maximum performance. Please comment. Thanks! Changes: v2: * added missing __pa() in "powerpc/powernv: Release replaced TCE" * reposted to make some noise :) Alexey Kardashevskiy (13): powerpc/iommu: Check that TCE page size is equal to it_page_size powerpc/powernv: Make invalidate() a callback powerpc/spapr: vfio: Implement spapr_tce_iommu_ops powerpc/powernv: Convert/move set_bypass() callback to take_ownership() powerpc/iommu: Fix IOMMU ownership control functions powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table powerpc/powernv: Do not set "read" flag if direction==DMA_NONE powerpc/powernv: Release replaced TCE powerpc/pseries/lpar: Enable VFIO powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA vfio: powerpc/spapr: Move locked_vm accounting to helpers vfio: powerpc/spapr: Use it_page_size vfio: powerpc/spapr: Enable Dynamic DMA windows arch/powerpc/include/asm/iommu.h| 35 ++- arch/powerpc/include/asm/machdep.h | 25 -- arch/powerpc/include/asm/tce.h | 37 +++ arch/powerpc/kernel/iommu.c | 213 +-- arch/powerpc/kernel/vio.c | 5 +- arch/powerpc/platforms/cell/iommu.c | 9 +- arch/powerpc/platforms/pasemi/iommu.c | 8 +- arch/powerpc/platforms/powernv/pci-ioda.c | 233 +++-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 4 +- arch/powerpc/platforms/powernv/pci.c| 113 +--- arch/powerpc/platforms/powernv/pci.h| 15 +- arch/powerpc/platforms/pseries/iommu.c | 77 -- arch/powerpc/sysdev/dart_iommu.c| 13 +- drivers/vfio/vfio_iommu_spapr_tce.c | 384 +++- include/uapi/linux/vfio.h | 25 +- 15 files changed, 925 insertions(+), 271 deletions(-) -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] powerpc/iommu/ddw: Fix endianness
rtas_call() accepts and returns values in CPU endianness. The ddw_query_response and ddw_create_response structs members are defined and treated as BE but as they are passed to rtas_call() as (u32 *) and they get byteswapped automatically, the data is actually CPU-endian. This fixes ddw_query_response and ddw_create_response definitions and use. of_read_number() is designed to work with device tree cells - it assumes the input is big-endian and returns data in CPU-endian. However due to the ddw_create_response struct fix, create.addr_hi/lo are already CPU-endian so do not byteswap them. ddw_avail is a pointer to the "ibm,ddw-applicable" property which contains 3 cells which are big-endian as it is a device tree. rtas_call() accepts a RTAS token in CPU-endian. This converts RTAS tokens from big-endian to CPU-endian. Since every token is used once till guest is rebooted, there is no much sense in caching RTAS tokens in CPU-endian. Signed-off-by: Alexey Kardashevskiy --- Changes: v2: * updated commit log * fixed definition of ddw_query_response and ddw_create_response --- arch/powerpc/platforms/pseries/iommu.c | 38 ++ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 4642d6a..f052cc8 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -329,16 +329,16 @@ struct direct_window { /* Dynamic DMA Window support */ struct ddw_query_response { - __be32 windows_available; - __be32 largest_available_block; - __be32 page_size; - __be32 migration_capable; + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; }; struct ddw_create_response { - __be32 liobn; - __be32 addr_hi; - __be32 addr_lo; + u32 liobn; + u32 addr_hi; + u32 addr_lo; }; static LIST_HEAD(direct_window_list); @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug("%s successfully cleared tces in window.\n", np->full_name); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); if (ret) pr_warning("%s: failed to remove direct window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", @@ -841,7 +841,7 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, cfg_addr = edev->pe_config_addr; buid = edev->phb->buid; - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + ret = rtas_call(be32_to_cpu(ddw_avail[0]), 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x" " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), @@ -872,8 +872,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(be32_to_cpu(ddw_avail[1]), 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(>dev, "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " @@ -966,11 +967,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(>dev, "no free dynamic windows"); goto out_failed; } - if (be32_to_cpu(query.page_size) & 4) { + if (query.page_size & 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) & 2) { + } else if (query.page_size & 2) { page_shift = 16; /* 64kB */ - } else if (be32_to_cpu(query.page_size) & 1) { + } else if (query.page_size & 1) { page_shift = 12; /* 4kB */ } else { dev_dbg(>dev, "no supported direct page size in mask %x", @@ -980,7 +981,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) /* verify the window * number of ptes will map the partition */ /* check largest block * page size > max memory hotplug addr */ max_addr = memory_hotplug_max(); - if (be32_to_cpu(query.largest_available_block) < (max_addr >> page_shift)) { + if (query.largest_available_block < (max_addr >> page_shift)) {
[PATCH v2] powerpc/iommu/ddw: Fix endianness
rtas_call() accepts and returns values in CPU endianness. The ddw_query_response and ddw_create_response structs members are defined and treated as BE but as they are passed to rtas_call() as (u32 *) and they get byteswapped automatically, the data is actually CPU-endian. This fixes ddw_query_response and ddw_create_response definitions and use. of_read_number() is designed to work with device tree cells - it assumes the input is big-endian and returns data in CPU-endian. However due to the ddw_create_response struct fix, create.addr_hi/lo are already CPU-endian so do not byteswap them. ddw_avail is a pointer to the ibm,ddw-applicable property which contains 3 cells which are big-endian as it is a device tree. rtas_call() accepts a RTAS token in CPU-endian. This converts RTAS tokens from big-endian to CPU-endian. Since every token is used once till guest is rebooted, there is no much sense in caching RTAS tokens in CPU-endian. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v2: * updated commit log * fixed definition of ddw_query_response and ddw_create_response --- arch/powerpc/platforms/pseries/iommu.c | 38 ++ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 4642d6a..f052cc8 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -329,16 +329,16 @@ struct direct_window { /* Dynamic DMA Window support */ struct ddw_query_response { - __be32 windows_available; - __be32 largest_available_block; - __be32 page_size; - __be32 migration_capable; + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; }; struct ddw_create_response { - __be32 liobn; - __be32 addr_hi; - __be32 addr_lo; + u32 liobn; + u32 addr_hi; + u32 addr_lo; }; static LIST_HEAD(direct_window_list); @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug(%s successfully cleared tces in window.\n, np-full_name); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); if (ret) pr_warning(%s: failed to remove direct window: rtas returned %d to ibm,remove-pe-dma-window(%x) %llx\n, @@ -841,7 +841,7 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, cfg_addr = edev-pe_config_addr; buid = edev-phb-buid; - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + ret = rtas_call(be32_to_cpu(ddw_avail[0]), 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(dev-dev, ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n, ddw_avail[0], cfg_addr, BUID_HI(buid), @@ -872,8 +872,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(be32_to_cpu(ddw_avail[1]), 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(dev-dev, ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d @@ -966,11 +967,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(dev-dev, no free dynamic windows); goto out_failed; } - if (be32_to_cpu(query.page_size) 4) { + if (query.page_size 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) 2) { + } else if (query.page_size 2) { page_shift = 16; /* 64kB */ - } else if (be32_to_cpu(query.page_size) 1) { + } else if (query.page_size 1) { page_shift = 12; /* 4kB */ } else { dev_dbg(dev-dev, no supported direct page size in mask %x, @@ -980,7 +981,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) /* verify the window * number of ptes will map the partition */ /* check largest block * page size max memory hotplug addr */ max_addr = memory_hotplug_max(); - if (be32_to_cpu(query.largest_available_block) (max_addr page_shift)) { + if (query.largest_available_block (max_addr page_shift)) { dev_dbg(dev-dev, can't map partiton max 0x%llx with %u %llu-sized pages\n, max_addr, query.largest_available_block, 1ULL
[PATCH v2 00/13] powerpc/iommu/vfio: Enable Dynamic DMA windows
This enables PAPR defined feature called Dynamic DMA windows (DDW). Each Partitionable Endpoint (IOMMU group) has a separate DMA window on a PCI bus where devices are allows to perform DMA. By default there is 1 or 2GB window allocated at the host boot time and these windows are used when an IOMMU group is passed to the userspace (guest). These windows are mapped at zero offset on a PCI bus. Hi-speed devices may suffer from limited size of this window. On the host side a TCE bypass mode is enabled on POWER8 CPU which implements direct mapping of the host memory to a PCI bus at 159. For the guest, PAPR defines a DDW RTAS API which allows the pseries guest to query the hypervisor if it supports DDW and what are the parameters of possible windows. Currently POWER8 supports 2 DMA windows per PE - already mentioned and used small 32bit window and 64bit window which can only start from 159 and can support various page sizes. This patchset reworks PPC IOMMU code and adds necessary structures to extend it to support big windows. When the guest detectes the feature and the PE is capable of 64bit DMA, it does: 1. query to hypervisor about number of available windows and page masks; 2. creates a window with the biggest possible page size (current guests can do 64K or 16MB TCEs); 3. maps the entire guest RAM via H_PUT_TCE* hypercalls 4. switches dma_ops to direct_dma_ops on the selected PE. Once this is done, H_PUT_TCE is not called anymore and the guest gets maximum performance. Please comment. Thanks! Changes: v2: * added missing __pa() in powerpc/powernv: Release replaced TCE * reposted to make some noise :) Alexey Kardashevskiy (13): powerpc/iommu: Check that TCE page size is equal to it_page_size powerpc/powernv: Make invalidate() a callback powerpc/spapr: vfio: Implement spapr_tce_iommu_ops powerpc/powernv: Convert/move set_bypass() callback to take_ownership() powerpc/iommu: Fix IOMMU ownership control functions powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table powerpc/powernv: Do not set read flag if direction==DMA_NONE powerpc/powernv: Release replaced TCE powerpc/pseries/lpar: Enable VFIO powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA vfio: powerpc/spapr: Move locked_vm accounting to helpers vfio: powerpc/spapr: Use it_page_size vfio: powerpc/spapr: Enable Dynamic DMA windows arch/powerpc/include/asm/iommu.h| 35 ++- arch/powerpc/include/asm/machdep.h | 25 -- arch/powerpc/include/asm/tce.h | 37 +++ arch/powerpc/kernel/iommu.c | 213 +-- arch/powerpc/kernel/vio.c | 5 +- arch/powerpc/platforms/cell/iommu.c | 9 +- arch/powerpc/platforms/pasemi/iommu.c | 8 +- arch/powerpc/platforms/powernv/pci-ioda.c | 233 +++-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 4 +- arch/powerpc/platforms/powernv/pci.c| 113 +--- arch/powerpc/platforms/powernv/pci.h| 15 +- arch/powerpc/platforms/pseries/iommu.c | 77 -- arch/powerpc/sysdev/dart_iommu.c| 13 +- drivers/vfio/vfio_iommu_spapr_tce.c | 384 +++- include/uapi/linux/vfio.h | 25 +- 15 files changed, 925 insertions(+), 271 deletions(-) -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 06/13] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
This adds a iommu_table_ops struct and puts pointer to it into the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush callbacks from ppc_md to the new struct where they really belong to. This adds an extra @ops parameter to iommu_init_table() to make sure that we do not leave any IOMMU table without iommu_table_ops. @it_ops is initialized in the very beginning as iommu_init_table() calls iommu_table_clear() and the latter uses callbacks already. This does s/tce_build/set/, s/tce_free/clear/ and removes tce_ prefixes for better readability. This removes tce_xxx_rm handlers from ppc_md as well but does not add them to iommu_table_ops, this will be done later if we decide to support TCE hypercalls in real mode. This always uses tce_buildmulti_pSeriesLP/tce_buildmulti_pSeriesLP as callbacks for pseries. This changes multi callbacks to fall back to tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not present. The reason for this is we still have to support multitce=off boot parameter in disable_multitce() and we do not want to walk through all IOMMU tables in the system and replace multi callbacks with single ones. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 20 +++- arch/powerpc/include/asm/machdep.h | 25 --- arch/powerpc/kernel/iommu.c | 50 - arch/powerpc/kernel/vio.c | 5 ++- arch/powerpc/platforms/cell/iommu.c | 9 -- arch/powerpc/platforms/pasemi/iommu.c | 8 +++-- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 +- arch/powerpc/platforms/powernv/pci.c| 24 -- arch/powerpc/platforms/powernv/pci.h| 1 + arch/powerpc/platforms/pseries/iommu.c | 42 +--- arch/powerpc/sysdev/dart_iommu.c| 13 12 files changed, 102 insertions(+), 102 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 2b0b01d..c725e4a 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -43,6 +43,22 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*clear)(struct iommu_table *tbl, + long index, long npages); + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -77,6 +93,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + struct iommu_table_ops *it_ops; }; /* Pure 2^n version of get_order */ @@ -106,7 +123,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); * structure */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, - int nid); + int nid, + struct iommu_table_ops *ops); struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index b125cea..1fc824d 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void(*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void(*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction
[PATCH v2 05/13] powerpc/iommu: Fix IOMMU ownership control functions
This adds missing locks in iommu_take_ownership()/ iommu_release_ownership(). This marks all pages busy in iommu_table::it_map in order to catch errors if there is an attempt to use this table while ownership over it is taken. This only clears TCE content if there is no page marked busy in it_map. Clearing must be done outside of the table locks as iommu_clear_tce() called from iommu_clear_tces_and_put_pages() does this. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 36 +--- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c2c8d9d..cd80867 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1126,33 +1126,55 @@ EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); int iommu_take_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; + int ret = 0, bit0 = 0; + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); if (tbl-it_offset == 0) - clear_bit(0, tbl-it_map); + bit0 = test_and_clear_bit(0, tbl-it_map); if (!bitmap_empty(tbl-it_map, tbl-it_size)) { pr_err(iommu_tce: it_map is not empty); - return -EBUSY; + ret = -EBUSY; + if (bit0) + set_bit(0, tbl-it_map); + } else { + memset(tbl-it_map, 0xff, sz); } - memset(tbl-it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); - return 0; + if (!ret) + iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, + tbl-it_size); + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); + memset(tbl-it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ if (tbl-it_offset == 0) set_bit(0, tbl-it_map); + + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); } EXPORT_SYMBOL_GPL(iommu_release_ownership); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 12/13] vfio: powerpc/spapr: Use it_page_size
This makes use of the it_page_size from the iommu_table struct as page size can differ. This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code as recently introduced IOMMU_PAGE_XXX macros do not include IOMMU_PAGE_SHIFT. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 36 ++-- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c9fac97..0dccbc4 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -36,7 +36,7 @@ static long try_increment_locked_vm(struct iommu_table *tbl) if (!current || !current-mm) return -ESRCH; /* process exited */ - npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + npages = (tbl-it_size tbl-it_page_shift) PAGE_SHIFT; down_write(current-mm-mmap_sem); locked = current-mm-locked_vm + npages; @@ -60,7 +60,7 @@ static void decrement_locked_vm(struct iommu_table *tbl) if (!current || !current-mm) return; /* process exited */ - npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + npages = (tbl-it_size tbl-it_page_shift) PAGE_SHIFT; down_write(current-mm-mmap_sem); if (npages current-mm-locked_vm) @@ -284,8 +284,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz minsz) return -EINVAL; - info.dma32_window_start = tbl-it_offset IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl-it_size IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = tbl-it_offset tbl-it_page_shift; + info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; if (copy_to_user((void __user *)arg, info, minsz)) @@ -318,10 +318,6 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; - if ((param.size ~IOMMU_PAGE_MASK_4K) || - (param.vaddr ~IOMMU_PAGE_MASK_4K)) - return -EINVAL; - /* iova is checked by the IOMMU API */ tce = param.vaddr; if (param.flags VFIO_DMA_MAP_FLAG_READ) @@ -334,21 +330,25 @@ static long tce_iommu_ioctl(void *iommu_data, return -ENXIO; BUG_ON(!tbl-it_group); + if ((param.size ~IOMMU_PAGE_MASK(tbl)) || + (param.vaddr ~IOMMU_PAGE_MASK(tbl))) + return -EINVAL; + ret = iommu_tce_put_param_check(tbl, param.iova, tce); if (ret) return ret; - for (i = 0; i (param.size IOMMU_PAGE_SHIFT_4K); ++i) { + for (i = 0; i (param.size tbl-it_page_shift); ++i) { ret = iommu_put_tce_user_mode(tbl, - (param.iova IOMMU_PAGE_SHIFT_4K) + i, + (param.iova tbl-it_page_shift) + i, tce); if (ret) break; - tce += IOMMU_PAGE_SIZE_4K; + tce += IOMMU_PAGE_SIZE(tbl); } if (ret) iommu_clear_tces_and_put_pages(tbl, - param.iova IOMMU_PAGE_SHIFT_4K, i); + param.iova tbl-it_page_shift, i); iommu_flush_tce(tbl); @@ -379,23 +379,23 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; - if (param.size ~IOMMU_PAGE_MASK_4K) - return -EINVAL; - tbl = spapr_tce_find_table(container, data, param.iova); if (!tbl) return -ENXIO; + if (param.size ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + BUG_ON(!tbl-it_group); ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size IOMMU_PAGE_SHIFT_4K); + param.size tbl-it_page_shift); if (ret) return ret; ret = iommu_clear_tces_and_put_pages(tbl, - param.iova IOMMU_PAGE_SHIFT_4K, - param.size IOMMU_PAGE_SHIFT_4K); + param.iova tbl-it_page_shift, + param.size tbl-it_page_shift); iommu_flush_tce(tbl); return ret; -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel
[PATCH v2 09/13] powerpc/pseries/lpar: Enable VFIO
The previous patch introduced iommu_table_ops::exchange() callback which effectively disabled VFIO on pseries. This implements exchange() for pseries/lpar so VFIO can work in nested guests. Since exchaange() callback returns an old TCE, it has to call H_GET_TCE for every TCE being put to the table so VFIO performance in guests running under PR KVM is expected to be slower than in guests running under HV KVM or bare metal hosts. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 25 +++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 9a7364f..ae15b5a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -138,13 +138,14 @@ static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, + unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 rc = 0; u64 proto_tce, tce; u64 rpn; - int ret = 0; + int ret = 0, i = 0; long tcenum_start = tcenum, npages_start = npages; rpn = __pa(uaddr) TCE_SHIFT; @@ -154,6 +155,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, while (npages--) { tce = proto_tce | (rpn TCE_RPN_MASK) TCE_RPN_SHIFT; + if (old_tces) + plpar_tce_get((u64)tbl-it_index, (u64)tcenum 12, + old_tces[i++]); rc = plpar_tce_put((u64)tbl-it_index, (u64)tcenum 12, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { @@ -179,8 +183,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, static DEFINE_PER_CPU(__be64 *, tce_page); -static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_xchg_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, +unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -195,6 +200,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } @@ -211,6 +217,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } __get_cpu_var(tce_page) = tcep; @@ -232,6 +239,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, for (l = 0; l limit; l++) { tcep[l] = cpu_to_be64(proto_tce | (rpn TCE_RPN_MASK) TCE_RPN_SHIFT); rpn++; + if (old_tces) + plpar_tce_get((u64)tbl-it_index, + (u64)(tcenum + l) 12, + old_tces[tcenum + l]); } rc = plpar_tce_put_indirect((u64)tbl-it_index, @@ -262,6 +273,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +long npages, unsigned long uaddr, +enum dma_data_direction direction, +struct dma_attrs *attrs) +{ + return tce_xchg_pSeriesLP(tbl, tcenum, npages, uaddr, NULL, + direction, attrs); +} + static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) { u64 rc; @@ -637,6 +657,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, + .exchange = tce_xchg_pSeriesLP, .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP }; -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http
[PATCH v2 08/13] powerpc/powernv: Release replaced TCE
At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first. Another problem this patch is addressing is the use of pool locks for external IOMMU users such as VFIO. The pool locks are to protect DMA page allocator rather than entries and since the host kernel does not control what pages are in use, there is no point in pool locks and exchange()+put_page(oldtce) is sufficient to avoid possible races. This adds an exchange() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE(s) so the caller can release the pages afterwards. This makes iommu_tce_build() put pages returned by exchange(). This replaces iommu_clear_tce() with iommu_tce_build which now can call exchange() with TCE==NULL (i.e. clear). This preserves permission bits in TCE in iommu_put_tce_user_mode(). This removes use of pool locks for external IOMMU uses. This disables external IOMMU use (i.e. VFIO) for IOMMUs which do not implement exchange() callback. Therefore the powernv platform is the only supported one after this patch. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v2: * added missing __pa() for TCE which was read from the table --- arch/powerpc/include/asm/iommu.h | 8 +++-- arch/powerpc/kernel/iommu.c | 62 arch/powerpc/platforms/powernv/pci.c | 40 +++ 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index c725e4a..8e0537d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -49,6 +49,12 @@ struct iommu_table_ops { unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); + int (*exchange)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + unsigned long *old_tces, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*clear)(struct iommu_table *tbl, long index, long npages); unsigned long (*get)(struct iommu_table *tbl, long index); @@ -209,8 +215,6 @@ extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages); extern int iommu_put_tce_user_mode(struct iommu_table *tbl, diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 678fee8..39ccce7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1006,43 +1006,11 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) -{ - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock((pool-lock)); - - oldtce = tbl-it_ops-get(tbl, entry); - if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) - tbl-it_ops-clear(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock((pool-lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { - unsigned long oldtce; - struct page *page; - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } + iommu_tce_build(tbl, entry, 0, DMA_NONE); } return 0; @@ -1056,18 +1024,19 @@ EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction) { - int ret = -EBUSY; + int ret; unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - spin_lock((pool-lock)); + ret = tbl-it_ops-exchange(tbl, entry, 1, hwaddr, oldtce, + direction, NULL
[PATCH v2 11/13] vfio: powerpc/spapr: Move locked_vm accounting to helpers
There moves locked pages accounting to helpers. Later they will be reused for Dynamic DMA windows (DDW). While we are here, update the comment explaining why RLIMIT_MEMLOCK might be required to be bigger than the guest RAM. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 71 +++-- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 1c1a9c4..c9fac97 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -29,6 +29,46 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +static long try_increment_locked_vm(struct iommu_table *tbl) +{ + long ret = 0, locked, lock_limit, npages; + + if (!current || !current-mm) + return -ESRCH; /* process exited */ + + npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + + down_write(current-mm-mmap_sem); + locked = current-mm-locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; + if (locked lock_limit !capable(CAP_IPC_LOCK)) { + pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current-mm-locked_vm += npages; + } + up_write(current-mm-mmap_sem); + + return ret; +} + +static void decrement_locked_vm(struct iommu_table *tbl) +{ + long npages; + + if (!current || !current-mm) + return; /* process exited */ + + npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + + down_write(current-mm-mmap_sem); + if (npages current-mm-locked_vm) + npages = current-mm-locked_vm; + current-mm-locked_vm -= npages; + up_write(current-mm-mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -86,7 +126,6 @@ static void tce_iommu_take_ownership_notify(struct spapr_tce_iommu_group *data, static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; struct iommu_table *tbl; struct spapr_tce_iommu_group *data; @@ -120,24 +159,23 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. +* +* Unfortunately at the moment it counts whole tables, no matter how +* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups +* each with 2GB DMA window, 8GB will be counted here. The reason for +* this is that we cannot tell here the amount of RAM used by the guest +* as this information is only available from KVM and VFIO is +* KVM agnostic. */ tbl = data-ops-get_table(data, 0); if (!tbl) return -ENXIO; - down_write(current-mm-mmap_sem); - npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; - locked = current-mm-locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; - if (locked lock_limit !capable(CAP_IPC_LOCK)) { - pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { - current-mm-locked_vm += npages; - container-enabled = true; - } - up_write(current-mm-mmap_sem); + ret = try_increment_locked_vm(tbl); + if (ret) + return ret; + + container-enabled = true; return ret; } @@ -163,10 +201,7 @@ static void tce_iommu_disable(struct tce_container *container) if (!tbl) return; - down_write(current-mm-mmap_sem); - current-mm-locked_vm -= (tbl-it_size - IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; - up_write(current-mm-mmap_sem); + decrement_locked_vm(tbl); } static void *tce_iommu_open(unsigned long arg) -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows
This defines and implements VFIO IOMMU API which lets the userspace create and remove DMA windows. This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of available windows and page mask. This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE to allow the user space to create and remove window(s). The VFIO IOMMU driver does basic sanity checks and calls corresponding SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) implements them. This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via VFIO_IOMMU_SPAPR_TCE_GET_INFO. This calls platform DDW reset() callback when IOMMU is being disabled to reset the DMA configuration to its original state. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 135 ++-- include/uapi/linux/vfio.h | 25 ++- 2 files changed, 153 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0dccbc4..b518891 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container *container) container-enabled = false; - if (!container-grp || !current-mm) + if (!container-grp) return; data = iommu_group_get_iommudata(container-grp); if (!data || !data-iommu_owner || !data-ops-get_table) return; - tbl = data-ops-get_table(data, 0); - if (!tbl) - return; + if (current-mm) { + tbl = data-ops-get_table(data, 0); + if (tbl) + decrement_locked_vm(tbl); - decrement_locked_vm(tbl); + tbl = data-ops-get_table(data, 1); + if (tbl) + decrement_locked_vm(tbl); + } + + if (data-ops-reset) + data-ops-reset(data); } static void *tce_iommu_open(unsigned long arg) @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, + page_size_mask); + + if (info.argsz == ddwsz) { + if (data-ops-query data-ops-create + data-ops-remove) { + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; + + ret = data-ops-query(data, + info.current_windows, + info.windows_available, + info.page_size_mask); + if (ret) + return ret; + } else { + info.current_windows = 0; + info.windows_available = 0; + info.page_size_mask = 0; + } + minsz = ddwsz; + } + if (copy_to_user((void __user *)arg, info, minsz)) return -EFAULT; @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(container-lock); return 0; + case VFIO_EEH_PE_OP: if (!container-grp) return -ENODEV; return vfio_spapr_iommu_eeh_ioctl(container-grp, cmd, arg); + + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct spapr_tce_iommu_group *data; + struct iommu_table *tbl; + + if (WARN_ON(!container-grp)) + return -ENXIO; + + data = iommu_group_get_iommudata(container-grp); + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(create, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + if (!data-ops-create || !data-iommu_owner) + return -ENOSYS; + + BUG_ON(!data || !data-ops || !data-ops-remove
[PATCH v2 07/13] powerpc/powernv: Do not set read flag if direction==DMA_NONE
Normally a bitmap from the iommu_table is used to track what TCE entry is in use. Since we are going to use iommu_table without its locks and do xchg() instead, it becomes essential not to put bits which are not implied in the direction flag. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index deddcad..ab79e2d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -628,10 +628,18 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, __be64 *tcep, *tces; u64 rpn; - proto_tce = TCE_PCI_READ; // Read allowed - - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; + switch (direction) { + case DMA_BIDIRECTIONAL: + case DMA_FROM_DEVICE: + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + break; + case DMA_TO_DEVICE: + proto_tce = TCE_PCI_READ; + break; + default: + proto_tce = 0; + break; + } tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset; rpn = __pa(uaddr) tbl-it_page_shift; -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 10/13] powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA
SPAPR defines an interface to create additional DMA windows dynamically. Dynamically means that the window is not allocated before the guest even started, the guest can request it later. In practice, existing linux guests check for the capability and if it is there, they create and map a DMA window as big as the entire guest RAM. This adds 4 callbacks to the spapr_tce_iommu_ops struct: 1. query - ibm,query-pe-dma-window - returns number/size of windows which can be created (one, any page size); 2. create - ibm,create-pe-dma-window - creates a window; 3. remove - ibm,remove-pe-dma-window - removes a window; removing the default 32bit window is not allowed by this patch, this will be added later if needed; 4. reset - ibm,reset-pe-dma-window - reset the DMA windows configuration to the default state; as the default window cannot be removed, it only removes the additional window if it was created. The next patch will add corresponding ioctls to VFIO SPAPR TCE driver to provide necessary support to the userspace. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/tce.h| 22 + arch/powerpc/platforms/powernv/pci-ioda.c | 159 +- arch/powerpc/platforms/powernv/pci.h | 1 + 3 files changed, 181 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index e6355f9..23b0362 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -58,6 +58,28 @@ struct spapr_tce_iommu_ops { int num); void (*take_ownership)(struct spapr_tce_iommu_group *data, bool enable); + + /* Dynamic DMA window */ + /* Page size flags for ibm,query-pe-dma-window */ +#define DDW_PGSIZE_4K 0x01 +#define DDW_PGSIZE_64K 0x02 +#define DDW_PGSIZE_16M 0x04 +#define DDW_PGSIZE_32M 0x08 +#define DDW_PGSIZE_64M 0x10 +#define DDW_PGSIZE_128M 0x20 +#define DDW_PGSIZE_256M 0x40 +#define DDW_PGSIZE_16G 0x80 + long (*query)(struct spapr_tce_iommu_group *data, + __u32 *current_windows, + __u32 *windows_available, + __u32 *page_size_mask); + long (*create)(struct spapr_tce_iommu_group *data, + __u32 page_shift, + __u32 window_shift, + struct iommu_table **ptbl); + long (*remove)(struct spapr_tce_iommu_group *data, + struct iommu_table *tbl); + long (*reset)(struct spapr_tce_iommu_group *data); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 296f49b..a6318cb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1154,6 +1154,26 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pnv_pci_ioda2_set_bypass(pe, true); } +static struct iommu_table *pnv_ioda2_iommu_get_table( + struct spapr_tce_iommu_group *data, + int num) +{ + struct pnv_ioda_pe *pe = data-iommu_owner; + + switch (num) { + case 0: + if (pe-tce32.table.it_size) + return pe-tce32.table; + return NULL; + case 1: + if (pe-tce64.table.it_size) + return pe-tce64.table; + return NULL; + default: + return NULL; + } +} + static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, bool enable) { @@ -1162,9 +1182,146 @@ static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, pnv_pci_ioda2_set_bypass(pe, !enable); } +static long pnv_pci_ioda2_ddw_query(struct spapr_tce_iommu_group *data, + __u32 *current_windows, + __u32 *windows_available, __u32 *page_size_mask) +{ + struct pnv_ioda_pe *pe = data-iommu_owner; + + *windows_available = 2; + *current_windows = 0; + if (pe-tce32.table.it_size) { + --*windows_available; + ++*current_windows; + } + if (pe-tce64.table.it_size) { + --*windows_available; + ++*current_windows; + } + *page_size_mask = + DDW_PGSIZE_4K | + DDW_PGSIZE_64K | + DDW_PGSIZE_16M; + + return 0; +} + +static long pnv_pci_ioda2_ddw_create(struct spapr_tce_iommu_group *data, + __u32 page_shift, __u32 window_shift, + struct iommu_table **ptbl) +{ + struct pnv_ioda_pe *pe = data-iommu_owner; + struct pnv_phb *phb = pe-phb; + struct page *tce_mem = NULL; + void *addr; + long ret; + unsigned long tce_table_size = + (1ULL (window_shift - page_shift)) * 8; + unsigned
[PATCH v2 04/13] powerpc/powernv: Convert/move set_bypass() callback to take_ownership()
At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. Since the set_bypass() is not really an iommu_table function but PE's function, and we have an ops struct per IOMMU owner, let's move set_bypass() to the spapr_tce_iommu_ops struct. As arch/powerpc/kernel/iommu.c is more about POWERPC IOMMU tables and has very little to do with PEs, this moves take_ownership() calls to the VFIO SPAPR TCE driver. This renames set_bypass() to take_ownership() as it is not necessarily just enabling bypassing, it can be something else/more so let's give it a generic name. The bool parameter is inverted. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/iommu.h | 1 - arch/powerpc/include/asm/tce.h| 2 ++ arch/powerpc/kernel/iommu.c | 12 arch/powerpc/platforms/powernv/pci-ioda.c | 20 drivers/vfio/vfio_iommu_spapr_tce.c | 16 5 files changed, 30 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 84ee339..2b0b01d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -77,7 +77,6 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 9f159eb..e6355f9 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -56,6 +56,8 @@ struct spapr_tce_iommu_ops { struct iommu_table *(*get_table)( struct spapr_tce_iommu_group *data, int num); + void (*take_ownership)(struct spapr_tce_iommu_group *data, + bool enable); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 1c5dae7..c2c8d9d 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1139,14 +1139,6 @@ int iommu_take_ownership(struct iommu_table *tbl) memset(tbl-it_map, 0xff, sz); iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); - /* -* Disable iommu bypass, otherwise the user can DMA to all of -* our physical memory via the bypass window instead of just -* the pages that has been explicitly mapped into the iommu -*/ - if (tbl-set_bypass) - tbl-set_bypass(tbl, false); - return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); @@ -1161,10 +1153,6 @@ void iommu_release_ownership(struct iommu_table *tbl) /* Restore bit#0 set by iommu_init_table() */ if (tbl-it_offset == 0) set_bit(0, tbl-it_map); - - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl-set_bypass) - tbl-set_bypass(tbl, true); } EXPORT_SYMBOL_GPL(iommu_release_ownership); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2d32a1c..8cb2f31 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1105,10 +1105,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32.table); uint16_t window_id = (pe-pe_number 1 ) + 1; int64_t rc; @@ -1136,7 +1134,7 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) * host side. */ if (pe-pdev) - set_iommu_table_base(pe-pdev-dev, tbl); + set_iommu_table_base(pe-pdev-dev, pe-tce32.table); else pnv_ioda_setup_bus_dma(pe, pe-pbus, false); } @@ -1152,15 +1150,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe-tce_bypass_base = 1ull 59; - /* Install set_bypass callback for VFIO */ - pe-tce32.table.set_bypass = pnv_pci_ioda2_set_bypass; - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe-tce32.table, true); + pnv_pci_ioda2_set_bypass(pe, true); +} + +static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, +bool
[PATCH v2 02/13] powerpc/powernv: Make invalidate() a callback
At the moment pnv_pci_ioda_tce_invalidate() gets the PE pointer via container_of(tbl). Since we are going to have to add Dynamic DMA windows and that means having 2 IOMMU tables per PE, this is not going to work. This implements pnv_pci_ioda(1|2)_tce_invalidate as a pnv_ioda_pe callback. This adds a pnv_iommu_table wrapper around iommu_table and stores a pointer to PE there. PNV's ppc_md.tce_build() call uses this to find PE and do the invalidation. This will be used later for Dynamic DMA windows too. This registers invalidate() callbacks for IODA1 and IODA2: - pnv_pci_ioda1_tce_invalidate; - pnv_pci_ioda2_tce_invalidate. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v4: * changed commit log to explain why this change is needed --- arch/powerpc/platforms/powernv/pci-ioda.c | 35 --- arch/powerpc/platforms/powernv/pci.c | 31 --- arch/powerpc/platforms/powernv/pci.h | 13 +++- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index df241b1..136e765 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = phb-ioda.pe_array[pdn-pe_number]; WARN_ON(get_dma_ops(pdev-dev) != dma_iommu_ops); - set_iommu_table_base_and_group(pdev-dev, pe-tce32_table); + set_iommu_table_base_and_group(pdev-dev, pe-tce32.table); } static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, @@ -884,7 +884,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, } else { dev_info(pdev-dev, Using 32-bit DMA via iommu\n); set_dma_ops(pdev-dev, dma_iommu_ops); - set_iommu_table_base(pdev-dev, pe-tce32_table); + set_iommu_table_base(pdev-dev, pe-tce32.table); } *pdev-dev.dma_mask = dma_mask; return 0; @@ -899,9 +899,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, list_for_each_entry(dev, bus-devices, bus_list) { if (add_to_iommu_group) set_iommu_table_base_and_group(dev-dev, - pe-tce32_table); + pe-tce32.table); else - set_iommu_table_base(dev-dev, pe-tce32_table); + set_iommu_table_base(dev-dev, pe-tce32.table); if (dev-subordinate) pnv_ioda_setup_bus_dma(pe, dev-subordinate, @@ -988,19 +988,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, } } -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, -__be64 *startp, __be64 *endp, bool rm) -{ - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); - struct pnv_phb *phb = pe-phb; - - if (phb-type == PNV_PHB_IODA1) - pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); - else - pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); -} - static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, unsigned int segs) @@ -1058,9 +1045,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = pe-tce32_table; + tbl = pe-tce32.table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base 28, IOMMU_PAGE_SHIFT_4K); + pe-tce32.pe = pe; + pe-tce32.invalidate_fn = pnv_pci_ioda1_tce_invalidate; /* OPAL variant of P7IOC SW invalidated TCEs */ swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); @@ -1097,7 +1086,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); + tce32.table); uint16_t window_id = (pe-pe_number 1 ) + 1; int64_t rc; @@ -1142,10 +1131,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pe-tce_bypass_base = 1ull 59; /* Install set_bypass callback for VFIO */ - pe-tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + pe-tce32.table.set_bypass = pnv_pci_ioda2_set_bypass; /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe-tce32_table, true); + pnv_pci_ioda2_set_bypass(pe-tce32.table, true); } static void
[PATCH v2 03/13] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
Modern IBM POWERPC systems support multiple IOMMU tables per PE so we need a more reliable way (compared to container_of()) to get a PE pointer from the iommu_table struct pointer used in IOMMU functions. At the moment IOMMU group data points to an iommu_table struct. This introduces a spapr_tce_iommu_group struct which keeps an iommu_owner and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to the pnv_ioda_pe struct, for others it is still a pointer to the iommu_table struct. The ops structs correspond to the type which iommu_owner points to. This defines a get_table() callback which returns an iommu_table by its number. As the IOMMU group data pointer points to variable type instead of iommu_table, VFIO SPAPR TCE driver is updated to use the new type. This changes the tce_container struct to store iommu_group instead of iommu_table. So, it was: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to iommu_table via iommu_group_get_iommudata(); now it is: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to spapr_tce_iommu_group via iommu_group_get_iommudata(); - spapr_tce_iommu_group points to either (depending on .get_table()): - iommu_table; - pnv_ioda_pe; This uses pnv_ioda1_iommu_get_table for both IODA12 but IODA2 will have own pnv_ioda2_iommu_get_table soon and pnv_ioda1_iommu_get_table will only be used for IODA1. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 6 ++ arch/powerpc/include/asm/tce.h | 13 +++ arch/powerpc/kernel/iommu.c | 35 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/pseries/iommu.c | 10 +- drivers/vfio/vfio_iommu_spapr_tce.c | 148 ++-- 8 files changed, 208 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 42632c7..84ee339 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -108,13 +108,19 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API extern void iommu_register_group(struct iommu_table *tbl, +void *iommu_owner, +struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else static inline void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, + struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 743f36b..9f159eb 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -50,5 +50,18 @@ #define TCE_PCI_READ 0x1 /* read from PCI allowed */ #define TCE_VB_WRITE 0x1 /* write from VB allowed */ +struct spapr_tce_iommu_group; + +struct spapr_tce_iommu_ops { + struct iommu_table *(*get_table)( + struct spapr_tce_iommu_group *data, + int num); +}; + +struct spapr_tce_iommu_group { + void *iommu_owner; + struct spapr_tce_iommu_ops *ops; +}; + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TCE_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b378f78..1c5dae7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -878,24 +878,53 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl-it_group = NULL; + kfree(iommu_data); } +static struct iommu_table *spapr_tce_default_get_table( + struct spapr_tce_iommu_group *data, int num) +{ + struct iommu_table *tbl = data-iommu_owner; + + switch (num) { + case 0: + if (tbl-it_size) + return tbl; + /* fallthru */ + default: + return NULL; + } +} + +static struct spapr_tce_iommu_ops spapr_tce_default_ops = { + .get_table = spapr_tce_default_get_table +}; + void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long
[PATCH v2 01/13] powerpc/iommu: Check that TCE page size is equal to it_page_size
This checks that the TCE table page size is not bigger that the size of a page we just pinned and going to put its physical address to the table. Otherwise the hardware gets unwanted access to physical memory between the end of the actual page and the end of the aligned up TCE page. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 28 +--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index a10642a..b378f78 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -38,6 +38,7 @@ #include linux/pci.h #include linux/iommu.h #include linux/sched.h +#include linux/hugetlb.h #include asm/io.h #include asm/prom.h #include asm/iommu.h @@ -1059,16 +1060,37 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, tce, entry tbl-it_page_shift, ret); */ return -EFAULT; } + + /* +* Check that the TCE table granularity is not bigger than the size of +* a page we just found. Otherwise the hardware can get access to +* a bigger memory chunk that it should. +*/ + if (PageHuge(page)) { + struct page *head = compound_head(page); + long shift = PAGE_SHIFT + compound_order(head); + + if (shift tbl-it_page_shift) { + ret = -EINVAL; + goto put_page_exit; + } + + } + hwaddr = (unsigned long) page_address(page) + offset; ret = iommu_tce_build(tbl, entry, hwaddr, direction); if (ret) - put_page(page); + goto put_page_exit; - if (ret 0) - pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n, + return 0; + +put_page_exit: + pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n, __func__, entry tbl-it_page_shift, tce, ret); + put_page(page); + return ret; } EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] powerpc/iommu/ddw: Fix endianness
On 09/15/2014 07:18 PM, Michael Ellerman wrote: > On Mon, 2014-09-15 at 18:41 +1000, Alexey Kardashevskiy wrote: >> On 09/09/2014 09:22 PM, Alexey Kardashevskiy wrote: >>> rtas_call() accepts and returns values in CPU endianness. > > Sounds right. > >>> of_read_number() accepts big-endian values but create.addr_hi/lo returned >>> by rtas_call() are in CPU endiannes. > > Also sounds right. > >>> The dynamic_dma_window_prop struct defines all members as BE so let's >>> make it true. > > It does. But why does it do that? It seems to be allocated and setup in > enable_ddw() and then the only place I see it used is in > tce_setrange_multi_pSeriesLP()/tce_clearrange_multi_pSeriesLP(), which both > unpack it again. What am I missing? I do not know why they are BE. I just know that create_ddw() returns ddw_create_response struct which members are declared as BE but they are not as rtas_call() already made them CPU-endian. May be rtas_call() must not be used for structs. Or these structs must be fixed to be CPU endian. Cannot choose what/how to fix here. Sure I still can miss something here and it is all correct and I have to fix QEMU. >>> struct dynamic_dma_window_prop { >>> __be32 liobn; /* tce table number */ >>> __be64 dma_base; /* address hi,lo */ >>> __be32 tce_shift; /* ilog2(tce_page_size) */ >>> __be32 window_shift; /* ilog2(tce_window_size) */ >>> }; > > We do read them from the device tree in find_existing_ddw_windows(), but if > that's the only place then the conversion to cpu endian should happen there. enable_ddw() kmalloc's ddwprop which is of this dynamic_dma_window_prop type. Then enable_ddw() initializes properties of that ddwprop thing but does it incorrectly. Then it calls walk_system_ram_range() which eventually calls tce_setrange_multi_pSeriesLP() to map every single page of guest's ram, the dynamic_dma_window_prop struct pointer is a void* argument of that callback. find_existing_ddw_windows() handles something called "linux,direct64-ddr-window-info" which is not a part of DDW at all and it is not from PAPR and this patch is not about it. >>> diff --git a/arch/powerpc/platforms/pseries/iommu.c >>> b/arch/powerpc/platforms/pseries/iommu.c >>> index 7c1d77c..700020a 100644 >>> --- a/arch/powerpc/platforms/pseries/iommu.c >>> +++ b/arch/powerpc/platforms/pseries/iommu.c >>> @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool >>> remove_prop) >>> pr_debug("%s successfully cleared tces in window.\n", >>> np->full_name); >>> >>> - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); >>> + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); > > The conversion should happen once where ever ddw_avail comes out of the device > tree, rather than everywhere it's used. ddw_avail is a pointer to device tree property value: ddw_avail = of_get_property(np, "ibm,ddw-applicable", ); This contains 3 tokens, only 2 of them are actually used by the DDW code in its current state and each of them is used just once in the guest's lifetime. Older guest kernels would use the "reset" extension token but again - only once. I fail to see the point in caching CPU-endian values of these tokens. > > cheers > > > -- Alexey -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] powerpc/iommu/ddw: Fix endianness
On 09/09/2014 09:22 PM, Alexey Kardashevskiy wrote: > rtas_call() accepts and returns values in CPU endianness. > > of_read_number() accepts big-endian values but create.addr_hi/lo returned > by rtas_call() are in CPU endiannes. > > The dynamic_dma_window_prop struct defines all members as BE so let's > make it true. > > struct dynamic_dma_window_prop { > __be32 liobn; /* tce table number */ > __be64 dma_base; /* address hi,lo */ > __be32 tce_shift; /* ilog2(tce_page_size) */ > __be32 window_shift; /* ilog2(tce_window_size) */ > }; > > Cc: Benjamin Herrenschmidt > Cc: Alexander Graf > Signed-off-by: Alexey Kardashevskiy Ping, anyone? > --- > arch/powerpc/platforms/pseries/iommu.c | 24 +--- > 1 file changed, 13 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/platforms/pseries/iommu.c > b/arch/powerpc/platforms/pseries/iommu.c > index 7c1d77c..700020a 100644 > --- a/arch/powerpc/platforms/pseries/iommu.c > +++ b/arch/powerpc/platforms/pseries/iommu.c > @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool > remove_prop) > pr_debug("%s successfully cleared tces in window.\n", >np->full_name); > > - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); > + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); > if (ret) > pr_warning("%s: failed to remove direct window: rtas returned " > "%d to ibm,remove-pe-dma-window(%x) %llx\n", > @@ -842,7 +842,7 @@ static int query_ddw(struct pci_dev *dev, const u32 > *ddw_avail, > cfg_addr = edev->pe_config_addr; > buid = edev->phb->buid; > > - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, > + ret = rtas_call(be32_to_cpu(ddw_avail[0]), 3, 5, (u32 *)query, > cfg_addr, BUID_HI(buid), BUID_LO(buid)); > dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x" > " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), > @@ -874,8 +874,9 @@ static int create_ddw(struct pci_dev *dev, const u32 > *ddw_avail, > > do { > /* extra outputs are LIOBN and dma-addr (hi, lo) */ > - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, > - BUID_HI(buid), BUID_LO(buid), page_shift, > window_shift); > + ret = rtas_call(be32_to_cpu(ddw_avail[1]), 5, 4, (u32 *)create, > + cfg_addr, BUID_HI(buid), BUID_LO(buid), > + page_shift, window_shift); > } while (rtas_busy_delay(ret)); > dev_info(>dev, > "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " > @@ -972,11 +973,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct > device_node *pdn) > dev_dbg(>dev, "no free dynamic windows"); > goto out_failed; > } > - if (be32_to_cpu(query.page_size) & 4) { > + if (query.page_size & 4) { > page_shift = 24; /* 16MB */ > - } else if (be32_to_cpu(query.page_size) & 2) { > + } else if (query.page_size & 2) { > page_shift = 16; /* 64kB */ > - } else if (be32_to_cpu(query.page_size) & 1) { > + } else if (query.page_size & 1) { > page_shift = 12; /* 4kB */ > } else { > dev_dbg(>dev, "no supported direct page size in mask %x", > @@ -987,7 +988,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct > device_node *pdn) > /* verify the window * number of ptes will map the partition */ > /* check largest block * page size > max memory hotplug addr */ > max_addr = memory_hotplug_max(); > - if (be32_to_cpu(query.largest_available_block) < (max_addr >> > page_shift)) { > + if (query.largest_available_block < (max_addr >> page_shift)) { > dev_dbg(>dev, "can't map partiton max 0x%llx with %u " > "%llu-sized pages\n", max_addr, > query.largest_available_block, > 1ULL << page_shift); > @@ -1014,8 +1015,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct > device_node *pdn) > if (ret != 0) > goto out_free_prop; > > - ddwprop->liobn = create.liobn; > - ddwprop->dma_base = cpu_to_be64(of_read_number(_hi, 2)); > + ddwprop->liobn = cpu_to_be32(create.liobn); > + ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) | > +
Re: [PATCH] powerpc/iommu/ddw: Fix endianness
On 09/09/2014 09:22 PM, Alexey Kardashevskiy wrote: rtas_call() accepts and returns values in CPU endianness. of_read_number() accepts big-endian values but create.addr_hi/lo returned by rtas_call() are in CPU endiannes. The dynamic_dma_window_prop struct defines all members as BE so let's make it true. struct dynamic_dma_window_prop { __be32 liobn; /* tce table number */ __be64 dma_base; /* address hi,lo */ __be32 tce_shift; /* ilog2(tce_page_size) */ __be32 window_shift; /* ilog2(tce_window_size) */ }; Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: Alexander Graf ag...@suse.de Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Ping, anyone? --- arch/powerpc/platforms/pseries/iommu.c | 24 +--- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c1d77c..700020a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug(%s successfully cleared tces in window.\n, np-full_name); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); if (ret) pr_warning(%s: failed to remove direct window: rtas returned %d to ibm,remove-pe-dma-window(%x) %llx\n, @@ -842,7 +842,7 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, cfg_addr = edev-pe_config_addr; buid = edev-phb-buid; - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + ret = rtas_call(be32_to_cpu(ddw_avail[0]), 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(dev-dev, ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n, ddw_avail[0], cfg_addr, BUID_HI(buid), @@ -874,8 +874,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(be32_to_cpu(ddw_avail[1]), 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(dev-dev, ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d @@ -972,11 +973,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(dev-dev, no free dynamic windows); goto out_failed; } - if (be32_to_cpu(query.page_size) 4) { + if (query.page_size 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) 2) { + } else if (query.page_size 2) { page_shift = 16; /* 64kB */ - } else if (be32_to_cpu(query.page_size) 1) { + } else if (query.page_size 1) { page_shift = 12; /* 4kB */ } else { dev_dbg(dev-dev, no supported direct page size in mask %x, @@ -987,7 +988,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) /* verify the window * number of ptes will map the partition */ /* check largest block * page size max memory hotplug addr */ max_addr = memory_hotplug_max(); - if (be32_to_cpu(query.largest_available_block) (max_addr page_shift)) { + if (query.largest_available_block (max_addr page_shift)) { dev_dbg(dev-dev, can't map partiton max 0x%llx with %u %llu-sized pages\n, max_addr, query.largest_available_block, 1ULL page_shift); @@ -1014,8 +1015,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (ret != 0) goto out_free_prop; - ddwprop-liobn = create.liobn; - ddwprop-dma_base = cpu_to_be64(of_read_number(create.addr_hi, 2)); + ddwprop-liobn = cpu_to_be32(create.liobn); + ddwprop-dma_base = cpu_to_be64(((u64)create.addr_hi 32) | + create.addr_lo); ddwprop-tce_shift = cpu_to_be32(page_shift); ddwprop-window_shift = cpu_to_be32(len); @@ -1048,7 +1050,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) list_add(window-list, direct_window_list); spin_unlock(direct_window_list_lock); - dma_addr = of_read_number(create.addr_hi, 2); + dma_addr = be64_to_cpu(ddwprop-dma_base); goto out_unlock; out_free_window: -- Alexey -- To unsubscribe from this list: send the line unsubscribe linux-kernel
Re: [PATCH] powerpc/iommu/ddw: Fix endianness
On 09/15/2014 07:18 PM, Michael Ellerman wrote: On Mon, 2014-09-15 at 18:41 +1000, Alexey Kardashevskiy wrote: On 09/09/2014 09:22 PM, Alexey Kardashevskiy wrote: rtas_call() accepts and returns values in CPU endianness. Sounds right. of_read_number() accepts big-endian values but create.addr_hi/lo returned by rtas_call() are in CPU endiannes. Also sounds right. The dynamic_dma_window_prop struct defines all members as BE so let's make it true. It does. But why does it do that? It seems to be allocated and setup in enable_ddw() and then the only place I see it used is in tce_setrange_multi_pSeriesLP()/tce_clearrange_multi_pSeriesLP(), which both unpack it again. What am I missing? I do not know why they are BE. I just know that create_ddw() returns ddw_create_response struct which members are declared as BE but they are not as rtas_call() already made them CPU-endian. May be rtas_call() must not be used for structs. Or these structs must be fixed to be CPU endian. Cannot choose what/how to fix here. Sure I still can miss something here and it is all correct and I have to fix QEMU. struct dynamic_dma_window_prop { __be32 liobn; /* tce table number */ __be64 dma_base; /* address hi,lo */ __be32 tce_shift; /* ilog2(tce_page_size) */ __be32 window_shift; /* ilog2(tce_window_size) */ }; We do read them from the device tree in find_existing_ddw_windows(), but if that's the only place then the conversion to cpu endian should happen there. enable_ddw() kmalloc's ddwprop which is of this dynamic_dma_window_prop type. Then enable_ddw() initializes properties of that ddwprop thing but does it incorrectly. Then it calls walk_system_ram_range() which eventually calls tce_setrange_multi_pSeriesLP() to map every single page of guest's ram, the dynamic_dma_window_prop struct pointer is a void* argument of that callback. find_existing_ddw_windows() handles something called linux,direct64-ddr-window-info which is not a part of DDW at all and it is not from PAPR and this patch is not about it. diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c1d77c..700020a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug(%s successfully cleared tces in window.\n, np-full_name); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); The conversion should happen once where ever ddw_avail comes out of the device tree, rather than everywhere it's used. ddw_avail is a pointer to device tree property value: ddw_avail = of_get_property(np, ibm,ddw-applicable, len); This contains 3 tokens, only 2 of them are actually used by the DDW code in its current state and each of them is used just once in the guest's lifetime. Older guest kernels would use the reset extension token but again - only once. I fail to see the point in caching CPU-endian values of these tokens. cheers -- Alexey -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] powerpc/iommu/ddw: Fix endianness
rtas_call() accepts and returns values in CPU endianness. of_read_number() accepts big-endian values but create.addr_hi/lo returned by rtas_call() are in CPU endiannes. The dynamic_dma_window_prop struct defines all members as BE so let's make it true. struct dynamic_dma_window_prop { __be32 liobn; /* tce table number */ __be64 dma_base; /* address hi,lo */ __be32 tce_shift; /* ilog2(tce_page_size) */ __be32 window_shift; /* ilog2(tce_window_size) */ }; Cc: Benjamin Herrenschmidt Cc: Alexander Graf Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/pseries/iommu.c | 24 +--- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c1d77c..700020a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug("%s successfully cleared tces in window.\n", np->full_name); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); if (ret) pr_warning("%s: failed to remove direct window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", @@ -842,7 +842,7 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, cfg_addr = edev->pe_config_addr; buid = edev->phb->buid; - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + ret = rtas_call(be32_to_cpu(ddw_avail[0]), 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x" " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), @@ -874,8 +874,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(be32_to_cpu(ddw_avail[1]), 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(>dev, "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " @@ -972,11 +973,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(>dev, "no free dynamic windows"); goto out_failed; } - if (be32_to_cpu(query.page_size) & 4) { + if (query.page_size & 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) & 2) { + } else if (query.page_size & 2) { page_shift = 16; /* 64kB */ - } else if (be32_to_cpu(query.page_size) & 1) { + } else if (query.page_size & 1) { page_shift = 12; /* 4kB */ } else { dev_dbg(>dev, "no supported direct page size in mask %x", @@ -987,7 +988,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) /* verify the window * number of ptes will map the partition */ /* check largest block * page size > max memory hotplug addr */ max_addr = memory_hotplug_max(); - if (be32_to_cpu(query.largest_available_block) < (max_addr >> page_shift)) { + if (query.largest_available_block < (max_addr >> page_shift)) { dev_dbg(>dev, "can't map partiton max 0x%llx with %u " "%llu-sized pages\n", max_addr, query.largest_available_block, 1ULL << page_shift); @@ -1014,8 +1015,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (ret != 0) goto out_free_prop; - ddwprop->liobn = create.liobn; - ddwprop->dma_base = cpu_to_be64(of_read_number(_hi, 2)); + ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) | + create.addr_lo); ddwprop->tce_shift = cpu_to_be32(page_shift); ddwprop->window_shift = cpu_to_be32(len); @@ -1048,7 +1050,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) list_add(>list, _window_list); spin_unlock(_window_list_lock); - dma_addr = of_read_number(_hi, 2); + dma_addr = be64_to_cpu(ddwprop->dma_base); goto out_unlock; out_free_window: -- 2
[PATCH] powerpc/iommu/ddw: Fix endianness
rtas_call() accepts and returns values in CPU endianness. of_read_number() accepts big-endian values but create.addr_hi/lo returned by rtas_call() are in CPU endiannes. The dynamic_dma_window_prop struct defines all members as BE so let's make it true. struct dynamic_dma_window_prop { __be32 liobn; /* tce table number */ __be64 dma_base; /* address hi,lo */ __be32 tce_shift; /* ilog2(tce_page_size) */ __be32 window_shift; /* ilog2(tce_window_size) */ }; Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: Alexander Graf ag...@suse.de Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 24 +--- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c1d77c..700020a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -750,7 +750,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) pr_debug(%s successfully cleared tces in window.\n, np-full_name); - ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + ret = rtas_call(be32_to_cpu(ddw_avail[2]), 1, 1, NULL, liobn); if (ret) pr_warning(%s: failed to remove direct window: rtas returned %d to ibm,remove-pe-dma-window(%x) %llx\n, @@ -842,7 +842,7 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, cfg_addr = edev-pe_config_addr; buid = edev-phb-buid; - ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + ret = rtas_call(be32_to_cpu(ddw_avail[0]), 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(dev-dev, ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n, ddw_avail[0], cfg_addr, BUID_HI(buid), @@ -874,8 +874,9 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ - ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, - BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + ret = rtas_call(be32_to_cpu(ddw_avail[1]), 5, 4, (u32 *)create, + cfg_addr, BUID_HI(buid), BUID_LO(buid), + page_shift, window_shift); } while (rtas_busy_delay(ret)); dev_info(dev-dev, ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d @@ -972,11 +973,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(dev-dev, no free dynamic windows); goto out_failed; } - if (be32_to_cpu(query.page_size) 4) { + if (query.page_size 4) { page_shift = 24; /* 16MB */ - } else if (be32_to_cpu(query.page_size) 2) { + } else if (query.page_size 2) { page_shift = 16; /* 64kB */ - } else if (be32_to_cpu(query.page_size) 1) { + } else if (query.page_size 1) { page_shift = 12; /* 4kB */ } else { dev_dbg(dev-dev, no supported direct page size in mask %x, @@ -987,7 +988,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) /* verify the window * number of ptes will map the partition */ /* check largest block * page size max memory hotplug addr */ max_addr = memory_hotplug_max(); - if (be32_to_cpu(query.largest_available_block) (max_addr page_shift)) { + if (query.largest_available_block (max_addr page_shift)) { dev_dbg(dev-dev, can't map partiton max 0x%llx with %u %llu-sized pages\n, max_addr, query.largest_available_block, 1ULL page_shift); @@ -1014,8 +1015,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (ret != 0) goto out_free_prop; - ddwprop-liobn = create.liobn; - ddwprop-dma_base = cpu_to_be64(of_read_number(create.addr_hi, 2)); + ddwprop-liobn = cpu_to_be32(create.liobn); + ddwprop-dma_base = cpu_to_be64(((u64)create.addr_hi 32) | + create.addr_lo); ddwprop-tce_shift = cpu_to_be32(page_shift); ddwprop-window_shift = cpu_to_be32(len); @@ -1048,7 +1050,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) list_add(window-list, direct_window_list); spin_unlock(direct_window_list_lock); - dma_addr = of_read_number(create.addr_hi, 2); + dma_addr = be64_to_cpu(ddwprop-dma_base); goto out_unlock; out_free_window: -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info
[PATCH 01/13] powerpc/iommu: Check that TCE page size is equal to it_page_size
This checks that the TCE table page size is not bigger that the size of a page we just pinned and going to put its physical address to the table. Otherwise the hardware gets unwanted access to physical memory between the end of the actual page and the end of the aligned up TCE page. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/kernel/iommu.c | 28 +--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index a10642a..b378f78 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1059,16 +1060,37 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, tce, entry << tbl->it_page_shift, ret); */ return -EFAULT; } + + /* +* Check that the TCE table granularity is not bigger than the size of +* a page we just found. Otherwise the hardware can get access to +* a bigger memory chunk that it should. +*/ + if (PageHuge(page)) { + struct page *head = compound_head(page); + long shift = PAGE_SHIFT + compound_order(head); + + if (shift < tbl->it_page_shift) { + ret = -EINVAL; + goto put_page_exit; + } + + } + hwaddr = (unsigned long) page_address(page) + offset; ret = iommu_tce_build(tbl, entry, hwaddr, direction); if (ret) - put_page(page); + goto put_page_exit; - if (ret < 0) - pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", + return 0; + +put_page_exit: + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", __func__, entry << tbl->it_page_shift, tce, ret); + put_page(page); + return ret; } EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 02/13] powerpc/powernv: Make invalidate() a callback
At the moment pnv_pci_ioda_tce_invalidate() gets the PE pointer via container_of(tbl). Since we are going to have to add Dynamic DMA windows and that means having 2 IOMMU tables per PE, this is not going to work. This implements pnv_pci_ioda(1|2)_tce_invalidate as a pnv_ioda_pe callback. This adds a pnv_iommu_table wrapper around iommu_table and stores a pointer to PE there. PNV's ppc_md.tce_build() call uses this to find PE and do the invalidation. This will be used later for Dynamic DMA windows too. This registers invalidate() callbacks for IODA1 and IODA2: - pnv_pci_ioda1_tce_invalidate; - pnv_pci_ioda2_tce_invalidate. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 35 --- arch/powerpc/platforms/powernv/pci.c | 31 --- arch/powerpc/platforms/powernv/pci.h | 13 +++- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index df241b1..136e765 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = >ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(>dev) != _iommu_ops); - set_iommu_table_base_and_group(>dev, >tce32_table); + set_iommu_table_base_and_group(>dev, >tce32.table); } static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, @@ -884,7 +884,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, } else { dev_info(>dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(>dev, _iommu_ops); - set_iommu_table_base(>dev, >tce32_table); + set_iommu_table_base(>dev, >tce32.table); } *pdev->dev.dma_mask = dma_mask; return 0; @@ -899,9 +899,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, list_for_each_entry(dev, >devices, bus_list) { if (add_to_iommu_group) set_iommu_table_base_and_group(>dev, - >tce32_table); + >tce32.table); else - set_iommu_table_base(>dev, >tce32_table); + set_iommu_table_base(>dev, >tce32.table); if (dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate, @@ -988,19 +988,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, } } -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, -__be64 *startp, __be64 *endp, bool rm) -{ - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); - struct pnv_phb *phb = pe->phb; - - if (phb->type == PNV_PHB_IODA1) - pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); - else - pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); -} - static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, unsigned int segs) @@ -1058,9 +1045,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = >tce32_table; + tbl = >tce32.table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base << 28, IOMMU_PAGE_SHIFT_4K); + pe->tce32.pe = pe; + pe->tce32.invalidate_fn = pnv_pci_ioda1_tce_invalidate; /* OPAL variant of P7IOC SW invalidated TCEs */ swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); @@ -1097,7 +1086,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); + tce32.table); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1142,10 +1131,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pe->tce_bypass_base = 1ull << 59; /* Install set_bypass callback for VFIO */ - pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + pe->tce32.table.set_bypass = pnv_pci_ioda2_set_bypass; /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(>tce32_table, true); + pnv_pci_ioda2_set_bypass(>tce32.table, tru
[PATCH 00/13] powerpc/iommu/vfio: Enable Dynamic DMA windows
This enables PAPR defined feature called Dynamic DMA windows (DDW). Each Partitionable Endpoint (IOMMU group) has a separate DMA window on a PCI bus where devices are allows to perform DMA. By default there is 1 or 2GB window allocated at the host boot time and these windows are used when an IOMMU group is passed to the userspace (guest). These windows are mapped at zero offset on a PCI bus. Hi-speed devices may suffer from limited size of this window. On the host side a TCE bypass mode is enabled on POWER8 CPU which implements direct mapping of the host memory to a PCI bus at 1<<59. For the guest, PAPR defines a DDW RTAS API which allows the pseries guest to query the hypervisor if it supports DDW and what are the parameters of possible windows. Currently POWER8 supports 2 DMA windows per PE - already mentioned and used small 32bit window and 64bit window which can only start from 1<<59 and can support various page sizes. This patchset reworks PPC IOMMU code and adds necessary structures to extend it to support big windows. When the guest detectes the feature and the PE is capable of 64bit DMA, it does: 1. query to hypervisor about number of available windows and page masks; 2. creates a window with the biggest possible page size (current guests can do 64K or 16MB TCEs); 3. maps the entire guest RAM via H_PUT_TCE* hypercalls 4. switches dma_ops to direct_dma_ops on the selected PE. Once this is done, H_PUT_TCE is not called anymore and the guest gets maximum performance. Please comment. Thanks! Alexey Kardashevskiy (13): powerpc/iommu: Check that TCE page size is equal to it_page_size powerpc/powernv: Make invalidate() a callback powerpc/spapr: vfio: Implement spapr_tce_iommu_ops powerpc/powernv: Convert/move set_bypass() callback to take_ownership() powerpc/iommu: Fix IOMMU ownership control functions powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table powerpc/powernv: Do not set "read" flag if direction==DMA_NONE powerpc/powernv: Release replaced TCE powerpc/pseries/lpar: Enable VFIO powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA vfio: powerpc/spapr: Move locked_vm accounting to helpers vfio: powerpc/spapr: Use it_page_size vfio: powerpc/spapr: Enable Dynamic DMA windows arch/powerpc/include/asm/iommu.h| 35 ++- arch/powerpc/include/asm/machdep.h | 25 -- arch/powerpc/include/asm/tce.h | 37 +++ arch/powerpc/kernel/iommu.c | 213 +-- arch/powerpc/kernel/vio.c | 5 +- arch/powerpc/platforms/cell/iommu.c | 9 +- arch/powerpc/platforms/pasemi/iommu.c | 8 +- arch/powerpc/platforms/powernv/pci-ioda.c | 233 +++-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 4 +- arch/powerpc/platforms/powernv/pci.c| 113 +--- arch/powerpc/platforms/powernv/pci.h| 15 +- arch/powerpc/platforms/pseries/iommu.c | 77 -- arch/powerpc/sysdev/dart_iommu.c| 13 +- drivers/vfio/vfio_iommu_spapr_tce.c | 384 +++- include/uapi/linux/vfio.h | 25 +- 15 files changed, 925 insertions(+), 271 deletions(-) -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 08/13] powerpc/powernv: Release replaced TCE
At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first. Another problem this patch is addressing is the use of pool locks for external IOMMU users such as VFIO. The pool locks are to protect DMA page allocator rather than entries and since the host kernel does not control what pages are in use, there is no point in pool locks and exchange()+put_page(oldtce) is sufficient to avoid possible races. This adds an exchange() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE(s) so the caller can release the pages afterwards. This makes iommu_tce_build() put pages returned by exchange(). This replaces iommu_clear_tce() with iommu_tce_build which now can call exchange() with TCE==NULL (i.e. clear). This preserves permission bits in TCE in iommu_put_tce_user_mode(). This removes use of pool locks for external IOMMU uses. This disables external IOMMU use (i.e. VFIO) for IOMMUs which do not implement exchange() callback. Therefore the "powernv" platform is the only supported one after this patch. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h | 8 +++-- arch/powerpc/kernel/iommu.c | 62 arch/powerpc/platforms/powernv/pci.c | 40 +++ 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index c725e4a..8e0537d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -49,6 +49,12 @@ struct iommu_table_ops { unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); + int (*exchange)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + unsigned long *old_tces, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*clear)(struct iommu_table *tbl, long index, long npages); unsigned long (*get)(struct iommu_table *tbl, long index); @@ -209,8 +215,6 @@ extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages); extern int iommu_put_tce_user_mode(struct iommu_table *tbl, diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 678fee8..39ccce7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1006,43 +1006,11 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) -{ - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); - - oldtce = tbl->it_ops->get(tbl, entry); - if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) - tbl->it_ops->clear(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock(&(pool->lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { - unsigned long oldtce; - struct page *page; - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } + iommu_tce_build(tbl, entry, 0, DMA_NONE); } return 0; @@ -1056,18 +1024,19 @@ EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long hwaddr, enum dma_data_direction direction) { - int ret = -EBUSY; + int ret; unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - spin_lock(&(pool->lock)); + ret = tbl->it_ops->exchange(tbl, entry, 1, hwaddr, , + direction, NULL); -
[PATCH 04/13] powerpc/powernv: Convert/move set_bypass() callback to take_ownership()
At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. Since the set_bypass() is not really an iommu_table function but PE's function, and we have an ops struct per IOMMU owner, let's move set_bypass() to the spapr_tce_iommu_ops struct. As arch/powerpc/kernel/iommu.c is more about POWERPC IOMMU tables and has very little to do with PEs, this moves take_ownership() calls to the VFIO SPAPR TCE driver. This renames set_bypass() to take_ownership() as it is not necessarily just enabling bypassing, it can be something else/more so let's give it a generic name. The bool parameter is inverted. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan --- arch/powerpc/include/asm/iommu.h | 1 - arch/powerpc/include/asm/tce.h| 2 ++ arch/powerpc/kernel/iommu.c | 12 arch/powerpc/platforms/powernv/pci-ioda.c | 20 drivers/vfio/vfio_iommu_spapr_tce.c | 16 5 files changed, 30 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 84ee339..2b0b01d 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -77,7 +77,6 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 9f159eb..e6355f9 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -56,6 +56,8 @@ struct spapr_tce_iommu_ops { struct iommu_table *(*get_table)( struct spapr_tce_iommu_group *data, int num); + void (*take_ownership)(struct spapr_tce_iommu_group *data, + bool enable); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 1c5dae7..c2c8d9d 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1139,14 +1139,6 @@ int iommu_take_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0xff, sz); iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); - /* -* Disable iommu bypass, otherwise the user can DMA to all of -* our physical memory via the bypass window instead of just -* the pages that has been explicitly mapped into the iommu -*/ - if (tbl->set_bypass) - tbl->set_bypass(tbl, false); - return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); @@ -1161,10 +1153,6 @@ void iommu_release_ownership(struct iommu_table *tbl) /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, true); } EXPORT_SYMBOL_GPL(iommu_release_ownership); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2d32a1c..8cb2f31 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1105,10 +1105,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32.table); uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1136,7 +1134,7 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) * host side. */ if (pe->pdev) - set_iommu_table_base(>pdev->dev, tbl); + set_iommu_table_base(>pdev->dev, >tce32.table); else pnv_ioda_setup_bus_dma(pe, pe->pbus, false); } @@ -1152,15 +1150,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; - /* Install set_bypass callback for VFIO */ - pe->tce32.table.set_bypass = pnv_pci_ioda2_set_bypass; - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(>tce32.table, true); + pnv_pci_ioda2_set_bypass(pe, true); +} + +static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *d
[PATCH 07/13] powerpc/powernv: Do not set "read" flag if direction==DMA_NONE
Normally a bitmap from the iommu_table is used to track what TCE entry is in use. Since we are going to use iommu_table without its locks and do xchg() instead, it becomes essential not to put bits which are not implied in the direction flag. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index deddcad..ab79e2d 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -628,10 +628,18 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, __be64 *tcep, *tces; u64 rpn; - proto_tce = TCE_PCI_READ; // Read allowed - - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; + switch (direction) { + case DMA_BIDIRECTIONAL: + case DMA_FROM_DEVICE: + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + break; + case DMA_TO_DEVICE: + proto_tce = TCE_PCI_READ; + break; + default: + proto_tce = 0; + break; + } tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> tbl->it_page_shift; -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 09/13] powerpc/pseries/lpar: Enable VFIO
The previous patch introduced iommu_table_ops::exchange() callback which effectively disabled VFIO on pseries. This implements exchange() for pseries/lpar so VFIO can work in nested guests. Since exchaange() callback returns an old TCE, it has to call H_GET_TCE for every TCE being put to the table so VFIO performance in guests running under PR KVM is expected to be slower than in guests running under HV KVM or bare metal hosts. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/pseries/iommu.c | 25 +++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 9a7364f..ae15b5a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -138,13 +138,14 @@ static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, + unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { u64 rc = 0; u64 proto_tce, tce; u64 rpn; - int ret = 0; + int ret = 0, i = 0; long tcenum_start = tcenum, npages_start = npages; rpn = __pa(uaddr) >> TCE_SHIFT; @@ -154,6 +155,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, while (npages--) { tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + if (old_tces) + plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, + _tces[i++]); rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { @@ -179,8 +183,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, static DEFINE_PER_CPU(__be64 *, tce_page); -static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_xchg_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, +unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -195,6 +200,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } @@ -211,6 +217,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + old_tces, direction, attrs); } __get_cpu_var(tce_page) = tcep; @@ -232,6 +239,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, for (l = 0; l < limit; l++) { tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); rpn++; + if (old_tces) + plpar_tce_get((u64)tbl->it_index, + (u64)(tcenum + l) << 12, + _tces[tcenum + l]); } rc = plpar_tce_put_indirect((u64)tbl->it_index, @@ -262,6 +273,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +long npages, unsigned long uaddr, +enum dma_data_direction direction, +struct dma_attrs *attrs) +{ + return tce_xchg_pSeriesLP(tbl, tcenum, npages, uaddr, NULL, + direction, attrs); +} + static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) { u64 rc; @@ -637,6 +657,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, + .exchange = tce_xchg_pSeriesLP, .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP }; -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body o
[PATCH 06/13] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
This adds a iommu_table_ops struct and puts pointer to it into the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush callbacks from ppc_md to the new struct where they really belong to. This adds an extra @ops parameter to iommu_init_table() to make sure that we do not leave any IOMMU table without iommu_table_ops. @it_ops is initialized in the very beginning as iommu_init_table() calls iommu_table_clear() and the latter uses callbacks already. This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_" prefixes for better readability. This removes tce_xxx_rm handlers from ppc_md as well but does not add them to iommu_table_ops, this will be done later if we decide to support TCE hypercalls in real mode. This always uses tce_buildmulti_pSeriesLP/tce_buildmulti_pSeriesLP as callbacks for pseries. This changes "multi" callbacks to fall back to tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not present. The reason for this is we still have to support "multitce=off" boot parameter in disable_multitce() and we do not want to walk through all IOMMU tables in the system and replace "multi" callbacks with single ones. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 20 +++- arch/powerpc/include/asm/machdep.h | 25 --- arch/powerpc/kernel/iommu.c | 50 - arch/powerpc/kernel/vio.c | 5 ++- arch/powerpc/platforms/cell/iommu.c | 9 -- arch/powerpc/platforms/pasemi/iommu.c | 8 +++-- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 +- arch/powerpc/platforms/powernv/pci.c| 24 -- arch/powerpc/platforms/powernv/pci.h| 1 + arch/powerpc/platforms/pseries/iommu.c | 42 +--- arch/powerpc/sysdev/dart_iommu.c| 13 12 files changed, 102 insertions(+), 102 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 2b0b01d..c725e4a 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -43,6 +43,22 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); + void (*clear)(struct iommu_table *tbl, + long index, long npages); + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -77,6 +93,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + struct iommu_table_ops *it_ops; }; /* Pure 2^n version of get_order */ @@ -106,7 +123,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); * structure */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, - int nid); + int nid, + struct iommu_table_ops *ops); struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index b125cea..1fc824d 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void(*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -enum dma_data_direction direction, -struct dma_attrs *attrs); - void(*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void(*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, -long index, -long npages, -unsigned long uaddr, -
[PATCH 10/13] powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA
SPAPR defines an interface to create additional DMA windows dynamically. "Dynamically" means that the window is not allocated before the guest even started, the guest can request it later. In practice, existing linux guests check for the capability and if it is there, they create and map a DMA window as big as the entire guest RAM. This adds 4 callbacks to the spapr_tce_iommu_ops struct: 1. query - ibm,query-pe-dma-window - returns number/size of windows which can be created (one, any page size); 2. create - ibm,create-pe-dma-window - creates a window; 3. remove - ibm,remove-pe-dma-window - removes a window; removing the default 32bit window is not allowed by this patch, this will be added later if needed; 4. reset - ibm,reset-pe-dma-window - reset the DMA windows configuration to the default state; as the default window cannot be removed, it only removes the additional window if it was created. The next patch will add corresponding ioctls to VFIO SPAPR TCE driver to provide necessary support to the userspace. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/tce.h| 22 + arch/powerpc/platforms/powernv/pci-ioda.c | 159 +- arch/powerpc/platforms/powernv/pci.h | 1 + 3 files changed, 181 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index e6355f9..23b0362 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -58,6 +58,28 @@ struct spapr_tce_iommu_ops { int num); void (*take_ownership)(struct spapr_tce_iommu_group *data, bool enable); + + /* Dynamic DMA window */ + /* Page size flags for ibm,query-pe-dma-window */ +#define DDW_PGSIZE_4K 0x01 +#define DDW_PGSIZE_64K 0x02 +#define DDW_PGSIZE_16M 0x04 +#define DDW_PGSIZE_32M 0x08 +#define DDW_PGSIZE_64M 0x10 +#define DDW_PGSIZE_128M 0x20 +#define DDW_PGSIZE_256M 0x40 +#define DDW_PGSIZE_16G 0x80 + long (*query)(struct spapr_tce_iommu_group *data, + __u32 *current_windows, + __u32 *windows_available, + __u32 *page_size_mask); + long (*create)(struct spapr_tce_iommu_group *data, + __u32 page_shift, + __u32 window_shift, + struct iommu_table **ptbl); + long (*remove)(struct spapr_tce_iommu_group *data, + struct iommu_table *tbl); + long (*reset)(struct spapr_tce_iommu_group *data); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 296f49b..a6318cb 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1154,6 +1154,26 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pnv_pci_ioda2_set_bypass(pe, true); } +static struct iommu_table *pnv_ioda2_iommu_get_table( + struct spapr_tce_iommu_group *data, + int num) +{ + struct pnv_ioda_pe *pe = data->iommu_owner; + + switch (num) { + case 0: + if (pe->tce32.table.it_size) + return >tce32.table; + return NULL; + case 1: + if (pe->tce64.table.it_size) + return >tce64.table; + return NULL; + default: + return NULL; + } +} + static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, bool enable) { @@ -1162,9 +1182,146 @@ static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, pnv_pci_ioda2_set_bypass(pe, !enable); } +static long pnv_pci_ioda2_ddw_query(struct spapr_tce_iommu_group *data, + __u32 *current_windows, + __u32 *windows_available, __u32 *page_size_mask) +{ + struct pnv_ioda_pe *pe = data->iommu_owner; + + *windows_available = 2; + *current_windows = 0; + if (pe->tce32.table.it_size) { + --*windows_available; + ++*current_windows; + } + if (pe->tce64.table.it_size) { + --*windows_available; + ++*current_windows; + } + *page_size_mask = + DDW_PGSIZE_4K | + DDW_PGSIZE_64K | + DDW_PGSIZE_16M; + + return 0; +} + +static long pnv_pci_ioda2_ddw_create(struct spapr_tce_iommu_group *data, + __u32 page_shift, __u32 window_shift, + struct iommu_table **ptbl) +{ + struct pnv_ioda_pe *pe = data->iommu_owner; + struct pnv_phb *phb = pe->phb; + struct page *tce_mem = NULL; + void *addr; + long ret; + unsigned long tce_table_size = + (1ULL << (window_sh
[PATCH 03/13] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
Modern IBM POWERPC systems support multiple IOMMU tables per PE so we need a more reliable way (compared to container_of()) to get a PE pointer from the iommu_table struct pointer used in IOMMU functions. At the moment IOMMU group data points to an iommu_table struct. This introduces a spapr_tce_iommu_group struct which keeps an iommu_owner and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to the pnv_ioda_pe struct, for others it is still a pointer to the iommu_table struct. The ops structs correspond to the type which iommu_owner points to. This defines a get_table() callback which returns an iommu_table by its number. As the IOMMU group data pointer points to variable type instead of iommu_table, VFIO SPAPR TCE driver is updated to use the new type. This changes the tce_container struct to store iommu_group instead of iommu_table. So, it was: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to iommu_table via iommu_group_get_iommudata(); now it is: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to spapr_tce_iommu_group via iommu_group_get_iommudata(); - spapr_tce_iommu_group points to either (depending on .get_table()): - iommu_table; - pnv_ioda_pe; This uses pnv_ioda1_iommu_get_table for both IODA1&2 but IODA2 will have own pnv_ioda2_iommu_get_table soon and pnv_ioda1_iommu_get_table will only be used for IODA1. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 6 ++ arch/powerpc/include/asm/tce.h | 13 +++ arch/powerpc/kernel/iommu.c | 35 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/pseries/iommu.c | 10 +- drivers/vfio/vfio_iommu_spapr_tce.c | 148 ++-- 8 files changed, 208 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 42632c7..84ee339 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -108,13 +108,19 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API extern void iommu_register_group(struct iommu_table *tbl, +void *iommu_owner, +struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else static inline void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, + struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 743f36b..9f159eb 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -50,5 +50,18 @@ #define TCE_PCI_READ 0x1 /* read from PCI allowed */ #define TCE_VB_WRITE 0x1 /* write from VB allowed */ +struct spapr_tce_iommu_group; + +struct spapr_tce_iommu_ops { + struct iommu_table *(*get_table)( + struct spapr_tce_iommu_group *data, + int num); +}; + +struct spapr_tce_iommu_group { + void *iommu_owner; + struct spapr_tce_iommu_ops *ops; +}; + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TCE_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b378f78..1c5dae7 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -878,24 +878,53 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl->it_group = NULL; + kfree(iommu_data); } +static struct iommu_table *spapr_tce_default_get_table( + struct spapr_tce_iommu_group *data, int num) +{ + struct iommu_table *tbl = data->iommu_owner; + + switch (num) { + case 0: + if (tbl->it_size) + return tbl; + /* fallthru */ + default: + return NULL; + } +} + +static struct spapr_tce_iommu_ops spapr_tce_default_ops = { + .get_table = spapr_tce_default_get_table +}; + void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigne
[PATCH 12/13] vfio: powerpc/spapr: Use it_page_size
This makes use of the it_page_size from the iommu_table struct as page size can differ. This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code as recently introduced IOMMU_PAGE_XXX macros do not include IOMMU_PAGE_SHIFT. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 36 ++-- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c9fac97..0dccbc4 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -36,7 +36,7 @@ static long try_increment_locked_vm(struct iommu_table *tbl) if (!current || !current->mm) return -ESRCH; /* process exited */ - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; down_write(>mm->mmap_sem); locked = current->mm->locked_vm + npages; @@ -60,7 +60,7 @@ static void decrement_locked_vm(struct iommu_table *tbl) if (!current || !current->mm) return; /* process exited */ - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT; down_write(>mm->mmap_sem); if (npages > current->mm->locked_vm) @@ -284,8 +284,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz < minsz) return -EINVAL; - info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = tbl->it_offset << tbl->it_page_shift; + info.dma32_window_size = tbl->it_size << tbl->it_page_shift; info.flags = 0; if (copy_to_user((void __user *)arg, , minsz)) @@ -318,10 +318,6 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; - if ((param.size & ~IOMMU_PAGE_MASK_4K) || - (param.vaddr & ~IOMMU_PAGE_MASK_4K)) - return -EINVAL; - /* iova is checked by the IOMMU API */ tce = param.vaddr; if (param.flags & VFIO_DMA_MAP_FLAG_READ) @@ -334,21 +330,25 @@ static long tce_iommu_ioctl(void *iommu_data, return -ENXIO; BUG_ON(!tbl->it_group); + if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || + (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) + return -EINVAL; + ret = iommu_tce_put_param_check(tbl, param.iova, tce); if (ret) return ret; - for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) { + for (i = 0; i < (param.size >> tbl->it_page_shift); ++i) { ret = iommu_put_tce_user_mode(tbl, - (param.iova >> IOMMU_PAGE_SHIFT_4K) + i, + (param.iova >> tbl->it_page_shift) + i, tce); if (ret) break; - tce += IOMMU_PAGE_SIZE_4K; + tce += IOMMU_PAGE_SIZE(tbl); } if (ret) iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, i); + param.iova >> tbl->it_page_shift, i); iommu_flush_tce(tbl); @@ -379,23 +379,23 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; - if (param.size & ~IOMMU_PAGE_MASK_4K) - return -EINVAL; - tbl = spapr_tce_find_table(container, data, param.iova); if (!tbl) return -ENXIO; + if (param.size & ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + BUG_ON(!tbl->it_group); ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size >> IOMMU_PAGE_SHIFT_4K); + param.size >> tbl->it_page_shift); if (ret) return ret; ret = iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, - param.size >
[PATCH 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows
This defines and implements VFIO IOMMU API which lets the userspace create and remove DMA windows. This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of available windows and page mask. This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE to allow the user space to create and remove window(s). The VFIO IOMMU driver does basic sanity checks and calls corresponding SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) implements them. This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via VFIO_IOMMU_SPAPR_TCE_GET_INFO. This calls platform DDW reset() callback when IOMMU is being disabled to reset the DMA configuration to its original state. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 135 ++-- include/uapi/linux/vfio.h | 25 ++- 2 files changed, 153 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0dccbc4..b518891 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; - if (!container->grp || !current->mm) + if (!container->grp) return; data = iommu_group_get_iommudata(container->grp); if (!data || !data->iommu_owner || !data->ops->get_table) return; - tbl = data->ops->get_table(data, 0); - if (!tbl) - return; + if (current->mm) { + tbl = data->ops->get_table(data, 0); + if (tbl) + decrement_locked_vm(tbl); - decrement_locked_vm(tbl); + tbl = data->ops->get_table(data, 1); + if (tbl) + decrement_locked_vm(tbl); + } + + if (data->ops->reset) + data->ops->reset(data); } static void *tce_iommu_open(unsigned long arg) @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_size = tbl->it_size << tbl->it_page_shift; info.flags = 0; + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, + page_size_mask); + + if (info.argsz == ddwsz) { + if (data->ops->query && data->ops->create && + data->ops->remove) { + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; + + ret = data->ops->query(data, + _windows, + _available, + _size_mask); + if (ret) + return ret; + } else { + info.current_windows = 0; + info.windows_available = 0; + info.page_size_mask = 0; + } + minsz = ddwsz; + } + if (copy_to_user((void __user *)arg, , minsz)) return -EFAULT; @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(>lock); return 0; + case VFIO_EEH_PE_OP: if (!container->grp) return -ENODEV; return vfio_spapr_iommu_eeh_ioctl(container->grp, cmd, arg); + + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct spapr_tce_iommu_group *data; + struct iommu_table *tbl; + + if (WARN_ON(!container->grp)) + return -ENXIO; + + data = iommu_group_get_iommudata(container->grp); + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz < minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + if (!data->ops->create || !data->iommu_owner) +
[PATCH 05/13] powerpc/iommu: Fix IOMMU ownership control functions
This adds missing locks in iommu_take_ownership()/ iommu_release_ownership(). This marks all pages busy in iommu_table::it_map in order to catch errors if there is an attempt to use this table while ownership over it is taken. This only clears TCE content if there is no page marked busy in it_map. Clearing must be done outside of the table locks as iommu_clear_tce() called from iommu_clear_tces_and_put_pages() does this. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/kernel/iommu.c | 36 +--- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c2c8d9d..cd80867 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1126,33 +1126,55 @@ EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); int iommu_take_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl->it_size + 7) >> 3; + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; + int ret = 0, bit0 = 0; + + spin_lock_irqsave(>large_pool.lock, flags); + for (i = 0; i < tbl->nr_pools; i++) + spin_lock(>pools[i].lock); if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + bit0 = test_and_clear_bit(0, tbl->it_map); if (!bitmap_empty(tbl->it_map, tbl->it_size)) { pr_err("iommu_tce: it_map is not empty"); - return -EBUSY; + ret = -EBUSY; + if (bit0) + set_bit(0, tbl->it_map); + } else { + memset(tbl->it_map, 0xff, sz); } - memset(tbl->it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + for (i = 0; i < tbl->nr_pools; i++) + spin_unlock(>pools[i].lock); + spin_unlock_irqrestore(>large_pool.lock, flags); - return 0; + if (!ret) + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, + tbl->it_size); + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl->it_size + 7) >> 3; + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + + spin_lock_irqsave(>large_pool.lock, flags); + for (i = 0; i < tbl->nr_pools; i++) + spin_lock(>pools[i].lock); + memset(tbl->it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); + + for (i = 0; i < tbl->nr_pools; i++) + spin_unlock(>pools[i].lock); + spin_unlock_irqrestore(>large_pool.lock, flags); } EXPORT_SYMBOL_GPL(iommu_release_ownership); -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/13] vfio: powerpc/spapr: Move locked_vm accounting to helpers
There moves locked pages accounting to helpers. Later they will be reused for Dynamic DMA windows (DDW). While we are here, update the comment explaining why RLIMIT_MEMLOCK might be required to be bigger than the guest RAM. Signed-off-by: Alexey Kardashevskiy --- drivers/vfio/vfio_iommu_spapr_tce.c | 71 +++-- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 1c1a9c4..c9fac97 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -29,6 +29,46 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +static long try_increment_locked_vm(struct iommu_table *tbl) +{ + long ret = 0, locked, lock_limit, npages; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + + down_write(>mm->mmap_sem); + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current->mm->locked_vm += npages; + } + up_write(>mm->mmap_sem); + + return ret; +} + +static void decrement_locked_vm(struct iommu_table *tbl) +{ + long npages; + + if (!current || !current->mm) + return; /* process exited */ + + npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + + down_write(>mm->mmap_sem); + if (npages > current->mm->locked_vm) + npages = current->mm->locked_vm; + current->mm->locked_vm -= npages; + up_write(>mm->mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -86,7 +126,6 @@ static void tce_iommu_take_ownership_notify(struct spapr_tce_iommu_group *data, static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; struct iommu_table *tbl; struct spapr_tce_iommu_group *data; @@ -120,24 +159,23 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. +* +* Unfortunately at the moment it counts whole tables, no matter how +* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups +* each with 2GB DMA window, 8GB will be counted here. The reason for +* this is that we cannot tell here the amount of RAM used by the guest +* as this information is only available from KVM and VFIO is +* KVM agnostic. */ tbl = data->ops->get_table(data, 0); if (!tbl) return -ENXIO; - down_write(>mm->mmap_sem); - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; - locked = current->mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { - current->mm->locked_vm += npages; - container->enabled = true; - } - up_write(>mm->mmap_sem); + ret = try_increment_locked_vm(tbl); + if (ret) + return ret; + + container->enabled = true; return ret; } @@ -163,10 +201,7 @@ static void tce_iommu_disable(struct tce_container *container) if (!tbl) return; - down_write(>mm->mmap_sem); - current->mm->locked_vm -= (tbl->it_size << - IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; - up_write(>mm->mmap_sem); + decrement_locked_vm(tbl); } static void *tce_iommu_open(unsigned long arg) -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/13] vfio: powerpc/spapr: Move locked_vm accounting to helpers
There moves locked pages accounting to helpers. Later they will be reused for Dynamic DMA windows (DDW). While we are here, update the comment explaining why RLIMIT_MEMLOCK might be required to be bigger than the guest RAM. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 71 +++-- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 1c1a9c4..c9fac97 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -29,6 +29,46 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +static long try_increment_locked_vm(struct iommu_table *tbl) +{ + long ret = 0, locked, lock_limit, npages; + + if (!current || !current-mm) + return -ESRCH; /* process exited */ + + npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + + down_write(current-mm-mmap_sem); + locked = current-mm-locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; + if (locked lock_limit !capable(CAP_IPC_LOCK)) { + pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current-mm-locked_vm += npages; + } + up_write(current-mm-mmap_sem); + + return ret; +} + +static void decrement_locked_vm(struct iommu_table *tbl) +{ + long npages; + + if (!current || !current-mm) + return; /* process exited */ + + npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + + down_write(current-mm-mmap_sem); + if (npages current-mm-locked_vm) + npages = current-mm-locked_vm; + current-mm-locked_vm -= npages; + up_write(current-mm-mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -86,7 +126,6 @@ static void tce_iommu_take_ownership_notify(struct spapr_tce_iommu_group *data, static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; struct iommu_table *tbl; struct spapr_tce_iommu_group *data; @@ -120,24 +159,23 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. +* +* Unfortunately at the moment it counts whole tables, no matter how +* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups +* each with 2GB DMA window, 8GB will be counted here. The reason for +* this is that we cannot tell here the amount of RAM used by the guest +* as this information is only available from KVM and VFIO is +* KVM agnostic. */ tbl = data-ops-get_table(data, 0); if (!tbl) return -ENXIO; - down_write(current-mm-mmap_sem); - npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; - locked = current-mm-locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; - if (locked lock_limit !capable(CAP_IPC_LOCK)) { - pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { - current-mm-locked_vm += npages; - container-enabled = true; - } - up_write(current-mm-mmap_sem); + ret = try_increment_locked_vm(tbl); + if (ret) + return ret; + + container-enabled = true; return ret; } @@ -163,10 +201,7 @@ static void tce_iommu_disable(struct tce_container *container) if (!tbl) return; - down_write(current-mm-mmap_sem); - current-mm-locked_vm -= (tbl-it_size - IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; - up_write(current-mm-mmap_sem); + decrement_locked_vm(tbl); } static void *tce_iommu_open(unsigned long arg) -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/13] powerpc/iommu: Fix IOMMU ownership control functions
This adds missing locks in iommu_take_ownership()/ iommu_release_ownership(). This marks all pages busy in iommu_table::it_map in order to catch errors if there is an attempt to use this table while ownership over it is taken. This only clears TCE content if there is no page marked busy in it_map. Clearing must be done outside of the table locks as iommu_clear_tce() called from iommu_clear_tces_and_put_pages() does this. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 36 +--- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c2c8d9d..cd80867 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1126,33 +1126,55 @@ EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); int iommu_take_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; + int ret = 0, bit0 = 0; + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); if (tbl-it_offset == 0) - clear_bit(0, tbl-it_map); + bit0 = test_and_clear_bit(0, tbl-it_map); if (!bitmap_empty(tbl-it_map, tbl-it_size)) { pr_err(iommu_tce: it_map is not empty); - return -EBUSY; + ret = -EBUSY; + if (bit0) + set_bit(0, tbl-it_map); + } else { + memset(tbl-it_map, 0xff, sz); } - memset(tbl-it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); - return 0; + if (!ret) + iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, + tbl-it_size); + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); + memset(tbl-it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ if (tbl-it_offset == 0) set_bit(0, tbl-it_map); + + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); } EXPORT_SYMBOL_GPL(iommu_release_ownership); -- 2.0.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows
This defines and implements VFIO IOMMU API which lets the userspace create and remove DMA windows. This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of available windows and page mask. This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE to allow the user space to create and remove window(s). The VFIO IOMMU driver does basic sanity checks and calls corresponding SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) implements them. This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via VFIO_IOMMU_SPAPR_TCE_GET_INFO. This calls platform DDW reset() callback when IOMMU is being disabled to reset the DMA configuration to its original state. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 135 ++-- include/uapi/linux/vfio.h | 25 ++- 2 files changed, 153 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0dccbc4..b518891 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container *container) container-enabled = false; - if (!container-grp || !current-mm) + if (!container-grp) return; data = iommu_group_get_iommudata(container-grp); if (!data || !data-iommu_owner || !data-ops-get_table) return; - tbl = data-ops-get_table(data, 0); - if (!tbl) - return; + if (current-mm) { + tbl = data-ops-get_table(data, 0); + if (tbl) + decrement_locked_vm(tbl); - decrement_locked_vm(tbl); + tbl = data-ops-get_table(data, 1); + if (tbl) + decrement_locked_vm(tbl); + } + + if (data-ops-reset) + data-ops-reset(data); } static void *tce_iommu_open(unsigned long arg) @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, + page_size_mask); + + if (info.argsz == ddwsz) { + if (data-ops-query data-ops-create + data-ops-remove) { + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; + + ret = data-ops-query(data, + info.current_windows, + info.windows_available, + info.page_size_mask); + if (ret) + return ret; + } else { + info.current_windows = 0; + info.windows_available = 0; + info.page_size_mask = 0; + } + minsz = ddwsz; + } + if (copy_to_user((void __user *)arg, info, minsz)) return -EFAULT; @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(container-lock); return 0; + case VFIO_EEH_PE_OP: if (!container-grp) return -ENODEV; return vfio_spapr_iommu_eeh_ioctl(container-grp, cmd, arg); + + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct spapr_tce_iommu_group *data; + struct iommu_table *tbl; + + if (WARN_ON(!container-grp)) + return -ENXIO; + + data = iommu_group_get_iommudata(container-grp); + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(create, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + if (!data-ops-create || !data-iommu_owner) + return -ENOSYS; + + BUG_ON(!data || !data-ops || !data-ops-remove