[PATCH 1/2] powerpc/powernv: Fix IOMMU table for VFIO dev
On PHB3, PCI devices can bypass IOMMU for DMA access. If we pass through one PCI device, whose hose driver ever enable the bypass mode, pdev-dev.archdata.dma_data.iommu_table_base isn't IOMMU table. However, EEH needs access the IOMMU table when the device is owned by guest. The patch fixes pdev-dev.archdata.dma_data.iommu_table when passing through the device to guest in pnv_pci_ioda2_set_bypass(). Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/platforms/powernv/pci-ioda.c | 30 +- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index de19ede..93fd815 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -494,14 +494,22 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, return 0; } -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, + struct pci_bus *bus, + bool add_to_iommu_group) { struct pci_dev *dev; list_for_each_entry(dev, bus-devices, bus_list) { - set_iommu_table_base_and_group(dev-dev, pe-tce32_table); + if (add_to_iommu_group) + set_iommu_table_base_and_group(dev-dev, + pe-tce32_table); + else + set_iommu_table_base(dev-dev, pe-tce32_table); + if (dev-subordinate) - pnv_ioda_setup_bus_dma(pe, dev-subordinate); + pnv_ioda_setup_bus_dma(pe, dev-subordinate, + add_to_iommu_group); } } @@ -677,7 +685,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (pe-pdev) set_iommu_table_base_and_group(pe-pdev-dev, tbl); else - pnv_ioda_setup_bus_dma(pe, pe-pbus); + pnv_ioda_setup_bus_dma(pe, pe-pbus, true); return; fail: @@ -713,11 +721,15 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) 0); /* -* We might want to reset the DMA ops of all devices on -* this PE. However in theory, that shouldn't be necessary -* as this is used for VFIO/KVM pass-through and the device -* hasn't yet been returned to its kernel driver +* EEH needs the mapping between IOMMU table and group +* of those VFIO/KVM pass-through devices. We can postpone +* resetting DMA ops until the DMA mask is configured in +* host side. */ + if (pe-pdev) + set_iommu_table_base(pe-pdev-dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe-pbus, false); } if (rc) pe_err(pe, OPAL error %lld configuring bypass window\n, rc); @@ -805,7 +817,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (pe-pdev) set_iommu_table_base_and_group(pe-pdev-dev, tbl); else - pnv_ioda_setup_bus_dma(pe, pe-pbus); + pnv_ioda_setup_bus_dma(pe, pe-pbus, true); /* Also create a bypass window */ pnv_pci_ioda2_setup_bypass_pe(phb, pe); -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] powerpc/eeh: Fetch IOMMU table in reliable way
Function eeh_iommu_group_to_pe() iterates each PCI device to check the binding IOMMU group with get_iommu_table_base(), which possibly fetches pdev-dev.archdata.dma_data.dma_offset. It's (0x1 59) for bypass cases. The patch fixes the issue by iterating devices hooked to the IOMMU group and fetch IOMMU table there. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/kernel/eeh.c | 33 ++--- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 18c40fd..4de2103 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -27,6 +27,7 @@ #include linux/init.h #include linux/list.h #include linux/pci.h +#include linux/iommu.h #include linux/proc_fs.h #include linux/rbtree.h #include linux/reboot.h @@ -1178,6 +1179,24 @@ out: } EXPORT_SYMBOL(eeh_dev_release); +static int dev_has_iommu_table(struct device *dev, void *data) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_dev **ppdev = data; + struct iommu_table *tbl; + + if (!dev) + return 0; + + tbl = get_iommu_table_base(dev); + if (tbl tbl-it_group) { + *ppdev = pdev; + return 1; + } + + return 0; +} + /** * eeh_iommu_group_to_pe - Convert IOMMU group to EEH PE * @group: IOMMU group @@ -1186,24 +1205,16 @@ EXPORT_SYMBOL(eeh_dev_release); */ struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group) { - struct iommu_table *tbl; struct pci_dev *pdev = NULL; struct eeh_dev *edev; - bool found = false; + int ret; /* No IOMMU group ? */ if (!group) return NULL; - /* No PCI device ? */ - for_each_pci_dev(pdev) { - tbl = get_iommu_table_base(pdev-dev); - if (tbl tbl-it_group == group) { - found = true; - break; - } - } - if (!found) + ret = iommu_group_for_each_dev(group, pdev, dev_has_iommu_table); + if (!ret || !pdev) return NULL; /* No EEH device or PE ? */ -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 0/2] Bug fix for VFIO EEH
Those 2 patches are bug fix for VFIO EEH support, which isn't merged yet though all reviewers gave their ack. So I'm sending this to avoid revert or something like that. The problem is that dma_offset/iommu_table_base are sharing same memory location. When disabling bypass mode, we missed to restore iommu_table_base. EEH is utilizing that to translate IOMMU group ID to PE. The patches fix the issue. Another issue is that we're searching all online PCI devices for translating IOMMU group ID to PE. That's incorrect since we're uncertain that one speicific device (except those in current IOMMU group) is running in bypassed mode or not. So we should have search current IOMMU group. It should be applied on top of unmerged VFIO EEH support patchset: http://patchwork.ozlabs.org/patch/357665/ Gavin Shan (2): powerpc/powernv: Fix IOMMU table for VFIO dev powerpc/eeh: Fetch IOMMU table in reliable way arch/powerpc/kernel/eeh.c | 33 --- arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++- 2 files changed, 43 insertions(+), 20 deletions(-) -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v5 2/2] [BUGFIX] kprobes: Fix Failed to find blacklist error on ia64 and ppc64
On Tue, 2014-07-15 at 13:19 +1000, Michael Ellerman wrote: Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com Reported-by: Tony Luck tony.l...@gmail.com Tested-by: Tony Luck tony.l...@intel.com Cc: Michael Ellerman m...@ellerman.id.au Tested-by: Michael Ellerman m...@ellerman.id.au Acked-by: Michael Ellerman m...@ellerman.id.au (for powerpc) Ben, can you take this in your tree? Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org That looks more like generic material. Do we have a kprobes maintainer ? Andrew, do you want to take this ? I'm happy to put it in powerpc and send it to Linus tomorrow if nobody cares :-) Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] ppc/xmon: use isxdigit/isspace/isalnum from ctype.h
Use linux/ctype.h instead of defining custom versions of isxdigit/isspace/isalnum. Signed-off-by: Vincent Bernat vinc...@bernat.im --- arch/powerpc/xmon/xmon.c | 12 +--- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d199bfa2f1fa..c0c31a47c469 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -24,6 +24,7 @@ #include linux/interrupt.h #include linux/irq.h #include linux/bug.h +#include linux/ctype.h #include asm/ptrace.h #include asm/string.h @@ -177,14 +178,6 @@ extern void xmon_leave(void); #define GETWORD(v) (((v)[0] 24) + ((v)[1] 16) + ((v)[2] 8) + (v)[3]) #endif -#define isxdigit(c)(('0' = (c) (c) = '9') \ -|| ('a' = (c) (c) = 'f') \ -|| ('A' = (c) (c) = 'F')) -#define isalnum(c) (('0' = (c) (c) = '9') \ -|| ('a' = (c) (c) = 'z') \ -|| ('A' = (c) (c) = 'Z')) -#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0) - static char *help_string = \ Commands:\n\ bshow breakpoints\n\ @@ -2121,9 +2114,6 @@ static void dump_pacas(void) } #endif -#define isxdigit(c)(('0' = (c) (c) = '9') \ -|| ('a' = (c) (c) = 'f') \ -|| ('A' = (c) (c) = 'F')) static void dump(void) { -- 2.0.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
RE: [PATCH] ppc/xmon: use isxdigit/isspace/isalnum from ctype.h
From: Vincent Bernat Use linux/ctype.h instead of defining custom versions of isxdigit/isspace/isalnum. ... -#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0) That is different from the version in linux/ctype.h Especially for 'c == 0', but probably also vertical tab and form feed. David ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 01/16] powerpc/iommu: Fix comments with it_page_shift
There is a couple of commented debug prints which still use IOMMU_PAGE_SHIFT() which is not defined for POWERPC anymore, replace them with it_page_shift. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 88e3ec6..f84f799 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1037,7 +1037,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, /* if (unlikely(ret)) pr_err(iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n, - __func__, hwaddr, entry IOMMU_PAGE_SHIFT(tbl), + __func__, hwaddr, entry tbl-it_page_shift, hwaddr, ret); */ return ret; @@ -1056,7 +1056,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, direction != DMA_TO_DEVICE, page); if (unlikely(ret != 1)) { /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n, - tce, entry IOMMU_PAGE_SHIFT(tbl), ret); */ + tce, entry tbl-it_page_shift, ret); */ return -EFAULT; } hwaddr = (unsigned long) page_address(page) + offset; -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 02/16] KVM: PPC: Use RCU when adding to arch.spapr_tce_tables
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc..516f2ee 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -131,7 +131,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kvm_get_kvm(kvm); mutex_lock(kvm-lock); - list_add(stt-list, kvm-arch.spapr_tce_tables); + list_add_rcu(stt-list, kvm-arch.spapr_tce_tables); mutex_unlock(kvm-lock); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW)
This prepares existing upstream kernel for DDW (Dynamic DMA windows) and adds actual DDW support for VFIO. This patchset does not contain any in-kernel acceleration stuff. This patchset does not enable DDW for emulated devices. Alexey Kardashevskiy (16): powerpc/iommu: Fix comments with it_page_shift KVM: PPC: Use RCU when adding to arch.spapr_tce_tables powerpc/powernv: Use it_page_shift for TCE invalidation powerpc/powernv: Use it_page_shift in TCE build powerpc/powernv: Add a page size parameter to pnv_pci_setup_iommu_table() powerpc/powernv: Make invalidate() callback an iommu_table callback powerpc/spapr: vfio: Implement spapr_tce_iommu_ops powerpc/powernv: Convert/move set_bypass() callback to take_ownership() powerpc/iommu: Fix IOMMU ownership control functions powerpc/iommu: Fix missing permission bits in iommu_put_tce_user_mode() powerpc/iommu: Extend ppc_md.tce_build(_rm) to return old TCE values powerpc/powernv: Return non-zero TCE from pnv_tce_build powerpc/iommu: Implement put_page() if TCE had non-zero value powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA vfio: Use it_page_size vfio: powerpc: Enable Dynamic DMA windows arch/powerpc/include/asm/iommu.h| 11 +- arch/powerpc/include/asm/machdep.h | 2 + arch/powerpc/include/asm/tce.h | 36 arch/powerpc/kernel/iommu.c | 95 +++--- arch/powerpc/kvm/book3s_64_vio.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 253 ++--- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 4 +- arch/powerpc/platforms/powernv/pci.c| 60 -- arch/powerpc/platforms/powernv/pci.h| 4 +- arch/powerpc/platforms/pseries/iommu.c | 17 +- arch/powerpc/sysdev/dart_iommu.c| 1 + drivers/vfio/vfio_iommu_spapr_tce.c | 280 include/uapi/linux/vfio.h | 37 +++- 13 files changed, 679 insertions(+), 123 deletions(-) -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 06/16] powerpc/powernv: Make invalidate() callback an iommu_table callback
This implements pnv_pci_ioda(1|2)_tce_invalidate as a callback of iommu_table to simplify code structure. The callbacks receive iommu_table only and cast it to PE, the specific callback knows how. This registers invalidate() callbacks for IODA1 and IODA2: - pnv_pci_ioda1_tce_invalidate; - pnv_pci_ioda2_tce_invalidate_32. There will be another pnv_pci_ioda2_tce_invalidate_64() callback for huge DMA windows. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 4 arch/powerpc/platforms/powernv/pci-ioda.c | 19 +-- arch/powerpc/platforms/powernv/pci.c | 27 +++ 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 42632c7..d8fb3fa 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -60,6 +60,9 @@ struct iommu_pool { spinlock_t lock; } cacheline_aligned_in_smp; +typedef void (*iommu_invalidate_fn)(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm); + struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ @@ -77,6 +80,7 @@ struct iommu_table { #ifdef CONFIG_IOMMU_API struct iommu_group *it_group; #endif + iommu_invalidate_fn invalidate; void (*set_bypass)(struct iommu_table *tbl, bool enable); }; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9f28e18..48e2358 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -505,10 +505,11 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) } } -static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, -struct iommu_table *tbl, +static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, __be64 *startp, __be64 *endp, bool rm) { + struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, + tce32_table); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe-tce_inval_reg_phys : (__be64 __iomem *)tbl-it_index; @@ -584,17 +585,13 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, } } -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, -__be64 *startp, __be64 *endp, bool rm) +static void pnv_pci_ioda2_tce_invalidate_32(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) { struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); - struct pnv_phb *phb = pe-phb; + tce32_table); - if (phb-type == PNV_PHB_IODA1) - pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); - else - pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); + pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); } static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, @@ -657,6 +654,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, tbl = pe-tce32_table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base 28, IOMMU_PAGE_SHIFT_4K); + tbl-invalidate = pnv_pci_ioda1_tce_invalidate; /* OPAL variant of P7IOC SW invalidated TCEs */ swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); @@ -788,6 +786,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, tbl = pe-tce32_table; pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, IOMMU_PAGE_SHIFT_4K); + tbl-invalidate = pnv_pci_ioda2_tce_invalidate_32; /* OPAL variant of PHB3 invalidated TCEs */ swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 4dff552..1ab0f62 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -550,6 +550,23 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; +static void pnv_tce_invalidate(struct iommu_table *tbl, __be64 *startp, + __be64 *endp, bool rm) +{ + /* +* Some implementations won't cache invalid TCEs and thus may not +* need that flush. We'll probably turn it_type into a bit mask +* of flags if that becomes the case +*/ + if (!(tbl-it_type TCE_PCI_SWINV_FREE)) + return; + + if (!tbl-invalidate) + return; + + tbl-invalidate(tbl, startp, endp, rm); +} + static int pnv_tce_build(struct
[PATCH v1 07/16] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
Modern IBM POWERPC systems support multiple IOMMU tables per PHB so we need a more reliable way (compared to container_of()) to get a PE pointer from the iommu_table struct pointer used in IOMMU functions. At the moment IOMMU group data points to an iommu_table struct. This introduces a spapr_tce_iommu_group struct which keeps an iommu_owner and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to the pnv_ioda_pe struct, for others it is still a pointer to the iommu_table struct. The ops structs correspond to the type which iommu_owner points to. At the moment a get_table() callback is the only one. It returns an iommu_table for a bus address. As the IOMMU group data pointer points to variable type instead of iommu_table, VFIO SPAPR TCE driver is fixed to use new type. This changes the tce_container struct to keep iommu_group instead of iommu_table. So, it was: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to iommu_table via iommu_group_get_iommudata(); now it is: - iommu_table points to iommu_group via iommu_table::it_group; - iommu_group points to spapr_tce_iommu_group via iommu_group_get_iommudata(); - spapr_tce_iommu_group points to either (depending on .get_table()): - iommu_table; - pnv_ioda_pe; Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h| 6 ++ arch/powerpc/include/asm/tce.h | 13 arch/powerpc/kernel/iommu.c | 31 +++- arch/powerpc/platforms/powernv/pci-ioda.c | 37 - arch/powerpc/platforms/powernv/pci-p5ioc2.c | 1 + arch/powerpc/platforms/powernv/pci.c| 2 +- arch/powerpc/platforms/pseries/iommu.c | 10 ++- drivers/vfio/vfio_iommu_spapr_tce.c | 112 +--- 8 files changed, 177 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index d8fb3fa..fb2c884 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -112,13 +112,19 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); + +struct spapr_tce_iommu_ops; #ifdef CONFIG_IOMMU_API extern void iommu_register_group(struct iommu_table *tbl, +void *iommu_owner, +struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); #else static inline void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, + struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num) { diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index 743f36b..a697681 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -50,5 +50,18 @@ #define TCE_PCI_READ 0x1 /* read from PCI allowed */ #define TCE_VB_WRITE 0x1 /* write from VB allowed */ +struct spapr_tce_iommu_group; + +struct spapr_tce_iommu_ops { + struct iommu_table *(*get_table)( + struct spapr_tce_iommu_group *data, + phys_addr_t addr); +}; + +struct spapr_tce_iommu_group { + void *iommu_owner; + struct spapr_tce_iommu_ops *ops; +}; + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TCE_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f84f799..b207332 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -877,24 +877,49 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl-it_group = NULL; + kfree(iommu_data); } +static struct iommu_table *spapr_tce_get_default_table( + struct spapr_tce_iommu_group *data, phys_addr_t addr) +{ + struct iommu_table *tbl = data-iommu_owner; + + if ((addr tbl-it_page_shift) tbl-it_size) + return tbl; + + return NULL; +} + +static struct spapr_tce_iommu_ops spapr_tce_default_ops = { + .get_table = spapr_tce_get_default_table +}; + void iommu_register_group(struct iommu_table *tbl, + void *iommu_owner, struct spapr_tce_iommu_ops *ops, int pci_domain_number, unsigned long pe_num) { struct iommu_group *grp; char *name; + struct spapr_tce_iommu_group *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + +
[PATCH v1 05/16] powerpc/powernv: Add a page size parameter to pnv_pci_setup_iommu_table()
Since a TCE page size can be other than 4K, make it configurable for P5IOC2 and IODA PHBs. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 5 +++-- arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 ++- arch/powerpc/platforms/powernv/pci.c| 6 +++--- arch/powerpc/platforms/powernv/pci.h| 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 40f968e..9f28e18 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -656,7 +656,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, /* Setup linux iommu table */ tbl = pe-tce32_table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, - base 28); + base 28, IOMMU_PAGE_SHIFT_4K); /* OPAL variant of P7IOC SW invalidated TCEs */ swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); @@ -786,7 +786,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, /* Setup linux iommu table */ tbl = pe-tce32_table; - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0); + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + IOMMU_PAGE_SHIFT_4K); /* OPAL variant of PHB3 invalidated TCEs */ swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL); diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index e3807d6..94ce348 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -172,7 +172,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, /* Setup TCEs */ phb-dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup; pnv_pci_setup_iommu_table(phb-p5ioc2.iommu_table, - tce_mem, tce_size, 0); + tce_mem, tce_size, 0, + IOMMU_PAGE_SHIFT_4K); } void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b6cb996..4dff552 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -628,11 +628,11 @@ static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages) void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, - u64 dma_offset) + u64 dma_offset, unsigned page_shift) { tbl-it_blocksize = 16; tbl-it_base = (unsigned long)tce_mem; - tbl-it_page_shift = IOMMU_PAGE_SHIFT_4K; + tbl-it_page_shift = page_shift; tbl-it_offset = dma_offset tbl-it_page_shift; tbl-it_index = 0; tbl-it_size = tce_size 3; @@ -657,7 +657,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose) if (WARN_ON(!tbl)) return NULL; pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), - be32_to_cpup(sizep), 0); + be32_to_cpup(sizep), 0, IOMMU_PAGE_SHIFT_4K); iommu_init_table(tbl, hose-node); iommu_register_group(tbl, pci_domain_nr(hose-bus), 0); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 676232c..6f5ff69 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -198,7 +198,7 @@ int pnv_pci_cfg_write(struct device_node *dn, int where, int size, u32 val); extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, - u64 dma_offset); + u64 dma_offset, unsigned page_shift); extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 03/16] powerpc/powernv: Use it_page_shift for TCE invalidation
This fixes IODA1/2 to use it_page_shift as it may be bigger than 4K. This changes the involved constant values to use ull modifier. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 16 +--- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index de19ede..40f968e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -513,15 +513,16 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, (__be64 __iomem *)pe-tce_inval_reg_phys : (__be64 __iomem *)tbl-it_index; unsigned long start, end, inc; + const unsigned shift = tbl-it_page_shift; start = __pa(startp); end = __pa(endp); /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ if (tbl-it_busno) { - start = 12; - end = 12; - inc = 128 12; + start = shift; + end = shift; + inc = 128ull shift; start |= tbl-it_busno; end |= tbl-it_busno; } else if (tbl-it_type TCE_PCI_SWINV_PAIR) { @@ -559,18 +560,19 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe-tce_inval_reg_phys : (__be64 __iomem *)tbl-it_index; + const unsigned shift = tbl-it_page_shift; /* We'll invalidate DMA address in PE scope */ - start = 0x2ul 60; + start = 0x2ull 60; start |= (pe-pe_number 0xFF); end = start; /* Figure out the start, end and step */ inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64)); - start |= (inc 12); + start |= (inc shift); inc = tbl-it_offset + (((u64)endp - tbl-it_base) / sizeof(u64)); - end |= (inc 12); - inc = (0x1ul 12); + end |= (inc shift); + inc = (0x1ull shift); mb(); while (start = end) { -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 08/16] powerpc/powernv: Convert/move set_bypass() callback to take_ownership()
At the moment the iommu_table struct has a set_bypass() which enables/ disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code which calls this callback when external IOMMU users such as VFIO are about to get over a PHB. Since the set_bypass() is not really an iommu_table function but PE's function, and we have an ops struct per IOMMU owner, let's move set_bypass() to the spapr_tce_iommu_ops struct. As arch/powerpc/kernel/iommu.c is more about POWERPC IOMMU tables and has very little to do with PEs, this moves take_ownership() calls to the VFIO SPAPR TCE driver. This renames set_bypass() to take_ownership() as it is not necessarily just enabling bypassing, it can be something else/more so let's give it a generic name. The bool parameter is inverted. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 1 - arch/powerpc/include/asm/tce.h| 2 ++ arch/powerpc/kernel/iommu.c | 12 arch/powerpc/platforms/powernv/pci-ioda.c | 17 ++--- drivers/vfio/vfio_iommu_spapr_tce.c | 16 5 files changed, 28 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index fb2c884..00205cb 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -81,7 +81,6 @@ struct iommu_table { struct iommu_group *it_group; #endif iommu_invalidate_fn invalidate; - void (*set_bypass)(struct iommu_table *tbl, bool enable); }; /* Pure 2^n version of get_order */ diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index a697681..c3d295d 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -56,6 +56,8 @@ struct spapr_tce_iommu_ops { struct iommu_table *(*get_table)( struct spapr_tce_iommu_group *data, phys_addr_t addr); + void (*take_ownership)(struct spapr_tce_iommu_group *data, + bool enable); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b207332..d9494b2 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1113,14 +1113,6 @@ int iommu_take_ownership(struct iommu_table *tbl) memset(tbl-it_map, 0xff, sz); iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); - /* -* Disable iommu bypass, otherwise the user can DMA to all of -* our physical memory via the bypass window instead of just -* the pages that has been explicitly mapped into the iommu -*/ - if (tbl-set_bypass) - tbl-set_bypass(tbl, false); - return 0; } EXPORT_SYMBOL_GPL(iommu_take_ownership); @@ -1135,10 +1127,6 @@ void iommu_release_ownership(struct iommu_table *tbl) /* Restore bit#0 set by iommu_init_table() */ if (tbl-it_offset == 0) set_bit(0, tbl-it_map); - - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl-set_bypass) - tbl-set_bypass(tbl, true); } EXPORT_SYMBOL_GPL(iommu_release_ownership); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8152e30..b5e757b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -715,10 +715,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); uint16_t window_id = (pe-pe_number 1 ) + 1; int64_t rc; @@ -758,16 +756,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, /* TVE #1 is selected by PCI address bit 59 */ pe-tce_bypass_base = 1ull 59; - /* Install set_bypass callback for VFIO */ - pe-tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe-tce32_table, true); + pnv_pci_ioda2_set_bypass(pe, true); +} +static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, +bool enable) +{ + struct pnv_ioda_pe *pe = data-iommu_owner; + + pnv_pci_ioda2_set_bypass(pe, !enable); } static struct spapr_tce_iommu_ops pnv_pci_ioda2_ops = { .get_table = pnv_ioda1_iommu_get_table, + .take_ownership = pnv_ioda2_take_ownership, }; static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index
[PATCH v1 04/16] powerpc/powernv: Use it_page_shift in TCE build
This makes use of iommu_table::it_page_shift instead of TCE_SHIFT and TCE_RPN_SHIFT hardcoded values. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index f91a4e5..b6cb996 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -564,10 +564,11 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, proto_tce |= TCE_PCI_WRITE; tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset; - rpn = __pa(uaddr) TCE_SHIFT; + rpn = __pa(uaddr) tbl-it_page_shift; while (npages--) - *(tcep++) = cpu_to_be64(proto_tce | (rpn++ TCE_RPN_SHIFT)); + *(tcep++) = cpu_to_be64(proto_tce | + (rpn++ tbl-it_page_shift)); /* Some implementations won't cache invalid TCEs and thus may not * need that flush. We'll probably turn it_type into a bit mask -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 11/16] powerpc/iommu: Extend ppc_md.tce_build(_rm) to return old TCE values
The tce_build/tce_build_rm callbacks are used to implement H_PUT_TCE/etc hypercalls. The PAPR spec does not allow to fail if the TCE is not empty. However we cannot just overwrite the existing TCE value with the new one as we still have to do page counting. This adds an optional @old_tces return parameter. If it is not NULL, it must point to an array of @npages size where the callbacks will store old TCE values. Since tce_build receives virtual addresses, the old_tces array will contain virtual addresses as well. As this patch is mechanical, no change in behaviour is expected. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/machdep.h | 2 ++ arch/powerpc/kernel/iommu.c| 8 +--- arch/powerpc/platforms/powernv/pci.c | 13 - arch/powerpc/platforms/pseries/iommu.c | 7 +-- arch/powerpc/sysdev/dart_iommu.c | 1 + 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index f92b0b5..f11596c 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -69,6 +69,7 @@ struct machdep_calls { long index, long npages, unsigned long uaddr, +unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs); void(*tce_free)(struct iommu_table *tbl, @@ -83,6 +84,7 @@ struct machdep_calls { long index, long npages, unsigned long uaddr, +long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs); void(*tce_free_rm)(struct iommu_table *tbl, diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 01ac319..ae57910 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -324,7 +324,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* Put the TCEs in the HW table */ build_fail = ppc_md.tce_build(tbl, entry, npages, (unsigned long)page - IOMMU_PAGE_MASK(tbl), direction, attrs); + IOMMU_PAGE_MASK(tbl), NULL, direction, + attrs); /* ppc_md.tce_build() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return @@ -497,7 +498,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Insert into HW table */ build_fail = ppc_md.tce_build(tbl, entry, npages, vaddr IOMMU_PAGE_MASK(tbl), - direction, attrs); + NULL, direction, attrs); if(unlikely(build_fail)) goto failure; @@ -1056,7 +1057,8 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, oldtce = ppc_md.tce_get(tbl, entry); /* Add new entry if it is not busy */ if (!(oldtce (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, NULL, + direction, NULL); spin_unlock((pool-lock)); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 09287c7..e002c66 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -568,7 +568,8 @@ static void pnv_tce_invalidate(struct iommu_table *tbl, __be64 *startp, } static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, -unsigned long uaddr, enum dma_data_direction direction, +unsigned long uaddr, unsigned long *old_tces, +enum dma_data_direction direction, struct dma_attrs *attrs, bool rm) { u64 proto_tce; @@ -593,12 +594,12 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, } static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, + unsigned long uaddr, unsigned long *old_tces, enum dma_data_direction direction, struct dma_attrs *attrs) { - return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, - false); +
[PATCH v1 12/16] powerpc/powernv: Return non-zero TCE from pnv_tce_build
This returns old TCE values to the caller if requested. The caller is expectded to call put_page() for them. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index e002c66..a9165a5 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -575,6 +575,7 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, u64 proto_tce; __be64 *tcep, *tces; u64 rpn; + long i; proto_tce = TCE_PCI_READ; // Read allowed @@ -584,9 +585,13 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset; rpn = __pa(uaddr) tbl-it_page_shift; - while (npages--) - *(tcep++) = cpu_to_be64(proto_tce | - (rpn++ tbl-it_page_shift)); + for (i = 0; i npages; i++) { + unsigned long oldtce = xchg(tcep, cpu_to_be64(proto_tce | + (rpn++ tbl-it_page_shift))); + if (old_tces) + old_tces[i] = (unsigned long) __va(oldtce); + tcep++; + } pnv_tce_invalidate(tbl, tces, tcep - 1, rm); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 13/16] powerpc/iommu: Implement put_page() if TCE had non-zero value
Guests might put new TCEs without clearing them first and the PAPR spec allows that. This adds put_page() for TCEs which we just replaced. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index ae57910..25fda58 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1054,11 +1054,11 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, spin_lock((pool-lock)); - oldtce = ppc_md.tce_get(tbl, entry); - /* Add new entry if it is not busy */ - if (!(oldtce (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, NULL, - direction, NULL); + ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, oldtce, + direction, NULL); + + if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) + put_page(pfn_to_page(__pa(oldtce))); spin_unlock((pool-lock)); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 10/16] powerpc/iommu: Fix missing permission bits in iommu_put_tce_user_mode()
This adds missing permission bits to the translated TCE. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index da04561..01ac319 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1085,6 +1085,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, return -EFAULT; } hwaddr = (unsigned long) page_address(page) + offset; + hwaddr |= tce (TCE_PCI_READ | TCE_PCI_WRITE); ret = iommu_tce_build(tbl, entry, hwaddr, direction); if (ret) -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 09/16] powerpc/iommu: Fix IOMMU ownership control functions
This adds missing locks in iommu_take_ownership()/ iommu_release_ownership(). This marks all pages busy in iommu_table::it_map in order to catch errors if there is an attempt to use this table while ownership over it is taken. This only clears TCE content if there is no page marked busy in it_map. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 37 ++--- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index d9494b2..da04561 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1100,33 +1100,56 @@ EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); int iommu_take_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; + int ret = 0, bit0 = 0; + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); if (tbl-it_offset == 0) - clear_bit(0, tbl-it_map); + bit0 = test_and_clear_bit(0, tbl-it_map); if (!bitmap_empty(tbl-it_map, tbl-it_size)) { pr_err(iommu_tce: it_map is not empty); - return -EBUSY; + ret = -EBUSY; + if (bit0) + set_bit(0, tbl-it_map); + } else { + memset(tbl-it_map, 0xff, sz); } - memset(tbl-it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + if (!ret) + iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, + tbl-it_size); - return 0; + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); + + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl-it_size + 7) 3; + unsigned long flags, i, sz = (tbl-it_size + 7) 3; iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + + spin_lock_irqsave(tbl-large_pool.lock, flags); + for (i = 0; i tbl-nr_pools; i++) + spin_lock(tbl-pools[i].lock); + memset(tbl-it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ if (tbl-it_offset == 0) set_bit(0, tbl-it_map); + + for (i = 0; i tbl-nr_pools; i++) + spin_unlock(tbl-pools[i].lock); + spin_unlock_irqrestore(tbl-large_pool.lock, flags); } EXPORT_SYMBOL_GPL(iommu_release_ownership); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 16/16] vfio: powerpc: Enable Dynamic DMA windows
This defines and implements VFIO IOMMU API required to support Dynamic DMA windows defined in the SPAPR specification. The ioctl handlers implement host-size part of corresponding RTAS calls: - VFIO_IOMMU_SPAPR_TCE_QUERY - ibm,query-pe-dma-window; - VFIO_IOMMU_SPAPR_TCE_CREATE - ibm,create-pe-dma-window; - VFIO_IOMMU_SPAPR_TCE_REMOVE - ibm,remove-pe-dma-window; - VFIO_IOMMU_SPAPR_TCE_RESET - ibm,reset-pe-dma-window. The VFIO IOMMU driver does basic sanity checks and calls corresponding SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge) implements them. This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via VFIO_IOMMU_SPAPR_TCE_GET_INFO. This calls reset() when IOMMU is being disabled (happens when VFIO stops using it). Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda.c | 1 + drivers/vfio/vfio_iommu_spapr_tce.c | 132 +- include/uapi/linux/vfio.h | 37 - 3 files changed, 168 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 25a4f0e..63aa697 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -869,6 +869,7 @@ static long pnv_pci_ioda2_ddw_create(struct spapr_tce_iommu_group *data, tbl64-invalidate = pnv_pci_ioda2_tce_invalidate_64; /* Copy invalidate register address */ + tbl64-it_group = pe-tce32_table.it_group; tbl64-it_index = pe-tce32_table.it_index; tbl64-it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE | TCE_PCI_SWINV_PAIR; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 808c7d3..8f992de 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -124,13 +124,20 @@ static void tce_iommu_disable(struct tce_container *container) container-enabled = false; - if (!container-grp || !current-mm) + if (!container-grp) return; data = iommu_group_get_iommudata(container-grp); if (!data || !data-iommu_owner || !data-ops-get_table) return; + /* Try resetting, there might have been a 64bit window */ + if (data-ops-reset) + data-ops-reset(data); + + if (!current-mm) + return; + tbl = data-ops-get_table(data, 0); if (!tbl) return; @@ -213,6 +220,8 @@ static long tce_iommu_ioctl(void *iommu_data, info.dma32_window_start = tbl-it_offset tbl-it_page_shift; info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; + if (data-ops-query data-ops-create data-ops-remove) + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW; if (copy_to_user((void __user *)arg, info, minsz)) return -EFAULT; @@ -338,6 +347,127 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(container-lock); return 0; + + case VFIO_IOMMU_SPAPR_TCE_QUERY: { + struct vfio_iommu_spapr_tce_query query; + struct spapr_tce_iommu_group *data; + + if (WARN_ON(!container-grp)) + return -ENXIO; + + data = iommu_group_get_iommudata(container-grp); + + minsz = offsetofend(struct vfio_iommu_spapr_tce_query, + page_size_mask); + + if (copy_from_user(query, (void __user *)arg, minsz)) + return -EFAULT; + + if (query.argsz minsz) + return -EINVAL; + + if (!data-ops-query || !data-iommu_owner) + return -ENOSYS; + + ret = data-ops-query(data, + query.windows_available, + query.page_size_mask); + + if (copy_to_user((void __user *)arg, query, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + struct spapr_tce_iommu_group *data; + struct iommu_table *tbl; + + if (WARN_ON(!container-grp)) + return -ENXIO; + + data = iommu_group_get_iommudata(container-grp); + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(create, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz minsz) + return -EINVAL; + + if (!data-ops-create || !data-iommu_owner) +
[PATCH v1 14/16] powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA
SPAPR defines an interface to create additional DMA windows dynamically. Dynamically means that the window is not allocated at the guest start and the guest can request it later. In practice, existing linux guests check for the capability and if it is there, they create+map one big DMA window as big as the entire guest RAM. SPAPR defines 4 RTAS calls for this feature which userspace implements. This adds 4 callbacks into the spapr_tce_iommu_ops struct: 1. query - ibm,query-pe-dma-window - returns number/size of windows which can be created (one, any page size); 2. create - ibm,create-pe-dma-window - creates a window; 3. remove - ibm,remove-pe-dma-window - removes a window; only additional window created by create() can be removed, the default 32bit window cannot be removed as guests do not expect new windows to start from zero; 4. reset - ibm,reset-pe-dma-window - reset the DMA windows configuration to the default state; now it only removes the additional window if it was created. The next patch will add corresponding ioctls to VFIO SPAPR TCE driver to pass RTAS call from the userspace to the IODA code. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/tce.h| 21 arch/powerpc/platforms/powernv/pci-ioda.c | 160 +- arch/powerpc/platforms/powernv/pci.h | 2 + 3 files changed, 182 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index c3d295d..68f6575 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -58,6 +58,27 @@ struct spapr_tce_iommu_ops { phys_addr_t addr); void (*take_ownership)(struct spapr_tce_iommu_group *data, bool enable); + + /* Dynamic DMA window */ + /* Page size flags for ibm,query-pe-dma-window */ +#define DDW_PGSIZE_4K 0x01 +#define DDW_PGSIZE_64K 0x02 +#define DDW_PGSIZE_16M 0x04 +#define DDW_PGSIZE_32M 0x08 +#define DDW_PGSIZE_64M 0x10 +#define DDW_PGSIZE_128M 0x20 +#define DDW_PGSIZE_256M 0x40 +#define DDW_PGSIZE_16G 0x80 + long (*query)(struct spapr_tce_iommu_group *data, + __u32 *windows_available, + __u32 *page_size_mask); + long (*create)(struct spapr_tce_iommu_group *data, + __u32 page_shift, + __u32 window_shift, + struct iommu_table **ptbl); + long (*remove)(struct spapr_tce_iommu_group *data, + struct iommu_table *tbl); + long (*reset)(struct spapr_tce_iommu_group *data); }; struct spapr_tce_iommu_group { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b5e757b..25a4f0e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -595,6 +595,15 @@ static void pnv_pci_ioda2_tce_invalidate_32(struct iommu_table *tbl, pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); } +static void pnv_pci_ioda2_tce_invalidate_64(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, + tce64_table); + + pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); +} + static bool pnv_pci_ioda_check_addr(struct iommu_table *tbl, __u64 start_addr) { unsigned long entry = start_addr tbl-it_page_shift; @@ -760,6 +769,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pnv_pci_ioda2_set_bypass(pe, true); } +static struct iommu_table *pnv_ioda2_iommu_get_table( + struct spapr_tce_iommu_group *data, + phys_addr_t addr) +{ + struct pnv_ioda_pe *pe = data-iommu_owner; + + if (pnv_pci_ioda_check_addr(pe-tce64_table, addr)) + return pe-tce64_table; + + if (pnv_pci_ioda_check_addr(pe-tce32_table, addr)) + return pe-tce32_table; + + return NULL; +} + static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, bool enable) { @@ -768,9 +792,143 @@ static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data, pnv_pci_ioda2_set_bypass(pe, !enable); } +static long pnv_pci_ioda2_ddw_query(struct spapr_tce_iommu_group *data, + __u32 *windows_available, __u32 *page_size_mask) +{ + struct pnv_ioda_pe *pe = data-iommu_owner; + + if (pe-tce64_active) { + *page_size_mask = 0; + *windows_available = 0; + } else { + *page_size_mask = + DDW_PGSIZE_4K | + DDW_PGSIZE_64K | + DDW_PGSIZE_16M | + DDW_PGSIZE_32M | + DDW_PGSIZE_64M | + DDW_PGSIZE_128M | +
[PATCH v1 3/7] powerpc/iommu: Clean up IOMMU API
The iommu_tce_direction() function is not used from outside iommu.c so make it static. The iommu_clear_tce() is not used anymore at all so remove it. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 4 arch/powerpc/kernel/iommu.c | 22 +- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 1c9b346..2f420c28 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -195,8 +195,6 @@ extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long *hpas, unsigned long npages, bool realmode); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages, bool realmode); @@ -207,7 +205,5 @@ extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); extern void iommu_release_ownership(struct iommu_table *tbl); -extern enum dma_data_direction iommu_tce_direction(unsigned long tce); - #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index dd68569..259ddb5 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -929,7 +929,7 @@ void iommu_register_group(struct iommu_table *tbl, kfree(name); } -enum dma_data_direction iommu_tce_direction(unsigned long tce) +static enum dma_data_direction iommu_tce_direction(unsigned long tce) { if ((tce TCE_PCI_READ) (tce TCE_PCI_WRITE)) return DMA_BIDIRECTIONAL; @@ -940,7 +940,6 @@ enum dma_data_direction iommu_tce_direction(unsigned long tce) else return DMA_NONE; } -EXPORT_SYMBOL_GPL(iommu_tce_direction); void iommu_flush_tce(struct iommu_table *tbl) { @@ -998,25 +997,6 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) -{ - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock((pool-lock)); - - oldtce = ppc_md.tce_get(tbl, entry); - if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) - ppc_md.tce_free(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock((pool-lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages, bool realmode) -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 1/7] powerpc/iommu: Change prototypes for realmode support
This is a mechanical patch to add an extra realmode parameter to iommu_clear_tces_and_put_pages() and iommu_tce_build() helpers. This changes iommu_tce_build() to receive multiple page addresses at once as in the future we want to save on locks and TCE flushes in realmode. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 5 +++-- arch/powerpc/kernel/iommu.c | 15 +-- arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++- drivers/vfio/vfio_iommu_spapr_tce.c | 6 -- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 00205cb..1c9b346 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -194,11 +194,12 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl, extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction); + unsigned long *hpas, unsigned long npages, bool realmode); extern unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry); extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages); + unsigned long entry, unsigned long pages, + bool realmode); extern int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, unsigned long tce); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 25fda58..8771b73 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1018,7 +1018,8 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) EXPORT_SYMBOL_GPL(iommu_clear_tce); int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) + unsigned long entry, unsigned long pages, + bool realmode) { unsigned long oldtce; struct page *page; @@ -1046,15 +1047,16 @@ EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); * tce_build converts it to a physical address. */ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction) + unsigned long *hpas, unsigned long npages, bool realmode) { int ret = -EBUSY; unsigned long oldtce; struct iommu_pool *pool = get_pool(tbl, entry); + enum dma_data_direction direction = iommu_tce_direction(*hpas); spin_lock((pool-lock)); - ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, oldtce, + ret = ppc_md.tce_build(tbl, entry, 1, *hpas, oldtce, direction, NULL); if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) @@ -1089,7 +1091,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, hwaddr = (unsigned long) page_address(page) + offset; hwaddr |= tce (TCE_PCI_READ | TCE_PCI_WRITE); - ret = iommu_tce_build(tbl, entry, hwaddr, direction); + ret = iommu_tce_build(tbl, entry, hwaddr, 1, direction); if (ret) put_page(page); @@ -1124,7 +1126,7 @@ int iommu_take_ownership(struct iommu_table *tbl) if (!ret) iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, - tbl-it_size); + tbl-it_size, false); for (i = 0; i tbl-nr_pools; i++) spin_unlock(tbl-pools[i].lock); @@ -1138,7 +1140,8 @@ void iommu_release_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl-it_size + 7) 3; - iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size, + false); spin_lock_irqsave(tbl-large_pool.lock, flags); for (i = 0; i tbl-nr_pools; i++) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 63aa697..2d65a7d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -895,7 +895,8 @@ static long pnv_pci_ioda2_ddw_remove(struct spapr_tce_iommu_group *data, pr_info(Removing huge 64bit DMA window\n); - iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size); + iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size, + false); pe-tce64_active = false; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8f992de..ff1b29e 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -283,7 +283,8 @@
[PATCH v1 0/7] powerpc/iommu: kvm: Enable MultiTCE support
This prepares upstream kernel for in-kernel acceleration of TCE hypercalls (H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE). This implements acceleration for both real and virtual modes. As it requires gup() for real mode to parse TCE list page, this implements gup() for realmode. This only accelerates emulated PCI and VIO devices. DDW is not affected. This was made on top of [PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW) Alexey Kardashevskiy (7): powerpc/iommu: Change prototypes for realmode support powerpc/iommu: Support real mode powerpc/iommu: Clean up IOMMU API KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers KVM: PPC: Add kvmppc_find_tce_table() KVM: PPC: Add support for multiple-TCE hcalls Documentation/virtual/kvm/api.txt | 26 +++ arch/powerpc/include/asm/iommu.h | 9 +- arch/powerpc/include/asm/kvm_book3s_64.h | 2 - arch/powerpc/include/asm/kvm_host.h | 30 +++ arch/powerpc/include/asm/kvm_ppc.h| 16 ++ arch/powerpc/kernel/iommu.c | 140 +++- arch/powerpc/kvm/book3s_64_vio.c | 177 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 343 ++ arch/powerpc/kvm/book3s_hv.c | 30 ++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 +- arch/powerpc/kvm/book3s_pr.c | 4 + arch/powerpc/kvm/book3s_pr_papr.c | 35 +++ arch/powerpc/kvm/powerpc.c| 3 + arch/powerpc/platforms/powernv/pci-ioda.c | 3 +- drivers/vfio/vfio_iommu_spapr_tce.c | 6 +- 15 files changed, 720 insertions(+), 108 deletions(-) -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 2/7] powerpc/iommu: Support real mode
The TCE tables handling differs for real (MMU off) and virtual modes (MMU on) so additional set of realmode-capable TCE callbacks has been added to ppc_md: * tce_build_rm * tce_free_rm * tce_flush_rm This makes use of new ppc_md calls in iommu_clear_tces_and_put_pages. This changes iommu_tce_build() to handle multiple pages at once under the same lock. tce_flush() is called once per call. This adds a memory barrier after flushing TCE table changes. This removes comment about hwaddr as now it is an array called hpas and hpa is descriptive enough acronym. This does not clear TCE for a huge page in real mode and passes handling of this to virtual mode. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kernel/iommu.c | 107 +--- 1 file changed, 81 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 8771b73..dd68569 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1021,53 +1021,108 @@ int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, unsigned long entry, unsigned long pages, bool realmode) { - unsigned long oldtce; - struct page *page; + int i, ret = 0, to_free = 0; - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) + if (realmode !ppc_md.tce_free_rm) + return -EAGAIN; + + for (i = 0; i pages; ++i) { + unsigned long oldtce = ppc_md.tce_get(tbl, entry + i); + + if (!(oldtce (TCE_PCI_WRITE | TCE_PCI_READ))) continue; - page = pfn_to_page(oldtce PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); + if (realmode) { + struct page *pg = realmode_pfn_to_page( + oldtce PAGE_SHIFT); + if (!pg) { + ret = -EAGAIN; + } else if (PageCompound(pg)) { + ret = -EAGAIN; + } else { + if (oldtce TCE_PCI_WRITE) + SetPageDirty(pg); + if (!put_page_unless_one(pg)) + ret = -EAGAIN; + } + } else { + struct page *pg = pfn_to_page(oldtce PAGE_SHIFT); + + if (!pg) { + ret = -EAGAIN; + } else { + if (oldtce TCE_PCI_WRITE) + SetPageDirty(pg); + put_page(pg); + } } + if (ret) + break; + to_free = i + 1; } - return 0; + if (to_free) { + if (realmode) + ppc_md.tce_free_rm(tbl, entry, to_free); + else + ppc_md.tce_free(tbl, entry, to_free); + + if (realmode ppc_md.tce_flush_rm) + ppc_md.tce_flush_rm(tbl); + else if (!realmode ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + } + + /* Make sure updates are seen by hardware */ + mb(); + + return ret; } EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); -/* - * hwaddr is a kernel virtual address here (0xc... bazillion), - * tce_build converts it to a physical address. - */ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, unsigned long *hpas, unsigned long npages, bool realmode) { - int ret = -EBUSY; - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - enum dma_data_direction direction = iommu_tce_direction(*hpas); + int i, ret = 0; - spin_lock((pool-lock)); + if (realmode !ppc_md.tce_build_rm) + return -EAGAIN; - ret = ppc_md.tce_build(tbl, entry, 1, *hpas, oldtce, - direction, NULL); + for (i = 0; i npages; ++i) { + unsigned long hva = (unsigned long) __va(hpas[i]); + enum dma_data_direction dir = iommu_tce_direction(hva); + unsigned long oldtce = 0; - if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) - put_page(pfn_to_page(__pa(oldtce))); + if (realmode) { + ret = ppc_md.tce_build_rm(tbl, entry + i, 1, + hva, oldtce, dir, NULL); + if (oldtce (TCE_PCI_WRITE | TCE_PCI_READ)) { +
[PATCH v1 4/7] KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K
SPAPR_TCE_SHIFT is used in few places only and since IOMMU_PAGE_SHIFT_4K can bre easily used instead, remove SPAPR_TCE_SHIFT. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/kvm_book3s_64.h | 2 -- arch/powerpc/kvm/book3s_64_vio.c | 3 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 5 +++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index fddb72b..4f7dcf6 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) } #endif -#define SPAPR_TCE_SHIFT12 - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ extern unsigned long kvm_rma_pages; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 516f2ee..e9bcb13 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -36,12 +36,13 @@ #include asm/ppc-opcode.h #include asm/kvm_host.h #include asm/udbg.h +#include asm/iommu.h #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) static long kvmppc_stt_npages(unsigned long window_size) { - return ALIGN((window_size SPAPR_TCE_SHIFT) + return ALIGN((window_size IOMMU_PAGE_SHIFT_4K) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 89e96b3..2624a01 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -35,6 +35,7 @@ #include asm/ppc-opcode.h #include asm/kvm_host.h #include asm/udbg.h +#include asm/iommu.h #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) @@ -52,7 +53,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) { if (stt-liobn == liobn) { - unsigned long idx = ioba SPAPR_TCE_SHIFT; + unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; struct page *page; u64 *tbl; @@ -84,7 +85,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) { if (stt-liobn == liobn) { - unsigned long idx = ioba SPAPR_TCE_SHIFT; + unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; struct page *page; u64 *tbl; -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 5/7] KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers
Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls) will validate TCE (not to have unexpected bits) and IO address (to be within the DMA window boundaries). This introduces helpers to validate TCE and IO address. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/kvm_ppc.h | 4 ++ arch/powerpc/kvm/book3s_64_vio_hv.c | 117 2 files changed, 109 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9c89cdd..26e6e1a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -127,6 +127,10 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages); +extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, + unsigned long tce); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 2624a01..ab3f50f 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -36,9 +36,102 @@ #include asm/kvm_host.h #include asm/udbg.h #include asm/iommu.h +#include asm/tce.h #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +/* + * Validates IO address. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages) +{ + unsigned long mask = (1 IOMMU_PAGE_SHIFT_4K) - 1; + unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; + unsigned long size = stt-window_size IOMMU_PAGE_SHIFT_4K; + + if ((ioba mask) || (size + npages = idx)) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); + +/* + * Validates TCE address. + * At the moment flags and page mask are validated. + * As the host kernel does not access those addresses (just puts them + * to the table and user space is supposed to process them), we can skip + * checking other things (such as TCE is a guest RAM address or the page + * was actually allocated). + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +{ + unsigned long mask = ((1 IOMMU_PAGE_SHIFT_4K) - 1) + ~(TCE_PCI_WRITE | TCE_PCI_READ); + + if (tce mask) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_validate); + +/* Note on the use of page_address() in real mode, + * + * It is safe to use page_address() in real mode on ppc64 because + * page_address() is always defined as lowmem_page_address() + * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial + * operation and does not access page struct. + * + * Theoretically page_address() could be defined different + * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL + * should be enabled. + * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, + * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only + * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP + * is not expected to be enabled on ppc32, page_address() + * is safe for ppc32 as well. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static u64 *kvmppc_page_address(struct page *page) +{ +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) +#error TODO: fix to avoid page_address() here +#endif + return (u64 *) page_address(page); +} + +/* + * Handles TCE requests for emulated devices. + * Puts guest TCE values to the table and expects user space to convert them. + * Called in both real and virtual modes. + * Cannot fail so kvmppc_tce_validate must be called before it. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, + unsigned long idx, unsigned long tce) +{ + struct page *page; + u64 *tbl; + + page = stt-pages[idx / TCES_PER_PAGE]; + tbl = kvmppc_page_address(page); + + tbl[idx % TCES_PER_PAGE] = tce; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_put); + /* WARNING: This will be called in real-mode on HV KVM and virtual * mode on PR KVM */ @@ -54,20 +147,19 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
[PATCH v1 6/7] KVM: PPC: Add kvmppc_find_tce_table()
This adds a common helper to search for a kvmppc_spapr_tce_table by LIOBN. This makes H_PUT_TCE and H_GET_TCE handler use this new helper. The helper will be also used in H_PUT_TCE_INDIRECT and H_STUFF_TCE handlers. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio_hv.c | 79 - 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ab3f50f..79406f1 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -40,6 +40,20 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(struct kvm *kvm, + unsigned long liobn) +{ + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) { + if (stt-liobn == liobn) + return stt; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(kvmppc_find_tce_table); + /* * Validates IO address. * @@ -138,62 +152,55 @@ EXPORT_SYMBOL_GPL(kvmppc_tce_put); long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvm *kvm = vcpu-kvm; struct kvmppc_spapr_tce_table *stt; + long ret; + unsigned long idx; /* udbg_printf(H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n, */ /* liobn, ioba, tce); */ - list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) { - if (stt-liobn == liobn) { - unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; - /* udbg_printf(H_PUT_TCE: liobn 0x%lx = stt=%p window_size=0x%x\n, */ - /* liobn, stt, stt-window_size); */ - long ret = kvmppc_ioba_validate(stt, ioba, 1); + stt = kvmppc_find_tce_table(vcpu-kvm, liobn); + if (!stt) + return H_TOO_HARD; - if (ret) - return ret; + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret) + return ret; - ret = kvmppc_tce_validate(stt, tce); - if (ret) - return ret; + ret = kvmppc_tce_validate(stt, tce); + if (ret) + return ret; - kvmppc_tce_put(stt, idx, tce); + idx = ioba IOMMU_PAGE_SHIFT_4K; + kvmppc_tce_put(stt, idx, tce); - return H_SUCCESS; - } - } - - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { - struct kvm *kvm = vcpu-kvm; struct kvmppc_spapr_tce_table *stt; + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; - list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) { - if (stt-liobn == liobn) { - unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; - struct page *page; - u64 *tbl; - long ret = kvmppc_ioba_validate(stt, ioba, 1); + stt = kvmppc_find_tce_table(vcpu-kvm, liobn); + if (!stt) + return H_TOO_HARD; - if (ret) - return ret; + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret) + return ret; - page = stt-pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + idx = ioba IOMMU_PAGE_SHIFT_4K; + page = stt-pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); - vcpu-arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - return H_SUCCESS; - } - } + vcpu-arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 7/7] KVM: PPC: Add support for multiple-TCE hcalls
This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition between kernel and user space. This adds a tce_tmp_hpas cache to kvm_vcpu_arch to save valid TCEs (copied from user and verified) before writing the whole list into the TCE table. This cache will be utilized more in the upcoming VFIO/IOMMU support to continue TCE list processing in the virtual mode in the case if the real mode handler failed for some reason. This adds kvmppc_spapr_tce_init() and kvmppc_spapr_tce_free() helpers to allocate and free the tce_tmp_hpas cache. This adds a function to convert a guest physical address to a host virtual address in order to parse a TCE list from H_PUT_TCE_INDIRECT. This caches tce_rm_list_pg TCE list page pointer for situation when the real mode handler managed to reference the list page and then PTE changed and real mode handler could not dereference the page. The cached page pointer is dereferenced in virtual mode. This implements the KVM_CAP_PPC_MULTITCE capability. When present, the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE. If they can not be handled by the kernel, they are passed on to the user space. The user space still has to have an implementation for these. Both HV and PR-syle KVM are supported. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changelog: v12: * used RCU for kvm-arch.spapr_tce_tables v11: * added kvm_vcpu_arch::tce_rm_list_pg to cache page struct pointer referenced-and-not-dereferenced in real mode * kvmppc_spapr_tce_init/kvmppc_spapr_tce_free called from PR code now too * removed get_page/put_page from virtual mode handler' * srcu_read_lock(vcpu-kvm-srcu) now protects entire kvmppc_h_put_tce_indirect (virtual mode handler for H_PUT_TCE_INDIRECT) v10: * kvmppc_find_tce_table() changed to take kvm* instead of vcpu* v8: * fixed warnings from check_patch.pl 2013/08/01 (v7): * realmode_get_page/realmode_put_page use was replaced with get_page_unless_zero/put_page_unless_one 2013/07/11: * addressed many, many comments from maintainers 2013/07/06: * fixed number of wrong get_page()/put_page() calls 2013/06/27: * fixed clear of BUSY bit in kvmppc_lookup_pte() * H_PUT_TCE_INDIRECT does realmode_get_page() now * KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64 * updated doc 2013/06/05: * fixed mistype about IBMVIO in the commit message * updated doc and moved it to another section * changed capability number 2013/05/21: * added kvm_vcpu_arch::tce_tmp * removed cleanup if put_indirect failed, instead we do not even start writing to TCE table if we cannot get TCEs from the user and they are invalid * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce and kvmppc_emulated_validate_tce (for the previous item) * fixed bug with failthrough for H_IPI * removed all get_user() from real mode handlers * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public) --- Documentation/virtual/kvm/api.txt | 26 + arch/powerpc/include/asm/kvm_host.h | 30 ++ arch/powerpc/include/asm/kvm_ppc.h | 12 +++ arch/powerpc/kvm/book3s_64_vio.c| 174 +++- arch/powerpc/kvm/book3s_64_vio_hv.c | 168 +- arch/powerpc/kvm/book3s_hv.c| 30 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 +- arch/powerpc/kvm/book3s_pr.c| 4 + arch/powerpc/kvm/book3s_pr_papr.c | 35 +++ arch/powerpc/kvm/powerpc.c | 3 + 10 files changed, 478 insertions(+), 8 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0fe3649..e1c72bf 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2494,6 +2494,32 @@ calls by the guest for that service will be passed to userspace to be handled. +4.87 KVM_CAP_PPC_MULTITCE + +Capability: KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significantly accelerates DMA operations for PPC KVM guests. +User space should expect that its handlers for these hypercalls +are not going to be called if user space previously registered LIOBN +in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using them if hcall-multi-tce is +present in the ibm,hypertas-functions device-tree property. + +The hypercalls mentioned above may or may not be processed successfully +in the kernel based fast path. If they can not be handled by the kernel, +they will get passed on to user
[PATCH v1 15/16] vfio: Use it_page_size
This makes use of the it_page_size from the iommu_table struct as page size can differ. This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code as recently introduced IOMMU_PAGE_XXX macros do not include IOMMU_PAGE_SHIFT. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 22 +++--- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index ff2bb92..808c7d3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -97,7 +97,7 @@ static int tce_iommu_enable(struct tce_container *container) return -ENXIO; down_write(current-mm-mmap_sem); - npages = (tbl-it_size IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + npages = (tbl-it_size tbl-it_page_shift) PAGE_SHIFT; locked = current-mm-locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; if (locked lock_limit !capable(CAP_IPC_LOCK)) { @@ -137,7 +137,7 @@ static void tce_iommu_disable(struct tce_container *container) down_write(current-mm-mmap_sem); current-mm-locked_vm -= (tbl-it_size - IOMMU_PAGE_SHIFT_4K) PAGE_SHIFT; + tbl-it_page_shift) PAGE_SHIFT; up_write(current-mm-mmap_sem); } @@ -210,8 +210,8 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz minsz) return -EINVAL; - info.dma32_window_start = tbl-it_offset IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl-it_size IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = tbl-it_offset tbl-it_page_shift; + info.dma32_window_size = tbl-it_size tbl-it_page_shift; info.flags = 0; if (copy_to_user((void __user *)arg, info, minsz)) @@ -264,17 +264,17 @@ static long tce_iommu_ioctl(void *iommu_data, if (ret) return ret; - for (i = 0; i (param.size IOMMU_PAGE_SHIFT_4K); ++i) { + for (i = 0; i (param.size tbl-it_page_shift); ++i) { ret = iommu_put_tce_user_mode(tbl, - (param.iova IOMMU_PAGE_SHIFT_4K) + i, + (param.iova tbl-it_page_shift) + i, tce); if (ret) break; - tce += IOMMU_PAGE_SIZE_4K; + tce += IOMMU_PAGE_SIZE(tbl); } if (ret) iommu_clear_tces_and_put_pages(tbl, - param.iova IOMMU_PAGE_SHIFT_4K, i); + param.iova tbl-it_page_shift, i); iommu_flush_tce(tbl); @@ -315,13 +315,13 @@ static long tce_iommu_ioctl(void *iommu_data, BUG_ON(!tbl-it_group); ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size IOMMU_PAGE_SHIFT_4K); + param.size tbl-it_page_shift); if (ret) return ret; ret = iommu_clear_tces_and_put_pages(tbl, - param.iova IOMMU_PAGE_SHIFT_4K, - param.size IOMMU_PAGE_SHIFT_4K); + param.iova tbl-it_page_shift, + param.size tbl-it_page_shift); iommu_flush_tce(tbl); return ret; -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 00/13] powerpc: kvm: Enable in-kernel acceleration for VFIO
This enables in-kernel acceleration of TCE hypercalls (H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE). This implements acceleration for both real and virtual modes. This was made on top of both: [PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW) [PATCH v1 0/7] powerpc/iommu: kvm: Enable MultiTCE support Alexey Kardashevskiy (13): KVM: PPC: Account TCE pages in locked_vm KVM: PPC: Rework kvmppc_spapr_tce_table to support variable page size KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64 permanently KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_64 capability number KVM: PPC: Add @offset to kvmppc_spapr_tce_table KVM: PPC: Add support for 64bit TCE windows KVM: PPC: Add hugepage support for IOMMU in-kernel handling KVM: PPC: Add page_shift support for in-kernel H_PUT_TCE/etc handlers KVM: PPC: Fix kvmppc_gpa_to_hva_and_get() to return host physical address KVM: PPC: Associate IOMMU group with guest copy of TCE table KVM: PPC: vfio kvm device: support spapr tce KVM: PPC: Add support for IOMMU in-kernel handling Documentation/virtual/kvm/api.txt | 51 Documentation/virtual/kvm/devices/vfio.txt | 20 +- arch/powerpc/include/asm/kvm_host.h| 41 ++- arch/powerpc/include/asm/kvm_ppc.h | 9 +- arch/powerpc/include/uapi/asm/kvm.h| 9 + arch/powerpc/kernel/iommu.c| 6 +- arch/powerpc/kvm/Kconfig | 2 + arch/powerpc/kvm/Makefile | 3 + arch/powerpc/kvm/book3s_64_vio.c | 389 +++-- arch/powerpc/kvm/book3s_64_vio_hv.c| 177 - arch/powerpc/kvm/book3s_hv.c | 3 + arch/powerpc/kvm/powerpc.c | 25 +- include/uapi/linux/kvm.h | 12 + virt/kvm/vfio.c| 69 + 14 files changed, 775 insertions(+), 41 deletions(-) -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 01/13] KVM: PPC: Account TCE pages in locked_vm
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c | 35 ++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 2137836..4ca33f1 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -73,18 +73,48 @@ static long kvmppc_stt_npages(unsigned long window_size) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } +/* + * Checks ulimit in order not to let the user space to pin all + * available memory for TCE tables. + */ +static long kvmppc_account_memlimit(long npages) +{ + unsigned long ret = 0, locked, lock_limit; + + if (!current-mm) + return -ESRCH; /* process exited */ + + down_write(current-mm-mmap_sem); + locked = current-mm-locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; + if (locked lock_limit !capable(CAP_IPC_LOCK)) { + pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current-mm-locked_vm += npages; + } + up_write(current-mm-mmap_sem); + + return ret; +} + static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) { struct kvm *kvm = stt-kvm; int i; + long npages = kvmppc_stt_npages(stt-window_size); mutex_lock(kvm-lock); list_del(stt-list); - for (i = 0; i kvmppc_stt_npages(stt-window_size); i++) + for (i = 0; i npages; i++) __free_page(stt-pages[i]); + kfree(stt); mutex_unlock(kvm-lock); + kvmppc_account_memlimit(-(npages + 1)); + kvm_put_kvm(kvm); } @@ -140,6 +170,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, } npages = kvmppc_stt_npages(args-window_size); + ret = kvmppc_account_memlimit(npages + 1); + if (ret) + goto fail; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 02/13] KVM: PPC: Rework kvmppc_spapr_tce_table to support variable page size
At the moment the kvmppc_spapr_tce_table struct can only describe 4GB windows which is not enough for big DMA windows. This replaces window_size (in bytes, 4GB max) with page_shift (32bit) and size (64bit, in pages). Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/kvm_host.h | 3 ++- arch/powerpc/kvm/book3s_64_vio.c| 17 + arch/powerpc/kvm/book3s_64_vio_hv.c | 3 +-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c37fee2..d3a154c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -180,7 +180,8 @@ struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; u64 liobn; - u32 window_size; + u32 page_shift; + u64 size; /* in pages */ struct page *pages[0]; }; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 4ca33f1..f2c8e4d 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -67,10 +67,9 @@ void kvmppc_spapr_tce_free(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvmppc_spapr_tce_free); -static long kvmppc_stt_npages(unsigned long window_size) +static long kvmppc_stt_npages(unsigned long size) { - return ALIGN((window_size IOMMU_PAGE_SHIFT_4K) -* sizeof(u64), PAGE_SIZE) / PAGE_SIZE; + return ALIGN(size * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } /* @@ -103,7 +102,7 @@ static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) { struct kvm *kvm = stt-kvm; int i; - long npages = kvmppc_stt_npages(stt-window_size); + long npages = kvmppc_stt_npages(stt-size); mutex_lock(kvm-lock); list_del(stt-list); @@ -123,7 +122,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma-vm_file-private_data; struct page *page; - if (vmf-pgoff = kvmppc_stt_npages(stt-window_size)) + if (vmf-pgoff = kvmppc_stt_npages(stt-size)) return VM_FAULT_SIGBUS; page = stt-pages[vmf-pgoff]; @@ -159,7 +158,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; - long npages; + long npages, size; int ret = -ENOMEM; int i; @@ -169,7 +168,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EBUSY; } - npages = kvmppc_stt_npages(args-window_size); + size = args-window_size IOMMU_PAGE_SHIFT_4K; + npages = kvmppc_stt_npages(size); ret = kvmppc_account_memlimit(npages + 1); if (ret) goto fail; @@ -180,7 +180,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt-liobn = args-liobn; - stt-window_size = args-window_size; + stt-page_shift = IOMMU_PAGE_SHIFT_4K; + stt-size = size; stt-kvm = kvm; for (i = 0; i npages; i++) { diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 79a39bb..fadfacb 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -73,9 +73,8 @@ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, { unsigned long mask = (1 IOMMU_PAGE_SHIFT_4K) - 1; unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; - unsigned long size = stt-window_size IOMMU_PAGE_SHIFT_4K; - if ((ioba mask) || (size + npages = idx)) + if ((ioba mask) || (stt-size + npages = idx)) return H_PARAMETER; return H_SUCCESS; -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 04/13] KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number
This adds a capability number for in-kernel support for VFIO on SPAPR platform. The capability will tell the user space whether in-kernel handlers of H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space must not attempt allocating a TCE table in the host kernel via the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests will not be passed to the user space which is desired action in the situation like that. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e11d8f1..3048c86 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -758,6 +758,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_VM_ATTRIBUTES 101 #define KVM_CAP_ARM_PSCI_0_2 102 #define KVM_CAP_PPC_FIXUP_HCALL 103 +#define KVM_CAP_SPAPR_TCE_VFIO 104 #ifdef KVM_CAP_IRQ_ROUTING -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 05/13] KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_64 capability number
This adds a capability number for 64-bit TCE tables support. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3048c86..65c2689 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -759,6 +759,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PSCI_0_2 102 #define KVM_CAP_PPC_FIXUP_HCALL 103 #define KVM_CAP_SPAPR_TCE_VFIO 104 +#define KVM_CAP_SPAPR_TCE_64 105 #ifdef KVM_CAP_IRQ_ROUTING -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 03/13] KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64 permanently
It does not make much sense to have KVM in book3s-64 and not to have IOMMU bits for PCI pass through support as it costs little and allows VFIO to function on book3s KVM. Having IOMMU_API always enabled makes it unnecessary to have a lot of #ifdef IOMMU_API in arch/powerpc/kvm/book3s_64_vio*. With those ifdef's we could have only user space emulated devices accelerated (but not VFIO) which do not seem to be very useful. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index d7a16ac6..301fa6b 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -63,6 +63,7 @@ config KVM_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE + select SPAPR_TCE_IOMMU if IOMMU_SUPPORT ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 07/13] KVM: PPC: Add support for 64bit TCE windows
The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not enough for directly mapped windows as the guest can get more than 4GB. This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it via KVM_CAP_SPAPR_TCE_64 capability. Since 64bit windows are to support Dynamic DMA windows (DDW), let's add @bus_offset and @page_shift which are also required by DDW. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Documentation/virtual/kvm/api.txt | 51 + arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 9 +++ arch/powerpc/kvm/book3s_64_vio.c| 10 +--- arch/powerpc/kvm/powerpc.c | 25 +- include/uapi/linux/kvm.h| 2 ++ 6 files changed, 94 insertions(+), 5 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e1c72bf..b4695ea 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2520,6 +2520,57 @@ an implementation for these despite the in kernel acceleration. This capability is always enabled. +4.88 KVM_CREATE_SPAPR_TCE_64 + +Capability: KVM_CAP_SPAPR_TCE_64 +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_64 (in) +Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows. + +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ + __u32 flags; +}; + + +!!! FIXME !!! + + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every IOMMU page. The bus_offset field tells where +this window is mapped on the IO bus. The page_shift field tells the size +of the pages in this window (for example, 10, 16, 24 for 4K, 64K, 16MB +page sizes respectively). The flags field is not used at the moment +but provides the room for extensions. + +When the guest issues an H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE hcall +on a liobn for which a TCE table has been created using this ioctl(), +the kernel will handle it in real or virtual mode, updating the TCE table. +If liobn has not been registered with this ioctl, H_PUT_TCE/etc calls +will cause a vm exit and must be handled by userspace. + +The return value is a file descriptor which can be passed to mmap(2) +to map the created TCE table into userspace. This lets userspace read +the entries written by kernel-handled H_PUT_TCE calls, and also lets +userspace update the TCE table directly which is useful in some +circumstances. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index b84ed80..e0a68ef 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -128,7 +128,7 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern int kvmppc_spapr_tce_init(struct kvm_vcpu *vcpu); extern void kvmppc_spapr_tce_free(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args); + struct kvm_create_spapr_tce_64 *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( struct kvm *kvm, unsigned long liobn); extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 2bc4a94..4452f6e 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -333,6 +333,15 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ + __u32 flags; +}; + /* for KVM_ALLOCATE_RMA */ struct kvm_allocate_rma { __u64 rma_size; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index f2c8e4d..2c6ab20 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -155,20 +155,23 @@ static const struct file_operations kvm_spapr_tce_fops = { }; long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) + struct
[PATCH v1 06/13] KVM: PPC: Add @offset to kvmppc_spapr_tce_table
This enables guest visible TCE tables to start from non-zero offset on a bus. This will be used for VFIO support. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_64_vio_hv.c | 5 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d3a154c..ed96b09 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -181,6 +181,7 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 page_shift; + u64 offset; /* in pages */ u64 size; /* in pages */ struct page *pages[0]; }; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index fadfacb..a3a6597 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -74,7 +74,8 @@ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long mask = (1 IOMMU_PAGE_SHIFT_4K) - 1; unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; - if ((ioba mask) || (stt-size + npages = idx)) + if ((ioba mask) || (idx stt-offset) || + (stt-offset + stt-size + npages = idx)) return H_PARAMETER; return H_SUCCESS; @@ -146,6 +147,7 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, struct page *page; u64 *tbl; + idx -= stt-offset; page = stt-pages[idx / TCES_PER_PAGE]; tbl = kvmppc_page_address(page); @@ -351,6 +353,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return ret; idx = ioba IOMMU_PAGE_SHIFT_4K; + idx -= stt-offset; page = stt-pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 08/13] KVM: PPC: Add hugepage support for IOMMU in-kernel handling
This adds special support for huge pages (16MB) in real mode. The reference counting cannot be easily done for such pages in real mode (when MMU is off) so this adds a hash table of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is in the hash table, then no reference counting is done, otherwise an exit to virtual mode happens. The hash table is released at KVM exit. This defines kvmppc_spapr_iommu_hugepage hash table entry and adds it to kvm_arch. This adds kvmppc_iommu_hugepages_init() and kvmppc_iommu_hugepages_cleanup() helpers. The latter puts cached pages. This fixes iommu_clear_tces_and_put_pages() not to put huge pages as this is to be done by kvmppc_iommu_hugepages_cleanup(). This implements a real mode kvmppc_rm_hugepage_gpa_to_hpa() helper to find a hash entry and a virtual mode kvmppc_iommu_hugepage_try_add() helper to add one. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this hash table does not cost much. However this can change and we may want to optimize this. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v11: * moved hashtables from IOMMU to KVM 2013/07/12: * removed multiple #ifdef IOMMU_API as IOMMU_API is always enabled for KVM_BOOK3S_64 2013/06/27: * list of huge pages replaces with hashtable for better performance * spinlock removed from real mode and only protects insertion of new huge [ages descriptors into the hashtable 2013/06/05: * fixed compile error when CONFIG_IOMMU_API=n 2013/05/20: * the real mode handler now searches for a huge page by gpa (used to be pte) * the virtual mode handler prints warning if it is called twice for the same huge page as the real mode handler is expected to fail just once - when a huge page is not in the list yet. * the huge page is refcounted twice - when added to the hugepage list and when used in the virtual mode hcall handler (can be optimized but it will make the patch less nice). --- arch/powerpc/include/asm/kvm_host.h | 34 +++ arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/kernel/iommu.c | 6 +- arch/powerpc/kvm/book3s_64_vio.c| 116 +++- arch/powerpc/kvm/book3s_64_vio_hv.c | 25 arch/powerpc/kvm/book3s_hv.c| 3 + 6 files changed, 183 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ed96b09..8a3b465 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -31,6 +31,7 @@ #include linux/list.h #include linux/atomic.h #include linux/tracepoint.h +#include linux/hashtable.h #include asm/kvm_asm.h #include asm/processor.h #include asm/page.h @@ -191,6 +192,36 @@ struct kvm_rma_info { unsigned long base_pfn; }; +/* + * The KVM guest can be backed with 16MB pages. + * In this case, we cannot do page counting from the real mode + * as the compound pages are used - they are linked in a list + * with pointers as virtual addresses which are inaccessible + * in real mode. + * + * To address the issue, here is what we do: + * + * 1) add a hashtable per KVM, each entry is kvmppc_spapr_iommu_hugepage + * and describes gpa-to-hpa mapping; + * 2) in real mode, if gpa is in the hash table, use the cached hpa; + * otherwise pass the request to virtual mode; + * 3) in virtual mode, check if gpa is in the hash table and use cached + * hpa; otherwise translate gpa to hpa and reference the page. + * + * hpa of every used hugepage will be cached in the hash table + * and referenced just once. Pages are released at KVM exit. + */ +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)hash_32(gpa 24, 32) +#define KVMPPC_SPAPR_HUGEPAGE_BUCKETS 64 + +struct kvmppc_spapr_iommu_hugepage { + struct hlist_node hash_node; + unsigned long gpa; /* Guest physical address */ + unsigned long hpa; /* Host physical address */ + struct page *page; /* page struct of the very first subpage */ + unsigned long size; /* Huge page size (always 16MB at the moment) */ +}; + /* XICS components, defined in book3s_xics.c */ struct kvmppc_xics; struct kvmppc_icp; @@ -266,6 +297,9 @@ struct kvm_arch { #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; struct list_head rtas_tokens; + DECLARE_HASHTABLE(hugepages_hash_tab, + ilog2(KVMPPC_SPAPR_HUGEPAGE_BUCKETS)); + spinlock_t hugepages_write_lock; #endif #ifdef CONFIG_KVM_MPIC struct openpic *mpic; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e0a68ef..86f5015 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -127,6 +127,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern int kvmppc_spapr_tce_init(struct kvm_vcpu *vcpu); extern void
[PATCH v1 09/13] KVM: PPC: Add page_shift support for in-kernel H_PUT_TCE/etc handlers
Recently introduced KVM_CREATE_SPAPR_TCE_64 added page_shift. This makes use of it in kvmppc_tce_put(). This changes kvmppc_tce_put() to take an TCE index rather than IO address. This does not change the existing behaviour and will be utilized later by Dynamic DMA windows which support 64K and 16MB page sizes. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c| 8 arch/powerpc/kvm/book3s_64_vio_hv.c | 16 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 2648d88..8250521 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -371,7 +371,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, if (ret) return ret; - kvmppc_tce_put(stt, ioba IOMMU_PAGE_SHIFT_4K, tce); + kvmppc_tce_put(stt, ioba stt-page_shift, tce); return H_SUCCESS; } @@ -436,7 +436,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, } for (i = 0; i npages; ++i) - kvmppc_tce_put(stt, (ioba IOMMU_PAGE_SHIFT_4K) + i, + kvmppc_tce_put(stt, (ioba stt-page_shift) + i, vcpu-arch.tce_tmp_hpas[i]); unlock_exit: @@ -465,8 +465,8 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, if (ret || (tce_value (TCE_PCI_WRITE | TCE_PCI_READ))) return H_PARAMETER; - for (i = 0; i npages; ++i, ioba += IOMMU_PAGE_SIZE_4K) - kvmppc_tce_put(stt, ioba IOMMU_PAGE_SHIFT_4K, tce_value); + for (i = 0; i npages; ++i, ioba += (1 stt-page_shift)) + kvmppc_tce_put(stt, ioba stt-page_shift, tce_value); return H_SUCCESS; } diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 6c0b95d..99bac58 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -71,8 +71,8 @@ EXPORT_SYMBOL_GPL(kvmppc_find_tce_table); long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages) { - unsigned long mask = (1 IOMMU_PAGE_SHIFT_4K) - 1; - unsigned long idx = ioba IOMMU_PAGE_SHIFT_4K; + unsigned long mask = (1 stt-page_shift) - 1; + unsigned long idx = ioba stt-page_shift; if ((ioba mask) || (idx stt-offset) || (stt-offset + stt-size + npages = idx)) @@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); */ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) { - unsigned long mask = ((1 IOMMU_PAGE_SHIFT_4K) - 1) + unsigned long mask = ((1 stt-page_shift) - 1) ~(TCE_PCI_WRITE | TCE_PCI_READ); if (tce mask) @@ -271,7 +271,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret) return ret; - idx = ioba IOMMU_PAGE_SHIFT_4K; + idx = ioba stt-page_shift; kvmppc_tce_put(stt, idx, tce); return H_SUCCESS; @@ -323,7 +323,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, } for (i = 0; i npages; ++i) - kvmppc_tce_put(stt, (ioba IOMMU_PAGE_SHIFT_4K) + i, + kvmppc_tce_put(stt, (ioba stt-page_shift) + i, vcpu-arch.tce_tmp_hpas[i]); put_page_exit: @@ -354,8 +354,8 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, if (ret || (tce_value (TCE_PCI_WRITE | TCE_PCI_READ))) return H_PARAMETER; - for (i = 0; i npages; ++i, ioba += IOMMU_PAGE_SIZE_4K) - kvmppc_tce_put(stt, ioba IOMMU_PAGE_SHIFT_4K, tce_value); + for (i = 0; i npages; ++i, ioba += (1 stt-page_shift)) + kvmppc_tce_put(stt, ioba stt-page_shift, tce_value); return H_SUCCESS; } @@ -377,7 +377,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret) return ret; - idx = ioba IOMMU_PAGE_SHIFT_4K; + idx = ioba stt-page_shift; idx -= stt-offset; page = stt-pages[idx / TCES_PER_PAGE]; tbl = (u64 *)page_address(page); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 10/13] KVM: PPC: Fix kvmppc_gpa_to_hva_and_get() to return host physical address
The existing support of emulated devices does not need to calculate a host physical address as the translation is performed by the userspace. The upcoming support of VFIO needs it as it stores host physical addresses in the real hardware TCE table which hardware uses during DMA transfer. This translation could be done using page struct object which is returned by kvmppc_gpa_to_hva_and_get(). However kvmppc_gpa_to_hva_and_get() does not return valid page struct for huge pages to avoid possible bugs with excessive page releases. This extends kvmppc_gpa_to_hva_and_get() to return a physical page address. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8250521..573fd6d 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -321,7 +321,7 @@ fail: * and returns ERROR_ADDR if failed. */ static void __user *kvmppc_gpa_to_hva_and_get(struct kvm_vcpu *vcpu, - unsigned long gpa, struct page **pg) + unsigned long gpa, struct page **pg, unsigned long *phpa) { unsigned long hva, gfn = gpa PAGE_SHIFT; struct kvm_memory_slot *memslot; @@ -337,6 +337,10 @@ static void __user *kvmppc_gpa_to_hva_and_get(struct kvm_vcpu *vcpu, if (get_user_pages_fast(hva PAGE_MASK, 1, is_write, pg) != 1) return ERROR_ADDR; + if (phpa) + *phpa = __pa((unsigned long) page_address(*pg)) | + (hva ~PAGE_MASK); + /* * Check if this GPA is taken care of by the hash table. * If this is the case, do not show the caller page struct @@ -404,7 +408,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; idx = srcu_read_lock(vcpu-kvm-srcu); - tces = kvmppc_gpa_to_hva_and_get(vcpu, tce_list, NULL); + tces = kvmppc_gpa_to_hva_and_get(vcpu, tce_list, NULL, NULL); if (tces == ERROR_ADDR) { ret = H_TOO_HARD; goto unlock_exit; -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 12/13] KVM: PPC: vfio kvm device: support spapr tce
In addition to the external VFIO user API, a VFIO KVM device has been introduced recently. sPAPR TCE IOMMU is para-virtualized and the guest does map/unmap via hypercalls which take a logical bus id (LIOBN) as a target IOMMU identifier. LIOBNs are made up, advertised to the guest system and linked to IOMMU groups by the user space. In order to enable acceleration for IOMMU operations in KVM, we need to tell KVM the information about LIOBN-to-group mapping. For that, a new KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN parameter is added. It accepts a pair of a VFIO group fd and LIOBN. KVM uses kvm_vfio_find_group_by_liobn() once per KVM run and caches the result in kvm_arch. iommu_group_put() for all groups is called at KVM finish in the SPAPR TCE (will be added in KVM enablement patch). Before notifying KVM about new link, this check the group for being registered with KVM device in order to release them at unexpected KVM finish. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v5: * added lock in search function * changed callback function type name v4: * fixed few bugs * changed kvm_vfio_find_group_by_liobn() to return informative errors v3: * total rework * added a release callback into kvm_vfio_find_group_by_liobn so now the user of the API can get a notification if the group is about to disappear --- Documentation/virtual/kvm/devices/vfio.txt | 20 - arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/kvm/Makefile | 3 ++ include/uapi/linux/kvm.h | 8 virt/kvm/vfio.c| 69 ++ 5 files changed, 99 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt index ef51740..eaf0f5e 100644 --- a/Documentation/virtual/kvm/devices/vfio.txt +++ b/Documentation/virtual/kvm/devices/vfio.txt @@ -16,7 +16,23 @@ Groups: KVM_DEV_VFIO_GROUP attributes: KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. -For each, kvm_device_attr.addr points to an int32_t file descriptor -for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN: sets a liobn for a VFIO group + kvm_device_attr.addr points to a struct: + struct kvm_vfio_spapr_tce_liobn { + __u32 argsz; + __s32 fd; + __u32 liobn; + __u64 start_addr; + }; + where + @argsz is the size of kvm_vfio_spapr_tce_liobn; + @fd is a file descriptor for a VFIO group; + @liobn is a logical bus id to be associated with the group; + @start_addr is a DMA window offset on the IO (PCI) bus. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 301fa6b..f708e61 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -64,6 +64,7 @@ config KVM_BOOK3S_64 select KVM select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE select SPAPR_TCE_IOMMU if IOMMU_SUPPORT + select KVM_VFIO if VFIO ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index ce569b6..d55c097 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -97,6 +97,9 @@ endif kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o +kvm-book3s_64-objs-$(CONFIG_KVM_VFIO) += \ + $(KVM)/vfio.o \ + kvm-book3s_64-module-objs += \ $(KVM)/kvm_main.o \ $(KVM)/eventfd.o \ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3beb542..c1ad9b7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -949,9 +949,17 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN 3 #define KVM_DEV_TYPE_ARM_VGIC_V2 5 #define KVM_DEV_TYPE_FLIC 6 +struct kvm_vfio_spapr_tce_liobn { + __u32 argsz; + __s32 fd; + __u32 liobn; + __u64 start_addr; +}; + /* * ioctls for VM fds */ diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ba1a93f..43a224b 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -19,6 +19,10 @@ #include linux/uaccess.h #include linux/vfio.h +#ifdef CONFIG_SPAPR_TCE_IOMMU +#include asm/kvm_ppc.h +#endif + struct kvm_vfio_group { struct list_head node; struct vfio_group
[PATCH v1 11/13] KVM: PPC: Associate IOMMU group with guest copy of TCE table
The existing in-kernel TCE table for emulated devices contains guest physical addresses which are accesses by emulated devices. Since we need to keep this information for VFIO devices too in order to implement H_GET_TCE, we are reusing it. This adds iommu_group* and iommu_table* pointers to kvmppc_spapr_tce_table. This adds kvm_spapr_tce_attach_iommu_group() helper to initialize the pointers. This puts the group when guest copy of TCE table is destroyed which happens when TCE table fd is closed. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/include/asm/kvm_ppc.h | 5 + arch/powerpc/kvm/book3s_64_vio.c| 28 3 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a3b465..8d8eee9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -184,6 +184,8 @@ struct kvmppc_spapr_tce_table { u32 page_shift; u64 offset; /* in pages */ u64 size; /* in pages */ + struct iommu_table *tbl; + struct iommu_group *refgrp;/* reference counting only */ struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 86f5015..92be7f5 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -129,6 +129,11 @@ extern int kvmppc_spapr_tce_init(struct kvm_vcpu *vcpu); extern void kvmppc_spapr_tce_free(struct kvm_vcpu *vcpu); extern void kvmppc_iommu_hugepages_init(struct kvm_arch *ka); extern void kvmppc_iommu_hugepages_cleanup(struct kvm_arch *ka); +struct iommu_group; +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, + unsigned long liobn, + phys_addr_t start_addr, + struct iommu_group *grp); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 573fd6d..b7de38e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -28,6 +28,7 @@ #include linux/hugetlb.h #include linux/list.h #include linux/anon_inodes.h +#include linux/iommu.h #include asm/tlbflush.h #include asm/kvm_ppc.h @@ -205,6 +206,10 @@ static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) mutex_lock(kvm-lock); list_del(stt-list); + + if (stt-refgrp) + iommu_group_put(stt-refgrp); + for (i = 0; i npages; i++) __free_page(stt-pages[i]); @@ -253,6 +258,29 @@ static const struct file_operations kvm_spapr_tce_fops = { .release= kvm_spapr_tce_release, }; +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, + unsigned long liobn, + phys_addr_t start_addr, + struct iommu_group *grp) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + + /* Check this LIOBN hasn't been previously allocated */ + list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) { + if (stt-liobn == liobn) { + struct spapr_tce_iommu_group *data; + + data = iommu_group_get_iommudata(grp); + BUG_ON(!data); + stt-tbl = data-ops-get_table(data, start_addr); + stt-refgrp = grp; + return 0; + } + } + + return -ENODEV; +} + long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args) { -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v1 13/13] KVM: PPC: Add support for IOMMU in-kernel handling
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests targeted an IOMMU TCE table without passing them to user space which saves time on switching to user space and back. Both real and virtual modes are supported. The kernel tries to handle a TCE request in the real mode, if fails it passes the request to the virtual mode to complete the operation. If it a virtual mode handler fails, the request is passed to user space. The first user of this is VFIO on POWER. Trampolines to the VFIO external user API functions are required for this patch. This adds a SPAPR TCE IOMMU KVM device to associate a logical bus number (LIOBN) with an VFIO IOMMU group fd and enable in-kernel handling of map/unmap requests. The device supports a single attribute which is a struct with LIOBN and IOMMU fd. When the attribute is set, the device establishes the connection between KVM and VFIO. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Changes: v12: * reworked for the latest VFIO KVM device v11: * removed VFIO_IOMMU capability * fixed comments from Gleb * added @type to kvmppc_spapr_tce_table struct and split it into 2 parts (emulated, iommu) v10: * all IOMMU TCE links are handled by one KVM device now * KVM device has its own list of TCE descriptors * the search-by-liobn function was extended to search through emulated and IOMMU lists v9: * KVM_CAP_SPAPR_TCE_IOMMU ioctl to KVM replaced with SPAPR TCE IOMMU KVM device * release_spapr_tce_table() is not shared between different TCE types * reduced the patch size by moving KVM device bits and VFIO external API trampolines to separate patches * moved documentation from Documentation/virtual/kvm/api.txt to Documentation/virtual/kvm/devices/spapr_tce_iommu.txt v8: * fixed warnings from check_patch.pl 2013/07/11: * removed multiple #ifdef IOMMU_API as IOMMU_API is always enabled for KVM_BOOK3S_64 * kvmppc_gpa_to_hva_and_get also returns host phys address. Not much sense for this here but the next patch for hugepages support will use it more. 2013/07/06: * added realmode arch_spin_lock to protect TCE table from races in real and virtual modes * POWERPC IOMMU API is changed to support real mode * iommu_take_ownership and iommu_release_ownership are protected by iommu_table's locks * VFIO external user API use rewritten * multiple small fixes 2013/06/27: * tce_list page is referenced now in order to protect it from accident invalidation during H_PUT_TCE_INDIRECT execution * added use of the external user VFIO API 2013/06/05: * changed capability number * changed ioctl number * update the doc article number 2013/05/20: * removed get_user() from real mode handlers * kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there translated TCEs, tries realmode_get_page() on those and if it fails, it passes control over the virtual mode handler which tries to finish the request handling * kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit on a page * The only reason to pass the request to user mode now is when the user mode did not register TCE table in the kernel, in all other cases the virtual mode handler is expected to do the job Conflicts: arch/powerpc/include/asm/kvm_host.h arch/powerpc/kvm/book3s_64_vio.c --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_64_vio.c| 177 ++-- arch/powerpc/kvm/book3s_64_vio_hv.c | 130 ++ 3 files changed, 298 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8d8eee9..6056114 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -726,6 +726,7 @@ struct kvm_vcpu_arch { */ } tce_rm_fail; /* failed stage of request processing */ struct page *tce_rm_list_pg;/* unreferenced page from realmode */ + unsigned long tce_tmp_num; /* valid entries number */ #endif #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) || \ defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index b7de38e..90e7ad1 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -21,7 +21,6 @@ #include linux/string.h #include linux/kvm.h #include linux/kvm_host.h - #include linux/highmem.h #include linux/gfp.h #include linux/slab.h @@ -29,6 +28,8 @@ #include linux/list.h #include linux/anon_inodes.h #include linux/iommu.h +#include linux/module.h +#include linux/file.h #include asm/tlbflush.h #include asm/kvm_ppc.h @@ -347,6 +348,8 @@ fail: * * If pg!=NULL, tries to increase page counter via get_user_pages_fast() * and returns ERROR_ADDR if failed. + * + * if pg!=NULLphpa!=NULL,
Re: [PATCH v1 01/13] KVM: PPC: Account TCE pages in locked_vm
On 07/15/2014 07:25 PM, Alexey Kardashevskiy wrote: Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Just realized this should go to powernv: vfio: Add Dynamic DMA windows (DDW). And neither patchset accounts DDW in locked_vm, need to decide how... --- arch/powerpc/kvm/book3s_64_vio.c | 35 ++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 2137836..4ca33f1 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -73,18 +73,48 @@ static long kvmppc_stt_npages(unsigned long window_size) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } +/* + * Checks ulimit in order not to let the user space to pin all + * available memory for TCE tables. + */ +static long kvmppc_account_memlimit(long npages) +{ + unsigned long ret = 0, locked, lock_limit; + + if (!current-mm) + return -ESRCH; /* process exited */ + + down_write(current-mm-mmap_sem); + locked = current-mm-locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) PAGE_SHIFT; + if (locked lock_limit !capable(CAP_IPC_LOCK)) { + pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n, + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + current-mm-locked_vm += npages; + } + up_write(current-mm-mmap_sem); + + return ret; +} + static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) { struct kvm *kvm = stt-kvm; int i; + long npages = kvmppc_stt_npages(stt-window_size); mutex_lock(kvm-lock); list_del(stt-list); - for (i = 0; i kvmppc_stt_npages(stt-window_size); i++) + for (i = 0; i npages; i++) __free_page(stt-pages[i]); + kfree(stt); mutex_unlock(kvm-lock); + kvmppc_account_memlimit(-(npages + 1)); + kvm_put_kvm(kvm); } @@ -140,6 +170,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, } npages = kvmppc_stt_npages(args-window_size); + ret = kvmppc_account_memlimit(npages + 1); + if (ret) + goto fail; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); -- Alexey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 00/12] vfio: pci: Enable DDW and in-kernel acceleration
This makes use of kernel patchsets: [PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW) [PATCH v1 0/7] powerpc/iommu: kvm: Enable MultiTCE support [PATCH v1 00/13] powerpc: kvm: Enable in-kernel acceleration for VFIO I am posting it for reference here, reviews are still welcome but not required :) Alexey Kardashevskiy (12): spapr_iommu: Disable in-kernel IOMMU tables for 4GB windows spapr_pci: Make find_phb()/find_dev() public spapr_iommu: Make spapr_tce_find_by_liobn() public linux headers update for DDW spapr_rtas: Add Dynamic DMA windows (DDW) RTAS calls support spapr: Add ddw machine option spapr_pci: Enable DDW spapr_pci_vfio: Enable DDW vfio: Enable DDW ioctls to VFIO IOMMU driver headers: update for KVM_CAP_SPAPR_TCE_64 and VFIO KVM device target-ppc: kvm: make use of KVM_CREATE_SPAPR_TCE_64 vfio: Enable in-kernel acceleration via VFIO KVM device hw/misc/vfio.c| 45 ++ hw/ppc/Makefile.objs | 3 + hw/ppc/spapr.c| 15 ++ hw/ppc/spapr_iommu.c | 6 +- hw/ppc/spapr_pci.c| 84 +-- hw/ppc/spapr_pci_vfio.c | 95 hw/ppc/spapr_rtas_ddw.c | 296 ++ include/hw/misc/vfio.h| 5 + include/hw/pci-host/spapr.h | 25 include/hw/ppc/spapr.h| 8 +- linux-headers/asm-mips/kvm_para.h | 6 +- linux-headers/asm-powerpc/kvm.h | 9 ++ linux-headers/linux/kvm.h | 12 ++ linux-headers/linux/kvm_para.h| 3 + linux-headers/linux/vfio.h| 37 - target-ppc/kvm.c | 47 -- target-ppc/kvm_ppc.h | 10 +- trace-events | 4 + vl.c | 4 + 19 files changed, 683 insertions(+), 31 deletions(-) create mode 100644 hw/ppc/spapr_rtas_ddw.c -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 05/12] spapr_rtas: Add Dynamic DMA windows (DDW) RTAS calls support
spapr_pci_vfio: Support dynamic DMA window This adds support for Dynamic DMA Windows (DDW) option defined by the SPAPR specification which allows to have additional DMA windows besides the default and small one which can only handle 4K pages and which should completely fit into first 32bit of PCI address space what makes it less than perfect solution for high-speed PCI devices. The existing implementation of DDW in the guest tries to create one huge DMA window with 64K or 16MB pages and map the entire guest RAM to. If this operation suceedes, the guest switches to dma_direct_ops and never calls TCE hypercalls (H_PUT_TCE,...). This enables VFIO devices to use the entire RAM and not spend time on mapping/unmapping. This adds 4 RTAS handlers: * ibm,query-pe-dma-window * ibm,create-pe-dma-window * ibm,remove-pe-dma-window * ibm,reset-pe-dma-window These are registered from qapi_init callback. This adds @ddw_supported property to sPAPRPHBState to enable DDW feature. This adds @ddw_reset_supported property to sPAPRPHBState to enable DDW reset extention (TODO: debug). This bumps migration descriptor version as there are 2 new properties. This adds a notifier for VFIO to provide path for calling DDW-related ioctls via VFIO container fd. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Reset is not implemented yet and it is questionable if we really want it. --- hw/ppc/Makefile.objs| 3 + hw/ppc/spapr_rtas_ddw.c | 296 include/hw/pci-host/spapr.h | 18 +++ include/hw/ppc/spapr.h | 7 +- trace-events| 4 + 5 files changed, 326 insertions(+), 2 deletions(-) create mode 100644 hw/ppc/spapr_rtas_ddw.c diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index edd44d0..9773294 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -7,6 +7,9 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) obj-y += spapr_pci_vfio.o endif +ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES), yy) +obj-y += spapr_rtas_ddw.o +endif # PowerPC 4xx boards obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o obj-y += ppc4xx_pci.o diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c new file mode 100644 index 000..943af2c --- /dev/null +++ b/hw/ppc/spapr_rtas_ddw.c @@ -0,0 +1,296 @@ +/* + * QEMU sPAPR Dynamic DMA windows support + * + * Copyright (c) 2014 Alexey Kardashevskiy, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see http://www.gnu.org/licenses/. + */ + +#include hw/ppc/spapr.h +#include hw/pci-host/spapr.h +#include trace.h + +static inline uint32_t spapr_iommu_fixmask(uint32_t cur_mask, + struct ppc_one_seg_page_size *sps, + uint32_t query_mask, + int shift, + uint32_t add_mask) +{ +if ((sps-page_shift == shift) (query_mask add_mask)) { +cur_mask |= add_mask; +} +return cur_mask; +} + +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, + sPAPREnvironment *spapr, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ +CPUPPCState *env = cpu-env; +sPAPRPHBState *sphb; +sPAPRPHBClass *spc; +uint64_t buid; +uint32_t addr, pgmask = 0; +uint32_t windows_available = 0, page_size_mask = 0; +long ret, i; + +if ((nargs != 3) || (nret != 5)) { +goto param_error_exit; +} + +buid = ((uint64_t)rtas_ld(args, 1) 32) | rtas_ld(args, 2); +addr = rtas_ld(args, 0); +sphb = spapr_pci_find_phb(spapr, buid); +if (!sphb) { +goto param_error_exit; +} + +spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb); +if (!spc-ddw_query) { +goto hw_error_exit; +} + +ret = spc-ddw_query(sphb, windows_available, page_size_mask); +trace_spapr_iommu_ddw_query(buid, addr, windows_available, +page_size_mask, pgmask, ret); +if (ret) { +goto hw_error_exit; +} + +/* DBG! */ +if (!(page_size_mask DDW_PGSIZE_16M)) { +goto hw_error_exit; +} + +/* Work out
[PATCH QEMU 02/12] spapr_pci: Make find_phb()/find_dev() public
This makes find_phb()/find_dev() public and changed its names to spapr_pci_find_phb()/spapr_pci_find_dev() as they are going to be used from other parts of QEMU such as VFIO DDW (dynamic DMA window) or VFIO PCI error injection or VFIO EEH handling - in all these cases there are RTAS calls which are addressed to BUID+config_addr in IEEE1275 format. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr_pci.c | 22 +++--- include/hw/pci-host/spapr.h | 4 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 9ed39a9..230b59c 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -47,7 +47,7 @@ #define RTAS_TYPE_MSI 1 #define RTAS_TYPE_MSIX 2 -static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid) +sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid) { sPAPRPHBState *sphb; @@ -61,10 +61,10 @@ static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid) return NULL; } -static PCIDevice *find_dev(sPAPREnvironment *spapr, uint64_t buid, - uint32_t config_addr) +PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid, + uint32_t config_addr) { -sPAPRPHBState *sphb = find_phb(spapr, buid); +sPAPRPHBState *sphb = spapr_pci_find_phb(spapr, buid); PCIHostState *phb = PCI_HOST_BRIDGE(sphb); int bus_num = (config_addr 16) 0xFF; int devfn = (config_addr 8) 0xFF; @@ -95,7 +95,7 @@ static void finish_read_pci_config(sPAPREnvironment *spapr, uint64_t buid, return; } -pci_dev = find_dev(spapr, buid, addr); +pci_dev = spapr_pci_find_dev(spapr, buid, addr); addr = rtas_pci_cfgaddr(addr); if (!pci_dev || (addr % size) || (addr = pci_config_size(pci_dev))) { @@ -162,7 +162,7 @@ static void finish_write_pci_config(sPAPREnvironment *spapr, uint64_t buid, return; } -pci_dev = find_dev(spapr, buid, addr); +pci_dev = spapr_pci_find_dev(spapr, buid, addr); addr = rtas_pci_cfgaddr(addr); if (!pci_dev || (addr % size) || (addr = pci_config_size(pci_dev))) { @@ -281,9 +281,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr, } /* Fins sPAPRPHBState */ -phb = find_phb(spapr, buid); +phb = spapr_pci_find_phb(spapr, buid); if (phb) { -pdev = find_dev(spapr, buid, config_addr); +pdev = spapr_pci_find_dev(spapr, buid, config_addr); } if (!phb || !pdev) { rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); @@ -377,9 +377,9 @@ static void rtas_ibm_query_interrupt_source_number(PowerPCCPU *cpu, spapr_pci_msi *msi; /* Find sPAPRPHBState */ -phb = find_phb(spapr, buid); +phb = spapr_pci_find_phb(spapr, buid); if (phb) { -pdev = find_dev(spapr, buid, config_addr); +pdev = spapr_pci_find_dev(spapr, buid, config_addr); } if (!phb || !pdev) { rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); @@ -553,7 +553,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) return; } -if (find_phb(spapr, sphb-buid)) { +if (spapr_pci_find_phb(spapr, sphb-buid)) { error_setg(errp, PCI host bridges must have unique BUIDs); return; } diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 32f0aa7..14c2ab0 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -122,4 +122,8 @@ void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr); void spapr_pci_rtas_init(void); +sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid); +PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid, + uint32_t config_addr); + #endif /* __HW_SPAPR_PCI_H__ */ -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 04/12] linux headers update for DDW
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- linux-headers/linux/vfio.h | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 26c218e..f0aa97d 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -448,13 +448,48 @@ struct vfio_iommu_type1_dma_unmap { */ struct vfio_iommu_spapr_tce_info { __u32 argsz; - __u32 flags;/* reserved for future use */ + __u32 flags; +#define VFIO_IOMMU_SPAPR_TCE_FLAG_DDW 1 /* Support dynamic windows */ __u32 dma32_window_start; /* 32 bit window start (bytes) */ __u32 dma32_window_size;/* 32 bit window size (bytes) */ }; #define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) +/* + * Dynamic DMA windows + */ +struct vfio_iommu_spapr_tce_query { + __u32 argsz; + /* out */ + __u32 windows_available; + __u32 page_size_mask; +}; +#define VFIO_IOMMU_SPAPR_TCE_QUERY _IO(VFIO_TYPE, VFIO_BASE + 17) + +struct vfio_iommu_spapr_tce_create { + __u32 argsz; + /* in */ + __u32 page_shift; + __u32 window_shift; + /* out */ + __u64 start_addr; + +}; +#define VFIO_IOMMU_SPAPR_TCE_CREATE_IO(VFIO_TYPE, VFIO_BASE + 18) + +struct vfio_iommu_spapr_tce_remove { + __u32 argsz; + /* in */ + __u64 start_addr; +}; +#define VFIO_IOMMU_SPAPR_TCE_REMOVE_IO(VFIO_TYPE, VFIO_BASE + 19) + +struct vfio_iommu_spapr_tce_reset { + __u32 argsz; +}; +#define VFIO_IOMMU_SPAPR_TCE_RESET _IO(VFIO_TYPE, VFIO_BASE + 20) + /* * */ #endif /* VFIO_H */ -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 03/12] spapr_iommu: Make spapr_tce_find_by_liobn() public
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr_iommu.c | 2 +- include/hw/ppc/spapr.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 36f5d27..588d442 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -40,7 +40,7 @@ enum sPAPRTCEAccess { static QLIST_HEAD(spapr_tce_tables, sPAPRTCETable) spapr_tce_tables; -static sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn) +sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn) { sPAPRTCETable *tcet; diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index bbba51a..9c5686e 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -467,6 +467,7 @@ struct sPAPRTCETable { QLIST_ENTRY(sPAPRTCETable) list; }; +sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn); void spapr_events_init(sPAPREnvironment *spapr); void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq); int spapr_h_cas_compose_response(target_ulong addr, target_ulong size); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 06/12] spapr: Add ddw machine option
This option will enable Dynamic DMA windows (DDW) support for pseries machine. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr.c | 15 +++ vl.c | 4 2 files changed, 19 insertions(+) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index d01978f..fec295b 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -100,6 +100,7 @@ struct sPAPRMachineState { /* public */ char *kvm_type; +bool ddw_supported; }; sPAPREnvironment *spapr; @@ -1570,10 +1571,24 @@ static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp) sm-kvm_type = g_strdup(value); } +static bool spapr_machine_get_ddw(Object *obj, Error **errp) +{ +sPAPRMachineState *sms = SPAPR_MACHINE(obj); +return sms-ddw_supported; +} + +static void spapr_machine_set_ddw(Object *obj, bool value, Error **errp) +{ +sPAPRMachineState *sms = SPAPR_MACHINE(obj); +sms-ddw_supported = value; +} + static void spapr_machine_initfn(Object *obj) { object_property_add_str(obj, kvm-type, spapr_get_kvm_type, spapr_set_kvm_type, NULL); +object_property_add_bool(obj, ddw, spapr_machine_get_ddw, + spapr_machine_set_ddw, NULL); } static void spapr_machine_class_init(ObjectClass *oc, void *data) diff --git a/vl.c b/vl.c index 6e084c2..a615fb1 100644 --- a/vl.c +++ b/vl.c @@ -383,6 +383,10 @@ static QemuOptsList qemu_machine_opts = { .name = kvm-type, .type = QEMU_OPT_STRING, .help = Specifies the KVM virtualization mode (HV, PR), +}, { +.name = ddw, +.type = QEMU_OPT_BOOL, +.help = Enable Dynamic DMA windows support (pseries only), },{ .name = PC_MACHINE_MAX_RAM_BELOW_4G, .type = QEMU_OPT_SIZE, -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 01/12] spapr_iommu: Disable in-kernel IOMMU tables for 4GB windows
The existing KVM_CREATE_SPAPR_TCE ioctl only support 4G windows max. We are going to add huge DMA windows support so this will create small window and unexpectedly fail later. This disables KVM_CREATE_SPAPR_TCE for windows bigger that 4GB. Since those windows are normally mapped at the boot time, there will be no performance impact. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr_iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index f6e32a4..36f5d27 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -113,11 +113,11 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = { static int spapr_tce_table_realize(DeviceState *dev) { sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev); +uint64_t window_size = tcet-nb_table tcet-page_shift; -if (kvm_enabled()) { +if (kvm_enabled() !(window_size 32)) { tcet-table = kvmppc_create_spapr_tce(tcet-liobn, - tcet-nb_table - tcet-page_shift, + window_size, tcet-fd, tcet-vfio_accel); } -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 08/12] spapr_pci_vfio: Enable DDW
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr_pci_vfio.c | 73 + 1 file changed, 73 insertions(+) diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index d3bddf2..b72aff0 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -71,6 +71,75 @@ static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp) spapr_tce_get_iommu(tcet)); } +static int spapr_pci_vfio_ddw_query(sPAPRPHBState *sphb, +uint32_t *windows_available, +uint32_t *page_size_mask) +{ +sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb); +struct vfio_iommu_spapr_tce_query query = { .argsz = sizeof(query) }; +int ret; + +ret = vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid, + VFIO_IOMMU_SPAPR_TCE_QUERY, query); +if (ret) { +return ret; +} + +*windows_available = query.windows_available; +*page_size_mask = query.page_size_mask; + +return ret; +} + +static int spapr_pci_vfio_ddw_create(sPAPRPHBState *sphb, uint32_t page_shift, + uint32_t window_shift, uint32_t liobn, + sPAPRTCETable **ptcet) +{ +sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb); +struct vfio_iommu_spapr_tce_create create = { +.argsz = sizeof(create), +.page_shift = page_shift, +.window_shift = window_shift, +.start_addr = 0 +}; +int ret; + +ret = vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid, + VFIO_IOMMU_SPAPR_TCE_CREATE, create); +if (ret) { +return ret; +} + +*ptcet = spapr_tce_new_table(DEVICE(sphb), liobn, create.start_addr, + page_shift, 1 (window_shift - page_shift), + true); +memory_region_add_subregion(sphb-iommu_root, (*ptcet)-bus_offset, +spapr_tce_get_iommu(*ptcet)); + +return ret; +} + +static int spapr_pci_vfio_ddw_remove(sPAPRPHBState *sphb, sPAPRTCETable *tcet) +{ +sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb); +struct vfio_iommu_spapr_tce_remove remove = { +.argsz = sizeof(remove), +.start_addr = tcet-bus_offset +}; + +return vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid, +VFIO_IOMMU_SPAPR_TCE_REMOVE, remove); +} + +static int spapr_pci_vfio_ddw_reset(sPAPRPHBState *sphb) +{ +sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb); +struct vfio_iommu_spapr_tce_reset reset = { .argsz = sizeof(reset) }; + +return vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid, +VFIO_IOMMU_SPAPR_TCE_RESET, reset); +} + static void spapr_phb_vfio_reset(DeviceState *qdev) { /* Do nothing */ @@ -84,6 +153,10 @@ static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data) dc-props = spapr_phb_vfio_properties; dc-reset = spapr_phb_vfio_reset; spc-finish_realize = spapr_phb_vfio_finish_realize; +spc-ddw_query = spapr_pci_vfio_ddw_query; +spc-ddw_create = spapr_pci_vfio_ddw_create; +spc-ddw_remove = spapr_pci_vfio_ddw_remove; +spc-ddw_reset = spapr_pci_vfio_ddw_reset; } static const TypeInfo spapr_phb_vfio_info = { -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 07/12] spapr_pci: Enable DDW
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr_pci.c | 62 + include/hw/pci-host/spapr.h | 3 +++ 2 files changed, 65 insertions(+) diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 230b59c..038a485 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -22,6 +22,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include sysemu/sysemu.h #include hw/hw.h #include hw/pci/pci.h #include hw/pci/msi.h @@ -781,6 +782,42 @@ static const char *spapr_phb_root_bus_path(PCIHostState *host_bridge, return sphb-dtbusname; } +static int spapr_pci_ddw_query(sPAPRPHBState *sphb, + uint32_t *windows_available, + uint32_t *page_size_mask) +{ +*windows_available = 1; +*page_size_mask = DDW_PGSIZE_16M; + +return 0; +} + +static int spapr_pci_ddw_create(sPAPRPHBState *sphb, uint32_t page_shift, +uint32_t window_shift, uint32_t liobn, +sPAPRTCETable **ptcet) +{ +*ptcet = spapr_tce_new_table(DEVICE(sphb), liobn, SPAPR_PCI_TCE64_START, + page_shift, 1 (window_shift - page_shift), + true); +if (!*ptcet) { +return -1; +} +memory_region_add_subregion(sphb-iommu_root, (*ptcet)-bus_offset, +spapr_tce_get_iommu(*ptcet)); + +return 0; +} + +static int spapr_pci_ddw_remove(sPAPRPHBState *sphb, sPAPRTCETable *tcet) +{ +return 0; +} + +static int spapr_pci_ddw_reset(sPAPRPHBState *sphb) +{ +return 0; +} + static void spapr_phb_class_init(ObjectClass *klass, void *data) { PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass); @@ -795,6 +832,10 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_BRIDGE, dc-categories); dc-cannot_instantiate_with_device_add_yet = false; spc-finish_realize = spapr_phb_finish_realize; +spc-ddw_query = spapr_pci_ddw_query; +spc-ddw_create = spapr_pci_ddw_create; +spc-ddw_remove = spapr_pci_ddw_remove; +spc-ddw_reset = spapr_pci_ddw_reset; } static const TypeInfo spapr_phb_info = { @@ -878,6 +919,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, uint32_t interrupt_map_mask[] = { cpu_to_be32(b_d(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)}; uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7]; +uint32_t ddw_applicable[] = { +RTAS_IBM_QUERY_PE_DMA_WINDOW, +RTAS_IBM_CREATE_PE_DMA_WINDOW, +RTAS_IBM_REMOVE_PE_DMA_WINDOW +}; +uint32_t ddw_extensions[] = { 1, RTAS_IBM_RESET_PE_DMA_WINDOW }; +sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(phb); +QemuOpts *machine_opts = qemu_get_machine_opts(); /* Start populating the FDT */ sprintf(nodename, pci@% PRIx64, phb-buid); @@ -907,6 +956,19 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, _FDT(fdt_setprop_cell(fdt, bus_off, ibm,pci-config-space-type, 0x1)); _FDT(fdt_setprop_cell(fdt, bus_off, ibm,pe-total-#msi, XICS_IRQS)); +/* Dynamic DMA window */ +if (qemu_opt_get_bool(machine_opts, ddw, true) +spc-ddw_query spc-ddw_create spc-ddw_remove) { +_FDT(fdt_setprop(fdt, bus_off, ibm,ddw-applicable, ddw_applicable, + sizeof(ddw_applicable))); + +if (spc-ddw_reset) { +/* When enabled, the guest will remove the default 32bit window */ +_FDT(fdt_setprop(fdt, bus_off, ibm,ddw-extensions, + ddw_extensions, sizeof(ddw_extensions))); +} +} + /* Build the interrupt-map, this must matches what is done * in pci_spapr_map_irq */ diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 119d326..f494cbb 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -125,6 +125,9 @@ struct sPAPRPHBVFIOState { #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL +/* Default 64bit dynamic window offset */ +#define SPAPR_PCI_TCE64_START0x8000ULL + static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin) { return xics_get_qirq(spapr-icp, phb-lsi_table[pin].irq); -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 11/12] target-ppc: kvm: make use of KVM_CREATE_SPAPR_TCE_64
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/ppc/spapr_iommu.c | 7 --- target-ppc/kvm.c | 47 --- target-ppc/kvm_ppc.h | 10 +++--- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 588d442..1710595 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -113,11 +113,12 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = { static int spapr_tce_table_realize(DeviceState *dev) { sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev); -uint64_t window_size = tcet-nb_table tcet-page_shift; -if (kvm_enabled() !(window_size 32)) { +if (kvm_enabled()) { tcet-table = kvmppc_create_spapr_tce(tcet-liobn, - window_size, + tcet-nb_table, + tcet-bus_offset, + tcet-page_shift, tcet-fd, tcet-vfio_accel); } diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 42718f7..cfc2599 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -62,6 +62,7 @@ static int cap_booke_sregs; static int cap_ppc_smt; static int cap_ppc_rma; static int cap_spapr_tce; +static int cap_spapr_tce_64; static int cap_spapr_multitce; static int cap_spapr_vfio; static int cap_hior; @@ -101,6 +102,7 @@ int kvm_arch_init(KVMState *s) cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT); cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA); cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE); +cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64); cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE); cap_spapr_vfio = false; cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG); @@ -1655,13 +1657,10 @@ bool kvmppc_spapr_use_multitce(void) return cap_spapr_multitce; } -void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd, - bool vfio_accel) +void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_shift, + uint64_t bus_offset, uint32_t page_shift, + int *pfd, bool vfio_accel) { -struct kvm_create_spapr_tce args = { -.liobn = liobn, -.window_size = window_size, -}; long len; int fd; void *table; @@ -1674,14 +1673,40 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd, return NULL; } -fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, args); -if (fd 0) { -fprintf(stderr, KVM: Failed to create TCE table for liobn 0x%x\n, -liobn); +if (cap_spapr_tce_64) { +struct kvm_create_spapr_tce_64 args = { +.liobn = liobn, +.page_shift = page_shift, +.offset = bus_offset page_shift, +.size = window_shift, +.flags = 0 +}; +fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_64, args); +if (fd 0) { +fprintf(stderr, +KVM: Failed to create TCE64 table for liobn 0x%x\n, +liobn); +return NULL; +} +} else if (cap_spapr_tce) { +struct kvm_create_spapr_tce args = { +.liobn = liobn, +.window_size = window_shift page_shift, +}; +if (((window_shift page_shift) != args.window_size) || bus_offset) { +return NULL; +} +fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, args); +if (fd 0) { +fprintf(stderr, KVM: Failed to create TCE table for liobn 0x%x\n, +liobn); +return NULL; +} +} else { return NULL; } -len = (window_size / SPAPR_TCE_PAGE_SIZE) * sizeof(uint64_t); +len = window_shift * sizeof(uint64_t); /* FIXME: round this up to page size */ table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h index d9516e7..154f434 100644 --- a/target-ppc/kvm_ppc.h +++ b/target-ppc/kvm_ppc.h @@ -33,8 +33,9 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu); #ifndef CONFIG_USER_ONLY off_t kvmppc_alloc_rma(void **rma); bool kvmppc_spapr_use_multitce(void); -void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd, - bool vfio_accel); +void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_shift, + uint64_t bus_offset, uint32_t page_shift, + int *pfd, bool vfio_accel); int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size); int kvmppc_reset_htab(int shift_hint); uint64_t kvmppc_rma_size(uint64_t
[PATCH QEMU 12/12] vfio: Enable in-kernel acceleration via VFIO KVM device
TCE hypercalls (H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE) use a logical bus number (LIOBN) to identify which TCE table the request is addressed to. However VFIO kernel driver operates with IOMMU group IDs and has no idea about which LIOBN corresponds to which group. If the host kernel supports in-kernel acceleration for TCE calls, we have to provide the LIOBN to IOMMU mapping information. This makes use of a VFIO KVM device's KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN attribute to set the link between LIOBN and IOMMU group. The vfio_container_spapr_set_liobn() helper is implemented completely in vfio.c because kvm_vfio_spapr_tce_liobn needs a group fd and we do not want to share resources likes that outside vfio.c. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/misc/vfio.c | 41 + hw/ppc/spapr_iommu.c| 1 + hw/ppc/spapr_pci_vfio.c | 22 ++ include/hw/misc/vfio.h | 5 + 4 files changed, 69 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index e7b4d6e..6e9919a 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -4450,3 +4450,44 @@ int vfio_container_ioctl(AddressSpace *as, int32_t groupid, return vfio_container_do_ioctl(as, groupid, req, param); } + +int vfio_container_spapr_set_liobn(AddressSpace *as, + int32_t groupid, + uint64_t liobn, + uint64_t start_addr) +{ +#ifdef CONFIG_KVM +VFIOGroup *group; +int ret; +struct kvm_vfio_spapr_tce_liobn param = { +.argsz = sizeof(param), +.liobn = liobn, +.start_addr = start_addr +}; +struct kvm_device_attr attr = { +.group = KVM_DEV_VFIO_GROUP, +.attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN, +.addr = (uint64_t)(unsigned long)param, +}; + +if (vfio_kvm_device_fd 0) { +return 0; +} + +group = vfio_get_group(groupid, as); +if (!group) { +return -1; +} + +param.fd = group-fd; +ret = ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, attr); +if (ret) { +error_report(vfio: failed to setup liobn for a group: %s, + strerror(errno)); +} + +return ret; +#else +return 0; +#endif +} diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 1710595..3c2a9c9 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -126,6 +126,7 @@ static int spapr_tce_table_realize(DeviceState *dev) if (!tcet-table) { size_t table_size = tcet-nb_table * sizeof(uint64_t); tcet-table = g_malloc0(table_size); +tcet-vfio_accel = false; } trace_spapr_iommu_new_table(tcet-liobn, tcet, tcet-table, tcet-fd); diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index b72aff0..06b4e02 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -21,6 +21,7 @@ #include hw/pci-host/spapr.h #include linux/vfio.h #include hw/misc/vfio.h +#include qemu/error-report.h static Property spapr_phb_vfio_properties[] = { DEFINE_PROP_INT32(iommu, sPAPRPHBVFIOState, iommugroupid, -1), @@ -69,6 +70,17 @@ static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp) /* Register default 32bit DMA window */ memory_region_add_subregion(sphb-iommu_root, tcet-bus_offset, spapr_tce_get_iommu(tcet)); + +if (!tcet-vfio_accel) { +return; +} +ret = vfio_container_spapr_set_liobn(svphb-phb.iommu_as, + svphb-iommugroupid, + tcet-liobn, + tcet-bus_offset); +if (ret) { +error_report(spapr-vfio: failed to create link to IOMMU); +} } static int spapr_pci_vfio_ddw_query(sPAPRPHBState *sphb, @@ -116,6 +128,16 @@ static int spapr_pci_vfio_ddw_create(sPAPRPHBState *sphb, uint32_t page_shift, memory_region_add_subregion(sphb-iommu_root, (*ptcet)-bus_offset, spapr_tce_get_iommu(*ptcet)); +if (!(*ptcet)-vfio_accel) { +return 0; +} +ret = vfio_container_spapr_set_liobn(sphb-iommu_as, svphb-iommugroupid, + liobn, (*ptcet)-bus_offset); +if (ret) { +error_report(spapr-vfio: failed to create link to IOMMU); +ret = 0; +} + return ret; } diff --git a/include/hw/misc/vfio.h b/include/hw/misc/vfio.h index 0b26cd8..8f248e2 100644 --- a/include/hw/misc/vfio.h +++ b/include/hw/misc/vfio.h @@ -6,4 +6,9 @@ extern int vfio_container_ioctl(AddressSpace *as, int32_t groupid, int req, void *param); +extern int vfio_container_spapr_set_liobn(AddressSpace *as, + int32_t groupid, + uint64_t liobn, + uint64_t
[PATCH QEMU 10/12] headers: update for KVM_CAP_SPAPR_TCE_64 and VFIO KVM device
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- linux-headers/asm-mips/kvm_para.h | 6 +- linux-headers/asm-powerpc/kvm.h | 9 + linux-headers/linux/kvm.h | 12 linux-headers/linux/kvm_para.h| 3 +++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/linux-headers/asm-mips/kvm_para.h b/linux-headers/asm-mips/kvm_para.h index 14fab8f..dbb2464 100644 --- a/linux-headers/asm-mips/kvm_para.h +++ b/linux-headers/asm-mips/kvm_para.h @@ -1 +1,5 @@ -#include asm-generic/kvm_para.h +#ifndef _ASM_MIPS_KVM_PARA_H +#define _ASM_MIPS_KVM_PARA_H + + +#endif /* _ASM_MIPS_KVM_PARA_H */ diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h index 2bc4a94..39325bf 100644 --- a/linux-headers/asm-powerpc/kvm.h +++ b/linux-headers/asm-powerpc/kvm.h @@ -333,6 +333,15 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ + __u32 flags; +}; + /* for KVM_ALLOCATE_RMA */ struct kvm_allocate_rma { __u64 rma_size; diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index f5d2c38..fd728d3 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -758,6 +758,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_VM_ATTRIBUTES 101 #define KVM_CAP_ARM_PSCI_0_2 102 #define KVM_CAP_PPC_FIXUP_HCALL 103 +#define KVM_CAP_SPAPR_TCE_VFIO 104 +#define KVM_CAP_SPAPR_TCE_64 105 #ifdef KVM_CAP_IRQ_ROUTING @@ -947,9 +949,17 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN 3 #define KVM_DEV_TYPE_ARM_VGIC_V2 5 #define KVM_DEV_TYPE_FLIC 6 +struct kvm_vfio_spapr_tce_liobn { + __u32 argsz; + __s32 fd; + __u32 liobn; + __u64 start_addr; +}; + /* * ioctls for VM fds */ @@ -1031,6 +1041,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PPC_ALLOC_HTAB */ #define KVM_PPC_ALLOCATE_HTAB_IOWR(KVMIO, 0xa7, __u32) #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +#define KVM_CREATE_SPAPR_TCE_64 _IOW(KVMIO, 0xa8, \ + struct kvm_create_spapr_tce_64) /* Available with KVM_CAP_RMA */ #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) /* Available with KVM_CAP_PPC_HTAB_FD */ diff --git a/linux-headers/linux/kvm_para.h b/linux-headers/linux/kvm_para.h index 2dff783..e61661e 100644 --- a/linux-headers/linux/kvm_para.h +++ b/linux-headers/linux/kvm_para.h @@ -20,6 +20,9 @@ #define KVM_HC_FEATURES3 #define KVM_HC_PPC_MAP_MAGIC_PAGE 4 #define KVM_HC_KICK_CPU5 +#define KVM_HC_MIPS_GET_CLOCK_FREQ 6 +#define KVM_HC_MIPS_EXIT_VM7 +#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 /* * hypercalls use architecture specific -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH QEMU 09/12] vfio: Enable DDW ioctls to VFIO IOMMU driver
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- hw/misc/vfio.c | 4 1 file changed, 4 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 0b9eba0..e7b4d6e 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -4437,6 +4437,10 @@ int vfio_container_ioctl(AddressSpace *as, int32_t groupid, switch (req) { case VFIO_CHECK_EXTENSION: case VFIO_IOMMU_SPAPR_TCE_GET_INFO: +case VFIO_IOMMU_SPAPR_TCE_QUERY: +case VFIO_IOMMU_SPAPR_TCE_CREATE: +case VFIO_IOMMU_SPAPR_TCE_REMOVE: +case VFIO_IOMMU_SPAPR_TCE_RESET: break; default: /* Return an error on unknown requests */ -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] ppc/xmon: use isxdigit/isspace/isalnum from ctype.h
❦ 15 juillet 2014 08:55 GMT, David Laight david.lai...@aculab.com : Use linux/ctype.h instead of defining custom versions of isxdigit/isspace/isalnum. ... -#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0) That is different from the version in linux/ctype.h Especially for 'c == 0', but probably also vertical tab and form feed. OK. Looking more carefully, the one in ctype.h is 9-13 (11 is vertical tab, 12 is form feed), 32 and 160 (non-breaking space, not ASCII). For isxdigit, this is the same. For isalnum, the one in ctype.h does accept non ASCII chars from 223. Also, in xmon.c, isxdigit is defined twice. -- Parenthesise to avoid ambiguity. - The Elements of Programming Style (Kernighan Plauger) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: Re: [PATCH v5 2/2] [BUGFIX] kprobes: Fix Failed to find blacklist error on ia64 and ppc64
(2014/07/15 16:16), Benjamin Herrenschmidt wrote: On Tue, 2014-07-15 at 13:19 +1000, Michael Ellerman wrote: Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com Reported-by: Tony Luck tony.l...@gmail.com Tested-by: Tony Luck tony.l...@intel.com Cc: Michael Ellerman m...@ellerman.id.au Tested-by: Michael Ellerman m...@ellerman.id.au Acked-by: Michael Ellerman m...@ellerman.id.au (for powerpc) Ben, can you take this in your tree? Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org That looks more like generic material. Do we have a kprobes maintainer ? Andrew, do you want to take this ? Yeah, I usually use Ingo's tip tree for kprobes maintenance. Ingo, could you pull this as urgent-for-linus patch? I'm happy to put it in powerpc and send it to Linus tomorrow if nobody cares :-) Thank you! -- Masami HIRAMATSU Software Platform Research Dept. Linux Technology Research Center Hitachi, Ltd., Yokohama Research Laboratory E-mail: masami.hiramatsu...@hitachi.com ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: Move bad_stack() below the fwnmi_data_area
At the moment the allmodconfig build is failing because we run out of space between altivec_assist() at 0x5700 and the fwnmi_data_area at 0x7000. Fixing it permanently will take some more work, but a quick fix is to move bad_stack() below the fwnmi_data_area. That gives us just enough room with everything enabled. bad_stack() is called from the common exception handlers, but it's a non-conditional branch, so we have plenty of scope to move it further way. Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/kernel/exceptions-64s.S | 120 +-- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b859b3665be6..647d6c75ed62 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -953,66 +953,6 @@ ppc64_runlatch_on_trampoline: b __ppc64_runlatch_on /* - * Here we have detected that the kernel stack pointer is bad. - * R9 contains the saved CR, r13 points to the paca, - * r10 contains the (bad) kernel stack pointer, - * r11 and r12 contain the saved SRR0 and SRR1. - * We switch to using an emergency stack, save the registers there, - * and call kernel_bad_stack(), which panics. - */ -bad_stack: - ld r1,PACAEMERGSP(r13) - subir1,r1,64+INT_FRAME_SIZE - std r9,_CCR(r1) - std r10,GPR1(r1) - std r11,_NIP(r1) - std r12,_MSR(r1) - mfspr r11,SPRN_DAR - mfspr r12,SPRN_DSISR - std r11,_DAR(r1) - std r12,_DSISR(r1) - mflrr10 - mfctr r11 - mfxer r12 - std r10,_LINK(r1) - std r11,_CTR(r1) - std r12,_XER(r1) - SAVE_GPR(0,r1) - SAVE_GPR(2,r1) - ld r10,EX_R3(r3) - std r10,GPR3(r1) - SAVE_GPR(4,r1) - SAVE_4GPRS(5,r1) - ld r9,EX_R9(r3) - ld r10,EX_R10(r3) - SAVE_2GPRS(9,r1) - ld r9,EX_R11(r3) - ld r10,EX_R12(r3) - ld r11,EX_R13(r3) - std r9,GPR11(r1) - std r10,GPR12(r1) - std r11,GPR13(r1) -BEGIN_FTR_SECTION - ld r10,EX_CFAR(r3) - std r10,ORIG_GPR3(r1) -END_FTR_SECTION_IFSET(CPU_FTR_CFAR) - SAVE_8GPRS(14,r1) - SAVE_10GPRS(22,r1) - lhz r12,PACA_TRAP_SAVE(r13) - std r12,_TRAP(r1) - addir11,r1,INT_FRAME_SIZE - std r11,0(r1) - li r12,0 - std r12,0(r11) - ld r2,PACATOC(r13) - ld r11,exception_marker@toc(r2) - std r12,RESULT(r1) - std r11,STACK_FRAME_OVERHEAD-16(r1) -1: addir3,r1,STACK_FRAME_OVERHEAD - bl kernel_bad_stack - b 1b - -/* * Here r13 points to the paca, r9 contains the saved CR, * SRR0 and SRR1 are saved in r11 and r12, * r9 - r13 are saved in paca-exgen. @@ -1636,3 +1576,63 @@ handle_dabr_fault: li r5,SIGSEGV bl bad_page_fault b ret_from_except + +/* + * Here we have detected that the kernel stack pointer is bad. + * R9 contains the saved CR, r13 points to the paca, + * r10 contains the (bad) kernel stack pointer, + * r11 and r12 contain the saved SRR0 and SRR1. + * We switch to using an emergency stack, save the registers there, + * and call kernel_bad_stack(), which panics. + */ +bad_stack: + ld r1,PACAEMERGSP(r13) + subir1,r1,64+INT_FRAME_SIZE + std r9,_CCR(r1) + std r10,GPR1(r1) + std r11,_NIP(r1) + std r12,_MSR(r1) + mfspr r11,SPRN_DAR + mfspr r12,SPRN_DSISR + std r11,_DAR(r1) + std r12,_DSISR(r1) + mflrr10 + mfctr r11 + mfxer r12 + std r10,_LINK(r1) + std r11,_CTR(r1) + std r12,_XER(r1) + SAVE_GPR(0,r1) + SAVE_GPR(2,r1) + ld r10,EX_R3(r3) + std r10,GPR3(r1) + SAVE_GPR(4,r1) + SAVE_4GPRS(5,r1) + ld r9,EX_R9(r3) + ld r10,EX_R10(r3) + SAVE_2GPRS(9,r1) + ld r9,EX_R11(r3) + ld r10,EX_R12(r3) + ld r11,EX_R13(r3) + std r9,GPR11(r1) + std r10,GPR12(r1) + std r11,GPR13(r1) +BEGIN_FTR_SECTION + ld r10,EX_CFAR(r3) + std r10,ORIG_GPR3(r1) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + SAVE_8GPRS(14,r1) + SAVE_10GPRS(22,r1) + lhz r12,PACA_TRAP_SAVE(r13) + std r12,_TRAP(r1) + addir11,r1,INT_FRAME_SIZE + std r11,0(r1) + li r12,0 + std r12,0(r11) + ld r2,PACATOC(r13) + ld r11,exception_marker@toc(r2) + std r12,RESULT(r1) + std r11,STACK_FRAME_OVERHEAD-16(r1) +1: addir3,r1,STACK_FRAME_OVERHEAD + bl kernel_bad_stack + b 1b -- 1.9.1 ___ Linuxppc-dev mailing
[PATCH 1/3] powerpc: Update comments in irqflags.h
The comment on TRACE_ENABLE_INTS is incorrect, and appears to have always been incorrect since the code was merged. It probably came from an original out-of-tree patch. Replace it with something that's correct. Also propagate the message to RECONCILE_IRQ_STATE(), because it's potentially subtle. Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/include/asm/irqflags.h | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/irqflags.h b/arch/powerpc/include/asm/irqflags.h index e20eb95429a8..f2149066fe5d 100644 --- a/arch/powerpc/include/asm/irqflags.h +++ b/arch/powerpc/include/asm/irqflags.h @@ -32,9 +32,8 @@ #endif /* - * Most of the CPU's IRQ-state tracing is done from assembly code; we - * have to call a C function so call a wrapper that saves all the - * C-clobbered registers. + * These are calls to C code, so the caller must be prepared for volatiles to + * be clobbered. */ #define TRACE_ENABLE_INTS TRACE_WITH_FRAME_BUFFER(trace_hardirqs_on) #define TRACE_DISABLE_INTS TRACE_WITH_FRAME_BUFFER(trace_hardirqs_off) @@ -42,6 +41,9 @@ /* * This is used by assembly code to soft-disable interrupts first and * reconcile irq state. + * + * NB: This may call C code, so the caller must be prepared for volatiles to + * be clobbered. */ #define RECONCILE_IRQ_STATE(__rA, __rB)\ lbz __rA,PACASOFTIRQEN(r13);\ -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/3] powerpc: Document register clobbering in EXCEPTION_COMMON()
Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/include/asm/exception-64s.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 8f35cd7d59cc..066c15cd2837 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -532,6 +532,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL) .globl label##_common; \ label##_common:\ EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ + /* Volatile regs are potentially clobbered here */ \ additions; \ addir3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/3] powerpc: Remove misleading DISABLE_INTS
DISABLE_INTS has a long and storied history, but for some time now it has not actually disabled interrupts. For the open-coded exception handlers, just stop using it, instead call RECONCILE_IRQ_STATE directly. This has the benefit of removing a level of indirection, and making it clear that r10 r11 are used at that point. For the addition case we still need a macro, so rename it to clarify what it actually does. Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/include/asm/exception-64s.h | 11 +++ arch/powerpc/kernel/exceptions-64s.S | 28 ++-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 066c15cd2837..13a63379e496 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -513,8 +513,11 @@ label##_relon_hv: \ * runlatch, etc... */ -/* Exception addition: Hard disable interrupts */ -#define DISABLE_INTS RECONCILE_IRQ_STATE(r10,r11) +/* + * This addition reconciles our actual IRQ state with the various software + * flags that track it. This may call C code. + */ +#define ADD_RECONCILE RECONCILE_IRQ_STATE(r10,r11) #define ADD_NVGPRS \ bl save_nvgprs @@ -540,7 +543,7 @@ label##_common: \ #define STD_EXCEPTION_COMMON(trap, label, hdlr)\ EXCEPTION_COMMON(trap, label, hdlr, ret_from_except,\ -ADD_NVGPRS;DISABLE_INTS) +ADD_NVGPRS;ADD_RECONCILE) /* * Like STD_EXCEPTION_COMMON, but for exceptions that can occur @@ -549,7 +552,7 @@ label##_common: \ */ #define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr)\ EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \ -FINISH_NAP;DISABLE_INTS;RUNLATCH_ON) +FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON) /* * When the idle code in power4_idle puts the CPU into NAP mode, diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a7d36b19221d..03a54ef03049 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1057,7 +1057,7 @@ data_access_common: mfspr r10,SPRN_DSISR stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) @@ -1073,7 +1073,7 @@ h_data_storage_common: stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN) bl save_nvgprs - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) addir3,r1,STACK_FRAME_OVERHEAD bl unknown_exception b ret_from_except @@ -1082,7 +1082,7 @@ h_data_storage_common: .globl instruction_access_common instruction_access_common: EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN) - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) andis. r4,r12,0x5820 @@ -1146,7 +1146,7 @@ slb_miss_fault: unrecov_user_slb: EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) bl save_nvgprs 1: addir3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception @@ -1169,7 +1169,7 @@ machine_check_common: stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC) FINISH_NAP - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) std r3,_DAR(r1) @@ -1192,7 +1192,7 @@ alignment_common: std r3,_DAR(r1) std r4,_DSISR(r1) bl save_nvgprs - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) addir3,r1,STACK_FRAME_OVERHEAD bl alignment_exception b ret_from_except @@ -1202,7 +1202,7 @@ alignment_common: program_check_common: EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) bl save_nvgprs - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) addir3,r1,STACK_FRAME_OVERHEAD bl program_check_exception b ret_from_except @@ -1213,7 +1213,7 @@ fp_unavailable_common: EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) bne 1f /* if from user, just load it up */ bl save_nvgprs - DISABLE_INTS + RECONCILE_IRQ_STATE(r10, r11) addir3,r1,STACK_FRAME_OVERHEAD bl kernel_fp_unavailable_exception
Re: [PATCH 1/6] powerpc/powernv: Enable M64 aperatus for PHB3
On Tue, Jul 15, 2014 at 10:55:25AM +0800, Wei Yang wrote: On Thu, Jul 10, 2014 at 09:53:41PM +0800, Guo Chao wrote: This patch enable M64 aperatus for PHB3. We already had platform hook (ppc_md.pcibios_window_alignment) to affect the PCI resource assignment done in PCI core so that each PE's M32 resource was built on basis of M32 segment size. Similarly, we're using that for M64 assignment on basis of M64 segment size. * We're using last M64 BAR to cover M64 aperatus, and it's shared by all 256 PEs. * We don't support P7IOC yet. However, some function callbacks are added to (struct pnv_phb) so that we can reuse them on P7IOC in future. * PE, corresponding to PCI bus with large M64 BAR device attached, might span multiple M64 segments. We introduce compound PE to cover the case. The compound PE is a list of PEs and the master PE is used as before. The slave PEs are just for MMIO isolation. Signed-off-by: Guo Chao y...@linux.vnet.ibm.com --- arch/powerpc/include/asm/opal.h | 8 +- arch/powerpc/platforms/powernv/pci-ioda.c | 284 -- arch/powerpc/platforms/powernv/pci.h | 20 +++ 3 files changed, 297 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 0da1dbd..ae885cc 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -340,6 +340,12 @@ enum OpalMveEnableAction { OPAL_ENABLE_MVE = 1 }; +enum OpalM64EnableAction { + OPAL_DISABLE_M64 = 0, + OPAL_ENABLE_M64_SPLIT = 1, + OPAL_ENABLE_M64_NON_SPLIT = 2 +}; + enum OpalPciResetScope { OPAL_PHB_COMPLETE = 1, OPAL_PCI_LINK = 2, OPAL_PHB_ERROR = 3, OPAL_PCI_HOT_RESET = 4, OPAL_PCI_FUNDAMENTAL_RESET = 5, @@ -768,7 +774,7 @@ int64_t opal_pci_set_phb_mem_window(uint64_t phb_id, uint16_t window_type, uint16_t window_num, uint64_t starting_real_address, uint64_t starting_pci_address, - uint16_t segment_size); + uint64_t size); int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint16_t pe_number, uint16_t window_type, uint16_t window_num, uint16_t segment_num); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index de19ede..851e615 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -36,6 +36,7 @@ #include asm/tce.h #include asm/xics.h #include asm/debug.h +#include asm/firmware.h #include powernv.h #include pci.h @@ -82,6 +83,12 @@ static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) : : r (val), r (paddr) : memory); } +static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) +{ + return ((flags (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == + (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); +} + static int pnv_ioda_alloc_pe(struct pnv_phb *phb) { unsigned long pe; @@ -106,6 +113,243 @@ static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) clear_bit(pe, phb-ioda.pe_alloc); } +/* The default M64 BAR is shared by all PEs */ +static int pnv_ioda2_init_m64(struct pnv_phb *phb) +{ + const char *desc; + struct resource *r; + s64 rc; + + /* Configure the default M64 BAR */ + rc = opal_pci_set_phb_mem_window(phb-opal_id, + OPAL_M64_WINDOW_TYPE, + phb-ioda.m64_bar_idx, + phb-ioda.m64_base, + 0, /* unused */ + phb-ioda.m64_size); + if (rc != OPAL_SUCCESS) { + desc = configuring; + goto fail; + } + + /* Enable the default M64 BAR */ + rc = opal_pci_phb_mmio_enable(phb-opal_id, + OPAL_M64_WINDOW_TYPE, + phb-ioda.m64_bar_idx, + OPAL_ENABLE_M64_SPLIT); + if (rc != OPAL_SUCCESS) { + desc = enabling; + goto fail; + } + + /* Mark the M64 BAR assigned */ + set_bit(phb-ioda.m64_bar_idx, phb-ioda.m64_bar_alloc); + + /* + * Strip off the segment used by the reserved PE, which is + * expected to be 0 or last one of PE capabicity. + */ + r = phb-hose-mem_resources[1]; + if (phb-ioda.reserved_pe == 0) + r-start += phb-ioda.m64_segsize; + else if (phb-ioda.reserved_pe == (phb-ioda.total_pe - 1)) + r-end -= phb-ioda.m64_segsize; + else + pr_warn( Cannot strip M64 segment for reserved PE#%d\n, + phb-ioda.reserved_pe); + + return 0; + +fail: + pr_warn( Failure %lld %s M64 BAR#%d\n, +
[PATCH] ppc/xmon: use isspace/isxdigit/isalnum from linux/ctype.h
isxdigit() macro definition is the same. isalnum() from linux/ctype.h will accept additional latin non-ASCII characters. This is harmless since this macro is used in scanhex() which parses user input. isspace() from linux/ctype.h will accept vertical tab and form feed but not NULL. The use of this macro is modified to accept NULL as well. Additional characters are harmless since this macro is also only used in scanhex(). Signed-off-by: Vincent Bernat vinc...@bernat.im --- arch/powerpc/xmon/xmon.c | 14 ++ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d199bfa2f1fa..55d9b48774b7 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -24,6 +24,7 @@ #include linux/interrupt.h #include linux/irq.h #include linux/bug.h +#include linux/ctype.h #include asm/ptrace.h #include asm/string.h @@ -177,14 +178,6 @@ extern void xmon_leave(void); #define GETWORD(v) (((v)[0] 24) + ((v)[1] 16) + ((v)[2] 8) + (v)[3]) #endif -#define isxdigit(c)(('0' = (c) (c) = '9') \ -|| ('a' = (c) (c) = 'f') \ -|| ('A' = (c) (c) = 'F')) -#define isalnum(c) (('0' = (c) (c) = '9') \ -|| ('a' = (c) (c) = 'z') \ -|| ('A' = (c) (c) = 'Z')) -#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0) - static char *help_string = \ Commands:\n\ bshow breakpoints\n\ @@ -2121,9 +2114,6 @@ static void dump_pacas(void) } #endif -#define isxdigit(c)(('0' = (c) (c) = '9') \ -|| ('a' = (c) (c) = 'f') \ -|| ('A' = (c) (c) = 'F')) static void dump(void) { @@ -2526,7 +2516,7 @@ scanhex(unsigned long *vp) int i; for (i=0; i63; i++) { c = inchar(); - if (isspace(c)) { + if (isspace(c) || c == '\0') { termch = c; break; } -- 2.0.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/3] powerpc: Add machine_early_initcall()
Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/include/asm/machdep.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index f92b0b54e921..5c7e74ddee4c 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -366,6 +366,7 @@ static inline void log_error(char *buf, unsigned int err_type, int fatal) } \ __define_initcall(__machine_initcall_##mach##_##fn, id); +#define machine_early_initcall(mach, fn) __define_machine_initcall(mach, fn, early) #define machine_core_initcall(mach, fn) __define_machine_initcall(mach, fn, 1) #define machine_core_initcall_sync(mach, fn) __define_machine_initcall(mach, fn, 1s) #define machine_postcore_initcall(mach, fn)__define_machine_initcall(mach, fn, 2) -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/3] powerpc/powernv: Switch powernv drivers to use machine_xxx_initcall()
A lot of the code in platforms/powernv is using non-machine initcalls. That means if a kernel built with powernv support runs on another platform, for example pseries, the initcalls will still run. That is usually OK, because the initcalls will check for something in the device tree or elsewhere before doing anything, so on other platforms they will usually just return. But it's fishy for powernv code to be running on other platforms, so switch them all to be machine initcalls. If we want any of them to run on other platforms in future they should move to sysdev. Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/platforms/powernv/eeh-powernv.c| 6 +- arch/powerpc/platforms/powernv/opal-async.c | 3 ++- arch/powerpc/platforms/powernv/opal-lpc.c | 2 +- arch/powerpc/platforms/powernv/opal-memory-errors.c | 3 ++- arch/powerpc/platforms/powernv/opal-xscom.c | 2 +- arch/powerpc/platforms/powernv/opal.c | 9 + arch/powerpc/platforms/powernv/pci.c| 3 +-- arch/powerpc/platforms/powernv/rng.c| 2 +- 8 files changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 56a206f32f77..998bcc18a491 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -398,9 +398,6 @@ static int __init eeh_powernv_init(void) { int ret = -EINVAL; - if (!machine_is(powernv)) - return ret; - ret = eeh_ops_register(powernv_eeh_ops); if (!ret) pr_info(EEH: PowerNV platform initialized\n); @@ -409,5 +406,4 @@ static int __init eeh_powernv_init(void) return ret; } - -early_initcall(eeh_powernv_init); +machine_early_initcall(powernv, eeh_powernv_init); diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 32e2adfa5320..e462ab947d16 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -20,6 +20,7 @@ #include linux/wait.h #include linux/gfp.h #include linux/of.h +#include asm/machdep.h #include asm/opal.h #define N_ASYNC_COMPLETIONS64 @@ -201,4 +202,4 @@ out_opal_node: out: return err; } -subsys_initcall(opal_async_comp_init); +machine_subsys_initcall(powernv, opal_async_comp_init); diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index f04b4d8aca5a..ad4b31df779a 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -324,7 +324,7 @@ static int opal_lpc_init_debugfs(void) rc |= opal_lpc_debugfs_create_type(root, fw, OPAL_LPC_FW); return rc; } -device_initcall(opal_lpc_init_debugfs); +machine_device_initcall(powernv, opal_lpc_init_debugfs); #endif /* CONFIG_DEBUG_FS */ void opal_lpc_init(void) diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index b17a34b695ef..43db2136dbff 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -27,6 +27,7 @@ #include linux/mm.h #include linux/slab.h +#include asm/machdep.h #include asm/opal.h #include asm/cputable.h @@ -143,4 +144,4 @@ static int __init opal_mem_err_init(void) } return 0; } -subsys_initcall(opal_mem_err_init); +machine_subsys_initcall(powernv, opal_mem_err_init); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 4cd2ea6c0dbe..7634d1c62299 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -130,4 +130,4 @@ static int opal_xscom_init(void) scom_init(opal_scom_controller); return 0; } -arch_initcall(opal_xscom_init); +machine_arch_initcall(powernv, opal_xscom_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 199975613fe9..6ef2e5c5bc64 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -22,6 +22,8 @@ #include linux/kobject.h #include linux/delay.h #include linux/memblock.h + +#include asm/machdep.h #include asm/opal.h #include asm/firmware.h #include asm/mce.h @@ -200,8 +202,7 @@ static int __init opal_register_exception_handlers(void) return 0; } - -early_initcall(opal_register_exception_handlers); +machine_early_initcall(powernv, opal_register_exception_handlers); int opal_notifier_register(struct notifier_block *nb) { @@ -368,7 +369,7 @@ static int __init opal_message_init(void) } return 0; } -early_initcall(opal_message_init); +machine_early_initcall(powernv, opal_message_init); int opal_get_chars(uint32_t vtermno, char *buf, int count) { @@ -630,7 +631,7 @@
[PATCH 3/3] powerpc/pseries: Switch pseries drivers to use machine_xxx_initcall()
A lot of the code in platforms/pseries is using non-machine initcalls. That means if a kernel built with pseries support runs on another platform, for example powernv, the initcalls will still run. Most of these cases are OK, though sometimes only due to luck. Some were having more effect: * hcall_inst_init - Checking FW_FEATURE_LPAR which is set on ps3 celleb. * mobility_sysfs_init - created sysfs files unconditionally - but no effect due to ENOSYS from rtas_ibm_suspend_me() * apo_pm_init - created sysfs, allows write - nothing checks the value written to though * alloc_dispatch_log_kmem_cache - creating kmem_cache on non-pseries machines Signed-off-by: Michael Ellerman m...@ellerman.id.au --- arch/powerpc/platforms/pseries/dtl.c | 3 ++- arch/powerpc/platforms/pseries/eeh_pseries.c | 8 ++-- arch/powerpc/platforms/pseries/hvCall_inst.c | 2 +- arch/powerpc/platforms/pseries/mobility.c| 3 ++- arch/powerpc/platforms/pseries/msi.c | 3 +-- arch/powerpc/platforms/pseries/power.c | 5 +++-- arch/powerpc/platforms/pseries/ras.c | 2 +- arch/powerpc/platforms/pseries/reconfig.c| 5 + arch/powerpc/platforms/pseries/rng.c | 2 +- arch/powerpc/platforms/pseries/setup.c | 2 +- arch/powerpc/platforms/pseries/suspend.c | 5 ++--- 11 files changed, 17 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 7d61498e45c0..1062f71f5a85 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -29,6 +29,7 @@ #include asm/lppaca.h #include asm/debug.h #include asm/plpar_wrappers.h +#include asm/machdep.h struct dtl { struct dtl_entry*buf; @@ -391,4 +392,4 @@ err_remove_dir: err: return rc; } -arch_initcall(dtl_init); +machine_arch_initcall(pseries, dtl_init); diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 0bec0c02c5e7..476a5d8b0b36 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -743,10 +743,7 @@ static struct eeh_ops pseries_eeh_ops = { */ static int __init eeh_pseries_init(void) { - int ret = -EINVAL; - - if (!machine_is(pseries)) - return ret; + int ret; ret = eeh_ops_register(pseries_eeh_ops); if (!ret) @@ -757,5 +754,4 @@ static int __init eeh_pseries_init(void) return ret; } - -early_initcall(eeh_pseries_init); +machine_early_initcall(pseries, eeh_pseries_init); diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index cf4e7736e4f1..19f15310facf 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -162,4 +162,4 @@ static int __init hcall_inst_init(void) return 0; } -__initcall(hcall_inst_init); +machine_device_initcall(pseries, hcall_inst_init); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index bde7ebad3949..d146fef038b8 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -18,6 +18,7 @@ #include linux/delay.h #include linux/slab.h +#include asm/machdep.h #include asm/rtas.h #include pseries.h @@ -362,4 +363,4 @@ static int __init mobility_sysfs_init(void) return rc; } -device_initcall(mobility_sysfs_init); +machine_device_initcall(pseries, mobility_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 0c882e83c4ce..b3486dd041df 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -532,5 +532,4 @@ static int rtas_msi_init(void) return 0; } -arch_initcall(rtas_msi_init); - +machine_arch_initcall(pseries, rtas_msi_init); diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c index 6d6266236446..c26eadde434c 100644 --- a/arch/powerpc/platforms/pseries/power.c +++ b/arch/powerpc/platforms/pseries/power.c @@ -25,6 +25,7 @@ #include linux/string.h #include linux/errno.h #include linux/init.h +#include asm/machdep.h unsigned long rtas_poweron_auto; /* default and normal state is 0 */ @@ -71,11 +72,11 @@ static int __init pm_init(void) return -ENOMEM; return sysfs_create_group(power_kobj, attr_group); } -core_initcall(pm_init); +machine_core_initcall(pseries, pm_init); #else static int __init apo_pm_init(void) { return (sysfs_create_file(power_kobj, auto_poweron_attr.attr)); } -__initcall(apo_pm_init); +machine_device_initcall(pseries, apo_pm_init); #endif diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 9c5778e6ed4b..dff05b9eb946 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@
Re: bit fields data tearing
On 07/13/2014 06:25 PM, Benjamin Herrenschmidt wrote: On Sun, 2014-07-13 at 09:15 -0400, Peter Hurley wrote: I'm not sure I understand your point here, Ben. Suppose that two different spinlocks are used independently to protect r-m-w access to adjacent data. In Oleg's example, suppose spinlock 1 is used for access to the bitfield and spinlock 2 is used for access to freeze_stop. What would prevent an accidental write to freeze_stop from the kt_1 thread? My point was to be weary of bitfields in general because access to them is always R-M-W, never atomic and that seem to escape people regularily :-) (Among other problems such as endian etc...) As for Oleg's example, it *should* have worked because the bitfield and the adjacent freeze_stop should have been accessed using load/stores that don't actually overlap, but the compiler bug causes the bitfield access to not properly use the basic type of the bitfield, but escalate to a full 64-bit R-M-W instead, thus incorrectly R-M-W'ing the field next door. Yeah, ok, so just a generic heads-up about non-atomicity of bitfields, and not something specific to Oleg's example. Thanks. Jonathan Corbet wrote a LWN article about this back in 2012: http://lwn.net/Articles/478657/ I guess it's fixed in gcc 4.8, but too bad there's not a workaround for earlier compilers (akin to -fstrict_volatile_bitfields without requiring the volatile keyword). Regards, Peter Hurley ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3] arm64, ia64, ppc, s390, sh, tile, um, x86, mm: Remove default gate area
On Sun, Jul 13, 2014 at 1:01 PM, Andy Lutomirski l...@amacapital.net wrote: The core mm code will provide a default gate area based on FIXADDR_USER_START and FIXADDR_USER_END if !defined(__HAVE_ARCH_GATE_AREA) defined(AT_SYSINFO_EHDR). This default is only useful for ia64. arm64, ppc, s390, sh, tile, 64-bit UML, and x86_32 have their own code just to disable it. arm, 32-bit UML, and x86_64 have gate areas, but they have their own implementations. This gets rid of the default and moves the code into ia64. This should save some code on architectures without a gate area: it's now possible to inline the gate_area functions in the default case. Can one of you pull this somewhere? Otherwise I can put it somewhere stable and ask for -next inclusion, but that seems like overkill for a single patch. --Andy ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: thp: Add write barrier after updating the valid bit
With hugepages, we store the hpte valid information in the pte page whose address is stored in the second half of the PMD. Use a write barrier to make sure that clearing pmd busy bit and updating hpte valid info are ordered properly. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable-ppc64.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index eb9261024f51..558beb760062 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -394,6 +394,12 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, unsigned int index, unsigned int hidx) { hpte_slot_array[index] = hidx 4 | 0x1 3; + /* +* The hpte valid is stored in the pgtable whose address is in the +* second half of the PMD. Order this against clearing of the busy bit in +* huge pmd. +*/ + smp_wmb(); } struct page *realmode_pfn_to_page(unsigned long pfn); -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] powerpc: thp: invalidate old 64K based hash page mapping before insert
If we changed base page size of the segment, either via sub_page_protect or via remap_4k_pfn, we do a demote_segment which doesn't flush the hash table entries. We do that when inserting a new hash pte by checking the _PAGE_COMBO flag. We missed to do that when inserting hash for a new 16MB page. Add the same. This patch mark the 4k base page size 16MB hugepage via _PAGE_COMBO. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/mm/hugepage-hash64.c | 66 +++ 1 file changed, 66 insertions(+) diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 826893fcb3a7..28d1b8b93674 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -18,6 +18,56 @@ #include linux/mm.h #include asm/machdep.h +static void flush_hash_hugepage(unsigned long vsid, unsigned long addr, + pmd_t *pmdp, unsigned int psize, int ssize) +{ + int i, max_hpte_count, valid; + unsigned long s_addr = addr; + unsigned char *hpte_slot_array; + unsigned long hidx, shift, vpn, hash, slot; + + hpte_slot_array = get_hpte_slot_array(pmdp); + /* +* IF we try to do a HUGE PTE update after a withdraw is done. +* we will find the below NULL. This happens when we do +* split_huge_page_pmd +*/ + if (!hpte_slot_array) + return; + + if (ppc_md.hugepage_invalidate) + return ppc_md.hugepage_invalidate(vsid, addr, hpte_slot_array, + psize, ssize); + /* +* No bluk hpte removal support, invalidate each entry +*/ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE shift; + for (i = 0; i max_hpte_count; i++) { + /* +* 8 bits per each hpte entries +* 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] +*/ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, 0); + } +} + + int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, pmd_t *pmdp, unsigned long trap, int local, int ssize, unsigned int psize) @@ -85,6 +135,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, vpn = hpt_vpn(ea, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); hpte_slot_array = get_hpte_slot_array(pmdp); + if (psize == MMU_PAGE_4K) { + /* +* invalidate the old hpte entry if we have that mapped via 64K +* base page size. This is because demote_segment won't flush +* hash page table entries. +*/ + if (!(old_pmd _PAGE_COMBO)) + flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, ssize); + } valid = hpte_valid(hpte_slot_array, index); if (valid) { @@ -172,6 +231,13 @@ repeat: mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* +* Mark the pte with _PAGE_COMBO, if we are trying to hash it with +* base page size 4k. +*/ + if (psize == MMU_PAGE_4K) + new_pmd |= _PAGE_COMBO; + + /* * No need to use ldarx/stdcx here */ *pmdp = __pmd(new_pmd ~_PAGE_BUSY); -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: thp: Add write barrier after updating the valid bit
With hugepages, we store the hpte valid information in the pte page whose address is stored in the second half of the PMD. Use a write barrier to make sure that clearing pmd busy bit and updating hpte valid info are ordered properly. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable-ppc64.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index eb9261024f51..558beb760062 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -394,6 +394,12 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, unsigned int index, unsigned int hidx) { hpte_slot_array[index] = hidx 4 | 0x1 3; + /* +* The hpte valid is stored in the pgtable whose address is in the +* second half of the PMD. Order this against clearing of the busy bit in +* huge pmd. +*/ + smp_wmb(); } struct page *realmode_pfn_to_page(unsigned long pfn); -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: subpage_protect: Increase the array size to take care of 64TB
We now support TASK_SIZE of 16TB, hence the array should be 8. Fixes the below crash: Unable to handle kernel paging request for data at address 0x000100bd Faulting instruction address: 0xc004f914 cpu 0x13: Vector: 300 (Data Access) at [c00fea75fa90] pc: c004f914: .sys_subpage_prot+0x2d4/0x5c0 lr: c004fb5c: .sys_subpage_prot+0x51c/0x5c0 sp: c00fea75fd10 msr: 90009032 dar: 100bd dsisr: 4000 current = 0xc00fea6ae490 paca= 0xcfb8ab00 softe: 0irq_happened: 0x00 pid = 8237, comm = a.out enter ? for help [c00fea75fe30] c000a164 syscall_exit+0x0/0x98 --- Exception: c00 (System Call) at 3fff89737004 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/mmu-hash64.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 807014dde821..c2b4dcf23d03 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -22,6 +22,7 @@ */ #include asm/pgtable-ppc64.h #include asm/bug.h +#include asm/processor.h /* * Segment table @@ -496,7 +497,7 @@ extern void slb_set_size(u16 size); */ struct subpage_prot_table { unsigned long maxaddr; /* only addresses this are protected */ - unsigned int **protptrs[2]; + unsigned int **protptrs[(TASK_SIZE_USER64 43)]; unsigned int *low_prot[4]; }; -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] powerpc: thp: don't recompute vsid and ssize in loop on invalidate
The segment identifier and segment size will remain the same in the loop, So we can compute it outside. We also change the hugepage_invalidate interface so that we can use it the later patch Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/machdep.h| 6 +++--- arch/powerpc/mm/hash_native_64.c | 19 +-- arch/powerpc/mm/pgtable_64.c | 24 arch/powerpc/platforms/pseries/lpar.c | 20 ++-- 4 files changed, 26 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index f92b0b54e921..8dcb721d03d8 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -57,10 +57,10 @@ struct machdep_calls { void(*hpte_removebolted)(unsigned long ea, int psize, int ssize); void(*flush_hash_range)(unsigned long number, int local); - void(*hugepage_invalidate)(struct mm_struct *mm, + void(*hugepage_invalidate)(unsigned long vsid, + unsigned long addr, unsigned char *hpte_slot_array, - unsigned long addr, int psize); - + int psize, int ssize); /* special for kexec, to be called in real mode, linear mapping is * destroyed as well */ void(*hpte_clear_all)(void); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index cf1d325eae8b..fb89d7695a9a 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -412,18 +412,18 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, local_irq_restore(flags); } -static void native_hugepage_invalidate(struct mm_struct *mm, +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, unsigned char *hpte_slot_array, - unsigned long addr, int psize) + int psize, int ssize) { - int ssize = 0, i; - int lock_tlbie; + int i, lock_tlbie; struct hash_pte *hptep; int actual_psize = MMU_PAGE_16M; unsigned int max_hpte_count, valid; unsigned long flags, s_addr = addr; unsigned long hpte_v, want_v, shift; - unsigned long hidx, vpn = 0, vsid, hash, slot; + unsigned long hidx, vpn = 0, hash, slot; shift = mmu_psize_defs[psize].shift; max_hpte_count = 1U (PMD_SHIFT - shift); @@ -437,15 +437,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm, /* get the vpn */ addr = s_addr + (i * (1ul shift)); - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm-context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - vpn = hpt_vpn(addr, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); if (hidx _PTEIDX_SECONDARY) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index f6ce1f111f5b..ac8c0754a4e9 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -745,12 +745,21 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, if (!hpte_slot_array) return; - /* get the base page size */ + /* get the base page size,vsid and segment size */ psize = get_slice_psize(mm, s_addr); + if (!is_kernel_addr(s_addr)) { + ssize = user_segment_size(s_addr); + vsid = get_vsid(mm-context.id, s_addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } if (ppc_md.hugepage_invalidate) - return ppc_md.hugepage_invalidate(mm, hpte_slot_array, - s_addr, psize); + return ppc_md.hugepage_invalidate(vsid, s_addr, + hpte_slot_array, + psize, ssize); /* * No bluk hpte removal support, invalidate each entry */ @@ -768,15 +777,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, /* get the vpn */ addr = s_addr + (i * (1ul shift)); - if (!is_kernel_addr(addr)) {
Re: bit fields data tearing
On 07/15/2014 06:54 AM, Peter Hurley wrote: Jonathan Corbet wrote a LWN article about this back in 2012: http://lwn.net/Articles/478657/ I guess it's fixed in gcc 4.8, but too bad there's not a workaround for earlier compilers (akin to -fstrict_volatile_bitfields without requiring the volatile keyword) From the gcc pr, it looks like the patch was backported to 4.7. But we didn't fix it in versions earlier than that. r~ ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: subpage_protect: Increase the array size to take care of 64TB
Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes: We now support TASK_SIZE of 16TB, hence the array should be 8. should be ^^^ 64TB Fixes the below crash: -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] ppc/xmon: use isspace/isxdigit/isalnum from linux/ctype.h
On Tue, 2014-07-15 at 13:43 +0200, Vincent Bernat wrote: isxdigit() macro definition is the same. isalnum() from linux/ctype.h will accept additional latin non-ASCII characters. This is harmless since this macro is used in scanhex() which parses user input. isspace() from linux/ctype.h will accept vertical tab and form feed but not NULL. The use of this macro is modified to accept NULL as well. Additional characters are harmless since this macro is also only used in scanhex(). I don't think we care about \0 ... Paul, care to chime in ? After all, you wrote that stuff a century or two ago... :) Cheers, Ben. Signed-off-by: Vincent Bernat vinc...@bernat.im --- arch/powerpc/xmon/xmon.c | 14 ++ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d199bfa2f1fa..55d9b48774b7 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -24,6 +24,7 @@ #include linux/interrupt.h #include linux/irq.h #include linux/bug.h +#include linux/ctype.h #include asm/ptrace.h #include asm/string.h @@ -177,14 +178,6 @@ extern void xmon_leave(void); #define GETWORD(v) (((v)[0] 24) + ((v)[1] 16) + ((v)[2] 8) + (v)[3]) #endif -#define isxdigit(c) (('0' = (c) (c) = '9') \ - || ('a' = (c) (c) = 'f') \ - || ('A' = (c) (c) = 'F')) -#define isalnum(c) (('0' = (c) (c) = '9') \ - || ('a' = (c) (c) = 'z') \ - || ('A' = (c) (c) = 'Z')) -#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0) - static char *help_string = \ Commands:\n\ b show breakpoints\n\ @@ -2121,9 +2114,6 @@ static void dump_pacas(void) } #endif -#define isxdigit(c) (('0' = (c) (c) = '9') \ - || ('a' = (c) (c) = 'f') \ - || ('A' = (c) (c) = 'F')) static void dump(void) { @@ -2526,7 +2516,7 @@ scanhex(unsigned long *vp) int i; for (i=0; i63; i++) { c = inchar(); - if (isspace(c)) { + if (isspace(c) || c == '\0') { termch = c; break; } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 3/3] powerpc/pseries: Switch pseries drivers to use machine_xxx_initcall()
A lot of the code in platforms/pseries is using non-machine initcalls. That means if a kernel built with pseries support runs on another platform, for example powernv, the initcalls will still run. Most of these cases are OK, though sometimes only due to luck. Some were having more effect: * hcall_inst_init - Checking FW_FEATURE_LPAR which is set on ps3 celleb. * mobility_sysfs_init - created sysfs files unconditionally - but no effect due to ENOSYS from rtas_ibm_suspend_me() * apo_pm_init - created sysfs, allows write - nothing checks the value written to though * alloc_dispatch_log_kmem_cache - creating kmem_cache on non-pseries machines Signed-off-by: Michael Ellerman m...@ellerman.id.au --- v2: Add missing includes of machdep.h arch/powerpc/platforms/pseries/dtl.c | 3 ++- arch/powerpc/platforms/pseries/eeh_pseries.c | 8 ++-- arch/powerpc/platforms/pseries/hvCall_inst.c | 3 ++- arch/powerpc/platforms/pseries/mobility.c| 3 ++- arch/powerpc/platforms/pseries/msi.c | 4 ++-- arch/powerpc/platforms/pseries/power.c | 5 +++-- arch/powerpc/platforms/pseries/ras.c | 2 +- arch/powerpc/platforms/pseries/reconfig.c| 5 + arch/powerpc/platforms/pseries/rng.c | 2 +- arch/powerpc/platforms/pseries/setup.c | 2 +- arch/powerpc/platforms/pseries/suspend.c | 5 ++--- 11 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 7d61498e45c0..1062f71f5a85 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -29,6 +29,7 @@ #include asm/lppaca.h #include asm/debug.h #include asm/plpar_wrappers.h +#include asm/machdep.h struct dtl { struct dtl_entry*buf; @@ -391,4 +392,4 @@ err_remove_dir: err: return rc; } -arch_initcall(dtl_init); +machine_arch_initcall(pseries, dtl_init); diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 0bec0c02c5e7..476a5d8b0b36 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -743,10 +743,7 @@ static struct eeh_ops pseries_eeh_ops = { */ static int __init eeh_pseries_init(void) { - int ret = -EINVAL; - - if (!machine_is(pseries)) - return ret; + int ret; ret = eeh_ops_register(pseries_eeh_ops); if (!ret) @@ -757,5 +754,4 @@ static int __init eeh_pseries_init(void) return ret; } - -early_initcall(eeh_pseries_init); +machine_early_initcall(pseries, eeh_pseries_init); diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index cf4e7736e4f1..4575f0c9e521 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -27,6 +27,7 @@ #include asm/firmware.h #include asm/cputable.h #include asm/trace.h +#include asm/machdep.h DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); @@ -162,4 +163,4 @@ static int __init hcall_inst_init(void) return 0; } -__initcall(hcall_inst_init); +machine_device_initcall(pseries, hcall_inst_init); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index bde7ebad3949..d146fef038b8 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -18,6 +18,7 @@ #include linux/delay.h #include linux/slab.h +#include asm/machdep.h #include asm/rtas.h #include pseries.h @@ -362,4 +363,4 @@ static int __init mobility_sysfs_init(void) return rc; } -device_initcall(mobility_sysfs_init); +machine_device_initcall(pseries, mobility_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 0c882e83c4ce..18ff4626d74e 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -16,6 +16,7 @@ #include asm/rtas.h #include asm/hw_irq.h #include asm/ppc-pci.h +#include asm/machdep.h static int query_token, change_token; @@ -532,5 +533,4 @@ static int rtas_msi_init(void) return 0; } -arch_initcall(rtas_msi_init); - +machine_arch_initcall(pseries, rtas_msi_init); diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c index 6d6266236446..c26eadde434c 100644 --- a/arch/powerpc/platforms/pseries/power.c +++ b/arch/powerpc/platforms/pseries/power.c @@ -25,6 +25,7 @@ #include linux/string.h #include linux/errno.h #include linux/init.h +#include asm/machdep.h unsigned long rtas_poweron_auto; /* default and normal state is 0 */ @@ -71,11 +72,11 @@ static int __init pm_init(void) return -ENOMEM; return sysfs_create_group(power_kobj, attr_group); } -core_initcall(pm_init); +machine_core_initcall(pseries, pm_init); #else static int __init
Re: [PATCH v2] powerpc/pseries: dynamically added OF nodes need to call of_node_init
On Thu, Jul 10, 2014 at 1:59 PM, Nathan Fontenot nf...@linux.vnet.ibm.com wrote: On 07/10/2014 01:50 PM, Tyrel Datwyler wrote: Commit 75b57ecf9 refactored device tree nodes to use kobjects such that they can be exposed via /sysfs. A secondary commit 0829f6d1f furthered this rework by moving the kobect initialization logic out of of_node_add into its own of_node_init function. The inital commit removed the existing kref_init calls in the pseries dlpar code with the assumption kobject initialization would occur in of_node_add. The second commit had the side effect of triggering a BUG_ON during DLPAR, migration and suspend/resume operations as a result of dynamically added nodes being uninitialized. This patch fixes this by adding of_node_init calls in place of the previously removed kref_init calls. Fixes: 0829f6d1f69e (of: device_node kobject lifecycle fixes) Cc: sta...@vger.kernel.org Signed-off-by: Tyrel Datwyler tyr...@linux.vnet.ibm.com Acked-by: Nathan Fontenot nf...@linux.vnet.ibm.com Acked-by: Grant Likely grant.lik...@linaro.org Ben, are you going to take this or should I take it via my tree? g. --- V2: - included stable kernel list on Cc per comment by mpe arch/powerpc/platforms/pseries/dlpar.c| 1 + arch/powerpc/platforms/pseries/reconfig.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 022b38e..2d0b4d6 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -86,6 +86,7 @@ static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa, } of_node_set_flag(dn, OF_DYNAMIC); + of_node_init(dn); return dn; } diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 0435bb6..1c0a60d 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -69,6 +69,7 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist np-properties = proplist; of_node_set_flag(np, OF_DYNAMIC); + of_node_init(np); np-parent = derive_parent(path); if (IS_ERR(np-parent)) { ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: OF_DYNAMIC node lifecycle
I've got another question about powerpc reconfiguration. I was looking at the dlpar_configure_connector() function in dlpar.c. I see that the function has the ability to process multiple nodes with additional sibling and child nodes. It appears to link them into a detached tree structure, and the function returns a pointer to the first node. All of the callers of that function then call dlpar_attach_node(), which calls of_attach_node(). However, of_attach_node() only handles a single node. It doesn't handle siblings or children. Is this a bug? Does the configure connector ever actually receive more than one node at once? g. On Fri, Jun 27, 2014 at 8:41 AM, Nathan Fontenot nf...@austin.ibm.com wrote: On 06/27/2014 07:41 AM, Grant Likely wrote: On Thu, 26 Jun 2014 15:01:49 -0500, Nathan Fontenot nf...@austin.ibm.com wrote: On 06/25/2014 03:24 PM, Grant Likely wrote: On Tue, 24 Jun 2014 15:10:55 -0500, Nathan Fontenot nf...@austin.ibm.com wrote: heh! I have often thought about adding reference counting to device tree properties. You horrible, horrible man. Yes. I are evil :) After looking again the work needed to add reference counts to properties would be huge. The few properties I am concerned with are specific to powerpc so perhaps just adding an arch specific lock around updating those properties would work. Which code/properties? I'd like to have a look myself. /ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory The property is updated in arch/powerpc/platforms/pseries/hotplug-memory.c:pseries_update_drconf_memory() Specifically, what do you need for the locking? Are you wanting to hold off additional changes while that function is executing? Pantelis is adding a mutex for device tree writers. Holding that mutex would prevent any changes from happening in the tree without affecting readers. Would that be sufficient? That would work. -Nathan ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 4/6] powerpc/eeh: Replace pr_warning() with pr_warn()
pr_warn() is equal to pr_warning(), but the former is a bit more formal. The patch replaces pr_warning() with pr_warn(). Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/kernel/eeh.c| 16 arch/powerpc/kernel/eeh_cache.c | 7 --- arch/powerpc/kernel/eeh_dev.c| 3 ++- arch/powerpc/kernel/eeh_driver.c | 16 arch/powerpc/kernel/eeh_pe.c | 3 ++- arch/powerpc/platforms/powernv/eeh-ioda.c| 12 ++-- arch/powerpc/platforms/powernv/eeh-powernv.c | 7 --- arch/powerpc/platforms/pseries/eeh_pseries.c | 28 ++-- 8 files changed, 48 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 123c151..dcc2a95 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -334,8 +334,8 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) /* Find the PHB PE */ phb_pe = eeh_phb_pe_get(pe-phb); if (!phb_pe) { - pr_warning(%s Can't find PE for PHB#%d\n, - __func__, pe-phb-global_number); + pr_warn(%s Can't find PE for PHB#%d\n, + __func__, pe-phb-global_number); return -EEXIST; } @@ -784,13 +784,13 @@ void eeh_save_bars(struct eeh_dev *edev) int __init eeh_ops_register(struct eeh_ops *ops) { if (!ops-name) { - pr_warning(%s: Invalid EEH ops name for %p\n, + pr_warn(%s: Invalid EEH ops name for %p\n, __func__, ops); return -EINVAL; } if (eeh_ops eeh_ops != ops) { - pr_warning(%s: EEH ops of platform %s already existing (%s)\n, + pr_warn(%s: EEH ops of platform %s already existing (%s)\n, __func__, eeh_ops-name, ops-name); return -EEXIST; } @@ -810,7 +810,7 @@ int __init eeh_ops_register(struct eeh_ops *ops) int __exit eeh_ops_unregister(const char *name) { if (!name || !strlen(name)) { - pr_warning(%s: Invalid EEH ops name\n, + pr_warn(%s: Invalid EEH ops name\n, __func__); return -EINVAL; } @@ -875,11 +875,11 @@ int eeh_init(void) /* call platform initialization function */ if (!eeh_ops) { - pr_warning(%s: Platform EEH operation not found\n, + pr_warn(%s: Platform EEH operation not found\n, __func__); return -EEXIST; } else if ((ret = eeh_ops-init())) { - pr_warning(%s: Failed to call platform init function (%d)\n, + pr_warn(%s: Failed to call platform init function (%d)\n, __func__, ret); return ret; } @@ -920,7 +920,7 @@ int eeh_init(void) if (eeh_enabled()) pr_info(EEH: PCI Enhanced I/O Error Handling Enabled\n); else - pr_warning(EEH: No capable adapters found\n); + pr_warn(EEH: No capable adapters found\n); return ret; } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 3639bee..07d8a24 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -143,7 +143,7 @@ eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo, } else { if (dev != piar-pcidev || alo != piar-addr_lo || ahi != piar-addr_hi) { - pr_warning(PIAR: overlapping address range\n); + pr_warn(PIAR: overlapping address range\n); } return piar; } @@ -177,13 +177,14 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); if (!dn) { - pr_warning(PCI: no pci dn found for dev=%s\n, pci_name(dev)); + pr_warn(PCI: no pci dn found for dev=%s\n, + pci_name(dev)); return; } edev = of_node_to_eeh_dev(dn); if (!edev) { - pr_warning(PCI: no EEH dev found for dn=%s\n, + pr_warn(PCI: no EEH dev found for dn=%s\n, dn-full_name); return; } diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index 1efa28f..e5274ee 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -57,7 +57,8 @@ void *eeh_dev_init(struct device_node *dn, void *data) /* Allocate EEH device */ edev = kzalloc(sizeof(*edev), GFP_KERNEL); if (!edev) { - pr_warning(%s: out of memory\n, __func__); + pr_warn(%s: out of memory\n, + __func__); return
[PATCH 1/6] powerpc/eeh: Refactor EEH flag accessors
There are multiple global EEH flags. Almost each flag has its own accessor, which doesn't make sense. The patch refactors EEH flag accessors so that they look unified: eeh_add_flag(): Add EEH flag eeh_clear_flag(): Clear EEH flag eeh_has_flag(): Check if one specific flag has been set eeh_enabled():Check if EEH functionality has been enabled Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/eeh.h | 32 ++-- arch/powerpc/kernel/eeh.c| 20 - arch/powerpc/kernel/eeh_cache.c | 2 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 6 +++--- arch/powerpc/platforms/powernv/pci-ioda.c| 1 - arch/powerpc/platforms/pseries/eeh_pseries.c | 4 ++-- 6 files changed, 27 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 6e47894..ca8aada 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -206,36 +206,28 @@ extern int eeh_subsystem_flags; extern struct eeh_ops *eeh_ops; extern raw_spinlock_t confirm_error_lock; -static inline bool eeh_enabled(void) +static inline void eeh_add_flag(int flag) { - if ((eeh_subsystem_flags EEH_FORCE_DISABLED) || - !(eeh_subsystem_flags EEH_ENABLED)) - return false; - - return true; + eeh_subsystem_flags |= flag; } -static inline void eeh_set_enable(bool mode) +static inline void eeh_clear_flag(int flag) { - if (mode) - eeh_subsystem_flags |= EEH_ENABLED; - else - eeh_subsystem_flags = ~EEH_ENABLED; + eeh_subsystem_flags = ~flag; } -static inline void eeh_probe_mode_set(int flag) +static inline bool eeh_has_flag(int flag) { - eeh_subsystem_flags |= flag; +return !!(eeh_subsystem_flags flag); } -static inline int eeh_probe_mode_devtree(void) +static inline bool eeh_enabled(void) { - return (eeh_subsystem_flags EEH_PROBE_MODE_DEVTREE); -} + if (eeh_has_flag(EEH_FORCE_DISABLED) || + !eeh_has_flag(EEH_ENABLED)) + return false; -static inline int eeh_probe_mode_dev(void) -{ - return (eeh_subsystem_flags EEH_PROBE_MODE_DEV); + return true; } static inline void eeh_serialize_lock(unsigned long *flags) @@ -314,8 +306,6 @@ static inline bool eeh_enabled(void) return false; } -static inline void eeh_set_enable(bool mode) { } - static inline int eeh_init(void) { return 0; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 4de2103..65a163f 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -142,7 +142,7 @@ static struct eeh_stats eeh_stats; static int __init eeh_setup(char *str) { if (!strcmp(str, off)) - eeh_subsystem_flags |= EEH_FORCE_DISABLED; + eeh_add_flag(EEH_FORCE_DISABLED); return 1; } @@ -252,7 +252,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) * 0xFF's is always returned from PCI config space. */ if (!(pe-type EEH_PE_PHB)) { - if (eeh_probe_mode_devtree()) + if (eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); eeh_ops-configure_bridge(pe); eeh_pe_restore_bars(pe); @@ -303,7 +303,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) unsigned long flags; int ret; - if (!eeh_probe_mode_dev()) + if (!eeh_has_flag(EEH_PROBE_MODE_DEV)) return -EPERM; /* Find the PHB PE */ @@ -801,7 +801,7 @@ int __exit eeh_ops_unregister(const char *name) static int eeh_reboot_notifier(struct notifier_block *nb, unsigned long action, void *unused) { - eeh_set_enable(false); + eeh_clear_flag(EEH_ENABLED); return NOTIFY_DONE; } @@ -865,13 +865,13 @@ int eeh_init(void) return ret; /* Enable EEH for all adapters */ - if (eeh_probe_mode_devtree()) { + if (eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) { list_for_each_entry_safe(hose, tmp, hose_list, list_node) { phb = hose-dn; traverse_pci_devices(phb, eeh_ops-of_probe, NULL); } - } else if (eeh_probe_mode_dev()) { + } else if (eeh_has_flag(EEH_PROBE_MODE_DEV)) { list_for_each_entry_safe(hose, tmp, hose_list, list_node) pci_walk_bus(hose-bus, eeh_ops-dev_probe, NULL); @@ -923,7 +923,7 @@ void eeh_add_device_early(struct device_node *dn) * would delay the probe until late stage because * the PCI device isn't available this moment. */ - if (!eeh_probe_mode_devtree()) + if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) return;
[PATCH 3/6] powerpc/eeh: Reduce lines of log dump
The patch prints 4 PCIE or AER config registers each line, which is part of the EEH log so that it looks a bit more compact. Suggested-by: Benjamin Herrenschmidt b...@kernel.crashing.org Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/kernel/eeh.c | 37 +++-- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index aa33656..123c151 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -157,12 +157,13 @@ __setup(eeh=, eeh_setup); * This routine captures assorted PCI configuration space data, * and puts them into a buffer for RTAS error logging. */ -static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len) +static size_t eeh_gather_pci_data(struct eeh_dev *edev, char *buf, size_t len) { struct device_node *dn = eeh_dev_to_of_node(edev); u32 cfg; - int cap, i; + int cap, i, j; int n = 0; + char buffer[128]; n += scnprintf(buf+n, len-n, %s\n, dn-full_name); pr_warn(EEH: of node=%s\n, dn-full_name); @@ -204,10 +205,22 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len) n += scnprintf(buf+n, len-n, pci-e cap10:\n); pr_warn(EEH: PCI-E capabilities and status follow:\n); - for (i=0; i=8; i++) { + for (i=0, j=0; i=8; i++) { eeh_ops-read_config(dn, cap+4*i, 4, cfg); n += scnprintf(buf+n, len-n, %02x:%x\n, 4*i, cfg); - pr_warn(EEH: PCI-E %02x: %08x\n, i, cfg); + + if ((i % 4) == 0) { + memset(buffer, 0, sizeof(buffer)); + j = scnprintf(buffer, sizeof(buffer), + EEH: PCI-E %02x: %08x , + 4*i, cfg); + } else { + j += scnprintf(buffer+j, sizeof(buffer)-j, + %08x , cfg); + } + + if ((i % 4) == 3 || i = 8) + pr_warn(%s\n, buffer); } } @@ -217,10 +230,22 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len) n += scnprintf(buf+n, len-n, pci-e AER:\n); pr_warn(EEH: PCI-E AER capability register set follows:\n); - for (i=0; i14; i++) { + for (i=0, j=0; i=13; i++) { eeh_ops-read_config(dn, cap+4*i, 4, cfg); n += scnprintf(buf+n, len-n, %02x:%x\n, 4*i, cfg); - pr_warn(EEH: PCI-E AER %02x: %08x\n, i, cfg); + + if ((i % 4) == 0) { + memset(buffer, 0, sizeof(buffer)); + j = scnprintf(buffer, sizeof(buffer), + EEH: PCI-E AER %02x: %08x , + 4*i, cfg); + } else { + j += scnprintf(buffer+j, sizeof(buffer)-j, + %08x , cfg); + } + + if ((i % 4) == 3 || i = 13) + pr_warn(%s\n, buffer); } } -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 0/6] EEH Cleanup
The patchset is EEH cleanup and expected to be merged during 3.17 window. The the patchset is expected to be applied after: |EEH support for guest |2 more bug fixes for EEH support for guest |M64 related EEH changes |2 bug fixes from Mike Qiu | +- The current patchset Except the following 2 patches, all patches are for cleanup: Refactoring EEH log, replacing pr_warning() with pr_warn(), reducing length of EEH log dump etc: PATCH[2/6]: We have to enable I/O path before collecting EEH log. Otherwise, 0xFF is always returned from PCI config of devices in frozen PE. the problem is only existing on PHB3. PATCH[6/6]: It's something related to EEH guest log retrieval. Currently, all PEs in one specific PHB are sharing diag-data blob for storing EEH log. It's possible for diag-data blob overwritten before being collected by guest. The patch introduce auxillary data for PE, which is maintained by backend. On PowerNV, that's used for EEH log. Gavin Shan (6): powerpc/eeh: Refactor EEH flag accessors powerpc/eeh: Selectively enable IO for error log powerpc/eeh: Reduce lines of log dump powerpc/eeh: Replace pr_warning() with pr_warn() powerpc/eeh: Make diag-data not endian dependent powerpc/eeh: Aux PE data for error log arch/powerpc/include/asm/eeh.h | 43 - arch/powerpc/include/asm/opal.h | 128 +-- arch/powerpc/kernel/eeh.c| 73 ++- arch/powerpc/kernel/eeh_cache.c | 9 +- arch/powerpc/kernel/eeh_dev.c| 3 +- arch/powerpc/kernel/eeh_driver.c | 16 ++-- arch/powerpc/kernel/eeh_pe.c | 29 +- arch/powerpc/platforms/powernv/eeh-ioda.c| 103 + arch/powerpc/platforms/powernv/eeh-powernv.c | 32 +-- arch/powerpc/platforms/powernv/pci-ioda.c| 1 - arch/powerpc/platforms/powernv/pci.c | 68 -- arch/powerpc/platforms/pseries/eeh_pseries.c | 32 +++ 12 files changed, 323 insertions(+), 214 deletions(-) -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 5/6] powerpc/eeh: Make diag-data not endian dependent
It's followup of commit ddf0322a (powerpc/powernv: Fix endianness problems in EEH). The patch helps to get non-endian-dependent diag-data. Cc: Guo Chao y...@linux.vnet.ibm.com Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/opal.h | 128 +++--- arch/powerpc/platforms/powernv/eeh-ioda.c | 51 +++- arch/powerpc/platforms/powernv/pci.c | 68 ++-- 3 files changed, 139 insertions(+), 108 deletions(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index edbfe1c..f0b5b40 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -520,40 +520,40 @@ enum { }; struct OpalIoP7IOCErrorData { - uint16_t type; + __be16 type; /* GEM */ - uint64_t gemXfir; - uint64_t gemRfir; - uint64_t gemRirqfir; - uint64_t gemMask; - uint64_t gemRwof; + __be64 gemXfir; + __be64 gemRfir; + __be64 gemRirqfir; + __be64 gemMask; + __be64 gemRwof; /* LEM */ - uint64_t lemFir; - uint64_t lemErrMask; - uint64_t lemAction0; - uint64_t lemAction1; - uint64_t lemWof; + __be64 lemFir; + __be64 lemErrMask; + __be64 lemAction0; + __be64 lemAction1; + __be64 lemWof; union { struct OpalIoP7IOCRgcErrorData { - uint64_t rgcStatus; /* 3E1C10 */ - uint64_t rgcLdcp; /* 3E1C18 */ + __be64 rgcStatus; /* 3E1C10 */ + __be64 rgcLdcp; /* 3E1C18 */ }rgc; struct OpalIoP7IOCBiErrorData { - uint64_t biLdcp0; /* 3C0100, 3C0118 */ - uint64_t biLdcp1; /* 3C0108, 3C0120 */ - uint64_t biLdcp2; /* 3C0110, 3C0128 */ - uint64_t biFenceStatus; /* 3C0130, 3C0130 */ + __be64 biLdcp0; /* 3C0100, 3C0118 */ + __be64 biLdcp1; /* 3C0108, 3C0120 */ + __be64 biLdcp2; /* 3C0110, 3C0128 */ + __be64 biFenceStatus; /* 3C0130, 3C0130 */ - uint8_t biDownbound; /* BI Downbound or Upbound */ + u8 biDownbound; /* BI Downbound or Upbound */ }bi; struct OpalIoP7IOCCiErrorData { - uint64_t ciPortStatus; /* 3Dn008 */ - uint64_t ciPortLdcp;/* 3Dn010 */ + __be64 ciPortStatus;/* 3Dn008 */ + __be64 ciPortLdcp; /* 3Dn010 */ - uint8_t ciPort;/* Index of CI port: 0/1 */ + u8 ciPort; /* Index of CI port: 0/1 */ }ci; }; }; @@ -585,60 +585,60 @@ struct OpalIoPhbErrorCommon { struct OpalIoP7IOCPhbErrorData { struct OpalIoPhbErrorCommon common; - uint32_t brdgCtl; + __be32 brdgCtl; // P7IOC utl regs - uint32_t portStatusReg; - uint32_t rootCmplxStatus; - uint32_t busAgentStatus; + __be32 portStatusReg; + __be32 rootCmplxStatus; + __be32 busAgentStatus; // P7IOC cfg regs - uint32_t deviceStatus; - uint32_t slotStatus; - uint32_t linkStatus; - uint32_t devCmdStatus; - uint32_t devSecStatus; + __be32 deviceStatus; + __be32 slotStatus; + __be32 linkStatus; + __be32 devCmdStatus; + __be32 devSecStatus; // cfg AER regs - uint32_t rootErrorStatus; - uint32_t uncorrErrorStatus; - uint32_t corrErrorStatus; - uint32_t tlpHdr1; - uint32_t tlpHdr2; - uint32_t tlpHdr3; - uint32_t tlpHdr4; - uint32_t sourceId; + __be32 rootErrorStatus; + __be32 uncorrErrorStatus; + __be32 corrErrorStatus; + __be32 tlpHdr1; + __be32 tlpHdr2; + __be32 tlpHdr3; + __be32 tlpHdr4; + __be32 sourceId; - uint32_t rsv3; + __be32 rsv3; // Record data about the call to allocate a buffer. - uint64_t errorClass; - uint64_t correlator; + __be64 errorClass; + __be64 correlator; //P7IOC MMIO Error Regs - uint64_t p7iocPlssr;// n120 - uint64_t p7iocCsr; // n110 - uint64_t lemFir;// nC00 - uint64_t lemErrorMask; // nC18 - uint64_t lemWOF;// nC40 - uint64_t phbErrorStatus;// nC80 - uint64_t phbFirstErrorStatus; // nC88 - uint64_t phbErrorLog0; // nCC0 - uint64_t phbErrorLog1; // nCC8 - uint64_t mmioErrorStatus; //
[PATCH 6/6] powerpc/eeh: Aux PE data for error log
The patch allows PE (struct eeh_pe) instance to have auxillary data, whose size is configurable on basis of platform. For PowerNV, the auxillary data will be used to cache PHB diag-data for that PE (frozen PE or fenced PHB). In turn, we can retrieve the diag-data at any later points It's useful for the case of VFIO PCI devices where the error log should be cached, and then be retrieved by the guest at later point. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/eeh.h | 2 ++ arch/powerpc/kernel/eeh_pe.c | 26 - arch/powerpc/platforms/powernv/eeh-ioda.c| 42 +++- arch/powerpc/platforms/powernv/eeh-powernv.c | 3 +- 4 files changed, 58 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 494c3ff..9983c3d 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -88,6 +88,7 @@ struct eeh_pe { int false_positives;/* Times of reported #ff's */ atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE*/ + void *data; /* PE auxillary data*/ struct list_head child_list;/* Link PE to the child list*/ struct list_head edevs; /* Link list of EEH devices */ struct list_head child; /* Child PEs*/ @@ -248,6 +249,7 @@ static inline void eeh_serialize_unlock(unsigned long flags) #define EEH_MAX_ALLOWED_FREEZES 5 typedef void *(*eeh_traverse_func)(void *data, void *flag); +void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb); struct eeh_pe *eeh_pe_get(struct eeh_dev *edev); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 77632ab..00e3844 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -32,9 +32,24 @@ #include asm/pci-bridge.h #include asm/ppc-pci.h +static int eeh_pe_aux_size = 0; static LIST_HEAD(eeh_phb_pe); /** + * eeh_set_pe_aux_size - Set PE auxillary data size + * @size: PE auxillary data size + * + * Set PE auxillary data size + */ +void eeh_set_pe_aux_size(int size) +{ + if (size 0) + return; + + eeh_pe_aux_size = size; +} + +/** * eeh_pe_alloc - Allocate PE * @phb: PCI controller * @type: PE type @@ -44,9 +59,16 @@ static LIST_HEAD(eeh_phb_pe); static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) { struct eeh_pe *pe; + size_t alloc_size; + + alloc_size = sizeof(struct eeh_pe); + if (eeh_pe_aux_size) { + alloc_size = ALIGN(alloc_size, cache_line_size()); + alloc_size += eeh_pe_aux_size; + } /* Allocate PHB PE */ - pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL); + pe = kzalloc(alloc_size, GFP_KERNEL); if (!pe) return NULL; /* Initialize PHB PE */ @@ -56,6 +78,8 @@ static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) INIT_LIST_HEAD(pe-child); INIT_LIST_HEAD(pe-edevs); + pe-data = (void *)pe + ALIGN(sizeof(struct eeh_pe), + cache_line_size()); return pe; } diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index bccdf60..b4624cf 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -236,20 +236,16 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int option) return ret; } -static void ioda_eeh_phb_diag(struct pci_controller *hose) +static void ioda_eeh_phb_diag(struct eeh_pe *pe) { - struct pnv_phb *phb = hose-private_data; + struct pnv_phb *phb = pe-phb-private_data; long rc; - rc = opal_pci_get_phb_diag_data2(phb-opal_id, phb-diag.blob, + rc = opal_pci_get_phb_diag_data2(phb-opal_id, pe-data, PNV_PCI_DIAG_BUF_SIZE); - if (rc != OPAL_SUCCESS) { + if (rc != OPAL_SUCCESS) pr_warn(%s: Failed to get diag-data for PHB#%x (%ld)\n, - __func__, hose-global_number, rc); - return; - } - - pnv_pci_dump_phb_diag_data(hose, phb-diag.blob); + __func__, pe-phb-global_number, rc); } static int ioda_eeh_get_phb_state(struct eeh_pe *pe) @@ -282,7 +278,7 @@ static int ioda_eeh_get_phb_state(struct eeh_pe *pe) EEH_STATE_DMA_ENABLED); } else if (!(pe-state EEH_PE_ISOLATED)) { eeh_pe_state_mark(pe, EEH_PE_ISOLATED); - ioda_eeh_phb_diag(phb-hose); + ioda_eeh_phb_diag(pe); } return result; @@ -380,7 +376,7 @@ static int
[PATCH 2/6] powerpc/eeh: Selectively enable IO for error log
According to the experiment I did, PCI config access is blocked on P7IOC frozen PE by hardware, but PHB3 doesn't do that. That means we always get 0xFF's while dumping PCI config space of the frozen PE on P7IOC. We don't have the problem on PHB3. So we have to enable I/O prioir to collecting error log. Otherwise, meaningless 0xFF's are always returned. The patch fixes it by EEH flag (EEH_ENABLE_IO_FOR_LOG), which is selectively set to indicate the case for: P7IOC on PowerNV platform, pSeries platform. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/eeh.h | 9 + arch/powerpc/kernel/eeh.c| 2 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 16 arch/powerpc/platforms/pseries/eeh_pseries.c | 2 +- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index ca8aada..494c3ff 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -34,10 +34,11 @@ struct device_node; #ifdef CONFIG_EEH /* EEH subsystem flags */ -#define EEH_ENABLED0x1 /* EEH enabled */ -#define EEH_FORCE_DISABLED 0x2 /* EEH disabled */ -#define EEH_PROBE_MODE_DEV 0x4 /* From PCI device */ -#define EEH_PROBE_MODE_DEVTREE 0x8 /* From device tree */ +#define EEH_ENABLED0x01/* EEH enabled */ +#define EEH_FORCE_DISABLED 0x02/* EEH disabled */ +#define EEH_PROBE_MODE_DEV 0x04/* From PCI device */ +#define EEH_PROBE_MODE_DEVTREE 0x08/* From device tree */ +#define EEH_ENABLE_IO_FOR_LOG 0x10/* Enable IO for log*/ /* * Delay for PE reset, all in ms diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 65a163f..aa33656 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -252,7 +252,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) * 0xFF's is always returned from PCI config space. */ if (!(pe-type EEH_PE_PHB)) { - if (eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) + if (eeh_has_flag(EEH_ENABLE_IO_FOR_LOG)) eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); eeh_ops-configure_bridge(pe); eeh_pe_restore_bars(pe); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index ba134ac..740c396 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -45,6 +45,9 @@ */ static int powernv_eeh_init(void) { + struct pci_controller *hose; + struct pnv_phb *phb; + /* We require OPALv3 */ if (!firmware_has_feature(FW_FEATURE_OPALv3)) { pr_warning(%s: OPALv3 is required !\n, __func__); @@ -54,6 +57,19 @@ static int powernv_eeh_init(void) /* Set probe mode */ eeh_add_flag(EEH_PROBE_MODE_DEV); + /* +* P7IOC blocks PCI config access to frozen PE, but PHB3 +* doesn't do that. So we have to selectively enable I/O +* prior to collecting error log. +*/ + list_for_each_entry(hose, hose_list, list_node) { + phb = hose-private_data; + + if (phb-model == PNV_PHB_MODEL_P7IOC) + eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + break; + } + return 0; } diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index f173705..1e15cdd 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -128,7 +128,7 @@ static int pseries_eeh_init(void) } /* Set EEH probe mode */ - eeh_add_flag(EEH_PROBE_MODE_DEVTREE); + eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); return 0; } -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev