Re: [PATCH] powerpc/powernv: Remove PCI_MSI ifdef checks
On Wed, 14 Nov 2018 at 16:50, Oliver O'Halloran wrote: > > CONFIG_PCI_MSI was made mandatory by commit a311e738b6d8 > ("powerpc/powernv: Make PCI non-optional") so the #ifdef > checks around CONFIG_PCI_MSI here can be removed entirely. > > Signed-off-by: Oliver O'Halloran Reviewed-by: Joel Stanley Can we do the same to the CONFIG_PCI_MSI in arch/powerpc/sysdev/xics/ics-opal.c ? Cheers, Joel
[PATCH] powerpc/powernv: Remove PCI_MSI ifdef checks
CONFIG_PCI_MSI was made mandatory by commit a311e738b6d8 ("powerpc/powernv: Make PCI non-optional") so the #ifdef checks around CONFIG_PCI_MSI here can be removed entirely. Signed-off-by: Oliver O'Halloran --- arch/powerpc/platforms/powernv/pci-ioda.c | 13 - arch/powerpc/platforms/powernv/pci.c | 2 -- arch/powerpc/platforms/powernv/pci.h | 2 -- 3 files changed, 17 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index cde710297a4e..90c5ea1e16f3 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -664,10 +664,6 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) return state; } -/* Currently those 2 are only used when MSIs are enabled, this will change - * but in the meantime, we need to protect them to avoid warnings - */ -#ifdef CONFIG_PCI_MSI struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); @@ -680,7 +676,6 @@ struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) return NULL; return >ioda.pe_array[pdn->pe_number]; } -#endif /* CONFIG_PCI_MSI */ static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, struct pnv_ioda_pe *parent, @@ -2837,7 +2832,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pnv_ioda_setup_bus_dma(pe, pe->pbus, true); } -#ifdef CONFIG_PCI_MSI int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) { struct pnv_phb *phb = container_of(chip, struct pnv_phb, @@ -2983,9 +2977,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); } -#else -static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } -#endif /* CONFIG_PCI_MSI */ #ifdef CONFIG_PCI_IOV static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) @@ -3699,10 +3690,8 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, -#ifdef CONFIG_PCI_MSI .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, -#endif .enable_device_hook = pnv_pci_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, @@ -3723,10 +3712,8 @@ static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, -#ifdef CONFIG_PCI_MSI .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, -#endif .enable_device_hook = pnv_pci_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus= pnv_pci_reset_secondary_bus, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 13aef2323bbc..583fb2e64893 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -160,7 +160,6 @@ int pnv_pci_set_power_state(uint64_t id, uint8_t state, struct opal_msg *msg) } EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); -#ifdef CONFIG_PCI_MSI int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -229,7 +228,6 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev) msi_bitmap_free_hwirqs(>msi_bmp, hwirq - phb->msi_base, 1); } } -#endif /* CONFIG_PCI_MSI */ /* Nicely print the contents of the PE State Tables (PEST). */ static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 8b37b28e3831..99ac2bd0cf51 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -106,11 +106,9 @@ struct pnv_phb { struct dentry *dbgfs; #endif -#ifdef CONFIG_PCI_MSI unsigned intmsi_base; unsigned intmsi32_support; struct msi_bitmap msi_bmp; -#endif int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg); -- 2.17.2
Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected
On Tue, Nov 13, 2018 at 10:39:15PM +, alex_gagn...@dellteam.com wrote: > On 11/12/2018 11:02 PM, Bjorn Helgaas wrote: > > > > [EXTERNAL EMAIL] > > Please report any suspicious attachments, links, or requests for sensitive > > information. It looks like Dell's email system adds the above in such a way that the email quoting convention suggests that *I* wrote it, when I did not. > ... > > Do you think Linux observes the rule about not touching AER bits on > > FFS? I'm not sure it does. I'm not even sure what section of the > > spec is relevant. > > I haven't found any place where linux breaks this rule. I'm very > confident that, unless otherwise instructed, we follow this rule. Just to make sure we're on the same page, can you point me to this rule? I do see that OSPM must request control of AER using _OSC before it touches the AER registers. What I don't see is the connection between firmware-first and the AER registers. The closest I can find is the "Enabled" field in the HEST PCIe AER structures (ACPI v6.2, sec 18.3.2.4, .5, .6), where it says: If the field value is 1, indicates this error source is to be enabled. If the field value is 0, indicates that the error source is not to be enabled. If FIRMWARE_FIRST is set in the flags field, the Enabled field is ignored by the OSPM. AFAICT, Linux completely ignores the Enabled field in these structures. These structures also contain values the OS is apparently supposed to write to Device Control and several AER registers (in struct acpi_hest_aer_common). Linux ignores these as well. These seem like fairly serious omissions in Linux. > > The whole issue of firmware-first, the mechanism by which firmware > > gets control, the System Error enables in Root Port Root Control > > registers, etc., is very murky to me. Jon has a sort of similar issue > > with VMD where he needs to leave System Errors enabled instead of > > disabling them as we currently do. > > Well, OS gets control via _OSC method, and based on that it should > touch/not touch the AER bits. I agree so far. > The bits that get set/cleared come from _HPX method, _HPX tells us about some AER registers, Device Control, Link Control, and some bridge registers. It doesn't say anything about the Root Control register that Jon is concerned with. For firmware-first to work, firmware has to get control. How does it get control? How does OSPM know to either set up that mechanism or keep its mitts off something firmware set up before handoff? In Jon's VMD case, I think firmware-first relies on the System Error controlled by the Root Control register. Linux thinks it owns that, and I don't know how to learn otherwise. > and there's a more about the FFS described in ACPI spec. It > seems that if platform, wants to enable VMD, it should pass the correct > bits via _HPX. I'm curious to know in what new twisted way FFS doesn't > work as intended. Bjorn
Re: [PATCH kernel v3 07/22] powerpc/powernv/npu: Move OPAL calls away from context manipulation
> - /* > - * Setup the NPU context table for a particular GPU. These need to be > - * per-GPU as we need the tables to filter ATSDs when there are no > - * active contexts on a particular GPU. It is safe for these to be > - * called concurrently with destroy as the OPAL call takes appropriate > - * locks and refcounts on init/destroy. > - */ > - rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, > - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); > - if (rc < 0) > - return ERR_PTR(-ENOSPC); > - This will prevent any drivers from setting up contexts with different MSR values (which is what the flags argument is for) than a standard userspace context (MSR_DR | MSR_PR | MSR_HV). In practice this currently never happens and I'm unsure if that's ever likely to change. We should at least return an error if flags != (MSR_DR | MSR_PR | MSR_HV). > /* >* We store the npu pci device so we can more easily get at the >* associated npus. > @@ -755,9 +738,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev > *gpdev, if (npu_context->release_cb != cb || > npu_context->priv != priv) { > spin_unlock(_context_lock); > - opal_npu_destroy_context(nphb->opal_id, mm->context.id, > - PCI_DEVID(gpdev->bus->number, > - gpdev->devfn)); > return ERR_PTR(-EINVAL); > } > > @@ -783,9 +763,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev > *gpdev, > > if (rc) { > kfree(npu_context); > - opal_npu_destroy_context(nphb->opal_id, mm->context.id, > - PCI_DEVID(gpdev->bus->number, > - gpdev->devfn)); > return ERR_PTR(rc); > } > > @@ -838,7 +815,6 @@ void pnv_npu2_destroy_context(struct npu_context > *npu_context, struct pci_dev *gpdev) > { > int removed; > - struct pnv_phb *nphb; > struct npu *npu; > struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); > struct device_node *nvlink_dn; > @@ -847,10 +823,6 @@ void pnv_npu2_destroy_context(struct npu_context > *npu_context, if (WARN_ON(!npdev)) > return; > > - if (!firmware_has_feature(FW_FEATURE_OPAL)) > - return; > - > - nphb = pci_bus_to_host(npdev->bus)->private_data; > npu = npdev_to_npu(npdev); > if (!npu) > return; > @@ -859,8 +831,6 @@ void pnv_npu2_destroy_context(struct npu_context > *npu_context, _index))) > return; > WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL); > - opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, > - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); > spin_lock(_context_lock); > removed = kref_put(_context->kref, pnv_npu2_release_context); > spin_unlock(_context_lock); > @@ -892,9 +862,6 @@ int pnv_npu2_handle_fault(struct npu_context *context, > uintptr_t *ea, /* mmap_sem should be held so the struct_mm must be present > */ > struct mm_struct *mm = context->mm; > > - if (!firmware_has_feature(FW_FEATURE_OPAL)) > - return -ENODEV; > - > WARN_ON(!rwsem_is_locked(>mmap_sem)); > > for (i = 0; i < count; i++) { > @@ -923,15 +890,11 @@ int pnv_npu2_handle_fault(struct npu_context *context, > uintptr_t *ea, } > EXPORT_SYMBOL(pnv_npu2_handle_fault); > > -int pnv_npu2_init(struct pnv_phb *phb) > +int pnv_npu2_init(struct pci_controller *hose) > { > unsigned int i; > u64 mmio_atsd; > - struct device_node *dn; > - struct pci_dev *gpdev; > static int npu_index; > - uint64_t rc = 0; > - struct pci_controller *hose = phb->hose; > struct npu *npu; > int ret; > > @@ -940,18 +903,6 @@ int pnv_npu2_init(struct pnv_phb *phb) > return -ENOMEM; > > npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush"); > - for_each_child_of_node(phb->hose->dn, dn) { > - gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); > - if (gpdev) { > - rc = opal_npu_map_lpar(phb->opal_id, > - PCI_DEVID(gpdev->bus->number, gpdev->devfn), > - 0, 0); > - if (rc) > - dev_err(>dev, > - "Error %lld mapping device to LPAR\n", > - rc); > - } > - } > > for (i = 0; !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", > i, _atsd); i++) > @@ -981,3 +932,57 @@ int pnv_npu2_init(struct pnv_phb *phb) > > return
Re: linux-next: build warnings from Linus' tree
Hello Alan, On Tue, 12 Jun 2018 at 07:44, Stephen Rothwell wrote: > Building Linus' tree, today's linux-next build (powerpc ppc64_defconfig) > produced these warning: > > ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in > section `.gnu.hash'. > ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in > section `.gnu.hash'. > ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in > section `.gnu.hash'. > > This may just be because I have started building using the native Debian > gcc for the powerpc builds ... Do you know why we started creating these? If it's intentional, should we be putting including them in the same way as .hash sections? https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kernel/vmlinux.lds.S#n282 .hash : AT(ADDR(.hash) - LOAD_OFFSET) { *(.hash) } Cheers, Joel
Re: [PATCH kernel v3 06/22] powerpc/powernv: Detach npu struct from pnv_phb
Hi Alexey, On Tuesday, 13 November 2018 7:28:07 PM AEDT Alexey Kardashevskiy wrote: > static struct npu *npdev_to_npu(struct pci_dev *npdev) > { > - struct pnv_phb *nphb; > + struct pci_controller *hose = pci_bus_to_host(npdev->bus); > + struct npu *npu; > > - nphb = pci_bus_to_host(npdev->bus)->private_data; > + list_for_each_entry(npu, _devices, next) This is called from the ATSD path which is (or at least has been) quite a performance critical path so searching through all the NPUs in a list may be problematic. I guess currently it wont make any practical difference as we only ever have 2 NPUs, but in future they may get divided into more logical NPUs. Would it be possible to store a back-pointer somewhere so we can avoid the lookup? > + if (hose == npu->hose) > + return npu; > > - return >npu; > + WARN_ON_ONCE(1); > + return NULL; > } > > /* Maximum number of nvlinks per npu */ > @@ -505,6 +531,9 @@ static void acquire_atsd_reg(struct npu_context > *npu_context, continue; > > npu = npdev_to_npu(npdev); > + if (!npu) > + continue; > + > mmio_atsd_reg[i].npu = npu; > mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); > while (mmio_atsd_reg[i].reg < 0) { > @@ -701,6 +730,8 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev > *gpdev, > > nphb = pci_bus_to_host(npdev->bus)->private_data; > npu = npdev_to_npu(npdev); > + if (!npu) > + return ERR_PTR(-ENODEV); > > /* >* Setup the NPU context table for a particular GPU. These need to be > @@ -821,6 +852,8 @@ void pnv_npu2_destroy_context(struct npu_context > *npu_context, > > nphb = pci_bus_to_host(npdev->bus)->private_data; > npu = npdev_to_npu(npdev); > + if (!npu) > + return; > nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); > if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", > _index))) > @@ -898,9 +931,15 @@ int pnv_npu2_init(struct pnv_phb *phb) > struct pci_dev *gpdev; > static int npu_index; > uint64_t rc = 0; > + struct pci_controller *hose = phb->hose; > + struct npu *npu; > + int ret; > > - phb->npu.nmmu_flush = > - of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); > + npu = kzalloc(sizeof(*npu), GFP_KERNEL); > + if (!npu) > + return -ENOMEM; > + > + npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush"); > for_each_child_of_node(phb->hose->dn, dn) { > gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); > if (gpdev) { > @@ -914,18 +953,31 @@ int pnv_npu2_init(struct pnv_phb *phb) > } > } > > - for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", > + for (i = 0; !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", > i, _atsd); i++) > - phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); > + npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); > > - pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); > - phb->npu.mmio_atsd_count = i; > - phb->npu.mmio_atsd_usage = 0; > + pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i); > + npu->mmio_atsd_count = i; > + npu->mmio_atsd_usage = 0; > npu_index++; > - if (WARN_ON(npu_index >= NV_MAX_NPUS)) > - return -ENOSPC; > + if (WARN_ON(npu_index >= NV_MAX_NPUS)) { > + ret = -ENOSPC; > + goto fail_exit; > + } > max_npu2_index = npu_index; > - phb->npu.index = npu_index; > + npu->index = npu_index; > + npu->hose = hose; > + > + list_add(>next, _devices); Guess we don't need any locking here as the list gets setup once during boot long before loading the driver and is never modified right? - Alistair > return 0; > + > +fail_exit: > + for (i = 0; i < npu->mmio_atsd_count; ++i) > + iounmap(npu->mmio_atsd_regs[i]); > + > + kfree(npu); > + > + return ret; > }
Re: [PATCH] powerpc/32: Include .branch_lt in data section
On Wed, Nov 14, 2018 at 01:32:18PM +1030, Joel Stanley wrote: > When building a 32 bit powerpc kernel with Binutils 2.31.1 this warning > is emitted: > > powerpc-linux-gnu-ld: warning: orphan section `.branch_lt' from > `arch/powerpc/kernel/head_44x.o' being placed in section `.branch_lt' > > As of binutils commit 2d7ad24e8726 ("Support PLT16 relocs against local > symbols")[1], 32 bit targets can produce .branch_lt sections in their > output. > > Include these symbols in the .data section as the ppc64 kernel does. > > [1] > https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commitdiff;h=2d7ad24e8726ba4c45c9e67be08223a146a837ce > Signed-off-by: Joel Stanley Reviewed-by: Alan Modra Looks fine to me. > --- > I wasn't sure where this should go or if the ordering matters. The usual answer is: "Look at where the section goes in the standard linker scripts." But that doesn't apply here. The section will be empty for a kernel build so it doesn't matter where it goes. > --- > arch/powerpc/kernel/vmlinux.lds.S | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/powerpc/kernel/vmlinux.lds.S > b/arch/powerpc/kernel/vmlinux.lds.S > index 434581bcd5b4..6d5fd1b95311 100644 > --- a/arch/powerpc/kernel/vmlinux.lds.S > +++ b/arch/powerpc/kernel/vmlinux.lds.S > @@ -313,6 +313,7 @@ SECTIONS > *(.sdata2) > *(.got.plt) *(.got) > *(.plt) > + *(.branch_lt) > } > #else > .data : AT(ADDR(.data) - LOAD_OFFSET) { > -- > 2.19.1 -- Alan Modra Australia Development Lab, IBM
Re: [PATCH kernel v3 05/22] powerpc/powernv/npu: Add helper to access struct npu for NPU device
Reviewed-by: Alistair Popple On Tuesday, 13 November 2018 7:28:06 PM AEDT Alexey Kardashevskiy wrote: > This step is to help removing the npu struct from pnv_phb so it > can be used by pseries as well. > > Signed-off-by: Alexey Kardashevskiy > Reviewed-by: David Gibson > --- > arch/powerpc/platforms/powernv/npu-dma.c | 22 -- > 1 file changed, 16 insertions(+), 6 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c index 91d488f..9f48831 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -327,6 +327,18 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct > pnv_ioda_pe *npe) return gpe; > } > > +/* > + * NPU2 ATS > + */ > +static struct npu *npdev_to_npu(struct pci_dev *npdev) > +{ > + struct pnv_phb *nphb; > + > + nphb = pci_bus_to_host(npdev->bus)->private_data; > + > + return >npu; > +} > + > /* Maximum number of nvlinks per npu */ > #define NV_MAX_LINKS 6 > > @@ -478,7 +490,6 @@ static void acquire_atsd_reg(struct npu_context > *npu_context, int i, j; > struct npu *npu; > struct pci_dev *npdev; > - struct pnv_phb *nphb; > > for (i = 0; i <= max_npu2_index; i++) { > mmio_atsd_reg[i].reg = -1; > @@ -493,8 +504,7 @@ static void acquire_atsd_reg(struct npu_context > *npu_context, if (!npdev) > continue; > > - nphb = pci_bus_to_host(npdev->bus)->private_data; > - npu = >npu; > + npu = npdev_to_npu(npdev); > mmio_atsd_reg[i].npu = npu; > mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); > while (mmio_atsd_reg[i].reg < 0) { > @@ -690,7 +700,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev > *gpdev, } > > nphb = pci_bus_to_host(npdev->bus)->private_data; > - npu = >npu; > + npu = npdev_to_npu(npdev); > > /* >* Setup the NPU context table for a particular GPU. These need to be > @@ -764,7 +774,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev > *gpdev, */ > WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); > > - if (!nphb->npu.nmmu_flush) { > + if (!npu->nmmu_flush) { > /* >* If we're not explicitly flushing ourselves we need to mark >* the thread for global flushes > @@ -810,7 +820,7 @@ void pnv_npu2_destroy_context(struct npu_context > *npu_context, return; > > nphb = pci_bus_to_host(npdev->bus)->private_data; > - npu = >npu; > + npu = npdev_to_npu(npdev); > nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); > if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", > _index)))
Unable to handle kernel paging request for data at address 0xc000000001da0000
Running some container workloads on an IBM power9 server with the latest mainline (rc2) triggered this, [ 1283.894167] Unable to handle kernel paging request for data at address 0xc1da [ 1283.894215] Faulting instruction address: 0xc0487ab8 [ 1283.894223] Oops: Kernel access of bad area, sig: 11 [#1] [ 1283.894227] LE SMP NR_CPUS=2048 DEBUG_PAGEALLOC NUMA pSeries [ 1283.894235] Modules linked in: nf_conntrack_netlink xt_mark tun udp_diag tcp_diag inet_diag netlink_diag af_packet_diag unix_diag nft_chain_nat_ipv6 nf_nat_ipv6 xt_conntrack ipt_MASQUERADE nft_counter xt_comment nft_compat nft_chain_nat_ipv4 nf_nat_ipv4 nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables veth bridge stp llc nfnetlink overlay sg pseries_rng xfs libcrc32c sd_mod ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod [ 1283.894275] CPU: 0 PID: 226 Comm: kmemleak Kdump: loaded Not tainted 4.20.0-rc2+ #1 [ 1283.894281] NIP: c0487ab8 LR: c0487ab0 CTR: [ 1283.894287] REGS: c005e94f3970 TRAP: 0300 Not tainted (4.20.0-rc2+) [ 1283.894291] MSR: 80010280b033 CR: 24000822 XER: [ 1283.894302] CFAR: c0487a04 DAR: c1da DSISR: 4000 IRQMASK: 1 GPR00: c0487ab0 c005e94f3bf8 c1c93400 GPR04: c005d9380a90 0001 e7d83172 c005d9380a90 GPR08: 78811199 c005f1665ac8 c005d9380080 fce6b398 GPR12: c2de c01717b8 c005efa49ab8 GPR16: GPR20: c1b85220 GPR24: c0046a1d8848 c0046a1d1b88 GPR28: c1da0ff9 c0f3b620 c1b85108 c1da [ 1283.894352] NIP [c0487ab8] scan_block+0xa8/0x190 [ 1283.894357] LR [c0487ab0] scan_block+0xa0/0x190 [ 1283.894361] Call Trace: [ 1283.894365] [c005e94f3bf8] [c0487ab0] scan_block+0xa0/0x190 (unreliable) [ 1283.894372] [c005e94f3c58] [c0487c00] scan_large_block+0x60/0xa0 [ 1283.894378] [c005e94f3c98] [c048857c] kmemleak_scan+0x24c/0xa10 [ 1283.894384] [c005e94f3d48] [c0489960] kmemleak_scan_thread+0xec/0x12c [ 1283.894391] [c005e94f3db8] [c017196c] kthread+0x1bc/0x1d0 [ 1283.894397] [c005e94f3e28] [c000b860] ret_from_kernel_thread+0x5c/0x7c [ 1283.894402] Instruction dump: [ 1283.894408] 7fa3eb78 4883847d 6000 6000 6000 6000 3bff0008 7fbcf840 [ 1283.894416] 409d00b8 4bfffeed 2fa3 409e00ac e93e0108 7fa91840 419dffdc [ 1283.894425] ---[ end trace 8e9576ac10ae13a1 ]--- [ 1283.897274] [ 1283.897288] Sending IPI to other CPUs [ 1285.021354] IPI complete [ 1285.042568] kexec: Starting switchover sequence.
Re: [PATCH] KVM: PPC: Book3S HV: fix handling for interrupted H_ENTER_NESTED
On Thu, 2018-11-08 at 21:27 -0600, Michael Roth wrote: > While running a nested guest VCPU on L0 via H_ENTER_NESTED hcall, a > pending signal in the L0 QEMU process can generate the following > sequence: > > ret0 = kvmppc_pseries_do_hcall() > ret1 = kvmhv_enter_nested_guest() > ret2 = kvmhv_run_single_vcpu() > if (ret2 == -EINTR) > return H_INTERRUPT > if (ret1 == H_INTERRUPT) > kvmppc_set_gpr(vcpu, 3, 0) > return -EINTR > /* skipped: */ > kvmppc_set_gpr(vcpu, 3, ret) > vcpu->arch.hcall_needed = 0 > return RESUME_GUEST > > which causes an exit to L0 userspace with ret0 == -EINTR. > > The intention seems to be to set the hcall return value to 0 (via > VCPU r3) so that L1 will see a successful return from H_ENTER_NESTED > once we resume executing the VCPU. However, because we don't set > vcpu->arch.hcall_needed = 0, we do the following once userspace > resumes execution via kvm_arch_vcpu_ioctl_run(): > > ... > } else if (vcpu->arch.hcall_needed) { > int i > > kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret); > for (i = 0; i < 9; ++i) >kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); > vcpu->arch.hcall_needed = 0; > > since vcpu->arch.hcall_needed == 1 indicates that userspace should > have handled the hcall and stored the return value in > run->papr_hcall.ret. Since that's not the case here, we can get an > unexpected value in VCPU r3, which can result in > kvmhv_p9_guest_entry() reporting an unexpected trap value when it > returns from H_ENTER_NESTED, causing the following register dump to > console via subsequent call to kvmppc_handle_exit_hv() in L1: > > [ 350.612854] vcpu f9564cf8 (0): > [ 350.612915] pc = c013eb98 msr = 80009033 trap > = 1 > [ 350.613020] r 0 = c04b9044 r16 = > [ 350.613075] r 1 = c0007cffba30 r17 = > [ 350.613120] r 2 = c178c100 r18 = 7fffc24f3b50 > [ 350.613166] r 3 = c0007ef52480 r19 = 7fffc24fff58 > [ 350.613212] r 4 = r20 = 0a1e96ece9d0 > [ 350.613253] r 5 = 70616d00746f6f72 r21 = 0a1ea117c9b0 > [ 350.613295] r 6 = 0020 r22 = 0a1ea1184360 > [ 350.613338] r 7 = c000783be440 r23 = 0003 > [ 350.613380] r 8 = fffc r24 = 0a1e96e9e124 > [ 350.613423] r 9 = c0007ef52490 r25 = 07ff > [ 350.613469] r10 = 0004 r26 = c0007eb2f7a0 > [ 350.613513] r11 = b0616d0009eccdb2 r27 = c0007cffbb10 > [ 350.613556] r12 = c04b9000 r28 = c0007d83a2c0 > [ 350.613597] r13 = c1b0 r29 = c000783cdf68 > [ 350.613639] r14 = r30 = > [ 350.613681] r15 = r31 = c0007cffbbf0 > [ 350.613723] ctr = c04b9000 lr = c04b9044 > [ 350.613765] srr0 = 772f954dd48c srr1 = 8280f033 > [ 350.613808] sprg0 = sprg1 = c1b0 > [ 350.613859] sprg2 = 772f9565a280 sprg3 = > [ 350.613911] cr = 88002848 xer = 2004 dsisr = > 4200 > [ 350.613962] dar = 772f9539 > [ 350.614031] fault dar = c00244b278c0 dsisr = > [ 350.614073] SLB (0 entries): > [ 350.614157] lpcr = 004003d40413 sdr1 = > last_inst = > [ 350.614252] trap=0x1 | pc=0xc013eb98 | > msr=0x80009033 > > followed by L1's QEMU reporting the following before stopping > execution > of the nested guest: > > KVM: unknown exit, hardware reason 1 > NIP c013eb98 LR c04b9044 CTR c04b9000 XER > 2004 CPU#0 > MSR 80009033 HID0 HF 8000 > iidx 3 didx 3 > TB DECR > GPR00 c04b9044 c0007cffba30 c178c100 > c0007ef52480 > GPR04 70616d00746f6f72 0020 > c000783be440 > GPR08 fffc c0007ef52490 0004 > b0616d0009eccdb2 > GPR12 c04b9000 c1b0 > > GPR16 7fffc24f3b50 > 7fffc24fff58 > GPR20 0a1e96ece9d0 0a1ea117c9b0 0a1ea1184360 > 0003 > GPR24 0a1e96e9e124 07ff c0007eb2f7a0 > c0007cffbb10 > GPR28 c0007d83a2c0 c000783cdf68 > c0007cffbbf0 > CR 88002848 [ L L - - E L G L ] RES > >SRR0 772f954dd48c SRR1 8280f033PVR > 004e1202 VRSAVE > SPRG0 SPRG1 c1b0 SPRG2 > 772f9565a280 SPRG3 > SPRG4 SPRG5 SPRG6 > SPRG7 > HSRR0 HSRR1 >CFAR >LPCR
[PATCH] powerpc/32: Include .branch_lt in data section
When building a 32 bit powerpc kernel with Binutils 2.31.1 this warning is emitted: powerpc-linux-gnu-ld: warning: orphan section `.branch_lt' from `arch/powerpc/kernel/head_44x.o' being placed in section `.branch_lt' As of binutils commit 2d7ad24e8726 ("Support PLT16 relocs against local symbols")[1], 32 bit targets can produce .branch_lt sections in their output. Include these symbols in the .data section as the ppc64 kernel does. [1] https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commitdiff;h=2d7ad24e8726ba4c45c9e67be08223a146a837ce Signed-off-by: Joel Stanley --- I wasn't sure where this should go or if the ordering matters. --- arch/powerpc/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 434581bcd5b4..6d5fd1b95311 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -313,6 +313,7 @@ SECTIONS *(.sdata2) *(.got.plt) *(.got) *(.plt) + *(.branch_lt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { -- 2.19.1
[PATCH v2] powerpc/math-emu: Update macros from gmp-6.1.2
The add_ss, sub_ddmmss, umul_ppmm and udiv_qrnnd macros originate from GMP's longlong.h. This was found when compiling with clang: arch/powerpc/math-emu/fnmsub.c:46:2: error: invalid use of a cast in a inline asm context requiring an l-value: remove the cast or build with -fheinous-gnu-extensions FP_ADD_D(R, T, B); ^ ... ./arch/powerpc/include/asm/sfp-machine.h:283:27: note: expanded from macro 'sub_ddmmss' : "=r" ((USItype)(sh)), \ ~~^~~ Segher points out: this was fixed in GCC over 16 years ago ( https://gcc.gnu.org/r56600 ), and in GMP (where it comes from) presumably before that. Update to the latest version in order to git rid of the invalid casts. The only functional change I noticed was this in udiv_qrnnd. __r1 = (n1) % __d1; __q1 = (n1) / __d1; Becomes this: __q1 = (n1) / __d1; __r1 = (n1) - __q1 * __d1; This is equivalent as it instead of calculating the remainder using modulo, it uses the result of integer division to subtract the count of 'whole' d1 from r1. Link: https://github.com/ClangBuiltLinux/linux/issues/260 Signed-off-by: Joel Stanley --- v1: https://lore.kernel.org/linuxppc-dev/20181102033713.31916-1-j...@jms.id.au/ v2: Instead of setting the -fheinous-gnu-extensions for clang, fix the code. arch/powerpc/include/asm/sfp-machine.h | 99 +- 1 file changed, 32 insertions(+), 67 deletions(-) diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h index d89beaba26ff..a6353d9dd5ba 100644 --- a/arch/powerpc/include/asm/sfp-machine.h +++ b/arch/powerpc/include/asm/sfp-machine.h @@ -213,30 +213,18 @@ * respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow * (i.e. carry out) is not stored anywhere, and is lost. */ -#define add_ss(sh, sl, ah, al, bh, bl) \ +#define add_ss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ - __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ -: "=r" ((USItype)(sh)),\ - "=" ((USItype)(sl))\ -: "%r" ((USItype)(ah)),\ - "%r" ((USItype)(al)),\ - "rI" ((USItype)(bl))); \ -else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0) \ - __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ -: "=r" ((USItype)(sh)),\ - "=" ((USItype)(sl))\ -: "%r" ((USItype)(ah)),\ - "%r" ((USItype)(al)),\ - "rI" ((USItype)(bl))); \ + __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ +: "=r" (sh), "=" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \ +else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ + __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ +: "=r" (sh), "=" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \ else \ - __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ -: "=r" ((USItype)(sh)),\ - "=" ((USItype)(sl))\ -: "%r" ((USItype)(ah)),\ - "r" ((USItype)(bh)), \ - "%r" ((USItype)(al)),\ - "rI" ((USItype)(bl))); \ + __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ +: "=r" (sh), "=" (sl)\ +: "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ } while (0) /* sub_ddmmss is used in op-2.h and udivmodti4.c and should be equivalent to @@ -248,44 +236,24 @@ * and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, * and is lost. */ -#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (ah) && (ah) == 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ - : "=r" ((USItype)(sh)), \ -"="
Re: It looks like that wild_bctr on powerpc/fixes is still not compiling
Hi Gromero ;) Gustavo Romero writes: > Hi mpe, > > Even after the latest fix for the wild_bctr selftest I'm still getting the > following compilation (actually, an assembling error) because UL is not > understood by the assembler: > > BUILD_TARGET=/home/gromero/git/linux/tools/testing/selftests/powerpc/mm; > mkdir -p $BUILD_TARGET; make OUTPUT=$BUILD_TARGET -k -C mm all > make[1]: Entering directory > '/home/gromero/git/linux/tools/testing/selftests/powerpc/mm' > gcc -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"v4.20-rc1-8-g2c7645b"' > -I/home/gromero/git/linux/tools/testing/selftests/powerpc/include -m64 > wild_bctr.c ../harness.c -o > /home/gromero/git/linux/tools/testing/selftests/powerpc/mm/wild_bctr > /tmp/cctUajlx.s: Assembler messages: > /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' > /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' ... > /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' > /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' > ../../lib.mk:152: recipe for target > '/home/gromero/git/linux/tools/testing/selftests/powerpc/mm/wild_bctr' failed > make[1]: *** > [/home/gromero/git/linux/tools/testing/selftests/powerpc/mm/wild_bctr] Error 1 > make[1]: Target 'all' not remade because of errors. > make[1]: Leaving directory > '/home/gromero/git/linux/tools/testing/selftests/powerpc/mm' > Makefile:39: recipe for target 'mm' failed > make: *** [mm] Error 2 > > For: > git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git > (powerpc/fixes) > $ git describe > v4.20-rc1-8-g2c7645b > > This is gcc: > $ gcc --version > gcc (Ubuntu/IBM 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609 > > but it should not make a difference, so I'm wondering if anybody else is > getting the same error too... It seems to make a difference :) I've been building with Ubuntu 7.3.0-27ubuntu1~18.04 and it builds just fine. It looks like binutils 2.27 doesn't accept ULL but binutils 2.28 does. Ah yep, here: https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commit;h=86b80085c889cd388fa677a5ae9053fd4be3776c > The following trivial workaround can solve it by forcing a type promotion on > the compiler side whilst leaving the macro taken into the asm code without > the UL string: > > diff --git a/tools/testing/selftests/powerpc/mm/wild_bctr.c > b/tools/testing/selftests/powerpc/mm/wild_bctr.c > index 90469a9..d2772f4 100644 > --- a/tools/testing/selftests/powerpc/mm/wild_bctr.c > +++ b/tools/testing/selftests/powerpc/mm/wild_bctr.c > @@ -47,8 +47,9 @@ static int ok(void) > return 0; > } > > -#define REG_POISON 0x5a5aUL > -#define POISONED_REG(n)((REG_POISON << 48) | ((n) << 32) | > (REG_POISON << 16) | (n)) > +#define REG_POISON 0x5a5a > +#define POISONED_REG(n)(((REG_POISON+0UL) << 48) | ((n) << 32) | > ((REG_POISON+0UL) << 16) | (n)) > > static inline void poison_regs(void) > { > > > Should I contribute such a fix? Yes thanks. cheers
Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected
On 11/13/2018 04:56 PM, Keith Busch wrote: > On Tue, Nov 13, 2018 at 10:39:15PM +, alex_gagn...@dellteam.com wrote: >> On 11/12/2018 11:02 PM, Bjorn Helgaas wrote: >>> The whole issue of firmware-first, the mechanism by which firmware >>> gets control, the System Error enables in Root Port Root Control >>> registers, etc., is very murky to me. Jon has a sort of similar issue >>> with VMD where he needs to leave System Errors enabled instead of >>> disabling them as we currently do. >> >> Well, OS gets control via _OSC method, and based on that it should >> touch/not touch the AER bits. The bits that get set/cleared come from >> _HPX method, and there's a more about the FFS described in ACPI spec. It >> seems that if platform, wants to enable VMD, it should pass the correct >> bits via _HPX. I'm curious to know in what new twisted way FFS doesn't >> work as intended. > > When VMD is enabled, the platform sees a VMD endpoint. It doesn't see > any of the root ports on that domain, so ACPI can't provide policies for > them nor AER registers for the platform to consider controlling. I'm not understanding the interdependency between RP AER settings and VMD. My understanding of VMD is quite rudimentary though, so I'll take your word for it. Alex
Re: [PATCH v2 1/2] Makefile: Export clang toolchain variables
On Mon, Nov 12, 2018 at 1:21 PM Joel Stanley wrote: > > The powerpc makefile will use these in it's boot wrapper. > > Signed-off-by: Joel Stanley > --- Applied to linux-kbuild. Thanks! > Makefile | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/Makefile b/Makefile > index 09278330282d..840efe6eb54c 100644 > --- a/Makefile > +++ b/Makefile > @@ -495,6 +495,7 @@ endif > ifneq ($(GCC_TOOLCHAIN),) > CLANG_FLAGS+= --gcc-toolchain=$(GCC_TOOLCHAIN) > endif > +export CLANG_FLAGS > CLANG_FLAGS+= -no-integrated-as > KBUILD_CFLAGS += $(CLANG_FLAGS) > KBUILD_AFLAGS += $(CLANG_FLAGS) > -- > 2.19.1 > -- Best Regards Masahiro Yamada
Re: [PATCH v2 2/2] powerpc/boot: Set target when cross-compiling for clang
On Tue, Nov 13, 2018 at 4:03 AM Nick Desaulniers wrote: > > On Sun, Nov 11, 2018 at 8:21 PM Joel Stanley wrote: > > > > Clang needs to be told which target it is building for when cross > > compiling. > > Neat! This will help us for continuous integration. > https://github.com/ClangBuiltLinux/continuous-integration > > > > > Link: https://github.com/ClangBuiltLinux/linux/issues/259 > > Signed-off-by: Joel Stanley > > Assuming patch 1/2 gets fixed up (or is fine) and doesn't radically change: > Reviewed-by: Nick Desaulniers > Applied to linux-kbuild. Thanks! > > --- > > arch/powerpc/boot/Makefile | 5 + > > 1 file changed, 5 insertions(+) > > > > diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile > > index 39354365f54a..111f97b1ccec 100644 > > --- a/arch/powerpc/boot/Makefile > > +++ b/arch/powerpc/boot/Makefile > > @@ -55,6 +55,11 @@ BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) > > -traditional -nostdinc > > > > BOOTARFLAGS:= -cr$(KBUILD_ARFLAGS) > > > > +ifdef CONFIG_CC_IS_CLANG > > +BOOTCFLAGS += $(CLANG_FLAGS) > > +BOOTAFLAGS += $(CLANG_FLAGS) > > +endif > > + > > ifdef CONFIG_DEBUG_INFO > > BOOTCFLAGS += -g > > endif > > -- > > 2.19.1 > > > > > -- > Thanks, > ~Nick Desaulniers -- Best Regards Masahiro Yamada
Re: [PATCH v2 2/2] kbuild: consolidate Clang compiler flags
On Tue, Nov 6, 2018 at 12:06 PM Masahiro Yamada wrote: > > Collect basic Clang options such as --target, --prefix, --gcc-toolchain, > -no-integrated-as into a single variable CLANG_FLAGS so that it can be > easily reused in other parts of Makefile. > > Signed-off-by: Masahiro Yamada > --- Applied to linux-kbuild. > Changes in v2: > - Use := flavor instead of = because $(CLANG_FLAGS) is expanded soon anyway > > Makefile | 13 ++--- > 1 file changed, 6 insertions(+), 7 deletions(-) > > diff --git a/Makefile b/Makefile > index da11700..e173a73 100644 > --- a/Makefile > +++ b/Makefile > @@ -487,18 +487,17 @@ endif > > ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) > ifneq ($(CROSS_COMPILE),) > -CLANG_TARGET := --target=$(notdir $(CROSS_COMPILE:%-=%)) > +CLANG_FLAGS:= --target=$(notdir $(CROSS_COMPILE:%-=%)) > GCC_TOOLCHAIN_DIR := $(dir $(shell which $(LD))) > -CLANG_PREFIX := --prefix=$(GCC_TOOLCHAIN_DIR) > +CLANG_FLAGS+= --prefix=$(GCC_TOOLCHAIN_DIR) > GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..) > endif > ifneq ($(GCC_TOOLCHAIN),) > -CLANG_GCC_TC := --gcc-toolchain=$(GCC_TOOLCHAIN) > +CLANG_FLAGS+= --gcc-toolchain=$(GCC_TOOLCHAIN) > endif > -KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) $(CLANG_PREFIX) > -KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) $(CLANG_PREFIX) > -KBUILD_CFLAGS += -no-integrated-as > -KBUILD_AFLAGS += -no-integrated-as > +CLANG_FLAGS+= -no-integrated-as > +KBUILD_CFLAGS += $(CLANG_FLAGS) > +KBUILD_AFLAGS += $(CLANG_FLAGS) > endif > > RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern > -mindirect-branch-register > -- > 2.7.4 > -- Best Regards Masahiro Yamada
Re: [PATCH v2 1/2] kbuild: add -no-integrated-as Clang option unconditionally
On Tue, Nov 6, 2018 at 12:06 PM Masahiro Yamada wrote: > > We are still a way off the Clang's integrated assembler support for > the kernel. Hence, -no-integrated-as is mandatory to build the kernel > with Clang. If you had an ancient version of Clang that does not > recognize this option, you would not be able to compile the kernel > anyway. > > Signed-off-by: Masahiro Yamada > --- Applied to linux-kbuild. > > Changes in v2: > - New patch > > Makefile | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/Makefile b/Makefile > index 93315eb..da11700 100644 > --- a/Makefile > +++ b/Makefile > @@ -497,8 +497,8 @@ CLANG_GCC_TC:= --gcc-toolchain=$(GCC_TOOLCHAIN) > endif > KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) $(CLANG_PREFIX) > KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) $(CLANG_PREFIX) > -KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) > -KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) > +KBUILD_CFLAGS += -no-integrated-as > +KBUILD_AFLAGS += -no-integrated-as > endif > > RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern > -mindirect-branch-register > -- > 2.7.4 > -- Best Regards Masahiro Yamada
Re: [PATCH 05/17] mips: Remove support for BZIP2 and LZMA compressed kernel
On Tue, Nov 13, 2018 at 10:45:54PM +, Paul Burton wrote: > On Fri, Nov 09, 2018 at 08:02:52PM +0100, Adam Borowski wrote: > > @@ -122,7 +104,6 @@ $(obj)/vmlinux.its.S: $(addprefix > > $(srctree)/arch/mips/$(PLATFORM)/,$(ITS_INPUTS > > > > targets += vmlinux.its > > targets += vmlinux.gz.its > > -targets += vmlinux.bz2.its > > targets += vmlinux.lzmo.its > > targets += vmlinux.lzo.its > > It looks to me like this "vmlinux.lzmo.its" was a typo & ought to have > been vmlinux.lzma.its, and thus ought to be removed. Good catch! The whole series was bz2 only at first, grepping for lzma missed this. > Apart from that I'm fine with this in general: > > Acked-by: Paul Burton Thanks. Meow! -- ⢀⣴⠾⠻⢶⣦⠀ ⣾⠁⢰⠒⠀⣿⡁ “This is gonna be as easy as cheating on an ethics exam!” ⢿⡄⠘⠷⠚⠋⠀ -Cerise Brightmoon ⠈⠳⣄
Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected
On Tue, Nov 13, 2018 at 10:39:15PM +, alex_gagn...@dellteam.com wrote: > On 11/12/2018 11:02 PM, Bjorn Helgaas wrote: > > The whole issue of firmware-first, the mechanism by which firmware > > gets control, the System Error enables in Root Port Root Control > > registers, etc., is very murky to me. Jon has a sort of similar issue > > with VMD where he needs to leave System Errors enabled instead of > > disabling them as we currently do. > > Well, OS gets control via _OSC method, and based on that it should > touch/not touch the AER bits. The bits that get set/cleared come from > _HPX method, and there's a more about the FFS described in ACPI spec. It > seems that if platform, wants to enable VMD, it should pass the correct > bits via _HPX. I'm curious to know in what new twisted way FFS doesn't > work as intended. When VMD is enabled, the platform sees a VMD endpoint. It doesn't see any of the root ports on that domain, so ACPI can't provide policies for them nor AER registers for the platform to consider controlling.
Re: [PATCH 05/17] mips: Remove support for BZIP2 and LZMA compressed kernel
Hi Adam, On Fri, Nov 09, 2018 at 08:02:52PM +0100, Adam Borowski wrote: > @@ -122,7 +104,6 @@ $(obj)/vmlinux.its.S: $(addprefix > $(srctree)/arch/mips/$(PLATFORM)/,$(ITS_INPUTS > > targets += vmlinux.its > targets += vmlinux.gz.its > -targets += vmlinux.bz2.its > targets += vmlinux.lzmo.its > targets += vmlinux.lzo.its It looks to me like this "vmlinux.lzmo.its" was a typo & ought to have been vmlinux.lzma.its, and thus ought to be removed. Apart from that I'm fine with this in general: Acked-by: Paul Burton Thanks, Paul
Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected
On 11/12/2018 11:02 PM, Bjorn Helgaas wrote: > > [EXTERNAL EMAIL] > Please report any suspicious attachments, links, or requests for sensitive > information. > > > [+cc Jon, for related VMD firmware-first error enable issue] > > On Mon, Nov 12, 2018 at 08:05:41PM +, alex_gagn...@dellteam.com wrote: >> On 11/11/2018 11:50 PM, Oliver O'Halloran wrote: >>> On Thu, 2018-11-08 at 23:06 +, alex_gagn...@dellteam.com wrote: > But it's not the firmware that crashes. It's linux as a result of a fatal error message from the firmware. And we can't fix that because FFS handling requires that the system reboots [1]. >>> >>> Do we know the exact circumsances that result in firmware requesting a >>> reboot? If it happen on any PCIe error I don't see what we can do to >>> prevent that beyond masking UEs entirely (are we even allowed to do >>> that on FFS systems?). >> >> Pull a drive out at an angle, push two drives in at the same time, pull >> out a drive really slow. If an error is even reported to the OS depends >> on PD state, and proprietary mechanisms and logic in the HW and FW. OS >> is not supposed to mask errors (touch AER bits) on FFS. > > PD? Presence Detect > Do you think Linux observes the rule about not touching AER bits on > FFS? I'm not sure it does. I'm not even sure what section of the > spec is relevant. I haven't found any place where linux breaks this rule. I'm very confident that, unless otherwise instructed, we follow this rule. > The whole issue of firmware-first, the mechanism by which firmware > gets control, the System Error enables in Root Port Root Control > registers, etc., is very murky to me. Jon has a sort of similar issue > with VMD where he needs to leave System Errors enabled instead of > disabling them as we currently do. Well, OS gets control via _OSC method, and based on that it should touch/not touch the AER bits. The bits that get set/cleared come from _HPX method, and there's a more about the FFS described in ACPI spec. It seems that if platform, wants to enable VMD, it should pass the correct bits via _HPX. I'm curious to know in what new twisted way FFS doesn't work as intended. Alex > Bjorn > > [1] > https://lore.kernel.org/linux-pci/20181029210651.gb13...@bhelgaas-glaptop.roam.corp.google.com >
It looks like that wild_bctr on powerpc/fixes is still not compiling
Hi mpe, Even after the latest fix for the wild_bctr selftest I'm still getting the following compilation (actually, an assembling error) because UL is not understood by the assembler: BUILD_TARGET=/home/gromero/git/linux/tools/testing/selftests/powerpc/mm; mkdir -p $BUILD_TARGET; make OUTPUT=$BUILD_TARGET -k -C mm all make[1]: Entering directory '/home/gromero/git/linux/tools/testing/selftests/powerpc/mm' gcc -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"v4.20-rc1-8-g2c7645b"' -I/home/gromero/git/linux/tools/testing/selftests/powerpc/include -m64wild_bctr.c ../harness.c -o /home/gromero/git/linux/tools/testing/selftests/powerpc/mm/wild_bctr /tmp/cctUajlx.s: Assembler messages: /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' /tmp/cctUajlx.s:270: Error: syntax error; found `U', expected `,' /tmp/cctUajlx.s:270: Error: junk at end of line: `UL' ../../lib.mk:152: recipe for target '/home/gromero/git/linux/tools/testing/selftests/powerpc/mm/wild_bctr' failed make[1]: *** [/home/gromero/git/linux/tools/testing/selftests/powerpc/mm/wild_bctr] Error 1 make[1]: Target 'all' not remade because of errors. make[1]: Leaving directory '/home/gromero/git/linux/tools/testing/selftests/powerpc/mm' Makefile:39: recipe for target 'mm' failed make: *** [mm] Error 2 For: git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git (powerpc/fixes) $ git describe v4.20-rc1-8-g2c7645b This is gcc: $ gcc --version gcc (Ubuntu/IBM 5.4.0-6ubuntu1~16.04.10) 5.4.0 20160609 but it should not make a difference, so I'm wondering if anybody else is
[PATCH v2 2/2] dpaa_eth: add ethtool coalesce control
Allow ethtool control of the DPAA QMan portal interrupt coalescing settings. Signed-off-by: Madalin Bucur --- drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 49 ++ 1 file changed, 49 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c index 13d6e2272ece..4df366b05976 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c @@ -529,6 +529,53 @@ static int dpaa_get_ts_info(struct net_device *net_dev, return 0; } +static int dpaa_get_coalesce(struct net_device *dev, +struct ethtool_coalesce *c) +{ + struct qman_portal *portal; + u32 period; + u8 thresh; + + portal = qman_get_affine_portal(smp_processor_id()); + qman_portal_get_iperiod(portal, ); + qman_dqrr_get_ithresh(portal, ); + + c->rx_coalesce_usecs = period; + c->rx_max_coalesced_frames = thresh; + c->use_adaptive_rx_coalesce = false; + + return 0; +} + +static int dpaa_set_coalesce(struct net_device *dev, +struct ethtool_coalesce *c) +{ + const cpumask_t *cpus = qman_affine_cpus(); + struct qman_portal *portal; + u32 period; + u8 thresh; + int cpu; + int res; + + if (c->use_adaptive_rx_coalesce) + return -EINVAL; + + period = c->rx_coalesce_usecs; + thresh = c->rx_max_coalesced_frames; + + for_each_cpu(cpu, cpus) { + portal = qman_get_affine_portal(cpu); + res = qman_portal_set_iperiod(portal, period); + if (res) + return res; + res = qman_dqrr_set_ithresh(portal, thresh); + if (res) + return res; + } + + return 0; +} + const struct ethtool_ops dpaa_ethtool_ops = { .get_drvinfo = dpaa_get_drvinfo, .get_msglevel = dpaa_get_msglevel, @@ -545,4 +592,6 @@ const struct ethtool_ops dpaa_ethtool_ops = { .get_rxnfc = dpaa_get_rxnfc, .set_rxnfc = dpaa_set_rxnfc, .get_ts_info = dpaa_get_ts_info, + .get_coalesce = dpaa_get_coalesce, + .set_coalesce = dpaa_set_coalesce, }; -- 2.1.0
[PATCH v2 1/2] soc/qman: add return value to interrupt coalesce changing APIs
Check that the values received by the portal interrupt coalesce change APIs are in range. Signed-off-by: Madalin Bucur Signed-off-by: Roy Pledge --- drivers/soc/fsl/qbman/qman.c | 33 ++--- include/soc/fsl/qman.h | 8 ++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index 5ce24718c2fd..5b9de224193c 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -36,6 +36,8 @@ #define MAX_IRQNAME16 /* big enough for "QMan portal %d" */ #define QMAN_POLL_LIMIT 32 #define QMAN_PIRQ_DQRR_ITHRESH 12 +#define QMAN_DQRR_IT_MAX 15 +#define QMAN_ITP_MAX 0xFFF #define QMAN_PIRQ_MR_ITHRESH 4 #define QMAN_PIRQ_IPERIOD 100 @@ -727,9 +729,15 @@ static inline void qm_dqrr_vdqcr_set(struct qm_portal *portal, u32 vdqcr) qm_out(portal, QM_REG_DQRR_VDQCR, vdqcr); } -static inline void qm_dqrr_set_ithresh(struct qm_portal *portal, u8 ithresh) +static inline int qm_dqrr_set_ithresh(struct qm_portal *portal, u8 ithresh) { + + if (ithresh > QMAN_DQRR_IT_MAX) + return -EINVAL; + qm_out(portal, QM_REG_DQRR_ITR, ithresh); + + return 0; } /* --- MR API --- */ @@ -1012,13 +1020,20 @@ static inline void put_affine_portal(void) static struct workqueue_struct *qm_portal_wq; -void qman_dqrr_set_ithresh(struct qman_portal *portal, u8 ithresh) +int qman_dqrr_set_ithresh(struct qman_portal *portal, u8 ithresh) { + int res; + if (!portal) - return; + return -EINVAL; + + res = qm_dqrr_set_ithresh(>p, ithresh); + if (res) + return res; - qm_dqrr_set_ithresh(>p, ithresh); portal->p.dqrr.ithresh = ithresh; + + return 0; } EXPORT_SYMBOL(qman_dqrr_set_ithresh); @@ -1036,10 +1051,14 @@ void qman_portal_get_iperiod(struct qman_portal *portal, u32 *iperiod) } EXPORT_SYMBOL(qman_portal_get_iperiod); -void qman_portal_set_iperiod(struct qman_portal *portal, u32 iperiod) +int qman_portal_set_iperiod(struct qman_portal *portal, u32 iperiod) { - if (portal) - qm_out(>p, QM_REG_ITPR, iperiod); + if (!portal || iperiod > QMAN_ITP_MAX) + return -EINVAL; + + qm_out(>p, QM_REG_ITPR, iperiod); + + return 0; } EXPORT_SYMBOL(qman_portal_set_iperiod); diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index 56877660d5ba..5cc7af06c1ba 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -1205,8 +1205,10 @@ void qman_dqrr_get_ithresh(struct qman_portal *portal, u8 *ithresh); * qman_dqrr_set_ithresh - Set coalesce interrupt threshold * @portal: portal to set the new value on * @ithresh: new threshold value + * + * Returns 0 on success, or a negative error code. */ -void qman_dqrr_set_ithresh(struct qman_portal *portal, u8 ithresh); +int qman_dqrr_set_ithresh(struct qman_portal *portal, u8 ithresh); /** * qman_dqrr_get_iperiod - Get coalesce interrupt period @@ -1219,7 +1221,9 @@ void qman_portal_get_iperiod(struct qman_portal *portal, u32 *iperiod); * qman_dqrr_set_iperiod - Set coalesce interrupt period * @portal: portal to set the new value on * @ithresh: new period value + * + * Returns 0 on success, or a negative error code. */ -void qman_portal_set_iperiod(struct qman_portal *portal, u32 iperiod); +int qman_portal_set_iperiod(struct qman_portal *portal, u32 iperiod); #endif /* __FSL_QMAN_H */ -- 2.1.0
[PATCH v2 0/2] dpaa_eth: add ethtool coalesce control
Add control of the DPAA portal interrupt coalescing settings from ethtool. changes from v1: added range checking for the QMan APIs Madalin Bucur (2): soc/qman: add return value to interrupt coalesce changing APIs dpaa_eth: add ethtool coalesce control drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 49 ++ drivers/soc/fsl/qbman/qman.c | 33 +++ include/soc/fsl/qman.h | 8 +++- 3 files changed, 81 insertions(+), 9 deletions(-) -- 2.1.0
Re: [PATCH -next] powerpc64/ftrace: Drop pointless static qualifier in is_b_op()
YueHaibing wrote: There is no need to have the 'intoffset' variable static since new value always be assigned before use it. Signed-off-by: YueHaibing --- arch/powerpc/kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4bf051d..65248d4 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -107,7 +107,7 @@ static int is_b_op(unsigned int op) static unsigned long find_bl_target(unsigned long ip, unsigned int op) { - static int offset; + int offset; That seems to go all the way back to the original commit by Steven. Good catch. Reviewed-by: Naveen N. Rao - Naveen
[PATCH -next] powerpc64/ftrace: Drop pointless static qualifier in is_b_op()
There is no need to have the 'intoffset' variable static since new value always be assigned before use it. Signed-off-by: YueHaibing --- arch/powerpc/kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4bf051d..65248d4 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -107,7 +107,7 @@ static int is_b_op(unsigned int op) static unsigned long find_bl_target(unsigned long ip, unsigned int op) { - static int offset; + int offset; offset = (op & 0x03fc); /* make it signed */
Re: [v2] selftests/powerpc: Fix wild_bctr test to work on ppc64
On Mon, 2018-11-12 at 02:46:06 UTC, Michael Ellerman wrote: > The selftest I recently added to test branching to an out-of-bounds > NIP doesn't work on 64-bit big endian. It does fail but not in the > right way. That is it SEGVs trying to load from the opd at BAD_NIP, > but it never gets as far as branching to BAD_NIP. > > To fix it we need to create an opd which is reachable but which holds > the bad address. > > Fixes: b7683fc66eba ("selftests/powerpc: Add a test of wild bctr") > Signed-off-by: Michael Ellerman Applied to powerpc fixes. https://git.kernel.org/powerpc/c/2c7645b0f7d1014f2636393de7906c cheers
Re: powerpc/mm/64s: Fix preempt warning in slb_allocate_kernel()
On Thu, 2018-11-01 at 05:21:05 UTC, Michael Ellerman wrote: > With preempt enabled we see warnings in do_slb_fault(): > > BUG: using smp_processor_id() in preemptible [] code: > kworker/u33:0/98 > futex hash table entries: 4096 (order: 3, 524288 bytes) > caller is do_slb_fault+0x204/0x230 > CPU: 5 PID: 98 Comm: kworker/u33:0 Not tainted > 4.19.0-rc3-gcc-7.3.1-00022-g1936f094e164 #138 > Call Trace: > dump_stack+0xb4/0x104 (unreliable) > check_preemption_disabled+0x148/0x150 > do_slb_fault+0x204/0x230 > data_access_slb_common+0x138/0x180 > > This is caused by the get_paca() in slb_allocate_kernel(), which > includes a call to debug_smp_processor_id(). > > slb_allocate_kernel() can only be called from do_slb_fault(), and in > that path interrupts are hard disabled and so we can't be preempted, > but we can't update the preempt flags (in thread_info) because that > could cause an SLB fault. > > So just use local_paca which is safe and doesn't cause the warning. > > Fixes: 48e7b7695745 ("powerpc/64s/hash: Convert SLB miss handlers to C") > Signed-off-by: Michael Ellerman Applied to powerpc fixes. https://git.kernel.org/powerpc/c/c8b00bb742dd036388f37d019dbb9d cheers
Re: powerpc/io: Fix the IO workarounds code to work with Radix
On Wed, 2018-11-07 at 05:38:53 UTC, Michael Ellerman wrote: > Back in 2006 Ben added some workarounds for a misbehaviour in the > Spider IO bridge used on early Cell machines, see commit > 014da7ff47b5 ("[POWERPC] Cell "Spider" MMIO workarounds"). Later these > were made to be generic, ie. not tied specifically to Spider. > > The code stashes a token in the high bits (59-48) of virtual addresses > used for IO (eg. returned from ioremap()). This works fine when using > the Hash MMU, but when we're using the Radix MMU the bits used for the > token overlap with some of the bits of the virtual address. > > This is because the maximum virtual address is larger with Radix, up > to c00f, and in fact we use that high part of the address > range for ioremap(), see RADIX_KERN_IO_START. > > As it happens the bits that are used overlap with the bits that > differentiate an IO address vs a linear map address. If the resulting > address lies outside the linear mapping we will crash (see below), if > not we just corrupt memory. > > virtio-pci :00:00.0: Using 64-bit direct DMA at offset 800 > Unable to handle kernel paging request for data at address > 0xc0008014 > ... > CFAR: c0626b98 DAR: c0008014 DSISR: 4200 IRQMASK: 0 > GPR00: c06c54fc c0003e523378 c16de600 > GPR04: c00c8014 0007 0fff000a 0030 > > ... > NIP [c0626c5c] .iowrite8+0xec/0x100 > LR [c06c992c] .vp_reset+0x2c/0x90 > Call Trace: > .pci_bus_read_config_dword+0xc4/0x120 (unreliable) > .register_virtio_device+0x13c/0x1c0 > .virtio_pci_probe+0x148/0x1f0 > .local_pci_probe+0x68/0x140 > .pci_device_probe+0x164/0x220 > .really_probe+0x274/0x3b0 > .driver_probe_device+0x80/0x170 > .__driver_attach+0x14c/0x150 > .bus_for_each_dev+0xb8/0x130 > .driver_attach+0x34/0x50 > .bus_add_driver+0x178/0x2f0 > .driver_register+0x90/0x1a0 > .__pci_register_driver+0x6c/0x90 > .virtio_pci_driver_init+0x2c/0x40 > .do_one_initcall+0x64/0x280 > .kernel_init_freeable+0x36c/0x474 > .kernel_init+0x24/0x160 > .ret_from_kernel_thread+0x58/0x7c > > This hasn't been a problem because CONFIG_PPC_IO_WORKAROUNDS which > enables this code is usually not enabled. It is only enabled when it's > selected by PPC_CELL_NATIVE which is only selected by > PPC_IBM_CELL_BLADE and that in turn depends on BIG_ENDIAN. So in order > to hit the bug you need to build a big endian kernel, with IBM Cell > Blade support enabled, as well as Radix MMU support, and then boot > that on Power9 using Radix MMU. > > Still we can fix the bug, so let's do that. We simply use fewer bits > for the token, taking the union of the restrictions on the address > from both Hash and Radix, we end up with 8 bits we can use for the > token. The only user of the token is iowa_mem_find_bus() which only > supports 8 token values, so 8 bits is plenty for that. > > Fixes: 566ca99af026 ("powerpc/mm/radix: Add dummy radix_enabled()") > Signed-off-by: Michael Ellerman Applied to powerpc fixes. https://git.kernel.org/powerpc/c/43c6494fa1499912c8177e71450c02 cheers
Re: [PATCH] powerpc: Add KVM guest defconfig
Satheesh Rajendran writes: > On Mon, Nov 12, 2018 at 11:24:08PM +1100, Michael Ellerman wrote: >> Satheesh Rajendran writes: >> >> > On Thu, Nov 08, 2018 at 04:23:07PM -0200, Breno Leitao wrote: >> >> hi Satheesh, >> >> >> >> On 11/08/2018 03:08 AM, sathn...@linux.vnet.ibm.com wrote: >> >> > --- /dev/null >> >> > +++ b/arch/powerpc/configs/guest.config >> >> > @@ -0,0 +1,14 @@ >> >> > +CONFIG_VIRTIO_BLK=y >> >> > +CONFIG_VIRTIO_BLK_SCSI=y >> >> > +CONFIG_SCSI_VIRTIO=y >> >> > +CONFIG_VIRTIO_NET=y >> >> > +CONFIG_NET_FAILOVER=y >> >> > +CONFIG_VIRTIO_CONSOLE=y >> >> > +CONFIG_VIRTIO=y >> >> > +CONFIG_VIRTIO_PCI=y >> >> > +CONFIG_KVM_GUEST=y >> >> > +CONFIG_EPAPR_PARAVIRT=y >> >> > +CONFIG_XFS_FS=y >> >> >> >> Why a guest kernel needs to have XFS integrated in the core image? I am >> >> wondering if it is a requirement from another CONFIG_ option. >> > >> > Idea is to have a working config which would boot guest without initramfs, >> > other FS(like EXT4) is already integrated in the core image, >> > thought this would be helpful for distributions, which default XFS as root >> > disk. >> >> Maybe we should switch XFS_FS to Y in ppc64_defconfig ? > > Sure, makes sense, will send it for ppc64_defconfig instead. > Inaddition, Have few more symbols to be enabled for cgroups, > memhotplug,numa balancing. > I guess these symbols can also go to ppc64_defconfig itself?. > > i.e, > > CONFIG_CGROUP_SCHED=y > CONFIG_MEMCG=y > CONFIG_CGROUP_SCHED=y > CONFIG_CGROUP_FREEZER=y > CONFIG_CGROUP_DEVICE=y > CONFIG_CGROUP_CPUACCT=y > CONFIG_CGROUP_PERF=y > CONFIG_MEMORY_HOTPLUG=y > CONFIG_MEMORY_HOTREMOVE=y > CONFIG_NUMA_BALANCING=y Yeah I don't see why not, they're probably more or less required by modern distros. cheers
Re: [PATCH 1/2] um: remove -fno-unit-at-a-time workaround for pre-4.0 GCC
Am Montag, 12. November 2018, 03:35:19 CET schrieb Masahiro Yamada: > Commit cafa0010cd51 ("Raise the minimum required gcc version to 4.6") > bumped the minimum GCC version to 4.6 for all architectures. > > '$(call cc-option,-fno-unit-at-a-time)' is now dead code since > '$(cc-version) -lt 0400' is always false. > > Signed-off-by: Masahiro Yamada > --- > > arch/x86/Makefile.um | 8 ++-- > 1 file changed, 2 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um > index 91085a0..577976b 100644 > --- a/arch/x86/Makefile.um > +++ b/arch/x86/Makefile.um > @@ -26,12 +26,8 @@ cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) > # an unresolved reference. > cflags-y += -ffreestanding > > -# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use > -# a lot more stack due to the lack of sharing of stacklots. Also, gcc > -# 4.3.0 needs -funit-at-a-time for extern inline functions. > -KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \ > - echo $(call cc-option,-fno-unit-at-a-time); \ > - else echo $(call cc-option,-funit-at-a-time); fi ;) > +# gcc 4.3.0 needs -funit-at-a-time for extern inline functions. > +KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) Acked-by: Richard Weinberger Thanks, //richard
Re: [PATCH] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update
> -static void topology_work_fn(struct work_struct *work) > -{ > - rebuild_sched_domains(); > + if (changed) > + rebuild_sched_domains(); > } > static DECLARE_WORK(topology_work, topology_work_fn); > > @@ -1553,7 +1424,6 @@ void __init shared_proc_topology_init(void) > if (lppaca_shared_proc(get_lppaca())) { > bitmap_fill(cpumask_bits(_associativity_changes_mask), > nr_cpumask_bits); > - numa_update_cpu_topology(false); Shouldn't we be calling topology_schedule_update() here? > } > } > -- Thanks and Regards Srikar Dronamraju
[PATCH kernel v3 22/22] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] [10de:1db1] subdriver
POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not pluggable PCIe devices but implement PCIe links for config space and MMIO. In addition to that the GPUs are interconnected to each other and also have direct links to the P9 CPU. The links are NVLink2 and provide direct access to the system RAM for GPUs via NPU (an NVLink2 "proxy" on P9 chip). These systems also support ATS (address translation services) which is a part of the NVLink2 protocol. Such GPUs also share on-board RAM (16GB in tested config) to the system via the same NVLink2 so a CPU has cache-coherent access to a GPU RAM. This exports GPU RAM to the userspace as a new PCI region. This preregisters the new memory as device memory as it might be used for DMA. This inserts pfns from the fault handler as the GPU memory is not onlined until the NVIDIA driver is loaded and trained the links so doing this earlier produces low level errors which we fence in the firmware so it does not hurt the host system but still better to avoid. This exports ATSD (Address Translation Shootdown) register of NPU which allows the guest to invalidate TLB. The register conveniently occupies a single 64k page. Since NPU maps the GPU memory, it has a "tgt" property (which is an abbreviated host system bus address) and tells the GPU its own system address. This exports the "tgt" as a capability to let the guest driver conglomerate the routing information so each GPU knows how to get directly to the other GPUs. This also adds the "tgt" capability to a GPU to allow the userspace to find out the NVLinks corresponding to a specific GPU. For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to know LPID (a logical partition ID or a KVM guest hardware ID in other words) and PID (a memory context ID of a userspace process, not to be confused with a linux pid). This assigns a GPU to LPID in the NPU and this is why this adds a listener for KVM on an IOMMU group. A PID comes via NVLink from a GPU and NPU uses a PID wildcard to pass it through. This requires coherent memory and ATSD to be available on the host as the GPU vendor only supports configurations with both features enabled and other configurations are known not to work. Because of this and because of the ways the features are advertised to the host system (which is a device tree with very platform specific properties), this requires enabled POWERNV platform. This hardcodes the NVLink2 support for specific vendor and device IDs as there is no reliable way of knowing about coherent memory and ATS support. The GPU has an unique vendor PCIe capability 0x23 but it was confirmed that it does not provide required information (and it is still undisclosed what it actually does). Signed-off-by: Alexey Kardashevskiy --- Changes: v3: * reworded the commit log about tgt * added tracepoints (do we want them enabled for entire vfio-pci?) * added code comments * added write|mmap flags to the new regions * auto enabled VFIO_PCI_NVLINK2 config option * added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu references; there are required by the NVIDIA driver * keep notifier registered only for short time --- drivers/vfio/pci/Makefile | 1 + drivers/vfio/pci/trace.h| 102 +++ drivers/vfio/pci/vfio_pci_private.h | 2 + include/uapi/linux/vfio.h | 26 ++ drivers/vfio/pci/vfio_pci.c | 39 ++- drivers/vfio/pci/vfio_pci_nvlink2.c | 433 drivers/vfio/pci/Kconfig| 6 + 7 files changed, 607 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/pci/trace.h create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 76d8ec0..9662c06 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,5 +1,6 @@ vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o +vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h new file mode 100644 index 000..b80d2d3 --- /dev/null +++ b/drivers/vfio/pci/trace.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * VFIO PCI mmap/mmap_fault tracepoints + * + * Copyright (C) 2018 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfio_pci + +#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VFIO_PCI_H + +#include + +TRACE_EVENT(vfio_pci_nvgpu_mmap_fault, + TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua, + vm_fault_t ret), + TP_ARGS(pdev, hpa,
[PATCH kernel v3 18/22] powerpc/powernv/npu: Add compound IOMMU groups
At the moment powernv registers an IOMMU group for each PE. There is an exception though - NPU (an emulated PCI bridge representing an NVLink); powernv attaches these bridges to the GPU IOMMU group which becomes a master. Now we have POWER9 systems with GPUs connected to each other directly, bypassing PCI. At the moment powernv does not control these links so it has to put such interconnected GPUs to the same IOMMU group which means that the old scheme with a GPU as a master won't work - there will be up to 3 GPUs in such group. This introduces a npu_comp struct which represents a compound IOMMU group made of multiple PEs. This converts the existing NVLink1 code to use the new scheme. From now on, each PE must have a valid iommu_table_group_ops which will either be called directly (a single PE group) or indirectly from a compound group. This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c. For POWER8, this stores a new compound group pointer in a PE (so a GPU is still a master); for POWER9 the new group pointer is stored in an NPU. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/pci.h| 1 + arch/powerpc/platforms/powernv/pci.h | 7 + arch/powerpc/platforms/powernv/npu-dma.c | 286 -- arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++-- 4 files changed, 308 insertions(+), 159 deletions(-) diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index baf2886..0c72f18 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index); extern int pnv_npu2_init(struct pci_controller *hose); extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, unsigned long msr); +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev); #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index cf9f748..aef4bb5 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -62,6 +62,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ struct iommu_table_group table_group; + struct npu_comp *npucomp; /* 64-bit TCE bypass region */ booltce_bypass_enabled; @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels); extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); +extern struct iommu_table_group *pnv_try_setup_npu_table_group( + struct pnv_ioda_pe *pe); +extern struct iommu_table_group *pnv_npu_compound_attach( + struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 1 diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 1792c7e..2231f4c 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -317,31 +317,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = { .unset_window = pnv_npu_unset_window, .take_ownership = pnv_npu_take_ownership, }; - -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - struct pci_bus *pbus = phb->hose->bus; - struct pci_dev *npdev, *gpdev = NULL, *gptmp; - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, ); - - if (!gpe || !gpdev) - return NULL; - - npe->table_group.ops = _pci_npu_ops; - - list_for_each_entry(npdev, >devices, bus_list) { - gptmp = pnv_pci_get_gpu_dev(npdev); - - if (gptmp != gpdev) - continue; - - pe_info(gpe, "Attached NPU %s\n", dev_name(>dev)); - iommu_group_add_device(gpe->table_group.group, >dev); - } - - return gpe; -} #endif /* !CONFIG_IOMMU_API */ /* @@ -349,6 +324,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) */ /* Maximum possible number of ATSD MMIO registers per NPU */ #define NV_NMMU_ATSD_REGS 8 +#define NV_NPU_MAX_PE_NUM 16 + +/* + * A compound
[PATCH kernel v3 14/22] powerpc/iommu_api: Move IOMMU groups setup to a single place
Registering new IOMMU groups and adding devices to them are separated in code and the latter is dug in the DMA setup code which it does not really belong to. This moved IOMMU groups setup to a separate helper which registers a group and adds devices as before. This does not make a difference as IOMMU groups are not used anyway; the only dependency here is that iommu_add_device() requires a valid pointer to an iommu_table (set by set_iommu_table_base()). To keep the old behaviour, this does not add new IOMMU groups for PEs with no DMA weigth and also skips NVLINK bridges which do not have pci_controller_ops::setup_bridge (the normal way of adding PEs). Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 80 +++ 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index f36a802..7f4904a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1269,6 +1269,8 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) pnv_ioda_setup_npu_PE(pdev); } +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe); + static void pnv_pci_ioda_setup_PEs(void) { struct pci_controller *hose; @@ -1591,6 +1593,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) mutex_unlock(>ioda.pe_list_mutex); pnv_pci_ioda2_setup_dma_pe(phb, pe); + pnv_ioda_setup_bus_iommu_group(pe); } } @@ -1930,21 +1933,16 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) return mask; } -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, - struct pci_bus *bus, - bool add_to_group) +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, >devices, bus_list) { set_iommu_table_base(>dev, pe->table_group.tables[0]); set_dma_offset(>dev, pe->tce_bypass_base); - if (add_to_group) - iommu_add_device(>table_group, >dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate, - add_to_group); + pnv_ioda_setup_bus_dma(pe, dev->subordinate); } } @@ -2374,7 +2372,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, iommu_init_table(tbl, phb->hose->node); if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + pnv_ioda_setup_bus_dma(pe, pe->pbus); return; fail: @@ -2607,7 +2605,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) pnv_pci_ioda2_set_bypass(pe, false); pnv_pci_ioda2_unset_window(>table_group, 0); if (pe->pbus) - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + pnv_ioda_setup_bus_dma(pe, pe->pbus); iommu_tce_table_put(tbl); } @@ -2618,7 +2616,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) pnv_pci_ioda2_setup_default_config(pe); if (pe->pbus) - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + pnv_ioda_setup_bus_dma(pe, pe->pbus); } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { @@ -2735,12 +2733,68 @@ static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { .release_ownership = pnv_ioda2_release_ownership, }; +static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe, + struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, >devices, bus_list) { + iommu_add_device(>table_group, >dev); + + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_iommu_group_add_devices(pe, + dev->subordinate); + } +} + +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe) +{ + if (!pnv_pci_ioda_pe_dma_weight(pe)) + return; + + iommu_register_group(>table_group, pe->phb->hose->global_number, + pe->pe_number); + + /* +* set_iommu_table_base(>pdev->dev, tbl) should have been called +* by now +*/ + if (pe->flags & PNV_IODA_PE_DEV) + iommu_add_device(>table_group, >pdev->dev); + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + pnv_ioda_setup_bus_iommu_group_add_devices(pe, pe->pbus); +} + static void pnv_pci_ioda_setup_iommu_api(void) { struct pci_controller *hose, *tmp; struct
[PATCH kernel v3 13/22] powerpc/powernv/pseries: Rework device adding to IOMMU groups
The powernv platform registers IOMMU groups and adds devices to them from the pci_controller_ops::setup_bridge() hook except one case when virtual functions (SRIOV VFs) are added from a bus notifier. The pseries platform registers IOMMU groups from the pci_controller_ops::dma_bus_setup() hook and adds devices from the pci_controller_ops::dma_dev_setup() hook. The very same bus notifier used for powernv does not add devices for pseries though as __of_scan_bus() adds devices first, then it does the bus/dev DMA setup. Both platforms use iommu_add_device() which takes a device and expects it to have a valid IOMMU table struct with an iommu_table_group pointer which in turn points the iommu_group struct (which represents an IOMMU group). Although the helper seems easy to use, it relies on some pre-existing device configuration and associated data structures which it does not really need. This simplifies iommu_add_device() to take the table_group pointer directly. Pseries already has a table_group pointer handy and the bus notified is not used anyway. For powernv, this copies the existing bus notifier, makes it work for powernv only which means an easy way of getting to the table_group pointer. This was tested on VFs but should also support physical PCI hotplug. Since iommu_add_device() receives the table_group pointer directly, pseries does not do TCE cache invalidation (the hypervisor does) nor allow multiple groups per a VFIO container (in other words sharing an IOMMU table between partitionable endpoints), this removes iommu_table_group_link from pseries. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- arch/powerpc/include/asm/iommu.h | 12 ++--- arch/powerpc/kernel/iommu.c | 58 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 10 +--- arch/powerpc/platforms/powernv/pci.c | 43 - arch/powerpc/platforms/pseries/iommu.c| 46 +- 5 files changed, 74 insertions(+), 95 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index a8aeac0..e847ff6 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -215,9 +215,9 @@ struct iommu_table_group { extern void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num); -extern int iommu_add_device(struct device *dev); +extern int iommu_add_device(struct iommu_table_group *table_group, + struct device *dev); extern void iommu_del_device(struct device *dev); -extern int __init tce_iommu_bus_notifier_init(void); extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction); @@ -228,7 +228,8 @@ static inline void iommu_register_group(struct iommu_table_group *table_group, { } -static inline int iommu_add_device(struct device *dev) +static inline int iommu_add_device(struct iommu_table_group *table_group, + struct device *dev) { return 0; } @@ -236,11 +237,6 @@ static inline int iommu_add_device(struct device *dev) static inline void iommu_del_device(struct device *dev) { } - -static inline int __init tce_iommu_bus_notifier_init(void) -{ -return 0; -} #endif /* !CONFIG_IOMMU_API */ int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 8ccfdd9..1e85168 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1076,11 +1076,8 @@ void iommu_release_ownership(struct iommu_table *tbl) } EXPORT_SYMBOL_GPL(iommu_release_ownership); -int iommu_add_device(struct device *dev) +int iommu_add_device(struct iommu_table_group *table_group, struct device *dev) { - struct iommu_table *tbl; - struct iommu_table_group_link *tgl; - /* * The sysfs entries should be populated before * binding IOMMU group. If sysfs entries isn't @@ -1096,32 +1093,10 @@ int iommu_add_device(struct device *dev) return -EBUSY; } - tbl = get_iommu_table_base(dev); - if (!tbl) { - pr_debug("%s: Skipping device %s with no tbl\n", -__func__, dev_name(dev)); - return 0; - } - - tgl = list_first_entry_or_null(>it_group_list, - struct iommu_table_group_link, next); - if (!tgl) { - pr_debug("%s: Skipping device %s with no group\n", -__func__, dev_name(dev)); - return 0; - } pr_debug("%s: Adding %s to iommu group %d\n", -__func__, dev_name(dev), -iommu_group_id(tgl->table_group->group)); +__func__, dev_name(dev), iommu_group_id(table_group->group)); - if (PAGE_SIZE <
[PATCH kernel v3 21/22] vfio_pci: Allow regions to add own capabilities
VFIO regions already support region capabilities with a limited set of fields. However the subdriver might have to report to the userspace additional bits. This adds an add_capability() hook to vfio_pci_regops. Signed-off-by: Alexey Kardashevskiy --- Changes: v3: * removed confusing rationale for the patch, the next patch makes use of it anyway --- drivers/vfio/pci/vfio_pci_private.h | 3 +++ drivers/vfio/pci/vfio_pci.c | 6 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 86aab05..93c1738 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -62,6 +62,9 @@ struct vfio_pci_regops { int (*mmap)(struct vfio_pci_device *vdev, struct vfio_pci_region *region, struct vm_area_struct *vma); + int (*add_capability)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vfio_info_cap *caps); }; struct vfio_pci_region { diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 4a6f7c0..6cb70cf 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -763,6 +763,12 @@ static long vfio_pci_ioctl(void *device_data, if (ret) return ret; + if (vdev->region[i].ops->add_capability) { + ret = vdev->region[i].ops->add_capability(vdev, + >region[i], ); + if (ret) + return ret; + } } } -- 2.17.1
[PATCH kernel v3 20/22] vfio_pci: Allow mapping extra regions
So far we only allowed mapping of MMIO BARs to the userspace. However there there are GPUs with on-board coherent RAM accessible via side channels which we also want to map to the userspace. The first client for this is NVIDIA V100 GPU with NVLink2 direct links to a POWER9 NPU-enabled CPU; such GPUs have 16GB RAM which is coherently mapped to the system address space, we are going to export these as an extra PCI region. We already support extra PCI regions and this adds support for mapping them to the userspace. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- Changes: v2: * reverted one of mistakenly removed error checks --- drivers/vfio/pci/vfio_pci_private.h | 3 +++ drivers/vfio/pci/vfio_pci.c | 9 + 2 files changed, 12 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index cde3b5d..86aab05 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -59,6 +59,9 @@ struct vfio_pci_regops { size_t count, loff_t *ppos, bool iswrite); void(*release)(struct vfio_pci_device *vdev, struct vfio_pci_region *region); + int (*mmap)(struct vfio_pci_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma); }; struct vfio_pci_region { diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index fef5002..4a6f7c0 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1130,6 +1130,15 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return -EINVAL; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; + if (index >= VFIO_PCI_NUM_REGIONS) { + int regnum = index - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_region *region = vdev->region + regnum; + + if (region && region->ops && region->ops->mmap && + (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) + return region->ops->mmap(vdev, region, vma); + return -EINVAL; + } if (index >= VFIO_PCI_ROM_REGION_INDEX) return -EINVAL; if (!vdev->bar_mmap_supported[index]) -- 2.17.1
[PATCH kernel v3 19/22] powerpc/powernv/npu: Add release_ownership hook
In order to make ATS work and translate addresses for arbitrary LPID and PID, we need to program an NPU with LPID and allow PID wildcard matching with a specific MSR mask. This implements a helper to assign a GPU to LPAR and program the NPU with a wildcard for PID and a helper to do clean-up. The helper takes MSR (only DR/HV/PR/SF bits are allowed) to program them into NPU2 for ATS checkout requests support. This exports pnv_npu2_unmap_lpar_dev() as following patches will use it from the VFIO driver. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/npu-dma.c | 47 1 file changed, 47 insertions(+) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2231f4c..48adaa5 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -289,6 +289,7 @@ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) table_group); struct pnv_phb *phb = npe->phb; int64_t rc; + struct pci_dev *gpdev = NULL; /* * Note: NPU has just a single TVE in the hardware which means that @@ -310,12 +311,28 @@ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) return; } pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); + + get_gpu_pci_dev_and_pe(npe, ); + if (gpdev) + pnv_npu2_unmap_lpar_dev(gpdev); +} + +static void pnv_npu_release_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pci_dev *gpdev = NULL; + + get_gpu_pci_dev_and_pe(npe, ); + if (gpdev) + pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV); } static struct iommu_table_group_ops pnv_pci_npu_ops = { .set_window = pnv_npu_set_window, .unset_window = pnv_npu_unset_window, .take_ownership = pnv_npu_take_ownership, + .release_ownership = pnv_npu_release_ownership, }; #endif /* !CONFIG_IOMMU_API */ @@ -1239,3 +1256,33 @@ void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr) ret); } } + +int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev) +{ + int ret; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct pci_controller *hose = pci_bus_to_host(npdev->bus); + struct pnv_phb *nphb = hose->private_data; + + dev_dbg(>dev, "destroy context opalid=%llu\n", + nphb->opal_id); + ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (ret < 0) { + dev_err(>dev, "Failed to destroy context: %d\n", ret); + return ret; + } + + /* Set LPID to 0 anyway, just to be safe */ + dev_dbg(>dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id); + ret = opal_npu_map_lpar(nphb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/, + 0 /* LPCR bits */); + if (ret) + dev_err(>dev, "Error %d mapping device to LPAR\n", ret); + + opal_purge_cache(); + + return ret; +} +EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev); -- 2.17.1
[PATCH kernel v3 09/22] powerpc/pseries/iommu: Force default DMA window removal
It is quite common for a device to support more than 32bit but less than 64bit for DMA, for example, GPUs often support 42..50bits. However the pseries platform only allows huge DMA window (the one which allows the use of more than 2GB of DMA space) for 64bit-capable devices mostly because: 1. we may have 32bit and >32bit devices on the same IOMMU domain and we cannot place the new big window where the 32bit one is located; 2. the existing hardware only supports the second window at very high offset of 1<<59 == 0x0800.... So in order to allow 33..59bit DMA, we have to remove the default DMA window and place a huge one there instead. The PAPR spec says that the platform may decide not to use the default window and remove it using DDW RTAS calls. There are few possible ways for the platform to decide: 1. look at the device IDs and decide in advance that such and such devices are capable of more than 32bit DMA (powernv's sketchy bypass does something like this - it drops the default window if all devices on the PE are from the same vendor) - this is not great as involves guessing because, unlike sketchy bypass, the GPU case involves 2 vendor ids and does not scale; 2. advertise 1 available DMA window in the hypervisor via ibm,query-pe-dma-window so the pseries platform could take it as a clue that if more bits for DMA are needed, it has to remove the default window - this is not great as it is implicit clue rather than direct instruction; 3. removing the default DMA window at all it not really an option as PAPR mandates its presense at the guest boot time; 4. make the hypervisor explicitly tell the guest that the default window is better be removed so the guest does not have to think hard and can simply do what requested and this is what this patch does. This makes use of the latter approach and exploits a new "qemu,dma-force-remove-default" flag in a vPHB. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/pseries/iommu.c | 28 +++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 9ece42f..78473ac 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -54,6 +54,7 @@ #include "pseries.h" #define DDW_INVALID_OFFSET ((uint64_t)-1) +#define DDW_INVALID_LIOBN ((uint32_t)-1) static struct iommu_table_group *iommu_pseries_alloc_group(int node) { @@ -977,7 +978,8 @@ static LIST_HEAD(failed_ddw_pdn_list); * * returns the dma offset for use by dma_set_mask */ -static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn, + u32 default_liobn) { int len, ret; struct ddw_query_response query; @@ -1022,6 +1024,16 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (ret) goto out_failed; + /* +* The device tree has a request to force remove the default window, +* do this. +*/ + if (default_liobn != DDW_INVALID_LIOBN && (!ddw_avail[2] || + rtas_call(ddw_avail[2], 1, 1, NULL, default_liobn))) { + dev_dbg(>dev, "Could not remove window"); + goto out_failed; + } + /* * Query if there is a second window of size to map the * whole partition. Query returns number of windows, largest @@ -1212,7 +1224,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) pdev = to_pci_dev(dev); /* only attempt to use a new window if 64-bit DMA is requested */ - if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + if (!disable_ddw && dma_mask > DMA_BIT_MASK(32)) { dn = pci_device_to_OF_node(pdev); dev_dbg(dev, "node is %pOF\n", dn); @@ -1229,7 +1241,17 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) break; } if (pdn && PCI_DN(pdn)) { - dma_offset = enable_ddw(pdev, pdn); + u32 liobn = DDW_INVALID_LIOBN; + int ret = of_device_is_compatible(pdn, "IBM,npu-vphb"); + + if (ret) { + dma_window = of_get_property(pdn, + "ibm,dma-window", NULL); + if (dma_window) + liobn = be32_to_cpu(dma_window[0]); + } + + dma_offset = enable_ddw(pdev, pdn, liobn); if (dma_offset != DDW_INVALID_OFFSET) { dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); set_dma_offset(dev, dma_offset); -- 2.17.1
[PATCH kernel v3 17/22] powerpc/powernv/npu: Convert NPU IOMMU helpers to iommu_table_group_ops
At the moment NPU IOMMU is manipulated directly from the IODA2 PCI PE code; PCI PE acts as a master to NPU PE. Soon we will have compound IOMMU groups with several PEs from several different PHB (such as interconnected GPUs and NPUs) so there will be no single master but a one big IOMMU group. This makes a first step and converts an NPU PE to a table group. This should cause no behavioral change. Note that pnv_npu_release_ownership() has never been implemented. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci.h | 5 arch/powerpc/platforms/powernv/npu-dma.c | 29 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 17 +++-- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index ddb4f02..cf9f748 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -216,11 +216,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); -extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, - struct iommu_table *tbl); -extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); -extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); -extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 1 diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 4b60f43..1792c7e 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -121,9 +121,11 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } -long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, +static long pnv_npu_set_window(struct iommu_table_group *table_group, int num, struct iommu_table *tbl) { + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); struct pnv_phb *phb = npe->phb; int64_t rc; const unsigned long size = tbl->it_indirect_levels ? @@ -155,8 +157,10 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, return 0; } -long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) +static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) { + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); struct pnv_phb *phb = npe->phb; int64_t rc; @@ -198,7 +202,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) if (!gpe) return; - rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); + rc = pnv_npu_set_window(>table_group, 0, + gpe->table_group.tables[0]); /* * NVLink devices use the same TCE table configuration as @@ -223,7 +228,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) return -EINVAL; - rc = pnv_npu_unset_window(npe, 0); + rc = pnv_npu_unset_window(>table_group, 0); if (rc != OPAL_SUCCESS) return rc; @@ -276,9 +281,12 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) } } +#ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ -void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) +static void pnv_npu_take_ownership(struct iommu_table_group *table_group) { + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, + table_group); struct pnv_phb *phb = npe->phb; int64_t rc; @@ -289,7 +297,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) * if it was enabled at the moment of ownership change. */ if (npe->table_group.tables[0]) { - pnv_npu_unset_window(npe, 0); + pnv_npu_unset_window(>table_group, 0); return; } @@ -304,6 +312,12 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); } +static struct iommu_table_group_ops pnv_pci_npu_ops = { + .set_window = pnv_npu_set_window, + .unset_window = pnv_npu_unset_window, + .take_ownership = pnv_npu_take_ownership, +}; + struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) { struct pnv_phb *phb = npe->phb; @@ -314,6 +328,8 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) if (!gpe || !gpdev) return NULL;
[PATCH kernel v3 08/22] powerpc/pseries/iommu: Allow dynamic window to start from zero
At the moment the kernel does not expect dynamic windows to ever start at zero on a PCI bus as PAPR requires the hypervisor to create a 32bit default window which starts from zero and the pseries kernel only creates additional windows. However PAPR permits removing the default window and creating another one instead, starting from zero as well. In fact, the kernel used to remove the default window after sha1 25ebc45b934 but this has been reverted later. Since there are devices capable of more than 32 bits for DMA but less than 50, and currently available hardware allows the second window only at 1<<59, we will need to be able to create bigger windows starting from zero. This does the initial preparation and should not cause any behavioral changes. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- arch/powerpc/platforms/pseries/iommu.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 06f0296..9ece42f 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -53,6 +53,8 @@ #include "pseries.h" +#define DDW_INVALID_OFFSET ((uint64_t)-1) + static struct iommu_table_group *iommu_pseries_alloc_group(int node) { struct iommu_table_group *table_group; @@ -844,7 +846,7 @@ static u64 find_existing_ddw(struct device_node *pdn) { struct direct_window *window; const struct dynamic_dma_window_prop *direct64; - u64 dma_addr = 0; + u64 dma_addr = DDW_INVALID_OFFSET; spin_lock(_window_list_lock); /* check if we already created a window and dupe that config if so */ @@ -992,7 +994,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) mutex_lock(_window_init_mutex); dma_addr = find_existing_ddw(pdn); - if (dma_addr != 0) + if (dma_addr != DDW_INVALID_OFFSET) goto out_unlock; /* @@ -1228,7 +1230,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) } if (pdn && PCI_DN(pdn)) { dma_offset = enable_ddw(pdev, pdn); - if (dma_offset != 0) { + if (dma_offset != DDW_INVALID_OFFSET) { dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); set_dma_offset(dev, dma_offset); set_dma_ops(dev, _nommu_ops); -- 2.17.1
[PATCH kernel v3 16/22] powerpc/powernv: Add purge cache OPAL call
Flushing caches using the dcbf instruction takes quite some time if we need to flush gigabytes (16GB takes more than 15s); OPAL just added a big hammer to flush all caches. This adds opal_purge_cache() which will be used later to flush caches for coherent GPU memory which might suddenly become unavailable if a GPU is reset and NVLink is not (re)trained. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/opal-api.h| 3 ++- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/opal.c | 1 + arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b..55bc640 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -210,7 +210,8 @@ #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 #defineOPAL_NX_COPROC_INIT 167 -#define OPAL_LAST 167 +#define OPAL_CLEAR_CACHE 170 +#define OPAL_LAST 170 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ff38664..7db576e 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -294,6 +294,7 @@ int opal_set_power_shift_ratio(u32 handle, int token, u32 psr); int opal_sensor_group_clear(u32 group_hndl, int token); int opal_sensor_group_enable(u32 group_hndl, int token, bool enable); int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct); +int opal_purge_cache(void); s64 opal_signal_system_reset(s32 cpu); s64 opal_quiesce(u64 shutdown_type, s32 cpu); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index beed86f..44ce824 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -1113,3 +1113,4 @@ EXPORT_SYMBOL_GPL(opal_int_eoi); EXPORT_SYMBOL_GPL(opal_error_code); /* Export the below symbol for NX compression */ EXPORT_SYMBOL(opal_nx_coproc_init); +EXPORT_SYMBOL(opal_purge_cache); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 2515282..5b886a6 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -331,3 +331,4 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_sensor_read_u64,OPAL_SENSOR_READ_U64); OPAL_CALL(opal_sensor_group_enable,OPAL_SENSOR_GROUP_ENABLE); OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); +OPAL_CALL(opal_purge_cache,OPAL_CLEAR_CACHE); -- 2.17.1
[PATCH kernel v3 07/22] powerpc/powernv/npu: Move OPAL calls away from context manipulation
When introduced, the NPU context init/destroy helpers called OPAL which enabled/disabled PID (a userspace memory context ID) filtering in an NPU per a GPU; this was a requirement for P9 DD1.0. However newer chip revision added a PID wildcard support so there is no more need to call OPAL every time a new context is initialized. Also, since the PID wildcard support was added, skiboot does not clear wildcard entries in the NPU so these remain in the hardware till the system reboot. This moves LPID and wildcard programming to the PE setup code which executes once during the booting process so NPU2 context init/destroy won't need to do additional configuration. This removes the check for FW_FEATURE_OPAL as pnv_npu2_init_context/ pnv_npu2_release_context/pnv_npu2_init do not call OPAL anymore. This moves pnv_npu2_init() declaration as pseries should be able to use it. This keeps pnv_npu2_map_lpar() in powernv as pseries is not allowed to call that. This exports pnv_npu2_map_lpar_dev() as following patches will use it from the VFIO driver. While at it, replace redundant list_for_each_entry_safe() with a simpler list_for_each_entry(). Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/pci.h| 3 + arch/powerpc/platforms/powernv/pci.h | 2 +- arch/powerpc/platforms/powernv/npu-dma.c | 105 +++--- arch/powerpc/platforms/powernv/pci-ioda.c | 15 +++- 4 files changed, 71 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 2af9ded..baf2886 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -129,5 +129,8 @@ extern void pcibios_scan_phb(struct pci_controller *hose); extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev); extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index); +extern int pnv_npu2_init(struct pci_controller *hose); +extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, + unsigned long msr); #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f2d50974..ddb4f02 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -190,6 +190,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); +extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); @@ -220,7 +221,6 @@ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); -extern int pnv_npu2_init(struct pnv_phb *phb); /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 1 diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 9fc4e4e..4b60f43 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -698,7 +698,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, u32 nvlink_index; struct device_node *nvlink_dn; struct mm_struct *mm = current->mm; - struct pnv_phb *nphb; struct npu *npu; struct npu_context *npu_context; @@ -708,9 +707,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, */ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - if (!firmware_has_feature(FW_FEATURE_OPAL)) - return ERR_PTR(-ENODEV); - if (!npdev) /* No nvlink associated with this GPU device */ return ERR_PTR(-ENODEV); @@ -728,23 +724,10 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, return ERR_PTR(-EINVAL); } - nphb = pci_bus_to_host(npdev->bus)->private_data; npu = npdev_to_npu(npdev); if (!npu) return ERR_PTR(-ENODEV); - /* -* Setup the NPU context table for a particular GPU. These need to be -* per-GPU as we need the tables to filter ATSDs when there are no -* active contexts on a particular GPU. It is safe for these to be -* called concurrently with destroy as the OPAL call takes appropriate -* locks and refcounts on init/destroy. -*/ - rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); - if (rc < 0) - return ERR_PTR(-ENOSPC); - /* * We store the npu
[PATCH kernel v3 15/22] powerpc/powernv: Reference iommu_table while it is linked to a group
The iommu_table pointer stored in iommu_table_group may get stale by accident, this adds referencing and removes a redundant comment about this. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 3 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 4 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 7639b21..697449a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -368,6 +368,7 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, found = false; for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { if (table_group->tables[i] == tbl) { + iommu_tce_table_put(tbl); table_group->tables[i] = NULL; found = true; break; @@ -393,7 +394,7 @@ long pnv_pci_link_table_and_group(int node, int num, tgl->table_group = table_group; list_add_rcu(>next, >it_group_list); - table_group->tables[num] = tbl; + table_group->tables[num] = iommu_tce_table_get(tbl); return 0; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7f4904a..7caf373 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2716,10 +2716,6 @@ static long pnv_pci_ioda2_npu_unset_window( static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) { - /* -* Detach NPU first as pnv_ioda2_take_ownership() will destroy -* the iommu_table if 32bit DMA is enabled. -*/ pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); pnv_ioda2_take_ownership(table_group); } -- 2.17.1
[PATCH kernel v3 04/22] powerpc/vfio/iommu/kvm: Do not pin device memory
This new memory does not have page structs as it is not plugged to the host so gup() will fail anyway. This adds 2 helpers: - mm_iommu_newdev() to preregister the "memory device" memory so the rest of API can still be used; - mm_iommu_is_devmem() to know if the physical address is one of thise new regions which we must avoid unpinning of. This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test if the memory is device memory to avoid pfn_to_page(). Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h | 5 +- arch/powerpc/include/asm/mmu_context.h | 5 ++ arch/powerpc/kernel/iommu.c| 9 ++- arch/powerpc/kvm/book3s_64_vio.c | 18 +++--- arch/powerpc/mm/mmu_context_iommu.c| 83 +++--- drivers/vfio/vfio_iommu_spapr_tce.c| 28 + 6 files changed, 116 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 35db0cb..a8aeac0 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -218,8 +218,9 @@ extern void iommu_register_group(struct iommu_table_group *table_group, extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); -extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction); +extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 2d6b00d..f0f9f3d 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -24,6 +24,9 @@ extern bool mm_iommu_preregistered(struct mm_struct *mm); extern long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, struct mm_iommu_table_group_mem_t **pmem); +extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + struct mm_iommu_table_group_mem_t **pmem); extern long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_init(struct mm_struct *mm); @@ -39,6 +42,8 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); +extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, + unsigned int pageshift); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f0dc680..8ccfdd9 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -47,6 +47,7 @@ #include #include #include +#include #define DBG(...) @@ -993,15 +994,17 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction) +long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) { long ret; ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL))) + (*direction == DMA_BIDIRECTIONAL)) && + !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift)) SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); /* if (unlikely(ret)) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 62a8d03..532ab797 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -397,12 +397,13 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } -static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) +static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry) { unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg(tbl, entry, , ); + iommu_tce_xchg(mm,
[PATCH kernel v3 12/22] powerpc/pseries: Remove IOMMU API support for non-LPAR systems
The pci_dma_bus_setup_pSeries and pci_dma_dev_setup_pSeries hooks are registered for the pseries platform which does not have FW_FEATURE_LPAR; these would be pre-powernv platforms which we never supported PCI pass through for anyway so remove it. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- Propably should remove all pseries-but-not-lpar code. --- arch/powerpc/platforms/pseries/iommu.c | 9 ++--- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index f818737..b045f28 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -648,7 +648,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = _table_pseries_ops; iommu_init_table(tbl, pci->phb->node); - iommu_register_group(pci->table_group, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x8000ul; @@ -759,10 +758,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) iommu_table_setparms(phb, dn, tbl); tbl->it_ops = _table_pseries_ops; iommu_init_table(tbl, phb->node); - iommu_register_group(PCI_DN(dn)->table_group, - pci_domain_nr(phb->bus), 0); set_iommu_table_base(>dev, tbl); - iommu_add_device(>dev); return; } @@ -773,11 +769,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) dn = dn->parent; - if (dn && PCI_DN(dn)) { + if (dn && PCI_DN(dn)) set_iommu_table_base(>dev, PCI_DN(dn)->table_group->tables[0]); - iommu_add_device(>dev); - } else + else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", pci_name(dev)); } -- 2.17.1
[PATCH kernel v3 03/22] powerpc/mm/iommu: Make mm_iommu_new() fail on existing regions
Since we are going to have 2 different preregistering helpers, let's make it clear that mm_iommu_new() is only for the normal memory (i.e.not device memory) and for existing areas mm_iommu_get() should be used instead. This removes the check for exact match as the check for overlap is enough now. Signed-off-by: Alexey Kardashevskiy --- Changes: v2: * remove the exact match check --- arch/powerpc/mm/mmu_context_iommu.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index babc6ad..580d89e 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -102,12 +102,6 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, list_for_each_entry_rcu(mem, >context.iommu_group_mem_list, next) { - if ((mem->ua == ua) && (mem->entries == entries)) { - ++mem->used; - *pmem = mem; - goto unlock_exit; - } - /* Overlap? */ if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && (ua < (mem->ua + -- 2.17.1
[PATCH kernel v3 11/22] powerpc/pseries/npu: Enable platform support
We already changed NPU API for GPUs to not to call OPAL and the remaining bit is initializing NPU structures. This uses a new QEMU capability which marks NPU-enabled vPHBs as "IBM,npu-vphb" and initializes an NPU structure per vPHB. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/pseries/pci.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 41d8a4d..a50d5e4 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "pseries.h" #if 0 @@ -237,6 +238,8 @@ static void __init pSeries_request_regions(void) void __init pSeries_final_fixup(void) { + struct pci_controller *hose; + pSeries_request_regions(); eeh_probe_devices(); @@ -246,6 +249,9 @@ void __init pSeries_final_fixup(void) ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable; #endif + list_for_each_entry(hose, _list, list_node) + if (of_device_is_compatible(hose->dn, "IBM,npu-vphb")) + pnv_npu2_init(hose); } /* -- 2.17.1
[PATCH kernel v3 02/22] powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a region
Normally mm_iommu_get() is supposed to add a reference and mm_iommu_put() to remove it. However historically mm_iommu_find() does the referencing and mm_iommu_get() is doing allocation and referencing. We are going to add another helper to preregister device memory so instead of having mm_iommu_new() which pre-registers the normal memory and references the region, we need separate helpers for pre-registering and referencing. This renames: - mm_iommu_get to mm_iommu_new; - mm_iommu_find to mm_iommu_get. To make the mm_iommu_get name reflect what it is supposed to do, this changes mm_iommu_get() to reference the region so from now on for every mm_iommu_get() we need a matching mm_iommu_put(). Signed-off-by: Alexey Kardashevskiy --- Changes: v2: * merged 2 patches into one --- arch/powerpc/include/asm/mmu_context.h | 4 +-- arch/powerpc/mm/mmu_context_iommu.c| 13 ++--- drivers/vfio/vfio_iommu_spapr_tce.c| 37 +- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 0381394..2d6b00d 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -21,7 +21,7 @@ struct mm_iommu_table_group_mem_t; extern int isolate_lru_page(struct page *page);/* from internal.h */ extern bool mm_iommu_preregistered(struct mm_struct *mm); -extern long mm_iommu_get(struct mm_struct *mm, +extern long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, struct mm_iommu_table_group_mem_t **pmem); extern long mm_iommu_put(struct mm_struct *mm, @@ -32,7 +32,7 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, unsigned long ua, unsigned long size); extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm( struct mm_struct *mm, unsigned long ua, unsigned long size); -extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, +extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries); extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 1d5161f..babc6ad 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -89,7 +89,7 @@ bool mm_iommu_preregistered(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mm_iommu_preregistered); -long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; @@ -202,7 +202,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_get); +EXPORT_SYMBOL_GPL(mm_iommu_new); static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) { @@ -318,21 +318,26 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, return ret; } -struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, +struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries) { struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + mutex_lock(_list_mutex); + list_for_each_entry_rcu(mem, >context.iommu_group_mem_list, next) { if ((mem->ua == ua) && (mem->entries == entries)) { ret = mem; + ++mem->used; break; } } + mutex_unlock(_list_mutex); + return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_find); +EXPORT_SYMBOL_GPL(mm_iommu_get); long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index ad63725..56db071 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -151,12 +151,13 @@ static long tce_iommu_unregister_pages(struct tce_container *container, { struct mm_iommu_table_group_mem_t *mem; struct tce_iommu_prereg *tcemem; - bool found = false; + bool found; + long ret; if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; - mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); + mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT); if (!mem) return -ENOENT; @@ -168,9 +169,13 @@ static long tce_iommu_unregister_pages(struct
[PATCH kernel v3 10/22] powerpc/pseries/iommu: Use memory@ nodes in max RAM address calculation
We might have memory@ nodes with "linux,usable-memory" set to zero (for example, to replicate powernv's behaviour for GPU coherent memory) which means that the memory needs an extra initialization but since it can be used afterwards, the pseries platform will try mapping it for DMA so the DMA window needs to cover those memory regions too. This walks through the memory nodes to find the highest RAM address to let a huge DMA window cover that too in case this memory gets onlined later. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/pseries/iommu.c | 43 +- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 78473ac..f818737 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -967,6 +967,47 @@ struct failed_ddw_pdn { static LIST_HEAD(failed_ddw_pdn_list); +static unsigned long read_n_cells(int n, const __be32 **buf) +{ + unsigned long result = 0; + + while (n--) { + result = (result << 32) | of_read_number(*buf, 1); + (*buf)++; + } + return result; +} + +static phys_addr_t ddw_memory_hotplug_max(void) +{ + phys_addr_t max_addr = memory_hotplug_max(); + struct device_node *memory; + + for_each_node_by_type(memory, "memory") { + unsigned long start, size; + int ranges, n_mem_addr_cells, n_mem_size_cells, len; + const __be32 *memcell_buf; + + memcell_buf = of_get_property(memory, "reg", ); + if (!memcell_buf || len <= 0) + continue; + + n_mem_addr_cells = of_n_addr_cells(memory); + n_mem_size_cells = of_n_size_cells(memory); + + /* ranges in cell */ + ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); + + /* these are order-sensitive, and modify the buffer pointer */ + start = read_n_cells(n_mem_addr_cells, _buf); + size = read_n_cells(n_mem_size_cells, _buf); + + max_addr = max_t(phys_addr_t, max_addr, start + size); + } + + return max_addr; +} + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -1067,7 +1108,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn, } /* verify the window * number of ptes will map the partition */ /* check largest block * page size > max memory hotplug addr */ - max_addr = memory_hotplug_max(); + max_addr = ddw_memory_hotplug_max(); if (query.largest_available_block < (max_addr >> page_shift)) { dev_dbg(>dev, "can't map partition max 0x%llx with %u " "%llu-sized pages\n", max_addr, query.largest_available_block, -- 2.17.1
[PATCH kernel v3 06/22] powerpc/powernv: Detach npu struct from pnv_phb
The powernv PCI code stores NPU data in the pnv_phb struct. The latter is referenced by pci_controller::private_data. We are going to have NPU2 support in the pseries platform as well but it does not store any private_data in in the pci_controller struct; and even if it did, it would be a different data structure. This adds a global list of NPUs so each platform can register and use these in the same fashion. As npdev_to_npu() may now fail, this checks the returned pointer. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci.h | 16 - arch/powerpc/platforms/powernv/npu-dma.c | 78 2 files changed, 65 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 2131373..f2d50974 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -8,9 +8,6 @@ struct pci_dn; -/* Maximum possible number of ATSD MMIO registers per NPU */ -#define NV_NMMU_ATSD_REGS 8 - enum pnv_phb_type { PNV_PHB_IODA1 = 0, PNV_PHB_IODA2 = 1, @@ -176,19 +173,6 @@ struct pnv_phb { unsigned intdiag_data_size; u8 *diag_data; - /* Nvlink2 data */ - struct npu { - int index; - __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; - unsigned int mmio_atsd_count; - - /* Bitmask for MMIO register usage */ - unsigned long mmio_atsd_usage; - - /* Do we need to explicitly flush the nest mmu? */ - bool nmmu_flush; - } npu; - int p2p_target_count; }; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 9f48831..9fc4e4e 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -330,13 +330,39 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) /* * NPU2 ATS */ +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 + +/* An NPU descriptor, valid for POWER9 only */ +struct npu { + int index; + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; + unsigned int mmio_atsd_count; + + /* Bitmask for MMIO register usage */ + unsigned long mmio_atsd_usage; + + /* Do we need to explicitly flush the nest mmu? */ + bool nmmu_flush; + + struct list_head next; + + struct pci_controller *hose; +}; + +static LIST_HEAD(npu2_devices); + static struct npu *npdev_to_npu(struct pci_dev *npdev) { - struct pnv_phb *nphb; + struct pci_controller *hose = pci_bus_to_host(npdev->bus); + struct npu *npu; - nphb = pci_bus_to_host(npdev->bus)->private_data; + list_for_each_entry(npu, _devices, next) + if (hose == npu->hose) + return npu; - return >npu; + WARN_ON_ONCE(1); + return NULL; } /* Maximum number of nvlinks per npu */ @@ -505,6 +531,9 @@ static void acquire_atsd_reg(struct npu_context *npu_context, continue; npu = npdev_to_npu(npdev); + if (!npu) + continue; + mmio_atsd_reg[i].npu = npu; mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); while (mmio_atsd_reg[i].reg < 0) { @@ -701,6 +730,8 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, nphb = pci_bus_to_host(npdev->bus)->private_data; npu = npdev_to_npu(npdev); + if (!npu) + return ERR_PTR(-ENODEV); /* * Setup the NPU context table for a particular GPU. These need to be @@ -821,6 +852,8 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context, nphb = pci_bus_to_host(npdev->bus)->private_data; npu = npdev_to_npu(npdev); + if (!npu) + return; nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", _index))) @@ -898,9 +931,15 @@ int pnv_npu2_init(struct pnv_phb *phb) struct pci_dev *gpdev; static int npu_index; uint64_t rc = 0; + struct pci_controller *hose = phb->hose; + struct npu *npu; + int ret; - phb->npu.nmmu_flush = - of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); + npu = kzalloc(sizeof(*npu), GFP_KERNEL); + if (!npu) + return -ENOMEM; + + npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); if (gpdev) { @@ -914,18 +953,31 @@ int pnv_npu2_init(struct pnv_phb *phb)
[PATCH kernel v3 05/22] powerpc/powernv/npu: Add helper to access struct npu for NPU device
This step is to help removing the npu struct from pnv_phb so it can be used by pseries as well. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- arch/powerpc/platforms/powernv/npu-dma.c | 22 -- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 91d488f..9f48831 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -327,6 +327,18 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) return gpe; } +/* + * NPU2 ATS + */ +static struct npu *npdev_to_npu(struct pci_dev *npdev) +{ + struct pnv_phb *nphb; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + + return >npu; +} + /* Maximum number of nvlinks per npu */ #define NV_MAX_LINKS 6 @@ -478,7 +490,6 @@ static void acquire_atsd_reg(struct npu_context *npu_context, int i, j; struct npu *npu; struct pci_dev *npdev; - struct pnv_phb *nphb; for (i = 0; i <= max_npu2_index; i++) { mmio_atsd_reg[i].reg = -1; @@ -493,8 +504,7 @@ static void acquire_atsd_reg(struct npu_context *npu_context, if (!npdev) continue; - nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = >npu; + npu = npdev_to_npu(npdev); mmio_atsd_reg[i].npu = npu; mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); while (mmio_atsd_reg[i].reg < 0) { @@ -690,7 +700,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, } nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = >npu; + npu = npdev_to_npu(npdev); /* * Setup the NPU context table for a particular GPU. These need to be @@ -764,7 +774,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, */ WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); - if (!nphb->npu.nmmu_flush) { + if (!npu->nmmu_flush) { /* * If we're not explicitly flushing ourselves we need to mark * the thread for global flushes @@ -810,7 +820,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context, return; nphb = pci_bus_to_host(npdev->bus)->private_data; - npu = >npu; + npu = npdev_to_npu(npdev); nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", _index))) -- 2.17.1
[PATCH kernel v3 01/22] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2
The skiboot firmware has a hot reset handler which fences the NVIDIA V100 GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs: https://github.com/open-power/skiboot/commit/fca2b2b839a67 Now we are going to pass V100 via VFIO which most certainly involves KVM guests which are often terminated without getting a chance to offline GPU RAM so we end up with a running machine with misconfigured memory. Accessing this memory produces hardware management interrupts (HMI) which bring the host down. To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable() via pci_disable_device() which switches NPU2 to a safe mode and prevents HMIs. Signed-off-by: Alexey Kardashevskiy Acked-by: Alistair Popple --- Changes: v2: * updated the commit log --- arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3d2d8fa..c78c204 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3676,6 +3676,15 @@ static void pnv_pci_release_device(struct pci_dev *pdev) pnv_ioda_release_pe(pe); } +static void pnv_npu_disable_device(struct pci_dev *pdev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + struct eeh_pe *eehpe = edev ? edev->pe : NULL; + + if (eehpe && eeh_ops && eeh_ops->reset) + eeh_ops->reset(eehpe, EEH_RESET_HOT); +} + static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3720,6 +3729,7 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .reset_secondary_bus= pnv_pci_reset_secondary_bus, .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, + .disable_device = pnv_npu_disable_device, }; static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { -- 2.17.1
[PATCH kernel v3 00/22] powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough
This is for passing through NVIDIA V100 GPUs on POWER9 systems. 7/7 and https://github.com/aik/linux/commit/f41f5666d27b31c1 have the details of hardware setup. This implements support for NVIDIA V100 GPU with coherent memory and NPU/ATS support available in the POWER9 CPU. The aim is to support unmodified vendor driver in the guest. This is pushed to github: https://github.com/aik/qemu/tree/nv2-stage4 The host and guest kernel tree is pushed on github as well: https://github.com/aik/linux/tree/nv2-stage4 Skiboot bits are here: https://github.com/aik/skiboot/tree/nv2-stage4 Please comment. Thanks. Alexey Kardashevskiy (22): powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2 powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a region powerpc/mm/iommu: Make mm_iommu_new() fail on existing regions powerpc/vfio/iommu/kvm: Do not pin device memory powerpc/powernv/npu: Add helper to access struct npu for NPU device powerpc/powernv: Detach npu struct from pnv_phb powerpc/powernv/npu: Move OPAL calls away from context manipulation powerpc/pseries/iommu: Allow dynamic window to start from zero powerpc/pseries/iommu: Force default DMA window removal powerpc/pseries/iommu: Use memory@ nodes in max RAM address calculation powerpc/pseries/npu: Enable platform support powerpc/pseries: Remove IOMMU API support for non-LPAR systems powerpc/powernv/pseries: Rework device adding to IOMMU groups powerpc/iommu_api: Move IOMMU groups setup to a single place powerpc/powernv: Reference iommu_table while it is linked to a group powerpc/powernv: Add purge cache OPAL call powerpc/powernv/npu: Convert NPU IOMMU helpers to iommu_table_group_ops powerpc/powernv/npu: Add compound IOMMU groups powerpc/powernv/npu: Add release_ownership hook vfio_pci: Allow mapping extra regions vfio_pci: Allow regions to add own capabilities vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] [10de:1db1] subdriver drivers/vfio/pci/Makefile | 1 + arch/powerpc/include/asm/iommu.h | 17 +- arch/powerpc/include/asm/mmu_context.h| 9 +- arch/powerpc/include/asm/opal-api.h | 3 +- arch/powerpc/include/asm/opal.h | 1 + arch/powerpc/include/asm/pci.h| 4 + arch/powerpc/platforms/powernv/pci.h | 30 +- drivers/vfio/pci/trace.h | 102 drivers/vfio/pci/vfio_pci_private.h | 8 + include/uapi/linux/vfio.h | 26 + arch/powerpc/kernel/iommu.c | 67 +-- arch/powerpc/kvm/book3s_64_vio.c | 18 +- arch/powerpc/mm/mmu_context_iommu.c | 100 +++- arch/powerpc/platforms/powernv/npu-dma.c | 531 +++--- arch/powerpc/platforms/powernv/opal.c | 1 + arch/powerpc/platforms/powernv/pci-ioda-tce.c | 3 +- arch/powerpc/platforms/powernv/pci-ioda.c | 229 arch/powerpc/platforms/powernv/pci.c | 43 +- arch/powerpc/platforms/pseries/iommu.c| 134 +++-- arch/powerpc/platforms/pseries/pci.c | 6 + drivers/vfio/pci/vfio_pci.c | 54 +- drivers/vfio/pci/vfio_pci_nvlink2.c | 433 ++ drivers/vfio/vfio_iommu_spapr_tce.c | 65 ++- .../powerpc/platforms/powernv/opal-wrappers.S | 1 + drivers/vfio/pci/Kconfig | 6 + 25 files changed, 1497 insertions(+), 395 deletions(-) create mode 100644 drivers/vfio/pci/trace.h create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c -- 2.17.1