Re: [PATCH v2 26/35] powerpc/64: system call: Fix sparse warning about missing declaration
Le 26/11/2019 à 22:44, Luc Van Oostenryck a écrit : On Tue, Nov 26, 2019 at 09:13:40PM +0100, Michal Suchanek wrote: Sparse warns about missing declarations for these functions: +arch/powerpc/kernel/syscall_64.c:108:23: warning: symbol 'syscall_exit_prepare' was not declared. Should it be static? +arch/powerpc/kernel/syscall_64.c:18:6: warning: symbol 'system_call_exception' was not declared. Should it be static? +arch/powerpc/kernel/syscall_64.c:200:23: warning: symbol 'interrupt_exit_user_prepare' was not declared. Should it be static? +arch/powerpc/kernel/syscall_64.c:288:23: warning: symbol 'interrupt_exit_kernel_prepare' was not declared. Should it be static? Add declaration for them. I'm fine with this patch but, just FYI, lately people seems to prefer to add '__visible' to the function definition instead of creating such header files. AFAIU, that's not exactly the purpose of '__visible', see https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9add850c2 Christophe
Re: [Very RFC 42/46] powernv/pci: Don't clear pdn->pe_number in pnv_pci_release_device
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Nothing looks at it anymore. With a small extra step we can ditch it (compile tested): https://github.com/aik/linux/commit/14db7061d48220354e83f8e100ab0cc1b7181da4 > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 12 > 1 file changed, 12 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index d3e375d71cdc..45d940730c30 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -3541,9 +3541,7 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) > > static void pnv_pci_release_device(struct pci_dev *pdev) > { > - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); > struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); > - struct pci_dn *pdn = pci_get_pdn(pdev); > > /* The VF PE state is torn down when sriov_disable() is called */ > if (pdev->is_virtfn) > @@ -3560,16 +3558,6 @@ static void pnv_pci_release_device(struct pci_dev > *pdev) > if (pdev->is_physfn) > kfree(pdev->dev.archdata.iov_data); > > - /* > - * PCI hotplug can happen as part of EEH error recovery. The @pdn > - * isn't removed and added afterwards in this scenario. We should > - * set the PE number in @pdn to an invalid one. Otherwise, the PE's > - * device count is decreased on removing devices while failing to > - * be increased on adding devices. It leads to unbalanced PE's device > - * count and eventually make normal PCI hotplug path broken. > - */ > - pdn->pe_number = IODA_INVALID_PE; > - > WARN_ON(--pe->device_count < 0); > if (pe->device_count == 0) > pnv_ioda_release_pe(pe); > -- Alexey
[PATCH v3] platforms/powernv: Avoid re-registration of imc debugfs directory
export_imc_mode_and_cmd() function which creates the debugfs interface for imc-mode and imc-command, is invoked when each nest pmu units is registered. When the first nest pmu unit is registered, export_imc_mode_and_cmd() creates 'imc' directory under `/debug/powerpc/`. In the subsequent invocations debugfs_create_dir() function returns, since the directory already exists. The recent commit (debugfs: make error message a bit more verbose), throws a warning if we try to invoke `debugfs_create_dir()` with an already existing directory name. Address this warning by making the debugfs directory registration in the opal_imc_counters_probe() function, i.e invoke export_imc_mode_and_cmd() function from the probe function. Signed-off-by: Anju T Sudhakar --- Changes from v2 -> v3: * Invoke export_imc_mode_and_cmd(), which does the imc debugfs directory registration and deletion, from the probe fucntion. * Change the return type of imc_pmu_create() to get the control block address for nest units in the probe function * Remove unnecessary comments --- arch/powerpc/platforms/powernv/opal-imc.c | 39 +-- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index e04b206..3b4518f 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -59,10 +59,6 @@ static void export_imc_mode_and_cmd(struct device_node *node, imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); - /* -* Return here, either because 'imc' directory already exists, -* Or failed to create a new one. -*/ if (!imc_debugfs_parent) return; @@ -135,7 +131,6 @@ static int imc_get_mem_addr_nest(struct device_node *node, } pmu_ptr->imc_counter_mmaped = true; - export_imc_mode_and_cmd(node, pmu_ptr); kfree(base_addr_arr); kfree(chipid_arr); return 0; @@ -151,7 +146,7 @@ static int imc_get_mem_addr_nest(struct device_node *node, * and domain as the inputs. * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets */ -static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) +static struct imc_pmu *imc_pmu_create(struct device_node *parent, int pmu_index, int domain) { int ret = 0; struct imc_pmu *pmu_ptr; @@ -159,27 +154,23 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) /* Return for unknown domain */ if (domain < 0) - return -EINVAL; + return NULL; /* memory for pmu */ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL); if (!pmu_ptr) - return -ENOMEM; + return NULL; /* Set the domain */ pmu_ptr->domain = domain; ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size); - if (ret) { - ret = -EINVAL; + if (ret) goto free_pmu; - } if (!of_property_read_u32(parent, "offset", &offset)) { - if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) { - ret = -EINVAL; + if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) goto free_pmu; - } } /* Function to register IMC pmu */ @@ -190,14 +181,14 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) if (pmu_ptr->domain == IMC_DOMAIN_NEST) kfree(pmu_ptr->mem_info); kfree(pmu_ptr); - return ret; + return NULL; } - return 0; + return pmu_ptr; free_pmu: kfree(pmu_ptr); - return ret; + return NULL; } static void disable_nest_pmu_counters(void) @@ -254,6 +245,7 @@ int get_max_nest_dev(void) static int opal_imc_counters_probe(struct platform_device *pdev) { struct device_node *imc_dev = pdev->dev.of_node; + struct imc_pmu *pmu; int pmu_count = 0, domain; bool core_imc_reg = false, thread_imc_reg = false; u32 type; @@ -269,6 +261,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) } for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) { + pmu = NULL; if (of_property_read_u32(imc_dev, "type", &type)) { pr_warn("IMC Device without type property\n"); continue; @@ -293,9 +286,13 @@ static int opal_imc_counters_probe(struct platform_device *pdev) break; } - if (!imc_pmu_create(imc_dev, pmu_count, domain)) { - if (domain == IMC_DOMAIN_NEST) + pmu = imc_pmu_create(imc_dev, pmu_count, domain); +
Re: [Very RFC 41/46] powernv/eeh: Remove pdn setup for SR-IOV VFs
On 20/11/2019 12:28, Oliver O'Halloran wrote: > We don't need a pci_dn for the VF any more, so we can skip adding them. Excellent! Reviewed-by: Alexey Kardashevskiy > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 16 > 1 file changed, 16 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index d111a50fbe68..d3e375d71cdc 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -1526,7 +1526,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, > u16 num_vfs) > for (vf_index = 0; vf_index < num_vfs; vf_index++) { > int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); > int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); > - struct pci_dn *vf_pdn; > > if (iov->m64_single_mode) > pe_num = iov->pe_num_map[vf_index]; > @@ -1558,15 +1557,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, > u16 num_vfs) > list_add_tail(&pe->list, &phb->ioda.pe_list); > mutex_unlock(&phb->ioda.pe_list_mutex); > > - /* associate this pe to it's pdn */ > - list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { > - if (vf_pdn->busno == vf_bus && > - vf_pdn->devfn == vf_devfn) { > - vf_pdn->pe_number = pe_num; > - break; > - } > - } > - > pnv_pci_ioda2_setup_dma_pe(phb, pe); > #ifdef CONFIG_IOMMU_API > iommu_register_group(&pe->table_group, > @@ -1688,17 +1678,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 > num_vfs) > int pnv_pcibios_sriov_disable(struct pci_dev *pdev) > { > pnv_pci_sriov_disable(pdev); > - > - /* Release PCI data */ > - remove_sriov_vf_pdns(pdev); > return 0; > } > > int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) > { > - /* Allocate PCI data */ > - add_sriov_vf_pdns(pdev); > - > return pnv_pci_sriov_enable(pdev, num_vfs); > } > #endif /* CONFIG_PCI_IOV */ > -- Alexey
Re: [Very RFC 40/46] powernv/npu: Don't drop refcount when looking up GPU pci_devs
On 20/11/2019 12:28, Oliver O'Halloran wrote: > The comment here implies that we don't need to take a ref to the pci_dev > because the ioda_pe will always have one. This implies that the current > expection is that the pci_dev for an NPU device will *never* be torn > down since the ioda_pe having a ref to the device will prevent the > release function from being called. > > In other words, the desired behaviour here appears to be leaking a ref. > > Nice! There is a history: https://patchwork.ozlabs.org/patch/1088078/ We did not fix anything in particular then, we do not seem to be fixing anything now (in other words - we cannot test it in a normal natural way). I'd drop this one. > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/npu-dma.c | 11 +++ > 1 file changed, 3 insertions(+), 8 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index 72d3749da02c..2eb6e6d45a98 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -28,15 +28,10 @@ static struct pci_dev *get_pci_dev(struct device_node *dn) > break; > > /* > - * pci_get_domain_bus_and_slot() increased the reference count of > - * the PCI device, but callers don't need that actually as the PE > - * already holds a reference to the device. Since callers aren't > - * aware of the reference count change, call pci_dev_put() now to > - * avoid leaks. > + * NB: for_each_pci_dev() elevates the pci_dev refcount. > + * Caller is responsible for dropping the ref when it's > + * finished with it. >*/ > - if (pdev) > - pci_dev_put(pdev); > - > return pdev; > } > > -- Alexey
Re: [Very RFC 39/46] powernv/npu: Avoid pci_dn when mapping device_node to a pci_dev
On 20/11/2019 12:28, Oliver O'Halloran wrote: > There's no need to use the pci_dn to find a device_node from a pci_dev. > Just search for the node pointed to by the pci_dev's of_node pointer. Reviewed-by: Alexey Kardashevskiy > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/npu-dma.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index 68bfaef44862..72d3749da02c 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -21,11 +21,11 @@ > > static struct pci_dev *get_pci_dev(struct device_node *dn) > { > - struct pci_dn *pdn = PCI_DN(dn); > - struct pci_dev *pdev; > + struct pci_dev *pdev = NULL; > > - pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus), > -pdn->busno, pdn->devfn); > + for_each_pci_dev(pdev) > + if (pdev->dev.of_node == dn) > + break; > > /* >* pci_get_domain_bus_and_slot() increased the reference count of > -- Alexey
Re: Bug 205201 - Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M
On Tue, Nov 26, 2019 at 05:40:26PM +0100, Christoph Hellwig wrote: > On Tue, Nov 26, 2019 at 12:26:38PM +0100, Christian Zigotzky wrote: > > Hello Christoph, > > > > The PCI TV card works with your patch! I was able to patch your Git kernel > > with the patch above. > > > > I haven't found any error messages in the dmesg yet. > > Thanks. Unfortunately this is a bit of a hack as we need to set > the mask based on runtime information like the magic FSL PCIe window. > Let me try to draft something better up, and thanks already for testing > this one! Maybe we'll simply force bottom up allocation before calling swiotlb_init()? Anyway, it's the last memblock allocation. diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 62f74b1b33bd..771e6cf7e2b9 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -286,14 +286,15 @@ void __init mem_init(void) /* * book3s is limited to 16 page sizes due to encoding this in * a 4-bit field for slices. */ BUILD_BUG_ON(MMU_PAGE_COUNT > 16); #ifdef CONFIG_SWIOTLB + memblock_set_bottom_up(true); swiotlb_init(0); #endif high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); set_max_mapnr(max_pfn); memblock_free_all(); -- Sincerely yours, Mike.
Re: [PATCH v11 1/7] mm: ksm: Export ksm_madvise()
On Tue, Nov 26, 2019 at 07:59:49PM -0800, Hugh Dickins wrote: > On Mon, 25 Nov 2019, Bharata B Rao wrote: > > > On PEF-enabled POWER platforms that support running of secure guests, > > secure pages of the guest are represented by device private pages > > in the host. Such pages needn't participate in KSM merging. This is > > achieved by using ksm_madvise() call which need to be exported > > since KVM PPC can be a kernel module. > > > > Signed-off-by: Bharata B Rao > > Acked-by: Paul Mackerras > > Cc: Andrea Arcangeli > > Cc: Hugh Dickins > > I can say > Acked-by: Hugh Dickins > to this one. > > But not to your 2/7 which actually makes use of it: because sadly it > needs down_write(&kvm->mm->mmap_sem) for the case when it switches off > VM_MERGEABLE in vma->vm_flags. That's frustrating, since I think it's > the only operation for which down_read() is not good enough. Oh ok! Thanks for pointing this out. > > I have no idea how contended that mmap_sem is likely to be, nor how > many to-be-secured pages that vma is likely to contain: you might find > it okay simply to go with it down_write throughout, or you might want > to start out with it down_read, and only restart with down_write (then > perhaps downgrade_write later) when you see VM_MERGEABLE is set. Using down_write throughtout is not easy as we do migrate_vma_pages() from fault path (->migrate_to_ram()) too. Here we come with down_read already held. Starting with down_read and restarting with down_write if VM_MERGEABLE is set -- this also looks a bit difficult as we will have challenges with locking order if we release mmap_sem in between and re-acquire. So I think I will start with down_write in this particular case and will downgrade_write as soon as ksm_madvise() is complete. > > The crash you got (thanks for the link): that will be because your > migrate_vma_pages() had already been applied to a page that was > already being shared via KSM. > > But if these secure pages are expected to be few and far between, > maybe you'd prefer to keep VM_MERGEABLE, and add per-page checks > of some kind into mm/ksm.c, to skip over these surprising hybrids. I did bail out from a few routines in mm/ksm.c with is_device_private_page(page) check, but that wasn't good enough and I encountered crashes in different code paths. Guess a bit more understanding of KSM internals would be required before retrying that. However since all the pages of the guest except for a few will be turned into secure pages early during boot, it appears better if secure guests don't participate in in KSM merging at all. Regards, Bharata.
Re: [Very RFC 38/46] powerpc/pci-hotplug: Scan the whole bus when using PCI_PROBE_NORMAL
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Currently when using the normal (i.e not building pci_dev's from the DT > node) probe method we only scan the devfn corresponding to the first child > of the bridge's DT node. This doesn't make much sense to me, but it seems > to have worked so far. At a guess it seems to work because in a PCIe > environment the first downstream child will be at devfn 00.0. > > In any case it's completely broken when no pci_dn is available. Remove > the PCI_DN checking and scan each of the device number that might be on > the downstream bus. Then why not just use pci_scan_child_bus()? Thanks, > Cc: Benjamin Herrenschmidt > Signed-off-by: Oliver O'Halloran > --- > I'm not sure we should be using pci_scan_slot() directly here. Maybe > there's some insane legacy reason for it. > --- > arch/powerpc/kernel/pci-hotplug.c | 15 --- > 1 file changed, 4 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/kernel/pci-hotplug.c > b/arch/powerpc/kernel/pci-hotplug.c > index d6a67f814983..85299c769768 100644 > --- a/arch/powerpc/kernel/pci-hotplug.c > +++ b/arch/powerpc/kernel/pci-hotplug.c > @@ -123,17 +123,10 @@ void pci_hp_add_devices(struct pci_bus *bus) > if (mode == PCI_PROBE_DEVTREE) { > /* use ofdt-based probe */ > of_rescan_bus(dn, bus); > - } else if (mode == PCI_PROBE_NORMAL && > -dn->child && PCI_DN(dn->child)) { > - /* > - * Use legacy probe. In the partial hotplug case, we > - * probably have grandchildren devices unplugged. So > - * we don't check the return value from pci_scan_slot() in > - * order for fully rescan all the way down to pick them up. > - * They can have been removed during partial hotplug. > - */ > - slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); > - pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); > + } else if (mode == PCI_PROBE_NORMAL) { > + for (slotno = 0; slotno < 255; slotno += 8) > + pci_scan_slot(bus, slotno); > + > max = bus->busn_res.start; > /* >* Scan bridges that are already configured. We don't touch > -- Alexey
Re: [Very RFC 36/46] powernv/npu: Remove open-coded PE lookup for GPU device
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/npu-dma.c | 13 ++--- > 1 file changed, 2 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index b95b9e3c4c98..68bfaef44862 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -97,25 +97,16 @@ EXPORT_SYMBOL(pnv_pci_get_npu_dev); > static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, > struct pci_dev **gpdev) > { > - struct pnv_phb *phb; > - struct pci_controller *hose; > struct pci_dev *pdev; > struct pnv_ioda_pe *pe; > - struct pci_dn *pdn; > > pdev = pnv_pci_get_gpu_dev(npe->pdev); > if (!pdev) > return NULL; > > - pdn = pci_get_pdn(pdev); > - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) > - return NULL; > - > - hose = pci_bus_to_host(pdev->bus); > - phb = hose->private_data; > - pe = &phb->ioda.pe_array[pdn->pe_number]; > + pe = pnv_ioda_get_pe(pdev); > > - if (gpdev) > + if (pe && pdev) s/pdev/gpdev/ > *gpdev = pdev; > > return pe; > -- Alexey
Re: [PATCH v2 29/35] powerpc/perf: remove current_is_64bit()
Le 26/11/2019 à 21:13, Michal Suchanek a écrit : Since commit ed1cd6deb013 ("powerpc: Activate CONFIG_THREAD_INFO_IN_TASK") current_is_64bit() is quivalent to !is_32bit_task(). Remove the redundant function. Link: https://github.com/linuxppc/issues/issues/275 Link: https://lkml.org/lkml/2019/9/12/540 Fixes: linuxppc#275 Suggested-by: Christophe Leroy Signed-off-by: Michal Suchanek This change is already in powerpc/next, see https://github.com/linuxppc/linux/commit/42484d2c0f82b666292faf6668c77b49a3a04bc0 Christophe --- arch/powerpc/perf/callchain.c | 17 + 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c84bbd4298a0..35d542515faf 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -284,16 +284,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, } } -static inline int current_is_64bit(void) -{ - /* -* We can't use test_thread_flag() here because we may be on an -* interrupt stack, and the thread flags don't get copied over -* from the thread_info on the main stack to the interrupt stack. -*/ - return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT); -} - #else /* CONFIG_PPC64 */ /* * On 32-bit we just access the address and let hash_page create a @@ -321,11 +311,6 @@ static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry { } -static inline int current_is_64bit(void) -{ - return 0; -} - static inline int valid_user_sp(unsigned long sp, int is_64) { if (!sp || (sp & 7) || sp > TASK_SIZE - 32) @@ -486,7 +471,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (current_is_64bit()) + if (!is_32bit_task()) perf_callchain_user_64(entry, regs); else perf_callchain_user_32(entry, regs);
Re: [Very RFC 35/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_release_device
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++--- > 1 file changed, 2 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 4f38652c7cd7..8525642b1256 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -3562,14 +3562,14 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe > *pe) > static void pnv_pci_release_device(struct pci_dev *pdev) > { > struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); > + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); > struct pci_dn *pdn = pci_get_pdn(pdev); > - struct pnv_ioda_pe *pe; > > /* The VF PE state is torn down when sriov_disable() is called */ > if (pdev->is_virtfn) > return; > > - if (!pdn || pdn->pe_number == IODA_INVALID_PE) > + if (WARN_ON(!pe)) Is that WARN_ON because there is always a PE - from upstream bridge or a reserved one? > return; > > /* > @@ -3588,7 +3588,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev) >* be increased on adding devices. It leads to unbalanced PE's device >* count and eventually make normal PCI hotplug path broken. >*/ > - pe = &phb->ioda.pe_array[pdn->pe_number]; > pdn->pe_number = IODA_INVALID_PE; > > WARN_ON(--pe->device_count < 0); > -- Alexey
Re: [Very RFC 34/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_enable_device_hook()
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy but better squash it. > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 7 +-- > 1 file changed, 1 insertion(+), 6 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 7e88de18ead6..4f38652c7cd7 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -3382,7 +3382,6 @@ static resource_size_t > pnv_pci_iov_resource_alignment(struct pci_dev *pdev, > static bool pnv_pci_enable_device_hook(struct pci_dev *dev) > { > struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); > - struct pci_dn *pdn; > > /* The function is probably called while the PEs have >* not be created yet. For example, resource reassignment > @@ -3392,11 +3391,7 @@ static bool pnv_pci_enable_device_hook(struct pci_dev > *dev) > if (!phb->initialized) > return true; > > - pdn = pci_get_pdn(dev); > - if (!pdn || pdn->pe_number == IODA_INVALID_PE) > - return false; > - > - return true; > + return !!pnv_ioda_get_pe(dev); > } > > static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, > -- Alexey
Re: [Very RFC 33/46] powernv/pci: Remove open-coded PE lookup in iommu notifier
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci.c | 9 - > 1 file changed, 4 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 5b1f4677cdce..0eeea8652426 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -943,23 +943,22 @@ static int pnv_tce_iommu_bus_notifier(struct > notifier_block *nb, > { > struct device *dev = data; > struct pci_dev *pdev; > - struct pci_dn *pdn; > struct pnv_ioda_pe *pe; > struct pnv_phb *phb; > > switch (action) { > case BUS_NOTIFY_ADD_DEVICE: > pdev = to_pci_dev(dev); > - pdn = pci_get_pdn(pdev); > phb = pci_bus_to_pnvhb(pdev->bus); > > WARN_ON_ONCE(!phb); > - if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb) > + if (!phb) > return 0; This check is weird - the function does not use @phb anymore, it would make more sense if pnv_ioda_get_pe() checked phb!=NULL. > > - pe = &phb->ioda.pe_array[pdn->pe_number]; > - if (!pe->table_group.group) > + pe = pnv_ioda_get_pe(pdev); > + if (!pe || !pe->table_group.group) > return 0; > + > iommu_add_device(&pe->table_group, dev); > return 0; > case BUS_NOTIFY_DEL_DEVICE: > -- Alexey
Re: [Very RFC 32/46] powernv/pci: Remove open-coded PE lookup in iommu_bypass_supported()
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy but honestly can be squashed into 31/46 or/and 33/46 or other similar patches. > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 6 ++ > 1 file changed, 2 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 98d858999a2d..7e88de18ead6 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -1801,13 +1801,11 @@ static bool > pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, > u64 dma_mask) > { > struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); > - struct pci_dn *pdn = pci_get_pdn(pdev); > - struct pnv_ioda_pe *pe; > + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); > > - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) > + if (WARN_ON(!pe)) > return false; > > - pe = &phb->ioda.pe_array[pdn->pe_number]; > if (pe->tce_bypass_enabled) { > u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; > if (dma_mask >= top) > -- Alexey
Re: [PATCH v3 2/4] powerpc/fadump: reorganize /sys/kernel/fadump_* sysfs files
On 11/25/19 12:10 AM, Michal Suchánek wrote: > On Sat, Nov 16, 2019 at 08:07:29PM +0530, Sourabh Jain wrote: >> >> >> On 11/9/19 6:29 PM, Michal Suchánek wrote: >>> On Sat, Nov 09, 2019 at 05:53:37PM +0530, Sourabh Jain wrote: As the number of FADump sysfs files increases it is hard to manage all of them inside /sys/kernel directory. It's better to have all the FADump related sysfs files in a dedicated directory /sys/kernel/fadump. But in order to maintain the backward compatibility the /sys/kernel/fadump_* sysfs files are replicated inside /sys/kernel/fadump/ and eventually get removed in future. As the FADump sysfs files are now part of dedicated directory there is no need to prefix their name with fadump_, hence sysfs file names are also updated. For example fadump_enabled sysfs file is now referred as enabled. Also consolidate ABI documentation for all the FADump sysfs files in a single file Documentation/ABI/testing/sysfs-kernel-fadump. Signed-off-by: Sourabh Jain --- Documentation/ABI/testing/sysfs-kernel-fadump | 41 +++ arch/powerpc/kernel/fadump.c | 38 + arch/powerpc/platforms/powernv/opal-core.c| 10 +++-- 3 files changed, 86 insertions(+), 3 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-fadump diff --git a/Documentation/ABI/testing/sysfs-kernel-fadump b/Documentation/ABI/testing/sysfs-kernel-fadump new file mode 100644 index ..a77f1a5ba389 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-fadump @@ -0,0 +1,41 @@ +What: /sys/kernel/fadump/* +Date: Nov 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: + The /sys/kernel/fadump/* is a collection of FADump sysfs + file provide information about the configuration status + of Firmware Assisted Dump (FADump). + +What: /sys/kernel/fadump/enabled +Date: Nov 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Primarily used to identify whether the FADump is enabled in + the kernel or not. +User: Kdump service + +What: /sys/kernel/fadump/registered +Date: Nov 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Helps to control the dump collect feature from userspace. + Setting 1 to this file enables the system to collect the + dump and 0 to disable it. +User: Kdump service + +What: /sys/kernel/fadump/release_mem +Date: Nov 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: write only + This is a special sysfs file and only available when + the system is booted to capture the vmcore using FADump. + It is used to release the memory reserved by FADump to + save the crash dump. + +What: /sys/kernel/fadump/release_opalcore +Date: Nov 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: write only + The sysfs file is available when the system is booted to + collect the dump on OPAL based machine. It used to release + the memory used to collect the opalcore. diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index ed59855430b9..a9591def0c84 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1418,6 +1418,9 @@ static int fadump_region_show(struct seq_file *m, void *private) return 0; } +struct kobject *fadump_kobj; +EXPORT_SYMBOL_GPL(fadump_kobj); + static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem, 0200, NULL, fadump_release_memory_store); @@ -1428,6 +1431,16 @@ static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered, 0644, fadump_register_show, fadump_register_store); +static struct kobj_attribute release_attr = __ATTR(release_mem, + 0200, NULL, + fadump_release_memory_store); +static struct kobj_attribute enable_attr = __ATTR(enabled, + 0444, fadump_enabled_show, + NULL); +static struct kobj_attribute register_attr = __ATTR(registered, +
Re: [Very RFC 31/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_ioda_dma_dev_setup()
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Use the helper to look up the pnv_ioda_pe for the device we're configuring DMA > for. In the VF case there's no need set pdn->pe_number since nothing looks at > it any more. > > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index d4b5ee926222..98d858999a2d 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -1709,10 +1709,9 @@ int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 > num_vfs) > > static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev > *pdev) > { > - struct pci_dn *pdn = pci_get_pdn(pdev); > struct pnv_ioda_pe *pe; > > - pe = &phb->ioda.pe_array[pdn->pe_number]; > + pe = pnv_ioda_get_pe(pdev); > WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); > pdev->dev.archdata.dma_offset = pe->tce_bypass_base; > set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); > -- Alexey
Re: [Very RFC 30/46] powernv/pci: Remove open-coded PE lookup in PELT-V teardown
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 12 +++- > 1 file changed, 7 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 5bd7c1b058da..d4b5ee926222 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -853,11 +853,13 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, > struct pnv_ioda_pe *pe) > > /* Release from all parents PELT-V */ > while (parent) { > - struct pci_dn *pdn = pci_get_pdn(parent); > - if (pdn && pdn->pe_number != IODA_INVALID_PE) { > - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, > - pe->pe_number, > OPAL_REMOVE_PE_FROM_DOMAIN); > - /* XXX What to do in case of error ? */ May be print a warning, like a few lines below (in the code, not in the patch). Not important though if gcc does not complain about an unused returned value. Reviewed-by: Alexey Kardashevskiy > + struct pnv_ioda_pe *parent_pe = pnv_ioda_get_pe(parent); > + > + if (parent_pe) { > + rc = opal_pci_set_peltv(phb->opal_id, > + parent_pe->pe_number, > + pe->pe_number, > + OPAL_REMOVE_PE_FROM_DOMAIN); > } > parent = parent->bus->self; > } > -- Alexey
Re: [Very RFC 29/46] powernv/pci: Remove open-coded PE lookup in PELT-V setup
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 32 +-- > 1 file changed, 24 insertions(+), 8 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 1c90feed233d..5bd7c1b058da 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -760,6 +760,11 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, > } > } > > + /* > + * Walk the bridges up to the root. Along the way mark this PE as > + * downstream of the bridge PE(s) so that errors upstream errors Too many "errors" in "errors upstream errors". Otherwise Reviewed-by: Alexey Kardashevskiy > + * also cause this PE to be frozen. > + */ > if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) > pdev = pe->pbus->self; > else if (pe->flags & PNV_IODA_PE_DEV) > @@ -768,16 +773,27 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, > else if (pe->flags & PNV_IODA_PE_VF) > pdev = pe->parent_dev; > #endif /* CONFIG_PCI_IOV */ > + > while (pdev) { > - struct pci_dn *pdn = pci_get_pdn(pdev); > - struct pnv_ioda_pe *parent; > + struct pnv_ioda_pe *parent = pnv_ioda_get_pe(pdev); > > - if (pdn && pdn->pe_number != IODA_INVALID_PE) { > - parent = &phb->ioda.pe_array[pdn->pe_number]; > - ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); > - if (ret) > - return ret; > - } > + /* > + * FIXME: This is called from pcibios_setup_bridge(), which is > called > + * from the bottom (leaf) bridge to the root. This means that > this > + * doesn't actually setup the PELT-V entries since the PEs for > + * the bridges above assigned after this is run for the leaf. > + * > + * FIXMEFIXME: might not be true since moving PE configuration > + * into pcibios_bus_add_device(). > + */ > + if (!parent) > + break; > + > + WARN_ON(!parent || parent->pe_number == IODA_INVALID_PE); > + > + ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); > + if (ret) > + return ret; > > pdev = pdev->bus->self; > } > -- Alexey
Re: [Very RFC 28/46] powernv/iov: Move SR-IOV PF state out of pci_dn
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Move the SR-IOV into a platform specific structure. I'm sure stashing all the > SR-IOV state in pci_dn seemed like a good idea at the time, but it results in > a > lot of powernv specifics being leaked out of the platform directory. > > Moving all the PHB3/4 specific M64 BAR wrangling into a PowerNV specific > structure helps to clarify the role of pci_dn and ensures that the platform > specifics stay that way. > > This will make the code easier to understand and modify since we don't need > to so much aboute PowerNV changes breaking pseries and EEH, and vis-a-vis. > > Signed-off-by: Oliver O'Halloran > --- > TODO: Remove all the sriov stuff from pci_dn. We can't do that yet because > the pseries SRIOV support was a giant hack that re-used some of the > previously powernv specific fields. > --- > arch/powerpc/include/asm/device.h | 3 + > arch/powerpc/platforms/powernv/pci-ioda.c | 199 -- > arch/powerpc/platforms/powernv/pci.h | 36 > 3 files changed, 148 insertions(+), 90 deletions(-) > > diff --git a/arch/powerpc/include/asm/device.h > b/arch/powerpc/include/asm/device.h > index 266542769e4b..4d8934db7ef5 100644 > --- a/arch/powerpc/include/asm/device.h > +++ b/arch/powerpc/include/asm/device.h > @@ -49,6 +49,9 @@ struct dev_archdata { > #ifdef CONFIG_CXL_BASE > struct cxl_context *cxl_ctx; > #endif > +#ifdef CONFIG_PCI_IOV > + void *iov_data; > +#endif > }; > > struct pdev_archdata { > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index a1c9315f3208..1c90feed233d 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -966,14 +966,15 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, > struct pnv_ioda_pe *pe) > #ifdef CONFIG_PCI_IOV > static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) > { > - struct pci_dn *pdn = pci_get_pdn(dev); > - int i; > struct resource *res, res2; > + struct pnv_iov_data *iov; > resource_size_t size; > u16 num_vfs; > + int i; > > if (!dev->is_physfn) > return -EINVAL; > + iov = pnv_iov_get(dev); > > /* >* "offset" is in VFs. The M64 windows are sized so that when they > @@ -983,7 +984,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, > int offset) >* separate PE, and changing the IOV BAR start address changes the >* range of PEs the VFs are in. >*/ > - num_vfs = pdn->num_vfs; > + num_vfs = iov->num_vfs; > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > res = &dev->resource[i + PCI_IOV_RESOURCES]; > if (!res->flags || !res->parent) > @@ -1029,19 +1030,19 @@ static int pnv_pci_vf_resource_shift(struct pci_dev > *dev, int offset) >num_vfs, offset); > > if (offset < 0) { > - devm_release_resource(&dev->dev, &pdn->holes[i]); > - memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); > + devm_release_resource(&dev->dev, &iov->holes[i]); > + memset(&iov->holes[i], 0, sizeof(iov->holes[i])); > } > > pci_update_resource(dev, i + PCI_IOV_RESOURCES); > > if (offset > 0) { > - pdn->holes[i].start = res2.start; > - pdn->holes[i].end = res2.start + size * offset - 1; > - pdn->holes[i].flags = IORESOURCE_BUS; > - pdn->holes[i].name = "pnv_iov_reserved"; > + iov->holes[i].start = res2.start; > + iov->holes[i].end = res2.start + size * offset - 1; > + iov->holes[i].flags = IORESOURCE_BUS; > + iov->holes[i].name = "pnv_iov_reserved"; > devm_request_resource(&dev->dev, res->parent, > - &pdn->holes[i]); > + &iov->holes[i]); > } > } > return 0; > @@ -1273,37 +1274,37 @@ static void pnv_pci_ioda_setup_PEs(void) > #ifdef CONFIG_PCI_IOV > static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) > { > + struct pnv_iov_data *iov; > struct pnv_phb*phb; > - struct pci_dn *pdn; > inti, j; > intm64_bars; > > phb = pci_bus_to_pnvhb(pdev->bus); > - pdn = pci_get_pdn(pdev); > + iov = pnv_iov_get(pdev); > > - if (pdn->m64_single_mode) > + if (iov->m64_single_mode) > m64_bars = num_vfs; > else > m64_bars = 1; > > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) > for (j = 0; j < m64_bars; j++) { > - if (pdn->m64_map[j][i] == IODA_INVALID_M64) > + if (iov->m64_m
Re: [PATCH v11 1/7] mm: ksm: Export ksm_madvise()
On Mon, 25 Nov 2019, Bharata B Rao wrote: > On PEF-enabled POWER platforms that support running of secure guests, > secure pages of the guest are represented by device private pages > in the host. Such pages needn't participate in KSM merging. This is > achieved by using ksm_madvise() call which need to be exported > since KVM PPC can be a kernel module. > > Signed-off-by: Bharata B Rao > Acked-by: Paul Mackerras > Cc: Andrea Arcangeli > Cc: Hugh Dickins I can say Acked-by: Hugh Dickins to this one. But not to your 2/7 which actually makes use of it: because sadly it needs down_write(&kvm->mm->mmap_sem) for the case when it switches off VM_MERGEABLE in vma->vm_flags. That's frustrating, since I think it's the only operation for which down_read() is not good enough. I have no idea how contended that mmap_sem is likely to be, nor how many to-be-secured pages that vma is likely to contain: you might find it okay simply to go with it down_write throughout, or you might want to start out with it down_read, and only restart with down_write (then perhaps downgrade_write later) when you see VM_MERGEABLE is set. The crash you got (thanks for the link): that will be because your migrate_vma_pages() had already been applied to a page that was already being shared via KSM. But if these secure pages are expected to be few and far between, maybe you'd prefer to keep VM_MERGEABLE, and add per-page checks of some kind into mm/ksm.c, to skip over these surprising hybrids. Hugh > --- > mm/ksm.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/mm/ksm.c b/mm/ksm.c > index dbee2eb4dd05..e45b02ad3f0b 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -2478,6 +2478,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned > long start, > > return 0; > } > +EXPORT_SYMBOL_GPL(ksm_madvise); > > int __ksm_enter(struct mm_struct *mm) > { > -- > 2.21.0
Re: [Very RFC 27/46] powernv/pci: Clear reserved PE freezes
On 20/11/2019 12:28, Oliver O'Halloran wrote: > When we scan an empty slot the PHB gets an Unsupported Request from the > downstream bridge when there's no device present at that BDFN. Some older > PHBs (p7-IOC) don't allow further config space accesses while the PE is > frozen, so clear it here without bothering with the diagnostic log. This executes when EEH is not enabled (rather unsupported case) and the patch allegedly extends support of some P7 none of which was ever supported by the powernv platform, or was/is it? Thanks, > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 36eea4bb514c..5b1f4677cdce 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -642,6 +642,19 @@ static void pnv_pci_config_check_eeh(struct pnv_phb > *phb, u16 bdfn) > if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || > fstate == OPAL_EEH_STOPPED_DMA_FREEZE || > fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) { > + > + /* > + * Scanning an empty slot will result in a freeze on the > reserved PE. > + * > + * Some old and bad PHBs block config space access to frozen > PEs in > + * addition to MMIOs, so unfreeze it here. > + */ > + if (pe_no == phb->ioda.reserved_pe_idx) { > + phb->unfreeze_pe(phb, phb->ioda.reserved_pe_idx, > + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); > + return; > + } > + > /* >* If PHB supports compound PE, freeze it for >* consistency. > -- Alexey
Re: [Very RFC 26/46] powernv/pci: Remove pdn from pnv_pci_cfg_{read|write}
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Remove the use of pci_dn from the low-level config space access functions. > These are used by the eeh's config ops and the bus config ops that we > provide to the PCI core. > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/eeh-powernv.c | 14 +++ > arch/powerpc/platforms/powernv/pci.c | 26 > arch/powerpc/platforms/powernv/pci.h | 6 ++--- > 3 files changed, 16 insertions(+), 30 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c > b/arch/powerpc/platforms/powernv/eeh-powernv.c > index 49a932ff092a..8a73bc7517c5 100644 > --- a/arch/powerpc/platforms/powernv/eeh-powernv.c > +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c > @@ -331,31 +331,25 @@ static inline bool pnv_eeh_cfg_blocked(struct eeh_dev > *edev) > static int pnv_eeh_read_config(struct eeh_dev *edev, > int where, int size, u32 *val) > { > - struct pci_dn *pdn = eeh_dev_to_pdn(edev); > - > - if (!pdn) > - return PCIBIOS_DEVICE_NOT_FOUND; > + struct pnv_phb *phb = edev->controller->private_data; > > if (pnv_eeh_cfg_blocked(edev)) { > *val = 0x; > return PCIBIOS_SET_FAILED; > } > > - return pnv_pci_cfg_read(pdn, where, size, val); > + return pnv_pci_cfg_read(phb, edev->bdfn, where, size, val); > } > > static int pnv_eeh_write_config(struct eeh_dev *edev, > int where, int size, u32 val) > { > - struct pci_dn *pdn = eeh_dev_to_pdn(edev); > - > - if (!pdn) > - return PCIBIOS_DEVICE_NOT_FOUND; > + struct pnv_phb *phb = edev->controller->private_data; > > if (pnv_eeh_cfg_blocked(edev)) > return PCIBIOS_SET_FAILED; > > - return pnv_pci_cfg_write(pdn, where, size, val); > + return pnv_pci_cfg_write(phb, edev->bdfn, where, size, val); > } > > static struct eeh_pe *pnv_eeh_pe_get_parent(struct pci_dev *pdev) > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 50142ff045ac..36eea4bb514c 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -654,11 +654,9 @@ static void pnv_pci_config_check_eeh(struct pnv_phb > *phb, u16 bdfn) > } > } > > -int pnv_pci_cfg_read(struct pci_dn *pdn, > +int pnv_pci_cfg_read(struct pnv_phb *phb, u16 bdfn, >int where, int size, u32 *val) > { > - struct pnv_phb *phb = pdn->phb->private_data; > - u32 bdfn = (pdn->busno << 8) | pdn->devfn; > s64 rc; > > switch (size) { > @@ -685,19 +683,16 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, > return PCIBIOS_FUNC_NOT_SUPPORTED; > } > > - pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", > - __func__, pdn->busno, pdn->devfn, where, size, *val); > + pr_devel("%s: bdfn: %x +%x/%x -> %08x\n", > + __func__, bdfn, where, size, *val); > return PCIBIOS_SUCCESSFUL; > } > > -int pnv_pci_cfg_write(struct pci_dn *pdn, > +int pnv_pci_cfg_write(struct pnv_phb *phb, u16 bdfn, > int where, int size, u32 val) > { > - struct pnv_phb *phb = pdn->phb->private_data; > - u32 bdfn = (pdn->busno << 8) | pdn->devfn; > - > - pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", > - __func__, pdn->busno, pdn->devfn, where, size, val); > + pr_devel("%s: bdfn: %x +%x/%x -> %08x\n", > + __func__, bdfn, where, size, val); > switch (size) { > case 1: > opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); > @@ -753,12 +748,11 @@ static int pnv_pci_read_config(struct pci_bus *bus, > if (!pdn) > return PCIBIOS_DEVICE_NOT_FOUND; > > - edev = pdn_to_eeh_dev(pdn); > + edev = pnv_eeh_find_edev(phb, bdfn); > if (!pnv_eeh_pre_cfg_check(edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > > - ret = pnv_pci_cfg_read(pdn, where, size, val); > - phb = pdn->phb->private_data; > + ret = pnv_pci_cfg_read(phb, bdfn, where, size, val); > if (phb->flags & PNV_PHB_FLAG_EEH && edev) { > if (*val == EEH_IO_ERROR_VALUE(size) && > eeh_dev_check_failure(edev)) > @@ -784,11 +778,11 @@ static int pnv_pci_write_config(struct pci_bus *bus, > if (!pdn) > return PCIBIOS_DEVICE_NOT_FOUND; > > - edev = pdn_to_eeh_dev(pdn); > + edev = pnv_eeh_find_edev(phb, bdfn); > if (!pnv_eeh_pre_cfg_check(edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > > - ret = pnv_pci_cfg_write(pdn, where, size, val); > + ret = pnv_pci_cfg_write(phb, bdfn, where, size, val); > > if (!(phb->flags & PNV_PHB_FLAG_EEH)) > pnv_pci_config_check_eeh(phb, bdfn); > diff --git a/arch/powerpc/platforms/powernv/pci.h > b/arch/powerpc/platforms/powernv/pci.h > index b
Re: [Very RFC 22/46] powernv/eeh: Allocate eeh_dev's when needed
On 25/11/2019 15:26, Oliver O'Halloran wrote: > On Mon, Nov 25, 2019 at 2:27 PM Alexey Kardashevskiy wrote: >> >> >> >> On 20/11/2019 12:28, Oliver O'Halloran wrote: >>> Have the PowerNV EEH backend allocate the eeh_dev if needed rather than >>> using >>> the one attached to the pci_dn. >> >> So that pci_dn attached one is leaked then? > > Sorta, the eeh_dev attached to the pci_dn is supposed to have the same > lifetime as the pci_dn it's attached to. Whatever frees the pci_dn > should also be freeing the eeh_dev, but I'm pretty sure the only > situation where that actually happens is when removing the pci_dn for > VFs. Oh, that's lovely. add_sriov_vf_pdns() calls eeh_dev_init() to allocate @edev but remove_sriov_vf_pdns() does kfree(edev) by itself. > It's bad. No sh*t :) > >>> This gets us most of the way towards decoupling >>> pci_dn from the PowerNV EEH code. >>> >>> Signed-off-by: Oliver O'Halloran >>> --- >>> We should probably be free()ing the eeh_dev somewhere. The pci_dev release >>> function is the right place for it. >>> --- >>> arch/powerpc/platforms/powernv/eeh-powernv.c | 22 >>> 1 file changed, 18 insertions(+), 4 deletions(-) >>> >>> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c >>> b/arch/powerpc/platforms/powernv/eeh-powernv.c >>> index 1cd80b35..7aba18e08996 100644 >>> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c >>> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c >>> @@ -366,10 +366,9 @@ static int pnv_eeh_write_config(struct eeh_dev *edev, >>> */ >>> static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev) >>> { >>> - struct pci_dn *pdn = pci_get_pdn(pdev); >>> - struct pci_controller *hose = pdn->phb; >>> - struct pnv_phb *phb = hose->private_data; >>> - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); >>> + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); >>> + struct pci_controller *hose = phb->hose; >>> + struct eeh_dev *edev; >>> uint32_t pcie_flags; >>> int ret; >>> int config_addr = (pdev->bus->number << 8) | (pdev->devfn); >>> @@ -415,12 +414,27 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct >>> pci_dev *pdev) >>> if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) >>> return NULL; >>> >>> + /* otherwise allocate and initialise a new eeh_dev */ >>> + edev = kzalloc(sizeof(*edev), GFP_KERNEL); >>> + if (!edev) { >>> + pr_err("%s: out of memory lol\n", __func__); >> >> "lol"? > > yeah lol "unprofessional" is the word for this ;) > > I am pretty sure we do not have to print anything if alloc failed >> as alloc prints an error anyway. Thanks, > > It does? Neat. Well, it is this: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/coding-style.rst#n878 === These generic allocation functions all emit a stack dump on failure when used without __GFP_NOWARN so there is no use in emitting an additional failure message when NULL is returned. === More than a printk. A small detail though. -- Alexey
Re: [PATCH 00/14] powerpc/vas: Page fault handling for user space NX requests
Hi Haren, On 27/11/19 12:00 pm, Haren Myneni wrote: Haren Myneni (14): powerpc/vas: Describe vas-port and interrupts properties Revert "powerpc/powernv: remove the unused vas_win_paste_addr and vas_win_id functions" powerpc/vas: Define nx_fault_stamp in coprocessor_request_block powerpc/vas: Setup IRQ mapping and register port for each window powerpc/vas: Setup fault window per VAS instance powerpc/VAS: Setup fault handler per VAS instance powerpc/vas: Read and process fault CRBs powerpc/vas: Take reference to PID and mm for user space windows powerpc/vas: Update CSB and notify process for fault CRBs powerpc/vas: Print CRB and FIFO values powerpc/vas: Do not use default credits for receive window powerpc/VAS: Return credits after handling fault powerpc/vas: Display process stuck message powerpc/vas: Free send window in VAS instance after credits returned In future, please send the patches in reply to the cover letter (and for series that don't have a cover letter, send patch 2 onwards as a reply to patch 1). You may want to consider using git send-email which automates all this for you. Thanks, -- Andrew Donnellan OzLabs, ADL Canberra a...@linux.ibm.com IBM Australia Limited
[PATCH 14/14] powerpc/vas: Free send window in VAS instance after credits returned
NX may be processing requests while trying to close window. Wait until all credits are returned and then free send window from VAS instance. Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-window.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 9ba354c..244952d7 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1319,14 +1319,14 @@ int vas_win_close(struct vas_window *window) unmap_paste_region(window); - clear_vinst_win(window); - poll_window_busy_state(window); unpin_close_window(window); poll_window_credits(window); + clear_vinst_win(window); + poll_window_castout(window); /* if send window, drop reference to matching receive window */ -- 1.8.3.1
[PATCH 13/14] powerpc/vas: Display process stuck message
Process can not close send window until all requests are processed. Means wait until window state is not busy and send credits are returned. Display debug message in case taking longer to close the window. Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-window.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 1c0788c..9ba354c 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1186,6 +1186,7 @@ static void poll_window_credits(struct vas_window *window) { u64 val; int creds, mode; + int count = 0; val = read_hvwc_reg(window, VREG(WINCTL)); if (window->tx_win) @@ -1204,10 +1205,25 @@ static void poll_window_credits(struct vas_window *window) creds = GET_FIELD(VAS_LRX_WCRED, val); } + /* +* Takes around few microseconds to complete all pending requests +* and return credits. +* TODO: Issue CRB Kill to stop all pending requests. Need only +* if there is a bug in NX or fault handling in kernel. +*/ if (creds < window->wcreds_max) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(10)); + count++; + /* +* Process can not close send window until all credits are +* returned. +*/ + if (!(count % 1)) + pr_debug("%s() pid %d stuck? retries %d\n", __func__, + vas_window_pid(window), count); + goto retry; } } @@ -1221,6 +1237,7 @@ static void poll_window_busy_state(struct vas_window *window) { int busy; u64 val; + int count = 0; retry: val = read_hvwc_reg(window, VREG(WIN_STATUS)); @@ -1229,6 +1246,15 @@ static void poll_window_busy_state(struct vas_window *window) val = 0; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(5)); + count++; + /* +* Takes around 5 microseconds to process all pending +* requests. +*/ + if (!(count % 1)) + pr_debug("%s() pid %d stuck? retries %d\n", __func__, + vas_window_pid(window), count); + goto retry; } } -- 1.8.3.1
[PATCH 12/14] powerpc/VAS: Return credits after handling fault
NX expects OS to return credit for send window after processing each fault. Also credit has to be returned even for fault window. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-fault.c | 9 + arch/powerpc/platforms/powernv/vas-window.c | 17 + arch/powerpc/platforms/powernv/vas.h| 1 + 3 files changed, 27 insertions(+) diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index ad594c8..2a3ee9f 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -244,6 +244,10 @@ static void process_fault_crbs(struct vas_instance *vinst) memset(fifo, 0, CRB_SIZE); mutex_unlock(&vinst->mutex); + /* +* Return credit for the fault window. +*/ + vas_return_credit(vinst->fault_win, 0); pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d pending %d\n", vinst->vas_id, vinst->fault_fifo, fifo, vinst->fault_crbs, @@ -270,6 +274,11 @@ static void process_fault_crbs(struct vas_instance *vinst) } update_csb(window, crb); + /* +* Return credit for send window after processing +* fault CRB. +*/ + vas_return_credit(window, 1); } while (true); } diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index ca208a3..1c0788c 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1323,6 +1323,23 @@ int vas_win_close(struct vas_window *window) EXPORT_SYMBOL_GPL(vas_win_close); /* + * Return credit for the given window. + */ +void vas_return_credit(struct vas_window *window, bool tx) +{ + uint64_t val; + + val = 0ULL; + if (tx) { /* send window */ + val = SET_FIELD(VAS_TX_WCRED, val, 1); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val); + } else { + val = SET_FIELD(VAS_LRX_WCRED, val, 1); + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val); + } +} + +/* * Return a system-wide unique window id for the window @win. */ u32 vas_win_id(struct vas_window *win) diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 75bea1d..b8b90f3 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -421,6 +421,7 @@ struct vas_winctx { extern void vas_wakeup_fault_handler(int virq, void *arg); extern int vas_setup_fault_handler(struct vas_instance *vinst); extern void vas_cleanup_fault_handler(struct vas_instance *vinst); +extern void vas_return_credit(struct vas_window *window, bool tx); extern struct vas_window *vas_pswid_to_window(struct vas_instance *vinst, uint32_t pswid); -- 1.8.3.1
[PATCH 11/14] powerpc/vas: Do not use default credits for receive window
System checkstops if RxFIFO overruns with more requests than the maximum possible number of CRBs allowed in FIFO at any time. So max credits value (rxattr.wcreds_max) is set and is passed to vas_rx_win_open() by the the driver. Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-window.c | 4 ++-- arch/powerpc/platforms/powernv/vas.h| 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index ad3104c..ca208a3 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -782,7 +782,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; - if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + if (!attr->wcreds_max) return false; if (attr->nx_win) { @@ -888,7 +888,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; rxwin->cop = cop; - rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + rxwin->wcreds_max = rxattr->wcreds_max; init_winctx_for_rxwin(rxwin, rxattr, &winctx); init_winctx_regs(rxwin, &winctx); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 03a1c9f..75bea1d 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -101,11 +101,9 @@ /* * Initial per-process credits. * Max send window credits:4K-1 (12-bits in VAS_TX_WCRED) - * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED) * * TODO: Needs tuning for per-process credits */ -#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) #define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) -- 1.8.3.1
[PATCH 10/14] powerpc/vas: Print CRB and FIFO values
Dump FIFO values if could not find send window and print CRB for debugging. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-fault.c | 40 ++ 1 file changed, 40 insertions(+) diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index dd27649..ad594c8 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -36,6 +36,27 @@ void vas_wakeup_fault_handler(int virq, void *arg) wake_up(&vinst->fault_wq); } +static void dump_crb(struct coprocessor_request_block *crb) +{ + struct data_descriptor_entry *dde; + struct nx_fault_stamp *nx; + + dde = &crb->source; + pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", + be64_to_cpu(dde->address), be32_to_cpu(dde->length), + dde->count, dde->index, dde->flags); + + dde = &crb->target; + pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", + be64_to_cpu(dde->address), be32_to_cpu(dde->length), + dde->count, dde->index, dde->flags); + + nx = &crb->stamp.nx; + pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n", + be32_to_cpu(nx->pswid), crb_nx_fault_addr(crb), + nx->flags, be32_to_cpu(nx->fault_status)); +} + static void notify_process(pid_t pid, u64 fault_addr) { int rc; @@ -154,6 +175,23 @@ static void update_csb(struct vas_window *window, } } +static void dump_fifo(struct vas_instance *vinst) +{ + int i; + unsigned long *fifo = vinst->fault_fifo; + + pr_err("Fault fifo size %d, max crbs %d, crb size %lu\n", + vinst->fault_fifo_size, + vinst->fault_fifo_size / CRB_SIZE, + sizeof(struct coprocessor_request_block)); + + pr_err("Fault FIFO Dump:\n"); + for (i = 0; i < 64; i += 4, fifo += 4) { + pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n", + i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3)); + } +} + /* * Process CRBs that we receive on the fault window. */ @@ -211,6 +249,7 @@ static void process_fault_crbs(struct vas_instance *vinst) vinst->fault_crbs, atomic_read(&vinst->pending_fault)); + dump_crb(crb); window = vas_pswid_to_window(vinst, crb_nx_pswid(crb)); if (IS_ERR(window)) { @@ -220,6 +259,7 @@ static void process_fault_crbs(struct vas_instance *vinst) * even clean it up (return credit). * But we should not get here. */ + dump_fifo(vinst); pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d, pending %d bad CRB?\n", vinst->vas_id, vinst->fault_fifo, fifo, crb_nx_pswid(crb), vinst->fault_crbs, -- 1.8.3.1
[PATCH 09/14] powerpc/vas: Update CSB and notify process for fault CRBs
For each fault CRB, update fault address in CRB (fault_storage_addr) and translation error status in CSB so that user space touch the fault address and resend the request. If the user space passed invalid CSB address send signal to process with SIGSEGV. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-fault.c | 121 - 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index 7a8b2b5..dd27649 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -36,6 +36,124 @@ void vas_wakeup_fault_handler(int virq, void *arg) wake_up(&vinst->fault_wq); } +static void notify_process(pid_t pid, u64 fault_addr) +{ + int rc; + struct kernel_siginfo info; + + memset(&info, 0, sizeof(info)); + + info.si_signo = SIGSEGV; + info.si_errno = EFAULT; + info.si_code = SEGV_MAPERR; + + info.si_addr = (void *)fault_addr; + rcu_read_lock(); + rc = kill_pid_info(SIGSEGV, &info, find_vpid(pid)); + rcu_read_unlock(); + + pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__, pid, rc); +} + +/* + * Update the CSB to indicate a translation error. + * + * If the fault is in the CSB address itself or if we are unable to + * update the CSB, send a signal to the process, because we have no + * other way of notifying the user process. + * + * Remaining settings in the CSB are based on wait_for_csb() of + * NX-GZIP. + */ +static void update_csb(struct vas_window *window, + struct coprocessor_request_block *crb) +{ + int rc; + pid_t pid; + int task_exit = 0; + void __user *csb_addr; + struct task_struct *tsk; + struct coprocessor_status_block csb; + + /* +* NX user space windows can not be opened for task->mm=NULL +* and faults will not be generated for kernel requests. +*/ + if (!window->mm || !window->user_win) + return; + + csb_addr = (void *)__be64_to_cpu(crb->csb_addr); + + csb.cc = CSB_CC_TRANSLATION; + csb.ce = CSB_CE_TERMINATION; + csb.cs = 0; + csb.count = 0; + + /* +* Returns the fault address in CPU format since it is passed with +* signal. But if the user space expects BE format, need changes. +* i.e either kernel (here) or user should convert to CPU format. +* Not both! +*/ + csb.address = crb_nx_fault_addr(crb); + csb.flags = 0; + + use_mm(window->mm); + rc = copy_to_user(csb_addr, &csb, sizeof(csb)); + /* +* User space polls on csb.flags (first byte). So add barrier +* then copy first byte with csb flags update. +*/ + smp_mb(); + if (!rc) { + csb.flags = CSB_V; + rc = copy_to_user(csb_addr, &csb, sizeof(u8)); + } + unuse_mm(window->mm); + + /* Success */ + if (!rc) + return; + + /* +* User space passed invalid CSB address, Notify process with +* SEGV signal. +*/ + tsk = get_pid_task(window->pid, PIDTYPE_PID); + /* +* Send window will be closed after processing all NX requests +* and process exits after closing all windows. In multi-thread +* applications, thread may not exists, but does not close FD +* (means send window) upon exit. Parent thread (tgid) can use +* and close the window later. +*/ + if (tsk) { + if (tsk->flags & PF_EXITING) + task_exit = 1; + put_task_struct(tsk); + pid = vas_window_pid(window); + } else { + pid = vas_window_tgid(window); + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return; + } + if (tsk->flags & PF_EXITING) + task_exit = 1; + rcu_read_unlock(); + } + + /* Do not notify if the task is exiting. */ + if (!task_exit) { + pr_err("Invalid CSB address 0x%p signalling pid(%d)\n", + csb_addr, pid); + notify_process(pid, (u64)csb_addr); + } +} + /* * Process CRBs that we receive on the fault window. */ @@ -97,7 +215,7 @@ static void process_fault_crbs(struct vas_instance *vinst) if (IS_ERR(window)) { /* -* What now? We got an interrupt about a specific send +* We got an interrupt about a specific send * window but we can't find that window and we can't * even clean it up (return credit).
[PATCH 08/14] powerpc/vas: Take reference to PID and mm for user space windows
Process close windows after its requests are completed. In multi-thread applications, child can open a window but release FD will not be called upon its exit. Parent thread will be closing it later upon its exit. The parent can also send NX requests with this window and NX can generate page faults. After kernel handles the page fault, send signal to process by using PID if CSB address is invalid. Parent thread will not receive signal since its PID is different from the one saved in vas_window. So use tgid in case if the task for the pid saved in window is not running and send signal to its parent. To prevent reusing the pid until the window closed, take reference to pid and task mm. Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-debug.c | 2 +- arch/powerpc/platforms/powernv/vas-window.c | 44 ++--- arch/powerpc/platforms/powernv/vas.h| 14 - 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c index 09e63df..ef9a717 100644 --- a/arch/powerpc/platforms/powernv/vas-debug.c +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -38,7 +38,7 @@ static int info_show(struct seq_file *s, void *private) seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), window->tx_win ? "Send" : "Receive"); - seq_printf(s, "Pid : %d\n", window->pid); + seq_printf(s, "Pid : %d\n", vas_window_pid(window)); unlock: mutex_unlock(&vas_mutex); diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 7fc1542..ad3104c 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include "vas.h" @@ -887,8 +889,6 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->user_win = rxattr->user_win; rxwin->cop = cop; rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; - if (rxattr->user_win) - rxwin->pid = task_pid_vnr(current); init_winctx_for_rxwin(rxwin, rxattr, &winctx); init_winctx_regs(rxwin, &winctx); @@ -1037,7 +1037,6 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; - txwin->pid = attr->pid; txwin->user_win = attr->user_win; txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; @@ -1079,6 +1078,34 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, goto free_window; } + if (txwin->user_win) { + /* +* Window opened by child thread may not be closed when +* it exits. So take reference to its pid and release it +* when the window is free by parent thread. +* Acquire a reference to the task's pid to make sure +* pid will not be re-used. +*/ + txwin->pid = get_task_pid(current, PIDTYPE_PID); + /* +* Acquire a reference to the task's mm. +*/ + txwin->mm = get_task_mm(current); + + if (txwin->mm) { + mmput(txwin->mm); + mmgrab(txwin->mm); + mm_context_add_copro(txwin->mm); + } else { + put_pid(txwin->pid); + pr_err("VAS: pid(%d): mm_struct is not found\n", + current->pid); + rc = -EPERM; + goto free_window; + } + txwin->tgid = task_tgid_vnr(current); + } + set_vinst_win(vinst, txwin); return txwin; @@ -1277,8 +1304,17 @@ int vas_win_close(struct vas_window *window) poll_window_castout(window); /* if send window, drop reference to matching receive window */ - if (window->tx_win) + if (window->tx_win) { + if (window->user_win) { + /* Drop references to pid and mm */ + put_pid(window->pid); + if (window->mm) { + mmdrop(window->mm); + mm_context_remove_copro(window->mm); + } + } put_rx_win(window->rxwin); + } vas_window_free(window); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index eb929c7..03a1c9f 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -343,7 +343,9 @@ struct vas_window { bool user_win; /* True if user s
[PATCH 07/14] powerpc/vas: Read and process fault CRBs
NX pastes CRB in fault FIFO and generates interrupt whenever faults on CRB. OS reads CRBs from fault FIFO and process them by setting faulting address in fault_storge_addr in CRB and update CSB. When CSB status is changed, process sends NX request after touching the fault address. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-fault.c | 81 + arch/powerpc/platforms/powernv/vas-window.c | 51 ++ arch/powerpc/platforms/powernv/vas.h| 3 ++ 3 files changed, 135 insertions(+) diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index c6c105c..7a8b2b5 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "vas.h" @@ -36,6 +37,84 @@ void vas_wakeup_fault_handler(int virq, void *arg) } /* + * Process CRBs that we receive on the fault window. + */ +static void process_fault_crbs(struct vas_instance *vinst) +{ + void *fifo; + struct vas_window *window; + struct coprocessor_request_block buf; + struct coprocessor_request_block *crb; + u64 csb_addr; + + crb = &buf; + + /* +* VAS can interrupt with multiple page faults. So process all +* valid CRBs within fault FIFO until reaches invalid CRB. +* For valid CRBs, csb_addr should be valid address points to CSB +* section within CRB. After reading CRB entry, it is reset with +* 0's in fault FIFO. +* +* In case kernel receives another interrupt with different page +* fault and is processed by the previous handling, will be returned +* from this function when it sees invalid CRB (means 0's). +*/ + do { + mutex_lock(&vinst->mutex); + + /* +* Advance the fault fifo pointer to next CRB. +* Use CRB_SIZE rather than sizeof(*crb) since the latter is +* aligned to CRB_ALIGN (256) but the CRB written to by VAS is +* only CRB_SIZE in len. +*/ + fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE); + csb_addr = ((struct coprocessor_request_block *)fifo)->csb_addr; + + /* +* Return if reached invalid CRB. +*/ + if (!csb_addr) { + mutex_unlock(&vinst->mutex); + return; + } + + vinst->fault_crbs++; + if (vinst->fault_crbs == vinst->fault_fifo_size/CRB_SIZE) + vinst->fault_crbs = 0; + + memcpy(crb, fifo, CRB_SIZE); + memset(fifo, 0, CRB_SIZE); + mutex_unlock(&vinst->mutex); + + pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d pending %d\n", + vinst->vas_id, vinst->fault_fifo, fifo, + vinst->fault_crbs, + atomic_read(&vinst->pending_fault)); + + window = vas_pswid_to_window(vinst, crb_nx_pswid(crb)); + + if (IS_ERR(window)) { + /* +* What now? We got an interrupt about a specific send +* window but we can't find that window and we can't +* even clean it up (return credit). +* But we should not get here. +*/ + pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d, pending %d bad CRB?\n", + vinst->vas_id, vinst->fault_fifo, fifo, + crb_nx_pswid(crb), vinst->fault_crbs, + atomic_read(&vinst->pending_fault)); + + WARN_ON_ONCE(1); + return; + } + + } while (true); +} + +/* * Fault handler thread for each VAS instance and process fault CRBs. */ static int fault_handler_func(void *arg) @@ -54,6 +133,8 @@ static int fault_handler_func(void *arg) break; atomic_dec(&vinst->pending_fault); + process_fault_crbs(vinst); + } while (!kthread_should_stop()); return 0; diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 5f1faeb..7fc1542 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1294,3 +1294,54 @@ u32 vas_win_id(struct vas_window *win) return encode_pswid(win->vinst->vas_id, win->winid); } EXPORT_SYMBOL_GPL(vas_win_id); + +struct vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid) +{ + int winid; + struct vas_wind
[PATCH 06/14] powerpc/vas: Setup fault handler per VAS instance
Fault handler is created as kernel thread for each VAS instance and invoked whenever NX generates page fault. This thread reads CRBs from fault FIFO and process them. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-fault.c | 54 ++ arch/powerpc/platforms/powernv/vas.c | 7 arch/powerpc/platforms/powernv/vas.h | 6 3 files changed, 67 insertions(+) diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c index a5e63a5..c6c105c 100644 --- a/arch/powerpc/platforms/powernv/vas-fault.c +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "vas.h" @@ -24,6 +25,54 @@ */ #define VAS_FAULT_WIN_FIFO_SIZE(4 << 20) +struct task_struct *fault_handler; + +void vas_wakeup_fault_handler(int virq, void *arg) +{ + struct vas_instance *vinst = arg; + + atomic_inc(&vinst->pending_fault); + wake_up(&vinst->fault_wq); +} + +/* + * Fault handler thread for each VAS instance and process fault CRBs. + */ +static int fault_handler_func(void *arg) +{ + struct vas_instance *vinst = (struct vas_instance *)arg; + + do { + if (signal_pending(current)) + flush_signals(current); + + wait_event_interruptible(vinst->fault_wq, + atomic_read(&vinst->pending_fault) || + kthread_should_stop()); + + if (kthread_should_stop()) + break; + + atomic_dec(&vinst->pending_fault); + } while (!kthread_should_stop()); + + return 0; +} + +/* + * Create a thread that processes the fault CRBs. + */ +int vas_setup_fault_handler(struct vas_instance *vinst) +{ + vinst->fault_handler = kthread_run(fault_handler_func, (void *)vinst, + "vas-fault-%u", vinst->vas_id); + + if (IS_ERR(vinst->fault_handler)) + return PTR_ERR(vinst->fault_handler); + + return 0; +} + /* * Fault window is opened per VAS instance. NX pastes fault CRB in fault * FIFO upon page faults. @@ -102,4 +151,9 @@ int vas_cleanup_fault_window(struct vas_instance *vinst) return rc; } + +void vas_cleanup_fault_handler(struct vas_instance *vinst) +{ + kthread_stop(vinst->fault_handler); +} #endif diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index dd0e06c..db2aca4 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -30,6 +30,7 @@ static irqreturn_t vas_irq_handler(int virq, void *data) struct vas_instance *vinst = data; pr_devel("VAS %d: virq %d\n", vinst->vas_id, virq); + vas_wakeup_fault_handler(virq, data); return IRQ_HANDLED; } @@ -54,6 +55,10 @@ static void vas_irq_fault_handle_setup(struct vas_instance *vinst) * for user space. */ rc = vas_setup_fault_window(vinst); + + if (!rc) + rc = vas_setup_fault_handler(vinst); + if (rc) { free_irq(vinst->virq, vinst); vinst->virq = 0; @@ -129,6 +134,8 @@ static int init_vas_instance(struct platform_device *pdev) } } + init_waitqueue_head(&vinst->fault_wq); + pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n", pdev->name, vasid, vinst->paste_base_addr, vinst->paste_win_id_shift, vinst->virq, diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index e23fd69..ee284b3 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -317,6 +317,9 @@ struct vas_instance { int virq; int fault_fifo_size; void *fault_fifo; + atomic_t pending_fault; + wait_queue_head_t fault_wq; + struct task_struct *fault_handler; struct vas_window *fault_win; /* Fault window */ struct mutex mutex; @@ -414,6 +417,9 @@ struct vas_winctx { extern void vas_window_free_dbgdir(struct vas_window *win); extern int vas_setup_fault_window(struct vas_instance *vinst); extern int vas_cleanup_fault_window(struct vas_instance *vinst); +extern void vas_wakeup_fault_handler(int virq, void *arg); +extern int vas_setup_fault_handler(struct vas_instance *vinst); +extern void vas_cleanup_fault_handler(struct vas_instance *vinst); static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) -- 1.8.3.1
[PATCH 05/14] powerpc/vas: Setup fault window per VAS instance
Setup fault window for each VAS instance. When NX gets fault on request buffer, write fault CRBs in the corresponding fault FIFO and then sends an interrupt to the OS. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/vas-fault.c | 105 arch/powerpc/platforms/powernv/vas-window.c | 13 +++- arch/powerpc/platforms/powernv/vas.c| 12 arch/powerpc/platforms/powernv/vas.h| 6 ++ 5 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/vas-fault.c diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index a3ac964..74c2246 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -17,6 +17,6 @@ obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o obj-$(CONFIG_OCXL_BASE)+= ocxl.o obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c new file mode 100644 index 000..a5e63a5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VAS Fault handling. + * Copyright 2019, IBM Corporation + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include +#include +#include +#include +#include +#include + +#include "vas.h" + +/* + * The maximum FIFO size for fault window can be 8MB + * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS + * instance will be having fault window. + * 8MB FIFO can be used if expects more faults for each VAS + * instance. + */ +#define VAS_FAULT_WIN_FIFO_SIZE(4 << 20) + +/* + * Fault window is opened per VAS instance. NX pastes fault CRB in fault + * FIFO upon page faults. + */ +int vas_setup_fault_window(struct vas_instance *vinst) +{ + struct vas_rx_win_attr attr; + + vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE; + vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL); + if (!vinst->fault_fifo) { + pr_err("Unable to alloc %d bytes for fault_fifo\n", + vinst->fault_fifo_size); + return -ENOMEM; + } + + vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT); + + attr.rx_fifo_size = vinst->fault_fifo_size; + attr.rx_fifo = vinst->fault_fifo; + + /* +* Max creds is based on number of CRBs can fit in the FIFO. +* (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds +* will be 0x since the receive creds field is 16bits wide. +*/ + attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE; + attr.lnotify_lpid = 0; + attr.lnotify_pid = mfspr(SPRN_PID); + attr.lnotify_tid = mfspr(SPRN_PID); + + vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, + &attr); + + if (IS_ERR(vinst->fault_win)) { + pr_err("VAS: Error %ld opening FaultWin\n", + PTR_ERR(vinst->fault_win)); + kfree(vinst->fault_fifo); + return PTR_ERR(vinst->fault_win); + } + + pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n", + vinst->fault_win->winid, attr.lnotify_lpid, + attr.lnotify_pid, attr.lnotify_tid); + + return 0; +} + +/* + * We do not remove VAS instances. The following functions are needed + * when VAS hotplug is supported. + */ +#if 0 +/* + * Close the fault window and free the receive FIFO. + * + * TODO:vas_win_close() will block till pending requests are drained. + * The fault thread itself allocates the FIFO, opens the window + * and when done, closes the window and frees the FIFO. + * Are there any other race condition to watch for here or in + * vas_win_close()? + * + */ +int vas_cleanup_fault_window(struct vas_instance *vinst) +{ + int rc; + + rc = vas_win_close(vinst->fault_win); + if (rc < 0) { + pr_err("VAS Fault handler %d: error %d closing window\n", + vinst->vas_id, rc); + } + + kfree(vinst->fault_fifo); + vinst->fault_fifo = NULL; + + return rc; +} +#endif diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index ad6be91..5f1faeb 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -383,7 +383,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) init_xlate_regs(window, winctx->u
[PATCH 04/14] powerpc/vas: Setup IRQ mapping and register port for each window
Read interrupt and port values from the device tree, setup IRQ mapping and register IRQ for each VAS instance. Set port value for each NX window. When NX sees a fault on CRB, kernel gets an interrupt and handles the fault. IRQ setup and fault handling is needed only for user space send windows. So for kernel requests, ignore if interrupts property is not available. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/platforms/powernv/vas-window.c | 14 ++ arch/powerpc/platforms/powernv/vas.c| 68 ++--- arch/powerpc/platforms/powernv/vas.h| 2 + 3 files changed, 78 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index ea5ca02..ad6be91 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -758,6 +758,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (rxwin->vinst->virq) + winctx->irq_port = rxwin->vinst->irq_port; } static bool rx_win_args_valid(enum vas_cop_type cop, @@ -959,6 +961,8 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->tc_mode = txattr->tc_mode; winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (txwin->vinst->virq) + winctx->irq_port = txwin->vinst->irq_port; winctx->pswid = 0; } @@ -1050,6 +1054,16 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } else { /* +* Interrupt hanlder setup failed. Means NX can not generate +* fault for page fault. So not opening for user space tx +* window. +*/ + if (!vinst->virq) { + rc = -ENODEV; + goto free_window; + } + + /* * A user mapping must ensure that context switch issues * CP_ABORT for this thread. */ diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index ed9cc6d..71bddaa 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include "vas.h" @@ -23,9 +25,33 @@ static DEFINE_PER_CPU(int, cpu_vas_id); +static irqreturn_t vas_irq_handler(int virq, void *data) +{ + struct vas_instance *vinst = data; + + pr_devel("VAS %d: virq %d\n", vinst->vas_id, virq); + + return IRQ_HANDLED; +} + +static void vas_irq_fault_handle_setup(struct vas_instance *vinst) +{ + int rc; + char devname[64]; + + snprintf(devname, sizeof(devname), "vas-inst-%d", vinst->vas_id); + rc = request_irq(vinst->virq, vas_irq_handler, 0, devname, vinst); + if (rc) { + pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n", + vinst->vas_id, vinst->virq, rc); + vinst->virq = 0; + } +} + static int init_vas_instance(struct platform_device *pdev) { - int rc, cpu, vasid; + int rc, cpu, vasid, nresources = 5; + uint64_t port; struct resource *res; struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; @@ -36,7 +62,18 @@ static int init_vas_instance(struct platform_device *pdev) return -ENODEV; } - if (pdev->num_resources != 4) { + rc = of_property_read_u64(dn, "ibm,vas-port", &port); + if (rc) { + pr_err("No ibm,vas-port property for %s?\n", pdev->name); + /* No interrupts property */ + nresources = 4; + } + + /* +* interrupts property is available with 'ibm,vas-port' property. +* 4 Resources and 1 IRQ if interrupts property is available. +*/ + if (pdev->num_resources != nresources) { pr_err("Unexpected DT configuration for [%s, %d]\n", pdev->name, vasid); return -ENODEV; @@ -51,6 +88,7 @@ static int init_vas_instance(struct platform_device *pdev) mutex_init(&vinst->mutex); vinst->vas_id = vasid; vinst->pdev = pdev; + vinst->irq_port = port; res = &pdev->resource[0]; vinst->hvwc_bar_start = res->start; @@ -66,12 +104,23 @@ static int init_vas_instance(struct platform_device *pdev) pr_err("Bad 'paste_win_id_shift' in DT, %llx\n", res->end); goto free_vinst; } - vinst->paste_win_id_shift = 63 - res->end; - pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, " - "paste_win_id_shift 0x%llx\n", pdev->name, vasid,
[PATCH 03/14] powerpc/vas: Define nx_fault_stamp in coprocessor_request_block
Kernel sets fault address and status in CRB for NX page fault on user space address after processing page fault. User space gets the signal and handles the fault mentioned in CRB by bringing the page in to memory and send NX request again. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Haren Myneni --- arch/powerpc/include/asm/icswx.h | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h index 9872f85..c071471 100644 --- a/arch/powerpc/include/asm/icswx.h +++ b/arch/powerpc/include/asm/icswx.h @@ -108,6 +108,21 @@ struct data_descriptor_entry { __be64 address; } __packed __aligned(DDE_ALIGN); +/* 4.3.2 NX-stamped Fault CRB */ + +#define NX_STAMP_ALIGN (0x10) + +#define NX_STAMP_ACCESS_MASK(0x01) +#define NX_STAMP_ACCESS_READ0 +#define NX_STAMP_ACCESS_WRITE 1 + +struct nx_fault_stamp { + __be64 fault_storage_addr; + __be16 reserved; + __u8 flags; + __u8 fault_status; + __be32 pswid; +} __packed __aligned(NX_STAMP_ALIGN); /* Chapter 6.5.2 Coprocessor-Request Block (CRB) */ @@ -135,11 +150,26 @@ struct coprocessor_request_block { struct coprocessor_completion_block ccb; - u8 reserved[48]; + union { + struct nx_fault_stamp nx; + u8 reserved[16]; + } stamp; + + u8 reserved[32]; struct coprocessor_status_block csb; } __packed __aligned(CRB_ALIGN); +#define crb_csb_addr(c)__be64_to_cpu(c->csb_addr) +#define crb_nx_fault_addr(c) __be64_to_cpu(c->stamp.nx.fault_storage_addr) +#define crb_nx_flags(c)c->stamp.nx.flags +#define crb_nx_fault_status(c) c->stamp.nx.fault_status + +static inline uint32_t crb_nx_pswid(struct coprocessor_request_block *crb) +{ + return __be32_to_cpu(crb->stamp.nx.pswid); +} + /* RFC02167 Initiate Coprocessor Instructions document * Chapter 8.2.1.1.1 RS -- 1.8.3.1
Re: [Very RFC 25/46] powernv/pci: Remove pdn from pnv_pci_config_check_eeh()
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Despite the name this function is generic PowerNV PCI code rather than > anything > EEH specific. Convert to take a phb and bdfn rather than a pci_dn. > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci.c | 32 ++-- > 1 file changed, 21 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 6170677bfdc7..50142ff045ac 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -591,9 +591,15 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb > *phb, u32 pe_no) > spin_unlock_irqrestore(&phb->lock, flags); > } > > -static void pnv_pci_config_check_eeh(struct pci_dn *pdn) > +/* > + * This, very strangely named, function checks if a config access > + * caused an EEH and un-freezes the PE if it did. This is mainly > + * for the !CONFIG_EEH case where nothing is going to un-freeze > + * it for us. > + */ Rather than writing a comment like this, simply rename it to pnv_pci_cfg_check_and_unfreeze() or similar as you are changing callsites anyway. Thanks, > +static void pnv_pci_config_check_eeh(struct pnv_phb *phb, u16 bdfn) > { > - struct pnv_phb *phb = pdn->phb->private_data; > + struct pnv_ioda_pe *ioda_pe; > u8 fstate = 0; > __be16 pcierr = 0; > unsigned int pe_no; > @@ -604,10 +610,11 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) >* setup that yet. So all ER errors should be mapped to >* reserved PE. >*/ > - pe_no = pdn->pe_number; > - if (pe_no == IODA_INVALID_PE) { > + ioda_pe = __pnv_ioda_get_pe(phb, bdfn); > + if (ioda_pe) > + pe_no = ioda_pe->pe_number; > + else > pe_no = phb->ioda.reserved_pe_idx; > - } > > /* >* Fetch frozen state. If the PHB support compound PE, > @@ -629,7 +636,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) > } > > pr_devel(" -> EEH check, bdfn=%04x PE#%x fstate=%x\n", > - (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); > + bdfn, pe_no, fstate); > > /* Clear the frozen state if applicable */ > if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || > @@ -642,6 +649,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) > if (phb->freeze_pe) > phb->freeze_pe(phb, pe_no); > > + /* fish out the EEH log and send an EEH event. */ > pnv_pci_handle_eeh_config(phb, pe_no); > } > } > @@ -735,7 +743,8 @@ static int pnv_pci_read_config(struct pci_bus *bus, > int where, int size, u32 *val) > { > struct pci_dn *pdn; > - struct pnv_phb *phb; > + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); > + u16 bdfn = bus->number << 8 | devfn; > struct eeh_dev *edev; > int ret; > > @@ -755,7 +764,7 @@ static int pnv_pci_read_config(struct pci_bus *bus, > eeh_dev_check_failure(edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > } else { > - pnv_pci_config_check_eeh(pdn); > + pnv_pci_config_check_eeh(phb, bdfn); > } > > return ret; > @@ -766,7 +775,8 @@ static int pnv_pci_write_config(struct pci_bus *bus, > int where, int size, u32 val) > { > struct pci_dn *pdn; > - struct pnv_phb *phb; > + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); > + u16 bdfn = bus->number << 8 | devfn; > struct eeh_dev *edev; > int ret; > > @@ -779,9 +789,9 @@ static int pnv_pci_write_config(struct pci_bus *bus, > return PCIBIOS_DEVICE_NOT_FOUND; > > ret = pnv_pci_cfg_write(pdn, where, size, val); > - phb = pdn->phb->private_data; > + > if (!(phb->flags & PNV_PHB_FLAG_EEH)) > - pnv_pci_config_check_eeh(pdn); > + pnv_pci_config_check_eeh(phb, bdfn); > > return ret; > } > -- Alexey
[PATCH 02/14] Revert "powerpc/powernv: remove the unused vas_win_paste_addr and vas_win_id functions"
This reverts commit 452d23c0f6bd97f2fd8a9691fee79b76040a0feb. User space send windows (NX GZIP compression) need vas_win_paste_addr() to mmap window paste address and vas_win_id() to get window ID when window address is given. Added vas_win_id() and vas_win_paste_addr() with: commit 61f3cca8cda97 ("powerpc/vas: Define vas_win_id()") commit 5676be2fb7035 ("powerpc/vas: Define vas_win_paste_addr()") Signed-off-by:Haren Myneni --- arch/powerpc/include/asm/vas.h | 10 ++ arch/powerpc/platforms/powernv/vas-window.c | 19 +++ arch/powerpc/platforms/powernv/vas.h| 20 3 files changed, 49 insertions(+) diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index f93e6b0..da0b198 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -163,4 +163,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, */ int vas_paste_crb(struct vas_window *win, int offset, bool re); +/* + * Return a system-wide unique id for the VAS window @win. + */ +extern u32 vas_win_id(struct vas_window *win); + +/* + * Return the power bus paste address associated with @win so the caller + * can map that address into their address space. + */ +extern u64 vas_win_paste_addr(struct vas_window *win); #endif /* __ASM_POWERPC_VAS_H */ diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 0c0d27d..ea5ca02 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); } +u64 vas_win_paste_addr(struct vas_window *win) +{ + u64 addr; + + compute_paste_address(win, &addr, NULL); + + return addr; +} +EXPORT_SYMBOL(vas_win_paste_addr); + static inline void get_hvwc_mmio_bar(struct vas_window *window, u64 *start, int *len) { @@ -1254,3 +1264,12 @@ int vas_win_close(struct vas_window *window) return 0; } EXPORT_SYMBOL_GPL(vas_win_close); + +/* + * Return a system-wide unique window id for the window @win. + */ +u32 vas_win_id(struct vas_window *win) +{ + return encode_pswid(win->vinst->vas_id, win->winid); +} +EXPORT_SYMBOL_GPL(vas_win_id); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 5574aec..9cc5251 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -444,6 +444,26 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } +/* + * Encode/decode the Partition Send Window ID (PSWID) for a window in + * a way that we can uniquely identify any window in the system. i.e. + * we should be able to locate the 'struct vas_window' given the PSWID. + * + * BitsUsage + * 0:7 VAS id (8 bits) + * 8:15Unused, 0 (3 bits) + * 16:31 Window id (16 bits) + */ +static inline u32 encode_pswid(int vasid, int winid) +{ + u32 pswid = 0; + + pswid |= vasid << (31 - 7); + pswid |= winid; + + return pswid; +} + static inline void decode_pswid(u32 pswid, int *vasid, int *winid) { if (vasid) -- 1.8.3.1
[PATCH 01/14] powerpc/vas: Describe vas-port and interrupts properties
[PATCH 01/14] powerpc/vas: Describe vas-port and interrupts properties Signed-off-by: Haren Myneni --- Documentation/devicetree/bindings/powerpc/ibm,vas.txt | 5 + 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt index bf11d2f..12de08b 100644 --- a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt @@ -11,6 +11,8 @@ Required properties: window context start and length, OS/User window context start and length, "Paste address" start and length, "Paste window id" start bit and number of bits) +- ibm,vas-port : Port address for the interrupt. +- interrupts: IRQ value for each VAS instance and level. Example: @@ -18,5 +20,8 @@ Example: compatible = "ibm,vas", "ibm,power9-vas"; reg = <0x60191 0x200 0x60190 0x1 0x8 0x1 0x20 0x10>; name = "vas"; + interrupts = <0x1f 0>; + interrupt-parent = <&mpic>; ibm,vas-id = <0x1>; + ibm,vas-port = <0x601000100>; }; -- 1.8.3.1
[PATCH 00/14] powerpc/vas: Page fault handling for user space NX requests
Applications send compression / decompression requests to NX with COPY/PASTE instructions. When NX is processing these requests, can hit fault on the request buffer (not in memory). It issues an interrupt and pastes fault CRB in fault FIFO. Expects kernel to handle this fault and return credits for both send and fault windows after processing. This patch series adds IRQ and fault window setup, and NX fault handling: - Read IRQ# from "interrupts" property and configure IRQ per VAS instance. - Set port# for each window to generate an interrupt when noticed fault. - Set fault window and FIFO on which NX paste fault CRB. - Setup fault handler (as kernel thread) per VAS instance. - When receiving an interrupt, Read CRBs from fault FIFO and update coprocessor_status_block (CSB) in the corresponding CRB with translation failure (CSB_CC_TRANSLATION). After issuing NX requests, process polls on CSB address. When it sees translation error, can touch the request buffer to bring the page in to memory and reissue NX request. - If copy_to_user fails on user space CSB address, OS sends SEGV signal. Tested these patches with NX-GZIP support and will be posting this series soon. Patch 2: Revert 452d23c0f6bd97f2 - Using vas_win_id() and vas_win_paste_addr() Patch 3: Define nx_fault_stamp on which NX writes fault status for the fault CRB Patch 4: IRQ and port setup Patches 5 and 6: Setup fault window and fault handler per each VAS instance. fault window is used for NX to paste fault CRB in FIFO. A kernel thread is created to handle faults on each VAS. Patches 7, 9 and 10: Read and process CRBs from fault FIFO and notify tasks by updating CSB or through signals. Patch 8: Reference to pid and mm so that pid is not used until window closed. Needef for multi thread application where child can open a window and can be used by parent later. Patches 11 and 12: Return credits for send and fault windows after handling faults. Patch 14: Fix closing send window after all credits are returned. This issue happens only for user space requests. No page faults on kernel request buffer. Haren Myneni (14): powerpc/vas: Describe vas-port and interrupts properties Revert "powerpc/powernv: remove the unused vas_win_paste_addr and vas_win_id functions" powerpc/vas: Define nx_fault_stamp in coprocessor_request_block powerpc/vas: Setup IRQ mapping and register port for each window powerpc/vas: Setup fault window per VAS instance powerpc/VAS: Setup fault handler per VAS instance powerpc/vas: Read and process fault CRBs powerpc/vas: Take reference to PID and mm for user space windows powerpc/vas: Update CSB and notify process for fault CRBs powerpc/vas: Print CRB and FIFO values powerpc/vas: Do not use default credits for receive window powerpc/VAS: Return credits after handling fault powerpc/vas: Display process stuck message powerpc/vas: Free send window in VAS instance after credits returned .../devicetree/bindings/powerpc/ibm,vas.txt| 5 + arch/powerpc/include/asm/icswx.h | 32 +- arch/powerpc/include/asm/vas.h | 10 + arch/powerpc/platforms/powernv/Makefile| 2 +- arch/powerpc/platforms/powernv/vas-debug.c | 2 +- arch/powerpc/platforms/powernv/vas-fault.c | 408 + arch/powerpc/platforms/powernv/vas-window.c| 192 +- arch/powerpc/platforms/powernv/vas.c | 87 - arch/powerpc/platforms/powernv/vas.h | 54 ++- 9 files changed, 769 insertions(+), 23 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/vas-fault.c -- 1.8.3.1
Re: [Very RFC 24/46] powernv/pci: Make the pre-cfg EEH freeze check use eeh_dev rather than pci_dn
On 20/11/2019 12:28, Oliver O'Halloran wrote: > Squash another usage in preperation for making the config accessors pci_dn. > > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy > --- > We might want to move this into eeh-powernv.c > --- > arch/powerpc/platforms/powernv/pci.c | 37 +--- > 1 file changed, 17 insertions(+), 20 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index d36dde9777aa..6170677bfdc7 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -708,30 +708,23 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, > } > > #if CONFIG_EEH > -static bool pnv_pci_cfg_check(struct pci_dn *pdn) > +bool pnv_eeh_pre_cfg_check(struct eeh_dev *edev) > { > - struct eeh_dev *edev = NULL; > - struct pnv_phb *phb = pdn->phb->private_data; > - > - /* EEH not enabled ? */ > - if (!(phb->flags & PNV_PHB_FLAG_EEH)) > + if (!edev || !edev->pe) > return true; > > - /* PE reset or device removed ? */ > - edev = pdn->edev; > - if (edev) { > - if (edev->pe && > - (edev->pe->state & EEH_PE_CFG_BLOCKED)) > - return false; > + /* PE in reset? */ > + if (edev->pe->state & EEH_PE_CFG_BLOCKED) > + return false; > > - if (edev->mode & EEH_DEV_REMOVED) > - return false; > - } > + /* Device removed? */ > + if (edev->mode & EEH_DEV_REMOVED) > + return false; > > return true; > } > #else > -static inline pnv_pci_cfg_check(struct pci_dn *pdn) > +static inline pnv_pci_cfg_check(struct eeh_dev *edev) > { > return true; > } > @@ -743,6 +736,7 @@ static int pnv_pci_read_config(struct pci_bus *bus, > { > struct pci_dn *pdn; > struct pnv_phb *phb; > + struct eeh_dev *edev; > int ret; > > *val = 0x; > @@ -750,14 +744,15 @@ static int pnv_pci_read_config(struct pci_bus *bus, > if (!pdn) > return PCIBIOS_DEVICE_NOT_FOUND; > > - if (!pnv_pci_cfg_check(pdn)) > + edev = pdn_to_eeh_dev(pdn); > + if (!pnv_eeh_pre_cfg_check(edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > > ret = pnv_pci_cfg_read(pdn, where, size, val); > phb = pdn->phb->private_data; > - if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) { > + if (phb->flags & PNV_PHB_FLAG_EEH && edev) { > if (*val == EEH_IO_ERROR_VALUE(size) && > - eeh_dev_check_failure(pdn->edev)) > + eeh_dev_check_failure(edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > } else { > pnv_pci_config_check_eeh(pdn); > @@ -772,13 +767,15 @@ static int pnv_pci_write_config(struct pci_bus *bus, > { > struct pci_dn *pdn; > struct pnv_phb *phb; > + struct eeh_dev *edev; > int ret; > > pdn = pci_get_pdn_by_devfn(bus, devfn); > if (!pdn) > return PCIBIOS_DEVICE_NOT_FOUND; > > - if (!pnv_pci_cfg_check(pdn)) > + edev = pdn_to_eeh_dev(pdn); > + if (!pnv_eeh_pre_cfg_check(edev)) > return PCIBIOS_DEVICE_NOT_FOUND; > > ret = pnv_pci_cfg_write(pdn, where, size, val); > -- Alexey
[PATCH v3 2/2] powerpc/kvm/book3e: Replace current->mm by kvm->mm
Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/booke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index be9a45874194..fd7bdb4f8f87 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -775,7 +775,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) debug = current->thread.debug; current->thread.debug = vcpu->arch.dbg_reg; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = vcpu->kvm->mm->pgd; kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); -- 2.23.0
[PATCH v3 1/2] powerpc/kvm/book3s: Replace current->mm by kvm->mm
Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c| 10 ++ arch/powerpc/kvm/book3s_hv.c| 10 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d381526c5c9b..6c372f5c61b6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -284,7 +284,7 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, - current->mm->pgd, false, pte_idx_ret); + kvm->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ @@ -573,7 +573,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = false; pfn = 0; page = NULL; - mm = current->mm; + mm = kvm->mm; pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 883a66e76638..ee6c103bb7d5 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -253,10 +253,11 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) } } + account_locked_vm(kvm->mm, + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); + kvm_put_kvm(stt->kvm); - account_locked_vm(current->mm, - kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -272,6 +273,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, { struct kvmppc_spapr_tce_table *stt = NULL; struct kvmppc_spapr_tce_table *siter; + struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; int ret = -ENOMEM; @@ -280,7 +282,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EINVAL; npages = kvmppc_tce_pages(size); - ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true); + ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true); if (ret) return ret; @@ -326,7 +328,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kfree(stt); fail_acct: - account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); + account_locked_vm(mm, kvmppc_stt_pages(npages), false); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec5c0379296a..d3baa23396e6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4263,7 +4263,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) user_vrsave = mfspr(SPRN_VRSAVE); vcpu->arch.wqp = &vcpu->arch.vcore->wq; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { @@ -4595,14 +4595,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); + down_read(&kvm->mm->mmap_sem); + vma = find_vma(kvm->mm, hva); if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) goto up_out; psize = vma_kernel_pagesize(vma); - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ if (psize >= 0x100) @@ -4635,7 +4635,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) return err; up_out: - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); goto out_srcu; } -- 2.23.0
[PATCH v3 0/2] Replace current->mm by kvm->mm on powerpc/kvm
Replace current->mm by kvm->mm on powerpc/kvm By replacing, we would reduce the use of 'global' current on code, relying more in the contents of kvm struct. On code, I found that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have tests like that: if (kvm->mm != current->mm) return -EIO; So this change would be safe. --- Changes since v2: - Rebased on torvalds/master and updated the remaining patches. Changes since v1: - Fixes possible 'use after free' on kvm_spapr_tce_release (from v1) - Fixes possible 'use after free' on kvm_vm_ioctl_create_spapr_tce - Fixes undeclared variable error Leonardo Bras (2): powerpc/kvm/book3s: Replace current->mm by kvm->mm powerpc/kvm/book3e: Replace current->mm by kvm->mm arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c| 10 ++ arch/powerpc/kvm/book3s_hv.c| 10 +- arch/powerpc/kvm/booke.c| 2 +- 4 files changed, 14 insertions(+), 12 deletions(-) -- 2.23.0
Re: [PATCH v2] dma-mapping: treat dev->bus_dma_mask as a DMA limit
On 2019-11-26 6:51 pm, Nicolas Saenz Julienne wrote: On Mon, 2019-11-25 at 16:33 +, Robin Murphy wrote: On 25/11/2019 7:44 am, Christoph Hellwig wrote: On Sat, Nov 23, 2019 at 09:51:08AM -0700, Nathan Chancellor wrote: Just as an FYI, this introduces a warning on arm32 allyesconfig for me: I think the dma_limit argument to iommu_dma_alloc_iova should be a u64 and/or we need to use min_t and open code the zero exception. Robin, Nicolas - any opinions? Yeah, given that it's always held a mask I'm not entirely sure why it was ever a dma_addr_t rather than a u64. Unless anyone else is desperate to do it I'll get a cleanup patch ready for rc1. Sounds good to me too Robin, since I started the mess, I'll be happy to do it if it helps offloading some work from you. No worries - your change only exposed my original weird decision ;) On second look the patch was literally a trivial one-liner, so I've written it up already. Cheers, Robin.
Re: [PATCH v2 26/35] powerpc/64: system call: Fix sparse warning about missing declaration
On Tue, Nov 26, 2019 at 09:13:40PM +0100, Michal Suchanek wrote: > Sparse warns about missing declarations for these functions: > > +arch/powerpc/kernel/syscall_64.c:108:23: warning: symbol > 'syscall_exit_prepare' was not declared. Should it be static? > +arch/powerpc/kernel/syscall_64.c:18:6: warning: symbol > 'system_call_exception' was not declared. Should it be static? > +arch/powerpc/kernel/syscall_64.c:200:23: warning: symbol > 'interrupt_exit_user_prepare' was not declared. Should it be static? > +arch/powerpc/kernel/syscall_64.c:288:23: warning: symbol > 'interrupt_exit_kernel_prepare' was not declared. Should it be static? > > Add declaration for them. I'm fine with this patch but, just FYI, lately people seems to prefer to add '__visible' to the function definition instead of creating such header files. Best regards, -- Luc Van Oostenryck
Re: [PATCH net v2 0/4] ibmvnic: Harden device commands and queries
From: Thomas Falcon Date: Mon, 25 Nov 2019 17:12:52 -0600 > This patch series fixes some shortcomings with the current > VNIC device command implementation. The first patch fixes > the initialization of driver completion structures used > for device commands. Additionally, all waits for device > commands are bounded with a timeout in the event that the > device does not respond or becomes inoperable. Finally, > serialize queries to retain the integrity of device return > codes. > > Changes in v2: > > - included header comment for ibmvnic_wait_for_completion > - removed open-coded loop in patch 3/4, suggested by Jakub > - ibmvnic_wait_for_completion accepts timeout value in milliseconds >instead of jiffies > - timeout calculations cleaned up and completed before wait loop > - included missing mutex_destroy calls, suggested by Jakub > - included comment before mutex declaration Series applied, thanks.
Re: [PATCH v3] libfdt: define INT32_MAX and UINT32_MAX in libfdt_env.h
On Wed, Nov 13, 2019 at 04:12:02PM +0900, Masahiro Yamada wrote: > The DTC v1.5.1 added references to (U)INT32_MAX. > > This is no problem for user-space programs since defines > (U)INT32_MAX along with (u)int32_t. > > For the kernel space, libfdt_env.h needs to be adjusted before we > pull in the changes. > > In the kernel, we usually use s/u32 instead of (u)int32_t for the > fixed-width types. > > Accordingly, we already have S/U32_MAX for their max values. > So, we should not add (U)INT32_MAX to any more. > > Instead, add them to the in-kernel libfdt_env.h to compile the > latest libfdt. > > Signed-off-by: Masahiro Yamada > --- > > My initial plan was to change this in a series of 3 patches > since it is clean, and reduces the code. > > [1/3] https://lore.kernel.org/patchwork/patch/1147095/ > [2/3] https://lore.kernel.org/patchwork/patch/1147096/ > [3/3] https://lore.kernel.org/patchwork/patch/1147097/ > > 1/3 is stuck in the license bikeshed. > > For 2/3, I have not been able to get Ack from Russell. > > So, I chose a straight-forward fixup. > > > Changes in v3: > - Resend as a single patch > > arch/arm/boot/compressed/libfdt_env.h | 4 +++- > arch/powerpc/boot/libfdt_env.h| 2 ++ > include/linux/libfdt_env.h| 3 +++ > 3 files changed, 8 insertions(+), 1 deletion(-) Applied. Rob
[PATCH v2 35/35] MAINTAINERS: perf: Add pattern that matches ppc perf to the perf entry.
Signed-off-by: Michal Suchanek --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 66cc549ac327..853690adb36f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12842,6 +12842,8 @@ F: arch/*/kernel/*/perf_event*.c F: arch/*/kernel/*/*/perf_event*.c F: arch/*/include/asm/perf_event.h F: arch/*/kernel/perf_callchain.c +F: arch/*/perf/* +F: arch/*/perf/*/* F: arch/*/events/* F: arch/*/events/*/* F: tools/perf/ -- 2.23.0
[PATCH v2 34/35] powerpc/perf: split callchain.c by bitness
Building callchain.c with !COMPAT proved quite ugly with all the defines. Splitting out the 32bit and 64bit parts looks better. No code change intended. Signed-off-by: Michal Suchanek --- v6: - move current_is_64bit consolidetaion to earlier patch - move defines to the top of callchain_32.c - Makefile cleanup v8: - fix valid_user_sp --- arch/powerpc/perf/Makefile | 5 +- arch/powerpc/perf/callchain.c| 362 +-- arch/powerpc/perf/callchain.h| 20 ++ arch/powerpc/perf/callchain_32.c | 197 + arch/powerpc/perf/callchain_64.c | 178 +++ 5 files changed, 400 insertions(+), 362 deletions(-) create mode 100644 arch/powerpc/perf/callchain.h create mode 100644 arch/powerpc/perf/callchain_32.c create mode 100644 arch/powerpc/perf/callchain_64.c diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index c155dcbb8691..53d614e98537 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,6 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o +obj-$(CONFIG_PERF_EVENTS) += callchain.o callchain_$(BITS).o perf_regs.o +ifdef CONFIG_COMPAT +obj-$(CONFIG_PERF_EVENTS) += callchain_32.o +endif obj-$(CONFIG_PPC_PERF_CTRS)+= core-book3s.o bhrb.o obj64-$(CONFIG_PPC_PERF_CTRS) += ppc970-pmu.o power5-pmu.o \ diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index b9fc2f297f30..dd5051015008 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -15,11 +15,9 @@ #include #include #include -#ifdef CONFIG_COMPAT -#include "../kernel/ppc32.h" -#endif #include +#include "callchain.h" /* * Is sp valid as the address of the next kernel stack frame after prev_sp? @@ -102,364 +100,6 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re } } -static inline int valid_user_sp(unsigned long sp) -{ - bool is_64 = !is_32bit_task(); - - if (!sp || (sp & (is_64 ? 7 : 3)) || sp > STACK_TOP - (is_64 ? 32 : 16)) - return 0; - return 1; -} - -#ifdef CONFIG_PPC64 -/* - * On 64-bit we don't want to invoke hash_page on user addresses from - * interrupt context, so if the access faults, we read the page tables - * to find which page (if any) is mapped and access it directly. - */ -static int read_user_stack_slow(void __user *ptr, void *buf, int nb) -{ - int ret = -EFAULT; - pgd_t *pgdir; - pte_t *ptep, pte; - unsigned shift; - unsigned long addr = (unsigned long) ptr; - unsigned long offset; - unsigned long pfn, flags; - void *kaddr; - - pgdir = current->mm->pgd; - if (!pgdir) - return -EFAULT; - - local_irq_save(flags); - ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); - if (!ptep) - goto err_out; - if (!shift) - shift = PAGE_SHIFT; - - /* align address to page boundary */ - offset = addr & ((1UL << shift) - 1); - - pte = READ_ONCE(*ptep); - if (!pte_present(pte) || !pte_user(pte)) - goto err_out; - pfn = pte_pfn(pte); - if (!page_is_ram(pfn)) - goto err_out; - - /* no highmem to worry about here */ - kaddr = pfn_to_kaddr(pfn); - memcpy(buf, kaddr + offset, nb); - ret = 0; -err_out: - local_irq_restore(flags); - return ret; -} - -static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) -{ - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) || - ((unsigned long)ptr & 7)) - return -EFAULT; - - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); - return 0; - } - pagefault_enable(); - - return read_user_stack_slow(ptr, ret, 8); -} - -/* - * 64-bit user processes use the same stack frame for RT and non-RT signals. - */ -struct signal_frame_64 { - chardummy[__SIGNAL_FRAMESIZE]; - struct ucontext uc; - unsigned long unused[2]; - unsigned inttramp[6]; - struct siginfo *pinfo; - void*puc; - struct siginfo info; - charabigap[288]; -}; - -static int is_sigreturn_64_address(unsigned long nip, unsigned long fp) -{ - if (nip == fp + offsetof(struct signal_frame_64, tramp)) - return 1; - if (vdso64_rt_sigtramp && current->mm->context.vdso_base && - nip == current->mm->context.vdso_base + vdso64_rt_sigtramp) - return 1; - return 0; -} - -/* - * Do some sanity checking on the signal frame pointed to by sp. - * We check the pinfo and puc pointers in the frame. - */ -static int sane_signal_64_frame(unsigned long sp) -{ - struct signal_frame_64 __user *sf; - unsigned long pinfo, puc; - - sf = (struct signal_fram
[PATCH v2 33/35] powerpc/64: Make COMPAT user-selectable disabled on littleendian by default.
On bigendian ppc64 it is common to have 32bit legacy binaries but much less so on littleendian. Signed-off-by: Michal Suchanek Reviewed-by: Christophe Leroy --- v3: make configurable --- arch/powerpc/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3e56c9c2f16e..825528db2921 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -266,8 +266,9 @@ config PANIC_TIMEOUT default 180 config COMPAT - bool - default y if PPC64 + bool "Enable support for 32bit binaries" + depends on PPC64 + default y if !CPU_LITTLE_ENDIAN select COMPAT_BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION -- 2.23.0
[PATCH v2 32/35] powerpc/64: make buildable without CONFIG_COMPAT
There are numerous references to 32bit functions in generic and 64bit code so ifdef them out. Signed-off-by: Michal Suchanek --- v2: - fix 32bit ifdef condition in signal.c - simplify the compat ifdef condition in vdso.c - 64bit is redundant - simplify the compat ifdef condition in callchain.c - 64bit is redundant v3: - use IS_ENABLED and maybe_unused where possible - do not ifdef declarations - clean up Makefile v4: - further makefile cleanup - simplify is_32bit_task conditions - avoid ifdef in condition by using return v5: - avoid unreachable code on 32bit - make is_current_64bit constant on !COMPAT - add stub perf_callchain_user_32 to avoid some ifdefs v6: - consolidate current_is_64bit v7: - remove leftover perf_callchain_user_32 stub from previous series version v8: - fix build again - too trigger-happy with stub removal - remove a vdso.c hunk that causes warning according to kbuild test robot v9: - removed current_is_64bit in previous patch v10: - rebase on top of 70ed86f4de5bd --- arch/powerpc/include/asm/thread_info.h | 4 ++-- arch/powerpc/kernel/Makefile | 6 +++--- arch/powerpc/kernel/entry_64.S | 2 ++ arch/powerpc/kernel/signal.c | 3 +-- arch/powerpc/kernel/syscall_64.c | 6 ++ arch/powerpc/kernel/vdso.c | 3 ++- arch/powerpc/perf/callchain.c | 8 +++- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 8e1d0195ac36..c128d8a48ea3 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -144,10 +144,10 @@ static inline bool test_thread_local_flags(unsigned int flags) return (ti->local_flags & flags) != 0; } -#ifdef CONFIG_PPC64 +#ifdef CONFIG_COMPAT #define is_32bit_task()(test_thread_flag(TIF_32BIT)) #else -#define is_32bit_task()(1) +#define is_32bit_task()(IS_ENABLED(CONFIG_PPC32)) #endif #if defined(CONFIG_PPC64) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 45f1d5e54671..35874119b398 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -44,16 +44,16 @@ CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING endif obj-y := cputable.o ptrace.o syscalls.o \ - irq.o align.o signal_32.o pmc.o vdso.o \ + irq.o align.o signal_$(BITS).o pmc.o vdso.o \ process.o systbl.o idle.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o -obj-$(CONFIG_PPC64)+= setup_64.o sys_ppc32.o \ - signal_64.o ptrace32.o \ +obj-$(CONFIG_PPC64)+= setup_64.o \ paca.o nvram_64.o firmware.o note.o \ syscall_64.o +obj-$(CONFIG_COMPAT) += sys_ppc32.o ptrace32.o signal_32.o obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 00173cc904ef..c339a984958f 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -52,8 +52,10 @@ SYS_CALL_TABLE: .tc sys_call_table[TC],sys_call_table +#ifdef CONFIG_COMPAT COMPAT_SYS_CALL_TABLE: .tc compat_sys_call_table[TC],compat_sys_call_table +#endif /* This value is used to mark exception frames on the stack. */ exception_marker: diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 60436432399f..61678cb0e6a1 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -247,7 +247,6 @@ static void do_signal(struct task_struct *tsk) sigset_t *oldset = sigmask_to_save(); struct ksignal ksig = { .sig = 0 }; int ret; - int is32 = is_32bit_task(); BUG_ON(tsk != current); @@ -277,7 +276,7 @@ static void do_signal(struct task_struct *tsk) rseq_signal_deliver(&ksig, tsk->thread.regs); - if (is32) { + if (is_32bit_task()) { if (ksig.ka.sa.sa_flags & SA_SIGINFO) ret = handle_rt_signal32(&ksig, oldset, tsk); else diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c index 62f44c3072f3..783deda66866 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c @@ -18,7 +18,6 @@ typedef long (*syscall_fn)(long, long, long, long, long, long); long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs) { - unsigned long ti_fla
[PATCH v2 31/35] powerpc/perf: consolidate valid_user_sp
Merge the 32bit and 64bit version. Halve the check constants on 32bit. Use STACK_TOP since it is defined. Passing is_64 is now redundant since is_32bit_task() is used to determine which callchain variant should be used. Use STACK_TOP and is_32bit_task() directly. This removes a page from the valid 32bit area on 64bit: #define TASK_SIZE_USER32 (0x0001UL - (1 * PAGE_SIZE)) #define STACK_TOP_USER32 TASK_SIZE_USER32 Signed-off-by: Michal Suchanek --- v8: new patch v11: simplify by using is_32bit_task() --- arch/powerpc/perf/callchain.c | 27 +++ 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c6c4c609cc14..a22a19975a19 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -102,6 +102,15 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re } } +static inline int valid_user_sp(unsigned long sp) +{ + bool is_64 = !is_32bit_task(); + + if (!sp || (sp & (is_64 ? 7 : 3)) || sp > STACK_TOP - (is_64 ? 32 : 16)) + return 0; + return 1; +} + #ifdef CONFIG_PPC64 /* * On 64-bit we don't want to invoke hash_page on user addresses from @@ -165,13 +174,6 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) return read_user_stack_slow(ptr, ret, 8); } -static inline int valid_user_sp(unsigned long sp, int is_64) -{ - if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x1UL) - 32) - return 0; - return 1; -} - /* * 64-bit user processes use the same stack frame for RT and non-RT signals. */ @@ -230,7 +232,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, while (entry->nr < entry->max_stack) { fp = (unsigned long __user *) sp; - if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) + if (!valid_user_sp(sp) || read_user_stack_64(fp, &next_sp)) return; if (level > 0 && read_user_stack_64(&fp[2], &next_ip)) return; @@ -279,13 +281,6 @@ static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry { } -static inline int valid_user_sp(unsigned long sp, int is_64) -{ - if (!sp || (sp & 7) || sp > TASK_SIZE - 32) - return 0; - return 1; -} - #define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE #define sigcontext32 sigcontext #define mcontext32 mcontext @@ -428,7 +423,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, while (entry->nr < entry->max_stack) { fp = (unsigned int __user *) (unsigned long) sp; - if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) + if (!valid_user_sp(sp) || read_user_stack_32(fp, &next_sp)) return; if (level > 0 && read_user_stack_32(&fp[1], &next_ip)) return; -- 2.23.0
[PATCH v2 30/35] powerpc/perf: consolidate read_user_stack_32
There are two almost identical copies for 32bit and 64bit. The function is used only in 32bit code which will be split out in next patch so consolidate to one function. Signed-off-by: Michal Suchanek Reviewed-by: Christophe Leroy --- v6: new patch v8: move the consolidated function out of the ifdef block. --- arch/powerpc/perf/callchain.c | 59 +++ 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 35d542515faf..c6c4c609cc14 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -165,22 +165,6 @@ static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) return read_user_stack_slow(ptr, ret, 8); } -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) -{ - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - pagefault_disable(); - if (!__get_user_inatomic(*ret, ptr)) { - pagefault_enable(); - return 0; - } - pagefault_enable(); - - return read_user_stack_slow(ptr, ret, 4); -} - static inline int valid_user_sp(unsigned long sp, int is_64) { if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x1UL) - 32) @@ -285,25 +269,9 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, } #else /* CONFIG_PPC64 */ -/* - * On 32-bit we just access the address and let hash_page create a - * HPTE if necessary, so there is no need to fall back to reading - * the page tables. Since this is called at interrupt level, - * do_page_fault() won't treat a DSI as a page fault. - */ -static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +static int read_user_stack_slow(void __user *ptr, void *buf, int nb) { - int rc; - - if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || - ((unsigned long)ptr & 3)) - return -EFAULT; - - pagefault_disable(); - rc = __get_user_inatomic(*ret, ptr); - pagefault_enable(); - - return rc; + return 0; } static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, @@ -326,6 +294,29 @@ static inline int valid_user_sp(unsigned long sp, int is_64) #endif /* CONFIG_PPC64 */ +/* + * On 32-bit we just access the address and let hash_page create a + * HPTE if necessary, so there is no need to fall back to reading + * the page tables. Since this is called at interrupt level, + * do_page_fault() won't treat a DSI as a page fault. + */ +static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) +{ + int rc; + + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || + ((unsigned long)ptr & 3)) + return -EFAULT; + + pagefault_disable(); + rc = __get_user_inatomic(*ret, ptr); + pagefault_enable(); + + if (IS_ENABLED(CONFIG_PPC64) && rc) + return read_user_stack_slow(ptr, ret, 4); + return rc; +} + /* * Layout for non-RT signal frames */ -- 2.23.0
[PATCH v2 29/35] powerpc/perf: remove current_is_64bit()
Since commit ed1cd6deb013 ("powerpc: Activate CONFIG_THREAD_INFO_IN_TASK") current_is_64bit() is quivalent to !is_32bit_task(). Remove the redundant function. Link: https://github.com/linuxppc/issues/issues/275 Link: https://lkml.org/lkml/2019/9/12/540 Fixes: linuxppc#275 Suggested-by: Christophe Leroy Signed-off-by: Michal Suchanek --- arch/powerpc/perf/callchain.c | 17 + 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c84bbd4298a0..35d542515faf 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -284,16 +284,6 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, } } -static inline int current_is_64bit(void) -{ - /* -* We can't use test_thread_flag() here because we may be on an -* interrupt stack, and the thread flags don't get copied over -* from the thread_info on the main stack to the interrupt stack. -*/ - return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT); -} - #else /* CONFIG_PPC64 */ /* * On 32-bit we just access the address and let hash_page create a @@ -321,11 +311,6 @@ static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry { } -static inline int current_is_64bit(void) -{ - return 0; -} - static inline int valid_user_sp(unsigned long sp, int is_64) { if (!sp || (sp & 7) || sp > TASK_SIZE - 32) @@ -486,7 +471,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (current_is_64bit()) + if (!is_32bit_task()) perf_callchain_user_64(entry, regs); else perf_callchain_user_32(entry, regs); -- 2.23.0
[PATCH v2 28/35] powerpc: move common register copy functions from signal_32.c to signal.c
These functions are required for 64bit as well. Signed-off-by: Michal Suchanek Reviewed-by: Christophe Leroy --- arch/powerpc/kernel/signal.c| 141 arch/powerpc/kernel/signal_32.c | 140 --- 2 files changed, 141 insertions(+), 140 deletions(-) diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e6c30cee6abf..60436432399f 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -18,12 +18,153 @@ #include #include #include +#include #include #include #include #include "signal.h" +#ifdef CONFIG_VSX +unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_FPR(i); + buf[i] = task->thread.fp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_fpr_from_user(struct task_struct *task, +void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_FPR(i) = buf[i]; + task->thread.fp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_vsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_vsx_from_user(struct task_struct *task, +void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +unsigned long copy_ckfpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_CKFPR(i); + buf[i] = task->thread.ckfp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_ckfpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_CKFPR(i) = buf[i]; + task->thread.ckfp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_ckvsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_ckvsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#else +inline unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + return __copy_to_user(to, task->thread.fp_state.fpr, + ELF_NFPREG * sizeof(double)); +} + +inline unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from) +{ + return __copy_from_user(task->thread.fp_state.fpr, from, + ELF_NFPREG * sizeof(double)); +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +inline unsigned long copy_ckfpr_to_user(void __user *to, +struct task_struct *task) +{ + return __copy_to_user(to, task->thread.ckfp_state.fpr, + ELF_NFPREG * sizeof(double)); +} +
[PATCH v2 27/35] powerpc: Add back __ARCH_WANT_SYS_LLSEEK macro
This partially reverts commit caf6f9c8a326 ("asm-generic: Remove unneeded __ARCH_WANT_SYS_LLSEEK macro") When CONFIG_COMPAT is disabled on ppc64 the kernel does not build. There is resistance to both removing the llseek syscall from the 64bit syscall tables and building the llseek interface unconditionally. Link: https://lore.kernel.org/lkml/20190828151552.ga16...@infradead.org/ Link: https://lore.kernel.org/lkml/20190829214319.498c7de2@naga/ Signed-off-by: Michal Suchanek Reviewed-by: Arnd Bergmann --- arch/powerpc/include/asm/unistd.h | 1 + fs/read_write.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b0720c7c3fcf..700fcdac2e3c 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -31,6 +31,7 @@ #define __ARCH_WANT_SYS_SOCKETCALL #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP +#define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT #define __ARCH_WANT_SYS_OLD_UNAME diff --git a/fs/read_write.c b/fs/read_write.c index 5bbf587f5bc1..89aa2701dbeb 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i } #endif -#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ + defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) -- 2.23.0
[PATCH v2 26/35] powerpc/64: system call: Fix sparse warning about missing declaration
Sparse warns about missing declarations for these functions: +arch/powerpc/kernel/syscall_64.c:108:23: warning: symbol 'syscall_exit_prepare' was not declared. Should it be static? +arch/powerpc/kernel/syscall_64.c:18:6: warning: symbol 'system_call_exception' was not declared. Should it be static? +arch/powerpc/kernel/syscall_64.c:200:23: warning: symbol 'interrupt_exit_user_prepare' was not declared. Should it be static? +arch/powerpc/kernel/syscall_64.c:288:23: warning: symbol 'interrupt_exit_kernel_prepare' was not declared. Should it be static? Add declaration for them. Signed-off-by: Michal Suchanek --- arch/powerpc/include/asm/asm-prototypes.h | 6 ++ arch/powerpc/kernel/syscall_64.c | 1 + 2 files changed, 7 insertions(+) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 399ca63196e4..841746357833 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -96,6 +96,12 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s unsigned long __init early_init(unsigned long dt_ptr); void __init machine_init(u64 dt_ptr); #endif +#ifdef CONFIG_PPC64 +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs); +notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs); +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr); +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr); +#endif long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c index d00cfc4a39a9..62f44c3072f3 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c @@ -1,4 +1,5 @@ #include +#include #include #include #include -- 2.23.0
[PATCH v2 25/35] powerpc/64s/exception: remove lite interrupt return
From: Nicholas Piggin The difference between lite and regular returns is that the lite case restores all NVGPRs, whereas lite skips that. This is quite clumsy though, most interrupts want the NVGPRs saved for debugging, not to modify in the caller, so the NVGPRs restore is not necessary most of the time. Restore NVGPRs explicitly for one case that requires it, and move everything else over to avoiding the restore unless the interrupt return demands it (e.g., handling a signal). Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/entry_64.S | 4 arch/powerpc/kernel/exceptions-64s.S | 21 +++-- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index b2e68f5ca8f7..00173cc904ef 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -452,10 +452,6 @@ _GLOBAL(fast_interrupt_return) .balign IFETCH_ALIGN_BYTES _GLOBAL(interrupt_return) - REST_NVGPRS(r1) - - .balign IFETCH_ALIGN_BYTES -_GLOBAL(interrupt_return_lite) ld r4,_MSR(r1) andi. r0,r4,MSR_PR beq kernel_interrupt_return diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 269edd1460be..1bccc869ebd3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1507,7 +1507,7 @@ EXC_COMMON_BEGIN(hardware_interrupt_common) RUNLATCH_ON addir3,r1,STACK_FRAME_OVERHEAD bl do_IRQ - b interrupt_return_lite + b interrupt_return GEN_KVM hardware_interrupt @@ -1694,7 +1694,7 @@ EXC_COMMON_BEGIN(decrementer_common) RUNLATCH_ON addir3,r1,STACK_FRAME_OVERHEAD bl timer_interrupt - b interrupt_return_lite + b interrupt_return GEN_KVM decrementer @@ -1785,7 +1785,7 @@ EXC_COMMON_BEGIN(doorbell_super_common) #else bl unknown_exception #endif - b interrupt_return_lite + b interrupt_return GEN_KVM doorbell_super @@ -2183,7 +2183,7 @@ EXC_COMMON_BEGIN(h_doorbell_common) #else bl unknown_exception #endif - b interrupt_return_lite + b interrupt_return GEN_KVM h_doorbell @@ -2213,7 +2213,7 @@ EXC_COMMON_BEGIN(h_virt_irq_common) RUNLATCH_ON addir3,r1,STACK_FRAME_OVERHEAD bl do_IRQ - b interrupt_return_lite + b interrupt_return GEN_KVM h_virt_irq @@ -2260,7 +2260,7 @@ EXC_COMMON_BEGIN(performance_monitor_common) RUNLATCH_ON addir3,r1,STACK_FRAME_OVERHEAD bl performance_monitor_exception - b interrupt_return_lite + b interrupt_return GEN_KVM performance_monitor @@ -3013,7 +3013,7 @@ do_hash_page: cmpdi r3,0/* see if __hash_page succeeded */ /* Success */ - beq interrupt_return_lite /* Return from exception on success */ + beq interrupt_return/* Return from exception on success */ /* Error */ blt-13f @@ -3027,10 +3027,11 @@ do_hash_page: handle_page_fault: 11:andis. r0,r5,DSISR_DABRMATCH@h bne-handle_dabr_fault + bl save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl do_page_fault cmpdi r3,0 - beq+interrupt_return_lite + beq+interrupt_return mr r5,r3 addir3,r1,STACK_FRAME_OVERHEAD ld r4,_DAR(r1) @@ -3045,9 +3046,9 @@ handle_dabr_fault: bl do_break /* * do_break() may have changed the NV GPRS while handling a breakpoint. -* If so, we need to restore them with their updated values. Don't use -* interrupt_return_lite here. +* If so, we need to restore them with their updated values. */ + REST_NVGPRS(r1) b interrupt_return -- 2.23.0
[PATCH v2 23/35] powerpc/64: system call implement the bulk of the logic in C
From: Nicholas Piggin System call entry and particularly exit code is beyond the limit of what is reasonable to implement in asm. This conversion moves all conditional branches out of the asm code, except for the case that all GPRs should be restored at exit. Null syscall test is about 5% faster after this patch, because the exit work is handled under local_irq_disable, and the hard mask and pending interrupt replay is handled after that, which avoids games with MSR. Signed-off-by: Nicholas Piggin [ms: add endian conversion for dtl_idx] Signed-off-by: Michal Suchanek v3: - Fix !KUAP build [mpe] - Fix BookE build/boot [mpe] - Don't trace irqs with MSR[RI]=0 - Don't allow syscall_exit_prepare to be ftraced, because function graph tracing which traces exits barfs after the IRQ state is prepared for kernel exit. - Fix BE syscall table to use normal function descriptors now that they are called from C. - Comment syscall_exit_prepare. --- arch/powerpc/include/asm/asm-prototypes.h | 11 - .../powerpc/include/asm/book3s/64/kup-radix.h | 14 +- arch/powerpc/include/asm/cputime.h| 24 ++ arch/powerpc/include/asm/hw_irq.h | 4 + arch/powerpc/include/asm/ptrace.h | 3 + arch/powerpc/include/asm/signal.h | 3 + arch/powerpc/include/asm/switch_to.h | 5 + arch/powerpc/include/asm/time.h | 3 + arch/powerpc/kernel/Makefile | 3 +- arch/powerpc/kernel/entry_64.S| 337 +++--- arch/powerpc/kernel/signal.h | 2 - arch/powerpc/kernel/syscall_64.c | 195 ++ arch/powerpc/kernel/systbl.S | 9 +- 13 files changed, 300 insertions(+), 313 deletions(-) create mode 100644 arch/powerpc/kernel/syscall_64.c diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 8561498e653c..399ca63196e4 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -103,14 +103,6 @@ long sys_switch_endian(void); notrace unsigned int __check_irq_replay(void); void notrace restore_interrupts(void); -/* ptrace */ -long do_syscall_trace_enter(struct pt_regs *regs); -void do_syscall_trace_leave(struct pt_regs *regs); - -/* process */ -void restore_math(struct pt_regs *regs); -void restore_tm_state(struct pt_regs *regs); - /* prom_init (OpenFirmware) */ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long pp, @@ -121,9 +113,6 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, void __init early_setup(unsigned long dt_ptr); void early_setup_secondary(void); -/* time */ -void accumulate_stolen_time(void); - /* misc runtime */ extern u64 __bswapdi2(u64); extern s64 __lshrdi3(s64, int); diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index f254de956d6a..07058edc5970 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H #include +#include #define AMR_KUAP_BLOCK_READUL(0x4000) #define AMR_KUAP_BLOCK_WRITE UL(0x8000) @@ -56,7 +57,14 @@ #ifdef CONFIG_PPC_KUAP -#include +#include +#include + +static inline void kuap_check_amr(void) +{ + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_RADIX_KUAP)) + WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED); +} /* * We support individually allowing read or write, but we don't support nesting @@ -101,6 +109,10 @@ static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)), "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read"); } +#else /* CONFIG_PPC_KUAP */ +static inline void kuap_check_amr(void) +{ +} #endif /* CONFIG_PPC_KUAP */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 2431b4ada2fa..c43614cffaac 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -60,6 +60,30 @@ static inline void arch_vtime_task_switch(struct task_struct *prev) } #endif +static inline void account_cpu_user_entry(void) +{ + unsigned long tb = mftb(); + struct cpu_accounting_data *acct = get_accounting(current); + + acct->utime += (tb - acct->starttime_user); + acct->starttime = tb; +} +static inline void account_cpu_user_exit(void) +{ + unsigned long tb = mftb(); + struct cpu_accounting_data *acct = get_accounting(current); + + acct->stime += (tb - acct->starttime); + acct->starttime_user = tb; +} + #endif /* __KERNEL__ */ +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +static inline void account_cpu_
[PATCH v2 24/35] powerpc/64s: interrupt return in C
From: Nicholas Piggin Implement the bulk of interrupt return logic in C. The asm return code must handle a few cases: restoring full GPRs, and emulating stack store. The asm return code is moved into 64e for now. The new logic has made allowance for 64e, but I don't have a full environment that works well to test it, and even booting in emulated qemu is not great for stress testing. 64e shouldn't be too far off working with this, given a bit more testing and auditing of the logic. This is slightly faster on a POWER9 (page fault speed increases about 1.1%), probably due to reduced mtmsrd. Signed-off-by: Nicholas Piggin [ms: Move the FP restore functions to restore_math. They are not used anywhere else and when restore_math is not built gcc warns about them being unused. Add asm/context_tracking.h include to exceptions-64e.S for SCHEDULE_USER definition.] Signed-off-by: Michal Suchanek --- .../powerpc/include/asm/book3s/64/kup-radix.h | 10 + arch/powerpc/include/asm/switch_to.h | 6 + arch/powerpc/kernel/entry_64.S| 475 -- arch/powerpc/kernel/exceptions-64e.S | 255 +- arch/powerpc/kernel/exceptions-64s.S | 119 ++--- arch/powerpc/kernel/process.c | 89 ++-- arch/powerpc/kernel/syscall_64.c | 157 +- arch/powerpc/kernel/vector.S | 2 +- 8 files changed, 623 insertions(+), 490 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index 07058edc5970..762afbed4762 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -60,6 +60,12 @@ #include #include +static inline void kuap_restore_amr(struct pt_regs *regs) +{ + if (mmu_has_feature(MMU_FTR_RADIX_KUAP)) + mtspr(SPRN_AMR, regs->kuap); +} + static inline void kuap_check_amr(void) { if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_RADIX_KUAP)) @@ -110,6 +116,10 @@ static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read"); } #else /* CONFIG_PPC_KUAP */ +static inline void kuap_restore_amr(struct pt_regs *regs) +{ +} + static inline void kuap_check_amr(void) { } diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 476008bc3d08..b867b58b1093 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -23,7 +23,13 @@ extern void switch_booke_debug_regs(struct debug_reg *new_debug); extern int emulate_altivec(struct pt_regs *); +#ifdef CONFIG_PPC_BOOK3S_64 void restore_math(struct pt_regs *regs); +#else +static inline void restore_math(struct pt_regs *regs) +{ +} +#endif void restore_tm_state(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 15bc2a872a76..b2e68f5ca8f7 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -279,7 +280,7 @@ flush_count_cache: * state of one is saved on its kernel stack. Then the state * of the other is restored from its kernel stack. The memory * management hardware is updated to the second process's state. - * Finally, we can return to the second process, via ret_from_except. + * Finally, we can return to the second process, via interrupt_return. * On entry, r3 points to the THREAD for the current task, r4 * points to the THREAD for the new task. * @@ -433,408 +434,150 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) addir1,r1,SWITCH_FRAME_SIZE blr - .align 7 -_GLOBAL(ret_from_except) - ld r11,_TRAP(r1) - andi. r0,r11,1 - bne ret_from_except_lite - REST_NVGPRS(r1) - -_GLOBAL(ret_from_except_lite) +#ifdef CONFIG_PPC_BOOK3S /* -* Disable interrupts so that current_thread_info()->flags -* can't change between when we test it and when we return -* from the interrupt. -*/ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r10,MSR_RI - mtmsrd r10,1 /* Update machine state */ -#endif /* CONFIG_PPC_BOOK3E */ +* If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not +* touched, AMR not set, no exit work created, then this can be used. +*/ + .balign IFETCH_ALIGN_BYTES +_GLOBAL(fast_interrupt_return) + ld r4,_MSR(r1) + andi. r0,r4,MSR_PR + bne .Lfast_user_interrupt_return + andi. r0,r4,MSR_RI + bne+.Lfast_kernel_interrupt_return + addir3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b . /* should not get here */ - ld r9, PACA_THREAD_INFO(r13) - ld r3,_MSR(r1) -#ifdef CONFIG_PPC_BOOK3E
[PATCH v2 22/35] powerpc/64: system call remove non-volatile GPR save optimisation
From: Nicholas Piggin powerpc has an optimisation where interrupts avoid saving the non-volatile (or callee saved) registers to the interrupt stack frame if they are not required. Two problems with this are that an interrupt does not always know whether it will need non-volatiles; and if it does need them, they can only be saved from the entry-scoped asm code (because we don't control what the C compiler does with these registers). system calls are the most difficult: some system calls always require all registers (e.g., fork, to copy regs into the child). Sometimes registers are only required under certain conditions (e.g., tracing, signal delivery). These cases require ugly logic in the call chains (e.g., ppc_fork), and require a lot of logic to be implemented in asm. So remove the optimisation for system calls, and always save NVGPRs on entry. Modern high performance CPUs are not so sensitive, because the stores are dense in cache and can be hidden by other expensive work in the syscall path -- the null syscall selftests benchmark on POWER9 is not slowed (124.40ns before and 123.64ns after, i.e., within the noise). Other interrupts retain the NVGPR optimisation for now. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/entry_64.S | 72 +--- arch/powerpc/kernel/syscalls/syscall.tbl | 22 +--- 2 files changed, 28 insertions(+), 66 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 6467bdab8d40..5a3e0b5c9ad1 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -98,13 +98,14 @@ END_BTB_FLUSH_SECTION std r11,_XER(r1) std r11,_CTR(r1) std r9,GPR13(r1) + SAVE_NVGPRS(r1) mflrr10 /* * This clears CR0.SO (bit 28), which is the error indication on * return from this system call. */ rldimi r2,r11,28,(63-28) - li r11,0xc01 + li r11,0xc00 std r10,_LINK(r1) std r11,_TRAP(r1) std r3,ORIG_GPR3(r1) @@ -323,7 +324,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Traced system call support */ .Lsyscall_dotrace: - bl save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter @@ -408,7 +408,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ - bl save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_leave b ret_from_except @@ -442,62 +441,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) _ASM_NOKPROBE_SYMBOL(system_call_common); _ASM_NOKPROBE_SYMBOL(system_call_exit); -/* Save non-volatile GPRs, if not already saved. */ -_GLOBAL(save_nvgprs) - ld r11,_TRAP(r1) - andi. r0,r11,1 - beqlr- - SAVE_NVGPRS(r1) - clrrdi r0,r11,1 - std r0,_TRAP(r1) - blr -_ASM_NOKPROBE_SYMBOL(save_nvgprs); - - -/* - * The sigsuspend and rt_sigsuspend system calls can call do_signal - * and thus put the process into the stopped state where we might - * want to examine its user state with ptrace. Therefore we need - * to save all the nonvolatile registers (r14 - r31) before calling - * the C code. Similarly, fork, vfork and clone need the full - * register state on the stack so that it can be copied to the child. - */ - -_GLOBAL(ppc_fork) - bl save_nvgprs - bl sys_fork - b .Lsyscall_exit - -_GLOBAL(ppc_vfork) - bl save_nvgprs - bl sys_vfork - b .Lsyscall_exit - -_GLOBAL(ppc_clone) - bl save_nvgprs - bl sys_clone - b .Lsyscall_exit - -_GLOBAL(ppc_clone3) - bl save_nvgprs - bl sys_clone3 - b .Lsyscall_exit - -_GLOBAL(ppc32_swapcontext) - bl save_nvgprs - bl compat_sys_swapcontext - b .Lsyscall_exit - -_GLOBAL(ppc64_swapcontext) - bl save_nvgprs - bl sys_swapcontext - b .Lsyscall_exit - -_GLOBAL(ppc_switch_endian) - bl save_nvgprs - bl sys_switch_endian - b .Lsyscall_exit - _GLOBAL(ret_from_fork) bl schedule_tail REST_NVGPRS(r1) @@ -516,6 +459,17 @@ _GLOBAL(ret_from_kernel_thread) li r3,0 b .Lsyscall_exit +/* Save non-volatile GPRs, if not already saved. */ +_GLOBAL(save_nvgprs) + ld r11,_TRAP(r1) + andi. r0,r11,1 + beqlr- + SAVE_NVGPRS(r1) + clrrdi r0,r11,1 + std r0,_TRAP(r1) + blr +_ASM_NOKPROBE_SYMBOL(save_nvgprs); + #ifdef CONFIG_PPC_BOOK3S_64 #define FLUSH_COUNT_CACHE \ diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 43f736ed47f2..d899bcb5343e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -9,7 +9,9 @@
[PATCH v2 21/35] powerpc/64s/exception: soft nmi interrupt should not use ret_from_except
From: Nicholas Piggin The soft nmi handler does not reconcile interrupt state, so it should not return via the normal ret_from_except path. Return like other NMIs, using the EXCEPTION_RESTORE_REGS macro. This becomes important when the scv interrupt is implemented, which must handle soft-masked interrupts that have r13 set to something other than the PACA -- returning to kernel in this case must restore r13. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 38bc66b95516..af1264cd005f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2740,7 +2740,11 @@ EXC_COMMON_BEGIN(soft_nmi_common) bl save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt - b ret_from_except + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL #endif /* CONFIG_PPC_WATCHDOG */ -- 2.23.0
[PATCH v2 20/35] powerpc/64s/exception: only test KVM in SRR interrupts when PR KVM is supported
From: Nicholas Piggin Apart from SRESET, MCE, and syscall (hcall variant), the SRR type interrupts are not escalated to hypervisor mode, so delivered to the OS. When running PR KVM, the OS is the hypervisor, and the guest runs with MSR[PR]=1, so these interrupts must test if a guest was running when interrupted. These tests are required at the real-mode entry points because the PR KVM host runs with LPCR[AIL]=0. In HV KVM and nested HV KVM, the guest always receives these interrupts, so there is no need for the host to make this test. So remove the tests if PR KVM is not configured. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 65 ++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2f50587392aa..38bc66b95516 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -214,9 +214,36 @@ do_define_int n #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* - * If hv is possible, interrupts come into to the hv version - * of the kvmppc_interrupt code, which then jumps to the PR handler, - * kvmppc_interrupt_pr, if the guest is a PR guest. + * All interrupts which set HSRR registers, as well as SRESET and MCE and + * syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken, + * so they all generally need to test whether they were taken in guest context. + * + * Note: SRESET and MCE may also be sent to the guest by the hypervisor, and be + * taken with MSR[HV]=0. + * + * Interrupts which set SRR registers (with the above exceptions) do not + * elevate to MSR[HV]=1 mode, though most can be taken when running with + * MSR[HV]=1 (e.g., bare metal kernel and userspace). So these interrupts do + * not need to test whether a guest is running because they get delivered to + * the guest directly, including nested HV KVM guests. + * + * The exception is PR KVM, where the guest runs with MSR[PR]=1 and the host + * runs with MSR[HV]=0, so the host takes all interrupts on behalf of the + * guest. PR KVM runs with LPCR[AIL]=0 which causes interrupts to always be + * delivered to the real-mode entry point, therefore such interrupts only test + * KVM in their real mode handlers, and only when PR KVM is possible. + * + * Interrupts that are taken in MSR[HV]=0 and escalate to MSR[HV]=1 are always + * delivered in real-mode when the MMU is in hash mode because the MMU + * registers are not set appropriately to translate host addresses. In nested + * radix mode these can be delivered in virt-mode as the host translations are + * used implicitly (see: effective LPID, effective PID). + */ + +/* + * If an interrupt is taken while a guest is running, it is immediately routed + * to KVM to handle. If both HV and PR KVM arepossible, KVM interrupts go first + * to kvmppc_interrupt_hv, which handles the PR guest case. */ #define kvmppc_interrupt kvmppc_interrupt_hv #else @@ -1258,8 +1285,10 @@ INT_DEFINE_BEGIN(data_access) IVEC=0x300 IDAR=1 IDSISR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_SKIP=1 IKVM_REAL=1 +#endif INT_DEFINE_END(data_access) EXC_REAL_BEGIN(data_access, 0x300, 0x80) @@ -1306,8 +1335,10 @@ INT_DEFINE_BEGIN(data_access_slb) IAREA=PACA_EXSLB IRECONCILE=0 IDAR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_SKIP=1 IKVM_REAL=1 +#endif INT_DEFINE_END(data_access_slb) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) @@ -1357,7 +1388,9 @@ INT_DEFINE_BEGIN(instruction_access) IISIDE=1 IDAR=1 IDSISR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 +#endif INT_DEFINE_END(instruction_access) EXC_REAL_BEGIN(instruction_access, 0x400, 0x80) @@ -1396,7 +1429,9 @@ INT_DEFINE_BEGIN(instruction_access_slb) IRECONCILE=0 IISIDE=1 IDAR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 +#endif INT_DEFINE_END(instruction_access_slb) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) @@ -1488,7 +1523,9 @@ INT_DEFINE_BEGIN(alignment) IVEC=0x600 IDAR=1 IDSISR=1 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 +#endif INT_DEFINE_END(alignment) EXC_REAL_BEGIN(alignment, 0x600, 0x100) @@ -1518,7 +1555,9 @@ EXC_COMMON_BEGIN(alignment_common) */ INT_DEFINE_BEGIN(program_check) IVEC=0x700 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 +#endif INT_DEFINE_END(program_check) EXC_REAL_BEGIN(program_check, 0x700, 0x100) @@ -1581,7 +1620,9 @@ EXC_COMMON_BEGIN(program_check_common) INT_DEFINE_BEGIN(fp_unavailable) IVEC=0x800 IRECONCILE=0 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 +#endif INT_DEFINE_END(fp_unavailable) EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) @@ -1643,7 +1684,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) INT_DEFINE_BEGIN(decrementer) IVEC=0x900
[PATCH v2 19/35] powerpc/64s/exception: add more comments for interrupt handlers
From: Nicholas Piggin A few of the non-standard handlers are left uncommented. Some more description could be added to some. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 391 --- 1 file changed, 353 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ef37d0ab6594..2f50587392aa 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -121,26 +121,26 @@ name: /* * Interrupt code generation macros */ -#define IVEC .L_IVEC_\name\() -#define IHSRR .L_IHSRR_\name\() -#define IHSRR_IF_HVMODE.L_IHSRR_IF_HVMODE_\name\() -#define IAREA .L_IAREA_\name\() -#define IVIRT .L_IVIRT_\name\() -#define IISIDE .L_IISIDE_\name\() -#define IDAR .L_IDAR_\name\() -#define IDSISR .L_IDSISR_\name\() -#define ISET_RI.L_ISET_RI_\name\() -#define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() -#define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() -#define IMASK .L_IMASK_\name\() -#define IKVM_SKIP .L_IKVM_SKIP_\name\() -#define IKVM_REAL .L_IKVM_REAL_\name\() +#define IVEC .L_IVEC_\name\()/* Interrupt vector address */ +#define IHSRR .L_IHSRR_\name\() /* Sets SRR or HSRR registers */ +#define IHSRR_IF_HVMODE.L_IHSRR_IF_HVMODE_\name\() /* HSRR if HV else SRR */ +#define IAREA .L_IAREA_\name\() /* PACA save area */ +#define IVIRT .L_IVIRT_\name\() /* Has virt mode entry point */ +#define IISIDE .L_IISIDE_\name\() /* Uses SRR0/1 not DAR/DSISR */ +#define IDAR .L_IDAR_\name\()/* Uses DAR (or SRR0) */ +#define IDSISR .L_IDSISR_\name\() /* Uses DSISR (or SRR1) */ +#define ISET_RI.L_ISET_RI_\name\() /* Run common code w/ MSR[RI]=1 */ +#define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */ +#define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() /* Common runs in realmode */ +#define IMASK .L_IMASK_\name\() /* IRQ soft-mask bit */ +#define IKVM_SKIP .L_IKVM_SKIP_\name\() /* Generate KVM skip handler */ +#define IKVM_REAL .L_IKVM_REAL_\name\() /* Real entry tests KVM */ #define __IKVM_REAL(name) .L_IKVM_REAL_ ## name -#define IKVM_VIRT .L_IKVM_VIRT_\name\() -#define ISTACK .L_ISTACK_\name\() +#define IKVM_VIRT .L_IKVM_VIRT_\name\() /* Virt entry tests KVM */ +#define ISTACK .L_ISTACK_\name\() /* Set regular kernel stack */ #define __ISTACK(name) .L_ISTACK_ ## name -#define IRECONCILE .L_IRECONCILE_\name\() -#define IKUAP .L_IKUAP_\name\() +#define IRECONCILE .L_IRECONCILE_\name\() /* Do RECONCILE_IRQ_STATE */ +#define IKUAP .L_IKUAP_\name\() /* Do KUAP lock */ #define INT_DEFINE_BEGIN(n)\ .macro int_define_ ## n name @@ -759,6 +759,39 @@ __start_interrupts: EXC_VIRT_NONE(0x4000, 0x100) +/** + * Interrupt 0x100 - System Reset Interrupt (SRESET aka NMI). + * This is a non-maskable, asynchronous interrupt always taken in real-mode. + * It is caused by: + * - Wake from power-saving state, on powernv. + * - An NMI from another CPU, triggered by firmware or hypercall. + * - As crash/debug signal injected from BMC, firmware or hypervisor. + * + * Handling: + * Power-save wakeup is the only performance critical path, so this is + * determined quickly as possible first. In this case volatile registers + * can be discarded and SPRs like CFAR don't need to be read. + * + * If not a powersave wakeup, then it's run as a regular interrupt, however + * it uses its own stack and PACA save area to preserve the regular kernel + * environment for debugging. + * + * This interrupt is not maskable, so triggering it when MSR[RI] is clear, + * or SCRATCH0 is in use, etc. may cause a crash. It's also not entirely + * correct to switch to virtual mode to run the regular interrupt handler + * because it might be interrupted when the MMU is in a bad state (e.g., SLB + * is clear). + * + * FWNMI: + * PAPR specifies a "fwnmi" facility which sends the sreset to a different + * entry point with a different register set up. Some hypervisors will + * send the sreset to 0x100 in the guest if it is not fwnmi capable. + * + * KVM: + * Unlike most SRR interrupts, this may be taken by the host while executing + * in a guest, so a KVM test is required. KVM will pull the CPU out of guest + * mode and then raise the sreset. + */ INT_DEFINE_BEGIN(system_reset) IVEC=0x100 IAREA=PACA_EXNMI @@ -834,6 +867,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) * Vectors for the FWNMI option. Share common code. */ TRAMP_REAL_BEGIN(system_reset_fwnmi) + /* XXX: fwnmi guest could run a nested/PR guest, so why no test? */ __IKVM_REAL(system_reset
[PATCH v2 18/35] powerpc/64s/exception: Clean up SRR specifiers
From: Nicholas Piggin Remove more magic numbers and replace with nicely named bools. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 68 +--- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9494403b9586..ef37d0ab6594 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -105,11 +105,6 @@ name: ori reg,reg,(ABS_ADDR(label))@l;\ addis reg,reg,(ABS_ADDR(label))@h -/* Exception register prefixes */ -#define EXC_HV_OR_STD 2 /* depends on HVMODE */ -#define EXC_HV 1 -#define EXC_STD0 - /* * Branch to label using its 0xC000 address. This results in instruction * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned @@ -128,6 +123,7 @@ name: */ #define IVEC .L_IVEC_\name\() #define IHSRR .L_IHSRR_\name\() +#define IHSRR_IF_HVMODE.L_IHSRR_IF_HVMODE_\name\() #define IAREA .L_IAREA_\name\() #define IVIRT .L_IVIRT_\name\() #define IISIDE .L_IISIDE_\name\() @@ -159,7 +155,10 @@ do_define_int n .error "IVEC not defined" .endif .ifndef IHSRR - IHSRR=EXC_STD + IHSRR=0 + .endif + .ifndef IHSRR_IF_HVMODE + IHSRR_IF_HVMODE=0 .endif .ifndef IAREA IAREA=PACA_EXGEN @@ -257,7 +256,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r9,IAREA+EX_R9(r13) ld r10,IAREA+EX_R10(r13) /* HSRR variants have the 0x2 bit added to their trap number */ - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION ori r12,r12,(IVEC + 0x2) FTR_SECTION_ELSE @@ -278,7 +277,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r10,IAREA+EX_R10(r13) ld r11,IAREA+EX_R11(r13) ld r12,IAREA+EX_R12(r13) - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION b kvmppc_skip_Hinterrupt FTR_SECTION_ELSE @@ -403,7 +402,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) stw r10,IAREA+EX_DSISR(r13) .endif - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION mfspr r11,SPRN_HSRR0 /* save HSRR0 */ mfspr r12,SPRN_HSRR1 /* and HSRR1 */ @@ -485,7 +484,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt) .abort "Bad maskable vector" .endif - .if IHSRR == EXC_HV_OR_STD + .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION bne masked_Hinterrupt FTR_SECTION_ELSE @@ -618,12 +617,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) * Restore all registers including H/SRR0/1 saved in a stack frame of a * standard exception. */ -.macro EXCEPTION_RESTORE_REGS hsrr +.macro EXCEPTION_RESTORE_REGS hsrr=0 /* Move original SRR0 and SRR1 into the respective regs */ ld r9,_MSR(r1) - .if \hsrr == EXC_HV_OR_STD - .error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS" - .endif .if \hsrr mtspr SPRN_HSRR1,r9 .else @@ -898,7 +894,7 @@ EXC_COMMON_BEGIN(system_reset_common) ld r10,SOFTE(r1) stb r10,PACAIRQSOFTMASK(r13) - EXCEPTION_RESTORE_REGS EXC_STD + EXCEPTION_RESTORE_REGS RFI_TO_USER_OR_KERNEL GEN_KVM system_reset @@ -952,7 +948,7 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi) lhz r12,PACA_IN_MCE(r13); \ subir12,r12,1; \ sth r12,PACA_IN_MCE(r13); \ - EXCEPTION_RESTORE_REGS EXC_STD + EXCEPTION_RESTORE_REGS EXC_COMMON_BEGIN(machine_check_early_common) /* @@ -1321,7 +1317,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) INT_DEFINE_BEGIN(hardware_interrupt) IVEC=0x500 - IHSRR=EXC_HV_OR_STD + IHSRR_IF_HVMODE=1 IMASK=IRQS_DISABLED IKVM_REAL=1 IKVM_VIRT=1 @@ -1490,7 +1486,7 @@ EXC_COMMON_BEGIN(decrementer_common) INT_DEFINE_BEGIN(hdecrementer) IVEC=0x980 - IHSRR=EXC_HV + IHSRR=1 ISTACK=0 IRECONCILE=0 IKVM_REAL=1 @@ -1732,7 +1728,7 @@ EXC_COMMON_BEGIN(single_step_common) INT_DEFINE_BEGIN(h_data_storage) IVEC=0xe00 - IHSRR=EXC_HV + IHSRR=1 IDAR=1 IDSISR=1 IKVM_SKIP=1 @@ -1764,7 +1760,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) INT_DEFINE_BEGIN(h_instr_storage) IVEC=0xe20 - IHSRR=EXC_HV + IHSRR=1 IKVM_REAL=1 IKVM_VIRT=1 INT_DEFINE_END(h_instr_storage) @@ -1787,7 +1783,7 @@ EXC_COMMON_BEGIN(h_instr_storage_common) INT_DEFINE_BEGIN(emulation_assist)
[PATCH v2 17/35] powerpc/64s/exception: re-inline some handlers
From: Nicholas Piggin The reduction in interrupt entry size allows some handlers to be re-inlined. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7a234e6d7bf5..9494403b9586 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1186,7 +1186,7 @@ INT_DEFINE_BEGIN(data_access) INT_DEFINE_END(data_access) EXC_REAL_BEGIN(data_access, 0x300, 0x80) - GEN_INT_ENTRY data_access, virt=0, ool=1 + GEN_INT_ENTRY data_access, virt=0 EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) GEN_INT_ENTRY data_access, virt=1 @@ -1216,7 +1216,7 @@ INT_DEFINE_BEGIN(data_access_slb) INT_DEFINE_END(data_access_slb) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - GEN_INT_ENTRY data_access_slb, virt=0, ool=1 + GEN_INT_ENTRY data_access_slb, virt=0 EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) GEN_INT_ENTRY data_access_slb, virt=1 @@ -1472,7 +1472,7 @@ INT_DEFINE_BEGIN(decrementer) INT_DEFINE_END(decrementer) EXC_REAL_BEGIN(decrementer, 0x900, 0x80) - GEN_INT_ENTRY decrementer, virt=0, ool=1 + GEN_INT_ENTRY decrementer, virt=0 EXC_REAL_END(decrementer, 0x900, 0x80) EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) GEN_INT_ENTRY decrementer, virt=1 -- 2.23.0
[PATCH v2 16/35] powerpc/64s/exception: hdecrementer avoid touching the stack
From: Nicholas Piggin The hdec interrupt handler is reported to sometimes fire in Linux if KVM leaves it pending after a guest exists. This is harmless, so there is a no-op handler for it. The interrupt handler currently uses the regular kernel stack. Change this to avoid touching the stack entirely. This should be the last place where the regular Linux stack can be accessed with asynchronous interrupts (including PMI) soft-masked. It might be possible to take advantage of this invariant, e.g., to context switch the kernel stack SLB entry without clearing MSR[EE]. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/time.h | 1 - arch/powerpc/kernel/exceptions-64s.S | 25 - arch/powerpc/kernel/time.c | 9 - 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 08dbe3e6831c..e0107495c4de 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -24,7 +24,6 @@ extern struct clock_event_device decrementer_clockevent; extern void generic_calibrate_decr(void); -extern void hdec_interrupt(struct pt_regs *regs); /* Some sane defaults: 125 MHz timebase, 1GHz processor */ extern unsigned long ppc_proc_freq; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9fa71d51ecf4..7a234e6d7bf5 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1491,6 +1491,8 @@ EXC_COMMON_BEGIN(decrementer_common) INT_DEFINE_BEGIN(hdecrementer) IVEC=0x980 IHSRR=EXC_HV + ISTACK=0 + IRECONCILE=0 IKVM_REAL=1 IKVM_VIRT=1 INT_DEFINE_END(hdecrementer) @@ -1502,11 +1504,24 @@ EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) GEN_INT_ENTRY hdecrementer, virt=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) EXC_COMMON_BEGIN(hdecrementer_common) - GEN_COMMON hdecrementer - bl save_nvgprs - addir3,r1,STACK_FRAME_OVERHEAD - bl hdec_interrupt - b ret_from_except + __GEN_COMMON_ENTRY hdecrementer + /* +* Hypervisor decrementer interrupts not caught by the KVM test +* shouldn't occur but are sometimes left pending on exit from a KVM +* guest. We don't need to do anything to clear them, as they are +* edge-triggered. +* +* Be careful to avoid touching the kernel stack. +*/ + ld r10,PACA_EXGEN+EX_CTR(r13) + mtctr r10 + mtcrf 0x80,r9 + ld r9,PACA_EXGEN+EX_R9(r13) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + HRFI_TO_KERNEL GEN_KVM hdecrementer diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308cd5..bebc8c440289 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -663,15 +663,6 @@ void timer_broadcast_interrupt(void) } #endif -/* - * Hypervisor decrementer interrupts shouldn't occur but are sometimes - * left pending on exit from a KVM guest. We don't need to do anything - * to clear them, as they are edge-triggered. - */ -void hdec_interrupt(struct pt_regs *regs) -{ -} - #ifdef CONFIG_SUSPEND static void generic_suspend_disable_irqs(void) { -- 2.23.0
[PATCH v2 15/35] powerpc/64s/exception: trim unused arguments from KVMTEST macro
From: Nicholas Piggin Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index abf26db36427..9fa71d51ecf4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -224,7 +224,7 @@ do_define_int n #define kvmppc_interrupt kvmppc_interrupt_pr #endif -.macro KVMTEST name, hsrr, n +.macro KVMTEST name lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 bne \name\()_kvm @@ -293,7 +293,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) .endm #else -.macro KVMTEST name, hsrr, n +.macro KVMTEST name .endm .macro GEN_KVM name .endm @@ -437,7 +437,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) DEFINE_FIXED_SYMBOL(\name\()_common_real) \name\()_common_real: .if IKVM_REAL - KVMTEST \name IHSRR IVEC + KVMTEST \name .endif ld r10,PACAKMSR(r13) /* get MSR value for kernel */ @@ -460,7 +460,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) DEFINE_FIXED_SYMBOL(\name\()_common_virt) \name\()_common_virt: .if IKVM_VIRT - KVMTEST \name IHSRR IVEC + KVMTEST \name 1: .endif .endif /* IVIRT */ @@ -1595,7 +1595,7 @@ INT_DEFINE_END(system_call) GET_PACA(r13) std r10,PACA_EXGEN+EX_R10(r13) INTERRUPT_TO_KERNEL - KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */ + KVMTEST system_call /* uses r10, branch to system_call_kvm */ mfctr r9 #else mr r9,r13 -- 2.23.0
[PATCH v2 14/35] powerpc/64s/exception: remove the SPR saving patch code macros
From: Nicholas Piggin These are used infrequently enough they don't provide much help, so inline them. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 82 ++-- 1 file changed, 28 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 716a95ba814f..abf26db36427 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -110,46 +110,6 @@ name: #define EXC_HV 1 #define EXC_STD0 -/* - * PPR save/restore macros used in exceptions-64s.S - * Used for P7 or later processors - */ -#define SAVE_PPR(area, ra) \ -BEGIN_FTR_SECTION_NESTED(940) \ - ld ra,area+EX_PPR(r13);/* Read PPR from paca */\ - std ra,_PPR(r1);\ -END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,940) - -#define RESTORE_PPR_PACA(area, ra) \ -BEGIN_FTR_SECTION_NESTED(941) \ - ld ra,area+EX_PPR(r13);\ - mtspr SPRN_PPR,ra;\ -END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,941) - -/* - * Get an SPR into a register if the CPU has the given feature - */ -#define OPT_GET_SPR(ra, spr, ftr) \ -BEGIN_FTR_SECTION_NESTED(943) \ - mfspr ra,spr; \ -END_FTR_SECTION_NESTED(ftr,ftr,943) - -/* - * Set an SPR from a register if the CPU has the given feature - */ -#define OPT_SET_SPR(ra, spr, ftr) \ -BEGIN_FTR_SECTION_NESTED(943) \ - mtspr spr,ra; \ -END_FTR_SECTION_NESTED(ftr,ftr,943) - -/* - * Save a register to the PACA if the CPU has the given feature - */ -#define OPT_SAVE_REG_TO_PACA(offset, ra, ftr) \ -BEGIN_FTR_SECTION_NESTED(943) \ - std ra,offset(r13); \ -END_FTR_SECTION_NESTED(ftr,ftr,943) - /* * Branch to label using its 0xC000 address. This results in instruction * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned @@ -278,18 +238,18 @@ do_define_int n cmpwi r10,KVM_GUEST_MODE_SKIP beq 89f .else -BEGIN_FTR_SECTION_NESTED(947) +BEGIN_FTR_SECTION ld r10,IAREA+EX_CFAR(r13) std r10,HSTATE_CFAR(r13) -END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) .endif ld r10,PACA_EXGEN+EX_CTR(r13) mtctr r10 -BEGIN_FTR_SECTION_NESTED(948) +BEGIN_FTR_SECTION ld r10,IAREA+EX_PPR(r13) std r10,HSTATE_PPR(r13) -END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r11,IAREA+EX_R11(r13) ld r12,IAREA+EX_R12(r13) std r12,HSTATE_SCRATCH0(r13) @@ -386,10 +346,14 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) SET_SCRATCH0(r13) /* save r13 */ GET_PACA(r13) std r9,IAREA+EX_R9(r13) /* save r9 */ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) +BEGIN_FTR_SECTION + mfspr r9,SPRN_PPR +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) HMT_MEDIUM std r10,IAREA+EX_R10(r13) /* save r10 - r12 */ - OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) +BEGIN_FTR_SECTION + mfspr r10,SPRN_CFAR +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) .if \ool .if !\virt b tramp_real_\name @@ -402,8 +366,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif .endif - OPT_SAVE_REG_TO_PACA(IAREA+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(IAREA+EX_CFAR, r10, CPU_FTR_CFAR) +BEGIN_FTR_SECTION + std r9,IAREA+EX_PPR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) +BEGIN_FTR_SECTION + std r10,IAREA+EX_CFAR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) INTERRUPT_TO_KERNEL mfctr r10 std r10,IAREA+EX_CTR(r13) @@ -558,7 +526,10 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt) .endif beq 101f/* if from kernel mode */ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) - SAVE_PPR(IAREA, r9) +BEGIN_FTR_SECTION + ld r9,IAREA+EX_PPR(r13)/* Read PPR from paca */ + std r9,_PPR(r1) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 101: .else .if IKUAP @@ -598,10 +569,10 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt) std r10,_DSISR(r1)
[PATCH v2 13/35] powerpc/64s/exception: remove confusing IEARLY option
From: Nicholas Piggin Replace IEARLY=1 and IEARLY=2 with IBRANCH_COMMON, which controls if the entry code branches to a common handler; and IREALMODE_COMMON, which controls whether the common handler should remain in real mode. These special cases no longer avoid loading the SRR registers, there is no point as most of them load the registers immediately anyway. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 48 ++-- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7db76e7be0aa..716a95ba814f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -174,7 +174,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define IDAR .L_IDAR_\name\() #define IDSISR .L_IDSISR_\name\() #define ISET_RI.L_ISET_RI_\name\() -#define IEARLY .L_IEARLY_\name\() +#define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() +#define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() #define IMASK .L_IMASK_\name\() #define IKVM_SKIP .L_IKVM_SKIP_\name\() #define IKVM_REAL .L_IKVM_REAL_\name\() @@ -218,8 +219,15 @@ do_define_int n .ifndef ISET_RI ISET_RI=1 .endif - .ifndef IEARLY - IEARLY=0 + .ifndef IBRANCH_TO_COMMON + IBRANCH_TO_COMMON=1 + .endif + .ifndef IREALMODE_COMMON + IREALMODE_COMMON=0 + .else + .if ! IBRANCH_TO_COMMON + .error "IREALMODE_COMMON=1 but IBRANCH_TO_COMMON=0" + .endif .endif .ifndef IMASK IMASK=0 @@ -353,6 +361,11 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) */ .macro GEN_BRANCH_TO_COMMON name, virt + .if IREALMODE_COMMON + LOAD_HANDLER(r10, \name\()_common) + mtctr r10 + bctr + .else .if \virt #ifndef CONFIG_RELOCATABLE b \name\()_common_virt @@ -366,6 +379,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) mtctr r10 bctr .endif + .endif .endm .macro GEN_INT_ENTRY name, virt, ool=0 @@ -421,11 +435,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) stw r10,IAREA+EX_DSISR(r13) .endif - .if IEARLY == 2 - /* nothing more */ - .elseif IEARLY - BRANCH_TO_C000(r11, \name\()_common) - .else .if IHSRR == EXC_HV_OR_STD BEGIN_FTR_SECTION mfspr r11,SPRN_HSRR0 /* save HSRR0 */ @@ -441,6 +450,8 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) mfspr r11,SPRN_SRR0 /* save SRR0 */ mfspr r12,SPRN_SRR1 /* and SRR1 */ .endif + + .if IBRANCH_TO_COMMON GEN_BRANCH_TO_COMMON \name \virt .endif @@ -926,6 +937,7 @@ INT_DEFINE_BEGIN(machine_check_early) IVEC=0x200 IAREA=PACA_EXMC IVIRT=0 /* no virt entry point */ + IREALMODE_COMMON=1 /* * MSR_RI is not enabled, because PACA_EXMC is being used, so a * nested machine check corrupts it. machine_check_common enables @@ -933,7 +945,6 @@ INT_DEFINE_BEGIN(machine_check_early) */ ISET_RI=0 ISTACK=0 - IEARLY=1 IDAR=1 IDSISR=1 IRECONCILE=0 @@ -973,9 +984,6 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi) EXCEPTION_RESTORE_REGS EXC_STD EXC_COMMON_BEGIN(machine_check_early_common) - mfspr r11,SPRN_SRR0 - mfspr r12,SPRN_SRR1 - /* * Switch to mc_emergency stack and handle re-entrancy (we limit * the nested MCE upto level 4 to avoid stack overflow). @@ -1822,7 +1830,7 @@ EXC_COMMON_BEGIN(emulation_assist_common) INT_DEFINE_BEGIN(hmi_exception_early) IVEC=0xe60 IHSRR=EXC_HV - IEARLY=1 + IREALMODE_COMMON=1 ISTACK=0 IRECONCILE=0 IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */ @@ -1842,8 +1850,6 @@ EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) EXC_COMMON_BEGIN(hmi_exception_early_common) - mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* Save HSRR1 */ mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/ @@ -2169,29 +2175,23 @@ EXC_VIRT_NONE(0x5400, 0x100) INT_DEFINE_BEGIN(denorm_exception) IVEC=0x1500 IHSRR=EXC_HV - IEARLY=2 + IBRANCH_TO_COMMON=0 IKVM_REAL=1 INT_DEFINE_END(denorm_exception) EXC_REAL_BEGIN(denorm_exception, 0x1500, 0x100) GEN_INT_ENTRY denorm_exception, virt=0 #ifdef CONFIG_PPC_DENORMALISATION - mfspr r10,SPRN_HSRR1 - andis. r10,r10,(H
[PATCH v2 12/35] powerpc/64s/exception: move KVM test to common code
From: Nicholas Piggin This allows more code to be moved out of unrelocated regions. The system call KVMTEST is changed to be open-coded and remain in the tramp area to avoid having to move it to entry_64.S. The custom nature of the system call entry code means the hcall case can be made more streamlined than regular interrupt handlers. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S| 239 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 -- arch/powerpc/kvm/book3s_segment.S | 7 - 3 files changed, 119 insertions(+), 138 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fbc3fbb293f7..7db76e7be0aa 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -44,7 +44,6 @@ * EXC_VIRT_BEGIN/END - virt (AIL), unrelocated exception vectors * TRAMP_REAL_BEGIN- real, unrelocated helpers (virt may call these) * TRAMP_VIRT_BEGIN- virt, unreloc helpers (in practice, real can use) - * TRAMP_KVM_BEGIN - KVM handlers, these are put into real, unrelocated * EXC_COMMON - After switching to virtual, relocated mode. */ @@ -74,13 +73,6 @@ name: #define TRAMP_VIRT_BEGIN(name) \ FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#define TRAMP_KVM_BEGIN(name) \ - TRAMP_VIRT_BEGIN(name) -#else -#define TRAMP_KVM_BEGIN(name) -#endif - #define EXC_REAL_NONE(start, size) \ FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) @@ -271,6 +263,9 @@ do_define_int n .endm .macro GEN_KVM name + .balign IFETCH_ALIGN_BYTES +\name\()_kvm: + .if IKVM_SKIP cmpwi r10,KVM_GUEST_MODE_SKIP beq 89f @@ -281,13 +276,18 @@ BEGIN_FTR_SECTION_NESTED(947) END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947) .endif + ld r10,PACA_EXGEN+EX_CTR(r13) + mtctr r10 BEGIN_FTR_SECTION_NESTED(948) ld r10,IAREA+EX_PPR(r13) std r10,HSTATE_PPR(r13) END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) - ld r10,IAREA+EX_R10(r13) + ld r11,IAREA+EX_R11(r13) + ld r12,IAREA+EX_R12(r13) std r12,HSTATE_SCRATCH0(r13) sldir12,r9,32 + ld r9,IAREA+EX_R9(r13) + ld r10,IAREA+EX_R10(r13) /* HSRR variants have the 0x2 bit added to their trap number */ .if IHSRR == EXC_HV_OR_STD BEGIN_FTR_SECTION @@ -300,29 +300,16 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .else ori r12,r12,(IVEC) .endif - -#ifdef CONFIG_RELOCATABLE - /* -* KVM requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives -* outside the head section. CONFIG_RELOCATABLE KVM expects CTR -* to be saved in HSTATE_SCRATCH1. -*/ - ld r9,IAREA+EX_CTR(r13) - std r9,HSTATE_SCRATCH1(r13) - __LOAD_FAR_HANDLER(r9, kvmppc_interrupt) - mtctr r9 - ld r9,IAREA+EX_R9(r13) - bctr -#else - ld r9,IAREA+EX_R9(r13) b kvmppc_interrupt -#endif - .if IKVM_SKIP 89:mtocrf 0x80,r9 + ld r10,PACA_EXGEN+EX_CTR(r13) + mtctr r10 ld r9,IAREA+EX_R9(r13) ld r10,IAREA+EX_R10(r13) + ld r11,IAREA+EX_R11(r13) + ld r12,IAREA+EX_R12(r13) .if IHSRR == EXC_HV_OR_STD BEGIN_FTR_SECTION b kvmppc_skip_Hinterrupt @@ -407,11 +394,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) mfctr r10 std r10,IAREA+EX_CTR(r13) mfcrr9 - - .if (!\virt && IKVM_REAL) || (\virt && IKVM_VIRT) - KVMTEST \name IHSRR IVEC - .endif - std r11,IAREA+EX_R11(r13) std r12,IAREA+EX_R12(r13) @@ -475,6 +457,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .macro __GEN_COMMON_ENTRY name DEFINE_FIXED_SYMBOL(\name\()_common_real) \name\()_common_real: + .if IKVM_REAL + KVMTEST \name IHSRR IVEC + .endif + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ /* MSR[RI] is clear iff using SRR regs */ .if IHSRR == EXC_HV_OR_STD @@ -487,9 +473,17 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) mtmsrd r10 .if IVIRT + .if IKVM_VIRT + b 1f /* skip the virt test coming from real */ + .endif + .balign IFETCH_ALIGN_BYTES DEFINE_FIXED_SYMBOL(\name\()_common_virt) \name\()_common_virt: + .if IKVM_VIRT + KVMTEST \name IHSRR IVEC +1: + .endif .endif /* IVIRT */ .endm @@ -848,8 +842,6 @@ END_FTR_SECTIO
[PATCH v2 11/35] powerpc/64s/exception: move soft-mask test to common code
From: Nicholas Piggin As well as moving code out of the unrelocated vectors, this allows the masked handlers to be moved to common code, and allows the soft_nmi handler to be generated more like a regular handler. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 106 +-- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5803ce3b9404..fbc3fbb293f7 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -411,36 +411,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .if (!\virt && IKVM_REAL) || (\virt && IKVM_VIRT) KVMTEST \name IHSRR IVEC .endif - .if IMASK - lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,IMASK - /* Associate vector numbers with bits in paca->irq_happened */ - .if IVEC == 0x500 || IVEC == 0xea0 - li r10,PACA_IRQ_EE - .elseif IVEC == 0x900 - li r10,PACA_IRQ_DEC - .elseif IVEC == 0xa00 || IVEC == 0xe80 - li r10,PACA_IRQ_DBELL - .elseif IVEC == 0xe60 - li r10,PACA_IRQ_HMI - .elseif IVEC == 0xf00 - li r10,PACA_IRQ_PMI - .else - .abort "Bad maskable vector" - .endif - - .if IHSRR == EXC_HV_OR_STD - BEGIN_FTR_SECTION - bne masked_Hinterrupt - FTR_SECTION_ELSE - bne masked_interrupt - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif IHSRR - bne masked_Hinterrupt - .else - bne masked_interrupt - .endif - .endif std r11,IAREA+EX_R11(r13) std r12,IAREA+EX_R12(r13) @@ -524,6 +494,37 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt) .endm .macro __GEN_COMMON_BODY name + .if IMASK + lbz r10,PACAIRQSOFTMASK(r13) + andi. r10,r10,IMASK + /* Associate vector numbers with bits in paca->irq_happened */ + .if IVEC == 0x500 || IVEC == 0xea0 + li r10,PACA_IRQ_EE + .elseif IVEC == 0x900 + li r10,PACA_IRQ_DEC + .elseif IVEC == 0xa00 || IVEC == 0xe80 + li r10,PACA_IRQ_DBELL + .elseif IVEC == 0xe60 + li r10,PACA_IRQ_HMI + .elseif IVEC == 0xf00 + li r10,PACA_IRQ_PMI + .else + .abort "Bad maskable vector" + .endif + + .if IHSRR == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif IHSRR + bne masked_Hinterrupt + .else + bne masked_interrupt + .endif + .endif + .if ISTACK andi. r10,r12,MSR_PR /* See if coming from user */ mr r10,r1 /* Save r1 */ @@ -2343,18 +2344,10 @@ EXC_VIRT_NONE(0x5800, 0x100) #ifdef CONFIG_PPC_WATCHDOG -#define MASKED_DEC_HANDLER_LABEL 3f - -#define MASKED_DEC_HANDLER(_H) \ -3: /* soft-nmi */ \ - std r12,PACA_EXGEN+EX_R12(r13); \ - GET_SCRATCH0(r10); \ - std r10,PACA_EXGEN+EX_R13(r13); \ - mfspr r11,SPRN_SRR0; /* save SRR0 */ \ - mfspr r12,SPRN_SRR1; /* and SRR1 */ \ - LOAD_HANDLER(r10, soft_nmi_common); \ - mtctr r10;\ - bctr +INT_DEFINE_BEGIN(soft_nmi) + IVEC=0x900 + ISTACK=0 +INT_DEFINE_END(soft_nmi) /* * Branch to soft_nmi_interrupt using the emergency stack. The emergency @@ -2366,19 +2359,16 @@ EXC_VIRT_NONE(0x5800, 0x100) * and run it entirely with interrupts hard disabled. */ EXC_COMMON_BEGIN(soft_nmi_common) + mfspr r11,SPRN_SRR0 mr r10,r1 ld r1,PACAEMERGSP(r13) subir1,r1,INT_FRAME_SIZE - __ISTACK(decrementer)=0 - __GEN_COMMON_BODY decrementer + __GEN_COMMON_BODY soft_nmi bl save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt b ret_from_except -#else /* CONFIG_PPC_WATCHDOG */ -#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ -#define MASKED_DEC_HANDLER(_H) #endif /* CONFIG_PPC_WATCHDOG */ /* @@ -2397,7 +2387,6 @@ masked_Hinterrupt: .else mas
[PATCH v2 10/35] powerpc/64s/exception: move real->virt switch into the common handler
From: Nicholas Piggin The real mode interrupt entry points currently use rfid to branch to the common handler in virtual mode. This is a significant amount of code, and forces other code (notably the KVM test) to live in the real mode handler. In the interest of minimising the amount of code that runs unrelocated move the switch to virt mode into the common code, and do it with mtmsrd, which avoids clobbering SRRs (although the post-KVMTEST performance of real-mode interrupt handlers is not a big concern these days). This requires CTR to always be saved (real-mode needs to reach 0xc...) but that's not a huge impact these days. It could be optimized away in future. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/exception-64s.h | 4 - arch/powerpc/kernel/exceptions-64s.S | 251 ++- 2 files changed, 109 insertions(+), 146 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 33f4f72eb035..47bd4ea0837d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -33,11 +33,7 @@ #include /* PACA save area size in u64 units (exgen, exmc, etc) */ -#if defined(CONFIG_RELOCATABLE) #define EX_SIZE10 -#else -#define EX_SIZE9 -#endif /* * maximum recursive depth of MCE exceptions diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b8588618cdc3..5803ce3b9404 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -32,16 +32,10 @@ #define EX_CCR 52 #define EX_CFAR56 #define EX_PPR 64 -#if defined(CONFIG_RELOCATABLE) #define EX_CTR 72 .if EX_SIZE != 10 .error "EX_SIZE is wrong" .endif -#else -.if EX_SIZE != 9 - .error "EX_SIZE is wrong" -.endif -#endif /* * Following are fixed section helper macros. @@ -124,22 +118,6 @@ name: #define EXC_HV 1 #define EXC_STD0 -#if defined(CONFIG_RELOCATABLE) -/* - * If we support interrupts with relocation on AND we're a relocatable kernel, - * we need to use CTR to get to the 2nd level handler. So, save/restore it - * when required. - */ -#define SAVE_CTR(reg, area)mfctr reg ; std reg,area+EX_CTR(r13) -#define GET_CTR(reg, area) ld reg,area+EX_CTR(r13) -#define RESTORE_CTR(reg, area) ld reg,area+EX_CTR(r13) ; mtctr reg -#else -/* ...else CTR is unused and in register. */ -#define SAVE_CTR(reg, area) -#define GET_CTR(reg, area) mfctr reg -#define RESTORE_CTR(reg, area) -#endif - /* * PPR save/restore macros used in exceptions-64s.S * Used for P7 or later processors @@ -199,6 +177,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define IVEC .L_IVEC_\name\() #define IHSRR .L_IHSRR_\name\() #define IAREA .L_IAREA_\name\() +#define IVIRT .L_IVIRT_\name\() #define IISIDE .L_IISIDE_\name\() #define IDAR .L_IDAR_\name\() #define IDSISR .L_IDSISR_\name\() @@ -232,6 +211,9 @@ do_define_int n .ifndef IAREA IAREA=PACA_EXGEN .endif + .ifndef IVIRT + IVIRT=1 + .endif .ifndef IISIDE IISIDE=0 .endif @@ -325,7 +307,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * outside the head section. CONFIG_RELOCATABLE KVM expects CTR * to be saved in HSTATE_SCRATCH1. */ - mfctr r9 + ld r9,IAREA+EX_CTR(r13) std r9,HSTATE_SCRATCH1(r13) __LOAD_FAR_HANDLER(r9, kvmppc_interrupt) mtctr r9 @@ -362,101 +344,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #endif -.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri - ld r10,PACAKMSR(r13) /* get MSR value for kernel */ - .if ! \set_ri - xorir10,r10,MSR_RI /* Clear MSR_RI */ - .endif - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - .endif - LOAD_HANDLER(r10, \label\()) - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - FTR_SECTION_ELSE - mtspr SPRN_SRR0,
[PATCH v2 09/35] powerpc/64s/exception: Add ISIDE option
From: Nicholas Piggin Rather than using DAR=2 to select the i-side registers, add an explicit option. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 23 --- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index bef0c2eee7dc..b8588618cdc3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -199,6 +199,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define IVEC .L_IVEC_\name\() #define IHSRR .L_IHSRR_\name\() #define IAREA .L_IAREA_\name\() +#define IISIDE .L_IISIDE_\name\() #define IDAR .L_IDAR_\name\() #define IDSISR .L_IDSISR_\name\() #define ISET_RI.L_ISET_RI_\name\() @@ -231,6 +232,9 @@ do_define_int n .ifndef IAREA IAREA=PACA_EXGEN .endif + .ifndef IISIDE + IISIDE=0 + .endif .ifndef IDAR IDAR=0 .endif @@ -542,7 +546,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) */ GET_SCRATCH0(r10) std r10,IAREA+EX_R13(r13) - .if IDAR == 1 + .if IDAR && !IISIDE .if IHSRR mfspr r10,SPRN_HDAR .else @@ -550,7 +554,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif std r10,IAREA+EX_DAR(r13) .endif - .if IDSISR == 1 + .if IDSISR && !IISIDE .if IHSRR mfspr r10,SPRN_HDSISR .else @@ -625,16 +629,18 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r9,GPR11(r1) std r10,GPR12(r1) std r11,GPR13(r1) + .if IDAR - .if IDAR == 2 + .if IISIDE ld r10,_NIP(r1) .else ld r10,IAREA+EX_DAR(r13) .endif std r10,_DAR(r1) .endif + .if IDSISR - .if IDSISR == 2 + .if IISIDE ld r10,_MSR(r1) lis r11,DSISR_SRR1_MATCH_64S@h and r10,r10,r11 @@ -643,6 +649,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif std r10,_DSISR(r1) .endif + BEGIN_FTR_SECTION_NESTED(66) ld r10,IAREA+EX_CFAR(r13) std r10,ORIG_GPR3(r1) @@ -1311,8 +1318,9 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) INT_DEFINE_BEGIN(instruction_access) IVEC=0x400 - IDAR=2 - IDSISR=2 + IISIDE=1 + IDAR=1 + IDSISR=1 IKVM_REAL=1 INT_DEFINE_END(instruction_access) @@ -1341,7 +1349,8 @@ INT_DEFINE_BEGIN(instruction_access_slb) IVEC=0x480 IAREA=PACA_EXSLB IRECONCILE=0 - IDAR=2 + IISIDE=1 + IDAR=1 IKVM_REAL=1 INT_DEFINE_END(instruction_access_slb) -- 2.23.0
[PATCH v2 08/35] powerpc/64s/exception: Remove old INT_KVM_HANDLER
From: Nicholas Piggin Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 55 +--- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f318869607db..bef0c2eee7dc 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -266,15 +266,6 @@ do_define_int n .endif .endm -.macro INT_KVM_HANDLER name, vec, hsrr, area, skip - TRAMP_KVM_BEGIN(\name\()_kvm) - KVM_HANDLER \vec, \hsrr, \area, \skip -.endm - -.macro GEN_KVM name - KVM_HANDLER IVEC, IHSRR, IAREA, IKVM_SKIP -.endm - #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* @@ -293,35 +284,35 @@ do_define_int n bne \name\()_kvm .endm -.macro KVM_HANDLER vec, hsrr, area, skip - .if \skip +.macro GEN_KVM name + .if IKVM_SKIP cmpwi r10,KVM_GUEST_MODE_SKIP beq 89f .else BEGIN_FTR_SECTION_NESTED(947) - ld r10,\area+EX_CFAR(r13) + ld r10,IAREA+EX_CFAR(r13) std r10,HSTATE_CFAR(r13) END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947) .endif BEGIN_FTR_SECTION_NESTED(948) - ld r10,\area+EX_PPR(r13) + ld r10,IAREA+EX_PPR(r13) std r10,HSTATE_PPR(r13) END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) - ld r10,\area+EX_R10(r13) + ld r10,IAREA+EX_R10(r13) std r12,HSTATE_SCRATCH0(r13) sldir12,r9,32 /* HSRR variants have the 0x2 bit added to their trap number */ - .if \hsrr == EXC_HV_OR_STD + .if IHSRR == EXC_HV_OR_STD BEGIN_FTR_SECTION - ori r12,r12,(\vec + 0x2) + ori r12,r12,(IVEC + 0x2) FTR_SECTION_ELSE - ori r12,r12,(\vec) + ori r12,r12,(IVEC) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - ori r12,r12,(\vec + 0x2) + .elseif IHSRR + ori r12,r12,(IVEC+ 0x2) .else - ori r12,r12,(\vec) + ori r12,r12,(IVEC) .endif #ifdef CONFIG_RELOCATABLE @@ -334,25 +325,25 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r9,HSTATE_SCRATCH1(r13) __LOAD_FAR_HANDLER(r9, kvmppc_interrupt) mtctr r9 - ld r9,\area+EX_R9(r13) + ld r9,IAREA+EX_R9(r13) bctr #else - ld r9,\area+EX_R9(r13) + ld r9,IAREA+EX_R9(r13) b kvmppc_interrupt #endif - .if \skip + .if IKVM_SKIP 89:mtocrf 0x80,r9 - ld r9,\area+EX_R9(r13) - ld r10,\area+EX_R10(r13) - .if \hsrr == EXC_HV_OR_STD + ld r9,IAREA+EX_R9(r13) + ld r10,IAREA+EX_R10(r13) + .if IHSRR == EXC_HV_OR_STD BEGIN_FTR_SECTION b kvmppc_skip_Hinterrupt FTR_SECTION_ELSE b kvmppc_skip_interrupt ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr + .elseif IHSRR b kvmppc_skip_Hinterrupt .else b kvmppc_skip_interrupt @@ -363,7 +354,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) #else .macro KVMTEST name, hsrr, n .endm -.macro KVM_HANDLER name, vec, hsrr, area, skip +.macro GEN_KVM name .endm #endif @@ -1640,6 +1631,12 @@ EXC_VIRT_NONE(0x4b00, 0x100) * without saving, though xer is not a good idea to use, as hardware may * interpret some bits so it may be costly to change them. */ +INT_DEFINE_BEGIN(system_call) + IVEC=0xc00 + IKVM_REAL=1 + IKVM_VIRT=1 +INT_DEFINE_END(system_call) + .macro SYSTEM_CALL virt #ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* @@ -1733,7 +1730,7 @@ TRAMP_KVM_BEGIN(system_call_kvm) SET_SCRATCH0(r10) std r9,PACA_EXGEN+EX_R9(r13) mfcrr9 - KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0 + GEN_KVM system_call #endif -- 2.23.0
[PATCH v2 07/35] powerpc/64s/exception: Remove old INT_COMMON macro
From: Nicholas Piggin Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 51 +--- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ba2dcd91aaaf..f318869607db 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -591,8 +591,8 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * If stack=0, then the stack is already set in r1, and r1 is saved in r10. * PPR save and CPU accounting is not done for the !stack case (XXX why not?) */ -.macro INT_COMMON vec, area, stack, kaup, reconcile, dar, dsisr - .if \stack +.macro GEN_COMMON name + .if ISTACK andi. r10,r12,MSR_PR /* See if coming from user */ mr r10,r1 /* Save r1 */ subir1,r1,INT_FRAME_SIZE/* alloc frame on kernel stack */ @@ -609,54 +609,54 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r0,GPR0(r1) /* save r0 in stackframe*/ std r10,GPR1(r1)/* save r1 in stackframe*/ - .if \stack - .if \kaup + .if ISTACK + .if IKUAP kuap_save_amr_and_lock r9, r10, cr1, cr0 .endif beq 101f/* if from kernel mode */ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) - SAVE_PPR(\area, r9) + SAVE_PPR(IAREA, r9) 101: .else - .if \kaup + .if IKUAP kuap_save_amr_and_lock r9, r10, cr1 .endif .endif /* Save original regs values from save area to stack frame. */ - ld r9,\area+EX_R9(r13) /* move r9, r10 to stackframe */ - ld r10,\area+EX_R10(r13) + ld r9,IAREA+EX_R9(r13) /* move r9, r10 to stackframe */ + ld r10,IAREA+EX_R10(r13) std r9,GPR9(r1) std r10,GPR10(r1) - ld r9,\area+EX_R11(r13)/* move r11 - r13 to stackframe */ - ld r10,\area+EX_R12(r13) - ld r11,\area+EX_R13(r13) + ld r9,IAREA+EX_R11(r13)/* move r11 - r13 to stackframe */ + ld r10,IAREA+EX_R12(r13) + ld r11,IAREA+EX_R13(r13) std r9,GPR11(r1) std r10,GPR12(r1) std r11,GPR13(r1) - .if \dar - .if \dar == 2 + .if IDAR + .if IDAR == 2 ld r10,_NIP(r1) .else - ld r10,\area+EX_DAR(r13) + ld r10,IAREA+EX_DAR(r13) .endif std r10,_DAR(r1) .endif - .if \dsisr - .if \dsisr == 2 + .if IDSISR + .if IDSISR == 2 ld r10,_MSR(r1) lis r11,DSISR_SRR1_MATCH_64S@h and r10,r10,r11 .else - lwz r10,\area+EX_DSISR(r13) + lwz r10,IAREA+EX_DSISR(r13) .endif std r10,_DSISR(r1) .endif BEGIN_FTR_SECTION_NESTED(66) - ld r10,\area+EX_CFAR(r13) + ld r10,IAREA+EX_CFAR(r13) std r10,ORIG_GPR3(r1) END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) - GET_CTR(r10, \area) + GET_CTR(r10, IAREA) std r10,_CTR(r1) std r2,GPR2(r1) /* save r2 in stackframe*/ SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ @@ -668,26 +668,22 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) mfspr r11,SPRN_XER/* save XER in stackframe */ std r10,SOFTE(r1) std r11,_XER(r1) - li r9,(\vec)+1 + li r9,(IVEC)+1 std r9,_TRAP(r1)/* set trap number */ li r10,0 ld r11,exception_marker@toc(r2) std r10,RESULT(r1) /* clear regs->result */ std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ - .if \stack + .if ISTACK ACCOUNT_STOLEN_TIME .endif - .if \reconcile + .if IRECONCILE RECONCILE_IRQ_STATE(r10, r11) .endif .endm -.macro GEN_COMMON name - INT_COMMON IVEC, IAREA, ISTACK, IKUAP, IRECONCILE, IDAR, IDSISR -.endm - /* * Restore all registers including H/SRR0/1 saved in a stack frame of a * standard exception. @@ -2400,7 +2396,8 @@ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) subir1,r1,INT_FRAME_SIZE - INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0 + __ISTACK(decrementer)=0 + GEN_COMMON decrementer bl save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt -- 2.23.0
[PATCH v2 06/35] powerpc/64s/exception: Remove old INT_ENTRY macro
From: Nicholas Piggin Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 68 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b5decc9a0cbf..ba2dcd91aaaf 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -482,13 +482,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * - Fall through and continue executing in real, unrelocated mode. * This is done if early=2. */ -.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 +.macro GEN_INT_ENTRY name, virt, ool=0 SET_SCRATCH0(r13) /* save r13 */ GET_PACA(r13) - std r9,\area\()+EX_R9(r13) /* save r9 */ + std r9,IAREA+EX_R9(r13) /* save r9 */ OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) HMT_MEDIUM - std r10,\area\()+EX_R10(r13)/* save r10 - r12 */ + std r10,IAREA+EX_R10(r13) /* save r10 - r12 */ OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) .if \ool .if !\virt @@ -502,47 +502,47 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif .endif - OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) + OPT_SAVE_REG_TO_PACA(IAREA+EX_PPR, r9, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(IAREA+EX_CFAR, r10, CPU_FTR_CFAR) INTERRUPT_TO_KERNEL - SAVE_CTR(r10, \area\()) + SAVE_CTR(r10, IAREA) mfcrr9 - .if \kvm - KVMTEST \name \hsrr \vec + .if (!\virt && IKVM_REAL) || (\virt && IKVM_VIRT) + KVMTEST \name IHSRR IVEC .endif - .if \bitmask + .if IMASK lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,\bitmask + andi. r10,r10,IMASK /* Associate vector numbers with bits in paca->irq_happened */ - .if \vec == 0x500 || \vec == 0xea0 + .if IVEC == 0x500 || IVEC == 0xea0 li r10,PACA_IRQ_EE - .elseif \vec == 0x900 + .elseif IVEC == 0x900 li r10,PACA_IRQ_DEC - .elseif \vec == 0xa00 || \vec == 0xe80 + .elseif IVEC == 0xa00 || IVEC == 0xe80 li r10,PACA_IRQ_DBELL - .elseif \vec == 0xe60 + .elseif IVEC == 0xe60 li r10,PACA_IRQ_HMI - .elseif \vec == 0xf00 + .elseif IVEC == 0xf00 li r10,PACA_IRQ_PMI .else .abort "Bad maskable vector" .endif - .if \hsrr == EXC_HV_OR_STD + .if IHSRR == EXC_HV_OR_STD BEGIN_FTR_SECTION bne masked_Hinterrupt FTR_SECTION_ELSE bne masked_interrupt ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr + .elseif IHSRR bne masked_Hinterrupt .else bne masked_interrupt .endif .endif - std r11,\area\()+EX_R11(r13) - std r12,\area\()+EX_R12(r13) + std r11,IAREA+EX_R11(r13) + std r12,IAREA+EX_R12(r13) /* * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], @@ -550,47 +550,39 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * not recoverable if they are live. */ GET_SCRATCH0(r10) - std r10,\area\()+EX_R13(r13) - .if \dar == 1 - .if \hsrr + std r10,IAREA+EX_R13(r13) + .if IDAR == 1 + .if IHSRR mfspr r10,SPRN_HDAR .else mfspr r10,SPRN_DAR .endif - std r10,\area\()+EX_DAR(r13) + std r10,IAREA+EX_DAR(r13) .endif - .if \dsisr == 1 - .if \hsrr + .if IDSISR == 1 + .if IHSRR mfspr r10,SPRN_HDSISR .else mfspr r10,SPRN_DSISR .endif - stw r10,\area\()+EX_DSISR(r13) + stw r10,IAREA+EX_DSISR(r13) .endif - .if \early == 2 + .if IEARLY == 2 /* nothing more */ - .elseif \early + .elseif IEARLY mfctr r10 /* save ctr, even for !RELOCATABLE */ BRANCH_TO_C000(r11, \name\()_common) .elseif !\virt - INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri + INT_SAVE_SRR_AND_JUMP \name\()_common, IHSRR, ISET_RI .else - INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr + INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, IHSRR .endif .if \ool .popsection
[PATCH v2 05/35] powerpc/64s/exception: Move all interrupt handlers to new style code gen macros
From: Nicholas Piggin Aside from label names and BUG line numbers, the generated code change is an additional HMI KVM handler added for the "late" KVM handler, because early and late HMI generation is achieved by defining two different interrupt types. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 556 --- 1 file changed, 418 insertions(+), 138 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 828fa4df15cf..b5decc9a0cbf 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -206,8 +206,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define IMASK .L_IMASK_\name\() #define IKVM_SKIP .L_IKVM_SKIP_\name\() #define IKVM_REAL .L_IKVM_REAL_\name\() +#define __IKVM_REAL(name) .L_IKVM_REAL_ ## name #define IKVM_VIRT .L_IKVM_VIRT_\name\() #define ISTACK .L_ISTACK_\name\() +#define __ISTACK(name) .L_ISTACK_ ## name #define IRECONCILE .L_IRECONCILE_\name\() #define IKUAP .L_IKUAP_\name\() @@ -570,7 +572,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) /* nothing more */ .elseif \early mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, \name\()_early_common) + BRANCH_TO_C000(r11, \name\()_common) .elseif !\virt INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri .else @@ -843,6 +845,19 @@ __start_interrupts: EXC_VIRT_NONE(0x4000, 0x100) +INT_DEFINE_BEGIN(system_reset) + IVEC=0x100 + IAREA=PACA_EXNMI + /* +* MSR_RI is not enabled, because PACA_EXNMI and nmi stack is +* being used, so a nested NMI exception would corrupt it. +*/ + ISET_RI=0 + ISTACK=0 + IRECONCILE=0 + IKVM_REAL=1 +INT_DEFINE_END(system_reset) + EXC_REAL_BEGIN(system_reset, 0x100, 0x100) #ifdef CONFIG_PPC_P7_NAP /* @@ -880,11 +895,8 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1 + GEN_INT_ENTRY system_reset, virt=0 /* -* MSR_RI is not enabled, because PACA_EXNMI and nmi stack is -* being used, so a nested NMI exception would corrupt it. -* * In theory, we should not enable relocation here if it was disabled * in SRR1, because the MMU may not be configured to support it (e.g., * SLB may have been cleared). In practice, there should only be a few @@ -893,7 +905,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) */ EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) -INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0 +TRAMP_KVM_BEGIN(system_reset_kvm) + GEN_KVM system_reset #ifdef CONFIG_PPC_P7_NAP TRAMP_REAL_BEGIN(system_reset_idle_wake) @@ -908,8 +921,8 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) * Vectors for the FWNMI option. Share common code. */ TRAMP_REAL_BEGIN(system_reset_fwnmi) - /* See comment at system_reset exception, don't turn on RI */ - INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0 + __IKVM_REAL(system_reset)=0 + GEN_INT_ENTRY system_reset, virt=0 #endif /* CONFIG_PPC_PSERIES */ @@ -929,7 +942,7 @@ EXC_COMMON_BEGIN(system_reset_common) mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subir1,r1,INT_FRAME_SIZE - INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0 + GEN_COMMON system_reset bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does @@ -971,23 +984,46 @@ EXC_COMMON_BEGIN(system_reset_common) RFI_TO_USER_OR_KERNEL -EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 +INT_DEFINE_BEGIN(machine_check_early) + IVEC=0x200 + IAREA=PACA_EXMC /* * MSR_RI is not enabled, because PACA_EXMC is being used, so a * nested machine check corrupts it. machine_check_common enables * MSR_RI. */ + ISET_RI=0 + ISTACK=0 + IEARLY=1 + IDAR=1 + IDSISR=1 + IRECONCILE=0 + IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */ +INT_DEFINE_END(machine_check_early) + +INT_DEFINE_BEGIN(machine_check) + IVEC=0x200 + IAREA=PACA_EXMC + ISET_RI=0 + IDAR=1 + IDSISR=1 + IKVM_SKIP=1 + IKVM_REAL=1 +INT_DEFINE_END(machine_check) + +EXC_REAL_BEGIN(machine_check, 0x200, 0x100) + GEN_INT_ENTRY machine_check_early, virt=0 EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) #ifdef CONFIG_PPC_PSERIES TRAMP_REAL_BEGIN(machine_check_fwnmi) /* See comment at machine_check exception, don't turn on RI */ - INT_HANDLER machine_check, 0x200, ear
[PATCH v2 04/35] powerpc/64s/exception: Expand EXC_COMMON and EXC_COMMON_ASYNC macros
From: Nicholas Piggin These don't provide a large amount of code sharing. Removing them makes code easier to shuffle around. For example, some of the common instructions will be moved into the common code gen macro. No generated code change. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 160 --- 1 file changed, 117 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 0e39e98ef719..828fa4df15cf 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -757,28 +757,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif -#define EXC_COMMON(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ - INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ - bl save_nvgprs;\ - addir3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b ret_from_except - -/* - * Like EXC_COMMON, but for exceptions that can occur in the idle task and - * therefore need the special idle handling (finish nap and runlatch) - */ -#define EXC_COMMON_ASYNC(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ - INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ - FINISH_NAP; \ - RUNLATCH_ON;\ - addir3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b ret_from_except_lite - - /* * There are a few constraints to be concerned with. * - Real mode exceptions code/data must be located at their physical location. @@ -1349,7 +1327,13 @@ EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0 -EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) +EXC_COMMON_BEGIN(hardware_interrupt_common) + INT_COMMON 0x500, PACA_EXGEN, 1, 1, 1, 0, 0 + FINISH_NAP + RUNLATCH_ON + addir3,r1,STACK_FRAME_OVERHEAD + bl do_IRQ + b ret_from_except_lite EXC_REAL_BEGIN(alignment, 0x600, 0x100) @@ -1455,7 +1439,13 @@ EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(decrementer, 0x4900, 0x80) INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0 -EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) +EXC_COMMON_BEGIN(decrementer_common) + INT_COMMON 0x900, PACA_EXGEN, 1, 1, 1, 0, 0 + FINISH_NAP + RUNLATCH_ON + addir3,r1,STACK_FRAME_OVERHEAD + bl timer_interrupt + b ret_from_except_lite EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80) @@ -1465,7 +1455,12 @@ EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0 -EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) +EXC_COMMON_BEGIN(hdecrementer_common) + INT_COMMON 0x980, PACA_EXGEN, 1, 1, 1, 0, 0 + bl save_nvgprs + addir3,r1,STACK_FRAME_OVERHEAD + bl hdec_interrupt + b ret_from_except EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) @@ -1475,11 +1470,17 @@ EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0 +EXC_COMMON_BEGIN(doorbell_super_common) + INT_COMMON 0xa00, PACA_EXGEN, 1, 1, 1, 0, 0 + FINISH_NAP + RUNLATCH_ON + addir3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_DOORBELL -EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) + bl doorbell_exception #else -EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception) + bl unknown_exception #endif + b ret_from_except_lite EXC_REAL_NONE(0xb00, 0x100) @@ -1623,7 +1624,12 @@ EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) INT_HANDLER single_step, 0xd00, virt=1 EXC_VIRT_END(single_step, 0x4d00, 0x100) INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0 -EXC_COMMON(single_step_common, 0xd00, single_step_exception) +EXC_COMMON_BEGIN(single_step_common) + INT_COMMON 0xd00, PACA_EXGEN, 1, 1, 1, 0, 0 + bl save_nvgprs + ad
[PATCH v2 03/35] powerpc/64s/exception: Add GEN_KVM macro that uses INT_DEFINE parameters
From: Nicholas Piggin No generated code change. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 591ae2a73e18..0e39e98ef719 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -204,6 +204,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define ISET_RI.L_ISET_RI_\name\() #define IEARLY .L_IEARLY_\name\() #define IMASK .L_IMASK_\name\() +#define IKVM_SKIP .L_IKVM_SKIP_\name\() #define IKVM_REAL .L_IKVM_REAL_\name\() #define IKVM_VIRT .L_IKVM_VIRT_\name\() #define ISTACK .L_ISTACK_\name\() @@ -243,6 +244,9 @@ do_define_int n .ifndef IMASK IMASK=0 .endif + .ifndef IKVM_SKIP + IKVM_SKIP=0 + .endif .ifndef IKVM_REAL IKVM_REAL=0 .endif @@ -265,6 +269,10 @@ do_define_int n KVM_HANDLER \vec, \hsrr, \area, \skip .endm +.macro GEN_KVM name + KVM_HANDLER IVEC, IHSRR, IAREA, IKVM_SKIP +.endm + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* @@ -1226,6 +1234,7 @@ INT_DEFINE_BEGIN(data_access) IVEC=0x300 IDAR=1 IDSISR=1 + IKVM_SKIP=1 IKVM_REAL=1 INT_DEFINE_END(data_access) @@ -1235,7 +1244,8 @@ EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) GEN_INT_ENTRY data_access, virt=1 EXC_VIRT_END(data_access, 0x4300, 0x80) -INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 +TRAMP_KVM_BEGIN(data_access_kvm) + GEN_KVM data_access EXC_COMMON_BEGIN(data_access_common) GEN_COMMON data_access ld r4,_DAR(r1) -- 2.23.0
[PATCH v2 02/35] powerpc/64s/exception: Add GEN_COMMON macro that uses INT_DEFINE parameters
From: Nicholas Piggin No generated code change. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 24 +--- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e6ad6e6cf65e..591ae2a73e18 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -206,6 +206,9 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define IMASK .L_IMASK_\name\() #define IKVM_REAL .L_IKVM_REAL_\name\() #define IKVM_VIRT .L_IKVM_VIRT_\name\() +#define ISTACK .L_ISTACK_\name\() +#define IRECONCILE .L_IRECONCILE_\name\() +#define IKUAP .L_IKUAP_\name\() #define INT_DEFINE_BEGIN(n)\ .macro int_define_ ## n name @@ -246,6 +249,15 @@ do_define_int n .ifndef IKVM_VIRT IKVM_VIRT=0 .endif + .ifndef ISTACK + ISTACK=1 + .endif + .ifndef IRECONCILE + IRECONCILE=1 + .endif + .ifndef IKUAP + IKUAP=1 + .endif .endm .macro INT_KVM_HANDLER name, vec, hsrr, area, skip @@ -670,6 +682,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) .endif .endm +.macro GEN_COMMON name + INT_COMMON IVEC, IAREA, ISTACK, IKUAP, IRECONCILE, IDAR, IDSISR +.endm + /* * Restore all registers including H/SRR0/1 saved in a stack frame of a * standard exception. @@ -1221,13 +1237,7 @@ EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) EXC_VIRT_END(data_access, 0x4300, 0x80) INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) - /* -* Here r13 points to the paca, r9 contains the saved CR, -* SRR0 and SRR1 are saved in r11 and r12, -* r9 - r13 are saved in paca->exgen. -* EX_DAR and EX_DSISR have saved DAR/DSISR -*/ - INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1 + GEN_COMMON data_access ld r4,_DAR(r1) ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION -- 2.23.0
[PATCH v2 01/35] powerpc/64s/exception: Introduce INT_DEFINE parameter block for code generation
From: Nicholas Piggin The code generation macro arguments are difficult to read, and defaults can't easily be used. This introduces a block where parameters can be set for interrupt handler code generation by the subsequent macros, and adds the first generation macro for interrupt entry. One interrupt handler is converted to the new macros to demonstrate the change, the rest will be coverted all at once. No generated code change. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 77 ++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d0018dd17e0a..e6ad6e6cf65e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -193,6 +193,61 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg;\ bctr +/* + * Interrupt code generation macros + */ +#define IVEC .L_IVEC_\name\() +#define IHSRR .L_IHSRR_\name\() +#define IAREA .L_IAREA_\name\() +#define IDAR .L_IDAR_\name\() +#define IDSISR .L_IDSISR_\name\() +#define ISET_RI.L_ISET_RI_\name\() +#define IEARLY .L_IEARLY_\name\() +#define IMASK .L_IMASK_\name\() +#define IKVM_REAL .L_IKVM_REAL_\name\() +#define IKVM_VIRT .L_IKVM_VIRT_\name\() + +#define INT_DEFINE_BEGIN(n)\ +.macro int_define_ ## n name + +#define INT_DEFINE_END(n) \ +.endm ; \ +int_define_ ## n n ; \ +do_define_int n + +.macro do_define_int name + .ifndef IVEC + .error "IVEC not defined" + .endif + .ifndef IHSRR + IHSRR=EXC_STD + .endif + .ifndef IAREA + IAREA=PACA_EXGEN + .endif + .ifndef IDAR + IDAR=0 + .endif + .ifndef IDSISR + IDSISR=0 + .endif + .ifndef ISET_RI + ISET_RI=1 + .endif + .ifndef IEARLY + IEARLY=0 + .endif + .ifndef IMASK + IMASK=0 + .endif + .ifndef IKVM_REAL + IKVM_REAL=0 + .endif + .ifndef IKVM_VIRT + IKVM_VIRT=0 + .endif +.endm + .macro INT_KVM_HANDLER name, vec, hsrr, area, skip TRAMP_KVM_BEGIN(\name\()_kvm) KVM_HANDLER \vec, \hsrr, \area, \skip @@ -474,7 +529,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) */ GET_SCRATCH0(r10) std r10,\area\()+EX_R13(r13) - .if \dar + .if \dar == 1 .if \hsrr mfspr r10,SPRN_HDAR .else @@ -482,7 +537,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif std r10,\area\()+EX_DAR(r13) .endif - .if \dsisr + .if \dsisr == 1 .if \hsrr mfspr r10,SPRN_HDSISR .else @@ -506,6 +561,14 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endif .endm +.macro GEN_INT_ENTRY name, virt, ool=0 + .if ! \virt + INT_HANDLER \name, IVEC, \ool, IEARLY, \virt, IHSRR, IAREA, ISET_RI, IDAR, IDSISR, IMASK, IKVM_REAL + .else + INT_HANDLER \name, IVEC, \ool, IEARLY, \virt, IHSRR, IAREA, ISET_RI, IDAR, IDSISR, IMASK, IKVM_VIRT + .endif +.endm + /* * On entry r13 points to the paca, r9-r13 are saved in the paca, * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and @@ -1143,12 +1206,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) bl unrecoverable_exception b . +INT_DEFINE_BEGIN(data_access) + IVEC=0x300 + IDAR=1 + IDSISR=1 + IKVM_REAL=1 +INT_DEFINE_END(data_access) EXC_REAL_BEGIN(data_access, 0x300, 0x80) - INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1 + GEN_INT_ENTRY data_access, virt=0, ool=1 EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) - INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 + GEN_INT_ENTRY data_access, virt=1 EXC_VIRT_END(data_access, 0x4300, 0x80) INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) -- 2.23.0
[PATCH v2 00/35] exception cleanup, syscall in C and !COMPAT
Hello, This is merge of https://patchwork.ozlabs.org/cover/1162376/ (except two last experimental patches) and https://patchwork.ozlabs.org/patch/1162079/ rebased on top of master. There was minor conflict in Makefile in the latter series. Refreshed the patchset to fix build error on ppc32 and ppc64e. Thanks Michal Michal Suchanek (10): powerpc/64: system call: Fix sparse warning about missing declaration powerpc: Add back __ARCH_WANT_SYS_LLSEEK macro powerpc: move common register copy functions from signal_32.c to signal.c powerpc/perf: remove current_is_64bit() powerpc/perf: consolidate read_user_stack_32 powerpc/perf: consolidate valid_user_sp powerpc/64: make buildable without CONFIG_COMPAT powerpc/64: Make COMPAT user-selectable disabled on littleendian by default. powerpc/perf: split callchain.c by bitness MAINTAINERS: perf: Add pattern that matches ppc perf to the perf entry. Nicholas Piggin (25): powerpc/64s/exception: Introduce INT_DEFINE parameter block for code generation powerpc/64s/exception: Add GEN_COMMON macro that uses INT_DEFINE parameters powerpc/64s/exception: Add GEN_KVM macro that uses INT_DEFINE parameters powerpc/64s/exception: Expand EXC_COMMON and EXC_COMMON_ASYNC macros powerpc/64s/exception: Move all interrupt handlers to new style code gen macros powerpc/64s/exception: Remove old INT_ENTRY macro powerpc/64s/exception: Remove old INT_COMMON macro powerpc/64s/exception: Remove old INT_KVM_HANDLER powerpc/64s/exception: Add ISIDE option powerpc/64s/exception: move real->virt switch into the common handler powerpc/64s/exception: move soft-mask test to common code powerpc/64s/exception: move KVM test to common code powerpc/64s/exception: remove confusing IEARLY option powerpc/64s/exception: remove the SPR saving patch code macros powerpc/64s/exception: trim unused arguments from KVMTEST macro powerpc/64s/exception: hdecrementer avoid touching the stack powerpc/64s/exception: re-inline some handlers powerpc/64s/exception: Clean up SRR specifiers powerpc/64s/exception: add more comments for interrupt handlers powerpc/64s/exception: only test KVM in SRR interrupts when PR KVM is supported powerpc/64s/exception: soft nmi interrupt should not use ret_from_except powerpc/64: system call remove non-volatile GPR save optimisation powerpc/64: system call implement the bulk of the logic in C powerpc/64s: interrupt return in C powerpc/64s/exception: remove lite interrupt return MAINTAINERS |2 + arch/powerpc/Kconfig |5 +- arch/powerpc/include/asm/asm-prototypes.h | 17 +- .../powerpc/include/asm/book3s/64/kup-radix.h | 24 +- arch/powerpc/include/asm/cputime.h| 24 + arch/powerpc/include/asm/exception-64s.h |4 - arch/powerpc/include/asm/hw_irq.h |4 + arch/powerpc/include/asm/ptrace.h |3 + arch/powerpc/include/asm/signal.h |3 + arch/powerpc/include/asm/switch_to.h | 11 + arch/powerpc/include/asm/thread_info.h|4 +- arch/powerpc/include/asm/time.h |4 +- arch/powerpc/include/asm/unistd.h |1 + arch/powerpc/kernel/Makefile |9 +- arch/powerpc/kernel/entry_64.S| 880 ++-- arch/powerpc/kernel/exceptions-64e.S | 255 ++- arch/powerpc/kernel/exceptions-64s.S | 1937 - arch/powerpc/kernel/process.c | 89 +- arch/powerpc/kernel/signal.c | 144 +- arch/powerpc/kernel/signal.h |2 - arch/powerpc/kernel/signal_32.c | 140 -- arch/powerpc/kernel/syscall_64.c | 349 +++ arch/powerpc/kernel/syscalls/syscall.tbl | 22 +- arch/powerpc/kernel/systbl.S |9 +- arch/powerpc/kernel/time.c|9 - arch/powerpc/kernel/vdso.c|3 +- arch/powerpc/kernel/vector.S |2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 - arch/powerpc/kvm/book3s_segment.S |7 - arch/powerpc/perf/Makefile|5 +- arch/powerpc/perf/callchain.c | 387 +--- arch/powerpc/perf/callchain.h | 20 + arch/powerpc/perf/callchain_32.c | 197 ++ arch/powerpc/perf/callchain_64.c | 178 ++ fs/read_write.c |3 +- 35 files changed, 2799 insertions(+), 1965 deletions(-) create mode 100644 arch/powerpc/kernel/syscall_64.c create mode 100644 arch/powerpc/perf/callchain.h create mode 100644 arch/powerpc/perf/callchain_32.c create mode 100644 arch/powerpc/perf/callchain_64.c -- 2.23.0
Re: [PATCH v2] of: unittest: fix memory leak in attach_node_and_children
On Tue, 26 Nov 2019 02:48:04 +0100, Erhard Furtner wrote: > In attach_node_and_children memory is allocated for full_name via > kasprintf. If the condition of the 1st if is not met the function > returns early without freeing the memory. Add a kfree() to fix that. > > This has been detected with kmemleak: > Link: https://bugzilla.kernel.org/show_bug.cgi?id=205327 > > It looks like the leak was introduced by this commit: > Fixes: 5babefb7f7ab ("of: unittest: allow base devicetree to have symbol > metadata") > > Signed-off-by: Erhard Furtner > Reviewed-by: Michael Ellerman > Reviewed-by: Tyrel Datwyler > --- > Changes in v2: > - Make the commit message more clearer. > > drivers/of/unittest.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > Applied, thanks. Rob
Re: [PATCH v2] dma-mapping: treat dev->bus_dma_mask as a DMA limit
On Mon, 2019-11-25 at 16:33 +, Robin Murphy wrote: > On 25/11/2019 7:44 am, Christoph Hellwig wrote: > > On Sat, Nov 23, 2019 at 09:51:08AM -0700, Nathan Chancellor wrote: > > > Just as an FYI, this introduces a warning on arm32 allyesconfig for me: > > > > I think the dma_limit argument to iommu_dma_alloc_iova should be a u64 > > and/or we need to use min_t and open code the zero exception. > > > > Robin, Nicolas - any opinions? > > Yeah, given that it's always held a mask I'm not entirely sure why it > was ever a dma_addr_t rather than a u64. Unless anyone else is desperate > to do it I'll get a cleanup patch ready for rc1. Sounds good to me too Robin, since I started the mess, I'll be happy to do it if it helps offloading some work from you. Regards, Nicolas signature.asc Description: This is a digitally signed message part
Re: [PATCH 1/1] powerpc/kvm/book3s: Fixes possible 'use after release' of kvm
On Tue, Nov 26, 2019 at 02:52:12PM -0300, Leonardo Bras wrote: > Fixes a possible 'use after free' of kvm variable. > It does use mutex_unlock(&kvm->lock) after possible freeing a variable > with kvm_put_kvm(kvm). Moving the calls to kvm_put_kvm() to the end of the functions doesn't actually fix a use-after-free. In these flows, the reference being released is a borrowed reference that KVM takes on behalf of the entity it is creating, e.g. device, vcpu, or spapr tce. The caller of these create helpers must also hold its own reference to @kvm on top of the borrowed reference, i.e. these kvm_put_kvm() calls will never free @kvm (assuming there are no refcounting bugs elsewhere in KVM). If one these kvm_put_kvm() calls did unexpectedly free @kvm (due to a bug somewhere else), KVM would still hit a use-after-free scenario as the caller still thinks @kvm is valid. Currently, this would only happen on a subsequent ioctl() on the caller's file descriptor (which holds a pointer to @kvm), as the callers of these functions don't directly dereference @kvm after the functions return. But, not deferencing @kvm isn't deliberate or functionally required, it's just how the code happens to be written. The intent of adding kvm_put_kvm_no_destroy() was primarily to document that under no circumstance should the to-be-put reference be the *last* reference to @kvm. Moving the call to kvm_put_kvm{_no_destroy}() doesn't change that > Signed-off-by: Leonardo Bras > --- > arch/powerpc/kvm/book3s_64_vio.c | 3 +-- > virt/kvm/kvm_main.c | 8 > 2 files changed, 5 insertions(+), 6 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c > b/arch/powerpc/kvm/book3s_64_vio.c > index 5834db0a54c6..a402ead833b6 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -316,14 +316,13 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > > if (ret >= 0) > list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); > - else > - kvm_put_kvm(kvm); > > mutex_unlock(&kvm->lock); > > if (ret >= 0) > return ret; > > + kvm_put_kvm(kvm); > kfree(stt); > fail_acct: > account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 13efc291b1c7..f37089b60d09 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2744,10 +2744,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, > u32 id) > /* Now it's all set up, let userspace reach it */ > kvm_get_kvm(kvm); > r = create_vcpu_fd(vcpu); > - if (r < 0) { > - kvm_put_kvm(kvm); > + if (r < 0) > goto unlock_vcpu_destroy; > - } > > kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; > > @@ -2771,6 +2769,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, > u32 id) > mutex_lock(&kvm->lock); > kvm->created_vcpus--; > mutex_unlock(&kvm->lock); > + if (r < 0) > + kvm_put_kvm(kvm); > return r; > } > > @@ -3183,10 +3183,10 @@ static int kvm_ioctl_create_device(struct kvm *kvm, > kvm_get_kvm(kvm); > ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | > O_CLOEXEC); > if (ret < 0) { > - kvm_put_kvm(kvm); > mutex_lock(&kvm->lock); > list_del(&dev->vm_node); > mutex_unlock(&kvm->lock); > + kvm_put_kvm(kvm); > ops->destroy(dev); > return ret; > } > -- > 2.23.0 >
[PATCH 1/1] powerpc/kvm/book3s: Fixes possible 'use after release' of kvm
Fixes a possible 'use after free' of kvm variable. It does use mutex_unlock(&kvm->lock) after possible freeing a variable with kvm_put_kvm(kvm). Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_64_vio.c | 3 +-- virt/kvm/kvm_main.c | 8 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 5834db0a54c6..a402ead833b6 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -316,14 +316,13 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, if (ret >= 0) list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); - else - kvm_put_kvm(kvm); mutex_unlock(&kvm->lock); if (ret >= 0) return ret; + kvm_put_kvm(kvm); kfree(stt); fail_acct: account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 13efc291b1c7..f37089b60d09 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2744,10 +2744,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) { - kvm_put_kvm(kvm); + if (r < 0) goto unlock_vcpu_destroy; - } kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; @@ -2771,6 +2769,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) mutex_lock(&kvm->lock); kvm->created_vcpus--; mutex_unlock(&kvm->lock); + if (r < 0) + kvm_put_kvm(kvm); return r; } @@ -3183,10 +3183,10 @@ static int kvm_ioctl_create_device(struct kvm *kvm, kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { - kvm_put_kvm(kvm); mutex_lock(&kvm->lock); list_del(&dev->vm_node); mutex_unlock(&kvm->lock); + kvm_put_kvm(kvm); ops->destroy(dev); return ret; } -- 2.23.0
[PATCH v2 1/2] powerpc/hw_breakpoints: Rewrite 8xx breakpoints to allow any address range size.
Unlike standard powerpc, Powerpc 8xx doesn't have SPRN_DABR, but it has a breakpoint support based on a set of comparators which allow more flexibility. Commit 4ad8622dc548 ("powerpc/8xx: Implement hw_breakpoint") implemented breakpoints by emulating the DABR behaviour. It did this by setting one comparator the match 4 bytes at breakpoint address and the other comparator to match 4 bytes at breakpoint address + 4. Rewrite 8xx hw_breakpoint to make breakpoints match all addresses defined by the breakpoint address and length by making full use of comparators. Now, comparator E is set to match any address greater than breakpoint address minus one. Comparator F is set to match any address lower than breakpoint address plus breakpoint length. Addresses are aligned to 32 bits. When the breakpoint range starts at address 0, the breakpoint is set to match comparator F only. When the breakpoint range end at address 0x, the breakpoint is set to match comparator E only. Otherwise the breakpoint is set to match comparator E and F. At the same time, use registers bit names instead of hardcoded values. Signed-off-by: Christophe Leroy Cc: Ravi Bangoria --- v2: rebased on today's powerpc/next ; added 32 bit alignment --- arch/powerpc/include/asm/hw_breakpoint.h | 4 +++ arch/powerpc/include/asm/reg_8xx.h | 14 arch/powerpc/kernel/hw_breakpoint.c | 15 + arch/powerpc/kernel/process.c| 57 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 27ac6f5d2891..f2f8d8aa8e3b 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -34,7 +34,11 @@ struct arch_hw_breakpoint { #define HW_BRK_TYPE_PRIV_ALL (HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL | \ HW_BRK_TYPE_HYP) +#ifdef CONFIG_PPC_8xx +#define HW_BREAKPOINT_ALIGN 0x3 +#else #define HW_BREAKPOINT_ALIGN 0x7 +#endif #define DABR_MAX_LEN 8 #define DAWR_MAX_LEN 512 diff --git a/arch/powerpc/include/asm/reg_8xx.h b/arch/powerpc/include/asm/reg_8xx.h index 07df35ee8cbc..299ee7be0f67 100644 --- a/arch/powerpc/include/asm/reg_8xx.h +++ b/arch/powerpc/include/asm/reg_8xx.h @@ -35,7 +35,21 @@ #define SPRN_CMPE 152 #define SPRN_CMPF 153 #define SPRN_LCTRL1156 +#define LCTRL1_CTE_GT0xc000 +#define LCTRL1_CTF_LT0x1400 +#define LCTRL1_CRWE_RW 0x +#define LCTRL1_CRWE_RO 0x0004 +#define LCTRL1_CRWE_WO 0x000c +#define LCTRL1_CRWF_RW 0x +#define LCTRL1_CRWF_RO 0x0001 +#define LCTRL1_CRWF_WO 0x0003 #define SPRN_LCTRL2157 +#define LCTRL2_LW0EN 0x8000 +#define LCTRL2_LW0LA_E 0x +#define LCTRL2_LW0LA_F 0x0400 +#define LCTRL2_LW0LA_EandF 0x0800 +#define LCTRL2_LW0LADC 0x0200 +#define LCTRL2_SLW0EN0x0002 #ifdef CONFIG_PPC_8xx #define SPRN_ICTRL 158 #endif diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 58ce3d37c2a3..2462cd7c565c 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -160,6 +160,9 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) /* DAWR region can't cross 512 bytes boundary */ if ((start_addr >> 9) != (end_addr >> 9)) return -EINVAL; + } else if (IS_ENABLED(CONFIG_PPC_8xx)) { + /* 8xx can setup a range without limitation */ + max_len = U16_MAX; } if (hw_len > max_len) @@ -328,13 +331,11 @@ int hw_breakpoint_handler(struct die_args *args) } info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; - if (IS_ENABLED(CONFIG_PPC_8xx)) { - if (!dar_within_range(regs->dar, info)) - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; - } else { - if (!stepping_handler(regs, bp, info)) - goto out; - } + if (!dar_within_range(regs->dar, info)) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + + if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info)) + goto out; /* * As a policy, the callback is invoked in a 'trigger-after-execute' diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4df94b6e2f32..7fcf72e58826 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -740,28 +740,6 @@ static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) mtspr(SPRN_DABRX, dabrx); return 0; } -#elif defined(CONFIG_PPC_8xx) -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) -{ - unsigned long addr = dabr & ~HW_BRK_TYPE_DABR
[PATCH v2 2/2] selftests/powerpc: enable range tests on 8xx in ptrace-hwbreak.c selftest
8xx is now able to support any range length so range tests can be enabled. Signed-off-by: Christophe Leroy --- v2: new --- tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c index 7deedbc16b0b..fc477dfe86a2 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c @@ -455,9 +455,8 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr) if (dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_RANGE) { test_sethwdebug_exact(child_pid); - if (!is_8xx) - test_sethwdebug_range_aligned(child_pid); - if (dawr && !is_8xx) { + test_sethwdebug_range_aligned(child_pid); + if (dawr || is_8xx) { test_sethwdebug_range_unaligned(child_pid); test_sethwdebug_range_unaligned_dar(child_pid); test_sethwdebug_dawr_max_range(child_pid); -- 2.13.3
Re: Bug 205201 - Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M
On Tue, Nov 26, 2019 at 12:26:38PM +0100, Christian Zigotzky wrote: > Hello Christoph, > > The PCI TV card works with your patch! I was able to patch your Git kernel > with the patch above. > > I haven't found any error messages in the dmesg yet. Thanks. Unfortunately this is a bit of a hack as we need to set the mask based on runtime information like the magic FSL PCIe window. Let me try to draft something better up, and thanks already for testing this one!
[PATCH] powernv/opal-sensor-groups: Add documentation for the sysfs interfaces
From: Shilpasri G Bhat Commit bf9571550f52 ("powerpc/powernv: Add support to clear sensor groups data") added a mechanism to clear sensor-group data via a sysfs interface. However, the ABI for that interface has not been documented. This patch documents the ABI for the sysfs interface for sensor-groups and clearing the sensor-groups. This patch was originally sent by Shilpasri G Bhat on the mailing list: https://lkml.org/lkml/2018/8/1/85 Signed-off-by: Shilpasri G Bhat Signed-off-by: Gautham R. Shenoy --- .../ABI/testing/sysfs-firmware-opal-sensor-groups | 21 + 1 file changed, 21 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups new file mode 100644 index 000..3a2dfe5 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups @@ -0,0 +1,21 @@ +What: /sys/firmware/opal/sensor_groups +Date: August 2017 +Contact: Linux for PowerPC mailing list +Description: Sensor groups directory for POWER9 powernv servers + + Each folder in this directory contains a sensor group + which are classified based on type of the sensor + like power, temperature, frequency, current, etc. They + can also indicate the group of sensors belonging to + different owners like CSM, Profiler, Job-Scheduler + +What: /sys/firmware/opal/sensor_groups//clear +Date: August 2017 +Contact: Linux for PowerPC mailing list +Description: Sysfs file to clear the min-max of all the sensors + belonging to the group. + + Writing 1 to this file will clear the minimum and + maximum values of all the sensors in the group. + In POWER9, the min-max of a sensor is the historical minimum + and maximum value of the sensor cached by OCC. -- 1.9.4
[PATCH v2] powerpc/8xx: Fix permanently mapped IMMR region.
When not using large TLBs, the IMMR region is still mapped as a whole block in the FIXMAP area. Properly report that the IMMR region is block-mapped even when not using large TLBs. Signed-off-by: Christophe Leroy --- v2: rebased on today's powerpc/next (this drops the change to mem.c which is already merged) --- arch/powerpc/mm/nohash/8xx.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 090af2d2d3e4..2c98078d2ede 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -21,33 +21,34 @@ extern int __map_without_ltlbs; static unsigned long block_mapped_ram; /* - * Return PA for this VA if it is in an area mapped with LTLBs. + * Return PA for this VA if it is in an area mapped with LTLBs or fixmap. * Otherwise, returns 0 */ phys_addr_t v_block_mapped(unsigned long va) { unsigned long p = PHYS_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE) return p + va - VIRT_IMMR_BASE; + if (__map_without_ltlbs) + return 0; if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram) return __pa(va); return 0; } /* - * Return VA for a given PA mapped with LTLBs or 0 if not mapped + * Return VA for a given PA mapped with LTLBs or fixmap + * Return 0 if not mapped */ unsigned long p_block_mapped(phys_addr_t pa) { unsigned long p = PHYS_IMMR_BASE; - if (__map_without_ltlbs) - return 0; if (pa >= p && pa < p + IMMR_SIZE) return VIRT_IMMR_BASE + pa - p; + if (__map_without_ltlbs) + return 0; if (pa < block_mapped_ram) return (unsigned long)__va(pa); return 0; -- 2.13.3
[PATCH v4 16/16] powerpc/32s: Activate CONFIG_VMAP_STACK
A few changes to retrieve DAR and DSISR from struct regs instead of retrieving them directly, as they may have changed due to a TLB miss. Also modifies hash_page() and friends to work with virtual data addresses instead of physical ones. Same on load_up_fpu() and load_up_altivec(). Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/entry_32.S | 4 +++ arch/powerpc/kernel/fpu.S | 3 +++ arch/powerpc/kernel/head_32.S | 19 +++--- arch/powerpc/kernel/head_32.h | 4 ++- arch/powerpc/kernel/vector.S | 3 +++ arch/powerpc/mm/book3s32/hash_low.S| 46 +- arch/powerpc/mm/book3s32/mmu.c | 9 +-- arch/powerpc/platforms/Kconfig.cputype | 2 ++ 8 files changed, 67 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 00fcf954e742..1d3b152ee54f 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1365,7 +1365,11 @@ _GLOBAL(enter_rtas) lis r6,1f@ha/* physical return address for rtas */ addir6,r6,1f@l tophys(r6,r6) +#ifdef CONFIG_VMAP_STACK + mr r7, r1 +#else tophys(r7,r1) +#endif lwz r8,RTASENTRY(r4) lwz r4,RTASBASE(r4) mfmsr r9 diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 0bb991ddd264..3235a8da6af7 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -94,6 +94,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) /* enable use of FP after return */ #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP/* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 90ef355e958b..28391a408a22 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -272,14 +272,22 @@ __secondary_hold_acknowledge: */ . = 0x200 DO_KVM 0x200 +MachineCheck: EXCEPTION_PROLOG_0 +#ifdef CONFIG_VMAP_STACK + li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r11 +#endif #ifdef CONFIG_PPC_CHRP mfspr r11, SPRN_SPRG_THREAD +#ifdef CONFIG_VMAP_STACK + tovirt(r11, r11) +#endif lwz r11, RTAS_SP(r11) cmpwi cr1, r11, 0 bne cr1, 7f #endif /* CONFIG_PPC_CHRP */ - EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_1 rtas 7: EXCEPTION_PROLOG_2 addir3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP @@ -294,7 +302,7 @@ __secondary_hold_acknowledge: . = 0x300 DO_KVM 0x300 DataAccess: - EXCEPTION_PROLOG + EXCEPTION_PROLOG dar get_and_save_dar_dsisr_on_stack r4, r5, r11 BEGIN_MMU_FTR_SECTION #ifdef CONFIG_PPC_KUAP @@ -334,7 +342,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) . = 0x600 DO_KVM 0x600 Alignment: - EXCEPTION_PROLOG + EXCEPTION_PROLOG dar save_dar_dsisr_on_stack r4, r5, r11 addir3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x600, alignment_exception) @@ -645,6 +653,11 @@ handle_page_fault_tramp_1: handle_page_fault_tramp_2: EXC_XFER_LITE(0x300, handle_page_fault) +#ifdef CONFIG_VMAP_STACK +stack_ovf_trampoline: + b stack_ovf +#endif + AltiVecUnavailable: EXCEPTION_PROLOG #ifdef CONFIG_ALTIVEC diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 283d4298d555..ae2c8e07e1d5 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -38,10 +38,12 @@ andi. r11, r11, MSR_PR .endm -.macro EXCEPTION_PROLOG_1 +.macro EXCEPTION_PROLOG_1 rtas #ifdef CONFIG_VMAP_STACK + .ifb\rtas li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ mtmsr r11 + .endif subir11, r1, INT_FRAME_SIZE /* use r1 if kernel */ #else tophys(r11,r1) /* use tophys(r1) if kernel */ diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 8eb867dbad5f..25c14a0981bf 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -67,6 +67,9 @@ _GLOBAL(load_up_altivec) #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ orisr9,r9,MSR_VEC@h +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif #else ld r4,PACACURRENT(r13) addir5,r4,THREAD/* Get THREAD */ diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 8bbbd9775c8a..c11b0a005196 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -25,6 +25,12 @@ #include #include +#ifdef CONFIG_VMAP_STA
[PATCH v4 14/16] powerpc/32s: reorganise DSI handler.
The part decidated to handling hash_page() is fully unneeded for processors not having real hash pages like the 603. Lets enlarge the content of the feature fixup, and provide an alternative which jumps directly instead of getting NIPs. Also, in preparation of VMAP stacks, the end of DSI handler has moved to later in the code as it won't fit anymore once VMAP stacks are there. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/head_32.S | 31 +-- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 449625b4ff03..7ec780858299 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -295,24 +295,20 @@ __secondary_hold_acknowledge: DO_KVM 0x300 DataAccess: EXCEPTION_PROLOG - mfspr r10,SPRN_DSISR - stw r10,_DSISR(r11) + get_and_save_dar_dsisr_on_stack r4, r5, r11 +BEGIN_MMU_FTR_SECTION #ifdef CONFIG_PPC_KUAP - andis. r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h + andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h #else - andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h + andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h #endif - bne 1f /* if not, try to put a PTE */ - mfspr r4,SPRN_DAR /* into the hash table */ - rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ -BEGIN_MMU_FTR_SECTION + bne handle_page_fault_tramp_2 /* if not, try to put a PTE */ + rlwinm r3, r5, 32 - 15, 21, 21 /* DSISR_STORE -> _PAGE_RW */ bl hash_page -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -1: lwz r5,_DSISR(r11) /* get DSISR value */ - mfspr r4,SPRN_DAR - stw r4, _DAR(r11) - EXC_XFER_LITE(0x300, handle_page_fault) - + b handle_page_fault_tramp_1 +FTR_SECTION_ELSE + b handle_page_fault_tramp_2 +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) /* Instruction access exception. */ . = 0x400 @@ -642,6 +638,13 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) . = 0x3000 +handle_page_fault_tramp_1: + lwz r4, _DAR(r11) + lwz r5, _DSISR(r11) + /* fall through */ +handle_page_fault_tramp_2: + EXC_XFER_LITE(0x300, handle_page_fault) + AltiVecUnavailable: EXCEPTION_PROLOG #ifdef CONFIG_ALTIVEC -- 2.13.3
[PATCH v4 15/16] powerpc/32s: avoid crossing page boundary while changing SRR0/1.
Trying VMAP_STACK with KVM, vmlinux was not starting. This was due to SRR0 and SRR1 clobbered by an ISI due to the rfi being in a different page than the mtsrr0/1: c0003fe0 : c0003fe0: 38 83 00 54 addir4,r3,84 c0003fe4: 7c 60 00 a6 mfmsr r3 c0003fe8: 70 60 00 30 andi. r0,r3,48 c0003fec: 4d 82 00 20 beqlr c0003ff0: 7c 63 00 78 andcr3,r3,r0 c0003ff4: 7c 9a 03 a6 mtsrr0 r4 c0003ff8: 7c 7b 03 a6 mtsrr1 r3 c0003ffc: 7c 00 04 ac hwsync c0004000: 4c 00 00 64 rfi Align the 4 instruction block used to deactivate MMU to order 4, so that the block never crosses a page boundary. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/head_32.S | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 7ec780858299..90ef355e958b 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -917,6 +917,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) ori r4,r4,2f@l tophys(r4,r4) li r3,MSR_KERNEL & ~(MSR_IR|MSR_DR) + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 SYNC @@ -1058,6 +1060,8 @@ _ENTRY(update_bats) rlwinm r0, r6, 0, ~MSR_RI rlwinm r0, r0, 0, ~MSR_EE mtmsr r0 + + .align 4 mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r3 SYNC @@ -1097,6 +1101,8 @@ mmu_off: andi. r0,r3,MSR_DR|MSR_IR /* MMU enabled? */ beqlr andcr3,r3,r0 + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 sync -- 2.13.3
[PATCH v4 13/16] powerpc/8xx: Enable CONFIG_VMAP_STACK
This patch enables CONFIG_VMAP_STACK. For that, a few changes are done in head_8xx.S. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/head_8xx.S | 34 -- arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 225e242ce1c5..fc6d4d10e298 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -127,7 +127,7 @@ instruction_counter: /* Machine check */ . = 0x200 MachineCheck: - EXCEPTION_PROLOG + EXCEPTION_PROLOG dar save_dar_dsisr_on_stack r4, r5, r11 li r6, RPN_PATTERN mtspr SPRN_DAR, r6/* Tag DAR, to be used in DTLB Error */ @@ -140,7 +140,7 @@ MachineCheck: /* Alignment exception */ . = 0x600 Alignment: - EXCEPTION_PROLOG + EXCEPTION_PROLOG dar save_dar_dsisr_on_stack r4, r5, r11 li r6, RPN_PATTERN mtspr SPRN_DAR, r6/* Tag DAR, to be used in DTLB Error */ @@ -457,20 +457,26 @@ InstructionTLBError: */ . = 0x1400 DataTLBError: - EXCEPTION_PROLOG_0 + EXCEPTION_PROLOG_0 dar mfspr r11, SPRN_DAR cmpwi cr1, r11, RPN_PATTERN beq-cr1, FixupDAR /* must be a buggy dcbX, icbi insn. */ DARFixed:/* Return from dcbx instruction bug workaround */ +#ifdef CONFIG_VMAP_STACK + li r11, RPN_PATTERN + mtspr SPRN_DAR, r11 /* Tag DAR, to be used in DTLB Error */ +#endif EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 dar get_and_save_dar_dsisr_on_stack r4, r5, r11 andis. r10,r5,DSISR_NOHPTE@h beq+.Ldtlbie tlbie r4 .Ldtlbie: +#ifndef CONFIG_VMAP_STACK li r10,RPN_PATTERN mtspr SPRN_DAR,r10/* Tag DAR, to be used in DTLB Error */ +#endif /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) @@ -492,16 +498,20 @@ DARFixed:/* Return from dcbx instruction bug workaround */ */ do_databreakpoint: EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 dar addir3,r1,STACK_FRAME_OVERHEAD mfspr r4,SPRN_BAR stw r4,_DAR(r11) +#ifdef CONFIG_VMAP_STACK + lwz r5,_DSISR(r11) +#else mfspr r5,SPRN_DSISR +#endif EXC_XFER_STD(0x1c00, do_break) . = 0x1c00 DataBreakpoint: - EXCEPTION_PROLOG_0 + EXCEPTION_PROLOG_0 dar mfspr r11, SPRN_SRR0 cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l @@ -530,6 +540,11 @@ InstructionBreakpoint: EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) +#ifdef CONFIG_VMAP_STACK +stack_ovf_trampoline: + b stack_ovf +#endif + . = 0x2000 /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions @@ -650,7 +665,14 @@ FixupDAR:/* Entry point for dcbx workaround. */ 152: mfdar r11 mtctr r11 /* restore ctr reg from DAR */ +#ifdef CONFIG_VMAP_STACK + mfspr r11, SPRN_SPRG_THREAD + stw r10, DAR(r11) + mfspr r10, SPRN_DSISR + stw r10, DSISR(r11) +#else mtdar r10 /* save fault EA to DAR */ +#endif mfspr r10,SPRN_M_TW b DARFixed/* Go back to normal TLB handling */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 1e352c2eea7a..f0583251e9a3 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -49,6 +49,7 @@ config PPC_8xx select PPC_HAVE_KUEP select PPC_HAVE_KUAP select PPC_MM_SLICES if HUGETLB_PAGE + select HAVE_ARCH_VMAP_STACK config 40x bool "AMCC 40x" -- 2.13.3
[PATCH v4 12/16] powerpc/8xx: split breakpoint exception
Breakpoint exception is big. Split it to support future growth on exception prolog. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/head_8xx.S | 19 ++- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 1e718e47fe3c..225e242ce1c5 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -490,14 +490,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ * support of breakpoints and such. Someday I will get around to * using them. */ - . = 0x1c00 -DataBreakpoint: - EXCEPTION_PROLOG_0 - mfspr r11, SPRN_SRR0 - cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l - cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l - beq-cr1, 11f - beq-cr7, 11f +do_databreakpoint: EXCEPTION_PROLOG_1 EXCEPTION_PROLOG_2 addir3,r1,STACK_FRAME_OVERHEAD @@ -505,7 +498,15 @@ DataBreakpoint: stw r4,_DAR(r11) mfspr r5,SPRN_DSISR EXC_XFER_STD(0x1c00, do_break) -11: + + . = 0x1c00 +DataBreakpoint: + EXCEPTION_PROLOG_0 + mfspr r11, SPRN_SRR0 + cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l + cror4*cr1+eq, 4*cr1+eq, 4*cr7+eq + bne cr1, do_databreakpoint mtcrr10 mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 -- 2.13.3
[PATCH v4 06/16] powerpc/32: prepare for CONFIG_VMAP_STACK
To support CONFIG_VMAP_STACK, the kernel has to activate Data MMU Translation for accessing the stack. Before doing that it must save SRR0, SRR1 and also DAR and DSISR when relevant, in order to not loose them in case there is a Data TLB Miss once the translation is reactivated. This patch adds fields in thread struct for saving those registers. It prepares entry_32.S to handle exception entry with Data MMU Translation enabled and alters EXCEPTION_PROLOG macros to save SRR0, SRR1, DAR and DSISR then reenables Data MMU. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/processor.h | 6 ++ arch/powerpc/include/asm/thread_info.h | 5 ++ arch/powerpc/kernel/asm-offsets.c | 6 ++ arch/powerpc/kernel/entry_32.S | 7 +++ arch/powerpc/kernel/head_32.h | 101 + 5 files changed, 115 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a9993e7a443b..92c02d15f117 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -163,6 +163,12 @@ struct thread_struct { #if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) unsigned long kuap; /* opened segments for user access */ #endif +#ifdef CONFIG_VMAP_STACK + unsigned long srr0; + unsigned long srr1; + unsigned long dar; + unsigned long dsisr; +#endif /* Debug Registers */ struct debug_reg debug; struct thread_fp_state fp_state; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 8e1d0195ac36..488d5c4670ff 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -10,10 +10,15 @@ #define _ASM_POWERPC_THREAD_INFO_H #include +#include #ifdef __KERNEL__ +#if defined(CONFIG_VMAP_STACK) && CONFIG_THREAD_SHIFT < PAGE_SHIFT +#define THREAD_SHIFT PAGE_SHIFT +#else #define THREAD_SHIFT CONFIG_THREAD_SHIFT +#endif #define THREAD_SIZE(1 << THREAD_SHIFT) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 484f54dab247..782cbf489ab0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -127,6 +127,12 @@ int main(void) OFFSET(KSP_VSID, thread_struct, ksp_vsid); #else /* CONFIG_PPC64 */ OFFSET(PGDIR, thread_struct, pgdir); +#ifdef CONFIG_VMAP_STACK + OFFSET(SRR0, thread_struct, srr0); + OFFSET(SRR1, thread_struct, srr1); + OFFSET(DAR, thread_struct, dar); + OFFSET(DSISR, thread_struct, dsisr); +#endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); OFFSET(THREAD_ACC, thread_struct, acc); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 317ad9df8ba8..2a26fe19f0b1 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -140,6 +140,9 @@ transfer_to_handler: stw r12,_CTR(r11) stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD +#ifdef CONFIG_VMAP_STACK + tovirt(r12, r12) +#endif beq 2f /* if from user, fix up THREAD.regs */ addir2, r12, -THREAD addir11,r1,STACK_FRAME_OVERHEAD @@ -195,7 +198,11 @@ transfer_to_handler: transfer_to_handler_cont: 3: mflrr9 +#ifdef CONFIG_VMAP_STACK + tovirt(r9, r9) +#else tovirt(r2, r2) /* set r2 to current */ +#endif lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9)/* where to go when done */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index f19a1ab91fb5..59e775930be8 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -10,31 +10,57 @@ * We assume sprg3 has the physical address of the current * task's thread_struct. */ -.macro EXCEPTION_PROLOG - EXCEPTION_PROLOG_0 +.macro EXCEPTION_PROLOG ext + EXCEPTION_PROLOG_0 \ext EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 \ext .endm -.macro EXCEPTION_PROLOG_0 +.macro EXCEPTION_PROLOG_0 ext mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 +#ifdef CONFIG_VMAP_STACK + mfspr r10, SPRN_SPRG_THREAD + .ifnb \ext + mfspr r11, SPRN_DAR + stw r11, DAR(r10) + mfspr r11, SPRN_DSISR + stw r11, DSISR(r10) + .endif + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) +#endif mfspr r11, SPRN_SRR1 /* check whether user or kernel */ +#ifdef CONFIG_VMAP_STACK + stw r11, SRR1(r10) +#endif mfcrr10 andi. r11, r11, MSR_PR .endm .macro EXCEPTION_PROLOG_1 +#ifdef CONFIG_VMAP_STACK + li r11, MSR