[powerpc:next] BUILD SUCCESS 335aca5f65f1a39670944930131da5f2276f888f
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next branch HEAD: 335aca5f65f1a39670944930131da5f2276f888f Merge branch 'scv' support into next elapsed time: 951m configs tested: 74 configs skipped: 1 The following configs have been built successfully. More configs may be tested in the coming days. arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig i386defconfig i386 allyesconfig i386 debian-10.3 i386 allnoconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig m68k allyesconfig nios2 defconfig nios2allyesconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig openriscdefconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allyesconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig sparcallyesconfig sparc defconfig sparc64 allnoconfig sparc64 allmodconfig sparc64 defconfig sparc64 allyesconfig x86_64rhel-7.6-kselftests x86_64 rhel-8.3 x86_64 kexec x86_64 rhel x86_64lkp x86_64 fedora-25 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
Re: [PATCH v2 10/16] powerpc/powernv/pci: Refactor pnv_ioda_alloc_pe()
On 22/07/2020 16:57, Oliver O'Halloran wrote: > Rework the PE allocation logic to allow allocating blocks of PEs rather > than individually. We'll use this to allocate contigious blocks of PEs for > the SR-IOVs. > > This patch also adds code to pnv_ioda_alloc_pe() and pnv_ioda_reserve_pe() to > use the existing, but unused, phb->pe_alloc_mutex. Currently these functions > use atomic bit ops to release a currently allocated PE number. However, > the pnv_ioda_alloc_pe() wants to have exclusive access to the bit map while > scanning for hole large enough to accomodate the allocation size. > > Signed-off-by: Oliver O'Halloran > --- > v2: Add some details about the pe_alloc mutex and why we're using it. Reviewed-by: Alexey Kardashevskiy > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 41 ++- > arch/powerpc/platforms/powernv/pci.h | 2 +- > 2 files changed, 34 insertions(+), 9 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 2d36a9ebf0e9..c9c25fb0783c 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -145,23 +145,45 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, > int pe_no) > return; > } > > + mutex_lock(>ioda.pe_alloc_mutex); > if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) > pr_debug("%s: PE %x was reserved on PHB#%x\n", >__func__, pe_no, phb->hose->global_number); > + mutex_unlock(>ioda.pe_alloc_mutex); > > pnv_ioda_init_pe(phb, pe_no); > } > > -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) > +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count) > { > - long pe; > + struct pnv_ioda_pe *ret = NULL; > + int run = 0, pe, i; > > + mutex_lock(>ioda.pe_alloc_mutex); > + > + /* scan backwards for a run of @count cleared bits */ > for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) { > - if (!test_and_set_bit(pe, phb->ioda.pe_alloc)) > - return pnv_ioda_init_pe(phb, pe); > + if (test_bit(pe, phb->ioda.pe_alloc)) { > + run = 0; > + continue; > + } > + > + run++; > + if (run == count) > + break; > } > + if (run != count) > + goto out; > > - return NULL; > + for (i = pe; i < pe + count; i++) { > + set_bit(i, phb->ioda.pe_alloc); > + pnv_ioda_init_pe(phb, i); > + } > + ret = >ioda.pe_array[pe]; > + > +out: > + mutex_unlock(>ioda.pe_alloc_mutex); > + return ret; > } > > void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) > @@ -173,7 +195,10 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) > WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */ > kfree(pe->npucomp); > memset(pe, 0, sizeof(struct pnv_ioda_pe)); > + > + mutex_lock(>ioda.pe_alloc_mutex); > clear_bit(pe_num, phb->ioda.pe_alloc); > + mutex_unlock(>ioda.pe_alloc_mutex); > } > > /* The default M64 BAR is shared by all PEs */ > @@ -976,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct > pci_dev *dev) > if (pdn->pe_number != IODA_INVALID_PE) > return NULL; > > - pe = pnv_ioda_alloc_pe(phb); > + pe = pnv_ioda_alloc_pe(phb, 1); > if (!pe) { > pr_warn("%s: Not enough PE# available, disabling device\n", > pci_name(dev)); > @@ -1047,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct > pci_bus *bus, bool all) > > /* The PE number isn't pinned by M64 */ > if (!pe) > - pe = pnv_ioda_alloc_pe(phb); > + pe = pnv_ioda_alloc_pe(phb, 1); > > if (!pe) { > pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n", > @@ -3065,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct > device_node *np, > pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); > } else { > /* otherwise just allocate one */ > - root_pe = pnv_ioda_alloc_pe(phb); > + root_pe = pnv_ioda_alloc_pe(phb, 1); > phb->ioda.root_pe_idx = root_pe->pe_number; > } > > diff --git a/arch/powerpc/platforms/powernv/pci.h > b/arch/powerpc/platforms/powernv/pci.h > index 23fc5e391c7f..06431a452130 100644 > --- a/arch/powerpc/platforms/powernv/pci.h > +++ b/arch/powerpc/platforms/powernv/pci.h > @@ -224,7 +224,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct > pnv_ioda_pe *pe); > void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); > void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe); > > -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb); > +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count); >
Re: [PATCH] powerpc/64s: Fix irq tracing corruption in interrupt/syscall return caused by perf interrupts
On 23/07/2020 20:29, Nicholas Piggin wrote: > Excerpts from Alexey Kardashevskiy's message of July 22, 2020 8:50 pm: >> >> >> On 22/07/2020 17:34, Nicholas Piggin wrote: >>> Alexey reports lockdep_assert_irqs_enabled() warnings when stress testing >>> perf, e.g., >>> >>> WARNING: CPU: 0 PID: 1556 at kernel/softirq.c:169 >>> __local_bh_enable_ip+0x258/0x270 >>> CPU: 0 PID: 1556 Comm: syz-executor >>> NIP: c01ec888 LR: c01ec884 CTR: c0ef0610 >>> REGS: c00022d4f8a0 TRAP: 0700 Not tainted (5.8.0-rc3-x) >>> MSR: 80029033 CR: 28008844 XER: 2004 >>> CFAR: c01dc1d0 IRQMASK: 0 >>> >>> The interesting thing is MSR[EE] and IRQMASK shows interrupts are enabled, >>> suggesting the current->hardirqs_enabled irq tracing state is going out of >>> sync >>> with the actual interrupt enable state. >>> >>> The cause is a window in interrupt/syscall return where irq tracing state >>> is being >>> adjusted for an irqs-enabled return while MSR[EE] is still enabled. A perf >>> interrupt hits and ends up calling trace_hardirqs_off() when restoring >>> interrupt flags to a disable state. >>> >>> Fix this by disabling perf interrupts as well while adjusting irq tracing >>> state. >>> >>> Add a debug check that catches the condition sooner. >>> >>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic >>> in C") >>> Reported-by: Alexey Kardashevskiy >>> Signed-off-by: Nicholas Piggin >>> --- >>> >>> I can reproduce similar symptoms and this patch fixes my test case, >>> still trying to confirm Alexey's test case or whether there's another >>> similar bug causing it. >> >> >> This does not fix my testcase. I applied this on top of 4fa640dc5230 >> ("Merge tag 'vfio-v5.8-rc7' of git://github.com/awilliam/linux-vfio into >> master") without any of my testing code, just to be clear. Sorry... > > Okay it seems to be a bigger problem and not actually caused by that > patch but was possible for lockdep hardirqs_enabled state to get out > of synch with the local_irq_disable() state before that too. Root > cause is similar -- perf interrupts hitting between updating the two > different bits of state. > > Not quite sure why Alexey's test wasn't hitting it before the patch, > but possibly the way masked interrupts get replayed. But I was able > to hit the problem with a different assertion. > > I think I have a fix, but it seems to be a generic irq tracing code > issue. So this patch can be dropped, and it's not an urgent issue for > the next release (it only triggers warns on rare occasions and only > when lockdep is enabled). I would still like to understand how the last curr->hardirq_disable_event misses the ftrace buffer and we end up in the original interrupted kernel code... -- Alexey
Re: [PATCH v2 14/14] powerpc/eeh: Move PE tree setup into the platform
On Fri, Jul 24, 2020 at 3:01 PM Alexey Kardashevskiy wrote: > > > > On 22/07/2020 14:26, Oliver O'Halloran wrote: > > The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree > > follows the PCI bus structures because a reset asserted on an upstream > > bridge will be propagated to the downstream bridges. On pseries there's a > > 1-1 correspondence between what the guest sees are a PHB and a PE so the > > "tree" is really just a single node. > > > > Current the EEH core is reponsible for setting up this PE tree which it > > does by traversing the pci_dn tree. The structure of the pci_dn tree > > matches the bus tree on PowerNV which leads to the PE tree being "correct" > > this setup method doesn't make a whole lot of sense and it's actively > > confusing for the pseries case where it doesn't really do anything. > > > > We want to remove the dependence on pci_dn anyway so this patch move > > choosing where to insert a new PE into the platform code rather than > > being part of the generic EEH code. For PowerNV this simplifies the > > tree building logic and removes the use of pci_dn. For pseries we > > keep the existing logic. I'm not really convinced it does anything > > due to the 1-1 PE-to-PHB correspondence so every device under that > > PHB should be in the same PE, but I'd rather not remove it entirely > > until we've had a chance to look at it more deeply. > > > > Signed-off-by: Oliver O'Halloran > > Reviewed-by: Alexey Kardashevskiy > > --- > > v2: Reworked pseries PE setup slightly. NOT DONE YET. mostly done needs test > > So far it was looking good and now this :) > > When is it going to be done? Is this the broken stuff you mentioned > elsewhere? I am a dumb. I put those in there to remind myself what I have / haven't done when respinning a series. I added that before I tested it and forgot to remove the comment. > > > -- > Alexey
Re: [PATCH v2 01/14] powerpc/eeh: Remove eeh_dev_phb_init_dynamic()
On 22/07/2020 14:26, Oliver O'Halloran wrote: > This function is a one line wrapper around eeh_phb_pe_create() and despite > the name it doesn't create any eeh_dev structures. The "eeh_dev_phb_init_dynamic" name does not suggest anything really but the comment does. Reviewed-by: Alexey Kardashevskiy > Replace it with direct > calls to eeh_phb_pe_create() since that does what it says on the tin > and removes a layer of indirection. > > Signed-off-by: Oliver O'Halloran > --- > v2: Added sub prototype of eeh_phb_pe_create() for the !CONFIG_EEH case. > --- > arch/powerpc/include/asm/eeh.h | 3 ++- > arch/powerpc/kernel/eeh.c | 2 +- > arch/powerpc/kernel/eeh_dev.c | 13 - > arch/powerpc/kernel/of_platform.c | 4 ++-- > arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- > 5 files changed, 6 insertions(+), 18 deletions(-) > > diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h > index 964a54292b36..64487b88c569 100644 > --- a/arch/powerpc/include/asm/eeh.h > +++ b/arch/powerpc/include/asm/eeh.h > @@ -294,7 +294,6 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe); > struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); > > struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); > -void eeh_dev_phb_init_dynamic(struct pci_controller *phb); > void eeh_show_enabled(void); > int __init eeh_ops_register(struct eeh_ops *ops); > int __exit eeh_ops_unregister(const char *name); > @@ -370,6 +369,8 @@ void pseries_eeh_init_edev_recursive(struct pci_dn *pdn); > #else > static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { } > static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { } > + > +static inline int eeh_phb_pe_create(struct pci_controller *phb) { return 0; } > #endif > > #ifdef CONFIG_PPC64 > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index d407981dec76..859f76020256 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -1096,7 +1096,7 @@ static int eeh_init(void) > > /* Initialize PHB PEs */ > list_for_each_entry_safe(hose, tmp, _list, list_node) > - eeh_dev_phb_init_dynamic(hose); > + eeh_phb_pe_create(hose); > > eeh_addr_cache_init(); > > diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c > index 7370185c7a05..8e159a12f10c 100644 > --- a/arch/powerpc/kernel/eeh_dev.c > +++ b/arch/powerpc/kernel/eeh_dev.c > @@ -52,16 +52,3 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) > > return edev; > } > - > -/** > - * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB > - * @phb: PHB > - * > - * Scan the PHB OF node and its child association, then create the > - * EEH devices accordingly > - */ > -void eeh_dev_phb_init_dynamic(struct pci_controller *phb) > -{ > - /* EEH PE for PHB */ > - eeh_phb_pe_create(phb); > -} > diff --git a/arch/powerpc/kernel/of_platform.c > b/arch/powerpc/kernel/of_platform.c > index 71a3f97dc988..f89376ff633e 100644 > --- a/arch/powerpc/kernel/of_platform.c > +++ b/arch/powerpc/kernel/of_platform.c > @@ -62,8 +62,8 @@ static int of_pci_phb_probe(struct platform_device *dev) > /* Init pci_dn data structures */ > pci_devs_phb_init_dynamic(phb); > > - /* Create EEH PEs for the PHB */ > - eeh_dev_phb_init_dynamic(phb); > + /* Create EEH PE for the PHB */ > + eeh_phb_pe_create(phb); > > /* Scan the bus */ > pcibios_scan_phb(phb); > diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c > b/arch/powerpc/platforms/pseries/pci_dlpar.c > index b3a38f5a6b68..f9ae17e8a0f4 100644 > --- a/arch/powerpc/platforms/pseries/pci_dlpar.c > +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c > @@ -34,7 +34,7 @@ struct pci_controller *init_phb_dynamic(struct device_node > *dn) > pci_devs_phb_init_dynamic(phb); > > /* Create EEH devices for the PHB */ > - eeh_dev_phb_init_dynamic(phb); > + eeh_phb_pe_create(phb); > > if (dn->child) > pseries_eeh_init_edev_recursive(PCI_DN(dn)); > -- Alexey
Re: [PATCH v2 14/14] powerpc/eeh: Move PE tree setup into the platform
On 22/07/2020 14:26, Oliver O'Halloran wrote: > The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree > follows the PCI bus structures because a reset asserted on an upstream > bridge will be propagated to the downstream bridges. On pseries there's a > 1-1 correspondence between what the guest sees are a PHB and a PE so the > "tree" is really just a single node. > > Current the EEH core is reponsible for setting up this PE tree which it > does by traversing the pci_dn tree. The structure of the pci_dn tree > matches the bus tree on PowerNV which leads to the PE tree being "correct" > this setup method doesn't make a whole lot of sense and it's actively > confusing for the pseries case where it doesn't really do anything. > > We want to remove the dependence on pci_dn anyway so this patch move > choosing where to insert a new PE into the platform code rather than > being part of the generic EEH code. For PowerNV this simplifies the > tree building logic and removes the use of pci_dn. For pseries we > keep the existing logic. I'm not really convinced it does anything > due to the 1-1 PE-to-PHB correspondence so every device under that > PHB should be in the same PE, but I'd rather not remove it entirely > until we've had a chance to look at it more deeply. > > Signed-off-by: Oliver O'Halloran > Reviewed-by: Alexey Kardashevskiy > --- > v2: Reworked pseries PE setup slightly. NOT DONE YET. mostly done needs test So far it was looking good and now this :) When is it going to be done? Is this the broken stuff you mentioned elsewhere? -- Alexey
[powerpc:fixes-test] BUILD SUCCESS 590ce02bd148cd35721560c140e3759e39a6e56a
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git fixes-test branch HEAD: 590ce02bd148cd35721560c140e3759e39a6e56a powerpc/64s: Fix irq tracing corruption in interrupt/syscall return caused by perf interrupts elapsed time: 905m configs tested: 74 configs skipped: 1 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig i386 allyesconfig i386defconfig i386 debian-10.3 i386 allnoconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig m68k allyesconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig mips allyesconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig sparcallyesconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig x86_64rhel-7.6-kselftests x86_64 rhel-8.3 x86_64 kexec x86_64 rhel x86_64lkp x86_64 fedora-25 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
Re: [PATCH v5 4/7] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
On Thu, Jul 23, 2020 at 01:07:21PM -0700, Ram Pai wrote: > The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the > pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge > delay in tranistioning the VM to SVM. The Ultravisor is only interested > in the pages that contain the kernel, initrd and other important data > structures. The rest contain throw-away content. > > However if not all pages are requested by the Ultravisor, the Hypervisor > continues to consider the GFNs corresponding to the non-requested pages > as normal GFNs. This can lead to data-corruption and undefined behavior. > > In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's > GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared > or Paged-in followed by a Paged-out. > > Cc: Paul Mackerras > Cc: Benjamin Herrenschmidt > Cc: Michael Ellerman > Cc: Bharata B Rao > Cc: Aneesh Kumar K.V > Cc: Sukadev Bhattiprolu > Cc: Laurent Dufour > Cc: Thiago Jung Bauermann > Cc: David Gibson > Cc: Claudio Carvalho > Cc: kvm-...@vger.kernel.org > Cc: linuxppc-dev@lists.ozlabs.org > Signed-off-by: Ram Pai > --- > Documentation/powerpc/ultravisor.rst| 2 + > arch/powerpc/include/asm/kvm_book3s_uvmem.h | 2 + > arch/powerpc/kvm/book3s_hv_uvmem.c | 136 > +--- > 3 files changed, 127 insertions(+), 13 deletions(-) > > diff --git a/Documentation/powerpc/ultravisor.rst > b/Documentation/powerpc/ultravisor.rst > index a1c8c37..ba6b1bf 100644 > --- a/Documentation/powerpc/ultravisor.rst > +++ b/Documentation/powerpc/ultravisor.rst > @@ -934,6 +934,8 @@ Return values > * H_UNSUPPORTED if called from the wrong context (e.g. > from an SVM or before an H_SVM_INIT_START > hypercall). > + * H_STATE if the hypervisor could not successfully > +transition the VM to Secure VM. > > Description > ~~~ > diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h > b/arch/powerpc/include/asm/kvm_book3s_uvmem.h > index 9cb7d8b..f229ab5 100644 > --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h > +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h > @@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm, > unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm); > void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, >struct kvm *kvm, bool skip_page_out); > +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm, > + const struct kvm_memory_slot *memslot); I still don't see why this be a global function. You should be able to move around a few functions in book3s_hv_uvmem.c up/down and satisfy the calling order dependencies. Otherwise, Reviewed-by: Bharata B Rao
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
On 23/07/2020 23:11, Nicholas Piggin wrote: > Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm: >> On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote: >> >>> diff --git a/arch/powerpc/include/asm/hw_irq.h >>> b/arch/powerpc/include/asm/hw_irq.h >>> index 3a0db7b0b46e..35060be09073 100644 >>> --- a/arch/powerpc/include/asm/hw_irq.h >>> +++ b/arch/powerpc/include/asm/hw_irq.h >>> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void) >>> #define powerpc_local_irq_pmu_save(flags) \ >>> do { \ >>> raw_local_irq_pmu_save(flags); \ >>> - trace_hardirqs_off(); \ >>> + if (!raw_irqs_disabled_flags(flags))\ >>> + trace_hardirqs_off(); \ >>> } while(0) >>> #define powerpc_local_irq_pmu_restore(flags) \ >>> do {\ >>> - if (raw_irqs_disabled_flags(flags)) { \ >>> - raw_local_irq_pmu_restore(flags); \ >>> - trace_hardirqs_off(); \ >>> - } else {\ >>> + if (!raw_irqs_disabled_flags(flags))\ >>> trace_hardirqs_on();\ >>> - raw_local_irq_pmu_restore(flags); \ >>> - } \ >>> + raw_local_irq_pmu_restore(flags); \ >>> } while(0) >> >> You shouldn't be calling lockdep from NMI context! > > After this patch it doesn't. > > trace_hardirqs_on/off implementation appears to expect to be called in NMI > context though, for some reason. > >> That is, I recently >> added suport for that on x86: >> >> https://lkml.kernel.org/r/20200623083721.155449...@infradead.org >> https://lkml.kernel.org/r/20200623083721.216740...@infradead.org >> >> But you need to be very careful on how you order things, as you can see >> the above relies on preempt_count() already having been incremented with >> NMI_MASK. > > Hmm. My patch seems simpler. And your patches fix my error while Peter's do not: IRQs not enabled as expected WARNING: CPU: 0 PID: 1377 at /home/aik/p/kernel/kernel/softirq.c:169 __local_bh_enable_ip+0x118/0x190 > > I don't know this stuff very well, I don't really understand what your patch > enables for x86 but at least it shouldn't be incompatible with this one > AFAIKS. > > Thanks, > Nick > -- Alexey
Re: [PATCH 15/15] powerpc/powernv/sriov: Make single PE mode a per-BAR setting
On Wed, Jul 22, 2020 at 8:06 PM Alexey Kardashevskiy wrote: > > >> Well, realistically the segment size should be 8MB to make this matter > >> (or the whole window 2GB) which does not seem to happen so it does not > >> matter. > > > > I'm not sure what you mean. > > I mean how can we possibly hit this case, what m64_segsize would the > platform have to trigger this. The whole check seems useless but whatever. Yeah maybe. IIRC some old P8 FSP systems had tiny M64 windows so it might have been an issue there. Maybe we can get rid of it., but I'd rather just leave the behaviour as-is for now.
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
Hi Nicholas, I love your patch! Perhaps something to improve: [auto build test WARNING on linux/master] [also build test WARNING on powerpc/next linus/master v5.8-rc6 next-20200723] [cannot apply to tip/locking/core] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/lockdep-improve-current-hard-soft-irqs_enabled-synchronisation-with-actual-irq-state/20200723-185938 base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9ebcfadb0610322ac537dd7aa5d9cbc2b2894c68 config: i386-randconfig-s001-20200723 (attached as .config) compiler: gcc-9 (Debian 9.3.0-14) 9.3.0 reproduce: # apt-get install sparse # sparse version: v0.6.2-93-g4c6cbe55-dirty # save the attached .config to linux build tree make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot sparse warnings: (new ones prefixed by >>) kernel/locking/spinlock.c:149:17: sparse: sparse: context imbalance in '_raw_spin_lock' - wrong count at exit kernel/locking/spinlock.c: note: in included file (through include/linux/preempt.h): >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_spin_lock_irqsave' - wrong count at exit >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_spin_lock_irq' - wrong count at exit kernel/locking/spinlock.c:173:17: sparse: sparse: context imbalance in '_raw_spin_lock_bh' - wrong count at exit kernel/locking/spinlock.c:181:17: sparse: sparse: context imbalance in '_raw_spin_unlock' - unexpected unlock kernel/locking/spinlock.c:189:17: sparse: sparse: context imbalance in '_raw_spin_unlock_irqrestore' - unexpected unlock kernel/locking/spinlock.c:197:17: sparse: sparse: context imbalance in '_raw_spin_unlock_irq' - unexpected unlock kernel/locking/spinlock.c:205:17: sparse: sparse: context imbalance in '_raw_spin_unlock_bh' - unexpected unlock kernel/locking/spinlock.c:221:17: sparse: sparse: context imbalance in '_raw_read_lock' - wrong count at exit >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_read_lock_irqsave' - wrong count at exit >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_read_lock_irq' - wrong count at exit kernel/locking/spinlock.c:245:17: sparse: sparse: context imbalance in '_raw_read_lock_bh' - wrong count at exit kernel/locking/spinlock.c:253:17: sparse: sparse: context imbalance in '_raw_read_unlock' - unexpected unlock kernel/locking/spinlock.c:261:17: sparse: sparse: context imbalance in '_raw_read_unlock_irqrestore' - unexpected unlock kernel/locking/spinlock.c:269:17: sparse: sparse: context imbalance in '_raw_read_unlock_irq' - unexpected unlock kernel/locking/spinlock.c:277:17: sparse: sparse: context imbalance in '_raw_read_unlock_bh' - unexpected unlock kernel/locking/spinlock.c:293:17: sparse: sparse: context imbalance in '_raw_write_lock' - wrong count at exit >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_write_lock_irqsave' - wrong count at exit >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_write_lock_irq' - wrong count at exit kernel/locking/spinlock.c:317:17: sparse: sparse: context imbalance in '_raw_write_lock_bh' - wrong count at exit kernel/locking/spinlock.c:325:17: sparse: sparse: context imbalance in '_raw_write_unlock' - unexpected unlock kernel/locking/spinlock.c:333:17: sparse: sparse: context imbalance in '_raw_write_unlock_irqrestore' - unexpected unlock kernel/locking/spinlock.c:341:17: sparse: sparse: context imbalance in '_raw_write_unlock_irq' - unexpected unlock kernel/locking/spinlock.c:349:17: sparse: sparse: context imbalance in '_raw_write_unlock_bh' - unexpected unlock kernel/locking/spinlock.c:358:17: sparse: sparse: context imbalance in '_raw_spin_lock_nested' - wrong count at exit >> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in >> '_raw_spin_lock_irqsave_nested' - wrong count at exit kernel/locking/spinlock.c:380:17: sparse: sparse: context imbalance in '_raw_spin_lock_nest_lock' - wrong count at exit -- kernel/trace/ring_buffer.c:699:32: sparse: sparse: incorrect type in return expression (different base types) @@ expected restricted __poll_t @@ got int @@ kernel/trace/ring_buffer.c:699:32: sparse: expected restricted __poll_t kernel/trace/ring_buffer.c:699:32: sparse: got int kernel/trace/ring_buffer.c: note: in included file (through include/linux/irqflags.h,
Re: [PATCH v5 7/7] KVM: PPC: Book3S HV: rework secure mem slot dropping
On Thu, Jul 23, 2020 at 01:07:24PM -0700, Ram Pai wrote: > From: Laurent Dufour > > When a secure memslot is dropped, all the pages backed in the secure > device (aka really backed by secure memory by the Ultravisor) > should be paged out to a normal page. Previously, this was > achieved by triggering the page fault mechanism which is calling > kvmppc_svm_page_out() on each pages. > > This can't work when hot unplugging a memory slot because the memory > slot is flagged as invalid and gfn_to_pfn() is then not trying to access > the page, so the page fault mechanism is not triggered. > > Since the final goal is to make a call to kvmppc_svm_page_out() it seems > simpler to call directly instead of triggering such a mechanism. This > way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a > memslot. > > Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock, > the call to __kvmppc_svm_page_out() is made. As > __kvmppc_svm_page_out needs the vma pointer to migrate the pages, > the VMA is fetched in a lazy way, to not trigger find_vma() all > the time. In addition, the mmap_sem is held in read mode during > that time, not in write mode since the virual memory layout is not > impacted, and kvm->arch.uvmem_lock prevents concurrent operation > on the secure device. > > Cc: Ram Pai > Cc: Bharata B Rao > Cc: Paul Mackerras > Signed-off-by: Ram Pai > [modified the changelog description] > Signed-off-by: Laurent Dufour > --- > arch/powerpc/kvm/book3s_hv_uvmem.c | 54 > ++ > 1 file changed, 37 insertions(+), 17 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c > b/arch/powerpc/kvm/book3s_hv_uvmem.c > index c772e92..daffa6e 100644 > --- a/arch/powerpc/kvm/book3s_hv_uvmem.c > +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c > @@ -632,35 +632,55 @@ static inline int kvmppc_svm_page_out(struct > vm_area_struct *vma, > * fault on them, do fault time migration to replace the device PTEs in > * QEMU page table with normal PTEs from newly allocated pages. > */ > -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, > +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, >struct kvm *kvm, bool skip_page_out) > { > int i; > struct kvmppc_uvmem_page_pvt *pvt; > - unsigned long pfn, uvmem_pfn; > - unsigned long gfn = free->base_gfn; > + struct page *uvmem_page; > + struct vm_area_struct *vma = NULL; > + unsigned long uvmem_pfn, gfn; > + unsigned long addr, end; > + > + mmap_read_lock(kvm->mm); > + > + addr = slot->userspace_addr; > + end = addr + (slot->npages * PAGE_SIZE); > > - for (i = free->npages; i; --i, ++gfn) { > - struct page *uvmem_page; > + gfn = slot->base_gfn; > + for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) { > + > + /* Fetch the VMA if addr is not in the latest fetched one */ > + if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) { > + vma = find_vma_intersection(kvm->mm, addr, end); > + if (!vma || > + vma->vm_start > addr || vma->vm_end < end) { > + pr_err("Can't find VMA for gfn:0x%lx\n", gfn); > + break; > + } There is a potential issue with the boundary condition check here which I discussed with Laurent yesterday. Guess he hasn't gotten around to look at it yet. Regards, Bharata.
Re: [v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine
Hi Athira, Thank you for the patch! Yet something to improve: [auto build test ERROR on powerpc/next] [also build test ERROR on v5.8-rc6 next-20200723] [cannot apply to mpe/next scottwood/next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Athira-Rajeev/powerpc-perf-Initialize-power10-PMU-registers-in-cpu-setup-routine/20200723-153537 base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next config: powerpc-defconfig (attached as .config) compiler: powerpc64-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): arch/powerpc/kernel/cpu_setup_power.S: Assembler messages: >> arch/powerpc/kernel/cpu_setup_power.S:244: Error: non-constant expression in >> ".if" statement >> arch/powerpc/kernel/cpu_setup_power.S:244: Error: non-constant expression in >> ".if" statement >> arch/powerpc/kernel/cpu_setup_power.S:243: Error: unsupported relocation >> against SPRN_MMCR3 vim +244 arch/powerpc/kernel/cpu_setup_power.S 14 15 /* Entry: r3 = crap, r4 = ptr to cputable entry 16 * 17 * Note that we can be called twice for pseudo-PVRs 18 */ 19 _GLOBAL(__setup_cpu_power7) 20 mflrr11 21 bl __init_hvmode_206 22 mtlrr11 23 beqlr 24 li r0,0 25 mtspr SPRN_LPID,r0 26 LOAD_REG_IMMEDIATE(r0, PCR_MASK) 27 mtspr SPRN_PCR,r0 28 mfspr r3,SPRN_LPCR 29 li r4,(LPCR_LPES1 >> LPCR_LPES_SH) 30 bl __init_LPCR_ISA206 31 mtlrr11 32 blr 33 34 _GLOBAL(__restore_cpu_power7) 35 mflrr11 36 mfmsr r3 37 rldicl. r0,r3,4,63 38 beqlr 39 li r0,0 40 mtspr SPRN_LPID,r0 41 LOAD_REG_IMMEDIATE(r0, PCR_MASK) 42 mtspr SPRN_PCR,r0 43 mfspr r3,SPRN_LPCR 44 li r4,(LPCR_LPES1 >> LPCR_LPES_SH) 45 bl __init_LPCR_ISA206 46 mtlrr11 47 blr 48 49 _GLOBAL(__setup_cpu_power8) 50 mflrr11 51 bl __init_FSCR 52 bl __init_PMU 53 bl __init_PMU_ISA207 54 bl __init_hvmode_206 55 mtlrr11 56 beqlr 57 li r0,0 58 mtspr SPRN_LPID,r0 59 LOAD_REG_IMMEDIATE(r0, PCR_MASK) 60 mtspr SPRN_PCR,r0 61 mfspr r3,SPRN_LPCR 62 ori r3, r3, LPCR_PECEDH 63 li r4,0 /* LPES = 0 */ 64 bl __init_LPCR_ISA206 65 bl __init_HFSCR 66 bl __init_PMU_HV 67 bl __init_PMU_HV_ISA207 68 mtlrr11 69 blr 70 71 _GLOBAL(__restore_cpu_power8) 72 mflrr11 73 bl __init_FSCR 74 bl __init_PMU 75 bl __init_PMU_ISA207 76 mfmsr r3 77 rldicl. r0,r3,4,63 78 mtlrr11 79 beqlr 80 li r0,0 81 mtspr SPRN_LPID,r0 82 LOAD_REG_IMMEDIATE(r0, PCR_MASK) 83 mtspr SPRN_PCR,r0 84 mfspr r3,SPRN_LPCR 85 ori r3, r3, LPCR_PECEDH 86 li r4,0 /* LPES = 0 */ 87 bl __init_LPCR_ISA206 88 bl __init_HFSCR 89 bl __init_PMU_HV 90 bl __init_PMU_HV_ISA207 91 mtlrr11 92 blr 93 94 _GLOBAL(__setup_cpu_power10) 95 mflrr11 96 bl __init_FSCR_power10 97 bl __init_PMU 98 bl __init_PMU_ISA31 99 b 1f 100 101 _GLOBAL(__setup_cpu_power9) 102 mflrr11 103 bl __init_FSCR 104 bl __init_PMU 105 1: bl __init_hvmode_206 106 mtlrr11 107 beqlr 108 li r0,0 109 mtspr SPRN_PSSCR,r0 110 mtspr SPRN_LPID,r0 111 mtspr SPRN_PID,r0 112 LOAD_REG_IMMEDIATE(r0, PCR_MASK) 11
Re: [PATCH 2/2] lockdep: warn on redundant or incorrect irq state changes
Hi Nicholas, I love your patch! Yet something to improve: [auto build test ERROR on linux/master] [also build test ERROR on powerpc/next linus/master v5.8-rc6] [cannot apply to tip/locking/core next-20200723] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/lockdep-improve-current-hard-soft-irqs_enabled-synchronisation-with-actual-irq-state/20200723-185938 base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9ebcfadb0610322ac537dd7aa5d9cbc2b2894c68 config: x86_64-randconfig-a002-20200723 (attached as .config) compiler: gcc-9 (Debian 9.3.0-14) 9.3.0 reproduce (this is a W=1 build): # save the attached .config to linux build tree make W=1 ARCH=x86_64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): kernel/locking/lockdep.c: In function 'lockdep_init': >> kernel/locking/lockdep.c:5673:9: error: 'struct task_struct' has no member >> named 'hardirqs_enabled' 5673 | current->hardirqs_enabled = 1; | ^~ >> kernel/locking/lockdep.c:5674:9: error: 'struct task_struct' has no member >> named 'softirqs_enabled' 5674 | current->softirqs_enabled = 1; | ^~ In file included from kernel/locking/lockdep.c:60: At top level: kernel/locking/lockdep_internals.h:64:28: warning: 'LOCKF_USED_IN_IRQ_READ' defined but not used [-Wunused-const-variable=] 64 | static const unsigned long LOCKF_USED_IN_IRQ_READ = |^~ In file included from kernel/locking/lockdep.c:60: kernel/locking/lockdep_internals.h:58:28: warning: 'LOCKF_ENABLED_IRQ_READ' defined but not used [-Wunused-const-variable=] 58 | static const unsigned long LOCKF_ENABLED_IRQ_READ = |^~ In file included from kernel/locking/lockdep.c:60: kernel/locking/lockdep_internals.h:52:28: warning: 'LOCKF_USED_IN_IRQ' defined but not used [-Wunused-const-variable=] 52 | static const unsigned long LOCKF_USED_IN_IRQ = |^ In file included from kernel/locking/lockdep.c:60: kernel/locking/lockdep_internals.h:46:28: warning: 'LOCKF_ENABLED_IRQ' defined but not used [-Wunused-const-variable=] 46 | static const unsigned long LOCKF_ENABLED_IRQ = |^ vim +5673 kernel/locking/lockdep.c 5667 5668 printk(" per task-struct memory footprint: %zu bytes\n", 5669 sizeof(((struct task_struct *)NULL)->held_locks)); 5670 5671 WARN_ON(irqs_disabled()); 5672 > 5673 current->hardirqs_enabled = 1; > 5674 current->softirqs_enabled = 1; 5675 } 5676 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
Hi Nicholas, I love your patch! Yet something to improve: [auto build test ERROR on linux/master] [also build test ERROR on powerpc/next linus/master v5.8-rc6 next-20200723] [cannot apply to tip/locking/core] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/lockdep-improve-current-hard-soft-irqs_enabled-synchronisation-with-actual-irq-state/20200723-185938 base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9ebcfadb0610322ac537dd7aa5d9cbc2b2894c68 config: nds32-allyesconfig (attached as .config) compiler: nds32le-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=nds32 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): In file included from include/asm-generic/bitops.h:14, from ./arch/nds32/include/generated/asm/bitops.h:1, from include/linux/bitops.h:29, from include/linux/kernel.h:12, from include/linux/list.h:9, from include/linux/rculist.h:10, from include/linux/pid.h:5, from include/linux/sched.h:14, from arch/nds32/kernel/asm-offsets.c:4: include/linux/spinlock_api_smp.h: In function '__raw_spin_lock_irq': >> include/linux/irqflags.h:158:31: error: implicit declaration of function >> 'arch_irqs_disabled'; did you mean 'raw_irqs_disabled'? >> [-Werror=implicit-function-declaration] 158 | #define raw_irqs_disabled() (arch_irqs_disabled()) | ^~ include/linux/irqflags.h:174:23: note: in expansion of macro 'raw_irqs_disabled' 174 | bool was_disabled = raw_irqs_disabled(); \ | ^ include/linux/spinlock_api_smp.h:126:2: note: in expansion of macro 'local_irq_disable' 126 | local_irq_disable(); | ^ cc1: some warnings being treated as errors make[2]: *** [scripts/Makefile.build:114: arch/nds32/kernel/asm-offsets.s] Error 1 make[2]: Target '__build' not remade because of errors. make[1]: *** [Makefile:1175: prepare0] Error 2 make[1]: Target 'prepare' not remade because of errors. make: *** [Makefile:185: __sub-make] Error 2 make: Target 'prepare' not remade because of errors. vim +158 include/linux/irqflags.h 81d68a96a398448 Steven Rostedt 2008-05-12 132 df9ee29270c11db David Howells 2010-10-07 133 /* df9ee29270c11db David Howells 2010-10-07 134 * Wrap the arch provided IRQ routines to provide appropriate checks. df9ee29270c11db David Howells 2010-10-07 135 */ df9ee29270c11db David Howells 2010-10-07 136 #define raw_local_irq_disable() arch_local_irq_disable() df9ee29270c11db David Howells 2010-10-07 137 #define raw_local_irq_enable() arch_local_irq_enable() df9ee29270c11db David Howells 2010-10-07 138 #define raw_local_irq_save(flags) \ df9ee29270c11db David Howells 2010-10-07 139 do { \ df9ee29270c11db David Howells 2010-10-07 140 typecheck(unsigned long, flags);\ df9ee29270c11db David Howells 2010-10-07 141 flags = arch_local_irq_save(); \ df9ee29270c11db David Howells 2010-10-07 142 } while (0) df9ee29270c11db David Howells 2010-10-07 143 #define raw_local_irq_restore(flags)\ df9ee29270c11db David Howells 2010-10-07 144 do { \ df9ee29270c11db David Howells 2010-10-07 145 typecheck(unsigned long, flags);\ df9ee29270c11db David Howells 2010-10-07 146 arch_local_irq_restore(flags); \ df9ee29270c11db David Howells 2010-10-07 147 } while (0) df9ee29270c11db David Howells 2010-10-07 148 #define raw_local_save_flags(flags) \ df9ee29270c11db David Howells 2010-10-07 149 do { \ df9ee29270c11db David Howells 2010-10-07 150 typecheck(unsigned long, flags);\ df9ee29270c11db David Howells 2010-10-07 151 flags = arch_local_save_flags();\ df9ee29270c11db David Howells 2010-10-07 152 } while (0) df9ee29270c11db David Howells 2010-10-07 153 #define raw_irqs_disabled_flags(flags)
[powerpc:merge] BUILD SUCCESS d6c13d397d6988ec3e6029cae9e80501364cf9cb
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge branch HEAD: d6c13d397d6988ec3e6029cae9e80501364cf9cb Automatic merge of 'master', 'next' and 'fixes' (2020-07-22 23:08) elapsed time: 2204m configs tested: 74 configs skipped: 1 The following configs have been built successfully. More configs may be tested in the coming days. arm defconfig arm allyesconfig arm allmodconfig arm allnoconfig arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig i386defconfig i386 debian-10.3 i386 allnoconfig i386 allyesconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig m68k allyesconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allyesconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig sparcallyesconfig sparc defconfig sparc64 defconfig sparc64 allnoconfig sparc64 allyesconfig sparc64 allmodconfig x86_64rhel-7.6-kselftests x86_64 rhel-8.3 x86_64 kexec x86_64 rhel x86_64lkp x86_64 fedora-25 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
Re: [PATCH v2 2/3] powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10
On Fri, 2020-07-10 at 10:52 +0530, Pratik Rajesh Sampat wrote: > Additional registers DAWR0, DAWRX0 may be lost on Power 10 for > stop levels < 4. > Therefore save the values of these SPRs before entering a "stop" > state and restore their values on wakeup. > > Signed-off-by: Pratik Rajesh Sampat > --- > arch/powerpc/platforms/powernv/idle.c | 10 ++ > 1 file changed, 10 insertions(+) > > diff --git a/arch/powerpc/platforms/powernv/idle.c > b/arch/powerpc/platforms/powernv/idle.c > index 19d94d021357..f2e2a6a4c274 100644 > --- a/arch/powerpc/platforms/powernv/idle.c > +++ b/arch/powerpc/platforms/powernv/idle.c > @@ -600,6 +600,8 @@ struct p9_sprs { > u64 iamr; > u64 amor; > u64 uamor; > + u64 dawr0; > + u64 dawrx0; > }; > > static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) > @@ -687,6 +689,10 @@ static unsigned long power9_idle_stop(unsigned long > psscr, bool mmu_on) > sprs.iamr = mfspr(SPRN_IAMR); > sprs.amor = mfspr(SPRN_AMOR); > sprs.uamor = mfspr(SPRN_UAMOR); > + if (cpu_has_feature(CPU_FTR_ARCH_31)) { Can you add a comment here saying even though DAWR0 is ARCH_30, it's only required to be saved on 31. Otherwise this looks pretty odd. > + sprs.dawr0 = mfspr(SPRN_DAWR0); > + sprs.dawrx0 = mfspr(SPRN_DAWRX0); > + } > > srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ > > @@ -710,6 +716,10 @@ static unsigned long power9_idle_stop(unsigned long > psscr, bool mmu_on) > mtspr(SPRN_IAMR,sprs.iamr); > mtspr(SPRN_AMOR,sprs.amor); > mtspr(SPRN_UAMOR, sprs.uamor); > + if (cpu_has_feature(CPU_FTR_ARCH_31)) { > + mtspr(SPRN_DAWR0, sprs.dawr0); > + mtspr(SPRN_DAWRX0, sprs.dawrx0); > + } > > /* >* Workaround for POWER9 DD2.0, if we lost resources, the ERAT
[PATCH] powerpc/test_emulate_sstep: Fix build error
ppc64_book3e_allmodconfig fails with: arch/powerpc/lib/test_emulate_step.c: In function 'test_pld': arch/powerpc/lib/test_emulate_step.c:113:7: error: implicit declaration of function 'cpu_has_feature' 113 | if (!cpu_has_feature(CPU_FTR_ARCH_31)) { | ^~~ Add an include of cpu_has_feature.h to fix it. Fixes: b6b54b42722a ("powerpc/sstep: Add tests for prefixed integer load/stores") Signed-off-by: Michael Ellerman --- arch/powerpc/lib/test_emulate_step.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 081b05480c47..d242e9f72e0c 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "emulate_step_test: " fmt #include +#include #include #include #include -- 2.25.1
Re: [PATCH v4 10/12] ppc64/kexec_file: prepare elfcore header for crashing kernel
Hari Bathini writes: > Prepare elf headers for the crashing kernel's core file using > crash_prepare_elf64_headers() and pass on this info to kdump > kernel by updating its command line with elfcorehdr parameter. > Also, add elfcorehdr location to reserve map to avoid it from > being stomped on while booting. > > Signed-off-by: Hari Bathini > Tested-by: Pingfan Liu Reviewed-by: Thiago Jung Bauermann -- Thiago Jung Bauermann IBM Linux Technology Center
Re: [PATCH v4 09/12] ppc64/kexec_file: setup backup region for kdump kernel
Hari Bathini writes: > Though kdump kernel boots from loaded address, the first 64K bytes > of it is copied down to real 0. So, setup a backup region to copy > the first 64K bytes of crashed kernel, in purgatory, before booting > into kdump kernel. Also, update reserve map with backup region and > crashed kernel's memory to avoid kdump kernel from accidentially > using that memory. > > Reported-by: kernel test robot > [lkp: In v1, purgatory() declaration was missing] > Signed-off-by: Hari Bathini Reviewed-by: Thiago Jung Bauermann Just one minor comment below: > @@ -1047,13 +1120,26 @@ int setup_new_fdt_ppc64(const struct kimage *image, > void *fdt, > goto out; > } > > - /* Ensure we don't touch crashed kernel's memory */ > - ret = fdt_add_mem_rsv(fdt, 0, crashk_res.start); > + /* > + * Ensure we don't touch crashed kernel's memory except the > + * first 64K of RAM, which will be backed up. > + */ > + ret = fdt_add_mem_rsv(fdt, BACKUP_SRC_SIZE, I know BACKUP_SRC_START is 0, but please forgive my pedantry when I say that I think it's clearer if the start address above is changed to BACKUP_SRC_START + BACKUP_SRC_SIZE... > + crashk_res.start - BACKUP_SRC_SIZE); > if (ret) { > pr_err("Error reserving crash memory: %s\n", > fdt_strerror(ret)); > goto out; > } > + > + /* Ensure backup region is not used by kdump/capture kernel */ > + ret = fdt_add_mem_rsv(fdt, image->arch.backup_start, > + BACKUP_SRC_SIZE); > + if (ret) { > + pr_err("Error reserving memory for backup: %s\n", > +fdt_strerror(ret)); > + goto out; > + } > } > > out: -- Thiago Jung Bauermann IBM Linux Technology Center
Re: [PATCH v4 06/12] ppc64/kexec_file: restrict memory usage of kdump kernel
Hari Bathini writes: > Kdump kernel, used for capturing the kernel core image, is supposed > to use only specific memory regions to avoid corrupting the image to > be captured. The regions are crashkernel range - the memory reserved > explicitly for kdump kernel, memory used for the tce-table, the OPAL > region and RTAS region as applicable. Restrict kdump kernel memory > to use only these regions by setting up usable-memory DT property. > Also, tell the kdump kernel to run at the loaded address by setting > the magic word at 0x5c. > > Signed-off-by: Hari Bathini > Tested-by: Pingfan Liu > --- > > v3 -> v4: > * Updated get_node_path() to be an iterative function instead of a > recursive one. > * Added comment explaining why low memory is added to kdump kernel's > usable memory ranges though it doesn't fall in crashkernel region. > * For correctness, added fdt_add_mem_rsv() for the low memory being > added to kdump kernel's usable memory ranges. Good idea. > * Fixed prop pointer update in add_usable_mem_property() and changed > duple to tuple as suggested by Thiago. > +/** > + * get_node_pathlen - Get the full path length of the given node. > + * @dn: Node. > + * > + * Also, counts '/' at the end of the path. > + * For example, /memory@0 will be "/memory@0/\0" => 11 bytes. Wouldn't this function return 10 in the case of /memory@0? Are you saying that it should count the \0 at the end too? it's not doing that, AFAICS. > + * > + * Returns the string length of the node's full path. > + */ Maybe it's me (by analogy with strlen()), but I would expect "string length" to not include the terminating \0. I suggest renaming the function to something like get_node_path_size() and do s/length/size/ in the comment above if it's supposed to count the terminating \0. > +static int get_node_pathlen(struct device_node *dn) > +{ > + int len = 0; > + > + if (!dn) > + return 0; > + > + while (dn) { > + len += strlen(dn->full_name) + 1; > + dn = dn->parent; > + } > + > + return len + 1; > +} > + > +/** > + * get_node_path - Get the full path of the given node. > + * @node: Device node. > + * > + * Allocates buffer for node path. The caller must free the buffer > + * after use. > + * > + * Returns buffer with path on success, NULL otherwise. > + */ > +static char *get_node_path(struct device_node *node) > +{ > + struct device_node *dn; > + int len, idx, nlen; > + char *path = NULL; > + char end_char; > + > + if (!node) > + goto err; > + > + /* > + * Get the path length first and use it to iteratively build the path > + * from node to root. > + */ > + len = get_node_pathlen(node); > + > + /* Allocate memory for node path */ > + path = kzalloc(ALIGN(len, 8), GFP_KERNEL); > + if (!path) > + goto err; > + > + /* > + * Iteratively update path from node to root by decrementing > + * index appropriately. > + * > + * Also, add %NUL at the end of node & '/' at the end of all its > + * parent nodes. > + */ > + dn = node; > + path[0] = '/'; > + idx = len - 1; Here, idx is pointing to the supposed '/' at the end of the node path ... > + end_char = '\0'; > + while (dn->parent) { > + path[--idx] = end_char; .. and in the first ireation, this is writing '\0' at a place which will be overwritten by the memcpy() below with the last character of dn->full_name. You need to start idx with len, not len - 1. > + end_char = '/'; > + > + nlen = strlen(dn->full_name); > + idx -= nlen; > + memcpy(path + idx, dn->full_name, nlen); > + > + dn = dn->parent; > + } > + > + return path; > +err: > + kfree(path); > + return NULL; > +} -- Thiago Jung Bauermann IBM Linux Technology Center
Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
On Thu, 2020-07-23 at 01:21 -0400, Alex Ghiti wrote: > > works fine with huge pages, what is your problem there ? You rely on > > punching small-page size holes in there ? > > > > ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel > mapping in the direct mapping as it sets different permissions to > different part of the kernel (data, text..etc). Ah ok, that can be solved in a couple of ways... One is to use the linker script to ensure those sections are linked HUGE_PAGE_SIZE appart and moved appropriately by early boot code. One is to selectively degrade just those huge pages. I'm not familiar with the RiscV MMU (I should probably go have a look) but if it's a classic radix tree with huge pages at PUD/PMD level, then you could just degrade the one(s) that cross those boundaries. Cheers, Ben.
Re: [PATCH v4 04/12] ppc64/kexec_file: avoid stomping memory used by special regions
Hari Bathini writes: > crashkernel region could have an overlap with special memory regions > like opal, rtas, tce-table & such. These regions are referred to as > exclude memory ranges. Setup this ranges during image probe in order > to avoid them while finding the buffer for different kdump segments. > Override arch_kexec_locate_mem_hole() to locate a memory hole taking > these ranges into account. > > Signed-off-by: Hari Bathini Reviewed-by: Thiago Jung Bauermann -- Thiago Jung Bauermann IBM Linux Technology Center
Re: [PATCH v4 03/12] powerpc/kexec_file: add helper functions for getting memory ranges
Hari Bathini writes: > In kexec case, the kernel to be loaded uses the same memory layout as > the running kernel. So, passing on the DT of the running kernel would > be good enough. > > But in case of kdump, different memory ranges are needed to manage > loading the kdump kernel, booting into it and exporting the elfcore > of the crashing kernel. The ranges are exclude memory ranges, usable > memory ranges, reserved memory ranges and crash memory ranges. > > Exclude memory ranges specify the list of memory ranges to avoid while > loading kdump segments. Usable memory ranges list the memory ranges > that could be used for booting kdump kernel. Reserved memory ranges > list the memory regions for the loading kernel's reserve map. Crash > memory ranges list the memory ranges to be exported as the crashing > kernel's elfcore. > > Add helper functions for setting up the above mentioned memory ranges. > This helpers facilitate in understanding the subsequent changes better > and make it easy to setup the different memory ranges listed above, as > and when appropriate. > > Signed-off-by: Hari Bathini > Tested-by: Pingfan Liu Just one comment below, but regardless: Reviewed-by: Thiago Jung Bauermann > +/** > + * add_htab_mem_range - Adds htab range to the given memory ranges list, > + * if it exists > + * @mem_ranges: Range list to add the memory range to. > + * > + * Returns 0 on success, negative errno on error. > + */ > +int add_htab_mem_range(struct crash_mem **mem_ranges) > +{ > + if (!htab_address) > + return 0; > + > + return add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes); > +} I believe you need to surround this function with `#ifdef CONFIG_PPC_BOOK3S_64` and `#endif` to match what is done in . -- Thiago Jung Bauermann IBM Linux Technology Center
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On 7/23/20 3:58 PM, pet...@infradead.org wrote: On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote: On 7/23/20 2:47 PM, pet...@infradead.org wrote: On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote: BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch? I will have to update the patch to fix the reported 0-day test problem, but I want to collect other feedback before sending out v3. I want to say I hate it all, it adds instructions to a path we spend an aweful lot of time optimizing without really getting anything back for it. It does add some extra instruction that may slow it down slightly, but I don't agree that it gives nothing back. The cpu lock holder information can be useful in analyzing crash dumps and in some debugging situation. I think it can be useful in RHEL for this readon. How about an x86 config option to allow distros to decide if they want to have it enabled? I will make sure that it will have no performance degradation if the option is not enabled. Config knobs suck too; they create a maintenance burden (we get to make sure all the permutations works/build/etc..) and effectively nobody uses them, since world+dog uses what distros pick. Anyway, instead of adding a second per-cpu variable, can you see how horrible something like this is: unsigned char adds(unsigned char var, unsigned char val) { unsigned short sat = 0xff, tmp = var; asm ("addb %[val], %b[var];" "cmovc%[sat], %[var];" : [var] "+r" (tmp) : [val] "ir" (val), [sat] "r" (sat) ); return tmp; } Another thing to try is, instead of threading that lockval throughout the thing, simply: #define _Q_LOCKED_VAL this_cpu_read_stable(cpu_sat) or combined with the above #define _Q_LOCKED_VAL adds(this_cpu_read_stable(cpu_number), 2) and see if the compiler really makes a mess of things. Thanks for the suggestion. I will try that out. Cheers, Longman
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On Thu, Jul 23, 2020 at 09:58:55PM +0200, pet...@infradead.org wrote: > asm ("addb %[val], %b[var];" >"cmovc %[sat], %[var];" >: [var] "+r" (tmp) >: [val] "ir" (val), [sat] "r" (sat) >); "var" (operand 0) needs an earlyclobber ("sat" is read after "var" is written for the first time). Segher
Re: [PATCH] ASoC: fsl: Replace HTTP links with HTTPS ones
On Sat, 18 Jul 2020 13:12:09 +0200, Alexander A. Klimov wrote: > Rationale: > Reduces attack surface on kernel devs opening the links for MITM > as HTTPS traffic is much harder to manipulate. > > Deterministic algorithm: > For each file: > If not .svg: > For each line: > If doesn't contain `\bxmlns\b`: > For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: > If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: > If both the HTTP and HTTPS versions > return 200 OK and serve the same content: > Replace HTTP with HTTPS. Applied to https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next Thanks! [1/1] ASoC: fsl: Replace HTTP links with HTTPS ones commit: 1ce8f643ed875d754ff09bf2096dfac3b905ab80 All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark
[PATCH v5 7/7] KVM: PPC: Book3S HV: rework secure mem slot dropping
From: Laurent Dufour When a secure memslot is dropped, all the pages backed in the secure device (aka really backed by secure memory by the Ultravisor) should be paged out to a normal page. Previously, this was achieved by triggering the page fault mechanism which is calling kvmppc_svm_page_out() on each pages. This can't work when hot unplugging a memory slot because the memory slot is flagged as invalid and gfn_to_pfn() is then not trying to access the page, so the page fault mechanism is not triggered. Since the final goal is to make a call to kvmppc_svm_page_out() it seems simpler to call directly instead of triggering such a mechanism. This way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a memslot. Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock, the call to __kvmppc_svm_page_out() is made. As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the VMA is fetched in a lazy way, to not trigger find_vma() all the time. In addition, the mmap_sem is held in read mode during that time, not in write mode since the virual memory layout is not impacted, and kvm->arch.uvmem_lock prevents concurrent operation on the secure device. Cc: Ram Pai Cc: Bharata B Rao Cc: Paul Mackerras Signed-off-by: Ram Pai [modified the changelog description] Signed-off-by: Laurent Dufour --- arch/powerpc/kvm/book3s_hv_uvmem.c | 54 ++ 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index c772e92..daffa6e 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -632,35 +632,55 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, * fault on them, do fault time migration to replace the device PTEs in * QEMU page table with normal PTEs from newly allocated pages. */ -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, struct kvm *kvm, bool skip_page_out) { int i; struct kvmppc_uvmem_page_pvt *pvt; - unsigned long pfn, uvmem_pfn; - unsigned long gfn = free->base_gfn; + struct page *uvmem_page; + struct vm_area_struct *vma = NULL; + unsigned long uvmem_pfn, gfn; + unsigned long addr, end; + + mmap_read_lock(kvm->mm); + + addr = slot->userspace_addr; + end = addr + (slot->npages * PAGE_SIZE); - for (i = free->npages; i; --i, ++gfn) { - struct page *uvmem_page; + gfn = slot->base_gfn; + for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) { + + /* Fetch the VMA if addr is not in the latest fetched one */ + if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) { + vma = find_vma_intersection(kvm->mm, addr, end); + if (!vma || + vma->vm_start > addr || vma->vm_end < end) { + pr_err("Can't find VMA for gfn:0x%lx\n", gfn); + break; + } + } mutex_lock(>arch.uvmem_lock); - if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, _pfn)) { + + if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, _pfn)) { + uvmem_page = pfn_to_page(uvmem_pfn); + pvt = uvmem_page->zone_device_data; + pvt->skip_page_out = skip_page_out; + pvt->remove_gfn = true; + + if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, + PAGE_SHIFT, kvm, pvt->gpa)) + pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", + pvt->gpa, addr); + } else { + /* Remove the shared flag if any */ kvmppc_gfn_remove(gfn, kvm); - mutex_unlock(>arch.uvmem_lock); - continue; } - uvmem_page = pfn_to_page(uvmem_pfn); - pvt = uvmem_page->zone_device_data; - pvt->skip_page_out = skip_page_out; - pvt->remove_gfn = true; mutex_unlock(>arch.uvmem_lock); - - pfn = gfn_to_pfn(kvm, gfn); - if (is_error_noslot_pfn(pfn)) - continue; - kvm_release_pfn_clean(pfn); } + + mmap_read_unlock(kvm->mm); } unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) -- 1.8.3.1
[PATCH v5 6/7] KVM: PPC: Book3S HV: move kvmppc_svm_page_out up
From: Laurent Dufour kvmppc_svm_page_out() will need to be called by kvmppc_uvmem_drop_pages() so move it upper in this file. Furthermore it will be interesting to call this function when already holding the kvm->arch.uvmem_lock, so prefix the original function with __ and remove the locking in it, and introduce a wrapper which call that function with the lock held. There is no functional change. Cc: Ram Pai Cc: Bharata B Rao Cc: Paul Mackerras Signed-off-by: Ram Pai Signed-off-by: Laurent Dufour --- arch/powerpc/kvm/book3s_hv_uvmem.c | 166 - 1 file changed, 90 insertions(+), 76 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index fd9ff6e..c772e92 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -535,6 +535,96 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) } /* + * Provision a new page on HV side and copy over the contents + * from secure memory using UV_PAGE_OUT uvcall. + * Caller must held kvm->arch.uvmem_lock. + */ +static int __kvmppc_svm_page_out(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, unsigned long page_shift, + struct kvm *kvm, unsigned long gpa) +{ + unsigned long src_pfn, dst_pfn = 0; + struct migrate_vma mig; + struct page *dpage, *spage; + struct kvmppc_uvmem_page_pvt *pvt; + unsigned long pfn; + int ret = U_SUCCESS; + + memset(, 0, sizeof(mig)); + mig.vma = vma; + mig.start = start; + mig.end = end; + mig.src = _pfn; + mig.dst = _pfn; + mig.src_owner = _uvmem_pgmap; + + /* The requested page is already paged-out, nothing to do */ + if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) + return ret; + + ret = migrate_vma_setup(); + if (ret) + return -1; + + spage = migrate_pfn_to_page(*mig.src); + if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE)) + goto out_finalize; + + if (!is_zone_device_page(spage)) + goto out_finalize; + + dpage = alloc_page_vma(GFP_HIGHUSER, vma, start); + if (!dpage) { + ret = -1; + goto out_finalize; + } + + lock_page(dpage); + pvt = spage->zone_device_data; + pfn = page_to_pfn(dpage); + + /* +* This function is used in two cases: +* - When HV touches a secure page, for which we do UV_PAGE_OUT +* - When a secure page is converted to shared page, we *get* +* the page to essentially unmap the device page. In this +* case we skip page-out. +*/ + if (!pvt->skip_page_out) + ret = uv_page_out(kvm->arch.lpid, pfn << page_shift, + gpa, 0, page_shift); + + if (ret == U_SUCCESS) + *mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED; + else { + unlock_page(dpage); + __free_page(dpage); + goto out_finalize; + } + + migrate_vma_pages(); + +out_finalize: + migrate_vma_finalize(); + return ret; +} + +static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long page_shift, + struct kvm *kvm, unsigned long gpa) +{ + int ret; + + mutex_lock(>arch.uvmem_lock); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + mutex_unlock(>arch.uvmem_lock); + + return ret; +} + +/* * Drop device pages that we maintain for the secure guest * * We first mark the pages to be skipped from UV_PAGE_OUT when there @@ -866,82 +956,6 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa, return ret; } -/* - * Provision a new page on HV side and copy over the contents - * from secure memory using UV_PAGE_OUT uvcall. - */ -static int kvmppc_svm_page_out(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) -{ - unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; - struct page *dpage, *spage; - struct kvmppc_uvmem_page_pvt *pvt; - unsigned long pfn; - int ret = U_SUCCESS; - - memset(, 0, sizeof(mig)); - mig.vma = vma; - mig.start = start; - mig.end = end; - mig.src = _pfn; - mig.dst = _pfn; - mig.src_owner = _uvmem_pgmap; - - mutex_lock(>arch.uvmem_lock); - /* The requested page is already paged-out, nothing to do */ - if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) - goto out; - - ret = migrate_vma_setup(); - if (ret) -
[PATCH v5 5/7] KVM: PPC: Book3S HV: migrate hot plugged memory
From: Laurent Dufour When a memory slot is hot plugged to a SVM, PFNs associated with the GFNs in that slot must be migrated to the secure-PFNs, aka device-PFNs. Call kvmppc_uv_migrate_mem_slot() to accomplish this. Disable page-merge for all pages in the memory slot. Signed-off-by: Ram Pai [rearranged the code, and modified the commit log] Signed-off-by: Laurent Dufour --- arch/powerpc/include/asm/kvm_book3s_uvmem.h | 14 ++ arch/powerpc/kvm/book3s_hv.c| 10 ++ arch/powerpc/kvm/book3s_hv_uvmem.c | 23 +++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index f229ab5..59c17ca 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -25,6 +25,10 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, struct kvm *kvm, bool skip_page_out); int kvmppc_uv_migrate_mem_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot); +int kvmppc_uvmem_memslot_create(struct kvm *kvm, + const struct kvm_memory_slot *new); +void kvmppc_uvmem_memslot_delete(struct kvm *kvm, + const struct kvm_memory_slot *old); #else static inline int kvmppc_uvmem_init(void) { @@ -84,5 +88,15 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) static inline void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, struct kvm *kvm, bool skip_page_out) { } + +static inline int kvmppc_uvmem_memslot_create(struct kvm *kvm, + const struct kvm_memory_slot *new) +{ + return H_UNSUPPORTED; +} + +static inline void kvmppc_uvmem_memslot_delete(struct kvm *kvm, + const struct kvm_memory_slot *old) { } + #endif /* CONFIG_PPC_UV */ #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d331b46..b1485ca 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4515,16 +4515,10 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, switch (change) { case KVM_MR_CREATE: - if (kvmppc_uvmem_slot_init(kvm, new)) - return; - uv_register_mem_slot(kvm->arch.lpid, -new->base_gfn << PAGE_SHIFT, -new->npages * PAGE_SIZE, -0, new->id); + kvmppc_uvmem_memslot_create(kvm, new); break; case KVM_MR_DELETE: - uv_unregister_mem_slot(kvm->arch.lpid, old->id); - kvmppc_uvmem_slot_free(kvm, old); + kvmppc_uvmem_memslot_delete(kvm, old); break; default: /* TODO: Handle KVM_MR_MOVE */ diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 43b3566..fd9ff6e 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -418,7 +418,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, return ret; } -static void kvmppc_uvmem_memslot_delete(struct kvm *kvm, +static void __kvmppc_uvmem_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *memslot) { uv_unregister_mem_slot(kvm->arch.lpid, memslot->id); @@ -426,7 +426,7 @@ static void kvmppc_uvmem_memslot_delete(struct kvm *kvm, kvmppc_memslot_page_merge(kvm, memslot, true); } -static int kvmppc_uvmem_memslot_create(struct kvm *kvm, +static int __kvmppc_uvmem_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *memslot) { int ret = H_PARAMETER; @@ -478,7 +478,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) /* register the memslot */ slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { - ret = kvmppc_uvmem_memslot_create(kvm, memslot); + ret = __kvmppc_uvmem_memslot_create(kvm, memslot); if (ret) break; } @@ -488,7 +488,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) kvm_for_each_memslot(m, slots) { if (m == memslot) break; - kvmppc_uvmem_memslot_delete(kvm, memslot); + __kvmppc_uvmem_memslot_delete(kvm, memslot); } } @@ -1057,6 +1057,21 @@ int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT; } +int kvmppc_uvmem_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new) +{ + int ret = __kvmppc_uvmem_memslot_create(kvm, new); + + if (!ret) + ret = kvmppc_uv_migrate_mem_slot(kvm,
[PATCH v5 4/7] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge delay in tranistioning the VM to SVM. The Ultravisor is only interested in the pages that contain the kernel, initrd and other important data structures. The rest contain throw-away content. However if not all pages are requested by the Ultravisor, the Hypervisor continues to consider the GFNs corresponding to the non-requested pages as normal GFNs. This can lead to data-corruption and undefined behavior. In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared or Paged-in followed by a Paged-out. Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Bharata B Rao Cc: Aneesh Kumar K.V Cc: Sukadev Bhattiprolu Cc: Laurent Dufour Cc: Thiago Jung Bauermann Cc: David Gibson Cc: Claudio Carvalho Cc: kvm-...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Ram Pai --- Documentation/powerpc/ultravisor.rst| 2 + arch/powerpc/include/asm/kvm_book3s_uvmem.h | 2 + arch/powerpc/kvm/book3s_hv_uvmem.c | 136 +--- 3 files changed, 127 insertions(+), 13 deletions(-) diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst index a1c8c37..ba6b1bf 100644 --- a/Documentation/powerpc/ultravisor.rst +++ b/Documentation/powerpc/ultravisor.rst @@ -934,6 +934,8 @@ Return values * H_UNSUPPORTED if called from the wrong context (e.g. from an SVM or before an H_SVM_INIT_START hypercall). + * H_STATE if the hypervisor could not successfully +transition the VM to Secure VM. Description ~~~ diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index 9cb7d8b..f229ab5 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm); void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, struct kvm *kvm, bool skip_page_out); +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm, + const struct kvm_memory_slot *memslot); #else static inline int kvmppc_uvmem_init(void) { diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 1b2b029..43b3566 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -93,6 +93,7 @@ #include #include #include +#include static struct dev_pagemap kvmppc_uvmem_pgmap; static unsigned long *kvmppc_uvmem_bitmap; @@ -348,6 +349,41 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm, return false; } +/* + * starting from *gfn search for the next available GFN that is not yet + * transitioned to a secure GFN. return the value of that GFN in *gfn. If a + * GFN is found, return true, else return false + * + * Must be called with kvm->arch.uvmem_lock held. + */ +static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot, + struct kvm *kvm, unsigned long *gfn) +{ + struct kvmppc_uvmem_slot *p; + bool ret = false; + unsigned long i; + + list_for_each_entry(p, >arch.uvmem_pfns, list) + if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns) + break; + if (!p) + return ret; + /* +* The code below assumes, one to one correspondence between +* kvmppc_uvmem_slot and memslot. +*/ + for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) { + unsigned long index = i - p->base_pfn; + + if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) { + *gfn = i; + ret = true; + break; + } + } + return ret; +} + static int kvmppc_memslot_page_merge(struct kvm *kvm, const struct kvm_memory_slot *memslot, bool merge) { @@ -462,12 +498,40 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) { + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int srcu_idx; + long ret = H_SUCCESS; + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) return H_UNSUPPORTED; + /* migrate any unmoved normal pfn to device pfns*/ + srcu_idx = srcu_read_lock(>srcu); + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + ret = kvmppc_uv_migrate_mem_slot(kvm, memslot); + if (ret) { + /* +
[PATCH v5 3/7] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
During the life of SVM, its GFNs transition through normal, secure and shared states. Since the kernel does not track GFNs that are shared, it is not possible to disambiguate a shared GFN from a GFN whose PFN has not yet been migrated to a secure-PFN. Also it is not possible to disambiguate a secure-GFN from a GFN whose GFN has been pagedout from the ultravisor. The ability to identify the state of a GFN is needed to skip migration of its PFN to secure-PFN during ESM transition. The code is re-organized to track the states of a GFN as explained below. 1. States of a GFN --- The GFN can be in one of the following states. (a) Secure - The GFN is secure. The GFN is associated with a Secure VM, the contents of the GFN is not accessible to the Hypervisor. This GFN can be backed by a secure-PFN, or can be backed by a normal-PFN with contents encrypted. The former is true when the GFN is paged-in into the ultravisor. The latter is true when the GFN is paged-out of the ultravisor. (b) Shared - The GFN is shared. The GFN is associated with a a secure VM. The contents of the GFN is accessible to Hypervisor. This GFN is backed by a normal-PFN and its content is un-encrypted. (c) Normal - The GFN is a normal. The GFN is associated with a normal VM. The contents of the GFN is accesible to the Hypervisor. Its content is never encrypted. 2. States of a VM. --- (a) Normal VM: A VM whose contents are always accessible to the hypervisor. All its GFNs are normal-GFNs. (b) Secure VM: A VM whose contents are not accessible to the hypervisor without the VM's consent. Its GFNs are either Shared-GFN or Secure-GFNs. (c) Transient VM: A Normal VM that is transitioning to secure VM. The transition starts on successful return of H_SVM_INIT_START, and ends on successful return of H_SVM_INIT_DONE. This transient VM, can have GFNs in any of the three states; i.e Secure-GFN, Shared-GFN, and Normal-GFN. The VM never executes in this state in supervisor-mode. 3. Memory slot State. -- The state of a memory slot mirrors the state of the VM the memory slot is associated with. 4. VM State transition. A VM always starts in Normal Mode. H_SVM_INIT_START moves the VM into transient state. During this time the Ultravisor may request some of its GFNs to be shared or secured. So its GFNs can be in one of the three GFN states. H_SVM_INIT_DONE moves the VM entirely from transient state to secure-state. At this point any left-over normal-GFNs are transitioned to Secure-GFN. H_SVM_INIT_ABORT moves the transient VM back to normal VM. All its GFNs are moved to Normal-GFNs. UV_TERMINATE transitions the secure-VM back to normal-VM. All the secure-GFN and shared-GFNs are tranistioned to normal-GFN Note: The contents of the normal-GFN is undefined at this point. 5. GFN state implementation: - Secure GFN is associated with a secure-PFN; also called uvmem_pfn, when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag set, and contains the value of the secure-PFN. It is associated with a normal-PFN; also called mem_pfn, when the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set. The value of the normal-PFN is not tracked. Shared GFN is associated with a normal-PFN. Its pfn[] has KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN is not tracked. Normal GFN is associated with normal-PFN. Its pfn[] has no flag set. The value of the normal-PFN is not tracked. 6. Life cycle of a GFN -- || Share | Unshare | SVM |H_SVM_INIT_DONE| ||operation |operation | abort/| | ||| | terminate | | - ||| | | | | Secure | Shared | Secure |Normal |Secure | ||| | | | | Shared | Shared | Secure |Normal |Shared | ||| | | | | Normal | Shared | Secure |Normal |Secure | -- 7. Life cycle of a VM | | start| H_SVM_ |H_SVM_ |H_SVM_ |UV_SVM_| | | VM |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE | | | | | | | | -
[PATCH v5 2/7] KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
Page-merging of pages in memory-slots associated with a Secure VM, is disabled in H_SVM_PAGE_IN handler. This operation should have been done the much earlier; the moment the VM is initiated for secure-transition. Delaying this operation, increases the probability for those pages to acquire new references , making it impossible to migrate those pages in H_SVM_PAGE_IN handler. Disable page-migration in H_SVM_INIT_START handling. Reviewed-by: Bharata B Rao Signed-off-by: Ram Pai --- Documentation/powerpc/ultravisor.rst | 1 + arch/powerpc/kvm/book3s_hv_uvmem.c | 123 +-- 2 files changed, 89 insertions(+), 35 deletions(-) diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst index df136c8..a1c8c37 100644 --- a/Documentation/powerpc/ultravisor.rst +++ b/Documentation/powerpc/ultravisor.rst @@ -895,6 +895,7 @@ Return values One of the following values: * H_SUCCESS on success. +* H_STATEif the VM is not in a position to switch to secure. Description ~~~ diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e6f76bc..533b608 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -211,10 +211,79 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm, return false; } +static int kvmppc_memslot_page_merge(struct kvm *kvm, + const struct kvm_memory_slot *memslot, bool merge) +{ + unsigned long gfn = memslot->base_gfn; + unsigned long end, start = gfn_to_hva(kvm, gfn); + int ret = 0; + struct vm_area_struct *vma; + int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE; + + if (kvm_is_error_hva(start)) + return H_STATE; + + end = start + (memslot->npages << PAGE_SHIFT); + + mmap_write_lock(kvm->mm); + do { + vma = find_vma_intersection(kvm->mm, start, end); + if (!vma) { + ret = H_STATE; + break; + } + ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, + merge_flag, >vm_flags); + if (ret) { + ret = H_STATE; + break; + } + start = vma->vm_end; + } while (end > vma->vm_end); + + mmap_write_unlock(kvm->mm); + return ret; +} + +static void kvmppc_uvmem_memslot_delete(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + uv_unregister_mem_slot(kvm->arch.lpid, memslot->id); + kvmppc_uvmem_slot_free(kvm, memslot); + kvmppc_memslot_page_merge(kvm, memslot, true); +} + +static int kvmppc_uvmem_memslot_create(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + int ret = H_PARAMETER; + + if (kvmppc_memslot_page_merge(kvm, memslot, false)) + return ret; + + if (kvmppc_uvmem_slot_init(kvm, memslot)) + goto out1; + + ret = uv_register_mem_slot(kvm->arch.lpid, + memslot->base_gfn << PAGE_SHIFT, + memslot->npages * PAGE_SIZE, + 0, memslot->id); + if (ret < 0) { + ret = H_PARAMETER; + goto out; + } + return 0; +out: + kvmppc_uvmem_slot_free(kvm, memslot); +out1: + kvmppc_memslot_page_merge(kvm, memslot, true); + return ret; +} + unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) { struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; + struct kvm_memory_slot *memslot, *m; int ret = H_SUCCESS; int srcu_idx; @@ -232,23 +301,24 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) return H_AUTHORITY; srcu_idx = srcu_read_lock(>srcu); + + /* register the memslot */ slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { - if (kvmppc_uvmem_slot_init(kvm, memslot)) { - ret = H_PARAMETER; - goto out; - } - ret = uv_register_mem_slot(kvm->arch.lpid, - memslot->base_gfn << PAGE_SHIFT, - memslot->npages * PAGE_SIZE, - 0, memslot->id); - if (ret < 0) { - kvmppc_uvmem_slot_free(kvm, memslot); - ret = H_PARAMETER; - goto out; + ret = kvmppc_uvmem_memslot_create(kvm, memslot); + if (ret) + break; + } + + if (ret) { + slots = kvm_memslots(kvm); + kvm_for_each_memslot(m, slots) { + if (m == memslot) +
[PATCH v5 1/7] KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
Without this fix, git is confused. It generates wrong function context for code changes in subsequent patches. Weird, but true. Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Bharata B Rao Cc: Aneesh Kumar K.V Cc: Sukadev Bhattiprolu Cc: Laurent Dufour Cc: Thiago Jung Bauermann Cc: David Gibson Cc: Claudio Carvalho Cc: kvm-...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Ram Pai --- arch/powerpc/kvm/book3s_hv_uvmem.c | 21 ++--- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 09d8119..e6f76bc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -382,8 +382,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) * Alloc a PFN from private device memory pool and copy page from normal * memory to secure memory using UV_PAGE_IN uvcall. */ -static int -kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, +static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long gpa, struct kvm *kvm, unsigned long page_shift, bool *downgrade) { @@ -450,8 +449,8 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) * In the former case, uses dev_pagemap_ops.migrate_to_ram handler * to unmap the device page from QEMU's page tables. */ -static unsigned long -kvmppc_share_page(struct kvm *kvm, unsigned long gpa, unsigned long page_shift) +static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa, + unsigned long page_shift) { int ret = H_PARAMETER; @@ -500,9 +499,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) * H_PAGE_IN_SHARED flag makes the page shared which means that the same * memory in is visible from both UV and HV. */ -unsigned long -kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa, -unsigned long flags, unsigned long page_shift) +unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa, + unsigned long flags, + unsigned long page_shift) { bool downgrade = false; unsigned long start, end; @@ -559,10 +558,10 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) * Provision a new page on HV side and copy over the contents * from secure memory using UV_PAGE_OUT uvcall. */ -static int -kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) +static int kvmppc_svm_page_out(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, unsigned long page_shift, + struct kvm *kvm, unsigned long gpa) { unsigned long src_pfn, dst_pfn = 0; struct migrate_vma mig; -- 1.8.3.1
[PATCH v5 0/7] Migrate non-migrated pages of a SVM.
The time to switch a VM to Secure-VM, increases by the size of the VM. A 100GB VM takes about 7minutes. This is unacceptable. This linear increase is caused by a suboptimal behavior by the Ultravisor and the Hypervisor. The Ultravisor unnecessarily migrates all the GFN of the VM from normal-memory to secure-memory. It has to just migrate the necessary and sufficient GFNs. However when the optimization is incorporated in the Ultravisor, the Hypervisor starts misbehaving. The Hypervisor has a inbuilt assumption that the Ultravisor will explicitly request to migrate, each and every GFN of the VM. If only necessary and sufficient GFNs are requested for migration, the Hypervisor continues to manage the remaining GFNs as normal GFNs. This leads to memory corruption; manifested consistently when the SVM reboots. The same is true, when a memory slot is hotplugged into a SVM. The Hypervisor expects the ultravisor to request migration of all GFNs to secure-GFN. But the hypervisor cannot handle any H_SVM_PAGE_IN requests from the Ultravisor, done in the context of UV_REGISTER_MEM_SLOT ucall. This problem manifests as random errors in the SVM, when a memory-slot is hotplugged. This patch series automatically migrates the non-migrated pages of a SVM, and thus solves the problem. Testing: Passed rigorous testing using various sized SVMs. Changelog: v5: . This patch series includes Laurent's fix for memory hotplug/unplug . drop pages first and then delete the memslot. Otherwise the memslot does not get cleanly deleted, causing problems during reboot. . recreatable through the following set of commands . device_add pc-dimm,id=dimm1,memdev=mem1 . device_del dimm1 . device_add pc-dimm,id=dimm1,memdev=mem1 Further incorporates comments from Bharata: . fix for off-by-one while disabling migration. . code-reorganized to maximize sharing in init_start path and in memory-hotplug path . locking adjustments in mass-page migration during H_SVM_INIT_DONE. . improved recovery on error paths. . additional comments in the code for better understanding. . removed the retry-on-migration-failure code. . re-added the initial patch that adjust some prototype to overcome a git problem, where it messes up the code context. Had accidently dropped the patch in the last version. v4: . Incorported Bharata's comments: - Optimization -- replace write mmap semaphore with read mmap semphore. - disable page-merge during memory hotplug. - rearranged the patches. consolidated the page-migration-retry logic in a single patch. v3: . Optimized the page-migration retry-logic. . Relax and relinquish the cpu regularly while bulk migrating the non-migrated pages. This issue was causing soft-lockups. Fixed it. . Added a new patch, to retry page-migration a couple of times before returning H_BUSY in H_SVM_PAGE_IN. This issue was seen a few times in a 24hour continuous reboot test of the SVMs. v2: . fixed a bug observed by Laurent. The state of the GFN's associated with Secure-VMs were not reset during memslot flush. . Re-organized the code, for easier review. . Better description of the patch series. v1: . fixed a bug observed by Bharata. Pages that where paged-in and later paged-out must also be skipped from migration during H_SVM_INIT_DONE. Laurent Dufour (3): KVM: PPC: Book3S HV: migrate hot plugged memory KVM: PPC: Book3S HV: move kvmppc_svm_page_out up KVM: PPC: Book3S HV: rework secure mem slot dropping Ram Pai (4): KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs. Documentation/powerpc/ultravisor.rst| 3 + arch/powerpc/include/asm/kvm_book3s_uvmem.h | 16 + arch/powerpc/kvm/book3s_hv.c| 10 +- arch/powerpc/kvm/book3s_hv_uvmem.c | 690 +--- 4 files changed, 548 insertions(+), 171 deletions(-) -- 1.8.3.1
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote: > On 7/23/20 2:47 PM, pet...@infradead.org wrote: > > On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote: > > > BTW, do you have any comment on my v2 lock holder cpu info qspinlock > > > patch? > > > I will have to update the patch to fix the reported 0-day test problem, > > > but > > > I want to collect other feedback before sending out v3. > > I want to say I hate it all, it adds instructions to a path we spend an > > aweful lot of time optimizing without really getting anything back for > > it. > > It does add some extra instruction that may slow it down slightly, but I > don't agree that it gives nothing back. The cpu lock holder information can > be useful in analyzing crash dumps and in some debugging situation. I think > it can be useful in RHEL for this readon. How about an x86 config option to > allow distros to decide if they want to have it enabled? I will make sure > that it will have no performance degradation if the option is not enabled. Config knobs suck too; they create a maintenance burden (we get to make sure all the permutations works/build/etc..) and effectively nobody uses them, since world+dog uses what distros pick. Anyway, instead of adding a second per-cpu variable, can you see how horrible something like this is: unsigned char adds(unsigned char var, unsigned char val) { unsigned short sat = 0xff, tmp = var; asm ("addb %[val], %b[var];" "cmovc %[sat], %[var];" : [var] "+r" (tmp) : [val] "ir" (val), [sat] "r" (sat) ); return tmp; } Another thing to try is, instead of threading that lockval throughout the thing, simply: #define _Q_LOCKED_VAL this_cpu_read_stable(cpu_sat) or combined with the above #define _Q_LOCKED_VAL adds(this_cpu_read_stable(cpu_number), 2) and see if the compiler really makes a mess of things.
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On 7/23/20 2:47 PM, pet...@infradead.org wrote: On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote: BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch? I will have to update the patch to fix the reported 0-day test problem, but I want to collect other feedback before sending out v3. I want to say I hate it all, it adds instructions to a path we spend an aweful lot of time optimizing without really getting anything back for it. It does add some extra instruction that may slow it down slightly, but I don't agree that it gives nothing back. The cpu lock holder information can be useful in analyzing crash dumps and in some debugging situation. I think it can be useful in RHEL for this readon. How about an x86 config option to allow distros to decide if they want to have it enabled? I will make sure that it will have no performance degradation if the option is not enabled. Cheers, Longman
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote: > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch? > I will have to update the patch to fix the reported 0-day test problem, but > I want to collect other feedback before sending out v3. I want to say I hate it all, it adds instructions to a path we spend an aweful lot of time optimizing without really getting anything back for it. Will, how do you feel about it?
Re: [PATCH v2 0/3] powerpc/pseries: IPI doorbell improvements
On 6/30/20 1:50 PM, Nicholas Piggin wrote: > Since v1: > - Fixed SMP compile error. > - Fixed EPAPR / KVM_GUEST breakage. > - Expanded patch 3 changelog a bit. > > Thanks, > Nick I gave the patchset a try on KVM guests (P9 and P8) and PowerVM LPARs (P9). Looks good. Tested-by: Cédric Le Goater Thanks, C. > > Nicholas Piggin (3): > powerpc: inline doorbell sending functions > powerpc/pseries: Use doorbells even if XIVE is available > powerpc/pseries: Add KVM guest doorbell restrictions > > arch/powerpc/include/asm/dbell.h | 63 ++-- > arch/powerpc/include/asm/firmware.h | 6 +++ > arch/powerpc/include/asm/kvm_para.h | 26 ++-- > arch/powerpc/kernel/Makefile | 5 +-- > arch/powerpc/kernel/dbell.c | 55 > arch/powerpc/kernel/firmware.c | 19 + > arch/powerpc/platforms/pseries/smp.c | 62 +++ > 7 files changed, 134 insertions(+), 102 deletions(-) >
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On 7/23/20 10:00 AM, Peter Zijlstra wrote: On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote: We don't really need to do a pv_spinlocks_init() if pv_kick() isn't supported. Waiman, if you cannot explain how not having kick is a sane thing, what are you saying here? The current PPC paravirt spinlock code doesn't do any cpu kick. It does an equivalence of pv_wait by yielding the cpu to the lock holder only. The pv_spinlocks_init() is for setting up the hash table for doing pv_kick. If we don't need to do pv_kick, we don't need the hash table. I am not saying that pv_kick is not needed for the PPC environment. I was just trying to adapt the pvqspinlock code to such an environment first. Further investigation on how to implement some kind of pv_kick will be something that we may want to do as a follow on. BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch? I will have to update the patch to fix the reported 0-day test problem, but I want to collect other feedback before sending out v3. Cheers, Longman
Re: [PATCH 2/2] powerpc/64s: system call support for scv/rfscv instructions
Michael Ellerman a écrit : Nicholas Piggin writes: diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 2a39c716c343..b2bdc4de1292 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -257,6 +257,7 @@ #define PPC_INST_MFVSRD0x7c66 #define PPC_INST_MTVSRD0x7c000166 #define PPC_INST_SC0x4402 +#define PPC_INST_SCV 0x4401 ... @@ -411,6 +412,7 @@ ... +#define __PPC_LEV(l) (((l) & 0x7f) << 5) These conflicted and didn't seem to be used so I dropped them. diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 5abe98216dc2..161bfccbc309 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -3378,6 +3382,16 @@ int emulate_step(struct pt_regs *regs, struct ppc_inst instr) regs->msr = MSR_KERNEL; return 1; + case SYSCALL_VECTORED_0:/* scv 0 */ + regs->gpr[9] = regs->gpr[13]; + regs->gpr[10] = MSR_KERNEL; + regs->gpr[11] = regs->nip + 4; + regs->gpr[12] = regs->msr & MSR_MASK; + regs->gpr[13] = (unsigned long) get_paca(); + regs->nip = (unsigned long) _call_vectored_emulate; + regs->msr = MSR_KERNEL; + return 1; + This broke the ppc64e build: ld: arch/powerpc/lib/sstep.o:(.toc+0x0): undefined reference to `system_call_vectored_emulate' make[1]: *** [/home/michael/linux/Makefile:1139: vmlinux] Error 1 I wrapped it in #ifdef CONFIG_PPC64_BOOK3S. You mean CONFIG_PPC_BOOK3S_64 ? Christophe cheers
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
Excerpts from Peter Zijlstra's message of July 24, 2020 12:59 am: > On Thu, Jul 23, 2020 at 11:11:03PM +1000, Nicholas Piggin wrote: >> Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm: >> > On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote: >> > >> >> diff --git a/arch/powerpc/include/asm/hw_irq.h >> >> b/arch/powerpc/include/asm/hw_irq.h >> >> index 3a0db7b0b46e..35060be09073 100644 >> >> --- a/arch/powerpc/include/asm/hw_irq.h >> >> +++ b/arch/powerpc/include/asm/hw_irq.h >> >> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void) >> >> #define powerpc_local_irq_pmu_save(flags)\ >> >>do { \ >> >> raw_local_irq_pmu_save(flags); \ >> >> - trace_hardirqs_off(); \ >> >> + if (!raw_irqs_disabled_flags(flags))\ >> >> + trace_hardirqs_off(); \ >> >> } while(0) >> >> #define powerpc_local_irq_pmu_restore(flags) \ >> >> do {\ >> >> - if (raw_irqs_disabled_flags(flags)) { \ >> >> - raw_local_irq_pmu_restore(flags); \ >> >> - trace_hardirqs_off(); \ >> >> - } else {\ >> >> + if (!raw_irqs_disabled_flags(flags))\ >> >> trace_hardirqs_on();\ >> >> - raw_local_irq_pmu_restore(flags); \ >> >> - } \ >> >> + raw_local_irq_pmu_restore(flags); \ >> >> } while(0) >> > >> > You shouldn't be calling lockdep from NMI context! >> >> After this patch it doesn't. > > You sure, trace_hardirqs_{on,off}() calls into lockdep. At least for irq enable/disable functions yes. NMI runs with interrupts disabled so these will never call trace_hardirqs_on/off after this patch. > (FWIW they're > also broken vs entry ordering, but that's another story). > >> trace_hardirqs_on/off implementation appears to expect to be called in NMI >> context though, for some reason. > > Hurpm, not sure.. I'll have to go grep arch code now :/ The generic NMI > code didn't touch that stuff. > > Argh, yes, there might be broken there... damn! I'll go frob around. > Thanks, Nick
Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
Excerpts from Waiman Long's message of July 24, 2020 12:29 am: > On 7/23/20 9:30 AM, Nicholas Piggin wrote: >>> I would prefer to extract out the pending bit handling code out into a >>> separate helper function which can be overridden by the arch code >>> instead of breaking the slowpath into 2 pieces. >> You mean have the arch provide a queued_spin_lock_slowpath_pending >> function that the slow path calls? >> >> I would actually prefer the pending handling can be made inline in >> the queued_spin_lock function, especially with out-of-line locks it >> makes sense to put it there. >> >> We could ifdef out queued_spin_lock_slowpath_queue if it's not used, >> then __queued_spin_lock_slowpath_queue would be inlined into the >> caller so there would be no split? > > The pending code is an optimization for lightly contended locks. That is > why I think it is appropriate to extract it into a helper function and > mark it as such. > > You can certainly put the code in the arch's spin_lock code, you just > has to override the generic pending code by a null function. I see what you mean. I guess that would work fine. Thanks, Nick
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
On Thu, Jul 23, 2020 at 11:11:03PM +1000, Nicholas Piggin wrote: > Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm: > > On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote: > > > >> diff --git a/arch/powerpc/include/asm/hw_irq.h > >> b/arch/powerpc/include/asm/hw_irq.h > >> index 3a0db7b0b46e..35060be09073 100644 > >> --- a/arch/powerpc/include/asm/hw_irq.h > >> +++ b/arch/powerpc/include/asm/hw_irq.h > >> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void) > >> #define powerpc_local_irq_pmu_save(flags) \ > >> do { \ > >>raw_local_irq_pmu_save(flags); \ > >> - trace_hardirqs_off(); \ > >> + if (!raw_irqs_disabled_flags(flags))\ > >> + trace_hardirqs_off(); \ > >>} while(0) > >> #define powerpc_local_irq_pmu_restore(flags) \ > >>do {\ > >> - if (raw_irqs_disabled_flags(flags)) { \ > >> - raw_local_irq_pmu_restore(flags); \ > >> - trace_hardirqs_off(); \ > >> - } else {\ > >> + if (!raw_irqs_disabled_flags(flags))\ > >>trace_hardirqs_on();\ > >> - raw_local_irq_pmu_restore(flags); \ > >> - } \ > >> + raw_local_irq_pmu_restore(flags); \ > >>} while(0) > > > > You shouldn't be calling lockdep from NMI context! > > After this patch it doesn't. You sure, trace_hardirqs_{on,off}() calls into lockdep. (FWIW they're also broken vs entry ordering, but that's another story). > trace_hardirqs_on/off implementation appears to expect to be called in NMI > context though, for some reason. Hurpm, not sure.. I'll have to go grep arch code now :/ The generic NMI code didn't touch that stuff. Argh, yes, there might be broken there... damn! I'll go frob around.
Re: [v3 12/15] powerpc/perf: Add support for outputting extended regs in perf intr_regs
Em Thu, Jul 23, 2020 at 11:14:16AM +0530, kajoljain escreveu: > > > On 7/21/20 11:32 AM, kajoljain wrote: > > > > > > On 7/17/20 8:08 PM, Athira Rajeev wrote: > >> From: Anju T Sudhakar > >> > >> Add support for perf extended register capability in powerpc. > >> The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the > >> PMU which support extended registers. The generic code define the mask > >> of extended registers as 0 for non supported architectures. > >> > >> Patch adds extended regs support for power9 platform by > >> exposing MMCR0, MMCR1 and MMCR2 registers. > >> > >> REG_RESERVED mask needs update to include extended regs. > >> `PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers, > >> is defined at runtime in the kernel based on platform since the supported > >> registers may differ from one processor version to another and hence the > >> MASK value. > >> > >> with patch > >> -- > >> > >> available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 > >> r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26 > >> r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe > >> trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2 > >> > >> PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0 > >> ... intr regs: mask 0x ABI 64-bit > >> r00xc012b77c > >> r10xc03fe5e03930 > >> r20xc1b0e000 > >> r30xc03fdcddf800 > >> r40xc03fc788 > >> r50x9c422724be > >> r60xc03fe5e03908 > >> r70xff63bddc8706 > >> r80x9e4 > >> r90x0 > >> r10 0x1 > >> r11 0x0 > >> r12 0xc01299c0 > >> r13 0xc03c4800 > >> r14 0x0 > >> r15 0x7fffdd8b8b00 > >> r16 0x0 > >> r17 0x7fffdd8be6b8 > >> r18 0x7e7076607730 > >> r19 0x2f > >> r20 0xc0001fc26c68 > >> r21 0xc0002041e4227e00 > >> r22 0xc0002018fb60 > >> r23 0x1 > >> r24 0xc03ffec4d900 > >> r25 0x8000 > >> r26 0x0 > >> r27 0x1 > >> r28 0x1 > >> r29 0xc1be1260 > >> r30 0x6008010 > >> r31 0xc03ffebb7218 > >> nip 0xc012b910 > >> msr 0x90009033 > >> orig_r3 0xc012b86c > >> ctr 0xc01299c0 > >> link 0xc012b77c > >> xer 0x0 > >> ccr 0x2800 > >> softe 0x1 > >> trap 0xf00 > >> dar 0x0 > >> dsisr 0x800 > >> sier 0x0 > >> mmcra 0x800 > >> mmcr0 0x82008090 > >> mmcr1 0x1e00 > >> mmcr2 0x0 > >> ... thread: perf:4784 > >> > >> Signed-off-by: Anju T Sudhakar > >> [Defined PERF_REG_EXTENDED_MASK at run time to add support for different > >> platforms ] > >> Signed-off-by: Athira Rajeev > >> Reviewed-by: Madhavan Srinivasan > >> --- > > > > Patch looks good to me. > > > > Reviewed-by: Kajol Jain > > Hi Arnaldo and Jiri, >Please let me know if you have any comments on these patches. Can you > pull/ack these > patches if they seems fine to you. Can you please clarify something here, I think I saw a kernel build bot complaint followed by a fix, in these cases I think, for reviewer's sake, that this would entail a v4 patchkit? One that has no such build issues? Or have I got something wrong? - Arnaldo
Re: [PATCHv3 1/2] powerpc/pseries: group lmb operation and memblock's
Pingfan Liu writes: > This patch prepares for the incoming patch which swaps the order of KOBJ_ > uevent and dt's updating. > > It has no functional effect, just groups lmb operation and memblock's in > order to insert dt updating operation easily, and makes it easier to > review. ... > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c > b/arch/powerpc/platforms/pseries/hotplug-memory.c > index 5d545b7..1a3ac3b 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -355,7 +355,8 @@ static int dlpar_add_lmb(struct drmem_lmb *); > static int dlpar_remove_lmb(struct drmem_lmb *lmb) > { > unsigned long block_sz; > - int rc; > + phys_addr_t base_addr; > + int rc, nid; > > if (!lmb_is_removable(lmb)) > return -EINVAL; > @@ -364,17 +365,19 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) > if (rc) > return rc; > > + base_addr = lmb->base_addr; > + nid = lmb->nid; > block_sz = pseries_memory_block_size(); > > - __remove_memory(lmb->nid, lmb->base_addr, block_sz); > - > - /* Update memory regions for memory remove */ > - memblock_remove(lmb->base_addr, block_sz); > - > invalidate_lmb_associativity_index(lmb); > lmb_clear_nid(lmb); > lmb->flags &= ~DRCONF_MEM_ASSIGNED; > > + __remove_memory(nid, base_addr, block_sz); > + > + /* Update memory regions for memory remove */ > + memblock_remove(base_addr, block_sz); > + > return 0; > } I don't understand; the commit message should not claim this has no functional effect when it changes the order of operations like this. Maybe this is an improvement over the current behavior, but it's not explained why it would be.
Re: [PATCH v3 4/6] powerpc/64s: implement queued spinlocks and rwlocks
On Mon, Jul 06, 2020 at 02:35:38PM +1000, Nicholas Piggin wrote: > These have shown significantly improved performance and fairness when > spinlock contention is moderate to high on very large systems. > > [ Numbers hopefully forthcoming after more testing, but initial >results look good ] > > Thanks to the fast path, single threaded performance is not noticably > hurt. > > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/Kconfig | 13 > arch/powerpc/include/asm/Kbuild | 2 ++ > arch/powerpc/include/asm/qspinlock.h | 25 +++ > arch/powerpc/include/asm/spinlock.h | 5 + > arch/powerpc/include/asm/spinlock_types.h | 5 + > arch/powerpc/lib/Makefile | 3 +++ > include/asm-generic/qspinlock.h | 2 ++ > 7 files changed, 55 insertions(+) > create mode 100644 arch/powerpc/include/asm/qspinlock.h > > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig > index 24ac85c868db..17663ea57697 100644 > --- a/arch/powerpc/Kconfig > +++ b/arch/powerpc/Kconfig > @@ -146,6 +146,8 @@ config PPC > select ARCH_SUPPORTS_ATOMIC_RMW > select ARCH_USE_BUILTIN_BSWAP > select ARCH_USE_CMPXCHG_LOCKREF if PPC64 > + select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS > + select ARCH_USE_QUEUED_SPINLOCKSif PPC_QUEUED_SPINLOCKS > select ARCH_WANT_IPC_PARSE_VERSION > select ARCH_WEAK_RELEASE_ACQUIRE > select BINFMT_ELF > @@ -492,6 +494,17 @@ config HOTPLUG_CPU > > Say N if you are unsure. > > +config PPC_QUEUED_SPINLOCKS > + bool "Queued spinlocks" > + depends on SMP > + default "y" if PPC_BOOK3S_64 > + help > + Say Y here to use to use queued spinlocks which are more complex > + but give better salability and fairness on large SMP and NUMA ^ +c? Thanks Michal > + systems. > + > + If unsure, say "Y" if you have lots of cores, otherwise "N". > + > config ARCH_CPU_PROBE_RELEASE > def_bool y > depends on HOTPLUG_CPU > diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild > index dadbcf3a0b1e..1dd8b6adff5e 100644 > --- a/arch/powerpc/include/asm/Kbuild > +++ b/arch/powerpc/include/asm/Kbuild > @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h > generic-y += export.h > generic-y += local64.h > generic-y += mcs_spinlock.h > +generic-y += qrwlock.h > +generic-y += qspinlock.h > generic-y += vtime.h > generic-y += early_ioremap.h > diff --git a/arch/powerpc/include/asm/qspinlock.h > b/arch/powerpc/include/asm/qspinlock.h > new file mode 100644 > index ..c49e33e24edd > --- /dev/null > +++ b/arch/powerpc/include/asm/qspinlock.h > @@ -0,0 +1,25 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_POWERPC_QSPINLOCK_H > +#define _ASM_POWERPC_QSPINLOCK_H > + > +#include > + > +#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */ > + > +#define smp_mb__after_spinlock() smp_mb() > + > +static __always_inline int queued_spin_is_locked(struct qspinlock *lock) > +{ > + /* > + * This barrier was added to simple spinlocks by commit 51d7d5205d338, > + * but it should now be possible to remove it, asm arm64 has done with > + * commit c6f5d02b6a0f. > + */ > + smp_mb(); > + return atomic_read(>val); > +} > +#define queued_spin_is_locked queued_spin_is_locked > + > +#include > + > +#endif /* _ASM_POWERPC_QSPINLOCK_H */ > diff --git a/arch/powerpc/include/asm/spinlock.h > b/arch/powerpc/include/asm/spinlock.h > index 21357fe05fe0..434615f1d761 100644 > --- a/arch/powerpc/include/asm/spinlock.h > +++ b/arch/powerpc/include/asm/spinlock.h > @@ -3,7 +3,12 @@ > #define __ASM_SPINLOCK_H > #ifdef __KERNEL__ > > +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS > +#include > +#include > +#else > #include > +#endif > > #endif /* __KERNEL__ */ > #endif /* __ASM_SPINLOCK_H */ > diff --git a/arch/powerpc/include/asm/spinlock_types.h > b/arch/powerpc/include/asm/spinlock_types.h > index 3906f52dae65..c5d742f18021 100644 > --- a/arch/powerpc/include/asm/spinlock_types.h > +++ b/arch/powerpc/include/asm/spinlock_types.h > @@ -6,6 +6,11 @@ > # error "please don't include this file directly" > #endif > > +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS > +#include > +#include > +#else > #include > +#endif > > #endif > diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile > index 5e994cda8e40..d66a645503eb 100644 > --- a/arch/powerpc/lib/Makefile > +++ b/arch/powerpc/lib/Makefile > @@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o > copypage_power7.o \ > obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ > memcpy_64.o memcpy_mcsafe_64.o > > +ifndef CONFIG_PPC_QUEUED_SPINLOCKS > obj64-$(CONFIG_SMP) += locks.o > +endif > + > obj64-$(CONFIG_ALTIVEC) += vmx-helper.o > obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ >
Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
On 7/23/20 9:30 AM, Nicholas Piggin wrote: I would prefer to extract out the pending bit handling code out into a separate helper function which can be overridden by the arch code instead of breaking the slowpath into 2 pieces. You mean have the arch provide a queued_spin_lock_slowpath_pending function that the slow path calls? I would actually prefer the pending handling can be made inline in the queued_spin_lock function, especially with out-of-line locks it makes sense to put it there. We could ifdef out queued_spin_lock_slowpath_queue if it's not used, then __queued_spin_lock_slowpath_queue would be inlined into the caller so there would be no split? The pending code is an optimization for lightly contended locks. That is why I think it is appropriate to extract it into a helper function and mark it as such. You can certainly put the code in the arch's spin_lock code, you just has to override the generic pending code by a null function. Cheers, Longman
Re: [RFC PATCH 4/5] powerpc/mm: Remove custom stack expansion checking
Hi Michael, > We have powerpc specific logic in our page fault handling to decide if > an access to an unmapped address below the stack pointer should expand > the stack VMA. > > The logic aims to prevent userspace from doing bad accesses below the > stack pointer. However as long as the stack is < 1MB in size, we allow > all accesses without further checks. Adding some debug I see that I > can do a full kernel build and LTP run, and not a single process has > used more than 1MB of stack. So for the majority of processes the > logic never even fires. > > We also recently found a nasty bug in this code which could cause > userspace programs to be killed during signal delivery. It went > unnoticed presumably because most processes use < 1MB of stack. > > The generic mm code has also grown support for stack guard pages since > this code was originally written, so the most heinous case of the > stack expanding into other mappings is now handled for us. > > Finally although some other arches have special logic in this path, > from what I can tell none of x86, arm64, arm and s390 impose any extra > checks other than those in expand_stack(). > > So drop our complicated logic and like other architectures just let > the stack expand as long as its within the rlimit. > I applied and tested this. While I wouldn't call my testing comprehensive, I have not been able to reproduce the crash with this patch applied. Kind regards, Daniel > Signed-off-by: Michael Ellerman > --- > arch/powerpc/mm/fault.c | 106 ++-- > 1 file changed, 5 insertions(+), 101 deletions(-) > > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c > index ed01329dd12b..925a7231abb3 100644 > --- a/arch/powerpc/mm/fault.c > +++ b/arch/powerpc/mm/fault.c > @@ -42,39 +42,7 @@ > #include > #include > > -/* > - * Check whether the instruction inst is a store using > - * an update addressing form which will update r1. > - */ > -static bool store_updates_sp(struct ppc_inst inst) > -{ > - /* check for 1 in the rA field */ > - if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1) > - return false; > - /* check major opcode */ > - switch (ppc_inst_primary_opcode(inst)) { > - case OP_STWU: > - case OP_STBU: > - case OP_STHU: > - case OP_STFSU: > - case OP_STFDU: > - return true; > - case OP_STD:/* std or stdu */ > - return (ppc_inst_val(inst) & 3) == 1; > - case OP_31: > - /* check minor opcode */ > - switch ((ppc_inst_val(inst) >> 1) & 0x3ff) { > - case OP_31_XOP_STDUX: > - case OP_31_XOP_STWUX: > - case OP_31_XOP_STBUX: > - case OP_31_XOP_STHUX: > - case OP_31_XOP_STFSUX: > - case OP_31_XOP_STFDUX: > - return true; > - } > - } > - return false; > -} > + > /* > * do_page_fault error handling helpers > */ > @@ -267,54 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, > unsigned long error_code, > return false; > } > > -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, > - struct vm_area_struct *vma, unsigned int flags, > - bool *must_retry) > -{ > - /* > - * N.B. The POWER/Open ABI allows programs to access up to > - * 288 bytes below the stack pointer. > - * The kernel signal delivery code writes up to 4KB > - * below the stack pointer (r1) before decrementing it. > - * The exec code can write slightly over 640kB to the stack > - * before setting the user r1. Thus we allow the stack to > - * expand to 1MB without further checks. > - */ > - if (address + 0x10 < vma->vm_end) { > - struct ppc_inst __user *nip = (struct ppc_inst __user > *)regs->nip; > - /* get user regs even if this fault is in kernel mode */ > - struct pt_regs *uregs = current->thread.regs; > - if (uregs == NULL) > - return true; > - > - /* > - * A user-mode access to an address a long way below > - * the stack pointer is only valid if the instruction > - * is one which would update the stack pointer to the > - * address accessed if the instruction completed, > - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb > - * (or the byte, halfword, float or double forms). > - * > - * If we don't check this then any write to the area > - * between the last mapped region and the stack will > - * expand the stack rather than segfaulting. > - */ > - if (address + 4096 >= uregs->gpr[1]) > - return false; > - > - if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && > - access_ok(nip, sizeof(*nip)))
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
Excerpts from Michael Ellerman's message of July 9, 2020 8:53 pm: > Nicholas Piggin writes: > >> Signed-off-by: Nicholas Piggin >> --- >> arch/powerpc/include/asm/paravirt.h | 28 >> arch/powerpc/include/asm/qspinlock.h | 66 +++ >> arch/powerpc/include/asm/qspinlock_paravirt.h | 7 ++ >> arch/powerpc/platforms/pseries/Kconfig| 5 ++ >> arch/powerpc/platforms/pseries/setup.c| 6 +- >> include/asm-generic/qspinlock.h | 2 + > > Another ack? > >> diff --git a/arch/powerpc/include/asm/paravirt.h >> b/arch/powerpc/include/asm/paravirt.h >> index 7a8546660a63..f2d51f929cf5 100644 >> --- a/arch/powerpc/include/asm/paravirt.h >> +++ b/arch/powerpc/include/asm/paravirt.h >> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 >> yield_count) >> { >> ___bad_yield_to_preempted(); /* This would be a bug */ >> } >> + >> +extern void ___bad_yield_to_any(void); >> +static inline void yield_to_any(void) >> +{ >> +___bad_yield_to_any(); /* This would be a bug */ >> +} > > Why do we do that rather than just not defining yield_to_any() at all > and letting the build fail on that? > > There's a condition somewhere that we know will false at compile time > and drop the call before linking? Mainly so you could use it in if (IS_ENABLED()) blocks, but would still catch the (presumably buggy) case where something calls it without the option set. I think I had it arranged a different way that was using IS_ENABLED earlier and changed it but might as well keep it this way. > >> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h >> b/arch/powerpc/include/asm/qspinlock_paravirt.h >> new file mode 100644 >> index ..750d1b5e0202 >> --- /dev/null >> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h >> @@ -0,0 +1,7 @@ >> +/* SPDX-License-Identifier: GPL-2.0-or-later */ >> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H >> +#define __ASM_QSPINLOCK_PARAVIRT_H > > _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please. > >> + >> +EXPORT_SYMBOL(__pv_queued_spin_unlock); > > Why's that in a header? Should that (eventually) go with the generic > implementation? Yeah the qspinlock_paravirt.h header is a bit weird and only gets included into kernel/locking/qspinlock.c >> diff --git a/arch/powerpc/platforms/pseries/Kconfig >> b/arch/powerpc/platforms/pseries/Kconfig >> index 24c18362e5ea..756e727b383f 100644 >> --- a/arch/powerpc/platforms/pseries/Kconfig >> +++ b/arch/powerpc/platforms/pseries/Kconfig >> @@ -25,9 +25,14 @@ config PPC_PSERIES >> select SWIOTLB >> default y >> >> +config PARAVIRT_SPINLOCKS >> +bool >> +default n > > default n is the default. > >> diff --git a/arch/powerpc/platforms/pseries/setup.c >> b/arch/powerpc/platforms/pseries/setup.c >> index 2db8469e475f..747a203d9453 100644 >> --- a/arch/powerpc/platforms/pseries/setup.c >> +++ b/arch/powerpc/platforms/pseries/setup.c >> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void) >> if (firmware_has_feature(FW_FEATURE_LPAR)) { >> vpa_init(boot_cpuid); >> >> -if (lppaca_shared_proc(get_lppaca())) >> +if (lppaca_shared_proc(get_lppaca())) { >> static_branch_enable(_processor); >> +#ifdef CONFIG_PARAVIRT_SPINLOCKS >> +pv_spinlocks_init(); >> +#endif >> +} > > We could avoid the ifdef with this I think? Yes I think so. Thanks, Nick
Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping
Le 23/07/2020 à 14:32, Laurent Dufour a écrit : Le 23/07/2020 à 05:36, Bharata B Rao a écrit : On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote: When a secure memslot is dropped, all the pages backed in the secure device (aka really backed by secure memory by the Ultravisor) should be paged out to a normal page. Previously, this was achieved by triggering the page fault mechanism which is calling kvmppc_svm_page_out() on each pages. This can't work when hot unplugging a memory slot because the memory slot is flagged as invalid and gfn_to_pfn() is then not trying to access the page, so the page fault mechanism is not triggered. Since the final goal is to make a call to kvmppc_svm_page_out() it seems simpler to directly calling it instead of triggering such a mechanism. This way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a memslot. Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock, the call to __kvmppc_svm_page_out() is made. As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the VMA is fetched in a lazy way, to not trigger find_vma() all the time. In addition, the mmap_sem is help in read mode during that time, not in write mode since the virual memory layout is not impacted, and kvm->arch.uvmem_lock prevents concurrent operation on the secure device. Cc: Ram Pai Cc: Bharata B Rao Cc: Paul Mackerras Signed-off-by: Laurent Dufour ---  arch/powerpc/kvm/book3s_hv_uvmem.c | 54 --  1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 5a4b02d3f651..ba5c7c77cc3a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -624,35 +624,55 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,   * fault on them, do fault time migration to replace the device PTEs in   * QEMU page table with normal PTEs from newly allocated pages.   */ -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,   struct kvm *kvm, bool skip_page_out)  {  int i;  struct kvmppc_uvmem_page_pvt *pvt; -   unsigned long pfn, uvmem_pfn; -   unsigned long gfn = free->base_gfn; +   struct page *uvmem_page; +   struct vm_area_struct *vma = NULL; +   unsigned long uvmem_pfn, gfn; +   unsigned long addr, end; + +   mmap_read_lock(kvm->mm); + +   addr = slot->userspace_addr; We typically use gfn_to_hva() for that, but that won't work for a memslot that is already marked INVALID which is the case here. I think it is ok to access slot->userspace_addr here of an INVALID memslot, but just thought of explictly bringing this up. Which explicitly mentioned above in the patch's description: This can't work when hot unplugging a memory slot because the memory slot is flagged as invalid and gfn_to_pfn() is then not trying to access the page, so the page fault mechanism is not triggered. +   end = addr + (slot->npages * PAGE_SIZE); -   for (i = free->npages; i; --i, ++gfn) { -   struct page *uvmem_page; +   gfn = slot->base_gfn; +   for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) { + +   /* Fetch the VMA if addr is not in the latest fetched one */ +   if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) { +   vma = find_vma_intersection(kvm->mm, addr, end); +   if (!vma || +   vma->vm_start > addr || vma->vm_end < end) { +   pr_err("Can't find VMA for gfn:0x%lx\n", gfn); +   break; +   } +   } In Ram's series, kvmppc_memslot_page_merge() also walks the VMAs spanning the memslot, but it uses a different logic for the same. Why can't these two cases use the same method to walk the VMAs? Is there anything subtly different between the two cases? This is probably doable. At the time I wrote that patch, the kvmppc_memslot_page_merge() was not yet introduced AFAIR. This being said, I'd help a lot to factorize that code... I let Ram dealing with that ;) Indeed I don't think this is relevant, the loop in kvmppc_memslot_page_merge() deals with one call (to ksm_advise) per VMA, while this code is dealing with one call per page of the VMA, which completely different. I don't think merging the both will be a good idea. Cheers, Laurent.
Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote: > We don't really need to do a pv_spinlocks_init() if pv_kick() isn't > supported. Waiman, if you cannot explain how not having kick is a sane thing, what are you saying here?
Re: [PATCH v2 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
On Thu, 23 Jul 2020 at 04:52, Jarkko Sakkinen wrote: > > On Thu, Jul 16, 2020 at 06:49:09PM +0200, Christophe Leroy wrote: > > Jarkko Sakkinen a écrit : > > > > > Rename module_alloc() to text_alloc() and module_memfree() to > > > text_memfree(), and move them to kernel/text.c, which is unconditionally > > > compiled to the kernel proper. This allows kprobes, ftrace and bpf to > > > allocate space for executable code without requiring to compile the > > > modules > > > support (CONFIG_MODULES=y) in. > > > > You are not changing enough in powerpc to have this work. > > On powerpc 32 bits (6xx), when STRICT_KERNEL_RWX is selected, the vmalloc > > space is set to NX (no exec) at segment level (ie by 256Mbytes zone) unless > > CONFIG_MODULES is selected. > > > > Christophe > > This has been deduced down to: > > https://lore.kernel.org/lkml/20200717030422.679972-1-jarkko.sakki...@linux.intel.com/ > > I.e. not intruding PPC anymore :-) > Ok, so after the elaborate discussion we had between Jessica, Russell, Peter, Will, Mark, you and myself, where we pointed out that a) a single text_alloc() abstraction for bpf, kprobes and ftrace does not fit other architectures very well, and b) that module_alloc() is not suitable as a default to base text_alloc() on, you went ahead and implemented that anyway, but only cc'ing Peter, akpm, Masami and the mm list this time? Sorry, but that is not how it works. Once people get pulled into a discussion, you cannot dismiss them or their feedback like that and go off and do your own thing anyway. Generic features like this are tricky to get right, and it will likely take many iterations and input from many different people.
Re: [PATCH 2/5] powerpc: Allow 4096 bytes of stack expansion for the signal frame
Hi Michael, Unfortunately, this patch doesn't completely solve the problem. Trying the original reproducer, I'm still able to trigger the crash even with this patch, although not 100% of the time. (If I turn ASLR off outside of tmux it reliably crashes, if I turn ASLR off _inside_ of tmux it reliably succeeds; all of this is on a serial console.) ./foo 1241000 & sleep 1; killall -USR1 foo; echo ok If I add some debugging information, I see that I'm getting address + 4096 = 7fed0fa0 gpr1 = 7fed1020 So address + 4096 is 0x80 bytes below the 4k window. I haven't been able to figure out why, gdb gives me a NIP in __kernel_sigtramp_rt64 but I don't know what to make of that. Kind regards, Daniel P.S. I don't know what your policy on linking to kernel bugzilla is, but if you want: Link: https://bugzilla.kernel.org/show_bug.cgi?id=205183 > Reported-by: Tom Lane > Signed-off-by: Michael Ellerman > --- > arch/powerpc/mm/fault.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c > index 641fc5f3d7dd..ed01329dd12b 100644 > --- a/arch/powerpc/mm/fault.c > +++ b/arch/powerpc/mm/fault.c > @@ -274,7 +274,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, > unsigned long address, > /* >* N.B. The POWER/Open ABI allows programs to access up to >* 288 bytes below the stack pointer. > - * The kernel signal delivery code writes up to about 1.5kB > + * The kernel signal delivery code writes up to 4KB >* below the stack pointer (r1) before decrementing it. >* The exec code can write slightly over 640kB to the stack >* before setting the user r1. Thus we allow the stack to > @@ -299,7 +299,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, > unsigned long address, >* between the last mapped region and the stack will >* expand the stack rather than segfaulting. >*/ > - if (address + 2048 >= uregs->gpr[1]) > + if (address + 4096 >= uregs->gpr[1]) > return false; > > if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && > -- > 2.25.1
Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
Excerpts from Waiman Long's message of July 22, 2020 12:36 am: > On 7/21/20 7:08 AM, Nicholas Piggin wrote: >> diff --git a/arch/powerpc/include/asm/qspinlock.h >> b/arch/powerpc/include/asm/qspinlock.h >> index b752d34517b3..26d8766a1106 100644 >> --- a/arch/powerpc/include/asm/qspinlock.h >> +++ b/arch/powerpc/include/asm/qspinlock.h >> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock >> *lock) >> >> #else >> extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); >> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock); >> #endif >> >> static __always_inline void queued_spin_lock(struct qspinlock *lock) >> { >> -u32 val = 0; >> - >> -if (likely(atomic_try_cmpxchg_lock(>val, , _Q_LOCKED_VAL))) >> +atomic_t *a = >val; >> +u32 val; >> + >> +again: >> +asm volatile( >> +"1:\t" PPC_LWARX(%0,0,%1,1) " # queued_spin_lock >> \n" >> +: "=" (val) >> +: "r" (>counter) >> +: "memory"); >> + >> +if (likely(val == 0)) { >> +asm_volatile_goto( >> +" stwcx. %0,0,%1 >> \n" >> +" bne-%l[again] >> \n" >> +"\t"PPC_ACQUIRE_BARRIER " >> \n" >> +: >> +: "r"(_Q_LOCKED_VAL), "r" (>counter) >> +: "cr0", "memory" >> +: again ); >> return; >> - >> -queued_spin_lock_slowpath(lock, val); >> +} >> + >> +if (likely(val == _Q_LOCKED_VAL)) { >> +asm_volatile_goto( >> +" stwcx. %0,0,%1 >> \n" >> +" bne-%l[again] >> \n" >> +: >> +: "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (>counter) >> +: "cr0", "memory" >> +: again ); >> + >> +atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK)); >> +// clear_pending_set_locked(lock); >> +WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); >> +// lockevent_inc(lock_pending); >> +return; >> +} >> + >> +if (val == _Q_PENDING_VAL) { >> +int cnt = _Q_PENDING_LOOPS; >> +val = atomic_cond_read_relaxed(a, >> + (VAL != _Q_PENDING_VAL) || >> !cnt--); >> +if (!(val & ~_Q_LOCKED_MASK)) >> +goto again; >> +} >> +queued_spin_lock_slowpath_queue(lock); >> } >> #define queued_spin_lock queued_spin_lock >> > > I am fine with the arch code override some part of the generic code. Cool. >> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c >> index b9515fcc9b29..ebcc6f5d99d5 100644 >> --- a/kernel/locking/qspinlock.c >> +++ b/kernel/locking/qspinlock.c >> @@ -287,10 +287,14 @@ static __always_inline u32 >> __pv_wait_head_or_lock(struct qspinlock *lock, >> >> #ifdef CONFIG_PARAVIRT_SPINLOCKS >> #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath >> +#define queued_spin_lock_slowpath_queue >> native_queued_spin_lock_slowpath_queue >> #endif >> >> #endif /* _GEN_PV_LOCK_SLOWPATH */ >> >> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock); >> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock); >> + >> /** >>* queued_spin_lock_slowpath - acquire the queued spinlock >>* @lock: Pointer to queued spinlock structure >> @@ -314,12 +318,6 @@ static __always_inline u32 >> __pv_wait_head_or_lock(struct qspinlock *lock, >>*/ >> void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) >> { >> -struct mcs_spinlock *prev, *next, *node; >> -u32 old, tail; >> -int idx; >> - >> -BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); >> - >> if (pv_enabled()) >> goto pv_queue; >> >> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, >> u32 val) >> queue: >> lockevent_inc(lock_slowpath); >> pv_queue: >> +__queued_spin_lock_slowpath_queue(lock); >> +} >> +EXPORT_SYMBOL(queued_spin_lock_slowpath); >> + >> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock) >> +{ >> +lockevent_inc(lock_slowpath); >> +__queued_spin_lock_slowpath_queue(lock); >> +} >> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue); >> + >> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock) >> +{ >> +struct mcs_spinlock *prev, *next, *node; >> +u32 old, tail; >> +u32 val; >> +int idx; >> + >> +BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); >> + >> node = this_cpu_ptr([0].mcs); >> idx = node->count++; >> tail = encode_tail(smp_processor_id(), idx); >> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, >> u32 val) >> */ >>
Re: [PATCHv3 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents
Pingfan Liu writes: > A bug is observed on pseries by taking the following steps on rhel: > -1. drmgr -c mem -r -q 5 > -2. echo c > /proc/sysrq-trigger > > And then, the failure looks like: > kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/ > kdump: saving vmcore-dmesg.txt > kdump: saving vmcore-dmesg.txt complete > kdump: saving vmcore > Checking for memory holes : [ 0.0 %] / > Checking for memory holes : [100.0 %] | > Excluding unnecessary pages : [100.0 %] \ > Copying data : [ 0.3 %] - > eta: 38s[ 44.337636] hash-mmu: mm: Hashing failure ! > EA=0x7fffba40 access=0x8004 current=makedumpfile > [ 44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 > psize 2 pte=0xc0005504 > [ 44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 > access=0x8004 current=makedumpfile > [ 44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 > psize 2 pte=0xc0005504 > [ 44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip > 7fffbbc4d7fc lr 00011356ca3c code 2 > [ 44.338548] Core dump to |/bin/false pipe failed > /lib/kdump-lib-initramfs.sh: line 98: 469 Bus error > $CORE_COLLECTOR /proc/vmcore > $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete > kdump: saving vmcore failed > > * Root cause * > After analyzing, it turns out that in the current implementation, > when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as > the code __remove_memory() comes before drmem_update_dt(). > So in kdump kernel, when read_from_oldmem() resorts to > pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to > non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it > can be observed "Bus error" > > From a viewpoint of listener and publisher, the publisher notifies the > listener before data is ready. This introduces a problem where udev > launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before > updating. And in capture kernel, makedumpfile will access the memory based > on the stale dt info, and hit a SIGBUS error due to an un-existed lmb. > > * Fix * > In order to fix this issue, update dt before __remove_memory(), and > accordingly the same rule in hot-add path. > > This will introduce extra dt updating payload for each involved lmb when > hotplug. > But it should be fine since drmem_update_dt() is memory based operation and > hotplug is not a hot path. This is great analysis but the performance implications of the change are grave. The add/remove paths here are already O(n) where n is the quantity of memory assigned to the LP, this change would make it O(n^2): dlpar_memory_add_by_count for_each_drmem_lmb <-- dlpar_add_lmb drmem_update_dt(_v1|_v2) for_each_drmem_lmb <-- Memory add/remove isn't a hot path but quadratic runtime complexity isn't acceptable. Its current performance is bad enough that I have internal bugs open on it. Not to mention we leak memory every time drmem_update_dt is called because we can't safely free device tree properties :-( Also note that this sort of reverts (fixes?) 063b8b1251fd ("powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR request").
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm: > On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote: > >> diff --git a/arch/powerpc/include/asm/hw_irq.h >> b/arch/powerpc/include/asm/hw_irq.h >> index 3a0db7b0b46e..35060be09073 100644 >> --- a/arch/powerpc/include/asm/hw_irq.h >> +++ b/arch/powerpc/include/asm/hw_irq.h >> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void) >> #define powerpc_local_irq_pmu_save(flags) \ >> do { \ >> raw_local_irq_pmu_save(flags); \ >> -trace_hardirqs_off(); \ >> +if (!raw_irqs_disabled_flags(flags))\ >> +trace_hardirqs_off(); \ >> } while(0) >> #define powerpc_local_irq_pmu_restore(flags)\ >> do {\ >> -if (raw_irqs_disabled_flags(flags)) { \ >> -raw_local_irq_pmu_restore(flags); \ >> -trace_hardirqs_off(); \ >> -} else {\ >> +if (!raw_irqs_disabled_flags(flags))\ >> trace_hardirqs_on();\ >> -raw_local_irq_pmu_restore(flags); \ >> -} \ >> +raw_local_irq_pmu_restore(flags); \ >> } while(0) > > You shouldn't be calling lockdep from NMI context! After this patch it doesn't. trace_hardirqs_on/off implementation appears to expect to be called in NMI context though, for some reason. > That is, I recently > added suport for that on x86: > > https://lkml.kernel.org/r/20200623083721.155449...@infradead.org > https://lkml.kernel.org/r/20200623083721.216740...@infradead.org > > But you need to be very careful on how you order things, as you can see > the above relies on preempt_count() already having been incremented with > NMI_MASK. Hmm. My patch seems simpler. I don't know this stuff very well, I don't really understand what your patch enables for x86 but at least it shouldn't be incompatible with this one AFAIKS. Thanks, Nick
Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping
Le 23/07/2020 à 05:36, Bharata B Rao a écrit : On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote: When a secure memslot is dropped, all the pages backed in the secure device (aka really backed by secure memory by the Ultravisor) should be paged out to a normal page. Previously, this was achieved by triggering the page fault mechanism which is calling kvmppc_svm_page_out() on each pages. This can't work when hot unplugging a memory slot because the memory slot is flagged as invalid and gfn_to_pfn() is then not trying to access the page, so the page fault mechanism is not triggered. Since the final goal is to make a call to kvmppc_svm_page_out() it seems simpler to directly calling it instead of triggering such a mechanism. This way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a memslot. Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock, the call to __kvmppc_svm_page_out() is made. As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the VMA is fetched in a lazy way, to not trigger find_vma() all the time. In addition, the mmap_sem is help in read mode during that time, not in write mode since the virual memory layout is not impacted, and kvm->arch.uvmem_lock prevents concurrent operation on the secure device. Cc: Ram Pai Cc: Bharata B Rao Cc: Paul Mackerras Signed-off-by: Laurent Dufour --- arch/powerpc/kvm/book3s_hv_uvmem.c | 54 -- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 5a4b02d3f651..ba5c7c77cc3a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -624,35 +624,55 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, * fault on them, do fault time migration to replace the device PTEs in * QEMU page table with normal PTEs from newly allocated pages. */ -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, struct kvm *kvm, bool skip_page_out) { int i; struct kvmppc_uvmem_page_pvt *pvt; - unsigned long pfn, uvmem_pfn; - unsigned long gfn = free->base_gfn; + struct page *uvmem_page; + struct vm_area_struct *vma = NULL; + unsigned long uvmem_pfn, gfn; + unsigned long addr, end; + + mmap_read_lock(kvm->mm); + + addr = slot->userspace_addr; We typically use gfn_to_hva() for that, but that won't work for a memslot that is already marked INVALID which is the case here. I think it is ok to access slot->userspace_addr here of an INVALID memslot, but just thought of explictly bringing this up. Which explicitly mentioned above in the patch's description: This can't work when hot unplugging a memory slot because the memory slot is flagged as invalid and gfn_to_pfn() is then not trying to access the page, so the page fault mechanism is not triggered. + end = addr + (slot->npages * PAGE_SIZE); - for (i = free->npages; i; --i, ++gfn) { - struct page *uvmem_page; + gfn = slot->base_gfn; + for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) { + + /* Fetch the VMA if addr is not in the latest fetched one */ + if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) { + vma = find_vma_intersection(kvm->mm, addr, end); + if (!vma || + vma->vm_start > addr || vma->vm_end < end) { + pr_err("Can't find VMA for gfn:0x%lx\n", gfn); + break; + } + } In Ram's series, kvmppc_memslot_page_merge() also walks the VMAs spanning the memslot, but it uses a different logic for the same. Why can't these two cases use the same method to walk the VMAs? Is there anything subtly different between the two cases? This is probably doable. At the time I wrote that patch, the kvmppc_memslot_page_merge() was not yet introduced AFAIR. This being said, I'd help a lot to factorize that code... I let Ram dealing with that ;) Cheers, Laurent.
Re: [v4 4/5] KVM: PPC: Book3S HV: retry page migration before erroring-out
I am dropping this patch based on our conversation, where we agreed, we need to rootcause the migration failure. On Thu, Jul 23, 2020 at 11:43:44AM +0530, Bharata B Rao wrote: > On Fri, Jul 17, 2020 at 01:00:26AM -0700, Ram Pai wrote: > > @@ -812,7 +842,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, > > unsigned long gpa, > > struct vm_area_struct *vma; > > int srcu_idx; > > unsigned long gfn = gpa >> page_shift; > > - int ret; > > + int ret, repeat_count = REPEAT_COUNT; > > > > if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) > > return H_UNSUPPORTED; > > @@ -826,34 +856,44 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, > > unsigned long gpa, > > if (flags & H_PAGE_IN_SHARED) > > return kvmppc_share_page(kvm, gpa, page_shift); > > > > - ret = H_PARAMETER; > > srcu_idx = srcu_read_lock(>srcu); > > - mmap_read_lock(kvm->mm); > > > > - start = gfn_to_hva(kvm, gfn); > > - if (kvm_is_error_hva(start)) > > - goto out; > > - > > - mutex_lock(>arch.uvmem_lock); > > /* Fail the page-in request of an already paged-in page */ > > - if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL)) > > - goto out_unlock; > > + mutex_lock(>arch.uvmem_lock); > > + ret = kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL); > > + mutex_unlock(>arch.uvmem_lock); > > Same comment as for the prev patch. I don't think you can release > the lock here. > > > + if (ret) { > > + srcu_read_unlock(>srcu, srcu_idx); > > + return H_PARAMETER; > > + } > > > > - end = start + (1UL << page_shift); > > - vma = find_vma_intersection(kvm->mm, start, end); > > - if (!vma || vma->vm_start > start || vma->vm_end < end) > > - goto out_unlock; > > + do { > > + ret = H_PARAMETER; > > + mmap_read_lock(kvm->mm); > > > > - if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift, > > - true)) > > - goto out_unlock; > > + start = gfn_to_hva(kvm, gfn); > > + if (kvm_is_error_hva(start)) { > > + mmap_read_unlock(kvm->mm); > > + break; > > + } > > > > - ret = H_SUCCESS; > > + end = start + (1UL << page_shift); > > + vma = find_vma_intersection(kvm->mm, start, end); > > + if (!vma || vma->vm_start > start || vma->vm_end < end) { > > + mmap_read_unlock(kvm->mm); > > + break; > > + } > > + > > + mutex_lock(>arch.uvmem_lock); > > + ret = kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, > > page_shift, true); > > + mutex_unlock(>arch.uvmem_lock); > > + > > + mmap_read_unlock(kvm->mm); > > + } while (ret == -2 && repeat_count--); > > + > > + if (ret == -2) > > + ret = H_BUSY; > > > > -out_unlock: > > - mutex_unlock(>arch.uvmem_lock); > > -out: > > - mmap_read_unlock(kvm->mm); > > srcu_read_unlock(>srcu, srcu_idx); > > return ret; > > } > > -- > > 1.8.3.1 -- Ram Pai
Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote: > diff --git a/arch/powerpc/include/asm/hw_irq.h > b/arch/powerpc/include/asm/hw_irq.h > index 3a0db7b0b46e..35060be09073 100644 > --- a/arch/powerpc/include/asm/hw_irq.h > +++ b/arch/powerpc/include/asm/hw_irq.h > @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void) > #define powerpc_local_irq_pmu_save(flags)\ >do { \ > raw_local_irq_pmu_save(flags); \ > - trace_hardirqs_off(); \ > + if (!raw_irqs_disabled_flags(flags))\ > + trace_hardirqs_off(); \ > } while(0) > #define powerpc_local_irq_pmu_restore(flags) \ > do {\ > - if (raw_irqs_disabled_flags(flags)) { \ > - raw_local_irq_pmu_restore(flags); \ > - trace_hardirqs_off(); \ > - } else {\ > + if (!raw_irqs_disabled_flags(flags))\ > trace_hardirqs_on();\ > - raw_local_irq_pmu_restore(flags); \ > - } \ > + raw_local_irq_pmu_restore(flags); \ > } while(0) You shouldn't be calling lockdep from NMI context! That is, I recently added suport for that on x86: https://lkml.kernel.org/r/20200623083721.155449...@infradead.org https://lkml.kernel.org/r/20200623083721.216740...@infradead.org But you need to be very careful on how you order things, as you can see the above relies on preempt_count() already having been incremented with NMI_MASK.
Re: [v4 3/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
On Thu, Jul 23, 2020 at 11:40:37AM +0530, Bharata B Rao wrote: > On Fri, Jul 17, 2020 at 01:00:25AM -0700, Ram Pai wrote: > > > > +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm, > > + const struct kvm_memory_slot *memslot) > > Don't see any callers for this outside of this file, so why not static? > > > +{ > > + unsigned long gfn = memslot->base_gfn; > > + struct vm_area_struct *vma; > > + unsigned long start, end; > > + int ret = 0; > > + > > + while (kvmppc_next_nontransitioned_gfn(memslot, kvm, )) { > > So you checked the state of gfn under uvmem_lock above, but release > it too. > > > + > > + mmap_read_lock(kvm->mm); > > + start = gfn_to_hva(kvm, gfn); > > + if (kvm_is_error_hva(start)) { > > + ret = H_STATE; > > + goto next; > > + } > > + > > + end = start + (1UL << PAGE_SHIFT); > > + vma = find_vma_intersection(kvm->mm, start, end); > > + if (!vma || vma->vm_start > start || vma->vm_end < end) { > > + ret = H_STATE; > > + goto next; > > + } > > + > > + mutex_lock(>arch.uvmem_lock); > > + ret = kvmppc_svm_migrate_page(vma, start, end, > > + (gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false); > > What is the guarantee that the gfn is in the same earlier state when you do > do migration here? Are you worried about the case, where someother thread will sneak-in and migrate the GFN, and this migration request will become a duplicate one? That is theortically possible, though practically improbable. This transition is attempted only when there is one vcpu active in the VM. However, may be, we should not bake-in that assumption in this code. Will remove that assumption. RP
Re: [v4 2/5] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
On Thu, Jul 23, 2020 at 10:18:30AM +0530, Bharata B Rao wrote: > On Fri, Jul 17, 2020 at 01:00:24AM -0700, Ram Pai wrote: > > pvt->gpa = gpa; ..snip.. > > pvt->kvm = kvm; > > @@ -524,6 +663,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, > > unsigned long gpa, > > uvmem_page = pfn_to_page(uvmem_pfn); > > pvt = uvmem_page->zone_device_data; > > pvt->skip_page_out = true; > > + pvt->remove_gfn = false; > > } > > > > retry: > > @@ -537,12 +677,16 @@ static unsigned long kvmppc_share_page(struct kvm > > *kvm, unsigned long gpa, > > uvmem_page = pfn_to_page(uvmem_pfn); > > pvt = uvmem_page->zone_device_data; > > pvt->skip_page_out = true; > > + pvt->remove_gfn = false; > > This is the case of making an already secure page as shared page. > A comment here as to why remove_gfn is set to false here will help. > > Also isn't it by default false? Is there a situation where it starts > out by default false, becomes true later and you are required to > explicitly mark it false here? It is by default false. And will be true when the GFN is released/invalidated through kvmppc_uvmem_drop_pages(). It is marked false explicitly here, just to be safe, and protect against any implicit changes. > > Otherwise, Reviewed-by: Bharata B Rao > Thanks for the review. RP
[PATCH 2/2] lockdep: warn on redundant or incorrect irq state changes
With the previous patch, lockdep hardirq state changes should not be redundant. Softirq state changes already follow that pattern. So warn on unexpected enable-when-enabled or disable-when-disabled conditions, to catch possible errors or sloppy patterns that could lead to similar bad behavior due to NMIs etc. Signed-off-by: Nicholas Piggin --- kernel/locking/lockdep.c | 80 +- kernel/locking/lockdep_internals.h | 4 -- kernel/locking/lockdep_proc.c | 10 +--- 3 files changed, 35 insertions(+), 59 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 29a8de4c50b9..138458fb2234 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3649,15 +3649,8 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (unlikely(current->hardirqs_enabled)) { - /* -* Neither irq nor preemption are disabled here -* so this is racy by nature but losing one hit -* in a stat is not a big deal. -*/ - __debug_atomic_inc(redundant_hardirqs_on); + if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) return; - } /* * We're enabling irqs and according to our state above irqs weren't @@ -3695,15 +3688,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) if (unlikely(!debug_locks || curr->lockdep_recursion)) return; - if (curr->hardirqs_enabled) { - /* -* Neither irq nor preemption are disabled here -* so this is racy by nature but losing one hit -* in a stat is not a big deal. -*/ - __debug_atomic_inc(redundant_hardirqs_on); + if (DEBUG_LOCKS_WARN_ON(curr->hardirqs_enabled)) return; - } /* * We're enabling irqs and according to our state above irqs weren't @@ -3738,6 +3724,9 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (unlikely(!debug_locks || curr->lockdep_recursion)) return; + if (DEBUG_LOCKS_WARN_ON(!curr->hardirqs_enabled)) + return; + /* * So we're supposed to get called after you mask local IRQs, but for * some reason the hardware doesn't quite think you did a proper job. @@ -3745,17 +3734,13 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->hardirqs_enabled) { - /* -* We have done an ON -> OFF transition: -*/ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_off_events); - } else { - debug_atomic_inc(redundant_hardirqs_off); - } + /* +* We have done an ON -> OFF transition: +*/ + curr->hardirqs_enabled = 0; + curr->hardirq_disable_ip = ip; + curr->hardirq_disable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_off_events); } EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); @@ -3769,6 +3754,9 @@ void lockdep_softirqs_on(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + if (DEBUG_LOCKS_WARN_ON(curr->softirqs_enabled)) + return; + /* * We fancy IRQs being disabled here, see softirq.c, avoids * funny state and nesting things. @@ -3776,11 +3764,6 @@ void lockdep_softirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { - debug_atomic_inc(redundant_softirqs_on); - return; - } - current->lockdep_recursion++; /* * We'll do an OFF -> ON transition: @@ -3809,26 +3792,26 @@ void lockdep_softirqs_off(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + if (DEBUG_LOCKS_WARN_ON(!curr->softirqs_enabled)) + return; + /* * We fancy IRQs being disabled here, see softirq.c */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; - if (curr->softirqs_enabled) { - /* -* We have done an ON -> OFF transition: -*/ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; - debug_atomic_inc(softirqs_off_events); - /* -* Whoops, we wanted softirqs off, so why aren't they? -*/ -
[PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
If an interrupt is not masked by local_irq_disable (e.g., a powerpc perf interrupt), then it can hit in local_irq_enable() after trace_hardirqs_on() and before raw_local_irq_enable(). If that interrupt handler calls local_irq_save(), it will call trace_hardirqs_off() but the local_irq_restore() will not call trace_hardirqs_on() again because raw_irqs_disabled_flags(flags) is true. This can lead lockdep_assert_irqs_enabled() to trigger false positive warnings. Fix this by being careful to only enable and disable trace_hardirqs with the outer-most irq enable/disable. Reported-by: Alexey Kardashevskiy Signed-off-by: Nicholas Piggin --- I haven't tested on other architectures but I imagine NMIs in general might cause a similar problem. Other architectures might have to be updated for patch 2, but there's a lot of asm around interrupt/return, so I didn't have a very good lock. The warnings should be harmless enough and uncover most places that need updating. arch/powerpc/include/asm/hw_irq.h | 11 --- include/linux/irqflags.h | 29 ++--- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 3a0db7b0b46e..35060be09073 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void) #define powerpc_local_irq_pmu_save(flags) \ do { \ raw_local_irq_pmu_save(flags); \ - trace_hardirqs_off(); \ + if (!raw_irqs_disabled_flags(flags))\ + trace_hardirqs_off(); \ } while(0) #define powerpc_local_irq_pmu_restore(flags) \ do {\ - if (raw_irqs_disabled_flags(flags)) { \ - raw_local_irq_pmu_restore(flags); \ - trace_hardirqs_off(); \ - } else {\ + if (!raw_irqs_disabled_flags(flags))\ trace_hardirqs_on();\ - raw_local_irq_pmu_restore(flags); \ - } \ + raw_local_irq_pmu_restore(flags); \ } while(0) #else #define powerpc_local_irq_pmu_save(flags) \ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 6384d2813ded..571ee29ecefc 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -163,26 +163,33 @@ do { \ * if !TRACE_IRQFLAGS. */ #ifdef CONFIG_TRACE_IRQFLAGS -#define local_irq_enable() \ - do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) -#define local_irq_disable() \ - do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0) +#define local_irq_enable() \ + do {\ + trace_hardirqs_on();\ + raw_local_irq_enable(); \ + } while (0) + +#define local_irq_disable()\ + do {\ + bool was_disabled = raw_irqs_disabled(); \ + raw_local_irq_disable();\ + if (!was_disabled) \ + trace_hardirqs_off(); \ + } while (0) + #define local_irq_save(flags) \ do {\ raw_local_irq_save(flags); \ - trace_hardirqs_off(); \ + if (!raw_irqs_disabled_flags(flags))\ + trace_hardirqs_off(); \ } while (0) #define local_irq_restore(flags) \ do {\ - if (raw_irqs_disabled_flags(flags)) { \ - raw_local_irq_restore(flags); \ - trace_hardirqs_off(); \ - } else {\ + if (!raw_irqs_disabled_flags(flags))\ trace_hardirqs_on();\ - raw_local_irq_restore(flags); \ - } \ + raw_local_irq_restore(flags); \ } while (0) #define safe_halt()\ -- 2.23.0
Re: [PATCH 0/7] powerpc/watchpoint: 2nd DAWR kvm enablement + selftests
On 7/23/20 3:50 PM, Ravi Bangoria wrote: Patch #1, #2 and #3 enables p10 2nd DAWR feature for Book3S kvm guest. DAWR is a hypervisor resource and thus H_SET_MODE hcall is used to set/unset it. A new case H_SET_MODE_RESOURCE_SET_DAWR1 is introduced in H_SET_MODE hcall for setting/unsetting 2nd DAWR. Also, new capability KVM_CAP_PPC_DAWR1 has been added to query 2nd DAWR support via kvm ioctl. This feature also needs to be enabled in Qemu to really use it. I'll reply link to qemu patches once I post them in qemu-devel mailing list. Qemu patches: https://lore.kernel.org/kvm/20200723104220.314671-1-ravi.bango...@linux.ibm.com
Re: [PATCH] powerpc/64s: Fix irq tracing corruption in interrupt/syscall return caused by perf interrupts
Excerpts from Alexey Kardashevskiy's message of July 22, 2020 8:50 pm: > > > On 22/07/2020 17:34, Nicholas Piggin wrote: >> Alexey reports lockdep_assert_irqs_enabled() warnings when stress testing >> perf, e.g., >> >> WARNING: CPU: 0 PID: 1556 at kernel/softirq.c:169 >> __local_bh_enable_ip+0x258/0x270 >> CPU: 0 PID: 1556 Comm: syz-executor >> NIP: c01ec888 LR: c01ec884 CTR: c0ef0610 >> REGS: c00022d4f8a0 TRAP: 0700 Not tainted (5.8.0-rc3-x) >> MSR: 80029033 CR: 28008844 XER: 2004 >> CFAR: c01dc1d0 IRQMASK: 0 >> >> The interesting thing is MSR[EE] and IRQMASK shows interrupts are enabled, >> suggesting the current->hardirqs_enabled irq tracing state is going out of >> sync >> with the actual interrupt enable state. >> >> The cause is a window in interrupt/syscall return where irq tracing state is >> being >> adjusted for an irqs-enabled return while MSR[EE] is still enabled. A perf >> interrupt hits and ends up calling trace_hardirqs_off() when restoring >> interrupt flags to a disable state. >> >> Fix this by disabling perf interrupts as well while adjusting irq tracing >> state. >> >> Add a debug check that catches the condition sooner. >> >> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic >> in C") >> Reported-by: Alexey Kardashevskiy >> Signed-off-by: Nicholas Piggin >> --- >> >> I can reproduce similar symptoms and this patch fixes my test case, >> still trying to confirm Alexey's test case or whether there's another >> similar bug causing it. > > > This does not fix my testcase. I applied this on top of 4fa640dc5230 > ("Merge tag 'vfio-v5.8-rc7' of git://github.com/awilliam/linux-vfio into > master") without any of my testing code, just to be clear. Sorry... Okay it seems to be a bigger problem and not actually caused by that patch but was possible for lockdep hardirqs_enabled state to get out of synch with the local_irq_disable() state before that too. Root cause is similar -- perf interrupts hitting between updating the two different bits of state. Not quite sure why Alexey's test wasn't hitting it before the patch, but possibly the way masked interrupts get replayed. But I was able to hit the problem with a different assertion. I think I have a fix, but it seems to be a generic irq tracing code issue. So this patch can be dropped, and it's not an urgent issue for the next release (it only triggers warns on rare occasions and only when lockdep is enabled). Thanks, Nick
[PATCH 7/7] powerpc/selftests: Add selftest to test concurrent perf/ptrace events
ptrace and perf watchpoints can't co-exists if their address range overlaps. See commit 29da4f91c0c1 ("powerpc/watchpoint: Don't allow concurrent perf and ptrace events") for more detail. Add selftest for the same. Sample o/p: # ./ptrace-perf-hwbreak test: ptrace-perf-hwbreak tags: git_version:powerpc-5.8-7-118-g937fa174a15d-dirty perf cpu event -> ptrace thread event (Overlapping): Ok perf cpu event -> ptrace thread event (Non-overlapping): Ok perf thread event -> ptrace same thread event (Overlapping): Ok perf thread event -> ptrace same thread event (Non-overlapping): Ok perf thread event -> ptrace other thread event: Ok ptrace thread event -> perf kernel event: Ok ptrace thread event -> perf same thread event (Overlapping): Ok ptrace thread event -> perf same thread event (Non-overlapping): Ok ptrace thread event -> perf other thread event: Ok ptrace thread event -> perf cpu event (Overlapping): Ok ptrace thread event -> perf cpu event (Non-overlapping): Ok ptrace thread event -> perf same thread & cpu event (Overlapping): Ok ptrace thread event -> perf same thread & cpu event (Non-overlapping): Ok ptrace thread event -> perf other thread & cpu event: Ok success: ptrace-perf-hwbreak Signed-off-by: Ravi Bangoria --- .../selftests/powerpc/ptrace/.gitignore | 1 + .../testing/selftests/powerpc/ptrace/Makefile | 2 +- .../powerpc/ptrace/ptrace-perf-hwbreak.c | 659 ++ 3 files changed, 661 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore index 0e96150b7c7e..eb75e5360e31 100644 --- a/tools/testing/selftests/powerpc/ptrace/.gitignore +++ b/tools/testing/selftests/powerpc/ptrace/.gitignore @@ -14,3 +14,4 @@ perf-hwbreak core-pkey ptrace-pkey ptrace-syscall +ptrace-perf-hwbreak diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 8d3f006c98cc..a500639da97a 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -2,7 +2,7 @@ TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx ptrace-tm-vsx \ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \ - perf-hwbreak ptrace-syscall + perf-hwbreak ptrace-syscall ptrace-perf-hwbreak top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c new file mode 100644 index ..6b8804a4942e --- /dev/null +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c @@ -0,0 +1,659 @@ +// SPDX-License-Identifier: GPL-2.0+ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ptrace.h" + +char data[16]; + +/* Overlapping address range */ +volatile __u64 *ptrace_data1 = (__u64 *)[0]; +volatile __u64 *perf_data1 = (__u64 *)[4]; + +/* Non-overlapping address range */ +volatile __u64 *ptrace_data2 = (__u64 *)[0]; +volatile __u64 *perf_data2 = (__u64 *)[8]; + +static unsigned long pid_max_addr(void) +{ + FILE *fp; + char *line, *c; + char addr[100]; + size_t len = 0; + + fp = fopen("/proc/kallsyms", "r"); + if (!fp) { + printf("Failed to read /proc/kallsyms. Exiting..\n"); + exit(EXIT_FAILURE); + } + + while (getline(, , fp) != -1) { + if (!strstr(line, "pid_max") || strstr(line, "pid_max_max") || + strstr(line, "pid_max_min")) + continue; + + strncpy(addr, line, len < 100 ? len : 100); + c = strchr(addr, ' '); + *c = '\0'; + return strtoul(addr, , 16); + } + fclose(fp); + printf("Could not find pix_max. Exiting..\n"); + exit(EXIT_FAILURE); + return -1; +} + +static void perf_user_event_attr_set(struct perf_event_attr *attr, __u64 addr, __u64 len) +{ + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->bp_type= HW_BREAKPOINT_R; + attr->bp_addr= addr; + attr->bp_len = len; + attr->exclude_kernel = 1; + attr->exclude_hv = 1; +} + +static void perf_kernel_event_attr_set(struct perf_event_attr *attr) +{ + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->bp_type= HW_BREAKPOINT_R; + attr->bp_addr=
[PATCH 6/7] powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR
Extend perf-hwbreak.c selftest to test multiple DAWRs. Also add testcase for testing 512 byte boundary removal. Sample o/p: # ./perf-hwbreak ... TESTED: Process specific, Two events, diff addr TESTED: Process specific, Two events, same addr TESTED: Process specific, Two events, diff addr, one is RO, other is WO TESTED: Process specific, Two events, same addr, one is RO, other is WO TESTED: Systemwide, Two events, diff addr TESTED: Systemwide, Two events, same addr TESTED: Systemwide, Two events, diff addr, one is RO, other is WO TESTED: Systemwide, Two events, same addr, one is RO, other is WO TESTED: Process specific, 512 bytes, unaligned success: perf_hwbreak Signed-off-by: Ravi Bangoria --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 568 +- 1 file changed, 567 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index bde475341c8a..5df08738884d 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -21,8 +21,13 @@ #include #include #include +#include #include #include +#include +#include +#include +#include #include #include #include @@ -34,6 +39,12 @@ #define DAWR_LENGTH_MAX ((0x3f + 1) * 8) +int nprocs; + +static volatile int a = 10; +static volatile int b = 10; +static volatile char c[512 + 8] __attribute__((aligned(512))); + static void perf_event_attr_set(struct perf_event_attr *attr, __u32 type, __u64 addr, __u64 len, bool exclude_user) @@ -68,6 +79,76 @@ static int perf_process_event_open(__u32 type, __u64 addr, __u64 len) return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0); } +static int perf_cpu_event_open(long cpu, __u32 type, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_event_attr_set(, type, addr, len, 0); + return syscall(__NR_perf_event_open, , -1, cpu, -1, 0); +} + +static void close_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + close(fd[i]); +} + +static unsigned long read_fds(int *fd, int n) +{ + int i; + unsigned long c = 0; + unsigned long count = 0; + size_t res; + + for (i = 0; i < n; i++) { + res = read(fd[i], , sizeof(c)); + assert(res == sizeof(unsigned long long)); + count += c; + } + return count; +} + +static void reset_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + ioctl(fd[i], PERF_EVENT_IOC_RESET); +} + +static void enable_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + ioctl(fd[i], PERF_EVENT_IOC_ENABLE); +} + +static void disable_fds(int *fd, int n) +{ + int i; + + for (i = 0; i < n; i++) + ioctl(fd[i], PERF_EVENT_IOC_DISABLE); +} + +static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len) +{ + int i = 0; + + /* Assume online processors are 0 to nprocs for simplisity */ + for (i = 0; i < nprocs; i++) { + fd[i] = perf_cpu_event_open(i, type, addr, len); + if (fd[i] < 0) { + close_fds(fd, i); + return fd[i]; + } + } + return 0; +} + static inline bool breakpoint_test(int len) { int fd; @@ -261,11 +342,483 @@ static int runtest_dar_outside(void) return fail; } +static void multi_dawr_workload(void) +{ + a += 10; + b += 10; + c[512 + 1] += 'a'; +} + +static int test_process_multi_diff_addr(void) +{ + unsigned long long breaks1 = 0, breaks2 = 0; + int fd1, fd2; + char *desc = "Process specific, Two events, diff addr"; + size_t res; + + fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64), (__u64)sizeof(a)); + if (fd1 < 0) { + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64), (__u64)sizeof(b)); + if (fd2 < 0) { + close(fd1); + perror("perf_process_event_open"); + exit(EXIT_FAILURE); + } + + ioctl(fd1, PERF_EVENT_IOC_RESET); + ioctl(fd2, PERF_EVENT_IOC_RESET); + ioctl(fd1, PERF_EVENT_IOC_ENABLE); + ioctl(fd2, PERF_EVENT_IOC_ENABLE); + multi_dawr_workload(); + ioctl(fd1, PERF_EVENT_IOC_DISABLE); + ioctl(fd2, PERF_EVENT_IOC_DISABLE); + + res = read(fd1, , sizeof(breaks1)); + assert(res == sizeof(unsigned long long)); + res = read(fd2, , sizeof(breaks2)); + assert(res == sizeof(unsigned long long)); + + close(fd1); + close(fd2); + + if (breaks1 != 2 || breaks2 != 2) { + printf("FAILED: %s: %lld != 2 || %lld
[PATCH 5/7] powerpc/selftests/perf-hwbreak: Coalesce event creation code
perf-hwbreak selftest opens hw-breakpoint event at multiple places for which it has same code repeated. Coalesce that code into a function. Signed-off-by: Ravi Bangoria --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 78 +-- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index c1f324afdbf3..bde475341c8a 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -34,28 +34,46 @@ #define DAWR_LENGTH_MAX ((0x3f + 1) * 8) -static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, - int cpu, int group_fd, - unsigned long flags) +static void perf_event_attr_set(struct perf_event_attr *attr, + __u32 type, __u64 addr, __u64 len, + bool exclude_user) { - attr->size = sizeof(*attr); - return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); + memset(attr, 0, sizeof(struct perf_event_attr)); + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(struct perf_event_attr); + attr->bp_type= type; + attr->bp_addr= addr; + attr->bp_len = len; + attr->exclude_kernel = 1; + attr->exclude_hv = 1; + attr->exclude_guest = 1; + attr->exclude_user = exclude_user; + attr->disabled = 1; } -static inline bool breakpoint_test(int len) +static int +perf_process_event_open_exclude_user(__u32 type, __u64 addr, __u64 len, bool exclude_user) { struct perf_event_attr attr; + + perf_event_attr_set(, type, addr, len, exclude_user); + return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0); +} + +static int perf_process_event_open(__u32 type, __u64 addr, __u64 len) +{ + struct perf_event_attr attr; + + perf_event_attr_set(, type, addr, len, 0); + return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0); +} + +static inline bool breakpoint_test(int len) +{ int fd; - /* setup counters */ - memset(, 0, sizeof(attr)); - attr.disabled = 1; - attr.type = PERF_TYPE_BREAKPOINT; - attr.bp_type = HW_BREAKPOINT_R; /* bp_addr can point anywhere but needs to be aligned */ - attr.bp_addr = (__u64)() & 0xf800; - attr.bp_len = len; - fd = sys_perf_event_open(, 0, -1, -1, 0); + fd = perf_process_event_open(HW_BREAKPOINT_R, (__u64)() & 0xf800, len); if (fd < 0) return false; close(fd); @@ -75,7 +93,6 @@ static inline bool dawr_supported(void) static int runtestsingle(int readwriteflag, int exclude_user, int arraytest) { int i,j; - struct perf_event_attr attr; size_t res; unsigned long long breaks, needed; int readint; @@ -94,19 +111,11 @@ static int runtestsingle(int readwriteflag, int exclude_user, int arraytest) if (arraytest) ptr = [0]; - /* setup counters */ - memset(, 0, sizeof(attr)); - attr.disabled = 1; - attr.type = PERF_TYPE_BREAKPOINT; - attr.bp_type = readwriteflag; - attr.bp_addr = (__u64)ptr; - attr.bp_len = sizeof(int); - if (arraytest) - attr.bp_len = DAWR_LENGTH_MAX; - attr.exclude_user = exclude_user; - break_fd = sys_perf_event_open(, 0, -1, -1, 0); + break_fd = perf_process_event_open_exclude_user(readwriteflag, (__u64)ptr, + arraytest ? DAWR_LENGTH_MAX : sizeof(int), + exclude_user); if (break_fd < 0) { - perror("sys_perf_event_open"); + perror("perf_process_event_open_exclude_user"); exit(1); } @@ -153,7 +162,6 @@ static int runtest_dar_outside(void) void *target; volatile __u16 temp16; volatile __u64 temp64; - struct perf_event_attr attr; int break_fd; unsigned long long breaks; int fail = 0; @@ -165,21 +173,11 @@ static int runtest_dar_outside(void) exit(EXIT_FAILURE); } - /* setup counters */ - memset(, 0, sizeof(attr)); - attr.disabled = 1; - attr.type = PERF_TYPE_BREAKPOINT; - attr.exclude_kernel = 1; - attr.exclude_hv = 1; - attr.exclude_guest = 1; - attr.bp_type = HW_BREAKPOINT_RW; /* watch middle half of target array */ - attr.bp_addr = (__u64)(target + 2); - attr.bp_len = 4; - break_fd = sys_perf_event_open(, 0, -1, -1, 0); + break_fd = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)(target + 2), 4); if (break_fd < 0) { free(target); - perror("sys_perf_event_open"); +
[PATCH 4/7] powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR
Add selftests to test multiple active DAWRs with ptrace interface. Sample o/p: $ ./ptrace-hwbreak ... PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO, len: 6: Ok PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO, len: 6: Ok Signed-off-by: Ravi Bangoria --- .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 79 +++ 1 file changed, 79 insertions(+) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c index fc477dfe86a2..65781f4035c1 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c @@ -185,6 +185,18 @@ static void test_workload(void) big_var[rand() % DAWR_MAX_LEN] = 'a'; else cvar = big_var[rand() % DAWR_MAX_LEN]; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */ + gstruct.a[rand() % A_LEN] = 'a'; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */ + cvar = gstruct.b[rand() % B_LEN]; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */ + gstruct.a[rand() % A_LEN] = 'a'; + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */ + cvar = gstruct.a[rand() % A_LEN]; } static void check_success(pid_t child_pid, const char *name, const char *type, @@ -374,6 +386,69 @@ static void test_sethwdebug_range_aligned(pid_t child_pid) ptrace_delhwdebug(child_pid, wh); } +static void test_multi_sethwdebug_range(pid_t child_pid) +{ + struct ppc_hw_breakpoint info1, info2; + unsigned long wp_addr1, wp_addr2; + char *name1 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED"; + char *name2 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED"; + int len1, len2; + int wh1, wh2; + + wp_addr1 = (unsigned long) + wp_addr2 = (unsigned long) + len1 = A_LEN; + len2 = B_LEN; + get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, len1); + get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, len2); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */ + wh1 = ptrace_sethwdebug(child_pid, ); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */ + wh2 = ptrace_sethwdebug(child_pid, ); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name1, "WO", wp_addr1, len1); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name2, "RO", wp_addr2, len2); + + ptrace_delhwdebug(child_pid, wh1); + ptrace_delhwdebug(child_pid, wh2); +} + +static void test_multi_sethwdebug_range_dawr_overlap(pid_t child_pid) +{ + struct ppc_hw_breakpoint info1, info2; + unsigned long wp_addr1, wp_addr2; + char *name = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap"; + int len1, len2; + int wh1, wh2; + + wp_addr1 = (unsigned long) + wp_addr2 = (unsigned long) + len1 = A_LEN; + len2 = A_LEN; + get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, len1); + get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, len2); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */ + wh1 = ptrace_sethwdebug(child_pid, ); + + /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */ + wh2 = ptrace_sethwdebug(child_pid, ); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name, "WO", wp_addr1, len1); + + ptrace(PTRACE_CONT, child_pid, NULL, 0); + check_success(child_pid, name, "RO", wp_addr2, len2); + + ptrace_delhwdebug(child_pid, wh1); + ptrace_delhwdebug(child_pid, wh2); +} + static void test_sethwdebug_range_unaligned(pid_t child_pid) { struct ppc_hw_breakpoint info; @@ -460,6 +535,10 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr) test_sethwdebug_range_unaligned(child_pid); test_sethwdebug_range_unaligned_dar(child_pid); test_sethwdebug_dawr_max_range(child_pid); + if (dbginfo->num_data_bps > 1) { + test_multi_sethwdebug_range(child_pid); + test_multi_sethwdebug_range_dawr_overlap(child_pid); + } } } } -- 2.26.2
[PATCH 1/7] powerpc/watchpoint/kvm: Rename current DAWR macros and variables
Power10 is introducing second DAWR. Use real register names (with suffix 0) from ISA for current macros and variables used by kvm. Signed-off-by: Ravi Bangoria --- Documentation/virt/kvm/api.rst| 4 +-- arch/powerpc/include/asm/kvm_host.h | 4 +-- arch/powerpc/include/uapi/asm/kvm.h | 4 +-- arch/powerpc/kernel/asm-offsets.c | 4 +-- arch/powerpc/kvm/book3s_hv.c | 32 +++ arch/powerpc/kvm/book3s_hv_nested.c | 8 +++--- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 20 +++--- tools/arch/powerpc/include/uapi/asm/kvm.h | 4 +-- 8 files changed, 40 insertions(+), 40 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 426f94582b7a..4dc18fe6a2bf 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2219,8 +2219,8 @@ registers, find a list below: PPC KVM_REG_PPC_BESCR 64 PPC KVM_REG_PPC_TAR 64 PPC KVM_REG_PPC_DPDES 64 - PPC KVM_REG_PPC_DAWR64 - PPC KVM_REG_PPC_DAWRX 64 + PPC KVM_REG_PPC_DAWR0 64 + PPC KVM_REG_PPC_DAWRX0 64 PPC KVM_REG_PPC_CIABR 64 PPC KVM_REG_PPC_IC 64 PPC KVM_REG_PPC_VTB 64 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7e2d061d0445..9aa3854f0e1e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -582,8 +582,8 @@ struct kvm_vcpu_arch { u32 ctrl; u32 dabrx; ulong dabr; - ulong dawr; - ulong dawrx; + ulong dawr0; + ulong dawrx0; ulong ciabr; ulong cfar; ulong ppr; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 264e266a85bf..38d61b73f5ed 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -608,8 +608,8 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_BESCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7) #define KVM_REG_PPC_TAR(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8) #define KVM_REG_PPC_DPDES (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9) -#define KVM_REG_PPC_DAWR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa) -#define KVM_REG_PPC_DAWRX (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab) +#define KVM_REG_PPC_DAWR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa) +#define KVM_REG_PPC_DAWRX0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab) #define KVM_REG_PPC_CIABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac) #define KVM_REG_PPC_IC (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad) #define KVM_REG_PPC_VTB(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6657dc6b2336..e76bffe348e1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -547,8 +547,8 @@ int main(void) OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl); OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr); OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx); - OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr); - OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx); + OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0); + OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0); OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr); OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags); OFFSET(VCPU_DEC, kvm_vcpu, arch.dec); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 89afcc5f60ca..28200e4f5d27 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -778,8 +778,8 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, return H_UNSUPPORTED_FLAG_START; if (value2 & DABRX_HYP) return H_P4; - vcpu->arch.dawr = value1; - vcpu->arch.dawrx = value2; + vcpu->arch.dawr0 = value1; + vcpu->arch.dawrx0 = value2; return H_SUCCESS; case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: /* KVM does not support mflags=2 (AIL=2) */ @@ -1724,11 +1724,11 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); break; - case KVM_REG_PPC_DAWR: - *val = get_reg_val(id, vcpu->arch.dawr); + case KVM_REG_PPC_DAWR0: + *val = get_reg_val(id, vcpu->arch.dawr0); break; - case KVM_REG_PPC_DAWRX: - *val = get_reg_val(id, vcpu->arch.dawrx); + case KVM_REG_PPC_DAWRX0: + *val = get_reg_val(id, vcpu->arch.dawrx0); break; case KVM_REG_PPC_CIABR: *val = get_reg_val(id, vcpu->arch.ciabr); @@ -1938,11
[PATCH 3/7] powerpc/watchpoint/kvm: Introduce new capability for 2nd DAWR
Introduce KVM_CAP_PPC_DAWR1 which can be used by Qemu to query whether kvm supports 2nd DAWR or not. Signed-off-by: Ravi Bangoria --- arch/powerpc/kvm/powerpc.c | 3 +++ include/uapi/linux/kvm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index dd7d141e33e8..f38380fd1fe9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -676,6 +676,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) !kvmppc_hv_ops->enable_svm(NULL); break; #endif + case KVM_CAP_PPC_DAWR1: + r = cpu_has_feature(CPU_FTR_DAWR1); + break; default: r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..2c3713d6526a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_PPC_DAWR1 184 #ifdef KVM_CAP_IRQ_ROUTING -- 2.26.2
[PATCH 2/7] powerpc/watchpoint/kvm: Add infrastructure to support 2nd DAWR
kvm code assumes single DAWR everywhere. Add code to support 2nd DAWR. DAWR is a hypervisor resource and thus H_SET_MODE hcall is used to set/ unset it. Introduce new case H_SET_MODE_RESOURCE_SET_DAWR1 for 2nd DAWR. Also, kvm will support 2nd DAWR only if CPU_FTR_DAWR1 is set. Signed-off-by: Ravi Bangoria --- Documentation/virt/kvm/api.rst| 2 ++ arch/powerpc/include/asm/hvcall.h | 2 ++ arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/include/uapi/asm/kvm.h | 4 +++ arch/powerpc/kernel/asm-offsets.c | 2 ++ arch/powerpc/kvm/book3s_hv.c | 41 +++ arch/powerpc/kvm/book3s_hv_nested.c | 7 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 23 + tools/arch/powerpc/include/uapi/asm/kvm.h | 4 +++ 9 files changed, 87 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4dc18fe6a2bf..7b1d16c2ad24 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2242,6 +2242,8 @@ registers, find a list below: PPC KVM_REG_PPC_PSSCR 64 PPC KVM_REG_PPC_DEC_EXPIRY 64 PPC KVM_REG_PPC_PTCR64 + PPC KVM_REG_PPC_DAWR1 64 + PPC KVM_REG_PPC_DAWRX1 64 PPC KVM_REG_PPC_TM_GPR0 64 ... PPC KVM_REG_PPC_TM_GPR3164 diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 33793444144c..03f401d7be41 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -538,6 +538,8 @@ struct hv_guest_state { s64 tb_offset; u64 dawr0; u64 dawrx0; + u64 dawr1; + u64 dawrx1; u64 ciabr; u64 hdec_expiry; u64 purr; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9aa3854f0e1e..bda839edd5fe 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -584,6 +584,8 @@ struct kvm_vcpu_arch { ulong dabr; ulong dawr0; ulong dawrx0; + ulong dawr1; + ulong dawrx1; ulong ciabr; ulong cfar; ulong ppr; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 38d61b73f5ed..c5c0f128b46f 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -640,6 +640,10 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) #define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) +/* POWER10 registers. */ +#define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1) +#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e76bffe348e1..ef2c0f3f5a7b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -549,6 +549,8 @@ int main(void) OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx); OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0); OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0); + OFFSET(VCPU_DAWR1, kvm_vcpu, arch.dawr1); + OFFSET(VCPU_DAWRX1, kvm_vcpu, arch.dawrx1); OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr); OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags); OFFSET(VCPU_DEC, kvm_vcpu, arch.dec); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 28200e4f5d27..24575520b2ea 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -781,6 +781,20 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, vcpu->arch.dawr0 = value1; vcpu->arch.dawrx0 = value2; return H_SUCCESS; + case H_SET_MODE_RESOURCE_SET_DAWR1: + if (!kvmppc_power8_compatible(vcpu)) + return H_P2; + if (!ppc_breakpoint_available()) + return H_P2; + if (!cpu_has_feature(CPU_FTR_DAWR1)) + return H_P2; + if (mflags) + return H_UNSUPPORTED_FLAG_START; + if (value2 & DABRX_HYP) + return H_P4; + vcpu->arch.dawr1 = value1; + vcpu->arch.dawrx1 = value2; + return H_SUCCESS; case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: /* KVM does not support mflags=2 (AIL=2) */ if (mflags != 0 && mflags != 3) @@ -1730,6 +1744,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_DAWRX0: *val = get_reg_val(id, vcpu->arch.dawrx0); break; + case KVM_REG_PPC_DAWR1: + *val = get_reg_val(id,
[PATCH 0/7] powerpc/watchpoint: 2nd DAWR kvm enablement + selftests
Patch #1, #2 and #3 enables p10 2nd DAWR feature for Book3S kvm guest. DAWR is a hypervisor resource and thus H_SET_MODE hcall is used to set/unset it. A new case H_SET_MODE_RESOURCE_SET_DAWR1 is introduced in H_SET_MODE hcall for setting/unsetting 2nd DAWR. Also, new capability KVM_CAP_PPC_DAWR1 has been added to query 2nd DAWR support via kvm ioctl. This feature also needs to be enabled in Qemu to really use it. I'll reply link to qemu patches once I post them in qemu-devel mailing list. Patch #4, #5, #6 and #7 adds selftests to test 2nd DAWR. Dependency: 1: p10 kvm base enablement https://lore.kernel.org/linuxppc-dev/20200602055325.6102-1-alist...@popple.id.au 2: 2nd DAWR powervm/baremetal enablement https://lore.kernel.org/linuxppc-dev/20200723090813.303838-1-ravi.bango...@linux.ibm.com 3: ptrace PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 flag https://lore.kernel.org/linuxppc-dev/20200723093330.306341-1-ravi.bango...@linux.ibm.com Patches in this series applies fine on top of powerpc/next (9a77c4a0a125) plus above dependency patches. Ravi Bangoria (7): powerpc/watchpoint/kvm: Rename current DAWR macros and variables powerpc/watchpoint/kvm: Add infrastructure to support 2nd DAWR powerpc/watchpoint/kvm: Introduce new capability for 2nd DAWR powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR powerpc/selftests/perf-hwbreak: Coalesce event creation code powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR powerpc/selftests: Add selftest to test concurrent perf/ptrace events Documentation/virt/kvm/api.rst| 6 +- arch/powerpc/include/asm/hvcall.h | 2 + arch/powerpc/include/asm/kvm_host.h | 6 +- arch/powerpc/include/uapi/asm/kvm.h | 8 +- arch/powerpc/kernel/asm-offsets.c | 6 +- arch/powerpc/kvm/book3s_hv.c | 73 +- arch/powerpc/kvm/book3s_hv_nested.c | 15 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 43 +- arch/powerpc/kvm/powerpc.c| 3 + include/uapi/linux/kvm.h | 1 + tools/arch/powerpc/include/uapi/asm/kvm.h | 8 +- .../selftests/powerpc/ptrace/.gitignore | 1 + .../testing/selftests/powerpc/ptrace/Makefile | 2 +- .../selftests/powerpc/ptrace/perf-hwbreak.c | 646 +++-- .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 79 +++ .../powerpc/ptrace/ptrace-perf-hwbreak.c | 659 ++ 16 files changed, 1476 insertions(+), 82 deletions(-) create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c -- 2.26.2
[PATCH v2] powerpc/watchpoint/ptrace: Introduce PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31
PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 can be used to determine whether we are running on an ISA 3.1 compliant machine. Which is needed to determine DAR behaviour, 512 byte boundary limit etc. This was requested by Pedro Miraglia Franco de Carvalho for extending watchpoint features in gdb. Note that availability of 2nd DAWR is independent of this flag and should be checked using ppc_debug_info->num_data_bps. Signed-off-by: Ravi Bangoria --- v1->v2: - Mention new flag in Documentaion/ as well. Documentation/powerpc/ptrace.rst | 1 + arch/powerpc/include/uapi/asm/ptrace.h| 1 + arch/powerpc/kernel/ptrace/ptrace-noadv.c | 5 - 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/powerpc/ptrace.rst b/Documentation/powerpc/ptrace.rst index 864d4b61..4d42290248cb 100644 --- a/Documentation/powerpc/ptrace.rst +++ b/Documentation/powerpc/ptrace.rst @@ -46,6 +46,7 @@ features will have bits indicating whether there is support for:: #define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x4 #define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x8 #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x10 + #define PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 0x20 2. PTRACE_SETHWDEBUG diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h index f5f1ccc740fc..0a87bcd4300a 100644 --- a/arch/powerpc/include/uapi/asm/ptrace.h +++ b/arch/powerpc/include/uapi/asm/ptrace.h @@ -222,6 +222,7 @@ struct ppc_debug_info { #define PPC_DEBUG_FEATURE_DATA_BP_RANGE0x0004 #define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x0008 #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x0010 +#define PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 0x0020 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c index 697c7e4b5877..b2de874d650b 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c +++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c @@ -52,8 +52,11 @@ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo) dbginfo->sizeof_condition = 0; if (IS_ENABLED(CONFIG_HAVE_HW_BREAKPOINT)) { dbginfo->features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; - if (dawr_enabled()) + if (dawr_enabled()) { dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; + if (cpu_has_feature(CPU_FTR_ARCH_31)) + dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31; + } } else { dbginfo->features = 0; } -- 2.26.2
[PATCH v5 10/10] powerpc/watchpoint: Remove 512 byte boundary
Power10 has removed 512 bytes boundary from match criteria i.e. the watch range can cross 512 bytes boundary. Note: ISA 3.1 Book III 9.4 match criteria includes 512 byte limit but that is a documentation mistake and hopefully will be fixed in the next version of ISA. Though, ISA 3.1 change log mentions about removal of 512B boundary: Multiple DEAW: Added a second Data Address Watchpoint. [H]DAR is set to the first byte of overlap. 512B boundary is removed. Signed-off-by: Ravi Bangoria --- arch/powerpc/kernel/hw_breakpoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index c55e67bab271..1f4a1efa0074 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -418,8 +418,9 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) if (dawr_enabled()) { max_len = DAWR_MAX_LEN; - /* DAWR region can't cross 512 bytes boundary */ - if (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512)) + /* DAWR region can't cross 512 bytes boundary on p10 predecessors */ + if (!cpu_has_feature(CPU_FTR_ARCH_31) && + (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512))) return -EINVAL; } else if (IS_ENABLED(CONFIG_PPC_8xx)) { /* 8xx can setup a range without limitation */ -- 2.26.2
[PATCH v5 09/10] powerpc/watchpoint: Return available watchpoints dynamically
So far Book3S Powerpc supported only one watchpoint. Power10 is introducing 2nd DAWR. Enable 2nd DAWR support for Power10. Availability of 2nd DAWR will depend on CPU_FTR_DAWR1. Signed-off-by: Ravi Bangoria --- arch/powerpc/include/asm/cputable.h | 5 +++-- arch/powerpc/include/asm/hw_breakpoint.h | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 5583f2d08df7..fa1232c33ab9 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -629,9 +629,10 @@ enum { /* * Maximum number of hw breakpoint supported on powerpc. Number of - * breakpoints supported by actual hw might be less than this. + * breakpoints supported by actual hw might be less than this, which + * is decided at run time in nr_wp_slots(). */ -#define HBP_NUM_MAX1 +#define HBP_NUM_MAX2 #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index cb424799da0d..c89250b6ac34 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -9,6 +9,8 @@ #ifndef _PPC_BOOK3S_64_HW_BREAKPOINT_H #define _PPC_BOOK3S_64_HW_BREAKPOINT_H +#include + #ifdef __KERNEL__ struct arch_hw_breakpoint { unsigned long address; @@ -46,7 +48,7 @@ struct arch_hw_breakpoint { static inline int nr_wp_slots(void) { - return HBP_NUM_MAX; + return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1; } #ifdef CONFIG_HAVE_HW_BREAKPOINT -- 2.26.2
[PATCH v5 08/10] powerpc/watchpoint: Guest support for 2nd DAWR hcall
2nd DAWR can be set/unset using H_SET_MODE hcall with resource value 5. Enable powervm guest support with that. This has no effect on kvm guest because kvm will return error if guest does hcall with resource value 5. Signed-off-by: Ravi Bangoria --- arch/powerpc/include/asm/hvcall.h | 1 + arch/powerpc/include/asm/machdep.h| 2 +- arch/powerpc/include/asm/plpar_wrappers.h | 5 + arch/powerpc/kernel/dawr.c| 2 +- arch/powerpc/platforms/pseries/setup.c| 7 +-- 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index b785e9f0071c..33793444144c 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -358,6 +358,7 @@ #define H_SET_MODE_RESOURCE_SET_DAWR0 2 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE3 #define H_SET_MODE_RESOURCE_LE 4 +#define H_SET_MODE_RESOURCE_SET_DAWR1 5 /* Values for argument to H_SIGNAL_SYS_RESET */ #define H_SIGNAL_SYS_RESET_ALL -1 diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 7bcb6a39..a90b892f0bfe 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -131,7 +131,7 @@ struct machdep_calls { unsigned long dabrx); /* Set DAWR for this platform, leave empty for default implementation */ - int (*set_dawr)(unsigned long dawr, + int (*set_dawr)(int nr, unsigned long dawr, unsigned long dawrx); #ifdef CONFIG_PPC32/* XXX for now */ diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index d12c3680d946..ece84a430701 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -315,6 +315,11 @@ static inline long plpar_set_watchpoint0(unsigned long dawr0, unsigned long dawr return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR0, dawr0, dawrx0); } +static inline long plpar_set_watchpoint1(unsigned long dawr1, unsigned long dawrx1) +{ + return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR1, dawr1, dawrx1); +} + static inline long plpar_signal_sys_reset(long cpu) { return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu); diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index 500f52fa4711..cdc2dccb987d 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -37,7 +37,7 @@ int set_dawr(int nr, struct arch_hw_breakpoint *brk) dawrx |= (mrd & 0x3f) << (63 - 53); if (ppc_md.set_dawr) - return ppc_md.set_dawr(dawr, dawrx); + return ppc_md.set_dawr(nr, dawr, dawrx); if (nr == 0) { mtspr(SPRN_DAWR0, dawr); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2db8469e475f..d516ee8eb7fc 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -831,12 +831,15 @@ static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx) return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx); } -static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx) +static int pseries_set_dawr(int nr, unsigned long dawr, unsigned long dawrx) { /* PAPR says we can't set HYP */ dawrx &= ~DAWRX_HYP; - return plpar_set_watchpoint0(dawr, dawrx); + if (nr == 0) + return plpar_set_watchpoint0(dawr, dawrx); + else + return plpar_set_watchpoint1(dawr, dawrx); } #define CMO_CHARACTERISTICS_TOKEN 44 -- 2.26.2
[PATCH v5 07/10] powerpc/watchpoint: Rename current H_SET_MODE DAWR macro
Current H_SET_MODE hcall macro name for setting/resetting DAWR0 is H_SET_MODE_RESOURCE_SET_DAWR. Add suffix 0 to macro name as well. Signed-off-by: Ravi Bangoria Reviewed-by: Jordan Niethe --- arch/powerpc/include/asm/hvcall.h | 2 +- arch/powerpc/include/asm/plpar_wrappers.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 43486e773bd6..b785e9f0071c 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -355,7 +355,7 @@ /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 -#define H_SET_MODE_RESOURCE_SET_DAWR 2 +#define H_SET_MODE_RESOURCE_SET_DAWR0 2 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE3 #define H_SET_MODE_RESOURCE_LE 4 diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 4293c5d2ddf4..d12c3680d946 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -312,7 +312,7 @@ static inline long plpar_set_ciabr(unsigned long ciabr) static inline long plpar_set_watchpoint0(unsigned long dawr0, unsigned long dawrx0) { - return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0); + return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR0, dawr0, dawrx0); } static inline long plpar_signal_sys_reset(long cpu) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6bf66649ab92..7ad692c2d7c7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -764,7 +764,7 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, return H_P3; vcpu->arch.ciabr = value1; return H_SUCCESS; - case H_SET_MODE_RESOURCE_SET_DAWR: + case H_SET_MODE_RESOURCE_SET_DAWR0: if (!kvmppc_power8_compatible(vcpu)) return H_P2; if (!ppc_breakpoint_available()) -- 2.26.2
[PATCH v5 06/10] powerpc/watchpoint: Set CPU_FTR_DAWR1 based on pa-features bit
As per the PAPR, bit 0 of byte 64 in pa-features property indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd DAWR is present, otherwise not. Host generally uses "cpu-features", which masks "pa-features". But "cpu-features" are still not used for guests and thus this change is mostly applicable for guests only. Signed-off-by: Ravi Bangoria Tested-by: Jordan Niethe --- arch/powerpc/kernel/prom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 033d43819ed8..01dda206d68e 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -175,6 +175,8 @@ static struct ibm_pa_feature { */ { .pabyte = 22, .pabit = 0, .cpu_features = CPU_FTR_TM_COMP, .cpu_user_ftrs2 = PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_HTM_NOSC_COMP }, + + { .pabyte = 64, .pabit = 0, .cpu_features = CPU_FTR_DAWR1 }, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, -- 2.26.2
[PATCH v5 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR
Add new device-tree feature for 2nd DAWR. If this feature is present, 2nd DAWR is supported, otherwise not. Signed-off-by: Ravi Bangoria --- arch/powerpc/include/asm/cputable.h | 3 ++- arch/powerpc/kernel/dt_cpu_ftrs.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index e506d429b1af..5583f2d08df7 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_P9_TLBIE_ERAT_BUG LONG_ASM_CONST(0x0001) #define CPU_FTR_P9_RADIX_PREFETCH_BUG LONG_ASM_CONST(0x0002) #define CPU_FTR_ARCH_31 LONG_ASM_CONST(0x0004) +#define CPU_FTR_DAWR1 LONG_ASM_CONST(0x0008) #ifndef __ASSEMBLY__ @@ -478,7 +479,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_ARCH_31 | CPU_FTR_DAWR) + CPU_FTR_ARCH_31 | CPU_FTR_DAWR | CPU_FTR_DAWR1) #define CPU_FTRS_CELL (CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index ac650c233cd9..675b824038f9 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -649,6 +649,7 @@ static struct dt_cpu_feature_match __initdata {"wait-v3", feat_enable, 0}, {"prefix-instructions", feat_enable, 0}, {"matrix-multiply-assist", feat_enable_mma, 0}, + {"debug-facilities-v31", feat_enable, CPU_FTR_DAWR1}, }; static bool __initdata using_dt_cpu_ftrs; -- 2.26.2
[PATCH v5 04/10] powerpc/watchpoint: Enable watchpoint functionality on power10 guest
CPU_FTR_DAWR is by default enabled for host via CPU_FTRS_DT_CPU_BASE (controlled by CONFIG_PPC_DT_CPU_FTRS). But cpu-features device-tree node is not PAPR compatible and thus not yet used by kvm or pHyp guests. Enable watchpoint functionality on power10 guest (both kvm and powervm) by adding CPU_FTR_DAWR to CPU_FTRS_POWER10. Note that this change does not enable 2nd DAWR support. Signed-off-by: Ravi Bangoria Tested-by: Jordan Niethe --- arch/powerpc/include/asm/cputable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index bac2252c839e..e506d429b1af 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -478,7 +478,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \ - CPU_FTR_ARCH_31) + CPU_FTR_ARCH_31 | CPU_FTR_DAWR) #define CPU_FTRS_CELL (CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ -- 2.26.2
[PATCH v5 03/10] powerpc/watchpoint: Fix DAWR exception for CACHEOP
'ea' returned by analyse_instr() needs to be aligned down to cache block size for CACHEOP instructions. analyse_instr() does not set size for CACHEOP, thus size also needs to be calculated manually. Fixes: 27985b2a640e ("powerpc/watchpoint: Don't ignore extraneous exceptions blindly") Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than one watchpoint") Signed-off-by: Ravi Bangoria --- arch/powerpc/kernel/hw_breakpoint.c | 21 - 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index a971e22aea81..c55e67bab271 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -538,7 +538,12 @@ static bool check_dawrx_constraints(struct pt_regs *regs, int type, if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ)) return false; - if (OP_IS_STORE(type) && !(info->type & HW_BRK_TYPE_WRITE)) + /* +* The Cache Management instructions other than dcbz never +* cause a match. i.e. if type is CACHEOP, the instruction +* is dcbz, and dcbz is treated as Store. +*/ + if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE)) return false; if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL)) @@ -601,6 +606,15 @@ static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr, return false; } +static int cache_op_size(void) +{ +#ifdef __powerpc64__ + return ppc64_caches.l1d.block_size; +#else + return L1_CACHE_BYTES; +#endif +} + static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, int *type, int *size, unsigned long *ea) { @@ -616,7 +630,12 @@ static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, if (!(regs->msr & MSR_64BIT)) *ea &= 0xUL; #endif + *size = GETSIZE(op.type); + if (*type == CACHEOP) { + *size = cache_op_size(); + *ea &= ~(*size - 1); + } } static bool is_larx_stcx_instr(int type) -- 2.26.2
[PATCH v5 02/10] powerpc/watchpoint: Fix DAWR exception constraint
Pedro Miraglia Franco de Carvalho noticed that on p8/p9, DAR value is inconsistent with different type of load/store. Like for byte,word etc. load/stores, DAR is set to the address of the first byte of overlap between watch range and real access. But for quadword load/ store it's sometime set to the address of the first byte of real access whereas sometime set to the address of the first byte of overlap. This issue has been fixed in p10. In p10(ISA 3.1), DAR is always set to the address of the first byte of overlap. Commit 27985b2a640e ("powerpc/watchpoint: Don't ignore extraneous exceptions blindly") wrongly assumes that DAR is set to the address of the first byte of overlap for all load/stores on p8/p9 as well. Fix that. With the fix, we now rely on 'ea' provided by analyse_instr(). If analyse_instr() fails, generate event unconditionally on p8/p9, and on p10 generate event only if DAR is within a DAWR range. Note: 8xx is not affected. Fixes: 27985b2a640e ("powerpc/watchpoint: Don't ignore extraneous exceptions blindly") Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than one watchpoint") Reported-by: Pedro Miraglia Franco de Carvalho Signed-off-by: Ravi Bangoria --- arch/powerpc/kernel/hw_breakpoint.c | 72 - 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 031e6defc08e..a971e22aea81 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -498,11 +498,11 @@ static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info return ((info->address <= dar) && (dar - info->address < info->len)); } -static bool dar_user_range_overlaps(unsigned long dar, int size, - struct arch_hw_breakpoint *info) +static bool ea_user_range_overlaps(unsigned long ea, int size, + struct arch_hw_breakpoint *info) { - return ((dar < info->address + info->len) && - (dar + size > info->address)); + return ((ea < info->address + info->len) && + (ea + size > info->address)); } static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info) @@ -515,20 +515,22 @@ static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info) return ((hw_start_addr <= dar) && (hw_end_addr > dar)); } -static bool dar_hw_range_overlaps(unsigned long dar, int size, - struct arch_hw_breakpoint *info) +static bool ea_hw_range_overlaps(unsigned long ea, int size, +struct arch_hw_breakpoint *info) { unsigned long hw_start_addr, hw_end_addr; hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE); hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE); - return ((dar < hw_end_addr) && (dar + size > hw_start_addr)); + return ((ea < hw_end_addr) && (ea + size > hw_start_addr)); } /* * If hw has multiple DAWR registers, we also need to check all * dawrx constraint bits to confirm this is _really_ a valid event. + * If type is UNKNOWN, but privilege level matches, consider it as + * a positive match. */ static bool check_dawrx_constraints(struct pt_regs *regs, int type, struct arch_hw_breakpoint *info) @@ -553,7 +555,8 @@ static bool check_dawrx_constraints(struct pt_regs *regs, int type, * including extraneous exception. Otherwise return false. */ static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr, - int type, int size, struct arch_hw_breakpoint *info) + unsigned long ea, int type, int size, + struct arch_hw_breakpoint *info) { bool in_user_range = dar_in_user_range(regs->dar, info); bool dawrx_constraints; @@ -569,22 +572,27 @@ static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr, } if (unlikely(ppc_inst_equal(instr, ppc_inst(0 { - if (in_user_range) - return true; + if (cpu_has_feature(CPU_FTR_ARCH_31) && + !dar_in_hw_range(regs->dar, info)) + return false; - if (dar_in_hw_range(regs->dar, info)) { - info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; - return true; - } - return false; + return true; } dawrx_constraints = check_dawrx_constraints(regs, type, info); - if (dar_user_range_overlaps(regs->dar, size, info)) + if (type == UNKNOWN) { + if (cpu_has_feature(CPU_FTR_ARCH_31) && + !dar_in_hw_range(regs->dar, info)) + return false; + return dawrx_constraints; + }
[PATCH v5 01/10] powerpc/watchpoint: Fix 512 byte boundary limit
Milton Miller reported that we are aligning start and end address to wrong size SZ_512M. It should be SZ_512. Fix that. While doing this change I also found a case where ALIGN() comparison fails. Within a given aligned range, ALIGN() of two addresses does not match when start address is pointing to the first byte and end address is pointing to any other byte except the first one. But that's not true for ALIGN_DOWN(). ALIGN_DOWN() of any two addresses within that range will always point to the first byte. So use ALIGN_DOWN() instead of ALIGN(). Fixes: e68ef121c1f4 ("powerpc/watchpoint: Use builtin ALIGN*() macros") Reported-by: Milton Miller Signed-off-by: Ravi Bangoria Tested-by: Jordan Niethe --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index daf0e1da..031e6defc08e 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -419,7 +419,7 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) if (dawr_enabled()) { max_len = DAWR_MAX_LEN; /* DAWR region can't cross 512 bytes boundary */ - if (ALIGN(start_addr, SZ_512M) != ALIGN(end_addr - 1, SZ_512M)) + if (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512)) return -EINVAL; } else if (IS_ENABLED(CONFIG_PPC_8xx)) { /* 8xx can setup a range without limitation */ -- 2.26.2
[PATCH v5 00/10] powerpc/watchpoint: Enable 2nd DAWR on baremetal and powervm
Last series[1] was to add basic infrastructure support for more than one watchpoint on Book3S powerpc. This series actually enables the 2nd DAWR for baremetal and powervm. Kvm guest is still not supported. v4: https://lore.kernel.org/r/20200717040958.70561-1-ravi.bango...@linux.ibm.com v4->v5: - Using hardcoded values instead of macros HBP_NUM_ONE and HBP_NUM_TWO. Comment above HBP_NUM_MAX changed to explain it's value. - Included CPU_FTR_DAWR1 into CPU_FTRS_POWER10 - Using generic function feat_enable() instead of feat_enable_debug_facilities_v31() to enable CPU_FTR_DAWR1. - ISA still includes 512B boundary in match criteria. But that's a documentation mistake. Mentioned about this in the last patch. - Rebased to powerpc/next - Added Jordan's Reviewed-by/Tested-by tags [1]: https://lore.kernel.org/linuxppc-dev/20200514111741.97993-1-ravi.bango...@linux.ibm.com/ Ravi Bangoria (10): powerpc/watchpoint: Fix 512 byte boundary limit powerpc/watchpoint: Fix DAWR exception constraint powerpc/watchpoint: Fix DAWR exception for CACHEOP powerpc/watchpoint: Enable watchpoint functionality on power10 guest powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR powerpc/watchpoint: Set CPU_FTR_DAWR1 based on pa-features bit powerpc/watchpoint: Rename current H_SET_MODE DAWR macro powerpc/watchpoint: Guest support for 2nd DAWR hcall powerpc/watchpoint: Return available watchpoints dynamically powerpc/watchpoint: Remove 512 byte boundary arch/powerpc/include/asm/cputable.h | 8 +- arch/powerpc/include/asm/hvcall.h | 3 +- arch/powerpc/include/asm/hw_breakpoint.h | 4 +- arch/powerpc/include/asm/machdep.h| 2 +- arch/powerpc/include/asm/plpar_wrappers.h | 7 +- arch/powerpc/kernel/dawr.c| 2 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 1 + arch/powerpc/kernel/hw_breakpoint.c | 98 +++ arch/powerpc/kernel/prom.c| 2 + arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/platforms/pseries/setup.c| 7 +- 11 files changed, 91 insertions(+), 45 deletions(-) -- 2.26.2
[PATCH v3 00/10] Coregroup support on Powerpc
Changelog v2 -> v3: v2: https://lore.kernel.org/linuxppc-dev/20200721113814.32284-1-sri...@linux.vnet.ibm.com/t/#u powerpc/smp: Cache node for reuse Removed node caching part. Rewrote the Commit msg (Michael Ellerman) Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES powerpc/smp: Enable small core scheduling sooner Rewrote changelog (Gautham) Renamed to powerpc/smp: Move topology fixups into a new function powerpc/smp: Create coregroup domain Add optimization for mask updation under coregroup_support Changelog v1 -> v2: v1: https://lore.kernel.org/linuxppc-dev/20200714043624.5648-1-sri...@linux.vnet.ibm.com/t/#u powerpc/smp: Merge Power9 topology with Power topology Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu) since cpu_smt_mask is only defined under CONFIG_SCHED_SMT powerpc/smp: Enable small core scheduling sooner Restored the previous info msg (Jordan) Moved big core topology fixup to fixup_topology (Gautham) powerpc/smp: Dont assume l2-cache to be superset of sibling Set cpumask after verifying l2-cache. (Gautham) powerpc/smp: Generalize 2nd sched domain Moved shared_cache topology fixup to fixup_topology (Gautham) Powerpc/numa: Detect support for coregroup Explained Coregroup in commit msg (Michael Ellerman) Powerpc/smp: Create coregroup domain Moved coregroup topology fixup to fixup_topology (Gautham) powerpc/smp: Implement cpu_to_coregroup_id Move coregroup_enabled before getting associativity (Gautham) powerpc/smp: Provide an ability to disable coregroup Patch dropped (Michael Ellerman) Cleanup of existing powerpc topologies and add coregroup support on Powerpc. Coregroup is a group of (subset of) cores of a DIE that share a resource. Patch 7 of this patch series: "Powerpc/numa: Detect support for coregroup" depends on https://lore.kernel.org/linuxppc-dev/20200707140644.7241-1-sri...@linux.vnet.ibm.com/t/#u However it should be easy to rebase the patch without the above patch. This patch series is based on top of current powerpc/next tree + the above patch. On Power 8 Systems -- $ tail /proc/cpuinfo processor : 255 cpu : POWER8 (architected), altivec supported clock : 3724.00MHz revision: 2.1 (pvr 004b 0201) timebase: 51200 platform: pSeries model : IBM,8408-E8E machine : CHRP IBM,8408-E8E MMU : Hash Before the patchset --- $ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name SMT DIE NUMA NUMA $ head /proc/schedstat version 15 timestamp 4295534931 cpu0 0 0 0 0 0 0 41389823338 17682779896 14117 domain0 ,,,,,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain1 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain2 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain3 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 cpu1 0 0 0 0 0 0 27087859050 152273672 10396 domain0 ,,,,,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain1 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 After the patchset -- $ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name SMT DIE NUMA NUMA $ head /proc/schedstat version 15 timestamp 4295534931 cpu0 0 0 0 0 0 0 41389823338 17682779896 14117 domain0 ,,,,,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain1 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain2 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain3 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 cpu1 0 0 0 0 0 0 27087859050 152273672 10396 domain0 ,,,,,,,00ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 domain1 ,,,,,,, 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 On Power 9 (with device-tree enablement to show coregroups). (hunks for mimicing a coregroup was posted at
[PATCH v3 06/10] powerpc/smp: Generalize 2nd sched domain
Currently "CACHE" domain happens to be the 2nd sched domain as per powerpc_topology. This domain will collapse if cpumask of l2-cache is same as SMT domain. However we could generalize this domain such that it could mean either be a "CACHE" domain or a "BIGCORE" domain. While setting up the "CACHE" domain, check if shared_cache is already set. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Signed-off-by: Srikar Dronamraju --- Changelog v1 -> v2: Moved shared_cache topology fixup to fixup_topology (Gautham) arch/powerpc/kernel/smp.c | 49 --- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index d997c7411664..6f5877f6496e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); +enum { +#ifdef CONFIG_SCHED_SMT + smt_idx, +#endif + bigcore_idx, + die_idx, +}; + #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 struct thread_groups { @@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void) */ static const struct cpumask *shared_cache_mask(int cpu) { - if (shared_caches) - return cpu_l2_cache_mask(cpu); - - if (has_big_cores) - return cpu_smallcore_mask(cpu); - - return per_cpu(cpu_sibling_map, cpu); + return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT @@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu) } #endif +static const struct cpumask *cpu_bigcore_mask(int cpu) +{ + return per_cpu(cpu_sibling_map, cpu); +} + static struct sched_domain_topology_level powerpc_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, #endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) }, { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; @@ -1311,7 +1318,6 @@ static void add_cpu_to_masks(int cpu) void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; mmgrab(_mm); current->active_mm = _mm; @@ -1337,14 +1343,20 @@ void start_secondary(void *unused) /* Update topology CPU masks */ add_cpu_to_masks(cpu); - if (has_big_cores) - sibling_mask = cpu_smallcore_mask; /* * Check for any shared caches. Note that this must be done on a * per-core basis because one core in the pair might be disabled. */ - if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu))) - shared_caches = true; + if (!shared_caches) { + struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask; + struct cpumask *mask = cpu_l2_cache_mask(cpu); + + if (has_big_cores) + sibling_mask = cpu_smallcore_mask; + + if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu))) + shared_caches = true; + } set_numa_node(numa_cpu_lookup_table[cpu]); set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); @@ -1372,10 +1384,19 @@ int setup_profiling_timer(unsigned int multiplier) static void fixup_topology(void) { + if (shared_caches) { + pr_info("Using shared cache scheduler topology\n"); + powerpc_topology[bigcore_idx].mask = shared_cache_mask; +#ifdef CONFIG_SCHED_DEBUG + powerpc_topology[bigcore_idx].name = "CACHE"; +#endif + powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags; + } + #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[0].mask = smallcore_smt_mask; + powerpc_topology[smt_idx].mask = smallcore_smt_mask; } #endif } -- 2.18.2
[PATCH v3 10/10] powerpc/smp: Implement cpu_to_coregroup_id
Lookup the coregroup id from the associativity array. If unable to detect the coregroup id, fallback on the core id. This way, ensure sched_domain degenerates and an extra sched domain is not created. Ideally this function should have been implemented in arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we don't need to find the primary domain again. If the device-tree mentions more than one coregroup, then kernel implements only the last or the smallest coregroup, which currently corresponds to the penultimate domain in the device-tree. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Reviewed-by : Gautham R. Shenoy Signed-off-by: Srikar Dronamraju --- Changelog v1 -> v2: Move coregroup_enabled before getting associativity (Gautham) arch/powerpc/mm/numa.c | 20 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index ef8aa580da21..608e0fa61019 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu) int cpu_to_coregroup_id(int cpu) { + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + int index; + + if (cpu < 0 || cpu > nr_cpu_ids) + return -1; + + if (!coregroup_enabled) + goto out; + + if (!firmware_has_feature(FW_FEATURE_VPHN)) + goto out; + + if (vphn_get_associativity(cpu, associativity)) + goto out; + + index = of_read_number(associativity, 1); + if (index > min_common_depth + 1) + return of_read_number([index - 1], 1); + +out: return cpu_to_core_id(cpu); } -- 2.18.2
[PATCH v3 09/10] powerpc/smp: Create coregroup domain
Add percpu coregroup maps and masks to create coregroup domain. If a coregroup doesn't exist, the coregroup domain will be degenerated in favour of SMT/CACHE domain. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Signed-off-by: Srikar Dronamraju --- Changelog v2 -> v3: Add optimization for mask updation under coregroup_support Changelog v1 -> v2: Moved coregroup topology fixup to fixup_topology (Gautham) arch/powerpc/include/asm/topology.h | 10 +++ arch/powerpc/kernel/smp.c | 44 + arch/powerpc/mm/numa.c | 5 3 files changed, 59 insertions(+) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index f0b6300e7dd3..6609174918ab 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) extern int find_and_online_cpu_nid(int cpu); +extern int cpu_to_coregroup_id(int cpu); #else static inline int find_and_online_cpu_nid(int cpu) { return 0; } +static inline int cpu_to_coregroup_id(int cpu) +{ +#ifdef CONFIG_SMP + return cpu_to_core_id(cpu); +#else + return 0; +#endif +} + #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 7d8d44cbab11..1faedde3e406 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); @@ -91,6 +92,7 @@ enum { smt_idx, #endif bigcore_idx, + mc_idx, die_idx, }; @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu) } #endif +static struct cpumask *cpu_coregroup_mask(int cpu) +{ + return per_cpu(cpu_coregroup_map, cpu); +} + +static bool has_coregroup_support(void) +{ + return coregroup_enabled; +} + +static const struct cpumask *cpu_mc_mask(int cpu) +{ + return cpu_coregroup_mask(cpu); +} + static const struct cpumask *cpu_bigcore_mask(int cpu) { return per_cpu(cpu_sibling_map, cpu); @@ -879,6 +896,7 @@ static struct sched_domain_topology_level powerpc_topology[] = { { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, #endif { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) }, + { cpu_mc_mask, SD_INIT_NAME(MC) }, { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; @@ -925,6 +943,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus) GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + if (has_coregroup_support()) + zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + #ifdef CONFIG_NEED_MULTIPLE_NODES /* * numa_node_id() works after this. @@ -942,6 +964,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + if (has_coregroup_support()) + cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid)); + init_big_cores(); if (has_big_cores) { cpumask_set_cpu(boot_cpuid, @@ -1233,6 +1258,8 @@ static void remove_cpu_from_masks(int cpu) set_cpus_unrelated(cpu, i, cpu_sibling_mask); if (has_big_cores) set_cpus_unrelated(cpu, i, cpu_smallcore_mask); + if (has_coregroup_support()) + set_cpus_unrelated(cpu, i, cpu_coregroup_mask); } } #endif @@ -1293,6 +1320,20 @@ static void add_cpu_to_masks(int cpu) add_cpu_to_smallcore_masks(cpu); update_mask_by_l2(cpu, cpu_l2_cache_mask); + if (has_coregroup_support()) { + int coregroup_id = cpu_to_coregroup_id(cpu); + + cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu)); + for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) { + int fcpu = cpu_first_thread_sibling(i); + + if (fcpu == first_thread) + set_cpus_related(cpu, i, cpu_coregroup_mask); +
[PATCH v3 08/10] powerpc/smp: Allocate cpumask only after searching thread group
If allocated earlier and the search fails, then cpumask need to be freed. However cpu_l1_cache_map can be allocated after we search thread group. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Reviewed-by: Gautham R. Shenoy Signed-off-by: Srikar Dronamraju --- arch/powerpc/kernel/smp.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbca4a8c3314..7d8d44cbab11 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu) if (err) goto out; - zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu), - GFP_KERNEL, - cpu_to_node(cpu)); - cpu_group_start = get_cpu_thread_group_start(cpu, ); if (unlikely(cpu_group_start == -1)) { @@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu) goto out; } + zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + for (i = first_thread; i < first_thread + threads_per_core; i++) { int i_group_start = get_cpu_thread_group_start(i, ); -- 2.18.2
[PATCH v3 07/10] powerpc/numa: Detect support for coregroup
Add support for grouping cores based on the device-tree classification. - The last domain in the associativity domains always refers to the core. - If primary reference domain happens to be the penultimate domain in the associativity domains device-tree property, then there are no coregroups. However if its not a penultimate domain, then there are coregroups. There can be more than one coregroup. For now we would be interested in the last or the smallest coregroups. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Reviewed-by: Gautham R. Shenoy Signed-off-by: Srikar Dronamraju --- Changelog v1 -> v2: Explained Coregroup in commit msg (Michael Ellerman) arch/powerpc/include/asm/smp.h | 1 + arch/powerpc/kernel/smp.c | 1 + arch/powerpc/mm/numa.c | 34 +- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 49a25e2400f2..5bdc17a7049f 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -28,6 +28,7 @@ extern int boot_cpuid; extern int spinning_secondaries; extern u32 *cpu_to_phys_id; +extern bool coregroup_enabled; extern void cpu_die(void); extern int cpu_to_chip_id(int cpu); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 6f5877f6496e..cbca4a8c3314 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; struct task_struct *secondary_current; bool has_big_cores; +bool coregroup_enabled; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index bc5b2e8112c8..3248160c0327 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -886,7 +886,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { struct device_node *rtas; - u32 numnodes, i; + const __be32 *domains; + int prop_length, max_nodes; + u32 i; if (!numa_enabled) return; @@ -895,25 +897,31 @@ static void __init find_possible_nodes(void) if (!rtas) return; - if (of_property_read_u32_index(rtas, "ibm,current-associativity-domains", - min_common_depth, )) { - /* -* ibm,current-associativity-domains is a fairly recent -* property. If it doesn't exist, then fallback on -* ibm,max-associativity-domains. Current denotes what the -* platform can support compared to max which denotes what the -* Hypervisor can support. -*/ - if (of_property_read_u32_index(rtas, "ibm,max-associativity-domains", - min_common_depth, )) + /* +* ibm,current-associativity-domains is a fairly recent property. If +* it doesn't exist, then fallback on ibm,max-associativity-domains. +* Current denotes what the platform can support compared to max +* which denotes what the Hypervisor can support. +*/ + domains = of_get_property(rtas, "ibm,current-associativity-domains", + _length); + if (!domains) { + domains = of_get_property(rtas, "ibm,max-associativity-domains", + _length); + if (!domains) goto out; } - for (i = 0; i < numnodes; i++) { + max_nodes = of_read_number([min_common_depth], 1); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); } + prop_length /= sizeof(int); + if (prop_length > min_common_depth + 2) + coregroup_enabled = 1; + out: of_node_put(rtas); } -- 2.18.2
[PATCH v3 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
Current code assumes that cpumask of cpus sharing a l2-cache mask will always be a superset of cpu_sibling_mask. Lets stop that assumption. cpu_l2_cache_mask is a superset of cpu_sibling_mask if and only if shared_caches is set. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Signed-off-by: Srikar Dronamraju --- Changelog v1 -> v2: Set cpumask after verifying l2-cache. (Gautham) arch/powerpc/kernel/smp.c | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index da27f6909be1..d997c7411664 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1194,6 +1194,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) if (!l2_cache) return false; + cpumask_set_cpu(cpu, mask_fn(cpu)); for_each_cpu(i, cpu_online_mask) { /* * when updating the marks the current CPU has not been marked @@ -1276,29 +1277,30 @@ static void add_cpu_to_masks(int cpu) * add it to it's own thread sibling mask. */ cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); for (i = first_thread; i < first_thread + threads_per_core; i++) if (cpu_online(i)) set_cpus_related(i, cpu, cpu_sibling_mask); add_cpu_to_smallcore_masks(cpu); - /* -* Copy the thread sibling mask into the cache sibling mask -* and mark any CPUs that share an L2 with this CPU. -*/ - for_each_cpu(i, cpu_sibling_mask(cpu)) - set_cpus_related(cpu, i, cpu_l2_cache_mask); update_mask_by_l2(cpu, cpu_l2_cache_mask); - /* -* Copy the cache sibling mask into core sibling mask and mark -* any CPUs on the same chip as this CPU. -*/ - for_each_cpu(i, cpu_l2_cache_mask(cpu)) - set_cpus_related(cpu, i, cpu_core_mask); + if (pkg_id == -1) { + struct cpumask *(*mask)(int) = cpu_sibling_mask; + + /* +* Copy the sibling mask into core sibling mask and +* mark any CPUs on the same chip as this CPU. +*/ + if (shared_caches) + mask = cpu_l2_cache_mask; + + for_each_cpu(i, mask(cpu)) + set_cpus_related(cpu, i, cpu_core_mask); - if (pkg_id == -1) return; + } for_each_cpu(i, cpu_online_mask) if (get_physical_package_id(i) == pkg_id) -- 2.18.2
[PATCH v3 04/10] powerpc/smp: Move topology fixups into a new function
Move topology fixup based on the platform attributes into its own function which is called just before set_sched_topology. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Signed-off-by: Srikar Dronamraju --- Changelog v2 -> v3: Rewrote changelog (Gautham) Renamed to powerpc/smp: Move topology fixups into a new function arch/powerpc/kernel/smp.c | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a685915e5941..da27f6909be1 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1368,6 +1368,16 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +static void fixup_topology(void) +{ +#ifdef CONFIG_SCHED_SMT + if (has_big_cores) { + pr_info("Big cores detected but using small core scheduling\n"); + powerpc_topology[0].mask = smallcore_smt_mask; + } +#endif +} + void __init smp_cpus_done(unsigned int max_cpus) { /* @@ -1381,12 +1391,7 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -#ifdef CONFIG_SCHED_SMT - if (has_big_cores) { - pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[0].mask = smallcore_smt_mask; - } -#endif + fixup_topology(); set_sched_topology(powerpc_topology); } -- 2.18.2
[PATCH v3 03/10] powerpc/smp: Move powerpc_topology above
Just moving the powerpc_topology description above. This will help in using functions in this file and avoid declarations. No other functional changes Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Reviewed-by: Gautham R. Shenoy Signed-off-by: Srikar Dronamraju --- arch/powerpc/kernel/smp.c | 116 +++--- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 283a04e54f52..a685915e5941 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu) return err; } +static bool shared_caches; + +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymmetric SMT dependency */ +static int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +/* + * P9 has a slightly odd architecture where pairs of cores share an L2 cache. + * This topology makes it *much* cheaper to migrate tasks between adjacent cores + * since the migrated task remains cache hot. We want to take advantage of this + * at the scheduler level so an extra topology level is required. + */ +static int powerpc_shared_cache_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} + +/* + * We can't just pass cpu_l2_cache_mask() directly because + * returns a non-const pointer and the compiler barfs on that. + */ +static const struct cpumask *shared_cache_mask(int cpu) +{ + if (shared_caches) + return cpu_l2_cache_mask(cpu); + + if (has_big_cores) + return cpu_smallcore_mask(cpu); + + return per_cpu(cpu_sibling_map, cpu); +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *smallcore_smt_mask(int cpu) +{ + return cpu_smallcore_mask(cpu); +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static int init_big_cores(void) { int cpu; @@ -1247,8 +1305,6 @@ static void add_cpu_to_masks(int cpu) set_cpus_related(cpu, i, cpu_core_mask); } -static bool shared_caches; - /* Activate a secondary processor. */ void start_secondary(void *unused) { @@ -1312,62 +1368,6 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } -#ifdef CONFIG_SCHED_SMT -/* cpumask of CPUs with asymmetric SMT dependency */ -static int powerpc_smt_flags(void) -{ - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; - - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - flags |= SD_ASYM_PACKING; - } - return flags; -} -#endif - -/* - * P9 has a slightly odd architecture where pairs of cores share an L2 cache. - * This topology makes it *much* cheaper to migrate tasks between adjacent cores - * since the migrated task remains cache hot. We want to take advantage of this - * at the scheduler level so an extra topology level is required. - */ -static int powerpc_shared_cache_flags(void) -{ - return SD_SHARE_PKG_RESOURCES; -} - -/* - * We can't just pass cpu_l2_cache_mask() directly because - * returns a non-const pointer and the compiler barfs on that. - */ -static const struct cpumask *shared_cache_mask(int cpu) -{ - if (shared_caches) - return cpu_l2_cache_mask(cpu); - - if (has_big_cores) - return cpu_smallcore_mask(cpu); - - return per_cpu(cpu_sibling_map, cpu); -} - -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) -{ - return cpu_smallcore_mask(cpu); -} -#endif - -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - void __init smp_cpus_done(unsigned int max_cpus) { /* -- 2.18.2
[PATCH v3 02/10] powerpc/smp: Merge Power9 topology with Power topology
A new sched_domain_topology_level was added just for Power9. However the same can be achieved by merging powerpc_topology with power9_topology and makes the code more simpler especially when adding a new sched domain. Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Signed-off-by: Srikar Dronamraju --- Changelog v1 -> v2: Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu) since cpu_smt_mask is only defined under CONFIG_SCHED_SMT arch/powerpc/kernel/smp.c | 33 ++--- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index edf94ca64eea..283a04e54f52 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1313,7 +1313,7 @@ int setup_profiling_timer(unsigned int multiplier) } #ifdef CONFIG_SCHED_SMT -/* cpumask of CPUs with asymetric SMT dependancy */ +/* cpumask of CPUs with asymmetric SMT dependency */ static int powerpc_smt_flags(void) { int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; @@ -1326,14 +1326,6 @@ static int powerpc_smt_flags(void) } #endif -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - /* * P9 has a slightly odd architecture where pairs of cores share an L2 cache. * This topology makes it *much* cheaper to migrate tasks between adjacent cores @@ -1351,7 +1343,13 @@ static int powerpc_shared_cache_flags(void) */ static const struct cpumask *shared_cache_mask(int cpu) { - return cpu_l2_cache_mask(cpu); + if (shared_caches) + return cpu_l2_cache_mask(cpu); + + if (has_big_cores) + return cpu_smallcore_mask(cpu); + + return per_cpu(cpu_sibling_map, cpu); } #ifdef CONFIG_SCHED_SMT @@ -1361,7 +1359,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu) } #endif -static struct sched_domain_topology_level power9_topology[] = { +static struct sched_domain_topology_level powerpc_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, #endif @@ -1386,21 +1384,10 @@ void __init smp_cpus_done(unsigned int max_cpus) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - power9_topology[0].mask = smallcore_smt_mask; powerpc_topology[0].mask = smallcore_smt_mask; } #endif - /* -* If any CPU detects that it's sharing a cache with another CPU then -* use the deeper topology that is aware of this sharing. -*/ - if (shared_caches) { - pr_info("Using shared cache scheduler topology\n"); - set_sched_topology(power9_topology); - } else { - pr_info("Using standard scheduler topology\n"); - set_sched_topology(powerpc_topology); - } + set_sched_topology(powerpc_topology); } #ifdef CONFIG_HOTPLUG_CPU -- 2.18.2
[PATCH v3 01/10] powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES
Fix a build warning in a non CONFIG_NEED_MULTIPLE_NODES "error: _numa_cpu_lookup_table_ undeclared" Cc: linuxppc-dev Cc: LKML Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Anton Blanchard Cc: Oliver O'Halloran Cc: Nathan Lynch Cc: Michael Neuling Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valentin Schneider Cc: Jordan Niethe Reviewed-by: Gautham R. Shenoy Signed-off-by: Srikar Dronamraju --- Changelog v2 -> v3: Removed node caching part. Rewrote the Commit msg (Michael Ellerman) Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES arch/powerpc/kernel/smp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 73199470c265..edf94ca64eea 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -860,6 +860,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); +#ifdef CONFIG_NEED_MULTIPLE_NODES /* * numa_node_id() works after this. */ @@ -868,6 +869,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu])); } +#endif } /* Init the cpumasks so the boot CPU is related to itself */ -- 2.18.2
Re: [v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine
On Thu, Jul 23, 2020 at 5:32 PM Athira Rajeev wrote: > > Initialize Monitor Mode Control Register 3 (MMCR3) > SPR which is new in power10. For PowerISA v3.1, BHRB disable > is controlled via Monitor Mode Control Register A (MMCRA) bit, > namely "BHRB Recording Disable (BHRBRD)". This patch also initializes > MMCRA BHRBRD to disable BHRB feature at boot for power10. > > Signed-off-by: Athira Rajeev Reviewed-by: Jordan Niethe > --- > Dependency: > - On power10 PMU base enablement series V3: > https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462 > > Changes from v3 -> v4 > - Addressed review comments from Jordan and Michael Ellerman. > This patch was initially part of Power10 PMU base enablement > series. Moving this as separate patch as suggested by Michael > Ellerman. Hence dependency of initial series Patch 7 which defines > MMCRA_BHRB_DISABLE. Addressed review comments from Jordan to make > sure existing PMU function (__INIT_PMU) will not overwrite ISA 3.1 > updates > > Changes from v2 -> v3 > - Addressed review comment from Michael Ellerman to > call PMU init from __setup_cpu_power10 > > arch/powerpc/kernel/cpu_setup_power.S | 19 +++ > 1 file changed, 15 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/kernel/cpu_setup_power.S > b/arch/powerpc/kernel/cpu_setup_power.S > index efdcfa7..3fa6eef 100644 > --- a/arch/powerpc/kernel/cpu_setup_power.S > +++ b/arch/powerpc/kernel/cpu_setup_power.S > @@ -94,13 +94,15 @@ _GLOBAL(__restore_cpu_power8) > _GLOBAL(__setup_cpu_power10) > mflrr11 > bl __init_FSCR_power10 > + bl __init_PMU > + bl __init_PMU_ISA31 > b 1f > > _GLOBAL(__setup_cpu_power9) > mflrr11 > bl __init_FSCR > -1: bl __init_PMU > - bl __init_hvmode_206 > + bl __init_PMU > +1: bl __init_hvmode_206 > mtlrr11 > beqlr > li r0,0 > @@ -124,13 +126,15 @@ _GLOBAL(__setup_cpu_power9) > _GLOBAL(__restore_cpu_power10) > mflrr11 > bl __init_FSCR_power10 > + bl __init_PMU > + bl __init_PMU_ISA31 > b 1f > > _GLOBAL(__restore_cpu_power9) > mflrr11 > bl __init_FSCR > -1: bl __init_PMU > - mfmsr r3 > + bl __init_PMU > +1: mfmsr r3 > rldicl. r0,r3,4,63 > mtlrr11 > beqlr > @@ -233,3 +237,10 @@ __init_PMU_ISA207: > li r5,0 > mtspr SPRN_MMCRS,r5 > blr > + > +__init_PMU_ISA31: > + li r5,0 > + mtspr SPRN_MMCR3,r5 > + LOAD_REG_IMMEDIATE(r5, MMCRA_BHRB_DISABLE) > + mtspr SPRN_MMCRA,r5 > + blr > -- > 1.8.3.1 >
[v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine
Initialize Monitor Mode Control Register 3 (MMCR3) SPR which is new in power10. For PowerISA v3.1, BHRB disable is controlled via Monitor Mode Control Register A (MMCRA) bit, namely "BHRB Recording Disable (BHRBRD)". This patch also initializes MMCRA BHRBRD to disable BHRB feature at boot for power10. Signed-off-by: Athira Rajeev --- Dependency: - On power10 PMU base enablement series V3: https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462 Changes from v3 -> v4 - Addressed review comments from Jordan and Michael Ellerman. This patch was initially part of Power10 PMU base enablement series. Moving this as separate patch as suggested by Michael Ellerman. Hence dependency of initial series Patch 7 which defines MMCRA_BHRB_DISABLE. Addressed review comments from Jordan to make sure existing PMU function (__INIT_PMU) will not overwrite ISA 3.1 updates Changes from v2 -> v3 - Addressed review comment from Michael Ellerman to call PMU init from __setup_cpu_power10 arch/powerpc/kernel/cpu_setup_power.S | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index efdcfa7..3fa6eef 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -94,13 +94,15 @@ _GLOBAL(__restore_cpu_power8) _GLOBAL(__setup_cpu_power10) mflrr11 bl __init_FSCR_power10 + bl __init_PMU + bl __init_PMU_ISA31 b 1f _GLOBAL(__setup_cpu_power9) mflrr11 bl __init_FSCR -1: bl __init_PMU - bl __init_hvmode_206 + bl __init_PMU +1: bl __init_hvmode_206 mtlrr11 beqlr li r0,0 @@ -124,13 +126,15 @@ _GLOBAL(__setup_cpu_power9) _GLOBAL(__restore_cpu_power10) mflrr11 bl __init_FSCR_power10 + bl __init_PMU + bl __init_PMU_ISA31 b 1f _GLOBAL(__restore_cpu_power9) mflrr11 bl __init_FSCR -1: bl __init_PMU - mfmsr r3 + bl __init_PMU +1: mfmsr r3 rldicl. r0,r3,4,63 mtlrr11 beqlr @@ -233,3 +237,10 @@ __init_PMU_ISA207: li r5,0 mtspr SPRN_MMCRS,r5 blr + +__init_PMU_ISA31: + li r5,0 + mtspr SPRN_MMCR3,r5 + LOAD_REG_IMMEDIATE(r5, MMCRA_BHRB_DISABLE) + mtspr SPRN_MMCRA,r5 + blr -- 1.8.3.1
[powerpc:merge] BUILD SUCCESS c27fe454aff795023d2f3f90f41eb1a3104e614f
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge branch HEAD: c27fe454aff795023d2f3f90f41eb1a3104e614f Automatic merge of 'master', 'next' and 'fixes' (2020-07-21 00:00) elapsed time: 3913m configs tested: 111 configs skipped: 2 The following configs have been built successfully. More configs may be tested in the coming days. arm64allyesconfig arm64 defconfig arm64allmodconfig arm64 allnoconfig arm allyesconfig arm allmodconfig arm allnoconfig arm defconfig sh rsk7264_defconfig c6x defconfig c6xevmc6474_defconfig arm shannon_defconfig powerpc ppc64_defconfig arm footbridge_defconfig s390 alldefconfig s390 debug_defconfig arm pxa3xx_defconfig m68km5407c3_defconfig sh sdk7780_defconfig c6x dsk6455_defconfig arm h5000_defconfig i386 allyesconfig i386defconfig i386 debian-10.3 i386 allnoconfig ia64 allmodconfig ia64defconfig ia64 allnoconfig ia64 allyesconfig m68k allmodconfig m68k allnoconfig m68k sun3_defconfig m68kdefconfig m68k allyesconfig nios2 defconfig nios2allyesconfig openriscdefconfig c6x allyesconfig c6x allnoconfig openrisc allyesconfig nds32 defconfig nds32 allnoconfig csky allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig h8300allmodconfig xtensa defconfig arc defconfig arc allyesconfig sh allmodconfig shallnoconfig microblazeallnoconfig mips allyesconfig mips allnoconfig mips allmodconfig pariscallnoconfig parisc defconfig parisc allyesconfig parisc allmodconfig powerpc defconfig powerpc allyesconfig powerpc rhel-kconfig powerpc allmodconfig powerpc allnoconfig i386 randconfig-a001-20200719 i386 randconfig-a006-20200719 i386 randconfig-a002-20200719 i386 randconfig-a005-20200719 i386 randconfig-a003-20200719 i386 randconfig-a004-20200719 x86_64 randconfig-a014-20200720 x86_64 randconfig-a015-20200720 x86_64 randconfig-a016-20200720 x86_64 randconfig-a012-20200720 x86_64 randconfig-a013-20200720 x86_64 randconfig-a011-20200720 x86_64 randconfig-a005-20200719 x86_64 randconfig-a002-20200719 x86_64 randconfig-a006-20200719 x86_64 randconfig-a001-20200719 x86_64 randconfig-a003-20200719 x86_64 randconfig-a004-20200719 i386 randconfig-a015-20200719 i386 randconfig-a011-20200719 i386 randconfig-a016-20200719 i386 randconfig-a012-20200719 i386 randconfig-a013-20200719 i386 randconfig-a014-20200719 riscvallyesconfig riscv allnoconfig riscv defconfig riscvallmodconfig s390 allyesconfig s390 allnoconfig s390 allmodconfig s390defconfig sparcallyesconfig sparc