[powerpc:next] BUILD SUCCESS 335aca5f65f1a39670944930131da5f2276f888f

2020-07-23 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
next
branch HEAD: 335aca5f65f1a39670944930131da5f2276f888f  Merge branch 'scv' 
support into next

elapsed time: 951m

configs tested: 74
configs skipped: 1

The following configs have been built successfully.
More configs may be tested in the coming days.

arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
arm defconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
i386defconfig
i386 allyesconfig
i386  debian-10.3
i386  allnoconfig
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
nios2allyesconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
openriscdefconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
riscvallmodconfig
s390 allyesconfig
s390  allnoconfig
s390 allmodconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
sparc64   allnoconfig
sparc64  allmodconfig
sparc64 defconfig
sparc64  allyesconfig
x86_64rhel-7.6-kselftests
x86_64   rhel-8.3
x86_64  kexec
x86_64   rhel
x86_64lkp
x86_64  fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH v2 10/16] powerpc/powernv/pci: Refactor pnv_ioda_alloc_pe()

2020-07-23 Thread Alexey Kardashevskiy



On 22/07/2020 16:57, Oliver O'Halloran wrote:
> Rework the PE allocation logic to allow allocating blocks of PEs rather
> than individually. We'll use this to allocate contigious blocks of PEs for
> the SR-IOVs.
> 
> This patch also adds code to pnv_ioda_alloc_pe() and pnv_ioda_reserve_pe() to
> use the existing, but unused, phb->pe_alloc_mutex. Currently these functions
> use atomic bit ops to release a currently allocated PE number. However,
> the pnv_ioda_alloc_pe() wants to have exclusive access to the bit map while
> scanning for hole large enough to accomodate the allocation size.
> 
> Signed-off-by: Oliver O'Halloran 
> ---
> v2: Add some details about the pe_alloc mutex and why we're using it.


Reviewed-by: Alexey Kardashevskiy 




> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 41 ++-
>  arch/powerpc/platforms/powernv/pci.h  |  2 +-
>  2 files changed, 34 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 2d36a9ebf0e9..c9c25fb0783c 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -145,23 +145,45 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, 
> int pe_no)
>   return;
>   }
>  
> + mutex_lock(>ioda.pe_alloc_mutex);
>   if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
>   pr_debug("%s: PE %x was reserved on PHB#%x\n",
>__func__, pe_no, phb->hose->global_number);
> + mutex_unlock(>ioda.pe_alloc_mutex);
>  
>   pnv_ioda_init_pe(phb, pe_no);
>  }
>  
> -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
>  {
> - long pe;
> + struct pnv_ioda_pe *ret = NULL;
> + int run = 0, pe, i;
>  
> + mutex_lock(>ioda.pe_alloc_mutex);
> +
> + /* scan backwards for a run of @count cleared bits */
>   for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
> - if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
> - return pnv_ioda_init_pe(phb, pe);
> + if (test_bit(pe, phb->ioda.pe_alloc)) {
> + run = 0;
> + continue;
> + }
> +
> + run++;
> + if (run == count)
> + break;
>   }
> + if (run != count)
> + goto out;
>  
> - return NULL;
> + for (i = pe; i < pe + count; i++) {
> + set_bit(i, phb->ioda.pe_alloc);
> + pnv_ioda_init_pe(phb, i);
> + }
> + ret = >ioda.pe_array[pe];
> +
> +out:
> + mutex_unlock(>ioda.pe_alloc_mutex);
> + return ret;
>  }
>  
>  void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
> @@ -173,7 +195,10 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
>   WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
>   kfree(pe->npucomp);
>   memset(pe, 0, sizeof(struct pnv_ioda_pe));
> +
> + mutex_lock(>ioda.pe_alloc_mutex);
>   clear_bit(pe_num, phb->ioda.pe_alloc);
> + mutex_unlock(>ioda.pe_alloc_mutex);
>  }
>  
>  /* The default M64 BAR is shared by all PEs */
> @@ -976,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct 
> pci_dev *dev)
>   if (pdn->pe_number != IODA_INVALID_PE)
>   return NULL;
>  
> - pe = pnv_ioda_alloc_pe(phb);
> + pe = pnv_ioda_alloc_pe(phb, 1);
>   if (!pe) {
>   pr_warn("%s: Not enough PE# available, disabling device\n",
>   pci_name(dev));
> @@ -1047,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
> pci_bus *bus, bool all)
>  
>   /* The PE number isn't pinned by M64 */
>   if (!pe)
> - pe = pnv_ioda_alloc_pe(phb);
> + pe = pnv_ioda_alloc_pe(phb, 1);
>  
>   if (!pe) {
>   pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
> @@ -3065,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct 
> device_node *np,
>   pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
>   } else {
>   /* otherwise just allocate one */
> - root_pe = pnv_ioda_alloc_pe(phb);
> + root_pe = pnv_ioda_alloc_pe(phb, 1);
>   phb->ioda.root_pe_idx = root_pe->pe_number;
>   }
>  
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index 23fc5e391c7f..06431a452130 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -224,7 +224,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct 
> pnv_ioda_pe *pe);
>  void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
>  void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
>  
> -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb);
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
>  

Re: [PATCH] powerpc/64s: Fix irq tracing corruption in interrupt/syscall return caused by perf interrupts

2020-07-23 Thread Alexey Kardashevskiy



On 23/07/2020 20:29, Nicholas Piggin wrote:
> Excerpts from Alexey Kardashevskiy's message of July 22, 2020 8:50 pm:
>>
>>
>> On 22/07/2020 17:34, Nicholas Piggin wrote:
>>> Alexey reports lockdep_assert_irqs_enabled() warnings when stress testing 
>>> perf, e.g.,
>>>
>>> WARNING: CPU: 0 PID: 1556 at kernel/softirq.c:169 
>>> __local_bh_enable_ip+0x258/0x270
>>> CPU: 0 PID: 1556 Comm: syz-executor
>>> NIP:  c01ec888 LR: c01ec884 CTR: c0ef0610
>>> REGS: c00022d4f8a0 TRAP: 0700   Not tainted  (5.8.0-rc3-x)
>>> MSR:  80029033   CR: 28008844  XER: 2004
>>> CFAR: c01dc1d0 IRQMASK: 0
>>>
>>> The interesting thing is MSR[EE] and IRQMASK shows interrupts are enabled,
>>> suggesting the current->hardirqs_enabled irq tracing state is going out of 
>>> sync
>>> with the actual interrupt enable state.
>>>
>>> The cause is a window in interrupt/syscall return where irq tracing state 
>>> is being
>>> adjusted for an irqs-enabled return while MSR[EE] is still enabled. A perf
>>> interrupt hits and ends up calling trace_hardirqs_off() when restoring
>>> interrupt flags to a disable state.
>>>
>>> Fix this by disabling perf interrupts as well while adjusting irq tracing 
>>> state.
>>>
>>> Add a debug check that catches the condition sooner.
>>>
>>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic 
>>> in C")
>>> Reported-by: Alexey Kardashevskiy 
>>> Signed-off-by: Nicholas Piggin 
>>> ---
>>>
>>> I can reproduce similar symptoms and this patch fixes my test case,
>>> still trying to confirm Alexey's test case or whether there's another
>>> similar bug causing it.
>>
>>
>> This does not fix my testcase. I applied this on top of 4fa640dc5230
>> ("Merge tag 'vfio-v5.8-rc7' of git://github.com/awilliam/linux-vfio into
>> master")  without any of my testing code, just to be clear. Sorry...
> 
> Okay it seems to be a bigger problem and not actually caused by that
> patch but was possible for lockdep hardirqs_enabled state to get out 
> of synch with the local_irq_disable() state before that too. Root
> cause is similar -- perf interrupts hitting between updating the two
> different bits of state.
> 
> Not quite sure why Alexey's test wasn't hitting it before the patch,
> but possibly the way masked interrupts get replayed. But I was able
> to hit the problem with a different assertion.
> 
> I think I have a fix, but it seems to be a generic irq tracing code
> issue. So this patch can be dropped, and it's not an urgent issue for
> the next release (it only triggers warns on rare occasions and only
> when lockdep is enabled).


I would still like to understand how the last
curr->hardirq_disable_event misses the ftrace buffer and we end up in
the original interrupted kernel code...


-- 
Alexey


Re: [PATCH v2 14/14] powerpc/eeh: Move PE tree setup into the platform

2020-07-23 Thread Oliver O'Halloran
On Fri, Jul 24, 2020 at 3:01 PM Alexey Kardashevskiy  wrote:
>
>
>
> On 22/07/2020 14:26, Oliver O'Halloran wrote:
> > The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree
> > follows the PCI bus structures because a reset asserted on an upstream
> > bridge will be propagated to the downstream bridges. On pseries there's a
> > 1-1 correspondence between what the guest sees are a PHB and a PE so the
> > "tree" is really just a single node.
> >
> > Current the EEH core is reponsible for setting up this PE tree which it
> > does by traversing the pci_dn tree. The structure of the pci_dn tree
> > matches the bus tree on PowerNV which leads to the PE tree being "correct"
> > this setup method doesn't make a whole lot of sense and it's actively
> > confusing for the pseries case where it doesn't really do anything.
> >
> > We want to remove the dependence on pci_dn anyway so this patch move
> > choosing where to insert a new PE into the platform code rather than
> > being part of the generic EEH code. For PowerNV this simplifies the
> > tree building logic and removes the use of pci_dn. For pseries we
> > keep the existing logic. I'm not really convinced it does anything
> > due to the 1-1 PE-to-PHB correspondence so every device under that
> > PHB should be in the same PE, but I'd rather not remove it entirely
> > until we've had a chance to look at it more deeply.
> >
> > Signed-off-by: Oliver O'Halloran 
> > Reviewed-by: Alexey Kardashevskiy 
> > ---
> > v2: Reworked pseries PE setup slightly. NOT DONE YET. mostly done needs test
>
> So far it was looking good and now this :)
>
> When is it going to be done? Is this the broken stuff you mentioned
> elsewhere?

I am a dumb.

I put those in there to remind myself what I have / haven't done when
respinning a series. I added that before I tested it and forgot to
remove the comment.

>
>
> --
> Alexey


Re: [PATCH v2 01/14] powerpc/eeh: Remove eeh_dev_phb_init_dynamic()

2020-07-23 Thread Alexey Kardashevskiy



On 22/07/2020 14:26, Oliver O'Halloran wrote:
> This function is a one line wrapper around eeh_phb_pe_create() and despite
> the name it doesn't create any eeh_dev structures.

The "eeh_dev_phb_init_dynamic" name does not suggest anything really but
the comment does.


Reviewed-by: Alexey Kardashevskiy 



> Replace it with direct
> calls to eeh_phb_pe_create() since that does what it says on the tin
> and removes a layer of indirection.
> 
> Signed-off-by: Oliver O'Halloran 
> ---
> v2: Added sub prototype of eeh_phb_pe_create() for the !CONFIG_EEH case.
> ---
>  arch/powerpc/include/asm/eeh.h |  3 ++-
>  arch/powerpc/kernel/eeh.c  |  2 +-
>  arch/powerpc/kernel/eeh_dev.c  | 13 -
>  arch/powerpc/kernel/of_platform.c  |  4 ++--
>  arch/powerpc/platforms/pseries/pci_dlpar.c |  2 +-
>  5 files changed, 6 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 964a54292b36..64487b88c569 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -294,7 +294,6 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe);
>  struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
>  
>  struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
> -void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
>  void eeh_show_enabled(void);
>  int __init eeh_ops_register(struct eeh_ops *ops);
>  int __exit eeh_ops_unregister(const char *name);
> @@ -370,6 +369,8 @@ void pseries_eeh_init_edev_recursive(struct pci_dn *pdn);
>  #else
>  static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { }
>  static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { }
> +
> +static inline int eeh_phb_pe_create(struct pci_controller *phb) { return 0; }
>  #endif
>  
>  #ifdef CONFIG_PPC64
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index d407981dec76..859f76020256 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1096,7 +1096,7 @@ static int eeh_init(void)
>  
>   /* Initialize PHB PEs */
>   list_for_each_entry_safe(hose, tmp, _list, list_node)
> - eeh_dev_phb_init_dynamic(hose);
> + eeh_phb_pe_create(hose);
>  
>   eeh_addr_cache_init();
>  
> diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
> index 7370185c7a05..8e159a12f10c 100644
> --- a/arch/powerpc/kernel/eeh_dev.c
> +++ b/arch/powerpc/kernel/eeh_dev.c
> @@ -52,16 +52,3 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
>  
>   return edev;
>  }
> -
> -/**
> - * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
> - * @phb: PHB
> - *
> - * Scan the PHB OF node and its child association, then create the
> - * EEH devices accordingly
> - */
> -void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
> -{
> - /* EEH PE for PHB */
> - eeh_phb_pe_create(phb);
> -}
> diff --git a/arch/powerpc/kernel/of_platform.c 
> b/arch/powerpc/kernel/of_platform.c
> index 71a3f97dc988..f89376ff633e 100644
> --- a/arch/powerpc/kernel/of_platform.c
> +++ b/arch/powerpc/kernel/of_platform.c
> @@ -62,8 +62,8 @@ static int of_pci_phb_probe(struct platform_device *dev)
>   /* Init pci_dn data structures */
>   pci_devs_phb_init_dynamic(phb);
>  
> - /* Create EEH PEs for the PHB */
> - eeh_dev_phb_init_dynamic(phb);
> + /* Create EEH PE for the PHB */
> + eeh_phb_pe_create(phb);
>  
>   /* Scan the bus */
>   pcibios_scan_phb(phb);
> diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c 
> b/arch/powerpc/platforms/pseries/pci_dlpar.c
> index b3a38f5a6b68..f9ae17e8a0f4 100644
> --- a/arch/powerpc/platforms/pseries/pci_dlpar.c
> +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
> @@ -34,7 +34,7 @@ struct pci_controller *init_phb_dynamic(struct device_node 
> *dn)
>   pci_devs_phb_init_dynamic(phb);
>  
>   /* Create EEH devices for the PHB */
> - eeh_dev_phb_init_dynamic(phb);
> + eeh_phb_pe_create(phb);
>  
>   if (dn->child)
>   pseries_eeh_init_edev_recursive(PCI_DN(dn));
> 

-- 
Alexey


Re: [PATCH v2 14/14] powerpc/eeh: Move PE tree setup into the platform

2020-07-23 Thread Alexey Kardashevskiy



On 22/07/2020 14:26, Oliver O'Halloran wrote:
> The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree
> follows the PCI bus structures because a reset asserted on an upstream
> bridge will be propagated to the downstream bridges. On pseries there's a
> 1-1 correspondence between what the guest sees are a PHB and a PE so the
> "tree" is really just a single node.
> 
> Current the EEH core is reponsible for setting up this PE tree which it
> does by traversing the pci_dn tree. The structure of the pci_dn tree
> matches the bus tree on PowerNV which leads to the PE tree being "correct"
> this setup method doesn't make a whole lot of sense and it's actively
> confusing for the pseries case where it doesn't really do anything.
> 
> We want to remove the dependence on pci_dn anyway so this patch move
> choosing where to insert a new PE into the platform code rather than
> being part of the generic EEH code. For PowerNV this simplifies the
> tree building logic and removes the use of pci_dn. For pseries we
> keep the existing logic. I'm not really convinced it does anything
> due to the 1-1 PE-to-PHB correspondence so every device under that
> PHB should be in the same PE, but I'd rather not remove it entirely
> until we've had a chance to look at it more deeply.
> 
> Signed-off-by: Oliver O'Halloran 
> Reviewed-by: Alexey Kardashevskiy 
> ---
> v2: Reworked pseries PE setup slightly. NOT DONE YET. mostly done needs test

So far it was looking good and now this :)

When is it going to be done? Is this the broken stuff you mentioned
elsewhere?


-- 
Alexey


[powerpc:fixes-test] BUILD SUCCESS 590ce02bd148cd35721560c140e3759e39a6e56a

2020-07-23 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
fixes-test
branch HEAD: 590ce02bd148cd35721560c140e3759e39a6e56a  powerpc/64s: Fix irq 
tracing corruption in interrupt/syscall return caused by perf interrupts

elapsed time: 905m

configs tested: 74
configs skipped: 1

The following configs have been built successfully.
More configs may be tested in the coming days.

arm defconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
i386 allyesconfig
i386defconfig
i386  debian-10.3
i386  allnoconfig
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
nios2   defconfig
nios2allyesconfig
openriscdefconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
riscvallmodconfig
s390 allyesconfig
s390  allnoconfig
s390 allmodconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
sparc64 defconfig
sparc64   allnoconfig
sparc64  allyesconfig
sparc64  allmodconfig
x86_64rhel-7.6-kselftests
x86_64   rhel-8.3
x86_64  kexec
x86_64   rhel
x86_64lkp
x86_64  fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH v5 4/7] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.

2020-07-23 Thread Bharata B Rao
On Thu, Jul 23, 2020 at 01:07:21PM -0700, Ram Pai wrote:
> The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the
> pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge
> delay in tranistioning the VM to SVM. The Ultravisor is only interested
> in the pages that contain the kernel, initrd and other important data
> structures. The rest contain throw-away content.
> 
> However if not all pages are requested by the Ultravisor, the Hypervisor
> continues to consider the GFNs corresponding to the non-requested pages
> as normal GFNs. This can lead to data-corruption and undefined behavior.
> 
> In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's
> GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared
> or Paged-in followed by a Paged-out.
> 
> Cc: Paul Mackerras 
> Cc: Benjamin Herrenschmidt 
> Cc: Michael Ellerman 
> Cc: Bharata B Rao 
> Cc: Aneesh Kumar K.V 
> Cc: Sukadev Bhattiprolu 
> Cc: Laurent Dufour 
> Cc: Thiago Jung Bauermann 
> Cc: David Gibson 
> Cc: Claudio Carvalho 
> Cc: kvm-...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Ram Pai 
> ---
>  Documentation/powerpc/ultravisor.rst|   2 +
>  arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
>  arch/powerpc/kvm/book3s_hv_uvmem.c  | 136 
> +---
>  3 files changed, 127 insertions(+), 13 deletions(-)
> 
> diff --git a/Documentation/powerpc/ultravisor.rst 
> b/Documentation/powerpc/ultravisor.rst
> index a1c8c37..ba6b1bf 100644
> --- a/Documentation/powerpc/ultravisor.rst
> +++ b/Documentation/powerpc/ultravisor.rst
> @@ -934,6 +934,8 @@ Return values
>   * H_UNSUPPORTED if called from the wrong context (e.g.
>   from an SVM or before an H_SVM_INIT_START
>   hypercall).
> + * H_STATE   if the hypervisor could not successfully
> +transition the VM to Secure VM.
>  
>  Description
>  ~~~
> diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h 
> b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
> index 9cb7d8b..f229ab5 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
> @@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
>  unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
>  void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
>struct kvm *kvm, bool skip_page_out);
> +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
> + const struct kvm_memory_slot *memslot);

I still don't see why this be a global function. You should be able
to move around a few functions in book3s_hv_uvmem.c up/down and
satisfy the calling order dependencies.

Otherwise, Reviewed-by: Bharata B Rao 


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread Alexey Kardashevskiy



On 23/07/2020 23:11, Nicholas Piggin wrote:
> Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
>> On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
>>
>>> diff --git a/arch/powerpc/include/asm/hw_irq.h 
>>> b/arch/powerpc/include/asm/hw_irq.h
>>> index 3a0db7b0b46e..35060be09073 100644
>>> --- a/arch/powerpc/include/asm/hw_irq.h
>>> +++ b/arch/powerpc/include/asm/hw_irq.h
>>> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>>>  #define powerpc_local_irq_pmu_save(flags)  \
>>>  do {   \
>>> raw_local_irq_pmu_save(flags);  \
>>> -   trace_hardirqs_off();   \
>>> +   if (!raw_irqs_disabled_flags(flags))\
>>> +   trace_hardirqs_off();   \
>>> } while(0)
>>>  #define powerpc_local_irq_pmu_restore(flags)   \
>>> do {\
>>> -   if (raw_irqs_disabled_flags(flags)) {   \
>>> -   raw_local_irq_pmu_restore(flags);   \
>>> -   trace_hardirqs_off();   \
>>> -   } else {\
>>> +   if (!raw_irqs_disabled_flags(flags))\
>>> trace_hardirqs_on();\
>>> -   raw_local_irq_pmu_restore(flags);   \
>>> -   }   \
>>> +   raw_local_irq_pmu_restore(flags);   \
>>> } while(0)
>>
>> You shouldn't be calling lockdep from NMI context!
> 
> After this patch it doesn't.
> 
> trace_hardirqs_on/off implementation appears to expect to be called in NMI 
> context though, for some reason.
> 
>> That is, I recently
>> added suport for that on x86:
>>
>>   https://lkml.kernel.org/r/20200623083721.155449...@infradead.org
>>   https://lkml.kernel.org/r/20200623083721.216740...@infradead.org
>>
>> But you need to be very careful on how you order things, as you can see
>> the above relies on preempt_count() already having been incremented with
>> NMI_MASK.
> 
> Hmm. My patch seems simpler.

And your patches fix my error while Peter's do not:


IRQs not enabled as expected
WARNING: CPU: 0 PID: 1377 at /home/aik/p/kernel/kernel/softirq.c:169
__local_bh_enable_ip+0x118/0x190


> 
> I don't know this stuff very well, I don't really understand what your patch 
> enables for x86 but at least it shouldn't be incompatible with this one 
> AFAIKS.
> 
> Thanks,
> Nick
> 

-- 
Alexey


Re: [PATCH 15/15] powerpc/powernv/sriov: Make single PE mode a per-BAR setting

2020-07-23 Thread Oliver O'Halloran
On Wed, Jul 22, 2020 at 8:06 PM Alexey Kardashevskiy  wrote:
>
> >> Well, realistically the segment size should be 8MB to make this matter
> >> (or the whole window 2GB) which does not seem to happen so it does not
> >> matter.
> >
> > I'm not sure what you mean.
>
> I mean how can we possibly hit this case, what m64_segsize would the
> platform have to trigger this. The whole check seems useless but whatever.

Yeah maybe.

IIRC some old P8 FSP systems had tiny M64 windows so it might have
been an issue there. Maybe we can get rid of it., but I'd rather just
leave the behaviour as-is for now.


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread kernel test robot
Hi Nicholas,

I love your patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on powerpc/next linus/master v5.8-rc6 next-20200723]
[cannot apply to tip/locking/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/lockdep-improve-current-hard-soft-irqs_enabled-synchronisation-with-actual-irq-state/20200723-185938
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
9ebcfadb0610322ac537dd7aa5d9cbc2b2894c68
config: i386-randconfig-s001-20200723 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-14) 9.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.2-93-g4c6cbe55-dirty
# save the attached .config to linux build tree
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)

   kernel/locking/spinlock.c:149:17: sparse: sparse: context imbalance in 
'_raw_spin_lock' - wrong count at exit
   kernel/locking/spinlock.c: note: in included file (through 
include/linux/preempt.h):
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_spin_lock_irqsave' - wrong count at exit
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_spin_lock_irq' - wrong count at exit
   kernel/locking/spinlock.c:173:17: sparse: sparse: context imbalance in 
'_raw_spin_lock_bh' - wrong count at exit
   kernel/locking/spinlock.c:181:17: sparse: sparse: context imbalance in 
'_raw_spin_unlock' - unexpected unlock
   kernel/locking/spinlock.c:189:17: sparse: sparse: context imbalance in 
'_raw_spin_unlock_irqrestore' - unexpected unlock
   kernel/locking/spinlock.c:197:17: sparse: sparse: context imbalance in 
'_raw_spin_unlock_irq' - unexpected unlock
   kernel/locking/spinlock.c:205:17: sparse: sparse: context imbalance in 
'_raw_spin_unlock_bh' - unexpected unlock
   kernel/locking/spinlock.c:221:17: sparse: sparse: context imbalance in 
'_raw_read_lock' - wrong count at exit
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_read_lock_irqsave' - wrong count at exit
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_read_lock_irq' - wrong count at exit
   kernel/locking/spinlock.c:245:17: sparse: sparse: context imbalance in 
'_raw_read_lock_bh' - wrong count at exit
   kernel/locking/spinlock.c:253:17: sparse: sparse: context imbalance in 
'_raw_read_unlock' - unexpected unlock
   kernel/locking/spinlock.c:261:17: sparse: sparse: context imbalance in 
'_raw_read_unlock_irqrestore' - unexpected unlock
   kernel/locking/spinlock.c:269:17: sparse: sparse: context imbalance in 
'_raw_read_unlock_irq' - unexpected unlock
   kernel/locking/spinlock.c:277:17: sparse: sparse: context imbalance in 
'_raw_read_unlock_bh' - unexpected unlock
   kernel/locking/spinlock.c:293:17: sparse: sparse: context imbalance in 
'_raw_write_lock' - wrong count at exit
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_write_lock_irqsave' - wrong count at exit
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_write_lock_irq' - wrong count at exit
   kernel/locking/spinlock.c:317:17: sparse: sparse: context imbalance in 
'_raw_write_lock_bh' - wrong count at exit
   kernel/locking/spinlock.c:325:17: sparse: sparse: context imbalance in 
'_raw_write_unlock' - unexpected unlock
   kernel/locking/spinlock.c:333:17: sparse: sparse: context imbalance in 
'_raw_write_unlock_irqrestore' - unexpected unlock
   kernel/locking/spinlock.c:341:17: sparse: sparse: context imbalance in 
'_raw_write_unlock_irq' - unexpected unlock
   kernel/locking/spinlock.c:349:17: sparse: sparse: context imbalance in 
'_raw_write_unlock_bh' - unexpected unlock
   kernel/locking/spinlock.c:358:17: sparse: sparse: context imbalance in 
'_raw_spin_lock_nested' - wrong count at exit
>> arch/x86/include/asm/preempt.h:79:9: sparse: sparse: context imbalance in 
>> '_raw_spin_lock_irqsave_nested' - wrong count at exit
   kernel/locking/spinlock.c:380:17: sparse: sparse: context imbalance in 
'_raw_spin_lock_nest_lock' - wrong count at exit
--
   kernel/trace/ring_buffer.c:699:32: sparse: sparse: incorrect type in return 
expression (different base types) @@ expected restricted __poll_t @@ 
got int @@
   kernel/trace/ring_buffer.c:699:32: sparse: expected restricted __poll_t
   kernel/trace/ring_buffer.c:699:32: sparse: got int
   kernel/trace/ring_buffer.c: note: in included file (through 
include/linux/irqflags.h, 

Re: [PATCH v5 7/7] KVM: PPC: Book3S HV: rework secure mem slot dropping

2020-07-23 Thread Bharata B Rao
On Thu, Jul 23, 2020 at 01:07:24PM -0700, Ram Pai wrote:
> From: Laurent Dufour 
> 
> When a secure memslot is dropped, all the pages backed in the secure
> device (aka really backed by secure memory by the Ultravisor)
> should be paged out to a normal page. Previously, this was
> achieved by triggering the page fault mechanism which is calling
> kvmppc_svm_page_out() on each pages.
> 
> This can't work when hot unplugging a memory slot because the memory
> slot is flagged as invalid and gfn_to_pfn() is then not trying to access
> the page, so the page fault mechanism is not triggered.
> 
> Since the final goal is to make a call to kvmppc_svm_page_out() it seems
> simpler to call directly instead of triggering such a mechanism. This
> way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
> memslot.
> 
> Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
> the call to __kvmppc_svm_page_out() is made.  As
> __kvmppc_svm_page_out needs the vma pointer to migrate the pages,
> the VMA is fetched in a lazy way, to not trigger find_vma() all
> the time. In addition, the mmap_sem is held in read mode during
> that time, not in write mode since the virual memory layout is not
> impacted, and kvm->arch.uvmem_lock prevents concurrent operation
> on the secure device.
> 
> Cc: Ram Pai 
> Cc: Bharata B Rao 
> Cc: Paul Mackerras 
> Signed-off-by: Ram Pai 
>   [modified the changelog description]
> Signed-off-by: Laurent Dufour 
> ---
>  arch/powerpc/kvm/book3s_hv_uvmem.c | 54 
> ++
>  1 file changed, 37 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
> b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index c772e92..daffa6e 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -632,35 +632,55 @@ static inline int kvmppc_svm_page_out(struct 
> vm_area_struct *vma,
>   * fault on them, do fault time migration to replace the device PTEs in
>   * QEMU page table with normal PTEs from newly allocated pages.
>   */
> -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
> +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
>struct kvm *kvm, bool skip_page_out)
>  {
>   int i;
>   struct kvmppc_uvmem_page_pvt *pvt;
> - unsigned long pfn, uvmem_pfn;
> - unsigned long gfn = free->base_gfn;
> + struct page *uvmem_page;
> + struct vm_area_struct *vma = NULL;
> + unsigned long uvmem_pfn, gfn;
> + unsigned long addr, end;
> +
> + mmap_read_lock(kvm->mm);
> +
> + addr = slot->userspace_addr;
> + end = addr + (slot->npages * PAGE_SIZE);
>  
> - for (i = free->npages; i; --i, ++gfn) {
> - struct page *uvmem_page;
> + gfn = slot->base_gfn;
> + for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
> +
> + /* Fetch the VMA if addr is not in the latest fetched one */
> + if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
> + vma = find_vma_intersection(kvm->mm, addr, end);
> + if (!vma ||
> + vma->vm_start > addr || vma->vm_end < end) {
> + pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
> + break;
> + }

There is a potential issue with the boundary condition check here
which I discussed with Laurent yesterday. Guess he hasn't gotten around
to look at it yet.

Regards,
Bharata.


Re: [v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine

2020-07-23 Thread kernel test robot
Hi Athira,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.8-rc6 next-20200723]
[cannot apply to mpe/next scottwood/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Athira-Rajeev/powerpc-perf-Initialize-power10-PMU-registers-in-cpu-setup-routine/20200723-153537
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   arch/powerpc/kernel/cpu_setup_power.S: Assembler messages:
>> arch/powerpc/kernel/cpu_setup_power.S:244: Error: non-constant expression in 
>> ".if" statement
>> arch/powerpc/kernel/cpu_setup_power.S:244: Error: non-constant expression in 
>> ".if" statement
>> arch/powerpc/kernel/cpu_setup_power.S:243: Error: unsupported relocation 
>> against SPRN_MMCR3

vim +244 arch/powerpc/kernel/cpu_setup_power.S

14  
15  /* Entry: r3 = crap, r4 = ptr to cputable entry
16   *
17   * Note that we can be called twice for pseudo-PVRs
18   */
19  _GLOBAL(__setup_cpu_power7)
20  mflrr11
21  bl  __init_hvmode_206
22  mtlrr11
23  beqlr
24  li  r0,0
25  mtspr   SPRN_LPID,r0
26  LOAD_REG_IMMEDIATE(r0, PCR_MASK)
27  mtspr   SPRN_PCR,r0
28  mfspr   r3,SPRN_LPCR
29  li  r4,(LPCR_LPES1 >> LPCR_LPES_SH)
30  bl  __init_LPCR_ISA206
31  mtlrr11
32  blr
33  
34  _GLOBAL(__restore_cpu_power7)
35  mflrr11
36  mfmsr   r3
37  rldicl. r0,r3,4,63
38  beqlr
39  li  r0,0
40  mtspr   SPRN_LPID,r0
41  LOAD_REG_IMMEDIATE(r0, PCR_MASK)
42  mtspr   SPRN_PCR,r0
43  mfspr   r3,SPRN_LPCR
44  li  r4,(LPCR_LPES1 >> LPCR_LPES_SH)
45  bl  __init_LPCR_ISA206
46  mtlrr11
47  blr
48  
49  _GLOBAL(__setup_cpu_power8)
50  mflrr11
51  bl  __init_FSCR
52  bl  __init_PMU
53  bl  __init_PMU_ISA207
54  bl  __init_hvmode_206
55  mtlrr11
56  beqlr
57  li  r0,0
58  mtspr   SPRN_LPID,r0
59  LOAD_REG_IMMEDIATE(r0, PCR_MASK)
60  mtspr   SPRN_PCR,r0
61  mfspr   r3,SPRN_LPCR
62  ori r3, r3, LPCR_PECEDH
63  li  r4,0 /* LPES = 0 */
64  bl  __init_LPCR_ISA206
65  bl  __init_HFSCR
66  bl  __init_PMU_HV
67  bl  __init_PMU_HV_ISA207
68  mtlrr11
69  blr
70  
71  _GLOBAL(__restore_cpu_power8)
72  mflrr11
73  bl  __init_FSCR
74  bl  __init_PMU
75  bl  __init_PMU_ISA207
76  mfmsr   r3
77  rldicl. r0,r3,4,63
78  mtlrr11
79  beqlr
80  li  r0,0
81  mtspr   SPRN_LPID,r0
82  LOAD_REG_IMMEDIATE(r0, PCR_MASK)
83  mtspr   SPRN_PCR,r0
84  mfspr   r3,SPRN_LPCR
85  ori r3, r3, LPCR_PECEDH
86  li  r4,0 /* LPES = 0 */
87  bl  __init_LPCR_ISA206
88  bl  __init_HFSCR
89  bl  __init_PMU_HV
90  bl  __init_PMU_HV_ISA207
91  mtlrr11
92  blr
93  
94  _GLOBAL(__setup_cpu_power10)
95  mflrr11
96  bl  __init_FSCR_power10
97  bl  __init_PMU
98  bl  __init_PMU_ISA31
99  b   1f
   100  
   101  _GLOBAL(__setup_cpu_power9)
   102  mflrr11
   103  bl  __init_FSCR
   104  bl  __init_PMU
   105  1:  bl  __init_hvmode_206
   106  mtlrr11
   107  beqlr
   108  li  r0,0
   109  mtspr   SPRN_PSSCR,r0
   110  mtspr   SPRN_LPID,r0
   111  mtspr   SPRN_PID,r0
   112  LOAD_REG_IMMEDIATE(r0, PCR_MASK)
   11

Re: [PATCH 2/2] lockdep: warn on redundant or incorrect irq state changes

2020-07-23 Thread kernel test robot
Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on powerpc/next linus/master v5.8-rc6]
[cannot apply to tip/locking/core next-20200723]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/lockdep-improve-current-hard-soft-irqs_enabled-synchronisation-with-actual-irq-state/20200723-185938
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
9ebcfadb0610322ac537dd7aa5d9cbc2b2894c68
config: x86_64-randconfig-a002-20200723 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-14) 9.3.0
reproduce (this is a W=1 build):
# save the attached .config to linux build tree
make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   kernel/locking/lockdep.c: In function 'lockdep_init':
>> kernel/locking/lockdep.c:5673:9: error: 'struct task_struct' has no member 
>> named 'hardirqs_enabled'
5673 |  current->hardirqs_enabled = 1;
 | ^~
>> kernel/locking/lockdep.c:5674:9: error: 'struct task_struct' has no member 
>> named 'softirqs_enabled'
5674 |  current->softirqs_enabled = 1;
 | ^~
   In file included from kernel/locking/lockdep.c:60:
   At top level:
   kernel/locking/lockdep_internals.h:64:28: warning: 'LOCKF_USED_IN_IRQ_READ' 
defined but not used [-Wunused-const-variable=]
  64 | static const unsigned long LOCKF_USED_IN_IRQ_READ =
 |^~
   In file included from kernel/locking/lockdep.c:60:
   kernel/locking/lockdep_internals.h:58:28: warning: 'LOCKF_ENABLED_IRQ_READ' 
defined but not used [-Wunused-const-variable=]
  58 | static const unsigned long LOCKF_ENABLED_IRQ_READ =
 |^~
   In file included from kernel/locking/lockdep.c:60:
   kernel/locking/lockdep_internals.h:52:28: warning: 'LOCKF_USED_IN_IRQ' 
defined but not used [-Wunused-const-variable=]
  52 | static const unsigned long LOCKF_USED_IN_IRQ =
 |^
   In file included from kernel/locking/lockdep.c:60:
   kernel/locking/lockdep_internals.h:46:28: warning: 'LOCKF_ENABLED_IRQ' 
defined but not used [-Wunused-const-variable=]
  46 | static const unsigned long LOCKF_ENABLED_IRQ =
 |^

vim +5673 kernel/locking/lockdep.c

  5667  
  5668  printk(" per task-struct memory footprint: %zu bytes\n",
  5669 sizeof(((struct task_struct *)NULL)->held_locks));
  5670  
  5671  WARN_ON(irqs_disabled());
  5672  
> 5673  current->hardirqs_enabled = 1;
> 5674  current->softirqs_enabled = 1;
  5675  }
  5676  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread kernel test robot
Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on powerpc/next linus/master v5.8-rc6 next-20200723]
[cannot apply to tip/locking/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/lockdep-improve-current-hard-soft-irqs_enabled-synchronisation-with-actual-irq-state/20200723-185938
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
9ebcfadb0610322ac537dd7aa5d9cbc2b2894c68
config: nds32-allyesconfig (attached as .config)
compiler: nds32le-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=nds32 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   In file included from include/asm-generic/bitops.h:14,
from ./arch/nds32/include/generated/asm/bitops.h:1,
from include/linux/bitops.h:29,
from include/linux/kernel.h:12,
from include/linux/list.h:9,
from include/linux/rculist.h:10,
from include/linux/pid.h:5,
from include/linux/sched.h:14,
from arch/nds32/kernel/asm-offsets.c:4:
   include/linux/spinlock_api_smp.h: In function '__raw_spin_lock_irq':
>> include/linux/irqflags.h:158:31: error: implicit declaration of function 
>> 'arch_irqs_disabled'; did you mean 'raw_irqs_disabled'? 
>> [-Werror=implicit-function-declaration]
 158 | #define raw_irqs_disabled()  (arch_irqs_disabled())
 |   ^~
   include/linux/irqflags.h:174:23: note: in expansion of macro 
'raw_irqs_disabled'
 174 |   bool was_disabled = raw_irqs_disabled(); \
 |   ^
   include/linux/spinlock_api_smp.h:126:2: note: in expansion of macro 
'local_irq_disable'
 126 |  local_irq_disable();
 |  ^
   cc1: some warnings being treated as errors
   make[2]: *** [scripts/Makefile.build:114: arch/nds32/kernel/asm-offsets.s] 
Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [Makefile:1175: prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [Makefile:185: __sub-make] Error 2
   make: Target 'prepare' not remade because of errors.

vim +158 include/linux/irqflags.h

81d68a96a398448 Steven Rostedt 2008-05-12  132  
df9ee29270c11db David Howells  2010-10-07  133  /*
df9ee29270c11db David Howells  2010-10-07  134   * Wrap the arch provided IRQ 
routines to provide appropriate checks.
df9ee29270c11db David Howells  2010-10-07  135   */
df9ee29270c11db David Howells  2010-10-07  136  #define raw_local_irq_disable() 
arch_local_irq_disable()
df9ee29270c11db David Howells  2010-10-07  137  #define raw_local_irq_enable()  
arch_local_irq_enable()
df9ee29270c11db David Howells  2010-10-07  138  #define 
raw_local_irq_save(flags)   \
df9ee29270c11db David Howells  2010-10-07  139  do {
\
df9ee29270c11db David Howells  2010-10-07  140  
typecheck(unsigned long, flags);\
df9ee29270c11db David Howells  2010-10-07  141  flags = 
arch_local_irq_save();  \
df9ee29270c11db David Howells  2010-10-07  142  } while (0)
df9ee29270c11db David Howells  2010-10-07  143  #define 
raw_local_irq_restore(flags)\
df9ee29270c11db David Howells  2010-10-07  144  do {
\
df9ee29270c11db David Howells  2010-10-07  145  
typecheck(unsigned long, flags);\
df9ee29270c11db David Howells  2010-10-07  146  
arch_local_irq_restore(flags);  \
df9ee29270c11db David Howells  2010-10-07  147  } while (0)
df9ee29270c11db David Howells  2010-10-07  148  #define 
raw_local_save_flags(flags) \
df9ee29270c11db David Howells  2010-10-07  149  do {
\
df9ee29270c11db David Howells  2010-10-07  150  
typecheck(unsigned long, flags);\
df9ee29270c11db David Howells  2010-10-07  151  flags = 
arch_local_save_flags();\
df9ee29270c11db David Howells  2010-10-07  152  } while (0)
df9ee29270c11db David Howells  2010-10-07  153  #define 
raw_irqs_disabled_flags(flags) 

[powerpc:merge] BUILD SUCCESS d6c13d397d6988ec3e6029cae9e80501364cf9cb

2020-07-23 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: d6c13d397d6988ec3e6029cae9e80501364cf9cb  Automatic merge of 
'master', 'next' and 'fixes' (2020-07-22 23:08)

elapsed time: 2204m

configs tested: 74
configs skipped: 1

The following configs have been built successfully.
More configs may be tested in the coming days.

arm defconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
i386defconfig
i386  debian-10.3
i386  allnoconfig
i386 allyesconfig
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
nios2allyesconfig
openriscdefconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
riscvallmodconfig
s390 allyesconfig
s390  allnoconfig
s390 allmodconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
sparc64 defconfig
sparc64   allnoconfig
sparc64  allyesconfig
sparc64  allmodconfig
x86_64rhel-7.6-kselftests
x86_64   rhel-8.3
x86_64  kexec
x86_64   rhel
x86_64lkp
x86_64  fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH v2 2/3] powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10

2020-07-23 Thread Michael Neuling
On Fri, 2020-07-10 at 10:52 +0530, Pratik Rajesh Sampat wrote:
> Additional registers DAWR0, DAWRX0 may be lost on Power 10 for
> stop levels < 4.
> Therefore save the values of these SPRs before entering a  "stop"
> state and restore their values on wakeup.
> 
> Signed-off-by: Pratik Rajesh Sampat 
> ---
>  arch/powerpc/platforms/powernv/idle.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c
> b/arch/powerpc/platforms/powernv/idle.c
> index 19d94d021357..f2e2a6a4c274 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -600,6 +600,8 @@ struct p9_sprs {
>   u64 iamr;
>   u64 amor;
>   u64 uamor;
> + u64 dawr0;
> + u64 dawrx0;
>  };
>  
>  static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
> @@ -687,6 +689,10 @@ static unsigned long power9_idle_stop(unsigned long
> psscr, bool mmu_on)
>   sprs.iamr   = mfspr(SPRN_IAMR);
>   sprs.amor   = mfspr(SPRN_AMOR);
>   sprs.uamor  = mfspr(SPRN_UAMOR);
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {

Can you add a comment here saying even though DAWR0 is ARCH_30, it's only
required to be saved on 31. Otherwise this looks pretty odd.

> + sprs.dawr0 = mfspr(SPRN_DAWR0);
> + sprs.dawrx0 = mfspr(SPRN_DAWRX0);
> + }
>  
>   srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */
>  
> @@ -710,6 +716,10 @@ static unsigned long power9_idle_stop(unsigned long
> psscr, bool mmu_on)
>   mtspr(SPRN_IAMR,sprs.iamr);
>   mtspr(SPRN_AMOR,sprs.amor);
>   mtspr(SPRN_UAMOR,   sprs.uamor);
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> + mtspr(SPRN_DAWR0, sprs.dawr0);
> + mtspr(SPRN_DAWRX0, sprs.dawrx0);
> + }
>  
>   /*
>* Workaround for POWER9 DD2.0, if we lost resources, the ERAT



[PATCH] powerpc/test_emulate_sstep: Fix build error

2020-07-23 Thread Michael Ellerman
ppc64_book3e_allmodconfig fails with:

  arch/powerpc/lib/test_emulate_step.c: In function 'test_pld':
  arch/powerpc/lib/test_emulate_step.c:113:7: error: implicit declaration of 
function 'cpu_has_feature'
113 |  if (!cpu_has_feature(CPU_FTR_ARCH_31)) {
|   ^~~

Add an include of cpu_has_feature.h to fix it.

Fixes: b6b54b42722a ("powerpc/sstep: Add tests for prefixed integer 
load/stores")
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/lib/test_emulate_step.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/test_emulate_step.c 
b/arch/powerpc/lib/test_emulate_step.c
index 081b05480c47..d242e9f72e0c 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -8,6 +8,7 @@
 #define pr_fmt(fmt) "emulate_step_test: " fmt
 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.25.1



Re: [PATCH v4 10/12] ppc64/kexec_file: prepare elfcore header for crashing kernel

2020-07-23 Thread Thiago Jung Bauermann


Hari Bathini  writes:

> Prepare elf headers for the crashing kernel's core file using
> crash_prepare_elf64_headers() and pass on this info to kdump
> kernel by updating its command line with elfcorehdr parameter.
> Also, add elfcorehdr location to reserve map to avoid it from
> being stomped on while booting.
>
> Signed-off-by: Hari Bathini 
> Tested-by: Pingfan Liu 

Reviewed-by: Thiago Jung Bauermann 

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v4 09/12] ppc64/kexec_file: setup backup region for kdump kernel

2020-07-23 Thread Thiago Jung Bauermann


Hari Bathini  writes:

> Though kdump kernel boots from loaded address, the first 64K bytes
> of it is copied down to real 0. So, setup a backup region to copy
> the first 64K bytes of crashed kernel, in purgatory, before booting
> into kdump kernel. Also, update reserve map with backup region and
> crashed kernel's memory to avoid kdump kernel from accidentially
> using that memory.
>
> Reported-by: kernel test robot 
> [lkp: In v1, purgatory() declaration was missing]
> Signed-off-by: Hari Bathini 

Reviewed-by: Thiago Jung Bauermann 

Just one minor comment below:

> @@ -1047,13 +1120,26 @@ int setup_new_fdt_ppc64(const struct kimage *image, 
> void *fdt,
>   goto out;
>   }
>
> - /* Ensure we don't touch crashed kernel's memory */
> - ret = fdt_add_mem_rsv(fdt, 0, crashk_res.start);
> + /*
> +  * Ensure we don't touch crashed kernel's memory except the
> +  * first 64K of RAM, which will be backed up.
> +  */
> + ret = fdt_add_mem_rsv(fdt, BACKUP_SRC_SIZE,

I know BACKUP_SRC_START is 0, but please forgive my pedantry when I say
that I think it's clearer if the start address above is changed to
BACKUP_SRC_START + BACKUP_SRC_SIZE...

> +   crashk_res.start - BACKUP_SRC_SIZE);
>   if (ret) {
>   pr_err("Error reserving crash memory: %s\n",
>  fdt_strerror(ret));
>   goto out;
>   }
> +
> + /* Ensure backup region is not used by kdump/capture kernel */
> + ret = fdt_add_mem_rsv(fdt, image->arch.backup_start,
> +   BACKUP_SRC_SIZE);
> + if (ret) {
> + pr_err("Error reserving memory for backup: %s\n",
> +fdt_strerror(ret));
> + goto out;
> + }
>   }
>
>  out:

--
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v4 06/12] ppc64/kexec_file: restrict memory usage of kdump kernel

2020-07-23 Thread Thiago Jung Bauermann


Hari Bathini  writes:

> Kdump kernel, used for capturing the kernel core image, is supposed
> to use only specific memory regions to avoid corrupting the image to
> be captured. The regions are crashkernel range - the memory reserved
> explicitly for kdump kernel, memory used for the tce-table, the OPAL
> region and RTAS region as applicable. Restrict kdump kernel memory
> to use only these regions by setting up usable-memory DT property.
> Also, tell the kdump kernel to run at the loaded address by setting
> the magic word at 0x5c.
>
> Signed-off-by: Hari Bathini 
> Tested-by: Pingfan Liu 
> ---
>
> v3 -> v4:
> * Updated get_node_path() to be an iterative function instead of a
>   recursive one.
> * Added comment explaining why low memory is added to kdump kernel's
>   usable memory ranges though it doesn't fall in crashkernel region.
> * For correctness, added fdt_add_mem_rsv() for the low memory being
>   added to kdump kernel's usable memory ranges.

Good idea.

> * Fixed prop pointer update in add_usable_mem_property() and changed
>   duple to tuple as suggested by Thiago.



> +/**
> + * get_node_pathlen - Get the full path length of the given node.
> + * @dn:   Node.
> + *
> + * Also, counts '/' at the end of the path.
> + * For example, /memory@0 will be "/memory@0/\0" => 11 bytes.

Wouldn't this function return 10 in the case of /memory@0?
Are you saying that it should count the \0 at the end too? it's not
doing that, AFAICS.

> + *
> + * Returns the string length of the node's full path.
> + */

Maybe it's me (by analogy with strlen()), but I would expect "string
length" to not include the terminating \0. I suggest renaming the
function to something like get_node_path_size() and do s/length/size/ in
the comment above if it's supposed to count the terminating \0.

> +static int get_node_pathlen(struct device_node *dn)
> +{
> + int len = 0;
> +
> + if (!dn)
> + return 0;
> +
> + while (dn) {
> + len += strlen(dn->full_name) + 1;
> + dn = dn->parent;
> + }
> +
> + return len + 1;
> +}
> +
> +/**
> + * get_node_path - Get the full path of the given node.
> + * @node:  Device node.
> + *
> + * Allocates buffer for node path. The caller must free the buffer
> + * after use.
> + *
> + * Returns buffer with path on success, NULL otherwise.
> + */
> +static char *get_node_path(struct device_node *node)
> +{
> + struct device_node *dn;
> + int len, idx, nlen;
> + char *path = NULL;
> + char end_char;
> +
> + if (!node)
> + goto err;
> +
> + /*
> +  * Get the path length first and use it to iteratively build the path
> +  * from node to root.
> +  */
> + len = get_node_pathlen(node);
> +
> + /* Allocate memory for node path */
> + path = kzalloc(ALIGN(len, 8), GFP_KERNEL);
> + if (!path)
> + goto err;
> +
> + /*
> +  * Iteratively update path from node to root by decrementing
> +  * index appropriately.
> +  *
> +  * Also, add %NUL at the end of node & '/' at the end of all its
> +  * parent nodes.
> +  */
> + dn = node;
> + path[0] = '/';
> + idx = len - 1;

Here, idx is pointing to the supposed '/' at the end of the node
path ...

> + end_char = '\0';
> + while (dn->parent) {
> + path[--idx] = end_char;

.. and in the first ireation, this is writing '\0' at a place which will be
overwritten by the memcpy() below with the last character of
dn->full_name. You need to start idx with len, not len - 1.

> + end_char = '/';
> +
> + nlen = strlen(dn->full_name);
> + idx -= nlen;
> + memcpy(path + idx, dn->full_name, nlen);
> +
> + dn = dn->parent;
> + }
> +
> + return path;
> +err:
> + kfree(path);
> + return NULL;
> +}

--
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-23 Thread Benjamin Herrenschmidt
On Thu, 2020-07-23 at 01:21 -0400, Alex Ghiti wrote:
> > works fine with huge pages, what is your problem there ? You rely on
> > punching small-page size holes in there ?
> > 
> 
> ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel 
> mapping in the direct mapping as it sets different permissions to 
> different part of the kernel (data, text..etc).

Ah ok, that can be solved in a couple of ways...

One is to use the linker script to ensure those sections are linked
HUGE_PAGE_SIZE appart and moved appropriately by early boot code. One
is to selectively degrade just those huge pages.

I'm not familiar with the RiscV MMU (I should probably go have a look)
but if it's a classic radix tree with huge pages at PUD/PMD level, then
you could just degrade the one(s) that cross those boundaries.

Cheers,
Ben.




Re: [PATCH v4 04/12] ppc64/kexec_file: avoid stomping memory used by special regions

2020-07-23 Thread Thiago Jung Bauermann


Hari Bathini  writes:

> crashkernel region could have an overlap with special memory regions
> like  opal, rtas, tce-table & such. These regions are referred to as
> exclude memory ranges. Setup this ranges during image probe in order
> to avoid them while finding the buffer for different kdump segments.
> Override arch_kexec_locate_mem_hole() to locate a memory hole taking
> these ranges into account.
>
> Signed-off-by: Hari Bathini 

Reviewed-by: Thiago Jung Bauermann 

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v4 03/12] powerpc/kexec_file: add helper functions for getting memory ranges

2020-07-23 Thread Thiago Jung Bauermann


Hari Bathini  writes:

> In kexec case, the kernel to be loaded uses the same memory layout as
> the running kernel. So, passing on the DT of the running kernel would
> be good enough.
>
> But in case of kdump, different memory ranges are needed to manage
> loading the kdump kernel, booting into it and exporting the elfcore
> of the crashing kernel. The ranges are exclude memory ranges, usable
> memory ranges, reserved memory ranges and crash memory ranges.
>
> Exclude memory ranges specify the list of memory ranges to avoid while
> loading kdump segments. Usable memory ranges list the memory ranges
> that could be used for booting kdump kernel. Reserved memory ranges
> list the memory regions for the loading kernel's reserve map. Crash
> memory ranges list the memory ranges to be exported as the crashing
> kernel's elfcore.
>
> Add helper functions for setting up the above mentioned memory ranges.
> This helpers facilitate in understanding the subsequent changes better
> and make it easy to setup the different memory ranges listed above, as
> and when appropriate.
>
> Signed-off-by: Hari Bathini 
> Tested-by: Pingfan Liu 

Just one comment below, but regardless:

Reviewed-by: Thiago Jung Bauermann 

> +/**
> + * add_htab_mem_range - Adds htab range to the given memory ranges list,
> + *  if it exists
> + * @mem_ranges: Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_htab_mem_range(struct crash_mem **mem_ranges)
> +{
> + if (!htab_address)
> + return 0;
> +
> + return add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
> +}

I believe you need to surround this function with `#ifdef
CONFIG_PPC_BOOK3S_64` and `#endif` to match what is done in
.

--
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread Waiman Long

On 7/23/20 3:58 PM, pet...@infradead.org wrote:

On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote:

On 7/23/20 2:47 PM, pet...@infradead.org wrote:

On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:

BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
I will have to update the patch to fix the reported 0-day test problem, but
I want to collect other feedback before sending out v3.

I want to say I hate it all, it adds instructions to a path we spend an
aweful lot of time optimizing without really getting anything back for
it.

It does add some extra instruction that may slow it down slightly, but I
don't agree that it gives nothing back. The cpu lock holder information can
be useful in analyzing crash dumps and in some debugging situation. I think
it can be useful in RHEL for this readon. How about an x86 config option to
allow distros to decide if they want to have it enabled? I will make sure
that it will have no performance degradation if the option is not enabled.

Config knobs suck too; they create a maintenance burden (we get to make
sure all the permutations works/build/etc..) and effectively nobody uses
them, since world+dog uses what distros pick.

Anyway, instead of adding a second per-cpu variable, can you see how
horrible something like this is:

unsigned char adds(unsigned char var, unsigned char val)
{
unsigned short sat = 0xff, tmp = var;

asm ("addb %[val], %b[var];"
 "cmovc%[sat], %[var];"
 : [var] "+r" (tmp)
 : [val] "ir" (val), [sat] "r" (sat)
 );

return tmp;
}

Another thing to try is, instead of threading that lockval throughout
the thing, simply:

#define _Q_LOCKED_VAL   this_cpu_read_stable(cpu_sat)

or combined with the above

#define _Q_LOCKED_VAL   adds(this_cpu_read_stable(cpu_number), 2)

and see if the compiler really makes a mess of things.


Thanks for the suggestion. I will try that out.

Cheers,
Longman



Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread Segher Boessenkool
On Thu, Jul 23, 2020 at 09:58:55PM +0200, pet...@infradead.org wrote:
>   asm ("addb  %[val], %b[var];"
>"cmovc %[sat], %[var];"
>: [var] "+r" (tmp)
>: [val] "ir" (val), [sat] "r" (sat)
>);

"var" (operand 0) needs an earlyclobber ("sat" is read after "var" is
written for the first time).


Segher


Re: [PATCH] ASoC: fsl: Replace HTTP links with HTTPS ones

2020-07-23 Thread Mark Brown
On Sat, 18 Jul 2020 13:12:09 +0200, Alexander A. Klimov wrote:
> Rationale:
> Reduces attack surface on kernel devs opening the links for MITM
> as HTTPS traffic is much harder to manipulate.
> 
> Deterministic algorithm:
> For each file:
>   If not .svg:
> For each line:
>   If doesn't contain `\bxmlns\b`:
> For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
> If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
> If both the HTTP and HTTPS versions
> return 200 OK and serve the same content:
>   Replace HTTP with HTTPS.

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/1] ASoC: fsl: Replace HTTP links with HTTPS ones
  commit: 1ce8f643ed875d754ff09bf2096dfac3b905ab80

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark


[PATCH v5 7/7] KVM: PPC: Book3S HV: rework secure mem slot dropping

2020-07-23 Thread Ram Pai
From: Laurent Dufour 

When a secure memslot is dropped, all the pages backed in the secure
device (aka really backed by secure memory by the Ultravisor)
should be paged out to a normal page. Previously, this was
achieved by triggering the page fault mechanism which is calling
kvmppc_svm_page_out() on each pages.

This can't work when hot unplugging a memory slot because the memory
slot is flagged as invalid and gfn_to_pfn() is then not trying to access
the page, so the page fault mechanism is not triggered.

Since the final goal is to make a call to kvmppc_svm_page_out() it seems
simpler to call directly instead of triggering such a mechanism. This
way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
memslot.

Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
the call to __kvmppc_svm_page_out() is made.  As
__kvmppc_svm_page_out needs the vma pointer to migrate the pages,
the VMA is fetched in a lazy way, to not trigger find_vma() all
the time. In addition, the mmap_sem is held in read mode during
that time, not in write mode since the virual memory layout is not
impacted, and kvm->arch.uvmem_lock prevents concurrent operation
on the secure device.

Cc: Ram Pai 
Cc: Bharata B Rao 
Cc: Paul Mackerras 
Signed-off-by: Ram Pai 
[modified the changelog description]
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 54 ++
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index c772e92..daffa6e 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -632,35 +632,55 @@ static inline int kvmppc_svm_page_out(struct 
vm_area_struct *vma,
  * fault on them, do fault time migration to replace the device PTEs in
  * QEMU page table with normal PTEs from newly allocated pages.
  */
-void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
+void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
 struct kvm *kvm, bool skip_page_out)
 {
int i;
struct kvmppc_uvmem_page_pvt *pvt;
-   unsigned long pfn, uvmem_pfn;
-   unsigned long gfn = free->base_gfn;
+   struct page *uvmem_page;
+   struct vm_area_struct *vma = NULL;
+   unsigned long uvmem_pfn, gfn;
+   unsigned long addr, end;
+
+   mmap_read_lock(kvm->mm);
+
+   addr = slot->userspace_addr;
+   end = addr + (slot->npages * PAGE_SIZE);
 
-   for (i = free->npages; i; --i, ++gfn) {
-   struct page *uvmem_page;
+   gfn = slot->base_gfn;
+   for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
+
+   /* Fetch the VMA if addr is not in the latest fetched one */
+   if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
+   vma = find_vma_intersection(kvm->mm, addr, end);
+   if (!vma ||
+   vma->vm_start > addr || vma->vm_end < end) {
+   pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
+   break;
+   }
+   }
 
mutex_lock(>arch.uvmem_lock);
-   if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, _pfn)) {
+
+   if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, _pfn)) {
+   uvmem_page = pfn_to_page(uvmem_pfn);
+   pvt = uvmem_page->zone_device_data;
+   pvt->skip_page_out = skip_page_out;
+   pvt->remove_gfn = true;
+
+   if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE,
+ PAGE_SHIFT, kvm, pvt->gpa))
+   pr_err("Can't page out gpa:0x%lx addr:0x%lx\n",
+  pvt->gpa, addr);
+   } else {
+   /* Remove the shared flag if any */
kvmppc_gfn_remove(gfn, kvm);
-   mutex_unlock(>arch.uvmem_lock);
-   continue;
}
 
-   uvmem_page = pfn_to_page(uvmem_pfn);
-   pvt = uvmem_page->zone_device_data;
-   pvt->skip_page_out = skip_page_out;
-   pvt->remove_gfn = true;
mutex_unlock(>arch.uvmem_lock);
-
-   pfn = gfn_to_pfn(kvm, gfn);
-   if (is_error_noslot_pfn(pfn))
-   continue;
-   kvm_release_pfn_clean(pfn);
}
+
+   mmap_read_unlock(kvm->mm);
 }
 
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
-- 
1.8.3.1



[PATCH v5 6/7] KVM: PPC: Book3S HV: move kvmppc_svm_page_out up

2020-07-23 Thread Ram Pai
From: Laurent Dufour 

kvmppc_svm_page_out() will need to be called by kvmppc_uvmem_drop_pages()
so move it upper in this file.

Furthermore it will be interesting to call this function when already
holding the kvm->arch.uvmem_lock, so prefix the original function with __
and remove the locking in it, and introduce a wrapper which call that
function with the lock held.

There is no functional change.

Cc: Ram Pai 
Cc: Bharata B Rao 
Cc: Paul Mackerras 
Signed-off-by: Ram Pai 
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 166 -
 1 file changed, 90 insertions(+), 76 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index fd9ff6e..c772e92 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -535,6 +535,96 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
 }
 
 /*
+ * Provision a new page on HV side and copy over the contents
+ * from secure memory using UV_PAGE_OUT uvcall.
+ * Caller must held kvm->arch.uvmem_lock.
+ */
+static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
+   unsigned long start,
+   unsigned long end, unsigned long page_shift,
+   struct kvm *kvm, unsigned long gpa)
+{
+   unsigned long src_pfn, dst_pfn = 0;
+   struct migrate_vma mig;
+   struct page *dpage, *spage;
+   struct kvmppc_uvmem_page_pvt *pvt;
+   unsigned long pfn;
+   int ret = U_SUCCESS;
+
+   memset(, 0, sizeof(mig));
+   mig.vma = vma;
+   mig.start = start;
+   mig.end = end;
+   mig.src = _pfn;
+   mig.dst = _pfn;
+   mig.src_owner = _uvmem_pgmap;
+
+   /* The requested page is already paged-out, nothing to do */
+   if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
+   return ret;
+
+   ret = migrate_vma_setup();
+   if (ret)
+   return -1;
+
+   spage = migrate_pfn_to_page(*mig.src);
+   if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE))
+   goto out_finalize;
+
+   if (!is_zone_device_page(spage))
+   goto out_finalize;
+
+   dpage = alloc_page_vma(GFP_HIGHUSER, vma, start);
+   if (!dpage) {
+   ret = -1;
+   goto out_finalize;
+   }
+
+   lock_page(dpage);
+   pvt = spage->zone_device_data;
+   pfn = page_to_pfn(dpage);
+
+   /*
+* This function is used in two cases:
+* - When HV touches a secure page, for which we do UV_PAGE_OUT
+* - When a secure page is converted to shared page, we *get*
+*   the page to essentially unmap the device page. In this
+*   case we skip page-out.
+*/
+   if (!pvt->skip_page_out)
+   ret = uv_page_out(kvm->arch.lpid, pfn << page_shift,
+ gpa, 0, page_shift);
+
+   if (ret == U_SUCCESS)
+   *mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
+   else {
+   unlock_page(dpage);
+   __free_page(dpage);
+   goto out_finalize;
+   }
+
+   migrate_vma_pages();
+
+out_finalize:
+   migrate_vma_finalize();
+   return ret;
+}
+
+static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long page_shift,
+ struct kvm *kvm, unsigned long gpa)
+{
+   int ret;
+
+   mutex_lock(>arch.uvmem_lock);
+   ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa);
+   mutex_unlock(>arch.uvmem_lock);
+
+   return ret;
+}
+
+/*
  * Drop device pages that we maintain for the secure guest
  *
  * We first mark the pages to be skipped from UV_PAGE_OUT when there
@@ -866,82 +956,6 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, 
unsigned long gpa,
return ret;
 }
 
-/*
- * Provision a new page on HV side and copy over the contents
- * from secure memory using UV_PAGE_OUT uvcall.
- */
-static int kvmppc_svm_page_out(struct vm_area_struct *vma,
-   unsigned long start,
-   unsigned long end, unsigned long page_shift,
-   struct kvm *kvm, unsigned long gpa)
-{
-   unsigned long src_pfn, dst_pfn = 0;
-   struct migrate_vma mig;
-   struct page *dpage, *spage;
-   struct kvmppc_uvmem_page_pvt *pvt;
-   unsigned long pfn;
-   int ret = U_SUCCESS;
-
-   memset(, 0, sizeof(mig));
-   mig.vma = vma;
-   mig.start = start;
-   mig.end = end;
-   mig.src = _pfn;
-   mig.dst = _pfn;
-   mig.src_owner = _uvmem_pgmap;
-
-   mutex_lock(>arch.uvmem_lock);
-   /* The requested page is already paged-out, nothing to do */
-   if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL))
-   goto out;
-
-   ret = migrate_vma_setup();
-   if (ret)
-   

[PATCH v5 5/7] KVM: PPC: Book3S HV: migrate hot plugged memory

2020-07-23 Thread Ram Pai
From: Laurent Dufour 

When a memory slot is hot plugged to a SVM, PFNs associated with the
GFNs in that slot must be migrated to the secure-PFNs, aka device-PFNs.

Call kvmppc_uv_migrate_mem_slot() to accomplish this.
Disable page-merge for all pages in the memory slot.

Signed-off-by: Ram Pai 
[rearranged the code, and modified the commit log]
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/include/asm/kvm_book3s_uvmem.h | 14 ++
 arch/powerpc/kvm/book3s_hv.c| 10 ++
 arch/powerpc/kvm/book3s_hv_uvmem.c  | 23 +++
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h 
b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index f229ab5..59c17ca 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -25,6 +25,10 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot 
*free,
 struct kvm *kvm, bool skip_page_out);
 int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
+int kvmppc_uvmem_memslot_create(struct kvm *kvm,
+   const struct kvm_memory_slot *new);
+void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+   const struct kvm_memory_slot *old);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
@@ -84,5 +88,15 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, 
unsigned long gfn)
 static inline void
 kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
struct kvm *kvm, bool skip_page_out) { }
+
+static inline int  kvmppc_uvmem_memslot_create(struct kvm *kvm,
+   const struct kvm_memory_slot *new)
+{
+   return H_UNSUPPORTED;
+}
+
+static inline void  kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+   const struct kvm_memory_slot *old) { }
+
 #endif /* CONFIG_PPC_UV */
 #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d331b46..b1485ca 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4515,16 +4515,10 @@ static void kvmppc_core_commit_memory_region_hv(struct 
kvm *kvm,
 
switch (change) {
case KVM_MR_CREATE:
-   if (kvmppc_uvmem_slot_init(kvm, new))
-   return;
-   uv_register_mem_slot(kvm->arch.lpid,
-new->base_gfn << PAGE_SHIFT,
-new->npages * PAGE_SIZE,
-0, new->id);
+   kvmppc_uvmem_memslot_create(kvm, new);
break;
case KVM_MR_DELETE:
-   uv_unregister_mem_slot(kvm->arch.lpid, old->id);
-   kvmppc_uvmem_slot_free(kvm, old);
+   kvmppc_uvmem_memslot_delete(kvm, old);
break;
default:
/* TODO: Handle KVM_MR_MOVE */
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 43b3566..fd9ff6e 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -418,7 +418,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
return ret;
 }
 
-static void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+static void __kvmppc_uvmem_memslot_delete(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
 {
uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
@@ -426,7 +426,7 @@ static void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
kvmppc_memslot_page_merge(kvm, memslot, true);
 }
 
-static int kvmppc_uvmem_memslot_create(struct kvm *kvm,
+static int __kvmppc_uvmem_memslot_create(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
 {
int ret = H_PARAMETER;
@@ -478,7 +478,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
/* register the memslot */
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
-   ret = kvmppc_uvmem_memslot_create(kvm, memslot);
+   ret = __kvmppc_uvmem_memslot_create(kvm, memslot);
if (ret)
break;
}
@@ -488,7 +488,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
kvm_for_each_memslot(m, slots) {
if (m == memslot)
break;
-   kvmppc_uvmem_memslot_delete(kvm, memslot);
+   __kvmppc_uvmem_memslot_delete(kvm, memslot);
}
}
 
@@ -1057,6 +1057,21 @@ int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned 
long gfn)
return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT;
 }
 
+int kvmppc_uvmem_memslot_create(struct kvm *kvm, const struct kvm_memory_slot 
*new)
+{
+   int ret = __kvmppc_uvmem_memslot_create(kvm, new);
+
+   if (!ret)
+   ret = kvmppc_uv_migrate_mem_slot(kvm, 

[PATCH v5 4/7] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.

2020-07-23 Thread Ram Pai
The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the
pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge
delay in tranistioning the VM to SVM. The Ultravisor is only interested
in the pages that contain the kernel, initrd and other important data
structures. The rest contain throw-away content.

However if not all pages are requested by the Ultravisor, the Hypervisor
continues to consider the GFNs corresponding to the non-requested pages
as normal GFNs. This can lead to data-corruption and undefined behavior.

In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's
GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared
or Paged-in followed by a Paged-out.

Cc: Paul Mackerras 
Cc: Benjamin Herrenschmidt 
Cc: Michael Ellerman 
Cc: Bharata B Rao 
Cc: Aneesh Kumar K.V 
Cc: Sukadev Bhattiprolu 
Cc: Laurent Dufour 
Cc: Thiago Jung Bauermann 
Cc: David Gibson 
Cc: Claudio Carvalho 
Cc: kvm-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai 
---
 Documentation/powerpc/ultravisor.rst|   2 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
 arch/powerpc/kvm/book3s_hv_uvmem.c  | 136 +---
 3 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst 
b/Documentation/powerpc/ultravisor.rst
index a1c8c37..ba6b1bf 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -934,6 +934,8 @@ Return values
* H_UNSUPPORTED if called from the wrong context (e.g.
from an SVM or before an H_SVM_INIT_START
hypercall).
+   * H_STATE   if the hypervisor could not successfully
+transition the VM to Secure VM.
 
 Description
 ~~~
diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h 
b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index 9cb7d8b..f229ab5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 struct kvm *kvm, bool skip_page_out);
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+   const struct kvm_memory_slot *memslot);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 1b2b029..43b3566 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -93,6 +93,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static struct dev_pagemap kvmppc_uvmem_pgmap;
 static unsigned long *kvmppc_uvmem_bitmap;
@@ -348,6 +349,41 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, 
struct kvm *kvm,
return false;
 }
 
+/*
+ * starting from *gfn search for the next available GFN that is not yet
+ * transitioned to a secure GFN.  return the value of that GFN in *gfn.  If a
+ * GFN is found, return true, else return false
+ *
+ * Must be called with kvm->arch.uvmem_lock  held.
+ */
+static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot 
*memslot,
+   struct kvm *kvm, unsigned long *gfn)
+{
+   struct kvmppc_uvmem_slot *p;
+   bool ret = false;
+   unsigned long i;
+
+   list_for_each_entry(p, >arch.uvmem_pfns, list)
+   if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+   break;
+   if (!p)
+   return ret;
+   /*
+* The code below assumes, one to one correspondence between
+* kvmppc_uvmem_slot and memslot.
+*/
+   for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) {
+   unsigned long index = i - p->base_pfn;
+
+   if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) {
+   *gfn = i;
+   ret = true;
+   break;
+   }
+   }
+   return ret;
+}
+
 static int kvmppc_memslot_page_merge(struct kvm *kvm,
const struct kvm_memory_slot *memslot, bool merge)
 {
@@ -462,12 +498,40 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 
 unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
 {
+   struct kvm_memslots *slots;
+   struct kvm_memory_slot *memslot;
+   int srcu_idx;
+   long ret = H_SUCCESS;
+
if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
return H_UNSUPPORTED;
 
+   /* migrate any unmoved normal pfn to device pfns*/
+   srcu_idx = srcu_read_lock(>srcu);
+   slots = kvm_memslots(kvm);
+   kvm_for_each_memslot(memslot, slots) {
+   ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+   if (ret) {
+   /*
+   

[PATCH v5 3/7] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs

2020-07-23 Thread Ram Pai
During the life of SVM, its GFNs transition through normal, secure and
shared states. Since the kernel does not track GFNs that are shared, it
is not possible to disambiguate a shared GFN from a GFN whose PFN has
not yet been migrated to a secure-PFN. Also it is not possible to
disambiguate a secure-GFN from a GFN whose GFN has been pagedout from
the ultravisor.

The ability to identify the state of a GFN is needed to skip migration
of its PFN to secure-PFN during ESM transition.

The code is re-organized to track the states of a GFN as explained
below.


 1. States of a GFN
---
 The GFN can be in one of the following states.

 (a) Secure - The GFN is secure. The GFN is associated with
a Secure VM, the contents of the GFN is not accessible
to the Hypervisor.  This GFN can be backed by a secure-PFN,
or can be backed by a normal-PFN with contents encrypted.
The former is true when the GFN is paged-in into the
ultravisor. The latter is true when the GFN is paged-out
of the ultravisor.

 (b) Shared - The GFN is shared. The GFN is associated with a
a secure VM. The contents of the GFN is accessible to
Hypervisor. This GFN is backed by a normal-PFN and its
content is un-encrypted.

 (c) Normal - The GFN is a normal. The GFN is associated with
a normal VM. The contents of the GFN is accesible to
the Hypervisor. Its content is never encrypted.

 2. States of a VM.
---

 (a) Normal VM:  A VM whose contents are always accessible to
the hypervisor.  All its GFNs are normal-GFNs.

 (b) Secure VM: A VM whose contents are not accessible to the
hypervisor without the VM's consent.  Its GFNs are
either Shared-GFN or Secure-GFNs.

 (c) Transient VM: A Normal VM that is transitioning to secure VM.
The transition starts on successful return of
H_SVM_INIT_START, and ends on successful return
of H_SVM_INIT_DONE. This transient VM, can have GFNs
in any of the three states; i.e Secure-GFN, Shared-GFN,
and Normal-GFN. The VM never executes in this state
in supervisor-mode.

 3. Memory slot State.
--
The state of a memory slot mirrors the state of the
VM the memory slot is associated with.

 4. VM State transition.


  A VM always starts in Normal Mode.

  H_SVM_INIT_START moves the VM into transient state. During this
  time the Ultravisor may request some of its GFNs to be shared or
  secured. So its GFNs can be in one of the three GFN states.

  H_SVM_INIT_DONE moves the VM entirely from transient state to
  secure-state. At this point any left-over normal-GFNs are
  transitioned to Secure-GFN.

  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
  All its GFNs are moved to Normal-GFNs.

  UV_TERMINATE transitions the secure-VM back to normal-VM. All
  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
  Note: The contents of the normal-GFN is undefined at this point.

 5. GFN state implementation:
-

 Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
 when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
 set, and contains the value of the secure-PFN.
 It is associated with a normal-PFN; also called mem_pfn, when
 the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
 The value of the normal-PFN is not tracked.

 Shared GFN is associated with a normal-PFN. Its pfn[] has
 KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
 is not tracked.

 Normal GFN is associated with normal-PFN. Its pfn[] has
 no flag set. The value of the normal-PFN is not tracked.

 6. Life cycle of a GFN

 --
 || Share  |  Unshare | SVM   |H_SVM_INIT_DONE|
 ||operation   |operation | abort/|   |
 |||  | terminate |   |
 -
 |||  |   |   |
 | Secure | Shared | Secure   |Normal |Secure |
 |||  |   |   |
 | Shared | Shared | Secure   |Normal |Shared |
 |||  |   |   |
 | Normal | Shared | Secure   |Normal |Secure |
 --

 7. Life cycle of a VM

 
 | |  start|  H_SVM_  |H_SVM_   |H_SVM_ |UV_SVM_|
 | |  VM   |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
 | |   |  | |   |   |
 - 

[PATCH v5 2/7] KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START

2020-07-23 Thread Ram Pai
Page-merging of pages in memory-slots associated with a Secure VM,
is disabled in H_SVM_PAGE_IN handler.

This operation should have been done the much earlier; the moment the VM
is initiated for secure-transition. Delaying this operation, increases
the probability for those pages to acquire new references , making it
impossible to migrate those pages in H_SVM_PAGE_IN handler.

Disable page-migration in H_SVM_INIT_START handling.

Reviewed-by: Bharata B Rao 
Signed-off-by: Ram Pai 
---
 Documentation/powerpc/ultravisor.rst |   1 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 123 +--
 2 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst 
b/Documentation/powerpc/ultravisor.rst
index df136c8..a1c8c37 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -895,6 +895,7 @@ Return values
 One of the following values:
 
* H_SUCCESS  on success.
+* H_STATEif the VM is not in a position to switch to secure.
 
 Description
 ~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e6f76bc..533b608 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -211,10 +211,79 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, 
struct kvm *kvm,
return false;
 }
 
+static int kvmppc_memslot_page_merge(struct kvm *kvm,
+   const struct kvm_memory_slot *memslot, bool merge)
+{
+   unsigned long gfn = memslot->base_gfn;
+   unsigned long end, start = gfn_to_hva(kvm, gfn);
+   int ret = 0;
+   struct vm_area_struct *vma;
+   int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
+
+   if (kvm_is_error_hva(start))
+   return H_STATE;
+
+   end = start + (memslot->npages << PAGE_SHIFT);
+
+   mmap_write_lock(kvm->mm);
+   do {
+   vma = find_vma_intersection(kvm->mm, start, end);
+   if (!vma) {
+   ret = H_STATE;
+   break;
+   }
+   ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ merge_flag, >vm_flags);
+   if (ret) {
+   ret = H_STATE;
+   break;
+   }
+   start = vma->vm_end;
+   } while (end > vma->vm_end);
+
+   mmap_write_unlock(kvm->mm);
+   return ret;
+}
+
+static void kvmppc_uvmem_memslot_delete(struct kvm *kvm,
+   const struct kvm_memory_slot *memslot)
+{
+   uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
+   kvmppc_uvmem_slot_free(kvm, memslot);
+   kvmppc_memslot_page_merge(kvm, memslot, true);
+}
+
+static int kvmppc_uvmem_memslot_create(struct kvm *kvm,
+   const struct kvm_memory_slot *memslot)
+{
+   int ret = H_PARAMETER;
+
+   if (kvmppc_memslot_page_merge(kvm, memslot, false))
+   return ret;
+
+   if (kvmppc_uvmem_slot_init(kvm, memslot))
+   goto out1;
+
+   ret = uv_register_mem_slot(kvm->arch.lpid,
+  memslot->base_gfn << PAGE_SHIFT,
+  memslot->npages * PAGE_SIZE,
+  0, memslot->id);
+   if (ret < 0) {
+   ret = H_PARAMETER;
+   goto out;
+   }
+   return 0;
+out:
+   kvmppc_uvmem_slot_free(kvm, memslot);
+out1:
+   kvmppc_memslot_page_merge(kvm, memslot, true);
+   return ret;
+}
+
 unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 {
struct kvm_memslots *slots;
-   struct kvm_memory_slot *memslot;
+   struct kvm_memory_slot *memslot, *m;
int ret = H_SUCCESS;
int srcu_idx;
 
@@ -232,23 +301,24 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
return H_AUTHORITY;
 
srcu_idx = srcu_read_lock(>srcu);
+
+   /* register the memslot */
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
-   if (kvmppc_uvmem_slot_init(kvm, memslot)) {
-   ret = H_PARAMETER;
-   goto out;
-   }
-   ret = uv_register_mem_slot(kvm->arch.lpid,
-  memslot->base_gfn << PAGE_SHIFT,
-  memslot->npages * PAGE_SIZE,
-  0, memslot->id);
-   if (ret < 0) {
-   kvmppc_uvmem_slot_free(kvm, memslot);
-   ret = H_PARAMETER;
-   goto out;
+   ret = kvmppc_uvmem_memslot_create(kvm, memslot);
+   if (ret)
+   break;
+   }
+
+   if (ret) {
+   slots = kvm_memslots(kvm);
+   kvm_for_each_memslot(m, slots) {
+   if (m == memslot)
+   

[PATCH v5 1/7] KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c

2020-07-23 Thread Ram Pai
Without this fix, git is confused. It generates wrong
function context for code changes in subsequent patches.
Weird, but true.

Cc: Paul Mackerras 
Cc: Benjamin Herrenschmidt 
Cc: Michael Ellerman 
Cc: Bharata B Rao 
Cc: Aneesh Kumar K.V 
Cc: Sukadev Bhattiprolu 
Cc: Laurent Dufour 
Cc: Thiago Jung Bauermann 
Cc: David Gibson 
Cc: Claudio Carvalho 
Cc: kvm-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 09d8119..e6f76bc 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -382,8 +382,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
  * Alloc a PFN from private device memory pool and copy page from normal
  * memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int
-kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
+static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
   unsigned long end, unsigned long gpa, struct kvm *kvm,
   unsigned long page_shift, bool *downgrade)
 {
@@ -450,8 +449,8 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
  * In the former case, uses dev_pagemap_ops.migrate_to_ram handler
  * to unmap the device page from QEMU's page tables.
  */
-static unsigned long
-kvmppc_share_page(struct kvm *kvm, unsigned long gpa, unsigned long page_shift)
+static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
+   unsigned long page_shift)
 {
 
int ret = H_PARAMETER;
@@ -500,9 +499,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
  * H_PAGE_IN_SHARED flag makes the page shared which means that the same
  * memory in is visible from both UV and HV.
  */
-unsigned long
-kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
-unsigned long flags, unsigned long page_shift)
+unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+   unsigned long flags,
+   unsigned long page_shift)
 {
bool downgrade = false;
unsigned long start, end;
@@ -559,10 +558,10 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
  * Provision a new page on HV side and copy over the contents
  * from secure memory using UV_PAGE_OUT uvcall.
  */
-static int
-kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
-   unsigned long end, unsigned long page_shift,
-   struct kvm *kvm, unsigned long gpa)
+static int kvmppc_svm_page_out(struct vm_area_struct *vma,
+   unsigned long start,
+   unsigned long end, unsigned long page_shift,
+   struct kvm *kvm, unsigned long gpa)
 {
unsigned long src_pfn, dst_pfn = 0;
struct migrate_vma mig;
-- 
1.8.3.1



[PATCH v5 0/7] Migrate non-migrated pages of a SVM.

2020-07-23 Thread Ram Pai
The time to switch a VM to Secure-VM, increases by the size of the VM.
A 100GB VM takes about 7minutes. This is unacceptable.  This linear
increase is caused by a suboptimal behavior by the Ultravisor and the
Hypervisor.  The Ultravisor unnecessarily migrates all the GFN of the
VM from normal-memory to secure-memory. It has to just migrate the
necessary and sufficient GFNs.

However when the optimization is incorporated in the Ultravisor, the
Hypervisor starts misbehaving. The Hypervisor has a inbuilt assumption
that the Ultravisor will explicitly request to migrate, each and every
GFN of the VM. If only necessary and sufficient GFNs are requested for
migration, the Hypervisor continues to manage the remaining GFNs as
normal GFNs. This leads to memory corruption; manifested
consistently when the SVM reboots.

The same is true, when a memory slot is hotplugged into a SVM. The
Hypervisor expects the ultravisor to request migration of all GFNs to
secure-GFN.  But the hypervisor cannot handle any H_SVM_PAGE_IN
requests from the Ultravisor, done in the context of
UV_REGISTER_MEM_SLOT ucall.  This problem manifests as random errors
in the SVM, when a memory-slot is hotplugged.

This patch series automatically migrates the non-migrated pages of a
SVM, and thus solves the problem.

Testing: Passed rigorous testing using various sized SVMs.

Changelog:

v5:  .  This patch series includes Laurent's fix for memory hotplug/unplug
  . drop pages first and then delete the memslot. Otherwise
the memslot does not get cleanly deleted, causing
problems during reboot.
  . recreatable through the following set of commands
 . device_add pc-dimm,id=dimm1,memdev=mem1
 . device_del dimm1
 . device_add pc-dimm,id=dimm1,memdev=mem1
Further incorporates comments from Bharata:
. fix for off-by-one while disabling migration.
. code-reorganized to maximize sharing in init_start path
and in memory-hotplug path
. locking adjustments in mass-page migration during H_SVM_INIT_DONE.
. improved recovery on error paths.
. additional comments in the code for better understanding.
. removed the retry-on-migration-failure code.
. re-added the initial patch that adjust some prototype to overcome
   a git problem, where it messes up the code context. Had
accidently dropped the patch in the last version.

v4:  .  Incorported Bharata's comments:
- Optimization -- replace write mmap semaphore with read mmap semphore.
- disable page-merge during memory hotplug.
- rearranged the patches. consolidated the page-migration-retry logic
in a single patch.

v3: . Optimized the page-migration retry-logic. 
. Relax and relinquish the cpu regularly while bulk migrating
the non-migrated pages. This issue was causing soft-lockups.
Fixed it.
. Added a new patch, to retry page-migration a couple of times
before returning H_BUSY in H_SVM_PAGE_IN. This issue was
seen a few times in a 24hour continuous reboot test of the SVMs.

v2: . fixed a bug observed by Laurent. The state of the GFN's associated
with Secure-VMs were not reset during memslot flush.
. Re-organized the code, for easier review.
. Better description of the patch series.

v1: . fixed a bug observed by Bharata. Pages that where paged-in and later
paged-out must also be skipped from migration during H_SVM_INIT_DONE.

Laurent Dufour (3):
  KVM: PPC: Book3S HV: migrate hot plugged memory
  KVM: PPC: Book3S HV: move kvmppc_svm_page_out up
  KVM: PPC: Book3S HV: rework secure mem slot dropping

Ram Pai (4):
  KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
  KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
  KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
  KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs
to secure-GFNs.

 Documentation/powerpc/ultravisor.rst|   3 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |  16 +
 arch/powerpc/kvm/book3s_hv.c|  10 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c  | 690 +---
 4 files changed, 548 insertions(+), 171 deletions(-)

-- 
1.8.3.1



Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread peterz
On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote:
> On 7/23/20 2:47 PM, pet...@infradead.org wrote:
> > On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > > BTW, do you have any comment on my v2 lock holder cpu info qspinlock 
> > > patch?
> > > I will have to update the patch to fix the reported 0-day test problem, 
> > > but
> > > I want to collect other feedback before sending out v3.
> > I want to say I hate it all, it adds instructions to a path we spend an
> > aweful lot of time optimizing without really getting anything back for
> > it.
> 
> It does add some extra instruction that may slow it down slightly, but I
> don't agree that it gives nothing back. The cpu lock holder information can
> be useful in analyzing crash dumps and in some debugging situation. I think
> it can be useful in RHEL for this readon. How about an x86 config option to
> allow distros to decide if they want to have it enabled? I will make sure
> that it will have no performance degradation if the option is not enabled.

Config knobs suck too; they create a maintenance burden (we get to make
sure all the permutations works/build/etc..) and effectively nobody uses
them, since world+dog uses what distros pick.

Anyway, instead of adding a second per-cpu variable, can you see how
horrible something like this is:

unsigned char adds(unsigned char var, unsigned char val)
{
unsigned short sat = 0xff, tmp = var;

asm ("addb  %[val], %b[var];"
 "cmovc %[sat], %[var];"
 : [var] "+r" (tmp)
 : [val] "ir" (val), [sat] "r" (sat)
 );

return tmp;
}

Another thing to try is, instead of threading that lockval throughout
the thing, simply:

#define _Q_LOCKED_VAL   this_cpu_read_stable(cpu_sat)

or combined with the above

#define _Q_LOCKED_VAL   adds(this_cpu_read_stable(cpu_number), 2)

and see if the compiler really makes a mess of things.


Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread Waiman Long

On 7/23/20 2:47 PM, pet...@infradead.org wrote:

On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:

BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
I will have to update the patch to fix the reported 0-day test problem, but
I want to collect other feedback before sending out v3.

I want to say I hate it all, it adds instructions to a path we spend an
aweful lot of time optimizing without really getting anything back for
it.


It does add some extra instruction that may slow it down slightly, but I 
don't agree that it gives nothing back. The cpu lock holder information 
can be useful in analyzing crash dumps and in some debugging situation. 
I think it can be useful in RHEL for this readon. How about an x86 
config option to allow distros to decide if they want to have it 
enabled? I will make sure that it will have no performance degradation 
if the option is not enabled.


Cheers,
Longman




Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread peterz
On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> I will have to update the patch to fix the reported 0-day test problem, but
> I want to collect other feedback before sending out v3.

I want to say I hate it all, it adds instructions to a path we spend an
aweful lot of time optimizing without really getting anything back for
it.

Will, how do you feel about it?


Re: [PATCH v2 0/3] powerpc/pseries: IPI doorbell improvements

2020-07-23 Thread Cédric Le Goater
On 6/30/20 1:50 PM, Nicholas Piggin wrote:
> Since v1:
> - Fixed SMP compile error.
> - Fixed EPAPR / KVM_GUEST breakage.
> - Expanded patch 3 changelog a bit.
> 
> Thanks,
> Nick

I gave the patchset a try on KVM guests (P9 and P8) and PowerVM LPARs (P9).

Looks good.  

Tested-by: Cédric Le Goater 

Thanks,

C. 

> 
> Nicholas Piggin (3):
>   powerpc: inline doorbell sending functions
>   powerpc/pseries: Use doorbells even if XIVE is available
>   powerpc/pseries: Add KVM guest doorbell restrictions
> 
>  arch/powerpc/include/asm/dbell.h | 63 ++--
>  arch/powerpc/include/asm/firmware.h  |  6 +++
>  arch/powerpc/include/asm/kvm_para.h  | 26 ++--
>  arch/powerpc/kernel/Makefile |  5 +--
>  arch/powerpc/kernel/dbell.c  | 55 
>  arch/powerpc/kernel/firmware.c   | 19 +
>  arch/powerpc/platforms/pseries/smp.c | 62 +++
>  7 files changed, 134 insertions(+), 102 deletions(-)
> 



Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread Waiman Long

On 7/23/20 10:00 AM, Peter Zijlstra wrote:

On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:

We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
supported.

Waiman, if you cannot explain how not having kick is a sane thing, what
are you saying here?

The current PPC paravirt spinlock code doesn't do any cpu kick. It does 
an equivalence of pv_wait by yielding the cpu to the lock holder only. 
The pv_spinlocks_init() is for setting up the hash table for doing 
pv_kick. If we don't need to do pv_kick, we don't need the hash table.


I am not saying that pv_kick is not needed for the PPC environment. I 
was just trying to adapt the pvqspinlock code to such an environment 
first. Further investigation on how to implement some kind of pv_kick 
will be something that we may want to do as a follow on.


BTW, do you have any comment on my v2 lock holder cpu info qspinlock 
patch? I will have to update the patch to fix the reported 0-day test 
problem, but I want to collect other feedback before sending out v3.


Cheers,
Longman



Re: [PATCH 2/2] powerpc/64s: system call support for scv/rfscv instructions

2020-07-23 Thread Christophe Leroy

Michael Ellerman  a écrit :


Nicholas Piggin  writes:
diff --git a/arch/powerpc/include/asm/ppc-opcode.h  
b/arch/powerpc/include/asm/ppc-opcode.h

index 2a39c716c343..b2bdc4de1292 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -257,6 +257,7 @@
 #define PPC_INST_MFVSRD0x7c66
 #define PPC_INST_MTVSRD0x7c000166
 #define PPC_INST_SC0x4402
+#define PPC_INST_SCV   0x4401

...

@@ -411,6 +412,7 @@

...

+#define __PPC_LEV(l)   (((l) & 0x7f) << 5)


These conflicted and didn't seem to be used so I dropped them.


diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5abe98216dc2..161bfccbc309 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -3378,6 +3382,16 @@ int emulate_step(struct pt_regs *regs,  
struct ppc_inst instr)

regs->msr = MSR_KERNEL;
return 1;

+   case SYSCALL_VECTORED_0:/* scv 0 */
+   regs->gpr[9] = regs->gpr[13];
+   regs->gpr[10] = MSR_KERNEL;
+   regs->gpr[11] = regs->nip + 4;
+   regs->gpr[12] = regs->msr & MSR_MASK;
+   regs->gpr[13] = (unsigned long) get_paca();
+   regs->nip = (unsigned long) _call_vectored_emulate;
+   regs->msr = MSR_KERNEL;
+   return 1;
+


This broke the ppc64e build:

  ld: arch/powerpc/lib/sstep.o:(.toc+0x0): undefined reference to  
`system_call_vectored_emulate'

  make[1]: *** [/home/michael/linux/Makefile:1139: vmlinux] Error 1

I wrapped it in #ifdef CONFIG_PPC64_BOOK3S.


You mean CONFIG_PPC_BOOK3S_64 ?

Christophe


cheers





Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread Nicholas Piggin
Excerpts from Peter Zijlstra's message of July 24, 2020 12:59 am:
> On Thu, Jul 23, 2020 at 11:11:03PM +1000, Nicholas Piggin wrote:
>> Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
>> > On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
>> > 
>> >> diff --git a/arch/powerpc/include/asm/hw_irq.h 
>> >> b/arch/powerpc/include/asm/hw_irq.h
>> >> index 3a0db7b0b46e..35060be09073 100644
>> >> --- a/arch/powerpc/include/asm/hw_irq.h
>> >> +++ b/arch/powerpc/include/asm/hw_irq.h
>> >> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>> >>  #define powerpc_local_irq_pmu_save(flags)\
>> >>do {   \
>> >>   raw_local_irq_pmu_save(flags);  \
>> >> - trace_hardirqs_off();   \
>> >> + if (!raw_irqs_disabled_flags(flags))\
>> >> + trace_hardirqs_off();   \
>> >>   } while(0)
>> >>  #define powerpc_local_irq_pmu_restore(flags) \
>> >>   do {\
>> >> - if (raw_irqs_disabled_flags(flags)) {   \
>> >> - raw_local_irq_pmu_restore(flags);   \
>> >> - trace_hardirqs_off();   \
>> >> - } else {\
>> >> + if (!raw_irqs_disabled_flags(flags))\
>> >>   trace_hardirqs_on();\
>> >> - raw_local_irq_pmu_restore(flags);   \
>> >> - }   \
>> >> + raw_local_irq_pmu_restore(flags);   \
>> >>   } while(0)
>> > 
>> > You shouldn't be calling lockdep from NMI context!
>> 
>> After this patch it doesn't.
> 
> You sure, trace_hardirqs_{on,off}() calls into lockdep.

At least for irq enable/disable functions yes. NMI runs with
interrupts disabled so these will never call trace_hardirqs_on/off
after this patch.

> (FWIW they're
> also broken vs entry ordering, but that's another story).
> 
>> trace_hardirqs_on/off implementation appears to expect to be called in NMI 
>> context though, for some reason.
> 
> Hurpm, not sure.. I'll have to go grep arch code now :/ The generic NMI
> code didn't touch that stuff.
> 
> Argh, yes, there might be broken there... damn! I'll go frob around.
> 

Thanks,
Nick


Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

2020-07-23 Thread Nicholas Piggin
Excerpts from Waiman Long's message of July 24, 2020 12:29 am:
> On 7/23/20 9:30 AM, Nicholas Piggin wrote:
>>> I would prefer to extract out the pending bit handling code out into a
>>> separate helper function which can be overridden by the arch code
>>> instead of breaking the slowpath into 2 pieces.
>> You mean have the arch provide a queued_spin_lock_slowpath_pending
>> function that the slow path calls?
>>
>> I would actually prefer the pending handling can be made inline in
>> the queued_spin_lock function, especially with out-of-line locks it
>> makes sense to put it there.
>>
>> We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
>> then __queued_spin_lock_slowpath_queue would be inlined into the
>> caller so there would be no split?
> 
> The pending code is an optimization for lightly contended locks. That is 
> why I think it is appropriate to extract it into a helper function and 
> mark it as such.
> 
> You can certainly put the code in the arch's spin_lock code, you just 
> has to override the generic pending code by a null function.

I see what you mean. I guess that would work fine.

Thanks,
Nick


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread Peter Zijlstra
On Thu, Jul 23, 2020 at 11:11:03PM +1000, Nicholas Piggin wrote:
> Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
> > On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
> > 
> >> diff --git a/arch/powerpc/include/asm/hw_irq.h 
> >> b/arch/powerpc/include/asm/hw_irq.h
> >> index 3a0db7b0b46e..35060be09073 100644
> >> --- a/arch/powerpc/include/asm/hw_irq.h
> >> +++ b/arch/powerpc/include/asm/hw_irq.h
> >> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
> >>  #define powerpc_local_irq_pmu_save(flags) \
> >> do {   \
> >>raw_local_irq_pmu_save(flags);  \
> >> -  trace_hardirqs_off();   \
> >> +  if (!raw_irqs_disabled_flags(flags))\
> >> +  trace_hardirqs_off();   \
> >>} while(0)
> >>  #define powerpc_local_irq_pmu_restore(flags)  \
> >>do {\
> >> -  if (raw_irqs_disabled_flags(flags)) {   \
> >> -  raw_local_irq_pmu_restore(flags);   \
> >> -  trace_hardirqs_off();   \
> >> -  } else {\
> >> +  if (!raw_irqs_disabled_flags(flags))\
> >>trace_hardirqs_on();\
> >> -  raw_local_irq_pmu_restore(flags);   \
> >> -  }   \
> >> +  raw_local_irq_pmu_restore(flags);   \
> >>} while(0)
> > 
> > You shouldn't be calling lockdep from NMI context!
> 
> After this patch it doesn't.

You sure, trace_hardirqs_{on,off}() calls into lockdep. (FWIW they're
also broken vs entry ordering, but that's another story).

> trace_hardirqs_on/off implementation appears to expect to be called in NMI 
> context though, for some reason.

Hurpm, not sure.. I'll have to go grep arch code now :/ The generic NMI
code didn't touch that stuff.

Argh, yes, there might be broken there... damn! I'll go frob around.


Re: [v3 12/15] powerpc/perf: Add support for outputting extended regs in perf intr_regs

2020-07-23 Thread Arnaldo Carvalho de Melo
Em Thu, Jul 23, 2020 at 11:14:16AM +0530, kajoljain escreveu:
> 
> 
> On 7/21/20 11:32 AM, kajoljain wrote:
> > 
> > 
> > On 7/17/20 8:08 PM, Athira Rajeev wrote:
> >> From: Anju T Sudhakar 
> >>
> >> Add support for perf extended register capability in powerpc.
> >> The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
> >> PMU which support extended registers. The generic code define the mask
> >> of extended registers as 0 for non supported architectures.
> >>
> >> Patch adds extended regs support for power9 platform by
> >> exposing MMCR0, MMCR1 and MMCR2 registers.
> >>
> >> REG_RESERVED mask needs update to include extended regs.
> >> `PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers,
> >> is defined at runtime in the kernel based on platform since the supported
> >> registers may differ from one processor version to another and hence the
> >> MASK value.
> >>
> >> with patch
> >> --
> >>
> >> available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
> >> r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
> >> r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
> >> trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2
> >>
> >> PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
> >> ... intr regs: mask 0x ABI 64-bit
> >>  r00xc012b77c
> >>  r10xc03fe5e03930
> >>  r20xc1b0e000
> >>  r30xc03fdcddf800
> >>  r40xc03fc788
> >>  r50x9c422724be
> >>  r60xc03fe5e03908
> >>  r70xff63bddc8706
> >>  r80x9e4
> >>  r90x0
> >>  r10   0x1
> >>  r11   0x0
> >>  r12   0xc01299c0
> >>  r13   0xc03c4800
> >>  r14   0x0
> >>  r15   0x7fffdd8b8b00
> >>  r16   0x0
> >>  r17   0x7fffdd8be6b8
> >>  r18   0x7e7076607730
> >>  r19   0x2f
> >>  r20   0xc0001fc26c68
> >>  r21   0xc0002041e4227e00
> >>  r22   0xc0002018fb60
> >>  r23   0x1
> >>  r24   0xc03ffec4d900
> >>  r25   0x8000
> >>  r26   0x0
> >>  r27   0x1
> >>  r28   0x1
> >>  r29   0xc1be1260
> >>  r30   0x6008010
> >>  r31   0xc03ffebb7218
> >>  nip   0xc012b910
> >>  msr   0x90009033
> >>  orig_r3 0xc012b86c
> >>  ctr   0xc01299c0
> >>  link  0xc012b77c
> >>  xer   0x0
> >>  ccr   0x2800
> >>  softe 0x1
> >>  trap  0xf00
> >>  dar   0x0
> >>  dsisr 0x800
> >>  sier  0x0
> >>  mmcra 0x800
> >>  mmcr0 0x82008090
> >>  mmcr1 0x1e00
> >>  mmcr2 0x0
> >>  ... thread: perf:4784
> >>
> >> Signed-off-by: Anju T Sudhakar 
> >> [Defined PERF_REG_EXTENDED_MASK at run time to add support for different 
> >> platforms ]
> >> Signed-off-by: Athira Rajeev 
> >> Reviewed-by: Madhavan Srinivasan 
> >> ---
> > 
> > Patch looks good to me.
> > 
> > Reviewed-by: Kajol Jain 
> 
> Hi Arnaldo and Jiri,
>Please let me know if you have any comments on these patches. Can you 
> pull/ack these
> patches if they seems fine to you.

Can you please clarify something here, I think I saw a kernel build bot
complaint followed by a fix, in these cases I think, for reviewer's
sake, that this would entail a v4 patchkit? One that has no such build
issues?

Or have I got something wrong?

- Arnaldo


Re: [PATCHv3 1/2] powerpc/pseries: group lmb operation and memblock's

2020-07-23 Thread Nathan Lynch
Pingfan Liu  writes:
> This patch prepares for the incoming patch which swaps the order of KOBJ_
> uevent and dt's updating.
>
> It has no functional effect, just groups lmb operation and memblock's in
> order to insert dt updating operation easily, and makes it easier to
> review.

...

> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 5d545b7..1a3ac3b 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -355,7 +355,8 @@ static int dlpar_add_lmb(struct drmem_lmb *);
>  static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>  {
>   unsigned long block_sz;
> - int rc;
> + phys_addr_t base_addr;
> + int rc, nid;
>  
>   if (!lmb_is_removable(lmb))
>   return -EINVAL;
> @@ -364,17 +365,19 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>   if (rc)
>   return rc;
>  
> + base_addr = lmb->base_addr;
> + nid = lmb->nid;
>   block_sz = pseries_memory_block_size();
>  
> - __remove_memory(lmb->nid, lmb->base_addr, block_sz);
> -
> - /* Update memory regions for memory remove */
> - memblock_remove(lmb->base_addr, block_sz);
> -
>   invalidate_lmb_associativity_index(lmb);
>   lmb_clear_nid(lmb);
>   lmb->flags &= ~DRCONF_MEM_ASSIGNED;
>  
> + __remove_memory(nid, base_addr, block_sz);
> +
> + /* Update memory regions for memory remove */
> + memblock_remove(base_addr, block_sz);
> +
>   return 0;
>  }

I don't understand; the commit message should not claim this has no
functional effect when it changes the order of operations like
this. Maybe this is an improvement over the current behavior, but it's
not explained why it would be.


Re: [PATCH v3 4/6] powerpc/64s: implement queued spinlocks and rwlocks

2020-07-23 Thread Michal Suchánek
On Mon, Jul 06, 2020 at 02:35:38PM +1000, Nicholas Piggin wrote:
> These have shown significantly improved performance and fairness when
> spinlock contention is moderate to high on very large systems.
> 
>  [ Numbers hopefully forthcoming after more testing, but initial
>results look good ]
> 
> Thanks to the fast path, single threaded performance is not noticably
> hurt.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/Kconfig  | 13 
>  arch/powerpc/include/asm/Kbuild   |  2 ++
>  arch/powerpc/include/asm/qspinlock.h  | 25 +++
>  arch/powerpc/include/asm/spinlock.h   |  5 +
>  arch/powerpc/include/asm/spinlock_types.h |  5 +
>  arch/powerpc/lib/Makefile |  3 +++
>  include/asm-generic/qspinlock.h   |  2 ++
>  7 files changed, 55 insertions(+)
>  create mode 100644 arch/powerpc/include/asm/qspinlock.h
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 24ac85c868db..17663ea57697 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -146,6 +146,8 @@ config PPC
>   select ARCH_SUPPORTS_ATOMIC_RMW
>   select ARCH_USE_BUILTIN_BSWAP
>   select ARCH_USE_CMPXCHG_LOCKREF if PPC64
> + select ARCH_USE_QUEUED_RWLOCKS  if PPC_QUEUED_SPINLOCKS
> + select ARCH_USE_QUEUED_SPINLOCKSif PPC_QUEUED_SPINLOCKS
>   select ARCH_WANT_IPC_PARSE_VERSION
>   select ARCH_WEAK_RELEASE_ACQUIRE
>   select BINFMT_ELF
> @@ -492,6 +494,17 @@ config HOTPLUG_CPU
>  
> Say N if you are unsure.
>  
> +config PPC_QUEUED_SPINLOCKS
> + bool "Queued spinlocks"
> + depends on SMP
> + default "y" if PPC_BOOK3S_64
> + help
> +   Say Y here to use to use queued spinlocks which are more complex
> +   but give better salability and fairness on large SMP and NUMA
   ^ +c?
Thanks

Michal
> +   systems.
> +
> +   If unsure, say "Y" if you have lots of cores, otherwise "N".
> +
>  config ARCH_CPU_PROBE_RELEASE
>   def_bool y
>   depends on HOTPLUG_CPU
> diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
> index dadbcf3a0b1e..1dd8b6adff5e 100644
> --- a/arch/powerpc/include/asm/Kbuild
> +++ b/arch/powerpc/include/asm/Kbuild
> @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
>  generic-y += export.h
>  generic-y += local64.h
>  generic-y += mcs_spinlock.h
> +generic-y += qrwlock.h
> +generic-y += qspinlock.h
>  generic-y += vtime.h
>  generic-y += early_ioremap.h
> diff --git a/arch/powerpc/include/asm/qspinlock.h 
> b/arch/powerpc/include/asm/qspinlock.h
> new file mode 100644
> index ..c49e33e24edd
> --- /dev/null
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_QSPINLOCK_H
> +#define _ASM_POWERPC_QSPINLOCK_H
> +
> +#include 
> +
> +#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
> +
> +#define smp_mb__after_spinlock()   smp_mb()
> +
> +static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
> +{
> + /*
> +  * This barrier was added to simple spinlocks by commit 51d7d5205d338,
> +  * but it should now be possible to remove it, asm arm64 has done with
> +  * commit c6f5d02b6a0f.
> +  */
> + smp_mb();
> + return atomic_read(>val);
> +}
> +#define queued_spin_is_locked queued_spin_is_locked
> +
> +#include 
> +
> +#endif /* _ASM_POWERPC_QSPINLOCK_H */
> diff --git a/arch/powerpc/include/asm/spinlock.h 
> b/arch/powerpc/include/asm/spinlock.h
> index 21357fe05fe0..434615f1d761 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -3,7 +3,12 @@
>  #define __ASM_SPINLOCK_H
>  #ifdef __KERNEL__
>  
> +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
> +#include 
> +#include 
> +#else
>  #include 
> +#endif
>  
>  #endif /* __KERNEL__ */
>  #endif /* __ASM_SPINLOCK_H */
> diff --git a/arch/powerpc/include/asm/spinlock_types.h 
> b/arch/powerpc/include/asm/spinlock_types.h
> index 3906f52dae65..c5d742f18021 100644
> --- a/arch/powerpc/include/asm/spinlock_types.h
> +++ b/arch/powerpc/include/asm/spinlock_types.h
> @@ -6,6 +6,11 @@
>  # error "please don't include this file directly"
>  #endif
>  
> +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
> +#include 
> +#include 
> +#else
>  #include 
> +#endif
>  
>  #endif
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 5e994cda8e40..d66a645503eb 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
> copypage_power7.o \
>  obj64-y  += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
>  memcpy_64.o memcpy_mcsafe_64.o
>  
> +ifndef CONFIG_PPC_QUEUED_SPINLOCKS
>  obj64-$(CONFIG_SMP)  += locks.o
> +endif
> +
>  obj64-$(CONFIG_ALTIVEC)  += vmx-helper.o
>  obj64-$(CONFIG_KPROBES_SANITY_TEST)  += test_emulate_step.o \
>  

Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

2020-07-23 Thread Waiman Long

On 7/23/20 9:30 AM, Nicholas Piggin wrote:

I would prefer to extract out the pending bit handling code out into a
separate helper function which can be overridden by the arch code
instead of breaking the slowpath into 2 pieces.

You mean have the arch provide a queued_spin_lock_slowpath_pending
function that the slow path calls?

I would actually prefer the pending handling can be made inline in
the queued_spin_lock function, especially with out-of-line locks it
makes sense to put it there.

We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
then __queued_spin_lock_slowpath_queue would be inlined into the
caller so there would be no split?


The pending code is an optimization for lightly contended locks. That is 
why I think it is appropriate to extract it into a helper function and 
mark it as such.


You can certainly put the code in the arch's spin_lock code, you just 
has to override the generic pending code by a null function.


Cheers,
Longman



Re: [RFC PATCH 4/5] powerpc/mm: Remove custom stack expansion checking

2020-07-23 Thread Daniel Axtens
Hi Michael,

> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The logic aims to prevent userspace from doing bad accesses below the
> stack pointer. However as long as the stack is < 1MB in size, we allow
> all accesses without further checks. Adding some debug I see that I
> can do a full kernel build and LTP run, and not a single process has
> used more than 1MB of stack. So for the majority of processes the
> logic never even fires.
>
> We also recently found a nasty bug in this code which could cause
> userspace programs to be killed during signal delivery. It went
> unnoticed presumably because most processes use < 1MB of stack.
>
> The generic mm code has also grown support for stack guard pages since
> this code was originally written, so the most heinous case of the
> stack expanding into other mappings is now handled for us.
>
> Finally although some other arches have special logic in this path,
> from what I can tell none of x86, arm64, arm and s390 impose any extra
> checks other than those in expand_stack().
>
> So drop our complicated logic and like other architectures just let
> the stack expand as long as its within the rlimit.
>

I applied and tested this. While I wouldn't call my testing
comprehensive, I have not been able to reproduce the crash with this
patch applied.

Kind regards,
Daniel


> Signed-off-by: Michael Ellerman 
> ---
>  arch/powerpc/mm/fault.c | 106 ++--
>  1 file changed, 5 insertions(+), 101 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index ed01329dd12b..925a7231abb3 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -42,39 +42,7 @@
>  #include 
>  #include 
>  
> -/*
> - * Check whether the instruction inst is a store using
> - * an update addressing form which will update r1.
> - */
> -static bool store_updates_sp(struct ppc_inst inst)
> -{
> - /* check for 1 in the rA field */
> - if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
> - return false;
> - /* check major opcode */
> - switch (ppc_inst_primary_opcode(inst)) {
> - case OP_STWU:
> - case OP_STBU:
> - case OP_STHU:
> - case OP_STFSU:
> - case OP_STFDU:
> - return true;
> - case OP_STD:/* std or stdu */
> - return (ppc_inst_val(inst) & 3) == 1;
> - case OP_31:
> - /* check minor opcode */
> - switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
> - case OP_31_XOP_STDUX:
> - case OP_31_XOP_STWUX:
> - case OP_31_XOP_STBUX:
> - case OP_31_XOP_STHUX:
> - case OP_31_XOP_STFSUX:
> - case OP_31_XOP_STFDUX:
> - return true;
> - }
> - }
> - return false;
> -}
> +
>  /*
>   * do_page_fault error handling helpers
>   */
> @@ -267,54 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, 
> unsigned long error_code,
>   return false;
>  }
>  
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> - struct vm_area_struct *vma, unsigned int flags,
> - bool *must_retry)
> -{
> - /*
> -  * N.B. The POWER/Open ABI allows programs to access up to
> -  * 288 bytes below the stack pointer.
> -  * The kernel signal delivery code writes up to 4KB
> -  * below the stack pointer (r1) before decrementing it.
> -  * The exec code can write slightly over 640kB to the stack
> -  * before setting the user r1.  Thus we allow the stack to
> -  * expand to 1MB without further checks.
> -  */
> - if (address + 0x10 < vma->vm_end) {
> - struct ppc_inst __user *nip = (struct ppc_inst __user 
> *)regs->nip;
> - /* get user regs even if this fault is in kernel mode */
> - struct pt_regs *uregs = current->thread.regs;
> - if (uregs == NULL)
> - return true;
> -
> - /*
> -  * A user-mode access to an address a long way below
> -  * the stack pointer is only valid if the instruction
> -  * is one which would update the stack pointer to the
> -  * address accessed if the instruction completed,
> -  * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> -  * (or the byte, halfword, float or double forms).
> -  *
> -  * If we don't check this then any write to the area
> -  * between the last mapped region and the stack will
> -  * expand the stack rather than segfaulting.
> -  */
> - if (address + 4096 >= uregs->gpr[1])
> - return false;
> -
> - if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> - access_ok(nip, sizeof(*nip))) 

Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread Nicholas Piggin
Excerpts from Michael Ellerman's message of July 9, 2020 8:53 pm:
> Nicholas Piggin  writes:
> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>  arch/powerpc/include/asm/paravirt.h   | 28 
>>  arch/powerpc/include/asm/qspinlock.h  | 66 +++
>>  arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
>>  arch/powerpc/platforms/pseries/Kconfig|  5 ++
>>  arch/powerpc/platforms/pseries/setup.c|  6 +-
>>  include/asm-generic/qspinlock.h   |  2 +
> 
> Another ack?
> 
>> diff --git a/arch/powerpc/include/asm/paravirt.h 
>> b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 
>> yield_count)
>>  {
>>  ___bad_yield_to_preempted(); /* This would be a bug */
>>  }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> +___bad_yield_to_any(); /* This would be a bug */
>> +}
> 
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
> 
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?

Mainly so you could use it in if (IS_ENABLED()) blocks, but would still
catch the (presumably buggy) case where something calls it without the
option set.

I think I had it arranged a different way that was using IS_ENABLED 
earlier and changed it but might as well keep it this way.

> 
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
>> b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index ..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
> 
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
> 
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
> 
> Why's that in a header? Should that (eventually) go with the generic 
> implementation?

Yeah the qspinlock_paravirt.h header is a bit weird and only gets 
included into kernel/locking/qspinlock.c

>> diff --git a/arch/powerpc/platforms/pseries/Kconfig 
>> b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>>  select SWIOTLB
>>  default y
>>  
>> +config PARAVIRT_SPINLOCKS
>> +bool
>> +default n
> 
> default n is the default.
> 
>> diff --git a/arch/powerpc/platforms/pseries/setup.c 
>> b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>>  if (firmware_has_feature(FW_FEATURE_LPAR)) {
>>  vpa_init(boot_cpuid);
>>  
>> -if (lppaca_shared_proc(get_lppaca()))
>> +if (lppaca_shared_proc(get_lppaca())) {
>>  static_branch_enable(_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> +pv_spinlocks_init();
>> +#endif
>> +}
> 
> We could avoid the ifdef with this I think?

Yes I think so.

Thanks,
Nick



Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping

2020-07-23 Thread Laurent Dufour

Le 23/07/2020 à 14:32, Laurent Dufour a écrit :

Le 23/07/2020 à 05:36, Bharata B Rao a écrit :

On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote:

When a secure memslot is dropped, all the pages backed in the secure device
(aka really backed by secure memory by the Ultravisor) should be paged out
to a normal page. Previously, this was achieved by triggering the page
fault mechanism which is calling kvmppc_svm_page_out() on each pages.

This can't work when hot unplugging a memory slot because the memory slot
is flagged as invalid and gfn_to_pfn() is then not trying to access the
page, so the page fault mechanism is not triggered.

Since the final goal is to make a call to kvmppc_svm_page_out() it seems
simpler to directly calling it instead of triggering such a mechanism. This
way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
memslot.

Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
the call to __kvmppc_svm_page_out() is made.
As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the
VMA is fetched in a lazy way, to not trigger find_vma() all the time. In
addition, the mmap_sem is help in read mode during that time, not in write
mode since the virual memory layout is not impacted, and
kvm->arch.uvmem_lock prevents concurrent operation on the secure device.

Cc: Ram Pai 
Cc: Bharata B Rao 
Cc: Paul Mackerras 
Signed-off-by: Laurent Dufour 
---
  arch/powerpc/kvm/book3s_hv_uvmem.c | 54 --
  1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c

index 5a4b02d3f651..ba5c7c77cc3a 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -624,35 +624,55 @@ static inline int kvmppc_svm_page_out(struct 
vm_area_struct *vma,

   * fault on them, do fault time migration to replace the device PTEs in
   * QEMU page table with normal PTEs from newly allocated pages.
   */
-void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
+void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
   struct kvm *kvm, bool skip_page_out)
  {
  int i;
  struct kvmppc_uvmem_page_pvt *pvt;
-    unsigned long pfn, uvmem_pfn;
-    unsigned long gfn = free->base_gfn;
+    struct page *uvmem_page;
+    struct vm_area_struct *vma = NULL;
+    unsigned long uvmem_pfn, gfn;
+    unsigned long addr, end;
+
+    mmap_read_lock(kvm->mm);
+
+    addr = slot->userspace_addr;


We typically use gfn_to_hva() for that, but that won't work for a
memslot that is already marked INVALID which is the case here.
I think it is ok to access slot->userspace_addr here of an INVALID
memslot, but just thought of explictly bringing this up.


Which explicitly mentioned above in the patch's description:

This can't work when hot unplugging a memory slot because the memory slot
is flagged as invalid and gfn_to_pfn() is then not trying to access the
page, so the page fault mechanism is not triggered.




+    end = addr + (slot->npages * PAGE_SIZE);
-    for (i = free->npages; i; --i, ++gfn) {
-    struct page *uvmem_page;
+    gfn = slot->base_gfn;
+    for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
+
+    /* Fetch the VMA if addr is not in the latest fetched one */
+    if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
+    vma = find_vma_intersection(kvm->mm, addr, end);
+    if (!vma ||
+    vma->vm_start > addr || vma->vm_end < end) {
+    pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
+    break;
+    }
+    }


In Ram's series, kvmppc_memslot_page_merge() also walks the VMAs spanning
the memslot, but it uses a different logic for the same. Why can't these
two cases use the same method to walk the VMAs? Is there anything subtly
different between the two cases?


This is probably doable. At the time I wrote that patch, the 
kvmppc_memslot_page_merge() was not yet introduced AFAIR.


This being said, I'd help a lot to factorize that code... I let Ram dealing with 
that ;)


Indeed I don't think this is relevant, the loop in kvmppc_memslot_page_merge() 
deals with one call (to ksm_advise) per VMA, while this code is dealing with one 
call per page of the VMA, which completely different.


I don't think merging the both will be a good idea.

Cheers,
Laurent.


Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

2020-07-23 Thread Peter Zijlstra
On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:
> We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
> supported.

Waiman, if you cannot explain how not having kick is a sane thing, what
are you saying here?


Re: [PATCH v2 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper

2020-07-23 Thread Ard Biesheuvel
On Thu, 23 Jul 2020 at 04:52, Jarkko Sakkinen
 wrote:
>
> On Thu, Jul 16, 2020 at 06:49:09PM +0200, Christophe Leroy wrote:
> > Jarkko Sakkinen  a écrit :
> >
> > > Rename module_alloc() to text_alloc() and module_memfree() to
> > > text_memfree(), and move them to kernel/text.c, which is unconditionally
> > > compiled to the kernel proper. This allows kprobes, ftrace and bpf to
> > > allocate space for executable code without requiring to compile the 
> > > modules
> > > support (CONFIG_MODULES=y) in.
> >
> > You are not changing enough in powerpc to have this work.
> > On powerpc 32 bits (6xx), when STRICT_KERNEL_RWX is selected, the vmalloc
> > space is set to NX (no exec) at segment level (ie by 256Mbytes zone) unless
> > CONFIG_MODULES is selected.
> >
> > Christophe
>
> This has been deduced down to:
>
> https://lore.kernel.org/lkml/20200717030422.679972-1-jarkko.sakki...@linux.intel.com/
>
> I.e. not intruding PPC anymore :-)
>

Ok, so after the elaborate discussion we had between Jessica, Russell,
Peter, Will, Mark, you and myself, where we pointed out that
a) a single text_alloc() abstraction for bpf, kprobes and ftrace does
not fit other architectures very well, and
b) that module_alloc() is not suitable as a default to base text_alloc() on,

you went ahead and implemented that anyway, but only cc'ing Peter,
akpm, Masami and the mm list this time?

Sorry, but that is not how it works. Once people get pulled into a
discussion, you cannot dismiss them or their feedback like that and go
off and do your own thing anyway. Generic features like this are
tricky to get right, and it will likely take many iterations and input
from many different people.


Re: [PATCH 2/5] powerpc: Allow 4096 bytes of stack expansion for the signal frame

2020-07-23 Thread Daniel Axtens
Hi Michael,

Unfortunately, this patch doesn't completely solve the problem.

Trying the original reproducer, I'm still able to trigger the crash even
with this patch, although not 100% of the time. (If I turn ASLR off
outside of tmux it reliably crashes, if I turn ASLR off _inside_ of tmux
it reliably succeeds; all of this is on a serial console.)

./foo 1241000 & sleep 1; killall -USR1 foo; echo ok

If I add some debugging information, I see that I'm getting
address + 4096 = 7fed0fa0
gpr1 =   7fed1020

So address + 4096 is 0x80 bytes below the 4k window. I haven't been able
to figure out why, gdb gives me a NIP in __kernel_sigtramp_rt64 but I
don't know what to make of that.

Kind regards,
Daniel

P.S. I don't know what your policy on linking to kernel bugzilla is, but
if you want:

Link: https://bugzilla.kernel.org/show_bug.cgi?id=205183


> Reported-by: Tom Lane 
> Signed-off-by: Michael Ellerman 
> ---
>  arch/powerpc/mm/fault.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 641fc5f3d7dd..ed01329dd12b 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -274,7 +274,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, 
> unsigned long address,
>   /*
>* N.B. The POWER/Open ABI allows programs to access up to
>* 288 bytes below the stack pointer.
> -  * The kernel signal delivery code writes up to about 1.5kB
> +  * The kernel signal delivery code writes up to 4KB
>* below the stack pointer (r1) before decrementing it.
>* The exec code can write slightly over 640kB to the stack
>* before setting the user r1.  Thus we allow the stack to
> @@ -299,7 +299,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, 
> unsigned long address,
>* between the last mapped region and the stack will
>* expand the stack rather than segfaulting.
>*/
> - if (address + 2048 >= uregs->gpr[1])
> + if (address + 4096 >= uregs->gpr[1])
>   return false;
>  
>   if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> -- 
> 2.25.1


Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

2020-07-23 Thread Nicholas Piggin
Excerpts from Waiman Long's message of July 22, 2020 12:36 am:
> On 7/21/20 7:08 AM, Nicholas Piggin wrote:
>> diff --git a/arch/powerpc/include/asm/qspinlock.h 
>> b/arch/powerpc/include/asm/qspinlock.h
>> index b752d34517b3..26d8766a1106 100644
>> --- a/arch/powerpc/include/asm/qspinlock.h
>> +++ b/arch/powerpc/include/asm/qspinlock.h
>> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock 
>> *lock)
>>   
>>   #else
>>   extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
>> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>>   #endif
>>   
>>   static __always_inline void queued_spin_lock(struct qspinlock *lock)
>>   {
>> -u32 val = 0;
>> -
>> -if (likely(atomic_try_cmpxchg_lock(>val, , _Q_LOCKED_VAL)))
>> +atomic_t *a = >val;
>> +u32 val;
>> +
>> +again:
>> +asm volatile(
>> +"1:\t"  PPC_LWARX(%0,0,%1,1) "  # queued_spin_lock  
>> \n"
>> +: "=" (val)
>> +: "r" (>counter)
>> +: "memory");
>> +
>> +if (likely(val == 0)) {
>> +asm_volatile_goto(
>> +"   stwcx.  %0,0,%1 
>> \n"
>> +"   bne-%l[again]   
>> \n"
>> +"\t"PPC_ACQUIRE_BARRIER "   
>> \n"
>> +:
>> +: "r"(_Q_LOCKED_VAL), "r" (>counter)
>> +: "cr0", "memory"
>> +: again );
>>  return;
>> -
>> -queued_spin_lock_slowpath(lock, val);
>> +}
>> +
>> +if (likely(val == _Q_LOCKED_VAL)) {
>> +asm_volatile_goto(
>> +"   stwcx.  %0,0,%1 
>> \n"
>> +"   bne-%l[again]   
>> \n"
>> +:
>> +: "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (>counter)
>> +: "cr0", "memory"
>> +: again );
>> +
>> +atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
>> +//  clear_pending_set_locked(lock);
>> +WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
>> +//  lockevent_inc(lock_pending);
>> +return;
>> +}
>> +
>> +if (val == _Q_PENDING_VAL) {
>> +int cnt = _Q_PENDING_LOOPS;
>> +val = atomic_cond_read_relaxed(a,
>> +   (VAL != _Q_PENDING_VAL) || 
>> !cnt--);
>> +if (!(val & ~_Q_LOCKED_MASK))
>> +goto again;
>> +}
>> +queued_spin_lock_slowpath_queue(lock);
>>   }
>>   #define queued_spin_lock queued_spin_lock
>>   
> 
> I am fine with the arch code override some part of the generic code.

Cool.

>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index b9515fcc9b29..ebcc6f5d99d5 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -287,10 +287,14 @@ static __always_inline u32  
>> __pv_wait_head_or_lock(struct qspinlock *lock,
>>   
>>   #ifdef CONFIG_PARAVIRT_SPINLOCKS
>>   #define queued_spin_lock_slowpath  native_queued_spin_lock_slowpath
>> +#define queued_spin_lock_slowpath_queue 
>> native_queued_spin_lock_slowpath_queue
>>   #endif
>>   
>>   #endif /* _GEN_PV_LOCK_SLOWPATH */
>>   
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +
>>   /**
>>* queued_spin_lock_slowpath - acquire the queued spinlock
>>* @lock: Pointer to queued spinlock structure
>> @@ -314,12 +318,6 @@ static __always_inline u32  
>> __pv_wait_head_or_lock(struct qspinlock *lock,
>>*/
>>   void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>>   {
>> -struct mcs_spinlock *prev, *next, *node;
>> -u32 old, tail;
>> -int idx;
>> -
>> -BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> -
>>  if (pv_enabled())
>>  goto pv_queue;
>>   
>> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, 
>> u32 val)
>>   queue:
>>  lockevent_inc(lock_slowpath);
>>   pv_queue:
>> +__queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath);
>> +
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> +lockevent_inc(lock_slowpath);
>> +__queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
>> +
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> +struct mcs_spinlock *prev, *next, *node;
>> +u32 old, tail;
>> +u32 val;
>> +int idx;
>> +
>> +BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> +
>>  node = this_cpu_ptr([0].mcs);
>>  idx = node->count++;
>>  tail = encode_tail(smp_processor_id(), idx);
>> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, 
>> u32 val)
>>   */
>>  

Re: [PATCHv3 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents

2020-07-23 Thread Nathan Lynch
Pingfan Liu  writes:
> A bug is observed on pseries by taking the following steps on rhel:
> -1. drmgr -c mem -r -q 5
> -2. echo c > /proc/sysrq-trigger
>
> And then, the failure looks like:
> kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
> kdump: saving vmcore-dmesg.txt
> kdump: saving vmcore-dmesg.txt complete
> kdump: saving vmcore
>  Checking for memory holes : [  0.0 %] /  
>  Checking for memory holes : [100.0 %] |  
>  Excluding unnecessary pages   : [100.0 %] \  
>  Copying data  : [  0.3 %] -  
> eta: 38s[   44.337636] hash-mmu: mm: Hashing failure ! 
> EA=0x7fffba40 access=0x8004 current=makedumpfile
> [   44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
> psize 2 pte=0xc0005504
> [   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
> access=0x8004 current=makedumpfile
> [   44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
> psize 2 pte=0xc0005504
> [   44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip 
> 7fffbbc4d7fc lr 00011356ca3c code 2
> [   44.338548] Core dump to |/bin/false pipe failed
> /lib/kdump-lib-initramfs.sh: line 98:   469 Bus error   
> $CORE_COLLECTOR /proc/vmcore 
> $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
> kdump: saving vmcore failed
>
> * Root cause *
>   After analyzing, it turns out that in the current implementation,
> when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as
> the code __remove_memory() comes before drmem_update_dt().
> So in kdump kernel, when read_from_oldmem() resorts to
> pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to
> non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it
> can be observed "Bus error"
>
> From a viewpoint of listener and publisher, the publisher notifies the
> listener before data is ready.  This introduces a problem where udev
> launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
> updating. And in capture kernel, makedumpfile will access the memory based
> on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.
>
> * Fix *
>   In order to fix this issue, update dt before __remove_memory(), and
> accordingly the same rule in hot-add path.
>
> This will introduce extra dt updating payload for each involved lmb when 
> hotplug.
> But it should be fine since drmem_update_dt() is memory based operation and
> hotplug is not a hot path.

This is great analysis but the performance implications of the change
are grave. The add/remove paths here are already O(n) where n is the
quantity of memory assigned to the LP, this change would make it O(n^2):

dlpar_memory_add_by_count
  for_each_drmem_lmb <--
dlpar_add_lmb
  drmem_update_dt(_v1|_v2)
for_each_drmem_lmb   <--

Memory add/remove isn't a hot path but quadratic runtime complexity
isn't acceptable. Its current performance is bad enough that I have
internal bugs open on it.

Not to mention we leak memory every time drmem_update_dt is called
because we can't safely free device tree properties :-(

Also note that this sort of reverts (fixes?) 063b8b1251fd
("powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR
request").


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread Nicholas Piggin
Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
> On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
> 
>> diff --git a/arch/powerpc/include/asm/hw_irq.h 
>> b/arch/powerpc/include/asm/hw_irq.h
>> index 3a0db7b0b46e..35060be09073 100644
>> --- a/arch/powerpc/include/asm/hw_irq.h
>> +++ b/arch/powerpc/include/asm/hw_irq.h
>> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>>  #define powerpc_local_irq_pmu_save(flags)   \
>>   do {   \
>>  raw_local_irq_pmu_save(flags);  \
>> -trace_hardirqs_off();   \
>> +if (!raw_irqs_disabled_flags(flags))\
>> +trace_hardirqs_off();   \
>>  } while(0)
>>  #define powerpc_local_irq_pmu_restore(flags)\
>>  do {\
>> -if (raw_irqs_disabled_flags(flags)) {   \
>> -raw_local_irq_pmu_restore(flags);   \
>> -trace_hardirqs_off();   \
>> -} else {\
>> +if (!raw_irqs_disabled_flags(flags))\
>>  trace_hardirqs_on();\
>> -raw_local_irq_pmu_restore(flags);   \
>> -}   \
>> +raw_local_irq_pmu_restore(flags);   \
>>  } while(0)
> 
> You shouldn't be calling lockdep from NMI context!

After this patch it doesn't.

trace_hardirqs_on/off implementation appears to expect to be called in NMI 
context though, for some reason.

> That is, I recently
> added suport for that on x86:
> 
>   https://lkml.kernel.org/r/20200623083721.155449...@infradead.org
>   https://lkml.kernel.org/r/20200623083721.216740...@infradead.org
> 
> But you need to be very careful on how you order things, as you can see
> the above relies on preempt_count() already having been incremented with
> NMI_MASK.

Hmm. My patch seems simpler.

I don't know this stuff very well, I don't really understand what your patch 
enables for x86 but at least it shouldn't be incompatible with this one 
AFAIKS.

Thanks,
Nick


Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping

2020-07-23 Thread Laurent Dufour

Le 23/07/2020 à 05:36, Bharata B Rao a écrit :

On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote:

When a secure memslot is dropped, all the pages backed in the secure device
(aka really backed by secure memory by the Ultravisor) should be paged out
to a normal page. Previously, this was achieved by triggering the page
fault mechanism which is calling kvmppc_svm_page_out() on each pages.

This can't work when hot unplugging a memory slot because the memory slot
is flagged as invalid and gfn_to_pfn() is then not trying to access the
page, so the page fault mechanism is not triggered.

Since the final goal is to make a call to kvmppc_svm_page_out() it seems
simpler to directly calling it instead of triggering such a mechanism. This
way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
memslot.

Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
the call to __kvmppc_svm_page_out() is made.
As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the
VMA is fetched in a lazy way, to not trigger find_vma() all the time. In
addition, the mmap_sem is help in read mode during that time, not in write
mode since the virual memory layout is not impacted, and
kvm->arch.uvmem_lock prevents concurrent operation on the secure device.

Cc: Ram Pai 
Cc: Bharata B Rao 
Cc: Paul Mackerras 
Signed-off-by: Laurent Dufour 
---
  arch/powerpc/kvm/book3s_hv_uvmem.c | 54 --
  1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 5a4b02d3f651..ba5c7c77cc3a 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -624,35 +624,55 @@ static inline int kvmppc_svm_page_out(struct 
vm_area_struct *vma,
   * fault on them, do fault time migration to replace the device PTEs in
   * QEMU page table with normal PTEs from newly allocated pages.
   */
-void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
+void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
 struct kvm *kvm, bool skip_page_out)
  {
int i;
struct kvmppc_uvmem_page_pvt *pvt;
-   unsigned long pfn, uvmem_pfn;
-   unsigned long gfn = free->base_gfn;
+   struct page *uvmem_page;
+   struct vm_area_struct *vma = NULL;
+   unsigned long uvmem_pfn, gfn;
+   unsigned long addr, end;
+
+   mmap_read_lock(kvm->mm);
+
+   addr = slot->userspace_addr;


We typically use gfn_to_hva() for that, but that won't work for a
memslot that is already marked INVALID which is the case here.
I think it is ok to access slot->userspace_addr here of an INVALID
memslot, but just thought of explictly bringing this up.


Which explicitly mentioned above in the patch's description:

This can't work when hot unplugging a memory slot because the memory slot
is flagged as invalid and gfn_to_pfn() is then not trying to access the
page, so the page fault mechanism is not triggered.




+   end = addr + (slot->npages * PAGE_SIZE);
  
-	for (i = free->npages; i; --i, ++gfn) {

-   struct page *uvmem_page;
+   gfn = slot->base_gfn;
+   for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
+
+   /* Fetch the VMA if addr is not in the latest fetched one */
+   if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
+   vma = find_vma_intersection(kvm->mm, addr, end);
+   if (!vma ||
+   vma->vm_start > addr || vma->vm_end < end) {
+   pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
+   break;
+   }
+   }


In Ram's series, kvmppc_memslot_page_merge() also walks the VMAs spanning
the memslot, but it uses a different logic for the same. Why can't these
two cases use the same method to walk the VMAs? Is there anything subtly
different between the two cases?


This is probably doable. At the time I wrote that patch, the 
kvmppc_memslot_page_merge() was not yet introduced AFAIR.


This being said, I'd help a lot to factorize that code... I let Ram dealing with 
that ;)


Cheers,
Laurent.




Re: [v4 4/5] KVM: PPC: Book3S HV: retry page migration before erroring-out

2020-07-23 Thread Ram Pai
I am dropping this patch based on our conversation, where we agreed, we
need to rootcause the migration failure.

On Thu, Jul 23, 2020 at 11:43:44AM +0530, Bharata B Rao wrote:
> On Fri, Jul 17, 2020 at 01:00:26AM -0700, Ram Pai wrote:
> > @@ -812,7 +842,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, 
> > unsigned long gpa,
> > struct vm_area_struct *vma;
> > int srcu_idx;
> > unsigned long gfn = gpa >> page_shift;
> > -   int ret;
> > +   int ret, repeat_count = REPEAT_COUNT;
> >  
> > if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
> > return H_UNSUPPORTED;
> > @@ -826,34 +856,44 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, 
> > unsigned long gpa,
> > if (flags & H_PAGE_IN_SHARED)
> > return kvmppc_share_page(kvm, gpa, page_shift);
> >  
> > -   ret = H_PARAMETER;
> > srcu_idx = srcu_read_lock(>srcu);
> > -   mmap_read_lock(kvm->mm);
> >  
> > -   start = gfn_to_hva(kvm, gfn);
> > -   if (kvm_is_error_hva(start))
> > -   goto out;
> > -
> > -   mutex_lock(>arch.uvmem_lock);
> > /* Fail the page-in request of an already paged-in page */
> > -   if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL))
> > -   goto out_unlock;
> > +   mutex_lock(>arch.uvmem_lock);
> > +   ret = kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL);
> > +   mutex_unlock(>arch.uvmem_lock);
> 
> Same comment as for the prev patch. I don't think you can release
> the lock here.
> 
> > +   if (ret) {
> > +   srcu_read_unlock(>srcu, srcu_idx);
> > +   return H_PARAMETER;
> > +   }
> >  
> > -   end = start + (1UL << page_shift);
> > -   vma = find_vma_intersection(kvm->mm, start, end);
> > -   if (!vma || vma->vm_start > start || vma->vm_end < end)
> > -   goto out_unlock;
> > +   do {
> > +   ret = H_PARAMETER;
> > +   mmap_read_lock(kvm->mm);
> >  
> > -   if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
> > -   true))
> > -   goto out_unlock;
> > +   start = gfn_to_hva(kvm, gfn);
> > +   if (kvm_is_error_hva(start)) {
> > +   mmap_read_unlock(kvm->mm);
> > +   break;
> > +   }
> >  
> > -   ret = H_SUCCESS;
> > +   end = start + (1UL << page_shift);
> > +   vma = find_vma_intersection(kvm->mm, start, end);
> > +   if (!vma || vma->vm_start > start || vma->vm_end < end) {
> > +   mmap_read_unlock(kvm->mm);
> > +   break;
> > +   }
> > +
> > +   mutex_lock(>arch.uvmem_lock);
> > +   ret = kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, 
> > page_shift, true);
> > +   mutex_unlock(>arch.uvmem_lock);
> > +
> > +   mmap_read_unlock(kvm->mm);
> > +   } while (ret == -2 && repeat_count--);
> > +
> > +   if (ret == -2)
> > +   ret = H_BUSY;
> >  
> > -out_unlock:
> > -   mutex_unlock(>arch.uvmem_lock);
> > -out:
> > -   mmap_read_unlock(kvm->mm);
> > srcu_read_unlock(>srcu, srcu_idx);
> > return ret;
> >  }
> > -- 
> > 1.8.3.1

-- 
Ram Pai


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread Peter Zijlstra
On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:

> diff --git a/arch/powerpc/include/asm/hw_irq.h 
> b/arch/powerpc/include/asm/hw_irq.h
> index 3a0db7b0b46e..35060be09073 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>  #define powerpc_local_irq_pmu_save(flags)\
>do {   \
>   raw_local_irq_pmu_save(flags);  \
> - trace_hardirqs_off();   \
> + if (!raw_irqs_disabled_flags(flags))\
> + trace_hardirqs_off();   \
>   } while(0)
>  #define powerpc_local_irq_pmu_restore(flags) \
>   do {\
> - if (raw_irqs_disabled_flags(flags)) {   \
> - raw_local_irq_pmu_restore(flags);   \
> - trace_hardirqs_off();   \
> - } else {\
> + if (!raw_irqs_disabled_flags(flags))\
>   trace_hardirqs_on();\
> - raw_local_irq_pmu_restore(flags);   \
> - }   \
> + raw_local_irq_pmu_restore(flags);   \
>   } while(0)

You shouldn't be calling lockdep from NMI context! That is, I recently
added suport for that on x86:

  https://lkml.kernel.org/r/20200623083721.155449...@infradead.org
  https://lkml.kernel.org/r/20200623083721.216740...@infradead.org

But you need to be very careful on how you order things, as you can see
the above relies on preempt_count() already having been incremented with
NMI_MASK.


Re: [v4 3/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.

2020-07-23 Thread Ram Pai
On Thu, Jul 23, 2020 at 11:40:37AM +0530, Bharata B Rao wrote:
> On Fri, Jul 17, 2020 at 01:00:25AM -0700, Ram Pai wrote:
> >  
> > +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
> > +   const struct kvm_memory_slot *memslot)
> 
> Don't see any callers for this outside of this file, so why not static?
> 
> > +{
> > +   unsigned long gfn = memslot->base_gfn;
> > +   struct vm_area_struct *vma;
> > +   unsigned long start, end;
> > +   int ret = 0;
> > +
> > +   while (kvmppc_next_nontransitioned_gfn(memslot, kvm, )) {
> 
> So you checked the state of gfn under uvmem_lock above, but release
> it too.
> 
> > +
> > +   mmap_read_lock(kvm->mm);
> > +   start = gfn_to_hva(kvm, gfn);
> > +   if (kvm_is_error_hva(start)) {
> > +   ret = H_STATE;
> > +   goto next;
> > +   }
> > +
> > +   end = start + (1UL << PAGE_SHIFT);
> > +   vma = find_vma_intersection(kvm->mm, start, end);
> > +   if (!vma || vma->vm_start > start || vma->vm_end < end) {
> > +   ret = H_STATE;
> > +   goto next;
> > +   }
> > +
> > +   mutex_lock(>arch.uvmem_lock);
> > +   ret = kvmppc_svm_migrate_page(vma, start, end,
> > +   (gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
> 
> What is the guarantee that the gfn is in the same earlier state when you do
> do migration here?

Are you worried about the case, where someother thread will sneak-in and
migrate the GFN, and this migration request will become a duplicate one?

That is theortically possible, though practically improbable. This
transition is attempted only when there is one vcpu active in the VM.

However, may be, we should not bake-in that assumption in this code.
Will remove that assumption.

RP


Re: [v4 2/5] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs

2020-07-23 Thread Ram Pai
On Thu, Jul 23, 2020 at 10:18:30AM +0530, Bharata B Rao wrote:
> On Fri, Jul 17, 2020 at 01:00:24AM -0700, Ram Pai wrote:
> > pvt->gpa = gpa;
..snip..
> > pvt->kvm = kvm;
> > @@ -524,6 +663,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, 
> > unsigned long gpa,
> > uvmem_page = pfn_to_page(uvmem_pfn);
> > pvt = uvmem_page->zone_device_data;
> > pvt->skip_page_out = true;
> > +   pvt->remove_gfn = false;
> > }
> >  
> >  retry:
> > @@ -537,12 +677,16 @@ static unsigned long kvmppc_share_page(struct kvm 
> > *kvm, unsigned long gpa,
> > uvmem_page = pfn_to_page(uvmem_pfn);
> > pvt = uvmem_page->zone_device_data;
> > pvt->skip_page_out = true;
> > +   pvt->remove_gfn = false;
> 
> This is the case of making an already secure page as shared page.
> A comment here as to why remove_gfn is set to false here will help.
> 
> Also isn't it by default false? Is there a situation where it starts
> out by default false, becomes true later and you are required to
> explicitly mark it false here?

It is by default false. And will be true when the GFN is
released/invalidated through kvmppc_uvmem_drop_pages().

It is marked false explicitly here, just to be safe, and protect
against any implicit changes.

> 
> Otherwise, Reviewed-by: Bharata B Rao 
> 
Thanks for the review.

RP


[PATCH 2/2] lockdep: warn on redundant or incorrect irq state changes

2020-07-23 Thread Nicholas Piggin
With the previous patch, lockdep hardirq state changes should not be
redundant. Softirq state changes already follow that pattern.

So warn on unexpected enable-when-enabled or disable-when-disabled
conditions, to catch possible errors or sloppy patterns that could
lead to similar bad behavior due to NMIs etc.

Signed-off-by: Nicholas Piggin 
---
 kernel/locking/lockdep.c   | 80 +-
 kernel/locking/lockdep_internals.h |  4 --
 kernel/locking/lockdep_proc.c  | 10 +---
 3 files changed, 35 insertions(+), 59 deletions(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..138458fb2234 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3649,15 +3649,8 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
 
-   if (unlikely(current->hardirqs_enabled)) {
-   /*
-* Neither irq nor preemption are disabled here
-* so this is racy by nature but losing one hit
-* in a stat is not a big deal.
-*/
-   __debug_atomic_inc(redundant_hardirqs_on);
+   if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled))
return;
-   }
 
/*
 * We're enabling irqs and according to our state above irqs weren't
@@ -3695,15 +3688,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
if (unlikely(!debug_locks || curr->lockdep_recursion))
return;
 
-   if (curr->hardirqs_enabled) {
-   /*
-* Neither irq nor preemption are disabled here
-* so this is racy by nature but losing one hit
-* in a stat is not a big deal.
-*/
-   __debug_atomic_inc(redundant_hardirqs_on);
+   if (DEBUG_LOCKS_WARN_ON(curr->hardirqs_enabled))
return;
-   }
 
/*
 * We're enabling irqs and according to our state above irqs weren't
@@ -3738,6 +3724,9 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
if (unlikely(!debug_locks || curr->lockdep_recursion))
return;
 
+   if (DEBUG_LOCKS_WARN_ON(!curr->hardirqs_enabled))
+   return;
+
/*
 * So we're supposed to get called after you mask local IRQs, but for
 * some reason the hardware doesn't quite think you did a proper job.
@@ -3745,17 +3734,13 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
 
-   if (curr->hardirqs_enabled) {
-   /*
-* We have done an ON -> OFF transition:
-*/
-   curr->hardirqs_enabled = 0;
-   curr->hardirq_disable_ip = ip;
-   curr->hardirq_disable_event = ++curr->irq_events;
-   debug_atomic_inc(hardirqs_off_events);
-   } else {
-   debug_atomic_inc(redundant_hardirqs_off);
-   }
+   /*
+* We have done an ON -> OFF transition:
+*/
+   curr->hardirqs_enabled = 0;
+   curr->hardirq_disable_ip = ip;
+   curr->hardirq_disable_event = ++curr->irq_events;
+   debug_atomic_inc(hardirqs_off_events);
 }
 EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
 
@@ -3769,6 +3754,9 @@ void lockdep_softirqs_on(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
 
+   if (DEBUG_LOCKS_WARN_ON(curr->softirqs_enabled))
+   return;
+
/*
 * We fancy IRQs being disabled here, see softirq.c, avoids
 * funny state and nesting things.
@@ -3776,11 +3764,6 @@ void lockdep_softirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
 
-   if (curr->softirqs_enabled) {
-   debug_atomic_inc(redundant_softirqs_on);
-   return;
-   }
-
current->lockdep_recursion++;
/*
 * We'll do an OFF -> ON transition:
@@ -3809,26 +3792,26 @@ void lockdep_softirqs_off(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
 
+   if (DEBUG_LOCKS_WARN_ON(!curr->softirqs_enabled))
+   return;
+
/*
 * We fancy IRQs being disabled here, see softirq.c
 */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
 
-   if (curr->softirqs_enabled) {
-   /*
-* We have done an ON -> OFF transition:
-*/
-   curr->softirqs_enabled = 0;
-   curr->softirq_disable_ip = ip;
-   curr->softirq_disable_event = ++curr->irq_events;
-   debug_atomic_inc(softirqs_off_events);
-   /*
-* Whoops, we wanted softirqs off, so why aren't they?
-*/
-   

[PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-07-23 Thread Nicholas Piggin
If an interrupt is not masked by local_irq_disable (e.g., a powerpc perf
interrupt), then it can hit in local_irq_enable() after trace_hardirqs_on()
and before raw_local_irq_enable().

If that interrupt handler calls local_irq_save(), it will call
trace_hardirqs_off() but the local_irq_restore() will not call
trace_hardirqs_on() again because raw_irqs_disabled_flags(flags) is true.

This can lead lockdep_assert_irqs_enabled() to trigger false positive
warnings.

Fix this by being careful to only enable and disable trace_hardirqs with
the outer-most irq enable/disable.

Reported-by: Alexey Kardashevskiy 
Signed-off-by: Nicholas Piggin 
---

I haven't tested on other architectures but I imagine NMIs in general
might cause a similar problem.

Other architectures might have to be updated for patch 2, but there's
a lot of asm around interrupt/return, so I didn't have a very good
lock. The warnings should be harmless enough and uncover most places
that need updating.

 arch/powerpc/include/asm/hw_irq.h | 11 ---
 include/linux/irqflags.h  | 29 ++---
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 3a0db7b0b46e..35060be09073 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
 #define powerpc_local_irq_pmu_save(flags)  \
 do {   \
raw_local_irq_pmu_save(flags);  \
-   trace_hardirqs_off();   \
+   if (!raw_irqs_disabled_flags(flags))\
+   trace_hardirqs_off();   \
} while(0)
 #define powerpc_local_irq_pmu_restore(flags)   \
do {\
-   if (raw_irqs_disabled_flags(flags)) {   \
-   raw_local_irq_pmu_restore(flags);   \
-   trace_hardirqs_off();   \
-   } else {\
+   if (!raw_irqs_disabled_flags(flags))\
trace_hardirqs_on();\
-   raw_local_irq_pmu_restore(flags);   \
-   }   \
+   raw_local_irq_pmu_restore(flags);   \
} while(0)
 #else
 #define powerpc_local_irq_pmu_save(flags)  \
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 6384d2813ded..571ee29ecefc 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -163,26 +163,33 @@ do {  \
  * if !TRACE_IRQFLAGS.
  */
 #ifdef CONFIG_TRACE_IRQFLAGS
-#define local_irq_enable() \
-   do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
-#define local_irq_disable() \
-   do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
+#define local_irq_enable() \
+   do {\
+   trace_hardirqs_on();\
+   raw_local_irq_enable(); \
+   } while (0)
+
+#define local_irq_disable()\
+   do {\
+   bool was_disabled = raw_irqs_disabled(); \
+   raw_local_irq_disable();\
+   if (!was_disabled)  \
+   trace_hardirqs_off();   \
+   } while (0)
+
 #define local_irq_save(flags)  \
do {\
raw_local_irq_save(flags);  \
-   trace_hardirqs_off();   \
+   if (!raw_irqs_disabled_flags(flags))\
+   trace_hardirqs_off();   \
} while (0)
 
 
 #define local_irq_restore(flags)   \
do {\
-   if (raw_irqs_disabled_flags(flags)) {   \
-   raw_local_irq_restore(flags);   \
-   trace_hardirqs_off();   \
-   } else {\
+   if (!raw_irqs_disabled_flags(flags))\
trace_hardirqs_on();\
-   raw_local_irq_restore(flags);   \
-   }   \
+   raw_local_irq_restore(flags);   \
} while (0)
 
 #define safe_halt()\
-- 
2.23.0



Re: [PATCH 0/7] powerpc/watchpoint: 2nd DAWR kvm enablement + selftests

2020-07-23 Thread Ravi Bangoria




On 7/23/20 3:50 PM, Ravi Bangoria wrote:

Patch #1, #2 and #3 enables p10 2nd DAWR feature for Book3S kvm guest. DAWR
is a hypervisor resource and thus H_SET_MODE hcall is used to set/unset it.
A new case H_SET_MODE_RESOURCE_SET_DAWR1 is introduced in H_SET_MODE hcall
for setting/unsetting 2nd DAWR. Also, new capability KVM_CAP_PPC_DAWR1 has
been added to query 2nd DAWR support via kvm ioctl.

This feature also needs to be enabled in Qemu to really use it. I'll reply
link to qemu patches once I post them in qemu-devel mailing list.


Qemu patches: 
https://lore.kernel.org/kvm/20200723104220.314671-1-ravi.bango...@linux.ibm.com


Re: [PATCH] powerpc/64s: Fix irq tracing corruption in interrupt/syscall return caused by perf interrupts

2020-07-23 Thread Nicholas Piggin
Excerpts from Alexey Kardashevskiy's message of July 22, 2020 8:50 pm:
> 
> 
> On 22/07/2020 17:34, Nicholas Piggin wrote:
>> Alexey reports lockdep_assert_irqs_enabled() warnings when stress testing 
>> perf, e.g.,
>> 
>> WARNING: CPU: 0 PID: 1556 at kernel/softirq.c:169 
>> __local_bh_enable_ip+0x258/0x270
>> CPU: 0 PID: 1556 Comm: syz-executor
>> NIP:  c01ec888 LR: c01ec884 CTR: c0ef0610
>> REGS: c00022d4f8a0 TRAP: 0700   Not tainted  (5.8.0-rc3-x)
>> MSR:  80029033   CR: 28008844  XER: 2004
>> CFAR: c01dc1d0 IRQMASK: 0
>> 
>> The interesting thing is MSR[EE] and IRQMASK shows interrupts are enabled,
>> suggesting the current->hardirqs_enabled irq tracing state is going out of 
>> sync
>> with the actual interrupt enable state.
>> 
>> The cause is a window in interrupt/syscall return where irq tracing state is 
>> being
>> adjusted for an irqs-enabled return while MSR[EE] is still enabled. A perf
>> interrupt hits and ends up calling trace_hardirqs_off() when restoring
>> interrupt flags to a disable state.
>> 
>> Fix this by disabling perf interrupts as well while adjusting irq tracing 
>> state.
>> 
>> Add a debug check that catches the condition sooner.
>> 
>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic 
>> in C")
>> Reported-by: Alexey Kardashevskiy 
>> Signed-off-by: Nicholas Piggin 
>> ---
>> 
>> I can reproduce similar symptoms and this patch fixes my test case,
>> still trying to confirm Alexey's test case or whether there's another
>> similar bug causing it.
> 
> 
> This does not fix my testcase. I applied this on top of 4fa640dc5230
> ("Merge tag 'vfio-v5.8-rc7' of git://github.com/awilliam/linux-vfio into
> master")  without any of my testing code, just to be clear. Sorry...

Okay it seems to be a bigger problem and not actually caused by that
patch but was possible for lockdep hardirqs_enabled state to get out 
of synch with the local_irq_disable() state before that too. Root
cause is similar -- perf interrupts hitting between updating the two
different bits of state.

Not quite sure why Alexey's test wasn't hitting it before the patch,
but possibly the way masked interrupts get replayed. But I was able
to hit the problem with a different assertion.

I think I have a fix, but it seems to be a generic irq tracing code
issue. So this patch can be dropped, and it's not an urgent issue for
the next release (it only triggers warns on rare occasions and only
when lockdep is enabled).

Thanks,
Nick



[PATCH 7/7] powerpc/selftests: Add selftest to test concurrent perf/ptrace events

2020-07-23 Thread Ravi Bangoria
ptrace and perf watchpoints can't co-exists if their address range
overlaps. See commit 29da4f91c0c1 ("powerpc/watchpoint: Don't allow
concurrent perf and ptrace events") for more detail. Add selftest
for the same.

Sample o/p:
  # ./ptrace-perf-hwbreak
  test: ptrace-perf-hwbreak
  tags: git_version:powerpc-5.8-7-118-g937fa174a15d-dirty
  perf cpu event -> ptrace thread event (Overlapping): Ok
  perf cpu event -> ptrace thread event (Non-overlapping): Ok
  perf thread event -> ptrace same thread event (Overlapping): Ok
  perf thread event -> ptrace same thread event (Non-overlapping): Ok
  perf thread event -> ptrace other thread event: Ok
  ptrace thread event -> perf kernel event: Ok
  ptrace thread event -> perf same thread event (Overlapping): Ok
  ptrace thread event -> perf same thread event (Non-overlapping): Ok
  ptrace thread event -> perf other thread event: Ok
  ptrace thread event -> perf cpu event (Overlapping): Ok
  ptrace thread event -> perf cpu event (Non-overlapping): Ok
  ptrace thread event -> perf same thread & cpu event (Overlapping): Ok
  ptrace thread event -> perf same thread & cpu event (Non-overlapping): Ok
  ptrace thread event -> perf other thread & cpu event: Ok
  success: ptrace-perf-hwbreak

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/.gitignore   |   1 +
 .../testing/selftests/powerpc/ptrace/Makefile |   2 +-
 .../powerpc/ptrace/ptrace-perf-hwbreak.c  | 659 ++
 3 files changed, 661 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c

diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore 
b/tools/testing/selftests/powerpc/ptrace/.gitignore
index 0e96150b7c7e..eb75e5360e31 100644
--- a/tools/testing/selftests/powerpc/ptrace/.gitignore
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -14,3 +14,4 @@ perf-hwbreak
 core-pkey
 ptrace-pkey
 ptrace-syscall
+ptrace-perf-hwbreak
diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile 
b/tools/testing/selftests/powerpc/ptrace/Makefile
index 8d3f006c98cc..a500639da97a 100644
--- a/tools/testing/selftests/powerpc/ptrace/Makefile
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -2,7 +2,7 @@
 TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \
   ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx 
ptrace-tm-vsx \
   ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey 
core-pkey \
-  perf-hwbreak ptrace-syscall
+  perf-hwbreak ptrace-syscall ptrace-perf-hwbreak
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c
new file mode 100644
index ..6b8804a4942e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c
@@ -0,0 +1,659 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "ptrace.h"
+
+char data[16];
+
+/* Overlapping address range */
+volatile __u64 *ptrace_data1 = (__u64 *)[0];
+volatile __u64 *perf_data1 = (__u64 *)[4];
+
+/* Non-overlapping address range */
+volatile __u64 *ptrace_data2 = (__u64 *)[0];
+volatile __u64 *perf_data2 = (__u64 *)[8];
+
+static unsigned long pid_max_addr(void)
+{
+   FILE *fp;
+   char *line, *c;
+   char addr[100];
+   size_t len = 0;
+
+   fp = fopen("/proc/kallsyms", "r");
+   if (!fp) {
+   printf("Failed to read /proc/kallsyms. Exiting..\n");
+   exit(EXIT_FAILURE);
+   }
+
+   while (getline(, , fp) != -1) {
+   if (!strstr(line, "pid_max") || strstr(line, "pid_max_max") ||
+   strstr(line, "pid_max_min"))
+   continue;
+
+   strncpy(addr, line, len < 100 ? len : 100);
+   c = strchr(addr, ' ');
+   *c = '\0';
+   return strtoul(addr, , 16);
+   }
+   fclose(fp);
+   printf("Could not find pix_max. Exiting..\n");
+   exit(EXIT_FAILURE);
+   return -1;
+}
+
+static void perf_user_event_attr_set(struct perf_event_attr *attr, __u64 addr, 
__u64 len)
+{
+   memset(attr, 0, sizeof(struct perf_event_attr));
+   attr->type   = PERF_TYPE_BREAKPOINT;
+   attr->size   = sizeof(struct perf_event_attr);
+   attr->bp_type= HW_BREAKPOINT_R;
+   attr->bp_addr= addr;
+   attr->bp_len = len;
+   attr->exclude_kernel = 1;
+   attr->exclude_hv = 1;
+}
+
+static void perf_kernel_event_attr_set(struct perf_event_attr *attr)
+{
+   memset(attr, 0, sizeof(struct perf_event_attr));
+   attr->type   = PERF_TYPE_BREAKPOINT;
+   attr->size   = sizeof(struct perf_event_attr);
+   attr->bp_type= HW_BREAKPOINT_R;
+   attr->bp_addr= 

[PATCH 6/7] powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR

2020-07-23 Thread Ravi Bangoria
Extend perf-hwbreak.c selftest to test multiple DAWRs. Also add
testcase for testing 512 byte boundary removal.

Sample o/p:
  # ./perf-hwbreak
  ...
  TESTED: Process specific, Two events, diff addr
  TESTED: Process specific, Two events, same addr
  TESTED: Process specific, Two events, diff addr, one is RO, other is WO
  TESTED: Process specific, Two events, same addr, one is RO, other is WO
  TESTED: Systemwide, Two events, diff addr
  TESTED: Systemwide, Two events, same addr
  TESTED: Systemwide, Two events, diff addr, one is RO, other is WO
  TESTED: Systemwide, Two events, same addr, one is RO, other is WO
  TESTED: Process specific, 512 bytes, unaligned
  success: perf_hwbreak

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 568 +-
 1 file changed, 567 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index bde475341c8a..5df08738884d 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -21,8 +21,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -34,6 +39,12 @@
 
 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
 
+int nprocs;
+
+static volatile int a = 10;
+static volatile int b = 10;
+static volatile char c[512 + 8] __attribute__((aligned(512)));
+
 static void perf_event_attr_set(struct perf_event_attr *attr,
__u32 type, __u64 addr, __u64 len,
bool exclude_user)
@@ -68,6 +79,76 @@ static int perf_process_event_open(__u32 type, __u64 addr, 
__u64 len)
return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
 }
 
+static int perf_cpu_event_open(long cpu, __u32 type, __u64 addr, __u64 len)
+{
+   struct perf_event_attr attr;
+
+   perf_event_attr_set(, type, addr, len, 0);
+   return syscall(__NR_perf_event_open, , -1, cpu, -1, 0);
+}
+
+static void close_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   close(fd[i]);
+}
+
+static unsigned long read_fds(int *fd, int n)
+{
+   int i;
+   unsigned long c = 0;
+   unsigned long count = 0;
+   size_t res;
+
+   for (i = 0; i < n; i++) {
+   res = read(fd[i], , sizeof(c));
+   assert(res == sizeof(unsigned long long));
+   count += c;
+   }
+   return count;
+}
+
+static void reset_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   ioctl(fd[i], PERF_EVENT_IOC_RESET);
+}
+
+static void enable_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   ioctl(fd[i], PERF_EVENT_IOC_ENABLE);
+}
+
+static void disable_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   ioctl(fd[i], PERF_EVENT_IOC_DISABLE);
+}
+
+static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 
len)
+{
+   int i = 0;
+
+   /* Assume online processors are 0 to nprocs for simplisity */
+   for (i = 0; i < nprocs; i++) {
+   fd[i] = perf_cpu_event_open(i, type, addr, len);
+   if (fd[i] < 0) {
+   close_fds(fd, i);
+   return fd[i];
+   }
+   }
+   return 0;
+}
+
 static inline bool breakpoint_test(int len)
 {
int fd;
@@ -261,11 +342,483 @@ static int runtest_dar_outside(void)
return fail;
 }
 
+static void multi_dawr_workload(void)
+{
+   a += 10;
+   b += 10;
+   c[512 + 1] += 'a';
+}
+
+static int test_process_multi_diff_addr(void)
+{
+   unsigned long long breaks1 = 0, breaks2 = 0;
+   int fd1, fd2;
+   char *desc = "Process specific, Two events, diff addr";
+   size_t res;
+
+   fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64), 
(__u64)sizeof(a));
+   if (fd1 < 0) {
+   perror("perf_process_event_open");
+   exit(EXIT_FAILURE);
+   }
+
+   fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64), 
(__u64)sizeof(b));
+   if (fd2 < 0) {
+   close(fd1);
+   perror("perf_process_event_open");
+   exit(EXIT_FAILURE);
+   }
+
+   ioctl(fd1, PERF_EVENT_IOC_RESET);
+   ioctl(fd2, PERF_EVENT_IOC_RESET);
+   ioctl(fd1, PERF_EVENT_IOC_ENABLE);
+   ioctl(fd2, PERF_EVENT_IOC_ENABLE);
+   multi_dawr_workload();
+   ioctl(fd1, PERF_EVENT_IOC_DISABLE);
+   ioctl(fd2, PERF_EVENT_IOC_DISABLE);
+
+   res = read(fd1, , sizeof(breaks1));
+   assert(res == sizeof(unsigned long long));
+   res = read(fd2, , sizeof(breaks2));
+   assert(res == sizeof(unsigned long long));
+
+   close(fd1);
+   close(fd2);
+
+   if (breaks1 != 2 || breaks2 != 2) {
+   printf("FAILED: %s: %lld != 2 || %lld 

[PATCH 5/7] powerpc/selftests/perf-hwbreak: Coalesce event creation code

2020-07-23 Thread Ravi Bangoria
perf-hwbreak selftest opens hw-breakpoint event at multiple places for
which it has same code repeated. Coalesce that code into a function.

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 78 +--
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index c1f324afdbf3..bde475341c8a 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -34,28 +34,46 @@
 
 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
 
-static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
- int cpu, int group_fd,
- unsigned long flags)
+static void perf_event_attr_set(struct perf_event_attr *attr,
+   __u32 type, __u64 addr, __u64 len,
+   bool exclude_user)
 {
-   attr->size = sizeof(*attr);
-   return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+   memset(attr, 0, sizeof(struct perf_event_attr));
+   attr->type   = PERF_TYPE_BREAKPOINT;
+   attr->size   = sizeof(struct perf_event_attr);
+   attr->bp_type= type;
+   attr->bp_addr= addr;
+   attr->bp_len = len;
+   attr->exclude_kernel = 1;
+   attr->exclude_hv = 1;
+   attr->exclude_guest  = 1;
+   attr->exclude_user   = exclude_user;
+   attr->disabled   = 1;
 }
 
-static inline bool breakpoint_test(int len)
+static int
+perf_process_event_open_exclude_user(__u32 type, __u64 addr, __u64 len, bool 
exclude_user)
 {
struct perf_event_attr attr;
+
+   perf_event_attr_set(, type, addr, len, exclude_user);
+   return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+}
+
+static int perf_process_event_open(__u32 type, __u64 addr, __u64 len)
+{
+   struct perf_event_attr attr;
+
+   perf_event_attr_set(, type, addr, len, 0);
+   return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+}
+
+static inline bool breakpoint_test(int len)
+{
int fd;
 
-   /* setup counters */
-   memset(, 0, sizeof(attr));
-   attr.disabled = 1;
-   attr.type = PERF_TYPE_BREAKPOINT;
-   attr.bp_type = HW_BREAKPOINT_R;
/* bp_addr can point anywhere but needs to be aligned */
-   attr.bp_addr = (__u64)() & 0xf800;
-   attr.bp_len = len;
-   fd = sys_perf_event_open(, 0, -1, -1, 0);
+   fd = perf_process_event_open(HW_BREAKPOINT_R, (__u64)() & 
0xf800, len);
if (fd < 0)
return false;
close(fd);
@@ -75,7 +93,6 @@ static inline bool dawr_supported(void)
 static int runtestsingle(int readwriteflag, int exclude_user, int arraytest)
 {
int i,j;
-   struct perf_event_attr attr;
size_t res;
unsigned long long breaks, needed;
int readint;
@@ -94,19 +111,11 @@ static int runtestsingle(int readwriteflag, int 
exclude_user, int arraytest)
if (arraytest)
ptr = [0];
 
-   /* setup counters */
-   memset(, 0, sizeof(attr));
-   attr.disabled = 1;
-   attr.type = PERF_TYPE_BREAKPOINT;
-   attr.bp_type = readwriteflag;
-   attr.bp_addr = (__u64)ptr;
-   attr.bp_len = sizeof(int);
-   if (arraytest)
-   attr.bp_len = DAWR_LENGTH_MAX;
-   attr.exclude_user = exclude_user;
-   break_fd = sys_perf_event_open(, 0, -1, -1, 0);
+   break_fd = perf_process_event_open_exclude_user(readwriteflag, 
(__u64)ptr,
+   arraytest ? DAWR_LENGTH_MAX : sizeof(int),
+   exclude_user);
if (break_fd < 0) {
-   perror("sys_perf_event_open");
+   perror("perf_process_event_open_exclude_user");
exit(1);
}
 
@@ -153,7 +162,6 @@ static int runtest_dar_outside(void)
void *target;
volatile __u16 temp16;
volatile __u64 temp64;
-   struct perf_event_attr attr;
int break_fd;
unsigned long long breaks;
int fail = 0;
@@ -165,21 +173,11 @@ static int runtest_dar_outside(void)
exit(EXIT_FAILURE);
}
 
-   /* setup counters */
-   memset(, 0, sizeof(attr));
-   attr.disabled = 1;
-   attr.type = PERF_TYPE_BREAKPOINT;
-   attr.exclude_kernel = 1;
-   attr.exclude_hv = 1;
-   attr.exclude_guest = 1;
-   attr.bp_type = HW_BREAKPOINT_RW;
/* watch middle half of target array */
-   attr.bp_addr = (__u64)(target + 2);
-   attr.bp_len = 4;
-   break_fd = sys_perf_event_open(, 0, -1, -1, 0);
+   break_fd = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)(target + 
2), 4);
if (break_fd < 0) {
free(target);
-   perror("sys_perf_event_open");
+ 

[PATCH 4/7] powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR

2020-07-23 Thread Ravi Bangoria
Add selftests to test multiple active DAWRs with ptrace interface.

Sample o/p:
  $ ./ptrace-hwbreak
  ...
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO, len: 6: Ok

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 79 +++
 1 file changed, 79 insertions(+)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
index fc477dfe86a2..65781f4035c1 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
@@ -185,6 +185,18 @@ static void test_workload(void)
big_var[rand() % DAWR_MAX_LEN] = 'a';
else
cvar = big_var[rand() % DAWR_MAX_LEN];
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */
+   gstruct.a[rand() % A_LEN] = 'a';
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */
+   cvar = gstruct.b[rand() % B_LEN];
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */
+   gstruct.a[rand() % A_LEN] = 'a';
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */
+   cvar = gstruct.a[rand() % A_LEN];
 }
 
 static void check_success(pid_t child_pid, const char *name, const char *type,
@@ -374,6 +386,69 @@ static void test_sethwdebug_range_aligned(pid_t child_pid)
ptrace_delhwdebug(child_pid, wh);
 }
 
+static void test_multi_sethwdebug_range(pid_t child_pid)
+{
+   struct ppc_hw_breakpoint info1, info2;
+   unsigned long wp_addr1, wp_addr2;
+   char *name1 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED";
+   char *name2 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED";
+   int len1, len2;
+   int wh1, wh2;
+
+   wp_addr1 = (unsigned long)
+   wp_addr2 = (unsigned long)
+   len1 = A_LEN;
+   len2 = B_LEN;
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, 
len1);
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, 
len2);
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */
+   wh1 = ptrace_sethwdebug(child_pid, );
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */
+   wh2 = ptrace_sethwdebug(child_pid, );
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name1, "WO", wp_addr1, len1);
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name2, "RO", wp_addr2, len2);
+
+   ptrace_delhwdebug(child_pid, wh1);
+   ptrace_delhwdebug(child_pid, wh2);
+}
+
+static void test_multi_sethwdebug_range_dawr_overlap(pid_t child_pid)
+{
+   struct ppc_hw_breakpoint info1, info2;
+   unsigned long wp_addr1, wp_addr2;
+   char *name = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap";
+   int len1, len2;
+   int wh1, wh2;
+
+   wp_addr1 = (unsigned long)
+   wp_addr2 = (unsigned long)
+   len1 = A_LEN;
+   len2 = A_LEN;
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, 
len1);
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, 
len2);
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */
+   wh1 = ptrace_sethwdebug(child_pid, );
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */
+   wh2 = ptrace_sethwdebug(child_pid, );
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name, "WO", wp_addr1, len1);
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name, "RO", wp_addr2, len2);
+
+   ptrace_delhwdebug(child_pid, wh1);
+   ptrace_delhwdebug(child_pid, wh2);
+}
+
 static void test_sethwdebug_range_unaligned(pid_t child_pid)
 {
struct ppc_hw_breakpoint info;
@@ -460,6 +535,10 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, 
bool dawr)
test_sethwdebug_range_unaligned(child_pid);
test_sethwdebug_range_unaligned_dar(child_pid);
test_sethwdebug_dawr_max_range(child_pid);
+   if (dbginfo->num_data_bps > 1) {
+   test_multi_sethwdebug_range(child_pid);
+   
test_multi_sethwdebug_range_dawr_overlap(child_pid);
+   }
}
}
 }
-- 
2.26.2



[PATCH 1/7] powerpc/watchpoint/kvm: Rename current DAWR macros and variables

2020-07-23 Thread Ravi Bangoria
Power10 is introducing second DAWR. Use real register names (with
suffix 0) from ISA for current macros and variables used by kvm.

Signed-off-by: Ravi Bangoria 
---
 Documentation/virt/kvm/api.rst|  4 +--
 arch/powerpc/include/asm/kvm_host.h   |  4 +--
 arch/powerpc/include/uapi/asm/kvm.h   |  4 +--
 arch/powerpc/kernel/asm-offsets.c |  4 +--
 arch/powerpc/kvm/book3s_hv.c  | 32 +++
 arch/powerpc/kvm/book3s_hv_nested.c   |  8 +++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 20 +++---
 tools/arch/powerpc/include/uapi/asm/kvm.h |  4 +--
 8 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 426f94582b7a..4dc18fe6a2bf 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -2219,8 +2219,8 @@ registers, find a list below:
   PPC KVM_REG_PPC_BESCR   64
   PPC KVM_REG_PPC_TAR 64
   PPC KVM_REG_PPC_DPDES   64
-  PPC KVM_REG_PPC_DAWR64
-  PPC KVM_REG_PPC_DAWRX   64
+  PPC KVM_REG_PPC_DAWR0   64
+  PPC KVM_REG_PPC_DAWRX0  64
   PPC KVM_REG_PPC_CIABR   64
   PPC KVM_REG_PPC_IC  64
   PPC KVM_REG_PPC_VTB 64
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 7e2d061d0445..9aa3854f0e1e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -582,8 +582,8 @@ struct kvm_vcpu_arch {
u32 ctrl;
u32 dabrx;
ulong dabr;
-   ulong dawr;
-   ulong dawrx;
+   ulong dawr0;
+   ulong dawrx0;
ulong ciabr;
ulong cfar;
ulong ppr;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 264e266a85bf..38d61b73f5ed 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -608,8 +608,8 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_BESCR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7)
 #define KVM_REG_PPC_TAR(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8)
 #define KVM_REG_PPC_DPDES  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9)
-#define KVM_REG_PPC_DAWR   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
-#define KVM_REG_PPC_DAWRX  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
+#define KVM_REG_PPC_DAWR0  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
+#define KVM_REG_PPC_DAWRX0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
 #define KVM_REG_PPC_CIABR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac)
 #define KVM_REG_PPC_IC (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad)
 #define KVM_REG_PPC_VTB(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae)
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 6657dc6b2336..e76bffe348e1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -547,8 +547,8 @@ int main(void)
OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl);
OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr);
OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx);
-   OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr);
-   OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx);
+   OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0);
+   OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0);
OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr);
OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags);
OFFSET(VCPU_DEC, kvm_vcpu, arch.dec);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 89afcc5f60ca..28200e4f5d27 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -778,8 +778,8 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, 
unsigned long mflags,
return H_UNSUPPORTED_FLAG_START;
if (value2 & DABRX_HYP)
return H_P4;
-   vcpu->arch.dawr  = value1;
-   vcpu->arch.dawrx = value2;
+   vcpu->arch.dawr0  = value1;
+   vcpu->arch.dawrx0 = value2;
return H_SUCCESS;
case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
/* KVM does not support mflags=2 (AIL=2) */
@@ -1724,11 +1724,11 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
case KVM_REG_PPC_VTB:
*val = get_reg_val(id, vcpu->arch.vcore->vtb);
break;
-   case KVM_REG_PPC_DAWR:
-   *val = get_reg_val(id, vcpu->arch.dawr);
+   case KVM_REG_PPC_DAWR0:
+   *val = get_reg_val(id, vcpu->arch.dawr0);
break;
-   case KVM_REG_PPC_DAWRX:
-   *val = get_reg_val(id, vcpu->arch.dawrx);
+   case KVM_REG_PPC_DAWRX0:
+   *val = get_reg_val(id, vcpu->arch.dawrx0);
break;
case KVM_REG_PPC_CIABR:
*val = get_reg_val(id, vcpu->arch.ciabr);
@@ -1938,11 

[PATCH 3/7] powerpc/watchpoint/kvm: Introduce new capability for 2nd DAWR

2020-07-23 Thread Ravi Bangoria
Introduce KVM_CAP_PPC_DAWR1 which can be used by Qemu to query whether
kvm supports 2nd DAWR or not.

Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/kvm/powerpc.c | 3 +++
 include/uapi/linux/kvm.h   | 1 +
 2 files changed, 4 insertions(+)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index dd7d141e33e8..f38380fd1fe9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -676,6 +676,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
!kvmppc_hv_ops->enable_svm(NULL);
break;
 #endif
+   case KVM_CAP_PPC_DAWR1:
+   r = cpu_has_feature(CPU_FTR_DAWR1);
+   break;
default:
r = 0;
break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4fdf30316582..2c3713d6526a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SECURE_GUEST 181
 #define KVM_CAP_HALT_POLL 182
 #define KVM_CAP_ASYNC_PF_INT 183
+#define KVM_CAP_PPC_DAWR1 184
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.26.2



[PATCH 2/7] powerpc/watchpoint/kvm: Add infrastructure to support 2nd DAWR

2020-07-23 Thread Ravi Bangoria
kvm code assumes single DAWR everywhere. Add code to support 2nd DAWR.
DAWR is a hypervisor resource and thus H_SET_MODE hcall is used to set/
unset it. Introduce new case H_SET_MODE_RESOURCE_SET_DAWR1 for 2nd DAWR.
Also, kvm will support 2nd DAWR only if CPU_FTR_DAWR1 is set.

Signed-off-by: Ravi Bangoria 
---
 Documentation/virt/kvm/api.rst|  2 ++
 arch/powerpc/include/asm/hvcall.h |  2 ++
 arch/powerpc/include/asm/kvm_host.h   |  2 ++
 arch/powerpc/include/uapi/asm/kvm.h   |  4 +++
 arch/powerpc/kernel/asm-offsets.c |  2 ++
 arch/powerpc/kvm/book3s_hv.c  | 41 +++
 arch/powerpc/kvm/book3s_hv_nested.c   |  7 
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 23 +
 tools/arch/powerpc/include/uapi/asm/kvm.h |  4 +++
 9 files changed, 87 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 4dc18fe6a2bf..7b1d16c2ad24 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -2242,6 +2242,8 @@ registers, find a list below:
   PPC KVM_REG_PPC_PSSCR   64
   PPC KVM_REG_PPC_DEC_EXPIRY  64
   PPC KVM_REG_PPC_PTCR64
+  PPC KVM_REG_PPC_DAWR1   64
+  PPC KVM_REG_PPC_DAWRX1  64
   PPC KVM_REG_PPC_TM_GPR0 64
   ...
   PPC KVM_REG_PPC_TM_GPR3164
diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 33793444144c..03f401d7be41 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -538,6 +538,8 @@ struct hv_guest_state {
s64 tb_offset;
u64 dawr0;
u64 dawrx0;
+   u64 dawr1;
+   u64 dawrx1;
u64 ciabr;
u64 hdec_expiry;
u64 purr;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 9aa3854f0e1e..bda839edd5fe 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -584,6 +584,8 @@ struct kvm_vcpu_arch {
ulong dabr;
ulong dawr0;
ulong dawrx0;
+   ulong dawr1;
+   ulong dawrx1;
ulong ciabr;
ulong cfar;
ulong ppr;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 38d61b73f5ed..c5c0f128b46f 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -640,6 +640,10 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
 #define KVM_REG_PPC_PTCR   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
 
+/* POWER10 registers. */
+#define KVM_REG_PPC_DAWR1  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1)
+#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index e76bffe348e1..ef2c0f3f5a7b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -549,6 +549,8 @@ int main(void)
OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx);
OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0);
OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0);
+   OFFSET(VCPU_DAWR1, kvm_vcpu, arch.dawr1);
+   OFFSET(VCPU_DAWRX1, kvm_vcpu, arch.dawrx1);
OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr);
OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags);
OFFSET(VCPU_DEC, kvm_vcpu, arch.dec);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 28200e4f5d27..24575520b2ea 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -781,6 +781,20 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, 
unsigned long mflags,
vcpu->arch.dawr0  = value1;
vcpu->arch.dawrx0 = value2;
return H_SUCCESS;
+   case H_SET_MODE_RESOURCE_SET_DAWR1:
+   if (!kvmppc_power8_compatible(vcpu))
+   return H_P2;
+   if (!ppc_breakpoint_available())
+   return H_P2;
+   if (!cpu_has_feature(CPU_FTR_DAWR1))
+   return H_P2;
+   if (mflags)
+   return H_UNSUPPORTED_FLAG_START;
+   if (value2 & DABRX_HYP)
+   return H_P4;
+   vcpu->arch.dawr1  = value1;
+   vcpu->arch.dawrx1 = value2;
+   return H_SUCCESS;
case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
/* KVM does not support mflags=2 (AIL=2) */
if (mflags != 0 && mflags != 3)
@@ -1730,6 +1744,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
case KVM_REG_PPC_DAWRX0:
*val = get_reg_val(id, vcpu->arch.dawrx0);
break;
+   case KVM_REG_PPC_DAWR1:
+   *val = get_reg_val(id, 

[PATCH 0/7] powerpc/watchpoint: 2nd DAWR kvm enablement + selftests

2020-07-23 Thread Ravi Bangoria
Patch #1, #2 and #3 enables p10 2nd DAWR feature for Book3S kvm guest. DAWR
is a hypervisor resource and thus H_SET_MODE hcall is used to set/unset it.
A new case H_SET_MODE_RESOURCE_SET_DAWR1 is introduced in H_SET_MODE hcall
for setting/unsetting 2nd DAWR. Also, new capability KVM_CAP_PPC_DAWR1 has
been added to query 2nd DAWR support via kvm ioctl.

This feature also needs to be enabled in Qemu to really use it. I'll reply
link to qemu patches once I post them in qemu-devel mailing list.

Patch #4, #5, #6 and #7 adds selftests to test 2nd DAWR.

Dependency:
  1: p10 kvm base enablement
 
https://lore.kernel.org/linuxppc-dev/20200602055325.6102-1-alist...@popple.id.au

  2: 2nd DAWR powervm/baremetal enablement
 
https://lore.kernel.org/linuxppc-dev/20200723090813.303838-1-ravi.bango...@linux.ibm.com

  3: ptrace PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 flag
 
https://lore.kernel.org/linuxppc-dev/20200723093330.306341-1-ravi.bango...@linux.ibm.com

Patches in this series applies fine on top of powerpc/next (9a77c4a0a125)
plus above dependency patches.

Ravi Bangoria (7):
  powerpc/watchpoint/kvm: Rename current DAWR macros and variables
  powerpc/watchpoint/kvm: Add infrastructure to support 2nd DAWR
  powerpc/watchpoint/kvm: Introduce new capability for 2nd DAWR
  powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR
  powerpc/selftests/perf-hwbreak: Coalesce event creation code
  powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR
  powerpc/selftests: Add selftest to test concurrent perf/ptrace events

 Documentation/virt/kvm/api.rst|   6 +-
 arch/powerpc/include/asm/hvcall.h |   2 +
 arch/powerpc/include/asm/kvm_host.h   |   6 +-
 arch/powerpc/include/uapi/asm/kvm.h   |   8 +-
 arch/powerpc/kernel/asm-offsets.c |   6 +-
 arch/powerpc/kvm/book3s_hv.c  |  73 +-
 arch/powerpc/kvm/book3s_hv_nested.c   |  15 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  43 +-
 arch/powerpc/kvm/powerpc.c|   3 +
 include/uapi/linux/kvm.h  |   1 +
 tools/arch/powerpc/include/uapi/asm/kvm.h |   8 +-
 .../selftests/powerpc/ptrace/.gitignore   |   1 +
 .../testing/selftests/powerpc/ptrace/Makefile |   2 +-
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 646 +++--
 .../selftests/powerpc/ptrace/ptrace-hwbreak.c |  79 +++
 .../powerpc/ptrace/ptrace-perf-hwbreak.c  | 659 ++
 16 files changed, 1476 insertions(+), 82 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c

-- 
2.26.2



[PATCH v2] powerpc/watchpoint/ptrace: Introduce PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31

2020-07-23 Thread Ravi Bangoria
PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 can be used to determine
whether we are running on an ISA 3.1 compliant machine. Which is
needed to determine DAR behaviour, 512 byte boundary limit etc.
This was requested by Pedro Miraglia Franco de Carvalho for
extending watchpoint features in gdb. Note that availability of
2nd DAWR is independent of this flag and should be checked using
ppc_debug_info->num_data_bps.

Signed-off-by: Ravi Bangoria 
---
v1->v2:
 - Mention new flag in Documentaion/ as well.

 Documentation/powerpc/ptrace.rst  | 1 +
 arch/powerpc/include/uapi/asm/ptrace.h| 1 +
 arch/powerpc/kernel/ptrace/ptrace-noadv.c | 5 -
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/powerpc/ptrace.rst b/Documentation/powerpc/ptrace.rst
index 864d4b61..4d42290248cb 100644
--- a/Documentation/powerpc/ptrace.rst
+++ b/Documentation/powerpc/ptrace.rst
@@ -46,6 +46,7 @@ features will have bits indicating whether there is support 
for::
   #define PPC_DEBUG_FEATURE_DATA_BP_RANGE  0x4
   #define PPC_DEBUG_FEATURE_DATA_BP_MASK   0x8
   #define PPC_DEBUG_FEATURE_DATA_BP_DAWR   0x10
+  #define PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31   0x20
 
 2. PTRACE_SETHWDEBUG
 
diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
b/arch/powerpc/include/uapi/asm/ptrace.h
index f5f1ccc740fc..0a87bcd4300a 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -222,6 +222,7 @@ struct ppc_debug_info {
 #define PPC_DEBUG_FEATURE_DATA_BP_RANGE0x0004
 #define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x0008
 #define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x0010
+#define PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31 0x0020
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c 
b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
index 697c7e4b5877..b2de874d650b 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
@@ -52,8 +52,11 @@ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo)
dbginfo->sizeof_condition = 0;
if (IS_ENABLED(CONFIG_HAVE_HW_BREAKPOINT)) {
dbginfo->features = PPC_DEBUG_FEATURE_DATA_BP_RANGE;
-   if (dawr_enabled())
+   if (dawr_enabled()) {
dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR;
+   if (cpu_has_feature(CPU_FTR_ARCH_31))
+   dbginfo->features |= 
PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31;
+   }
} else {
dbginfo->features = 0;
}
-- 
2.26.2



[PATCH v5 10/10] powerpc/watchpoint: Remove 512 byte boundary

2020-07-23 Thread Ravi Bangoria
Power10 has removed 512 bytes boundary from match criteria i.e. the watch
range can cross 512 bytes boundary.

Note: ISA 3.1 Book III 9.4 match criteria includes 512 byte limit but that
is a documentation mistake and hopefully will be fixed in the next version
of ISA. Though, ISA 3.1 change log mentions about removal of 512B boundary:

  Multiple DEAW:
  Added a second Data Address Watchpoint. [H]DAR is
  set to the first byte of overlap. 512B boundary is
  removed.

Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/kernel/hw_breakpoint.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index c55e67bab271..1f4a1efa0074 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -418,8 +418,9 @@ static int hw_breakpoint_validate_len(struct 
arch_hw_breakpoint *hw)
 
if (dawr_enabled()) {
max_len = DAWR_MAX_LEN;
-   /* DAWR region can't cross 512 bytes boundary */
-   if (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, 
SZ_512))
+   /* DAWR region can't cross 512 bytes boundary on p10 
predecessors */
+   if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
+   (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, 
SZ_512)))
return -EINVAL;
} else if (IS_ENABLED(CONFIG_PPC_8xx)) {
/* 8xx can setup a range without limitation */
-- 
2.26.2



[PATCH v5 09/10] powerpc/watchpoint: Return available watchpoints dynamically

2020-07-23 Thread Ravi Bangoria
So far Book3S Powerpc supported only one watchpoint. Power10 is
introducing 2nd DAWR. Enable 2nd DAWR support for Power10.
Availability of 2nd DAWR will depend on CPU_FTR_DAWR1.

Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/include/asm/cputable.h  | 5 +++--
 arch/powerpc/include/asm/hw_breakpoint.h | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 5583f2d08df7..fa1232c33ab9 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -629,9 +629,10 @@ enum {
 
 /*
  * Maximum number of hw breakpoint supported on powerpc. Number of
- * breakpoints supported by actual hw might be less than this.
+ * breakpoints supported by actual hw might be less than this, which
+ * is decided at run time in nr_wp_slots().
  */
-#define HBP_NUM_MAX1
+#define HBP_NUM_MAX2
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h 
b/arch/powerpc/include/asm/hw_breakpoint.h
index cb424799da0d..c89250b6ac34 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -9,6 +9,8 @@
 #ifndef _PPC_BOOK3S_64_HW_BREAKPOINT_H
 #define _PPC_BOOK3S_64_HW_BREAKPOINT_H
 
+#include 
+
 #ifdef __KERNEL__
 struct arch_hw_breakpoint {
unsigned long   address;
@@ -46,7 +48,7 @@ struct arch_hw_breakpoint {
 
 static inline int nr_wp_slots(void)
 {
-   return HBP_NUM_MAX;
+   return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1;
 }
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-- 
2.26.2



[PATCH v5 08/10] powerpc/watchpoint: Guest support for 2nd DAWR hcall

2020-07-23 Thread Ravi Bangoria
2nd DAWR can be set/unset using H_SET_MODE hcall with resource value 5.
Enable powervm guest support with that. This has no effect on kvm guest
because kvm will return error if guest does hcall with resource value 5.

Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/include/asm/hvcall.h | 1 +
 arch/powerpc/include/asm/machdep.h| 2 +-
 arch/powerpc/include/asm/plpar_wrappers.h | 5 +
 arch/powerpc/kernel/dawr.c| 2 +-
 arch/powerpc/platforms/pseries/setup.c| 7 +--
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index b785e9f0071c..33793444144c 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -358,6 +358,7 @@
 #define H_SET_MODE_RESOURCE_SET_DAWR0  2
 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE3
 #define H_SET_MODE_RESOURCE_LE 4
+#define H_SET_MODE_RESOURCE_SET_DAWR1  5
 
 /* Values for argument to H_SIGNAL_SYS_RESET */
 #define H_SIGNAL_SYS_RESET_ALL -1
diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 7bcb6a39..a90b892f0bfe 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -131,7 +131,7 @@ struct machdep_calls {
unsigned long dabrx);
 
/* Set DAWR for this platform, leave empty for default implementation */
-   int (*set_dawr)(unsigned long dawr,
+   int (*set_dawr)(int nr, unsigned long dawr,
unsigned long dawrx);
 
 #ifdef CONFIG_PPC32/* XXX for now */
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index d12c3680d946..ece84a430701 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -315,6 +315,11 @@ static inline long plpar_set_watchpoint0(unsigned long 
dawr0, unsigned long dawr
return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR0, dawr0, dawrx0);
 }
 
+static inline long plpar_set_watchpoint1(unsigned long dawr1, unsigned long 
dawrx1)
+{
+   return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR1, dawr1, dawrx1);
+}
+
 static inline long plpar_signal_sys_reset(long cpu)
 {
return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);
diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
index 500f52fa4711..cdc2dccb987d 100644
--- a/arch/powerpc/kernel/dawr.c
+++ b/arch/powerpc/kernel/dawr.c
@@ -37,7 +37,7 @@ int set_dawr(int nr, struct arch_hw_breakpoint *brk)
dawrx |= (mrd & 0x3f) << (63 - 53);
 
if (ppc_md.set_dawr)
-   return ppc_md.set_dawr(dawr, dawrx);
+   return ppc_md.set_dawr(nr, dawr, dawrx);
 
if (nr == 0) {
mtspr(SPRN_DAWR0, dawr);
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..d516ee8eb7fc 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -831,12 +831,15 @@ static int pseries_set_xdabr(unsigned long dabr, unsigned 
long dabrx)
return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
 }
 
-static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx)
+static int pseries_set_dawr(int nr, unsigned long dawr, unsigned long dawrx)
 {
/* PAPR says we can't set HYP */
dawrx &= ~DAWRX_HYP;
 
-   return  plpar_set_watchpoint0(dawr, dawrx);
+   if (nr == 0)
+   return plpar_set_watchpoint0(dawr, dawrx);
+   else
+   return plpar_set_watchpoint1(dawr, dawrx);
 }
 
 #define CMO_CHARACTERISTICS_TOKEN 44
-- 
2.26.2



[PATCH v5 07/10] powerpc/watchpoint: Rename current H_SET_MODE DAWR macro

2020-07-23 Thread Ravi Bangoria
Current H_SET_MODE hcall macro name for setting/resetting DAWR0 is
H_SET_MODE_RESOURCE_SET_DAWR. Add suffix 0 to macro name as well.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Jordan Niethe 
---
 arch/powerpc/include/asm/hvcall.h | 2 +-
 arch/powerpc/include/asm/plpar_wrappers.h | 2 +-
 arch/powerpc/kvm/book3s_hv.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 43486e773bd6..b785e9f0071c 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -355,7 +355,7 @@
 
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR  1
-#define H_SET_MODE_RESOURCE_SET_DAWR   2
+#define H_SET_MODE_RESOURCE_SET_DAWR0  2
 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE3
 #define H_SET_MODE_RESOURCE_LE 4
 
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index 4293c5d2ddf4..d12c3680d946 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -312,7 +312,7 @@ static inline long plpar_set_ciabr(unsigned long ciabr)
 
 static inline long plpar_set_watchpoint0(unsigned long dawr0, unsigned long 
dawrx0)
 {
-   return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0);
+   return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR0, dawr0, dawrx0);
 }
 
 static inline long plpar_signal_sys_reset(long cpu)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6bf66649ab92..7ad692c2d7c7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -764,7 +764,7 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, 
unsigned long mflags,
return H_P3;
vcpu->arch.ciabr  = value1;
return H_SUCCESS;
-   case H_SET_MODE_RESOURCE_SET_DAWR:
+   case H_SET_MODE_RESOURCE_SET_DAWR0:
if (!kvmppc_power8_compatible(vcpu))
return H_P2;
if (!ppc_breakpoint_available())
-- 
2.26.2



[PATCH v5 06/10] powerpc/watchpoint: Set CPU_FTR_DAWR1 based on pa-features bit

2020-07-23 Thread Ravi Bangoria
As per the PAPR, bit 0 of byte 64 in pa-features property indicates
availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Host generally uses "cpu-features",
which masks "pa-features". But "cpu-features" are still not used for
guests and thus this change is mostly applicable for guests only.

Signed-off-by: Ravi Bangoria 
Tested-by: Jordan Niethe 
---
 arch/powerpc/kernel/prom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 033d43819ed8..01dda206d68e 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -175,6 +175,8 @@ static struct ibm_pa_feature {
 */
{ .pabyte = 22, .pabit = 0, .cpu_features = CPU_FTR_TM_COMP,
  .cpu_user_ftrs2 = PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_HTM_NOSC_COMP 
},
+
+   { .pabyte = 64, .pabit = 0, .cpu_features = CPU_FTR_DAWR1 },
 };
 
 static void __init scan_features(unsigned long node, const unsigned char *ftrs,
-- 
2.26.2



[PATCH v5 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR

2020-07-23 Thread Ravi Bangoria
Add new device-tree feature for 2nd DAWR. If this feature is present,
2nd DAWR is supported, otherwise not.

Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/include/asm/cputable.h | 3 ++-
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index e506d429b1af..5583f2d08df7 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_P9_TLBIE_ERAT_BUG  LONG_ASM_CONST(0x0001)
 #define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002)
 #define CPU_FTR_ARCH_31
LONG_ASM_CONST(0x0004)
+#define CPU_FTR_DAWR1  LONG_ASM_CONST(0x0008)
 
 #ifndef __ASSEMBLY__
 
@@ -478,7 +479,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
-   CPU_FTR_ARCH_31 | CPU_FTR_DAWR)
+   CPU_FTR_ARCH_31 | CPU_FTR_DAWR | CPU_FTR_DAWR1)
 #define CPU_FTRS_CELL  (CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index ac650c233cd9..675b824038f9 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -649,6 +649,7 @@ static struct dt_cpu_feature_match __initdata
{"wait-v3", feat_enable, 0},
{"prefix-instructions", feat_enable, 0},
{"matrix-multiply-assist", feat_enable_mma, 0},
+   {"debug-facilities-v31", feat_enable, CPU_FTR_DAWR1},
 };
 
 static bool __initdata using_dt_cpu_ftrs;
-- 
2.26.2



[PATCH v5 04/10] powerpc/watchpoint: Enable watchpoint functionality on power10 guest

2020-07-23 Thread Ravi Bangoria
CPU_FTR_DAWR is by default enabled for host via CPU_FTRS_DT_CPU_BASE
(controlled by CONFIG_PPC_DT_CPU_FTRS). But cpu-features device-tree
node is not PAPR compatible and thus not yet used by kvm or pHyp
guests. Enable watchpoint functionality on power10 guest (both kvm
and powervm) by adding CPU_FTR_DAWR to CPU_FTRS_POWER10. Note that
this change does not enable 2nd DAWR support.

Signed-off-by: Ravi Bangoria 
Tested-by: Jordan Niethe 
---
 arch/powerpc/include/asm/cputable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index bac2252c839e..e506d429b1af 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -478,7 +478,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
-   CPU_FTR_ARCH_31)
+   CPU_FTR_ARCH_31 | CPU_FTR_DAWR)
 #define CPU_FTRS_CELL  (CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
-- 
2.26.2



[PATCH v5 03/10] powerpc/watchpoint: Fix DAWR exception for CACHEOP

2020-07-23 Thread Ravi Bangoria
'ea' returned by analyse_instr() needs to be aligned down to cache
block size for CACHEOP instructions. analyse_instr() does not set
size for CACHEOP, thus size also needs to be calculated manually.

Fixes: 27985b2a640e ("powerpc/watchpoint: Don't ignore extraneous exceptions 
blindly")
Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than 
one watchpoint")
Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/kernel/hw_breakpoint.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index a971e22aea81..c55e67bab271 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -538,7 +538,12 @@ static bool check_dawrx_constraints(struct pt_regs *regs, 
int type,
if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ))
return false;
 
-   if (OP_IS_STORE(type) && !(info->type & HW_BRK_TYPE_WRITE))
+   /*
+* The Cache Management instructions other than dcbz never
+* cause a match. i.e. if type is CACHEOP, the instruction
+* is dcbz, and dcbz is treated as Store.
+*/
+   if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & 
HW_BRK_TYPE_WRITE))
return false;
 
if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL))
@@ -601,6 +606,15 @@ static bool check_constraints(struct pt_regs *regs, struct 
ppc_inst instr,
return false;
 }
 
+static int cache_op_size(void)
+{
+#ifdef __powerpc64__
+   return ppc64_caches.l1d.block_size;
+#else
+   return L1_CACHE_BYTES;
+#endif
+}
+
 static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
 int *type, int *size, unsigned long *ea)
 {
@@ -616,7 +630,12 @@ static void get_instr_detail(struct pt_regs *regs, struct 
ppc_inst *instr,
if (!(regs->msr & MSR_64BIT))
*ea &= 0xUL;
 #endif
+
*size = GETSIZE(op.type);
+   if (*type == CACHEOP) {
+   *size = cache_op_size();
+   *ea &= ~(*size - 1);
+   }
 }
 
 static bool is_larx_stcx_instr(int type)
-- 
2.26.2



[PATCH v5 02/10] powerpc/watchpoint: Fix DAWR exception constraint

2020-07-23 Thread Ravi Bangoria
Pedro Miraglia Franco de Carvalho noticed that on p8/p9, DAR value is
inconsistent with different type of load/store. Like for byte,word
etc. load/stores, DAR is set to the address of the first byte of
overlap between watch range and real access. But for quadword load/
store it's sometime set to the address of the first byte of real
access whereas sometime set to the address of the first byte of
overlap. This issue has been fixed in p10. In p10(ISA 3.1), DAR is
always set to the address of the first byte of overlap. Commit 27985b2a640e
("powerpc/watchpoint: Don't ignore extraneous exceptions blindly")
wrongly assumes that DAR is set to the address of the first byte of
overlap for all load/stores on p8/p9 as well. Fix that. With the fix,
we now rely on 'ea' provided by analyse_instr(). If analyse_instr()
fails, generate event unconditionally on p8/p9, and on p10 generate
event only if DAR is within a DAWR range.

Note: 8xx is not affected.

Fixes: 27985b2a640e ("powerpc/watchpoint: Don't ignore extraneous exceptions 
blindly")
Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than 
one watchpoint")
Reported-by: Pedro Miraglia Franco de Carvalho 
Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/kernel/hw_breakpoint.c | 72 -
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index 031e6defc08e..a971e22aea81 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -498,11 +498,11 @@ static bool dar_in_user_range(unsigned long dar, struct 
arch_hw_breakpoint *info
return ((info->address <= dar) && (dar - info->address < info->len));
 }
 
-static bool dar_user_range_overlaps(unsigned long dar, int size,
-   struct arch_hw_breakpoint *info)
+static bool ea_user_range_overlaps(unsigned long ea, int size,
+  struct arch_hw_breakpoint *info)
 {
-   return ((dar < info->address + info->len) &&
-   (dar + size > info->address));
+   return ((ea < info->address + info->len) &&
+   (ea + size > info->address));
 }
 
 static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info)
@@ -515,20 +515,22 @@ static bool dar_in_hw_range(unsigned long dar, struct 
arch_hw_breakpoint *info)
return ((hw_start_addr <= dar) && (hw_end_addr > dar));
 }
 
-static bool dar_hw_range_overlaps(unsigned long dar, int size,
- struct arch_hw_breakpoint *info)
+static bool ea_hw_range_overlaps(unsigned long ea, int size,
+struct arch_hw_breakpoint *info)
 {
unsigned long hw_start_addr, hw_end_addr;
 
hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE);
hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE);
 
-   return ((dar < hw_end_addr) && (dar + size > hw_start_addr));
+   return ((ea < hw_end_addr) && (ea + size > hw_start_addr));
 }
 
 /*
  * If hw has multiple DAWR registers, we also need to check all
  * dawrx constraint bits to confirm this is _really_ a valid event.
+ * If type is UNKNOWN, but privilege level matches, consider it as
+ * a positive match.
  */
 static bool check_dawrx_constraints(struct pt_regs *regs, int type,
struct arch_hw_breakpoint *info)
@@ -553,7 +555,8 @@ static bool check_dawrx_constraints(struct pt_regs *regs, 
int type,
  * including extraneous exception. Otherwise return false.
  */
 static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr,
- int type, int size, struct arch_hw_breakpoint 
*info)
+ unsigned long ea, int type, int size,
+ struct arch_hw_breakpoint *info)
 {
bool in_user_range = dar_in_user_range(regs->dar, info);
bool dawrx_constraints;
@@ -569,22 +572,27 @@ static bool check_constraints(struct pt_regs *regs, 
struct ppc_inst instr,
}
 
if (unlikely(ppc_inst_equal(instr, ppc_inst(0 {
-   if (in_user_range)
-   return true;
+   if (cpu_has_feature(CPU_FTR_ARCH_31) &&
+   !dar_in_hw_range(regs->dar, info))
+   return false;
 
-   if (dar_in_hw_range(regs->dar, info)) {
-   info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
-   return true;
-   }
-   return false;
+   return true;
}
 
dawrx_constraints = check_dawrx_constraints(regs, type, info);
 
-   if (dar_user_range_overlaps(regs->dar, size, info))
+   if (type == UNKNOWN) {
+   if (cpu_has_feature(CPU_FTR_ARCH_31) &&
+   !dar_in_hw_range(regs->dar, info))
+   return false;
+
return dawrx_constraints;
+   }

[PATCH v5 01/10] powerpc/watchpoint: Fix 512 byte boundary limit

2020-07-23 Thread Ravi Bangoria
Milton Miller reported that we are aligning start and end address to
wrong size SZ_512M. It should be SZ_512. Fix that.

While doing this change I also found a case where ALIGN() comparison
fails. Within a given aligned range, ALIGN() of two addresses does not
match when start address is pointing to the first byte and end address
is pointing to any other byte except the first one. But that's not true
for ALIGN_DOWN(). ALIGN_DOWN() of any two addresses within that range
will always point to the first byte. So use ALIGN_DOWN() instead of
ALIGN().

Fixes: e68ef121c1f4 ("powerpc/watchpoint: Use builtin ALIGN*() macros")
Reported-by: Milton Miller 
Signed-off-by: Ravi Bangoria 
Tested-by: Jordan Niethe 
---
 arch/powerpc/kernel/hw_breakpoint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index daf0e1da..031e6defc08e 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -419,7 +419,7 @@ static int hw_breakpoint_validate_len(struct 
arch_hw_breakpoint *hw)
if (dawr_enabled()) {
max_len = DAWR_MAX_LEN;
/* DAWR region can't cross 512 bytes boundary */
-   if (ALIGN(start_addr, SZ_512M) != ALIGN(end_addr - 1, SZ_512M))
+   if (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, 
SZ_512))
return -EINVAL;
} else if (IS_ENABLED(CONFIG_PPC_8xx)) {
/* 8xx can setup a range without limitation */
-- 
2.26.2



[PATCH v5 00/10] powerpc/watchpoint: Enable 2nd DAWR on baremetal and powervm

2020-07-23 Thread Ravi Bangoria
Last series[1] was to add basic infrastructure support for more than
one watchpoint on Book3S powerpc. This series actually enables the 2nd 
DAWR for baremetal and powervm. Kvm guest is still not supported.

v4: https://lore.kernel.org/r/20200717040958.70561-1-ravi.bango...@linux.ibm.com

v4->v5:
 - Using hardcoded values instead of macros HBP_NUM_ONE and HBP_NUM_TWO.
   Comment above HBP_NUM_MAX changed to explain it's value.
 - Included CPU_FTR_DAWR1 into CPU_FTRS_POWER10
 - Using generic function feat_enable() instead of
   feat_enable_debug_facilities_v31() to enable CPU_FTR_DAWR1.
 - ISA still includes 512B boundary in match criteria. But that's a
   documentation mistake. Mentioned about this in the last patch.
 - Rebased to powerpc/next
 - Added Jordan's Reviewed-by/Tested-by tags

[1]: 
https://lore.kernel.org/linuxppc-dev/20200514111741.97993-1-ravi.bango...@linux.ibm.com/

Ravi Bangoria (10):
  powerpc/watchpoint: Fix 512 byte boundary limit
  powerpc/watchpoint: Fix DAWR exception constraint
  powerpc/watchpoint: Fix DAWR exception for CACHEOP
  powerpc/watchpoint: Enable watchpoint functionality on power10 guest
  powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR
  powerpc/watchpoint: Set CPU_FTR_DAWR1 based on pa-features bit
  powerpc/watchpoint: Rename current H_SET_MODE DAWR macro
  powerpc/watchpoint: Guest support for 2nd DAWR hcall
  powerpc/watchpoint: Return available watchpoints dynamically
  powerpc/watchpoint: Remove 512 byte boundary

 arch/powerpc/include/asm/cputable.h   |  8 +-
 arch/powerpc/include/asm/hvcall.h |  3 +-
 arch/powerpc/include/asm/hw_breakpoint.h  |  4 +-
 arch/powerpc/include/asm/machdep.h|  2 +-
 arch/powerpc/include/asm/plpar_wrappers.h |  7 +-
 arch/powerpc/kernel/dawr.c|  2 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c |  1 +
 arch/powerpc/kernel/hw_breakpoint.c   | 98 +++
 arch/powerpc/kernel/prom.c|  2 +
 arch/powerpc/kvm/book3s_hv.c  |  2 +-
 arch/powerpc/platforms/pseries/setup.c|  7 +-
 11 files changed, 91 insertions(+), 45 deletions(-)

-- 
2.26.2



[PATCH v3 00/10] Coregroup support on Powerpc

2020-07-23 Thread Srikar Dronamraju
Changelog v2 -> v3:
v2: 
https://lore.kernel.org/linuxppc-dev/20200721113814.32284-1-sri...@linux.vnet.ibm.com/t/#u

powerpc/smp: Cache node for reuse
Removed node caching part. Rewrote the Commit msg (Michael Ellerman)
Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

powerpc/smp: Enable small core scheduling sooner
Rewrote changelog (Gautham)
Renamed to powerpc/smp: Move topology fixups into  a new function

powerpc/smp: Create coregroup domain
Add optimization for mask updation under coregroup_support

Changelog v1 -> v2:
v1: 
https://lore.kernel.org/linuxppc-dev/20200714043624.5648-1-sri...@linux.vnet.ibm.com/t/#u

powerpc/smp: Merge Power9 topology with Power topology
Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
since cpu_smt_mask is only defined under CONFIG_SCHED_SMT

powerpc/smp: Enable small core scheduling sooner
Restored the previous info msg (Jordan)
Moved big core topology fixup to fixup_topology (Gautham)

powerpc/smp: Dont assume l2-cache to be superset of sibling
Set cpumask after verifying l2-cache. (Gautham)

powerpc/smp: Generalize 2nd sched domain
Moved shared_cache topology fixup to fixup_topology (Gautham)

Powerpc/numa: Detect support for coregroup
Explained Coregroup in commit msg (Michael Ellerman)

Powerpc/smp: Create coregroup domain
Moved coregroup topology fixup to fixup_topology (Gautham)

powerpc/smp: Implement cpu_to_coregroup_id
Move coregroup_enabled before getting associativity (Gautham)

powerpc/smp: Provide an ability to disable coregroup
Patch dropped (Michael Ellerman)

Cleanup of existing powerpc topologies and add coregroup support on
Powerpc. Coregroup is a group of (subset of) cores of a DIE that share
a resource.

Patch 7 of this patch series: "Powerpc/numa: Detect support for coregroup"
depends on
https://lore.kernel.org/linuxppc-dev/20200707140644.7241-1-sri...@linux.vnet.ibm.com/t/#u
However it should be easy to rebase the patch without the above patch.

This patch series is based on top of current powerpc/next tree + the
above patch.

On Power 8 Systems
--
$ tail /proc/cpuinfo
processor   : 255
cpu : POWER8 (architected), altivec supported
clock   : 3724.00MHz
revision: 2.1 (pvr 004b 0201)

timebase: 51200
platform: pSeries
model   : IBM,8408-E8E
machine : CHRP IBM,8408-E8E
MMU : Hash

Before the patchset
---
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After the patchset
--
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

On Power 9 (with device-tree enablement to show coregroups).
(hunks for mimicing a coregroup was posted at

[PATCH v3 06/10] powerpc/smp: Generalize 2nd sched domain

2020-07-23 Thread Srikar Dronamraju
Currently "CACHE" domain happens to be the 2nd sched domain as per
powerpc_topology. This domain will collapse if cpumask of l2-cache is
same as SMT domain. However we could generalize this domain such that it
could mean either be a "CACHE" domain or a "BIGCORE" domain.

While setting up the "CACHE" domain, check if shared_cache is already
set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Moved shared_cache topology fixup to fixup_topology (Gautham)

 arch/powerpc/kernel/smp.c | 49 ---
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d997c7411664..6f5877f6496e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
+enum {
+#ifdef CONFIG_SCHED_SMT
+   smt_idx,
+#endif
+   bigcore_idx,
+   die_idx,
+};
+
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
 struct thread_groups {
@@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
+   return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static const struct cpumask *cpu_bigcore_mask(int cpu)
+{
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
 static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -1311,7 +1318,6 @@ static void add_cpu_to_masks(int cpu)
 void start_secondary(void *unused)
 {
unsigned int cpu = smp_processor_id();
-   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
mmgrab(_mm);
current->active_mm = _mm;
@@ -1337,14 +1343,20 @@ void start_secondary(void *unused)
/* Update topology CPU masks */
add_cpu_to_masks(cpu);
 
-   if (has_big_cores)
-   sibling_mask = cpu_smallcore_mask;
/*
 * Check for any shared caches. Note that this must be done on a
 * per-core basis because one core in the pair might be disabled.
 */
-   if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
-   shared_caches = true;
+   if (!shared_caches) {
+   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+   struct cpumask *mask = cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   sibling_mask = cpu_smallcore_mask;
+
+   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   shared_caches = true;
+   }
 
set_numa_node(numa_cpu_lookup_table[cpu]);
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
@@ -1372,10 +1384,19 @@ int setup_profiling_timer(unsigned int multiplier)
 
 static void fixup_topology(void)
 {
+   if (shared_caches) {
+   pr_info("Using shared cache scheduler topology\n");
+   powerpc_topology[bigcore_idx].mask = shared_cache_mask;
+#ifdef CONFIG_SCHED_DEBUG
+   powerpc_topology[bigcore_idx].name = "CACHE";
+#endif
+   powerpc_topology[bigcore_idx].sd_flags = 
powerpc_shared_cache_flags;
+   }
+
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
+   powerpc_topology[smt_idx].mask = smallcore_smt_mask;
}
 #endif
 }
-- 
2.18.2



[PATCH v3 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-23 Thread Srikar Dronamraju
Lookup the coregroup id from the associativity array.

If unable to detect the coregroup id, fallback on the core id.
This way, ensure sched_domain degenerates and an extra sched domain is
not created.

Ideally this function should have been implemented in
arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
don't need to find the primary domain again.

If the device-tree mentions more than one coregroup, then kernel
implements only the last or the smallest coregroup, which currently
corresponds to the penultimate domain in the device-tree.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by : Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Move coregroup_enabled before getting associativity (Gautham)

 arch/powerpc/mm/numa.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index ef8aa580da21..608e0fa61019 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu)
 
 int cpu_to_coregroup_id(int cpu)
 {
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   int index;
+
+   if (cpu < 0 || cpu > nr_cpu_ids)
+   return -1;
+
+   if (!coregroup_enabled)
+   goto out;
+
+   if (!firmware_has_feature(FW_FEATURE_VPHN))
+   goto out;
+
+   if (vphn_get_associativity(cpu, associativity))
+   goto out;
+
+   index = of_read_number(associativity, 1);
+   if (index > min_common_depth + 1)
+   return of_read_number([index - 1], 1);
+
+out:
return cpu_to_core_id(cpu);
 }
 
-- 
2.18.2



[PATCH v3 09/10] powerpc/smp: Create coregroup domain

2020-07-23 Thread Srikar Dronamraju
Add percpu coregroup maps and masks to create coregroup domain.
If a coregroup doesn't exist, the coregroup domain will be degenerated
in favour of SMT/CACHE domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Add optimization for mask updation under coregroup_support

Changelog v1 -> v2:
Moved coregroup topology fixup to fixup_topology (Gautham)

 arch/powerpc/include/asm/topology.h | 10 +++
 arch/powerpc/kernel/smp.c   | 44 +
 arch/powerpc/mm/numa.c  |  5 
 3 files changed, 59 insertions(+)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f0b6300e7dd3..6609174918ab 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int find_and_online_cpu_nid(int cpu);
+extern int cpu_to_coregroup_id(int cpu);
 #else
 static inline int find_and_online_cpu_nid(int cpu)
 {
return 0;
 }
 
+static inline int cpu_to_coregroup_id(int cpu)
+{
+#ifdef CONFIG_SMP
+   return cpu_to_core_id(cpu);
+#else
+   return 0;
+#endif
+}
+
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7d8d44cbab11..1faedde3e406 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
@@ -91,6 +92,7 @@ enum {
smt_idx,
 #endif
bigcore_idx,
+   mc_idx,
die_idx,
 };
 
@@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static struct cpumask *cpu_coregroup_mask(int cpu)
+{
+   return per_cpu(cpu_coregroup_map, cpu);
+}
+
+static bool has_coregroup_support(void)
+{
+   return coregroup_enabled;
+}
+
+static const struct cpumask *cpu_mc_mask(int cpu)
+{
+   return cpu_coregroup_mask(cpu);
+}
+
 static const struct cpumask *cpu_bigcore_mask(int cpu)
 {
return per_cpu(cpu_sibling_map, cpu);
@@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
+   { cpu_mc_mask, SD_INIT_NAME(MC) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -925,6 +943,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+   if (has_coregroup_support())
+   zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
@@ -942,6 +964,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
+   if (has_coregroup_support())
+   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+
init_big_cores();
if (has_big_cores) {
cpumask_set_cpu(boot_cpuid,
@@ -1233,6 +1258,8 @@ static void remove_cpu_from_masks(int cpu)
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
+   if (has_coregroup_support())
+   set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
 #endif
@@ -1293,6 +1320,20 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
+   if (has_coregroup_support()) {
+   int coregroup_id = cpu_to_coregroup_id(cpu);
+
+   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+   for_each_cpu_and(i, cpu_online_mask, cpu_cpu_mask(cpu)) {
+   int fcpu = cpu_first_thread_sibling(i);
+
+   if (fcpu == first_thread)
+   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   

[PATCH v3 08/10] powerpc/smp: Allocate cpumask only after searching thread group

2020-07-23 Thread Srikar Dronamraju
If allocated earlier and the search fails, then cpumask need to be
freed. However cpu_l1_cache_map can be allocated after we search thread
group.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cbca4a8c3314..7d8d44cbab11 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
if (err)
goto out;
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
-   GFP_KERNEL,
-   cpu_to_node(cpu));
-
cpu_group_start = get_cpu_thread_group_start(cpu, );
 
if (unlikely(cpu_group_start == -1)) {
@@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
goto out;
}
 
+   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, );
 
-- 
2.18.2



[PATCH v3 07/10] powerpc/numa: Detect support for coregroup

2020-07-23 Thread Srikar Dronamraju
Add support for grouping cores based on the device-tree classification.
- The last domain in the associativity domains always refers to the
core.
- If primary reference domain happens to be the penultimate domain in
the associativity domains device-tree property, then there are no
coregroups. However if its not a penultimate domain, then there are
coregroups. There can be more than one coregroup. For now we would be
interested in the last or the smallest coregroups.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Explained Coregroup in commit msg (Michael Ellerman)

 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c  |  1 +
 arch/powerpc/mm/numa.c | 34 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..5bdc17a7049f 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -28,6 +28,7 @@
 extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
+extern bool coregroup_enabled;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 6f5877f6496e..cbca4a8c3314 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 struct task_struct *secondary_current;
 bool has_big_cores;
+bool coregroup_enabled;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index bc5b2e8112c8..3248160c0327 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -886,7 +886,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
 static void __init find_possible_nodes(void)
 {
struct device_node *rtas;
-   u32 numnodes, i;
+   const __be32 *domains;
+   int prop_length, max_nodes;
+   u32 i;
 
if (!numa_enabled)
return;
@@ -895,25 +897,31 @@ static void __init find_possible_nodes(void)
if (!rtas)
return;
 
-   if (of_property_read_u32_index(rtas, 
"ibm,current-associativity-domains",
-   min_common_depth, )) {
-   /*
-* ibm,current-associativity-domains is a fairly recent
-* property. If it doesn't exist, then fallback on
-* ibm,max-associativity-domains. Current denotes what the
-* platform can support compared to max which denotes what the
-* Hypervisor can support.
-*/
-   if (of_property_read_u32_index(rtas, 
"ibm,max-associativity-domains",
-   min_common_depth, ))
+   /*
+* ibm,current-associativity-domains is a fairly recent property. If
+* it doesn't exist, then fallback on ibm,max-associativity-domains.
+* Current denotes what the platform can support compared to max
+* which denotes what the Hypervisor can support.
+*/
+   domains = of_get_property(rtas, "ibm,current-associativity-domains",
+   _length);
+   if (!domains) {
+   domains = of_get_property(rtas, "ibm,max-associativity-domains",
+   _length);
+   if (!domains)
goto out;
}
 
-   for (i = 0; i < numnodes; i++) {
+   max_nodes = of_read_number([min_common_depth], 1);
+   for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
+   prop_length /= sizeof(int);
+   if (prop_length > min_common_depth + 2)
+   coregroup_enabled = 1;
+
 out:
of_node_put(rtas);
 }
-- 
2.18.2



[PATCH v3 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-23 Thread Srikar Dronamraju
Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Set cpumask after verifying l2-cache. (Gautham)

 arch/powerpc/kernel/smp.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index da27f6909be1..d997c7411664 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1194,6 +1194,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
if (!l2_cache)
return false;
 
+   cpumask_set_cpu(cpu, mask_fn(cpu));
for_each_cpu(i, cpu_online_mask) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1276,29 +1277,30 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   /*
-* Copy the thread sibling mask into the cache sibling mask
-* and mark any CPUs that share an L2 with this CPU.
-*/
-   for_each_cpu(i, cpu_sibling_mask(cpu))
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-   /*
-* Copy the cache sibling mask into core sibling mask and mark
-* any CPUs on the same chip as this CPU.
-*/
-   for_each_cpu(i, cpu_l2_cache_mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
+   if (pkg_id == -1) {
+   struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+   /*
+* Copy the sibling mask into core sibling mask and
+* mark any CPUs on the same chip as this CPU.
+*/
+   if (shared_caches)
+   mask = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask(cpu))
+   set_cpus_related(cpu, i, cpu_core_mask);
 
-   if (pkg_id == -1)
return;
+   }
 
for_each_cpu(i, cpu_online_mask)
if (get_physical_package_id(i) == pkg_id)
-- 
2.18.2



[PATCH v3 04/10] powerpc/smp: Move topology fixups into a new function

2020-07-23 Thread Srikar Dronamraju
Move topology fixup based on the platform attributes into its own
function which is called just before set_sched_topology.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Rewrote changelog (Gautham)
Renamed to powerpc/smp: Move topology fixups into  a new function

 arch/powerpc/kernel/smp.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index a685915e5941..da27f6909be1 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1368,6 +1368,16 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
+static void fixup_topology(void)
+{
+#ifdef CONFIG_SCHED_SMT
+   if (has_big_cores) {
+   pr_info("Big cores detected but using small core scheduling\n");
+   powerpc_topology[0].mask = smallcore_smt_mask;
+   }
+#endif
+}
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
@@ -1381,12 +1391,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
dump_numa_cpu_topology();
 
-#ifdef CONFIG_SCHED_SMT
-   if (has_big_cores) {
-   pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
-   }
-#endif
+   fixup_topology();
set_sched_topology(powerpc_topology);
 }
 
-- 
2.18.2



[PATCH v3 03/10] powerpc/smp: Move powerpc_topology above

2020-07-23 Thread Srikar Dronamraju
Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 116 +++---
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 283a04e54f52..a685915e5941 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+   flags |= SD_ASYM_PACKING;
+   }
+   return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+   return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+   return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { NULL, },
+};
+
 static int init_big_cores(void)
 {
int cpu;
@@ -1247,8 +1305,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1312,62 +1368,6 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-   flags |= SD_ASYM_PACKING;
-   }
-   return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-   return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-   return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
-- 
2.18.2



[PATCH v3 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-07-23 Thread Srikar Dronamraju
A new sched_domain_topology_level was added just for Power9. However the
same can be achieved by merging powerpc_topology with power9_topology
and makes the code more simpler especially when adding a new sched
domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
since cpu_smt_mask is only defined under CONFIG_SCHED_SMT

 arch/powerpc/kernel/smp.c | 33 ++---
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index edf94ca64eea..283a04e54f52 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1313,7 +1313,7 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymetric SMT dependancy */
+/* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
@@ -1326,14 +1326,6 @@ static int powerpc_smt_flags(void)
 }
 #endif
 
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 /*
  * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
  * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
@@ -1351,7 +1343,13 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   return cpu_l2_cache_mask(cpu);
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -1361,7 +1359,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
-static struct sched_domain_topology_level power9_topology[] = {
+static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@@ -1386,21 +1384,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   power9_topology[0].mask = smallcore_smt_mask;
powerpc_topology[0].mask = smallcore_smt_mask;
}
 #endif
-   /*
-* If any CPU detects that it's sharing a cache with another CPU then
-* use the deeper topology that is aware of this sharing.
-*/
-   if (shared_caches) {
-   pr_info("Using shared cache scheduler topology\n");
-   set_sched_topology(power9_topology);
-   } else {
-   pr_info("Using standard scheduler topology\n");
-   set_sched_topology(powerpc_topology);
-   }
+   set_sched_topology(powerpc_topology);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.18.2



[PATCH v3 01/10] powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

2020-07-23 Thread Srikar Dronamraju
Fix a build warning in a non CONFIG_NEED_MULTIPLE_NODES
"error: _numa_cpu_lookup_table_ undeclared"

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Anton Blanchard 
Cc: Oliver O'Halloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Gautham R Shenoy 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v2 -> v3:
Removed node caching part. Rewrote the Commit msg (Michael Ellerman)
Renamed to powerpc/smp: Fix a warning under !NEED_MULTIPLE_NODES

 arch/powerpc/kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..edf94ca64eea 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -860,6 +860,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+#ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
 */
@@ -868,6 +869,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
set_cpu_numa_mem(cpu,
local_memory_node(numa_cpu_lookup_table[cpu]));
}
+#endif
}
 
/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.18.2



Re: [v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine

2020-07-23 Thread Jordan Niethe
On Thu, Jul 23, 2020 at 5:32 PM Athira Rajeev
 wrote:
>
> Initialize Monitor Mode Control Register 3 (MMCR3)
> SPR which is new in power10. For PowerISA v3.1, BHRB disable
> is controlled via Monitor Mode Control Register A (MMCRA) bit,
> namely "BHRB Recording Disable (BHRBRD)". This patch also initializes
> MMCRA BHRBRD to disable BHRB feature at boot for power10.
>
> Signed-off-by: Athira Rajeev 
Reviewed-by: Jordan Niethe 
> ---
> Dependency:
> - On power10 PMU base enablement series V3:
>   https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462
>
> Changes from v3 -> v4
> - Addressed review comments from Jordan and Michael Ellerman.
>   This patch was initially part of Power10 PMU base enablement
>   series. Moving this as separate patch as suggested by Michael
>   Ellerman. Hence dependency of initial series Patch 7 which defines
>   MMCRA_BHRB_DISABLE. Addressed review comments from Jordan to make
>   sure existing PMU function (__INIT_PMU) will not overwrite ISA 3.1
>   updates
>
> Changes from v2 -> v3
> - Addressed review comment from Michael Ellerman to
>   call PMU init from __setup_cpu_power10
>
>  arch/powerpc/kernel/cpu_setup_power.S | 19 +++
>  1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kernel/cpu_setup_power.S 
> b/arch/powerpc/kernel/cpu_setup_power.S
> index efdcfa7..3fa6eef 100644
> --- a/arch/powerpc/kernel/cpu_setup_power.S
> +++ b/arch/powerpc/kernel/cpu_setup_power.S
> @@ -94,13 +94,15 @@ _GLOBAL(__restore_cpu_power8)
>  _GLOBAL(__setup_cpu_power10)
> mflrr11
> bl  __init_FSCR_power10
> +   bl  __init_PMU
> +   bl  __init_PMU_ISA31
> b   1f
>
>  _GLOBAL(__setup_cpu_power9)
> mflrr11
> bl  __init_FSCR
> -1: bl  __init_PMU
> -   bl  __init_hvmode_206
> +   bl  __init_PMU
> +1: bl  __init_hvmode_206
> mtlrr11
> beqlr
> li  r0,0
> @@ -124,13 +126,15 @@ _GLOBAL(__setup_cpu_power9)
>  _GLOBAL(__restore_cpu_power10)
> mflrr11
> bl  __init_FSCR_power10
> +   bl  __init_PMU
> +   bl  __init_PMU_ISA31
> b   1f
>
>  _GLOBAL(__restore_cpu_power9)
> mflrr11
> bl  __init_FSCR
> -1: bl  __init_PMU
> -   mfmsr   r3
> +   bl  __init_PMU
> +1: mfmsr   r3
> rldicl. r0,r3,4,63
> mtlrr11
> beqlr
> @@ -233,3 +237,10 @@ __init_PMU_ISA207:
> li  r5,0
> mtspr   SPRN_MMCRS,r5
> blr
> +
> +__init_PMU_ISA31:
> +   li  r5,0
> +   mtspr   SPRN_MMCR3,r5
> +   LOAD_REG_IMMEDIATE(r5, MMCRA_BHRB_DISABLE)
> +   mtspr   SPRN_MMCRA,r5
> +   blr
> --
> 1.8.3.1
>


[v4] powerpc/perf: Initialize power10 PMU registers in cpu setup routine

2020-07-23 Thread Athira Rajeev
Initialize Monitor Mode Control Register 3 (MMCR3)
SPR which is new in power10. For PowerISA v3.1, BHRB disable
is controlled via Monitor Mode Control Register A (MMCRA) bit,
namely "BHRB Recording Disable (BHRBRD)". This patch also initializes
MMCRA BHRBRD to disable BHRB feature at boot for power10.

Signed-off-by: Athira Rajeev 
---
Dependency:
- On power10 PMU base enablement series V3:
  https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462

Changes from v3 -> v4
- Addressed review comments from Jordan and Michael Ellerman.
  This patch was initially part of Power10 PMU base enablement
  series. Moving this as separate patch as suggested by Michael
  Ellerman. Hence dependency of initial series Patch 7 which defines
  MMCRA_BHRB_DISABLE. Addressed review comments from Jordan to make
  sure existing PMU function (__INIT_PMU) will not overwrite ISA 3.1
  updates

Changes from v2 -> v3
- Addressed review comment from Michael Ellerman to
  call PMU init from __setup_cpu_power10

 arch/powerpc/kernel/cpu_setup_power.S | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/cpu_setup_power.S 
b/arch/powerpc/kernel/cpu_setup_power.S
index efdcfa7..3fa6eef 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -94,13 +94,15 @@ _GLOBAL(__restore_cpu_power8)
 _GLOBAL(__setup_cpu_power10)
mflrr11
bl  __init_FSCR_power10
+   bl  __init_PMU
+   bl  __init_PMU_ISA31
b   1f
 
 _GLOBAL(__setup_cpu_power9)
mflrr11
bl  __init_FSCR
-1: bl  __init_PMU
-   bl  __init_hvmode_206
+   bl  __init_PMU
+1: bl  __init_hvmode_206
mtlrr11
beqlr
li  r0,0
@@ -124,13 +126,15 @@ _GLOBAL(__setup_cpu_power9)
 _GLOBAL(__restore_cpu_power10)
mflrr11
bl  __init_FSCR_power10
+   bl  __init_PMU
+   bl  __init_PMU_ISA31
b   1f
 
 _GLOBAL(__restore_cpu_power9)
mflrr11
bl  __init_FSCR
-1: bl  __init_PMU
-   mfmsr   r3
+   bl  __init_PMU
+1: mfmsr   r3
rldicl. r0,r3,4,63
mtlrr11
beqlr
@@ -233,3 +237,10 @@ __init_PMU_ISA207:
li  r5,0
mtspr   SPRN_MMCRS,r5
blr
+
+__init_PMU_ISA31:
+   li  r5,0
+   mtspr   SPRN_MMCR3,r5
+   LOAD_REG_IMMEDIATE(r5, MMCRA_BHRB_DISABLE)
+   mtspr   SPRN_MMCRA,r5
+   blr
-- 
1.8.3.1



[powerpc:merge] BUILD SUCCESS c27fe454aff795023d2f3f90f41eb1a3104e614f

2020-07-23 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: c27fe454aff795023d2f3f90f41eb1a3104e614f  Automatic merge of 
'master', 'next' and 'fixes' (2020-07-21 00:00)

elapsed time: 3913m

configs tested: 111
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
arm defconfig
sh  rsk7264_defconfig
c6x defconfig
c6xevmc6474_defconfig
arm shannon_defconfig
powerpc   ppc64_defconfig
arm  footbridge_defconfig
s390 alldefconfig
s390  debug_defconfig
arm  pxa3xx_defconfig
m68km5407c3_defconfig
sh  sdk7780_defconfig
c6x dsk6455_defconfig
arm   h5000_defconfig
i386 allyesconfig
i386defconfig
i386  debian-10.3
i386  allnoconfig
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
nios2allyesconfig
openriscdefconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc defconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a001-20200719
i386 randconfig-a006-20200719
i386 randconfig-a002-20200719
i386 randconfig-a005-20200719
i386 randconfig-a003-20200719
i386 randconfig-a004-20200719
x86_64   randconfig-a014-20200720
x86_64   randconfig-a015-20200720
x86_64   randconfig-a016-20200720
x86_64   randconfig-a012-20200720
x86_64   randconfig-a013-20200720
x86_64   randconfig-a011-20200720
x86_64   randconfig-a005-20200719
x86_64   randconfig-a002-20200719
x86_64   randconfig-a006-20200719
x86_64   randconfig-a001-20200719
x86_64   randconfig-a003-20200719
x86_64   randconfig-a004-20200719
i386 randconfig-a015-20200719
i386 randconfig-a011-20200719
i386 randconfig-a016-20200719
i386 randconfig-a012-20200719
i386 randconfig-a013-20200719
i386 randconfig-a014-20200719
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
riscvallmodconfig
s390 allyesconfig
s390  allnoconfig
s390 allmodconfig
s390defconfig
sparcallyesconfig
sparc  

  1   2   >