[powerpc:next-test] BUILD SUCCESS 824a2d10fcf429689cd20d7d36eeb24697466c9b

2020-04-01 Thread kbuild test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
next-test
branch HEAD: 824a2d10fcf429689cd20d7d36eeb24697466c9b  powerpc/perf: split 
callchain.c by bitness

elapsed time: 1004m

configs tested: 192
configs skipped: 0

The following configs have been built successfully.
More configs may be tested in the coming days.

arm  allmodconfig
arm   allnoconfig
arm  allyesconfig
arm64allmodconfig
arm64 allnoconfig
arm64allyesconfig
arm at91_dt_defconfig
arm   efm32_defconfig
arm  exynos_defconfig
armmulti_v5_defconfig
armmulti_v7_defconfig
armshmobile_defconfig
arm   sunxi_defconfig
arm64   defconfig
sparcallyesconfig
h8300h8300h-sim_defconfig
s390  allnoconfig
shallnoconfig
m68k   m5475evb_defconfig
c6x  allyesconfig
powerpc   ppc64_defconfig
ia64defconfig
powerpc defconfig
sparc64 defconfig
i386  allnoconfig
i386 alldefconfig
i386 allyesconfig
i386defconfig
ia64 alldefconfig
ia64 allmodconfig
ia64  allnoconfig
ia64 allyesconfig
nios2 3c120_defconfig
nios2 10m50_defconfig
c6xevmc6678_defconfig
xtensa  iss_defconfig
xtensa   common_defconfig
openrisc simple_smp_defconfig
openriscor1ksim_defconfig
alpha   defconfig
cskydefconfig
nds32 allnoconfig
nds32   defconfig
h8300 edosk2674_defconfig
h8300   h8s-sim_defconfig
m68k allmodconfig
m68k  multi_defconfig
m68k   sun3_defconfig
arc  allyesconfig
arc defconfig
microblaze  mmu_defconfig
microblazenommu_defconfig
powerpc   allnoconfig
powerpc  rhel-kconfig
mips   32r2_defconfig
mips 64r6el_defconfig
mips allmodconfig
mips  allnoconfig
mips allyesconfig
mips  fuloong2e_defconfig
mips  malta_kvm_defconfig
pariscallnoconfig
parisc   allyesconfig
pariscgeneric-32bit_defconfig
pariscgeneric-64bit_defconfig
x86_64   randconfig-a001-20200401
x86_64   randconfig-a002-20200401
x86_64   randconfig-a003-20200401
i386 randconfig-a001-20200401
i386 randconfig-a002-20200401
i386 randconfig-a003-20200401
alpharandconfig-a001-20200401
m68k randconfig-a001-20200401
mips randconfig-a001-20200401
nds32randconfig-a001-20200401
parisc   randconfig-a001-20200401
riscvrandconfig-a001-20200401
mips randconfig-a001-20200402
nds32randconfig-a001-20200402
m68k randconfig-a001-20200402
alpharandconfig-a001-20200402
parisc   randconfig-a001-20200402
riscvrandconfig-a001-20200402
microblaze   randconfig-a001-20200331
h8300randconfig-a001-20200331
nios2randconfig-a001-20200331
c6x  randconfig-a001-20200331
sparc64  randconfig-a001-20200331
c6x  randconfig-a001-20200401
h8300randconfig-a001-20200401
microblaze   randconfig-a001-20200401
nios2randconfig-a001-20200401
sparc64  randconfig-a001-20200401
c6x  randconfig-a001-20200402
h8300randconfig-a001-20200402
microblaze   randconfig-a001-20200402
nios2randconfig-a001-20200402
sparc64  randconfig-a001-20200402
s390 randconfig-a001-20200401
xtensa   randconfig-a001-20200401
csky randconfig-a001-20200401

[powerpc:merge] BUILD SUCCESS d0c12846a3a24cd6d68b608c866712bc7e471634

2020-04-01 Thread kbuild test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: d0c12846a3a24cd6d68b608c866712bc7e471634  Automatic merge of 
branch 'next' into merge

elapsed time: 879m

configs tested: 176
configs skipped: 0

The following configs have been built successfully.
More configs may be tested in the coming days.

arm  allmodconfig
arm   allnoconfig
arm  allyesconfig
arm64allmodconfig
arm64 allnoconfig
arm64allyesconfig
arm at91_dt_defconfig
arm   efm32_defconfig
arm  exynos_defconfig
armmulti_v5_defconfig
armmulti_v7_defconfig
armshmobile_defconfig
arm   sunxi_defconfig
arm64   defconfig
sparcallyesconfig
riscvnommu_virt_defconfig
ia64defconfig
powerpc defconfig
c6x  allyesconfig
powerpc   ppc64_defconfig
sparc64 defconfig
s390  allnoconfig
i386  allnoconfig
i386 alldefconfig
i386 allyesconfig
i386defconfig
ia64 allmodconfig
ia64  allnoconfig
ia64 allyesconfig
ia64 alldefconfig
c6xevmc6678_defconfig
nios2 10m50_defconfig
nios2 3c120_defconfig
openriscor1ksim_defconfig
openrisc simple_smp_defconfig
xtensa   common_defconfig
xtensa  iss_defconfig
alpha   defconfig
cskydefconfig
nds32 allnoconfig
nds32   defconfig
h8300 edosk2674_defconfig
h8300h8300h-sim_defconfig
h8300   h8s-sim_defconfig
m68k allmodconfig
m68k   m5475evb_defconfig
m68k  multi_defconfig
m68k   sun3_defconfig
arc  allyesconfig
arc defconfig
microblaze  mmu_defconfig
microblazenommu_defconfig
powerpc   allnoconfig
powerpc  rhel-kconfig
mips  fuloong2e_defconfig
mips  malta_kvm_defconfig
mips allyesconfig
mips 64r6el_defconfig
mips  allnoconfig
mips   32r2_defconfig
mips allmodconfig
pariscallnoconfig
parisc   allyesconfig
pariscgeneric-32bit_defconfig
pariscgeneric-64bit_defconfig
x86_64   randconfig-a001-20200401
x86_64   randconfig-a002-20200401
x86_64   randconfig-a003-20200401
i386 randconfig-a001-20200401
i386 randconfig-a002-20200401
i386 randconfig-a003-20200401
alpharandconfig-a001-20200401
m68k randconfig-a001-20200401
mips randconfig-a001-20200401
nds32randconfig-a001-20200401
parisc   randconfig-a001-20200401
riscvrandconfig-a001-20200401
mips randconfig-a001-20200402
c6x  randconfig-a001-20200401
h8300randconfig-a001-20200401
microblaze   randconfig-a001-20200401
nios2randconfig-a001-20200401
sparc64  randconfig-a001-20200401
c6x  randconfig-a001-20200402
h8300randconfig-a001-20200402
microblaze   randconfig-a001-20200402
nios2randconfig-a001-20200402
sparc64  randconfig-a001-20200402
csky randconfig-a001-20200401
openrisc randconfig-a001-20200401
s390 randconfig-a001-20200401
sh   randconfig-a001-20200401
xtensa   randconfig-a001-20200401
x86_64   randconfig-b001-20200402
x86_64   randconfig-b002-20200402
x86_64   randconfig-b003-20200402
i386 randconfig-b001-20200402
i386 randconfig-b002-20200402
i386 randconfig-b003-20200402
x86_64   randconfig-b001-20200401
x86_64   randconfig-b002-20200401

Re: [PATCH v2 1/1] ppc/crash: Skip spinlocks during crash

2020-04-01 Thread kbuild test robot
Hi Leonardo,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/locking/core]
[also build test ERROR on powerpc/next paulus-powerpc/kvm-ppc-next v5.6 
next-20200401]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Leonardo-Bras/ppc-crash-Skip-spinlocks-during-crash/20200327-105958
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
8bf6c677ddb9c922423ea3bf494fe7c508bfbb8c
config: powerpc-randconfig-a001-20200401 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot 

All errors (new ones prefixed by >>):

   powerpc-linux-ld: arch/powerpc/kernel/traps.o: in function `arch_spin_lock':
>> arch/powerpc/include/asm/spinlock.h:147: undefined reference to 
>> `crash_skip_spinlock'
>> powerpc-linux-ld: arch/powerpc/include/asm/spinlock.h:147: undefined 
>> reference to `crash_skip_spinlock'
   powerpc-linux-ld: arch/powerpc/kernel/rtas.o: in function `arch_spin_lock':
>> arch/powerpc/include/asm/spinlock.h:147: undefined reference to 
>> `crash_skip_spinlock'
>> powerpc-linux-ld: arch/powerpc/include/asm/spinlock.h:147: undefined 
>> reference to `crash_skip_spinlock'
   powerpc-linux-ld: kernel/locking/lockdep.o: in function `arch_spin_lock':
>> arch/powerpc/include/asm/spinlock.h:147: undefined reference to 
>> `crash_skip_spinlock'
   powerpc-linux-ld: 
kernel/locking/lockdep.o:arch/powerpc/include/asm/spinlock.h:147: more 
undefined references to `crash_skip_spinlock' follow
>> pahole: .tmp_vmlinux.btf: No such file or directory
   powerpc-linux-objdump: '.tmp_vmlinux.btf': No such file
   powerpc-linux-objdump: '.tmp_vmlinux.btf': No such file
   powerpc-linux-objcopy: '.tmp_vmlinux.btf': No such file
   powerpc-linux-objcopy: --change-section-vma .BTF=0x never 
used
   powerpc-linux-objcopy: --change-section-lma .BTF=0x never 
used
   powerpc-linux-objcopy: '.btf.vmlinux.bin': No such file
   Failed to generate BTF for vmlinux
   Try to disable CONFIG_DEBUG_INFO_BTF

vim +147 arch/powerpc/include/asm/spinlock.h

   140  
   141  static inline void arch_spin_lock(arch_spinlock_t *lock)
   142  {
   143  while (1) {
   144  if (likely(__arch_spin_trylock(lock) == 0))
   145  break;
   146  do {
 > 147  if (unlikely(crash_skip_spinlock))
   148  return;
   149  HMT_low();
   150  if (is_shared_processor())
   151  splpar_spin_yield(lock);
   152  } while (unlikely(lock->slock != 0));
   153  HMT_medium();
   154  }
   155  }
   156  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


RE: [PATCH v4 03/25] powerpc/powernv: Map & release OpenCAPI LPC memory

2020-04-01 Thread Alastair D'Silva
> -Original Message-
> From: Dan Williams 
> Sent: Wednesday, 1 April 2020 7:49 PM
> To: Alastair D'Silva 
> Cc: Aneesh Kumar K . V ; Oliver O'Halloran
> ; Benjamin Herrenschmidt
> ; Paul Mackerras ; Michael
> Ellerman ; Frederic Barrat ;
> Andrew Donnellan ; Arnd Bergmann
> ; Greg Kroah-Hartman ;
> Vishal Verma ; Dave Jiang
> ; Ira Weiny ; Andrew Morton
> ; Mauro Carvalho Chehab
> ; David S. Miller ;
> Rob Herring ; Anton Blanchard ;
> Krzysztof Kozlowski ; Mahesh Salgaonkar
> ; Madhavan Srinivasan
> ; Cédric Le Goater ; Anju T
> Sudhakar ; Hari Bathini
> ; Thomas Gleixner ; Greg
> Kurz ; Nicholas Piggin ; Masahiro
> Yamada ; Alexey Kardashevskiy
> ; Linux Kernel Mailing List ;
> linuxppc-dev ; linux-nvdimm  nvd...@lists.01.org>; Linux MM 
> Subject: Re: [PATCH v4 03/25] powerpc/powernv: Map & release OpenCAPI
> LPC memory
> 
> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
> wrote:
> >
> > This patch adds OPAL calls to powernv so that the OpenCAPI driver can
> > map & release LPC (Lowest Point of Coherency)  memory.
> >
> > Signed-off-by: Alastair D'Silva 
> > Reviewed-by: Andrew Donnellan 
> > ---
> >  arch/powerpc/include/asm/pnv-ocxl.h   |  2 ++
> >  arch/powerpc/platforms/powernv/ocxl.c | 43
> > +++
> >  2 files changed, 45 insertions(+)
> >
> > diff --git a/arch/powerpc/include/asm/pnv-ocxl.h
> > b/arch/powerpc/include/asm/pnv-ocxl.h
> > index 7de82647e761..560a19bb71b7 100644
> > --- a/arch/powerpc/include/asm/pnv-ocxl.h
> > +++ b/arch/powerpc/include/asm/pnv-ocxl.h
> > @@ -32,5 +32,7 @@ extern int
> pnv_ocxl_spa_remove_pe_from_cache(void
> > *platform_data, int pe_handle)
> >
> >  extern int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr);
> > extern void pnv_ocxl_free_xive_irq(u32 irq);
> > +u64 pnv_ocxl_platform_lpc_setup(struct pci_dev *pdev, u64 size); void
> > +pnv_ocxl_platform_lpc_release(struct pci_dev *pdev);
> >
> >  #endif /* _ASM_PNV_OCXL_H */
> > diff --git a/arch/powerpc/platforms/powernv/ocxl.c
> > b/arch/powerpc/platforms/powernv/ocxl.c
> > index 8c65aacda9c8..f13119a7c026 100644
> > --- a/arch/powerpc/platforms/powernv/ocxl.c
> > +++ b/arch/powerpc/platforms/powernv/ocxl.c
> > @@ -475,6 +475,49 @@ void pnv_ocxl_spa_release(void *platform_data)
> }
> > EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
> >
> > +u64 pnv_ocxl_platform_lpc_setup(struct pci_dev *pdev, u64 size) {
> > +   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> > +   struct pnv_phb *phb = hose->private_data;
> 
> Is calling the local variable 'hose' instead of 'host' on purpose?
> 

Yes, this follows the convention used in other functions in this file.

> > +   u32 bdfn = pci_dev_id(pdev);
> > +   __be64 base_addr_be64;
> > +   u64 base_addr;
> > +   int rc;
> > +
> > +   rc = opal_npu_mem_alloc(phb->opal_id, bdfn, size,
> _addr_be64);
> > +   if (rc) {
> > +   dev_warn(>dev,
> > +"OPAL could not allocate LPC memory, rc=%d\n", rc);
> > +   return 0;
> > +   }
> > +
> > +   base_addr = be64_to_cpu(base_addr_be64);
> > +
> > +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
> 
> With the proposed cleanup in patch2 the ifdef can be elided here.

Ok
> 
> > +   rc = check_hotplug_memory_addressable(base_addr >> PAGE_SHIFT,
> > + size >> PAGE_SHIFT);
> > +   if (rc)
> > +   return 0;
> 
> Is this an error worth logging if someone is wondering why their device is not
> showing up?
> 

Yes, I'll add a message.

> 
> > +#endif
> > +
> > +   return base_addr;
> > +}
> > +EXPORT_SYMBOL_GPL(pnv_ocxl_platform_lpc_setup);
> > +
> > +void pnv_ocxl_platform_lpc_release(struct pci_dev *pdev) {
> > +   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> > +   struct pnv_phb *phb = hose->private_data;
> > +   u32 bdfn = pci_dev_id(pdev);
> > +   int rc;
> > +
> > +   rc = opal_npu_mem_release(phb->opal_id, bdfn);
> > +   if (rc)
> > +   dev_warn(>dev,
> > +"OPAL reported rc=%d when releasing LPC
> > +memory\n", rc); }
> EXPORT_SYMBOL_GPL(pnv_ocxl_platform_lpc_release);
> > +
> >  int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int
> > pe_handle)  {
> > struct spa_data *data = (struct spa_data *) platform_data;
> > --
> > 2.24.1
> >
> 
> 
> --
> This email has been checked for viruses by AVG.
> https://www.avg.com




RE: [PATCH v4 02/25] mm/memory_hotplug: Allow check_hotplug_memory_addressable to be called from drivers

2020-04-01 Thread Alastair D'Silva
> -Original Message-
> From: Dan Williams 
> Sent: Wednesday, 1 April 2020 7:48 PM
> To: Alastair D'Silva 
> Cc: Aneesh Kumar K . V ; Oliver O'Halloran
> ; Benjamin Herrenschmidt
> ; Paul Mackerras ; Michael
> Ellerman ; Frederic Barrat ;
> Andrew Donnellan ; Arnd Bergmann
> ; Greg Kroah-Hartman ;
> Vishal Verma ; Dave Jiang
> ; Ira Weiny ; Andrew Morton
> ; Mauro Carvalho Chehab
> ; David S. Miller ;
> Rob Herring ; Anton Blanchard ;
> Krzysztof Kozlowski ; Mahesh Salgaonkar
> ; Madhavan Srinivasan
> ; Cédric Le Goater ; Anju T
> Sudhakar ; Hari Bathini
> ; Thomas Gleixner ; Greg
> Kurz ; Nicholas Piggin ; Masahiro
> Yamada ; Alexey Kardashevskiy
> ; Linux Kernel Mailing List ;
> linuxppc-dev ; linux-nvdimm  nvd...@lists.01.org>; Linux MM 
> Subject: Re: [PATCH v4 02/25] mm/memory_hotplug: Allow
> check_hotplug_memory_addressable to be called from drivers
> 
> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
> wrote:
> >
> > When setting up OpenCAPI connected persistent memory, the range check
> > may not be performed until quite late (or perhaps not at all, if the
> > user does not establish a DAX device).
> >
> > This patch makes the range check callable so we can perform the check
> > while probing the OpenCAPI Persistent Memory device.
> >
> > Signed-off-by: Alastair D'Silva 
> > Reviewed-by: Andrew Donnellan 
> > ---
> >  include/linux/memory_hotplug.h | 5 +
> >  mm/memory_hotplug.c| 4 ++--
> >  2 files changed, 7 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/memory_hotplug.h
> > b/include/linux/memory_hotplug.h index f4d59155f3d4..9a19ae0d7e31
> > 100644
> > --- a/include/linux/memory_hotplug.h
> > +++ b/include/linux/memory_hotplug.h
> > @@ -337,6 +337,11 @@ static inline void __remove_memory(int nid, u64
> > start, u64 size) {}  extern void set_zone_contiguous(struct zone
> > *zone);  extern void clear_zone_contiguous(struct zone *zone);
> >
> > +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
> > +int check_hotplug_memory_addressable(unsigned long pfn,
> > +unsigned long nr_pages); #endif
> > +/* CONFIG_MEMORY_HOTPLUG_SPARSE */
> 
> Let's move this to include/linux/memory.h with the other
> CONFIG_MEMORY_HOTPLUG_SPARSE declarations, and add a dummy
> implementation for the CONFIG_MEMORY_HOTPLUG_SPARSE=n case.
> 
> Also, this patch can be squashed with the next one, no need for it to be
> stand alone.
> 

Ok

> 
> > +
> >  extern void __ref free_area_init_core_hotplug(int nid);  extern int
> > __add_memory(int nid, u64 start, u64 size);  extern int add_memory(int
> > nid, u64 start, u64 size); diff --git a/mm/memory_hotplug.c
> > b/mm/memory_hotplug.c index 0a54ffac8c68..14945f033594 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -276,8 +276,8 @@ static int check_pfn_span(unsigned long pfn,
> unsigned long nr_pages,
> > return 0;
> >  }
> >
> > -static int check_hotplug_memory_addressable(unsigned long pfn,
> > -   unsigned long nr_pages)
> > +int check_hotplug_memory_addressable(unsigned long pfn,
> > +unsigned long nr_pages)
> >  {
> > const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
> >
> > --
> > 2.24.1
> >

-- 
Alastair D'Silva   mob: 0423 762 819
skype: alastair_dsilva msn: alast...@d-silva.org
blog: http://alastair.d-silva.orgTwitter: @EvilDeece
 




Re: [RFC PATCH 3/4] powerpc ppc-opcode: move ppc instuction encoding from test_emulate_step

2020-04-01 Thread Michael Ellerman
"Naveen N. Rao"  writes:
> Balamuruhan S wrote:
>> Few ppc instructions are encoded in test_emulate_step.c, consolidate them to
>> ppc-opcode.h, fix redefintion errors in bpf_jit caused due to this 
>> consolidation.
>> Reuse the macros from ppc-opcode.h
...
>> diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h
>> index 4ec2a9f14f84..8a9f16a7262e 100644
>> --- a/arch/powerpc/net/bpf_jit32.h
>> +++ b/arch/powerpc/net/bpf_jit32.h
>> @@ -76,13 +76,13 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh);
>>  else {  PPC_ADDIS(r, base, IMM_HA(i));\
>>  PPC_LBZ(r, r, IMM_L(i)); } } while(0)
>> 
>> -#define PPC_LD_OFFS(r, base, i) do { if ((i) < 32768) PPC_LD(r, base, i);   
>>   \
>> +#define _OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_ENCODE_LD(r, base, 
>> i)); \
>  ^
> Should be PPC_LD_OFFS. For the next version, please also build ppc32 and 
> booke codebase to confirm that your changes in those areas are fine.
>
> PPC_ENCODE_* also looks quite verbose, so perhaps PPC_ENC_* might be 
> better. Otherwise, this patchset looks good to me and should help reuse 
> some of those macros, especially from the eBPF codebase.
>
> Michael,
> Can you let us know if this looks ok to you? Based on your feedback, we 
> will also update the eBPF codebase.

I didn't really like the first patch which does the mass renaming. It
creates a huge amount of churn.

I think I'd be happier if this series just did what it needs, and then
maybe at the end there's a patch to update all the existing names, which
I may or may not take.

As far as the naming, currently we have:

PPC_INST_FOO - just the opcode

PPC_FOO(x) - macro to encode the opcode with x and (usually) also emit a
.long and stringify.

And you need an in-between that gives you the full instruction but
without the .long and stringify, right?

So how about PPC_RAW_FOO() for just the numeric value, without the .long
and stringify.

We also seem to have a lot of PPC_INST_FOO's that are only ever used in
the PPC_INST macro. I'm inclined to fold those into the PPC_INST macro,
to avoid people accidentally using the PPC_INST version when they don't
mean to. But that's a separate issue.

cheers


Re: [PATCH v4 00/25] Add support for OpenCAPI Persistent Memory devices

2020-04-01 Thread Oliver O'Halloran
On Thu, Apr 2, 2020 at 2:42 PM Michael Ellerman  wrote:
>
> "Alastair D'Silva"  writes:
> >> -Original Message-
> >> From: Dan Williams 
> >>
> >> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
> >> wrote:
> >> >
> >> > *snip*
> >> Are OPAL calls similar to ACPI DSMs? I.e. methods for the OS to invoke
> >> platform firmware services? What's Skiboot?
> >>
> >
> > Yes, OPAL is the interface to firmware for POWER. Skiboot is the 
> > open-source (and only) implementation of OPAL.
>
>   https://github.com/open-power/skiboot
>
> In particular the tokens for calls are defined here:
>
>   https://github.com/open-power/skiboot/blob/master/include/opal-api.h#L220
>
> And you can grep for the token to find the implementation:
>
>   https://github.com/open-power/skiboot/blob/master/hw/npu2-opencapi.c#L2328

I'm not sure I'd encourage anyone to read npu2-opencapi.c. I find it
hard enough to follow even with access to the workbooks.

There's an OPAL call API reference here:
http://open-power.github.io/skiboot/doc/opal-api/index.html

Oliver


RE: [PATCH v4 00/25] Add support for OpenCAPI Persistent Memory devices

2020-04-01 Thread Michael Ellerman
"Alastair D'Silva"  writes:
>> -Original Message-
>> From: Dan Williams 
>> 
>> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
>> wrote:
>> >
>> > This series adds support for OpenCAPI Persistent Memory devices on
>> > bare metal (arch/powernv), exposing them as nvdimms so that we can
>> > make use of the existing infrastructure. There already exists a driver
>> > for the same devices abstracted through PowerVM (arch/pseries):
>> > arch/powerpc/platforms/pseries/papr_scm.c
>> >
>> > These devices are connected via OpenCAPI, and present as LPC (lowest
>> coherence point) memory to the system, practically, that means that
>> memory on these cards could be treated as conventional, cache-coherent
>> memory.
>> >
>> > Since the devices are connected via OpenCAPI, they are not enumerated
>> via ACPI. Instead, OpenCAPI links present as pseudo-PCI bridges, with
>> devices below them.
>> >
>> > This series introduces a driver that exposes the memory on these cards as
>> nvdimms, with each card getting it's own bus. This is somewhat complicated
>> by the fact that the cards do not have out of band persistent storage for
>> metadata, so 1 SECTION_SIZE's (see SPARSEMEM) worth of storage is carved
>> out of the top of the card storage to implement the ndctl_config_* calls.
>> 
>> Is it really tied to section-size? Can't that change based on the configured
>> page-size? It's not clear to me why that would be the choice, but I'll dig 
>> into
>> the implementation.
>> 
>
> I had tried using PAGE_SIZE, but ran into problems carving off just 1 page 
> and handing it to the kernel, while leaving the rest as pmem. That was a 
> while ago though, so maybe I should retry it.
>
>> > The driver is not responsible for configuring the NPU (NVLink Processing
>> Unit) BARs to map the LPC memory from the card into the system's physical
>> address space, instead, it requests this to be done via OPAL calls (typically
>> implemented by Skiboot).
>> 
>> Are OPAL calls similar to ACPI DSMs? I.e. methods for the OS to invoke
>> platform firmware services? What's Skiboot?
>> 
>
> Yes, OPAL is the interface to firmware for POWER. Skiboot is the open-source 
> (and only) implementation of OPAL.

  https://github.com/open-power/skiboot

In particular the tokens for calls are defined here:

  https://github.com/open-power/skiboot/blob/master/include/opal-api.h#L220

And you can grep for the token to find the implementation:

  https://github.com/open-power/skiboot/blob/master/hw/npu2-opencapi.c#L2328


cheers


Re: [PATCH v5 1/4] powerpc/papr_scm: Fetch nvdimm health information from PHYP

2020-04-01 Thread Dan Williams
On Tue, Mar 31, 2020 at 7:33 AM Vaibhav Jain  wrote:
>
> Implement support for fetching nvdimm health information via
> H_SCM_HEALTH hcall as documented in Ref[1]. The hcall returns a pair
> of 64-bit big-endian integers which are then stored in 'struct
> papr_scm_priv' and subsequently partially exposed to user-space via
> newly introduced dimm specific attribute 'papr_flags'. Also a new asm
> header named 'papr-scm.h' is added that describes the interface
> between PHYP and guest kernel.
>
> Following flags are reported via 'papr_flags' sysfs attribute contents
> of which are space separated string flags indicating various nvdimm
> states:
>
>  * "not_armed"  : Indicating that nvdimm contents wont survive a power
>cycle.

s/wont/will not/

>  * "save_fail"  : Indicating that nvdimm contents couldn't be flushed
>during last shutdown event.

In the nfit definition this description is "flush_fail". The
"save_fail" flag was specific to hybrid devices that don't have
persistent media and instead scuttle away data from DRAM to flash on
power-failure.

>  * "restore_fail": Indicating that nvdimm contents couldn't be restored
>during dimm initialization.
>  * "encrypted"  : Dimm contents are encrypted.

This does not seem like a health flag to me, have you considered the
libnvdimm security interface for this indicator?

>  * "smart_notify": There is health event for the nvdimm.

Are you also going to signal the sysfs attribute when this event happens?

>  * "scrubbed"   : Indicating that contents of the nvdimm have been
>scrubbed.

This one seems odd to me what does it mean if it is not set? What does
it mean if a new scrub has been launched. Basically, is there value in
exposing this state?

>  * "locked" : Indicating that nvdimm contents cant be modified
>until next power cycle.

There is the generic NDD_LOCKED flag, can you use that? ...and in
general I wonder if we should try to unify all the common papr_scm and
nfit health flags in a generic location. It will already be the case
the ndctl needs to look somewhere papr specific for this data maybe it
all should have been generic from the beginning.


In any event, can you also add this content to a new
Documentation/ABI/testing/sysfs-bus-papr? See sysfs-bus-nfit for
comparison.

>
> [1]: commit 58b278f568f0 ("powerpc: Provide initial documentation for
> PAPR hcalls")
>
> Signed-off-by: Vaibhav Jain 
> ---
> Changelog:
>
> v4..v5 : None
>
> v3..v4 : None
>
> v2..v3 : Removed PAPR_SCM_DIMM_HEALTH_NON_CRITICAL as a condition for
>  NVDIMM unarmed [Aneesh]
>
> v1..v2 : New patch in the series.
> ---
>  arch/powerpc/include/asm/papr_scm.h   |  48 ++
>  arch/powerpc/platforms/pseries/papr_scm.c | 105 +-
>  2 files changed, 151 insertions(+), 2 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/papr_scm.h
>
> diff --git a/arch/powerpc/include/asm/papr_scm.h 
> b/arch/powerpc/include/asm/papr_scm.h
> new file mode 100644
> index ..868d3360f56a
> --- /dev/null
> +++ b/arch/powerpc/include/asm/papr_scm.h
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Structures and defines needed to manage nvdimms for spapr guests.
> + */
> +#ifndef _ASM_POWERPC_PAPR_SCM_H_
> +#define _ASM_POWERPC_PAPR_SCM_H_
> +
> +#include 
> +#include 
> +
> +/* DIMM health bitmap bitmap indicators */
> +/* SCM device is unable to persist memory contents */
> +#define PAPR_SCM_DIMM_UNARMED  PPC_BIT(0)
> +/* SCM device failed to persist memory contents */
> +#define PAPR_SCM_DIMM_SHUTDOWN_DIRTY   PPC_BIT(1)
> +/* SCM device contents are persisted from previous IPL */
> +#define PAPR_SCM_DIMM_SHUTDOWN_CLEAN   PPC_BIT(2)
> +/* SCM device contents are not persisted from previous IPL */
> +#define PAPR_SCM_DIMM_EMPTYPPC_BIT(3)
> +/* SCM device memory life remaining is critically low */
> +#define PAPR_SCM_DIMM_HEALTH_CRITICAL  PPC_BIT(4)
> +/* SCM device will be garded off next IPL due to failure */
> +#define PAPR_SCM_DIMM_HEALTH_FATAL PPC_BIT(5)
> +/* SCM contents cannot persist due to current platform health status */
> +#define PAPR_SCM_DIMM_HEALTH_UNHEALTHY PPC_BIT(6)
> +/* SCM device is unable to persist memory contents in certain conditions */
> +#define PAPR_SCM_DIMM_HEALTH_NON_CRITICAL  PPC_BIT(7)
> +/* SCM device is encrypted */
> +#define PAPR_SCM_DIMM_ENCRYPTEDPPC_BIT(8)
> +/* SCM device has been scrubbed and locked */
> +#define PAPR_SCM_DIMM_SCRUBBED_AND_LOCKED  PPC_BIT(9)
> +
> +/* Bits status indicators for health bitmap indicating unarmed dimm */
> +#define PAPR_SCM_DIMM_UNARMED_MASK (PAPR_SCM_DIMM_UNARMED |\
> +   PAPR_SCM_DIMM_HEALTH_UNHEALTHY)
> +
> +/* Bits status indicators for health bitmap indicating unflushed dimm */
> +#define 

Re: [PATCH v4 19/25] nvdimm/ocxl: Forward events to userspace

2020-04-01 Thread Dan Williams
On Tue, Mar 31, 2020 at 1:59 AM Alastair D'Silva  wrote:
>
> Some of the interrupts that the card generates are better handled
> by the userspace daemon, in particular:
> Controller Hardware/Firmware Fatal
> Controller Dump Available
> Error Log available
>
> This patch allows a userspace application to register an eventfd with
> the driver via SCM_IOCTL_EVENTFD to receive notifications of these
> interrupts.
>
> Userspace can then identify what events have occurred by calling
> SCM_IOCTL_EVENT_CHECK and checking against the SCM_IOCTL_EVENT_FOO
> masks.

The amount new ioctl's in this driver is too high, it seems much of
this data can be exported via sysfs attributes which are more
maintainable that ioctls. Then sysfs also has the ability to signal
events on sysfs attributes, see sys_notify_dirent.

Can you step back and review the ABI exposure of the driver and what
can be moved to sysfs? If you need to have bus specific attributes
ordered underneath the libnvdimm generic attributes you can create a
sysfs attribute subdirectory.

In general a roadmap document of all the proposed ABI is needed to
make sure it is both sufficient and necessary. See the libnvdimm
document that introduced the initial libnvdimm ABI:

https://www.kernel.org/doc/Documentation/nvdimm/nvdimm.txt

>
> Signed-off-by: Alastair D'Silva 
> ---
>  drivers/nvdimm/ocxl/main.c | 220 +
>  drivers/nvdimm/ocxl/ocxlpmem.h |   4 +
>  include/uapi/nvdimm/ocxlpmem.h |  12 ++
>  3 files changed, 236 insertions(+)
>
> diff --git a/drivers/nvdimm/ocxl/main.c b/drivers/nvdimm/ocxl/main.c
> index 0040fc09cceb..cb6cdc9eb899 100644
> --- a/drivers/nvdimm/ocxl/main.c
> +++ b/drivers/nvdimm/ocxl/main.c
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -301,8 +302,19 @@ static void free_ocxlpmem(struct ocxlpmem *ocxlpmem)
>  {
> int rc;
>
> +   // Disable doorbells
> +   (void)ocxl_global_mmio_set64(ocxlpmem->ocxl_afu, GLOBAL_MMIO_CHIEC,
> +OCXL_LITTLE_ENDIAN,
> +GLOBAL_MMIO_CHI_ALL);
> +
> free_minor(ocxlpmem);
>
> +   if (ocxlpmem->irq_addr[1])
> +   iounmap(ocxlpmem->irq_addr[1]);
> +
> +   if (ocxlpmem->irq_addr[0])
> +   iounmap(ocxlpmem->irq_addr[0]);
> +
> if (ocxlpmem->ocxl_context) {
> rc = ocxl_context_detach(ocxlpmem->ocxl_context);
> if (rc == -EBUSY)
> @@ -398,6 +410,11 @@ static int file_release(struct inode *inode, struct file 
> *file)
>  {
> struct ocxlpmem *ocxlpmem = file->private_data;
>
> +   if (ocxlpmem->ev_ctx) {
> +   eventfd_ctx_put(ocxlpmem->ev_ctx);
> +   ocxlpmem->ev_ctx = NULL;
> +   }
> +
> ocxlpmem_put(ocxlpmem);
> return 0;
>  }
> @@ -928,6 +945,52 @@ static int ioctl_controller_stats(struct ocxlpmem 
> *ocxlpmem,
> return rc;
>  }
>
> +static int ioctl_eventfd(struct ocxlpmem *ocxlpmem,
> +struct ioctl_ocxlpmem_eventfd __user *uarg)
> +{
> +   struct ioctl_ocxlpmem_eventfd args;
> +
> +   if (copy_from_user(, uarg, sizeof(args)))
> +   return -EFAULT;
> +
> +   if (ocxlpmem->ev_ctx)
> +   return -EBUSY;
> +
> +   ocxlpmem->ev_ctx = eventfd_ctx_fdget(args.eventfd);
> +   if (IS_ERR(ocxlpmem->ev_ctx))
> +   return PTR_ERR(ocxlpmem->ev_ctx);
> +
> +   return 0;
> +}
> +
> +static int ioctl_event_check(struct ocxlpmem *ocxlpmem, u64 __user *uarg)
> +{
> +   u64 val = 0;
> +   int rc;
> +   u64 chi = 0;
> +
> +   rc = ocxlpmem_chi(ocxlpmem, );
> +   if (rc < 0)
> +   return rc;
> +
> +   if (chi & GLOBAL_MMIO_CHI_ELA)
> +   val |= IOCTL_OCXLPMEM_EVENT_ERROR_LOG_AVAILABLE;
> +
> +   if (chi & GLOBAL_MMIO_CHI_CDA)
> +   val |= IOCTL_OCXLPMEM_EVENT_CONTROLLER_DUMP_AVAILABLE;
> +
> +   if (chi & GLOBAL_MMIO_CHI_CFFS)
> +   val |= IOCTL_OCXLPMEM_EVENT_FIRMWARE_FATAL;
> +
> +   if (chi & GLOBAL_MMIO_CHI_CHFS)
> +   val |= IOCTL_OCXLPMEM_EVENT_HARDWARE_FATAL;
> +
> +   if (copy_to_user((u64 __user *)uarg, , sizeof(val)))
> +   return -EFAULT;
> +
> +   return rc;
> +}
> +
>  static long file_ioctl(struct file *file, unsigned int cmd, unsigned long 
> args)
>  {
> struct ocxlpmem *ocxlpmem = file->private_data;
> @@ -956,6 +1019,15 @@ static long file_ioctl(struct file *file, unsigned int 
> cmd, unsigned long args)
> rc = ioctl_controller_stats(ocxlpmem,
> (struct 
> ioctl_ocxlpmem_controller_stats __user *)args);
> break;
> +
> +   case IOCTL_OCXLPMEM_EVENTFD:
> +   rc = ioctl_eventfd(ocxlpmem,
> +  (struct ioctl_ocxlpmem_eventfd __user 
> *)args);
> +   break;
> +
> + 

Re: [PATCH v4 08/25] ocxl: Emit a log message showing how much LPC memory was detected

2020-04-01 Thread Joe Perches
On Wed, 2020-04-01 at 01:49 -0700, Dan Williams wrote:
> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva  
> wrote:
> > This patch emits a message showing how much LPC memory & special purpose
> > memory was detected on an OCXL device.
[]
> > diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
[]
> > @@ -568,6 +568,10 @@ static int read_afu_lpc_memory_info(struct pci_dev 
> > *dev,
> > afu->special_purpose_mem_size =
> > total_mem_size - lpc_mem_size;
> > }
> > +
> > +   dev_info(>dev, "Probed LPC memory of %#llx bytes and special 
> > purpose memory of %#llx bytes\n",
> > +afu->lpc_mem_size, afu->special_purpose_mem_size);
> 
> A patch for a single log message is too fine grained for my taste,
> let's squash this into another patch in the series.

Is the granularity of lpc_mem_size actually bytes?
Might this be better as KiB or something using functions

Maybe something like:

unsigned long si_val(unsigned long val)
{
static const char units[] = "BKMGTPE";
const char *unit = units;

while (!(val & 1023) && unit[1]) {
val >>= 10;
unit++;
}

return val;
}

char si_type(unsigned long val)
{
static const char units[] = "BKMGTPE";
const char *unit = units;

while (!(val & 1023) && unit[1]) {
val >>= 10;
unit++;
}

return *unit;
}

so this could be something like:

   dev_info(>dev, "Probed LPC memory of %#llu%c and special purpose 
memory of %#llu%c\n",
si_val(afu->lpc_mem_size), si_type(afu->lpc_mem_size),
si_val(afu->special_purpose_mem_size), 
si_type(afu->special_purpose_mem_size));





Re: [PATCH v4 15/25] nvdimm/ocxl: Register a character device for userspace to interact with

2020-04-01 Thread Dan Williams
On Sun, Mar 29, 2020 at 10:53 PM Alastair D'Silva  wrote:
>
> This patch introduces a character device (/dev/ocxlpmemX) which further
> patches will use to interact with userspace, such as error logs,
> controller stats and card debug functionality.

This was asked earlier, but I'll reiterate, I do not see what
justifies an ocxlpmemX private device ABI vs routing through the
existing generic character ndbusX and nmemX character devices.

>
> Signed-off-by: Alastair D'Silva 
> ---
>  drivers/nvdimm/ocxl/main.c | 117 -
>  drivers/nvdimm/ocxl/ocxlpmem.h |   2 +
>  2 files changed, 117 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/nvdimm/ocxl/main.c b/drivers/nvdimm/ocxl/main.c
> index 8db573036423..9b85fcd3f1c9 100644
> --- a/drivers/nvdimm/ocxl/main.c
> +++ b/drivers/nvdimm/ocxl/main.c
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include "ocxlpmem.h"
> @@ -356,6 +357,67 @@ static int ocxlpmem_register(struct ocxlpmem *ocxlpmem)
> return device_register(>dev);
>  }
>
> +static void ocxlpmem_put(struct ocxlpmem *ocxlpmem)
> +{
> +   put_device(>dev);
> +}
> +
> +static struct ocxlpmem *ocxlpmem_get(struct ocxlpmem *ocxlpmem)
> +{
> +   return (!get_device(>dev)) ? NULL : ocxlpmem;
> +}
> +
> +static struct ocxlpmem *find_and_get_ocxlpmem(dev_t devno)
> +{
> +   struct ocxlpmem *ocxlpmem;
> +   int minor = MINOR(devno);
> +
> +   mutex_lock(_idr_lock);
> +   ocxlpmem = idr_find(_idr, minor);
> +   if (ocxlpmem)
> +   ocxlpmem_get(ocxlpmem);
> +   mutex_unlock(_idr_lock);
> +
> +   return ocxlpmem;
> +}
> +
> +static int file_open(struct inode *inode, struct file *file)
> +{
> +   struct ocxlpmem *ocxlpmem;
> +
> +   ocxlpmem = find_and_get_ocxlpmem(inode->i_rdev);
> +   if (!ocxlpmem)
> +   return -ENODEV;
> +
> +   file->private_data = ocxlpmem;
> +   return 0;
> +}
> +
> +static int file_release(struct inode *inode, struct file *file)
> +{
> +   struct ocxlpmem *ocxlpmem = file->private_data;
> +
> +   ocxlpmem_put(ocxlpmem);
> +   return 0;
> +}
> +
> +static const struct file_operations fops = {
> +   .owner  = THIS_MODULE,
> +   .open   = file_open,
> +   .release= file_release,
> +};
> +
> +/**
> + * create_cdev() - Create the chardev in /dev for the device
> + * @ocxlpmem: the SCM metadata
> + * Return: 0 on success, negative on failure
> + */
> +static int create_cdev(struct ocxlpmem *ocxlpmem)
> +{
> +   cdev_init(>cdev, );
> +   return cdev_add(>cdev, ocxlpmem->dev.devt, 1);
> +}
> +
>  /**
>   * ocxlpmem_remove() - Free an OpenCAPI persistent memory device
>   * @pdev: the PCI device information struct
> @@ -376,6 +438,13 @@ static void remove(struct pci_dev *pdev)
> if (ocxlpmem->nvdimm_bus)
> nvdimm_bus_unregister(ocxlpmem->nvdimm_bus);
>
> +   /*
> +* Remove the cdev early to prevent a race against userspace
> +* via the char dev
> +*/
> +   if (ocxlpmem->cdev.owner)
> +   cdev_del(>cdev);
> +
> device_unregister(>dev);
> }
>  }
> @@ -527,11 +596,18 @@ static int probe(struct pci_dev *pdev, const struct 
> pci_device_id *ent)
> goto err;
> }
>
> -   if (setup_command_metadata(ocxlpmem)) {
> +   rc = setup_command_metadata(ocxlpmem);
> +   if (rc) {
> dev_err(>dev, "Could not read command metadata\n");
> goto err;
> }
>
> +   rc = create_cdev(ocxlpmem);
> +   if (rc) {
> +   dev_err(>dev, "Could not create character device\n");
> +   goto err;
> +   }
> +
> elapsed = 0;
> timeout = ocxlpmem->readiness_timeout +
>   ocxlpmem->memory_available_timeout;
> @@ -599,6 +675,36 @@ static struct pci_driver pci_driver = {
> .shutdown = remove,
>  };
>
> +static int file_init(void)
> +{
> +   int rc;
> +
> +   rc = alloc_chrdev_region(_dev, 0, NUM_MINORS, "ocxlpmem");
> +   if (rc) {
> +   idr_destroy(_idr);
> +   pr_err("Unable to allocate OpenCAPI persistent memory major 
> number: %d\n",
> +  rc);
> +   return rc;
> +   }
> +
> +   ocxlpmem_class = class_create(THIS_MODULE, "ocxlpmem");
> +   if (IS_ERR(ocxlpmem_class)) {
> +   idr_destroy(_idr);
> +   pr_err("Unable to create ocxlpmem class\n");
> +   unregister_chrdev_region(ocxlpmem_dev, NUM_MINORS);
> +   return PTR_ERR(ocxlpmem_class);
> +   }
> +
> +   return 0;
> +}
> +
> +static void file_exit(void)
> +{
> +   class_destroy(ocxlpmem_class);
> +   unregister_chrdev_region(ocxlpmem_dev, NUM_MINORS);
> +   idr_destroy(_idr);
> +}
> +
>  static int __init 

Re: [PATCH v4 13/25] nvdimm/ocxl: Read the capability registers & wait for device ready

2020-04-01 Thread Dan Williams
On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva  wrote:
>
> This patch reads timeouts & firmware version from the controller, and
> uses those timeouts to wait for the controller to report that it is ready
> before handing the memory over to libnvdimm.
>
> Signed-off-by: Alastair D'Silva 
> ---
>  drivers/nvdimm/ocxl/Makefile|  2 +-
>  drivers/nvdimm/ocxl/main.c  | 85 +
>  drivers/nvdimm/ocxl/ocxlpmem.h  | 29 +
>  drivers/nvdimm/ocxl/ocxlpmem_internal.c | 19 ++
>  4 files changed, 134 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/nvdimm/ocxl/ocxlpmem_internal.c
>
> diff --git a/drivers/nvdimm/ocxl/Makefile b/drivers/nvdimm/ocxl/Makefile
> index e0e8ade1987a..bab97082e062 100644
> --- a/drivers/nvdimm/ocxl/Makefile
> +++ b/drivers/nvdimm/ocxl/Makefile
> @@ -4,4 +4,4 @@ ccflags-$(CONFIG_PPC_WERROR)+= -Werror
>
>  obj-$(CONFIG_OCXL_PMEM) += ocxlpmem.o
>
> -ocxlpmem-y := main.o
> \ No newline at end of file
> +ocxlpmem-y := main.o ocxlpmem_internal.o
> diff --git a/drivers/nvdimm/ocxl/main.c b/drivers/nvdimm/ocxl/main.c
> index c0066fedf9cc..be76acd33d74 100644
> --- a/drivers/nvdimm/ocxl/main.c
> +++ b/drivers/nvdimm/ocxl/main.c
> @@ -8,6 +8,7 @@
>
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -327,6 +328,50 @@ static void remove(struct pci_dev *pdev)
> }
>  }
>
> +/**
> + * read_device_metadata() - Retrieve config information from the AFU and 
> save it for future use
> + * @ocxlpmem: the device metadata
> + * Return: 0 on success, negative on failure
> + */
> +static int read_device_metadata(struct ocxlpmem *ocxlpmem)
> +{
> +   u64 val;
> +   int rc;
> +
> +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, GLOBAL_MMIO_CCAP0,
> +OCXL_LITTLE_ENDIAN, );

This calling convention would seem to defeat the ability of sparse to
validate endian correctness. That's independent of this series, but I
wonder how does someone review why this argument is sometimes
OCXL_LITTLE_ENDIAN and sometimes OCXL_HOST_ENDIAN?

> +   if (rc)
> +   return rc;
> +
> +   ocxlpmem->scm_revision = val & 0x;
> +   ocxlpmem->read_latency = (val >> 32) & 0x;
> +   ocxlpmem->readiness_timeout = (val >> 48) & 0x0F;
> +   ocxlpmem->memory_available_timeout = val >> 52;

Maybe some macros to parse out these register fields?

> +
> +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, GLOBAL_MMIO_CCAP1,
> +OCXL_LITTLE_ENDIAN, );
> +   if (rc)
> +   return rc;
> +
> +   ocxlpmem->max_controller_dump_size = val & 0x;
> +
> +   // Extract firmware version text
> +   rc = ocxl_global_mmio_read64(ocxlpmem->ocxl_afu, GLOBAL_MMIO_FWVER,
> +OCXL_HOST_ENDIAN,
> +(u64 *)ocxlpmem->fw_version);
> +   if (rc)
> +   return rc;
> +
> +   ocxlpmem->fw_version[8] = '\0';
> +
> +   dev_info(>dev,
> +"Firmware version '%s' SCM revision %d:%d\n",
> +ocxlpmem->fw_version, ocxlpmem->scm_revision >> 4,
> +ocxlpmem->scm_revision & 0x0F);

Does the driver need to be chatty here. If this data is relevant
should it appear in sysfs by default?

> +
> +   return 0;
> +}
> +
>  /**
>   * probe_function0() - Set up function 0 for an OpenCAPI persistent memory 
> device
>   * This is important as it enables templates higher than 0 across all other
> @@ -359,6 +404,9 @@ static int probe(struct pci_dev *pdev, const struct 
> pci_device_id *ent)
>  {
> struct ocxlpmem *ocxlpmem;
> int rc;
> +   u64 chi;
> +   u16 elapsed, timeout;
> +   bool ready = false;
>
> if (PCI_FUNC(pdev->devfn) == 0)
> return probe_function0(pdev);
> @@ -413,6 +461,43 @@ static int probe(struct pci_dev *pdev, const struct 
> pci_device_id *ent)
> goto err;
> }
>
> +   rc = read_device_metadata(ocxlpmem);
> +   if (rc) {
> +   dev_err(>dev, "Could not read metadata\n");
> +   goto err;
> +   }
> +
> +   elapsed = 0;
> +   timeout = ocxlpmem->readiness_timeout +
> + ocxlpmem->memory_available_timeout;
> +
> +   while (true) {
> +   rc = ocxlpmem_chi(ocxlpmem, );
> +   ready = (chi & (GLOBAL_MMIO_CHI_CRDY | GLOBAL_MMIO_CHI_MA)) ==
> +   (GLOBAL_MMIO_CHI_CRDY | GLOBAL_MMIO_CHI_MA);
> +
> +   if (ready)
> +   break;
> +
> +   if (elapsed++ > timeout) {
> +   dev_err(>dev,
> +   "OpenCAPI Persistent Memory ready 
> timeout.\n");
> +
> +   if (!(chi & GLOBAL_MMIO_CHI_CRDY))
> +   dev_err(>dev,
> +   "controller is not 

Re: [PATCH v3 1/1] ppc/crash: Reset spinlocks during crash

2020-04-01 Thread Leonardo Bras
Hello Peter, 

On Wed, 2020-04-01 at 11:26 +0200, Peter Zijlstra wrote:
> You might want to add a note to your asm/spinlock.h that you rely on
> spin_unlock() unconditionally clearing a lock.
> 
> This isn't naturally true for all lock implementations. Consider ticket
> locks, doing a surplus unlock will wreck your lock state in that case.
> So anybody poking at the powerpc spinlock implementation had better know
> you rely on this.

Good idea. I will add this to my changes and generate a v4.

Thank you,


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v4 03/16] powerpc: Use a datatype for instructions

2020-04-01 Thread Jordan Niethe
On Wed, Apr 1, 2020 at 9:32 PM Balamuruhan S  wrote:
>
> On Fri, 2020-03-20 at 16:17 +1100, Jordan Niethe wrote:
> > Currently unsigned ints are used to represent instructions on powerpc.
> > This has worked well as instructions have always been 4 byte words.
> > However, a future ISA version will introduce some changes to
> > instructions that mean this scheme will no longer work as well. This
> > change is Prefixed Instructions. A prefixed instruction is made up of a
> > word prefix followed by a word suffix to make an 8 byte double word
> > instruction. No matter the endianess of the system the prefix always
> > comes first. Prefixed instructions are only planned for powerpc64.
> >
> > Introduce a ppc_inst type to represent both prefixed and word
> > instructions on powerpc64 while keeping it possible to exclusively have
> > word instructions on powerpc32, A latter patch will expand the type to
> > include prefixed instructions but for now just typedef it to a u32.
> >
> > Later patches will introduce helper functions and macros for
> > manipulating the instructions so that powerpc64 and powerpc32 might
> > maintain separate type definitions.
> >
> > Signed-off-by: Jordan Niethe 
> > ---
> >  arch/powerpc/include/asm/code-patching.h | 31 +--
> >  arch/powerpc/include/asm/inst.h  | 53 +++
> >  arch/powerpc/include/asm/sstep.h |  5 +-
> >  arch/powerpc/kernel/align.c  |  2 +-
> >  arch/powerpc/kernel/hw_breakpoint.c  |  3 +-
> >  arch/powerpc/kernel/kprobes.c|  2 +-
> >  arch/powerpc/kernel/mce_power.c  |  5 +-
> >  arch/powerpc/kernel/optprobes.c  | 10 ++--
> >  arch/powerpc/kernel/trace/ftrace.c   | 66 
> >  arch/powerpc/kvm/emulate_loadstore.c |  1 +
> >  arch/powerpc/lib/code-patching.c | 54 +--
> >  arch/powerpc/lib/sstep.c |  4 +-
> >  arch/powerpc/lib/test_emulate_step.c |  9 ++--
> >  arch/powerpc/xmon/xmon.c | 12 ++---
> >  14 files changed, 160 insertions(+), 97 deletions(-)
> >  create mode 100644 arch/powerpc/include/asm/inst.h
> >
> > diff --git a/arch/powerpc/include/asm/code-patching.h
> > b/arch/powerpc/include/asm/code-patching.h
> > index 898b54262881..cb5106f92d67 100644
> > --- a/arch/powerpc/include/asm/code-patching.h
> > +++ b/arch/powerpc/include/asm/code-patching.h
> > @@ -11,6 +11,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  /* Flags for create_branch:
> >   * "b"   == create_branch(addr, target, 0);
> > @@ -22,27 +23,27 @@
> >  #define BRANCH_ABSOLUTE  0x2
> >
> >  bool is_offset_in_branch_range(long offset);
> > -unsigned int create_branch(const unsigned int *addr,
> > +ppc_inst create_branch(const ppc_inst *addr,
> >  unsigned long target, int flags);
> > -unsigned int create_cond_branch(const unsigned int *addr,
> > +unsigned int create_cond_branch(const ppc_inst *addr,
> >   unsigned long target, int flags);
> > -int patch_branch(unsigned int *addr, unsigned long target, int flags);
> > -int patch_instruction(unsigned int *addr, unsigned int instr);
> > -int raw_patch_instruction(unsigned int *addr, unsigned int instr);
> > +int patch_branch(ppc_inst *addr, unsigned long target, int flags);
> > +int patch_instruction(ppc_inst *addr, ppc_inst instr);
>
> we need to handle this change for its user in epapr_paravirt.c,
Thanks, good catch.
>
> arch/powerpc/kernel/epapr_paravirt.c: In function 'early_init_dt_scan_epapr':
> arch/powerpc/kernel/epapr_paravirt.c:40:48: error: incompatible type for
> argument 2 of 'patch_instruction'
>40 |   patch_instruction(epapr_hypercall_start + i, inst);
>   |^~~~
>   ||
>   |u32 {aka unsigned int}
> In file included from arch/powerpc/kernel/epapr_paravirt.c:12:
> ./arch/powerpc/include/asm/code-patching.h:31:44: note: expected 'ppc_inst'
> {aka 'struct ppc_inst'} but argument is of type 'u32' {aka 'unsigned int'}
>31 | int patch_instruction(void *addr, ppc_inst instr);
>   |   ~^
> make[2]: *** [scripts/Makefile.build:268: 
> arch/powerpc/kernel/epapr_paravirt.o]
> Error 1
> make[1]: *** [scripts/Makefile.build:505: arch/powerpc/kernel] Error 2
> make: *** [Makefile:1683: arch/powerpc] Error 2
>
>
> -- Bala
>
> > +int raw_patch_instruction(ppc_inst *addr, ppc_inst instr);
> >
> >  static inline unsigned long patch_site_addr(s32 *site)
> >  {
> >   return (unsigned long)site + *site;
> >  }
> >
> > -static inline int patch_instruction_site(s32 *site, unsigned int instr)
> > +static inline int patch_instruction_site(s32 *site, ppc_inst instr)
> >  {
> > - return patch_instruction((unsigned int *)patch_site_addr(site), 
> > instr);
> > + return 

Re: linux-next: manual merge of the net-next tree with the powerpc tree

2020-04-01 Thread Stephen Rothwell
Hi all,

On Fri, 6 Mar 2020 10:21:58 +1100 Stephen Rothwell  
wrote:
>
> Today's linux-next merge of the net-next tree got a conflict in:
> 
>   fs/sysfs/group.c
> 
> between commit:
> 
>   9255782f7061 ("sysfs: Wrap __compat_only_sysfs_link_entry_to_kobj function 
> to change the symlink name")
> 
> from the powerpc tree and commit:
> 
>   303a42769c4c ("sysfs: add sysfs_group{s}_change_owner()")
> 
> from the net-next tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> -- 
> Cheers,
> Stephen Rothwell
> 
> diff --cc fs/sysfs/group.c
> index 1e2a096057bc,5afe0e7ff7cd..
> --- a/fs/sysfs/group.c
> +++ b/fs/sysfs/group.c
> @@@ -478,4 -457,118 +479,118 @@@ int compat_only_sysfs_link_entry_to_kob
>   kernfs_put(target);
>   return PTR_ERR_OR_ZERO(link);
>   }
>  -EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
>  +EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);
> + 
> + static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
> +   const struct attribute_group *grp,
> +   struct iattr *newattrs)
> + {
> + struct kernfs_node *kn;
> + int error;
> + 
> + if (grp->attrs) {
> + struct attribute *const *attr;
> + 
> + for (attr = grp->attrs; *attr; attr++) {
> + kn = kernfs_find_and_get(grp_kn, (*attr)->name);
> + if (!kn)
> + return -ENOENT;
> + 
> + error = kernfs_setattr(kn, newattrs);
> + kernfs_put(kn);
> + if (error)
> + return error;
> + }
> + }
> + 
> + if (grp->bin_attrs) {
> + struct bin_attribute *const *bin_attr;
> + 
> + for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
> + kn = kernfs_find_and_get(grp_kn, 
> (*bin_attr)->attr.name);
> + if (!kn)
> + return -ENOENT;
> + 
> + error = kernfs_setattr(kn, newattrs);
> + kernfs_put(kn);
> + if (error)
> + return error;
> + }
> + }
> + 
> + return 0;
> + }
> + 
> + /**
> +  * sysfs_group_change_owner - change owner of an attribute group.
> +  * @kobj:   The kobject containing the group.
> +  * @grp:The attribute group.
> +  * @kuid:   new owner's kuid
> +  * @kgid:   new owner's kgid
> +  *
> +  * Returns 0 on success or error code on failure.
> +  */
> + int sysfs_group_change_owner(struct kobject *kobj,
> +  const struct attribute_group *grp, kuid_t kuid,
> +  kgid_t kgid)
> + {
> + struct kernfs_node *grp_kn;
> + int error;
> + struct iattr newattrs = {
> + .ia_valid = ATTR_UID | ATTR_GID,
> + .ia_uid = kuid,
> + .ia_gid = kgid,
> + };
> + 
> + if (!kobj->state_in_sysfs)
> + return -EINVAL;
> + 
> + if (grp->name) {
> + grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
> + } else {
> + kernfs_get(kobj->sd);
> + grp_kn = kobj->sd;
> + }
> + if (!grp_kn)
> + return -ENOENT;
> + 
> + error = kernfs_setattr(grp_kn, );
> + if (!error)
> + error = sysfs_group_attrs_change_owner(grp_kn, grp, );
> + 
> + kernfs_put(grp_kn);
> + 
> + return error;
> + }
> + EXPORT_SYMBOL_GPL(sysfs_group_change_owner);
> + 
> + /**
> +  * sysfs_groups_change_owner - change owner of a set of attribute groups.
> +  * @kobj:   The kobject containing the groups.
> +  * @groups: The attribute groups.
> +  * @kuid:   new owner's kuid
> +  * @kgid:   new owner's kgid
> +  *
> +  * Returns 0 on success or error code on failure.
> +  */
> + int sysfs_groups_change_owner(struct kobject *kobj,
> +   const struct attribute_group **groups,
> +   kuid_t kuid, kgid_t kgid)
> + {
> + int error = 0, i;
> + 
> + if (!kobj->state_in_sysfs)
> + return -EINVAL;
> + 
> + if (!groups)
> + return 0;
> + 
> + for (i = 0; groups[i]; i++) {
> + error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
> + if (error)
> + break;
> + }
> + 
> + return error;
> + }
> + EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);

This is now a conflict between the powerpc tree and Linus' tree.

-- 
Cheers,
Stephen Rothwell


pgpbZM_67Wxtq.pgp
Description: OpenPGP digital signature


RE: [PATCH v4 01/25] powerpc/powernv: Add OPAL calls for LPC memory alloc/release

2020-04-01 Thread Alastair D'Silva
> -Original Message-
> From: Dan Williams 
> Sent: Wednesday, 1 April 2020 7:48 PM
> To: Alastair D'Silva 
> Cc: Aneesh Kumar K . V ; Oliver O'Halloran
> ; Benjamin Herrenschmidt
> ; Paul Mackerras ; Michael
> Ellerman ; Frederic Barrat ;
> Andrew Donnellan ; Arnd Bergmann
> ; Greg Kroah-Hartman ;
> Vishal Verma ; Dave Jiang
> ; Ira Weiny ; Andrew Morton
> ; Mauro Carvalho Chehab
> ; David S. Miller ;
> Rob Herring ; Anton Blanchard ;
> Krzysztof Kozlowski ; Mahesh Salgaonkar
> ; Madhavan Srinivasan
> ; Cédric Le Goater ; Anju T
> Sudhakar ; Hari Bathini
> ; Thomas Gleixner ; Greg
> Kurz ; Nicholas Piggin ; Masahiro
> Yamada ; Alexey Kardashevskiy
> ; Linux Kernel Mailing List ;
> linuxppc-dev ; linux-nvdimm  nvd...@lists.01.org>; Linux MM 
> Subject: Re: [PATCH v4 01/25] powerpc/powernv: Add OPAL calls for LPC
> memory alloc/release
> 
> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
> wrote:
> >
> > Add OPAL calls for LPC memory alloc/release
> >
> 
> This seems to be referencing an existing api definition, can you include a
> pointer to the spec in case someone wanted to understand what these
> routines do? I suspect this is not allocating memory in the traditional sense 
> as
> much as it's allocating physical address space for a device to be mapped?
> 

These API calls were introduced in the following skiboot commit:
https://github.com/open-power/skiboot/commit/1a548857ce1f02f43585b326a891eed18a7b43b3

I'll add it to the description.

> 
> > Signed-off-by: Alastair D'Silva 
> > Acked-by: Andrew Donnellan 
> > Acked-by: Frederic Barrat 
> > ---
> >  arch/powerpc/include/asm/opal-api.h| 2 ++
> >  arch/powerpc/include/asm/opal.h| 2 ++
> >  arch/powerpc/platforms/powernv/opal-call.c | 2 ++
> >  3 files changed, 6 insertions(+)
> >
> > diff --git a/arch/powerpc/include/asm/opal-api.h
> > b/arch/powerpc/include/asm/opal-api.h
> > index c1f25a760eb1..9298e603001b 100644
> > --- a/arch/powerpc/include/asm/opal-api.h
> > +++ b/arch/powerpc/include/asm/opal-api.h
> > @@ -208,6 +208,8 @@
> >  #define OPAL_HANDLE_HMI2   166
> >  #defineOPAL_NX_COPROC_INIT 167
> >  #define OPAL_XIVE_GET_VP_STATE 170
> > +#define OPAL_NPU_MEM_ALLOC 171
> > +#define OPAL_NPU_MEM_RELEASE   172
> >  #define OPAL_MPIPL_UPDATE  173
> >  #define OPAL_MPIPL_REGISTER_TAG174
> >  #define OPAL_MPIPL_QUERY_TAG   175
> > diff --git a/arch/powerpc/include/asm/opal.h
> > b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..301fea46c7ca
> > 100644
> > --- a/arch/powerpc/include/asm/opal.h
> > +++ b/arch/powerpc/include/asm/opal.h
> > @@ -39,6 +39,8 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id,
> uint32_t bdfn,
> > uint64_t PE_handle);  int64_t
> > opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
> > uint64_t rate_phys, uint32_t size);
> > +int64_t opal_npu_mem_alloc(u64 phb_id, u32 bdfn, u64 size, __be64
> > +*bar); int64_t opal_npu_mem_release(u64 phb_id, u32 bdfn);
> >
> >  int64_t opal_console_write(int64_t term_number, __be64 *length,
> >const uint8_t *buffer); diff --git
> > a/arch/powerpc/platforms/powernv/opal-call.c
> > b/arch/powerpc/platforms/powernv/opal-call.c
> > index 5cd0f52d258f..f26e58b72c04 100644
> > --- a/arch/powerpc/platforms/powernv/opal-call.c
> > +++ b/arch/powerpc/platforms/powernv/opal-call.c
> > @@ -287,6 +287,8 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,
> OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
> >  OPAL_CALL(opal_sensor_read_u64,
> OPAL_SENSOR_READ_U64);
> >  OPAL_CALL(opal_sensor_group_enable,
> OPAL_SENSOR_GROUP_ENABLE);
> >  OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
> > +OPAL_CALL(opal_npu_mem_alloc,  OPAL_NPU_MEM_ALLOC);
> > +OPAL_CALL(opal_npu_mem_release,
> OPAL_NPU_MEM_RELEASE);
> >  OPAL_CALL(opal_mpipl_update,   OPAL_MPIPL_UPDATE);
> >  OPAL_CALL(opal_mpipl_register_tag,
> OPAL_MPIPL_REGISTER_TAG);
> >  OPAL_CALL(opal_mpipl_query_tag,
> OPAL_MPIPL_QUERY_TAG);
> > --
> > 2.24.1
> >
> 
> 
> --
> This email has been checked for viruses by AVG.
> https://www.avg.com


-- 
Alastair D'Silva   mob: 0423 762 819
skype: alastair_dsilva msn: alast...@d-silva.org
blog: http://alastair.d-silva.orgTwitter: @EvilDeece



RE: [PATCH v4 00/25] Add support for OpenCAPI Persistent Memory devices

2020-04-01 Thread Alastair D'Silva


> -Original Message-
> From: Dan Williams 
> Sent: Wednesday, 1 April 2020 7:48 PM
> To: Alastair D'Silva 
> Cc: Aneesh Kumar K . V ; Oliver O'Halloran
> ; Benjamin Herrenschmidt
> ; Paul Mackerras ; Michael
> Ellerman ; Frederic Barrat ;
> Andrew Donnellan ; Arnd Bergmann
> ; Greg Kroah-Hartman ;
> Vishal Verma ; Dave Jiang
> ; Ira Weiny ; Andrew Morton
> ; Mauro Carvalho Chehab
> ; David S. Miller ;
> Rob Herring ; Anton Blanchard ;
> Krzysztof Kozlowski ; Mahesh Salgaonkar
> ; Madhavan Srinivasan
> ; Cédric Le Goater ; Anju T
> Sudhakar ; Hari Bathini
> ; Thomas Gleixner ; Greg
> Kurz ; Nicholas Piggin ; Masahiro
> Yamada ; Alexey Kardashevskiy
> ; Linux Kernel Mailing List ;
> linuxppc-dev ; linux-nvdimm  nvd...@lists.01.org>; Linux MM 
> Subject: Re: [PATCH v4 00/25] Add support for OpenCAPI Persistent Memory
> devices
> 
> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
> wrote:
> >
> > This series adds support for OpenCAPI Persistent Memory devices on
> > bare metal (arch/powernv), exposing them as nvdimms so that we can
> > make use of the existing infrastructure. There already exists a driver
> > for the same devices abstracted through PowerVM (arch/pseries):
> > arch/powerpc/platforms/pseries/papr_scm.c
> >
> > These devices are connected via OpenCAPI, and present as LPC (lowest
> coherence point) memory to the system, practically, that means that
> memory on these cards could be treated as conventional, cache-coherent
> memory.
> >
> > Since the devices are connected via OpenCAPI, they are not enumerated
> via ACPI. Instead, OpenCAPI links present as pseudo-PCI bridges, with
> devices below them.
> >
> > This series introduces a driver that exposes the memory on these cards as
> nvdimms, with each card getting it's own bus. This is somewhat complicated
> by the fact that the cards do not have out of band persistent storage for
> metadata, so 1 SECTION_SIZE's (see SPARSEMEM) worth of storage is carved
> out of the top of the card storage to implement the ndctl_config_* calls.
> 
> Is it really tied to section-size? Can't that change based on the configured
> page-size? It's not clear to me why that would be the choice, but I'll dig 
> into
> the implementation.
> 

I had tried using PAGE_SIZE, but ran into problems carving off just 1 page and 
handing it to the kernel, while leaving the rest as pmem. That was a while ago 
though, so maybe I should retry it.

> > The driver is not responsible for configuring the NPU (NVLink Processing
> Unit) BARs to map the LPC memory from the card into the system's physical
> address space, instead, it requests this to be done via OPAL calls (typically
> implemented by Skiboot).
> 
> Are OPAL calls similar to ACPI DSMs? I.e. methods for the OS to invoke
> platform firmware services? What's Skiboot?
> 

Yes, OPAL is the interface to firmware for POWER. Skiboot is the open-source 
(and only) implementation of OPAL.

> >
> > The series is structured as follows:
> >  - Required infrastructure changes & cleanup
> >  - A minimal driver implementation
> >  - Implementing additional features within the driver
> 
> Thanks for the intro and the changelog!
> 
> >
> > Changelog:
> > V4:
> >   - Rebase on next-20200320
> 
> Do you have dependencies on other material that's in -next? Otherwise -
> next is only a viable development baseline if you are going to merge through
> Andrew's tree.
> 
> >   - Bump copyright to 2020
> >   - Ensure all uapi headers use C89 compatible comments (missed
> ocxlpmem.h)
> >   - Move the driver back to drivers/nvdimm/ocxl, after confirmation
> > that this location is desirable
> >   - Rename ocxl.c to ocxlpmem.c (+ support files)
> >   - Rename all ocxl_pmem to ocxlpmem
> >   - Address checkpatch --strict issues
> >   - "powerpc/powernv: Add OPAL calls for LPC memory alloc/release"
> > - Pass base address as __be64
> >   - "ocxl: Tally up the LPC memory on a link & allow it to be mapped"
> > - Address checkpatch spacing warnings
> > - Reword blurb
> > - Reword size description for ocxl_link_add_lpc_mem()
> > - Add an early exit in ocxl_link_lpc_release() to avoid triggering
> >   bogus warnings if called after ocxl_link_lpc_map() fails
> >   - "powerpc/powernv: Add OPAL calls for LPC memory alloc/release"
> > - Reword blurb
> >   - "powerpc/powernv: Map & release OpenCAPI LPC memory"
> > - Reword blurb
> >   - Move minor_idr init from file_init() to ocxlpmem_init() (fixes runtime
> error
> > in "nvdimm: Add driver for OpenCAPI Persistent Memory")
> >   - Wrap long lines
> >   - "nvdimm: Add driver for OpenCAPI Storage Class Memory"
> > - Remove '+ 1' workround from serial number->cookie assignment
> > - Drop out of memory message for ocxlpmem in probe()
> > - Fix leaks of ocxlpmem & ocxlpmem->ocxl_fn in probe()
> > - remove struct ocxlpmem_function0, it didn't value add
> > - factor out err_unregistered label in probe
> 

Re: [PATCH v2 1/1] vfio-pci/nvlink2: Allow fallback to ibm,mmio-atsd[0]

2020-04-01 Thread Alex Williamson
On Tue, 31 Mar 2020 15:12:46 +1100
Sam Bobroff  wrote:

> Older versions of skiboot only provide a single value in the device
> tree property "ibm,mmio-atsd", even when multiple Address Translation
> Shoot Down (ATSD) registers are present. This prevents NVLink2 devices
> (other than the first) from being used with vfio-pci because vfio-pci
> expects to be able to assign a dedicated ATSD register to each NVLink2
> device.
> 
> However, ATSD registers can be shared among devices. This change
> allows vfio-pci to fall back to sharing the register at index 0 if
> necessary.
> 
> Fixes: 7f92891778df ("vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] 
> subdriver")
> Signed-off-by: Sam Bobroff 
> ---
> Patch set v2:
> Patch 1/1: vfio-pci/nvlink2: Allow fallback to ibm,mmio-atsd[0]
> - Removed unnecessary warning.
> - Added Fixes tag.
> 
> Patch set v1:
> Patch 1/1: vfio-pci/nvlink2: Allow fallback to ibm,mmio-atsd[0]
> 
>  drivers/vfio/pci/vfio_pci_nvlink2.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)

Applied to vfio next branch for v5.7 with Alexey's review.  Thanks,

Alex

> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c 
> b/drivers/vfio/pci/vfio_pci_nvlink2.c
> index f2983f0f84be..ae2af590e501 100644
> --- a/drivers/vfio/pci/vfio_pci_nvlink2.c
> +++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
> @@ -420,8 +420,14 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
>  
>   if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
>   _atsd)) {
> - dev_warn(>pdev->dev, "No available ATSD found\n");
> - mmio_atsd = 0;
> + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0,
> + _atsd)) {
> + dev_warn(>pdev->dev, "No available ATSD found\n");
> + mmio_atsd = 0;
> + } else {
> + dev_warn(>pdev->dev,
> +  "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
> + }
>   }
>  
>   if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", )) {



[PATCH v5 6/9] crypto/nx: Make enable code generic to add new GZIP compression type

2020-04-01 Thread Haren Myneni
(Sorry for reposting. version number is missed in subject)

Make setup and enable code generic to support new GZIP compression type.
Changed nx842 reference to nx and moved some code to new functions.
Functionality is not changed except sparse warning fix - setting NULL
instead of 0 for per_cpu send window in nx_delete_coprocs().

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-common-powernv.c | 161 +-
 1 file changed, 101 insertions(+), 60 deletions(-)

diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index f42881f..82dfa60 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -40,9 +40,9 @@ struct nx842_workmem {
char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
 } __packed __aligned(WORKMEM_ALIGN);
 
-struct nx842_coproc {
+struct nx_coproc {
unsigned int chip_id;
-   unsigned int ct;
+   unsigned int ct;/* Can be 842 or GZIP high/normal*/
unsigned int ci;/* Coprocessor instance, used with icswx */
struct {
struct vas_window *rxwin;
@@ -58,9 +58,15 @@ struct nx842_coproc {
 static DEFINE_PER_CPU(struct vas_window *, cpu_txwin);
 
 /* no cpu hotplug on powernv, so this list never changes after init */
-static LIST_HEAD(nx842_coprocs);
+static LIST_HEAD(nx_coprocs);
 static unsigned int nx842_ct;  /* used in icswx function */
 
+/*
+ * Using same values as in skiboot or coprocessor type representing
+ * in NX workbook.
+ */
+#define NX_CT_842  (3)
+
 static int (*nx842_powernv_exec)(const unsigned char *in,
unsigned int inlen, unsigned char *out,
unsigned int *outlenp, void *workmem, int fc);
@@ -666,15 +672,15 @@ static int nx842_powernv_decompress(const unsigned char 
*in, unsigned int inlen,
  wmem, CCW_FC_842_DECOMP_CRC);
 }
 
-static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
+static inline void nx_add_coprocs_list(struct nx_coproc *coproc,
int chipid)
 {
coproc->chip_id = chipid;
INIT_LIST_HEAD(>list);
-   list_add(>list, _coprocs);
+   list_add(>list, _coprocs);
 }
 
-static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
+static struct vas_window *nx_alloc_txwin(struct nx_coproc *coproc)
 {
struct vas_window *txwin = NULL;
struct vas_tx_win_attr txattr;
@@ -704,9 +710,9 @@ static struct vas_window *nx842_alloc_txwin(struct 
nx842_coproc *coproc)
  * cpu_txwin is used in copy/paste operation for each compression /
  * decompression request.
  */
-static int nx842_open_percpu_txwins(void)
+static int nx_open_percpu_txwins(void)
 {
-   struct nx842_coproc *coproc, *n;
+   struct nx_coproc *coproc, *n;
unsigned int i, chip_id;
 
for_each_possible_cpu(i) {
@@ -714,17 +720,18 @@ static int nx842_open_percpu_txwins(void)
 
chip_id = cpu_to_chip_id(i);
 
-   list_for_each_entry_safe(coproc, n, _coprocs, list) {
+   list_for_each_entry_safe(coproc, n, _coprocs, list) {
/*
 * Kernel requests use only high priority FIFOs. So
 * open send windows for these FIFOs.
+* GZIP is not supported in kernel right now.
 */
 
if (coproc->ct != VAS_COP_TYPE_842_HIPRI)
continue;
 
if (coproc->chip_id == chip_id) {
-   txwin = nx842_alloc_txwin(coproc);
+   txwin = nx_alloc_txwin(coproc);
if (IS_ERR(txwin))
return PTR_ERR(txwin);
 
@@ -743,13 +750,28 @@ static int nx842_open_percpu_txwins(void)
return 0;
 }
 
+static int __init nx_set_ct(struct nx_coproc *coproc, const char *priority,
+   int high, int normal)
+{
+   if (!strcmp(priority, "High"))
+   coproc->ct = high;
+   else if (!strcmp(priority, "Normal"))
+   coproc->ct = normal;
+   else {
+   pr_err("Invalid RxFIFO priority value\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
-   int vasid, int *ct)
+   int vasid, int type, int *ct)
 {
struct vas_window *rxwin = NULL;
struct vas_rx_win_attr rxattr;
-   struct nx842_coproc *coproc;
u32 lpid, pid, tid, fifo_size;
+   struct nx_coproc *coproc;
u64 rx_fifo;
const char *priority;
int ret;
@@ -794,15 +816,12 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int 

[PATCH v9 06/13] powerpc/vas: Take reference to PID and mm for user space windows

2020-04-01 Thread Haren Myneni
(sorry reposting. version string missed)

When process opens a window, its pid and tgid will be saved in the
vas_window struct. This window will be closed when the process exits.
The kernel handles NX faults by updating CSB or send SEGV signal to pid
of the process if the user space csb addr is invalid.

In multi-thread applications, a window can be opened by a child thread,
but it will not be closed when this thread exits. It is expected that
the parent will clean up all resources including NX windows opened by
child threads. A child thread can send NX requests using this window
and could be killed before completion is reported. If the pid assigned
to this thread is reused while requests are pending, a failure SEGV
would be directed to the wrong place.

To prevent reusing the pid, take references to pid and mm when the window
is opened and release them when when the window is closed. Then if child
thread is not running, SEGV signal will be sent to thread group leader
(tgid).

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-debug.c  |  2 +-
 arch/powerpc/platforms/powernv/vas-window.c | 53 ++---
 arch/powerpc/platforms/powernv/vas.h|  9 -
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-debug.c 
b/arch/powerpc/platforms/powernv/vas-debug.c
index 09e63df..ef9a717 100644
--- a/arch/powerpc/platforms/powernv/vas-debug.c
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -38,7 +38,7 @@ static int info_show(struct seq_file *s, void *private)
 
seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
window->tx_win ? "Send" : "Receive");
-   seq_printf(s, "Pid : %d\n", window->pid);
+   seq_printf(s, "Pid : %d\n", vas_window_pid(window));
 
 unlock:
mutex_unlock(_mutex);
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index dc46bf6..7054cd4 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include "vas.h"
@@ -876,8 +878,6 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
-   if (rxattr->user_win)
-   rxwin->pid = task_pid_vnr(current);
 
init_winctx_for_rxwin(rxwin, rxattr, );
init_winctx_regs(rxwin, );
@@ -1027,7 +1027,6 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
txwin->tx_win = 1;
txwin->rxwin = rxwin;
txwin->nx_win = txwin->rxwin->nx_win;
-   txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 
@@ -1059,8 +1058,43 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
goto free_window;
}
 
-   set_vinst_win(vinst, txwin);
+   if (txwin->user_win) {
+   /*
+* Window opened by a child thread may not be closed when
+* it exits. So take reference to its pid and release it
+* when the window is free by parent thread.
+* Acquire a reference to the task's pid to make sure
+* pid will not be re-used - needed only for multithread
+* applications.
+*/
+   txwin->pid = get_task_pid(current, PIDTYPE_PID);
+   /*
+* Acquire a reference to the task's mm.
+*/
+   txwin->mm = get_task_mm(current);
 
+   if (!txwin->mm) {
+   put_pid(txwin->pid);
+   pr_err("VAS: pid(%d): mm_struct is not found\n",
+   current->pid);
+   rc = -EPERM;
+   goto free_window;
+   }
+
+   mmgrab(txwin->mm);
+   mmput(txwin->mm);
+   mm_context_add_copro(txwin->mm);
+   /*
+* Process closes window during exit. In the case of
+* multithread application, the child thread can open
+* window and can exit without closing it. Expects parent
+* thread to use and close the window. So do not need
+* to take pid reference for parent thread.
+*/
+   txwin->tgid = find_get_pid(task_tgid_vnr(current));
+   }
+
+   set_vinst_win(vinst, txwin);
return txwin;
 
 free_window:
@@ -1257,8 +1291,17 @@ int vas_win_close(struct vas_window *window)
poll_window_castout(window);
 
/* if send window, drop reference to matching receive window */
-   if (window->tx_win)
+   if (window->tx_win) {

[PATCH v5 9/9] Documentation/powerpc: VAS API

2020-04-01 Thread Haren Myneni


Power9 introduced Virtual Accelerator Switchboard (VAS) which allows
user space to communicate with Nest Accelerator (NX) directly. But
kernel has to establish channel to NX for user space. This document
describes user space API that application can use to establish
communication channel.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 Documentation/powerpc/index.rst   |   1 +
 Documentation/powerpc/vas-api.rst | 292 ++
 2 files changed, 293 insertions(+)
 create mode 100644 Documentation/powerpc/vas-api.rst

diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index 0d45f0f..afe2d5e 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -30,6 +30,7 @@ powerpc
 syscall64-abi
 transactional_memory
 ultravisor
+vas-api
 
 .. only::  subproject and html
 
diff --git a/Documentation/powerpc/vas-api.rst 
b/Documentation/powerpc/vas-api.rst
new file mode 100644
index 000..1217c2f
--- /dev/null
+++ b/Documentation/powerpc/vas-api.rst
@@ -0,0 +1,292 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. _VAS-API:
+
+===
+Virtual Accelerator Switchboard (VAS) userspace API
+===
+
+Introduction
+
+
+Power9 processor introduced Virtual Accelerator Switchboard (VAS) which
+allows both userspace and kernel communicate to co-processor
+(hardware accelerator) referred to as the Nest Accelerator (NX). The NX
+unit comprises of one or more hardware engines or co-processor types
+such as 842 compression, GZIP compression and encryption. On power9,
+userspace applications will have access to only GZIP Compression engine
+which supports ZLIB and GZIP compression algorithms in the hardware.
+
+To communicate with NX, kernel has to establish a channel or window and
+then requests can be submitted directly without kernel involvement.
+Requests to the GZIP engine must be formatted as a co-processor Request
+Block (CRB) and these CRBs must be submitted to the NX using COPY/PASTE
+instructions to paste the CRB to hardware address that is associated with
+the engine's request queue.
+
+The GZIP engine provides two priority levels of requests: Normal and
+High. Only Normal requests are supported from userspace right now.
+
+This document explains userspace API that is used to interact with
+kernel to setup channel / window which can be used to send compression
+requests directly to NX accelerator.
+
+
+Overview
+
+
+Application access to the GZIP engine is provided through
+/dev/crypto/nx-gzip device node implemented by the VAS/NX device driver.
+An application must open the /dev/crypto/nx-gzip device to obtain a file
+descriptor (fd). Then should issue VAS_TX_WIN_OPEN ioctl with this fd to
+establish connection to the engine. It means send window is opened on GZIP
+engine for this process. Once a connection is established, the application
+should use the mmap() system call to map the hardware address of engine's
+request queue into the application's virtual address space.
+
+The application can then submit one or more requests to the the engine by
+using copy/paste instructions and pasting the CRBs to the virtual address
+(aka paste_address) returned by mmap(). User space can close the
+established connection or send window by closing the file descriptior
+(close(fd)) or upon the process exit.
+
+Note that applications can send several requests with the same window or
+can establish multiple windows, but one window for each file descriptor.
+
+Following sections provide additional details and references about the
+individual steps.
+
+NX-GZIP Device Node
+===
+
+There is one /dev/crypto/nx-gzip node in the system and it provides
+access to all GZIP engines in the system. The only valid operations on
+/dev/crypto/nx-gzip are:
+
+   * open() the device for read and write.
+   * issue VAS_TX_WIN_OPEN ioctl
+   * mmap() the engine's request queue into application's virtual
+ address space (i.e. get a paste_address for the co-processor
+ engine).
+   * close the device node.
+
+Other file operations on this device node are undefined.
+
+Note that the copy and paste operations go directly to the hardware and
+do not go through this device. Refer COPY/PASTE document for more
+details.
+
+Although a system may have several instances of the NX co-processor
+engines (typically, one per P9 chip) there is just one
+/dev/crypto/nx-gzip device node in the system. When the nx-gzip device
+node is opened, Kernel opens send window on a suitable instance of NX
+accelerator. It finds CPU on which the user process is executing and
+determine the NX instance for the corresponding chip on which this CPU
+belongs.
+
+Applications may chose a specific instance of the NX co-processor using
+the vas_id field in the VAS_TX_WIN_OPEN ioctl as detailed below.
+
+A userspace 

[PATCH v5 8/9] crypto/nx: Remove 'pid' in vas_tx_win_attr struct

2020-04-01 Thread Haren Myneni


When window is opened, pid reference is taken for user space
windows. Not needed for kernel windows. So remove 'pid' in
vas_tx_win_attr struct.

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 arch/powerpc/include/asm/vas.h| 1 -
 drivers/crypto/nx/nx-common-powernv.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index e064953..994db6f 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -86,7 +86,6 @@ struct vas_tx_win_attr {
int wcreds_max;
int lpid;
int pidr;   /* hardware PID (from SPRN_PID) */
-   int pid;/* linux process id */
int pswid;
int rsvd_txbuf_count;
int tc_mode;
diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index f570691..38333e4 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -692,7 +692,6 @@ static struct vas_window *nx_alloc_txwin(struct nx_coproc 
*coproc)
 */
vas_init_tx_win_attr(, coproc->ct);
txattr.lpid = 0;/* lpid is 0 for kernel requests */
-   txattr.pid = 0; /* pid is 0 for kernel requests */
 
/*
 * Open a VAS send window which is used to send request to NX.
-- 
1.8.3.1





[PATCH v5 7/9] crypto/nx: Enable and setup GZIP compression type

2020-04-01 Thread Haren Myneni


Changes to probe GZIP device-tree nodes, open RX windows and setup
GZIP compression type. No plans to provide GZIP usage in kernel right
now, but this patch enables GZIP for user space usage.

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-common-powernv.c | 43 ++-
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index 82dfa60..f570691 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -65,6 +65,7 @@ struct nx_coproc {
  * Using same values as in skiboot or coprocessor type representing
  * in NX workbook.
  */
+#define NX_CT_GZIP (2) /* on P9 and later */
 #define NX_CT_842  (3)
 
 static int (*nx842_powernv_exec)(const unsigned char *in,
@@ -819,6 +820,9 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
if (type == NX_CT_842)
ret = nx_set_ct(coproc, priority, VAS_COP_TYPE_842_HIPRI,
VAS_COP_TYPE_842);
+   else if (type == NX_CT_GZIP)
+   ret = nx_set_ct(coproc, priority, VAS_COP_TYPE_GZIP_HIPRI,
+   VAS_COP_TYPE_GZIP);
 
if (ret)
goto err_out;
@@ -867,12 +871,16 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
return ret;
 }
 
-static int __init nx_coproc_init(int chip_id, int ct_842)
+static int __init nx_coproc_init(int chip_id, int ct_842, int ct_gzip)
 {
int ret = 0;
 
if (opal_check_token(OPAL_NX_COPROC_INIT)) {
ret = opal_nx_coproc_init(chip_id, ct_842);
+
+   if (!ret)
+   ret = opal_nx_coproc_init(chip_id, ct_gzip);
+
if (ret) {
ret = opal_error_code(ret);
pr_err("Failed to initialize NX for chip(%d): %d\n",
@@ -902,8 +910,8 @@ static int __init find_nx_device_tree(struct device_node 
*dn, int chip_id,
 static int __init nx_powernv_probe_vas(struct device_node *pn)
 {
int chip_id, vasid, ret = 0;
+   int ct_842 = 0, ct_gzip = 0;
struct device_node *dn;
-   int ct_842 = 0;
 
chip_id = of_get_ibm_chip_id(pn);
if (chip_id < 0) {
@@ -920,19 +928,24 @@ static int __init nx_powernv_probe_vas(struct device_node 
*pn)
for_each_child_of_node(pn, dn) {
ret = find_nx_device_tree(dn, chip_id, vasid, NX_CT_842,
"ibm,p9-nx-842", _842);
+
+   if (!ret)
+   ret = find_nx_device_tree(dn, chip_id, vasid,
+   NX_CT_GZIP, "ibm,p9-nx-gzip", _gzip);
+
if (ret)
return ret;
}
 
-   if (!ct_842) {
-   pr_err("NX842 FIFO nodes are missing\n");
+   if (!ct_842 || !ct_gzip) {
+   pr_err("NX FIFO nodes are missing\n");
return -EINVAL;
}
 
/*
 * Initialize NX instance for both high and normal priority FIFOs.
 */
-   ret = nx_coproc_init(chip_id, ct_842);
+   ret = nx_coproc_init(chip_id, ct_842, ct_gzip);
 
return ret;
 }
@@ -1072,10 +1085,19 @@ static __init int nx_compress_powernv_init(void)
nx842_powernv_exec = nx842_exec_icswx;
} else {
/*
+* Register VAS user space API for NX GZIP so
+* that user space can use GZIP engine.
+* 842 compression is supported only in kernel.
+*/
+   ret = vas_register_coproc_api(THIS_MODULE);
+
+   /*
 * GZIP is not supported in kernel right now.
 * So open tx windows only for 842.
 */
-   ret = nx_open_percpu_txwins();
+   if (!ret)
+   ret = nx_open_percpu_txwins();
+
if (ret) {
nx_delete_coprocs();
return ret;
@@ -1096,6 +1118,15 @@ static __init int nx_compress_powernv_init(void)
 
 static void __exit nx_compress_powernv_exit(void)
 {
+   /*
+* GZIP engine is supported only in power9 or later and nx842_ct
+* is used on power8 (icswx).
+* VAS API for NX GZIP is registered during init for user space
+* use. So delete this API use for GZIP engine.
+*/
+   if (!nx842_ct)
+   vas_unregister_coproc_api();
+
crypto_unregister_alg(_powernv_alg);
 
nx_delete_coprocs();
-- 
1.8.3.1





[PATCH 6/9] crypto/nx: Make enable code generic to add new GZIP compression type

2020-04-01 Thread Haren Myneni


Make setup and enable code generic to support new GZIP compression type.
Changed nx842 reference to nx and moved some code to new functions.
Functionality is not changed except sparse warning fix - setting NULL
instead of 0 for per_cpu send window in nx_delete_coprocs().

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-common-powernv.c | 161 +-
 1 file changed, 101 insertions(+), 60 deletions(-)

diff --git a/drivers/crypto/nx/nx-common-powernv.c 
b/drivers/crypto/nx/nx-common-powernv.c
index f42881f..82dfa60 100644
--- a/drivers/crypto/nx/nx-common-powernv.c
+++ b/drivers/crypto/nx/nx-common-powernv.c
@@ -40,9 +40,9 @@ struct nx842_workmem {
char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
 } __packed __aligned(WORKMEM_ALIGN);
 
-struct nx842_coproc {
+struct nx_coproc {
unsigned int chip_id;
-   unsigned int ct;
+   unsigned int ct;/* Can be 842 or GZIP high/normal*/
unsigned int ci;/* Coprocessor instance, used with icswx */
struct {
struct vas_window *rxwin;
@@ -58,9 +58,15 @@ struct nx842_coproc {
 static DEFINE_PER_CPU(struct vas_window *, cpu_txwin);
 
 /* no cpu hotplug on powernv, so this list never changes after init */
-static LIST_HEAD(nx842_coprocs);
+static LIST_HEAD(nx_coprocs);
 static unsigned int nx842_ct;  /* used in icswx function */
 
+/*
+ * Using same values as in skiboot or coprocessor type representing
+ * in NX workbook.
+ */
+#define NX_CT_842  (3)
+
 static int (*nx842_powernv_exec)(const unsigned char *in,
unsigned int inlen, unsigned char *out,
unsigned int *outlenp, void *workmem, int fc);
@@ -666,15 +672,15 @@ static int nx842_powernv_decompress(const unsigned char 
*in, unsigned int inlen,
  wmem, CCW_FC_842_DECOMP_CRC);
 }
 
-static inline void nx842_add_coprocs_list(struct nx842_coproc *coproc,
+static inline void nx_add_coprocs_list(struct nx_coproc *coproc,
int chipid)
 {
coproc->chip_id = chipid;
INIT_LIST_HEAD(>list);
-   list_add(>list, _coprocs);
+   list_add(>list, _coprocs);
 }
 
-static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
+static struct vas_window *nx_alloc_txwin(struct nx_coproc *coproc)
 {
struct vas_window *txwin = NULL;
struct vas_tx_win_attr txattr;
@@ -704,9 +710,9 @@ static struct vas_window *nx842_alloc_txwin(struct 
nx842_coproc *coproc)
  * cpu_txwin is used in copy/paste operation for each compression /
  * decompression request.
  */
-static int nx842_open_percpu_txwins(void)
+static int nx_open_percpu_txwins(void)
 {
-   struct nx842_coproc *coproc, *n;
+   struct nx_coproc *coproc, *n;
unsigned int i, chip_id;
 
for_each_possible_cpu(i) {
@@ -714,17 +720,18 @@ static int nx842_open_percpu_txwins(void)
 
chip_id = cpu_to_chip_id(i);
 
-   list_for_each_entry_safe(coproc, n, _coprocs, list) {
+   list_for_each_entry_safe(coproc, n, _coprocs, list) {
/*
 * Kernel requests use only high priority FIFOs. So
 * open send windows for these FIFOs.
+* GZIP is not supported in kernel right now.
 */
 
if (coproc->ct != VAS_COP_TYPE_842_HIPRI)
continue;
 
if (coproc->chip_id == chip_id) {
-   txwin = nx842_alloc_txwin(coproc);
+   txwin = nx_alloc_txwin(coproc);
if (IS_ERR(txwin))
return PTR_ERR(txwin);
 
@@ -743,13 +750,28 @@ static int nx842_open_percpu_txwins(void)
return 0;
 }
 
+static int __init nx_set_ct(struct nx_coproc *coproc, const char *priority,
+   int high, int normal)
+{
+   if (!strcmp(priority, "High"))
+   coproc->ct = high;
+   else if (!strcmp(priority, "Normal"))
+   coproc->ct = normal;
+   else {
+   pr_err("Invalid RxFIFO priority value\n");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
-   int vasid, int *ct)
+   int vasid, int type, int *ct)
 {
struct vas_window *rxwin = NULL;
struct vas_rx_win_attr rxattr;
-   struct nx842_coproc *coproc;
u32 lpid, pid, tid, fifo_size;
+   struct nx_coproc *coproc;
u64 rx_fifo;
const char *priority;
int ret;
@@ -794,15 +816,12 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
if (!coproc)
return -ENOMEM;
 

[PATCH v5 5/9] crypto/nx: Rename nx-842-powernv file name to nx-common-powernv

2020-04-01 Thread Haren Myneni


Rename nx-842-powernv.c to nx-common-powernv.c to add code for setup
and enable new GZIP compression type. The actual functionality is not
changed in this patch.

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/Makefile|2 +-
 drivers/crypto/nx/nx-842-powernv.c| 1062 -
 drivers/crypto/nx/nx-common-powernv.c | 1062 +
 3 files changed, 1063 insertions(+), 1063 deletions(-)
 delete mode 100644 drivers/crypto/nx/nx-842-powernv.c
 create mode 100644 drivers/crypto/nx/nx-common-powernv.c

diff --git a/drivers/crypto/nx/Makefile b/drivers/crypto/nx/Makefile
index 015155d..bc89a20 100644
--- a/drivers/crypto/nx/Makefile
+++ b/drivers/crypto/nx/Makefile
@@ -15,4 +15,4 @@ obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_PSERIES) += 
nx-compress-pseries.o nx-compres
 obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_POWERNV) += nx-compress-powernv.o 
nx-compress.o
 nx-compress-objs := nx-842.o
 nx-compress-pseries-objs := nx-842-pseries.o
-nx-compress-powernv-objs := nx-842-powernv.o
+nx-compress-powernv-objs := nx-common-powernv.o
diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
deleted file mode 100644
index 8e63326..000
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Driver for IBM PowerNV 842 compression accelerator
- *
- * Copyright (C) 2015 Dan Streetman, IBM Corp
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "nx-842.h"
-
-#include 
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dan Streetman ");
-MODULE_DESCRIPTION("842 H/W Compression driver for IBM PowerNV processors");
-MODULE_ALIAS_CRYPTO("842");
-MODULE_ALIAS_CRYPTO("842-nx");
-
-#define WORKMEM_ALIGN  (CRB_ALIGN)
-#define CSB_WAIT_MAX   (5000) /* ms */
-#define VAS_RETRIES(10)
-
-struct nx842_workmem {
-   /* Below fields must be properly aligned */
-   struct coprocessor_request_block crb; /* CRB_ALIGN align */
-   struct data_descriptor_entry ddl_in[DDL_LEN_MAX]; /* DDE_ALIGN align */
-   struct data_descriptor_entry ddl_out[DDL_LEN_MAX]; /* DDE_ALIGN align */
-   /* Above fields must be properly aligned */
-
-   ktime_t start;
-
-   char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
-} __packed __aligned(WORKMEM_ALIGN);
-
-struct nx842_coproc {
-   unsigned int chip_id;
-   unsigned int ct;
-   unsigned int ci;/* Coprocessor instance, used with icswx */
-   struct {
-   struct vas_window *rxwin;
-   int id;
-   } vas;
-   struct list_head list;
-};
-
-/*
- * Send the request to NX engine on the chip for the corresponding CPU
- * where the process is executing. Use with VAS function.
- */
-static DEFINE_PER_CPU(struct vas_window *, cpu_txwin);
-
-/* no cpu hotplug on powernv, so this list never changes after init */
-static LIST_HEAD(nx842_coprocs);
-static unsigned int nx842_ct;  /* used in icswx function */
-
-static int (*nx842_powernv_exec)(const unsigned char *in,
-   unsigned int inlen, unsigned char *out,
-   unsigned int *outlenp, void *workmem, int fc);
-
-/**
- * setup_indirect_dde - Setup an indirect DDE
- *
- * The DDE is setup with the the DDE count, byte count, and address of
- * first direct DDE in the list.
- */
-static void setup_indirect_dde(struct data_descriptor_entry *dde,
-  struct data_descriptor_entry *ddl,
-  unsigned int dde_count, unsigned int byte_count)
-{
-   dde->flags = 0;
-   dde->count = dde_count;
-   dde->index = 0;
-   dde->length = cpu_to_be32(byte_count);
-   dde->address = cpu_to_be64(nx842_get_pa(ddl));
-}
-
-/**
- * setup_direct_dde - Setup single DDE from buffer
- *
- * The DDE is setup with the buffer and length.  The buffer must be properly
- * aligned.  The used length is returned.
- * Returns:
- *   NSuccessfully set up DDE with N bytes
- */
-static unsigned int setup_direct_dde(struct data_descriptor_entry *dde,
-unsigned long pa, unsigned int len)
-{
-   unsigned int l = min_t(unsigned int, len, LEN_ON_PAGE(pa));
-
-   dde->flags = 0;
-   dde->count = 0;
-   dde->index = 0;
-   dde->length = cpu_to_be32(l);
-   dde->address = cpu_to_be64(pa);
-
-   return l;
-}
-
-/**
- * setup_ddl - Setup DDL from buffer
- *
- * Returns:
- *   0 Successfully set up DDL
- */
-static int setup_ddl(struct data_descriptor_entry *dde,
-struct data_descriptor_entry *ddl,
-unsigned char *buf, unsigned int len,
-bool in)
-{
-   unsigned long pa = nx842_get_pa(buf);
-   int i, ret, total_len = len;
-
-   if (!IS_ALIGNED(pa, DDE_BUFFER_ALIGN)) {
-   pr_debug("%s 

[PATCH v5 4/9] crypto/nx: Initialize coproc entry with kzalloc

2020-04-01 Thread Haren Myneni


coproc entry is initialized during NX probe on power9, but not on P8.
nx842_delete_coprocs() is used for both and frees receive window if it
is allocated. Getting crash for rmmod on P8 since coproc->vas.rxwin
is not initialized.

This patch replaces kmalloc with kzalloc in nx842_powernv_probe()

Signed-off-by: Haren Myneni 
Acked-by: Herbert Xu 
---
 drivers/crypto/nx/nx-842-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
index c037a24..8e63326 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -922,7 +922,7 @@ static int __init nx842_powernv_probe(struct device_node 
*dn)
return -EINVAL;
}
 
-   coproc = kmalloc(sizeof(*coproc), GFP_KERNEL);
+   coproc = kzalloc(sizeof(*coproc), GFP_KERNEL);
if (!coproc)
return -ENOMEM;
 
-- 
1.8.3.1





[PATCH v5 3/9] powerpc/vas: Add VAS user space API

2020-04-01 Thread Haren Myneni


On power9, user space can send GZIP compression requests directly to NX
once kernel establishes NX channel / window with VAS. This patch provides
user space API which allows user space to establish channel using open
VAS_TX_WIN_OPEN ioctl, mmap and close operations.

Each window corresponds to file descriptor and application can open
multiple windows. After the window is opened, VAS_TX_WIN_OPEN icoctl to
open a window on specific VAS instance, mmap() system call to map
the hardware address of engine's request queue into the application's
virtual address space.

Then the application can then submit one or more requests to the the
engine by using the copy/paste instructions and pasting the CRBs to
the virtual address (aka paste_address) returned by mmap().

Only NX GZIP coprocessor type is supported right now and allow GZIP
engine access via /dev/crypto/nx-gzip device node.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/vas.h  |  11 ++
 arch/powerpc/platforms/powernv/Makefile |   2 +-
 arch/powerpc/platforms/powernv/vas-api.c| 257 
 arch/powerpc/platforms/powernv/vas-window.c |   6 +-
 arch/powerpc/platforms/powernv/vas.h|   2 +
 5 files changed, 274 insertions(+), 4 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-api.c

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index f93e6b0..e064953 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -163,4 +163,15 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
  */
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
+/*
+ * Register / unregister coprocessor type to VAS API which will be exported
+ * to user space. Applications can use this API to open / close window
+ * which can be used to send / receive requests directly to cooprcessor.
+ *
+ * Only NX GZIP coprocessor type is supported now, but this API can be
+ * used for others in future.
+ */
+int vas_register_coproc_api(struct module *mod);
+void vas_unregister_coproc_api(void);
+
 #endif /* __ASM_POWERPC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 395789f..fe3f0fb 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_MEMORY_FAILURE)  += opal-memory-errors.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o vas-fault.o
+obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o vas-fault.o vas-api.o
 obj-$(CONFIG_OCXL_BASE)+= ocxl.o
 obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
 obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/vas-api.c 
b/arch/powerpc/platforms/powernv/vas-api.c
new file mode 100644
index 000..7d049af
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-api.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * VAS user space API for its accelerators (Only NX-GZIP is supported now)
+ * Copyright (C) 2019 Haren Myneni, IBM Corp
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "vas.h"
+
+/*
+ * The driver creates the device node that can be used as follows:
+ * For NX-GZIP
+ *
+ * fd = open("/dev/crypto/nx-gzip", O_RDWR);
+ * rc = ioctl(fd, VAS_TX_WIN_OPEN, );
+ * paste_addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, fd, 0ULL).
+ * vas_copy(, 0, 1);
+ * vas_paste(paste_addr, 0, 1);
+ * close(fd) or exit process to close window.
+ *
+ * where "vas_copy" and "vas_paste" are defined in copy-paste.h.
+ * copy/paste returns to the user space directly. So refer NX hardware
+ * documententation for exact copy/paste usage and completion / error
+ * conditions.
+ */
+
+static char*coproc_dev_name = "nx-gzip";
+
+/*
+ * Wrapper object for the nx-gzip device - there is just one instance of
+ * this node for the whole system.
+ */
+static struct coproc_dev {
+   struct cdev cdev;
+   struct device *device;
+   char *name;
+   dev_t devt;
+   struct class *class;
+} coproc_device;
+
+static char *coproc_devnode(struct device *dev, umode_t *mode)
+{
+   return kasprintf(GFP_KERNEL, "crypto/%s", dev_name(dev));
+}
+
+static int coproc_open(struct inode *inode, struct file *fp)
+{
+   /*
+* vas_window is allocated and assigned to fp->private_data
+* in ioctl. Nothing to do here for NX GZIP.
+*/
+   return 0;
+}
+
+static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg)
+{
+   void __user *uptr = (void __user *)arg;
+   struct vas_tx_win_attr txattr = {};
+   struct vas_tx_win_open_attr uattr;
+   struct vas_window *txwin;
+   int rc, vasid;
+
+   /*
+* One 

[PATCH v5 2/9] powerpc/vas: Define VAS_TX_WIN_OPEN ioctl API

2020-04-01 Thread Haren Myneni


Define the VAS_TX_WIN_OPEN ioctl interface for NX GZIP access
from user space. This interface is used to open GZIP send window and
mmap region which can be used by user space to send requests to NX
directly with copy/paste instructions.

Signed-off-by: Haren Myneni 
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |  1 +
 arch/powerpc/include/uapi/asm/vas-api.h| 22 ++
 2 files changed, 23 insertions(+)
 create mode 100644 arch/powerpc/include/uapi/asm/vas-api.h

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index f759eda..f18accb 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -286,6 +286,7 @@ Code  Seq#Include File  
 Comments
 'v'   00-1F  linux/fs.h  conflict!
 'v'   00-0F  linux/sonypi.h  conflict!
 'v'   00-0F  media/v4l2-subdev.h conflict!
+'v'   20-27  arch/powerpc/include/uapi/asm/vas-api.hVAS API
 'v'   C0-FF  linux/meye.hconflict!
 'w'   allCERN SCI 
driver
 'y'   00-1F  packet 
based user level communications
diff --git a/arch/powerpc/include/uapi/asm/vas-api.h 
b/arch/powerpc/include/uapi/asm/vas-api.h
new file mode 100644
index 000..fe95d67
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/vas-api.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright 2019 IBM Corp.
+ */
+
+#ifndef _UAPI_MISC_VAS_H
+#define _UAPI_MISC_VAS_H
+
+#include 
+
+#define VAS_MAGIC  'v'
+#define VAS_TX_WIN_OPEN_IOW(VAS_MAGIC, 0x20, struct 
vas_tx_win_open_attr)
+
+struct vas_tx_win_open_attr {
+   __u32   version;
+   __s16   vas_id; /* specific instance of vas or -1 for default */
+   __u16   reserved1;
+   __u64   flags;  /* Future use */
+   __u64   reserved2[6];
+};
+
+#endif /* _UAPI_MISC_VAS_H */
-- 
1.8.3.1





[PATCH v5 1/9] powerpc/vas: Initialize window attributes for GZIP coprocessor type

2020-04-01 Thread Haren Myneni


Initialize send and receive window attributes for GZIP high and
normal priority types.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-window.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index be900ad..d239c4b 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -817,7 +817,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, 
enum vas_cop_type cop)
 {
memset(rxattr, 0, sizeof(*rxattr));
 
-   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+   cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
rxattr->pin_win = true;
rxattr->nx_win = true;
rxattr->fault_win = false;
@@ -892,7 +893,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, 
enum vas_cop_type cop)
 {
memset(txattr, 0, sizeof(*txattr));
 
-   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+   if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+   cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
txattr->rej_no_credit = false;
txattr->rx_wcred_mode = true;
txattr->tx_wcred_mode = true;
@@ -976,9 +978,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
return false;
 
-   if (attr->user_win &&
-   (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
-   return false;
+   if (attr->user_win) {
+   if (attr->rsvd_txbuf_count)
+   return false;
+
+   if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP &&
+   cop != VAS_COP_TYPE_GZIP_HIPRI)
+   return false;
+   }
 
return true;
 }
-- 
1.8.3.1





[PATCH v5 0/9] crypto/nx: Enable GZIP engine and provide userpace API

2020-04-01 Thread Haren Myneni


Power9 processor supports Virtual Accelerator Switchboard (VAS) which
allows kernel and userspace to send compression requests to Nest
Accelerator (NX) directly. The NX unit comprises of 2 842 compression
engines and 1 GZIP engine. Linux kernel already has 842 compression
support on kernel. This patch series adds GZIP compression support
from user space. The GZIP Compression engine implements the ZLIB and
GZIP compression algorithms. No plans of adding NX-GZIP compression
support in kernel right now.

Applications can send requests to NX directly with COPY/PASTE
instructions. But kernel has to establish channel / window on NX-GZIP
device for the userspace. So userspace access to the GZIP engine is
provided through /dev/crypto/nx-gzip device with several operations.

An application must open the this device to obtain a file descriptor (fd).
Using the fd, application should issue the VAS_TX_WIN_OPEN ioctl to
establish a connection to the engine. Once window is opened, should use
mmap() system call to map the hardware address of engine's request queue
into the application's virtual address space. Then user space forms the
request as co-processor Request Block (CRB) and paste this CRB on the
mapped HW address using COPY/PASTE instructions. Application can poll
on status flags (part of CRB) with timeout for request completion.

For VAS_TX_WIN_OPEN ioctl, if user space passes vas_id = -1 (struct
vas_tx_win_open_attr), kernel determines the VAS instance on the
corresponding chip based on the CPU on which the process is executing.
Otherwise, the specified VAS instance is used if application passes the
proper VAS instance (vas_id listed in /proc/device-tree/vas@*/ibm,vas_id).

Process can open multiple windows with different FDs or can send several
requests to NX on the same window at the same time.

A userspace library libnxz is available:
https://github.com/abalib/power-gzip

Applications that use inflate/deflate calls can link with libNXz and use
NX GZIP compression without any modification.

Tested the available 842 compression on power8 and power9 system to make
sure no regression and tested GZIP compression on power9 with tests
available in the above link.

Thanks to Bulent Abali for nxz library and tests development.

Changelog:

V2:
  - Move user space API code to powerpc as suggested. Also this API
can be extended to any other coprocessor type that VAS can support
in future. Example: Fast thread wakeup feature from VAS
  - Rebased to 5.6-rc3

V3:
  - Fix sparse warnings (patches 3&6)

V4:
  - Remove unused coproc_instid and add only window address in
fp->private_data.
  - Add NX User's manual and Copy/paste links in VAS API documentation
in patch and other changes as Daniel Axtens suggested

V5:
  - Added "NX Fault handling" section in VAS API documentation as Nick
suggested.
  - Dcoumentation: mmap size should be PAGE_SIZE as Daniel Axtens pointed.

Haren Myneni (9):
  powerpc/vas: Initialize window attributes for GZIP coprocessor type
  powerpc/vas: Define VAS_TX_WIN_OPEN ioctl API
  powerpc/vas: Add VAS user space API
  crypto/nx: Initialize coproc entry with kzalloc
  crypto/nx: Rename nx-842-powernv file name to nx-common-powernv
  crypto/nx: Make enable code generic to add new GZIP compression type
  crypto/nx: Enable and setup GZIP compresstion type
  crypto/nx: Remove 'pid' in vas_tx_win_attr struct
  Documentation/powerpc: VAS API

 Documentation/powerpc/index.rst|1 +
 Documentation/powerpc/vas-api.rst  |  292 +
 Documentation/userspace-api/ioctl/ioctl-number.rst |1 +
 arch/powerpc/include/asm/vas.h |   12 +-
 arch/powerpc/include/uapi/asm/vas-api.h|   22 +
 arch/powerpc/platforms/powernv/Makefile|2 +-
 arch/powerpc/platforms/powernv/vas-api.c   |  257 +
 arch/powerpc/platforms/powernv/vas-window.c|   23 +-
 arch/powerpc/platforms/powernv/vas.h   |2 +
 drivers/crypto/nx/Makefile |2 +-
 drivers/crypto/nx/nx-842-powernv.c | 1062 --
 drivers/crypto/nx/nx-common-powernv.c  | 1133 
 12 files changed, 1736 insertions(+), 1073 deletions(-)
 create mode 100644 Documentation/powerpc/vas-api.rst
 create mode 100644 arch/powerpc/include/uapi/asm/vas-api.h
 create mode 100644 arch/powerpc/platforms/powernv/vas-api.c
 delete mode 100644 drivers/crypto/nx/nx-842-powernv.c
 create mode 100644 drivers/crypto/nx/nx-common-powernv.c

-- 
1.8.3.1





[PATCH v2] sched/core: fix illegal RCU from offline CPUs

2020-04-01 Thread Qian Cai
From: Peter Zijlstra 

In the CPU-offline process, it calls mmdrop() after idle entry and the
subsequent call to cpuhp_report_idle_dead(). Once execution passes the
call to rcu_report_dead(), RCU is ignoring the CPU, which results in
lockdep complaining when mmdrop() uses RCU from either memcg or
debugobjects below.

Fix it by cleaning up the active_mm state from BP instead. Every arch
which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit()
from AP. The only exception is parisc because it switches them to
_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()),
but the patch will still work there because it calls mmgrab(_mm) in
smp_cpu_init() and then should call mmdrop(_mm) in finish_cpu().

WARNING: suspicious RCU usage
-
kernel/workqueue.c:710 RCU or wq_pool_mutex should be held!

other info that might help us debug this:

RCU used illegally from offline CPU!
Call Trace:
 dump_stack+0xf4/0x164 (unreliable)
 lockdep_rcu_suspicious+0x140/0x164
 get_work_pool+0x110/0x150
 __queue_work+0x1bc/0xca0
 queue_work_on+0x114/0x120
 css_release+0x9c/0xc0
 percpu_ref_put_many+0x204/0x230
 free_pcp_prepare+0x264/0x570
 free_unref_page+0x38/0xf0
 __mmdrop+0x21c/0x2c0
 idle_task_exit+0x170/0x1b0
 pnv_smp_cpu_kill_self+0x38/0x2e0
 cpu_die+0x48/0x64
 arch_cpu_idle_dead+0x30/0x50
 do_idle+0x2f4/0x470
 cpu_startup_entry+0x38/0x40
 start_secondary+0x7a8/0xa80
 start_secondary_resume+0x10/0x14


Signed-off-by: Qian Cai 
---
 arch/powerpc/platforms/powernv/smp.c |  1 -
 include/linux/sched/mm.h |  2 ++
 kernel/cpu.c | 18 +-
 kernel/sched/core.c  |  5 +++--
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/smp.c 
b/arch/powerpc/platforms/powernv/smp.c
index 13e251699346..b2ba3e95bda7 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */
 
idle_task_exit();
-   current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c49257a3b510..a132d875d351 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
 }
 
+void mmdrop(struct mm_struct *mm);
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_sem for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2371292f30b0..244d30544377 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,6 +3,7 @@
  *
  * This code is licenced under the GPL.
  */
+#include 
 #include 
 #include 
 #include 
@@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu)
return bringup_wait_for_ap(cpu);
 }
 
+static int finish_cpu(unsigned int cpu)
+{
+   struct task_struct *idle = idle_thread_get(cpu);
+   struct mm_struct *mm = idle->active_mm;
+
+   /*
+* idle_task_exit() will have switched to _mm, now
+* clean up any remaining active_mm state.
+*/
+   if (mm != _mm)
+   idle->active_mm = _mm;
+   mmdrop(mm);
+   return 0;
+}
+
 /*
  * Hotplug state machine related functions
  */
@@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_BRINGUP_CPU] = {
.name   = "cpu:bringup",
.startup.single = bringup_cpu,
-   .teardown.single= NULL,
+   .teardown.single= finish_cpu,
.cant_stop  = true,
},
/* Final state before CPU kills itself */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2694ba82874..8787958339d5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6200,13 +6200,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
 
BUG_ON(cpu_online(smp_processor_id()));
+   BUG_ON(current != this_rq()->idle);
 
if (mm != _mm) {
switch_mm(mm, _mm, current);
-   current->active_mm = _mm;
finish_arch_post_lock_switch();
}
-   mmdrop(mm);
+
+   /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
 /*
-- 
2.21.0 (Apple Git-122.2)



Re: [PATCH v2] powerpc/pseries: Fix MCE handling on pseries

2020-04-01 Thread Ganesh

On 3/20/20 4:31 PM, Ganesh Goudar wrote:


MCE handling on pSeries platform fails as recent rework to use common
code for pSeries and PowerNV in machine check error handling tries to
access per-cpu variables in realmode. The per-cpu variables may be
outside the RMO region on pSeries platform and needs translation to be
enabled for access. Just moving these per-cpu variable into RMO region
did'nt help because we queue some work to workqueues in real mode, which
again tries to touch per-cpu variables. Also fwnmi_release_errinfo()
cannot be called when translation is not enabled.

This patch fixes this by enabling translation in the exception handler
when all required real mode handling is done. This change only affects
the pSeries platform.

Without this fix below kernel crash is seen on injecting
SLB multihit:

BUG: Unable to handle kernel data access on read at 0xc0027b205950
Faulting instruction address: 0xc003b7e0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: mcetest_slb(OE+) af_packet(E) xt_tcpudp(E) ip6t_rpfilter(E) 
ip6t_REJECT(E) ipt_REJECT(E) xt_conntrack(E) ip_set(E) nfnetlink(E) 
ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) 
ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) 
nf_defrag_ipv6(E) nf_defrag_ipv4(E) iptable_mangle(E) iptable_raw(E) 
iptable_security(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) 
ip6_tables(E) iptable_filter(E) ip_tables(E) x_tables(E) xfs(E) ibmveth(E) 
vmx_crypto(E) gf128mul(E) uio_pdrv_genirq(E) uio(E) crct10dif_vpmsum(E) 
rtc_generic(E) btrfs(E) libcrc32c(E) xor(E) zstd_decompress(E) zstd_compress(E) 
raid6_pq(E) sr_mod(E) sd_mod(E) cdrom(E) ibmvscsi(E) scsi_transport_srp(E) 
crc32c_vpmsum(E) dm_mod(E) sg(E) scsi_mod(E)
CPU: 34 PID: 8154 Comm: insmod Kdump: loaded Tainted: G OE 5.5.0-mahesh #1
NIP: c003b7e0 LR: c00f2218 CTR: 
REGS: c7dcb960 TRAP: 0300 Tainted: G OE (5.5.0-mahesh)
MSR: 80001003  CR: 28002428 XER: 2004
CFAR: c00f2214 DAR: c0027b205950 DSISR: 4000 IRQMASK: 0
GPR00: c00f2218 c7dcbbf0 c1544800 c7dcbd70
GPR04: 0001 c7dcbc98 c00800d00258 c008011c
GPR08:  00030003 c1035950 0348
GPR12: 00027a1d c7f9c000 0558 
GPR16: 0540 c0080111 c00801110540 
GPR20: c022af10 c0025480fd70 c0080128 c0004bfbb300
GPR24: c1442330 c008080d c0080800 4009287a77000510
GPR28:  0002 c1033d30 0001
NIP [c003b7e0] save_mce_event+0x30/0x240
LR [c00f2218] pseries_machine_check_realmode+0x2c8/0x4f0
Call Trace:
Instruction dump:
3c4c0151 38429050 7c0802a6 6000 fbc1fff0 fbe1fff8 f821ffd1 3d42ffaf
3fc2ffaf e98d0030 394a1150 3bdef530 <7d6a62aa> 1d2b0048 2f8b0063 380b0001
---[ end trace 46fd63f36bbdd940 ]---

Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common event 
code")
Reviewed-by: Mahesh Salgaonkar 
Reviewed-by: Nicholas Piggin 
Signed-off-by: Ganesh Goudar 


Hi mpe, Do you have any comments on this patch ?



[PATCH v9 13/13] powerpc/vas: Free send window in VAS instance after credits returned

2020-04-01 Thread Haren Myneni


NX may be processing requests while trying to close window. Wait until
all credits are returned and then free send window from VAS instance.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-window.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index c8644c3..be900ad 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1317,14 +1317,14 @@ int vas_win_close(struct vas_window *window)
 
unmap_paste_region(window);
 
-   clear_vinst_win(window);
-
poll_window_busy_state(window);
 
unpin_close_window(window);
 
poll_window_credits(window);
 
+   clear_vinst_win(window);
+
poll_window_castout(window);
 
/* if send window, drop reference to matching receive window */
-- 
1.8.3.1





[PATCH v9 12/13] powerpc/vas: Display process stuck message

2020-04-01 Thread Haren Myneni


Process can not close send window until all requests are processed.
Means wait until window state is not busy and send credits are
returned. Display debug messages in case taking longer to close the
window.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-window.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 084e76b..c8644c3 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1182,6 +1182,7 @@ static void poll_window_credits(struct vas_window *window)
 {
u64 val;
int creds, mode;
+   int count = 0;
 
val = read_hvwc_reg(window, VREG(WINCTL));
if (window->tx_win)
@@ -1200,10 +1201,27 @@ static void poll_window_credits(struct vas_window 
*window)
creds = GET_FIELD(VAS_LRX_WCRED, val);
}
 
+   /*
+* Takes around few milliseconds to complete all pending requests
+* and return credits.
+* TODO: Scan fault FIFO and invalidate CRBs points to this window
+*   and issue CRB Kill to stop all pending requests. Need only
+*   if there is a bug in NX or fault handling in kernel.
+*/
if (creds < window->wcreds_max) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(msecs_to_jiffies(10));
+   count++;
+   /*
+* Process can not close send window until all credits are
+* returned.
+*/
+   if (!(count % 1))
+   pr_debug("VAS: pid %d stuck. Waiting for credits 
returned for Window(%d). creds %d, Retries %d\n",
+   vas_window_pid(window), window->winid,
+   creds, count);
+
goto retry;
}
 }
@@ -1217,6 +1235,7 @@ static void poll_window_busy_state(struct vas_window 
*window)
 {
int busy;
u64 val;
+   int count = 0;
 
 retry:
val = read_hvwc_reg(window, VREG(WIN_STATUS));
@@ -1225,6 +1244,15 @@ static void poll_window_busy_state(struct vas_window 
*window)
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(msecs_to_jiffies(5));
+   count++;
+   /*
+* Takes around few milliseconds to process all pending
+* requests.
+*/
+   if (!(count % 1))
+   pr_debug("VAS: pid %d stuck. Window (ID=%d) is in busy 
state. Retries %d\n",
+   vas_window_pid(window), window->winid, count);
+
goto retry;
}
 }
-- 
1.8.3.1





[PATCH v9 11/13] powerpc/vas: Do not use default credits for receive window

2020-04-01 Thread Haren Myneni


System checkstops if RxFIFO overruns with more requests than the
maximum possible number of CRBs allowed in FIFO at any time. So
max credits value (rxattr.wcreds_max) is set and is passed to
vas_rx_win_open() by the the driver.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-window.c | 4 ++--
 arch/powerpc/platforms/powernv/vas.h| 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 33aaa7a..084e76b 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -772,7 +772,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
 
-   if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+   if (!attr->wcreds_max)
return false;
 
if (attr->nx_win) {
@@ -877,7 +877,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
-   rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+   rxwin->wcreds_max = rxattr->wcreds_max;
 
init_winctx_for_rxwin(rxwin, rxattr, );
init_winctx_regs(rxwin, );
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index efdaa28..32b5261 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -101,11 +101,9 @@
 /*
  * Initial per-process credits.
  * Max send window credits:4K-1 (12-bits in VAS_TX_WCRED)
- * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED)
  *
  * TODO: Needs tuning for per-process credits
  */
-#define VAS_RX_WCREDS_MAX  ((64 << 10) - 1)
 #define VAS_TX_WCREDS_MAX  ((4 << 10) - 1)
 #define VAS_WCREDS_DEFAULT (1 << 10)
 
-- 
1.8.3.1





[PATCH v9 10/13] powerpc/vas: Print CRB and FIFO values

2020-04-01 Thread Haren Myneni


Dump FIFO entries if could not find send window and print CRB
for debugging.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-fault.c | 41 ++
 1 file changed, 41 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-fault.c 
b/arch/powerpc/platforms/powernv/vas-fault.c
index b6bec64..25db70b 100644
--- a/arch/powerpc/platforms/powernv/vas-fault.c
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -26,6 +26,28 @@
  */
 #define VAS_FAULT_WIN_FIFO_SIZE(4 << 20)
 
+static void dump_crb(struct coprocessor_request_block *crb)
+{
+   struct data_descriptor_entry *dde;
+   struct nx_fault_stamp *nx;
+
+   dde = >source;
+   pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n",
+   be64_to_cpu(dde->address), be32_to_cpu(dde->length),
+   dde->count, dde->index, dde->flags);
+
+   dde = >target;
+   pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n",
+   be64_to_cpu(dde->address), be32_to_cpu(dde->length),
+   dde->count, dde->index, dde->flags);
+
+   nx = >stamp.nx;
+   pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n",
+   be32_to_cpu(nx->pswid),
+   be64_to_cpu(crb->stamp.nx.fault_storage_addr),
+   nx->flags, nx->fault_status);
+}
+
 /*
  * Update the CSB to indicate a translation error.
  *
@@ -148,6 +170,23 @@ static void update_csb(struct vas_window *window,
pid_vnr(pid), rc);
 }
 
+static void dump_fifo(struct vas_instance *vinst, void *entry)
+{
+   unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size;
+   unsigned long *fifo = entry;
+   int i;
+
+   pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size,
+   vinst->fault_fifo_size / CRB_SIZE);
+
+   /* Dump 10 CRB entries or until end of FIFO */
+   pr_err("Fault FIFO Dump:\n");
+   for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) {
+   pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n",
+   i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3));
+   }
+}
+
 /*
  * Process valid CRBs in fault FIFO.
  * NX process user space requests, return credit and update the status
@@ -233,6 +272,7 @@ irqreturn_t vas_fault_thread_fn(int irq, void *data)
vinst->vas_id, vinst->fault_fifo, fifo,
vinst->fault_crbs);
 
+   dump_crb(crb);
window = vas_pswid_to_window(vinst,
be32_to_cpu(crb->stamp.nx.pswid));
 
@@ -245,6 +285,7 @@ irqreturn_t vas_fault_thread_fn(int irq, void *data)
 * But we should not get here.
 * TODO: Disable IRQ.
 */
+   dump_fifo(vinst, (void *)entry);
pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, 
fault_crbs %d bad CRB?\n",
vinst->vas_id, vinst->fault_fifo, fifo,
be32_to_cpu(crb->stamp.nx.pswid),
-- 
1.8.3.1





[PATCH v9 09/13] powerpc/vas: Return credits after handling fault

2020-04-01 Thread Haren Myneni


NX uses credit mechanism to control the number of requests issued on
a specific window at any point of time. Only send windows and fault
window are used credits. When the request is issued on a given window,
a credit is taken. This credit will be returned after that request is
processed. If credits are not available, returns RMA_Busy for send
window and RMA_Reject for fault window.

NX expects OS to return credit for send window after processing fault
CRB. Also credit has to be returned for fault window after handling
the fault.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-fault.c  |  9 
 arch/powerpc/platforms/powernv/vas-window.c | 36 +
 arch/powerpc/platforms/powernv/vas.h|  1 +
 3 files changed, 46 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/vas-fault.c 
b/arch/powerpc/platforms/powernv/vas-fault.c
index 354577d..b6bec64 100644
--- a/arch/powerpc/platforms/powernv/vas-fault.c
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -224,6 +224,10 @@ irqreturn_t vas_fault_thread_fn(int irq, void *data)
memcpy(crb, fifo, CRB_SIZE);
entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY);
entry->ccw |= cpu_to_be32(CCW0_INVALID);
+   /*
+* Return credit for the fault window.
+*/
+   vas_return_credit(vinst->fault_win, false);
 
pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n",
vinst->vas_id, vinst->fault_fifo, fifo,
@@ -249,6 +253,11 @@ irqreturn_t vas_fault_thread_fn(int irq, void *data)
WARN_ON_ONCE(1);
} else {
update_csb(window, crb);
+   /*
+* Return credit for send window after processing
+* fault CRB.
+*/
+   vas_return_credit(window, true);
}
}
 }
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 382fe25..33aaa7a 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1318,6 +1318,42 @@ int vas_win_close(struct vas_window *window)
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
 
+/*
+ * Return credit for the given window.
+ * Send windows and fault window uses credit mechanism as follows:
+ *
+ * Send windows:
+ * - The default number of credits available for each send window is
+ *   1024. It means 1024 requests can be issued asynchronously at the
+ *   same time. If the credit is not available, that request will be
+ *   returned with RMA_Busy.
+ * - One credit is taken when NX request is issued.
+ * - This credit is returned after NX processed that request.
+ * - If NX encounters translation error, kernel will return the
+ *   credit on the specific send window after processing the fault CRB.
+ *
+ * Fault window:
+ * - The total number credits available is FIFO_SIZE/CRB_SIZE.
+ *   Means 4MB/128 in the current implementation. If credit is not
+ *   available, RMA_Reject is returned.
+ * - A credit is taken when NX pastes CRB in fault FIFO.
+ * - The kernel with return credit on fault window after reading entry
+ *   from fault FIFO.
+ */
+void vas_return_credit(struct vas_window *window, bool tx)
+{
+   uint64_t val;
+
+   val = 0ULL;
+   if (tx) { /* send window */
+   val = SET_FIELD(VAS_TX_WCRED, val, 1);
+   write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val);
+   } else {
+   val = SET_FIELD(VAS_LRX_WCRED, val, 1);
+   write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val);
+   }
+}
+
 struct vas_window *vas_pswid_to_window(struct vas_instance *vinst,
uint32_t pswid)
 {
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 0af7912..efdaa28 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -436,6 +436,7 @@ struct vas_winctx {
 extern int vas_setup_fault_window(struct vas_instance *vinst);
 extern irqreturn_t vas_fault_thread_fn(int irq, void *data);
 extern irqreturn_t vas_fault_handler(int irq, void *dev_id);
+extern void vas_return_credit(struct vas_window *window, bool tx);
 extern struct vas_window *vas_pswid_to_window(struct vas_instance *vinst,
uint32_t pswid);
 
-- 
1.8.3.1





[PATCH v9 08/13] powerpc/vas: Update CSB and notify process for fault CRBs

2020-04-01 Thread Haren Myneni


Applications polls on CSB for the status update after requests are
issued. NX process these requests and update the CSB with the status.
If it encounters translation error, pastes CRB in fault FIFO and
raises an interrupt. The kernel handles fault by reading CRB from
fault FIFO and process the fault CRB.

For each fault CRB, update fault address in CRB (fault_storage_addr)
and translation error status in CSB so that user space can touch the
fault address and resend the request. If the user space passed invalid
CSB address send signal to process with SIGSEGV.

In the case of multi-thread applications, child thread may not be
available. So if the task is not running, send signal to tgid.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-fault.c | 126 -
 1 file changed, 125 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/vas-fault.c 
b/arch/powerpc/platforms/powernv/vas-fault.c
index 0da8358..354577d 100644
--- a/arch/powerpc/platforms/powernv/vas-fault.c
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -26,6 +27,128 @@
 #define VAS_FAULT_WIN_FIFO_SIZE(4 << 20)
 
 /*
+ * Update the CSB to indicate a translation error.
+ *
+ * User space will be polling on CSB after the request is issued.
+ * If NX can handle the request without any issues, it updates CSB.
+ * Whereas if NX encounters page fault, the kernel will handle the
+ * fault and update CSB with translation error.
+ *
+ * If we are unable to update the CSB means copy_to_user failed due to
+ * invalid csb_addr, send a signal to the process.
+ */
+static void update_csb(struct vas_window *window,
+   struct coprocessor_request_block *crb)
+{
+   struct coprocessor_status_block csb;
+   struct kernel_siginfo info;
+   struct task_struct *tsk;
+   void __user *csb_addr;
+   struct pid *pid;
+   int rc;
+
+   /*
+* NX user space windows can not be opened for task->mm=NULL
+* and faults will not be generated for kernel requests.
+*/
+   if (WARN_ON_ONCE(!window->mm || !window->user_win))
+   return;
+
+   csb_addr = (void __user *)be64_to_cpu(crb->csb_addr);
+
+   memset(, 0, sizeof(csb));
+   csb.cc = CSB_CC_TRANSLATION;
+   csb.ce = CSB_CE_TERMINATION;
+   csb.cs = 0;
+   csb.count = 0;
+
+   /*
+* NX operates and returns in BE format as defined CRB struct.
+* So saves fault_storage_addr in BE as NX pastes in FIFO and
+* expects user space to convert to CPU format.
+*/
+   csb.address = crb->stamp.nx.fault_storage_addr;
+   csb.flags = 0;
+
+   pid = window->pid;
+   tsk = get_pid_task(pid, PIDTYPE_PID);
+   /*
+* Process closes send window after all pending NX requests are
+* completed. In multi-thread applications, a child thread can
+* open a window and can exit without closing it. May be some
+* requests are pending or this window can be used by other
+* threads later. We should handle faults if NX encounters
+* pages faults on these requests. Update CSB with translation
+* error and fault address. If csb_addr passed by user space is
+* invalid, send SEGV signal to pid saved in window. If the
+* child thread is not running, send the signal to tgid.
+* Parent thread (tgid) will close this window upon its exit.
+*
+* pid and mm references are taken when window is opened by
+* process (pid). So tgid is used only when child thread opens
+* a window and exits without closing it.
+*/
+   if (!tsk) {
+   pid = window->tgid;
+   tsk = get_pid_task(pid, PIDTYPE_PID);
+   /*
+* Parent thread (tgid) will be closing window when it
+* exits. So should not get here.
+*/
+   if (WARN_ON_ONCE(!tsk))
+   return;
+   }
+
+   /* Return if the task is exiting. */
+   if (tsk->flags & PF_EXITING) {
+   put_task_struct(tsk);
+   return;
+   }
+
+   use_mm(window->mm);
+   rc = copy_to_user(csb_addr, , sizeof(csb));
+   /*
+* User space polls on csb.flags (first byte). So add barrier
+* then copy first byte with csb flags update.
+*/
+   if (!rc) {
+   csb.flags = CSB_V;
+   /* Make sure update to csb.flags is visible now */
+   smp_mb();
+   rc = copy_to_user(csb_addr, , sizeof(u8));
+   }
+   unuse_mm(window->mm);
+   put_task_struct(tsk);
+
+   /* Success */
+   if (!rc)
+   return;
+
+   pr_debug("Invalid CSB address 0x%p signalling pid(%d)\n",
+   csb_addr, pid_vnr(pid));
+
+   

[PATCH v9 07/13] powerpc/vas: Setup thread IRQ handler per VAS instance

2020-04-01 Thread Haren Myneni


When NX encounters translation error on CRB and any request buffer,
raises an interrupt on the CPU to handle the fault. It can raise one
interrupt for multiple faults. Expects OS to handle these faults and
return credits for fault window after processing faults.

Setup thread IRQ handler and IRQ thread function per each VAS instance.
IRQ handler checks if the thread is already woken up and can handle new
faults. If so returns with IRQ_HANDLED, otherwise wake up thread to
process new faults.

The thread functions reads each CRB entry from fault FIFO until sees
invalid entry. After reading each CRB, determine the corresponding
send window using pswid (from CRB) and process fault CRB. Then
invalidate the entry and return credit. Processing fault CRB and
return credit is described in subsequent patches.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-fault.c  | 131 
 arch/powerpc/platforms/powernv/vas-window.c |  60 +
 arch/powerpc/platforms/powernv/vas.c|  23 -
 arch/powerpc/platforms/powernv/vas.h|   7 ++
 4 files changed, 220 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/vas-fault.c 
b/arch/powerpc/platforms/powernv/vas-fault.c
index 4044998..0da8358 100644
--- a/arch/powerpc/platforms/powernv/vas-fault.c
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "vas.h"
@@ -25,6 +26,136 @@
 #define VAS_FAULT_WIN_FIFO_SIZE(4 << 20)
 
 /*
+ * Process valid CRBs in fault FIFO.
+ * NX process user space requests, return credit and update the status
+ * in CRB. If it encounters transalation error when accessing CRB or
+ * request buffers, raises interrupt on the CPU to handle the fault.
+ * It takes credit on fault window, updates nx_fault_stamp in CRB with
+ * the following information and pastes CRB in fault FIFO.
+ *
+ * pswid - window ID of the window on which the request is sent.
+ * fault_storage_addr - fault address
+ *
+ * It can raise a single interrupt for multiple faults. Expects OS to
+ * process all valid faults and return credit for each fault on user
+ * space and fault windows. This fault FIFO control will be done with
+ * credit mechanism. NX can continuously paste CRBs until credits are not
+ * available on fault window. Otherwise, returns with RMA_reject.
+ *
+ * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128)
+ *
+ */
+irqreturn_t vas_fault_thread_fn(int irq, void *data)
+{
+   struct vas_instance *vinst = data;
+   struct coprocessor_request_block *crb, *entry;
+   struct coprocessor_request_block buf;
+   struct vas_window *window;
+   unsigned long flags;
+   void *fifo;
+
+   crb = 
+
+   /*
+* VAS can interrupt with multiple page faults. So process all
+* valid CRBs within fault FIFO until reaches invalid CRB.
+* We use CCW[0] and pswid to validate validate CRBs:
+*
+* CCW[0]   Reserved bit. When NX pastes CRB, CCW[0]=0
+*  OS sets this bit to 1 after reading CRB.
+* pswidNX assigns window ID. Set pswid to -1 after
+*  reading CRB from fault FIFO.
+*
+* We exit this function if no valid CRBs are available to process.
+* So acquire fault_lock and reset fifo_in_progress to 0 before
+* exit.
+* In case kernel receives another interrupt with different page
+* fault, interrupt handler returns with IRQ_HANDLED if
+* fifo_in_progress is set. Means these new faults will be
+* handled by the current thread. Otherwise set fifo_in_progress
+* and return IRQ_WAKE_THREAD to wake up thread.
+*/
+   while (true) {
+   spin_lock_irqsave(>fault_lock, flags);
+   /*
+* Advance the fault fifo pointer to next CRB.
+* Use CRB_SIZE rather than sizeof(*crb) since the latter is
+* aligned to CRB_ALIGN (256) but the CRB written to by VAS is
+* only CRB_SIZE in len.
+*/
+   fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE);
+   entry = fifo;
+
+   if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY))
+   || (entry->ccw & cpu_to_be32(CCW0_INVALID))) {
+   vinst->fifo_in_progress = 0;
+   spin_unlock_irqrestore(>fault_lock, flags);
+   return IRQ_HANDLED;
+   }
+
+   spin_unlock_irqrestore(>fault_lock, flags);
+   vinst->fault_crbs++;
+   if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE))
+   vinst->fault_crbs = 0;
+
+   memcpy(crb, fifo, CRB_SIZE);
+   entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY);
+ 

[PATCH 06/13] powerpc/vas: Take reference to PID and mm for user space windows

2020-04-01 Thread Haren Myneni


When process opens a window, its pid and tgid will be saved in the
vas_window struct. This window will be closed when the process exits.
The kernel handles NX faults by updating CSB or send SEGV signal to pid
of the process if the userspace csb addr is invalid.

In multi-thread applications, a window can be opened by a child thread,
but it will not be closed when this thread exits. It is expected that
the parent will clean up all resources including NX windows opened by
child threads. A child thread can send NX requests using this window
and could be killed before completion is reported. If the pid assigned
to this thread is reused while requests are pending, a failure SEGV
would be directed to the wrong place.

To prevent reusing the pid, take references to pid and mm when the window
is opened and release them when when the window is closed. Then if child
thread is not running, SEGV signal will be sent to thread group leader
(tgd).

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-debug.c  |  2 +-
 arch/powerpc/platforms/powernv/vas-window.c | 53 ++---
 arch/powerpc/platforms/powernv/vas.h|  9 -
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-debug.c 
b/arch/powerpc/platforms/powernv/vas-debug.c
index 09e63df..ef9a717 100644
--- a/arch/powerpc/platforms/powernv/vas-debug.c
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -38,7 +38,7 @@ static int info_show(struct seq_file *s, void *private)
 
seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
window->tx_win ? "Send" : "Receive");
-   seq_printf(s, "Pid : %d\n", window->pid);
+   seq_printf(s, "Pid : %d\n", vas_window_pid(window));
 
 unlock:
mutex_unlock(_mutex);
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index dc46bf6..7054cd4 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include "vas.h"
@@ -876,8 +878,6 @@ struct vas_window *vas_rx_win_open(int vasid, enum 
vas_cop_type cop,
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
-   if (rxattr->user_win)
-   rxwin->pid = task_pid_vnr(current);
 
init_winctx_for_rxwin(rxwin, rxattr, );
init_winctx_regs(rxwin, );
@@ -1027,7 +1027,6 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
txwin->tx_win = 1;
txwin->rxwin = rxwin;
txwin->nx_win = txwin->rxwin->nx_win;
-   txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 
@@ -1059,8 +1058,43 @@ struct vas_window *vas_tx_win_open(int vasid, enum 
vas_cop_type cop,
goto free_window;
}
 
-   set_vinst_win(vinst, txwin);
+   if (txwin->user_win) {
+   /*
+* Window opened by a child thread may not be closed when
+* it exits. So take reference to its pid and release it
+* when the window is free by parent thread.
+* Acquire a reference to the task's pid to make sure
+* pid will not be re-used - needed only for multithread
+* applications.
+*/
+   txwin->pid = get_task_pid(current, PIDTYPE_PID);
+   /*
+* Acquire a reference to the task's mm.
+*/
+   txwin->mm = get_task_mm(current);
 
+   if (!txwin->mm) {
+   put_pid(txwin->pid);
+   pr_err("VAS: pid(%d): mm_struct is not found\n",
+   current->pid);
+   rc = -EPERM;
+   goto free_window;
+   }
+
+   mmgrab(txwin->mm);
+   mmput(txwin->mm);
+   mm_context_add_copro(txwin->mm);
+   /*
+* Process closes window during exit. In the case of
+* multithread application, the child thread can open
+* window and can exit without closing it. Expects parent
+* thread to use and close the window. So do not need
+* to take pid reference for parent thread.
+*/
+   txwin->tgid = find_get_pid(task_tgid_vnr(current));
+   }
+
+   set_vinst_win(vinst, txwin);
return txwin;
 
 free_window:
@@ -1257,8 +1291,17 @@ int vas_win_close(struct vas_window *window)
poll_window_castout(window);
 
/* if send window, drop reference to matching receive window */
-   if (window->tx_win)
+   if (window->tx_win) {
+   if (window->user_win) {
+ 

[PATCH v9 05/13] powerpc/vas: Register NX with fault window ID and IRQ port value

2020-04-01 Thread Haren Myneni


For each user space send window, register NX with fault window ID
and port value so that NX paste CRBs in this fault FIFO when it
sees fault on the request buffer.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas-window.c | 15 +--
 arch/powerpc/platforms/powernv/vas.h| 15 +++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 1783fa9..dc46bf6 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -373,7 +373,7 @@ int init_winctx_regs(struct vas_window *window, struct 
vas_winctx *winctx)
init_xlate_regs(window, winctx->user_win);
 
val = 0ULL;
-   val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0);
+   val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id);
write_hvwc_reg(window, VREG(FAULT_TX_WIN), val);
 
/* In PowerNV, interrupts go to HV. */
@@ -748,6 +748,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+   if (rxwin->vinst->virq)
+   winctx->irq_port = rxwin->vinst->irq_port;
 }
 
 static bool rx_win_args_valid(enum vas_cop_type cop,
@@ -944,13 +946,22 @@ static void init_winctx_for_txwin(struct vas_window 
*txwin,
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
winctx->rx_win_id = txwin->rxwin->winid;
+   /*
+* IRQ and fault window setup is successful. Set fault window
+* for the send window so that ready to handle faults.
+*/
+   if (txwin->vinst->virq)
+   winctx->fault_win_id = txwin->vinst->fault_win->winid;
 
winctx->dma_type = VAS_DMA_TYPE_INJECT;
winctx->tc_mode = txattr->tc_mode;
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+   if (txwin->vinst->virq)
+   winctx->irq_port = txwin->vinst->irq_port;
 
-   winctx->pswid = 0;
+   winctx->pswid = txattr->pswid ? txattr->pswid :
+   encode_pswid(txwin->vinst->vas_id, txwin->winid);
 }
 
 static bool tx_win_args_valid(enum vas_cop_type cop,
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 9c8e3f5..88d084d 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -467,6 +467,21 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
 }
 
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * BitsUsage
+ * 0:7 VAS id (8 bits)
+ * 8:15Unused, 0 (3 bits)
+ * 16:31   Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+   return ((u32)winid | (vasid << (31 - 7)));
+}
+
 static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
 {
if (vasid)
-- 
1.8.3.1





[PATCH v9 04/13] powerpc/vas: Setup fault window per VAS instance

2020-04-01 Thread Haren Myneni


Setup fault window for each VAS instance. When NX gets a fault on
request buffer, pastes fault CRB in the corresponding fault FIFO and
then raises an interrupt to the OS. The kernel handles this fault
and process faults CRB from this FIFO.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/Makefile |  2 +-
 arch/powerpc/platforms/powernv/vas-fault.c  | 77 +
 arch/powerpc/platforms/powernv/vas-window.c |  4 +-
 arch/powerpc/platforms/powernv/vas.c| 20 
 arch/powerpc/platforms/powernv/vas.h| 21 
 5 files changed, 121 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-fault.c

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index c0f8120..395789f 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_MEMORY_FAILURE)  += opal-memory-errors.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_VAS)  += vas.o vas-window.o vas-debug.o vas-fault.o
 obj-$(CONFIG_OCXL_BASE)+= ocxl.o
 obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
 obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/vas-fault.c 
b/arch/powerpc/platforms/powernv/vas-fault.c
new file mode 100644
index 000..4044998
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VAS Fault handling.
+ * Copyright 2019, IBM Corporation
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas.h"
+
+/*
+ * The maximum FIFO size for fault window can be 8MB
+ * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS
+ * instance will be having fault window.
+ * 8MB FIFO can be used if expects more faults for each VAS
+ * instance.
+ */
+#define VAS_FAULT_WIN_FIFO_SIZE(4 << 20)
+
+/*
+ * Fault window is opened per VAS instance. NX pastes fault CRB in fault
+ * FIFO upon page faults.
+ */
+int vas_setup_fault_window(struct vas_instance *vinst)
+{
+   struct vas_rx_win_attr attr;
+
+   vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE;
+   vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL);
+   if (!vinst->fault_fifo) {
+   pr_err("Unable to alloc %d bytes for fault_fifo\n",
+   vinst->fault_fifo_size);
+   return -ENOMEM;
+   }
+
+   /*
+* Invalidate all CRB entries. NX pastes valid entry for each fault.
+*/
+   memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size);
+   vas_init_rx_win_attr(, VAS_COP_TYPE_FAULT);
+
+   attr.rx_fifo_size = vinst->fault_fifo_size;
+   attr.rx_fifo = vinst->fault_fifo;
+
+   /*
+* Max creds is based on number of CRBs can fit in the FIFO.
+* (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds
+* will be 0x since the receive creds field is 16bits wide.
+*/
+   attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE;
+   attr.lnotify_lpid = 0;
+   attr.lnotify_pid = mfspr(SPRN_PID);
+   attr.lnotify_tid = mfspr(SPRN_PID);
+
+   vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT,
+   );
+
+   if (IS_ERR(vinst->fault_win)) {
+   pr_err("VAS: Error %ld opening FaultWin\n",
+   PTR_ERR(vinst->fault_win));
+   kfree(vinst->fault_fifo);
+   return PTR_ERR(vinst->fault_win);
+   }
+
+   pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n",
+   vinst->fault_win->winid, attr.lnotify_lpid,
+   attr.lnotify_pid, attr.lnotify_tid);
+
+   return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
b/arch/powerpc/platforms/powernv/vas-window.c
index 0c0d27d..1783fa9 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -827,9 +827,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, 
enum vas_cop_type cop)
rxattr->fault_win = true;
rxattr->notify_disable = true;
rxattr->rx_wcred_mode = true;
-   rxattr->tx_wcred_mode = true;
rxattr->rx_win_ord_mode = true;
-   rxattr->tx_win_ord_mode = true;
+   rxattr->rej_no_credit = true;
+   rxattr->tc_mode = VAS_THRESH_DISABLED;
} else if (cop == VAS_COP_TYPE_FTW) {
rxattr->user_win = true;
rxattr->intr_disable = true;
diff --git a/arch/powerpc/platforms/powernv/vas.c 
b/arch/powerpc/platforms/powernv/vas.c
index 

[PATCH v9 03/13] powerpc/vas: Alloc and setup IRQ and trigger port address

2020-04-01 Thread Haren Myneni


Allocate a xive irq on each chip with a vas instance. The NX coprocessor
raises a host CPU interrupt via vas if it encounters page fault on user
space request buffer. Subsequent patches register the trigger port with
the NX coprocessor, and create a vas fault handler for this interrupt
mapping.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/platforms/powernv/vas.c | 44 +++-
 arch/powerpc/platforms/powernv/vas.h |  2 ++
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas.c 
b/arch/powerpc/platforms/powernv/vas.c
index ed9cc6d..3303cfe 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "vas.h"
 
@@ -25,10 +26,12 @@
 
 static int init_vas_instance(struct platform_device *pdev)
 {
-   int rc, cpu, vasid;
-   struct resource *res;
-   struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
+   struct vas_instance *vinst;
+   struct xive_irq_data *xd;
+   uint32_t chipid, hwirq;
+   struct resource *res;
+   int rc, cpu, vasid;
 
rc = of_property_read_u32(dn, "ibm,vas-id", );
if (rc) {
@@ -36,6 +39,12 @@ static int init_vas_instance(struct platform_device *pdev)
return -ENODEV;
}
 
+   rc = of_property_read_u32(dn, "ibm,chip-id", );
+   if (rc) {
+   pr_err("No ibm,chip-id property for %s?\n", pdev->name);
+   return -ENODEV;
+   }
+
if (pdev->num_resources != 4) {
pr_err("Unexpected DT configuration for [%s, %d]\n",
pdev->name, vasid);
@@ -69,9 +78,32 @@ static int init_vas_instance(struct platform_device *pdev)
 
vinst->paste_win_id_shift = 63 - res->end;
 
-   pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, "
-   "paste_win_id_shift 0x%llx\n", pdev->name, vasid,
-   vinst->paste_base_addr, vinst->paste_win_id_shift);
+   hwirq = xive_native_alloc_irq_on_chip(chipid);
+   if (!hwirq) {
+   pr_err("Inst%d: Unable to allocate global irq for chip %d\n",
+   vinst->vas_id, chipid);
+   return -ENOENT;
+   }
+
+   vinst->virq = irq_create_mapping(NULL, hwirq);
+   if (!vinst->virq) {
+   pr_err("Inst%d: Unable to map global irq %d\n",
+   vinst->vas_id, hwirq);
+   return -EINVAL;
+   }
+
+   xd = irq_get_handler_data(vinst->virq);
+   if (!xd) {
+   pr_err("Inst%d: Invalid virq %d\n",
+   vinst->vas_id, vinst->virq);
+   return -EINVAL;
+   }
+
+   vinst->irq_port = xd->trig_page;
+   pr_devel("Initialized instance [%s, %d] paste_base 0x%llx 
paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n",
+   pdev->name, vasid, vinst->paste_base_addr,
+   vinst->paste_win_id_shift, vinst->virq,
+   vinst->irq_port);
 
for_each_possible_cpu(cpu) {
if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
diff --git a/arch/powerpc/platforms/powernv/vas.h 
b/arch/powerpc/platforms/powernv/vas.h
index 5574aec..598608b 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -313,6 +313,8 @@ struct vas_instance {
u64 paste_base_addr;
u64 paste_win_id_shift;
 
+   u64 irq_port;
+   int virq;
struct mutex mutex;
struct vas_window *rxwin[VAS_COP_TYPE_MAX];
struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
-- 
1.8.3.1





[PATCH v9 02/13] powerpc/vas: Define nx_fault_stamp in coprocessor_request_block

2020-04-01 Thread Haren Myneni


Kernel sets fault address and status in CRB for NX page fault on user
space address after processing page fault. User space gets the signal
and handles the fault mentioned in CRB by bringing the page in to
memory and send NX request again.

Signed-off-by: Sukadev Bhattiprolu 
Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/icswx.h | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h
index 9872f85..965b1f3 100644
--- a/arch/powerpc/include/asm/icswx.h
+++ b/arch/powerpc/include/asm/icswx.h
@@ -108,6 +108,17 @@ struct data_descriptor_entry {
__be64 address;
 } __packed __aligned(DDE_ALIGN);
 
+/* 4.3.2 NX-stamped Fault CRB */
+
+#define NX_STAMP_ALIGN  (0x10)
+
+struct nx_fault_stamp {
+   __be64 fault_storage_addr;
+   __be16 reserved;
+   __u8   flags;
+   __u8   fault_status;
+   __be32 pswid;
+} __packed __aligned(NX_STAMP_ALIGN);
 
 /* Chapter 6.5.2 Coprocessor-Request Block (CRB) */
 
@@ -135,10 +146,15 @@ struct coprocessor_request_block {
 
struct coprocessor_completion_block ccb;
 
-   u8 reserved[48];
+   union {
+   struct nx_fault_stamp nx;
+   u8 reserved[16];
+   } stamp;
+
+   u8 reserved[32];
 
struct coprocessor_status_block csb;
-} __packed __aligned(CRB_ALIGN);
+} __packed;
 
 
 /* RFC02167 Initiate Coprocessor Instructions document
-- 
1.8.3.1





[PATCH v9 01/13] powerpc/xive: Define xive_native_alloc_irq_on_chip()

2020-04-01 Thread Haren Myneni


This function allocates IRQ on a specific chip. VAS needs per chip
IRQ allocation and will have IRQ handler per VAS instance.

Signed-off-by: Haren Myneni 
Reviewed-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive.h   | 9 -
 arch/powerpc/sysdev/xive/native.c | 6 +++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 93f982db..d08ea11 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -5,6 +5,8 @@
 #ifndef _ASM_POWERPC_XIVE_H
 #define _ASM_POWERPC_XIVE_H
 
+#include 
+
 #define XIVE_INVALID_VP0x
 
 #ifdef CONFIG_PPC_XIVE
@@ -108,7 +110,6 @@ struct xive_q {
 int xive_native_populate_irq_data(u32 hw_irq,
  struct xive_irq_data *data);
 void xive_cleanup_irq_data(struct xive_irq_data *xd);
-u32 xive_native_alloc_irq(void);
 void xive_native_free_irq(u32 irq);
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
 
@@ -137,6 +138,12 @@ int xive_native_set_queue_state(u32 vp_id, uint32_t prio, 
u32 qtoggle,
u32 qindex);
 int xive_native_get_vp_state(u32 vp_id, u64 *out_state);
 bool xive_native_has_queue_state_support(void);
+extern u32 xive_native_alloc_irq_on_chip(u32 chip_id);
+
+static inline u32 xive_native_alloc_irq(void)
+{
+   return xive_native_alloc_irq_on_chip(OPAL_XIVE_ANY_CHIP);
+}
 
 #else
 
diff --git a/arch/powerpc/sysdev/xive/native.c 
b/arch/powerpc/sysdev/xive/native.c
index 0ff6b73..14d4406 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -279,12 +279,12 @@ static int xive_native_get_ipi(unsigned int cpu, struct 
xive_cpu *xc)
 }
 #endif /* CONFIG_SMP */
 
-u32 xive_native_alloc_irq(void)
+u32 xive_native_alloc_irq_on_chip(u32 chip_id)
 {
s64 rc;
 
for (;;) {
-   rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP);
+   rc = opal_xive_allocate_irq(chip_id);
if (rc != OPAL_BUSY)
break;
msleep(OPAL_BUSY_DELAY_MS);
@@ -293,7 +293,7 @@ u32 xive_native_alloc_irq(void)
return 0;
return rc;
 }
-EXPORT_SYMBOL_GPL(xive_native_alloc_irq);
+EXPORT_SYMBOL_GPL(xive_native_alloc_irq_on_chip);
 
 void xive_native_free_irq(u32 irq)
 {
-- 
1.8.3.1





[PATCH v9 00/13] powerpc/vas: Page fault handling for user space NX requests

2020-04-01 Thread Haren Myneni


On power9, Virtual Accelerator Switchboard (VAS) allows user space or
kernel to communicate with Nest Accelerator (NX) directly using COPY/PASTE
instructions. NX provides various functionalities such as compression,
encryption and etc. But only compression (842 and GZIP formats) is
supported in Linux kernel on power9.

842 compression driver (drivers/crypto/nx/nx-842-powernv.c)
is already included in Linux. Only GZIP support will be available from
user space.

Applications can issue GZIP compression / decompression requests to NX with
COPY/PASTE instructions. When NX is processing these requests, can hit
fault on the request buffer (not in memory). It issues an interrupt and
pastes fault CRB in fault FIFO. Expects kernel to handle this fault and
return credits for both send and fault windows after processing.

This patch series adds IRQ and fault window setup, and NX fault handling:
- Alloc IRQ and trigger port address, and configure IRQ per VAS instance.
- Set port# for each window to generate an interrupt when noticed fault.
- Set fault window and FIFO on which NX paste fault CRB.
- Setup IRQ thread fault handler per VAS instance.
- When receiving an interrupt, Read CRBs from fault FIFO and update
  coprocessor_status_block (CSB) in the corresponding CRB with translation
  failure (CSB_CC_TRANSLATION). After issuing NX requests, process polls
  on CSB address. When it sees translation error, can touch the request
  buffer to bring the page in to memory and reissue NX request.
- If copy_to_user fails on user space CSB address, OS sends SEGV signal.

Tested these patches with NX-GZIP enable patches and posted them as separate
patch series.

Patch 1: Define alloc IRQ per chip which is needed to alloc IRQ per VAS
   instance.
Patch 2: Define nx_fault_stamp on which NX writes fault status for the fault
 CRB
Patch 3: Alloc and setup IRQ and trigger port address for each VAS instance
Patches 4 & 5: Setup fault window and register NX per each VAS instance. This
 window is used for NX to paste fault CRB in FIFO.
Patch 6: Reference to pid and mm so that pid is not used until window closed.
 Needed for multi thread application where child can open a window
 and can be used by parent it later.
Patch 7: Setup threaded IRQ handler per VAS
Patch 8: Process CRBs from fault FIFO and notify tasks by updating CSB or
 through signals.
Patches 9 & 11: Return credits for send and fault windows after handling
faults.
Patches 10 & 13: Dump FIFO / CRB data and messages for error conditions
Patch 14:Fix closing send window after all credits are returned. This issue
 happens only for user space requests. No page faults on kernel
 request buffer.

Changelog:

V2:
  - Use threaded IRQ instead of own kernel thread handler
  - Use pswid instead of user space CSB address to find valid CRB
  - Removed unused macros and other changes as suggested by Christoph Hellwig

V3:
  - Rebased to 5.5-rc2
  - Use struct pid * instead of pid_t for vas_window tgid
  - Code cleanup as suggested by Christoph Hellwig

V4:
  - Define xive alloc and get IRQ info based on chip ID and use these
functions for IRQ setup per VAS instance. It eliminates skiboot
dependency as suggested by Oliver.

V5:
  - Do not update CSB if the process is exiting (patch8)

V6:
  - Add interrupt handler instead of default one and return IRQ_HANDLED
if the fault handling thread is already in progress. (Patch7)
  - Use platform send window ID and CCW[0] bit to find valid CRB in
fault FIFO (Patch7).
  - Return fault address to user space in BE and other changes as
suggested by Michael Neuling. (patch8)
  - Rebased to 5.6-rc4

V7:
  - Fixed sparse warnings (patches 4, 9 and 10)

V8:
  - Moved mm_context_remove_copro() before mmdrop() (patch6)
  - Moved barrier before csb.flags store and add WARN_ON_ONCE() checks (patch8)

V9:
  - Rebased to 5.6
  - Changes based on Cedric's comments
- Removed "Define xive_native_alloc_get_irq_info()" patch and used
  irq_get_handler_data() (patch3)
  - Changes based on comments from Nicholas Piggin
- Moved "Taking PID reference" patch before setting VAS fault handler
  patch
- Removed mutex_lock/unlock (patch7)
- Other cleanup changes


Haren Myneni (13):
  powerpc/xive: Define xive_native_alloc_irq_on_chip()
  powerpc/vas: Define nx_fault_stamp in coprocessor_request_block
  powerpc/vas: Alloc and setup IRQ and trigger port address
  powerpc/vas: Setup fault window per VAS instance
  powerpc/vas: Register NX with fault window ID and IRQ port value
  powerpc/vas: Take reference to PID and mm for user space windows
  powerpc/vas: Setup thread IRQ handler per VAS instance
  powerpc/vas: Update CSB and notify process for fault CRBs
  powerpc/vas: Return credits after handling fault
  powerpc/vas: Print CRB and FIFO values
  powerpc/vas: Do not use default credits for receive window
  powerpc/vas: Display 

Re: [PATCH 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-04-01 Thread Arnd Bergmann
On Tue, Mar 31, 2020 at 7:51 PM Segher Boessenkool
 wrote:
>
> On Tue, Mar 31, 2020 at 08:56:23AM +0200, Christophe Leroy wrote:
> > While we are at it, can we also remove the 601 ? This one is also full
> > of workarounds and diverges a bit from other 6xx.
> >
> > I'm unable to find its end of life date, but it was on the market in
> > 1994, so I guess it must be outdated by more than 10-15 yr old now ?
>
> There probably are still some people running Linux on 601 powermacs.

It could be marked as "BROKEN" for a year to find out for sure ;-)

Apparently there were only two or three models that are old enough to
have a 601 and new enough to run Linux with PCI and OF: 7200/8200
and 7500. These were sold for less than 18 months around 1996,
though one can still find them on eBay.

   Arnd


[PATCH v8 7/7] perf/tools/pmu-events/powerpc: Add hv_24x7 socket/chip level metric events

2020-04-01 Thread Kajol Jain
The hv_24×7 feature in IBM® POWER9™ processor-based servers provide the
facility to continuously collect large numbers of hardware performance
metrics efficiently and accurately.
This patch adds hv_24x7  metric file for different Socket/chip
resources.

Result:

power9 platform:

command:# ./perf stat --metric-only -M Memory_RD_BW_Chip -C 0 -I 1000

 1.96188  0.9  0.3
 2.000285720  0.5  0.1
 3.000424990  0.4  0.1

command:# ./perf stat --metric-only -M PowerBUS_Frequency -C 0 -I 1000

 1.979812.32.3
 2.0002917132.32.3
 3.0004217192.32.3
 4.0005509122.32.3

Signed-off-by: Kajol Jain 
---
 .../arch/powerpc/power9/nest_metrics.json | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json

diff --git a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json 
b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
new file mode 100644
index ..c121e526442a
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
@@ -0,0 +1,19 @@
+[
+{
+"MetricExpr": "(hv_24x7@PM_MCS01_128B_RD_DISP_PORT01\\,chip\\=?@ + 
hv_24x7@PM_MCS01_128B_RD_DISP_PORT23\\,chip\\=?@ + 
hv_24x7@PM_MCS23_128B_RD_DISP_PORT01\\,chip\\=?@ + 
hv_24x7@PM_MCS23_128B_RD_DISP_PORT23\\,chip\\=?@)",
+"MetricName": "Memory_RD_BW_Chip",
+"MetricGroup": "Memory_BW",
+"ScaleUnit": "1.6e-2MB"
+},
+{
+   "MetricExpr": "(hv_24x7@PM_MCS01_128B_WR_DISP_PORT01\\,chip\\=?@ + 
hv_24x7@PM_MCS01_128B_WR_DISP_PORT23\\,chip\\=?@ + 
hv_24x7@PM_MCS23_128B_WR_DISP_PORT01\\,chip\\=?@ + 
hv_24x7@PM_MCS23_128B_WR_DISP_PORT23\\,chip\\=?@ )",
+"MetricName": "Memory_WR_BW_Chip",
+"MetricGroup": "Memory_BW",
+"ScaleUnit": "1.6e-2MB"
+},
+{
+   "MetricExpr": "(hv_24x7@PM_PB_CYC\\,chip\\=?@ )",
+"MetricName": "PowerBUS_Frequency",
+"ScaleUnit": "2.5e-7GHz"
+}
+]
-- 
2.21.0



[PATCH v8 6/7] tools/perf: Enable Hz/hz prinitg for --metric-only option

2020-04-01 Thread Kajol Jain
Commit 54b5091606c18 ("perf stat: Implement --metric-only mode")
added function 'valid_only_metric()' which drops "Hz" or "hz",
if it is part of "ScaleUnit". This patch enable it since hv_24x7
supports couple of frequency events.

Signed-off-by: Kajol Jain 
---
 tools/perf/util/stat-display.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 9e757d18d713..679aaa655824 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -237,8 +237,6 @@ static bool valid_only_metric(const char *unit)
if (!unit)
return false;
if (strstr(unit, "/sec") ||
-   strstr(unit, "hz") ||
-   strstr(unit, "Hz") ||
strstr(unit, "CPUs utilized"))
return false;
return true;
-- 
2.21.0



[PATCH v8 5/7] perf/tests/expr: Added test for runtime param in metric expression

2020-04-01 Thread Kajol Jain
Added test case for parsing  "?" in metric expression.

Signed-off-by: Kajol Jain 
---
 tools/perf/tests/expr.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index 516504cf0ea5..f9e8e5628836 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -59,6 +59,14 @@ int test__expr(struct test *t __maybe_unused, int subtest 
__maybe_unused)
TEST_ASSERT_VAL("find other", !strcmp(other[2], "BOZO"));
TEST_ASSERT_VAL("find other", other[3] == NULL);
 
+   TEST_ASSERT_VAL("find other",
+   expr__find_other("EVENT1\\,param\\=?@ + 
EVENT2\\,param\\=?@", NULL,
+  , _other, 3) == 0);
+   TEST_ASSERT_VAL("find other", num_other == 2);
+   TEST_ASSERT_VAL("find other", !strcmp(other[0], "EVENT1,param=3/"));
+   TEST_ASSERT_VAL("find other", !strcmp(other[1], "EVENT2,param=3/"));
+   TEST_ASSERT_VAL("find other", other[2] == NULL);
+
for (i = 0; i < num_other; i++)
zfree([i]);
free((void *)other);
-- 
2.21.0



[PATCH v8 4/7] perf/tools: Enhance JSON/metric infrastructure to handle "?"

2020-04-01 Thread Kajol Jain
Patch enhances current metric infrastructure to handle "?" in the metric
expression. The "?" can be use for parameters whose value not known while
creating metric events and which can be replace later at runtime to
the proper value. It also add flexibility to create multiple events out
of single metric event added in json file.

Patch adds function 'arch_get_runtimeparam' which is a arch specific
function, returns the count of metric events need to be created.
By default it return 1.

This infrastructure needed for hv_24x7 socket/chip level events.
"hv_24x7" chip level events needs specific chip-id to which the
data is requested. Function 'arch_get_runtimeparam' implemented
in header.c which extract number of sockets from sysfs file
"sockets" under "/sys/devices/hv_24x7/interface/".

With this patch basically we are trying to create as many metric events
as define by runtime_param.

For that one loop is added in function 'metricgroup__add_metric',
which create multiple events at run time depend on return value of
'arch_get_runtimeparam' and merge that event in 'group_list'.

To achieve that we are actually passing this parameter value as part of
`expr__find_other` function and changing "?" present in metric expression
with this value.

As in our json file, there gonna be single metric event, and out of
which we are creating multiple events.

To understand which data count belongs to which parameter value,
we also printing param value in generic_metric function.

For example,
command:# ./perf stat  -M PowerBUS_Frequency -C 0 -I 1000
 1.000101867  9,356,933  hv_24x7/pm_pb_cyc,chip=0/ #  2.3 
GHz  PowerBUS_Frequency_0
 1.000101867  9,366,134  hv_24x7/pm_pb_cyc,chip=1/ #  2.3 
GHz  PowerBUS_Frequency_1
 2.000314878  9,365,868  hv_24x7/pm_pb_cyc,chip=0/ #  2.3 
GHz  PowerBUS_Frequency_0
 2.000314878  9,366,092  hv_24x7/pm_pb_cyc,chip=1/ #  2.3 
GHz  PowerBUS_Frequency_1

So, here _0 and _1 after PowerBUS_Frequency specify parameter value.

Signed-off-by: Kajol Jain 
---
 tools/perf/arch/powerpc/util/header.c |  8 
 tools/perf/tests/expr.c   |  8 
 tools/perf/util/expr.c| 11 ++-
 tools/perf/util/expr.h|  5 +++--
 tools/perf/util/expr.l| 27 +++---
 tools/perf/util/metricgroup.c | 28 ---
 tools/perf/util/metricgroup.h |  2 ++
 tools/perf/util/stat-shadow.c | 17 ++--
 8 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/header.c 
b/tools/perf/arch/powerpc/util/header.c
index 3b4cdfc5efd6..d4870074f14c 100644
--- a/tools/perf/arch/powerpc/util/header.c
+++ b/tools/perf/arch/powerpc/util/header.c
@@ -7,6 +7,8 @@
 #include 
 #include 
 #include "header.h"
+#include "metricgroup.h"
+#include 
 
 #define mfspr(rn)   ({unsigned long rval; \
 asm volatile("mfspr %0," __stringify(rn) \
@@ -44,3 +46,9 @@ get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
 
return bufp;
 }
+
+int arch_get_runtimeparam(void)
+{
+   int count;
+   return sysfs__read_int("/devices/hv_24x7/interface/sockets", ) < 
0 ? 1 : count;
+}
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index ea10fc4412c4..516504cf0ea5 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -10,7 +10,7 @@ static int test(struct expr_parse_ctx *ctx, const char *e, 
double val2)
 {
double val;
 
-   if (expr__parse(, ctx, e))
+   if (expr__parse(, ctx, e, 1))
TEST_ASSERT_VAL("parse test failed", 0);
TEST_ASSERT_VAL("unexpected value", val == val2);
return 0;
@@ -44,15 +44,15 @@ int test__expr(struct test *t __maybe_unused, int subtest 
__maybe_unused)
return ret;
 
p = "FOO/0";
-   ret = expr__parse(, , p);
+   ret = expr__parse(, , p, 1);
TEST_ASSERT_VAL("division by zero", ret == -1);
 
p = "BAR/";
-   ret = expr__parse(, , p);
+   ret = expr__parse(, , p, 1);
TEST_ASSERT_VAL("missing operand", ret == -1);
 
TEST_ASSERT_VAL("find other",
-   expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", 
, _other) == 0);
+   expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", 
, _other, 1) == 0);
TEST_ASSERT_VAL("find other", num_other == 3);
TEST_ASSERT_VAL("find other", !strcmp(other[0], "BAR"));
TEST_ASSERT_VAL("find other", !strcmp(other[1], "BAZ"));
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index c3382d58cf40..aa631e37ad1e 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -27,10 +27,11 @@ void expr__ctx_init(struct expr_parse_ctx *ctx)
 
 static int
 __expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr,
- int start)
+ int start, int runtime)
 {
struct 

[PATCH v8 3/7] perf/tools: Refactoring metricgroup__add_metric function

2020-04-01 Thread Kajol Jain
This patch refactor metricgroup__add_metric function where
some part of it move to function metricgroup__add_metric_param.
No logic change.

Signed-off-by: Kajol Jain 
---
 tools/perf/util/metricgroup.c | 60 ---
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 926449a7cdbf..7ad81c8177ea 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -485,6 +485,39 @@ static bool metricgroup__has_constraint(struct pmu_event 
*pe)
return false;
 }
 
+static int __metricgroup__add_metric(struct strbuf *events,
+   struct list_head *group_list, struct pmu_event *pe)
+{
+
+   const char **ids;
+   int idnum;
+   struct egroup *eg;
+
+   if (expr__find_other(pe->metric_expr, NULL, , ) < 0)
+   return -EINVAL;
+
+   if (events->len > 0)
+   strbuf_addf(events, ",");
+
+   if (metricgroup__has_constraint(pe))
+   metricgroup__add_metric_non_group(events, ids, idnum);
+   else
+   metricgroup__add_metric_weak_group(events, ids, idnum);
+
+   eg = malloc(sizeof(*eg));
+   if (!eg)
+   return -ENOMEM;
+
+   eg->ids = ids;
+   eg->idnum = idnum;
+   eg->metric_name = pe->metric_name;
+   eg->metric_expr = pe->metric_expr;
+   eg->metric_unit = pe->unit;
+   list_add_tail(>nd, group_list);
+
+   return 0;
+}
+
 static int metricgroup__add_metric(const char *metric, struct strbuf *events,
   struct list_head *group_list)
 {
@@ -504,35 +537,12 @@ static int metricgroup__add_metric(const char *metric, 
struct strbuf *events,
continue;
if (match_metric(pe->metric_group, metric) ||
match_metric(pe->metric_name, metric)) {
-   const char **ids;
-   int idnum;
-   struct egroup *eg;
 
pr_debug("metric expr %s for %s\n", pe->metric_expr, 
pe->metric_name);
 
-   if (expr__find_other(pe->metric_expr,
-NULL, , ) < 0)
-   continue;
-   if (events->len > 0)
-   strbuf_addf(events, ",");
-
-   if (metricgroup__has_constraint(pe))
-   metricgroup__add_metric_non_group(events, ids, 
idnum);
-   else
-   metricgroup__add_metric_weak_group(events, ids, 
idnum);
-
-   eg = malloc(sizeof(struct egroup));
-   if (!eg) {
-   ret = -ENOMEM;
+   ret = __metricgroup__add_metric(events, group_list, pe);
+   if (ret == -ENOMEM)
break;
-   }
-   eg->ids = ids;
-   eg->idnum = idnum;
-   eg->metric_name = pe->metric_name;
-   eg->metric_expr = pe->metric_expr;
-   eg->metric_unit = pe->unit;
-   list_add_tail(>nd, group_list);
-   ret = 0;
}
}
return ret;
-- 
2.21.0



[PATCH v8 2/7] perf expr: Add expr_scanner_ctx object

2020-04-01 Thread Kajol Jain
From: Jiri Olsa 

Adding expr_scanner_ctx object to hold user data
for the expr scanner. Currently it holds only
start_token, Kajol Jain will use it to hold 24x7
runtime param.

Signed-off-by: Jiri Olsa 
---
 tools/perf/util/expr.c |  6 --
 tools/perf/util/expr.h |  4 
 tools/perf/util/expr.l | 10 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index c8ccc548a585..c3382d58cf40 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -3,7 +3,6 @@
 #include 
 #include "expr.h"
 #include "expr-bison.h"
-#define YY_EXTRA_TYPE int
 #include "expr-flex.h"
 
 #ifdef PARSER_DEBUG
@@ -30,11 +29,14 @@ static int
 __expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr,
  int start)
 {
+   struct expr_scanner_ctx scanner_ctx = {
+   .start_token = start,
+   };
YY_BUFFER_STATE buffer;
void *scanner;
int ret;
 
-   ret = expr_lex_init_extra(start, );
+   ret = expr_lex_init_extra(_ctx, );
if (ret)
return ret;
 
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index b9e53f2b5844..0938ad166ece 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -15,6 +15,10 @@ struct expr_parse_ctx {
struct expr_parse_id ids[MAX_PARSE_ID];
 };
 
+struct expr_scanner_ctx {
+   int start_token;
+};
+
 void expr__ctx_init(struct expr_parse_ctx *ctx);
 void expr__add_id(struct expr_parse_ctx *ctx, const char *id, double val);
 int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char 
*expr);
diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
index eaad29243c23..2582c2464938 100644
--- a/tools/perf/util/expr.l
+++ b/tools/perf/util/expr.l
@@ -76,13 +76,13 @@ sym [0-9a-zA-Z_\.:@]+
 symbol {spec}*{sym}*{spec}*{sym}*
 
 %%
-   {
-   int start_token;
+   struct expr_scanner_ctx *sctx = expr_get_extra(yyscanner);
 
-   start_token = expr_get_extra(yyscanner);
+   {
+   int start_token = sctx->start_token;
 
-   if (start_token) {
-   expr_set_extra(NULL, yyscanner);
+   if (sctx->start_token) {
+   sctx->start_token = 0;
return start_token;
}
}
-- 
2.21.0



[PATCH v8 1/7] perf expr: Add expr_ prefix for parse_ctx and parse_id

2020-04-01 Thread Kajol Jain
From: Jiri Olsa 

Adding expr_ prefix for parse_ctx and parse_id,
to straighten out the expr* namespace.

There's no functional change.

Signed-off-by: Jiri Olsa 
---
 tools/perf/tests/expr.c   |  4 ++--
 tools/perf/util/expr.c| 10 +-
 tools/perf/util/expr.h| 12 ++--
 tools/perf/util/expr.y|  6 +++---
 tools/perf/util/stat-shadow.c |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index 28313e59d6f6..ea10fc4412c4 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -6,7 +6,7 @@
 #include 
 #include 
 
-static int test(struct parse_ctx *ctx, const char *e, double val2)
+static int test(struct expr_parse_ctx *ctx, const char *e, double val2)
 {
double val;
 
@@ -22,7 +22,7 @@ int test__expr(struct test *t __maybe_unused, int subtest 
__maybe_unused)
const char **other;
double val;
int i, ret;
-   struct parse_ctx ctx;
+   struct expr_parse_ctx ctx;
int num_other;
 
expr__ctx_init();
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index fd192ddf93c1..c8ccc548a585 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -11,7 +11,7 @@ extern int expr_debug;
 #endif
 
 /* Caller must make sure id is allocated */
-void expr__add_id(struct parse_ctx *ctx, const char *name, double val)
+void expr__add_id(struct expr_parse_ctx *ctx, const char *name, double val)
 {
int idx;
 
@@ -21,13 +21,13 @@ void expr__add_id(struct parse_ctx *ctx, const char *name, 
double val)
ctx->ids[idx].val = val;
 }
 
-void expr__ctx_init(struct parse_ctx *ctx)
+void expr__ctx_init(struct expr_parse_ctx *ctx)
 {
ctx->num_ids = 0;
 }
 
 static int
-__expr__parse(double *val, struct parse_ctx *ctx, const char *expr,
+__expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr,
  int start)
 {
YY_BUFFER_STATE buffer;
@@ -52,7 +52,7 @@ __expr__parse(double *val, struct parse_ctx *ctx, const char 
*expr,
return ret;
 }
 
-int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr)
+int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char 
*expr)
 {
return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0;
 }
@@ -75,7 +75,7 @@ int expr__find_other(const char *expr, const char *one, const 
char ***other,
 int *num_other)
 {
int err, i = 0, j = 0;
-   struct parse_ctx ctx;
+   struct expr_parse_ctx ctx;
 
expr__ctx_init();
err = __expr__parse(NULL, , expr, EXPR_OTHER);
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index 9377538f4097..b9e53f2b5844 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -5,19 +5,19 @@
 #define EXPR_MAX_OTHER 20
 #define MAX_PARSE_ID EXPR_MAX_OTHER
 
-struct parse_id {
+struct expr_parse_id {
const char *name;
double val;
 };
 
-struct parse_ctx {
+struct expr_parse_ctx {
int num_ids;
-   struct parse_id ids[MAX_PARSE_ID];
+   struct expr_parse_id ids[MAX_PARSE_ID];
 };
 
-void expr__ctx_init(struct parse_ctx *ctx);
-void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
-int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr);
+void expr__ctx_init(struct expr_parse_ctx *ctx);
+void expr__add_id(struct expr_parse_ctx *ctx, const char *id, double val);
+int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char 
*expr);
 int expr__find_other(const char *expr, const char *one, const char ***other,
int *num_other);
 
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
index 4720cbe79357..cd17486c1c5d 100644
--- a/tools/perf/util/expr.y
+++ b/tools/perf/util/expr.y
@@ -15,7 +15,7 @@
 %define api.pure full
 
 %parse-param { double *final_val }
-%parse-param { struct parse_ctx *ctx }
+%parse-param { struct expr_parse_ctx *ctx }
 %parse-param {void *scanner}
 %lex-param {void* scanner}
 
@@ -39,14 +39,14 @@
 
 %{
 static void expr_error(double *final_val __maybe_unused,
-  struct parse_ctx *ctx __maybe_unused,
+  struct expr_parse_ctx *ctx __maybe_unused,
   void *scanner,
   const char *s)
 {
pr_debug("%s\n", s);
 }
 
-static int lookup_id(struct parse_ctx *ctx, char *id, double *val)
+static int lookup_id(struct expr_parse_ctx *ctx, char *id, double *val)
 {
int i;
 
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 0fd713d3674f..402af3e8d287 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -729,7 +729,7 @@ static void generic_metric(struct perf_stat_config *config,
   struct runtime_stat *st)
 {
print_metric_t print_metric = out->print_metric;
-   struct parse_ctx pctx;
+   struct expr_parse_ctx pctx;
double 

[PATCH v8 0/7] powerpc/perf: Add json file metric support for the hv_24x7 socket/chip level events

2020-04-01 Thread Kajol Jain
Patchset adds json file metric support for the hv_24x7 socket/chip level
events. "hv_24x7" pmu interface events needs system dependent parameter
like socket/chip/core. For example, hv_24x7 chip level events needs
specific chip-id to which the data is requested should be added as part
of pmu events.

So to enable JSON file support to "hv_24x7" interface, patchset reads
total number of sockets details in sysfs under 
"/sys/devices/hv_24x7/interface/".

Second patch of the patchset adds expr_scanner_ctx object to hold user
data for the expr scanner, which can be used to hold runtime parameter.

Patch 4 & 6 of the patchset handles perf tool plumbing needed to replace
the "?" character in the metric expression to proper value and hv_24x7
json metric file for different Socket/chip resources.

Patch set also enable Hz/hz prinitg for --metric-only option to print
metric data for bus frequency.

Applied and tested all these patches cleanly on top of jiri's flex changes
with the changes done by Kan Liang for "Support metric group constraint"
patchset and made required changes.

Also apply this patch on top of the fix patch send earlier
for printing metric name incase overlapping events.
https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/commit/?h=perf/core=37cd7f65bf71a48f25eeb6d9be5dacb20d008ea6

Changelog:
v7 -> v8
- Add test case for testing parsing of "?" in metric expression
- Reaname variables name to runtime

v6 -> v7
- Spit patchset into two patch series one for kernel changes and other
  for tool side changes.
- Made changes Suggested by Jiri, including rather then reading runtime
  parameter from metric name, actually add it in structure egroup and
  metric_expr.
- As we don't need to read runtime parameter from metric name,
  now I am not appending it and rather just printing it in
  generic_metric function.

Kernel Side changes patch series: https://lkml.org/lkml/2020/3/27/58

v5 -> v6
- resolve compilation issue due to rearranging patch series.
- Rather then adding new function to take careof case for runtime param
  in metricgroup__add_metric, using metricgroup__add_metric_param itself
  for that work.
- Address some optimization suggested like using directly file path
  rather then adding new macro in header.c
- Change commit message on patch where we are adding "?" support
  by adding simple example.

v4 -> v5
- Using sysfs__read_int instead of sysfs__read_ull while reading
  parameter value in powerpc/util/header.c file.

- Using asprintf rather then malloc and sprintf 
  Suggested by Arnaldo Carvalho de Melo

- Break patch 6 from previous version to two patch,
  - One to add refactor current "metricgroup__add_metric" function
and another where actually "?" handling infra added.

- Add expr__runtimeparam as part of 'expr_scanner_ctx' struct
  rather then making it global variable. Thanks Jiri for
  adding this structure to hold user data for the expr scanner.

- Add runtime param as agrugement to function 'expr__find_other'
  and 'expr__parse' and made changes on references accordingly.

v3 -> v4
- Apply these patch on top of Kan liang changes.
  As suggested by Jiri.

v2 -> v3
- Remove setting  event_count to 0 part in function 'h_24x7_event_read'
  with comment rather then adding 0 to event_count value.
  Suggested by: Sukadev Bhattiprolu

- Apply tool side changes require to replace "?" on Jiri's flex patch
  series and made all require changes to make it compatible with added
  flex change.

v1 -> v2
- Rename hv-24x7 metric json file as nest_metrics.json

Jiri Olsa (2):
  perf expr: Add expr_ prefix for parse_ctx and parse_id
  perf expr: Add expr_scanner_ctx object

Kajol Jain (5):
  perf/tools: Refactoring metricgroup__add_metric function
  perf/tools: Enhance JSON/metric infrastructure to handle "?"
  perf/tests/expr: Added test for runtime param in metric expression
  tools/perf: Enable Hz/hz prinitg for --metric-only option
  perf/tools/pmu-events/powerpc: Add hv_24x7 socket/chip level metric
events

 tools/perf/arch/powerpc/util/header.c |  8 ++
 .../arch/powerpc/power9/nest_metrics.json | 19 +
 tools/perf/tests/expr.c   | 20 +++--
 tools/perf/util/expr.c| 25 +++---
 tools/perf/util/expr.h| 19 +++--
 tools/perf/util/expr.l| 37 ++---
 tools/perf/util/expr.y|  6 +-
 tools/perf/util/metricgroup.c | 78 +--
 tools/perf/util/metricgroup.h |  2 +
 tools/perf/util/stat-display.c|  2 -
 tools/perf/util/stat-shadow.c | 19 +++--
 11 files changed, 164 insertions(+), 71 deletions(-)
 create mode 100644 tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json

-- 
2.21.0



Re: [PATCH v4 12/25] nvdimm/ocxl: Add register addresses & status values to the header

2020-04-01 Thread Dan Williams
On Sun, Mar 29, 2020 at 10:53 PM Alastair D'Silva  wrote:
>
> These values have been taken from the device specifications.

Link to specification?


Re: [PATCH v4 11/25] powerpc: Enable the OpenCAPI Persistent Memory driver for powernv_defconfig

2020-04-01 Thread Dan Williams
On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva  wrote:
>
> This patch enables the OpenCAPI Persistent Memory driver, as well
> as DAX support, for the 'powernv' defconfig.
>
> DAX is not a strict requirement for the functioning of the driver, but it
> is likely that a user will want to create a DAX device on top of their
> persistent memory device.
>
> Signed-off-by: Alastair D'Silva 
> Reviewed-by: Andrew Donnellan 
> ---
>  arch/powerpc/configs/powernv_defconfig | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/arch/powerpc/configs/powernv_defconfig 
> b/arch/powerpc/configs/powernv_defconfig
> index 71749377d164..921d77bbd3d2 100644
> --- a/arch/powerpc/configs/powernv_defconfig
> +++ b/arch/powerpc/configs/powernv_defconfig
> @@ -348,3 +348,8 @@ CONFIG_KVM_BOOK3S_64=m
>  CONFIG_KVM_BOOK3S_64_HV=m
>  CONFIG_VHOST_NET=m
>  CONFIG_PRINTK_TIME=y
> +CONFIG_ZONE_DEVICE=y
> +CONFIG_OCXL_PMEM=m
> +CONFIG_DEV_DAX=m
> +CONFIG_DEV_DAX_PMEM=m
> +CONFIG_FS_DAX=y

These options have dependencies. I think it would better to implement
a top-level configuration question called something like
PERSISTENT_MEMORY_ALL that goes and selects all the bus providers and
infrastructure and lets other defaults follow along. For example,
CONFIG_DEV_DAX could grow a "default LIBNVDIMM" and then
CONFIG_DEV_DAX_PMEM would default on as well. If
CONFIG_PERSISTENT_MEMORY_ALL selected all the bus providers and
ZONE_DEVICE then the Kconfig system could prompt you to where the
dependencies are not satisfied.


Re: [PATCH v4 10/25] nvdimm: Add driver for OpenCAPI Persistent Memory

2020-04-01 Thread Dan Williams
On Wed, Apr 1, 2020 at 1:49 AM Dan Williams  wrote:
>
> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva  
> wrote:
> >
> > This driver exposes LPC memory on OpenCAPI pmem cards
> > as an NVDIMM, allowing the existing nvram infrastructure
> > to be used.
> >
> > Namespace metadata is stored on the media itself, so
> > scm_reserve_metadata() maps 1 section's worth of PMEM storage
> > at the start to hold this. The rest of the PMEM range is registered
> > with libnvdimm as an nvdimm. ndctl_config_read/write/size() provide
> > callbacks to libnvdimm to access the metadata.
> >
> > Signed-off-by: Alastair D'Silva 
> > ---
> >  drivers/nvdimm/Kconfig |   2 +
> >  drivers/nvdimm/Makefile|   1 +
> >  drivers/nvdimm/ocxl/Kconfig|  15 ++
> >  drivers/nvdimm/ocxl/Makefile   |   7 +
> >  drivers/nvdimm/ocxl/main.c | 476 +
> >  drivers/nvdimm/ocxl/ocxlpmem.h |  23 ++
> >  6 files changed, 524 insertions(+)
> >  create mode 100644 drivers/nvdimm/ocxl/Kconfig
> >  create mode 100644 drivers/nvdimm/ocxl/Makefile
> >  create mode 100644 drivers/nvdimm/ocxl/main.c
> >  create mode 100644 drivers/nvdimm/ocxl/ocxlpmem.h
> >
> > diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
> > index b7d1eb38b27d..368328637182 100644
> > --- a/drivers/nvdimm/Kconfig
> > +++ b/drivers/nvdimm/Kconfig
> > @@ -131,4 +131,6 @@ config NVDIMM_TEST_BUILD
> >   core devm_memremap_pages() implementation and other
> >   infrastructure.
> >
> > +source "drivers/nvdimm/ocxl/Kconfig"
> > +
> >  endif
> > diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
> > index 29203f3d3069..bc02be11c794 100644
> > --- a/drivers/nvdimm/Makefile
> > +++ b/drivers/nvdimm/Makefile
> > @@ -33,3 +33,4 @@ libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o
> >  TOOLS := ../../tools
> >  TEST_SRC := $(TOOLS)/testing/nvdimm/test
> >  obj-$(CONFIG_NVDIMM_TEST_BUILD) += $(TEST_SRC)/iomap.o
> > +obj-$(CONFIG_LIBNVDIMM) += ocxl/
> > diff --git a/drivers/nvdimm/ocxl/Kconfig b/drivers/nvdimm/ocxl/Kconfig
> > new file mode 100644
> > index ..c5d927520920
> > --- /dev/null
> > +++ b/drivers/nvdimm/ocxl/Kconfig
> > @@ -0,0 +1,15 @@
> > +# SPDX-License-Identifier: GPL-2.0-only
> > +if LIBNVDIMM
> > +
> > +config OCXL_PMEM
> > +   tristate "OpenCAPI Persistent Memory"
> > +   depends on LIBNVDIMM && PPC_POWERNV && PCI && EEH && ZONE_DEVICE && 
> > OCXL
>
> Does OXCL_PMEM itself have any CONFIG_ZONE_DEVICE dependencies? That's
> more a function of CONFIG_DEV_DAX and CONFIG_FS_DAX. Doesn't OCXL
> already depend on CONFIG_PCI?
>
>
> > +   help
> > + Exposes devices that implement the OpenCAPI Storage Class Memory
> > + specification as persistent memory regions. You may also want
> > + DEV_DAX, DEV_DAX_PMEM & FS_DAX if you plan on using DAX devices
> > + stacked on top of this driver.
> > +
> > + Select N if unsure.
> > +
> > +endif
> > diff --git a/drivers/nvdimm/ocxl/Makefile b/drivers/nvdimm/ocxl/Makefile
> > new file mode 100644
> > index ..e0e8ade1987a
> > --- /dev/null
> > +++ b/drivers/nvdimm/ocxl/Makefile
> > @@ -0,0 +1,7 @@
> > +# SPDX-License-Identifier: GPL-2.0
> > +
> > +ccflags-$(CONFIG_PPC_WERROR)   += -Werror
> > +
> > +obj-$(CONFIG_OCXL_PMEM) += ocxlpmem.o
> > +
> > +ocxlpmem-y := main.o
> > \ No newline at end of file
> > diff --git a/drivers/nvdimm/ocxl/main.c b/drivers/nvdimm/ocxl/main.c
> > new file mode 100644
> > index ..c0066fedf9cc
> > --- /dev/null
> > +++ b/drivers/nvdimm/ocxl/main.c
> > @@ -0,0 +1,476 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +// Copyright 2020 IBM Corp.
> > +
> > +/*
> > + * A driver for OpenCAPI devices that implement the Storage Class
> > + * Memory specification.
> > + */
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include "ocxlpmem.h"
> > +
> > +static const struct pci_device_id pci_tbl[] = {
> > +   { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0625), },
> > +   { }
> > +};
> > +
> > +MODULE_DEVICE_TABLE(pci, pci_tbl);
> > +
> > +#define NUM_MINORS 256 // Total to reserve
> > +
> > +static dev_t ocxlpmem_dev;
> > +static struct class *ocxlpmem_class;
> > +static struct mutex minors_idr_lock;
> > +static struct idr minors_idr;
> > +
> > +/**
> > + * ndctl_config_write() - Handle a ND_CMD_SET_CONFIG_DATA command from 
> > ndctl
> > + * @ocxlpmem: the device metadata
> > + * @command: the incoming data to write
> > + * Return: 0 on success, negative on failure
> > + */
> > +static int ndctl_config_write(struct ocxlpmem *ocxlpmem,
> > + struct nd_cmd_set_config_hdr *command)
> > +{
> > +   if (command->in_offset + command->in_length > LABEL_AREA_SIZE)
> > +   return -EINVAL;
> > +
> > +   memcpy_flushcache(ocxlpmem->metadata_addr + command->in_offset,
> > + command->in_buf, command->in_length);
> > +
> > +   return 0;
> > +}
> > +
> > +/**
> > + * 

Re: [PATCH v2 4/4] hugetlbfs: clean up command line processing

2020-04-01 Thread Randy Dunlap
On 4/1/20 11:38 AM, Mike Kravetz wrote:
> With all hugetlb page processing done in a single file clean up code.
> - Make code match desired semantics
>   - Update documentation with semantics
> - Make all warnings and errors messages start with 'HugeTLB:'.
> - Consistently name command line parsing routines.
> - Check for hugepages_supported() before processing parameters.
> - Add comments to code
>   - Describe some of the subtle interactions
>   - Describe semantics of command line arguments
> 
> Signed-off-by: Mike Kravetz 
> ---

Hi Mike,
One nit, please see below:

>  .../admin-guide/kernel-parameters.txt | 35 ---
>  Documentation/admin-guide/mm/hugetlbpage.rst  | 44 +
>  mm/hugetlb.c  | 96 +++
>  3 files changed, 142 insertions(+), 33 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 1bd5454b5e5f..de653cfe1726 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -832,12 +832,15 @@
>   See also Documentation/networking/decnet.txt.
>  
>   default_hugepagesz=
> - [same as hugepagesz=] The size of the default
> - HugeTLB page size. This is the size represented by
> - the legacy /proc/ hugepages APIs, used for SHM, and
> - default size when mounting hugetlbfs filesystems.
> - Defaults to the default architecture's huge page size
> - if not specified.
> + [HW] The size of the default HugeTLB page size. This

Drop one "size" above?

> + is the size represented by the legacy /proc/ hugepages
> + APIs.  In addition, this is the default hugetlb size
> + used for shmget(), mmap() and mounting hugetlbfs
> + filesystems.  If not specified, defaults to the
> + architecture's default huge page size.  Huge page
> + sizes are architecture dependent.  See also
> + Documentation/admin-guide/mm/hugetlbpage.rst.
> + Format: size[KMG]



-- 
~Randy



[PATCH v2 0/4] Clean up hugetlb boot command line processing

2020-04-01 Thread Mike Kravetz
v2 -
   Fix build errors with patch 1 (Will)
   Change arch_hugetlb_valid_size arg to unsigned long and remove
 irrelevant 'extern' keyword (Christophe)
   Documentation and other misc changes (Randy, Christophe, Mina)
   Do not process command line options if !hugepages_supported()
 (Dave, but it sounds like we may want to additional changes to
  hugepages_supported() for x86?  If that is needed I would prefer
  a separate patch.)

Longpeng(Mike) reported a weird message from hugetlb command line processing
and proposed a solution [1].  While the proposed patch does address the
specific issue, there are other related issues in command line processing.
As hugetlbfs evolved, updates to command line processing have been made to
meet immediate needs and not necessarily in a coordinated manner.  The result
is that some processing is done in arch specific code, some is done in arch
independent code and coordination is problematic.  Semantics can vary between
architectures.

The patch series does the following:
- Define arch specific arch_hugetlb_valid_size routine used to validate
  passed huge page sizes.
- Move hugepagesz= command line parsing out of arch specific code and into
  an arch independent routine.
- Clean up command line processing to follow desired semantics and
  document those semantics.

[1] https://lore.kernel.org/linux-mm/20200305033014.1152-1-longpe...@huawei.com

Mike Kravetz (4):
  hugetlbfs: add arch_hugetlb_valid_size
  hugetlbfs: move hugepagesz= parsing to arch independent code
  hugetlbfs: remove hugetlb_add_hstate() warning for existing hstate
  hugetlbfs: clean up command line processing

 .../admin-guide/kernel-parameters.txt |  35 +++--
 Documentation/admin-guide/mm/hugetlbpage.rst  |  44 ++
 arch/arm64/include/asm/hugetlb.h  |   2 +
 arch/arm64/mm/hugetlbpage.c   |  30 +---
 arch/powerpc/include/asm/hugetlb.h|   3 +
 arch/powerpc/mm/hugetlbpage.c |  30 ++--
 arch/riscv/include/asm/hugetlb.h  |   3 +
 arch/riscv/mm/hugetlbpage.c   |  24 +--
 arch/s390/include/asm/hugetlb.h   |   3 +
 arch/s390/mm/hugetlbpage.c|  24 +--
 arch/sparc/include/asm/hugetlb.h  |   3 +
 arch/sparc/mm/init_64.c   |  43 ++
 arch/x86/include/asm/hugetlb.h|   5 +
 arch/x86/mm/hugetlbpage.c |  23 +--
 include/linux/hugetlb.h   |   8 +-
 mm/hugetlb.c  | 141 ++
 16 files changed, 252 insertions(+), 169 deletions(-)

-- 
2.25.1



[PATCH v2 1/4] hugetlbfs: add arch_hugetlb_valid_size

2020-04-01 Thread Mike Kravetz
The architecture independent routine hugetlb_default_setup sets up
the default huge pages size.  It has no way to verify if the passed
value is valid, so it accepts it and attempts to validate at a later
time.  This requires undocumented cooperation between the arch specific
and arch independent code.

For architectures that support more than one huge page size, provide
a routine arch_hugetlb_valid_size to validate a huge page size.
hugetlb_default_setup can use this to validate passed values.

arch_hugetlb_valid_size will also be used in a subsequent patch to
move processing of the "hugepagesz=" in arch specific code to a common
routine in arch independent code.

Signed-off-by: Mike Kravetz 
---
 arch/arm64/include/asm/hugetlb.h   |  2 ++
 arch/arm64/mm/hugetlbpage.c| 17 +
 arch/powerpc/include/asm/hugetlb.h |  3 +++
 arch/powerpc/mm/hugetlbpage.c  | 20 +---
 arch/riscv/include/asm/hugetlb.h   |  3 +++
 arch/riscv/mm/hugetlbpage.c| 26 +-
 arch/s390/include/asm/hugetlb.h|  3 +++
 arch/s390/mm/hugetlbpage.c | 16 
 arch/sparc/include/asm/hugetlb.h   |  3 +++
 arch/sparc/mm/init_64.c| 24 
 arch/x86/include/asm/hugetlb.h |  5 +
 arch/x86/mm/hugetlbpage.c  | 17 +
 include/linux/hugetlb.h|  7 +++
 mm/hugetlb.c   | 15 ---
 14 files changed, 122 insertions(+), 39 deletions(-)

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2eb6c234d594..81606223494f 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -59,6 +59,8 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned 
long addr,
 extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte, unsigned long sz);
 #define set_huge_swap_pte_at set_huge_swap_pte_at
+bool __init arch_hugetlb_valid_size(unsigned long size);
+#define arch_hugetlb_valid_size arch_hugetlb_valid_size
 
 #include 
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index bbeb6a5a6ba6..069b96ee2aec 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -462,17 +462,26 @@ static int __init hugetlbpage_init(void)
 }
 arch_initcall(hugetlbpage_init);
 
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
 {
-   unsigned long ps = memparse(opt, );
-
-   switch (ps) {
+   switch (size) {
 #ifdef CONFIG_ARM64_4K_PAGES
case PUD_SIZE:
 #endif
case CONT_PMD_SIZE:
case PMD_SIZE:
case CONT_PTE_SIZE:
+   return true;
+   }
+
+   return false;
+}
+
+static __init int setup_hugepagesz(char *opt)
+{
+   unsigned long ps = memparse(opt, );
+
+   if (arch_hugetlb_valid_size(ps)) {
add_huge_page_size(ps);
return 1;
}
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index bd6504c28c2f..19b453ee1431 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -64,6 +64,9 @@ static inline void arch_clear_hugepage_flags(struct page 
*page)
 {
 }
 
+#define arch_hugetlb_valid_size arch_hugetlb_valid_size
+bool __init arch_hugetlb_valid_size(unsigned long size);
+
 #include 
 
 #else /* ! CONFIG_HUGETLB_PAGE */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 33b3461d91e8..de54d2a37830 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -558,7 +558,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
return vma_kernel_pagesize(vma);
 }
 
-static int __init add_huge_page_size(unsigned long long size)
+bool __init arch_hugetlb_valid_size(unsigned long size)
 {
int shift = __ffs(size);
int mmu_psize;
@@ -566,20 +566,26 @@ static int __init add_huge_page_size(unsigned long long 
size)
/* Check that it is a page size supported by the hardware and
 * that it fits within pagetable and slice limits. */
if (size <= PAGE_SIZE || !is_power_of_2(size))
-   return -EINVAL;
+   return false;
 
mmu_psize = check_and_get_huge_psize(shift);
if (mmu_psize < 0)
-   return -EINVAL;
+   return false;
 
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
-   /* Return if huge page size has already been setup */
-   if (size_to_hstate(size))
-   return 0;
+   return true;
+}
 
-   hugetlb_add_hstate(shift - PAGE_SHIFT);
+static int __init add_huge_page_size(unsigned long long size)
+{
+   int shift = __ffs(size);
+
+   if (!arch_hugetlb_valid_size((unsigned long)size))
+   return -EINVAL;
 
+   if (!size_to_hstate(size))
+   hugetlb_add_hstate(shift - 

[PATCH v2 3/4] hugetlbfs: remove hugetlb_add_hstate() warning for existing hstate

2020-04-01 Thread Mike Kravetz
The routine hugetlb_add_hstate prints a warning if the hstate already
exists.  This was originally done as part of kernel command line
parsing.  If 'hugepagesz=' was specified more than once, the warning
pr_warn("hugepagesz= specified twice, ignoring\n");
would be printed.

Some architectures want to enable all huge page sizes.  They would
call hugetlb_add_hstate for all supported sizes.  However, this was
done after command line processing and as a result hstates could have
already been created for some sizes.  To make sure no warning were
printed, there would often be code like:
if (!size_to_hstate(size)
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT)

The only time we want to print the warning is as the result of command
line processing.  So, remove the warning from hugetlb_add_hstate and
add it to the single arch independent routine processing "hugepagesz=".
After this, calls to size_to_hstate() in arch specific code can be
removed and hugetlb_add_hstate can be called without worrying about
warning messages.

Signed-off-by: Mike Kravetz 
---
 arch/arm64/mm/hugetlbpage.c   | 16 
 arch/powerpc/mm/hugetlbpage.c |  3 +--
 arch/riscv/mm/hugetlbpage.c   |  2 +-
 arch/sparc/mm/init_64.c   | 19 ---
 arch/x86/mm/hugetlbpage.c |  2 +-
 mm/hugetlb.c  |  9 ++---
 6 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f706b821aba6..21fa98b51e00 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -441,22 +441,14 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
 }
 
-static void __init add_huge_page_size(unsigned long size)
-{
-   if (size_to_hstate(size))
-   return;
-
-   hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-}
-
 static int __init hugetlbpage_init(void)
 {
 #ifdef CONFIG_ARM64_4K_PAGES
-   add_huge_page_size(PUD_SIZE);
+   hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 #endif
-   add_huge_page_size(CONT_PMD_SIZE);
-   add_huge_page_size(PMD_SIZE);
-   add_huge_page_size(CONT_PTE_SIZE);
+   hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT);
 
return 0;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 2c3fa0a7787b..4d5ed1093615 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -584,8 +584,7 @@ static int __init add_huge_page_size(unsigned long long 
size)
if (!arch_hugetlb_valid_size((unsigned long)size))
return -EINVAL;
 
-   if (!size_to_hstate(size))
-   hugetlb_add_hstate(shift - PAGE_SHIFT);
+   hugetlb_add_hstate(shift - PAGE_SHIFT);
return 0;
 }
 
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 4e5d7e9f0eef..932dadfdca54 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -26,7 +26,7 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 static __init int gigantic_pages_init(void)
 {
/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
-   if (IS_ENABLED(CONFIG_64BIT) && !size_to_hstate(1UL << PUD_SHIFT))
+   if (IS_ENABLED(CONFIG_64BIT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
 }
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 4618f96fd30f..ae819a16d07a 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -325,23 +325,12 @@ static void __update_mmu_tsb_insert(struct mm_struct *mm, 
unsigned long tsb_inde
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static void __init add_huge_page_size(unsigned long size)
-{
-   unsigned int order;
-
-   if (size_to_hstate(size))
-   return;
-
-   order = ilog2(size) - PAGE_SHIFT;
-   hugetlb_add_hstate(order);
-}
-
 static int __init hugetlbpage_init(void)
 {
-   add_huge_page_size(1UL << HPAGE_64K_SHIFT);
-   add_huge_page_size(1UL << HPAGE_SHIFT);
-   add_huge_page_size(1UL << HPAGE_256MB_SHIFT);
-   add_huge_page_size(1UL << HPAGE_2GB_SHIFT);
+   hugetlb_add_hstate(HPAGE_64K_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(HPAGE_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(HPAGE_256MB_SHIFT - PAGE_SHIFT);
+   hugetlb_add_hstate(HPAGE_2GB_SHIFT - PAGE_SHIFT);
 
return 0;
 }
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 937d640a89e3..cf5781142716 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -195,7 +195,7 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 static __init int gigantic_pages_init(void)
 {
/* With compaction or CMA we can allocate gigantic pages at runtime */
-   if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << 
PUD_SHIFT))

[PATCH v2 4/4] hugetlbfs: clean up command line processing

2020-04-01 Thread Mike Kravetz
With all hugetlb page processing done in a single file clean up code.
- Make code match desired semantics
  - Update documentation with semantics
- Make all warnings and errors messages start with 'HugeTLB:'.
- Consistently name command line parsing routines.
- Check for hugepages_supported() before processing parameters.
- Add comments to code
  - Describe some of the subtle interactions
  - Describe semantics of command line arguments

Signed-off-by: Mike Kravetz 
---
 .../admin-guide/kernel-parameters.txt | 35 ---
 Documentation/admin-guide/mm/hugetlbpage.rst  | 44 +
 mm/hugetlb.c  | 96 +++
 3 files changed, 142 insertions(+), 33 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 1bd5454b5e5f..de653cfe1726 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -832,12 +832,15 @@
See also Documentation/networking/decnet.txt.
 
default_hugepagesz=
-   [same as hugepagesz=] The size of the default
-   HugeTLB page size. This is the size represented by
-   the legacy /proc/ hugepages APIs, used for SHM, and
-   default size when mounting hugetlbfs filesystems.
-   Defaults to the default architecture's huge page size
-   if not specified.
+   [HW] The size of the default HugeTLB page size. This
+   is the size represented by the legacy /proc/ hugepages
+   APIs.  In addition, this is the default hugetlb size
+   used for shmget(), mmap() and mounting hugetlbfs
+   filesystems.  If not specified, defaults to the
+   architecture's default huge page size.  Huge page
+   sizes are architecture dependent.  See also
+   Documentation/admin-guide/mm/hugetlbpage.rst.
+   Format: size[KMG]
 
deferred_probe_timeout=
[KNL] Debugging option to set a timeout in seconds for
@@ -1480,13 +1483,19 @@
If enabled, boot-time allocation of gigantic hugepages
is skipped.
 
-   hugepages=  [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
-   hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
-   On x86-64 and powerpc, this option can be specified
-   multiple times interleaved with hugepages= to reserve
-   huge pages of different sizes. Valid pages sizes on
-   x86-64 are 2M (when the CPU supports "pse") and 1G
-   (when the CPU supports the "pdpe1gb" cpuinfo flag).
+   hugepages=  [HW] Number of HugeTLB pages to allocate at boot.
+   If this follows hugepagesz (below), it specifies
+   the number of pages of hugepagesz to be allocated.
+   Format: 
+   hugepagesz=
+   [HW] The size of the HugeTLB pages.  This is used in
+   conjunction with hugepages (above) to allocate huge
+   pages of a specific size at boot.  The pair
+   hugepagesz=X hugepages=Y can be specified once for
+   each supported huge page size. Huge page sizes are
+   architecture dependent.  See also
+   Documentation/admin-guide/mm/hugetlbpage.rst.
+   Format: size[KMG]
 
hung_task_panic=
[KNL] Should the hung task detector generate panics.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst 
b/Documentation/admin-guide/mm/hugetlbpage.rst
index 1cc0bc78d10e..de340c586995 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -100,6 +100,50 @@ with a huge page size selection parameter 
"hugepagesz=".   must
 be specified in bytes with optional scale suffix [kKmMgG].  The default huge
 page size may be selected with the "default_hugepagesz=" boot parameter.
 
+Hugetlb boot command line parameter semantics
+hugepagesz - Specify a huge page size.  Used in conjunction with hugepages
+   parameter to preallocate a number of huge pages of the specified
+   size.  Hence, hugepagesz and hugepages are typically specified in
+   pairs such as:
+   hugepagesz=2M hugepages=512
+   hugepagesz can only be specified once on the command line for a
+   specific huge page size.  Valid huge page sizes are architecture
+   dependent.
+hugepages - Specify the number of huge pages to preallocate.  This typically
+   follows a valid hugepagesz parameter.  However, if hugepages 

[PATCH v2 2/4] hugetlbfs: move hugepagesz= parsing to arch independent code

2020-04-01 Thread Mike Kravetz
Now that architectures provide arch_hugetlb_valid_size(), parsing
of "hugepagesz=" can be done in architecture independent code.
Create a single routine to handle hugepagesz= parsing and remove
all arch specific routines.  We can also remove the interface
hugetlb_bad_size() as this is no longer used outside arch independent
code.

This also provides consistent behavior of hugetlbfs command line
options.  The hugepagesz= option should only be specified once for
a specific size, but some architectures allow multiple instances.
This appears to be more of an oversight when code was added by some
architectures to set up ALL huge pages sizes.

Signed-off-by: Mike Kravetz 
---
 arch/arm64/mm/hugetlbpage.c   | 15 ---
 arch/powerpc/mm/hugetlbpage.c | 15 ---
 arch/riscv/mm/hugetlbpage.c   | 16 
 arch/s390/mm/hugetlbpage.c| 18 --
 arch/sparc/mm/init_64.c   | 22 --
 arch/x86/mm/hugetlbpage.c | 16 
 include/linux/hugetlb.h   |  1 -
 mm/hugetlb.c  | 23 +--
 8 files changed, 17 insertions(+), 109 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 069b96ee2aec..f706b821aba6 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -476,18 +476,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 
return false;
 }
-
-static __init int setup_hugepagesz(char *opt)
-{
-   unsigned long ps = memparse(opt, );
-
-   if (arch_hugetlb_valid_size(ps)) {
-   add_huge_page_size(ps);
-   return 1;
-   }
-
-   hugetlb_bad_size();
-   pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
-   return 0;
-}
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index de54d2a37830..2c3fa0a7787b 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -589,21 +589,6 @@ static int __init add_huge_page_size(unsigned long long 
size)
return 0;
 }
 
-static int __init hugepage_setup_sz(char *str)
-{
-   unsigned long long size;
-
-   size = memparse(str, );
-
-   if (add_huge_page_size(size) != 0) {
-   hugetlb_bad_size();
-   pr_err("Invalid huge page size specified(%llu)\n", size);
-   }
-
-   return 1;
-}
-__setup("hugepagesz=", hugepage_setup_sz);
-
 static int __init hugetlbpage_init(void)
 {
bool configured = false;
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index da1f516bc451..4e5d7e9f0eef 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -22,22 +22,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
return false;
 }
 
-static __init int setup_hugepagesz(char *opt)
-{
-   unsigned long ps = memparse(opt, );
-
-   if (arch_hugetlb_valid_size(ps)) {
-   hugetlb_add_hstate(ilog2(ps) - PAGE_SHIFT);
-   return 1;
-   }
-
-   hugetlb_bad_size();
-   pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
-   return 0;
-
-}
-__setup("hugepagesz=", setup_hugepagesz);
-
 #ifdef CONFIG_CONTIG_ALLOC
 static __init int gigantic_pages_init(void)
 {
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index ac25b207624c..242dfc0d462d 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -261,24 +261,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
return false;
 }
 
-static __init int setup_hugepagesz(char *opt)
-{
-   unsigned long size;
-   char *string = opt;
-
-   size = memparse(opt, );
-   if (arch_hugetlb_valid_size(size)) {
-   hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-   } else {
-   hugetlb_bad_size();
-   pr_err("hugepagesz= specifies an unsupported page size %s\n",
-   string);
-   return 0;
-   }
-   return 1;
-}
-__setup("hugepagesz=", setup_hugepagesz);
-
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 2bfe8e22b706..4618f96fd30f 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -397,28 +397,6 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 
return true;
 }
-
-static int __init setup_hugepagesz(char *string)
-{
-   unsigned long long hugepage_size;
-   int rc = 0;
-
-   hugepage_size = memparse(string, );
-
-   if (!arch_hugetlb_valid_size((unsigned long)hugepage_size)) {
-   hugetlb_bad_size();
-   pr_err("hugepagesz=%llu not supported by MMU.\n",
-   hugepage_size);
-   goto out;
-   }
-
-   

Re: [PATCHv4] powerpc/crashkernel: take "mem=" option into account

2020-04-01 Thread Hari Bathini



On 01/04/20 7:30 PM, Pingfan Liu wrote:
> 'mem=" option is an easy way to put high pressure on memory during some
> test. Hence after applying the memory limit, instead of total mem, the
> actual usable memory should be considered when reserving mem for
> crashkernel. Otherwise the boot up may experience OOM issue.
> 
> E.g. it would reserve 4G prior to the change and 512M afterward, if passing
> crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G", and
> mem=5G on a 256G machine.
> 
> This issue is powerpc specific because it puts higher priority on fadump
> and kdump reservation than on "mem=". Referring the following code:
> if (fadump_reserve_mem() == 0)
> reserve_crashkernel();
> ...
> /* Ensure that total memory size is page-aligned. */
> limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
> memblock_enforce_memory_limit(limit);
> 
> While on other arches, the effect of "mem=" takes a higher priority and pass
> through memblock_phys_mem_size() before calling reserve_crashkernel().
>> Signed-off-by: Pingfan Liu 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Hari Bathini 
> Cc: Michael Ellerman 
> Cc: ke...@lists.infradead.org
> ---
> v3 -> v4: fix total_mem_sz based on adjusted memory_limit


Thanks for the update.

Reviewed-by: Hari Bathini 



Re: [RFC PATCH 3/4] powerpc ppc-opcode: move ppc instuction encoding from test_emulate_step

2020-04-01 Thread Naveen N. Rao

Balamuruhan S wrote:

Few ppc instructions are encoded in test_emulate_step.c, consolidate them to
ppc-opcode.h, fix redefintion errors in bpf_jit caused due to this 
consolidation.
Reuse the macros from ppc-opcode.h

Signed-off-by: Balamuruhan S 
---
 arch/powerpc/include/asm/ppc-opcode.h |  34 ++
 arch/powerpc/lib/test_emulate_step.c  | 155 ++
 arch/powerpc/net/bpf_jit.h|   8 --
 arch/powerpc/net/bpf_jit32.h  |  10 +-
 arch/powerpc/net/bpf_jit64.h  |   4 +-
 arch/powerpc/net/bpf_jit_comp.c   |   2 +-
 arch/powerpc/net/bpf_jit_comp64.c |  14 +--
 7 files changed, 105 insertions(+), 122 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index ea5e0f864b20..2ed8a285f1ec 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -76,6 +76,9 @@
 #define__REGA0_R30 30
 #define__REGA0_R31 31

+#define IMM_L(i)   ((uintptr_t)(i) & 0x)
+#define IMM_DS(i)  ((uintptr_t)(i) & 0xfffc)
+
 /* opcode and xopcode for instructions */
 #define OP_TRAP 3
 #define OP_TRAP_64 2
@@ -614,6 +617,37 @@
___PPC_RA(vra) | \
___PPC_RB(vrb) | __PPC_RC21)

+#define PPC_ENCODE_LD(r, base, i) (PPC_INST_LD | ___PPC_RT(r) |   \
+   ___PPC_RA(base) | IMM_DS(i))
+#define PPC_ENCODE_LWZ(r, base, i)(PPC_INST_LWZ | ___PPC_RT(r) |  \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_ENCODE_LWZX(t, a, b)  (PPC_INST_LWZX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_STD(r, base, i)(PPC_INST_STD | ___PPC_RS(r) |  \
+   ___PPC_RA(base) | IMM_DS(i))
+#define PPC_ENCODE_STDCX(s, a, b) (PPC_INST_STDCX | ___PPC_RS(s) |\
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_LFSX(t, a, b)  (PPC_INST_LFSX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_STFSX(s, a, b) (PPC_INST_STFSX | ___PPC_RS(s) |\
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_LFDX(t, a, b)  (PPC_INST_LFDX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_STFDX(s, a, b) (PPC_INST_STFDX | ___PPC_RS(s) |\
+___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_LVX(t, a, b)   (PPC_INST_LVX | ___PPC_RT(t) |  \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_STVX(s, a, b)  (PPC_INST_STVX | ___PPC_RS(s) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_ADD(t, a, b)   (PPC_INST_ADD | ___PPC_RT(t) |  \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_ADD_DOT(t, a, b)   (PPC_INST_ADD | ___PPC_RT(t) |  \
+   ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+#define PPC_ENCODE_ADDC(t, a, b)  (PPC_INST_ADDC | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_ENCODE_ADDC_DOT(t, a, b)  (PPC_INST_ADDC | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+
 #define PPC_CP_ABORTstringify_in_c(.long PPC_ENCODE_CP_ABORT)
 #define PPC_COPY(a, b)  stringify_in_c(.long PPC_ENCODE_COPY(a, b))
 #define PPC_DARN(t, l)  stringify_in_c(.long PPC_ENCODE_DARN(t, l))
diff --git a/arch/powerpc/lib/test_emulate_step.c 
b/arch/powerpc/lib/test_emulate_step.c
index 53df4146dd32..45b485edfee1 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -12,49 +12,6 @@
 #include 
 #include 

-#define IMM_L(i)   ((uintptr_t)(i) & 0x)
-#define IMM_DS(i)  ((uintptr_t)(i) & 0xfffc)
-
-/*
- * Defined with TEST_ prefix so it does not conflict with other
- * definitions.
- */
-#define TEST_LD(r, base, i)(PPC_INST_LD | ___PPC_RT(r) |   \
-   ___PPC_RA(base) | IMM_DS(i))
-#define TEST_LWZ(r, base, i)   (PPC_INST_LWZ | ___PPC_RT(r) |  \
-   ___PPC_RA(base) | IMM_L(i))
-#define TEST_LWZX(t, a, b) (PPC_INST_LWZX | ___PPC_RT(t) | \
-   ___PPC_RA(a) | ___PPC_RB(b))
-#define TEST_STD(r, base, i)   (PPC_INST_STD | ___PPC_RS(r) |  \
-   ___PPC_RA(base) | IMM_DS(i))
-#define TEST_LDARX(t, a, b, eh)(PPC_INST_LDARX | ___PPC_RT(t) |
\
-   ___PPC_RA(a) | ___PPC_RB(b) |   \
-   __PPC_EH(eh))
-#define TEST_STDCX(s, a, b)

[PATCH 4.14 037/148] mm, slub: prevent kmalloc_node crashes and memory leaks

2020-04-01 Thread Greg Kroah-Hartman
From: Vlastimil Babka 

commit 0715e6c516f106ed553828a671d30ad9a3431536 upstream.

Sachin reports [1] a crash in SLUB __slab_alloc():

  BUG: Kernel NULL pointer dereference on read at 0x73b0
  Faulting instruction address: 0xc03d55f4
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 19 PID: 1 Comm: systemd Not tainted 5.6.0-rc2-next-20200218-autotest #1
  NIP:  c03d55f4 LR: c03d5b94 CTR: 
  REGS: c008b37836d0 TRAP: 0300   Not tainted  
(5.6.0-rc2-next-20200218-autotest)
  MSR:  80009033   CR: 24004844  XER: 
  CFAR: c000dec4 DAR: 73b0 DSISR: 4000 IRQMASK: 1
  GPR00: c03d5b94 c008b3783960 c155d400 c008b301f500
  GPR04: 0dc0 0002 c03443d8 c008bb398620
  GPR08: 0008ba2f 0001  
  GPR12: 24004844 c0001ec52a00  
  GPR16: c008a1b20048 c1595898 c1750c18 0002
  GPR20: c1750c28 c1624470 000fffe0 5deadbeef122
  GPR24: 0001 0dc0 0002 c03443d8
  GPR28: c008b301f500 c008bb398620  c00c02287180
  NIP ___slab_alloc+0x1f4/0x760
  LR __slab_alloc+0x34/0x60
  Call Trace:
___slab_alloc+0x334/0x760 (unreliable)
__slab_alloc+0x34/0x60
__kmalloc_node+0x110/0x490
kvmalloc_node+0x58/0x110
mem_cgroup_css_online+0x108/0x270
online_css+0x48/0xd0
cgroup_apply_control_enable+0x2ec/0x4d0
cgroup_mkdir+0x228/0x5f0
kernfs_iop_mkdir+0x90/0xf0
vfs_mkdir+0x110/0x230
do_mkdirat+0xb0/0x1a0
system_call+0x5c/0x68

This is a PowerPC platform with following NUMA topology:

  available: 2 nodes (0-1)
  node 0 cpus:
  node 0 size: 0 MB
  node 0 free: 0 MB
  node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
25 26 27 28 29 30 31
  node 1 size: 35247 MB
  node 1 free: 30907 MB
  node distances:
  node   0   1
0:  10  40
1:  40  10

  possible numa nodes: 0-31

This only happens with a mmotm patch "mm/memcontrol.c: allocate
shrinker_map on appropriate NUMA node" [2] which effectively calls
kmalloc_node for each possible node.  SLUB however only allocates
kmem_cache_node on online N_NORMAL_MEMORY nodes, and relies on
node_to_mem_node to return such valid node for other nodes since commit
a561ce00b09e ("slub: fall back to node_to_mem_node() node if allocating
on memoryless node").  This is however not true in this configuration
where the _node_numa_mem_ array is not initialized for nodes 0 and 2-31,
thus it contains zeroes and get_partial() ends up accessing
non-allocated kmem_cache_node.

A related issue was reported by Bharata (originally by Ramachandran) [3]
where a similar PowerPC configuration, but with mainline kernel without
patch [2] ends up allocating large amounts of pages by kmalloc-1k
kmalloc-512.  This seems to have the same underlying issue with
node_to_mem_node() not behaving as expected, and might probably also
lead to an infinite loop with CONFIG_SLUB_CPU_PARTIAL [4].

This patch should fix both issues by not relying on node_to_mem_node()
anymore and instead simply falling back to NUMA_NO_NODE, when
kmalloc_node(node) is attempted for a node that's not online, or has no
usable memory.  The "usable memory" condition is also changed from
node_present_pages() to N_NORMAL_MEMORY node state, as that is exactly
the condition that SLUB uses to allocate kmem_cache_node structures.
The check in get_partial() is removed completely, as the checks in
___slab_alloc() are now sufficient to prevent get_partial() being
reached with an invalid node.

[1] 
https://lore.kernel.org/linux-next/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com/
[2] 
https://lore.kernel.org/linux-mm/fff0e636-4c36-ed10-281c-8cdb0687c...@virtuozzo.com/
[3] https://lore.kernel.org/linux-mm/20200317092624.gb22...@in.ibm.com/
[4] 
https://lore.kernel.org/linux-mm/088b5996-faae-8a56-ef9c-5b567125a...@suse.cz/

Fixes: a561ce00b09e ("slub: fall back to node_to_mem_node() node if allocating 
on memoryless node")
Reported-by: Sachin Sant 
Reported-by: PUVICHAKRAVARTHY RAMACHANDRAN 
Signed-off-by: Vlastimil Babka 
Signed-off-by: Andrew Morton 
Tested-by: Sachin Sant 
Tested-by: Bharata B Rao 
Reviewed-by: Srikar Dronamraju 
Cc: Mel Gorman 
Cc: Michael Ellerman 
Cc: Michal Hocko 
Cc: Christopher Lameter 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Joonsoo Kim 
Cc: Pekka Enberg 
Cc: David Rientjes 
Cc: Kirill Tkhai 
Cc: Vlastimil Babka 
Cc: Nathan Lynch 
Cc: 
Link: http://lkml.kernel.org/r/20200320115533.9604-1-vba...@suse.cz
Debugged-by: Srikar Dronamraju 
Signed-off-by: Linus Torvalds 
Signed-off-by: Greg Kroah-Hartman 

---
 mm/slub.c |   26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

--- a/mm/slub.c
+++ b/mm/slub.c
@@ 

[PATCH 4.9 026/102] mm, slub: prevent kmalloc_node crashes and memory leaks

2020-04-01 Thread Greg Kroah-Hartman
From: Vlastimil Babka 

commit 0715e6c516f106ed553828a671d30ad9a3431536 upstream.

Sachin reports [1] a crash in SLUB __slab_alloc():

  BUG: Kernel NULL pointer dereference on read at 0x73b0
  Faulting instruction address: 0xc03d55f4
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 19 PID: 1 Comm: systemd Not tainted 5.6.0-rc2-next-20200218-autotest #1
  NIP:  c03d55f4 LR: c03d5b94 CTR: 
  REGS: c008b37836d0 TRAP: 0300   Not tainted  
(5.6.0-rc2-next-20200218-autotest)
  MSR:  80009033   CR: 24004844  XER: 
  CFAR: c000dec4 DAR: 73b0 DSISR: 4000 IRQMASK: 1
  GPR00: c03d5b94 c008b3783960 c155d400 c008b301f500
  GPR04: 0dc0 0002 c03443d8 c008bb398620
  GPR08: 0008ba2f 0001  
  GPR12: 24004844 c0001ec52a00  
  GPR16: c008a1b20048 c1595898 c1750c18 0002
  GPR20: c1750c28 c1624470 000fffe0 5deadbeef122
  GPR24: 0001 0dc0 0002 c03443d8
  GPR28: c008b301f500 c008bb398620  c00c02287180
  NIP ___slab_alloc+0x1f4/0x760
  LR __slab_alloc+0x34/0x60
  Call Trace:
___slab_alloc+0x334/0x760 (unreliable)
__slab_alloc+0x34/0x60
__kmalloc_node+0x110/0x490
kvmalloc_node+0x58/0x110
mem_cgroup_css_online+0x108/0x270
online_css+0x48/0xd0
cgroup_apply_control_enable+0x2ec/0x4d0
cgroup_mkdir+0x228/0x5f0
kernfs_iop_mkdir+0x90/0xf0
vfs_mkdir+0x110/0x230
do_mkdirat+0xb0/0x1a0
system_call+0x5c/0x68

This is a PowerPC platform with following NUMA topology:

  available: 2 nodes (0-1)
  node 0 cpus:
  node 0 size: 0 MB
  node 0 free: 0 MB
  node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
25 26 27 28 29 30 31
  node 1 size: 35247 MB
  node 1 free: 30907 MB
  node distances:
  node   0   1
0:  10  40
1:  40  10

  possible numa nodes: 0-31

This only happens with a mmotm patch "mm/memcontrol.c: allocate
shrinker_map on appropriate NUMA node" [2] which effectively calls
kmalloc_node for each possible node.  SLUB however only allocates
kmem_cache_node on online N_NORMAL_MEMORY nodes, and relies on
node_to_mem_node to return such valid node for other nodes since commit
a561ce00b09e ("slub: fall back to node_to_mem_node() node if allocating
on memoryless node").  This is however not true in this configuration
where the _node_numa_mem_ array is not initialized for nodes 0 and 2-31,
thus it contains zeroes and get_partial() ends up accessing
non-allocated kmem_cache_node.

A related issue was reported by Bharata (originally by Ramachandran) [3]
where a similar PowerPC configuration, but with mainline kernel without
patch [2] ends up allocating large amounts of pages by kmalloc-1k
kmalloc-512.  This seems to have the same underlying issue with
node_to_mem_node() not behaving as expected, and might probably also
lead to an infinite loop with CONFIG_SLUB_CPU_PARTIAL [4].

This patch should fix both issues by not relying on node_to_mem_node()
anymore and instead simply falling back to NUMA_NO_NODE, when
kmalloc_node(node) is attempted for a node that's not online, or has no
usable memory.  The "usable memory" condition is also changed from
node_present_pages() to N_NORMAL_MEMORY node state, as that is exactly
the condition that SLUB uses to allocate kmem_cache_node structures.
The check in get_partial() is removed completely, as the checks in
___slab_alloc() are now sufficient to prevent get_partial() being
reached with an invalid node.

[1] 
https://lore.kernel.org/linux-next/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com/
[2] 
https://lore.kernel.org/linux-mm/fff0e636-4c36-ed10-281c-8cdb0687c...@virtuozzo.com/
[3] https://lore.kernel.org/linux-mm/20200317092624.gb22...@in.ibm.com/
[4] 
https://lore.kernel.org/linux-mm/088b5996-faae-8a56-ef9c-5b567125a...@suse.cz/

Fixes: a561ce00b09e ("slub: fall back to node_to_mem_node() node if allocating 
on memoryless node")
Reported-by: Sachin Sant 
Reported-by: PUVICHAKRAVARTHY RAMACHANDRAN 
Signed-off-by: Vlastimil Babka 
Signed-off-by: Andrew Morton 
Tested-by: Sachin Sant 
Tested-by: Bharata B Rao 
Reviewed-by: Srikar Dronamraju 
Cc: Mel Gorman 
Cc: Michael Ellerman 
Cc: Michal Hocko 
Cc: Christopher Lameter 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Joonsoo Kim 
Cc: Pekka Enberg 
Cc: David Rientjes 
Cc: Kirill Tkhai 
Cc: Vlastimil Babka 
Cc: Nathan Lynch 
Cc: 
Link: http://lkml.kernel.org/r/20200320115533.9604-1-vba...@suse.cz
Debugged-by: Srikar Dronamraju 
Signed-off-by: Linus Torvalds 
Signed-off-by: Greg Kroah-Hartman 

---
 mm/slub.c |   26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

--- a/mm/slub.c
+++ b/mm/slub.c
@@ 

[PATCH 4.4 24/91] mm, slub: prevent kmalloc_node crashes and memory leaks

2020-04-01 Thread Greg Kroah-Hartman
From: Vlastimil Babka 

commit 0715e6c516f106ed553828a671d30ad9a3431536 upstream.

Sachin reports [1] a crash in SLUB __slab_alloc():

  BUG: Kernel NULL pointer dereference on read at 0x73b0
  Faulting instruction address: 0xc03d55f4
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 19 PID: 1 Comm: systemd Not tainted 5.6.0-rc2-next-20200218-autotest #1
  NIP:  c03d55f4 LR: c03d5b94 CTR: 
  REGS: c008b37836d0 TRAP: 0300   Not tainted  
(5.6.0-rc2-next-20200218-autotest)
  MSR:  80009033   CR: 24004844  XER: 
  CFAR: c000dec4 DAR: 73b0 DSISR: 4000 IRQMASK: 1
  GPR00: c03d5b94 c008b3783960 c155d400 c008b301f500
  GPR04: 0dc0 0002 c03443d8 c008bb398620
  GPR08: 0008ba2f 0001  
  GPR12: 24004844 c0001ec52a00  
  GPR16: c008a1b20048 c1595898 c1750c18 0002
  GPR20: c1750c28 c1624470 000fffe0 5deadbeef122
  GPR24: 0001 0dc0 0002 c03443d8
  GPR28: c008b301f500 c008bb398620  c00c02287180
  NIP ___slab_alloc+0x1f4/0x760
  LR __slab_alloc+0x34/0x60
  Call Trace:
___slab_alloc+0x334/0x760 (unreliable)
__slab_alloc+0x34/0x60
__kmalloc_node+0x110/0x490
kvmalloc_node+0x58/0x110
mem_cgroup_css_online+0x108/0x270
online_css+0x48/0xd0
cgroup_apply_control_enable+0x2ec/0x4d0
cgroup_mkdir+0x228/0x5f0
kernfs_iop_mkdir+0x90/0xf0
vfs_mkdir+0x110/0x230
do_mkdirat+0xb0/0x1a0
system_call+0x5c/0x68

This is a PowerPC platform with following NUMA topology:

  available: 2 nodes (0-1)
  node 0 cpus:
  node 0 size: 0 MB
  node 0 free: 0 MB
  node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
25 26 27 28 29 30 31
  node 1 size: 35247 MB
  node 1 free: 30907 MB
  node distances:
  node   0   1
0:  10  40
1:  40  10

  possible numa nodes: 0-31

This only happens with a mmotm patch "mm/memcontrol.c: allocate
shrinker_map on appropriate NUMA node" [2] which effectively calls
kmalloc_node for each possible node.  SLUB however only allocates
kmem_cache_node on online N_NORMAL_MEMORY nodes, and relies on
node_to_mem_node to return such valid node for other nodes since commit
a561ce00b09e ("slub: fall back to node_to_mem_node() node if allocating
on memoryless node").  This is however not true in this configuration
where the _node_numa_mem_ array is not initialized for nodes 0 and 2-31,
thus it contains zeroes and get_partial() ends up accessing
non-allocated kmem_cache_node.

A related issue was reported by Bharata (originally by Ramachandran) [3]
where a similar PowerPC configuration, but with mainline kernel without
patch [2] ends up allocating large amounts of pages by kmalloc-1k
kmalloc-512.  This seems to have the same underlying issue with
node_to_mem_node() not behaving as expected, and might probably also
lead to an infinite loop with CONFIG_SLUB_CPU_PARTIAL [4].

This patch should fix both issues by not relying on node_to_mem_node()
anymore and instead simply falling back to NUMA_NO_NODE, when
kmalloc_node(node) is attempted for a node that's not online, or has no
usable memory.  The "usable memory" condition is also changed from
node_present_pages() to N_NORMAL_MEMORY node state, as that is exactly
the condition that SLUB uses to allocate kmem_cache_node structures.
The check in get_partial() is removed completely, as the checks in
___slab_alloc() are now sufficient to prevent get_partial() being
reached with an invalid node.

[1] 
https://lore.kernel.org/linux-next/3381cd91-ab3d-4773-ba04-e7a072a63...@linux.vnet.ibm.com/
[2] 
https://lore.kernel.org/linux-mm/fff0e636-4c36-ed10-281c-8cdb0687c...@virtuozzo.com/
[3] https://lore.kernel.org/linux-mm/20200317092624.gb22...@in.ibm.com/
[4] 
https://lore.kernel.org/linux-mm/088b5996-faae-8a56-ef9c-5b567125a...@suse.cz/

Fixes: a561ce00b09e ("slub: fall back to node_to_mem_node() node if allocating 
on memoryless node")
Reported-by: Sachin Sant 
Reported-by: PUVICHAKRAVARTHY RAMACHANDRAN 
Signed-off-by: Vlastimil Babka 
Signed-off-by: Andrew Morton 
Tested-by: Sachin Sant 
Tested-by: Bharata B Rao 
Reviewed-by: Srikar Dronamraju 
Cc: Mel Gorman 
Cc: Michael Ellerman 
Cc: Michal Hocko 
Cc: Christopher Lameter 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Joonsoo Kim 
Cc: Pekka Enberg 
Cc: David Rientjes 
Cc: Kirill Tkhai 
Cc: Vlastimil Babka 
Cc: Nathan Lynch 
Cc: 
Link: http://lkml.kernel.org/r/20200320115533.9604-1-vba...@suse.cz
Debugged-by: Srikar Dronamraju 
Signed-off-by: Linus Torvalds 
Signed-off-by: Greg Kroah-Hartman 

---
 mm/slub.c |   26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

--- a/mm/slub.c
+++ b/mm/slub.c
@@ 

Re: [RFC PATCH v2 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests

2020-04-01 Thread Leonardo Bras
On Thu, 2020-03-05 at 20:32 -0300, Leonardo Bras wrote:
> ---
> The new flag was already proposed on Power Architecture documentation,
> and it's waiting for approval.
> 
> I would like to get your comments on this change, but it's still not
> ready for being merged.

New flag got approved on the documentation.
Please review this patch.


signature.asc
Description: This is a digitally signed message part


[PATCHv4] powerpc/crashkernel: take "mem=" option into account

2020-04-01 Thread Pingfan Liu
'mem=" option is an easy way to put high pressure on memory during some
test. Hence after applying the memory limit, instead of total mem, the
actual usable memory should be considered when reserving mem for
crashkernel. Otherwise the boot up may experience OOM issue.

E.g. it would reserve 4G prior to the change and 512M afterward, if passing
crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G", and
mem=5G on a 256G machine.

This issue is powerpc specific because it puts higher priority on fadump
and kdump reservation than on "mem=". Referring the following code:
if (fadump_reserve_mem() == 0)
reserve_crashkernel();
...
/* Ensure that total memory size is page-aligned. */
limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
memblock_enforce_memory_limit(limit);

While on other arches, the effect of "mem=" takes a higher priority and pass
through memblock_phys_mem_size() before calling reserve_crashkernel().

Signed-off-by: Pingfan Liu 
To: linuxppc-dev@lists.ozlabs.org
Cc: Hari Bathini 
Cc: Michael Ellerman 
Cc: ke...@lists.infradead.org
---
v3 -> v4: fix total_mem_sz based on adjusted memory_limit

 arch/powerpc/kexec/core.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 078fe3d..56da5eb 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -115,11 +115,12 @@ void machine_kexec(struct kimage *image)

 void __init reserve_crashkernel(void)
 {
-   unsigned long long crash_size, crash_base;
+   unsigned long long crash_size, crash_base, total_mem_sz;
int ret;

+   total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size();
/* use common parsing */
-   ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+   ret = parse_crashkernel(boot_command_line, total_mem_sz,
_size, _base);
if (ret == 0 && crash_size > 0) {
crashk_res.start = crash_base;
@@ -178,6 +179,7 @@ void __init reserve_crashkernel(void)
/* Crash kernel trumps memory limit */
if (memory_limit && memory_limit <= crashk_res.end) {
memory_limit = crashk_res.end + 1;
+   total_mem_sz = memory_limit;
printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
   memory_limit);
}
@@ -186,7 +188,7 @@ void __init reserve_crashkernel(void)
"for crashkernel (System RAM: %ldMB)\n",
(unsigned long)(crash_size >> 20),
(unsigned long)(crashk_res.start >> 20),
-   (unsigned long)(memblock_phys_mem_size() >> 20));
+   (unsigned long)(total_mem_sz >> 20));

if (!memblock_is_region_memory(crashk_res.start, crash_size) ||
memblock_reserve(crashk_res.start, crash_size)) {
--
2.7.5



Re: [PATCH 10/16] powerpc: prefer __section and __printf from compiler_attributes.h

2020-04-01 Thread Miguel Ojeda
Hi Michael,

On Wed, Apr 1, 2020 at 2:53 PM Michael Ellerman
 wrote:
>
> On Mon, 2019-08-12 at 21:50:43 UTC, Nick Desaulniers wrote:
> > Reported-by: Sedat Dilek 
> > Suggested-by: Josh Poimboeuf 
> > Signed-off-by: Nick Desaulniers 
>
> Applied to powerpc next, thanks.

Missed this one from August, thanks Nick for this cleanup!

Michael, you already picked it up, but you may have my:

Acked-by: Miguel Ojeda 

Cheers,
Miguel


[PATCH -next] KVM: PPC: Book3S HV: remove redundant NULL check

2020-04-01 Thread Chen Zhou
Free function kfree() already does NULL check, so the additional
check is unnecessary, just remove it.

Signed-off-by: Chen Zhou 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index dc97e5b..cad3243 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1416,8 +1416,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_run 
*run,
rmapp = >arch.rmap[gfn - memslot->base_gfn];
ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
mmu_seq, gp->shadow_lpid, rmapp, _rmap);
-   if (n_rmap)
-   kfree(n_rmap);
+   kfree(n_rmap);
if (ret == -EAGAIN)
ret = RESUME_GUEST; /* Let the guest try again */
 
-- 
2.7.4



Re: [PATCH] powerpc/64/tm: Don't let userspace set regs->trap via sigreturn

2020-04-01 Thread Michael Ellerman
On Wed, 2020-04-01 at 02:38:36 UTC, Michael Ellerman wrote:
> In restore_tm_sigcontexts() we take the trap value directly from the
> user sigcontext with no checking:
> 
>   err |= __get_user(regs->trap, >gp_regs[PT_TRAP]);
> 
> This means we can be in the kernel with an arbitrary regs->trap value.
> 
> Although that's not immediately problematic, there is a risk we could
> trigger one of the uses of CHECK_FULL_REGS():
> 
>   #define CHECK_FULL_REGS(regs)   BUG_ON(regs->trap & 1)
> 
> It can also cause us to unnecessarily save non-volatile GPRs again in
> save_nvgprs(), which shouldn't be problematic but is still wrong.
> 
> It's also possible it could trick the syscall restart machinery, which
> relies on regs->trap not being == 0xc00 (see 9a81c16b5275 ("powerpc:
> fix double syscall restarts")), though I haven't been able to make
> that happen.
> 
> Finally it doesn't match the behaviour of the non-TM case, in
> restore_sigcontext() which zeroes regs->trap.
> 
> So change restore_tm_sigcontexts() to zero regs->trap.
> 
> This was discovered while testing Nick's upcoming rewrite of the
> syscall entry path. In that series the call to save_nvgprs() prior to
> signal handling (do_notify_resume()) is removed, which leaves the
> low-bit of regs->trap uncleared which can then trigger the FULL_REGS()
> WARNs in setup_tm_sigcontexts().
> 
> Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the 
> signal context")
> Cc: sta...@vger.kernel.org # v3.9+
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/c7def7fbdeaa25feaa19caf4a27c5d10bd8789e4

cheers


Re: [PATCH] selftests/powerpc: Fix try-run when source tree is not writable

2020-04-01 Thread Michael Ellerman
On Fri, 2020-03-27 at 09:53:19 UTC, Michael Ellerman wrote:
> We added a usage of try-run to pmu/ebb/Makefile to detect if the
> toolchain supported the -no-pie option.
> 
> This fails if we build out-of-tree and the source tree is not
> writable, as try-run tries to write its temporary files to the current
> directory. That leads to the -no-pie option being silently dropped,
> which leads to broken executables with some toolchains.
> 
> If we remove the redirect to /dev/null in try-run, we see the error:
> 
>   make[3]: Entering directory '/linux/tools/testing/selftests/powerpc/pmu/ebb'
>   /usr/bin/ld: cannot open output file .54.tmp: Read-only file system
>   collect2: error: ld returned 1 exit status
>   make[3]: Nothing to be done for 'all'.
> 
> And looking with strace we see it's trying to use a file that's in the
> source tree:
> 
>   lstat("/linux/tools/testing/selftests/powerpc/pmu/ebb/.54.tmp", 
> 0x7c0f83c8)
> 
> We can fix it by setting TMPOUT to point to the $(OUTPUT) directory,
> and we can verify with strace it's now trying to write to the output
> directory:
> 
>   lstat("/output/kselftest/powerpc/pmu/ebb/.54.tmp", 0x7fffd1bf6bf8)
> 
> And also see that the -no-pie option is now correctly detected.
> 
> Fixes: 0695f8bca93e ("selftests/powerpc: Handle Makefile for unrecognized 
> option")
> Cc: sta...@vger.kernel.org # v5.5+
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/9686813f6e9d5568bc045de0be853411e44958c8

cheers


Re: [PATCH v3] powerpc: Make setjmp/longjmp signature standard

2020-04-01 Thread Michael Ellerman
On Mon, 2020-03-30 at 08:03:56 UTC, Clement Courbet wrote:
> Declaring setjmp()/longjmp() as taking longs makes the signature
> non-standard, and makes clang complain. In the past, this has been
> worked around by adding -ffreestanding to the compile flags.
> 
> The implementation looks like it only ever propagates the value
> (in longjmp) or sets it to 1 (in setjmp), and we only call longjmp
> with integer parameters.
> 
> This allows removing -ffreestanding from the compilation flags.
> 
> Context:
> https://lore.kernel.org/patchwork/patch/1214060
> https://lore.kernel.org/patchwork/patch/1216174
> 
> Signed-off-by: Clement Courbet 
> Reviewed-by: Nathan Chancellor 
> Tested-by: Nathan Chancellor 
> Cc: sta...@vger.kernel.org # v4.14+
> Fixes: c9029ef9c957 ("powerpc: Avoid clang warnings around setjmp and 
> longjmp")

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/c17eb4dca5a353a9dbbb8ad6934fe57af7165e91

cheers


Re: [PATCH v4] powerpc/pseries: Handle UE event for memcpy_mcsafe

2020-04-01 Thread Michael Ellerman
On Thu, 2020-03-26 at 18:49:16 UTC, Ganesh Goudar wrote:
> memcpy_mcsafe has been implemented for power machines which is used
> by pmem infrastructure, so that an UE encountered during memcpy from
> pmem devices would not result in panic instead a right error code
> is returned. The implementation expects machine check handler to ignore
> the event and set nip to continue the execution from fixup code.
> 
> Appropriate changes are already made to powernv machine check handler,
> make similar changes to pseries machine check handler to ignore the
> the event and set nip to continue execution at the fixup entry if we
> hit UE at an instruction with a fixup entry.
> 
> while we are at it, have a common function which searches the exception
> table entry and updates nip with fixup address, and any future common
> changes can be made in this function that are valid for both architectures.
> 
> powernv changes are made by
> commit 895e3dceeb97 ("powerpc/mce: Handle UE event for memcpy_mcsafe")
> 
> Reviewed-by: Mahesh Salgaonkar 
> Reviewed-by: Santosh S 
> Signed-off-by: Ganesh Goudar 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/efbc4303b255bb80ab1283794b36dd5fe1fb0ec3

cheers


Re: [PATCH v2] powerpc/boot: Delete unneeded .globl _zimage_start

2020-04-01 Thread Michael Ellerman
On Wed, 2020-03-25 at 16:42:57 UTC, Fangrui Song wrote:
> .globl sets the symbol binding to STB_GLOBAL while .weak sets the
> binding to STB_WEAK. GNU as let .weak override .globl since binutils-gdb
> 5ca547dc2399a0a5d9f20626d4bf5547c3ccfddd (1996). Clang integrated
> assembler let the last win but it may error in the future.
> 
> Since it is a convention that only one binding directive is used, just
> delete .globl.
> 
> Fixes: cd197ffcf10b "[POWERPC] zImage: Cleanup and improve zImage entry point"
> Fixes: ee9d21b3b358 "powerpc/boot: Ensure _zimage_start is a weak symbol"
> Link: https://github.com/ClangBuiltLinux/linux/issues/937
> Signed-off-by: Fangrui Song 
> Cc: Alan Modra 
> Cc: Joel Stanley 
> Cc: Michael Ellerman 
> Cc: Nick Desaulniers 
> Cc: Segher Boessenkool 
> Cc: clang-built-li...@googlegroups.com

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/968339fad422a58312f67718691b717dac45c399

cheers


Re: [PATCH v2] powerpc/64: mark emergency stacks valid to unwind

2020-04-01 Thread Michael Ellerman
On Wed, 2020-03-25 at 10:41:44 UTC, Nicholas Piggin wrote:
> Before:
> 
>   WARNING: CPU: 0 PID: 494 at arch/powerpc/kernel/irq.c:343
>   CPU: 0 PID: 494 Comm: a Tainted: GW
>   NIP:  c001ed2c LR: c0d13190 CTR: c003f910
>   REGS: c001fffd3870 TRAP: 0700   Tainted: GW
>   MSR:  80021003   CR: 28000488  XER: 
>   CFAR: c001ec90 IRQMASK: 0
>   GPR00: c0aeb12c c001fffd3b00 c12ba300 
>   GPR04:   00010bd207c8 6b00696e74657272
>   GPR08:    efbeadde
>   GPR12:  c14a  
>   GPR16:    
>   GPR20:    
>   GPR24:    00010bd207bc
>   GPR28:  c148a898  c0013f50
>   NIP [c001ed2c] arch_local_irq_restore.part.0+0xac/0x100
>   LR [c0d13190] _raw_spin_unlock_irqrestore+0x50/0xc0
>   Call Trace:
>   Instruction dump:
>   6000 7d2000a6 71298000 41820068 3922 7d210164 4b9c 6000
>   6000 7d2000a6 71298000 4c820020 <0fe0> 4e800020 6000 6000
> 
> After:
> 
>   WARNING: CPU: 0 PID: 499 at arch/powerpc/kernel/irq.c:343
>   CPU: 0 PID: 499 Comm: a Not tainted
>   NIP:  c001ed2c LR: c0d13210 CTR: c003f980
>   REGS: c001fffd3870 TRAP: 0700   Not tainted
>   MSR:  80021003   CR: 28000488  XER: 
>   CFAR: c001ec90 IRQMASK: 0
>   GPR00: c0aeb1ac c001fffd3b00 c12ba300 
>   GPR04:   0001347607c8 6b00696e74657272
>   GPR08:    efbeadde
>   GPR12:  c14a  
>   GPR16:    
>   GPR20:    
>   GPR24:    0001347607bc
>   GPR28:  c148a898  c0013f50
>   NIP [c001ed2c] arch_local_irq_restore.part.0+0xac/0x100
>   LR [c0d13210] _raw_spin_unlock_irqrestore+0x50/0xc0
>   Call Trace:
>   [c001fffd3b20] [c0aeb1ac] of_find_property+0x6c/0x90
>   [c001fffd3b70] [c0aeb1f0] of_get_property+0x20/0x40
>   [c001fffd3b90] [c0042cdc] rtas_token+0x3c/0x70
>   [c001fffd3bb0] [c00dc318] fwnmi_release_errinfo+0x28/0x70
>   [c001fffd3c10] [c00dcd8c] 
> pseries_machine_check_realmode+0x1dc/0x540
>   [c001fffd3cd0] [c003fe04] machine_check_early+0x54/0x70
>   [c001fffd3d00] [c0008384] machine_check_early_common+0x134/0x1f0
>   --- interrupt: 200 at 0x1347607c8
>   LR = 0x7fffafbd8328
>   Instruction dump:
>   6000 7d2000a6 71298000 41820068 3922 7d210164 4b9c 6000
>   6000 7d2000a6 71298000 4c820020 <0fe0> 4e800020 6000 6000
> 
> Signed-off-by: Nicholas Piggin 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a2e366832f3f4d5e1b47b7c7f7c41977bd5100f4

cheers


Re: [PATCH] arch/powerpc/64: Avoid isync in flush_dcache_range

2020-04-01 Thread Michael Ellerman
On Fri, 2020-03-20 at 10:32:42 UTC, "Aneesh Kumar K.V" wrote:
> As per ISA and isync is only needed on instruction cache
> block invalidate. Remove the same from dcache invalidate.
> 
> Signed-off-by: Aneesh Kumar K.V 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/233ba5461838a56c19600216f0919e7cd3aec40e

cheers


Re: [PATCH 1/2] powerpc/smp: Drop superfluous NULL check

2020-04-01 Thread Michael Ellerman
On Fri, 2020-03-13 at 11:20:19 UTC, Michael Ellerman wrote:
> We don't need the NULL check of np, the result is the same because the
> OF helpers cope with NULL, of_node_to_nid(NULL) == NUMA_NO_NODE (-1).
> 
> Signed-off-by: Michael Ellerman 

Series applied to powerpc next.

https://git.kernel.org/powerpc/c/4b4d181d63518334070a877ba789211bde77da9e

cheers


Re: [PATCH 1/4] powerpc/xive: Use XIVE_BAD_IRQ instead of zero to catch non configured IPIs

2020-04-01 Thread Michael Ellerman
On Fri, 2020-03-06 at 15:01:40 UTC, =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= wrote:
> When a CPU is brought up, an IPI number is allocated and recorded
> under the XIVE CPU structure. Invalid IPI numbers are tracked with
> interrupt number 0x0.
> 
> On the PowerNV platform, the interrupt number space starts at 0x10 and
> this works fine. However, on the sPAPR platform, it is possible to
> allocate the interrupt number 0x0 and this raises an issue when CPU 0
> is unplugged. The XIVE spapr driver tracks allocated interrupt numbers
> in a bitmask and it is not correctly updated when interrupt number 0x0
> is freed. It stays allocated and it is then impossible to reallocate.
> 
> Fix by using the XIVE_BAD_IRQ value instead of zero on both platforms.
> 
> Reported-by: David Gibson 
> Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt 
> controller")
> Cc: sta...@vger.kernel.org # v4.14+
> Signed-off-by: Cédric Le Goater 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/b1a504a6500df50e83b701b7946b34fce27ad8a3

cheers


Re: [PATCH 1/2] powerpc: Drop -fno-dwarf2-cfi-asm

2020-04-01 Thread Michael Ellerman
On Thu, 2020-03-05 at 14:35:29 UTC, "Naveen N. Rao" wrote:
> The original commit/discussion adding -fno-dwarf2-cfi-asm refers to
> R_PPC64_REL32 relocations not being handled by our module loader:
> http://lkml.kernel.org/r/20090224065112.ga6...@bombadil.infradead.org
> 
> However, that is now handled thanks to commit 9f751b82b491d
> ("powerpc/module: Add support for R_PPC64_REL32 relocations").
> 
> So, drop this flag from our Makefile.
> 
> Signed-off-by: Naveen N. Rao 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/c04868df38d8d6239ef0f36f45dbba2624e6a9cb

cheers


Re: [PATCH v5 01/13] powerpc: move ptrace into a subdirectory.

2020-04-01 Thread Michael Ellerman
On Fri, 2020-02-28 at 00:14:37 UTC, Christophe Leroy wrote:
> In order to allow splitting of ptrace depending on the
> different CONFIG_ options, create a subdirectory dedicated to
> ptrace and move ptrace.c and ptrace32.c into it.
> 
> Signed-off-by: Christophe Leroy 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/da9a1c10e2c7311e923210b6ccd9fbd1ac9132df

cheers


Re: [PATCH 1/2] powerpc/vmlinux.lds: Explicitly retain .gnu.hash

2020-04-01 Thread Michael Ellerman
On Thu, 2020-02-27 at 04:59:32 UTC, Michael Ellerman wrote:
> Relocatable kernel builds produce a warning about .gnu.hash being an
> orphan section:
> 
>   ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in 
> section `.gnu.hash'
> 
> If we try to discard it the build fails:
> 
>   ld -EL -m elf64lppc -pie --orphan-handling=warn --build-id -o
> .tmp_vmlinux1 -T ./arch/powerpc/kernel/vmlinux.lds --whole-archive
> arch/powerpc/kernel/head_64.o arch/powerpc/kernel/entry_64.o
> ...
> sound/built-in.a net/built-in.a virt/built-in.a --no-whole-archive
> --start-group lib/lib.a --end-group
>   ld: could not find section .gnu.hash
> 
> So add an entry to explicitly retain it, as we do for .hash.
> 
> Signed-off-by: Michael Ellerman 

Applied to powerpc next.

https://git.kernel.org/powerpc/c/ead983604c5a390f1e3ce085945b60e82f08dbbe

cheers


Re: [PATCH v3 01/32] powerpc/64s/exception: Introduce INT_DEFINE parameter block for code generation

2020-04-01 Thread Michael Ellerman
On Tue, 2020-02-25 at 17:35:10 UTC, Nicholas Piggin wrote:
> The code generation macro arguments are difficult to read, and
> defaults can't easily be used.
> 
> This introduces a block where parameters can be set for interrupt
> handler code generation by the subsequent macros, and adds the first
> generation macro for interrupt entry.
> 
> One interrupt handler is converted to the new macros to demonstrate
> the change, the rest will be coverted all at once.
> 
> No generated code change.
> 
> Signed-off-by: Nicholas Piggin 

Patches 1-30 applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a42a239db3262b8185cb1a07a9350392ef1439ca

cheers


Re: [PATCH 1/8] powerpc: Update MAINTAINERS

2020-04-01 Thread Michael Ellerman
On Mon, 2020-02-24 at 23:31:39 UTC, Michael Ellerman wrote:
> A while back Paul pointed out I'd been maintaining the tree more or
> less solo for over five years, so perhaps it's time to update the
> MAINTAINERS entry.
> 
> Ben & Paul still wrote most of the code, so keep them as Reviewers so
> they still get Cc'ed on things. But if you're wondering why your patch
> hasn't been merged that's my fault.
> 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Signed-off-by: Michael Ellerman 

Series applied to powerpc next.

https://git.kernel.org/powerpc/c/7703889e8ee1b318f632be7ba4d58d9962ecf34f

cheers


Re: [PATCH] powernv/opal-sensor-groups: Add documentation for the sysfs interfaces

2020-04-01 Thread Michael Ellerman
On Tue, 2019-11-26 at 13:51:14 UTC, "Gautham R. Shenoy" wrote:
> From: Shilpasri G Bhat 
> 
> Commit bf9571550f52 ("powerpc/powernv: Add support to clear sensor
> groups data") added a mechanism to clear sensor-group data via a sysfs
> interface. However, the ABI for that interface has not been
> documented.
> 
> This patch documents the ABI for the sysfs interface for sensor-groups
> and clearing the sensor-groups.
> 
> This patch was originally sent by Shilpasri G Bhat on the mailing list:
> https://lkml.org/lkml/2018/8/1/85
> 
> Signed-off-by: Shilpasri G Bhat 
> Signed-off-by: Gautham R. Shenoy 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/32377bd2cbb62e23ac0a1aaaf0048957c5fd9f02

cheers


Re: [PATCH 1/1] powerpc/cputable: Remove unnecessary copy of cpu_spec->oprofile_type

2020-04-01 Thread Michael Ellerman
On Sat, 2020-02-15 at 05:36:37 UTC, Leonardo Bras wrote:
> Before checking for cpu_type == NULL, this same copy happens, so doing
> it here will just write the same value to the t->oprofile_type
> again.
> 
> Remove the repeated copy, as it is unnecessary.
> 
> Signed-off-by: Leonardo Bras 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/41b8426fdb59218f56a6e3b3facd43a82816e3eb

cheers


Re: [PATCH] powerpc/32: drop unused ISA_DMA_THRESHOLD

2020-04-01 Thread Michael Ellerman
On Mon, 2019-11-25 at 09:20:33 UTC, Mike Rapoport wrote:
> From: Mike Rapoport 
> 
> The ISA_DMA_THRESHOLD variable is set by several platforms but never
> referenced.
> Remove it.
> 
> Signed-off-by: Mike Rapoport 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/b77afad84e1eedca03658ae1478ce5b8ed5aa18c

cheers


Re: [PATCH 10/16] powerpc: prefer __section and __printf from compiler_attributes.h

2020-04-01 Thread Michael Ellerman
On Mon, 2019-08-12 at 21:50:43 UTC, Nick Desaulniers wrote:
> Reported-by: Sedat Dilek 
> Suggested-by: Josh Poimboeuf 
> Signed-off-by: Nick Desaulniers 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/a7032637b54186e5649917679727d7feaec932b1

cheers


Re: [PATCH v4 6/6] pseries/sysfs: Minimise IPI noise while reading [idle_][s]purr

2020-04-01 Thread Gautham R Shenoy
Hello Naveen,


On Wed, Apr 01, 2020 at 03:28:48PM +0530, Naveen N. Rao wrote:
> Gautham R. Shenoy wrote:
> >From: "Gautham R. Shenoy" 
> >
 [..snip..]

> >-static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
> >-static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
> > static DEVICE_ATTR(pir, 0400, show_pir, NULL);
> > static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr);
> > #endif /* CONFIG_PPC64 */
> >@@ -761,22 +757,110 @@ static void create_svm_file(void)
> > }
> > #endif /* CONFIG_PPC_SVM */
> >
> >+#ifdef CONFIG_PPC64
> >+/*
> >+ * The duration (in ms) from the last IPI to the target CPU until
> >+ * which a cached value of purr, spurr, idle_purr, idle_spurr can be
> >+ * reported to the user on a corresponding sysfs file read. Beyond
> >+ * this duration, fresh values need to be obtained by sending IPIs to
> >+ * the target CPU when the sysfs files are read.
> >+ */
> >+static unsigned long util_stats_staleness_tolerance_ms = 10;
> 
> This is a nice optimization for our use in lparstat, though I have a concern
> below.
> 
> >+struct util_acct_stats {
> >+u64 latest_purr;
> >+u64 latest_spurr;
> >+#ifdef CONFIG_PPC_PSERIES
> >+u64 latest_idle_purr;
> >+u64 latest_idle_spurr;
> >+#endif
> 
> You can probably drop the 'latest_' prefix.


Sure.

> 
> >+unsigned long last_update_jiffies;
> >+};
> >+
> >+DEFINE_PER_CPU(struct util_acct_stats, util_acct_stats);
> 
> Per snowpatch, this should be static, and so should get_util_stats_ptr()
> below:
> https://openpower.xyz/job/snowpatch/job/snowpatch-linux-sparse/16601//artifact/linux/report.txt

Ok, will fix this in v5.

> 
> >+
> >+static void update_util_acct_stats(void *ptr)
> >+{
> >+struct util_acct_stats *stats = ptr;
> >+
> >+stats->latest_purr = mfspr(SPRN_PURR);
> >+stats->latest_spurr = mfspr(SPRN_SPURR);
> > #ifdef CONFIG_PPC_PSERIES
> >-static void read_idle_purr(void *val)
> >+stats->latest_idle_purr = read_this_idle_purr();
> >+stats->latest_idle_spurr = read_this_idle_spurr();
> >+#endif
> >+stats->last_update_jiffies = jiffies;
> >+}
> >+
> >+struct util_acct_stats *get_util_stats_ptr(int cpu)
> >+{
> >+struct util_acct_stats *stats = per_cpu_ptr(_acct_stats, cpu);
> >+unsigned long delta_jiffies;
> >+
> >+delta_jiffies = jiffies - stats->last_update_jiffies;
> >+
> >+/*
> >+ * If we have a recent enough data, reuse that instead of
> >+ * sending an IPI.
> >+ */
> >+if (jiffies_to_msecs(delta_jiffies) < util_stats_staleness_tolerance_ms)
> >+return stats;
> >+
> >+smp_call_function_single(cpu, update_util_acct_stats, stats, 1);
> >+return stats;
> >+}
> >+
> >+static ssize_t show_purr(struct device *dev,
> >+ struct device_attribute *attr, char *buf)
> > {
> >-u64 *ret = val;
> >+struct cpu *cpu = container_of(dev, struct cpu, dev);
> >+struct util_acct_stats *stats;
> >
> >-*ret = read_this_idle_purr();
> >+stats = get_util_stats_ptr(cpu->dev.id);
> >+return sprintf(buf, "%llx\n", stats->latest_purr);
> 
> This alters the behavior of the current sysfs purr file. I am not sure if it
> is reasonable to return the same PURR value across a 10ms window.


It does reduce it to 10ms window. I am not sure if anyone samples PURR
etc faster than that rate.

I measured how much time it takes to read the purr, spurr, idle_purr,
idle_spurr files back-to-back. It takes not more than 150us.  From
lparstat will these values be read back-to-back ? If so, we can reduce
the staleness_tolerance to something like 500us and still avoid extra
IPIs. If not, what is the maximum delay between the first sysfs file
read and the last sysfs file read ?

>
> I wonder if we should introduce a sysctl interface to control thresholding.
> It can default to 0, which disables thresholding so that the existing
> behavior continues. Applications (lparstat) can optionally set it to suit
> their use.

We would be introducing 3 new sysfs interfaces that way instead of
two.

/sys/devices/system/cpu/purr_spurr_staleness
/sys/devices/system/cpu/cpuX/idle_purr
/sys/devices/system/cpu/cpuX/idle_spurr

I don't have a problem with this. Nathan, Michael, thoughts on this?


The alternative is to have a procfs interface, something like
/proc/powerpc/resource_util_stats

which gives a listing similar to /proc/stat, i.e

  CPUX

Even in this case, the values can be obtained in one-shot with a
single IPI and be printed in the row corresponding to the CPU.

> 
> - Naveen
> 

--
Thanks and Regards
gautham.


Re: [RFC WIP PATCH] powerpc/32: system call implement entry/exit logic in C

2020-04-01 Thread Christophe Leroy




Le 31/03/2020 à 17:22, Christophe Leroy a écrit :

That's first try to port PPC64 syscall entry/exit logic in C to PPC32.
I've do the minimum to get it work. I have not reworked calls
to sys_fork() and friends for instance.

For the time being, it seems to work more or less but:
- ping reports EINVAL on recvfrom
- strace shows NULL instead of strings in call like open() for instance.


For the two above problems, that's because system_call_exception() 
doesn't set orig_gpr3 whereas DoSycall() does in entry_32.S . Is that 
only done on PPC32 ?


With the following line at the begining of system_call_exception(), it 
works perfectly:


regs->orig_gpr3 = r3;

I will now focus on performance to see if we can do something about it.

Christophe


- the performance is definitively bad

On an 8xx, null_syscall test is about 30% slower after this patch:
- Without the patch: 284 cycles
- With the patch: 371 cycles

@nick and others, any suggestion to fix and improve ?

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/book3s/32/kup.h  |  21 ++
  .../powerpc/include/asm/book3s/64/kup-radix.h |  12 +-
  arch/powerpc/include/asm/hw_irq.h |  15 +
  arch/powerpc/include/asm/kup.h|   2 +
  arch/powerpc/include/asm/nohash/32/kup-8xx.h  |  13 +
  arch/powerpc/kernel/Makefile  |   5 +-
  arch/powerpc/kernel/entry_32.S| 259 ++
  arch/powerpc/kernel/head_32.h |   3 +-
  .../kernel/{syscall_64.c => syscall.c}|  25 +-
  9 files changed, 102 insertions(+), 253 deletions(-)
  rename arch/powerpc/kernel/{syscall_64.c => syscall.c} (97%)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 3c0ba22dc360..c85bc5b56366 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -102,6 +102,27 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 
end)
isync();/* Context sync required after mtsrin() */
  }
  
+static inline void kuap_restore(struct pt_regs *regs)

+{
+   u32 kuap = current->thread.kuap;
+   u32 addr = kuap & 0xf000;
+   u32 end = kuap << 28;
+
+   if (unlikely(!kuap))
+   return;
+
+   current->thread.kuap = 0;
+   kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end);   /* Clear Ks */
+}
+
+static inline void kuap_check(void)
+{
+   if (!IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+   return;
+
+   WARN_ON_ONCE(current->thread.kuap != 0);
+}
+
  static __always_inline void allow_user_access(void __user *to, const void 
__user *from,
  u32 size, unsigned long dir)
  {
diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h 
b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 3bcef989a35d..1f2716a0dcd8 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -60,13 +60,13 @@
  #include 
  #include 
  
-static inline void kuap_restore_amr(struct pt_regs *regs)

+static inline void kuap_restore(struct pt_regs *regs)
  {
if (mmu_has_feature(MMU_FTR_RADIX_KUAP))
mtspr(SPRN_AMR, regs->kuap);
  }
  
-static inline void kuap_check_amr(void)

+static inline void kuap_check(void)
  {
if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && 
mmu_has_feature(MMU_FTR_RADIX_KUAP))
WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
@@ -141,14 +141,6 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long 
address, bool is_write)
(regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : 
AMR_KUAP_BLOCK_READ)),
"Bug: %s fault blocked by AMR!", is_write ? "Write" : 
"Read");
  }
-#else /* CONFIG_PPC_KUAP */
-static inline void kuap_restore_amr(struct pt_regs *regs)
-{
-}
-
-static inline void kuap_check_amr(void)
-{
-}
  #endif /* CONFIG_PPC_KUAP */
  
  #endif /* __ASSEMBLY__ */

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index e0e71777961f..6ccf07de6665 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -321,6 +321,16 @@ static inline void arch_local_irq_disable(void)
mtmsr(mfmsr() & ~MSR_EE);
  }
  
+static inline void arch_local_recovery_disable(void)

+{
+   if (IS_ENABLED(CONFIG_BOOKE))
+   wrtee(0);
+   else if (IS_ENABLED(CONFIG_PPC_8xx))
+   wrtspr(SPRN_NRI);
+   else
+   mtmsr(mfmsr() & ~(MSR_EE | MSR_RI));
+}
+
  static inline void arch_local_irq_enable(void)
  {
if (IS_ENABLED(CONFIG_BOOKE))
@@ -343,6 +353,11 @@ static inline bool arch_irqs_disabled(void)
  
  #define hard_irq_disable()		arch_local_irq_disable()
  
+#define __hard_irq_enable()		arch_local_irq_enable()

+#define __hard_irq_disable()   arch_local_irq_disable()
+#define __hard_EE_RI_disable() arch_local_recovery_disable()
+#define 

Re: [PATCH v2 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-04-01 Thread Michal Simek
On 01. 04. 20 12:38, Takashi Iwai wrote:
> On Wed, 01 Apr 2020 12:35:16 +0200,
> Michael Ellerman wrote:
>>
>> Michal Simek  writes:
>>> On 01. 04. 20 4:07, Michael Ellerman wrote:
 Michal Simek  writes:
> Hi,
>
> recently we wanted to update xilinx intc driver and we found that function
> which we wanted to remove is still wired by ancient Xilinx PowerPC
> platforms. Here is the thread about it.
> https://lore.kernel.org/linux-next/48d3232d-0f1d-42ea-3109-f44bbabfa...@xilinx.com/
>
> I have been talking about it internally and there is no interest in these
> platforms and it is also orphan for quite a long time. None is really
> running/testing these platforms regularly that's why I think it makes 
> sense
> to remove them also with drivers which are specific to this platform.
>
> U-Boot support was removed in 2017 without anybody complain about it
> https://github.com/Xilinx/u-boot-xlnx/commit/98f705c9cefdfdba62c069821bbba10273a0a8ed
>
> Based on current ppc/next.
>
> If anyone has any objection about it, please let me know.

 Thanks for taking the time to find all this code and remove it.

 I'm not going to take this series for v5.7, it was posted too close to
 the merge window, and doing so wouldn't give people much time to object,
 especially given people are distracted at the moment.

 I'm happy to take it for v5.8, assuming there's no major objections.
>>>
>>> Sure. Just to let you know Christophe Leroy included this patch in his
>>> series about ppc405 removal. It should be the same.
>>>
>>> If you don't want to take that alsa patch I can send it separately and
>>> this patch can be taken from his series. I don't really mind but please
>>> let me know what way you prefer.
>>
>> It's better to keep it all together, so I'm happy take the alsa patch as
>> well, it's already been acked.
> 
> Sure, please go ahead.


g8.

Thanks,
Michal


Re: [PATCH v3 3/3] powerpc test_emulate_step: add testcases for divde[.] and divdeu[.] instructions

2020-04-01 Thread Balamuruhan S
On Wed, 2020-04-01 at 16:26 +0530, Naveen N. Rao wrote:
> Balamuruhan S wrote:
> > add testcases for divde, divde., divdeu, divdeu. emulated
> > instructions to cover few scenarios,
> > * with same dividend and divisor to have undefine RT
> >   for divdeu[.]
> > * with divide by zero to have undefine RT for both
> >   divde[.] and divdeu[.]
> > * with negative dividend to cover -|divisor| < r <= 0 if
> >   the dividend is negative for divde[.]
> > * normal case with proper dividend and divisor for both
> >   divde[.] and divdeu[.]
> > 
> > Reviewed-by: Sandipan Das 
> > Signed-off-by: Balamuruhan S 
> > ---
> >  arch/powerpc/lib/test_emulate_step.c | 164
> > +++
> >  1 file changed, 164 insertions(+)
> > 
> > diff --git a/arch/powerpc/lib/test_emulate_step.c
> > b/arch/powerpc/lib/test_emulate_step.c
> > index 42347067739c..ffeb9b68a31d 100644
> > --- a/arch/powerpc/lib/test_emulate_step.c
> > +++ b/arch/powerpc/lib/test_emulate_step.c
> > @@ -53,6 +53,14 @@
> > ___PPC_RA(a) | ___PPC_RB(b))
> >  #define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | 
> > \
> > ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
> > +#define TEST_DIVDE(t, a, b)(PPC_INST_DIVDE | ___PPC_RT(t) |
> > \
> > +   ___PPC_RA(a) | ___PPC_RB(b))
> > +#define TEST_DIVDE_DOT(t, a, b)(PPC_INST_DIVDE | ___PPC_RT(t) |
> > \
> > +   ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
> > +#define TEST_DIVDEU(t, a, b)   (PPC_INST_DIVDEU | ___PPC_RT(t) |   
> > \
> > +   ___PPC_RA(a) | ___PPC_RB(b))
> > +#define TEST_DIVDEU_DOT(t, a, b)(PPC_INST_DIVDEU | ___PPC_RT(t) |  \
> > +   ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
> > 
> >  #define MAX_SUBTESTS   16
> > 
> > @@ -837,6 +845,162 @@ static struct compute_test compute_tests[] = {
> > }
> > }
> > }
> > +   },
> > +   {
> > +   .mnemonic = "divde",
> > +   .subtests = {
> > +   {
> > +   .descr = "RA = LONG_MIN, RB = LONG_MIN",
> > +   .instr = TEST_DIVDE(20, 21, 22),
> > +   .regs = {
> > +   .gpr[21] = LONG_MIN,
> > +   .gpr[22] = LONG_MIN,
> > +   }
> > +   },
> > +   {
> > +   .descr = "RA = 1L, RB = 0",
> > +   .instr = TEST_DIVDE(20, 21, 22),
> > +   .flags = IGNORE_GPR(20),
> > +   .regs = {
> > +   .gpr[21] = 1L,
> > +   .gpr[22] = 0,
> > +   }
> > +   },
> > +   {
> > +   .descr = "RA = LONG_MIN, RB = LONG_MAX",
> > +   .instr = TEST_DIVDE(20, 21, 22),
> > +   .regs = {
> > +   .gpr[21] = LONG_MIN,
> > +   .gpr[22] = LONG_MAX,
> > +   }
> > +   }
> > +   }
> > +   },
> > +   {
> > +   .mnemonic = "divde.",
> > +   .subtests = {
> > +   {
> > +   .descr = "RA = LONG_MIN, RB = LONG_MIN",
> > +   .instr = TEST_DIVDE_DOT(20, 21, 22),
> > +   .regs = {
> > +   .gpr[21] = LONG_MIN,
> > +   .gpr[22] = LONG_MIN,
> > +   }
> > +   },
> > +   {
> > +   .descr = "RA = 1L, RB = 0",
> > +   .instr = TEST_DIVDE_DOT(20, 21, 22),
> > +   .flags = IGNORE_GPR(20),
> > +   .regs = {
> > +   .gpr[21] = 1L,
> > +   .gpr[22] = 0,
> > +   }
> > +   },
> > +   {
> > +   .descr = "RA = LONG_MIN, RB = LONG_MAX",
> > +   .instr = TEST_DIVDE_DOT(20, 21, 22),
> > +   .regs = {
> > +   .gpr[21] = LONG_MIN,
> > +   .gpr[22] = LONG_MAX,
> > +   }
> > +   }
> > +   }
> > +   },
> > +   {
> > +   .mnemonic = "divdeu",
> > +   .subtests = {
> > +   {
> > +   .descr = "RA = LONG_MIN, RB = LONG_MIN",
> > +   .instr = TEST_DIVDEU(20, 21, 22),
> > +  

Re: [PATCH v3 3/3] powerpc test_emulate_step: add testcases for divde[.] and divdeu[.] instructions

2020-04-01 Thread Naveen N. Rao

Balamuruhan S wrote:

add testcases for divde, divde., divdeu, divdeu. emulated
instructions to cover few scenarios,
* with same dividend and divisor to have undefine RT
  for divdeu[.]
* with divide by zero to have undefine RT for both
  divde[.] and divdeu[.]
* with negative dividend to cover -|divisor| < r <= 0 if
  the dividend is negative for divde[.]
* normal case with proper dividend and divisor for both
  divde[.] and divdeu[.]

Reviewed-by: Sandipan Das 
Signed-off-by: Balamuruhan S 
---
 arch/powerpc/lib/test_emulate_step.c | 164 +++
 1 file changed, 164 insertions(+)

diff --git a/arch/powerpc/lib/test_emulate_step.c 
b/arch/powerpc/lib/test_emulate_step.c
index 42347067739c..ffeb9b68a31d 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -53,6 +53,14 @@
___PPC_RA(a) | ___PPC_RB(b))
 #define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \
___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+#define TEST_DIVDE(t, a, b)(PPC_INST_DIVDE | ___PPC_RT(t) |\
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define TEST_DIVDE_DOT(t, a, b)(PPC_INST_DIVDE | ___PPC_RT(t) |
\
+   ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+#define TEST_DIVDEU(t, a, b)   (PPC_INST_DIVDEU | ___PPC_RT(t) |   \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define TEST_DIVDEU_DOT(t, a, b)(PPC_INST_DIVDEU | ___PPC_RT(t) |  \
+   ___PPC_RA(a) | ___PPC_RB(b) | 0x1)

 #define MAX_SUBTESTS   16

@@ -837,6 +845,162 @@ static struct compute_test compute_tests[] = {
}
}
}
+   },
+   {
+   .mnemonic = "divde",
+   .subtests = {
+   {
+   .descr = "RA = LONG_MIN, RB = LONG_MIN",
+   .instr = TEST_DIVDE(20, 21, 22),
+   .regs = {
+   .gpr[21] = LONG_MIN,
+   .gpr[22] = LONG_MIN,
+   }
+   },
+   {
+   .descr = "RA = 1L, RB = 0",
+   .instr = TEST_DIVDE(20, 21, 22),
+   .flags = IGNORE_GPR(20),
+   .regs = {
+   .gpr[21] = 1L,
+   .gpr[22] = 0,
+   }
+   },
+   {
+   .descr = "RA = LONG_MIN, RB = LONG_MAX",
+   .instr = TEST_DIVDE(20, 21, 22),
+   .regs = {
+   .gpr[21] = LONG_MIN,
+   .gpr[22] = LONG_MAX,
+   }
+   }
+   }
+   },
+   {
+   .mnemonic = "divde.",
+   .subtests = {
+   {
+   .descr = "RA = LONG_MIN, RB = LONG_MIN",
+   .instr = TEST_DIVDE_DOT(20, 21, 22),
+   .regs = {
+   .gpr[21] = LONG_MIN,
+   .gpr[22] = LONG_MIN,
+   }
+   },
+   {
+   .descr = "RA = 1L, RB = 0",
+   .instr = TEST_DIVDE_DOT(20, 21, 22),
+   .flags = IGNORE_GPR(20),
+   .regs = {
+   .gpr[21] = 1L,
+   .gpr[22] = 0,
+   }
+   },
+   {
+   .descr = "RA = LONG_MIN, RB = LONG_MAX",
+   .instr = TEST_DIVDE_DOT(20, 21, 22),
+   .regs = {
+   .gpr[21] = LONG_MIN,
+   .gpr[22] = LONG_MAX,
+   }
+   }
+   }
+   },
+   {
+   .mnemonic = "divdeu",
+   .subtests = {
+   {
+   .descr = "RA = LONG_MIN, RB = LONG_MIN",
+   .instr = TEST_DIVDEU(20, 21, 22),
+   .flags = IGNORE_GPR(20),
+   .regs = {
+   .gpr[21] = LONG_MIN,
+   .gpr[22] = LONG_MIN,

Re: [PATCH v2 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-04-01 Thread Takashi Iwai
On Wed, 01 Apr 2020 12:35:16 +0200,
Michael Ellerman wrote:
> 
> Michal Simek  writes:
> > On 01. 04. 20 4:07, Michael Ellerman wrote:
> >> Michal Simek  writes:
> >>> Hi,
> >>>
> >>> recently we wanted to update xilinx intc driver and we found that function
> >>> which we wanted to remove is still wired by ancient Xilinx PowerPC
> >>> platforms. Here is the thread about it.
> >>> https://lore.kernel.org/linux-next/48d3232d-0f1d-42ea-3109-f44bbabfa...@xilinx.com/
> >>>
> >>> I have been talking about it internally and there is no interest in these
> >>> platforms and it is also orphan for quite a long time. None is really
> >>> running/testing these platforms regularly that's why I think it makes 
> >>> sense
> >>> to remove them also with drivers which are specific to this platform.
> >>>
> >>> U-Boot support was removed in 2017 without anybody complain about it
> >>> https://github.com/Xilinx/u-boot-xlnx/commit/98f705c9cefdfdba62c069821bbba10273a0a8ed
> >>>
> >>> Based on current ppc/next.
> >>>
> >>> If anyone has any objection about it, please let me know.
> >> 
> >> Thanks for taking the time to find all this code and remove it.
> >> 
> >> I'm not going to take this series for v5.7, it was posted too close to
> >> the merge window, and doing so wouldn't give people much time to object,
> >> especially given people are distracted at the moment.
> >> 
> >> I'm happy to take it for v5.8, assuming there's no major objections.
> >
> > Sure. Just to let you know Christophe Leroy included this patch in his
> > series about ppc405 removal. It should be the same.
> >
> > If you don't want to take that alsa patch I can send it separately and
> > this patch can be taken from his series. I don't really mind but please
> > let me know what way you prefer.
> 
> It's better to keep it all together, so I'm happy take the alsa patch as
> well, it's already been acked.

Sure, please go ahead.


thanks,

Takashi


Re: [PATCH v2 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-04-01 Thread Michael Ellerman
Michal Simek  writes:
> On 01. 04. 20 4:07, Michael Ellerman wrote:
>> Michal Simek  writes:
>>> Hi,
>>>
>>> recently we wanted to update xilinx intc driver and we found that function
>>> which we wanted to remove is still wired by ancient Xilinx PowerPC
>>> platforms. Here is the thread about it.
>>> https://lore.kernel.org/linux-next/48d3232d-0f1d-42ea-3109-f44bbabfa...@xilinx.com/
>>>
>>> I have been talking about it internally and there is no interest in these
>>> platforms and it is also orphan for quite a long time. None is really
>>> running/testing these platforms regularly that's why I think it makes sense
>>> to remove them also with drivers which are specific to this platform.
>>>
>>> U-Boot support was removed in 2017 without anybody complain about it
>>> https://github.com/Xilinx/u-boot-xlnx/commit/98f705c9cefdfdba62c069821bbba10273a0a8ed
>>>
>>> Based on current ppc/next.
>>>
>>> If anyone has any objection about it, please let me know.
>> 
>> Thanks for taking the time to find all this code and remove it.
>> 
>> I'm not going to take this series for v5.7, it was posted too close to
>> the merge window, and doing so wouldn't give people much time to object,
>> especially given people are distracted at the moment.
>> 
>> I'm happy to take it for v5.8, assuming there's no major objections.
>
> Sure. Just to let you know Christophe Leroy included this patch in his
> series about ppc405 removal. It should be the same.
>
> If you don't want to take that alsa patch I can send it separately and
> this patch can be taken from his series. I don't really mind but please
> let me know what way you prefer.

It's better to keep it all together, so I'm happy take the alsa patch as
well, it's already been acked.

cheers


Re: [PATCH v4 03/16] powerpc: Use a datatype for instructions

2020-04-01 Thread Balamuruhan S
On Fri, 2020-03-20 at 16:17 +1100, Jordan Niethe wrote:
> Currently unsigned ints are used to represent instructions on powerpc.
> This has worked well as instructions have always been 4 byte words.
> However, a future ISA version will introduce some changes to
> instructions that mean this scheme will no longer work as well. This
> change is Prefixed Instructions. A prefixed instruction is made up of a
> word prefix followed by a word suffix to make an 8 byte double word
> instruction. No matter the endianess of the system the prefix always
> comes first. Prefixed instructions are only planned for powerpc64.
> 
> Introduce a ppc_inst type to represent both prefixed and word
> instructions on powerpc64 while keeping it possible to exclusively have
> word instructions on powerpc32, A latter patch will expand the type to
> include prefixed instructions but for now just typedef it to a u32.
> 
> Later patches will introduce helper functions and macros for
> manipulating the instructions so that powerpc64 and powerpc32 might
> maintain separate type definitions.
> 
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/code-patching.h | 31 +--
>  arch/powerpc/include/asm/inst.h  | 53 +++
>  arch/powerpc/include/asm/sstep.h |  5 +-
>  arch/powerpc/kernel/align.c  |  2 +-
>  arch/powerpc/kernel/hw_breakpoint.c  |  3 +-
>  arch/powerpc/kernel/kprobes.c|  2 +-
>  arch/powerpc/kernel/mce_power.c  |  5 +-
>  arch/powerpc/kernel/optprobes.c  | 10 ++--
>  arch/powerpc/kernel/trace/ftrace.c   | 66 
>  arch/powerpc/kvm/emulate_loadstore.c |  1 +
>  arch/powerpc/lib/code-patching.c | 54 +--
>  arch/powerpc/lib/sstep.c |  4 +-
>  arch/powerpc/lib/test_emulate_step.c |  9 ++--
>  arch/powerpc/xmon/xmon.c | 12 ++---
>  14 files changed, 160 insertions(+), 97 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/inst.h
> 
> diff --git a/arch/powerpc/include/asm/code-patching.h
> b/arch/powerpc/include/asm/code-patching.h
> index 898b54262881..cb5106f92d67 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /* Flags for create_branch:
>   * "b"   == create_branch(addr, target, 0);
> @@ -22,27 +23,27 @@
>  #define BRANCH_ABSOLUTE  0x2
>  
>  bool is_offset_in_branch_range(long offset);
> -unsigned int create_branch(const unsigned int *addr,
> +ppc_inst create_branch(const ppc_inst *addr,
>  unsigned long target, int flags);
> -unsigned int create_cond_branch(const unsigned int *addr,
> +unsigned int create_cond_branch(const ppc_inst *addr,
>   unsigned long target, int flags);
> -int patch_branch(unsigned int *addr, unsigned long target, int flags);
> -int patch_instruction(unsigned int *addr, unsigned int instr);
> -int raw_patch_instruction(unsigned int *addr, unsigned int instr);
> +int patch_branch(ppc_inst *addr, unsigned long target, int flags);
> +int patch_instruction(ppc_inst *addr, ppc_inst instr);

we need to handle this change for its user in epapr_paravirt.c,

arch/powerpc/kernel/epapr_paravirt.c: In function 'early_init_dt_scan_epapr':
arch/powerpc/kernel/epapr_paravirt.c:40:48: error: incompatible type for
argument 2 of 'patch_instruction'
   40 |   patch_instruction(epapr_hypercall_start + i, inst);
  |^~~~
  ||
  |u32 {aka unsigned int}
In file included from arch/powerpc/kernel/epapr_paravirt.c:12:
./arch/powerpc/include/asm/code-patching.h:31:44: note: expected 'ppc_inst'
{aka 'struct ppc_inst'} but argument is of type 'u32' {aka 'unsigned int'}
   31 | int patch_instruction(void *addr, ppc_inst instr);
  |   ~^
make[2]: *** [scripts/Makefile.build:268: arch/powerpc/kernel/epapr_paravirt.o]
Error 1
make[1]: *** [scripts/Makefile.build:505: arch/powerpc/kernel] Error 2
make: *** [Makefile:1683: arch/powerpc] Error 2


-- Bala

> +int raw_patch_instruction(ppc_inst *addr, ppc_inst instr);
>  
>  static inline unsigned long patch_site_addr(s32 *site)
>  {
>   return (unsigned long)site + *site;
>  }
>  
> -static inline int patch_instruction_site(s32 *site, unsigned int instr)
> +static inline int patch_instruction_site(s32 *site, ppc_inst instr)
>  {
> - return patch_instruction((unsigned int *)patch_site_addr(site), instr);
> + return patch_instruction((ppc_inst *)patch_site_addr(site), instr);
>  }
>  
>  static inline int patch_branch_site(s32 *site, unsigned long target, int
> flags)
>  {
> - return patch_branch((unsigned int *)patch_site_addr(site), target,
> flags);
> + return patch_branch((ppc_inst 

  1   2   >