date:20200910

Re: [PATCH 7/7] powerpc/32: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

CC  arch/powerpc/kernel/traps.o
../arch/powerpc/kernel/traps.c:1663:6: error: no previous prototype for 
‘stack_overflow_exception’ [-Werror=missing-prototypes]
  void stack_overflow_exception(struct pt_regs *regs)
   ^~~~



A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.




Cc: Christophe Leroy 


Reviewed-by: Christophe Leroy 



Fixes: 3978eb78517c ("powerpc/32: Add early stack overflow detection with VMAP 
stack.")
Signed-off-by: Cédric Le Goater 
---
  arch/powerpc/include/asm/asm-prototypes.h | 1 +


Note that asm-prototypes.h is not the right place for such a prototype, 
but that's probably for another cleanup patch. See discussion at 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/1463534212-4879-2-git-send-email-...@axtens.net/


Christophe



  1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index de14b1a34d56..4957119604c7 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -67,6 +67,7 @@ void single_step_exception(struct pt_regs *regs);
  void program_check_exception(struct pt_regs *regs);
  void alignment_exception(struct pt_regs *regs);
  void StackOverflow(struct pt_regs *regs);
+void stack_overflow_exception(struct pt_regs *regs);
  void kernel_fp_unavailable_exception(struct pt_regs *regs);
  void altivec_unavailable_exception(struct pt_regs *regs);
  void vsx_unavailable_exception(struct pt_regs *regs);

Re: [PATCH 6/7] powerpc/perf: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

CC  arch/powerpc/perf/imc-pmu.o
../arch/powerpc/perf/imc-pmu.c: In function ‘trace_imc_event_init’:
../arch/powerpc/perf/imc-pmu.c:1429:22: error: variable ‘target’ set but not 
used [-Werror=unused-but-set-variable]
   struct task_struct *target;
   ^~


A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.




Cc: Anju T Sudhakar 
Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/perf/imc-pmu.c | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 62d0b54086f8..9ed4fcccf8a9 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1426,8 +1426,6 @@ static void trace_imc_event_del(struct perf_event *event, 
int flags)
  
  static int trace_imc_event_init(struct perf_event *event)

  {
-   struct task_struct *target;
-
if (event->attr.type != event->pmu->type)
return -ENOENT;
  
@@ -1458,7 +1456,6 @@ static int trace_imc_event_init(struct perf_event *event)

mutex_unlock(_global_refc.lock);
  
  	event->hw.idx = -1;

-   target = event->hw.target;
  
  	event->pmu->task_ctx_nr = perf_hw_context;

event->destroy = reset_global_refc;

Re: [PATCH 5/7] powerpc/powernv/pci: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

   CC  arch/powerpc/platforms/powernv/pci-ioda.o
../arch/powerpc/platforms/powernv/pci-ioda.c: In function 
‘pnv_ioda_configure_pe’:
../arch/powerpc/platforms/powernv/pci-ioda.c:897:18: error: variable ‘parent’ 
set but not used [-Werror=unused-but-set-variable]
   struct pci_dev *parent;
   ^~



A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.




Cc: Oliver O'Halloran 
Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/platforms/powernv/pci-ioda.c | 8 
  1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 023a4f987bb2..2b4ceb5e6ce4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -894,7 +894,6 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
  
  int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)

  {
-   struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
long rc, rid_end, rid;
  
@@ -904,7 +903,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
  
  		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;

fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
-   parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
count = resource_size(>pbus->busn_res);
else
@@ -925,12 +923,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
-#ifdef CONFIG_PCI_IOV
-   if (pe->flags & PNV_IODA_PE_VF)
-   parent = pe->parent_dev;
-   else
-#endif /* CONFIG_PCI_IOV */
-   parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;

Re: [PATCH 4/7] powerpc/xive: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

CC  arch/powerpc/sysdev/xive/common.o
../arch/powerpc/sysdev/xive/common.c:1568:6: error: no previous prototype for 
‘xive_debug_show_cpu’ [-Werror=missing-prototypes]
  void xive_debug_show_cpu(struct seq_file *m, int cpu)
   ^~~
../arch/powerpc/sysdev/xive/common.c:1602:6: error: no previous prototype for 
‘xive_debug_show_irq’ [-Werror=missing-prototypes]
  void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)
   ^~~



A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.


There are two ways of fixing it:
- Add the missing prototype
- Make it static

You chose the second alternative, this needs to be told in the commit log.



Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/sysdev/xive/common.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f591be9f01f4..a80440af491a 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1565,7 +1565,7 @@ static int __init xive_off(char *arg)
  }
  __setup("xive=off", xive_off);
  
-void xive_debug_show_cpu(struct seq_file *m, int cpu)

+static void xive_debug_show_cpu(struct seq_file *m, int cpu)
  {
struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
  
@@ -1599,7 +1599,7 @@ void xive_debug_show_cpu(struct seq_file *m, int cpu)

seq_puts(m, "\n");
  }
  
-void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)

+static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct 
irq_data *d)
  {
struct irq_chip *chip = irq_data_get_irq_chip(d);
int rc;

Re: [PATCH 3/7] powerpc/sstep: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

../arch/powerpc/lib/sstep.c: In function ‘mlsd_8lsd_ea’:
../arch/powerpc/lib/sstep.c:225:3: error: suggest braces around empty body in 
an ‘if’ statement [-Werror=empty-body]
; /* Invalid form. Should already be checked for by caller! */
^


A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.






Cc: Jordan Niethe 
Signed-off-by: Cédric Le Goater 
---
  arch/powerpc/lib/sstep.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index caee8cc77e19..14572af16e55 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -221,8 +221,9 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned 
int instr,
; /* Leave ea as is */
else if (prefix_r && !ra)
ea += regs->nip;
-   else if (prefix_r && ra)
+   else if (prefix_r && ra) {
; /* Invalid form. Should already be checked for by caller! */
+   }


You can't do that. Now checkpatch will complain that you don't have 
braces on all legs of the if/else dance.


I think the last 'else if' should simply be removed entirely as it does 
nothing. Eventually, just leave the comment, something like:


	/* (prefix_r && ra) is Invalid form. Should already be checked for by 
caller! */


And if (prefix_r && ra) is not possible, then the previous if should 
just be 'if (prefx_r)'


Christophe


  
  	return ea;

  }

Re: [PATCH 2/7] powerpc/prom: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not 
used [-Werror=unused-but-set-variable]
   __be64 *reserve_map;
   ^~~
cc1: all warnings being treated as errors


A small sentence explaining how this is fixes would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.





Cc: Christophe Leroy 
Signed-off-by: Cédric Le Goater 
---
  arch/powerpc/kernel/prom.c | 51 +++---
  1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d8a2fb87ba0c..4bae9ebc7d0b 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -622,11 +622,6 @@ static void __init early_reserve_mem_dt(void)
  
  static void __init early_reserve_mem(void)

  {
-   __be64 *reserve_map;
-
-   reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
-   fdt_off_mem_rsvmap(initial_boot_params));
-
/* Look for the new "reserved-regions" property in the DT */
early_reserve_mem_dt();
  
@@ -639,28 +634,34 @@ static void __init early_reserve_mem(void)

}
  #endif /* CONFIG_BLK_DEV_INITRD */
  
-#ifdef CONFIG_PPC32


Instead of such a big change, you could simply do the following in 
addition to the move of reserve_map allocation after it.


if (!IS_ENABLED(CONFIG_PPC32))
return;


-   /*
-* Handle the case where we might be booting from an old kexec
-* image that setup the mem_rsvmap as pairs of 32-bit values
-*/
-   if (be64_to_cpup(reserve_map) > 0xull) {
-   u32 base_32, size_32;
-   __be32 *reserve_map_32 = (__be32 *)reserve_map;
-
-   DBG("Found old 32-bit reserve map\n");
-
-   while (1) {
-   base_32 = be32_to_cpup(reserve_map_32++);
-   size_32 = be32_to_cpup(reserve_map_32++);
-   if (size_32 == 0)
-   break;
-   DBG("reserving: %x -> %x\n", base_32, size_32);
-   memblock_reserve(base_32, size_32);
+   if (IS_ENABLED(CONFIG_PPC32)) {
+   __be64 *reserve_map;
+
+   reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
+fdt_off_mem_rsvmap(initial_boot_params));
+
+   /*
+* Handle the case where we might be booting from an
+* old kexec image that setup the mem_rsvmap as pairs
+* of 32-bit values
+*/
+   if (be64_to_cpup(reserve_map) > 0xull) {
+   u32 base_32, size_32;
+   __be32 *reserve_map_32 = (__be32 *)reserve_map;
+
+   DBG("Found old 32-bit reserve map\n");
+
+   while (1) {
+   base_32 = be32_to_cpup(reserve_map_32++);
+   size_32 = be32_to_cpup(reserve_map_32++);
+   if (size_32 == 0)
+   break;
+   DBG("reserving: %x -> %x\n", base_32, size_32);
+   memblock_reserve(base_32, size_32);
+   }
+   return;
}
-   return;
}
-#endif
  }
  
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM





Christophe

Re: [PATCH 1/7] powerpc/sysfs: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

arch/powerpc/kernel/sysfs.c: In function ‘sysfs_create_dscr_default’:
arch/powerpc/kernel/sysfs.c:228:7: error: variable ‘err’ set but not used 
[-Werror=unused-but-set-variable]
int err = 0;
^~~
cc1: all warnings being treated as errors


A small sentence explaining how this is fixes would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Even the subject should be more explicite, rather than saying 
"Fix W=1 compile warning", I think it should say something like "remove 
unused err variable"





Cc: Madhavan Srinivasan 
Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/kernel/sysfs.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 46b4ebc33db7..821a3dc4c924 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -225,14 +225,13 @@ static DEVICE_ATTR(dscr_default, 0600,
  static void sysfs_create_dscr_default(void)
  {
if (cpu_has_feature(CPU_FTR_DSCR)) {
-   int err = 0;
int cpu;
  
  		dscr_default = spr_default_dscr;

for_each_possible_cpu(cpu)
paca_ptrs[cpu]->dscr_default = dscr_default;
  
-		err = device_create_file(cpu_subsys.dev_root, _attr_dscr_default);

+   device_create_file(cpu_subsys.dev_root, _attr_dscr_default);
}
  }
  #endif /* CONFIG_PPC64 */

Re: [PATCH v4 13/13] mm/debug_vm_pgtable: Avoid none pte in pte_clear_test

2020-09-10 Thread Aneesh Kumar K.V

Nathan Chancellor  writes:

> On Wed, Sep 02, 2020 at 05:12:22PM +0530, Aneesh Kumar K.V wrote:
>> pte_clear_tests operate on an existing pte entry. Make sure that
>> is not a none pte entry.
>> 
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  mm/debug_vm_pgtable.c | 7 ---
>>  1 file changed, 4 insertions(+), 3 deletions(-)
>> 
>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>> index 9afa1354326b..c36530c69e33 100644
>> --- a/mm/debug_vm_pgtable.c
>> +++ b/mm/debug_vm_pgtable.c
>> @@ -542,9 +542,10 @@ static void __init pgd_populate_tests(struct mm_struct 
>> *mm, pgd_t *pgdp,
>>  #endif /* PAGETABLE_P4D_FOLDED */
>>  
>>  static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
>> -   unsigned long vaddr)
>> +   unsigned long pfn, unsigned long vaddr,
>> +   pgprot_t prot)
>>  {
>> -pte_t pte = ptep_get(ptep);
>> +pte_t pte = pfn_pte(pfn, prot);
>>  
>>  pr_debug("Validating PTE clear\n");
>>  pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
>> @@ -1049,7 +1050,7 @@ static int __init debug_vm_pgtable(void)
>>  
>>  ptl = pte_lockptr(mm, pmdp);
>>  spin_lock(ptl);
>> -pte_clear_tests(mm, ptep, vaddr);
>> +pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
>>  pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>>  pte_unmap_unlock(ptep, ptl);
>>  
>> -- 
> This patch causes a panic at boot for RISC-V defconfig. The rootfs is here if 
> it is needed:
> https://github.com/ClangBuiltLinux/boot-utils/blob/3b21a5b71451742866349ba4f18638c5a754e660/images/riscv/rootfs.cpio.zst
>
> $ make -skj"$(nproc)" ARCH=riscv CROSS_COMPILE=riscv64-linux- O=out/riscv 
> distclean defconfig Image
>
> $ qemu-system-riscv64 -bios default -M virt -display none -initrd rootfs.cpio 
> -kernel Image -m 512m -nodefaults -serial mon:stdio
> ...
>
> OpenSBI v0.6
>_  _
>   / __ \  / |  _ \_   _|
>  | |  | |_ __   ___ _ __ | (___ | |_) || |
>  | |  | | '_ \ / _ \ '_ \ \___ \|  _ < | |
>  | |__| | |_) |  __/ | | |) | |_) || |_
>   \/| .__/ \___|_| |_|_/|/_|
> | |
> |_|
>
> Platform Name  : QEMU Virt Machine
> Platform HART Features : RV64ACDFIMSU
> Platform Max HARTs : 8
> Current Hart   : 0
> Firmware Base  : 0x8000
> Firmware Size  : 120 KB
> Runtime SBI Version: 0.2
>
> MIDELEG : 0x0222
> MEDELEG : 0xb109
> PMP0: 0x8000-0x8001 (A)
> PMP1: 0x-0x (A,R,W,X)
> [0.00] Linux version 5.9.0-rc4-next-20200910 
> (nathan@ubuntu-n2-xlarge-x86) (riscv64-linux-gcc (GCC) 10.2.0, GNU ld (GNU 
> Binutils) 2.35) #1 SMP Thu Sep 10 19:10:43 MST 2020
> ...
> [0.294593] NET: Registered protocol family 17
> [0.295781] 9pnet: Installing 9P2000 support
> [0.296153] Key type dns_resolver registered
> [0.296694] debug_vm_pgtable: [debug_vm_pgtable ]: Validating 
> architecture page table helpers
> [0.297635] Unable to handle kernel paging request at virtual address 
> 0a7fffe01dafefc8
> [0.298029] Oops [#1]
> [0.298153] Modules linked in:
> [0.298433] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> 5.9.0-rc4-next-20200910 #1
> [0.298792] epc: ffe000205afc ra : ffe0008be0aa sp : 
> ffe01ae73d40
> [0.299078]  gp : ffe0010b9b48 tp : ffe01ae68000 t0 : 
> ffe008152000
> [0.299362]  t1 :  t2 :  s0 : 
> ffe01ae73d60
> [0.299648]  s1 : bffb a0 : 0a7fffe01dafefc8 a1 : 
> bffb
> [0.299948]  a2 : ffe0010a2698 a3 : 0001 a4 : 
> 0003
> [0.300231]  a5 : 0800 a6 : f080 a7 : 
> 1b642000
> [0.300521]  s2 : ffe0081517b8 s3 : ffe008150a80 s4 : 
> ffe01af3
> [0.300806]  s5 : ffe01f8ca9b8 s6 : ffe00815 s7 : 
> ffe0010bb100
> [0.301161]  s8 : ffe0010bb108 s9 : 00080202 s10: 
> ffe0010bb928
> [0.301481]  s11: 2008085b t3 :  t4 : 
> 
> [0.301722]  t5 :  t6 : ffe00815
> [0.301947] status: 0120 badaddr: 0a7fffe01dafefc8 cause: 
> 000f
> [0.302569] ---[ end trace 7ffb153d816164cf ]---
> [0.302797] note: swapper/0[1] exited with preempt_count 1
> [0.303101] Kernel panic - not

Re: [PATCH v1] pseries/hotplug-memory: hot-add: skip redundant LMB lookup

2020-09-10 Thread kernel test robot

Hi Scott,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on next-20200910]
[cannot apply to v5.9-rc4]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Scott-Cheloha/pseries-hotplug-memory-hot-add-skip-redundant-LMB-lookup/20200911-015744
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r034-20200911 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 
0448d11a06b451a63a8f60408fec613ad24801ba)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# install powerpc cross compiling tool for clang build
# apt-get install binutils-powerpc-linux-gnu
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

>> arch/powerpc/mm/numa.c:433:5: warning: no previous prototype for function 
>> 'of_drconf_to_nid_single' [-Wmissing-prototypes]
   int of_drconf_to_nid_single(struct drmem_lmb *lmb)
   ^
   arch/powerpc/mm/numa.c:433:1: note: declare 'static' if the function is not 
intended to be used outside of this translation unit
   int of_drconf_to_nid_single(struct drmem_lmb *lmb)
   ^
   static 
   1 warning generated.

# 
https://github.com/0day-ci/linux/commit/e6847f3f58460841a28885578cc3547735cfa50c
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Scott-Cheloha/pseries-hotplug-memory-hot-add-skip-redundant-LMB-lookup/20200911-015744
git checkout e6847f3f58460841a28885578cc3547735cfa50c
vim +/of_drconf_to_nid_single +433 arch/powerpc/mm/numa.c

   428  
   429  /*
   430   * This is like of_node_to_nid_single() for memory represented in the
   431   * ibm,dynamic-reconfiguration-memory node.
   432   */
 > 433  int of_drconf_to_nid_single(struct drmem_lmb *lmb)
   434  {
   435  struct assoc_arrays aa = { .arrays = NULL };
   436  int default_nid = NUMA_NO_NODE;
   437  int nid = default_nid;
   438  int rc, index;
   439  
   440  if ((min_common_depth < 0) || !numa_enabled)
   441  return default_nid;
   442  
   443  rc = of_get_assoc_arrays();
   444  if (rc)
   445  return default_nid;
   446  
   447  if (min_common_depth <= aa.array_sz &&
   448  !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
   449  index = lmb->aa_index * aa.array_sz + min_common_depth 
- 1;
   450  nid = of_read_number([index], 1);
   451  
   452  if (nid == 0x || nid >= nr_node_ids)
   453  nid = default_nid;
   454  
   455  if (nid > 0) {
   456  index = lmb->aa_index * aa.array_sz;
   457  initialize_distance_lookup_table(nid,
   458  
[index]);
   459  }
   460  }
   461  
   462  return nid;
   463  }
   464  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip

[PATCH] powerpc/kasan: Fix CONFIG_KASAN_VMALLOC for 8xx

2020-09-10 Thread Christophe Leroy

Before the commit identified below, pages tables allocation was
performed after the allocation of final shadow area for linear memory.
But that commit switched the order, leading to page tables being
already allocated at the time 8xx kasan_init_shadow_8M() is called.
Due to this, kasan_init_shadow_8M() doesn't map the needed
shadow entries because there are already page tables.

kasan_init_shadow_8M() installs huge PMD entries instead of page
tables. We could at that time free the page tables, but there is no
point in creating page tables that get freed before being used.

Only book3s/32 hash needs early allocation of page tables. For other
variants, we can keep the initial order and create remaining page
tables after the allocation of final shadow memory for linear mem.

Move back the allocation of shadow page tables for
CONFIG_KASAN_VMALLOC into kasan_init() after the loop which creates
final shadow memory for linear mem.

Fixes: 41ea93cf7ba4 ("powerpc/kasan: Fix shadow pages allocation failure")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c 
b/arch/powerpc/mm/kasan/kasan_init_32.c
index fb294046e00e..929716ea21e9 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -127,8 +127,7 @@ void __init kasan_mmu_init(void)
 {
int ret;
 
-   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ||
-   IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, 
KASAN_SHADOW_END);
 
if (ret)
@@ -139,11 +138,11 @@ void __init kasan_mmu_init(void)
 void __init kasan_init(void)
 {
struct memblock_region *reg;
+   int ret;
 
for_each_memblock(memory, reg) {
phys_addr_t base = reg->base;
phys_addr_t top = min(base + reg->size, total_lowmem);
-   int ret;
 
if (base >= top)
continue;
@@ -153,6 +152,13 @@ void __init kasan_init(void)
panic("kasan: kasan_init_region() failed");
}
 
+   if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+   ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, 
KASAN_SHADOW_END);
+
+   if (ret)
+   panic("kasan: kasan_init_shadow_page_tables() failed");
+   }
+
kasan_remap_early_shadow_ro();
 
clear_page(kasan_early_shadow_page);
-- 
2.25.0

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Joe Perches

On Thu, 2020-09-10 at 15:21 +0100, Robin Murphy wrote:
> On 2020-09-09 21:06, Joe Perches wrote:
> > fallthrough to a separate case/default label break; isn't very readable.
> > 
> > Convert pseudo-keyword fallthrough; statements to a simple break; when
> > the next label is case or default and the only statement in the next
> > label block is break;
> > 
> > Found using:
> > 
> > $ grep-2.5.4 -rP --include=*.[ch] -n 
> > "fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *
> > 
> > Miscellanea:
> > 
> > o Move or coalesce a couple label blocks above a default: block.
> > 
> > Signed-off-by: Joe Perches 
> > ---
> > 
> > Compiled allyesconfig x86-64 only.
> > A few files for other arches were not compiled.
> > 
> 
> [...]
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
> > b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > index c192544e874b..743db1abec40 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> > @@ -3777,7 +3777,7 @@ static int arm_smmu_device_hw_probe(struct 
> > arm_smmu_device *smmu)
> > switch (FIELD_GET(IDR0_TTF, reg)) {
> > case IDR0_TTF_AARCH32_64:
> > smmu->ias = 40;
> > -   fallthrough;
> > +   break;
> > case IDR0_TTF_AARCH64:
> > break;
> > default:
> 
> I have to say I don't really agree with the readability argument for 
> this one - a fallthrough is semantically correct here, since the first 
> case is a superset of the second. It just happens that anything we would 
> do for the common subset is implicitly assumed (there are other 
> potential cases we simply haven't added support for at the moment), thus 
> the second case is currently empty.
> This change actively obfuscates that distinction.

Then perhaps comments should be added to usefully
describe the mechanisms.

case IDR0_TTF_AARCH32_64:
smmu->ias = 40;
fallthrough;/* and still do the 64 bit processing */
case IDR0_TTF_AARCH64:
/* Nothing specific yet */
break;

> Robin.

[PATCH] KVM: PPC: Book3S HV: Do not allocate HPT for a nested guest

2020-09-10 Thread Fabiano Rosas

The current nested KVM code does not support HPT guests. This is
informed/enforced in some ways:

- Hosts < P9 will not be able to enable the nested HV feature;

- The nested hypervisor MMU capabilities will not contain
  KVM_CAP_PPC_MMU_HASH_V3;

- QEMU reflects the MMU capabilities in the
  'ibm,arch-vec-5-platform-support' device-tree property;

- The nested guest, at 'prom_parse_mmu_model' ignores the
  'disable_radix' kernel command line option if HPT is not supported;

- The KVM_PPC_CONFIGURE_V3_MMU ioctl will fail if trying to use HPT.

There is, however, still a way to start a HPT guest by using
max-compat-cpu=power8 at the QEMU machine options. This leads to the
guest being set to use hash after QEMU calls the KVM_PPC_ALLOCATE_HTAB
ioctl.

With the guest set to hash, the nested hypervisor goes through the
entry path that has no knowledge of nesting (kvmppc_run_vcpu) and
crashes when it tries to execute an hypervisor-privileged (mtspr
HDEC) instruction at __kvmppc_vcore_entry:

root@L1:~ $ qemu-system-ppc64 -machine pseries,max-cpu-compat=power8 ...


[  538.543303] CPU: 83 PID: 25185 Comm: CPU 0/KVM Not tainted 5.9.0-rc4 #1
[  538.543355] NIP:  c0080753f388 LR: c0080753f368 CTR: c01e5ec0
[  538.543417] REGS: c013e91e33b0 TRAP: 0700   Not tainted  (5.9.0-rc4)
[  538.543470] MSR:  82843033   CR: 
22422882  XER: 2004
[  538.543546] CFAR: c0080753f4b0 IRQMASK: 3
   GPR00: c008075397a0 c013e91e3640 c0080755e600 
8000
   GPR04:  c013eab19800 c01394de 
0043a054db72
   GPR08: 003b1652   
c008075502e0
   GPR12: c01e5ec0 c007ffa74200 c013eab19800 
0008
   GPR16:  c0139676c6c0 c1d23948 
c013e91e38b8
   GPR20: 0053  0001 

   GPR24: 0001 0001  
0001
   GPR28: 0001 0053 c013eab19800 
0001
[  538.544067] NIP [c0080753f388] __kvmppc_vcore_entry+0x90/0x104 [kvm_hv]
[  538.544121] LR [c0080753f368] __kvmppc_vcore_entry+0x70/0x104 [kvm_hv]
[  538.544173] Call Trace:
[  538.544196] [c013e91e3640] [c013e91e3680] 0xc013e91e3680 
(unreliable)
[  538.544260] [c013e91e3820] [c008075397a0] 
kvmppc_run_core+0xbc8/0x19d0 [kvm_hv]
[  538.544325] [c013e91e39e0] [c0080753d99c] 
kvmppc_vcpu_run_hv+0x404/0xc00 [kvm_hv]
[  538.544394] [c013e91e3ad0] [c008072da4fc] kvmppc_vcpu_run+0x34/0x48 
[kvm]
[  538.544472] [c013e91e3af0] [c008072d61b8] 
kvm_arch_vcpu_ioctl_run+0x310/0x420 [kvm]
[  538.544539] [c013e91e3b80] [c008072c7450] kvm_vcpu_ioctl+0x298/0x778 
[kvm]
[  538.544605] [c013e91e3ce0] [c04b8c2c] sys_ioctl+0x1dc/0xc90
[  538.544662] [c013e91e3dc0] [c002f9a4] 
system_call_exception+0xe4/0x1c0
[  538.544726] [c013e91e3e20] [c000d140] 
system_call_common+0xf0/0x27c
[  538.544787] Instruction dump:
[  538.544821] f86d1098 6000 6000 4899 e8ad0fe8 e8c500a0 e9264140 
75290002
[  538.544886] 7d1602a6 7cec42a6 40820008 7d0807b4 <7d164ba6> 7d083a14 f90d10a0 
480104fd
[  538.544953] ---[ end trace 74423e2b948c2e0c ]---

This patch makes the KVM_PPC_ALLOCATE_HTAB ioctl fail when running in
the nested hypervisor, causing QEMU to abort.

Reported-by: Satheesh Rajendran 
Signed-off-by: Fabiano Rosas 
---
 arch/powerpc/kvm/book3s_hv.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 4ba06a2a306c..764b6239ef72 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5250,6 +5250,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
case KVM_PPC_ALLOCATE_HTAB: {
u32 htab_order;
 
+   /* If we're a nested hypervisor, we currently only support 
radix */
+   if (kvmhv_on_pseries()) {
+   r = -EOPNOTSUPP;
+   break;
+   }
+
r = -EFAULT;
if (get_user(htab_order, (u32 __user *)argp))
break;
-- 
2.25.4

Re: [PATCH v4 13/13] mm/debug_vm_pgtable: Avoid none pte in pte_clear_test

2020-09-10 Thread Nathan Chancellor

On Wed, Sep 02, 2020 at 05:12:22PM +0530, Aneesh Kumar K.V wrote:
> pte_clear_tests operate on an existing pte entry. Make sure that
> is not a none pte entry.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 9afa1354326b..c36530c69e33 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -542,9 +542,10 @@ static void __init pgd_populate_tests(struct mm_struct 
> *mm, pgd_t *pgdp,
>  #endif /* PAGETABLE_P4D_FOLDED */
>  
>  static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
> -unsigned long vaddr)
> +unsigned long pfn, unsigned long vaddr,
> +pgprot_t prot)
>  {
> - pte_t pte = ptep_get(ptep);
> + pte_t pte = pfn_pte(pfn, prot);
>  
>   pr_debug("Validating PTE clear\n");
>   pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
> @@ -1049,7 +1050,7 @@ static int __init debug_vm_pgtable(void)
>  
>   ptl = pte_lockptr(mm, pmdp);
>   spin_lock(ptl);
> - pte_clear_tests(mm, ptep, vaddr);
> + pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
>   pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>   pte_unmap_unlock(ptep, ptl);
>  
> -- 
> 2.26.2
> 
> 

This patch causes a panic at boot for RISC-V defconfig. The rootfs is here if 
it is needed:
https://github.com/ClangBuiltLinux/boot-utils/blob/3b21a5b71451742866349ba4f18638c5a754e660/images/riscv/rootfs.cpio.zst

$ make -skj"$(nproc)" ARCH=riscv CROSS_COMPILE=riscv64-linux- O=out/riscv 
distclean defconfig Image

$ qemu-system-riscv64 -bios default -M virt -display none -initrd rootfs.cpio 
-kernel Image -m 512m -nodefaults -serial mon:stdio
...

OpenSBI v0.6
   _  _
  / __ \  / |  _ \_   _|
 | |  | |_ __   ___ _ __ | (___ | |_) || |
 | |  | | '_ \ / _ \ '_ \ \___ \|  _ < | |
 | |__| | |_) |  __/ | | |) | |_) || |_
  \/| .__/ \___|_| |_|_/|/_|
| |
|_|

Platform Name  : QEMU Virt Machine
Platform HART Features : RV64ACDFIMSU
Platform Max HARTs : 8
Current Hart   : 0
Firmware Base  : 0x8000
Firmware Size  : 120 KB
Runtime SBI Version: 0.2

MIDELEG : 0x0222
MEDELEG : 0xb109
PMP0: 0x00008000-0x8001 (A)
PMP1: 0x-0x (A,R,W,X)
[0.00] Linux version 5.9.0-rc4-next-20200910 
(nathan@ubuntu-n2-xlarge-x86) (riscv64-linux-gcc (GCC) 10.2.0, GNU ld (GNU 
Binutils) 2.35) #1 SMP Thu Sep 10 19:10:43 MST 2020
...
[0.294593] NET: Registered protocol family 17
[0.295781] 9pnet: Installing 9P2000 support
[0.296153] Key type dns_resolver registered
[0.296694] debug_vm_pgtable: [debug_vm_pgtable ]: Validating 
architecture page table helpers
[0.297635] Unable to handle kernel paging request at virtual address 
0a7fffe01dafefc8
[0.298029] Oops [#1]
[0.298153] Modules linked in:
[0.298433] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.9.0-rc4-next-20200910 #1
[0.298792] epc: ffe000205afc ra : ffe0008be0aa sp : ffe01ae73d40
[0.299078]  gp : ffe0010b9b48 tp : ffe01ae68000 t0 : 
ffe008152000
[0.299362]  t1 :  t2 :  s0 : 
ffe01ae73d60
[0.299648]  s1 : bffb a0 : 0a7fffe01dafefc8 a1 : 
bffb
[0.299948]  a2 : ffe0010a2698 a3 : 0001 a4 : 
0003
[0.300231]  a5 : 0800 a6 : f080 a7 : 
1b642000
[0.300521]  s2 : ffe0081517b8 s3 : ffe008150a80 s4 : 
ffe01af3
[0.300806]  s5 : ffe01f8ca9b8 s6 : ffe00815 s7 : 
ffe0010bb100
[0.301161]  s8 : ffe0010bb108 s9 : 00080202 s10: 
ffe0010bb928
[0.301481]  s11: 2008085b t3 :  t4 : 

[0.301722]  t5 :  t6 : ffe00815
[0.301947] status: 0120 badaddr: 0a7fffe01dafefc8 cause: 
000f
[0.302569] ---[ end trace 7ffb153d816164cf ]---
[0.302797] note: swapper/0[1] exited with preempt_count 1
[0.303101] Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x000b
[0.303614] ---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x000b ]---

$ git bisect log
# bad: [7ce53e3a447bced7b85ed181c4d027e93c062e07] Add linux-next specific files 
for 20200910
# good: [34d4ddd359dbcdf6c5fb3f85a179243d7a1cb7f8] Merge tag 
'linux-kselftest-5.9-rc5' of 
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
git bisect start '7ce53e3a447bced7b85ed181c4d027e93c062e07' 
'34d4ddd359dbcd

[PATCH] powerpc/ps3: make two symbols static

2020-09-10 Thread Jason Yan

This addresses the following sparse warning:

arch/powerpc/platforms/ps3/spu.c:451:33: warning: symbol
'spu_management_ps3_ops' was not declared. Should it be static?
arch/powerpc/platforms/ps3/spu.c:592:28: warning: symbol
'spu_priv1_ps3_ops' was not declared. Should it be static?

Reported-by: Hulk Robot 
Signed-off-by: Jason Yan 
---
 arch/powerpc/platforms/ps3/spu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c
index 1193c294b8d0..0c252478e556 100644
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -448,7 +448,7 @@ static void ps3_disable_spu(struct spu_context *ctx)
ctx->ops->runcntl_stop(ctx);
 }
 
-const struct spu_management_ops spu_management_ps3_ops = {
+static const struct spu_management_ops spu_management_ps3_ops = {
.enumerate_spus = ps3_enumerate_spus,
.create_spu = ps3_create_spu,
.destroy_spu = ps3_destroy_spu,
@@ -589,7 +589,7 @@ static u64 resource_allocation_enable_get(struct spu *spu)
return 0; /* No support. */
 }
 
-const struct spu_priv1_ops spu_priv1_ps3_ops = {
+static const struct spu_priv1_ops spu_priv1_ps3_ops = {
.int_mask_and = int_mask_and,
.int_mask_or = int_mask_or,
.int_mask_set = int_mask_set,
-- 
2.25.4

Re: [PATCH] powerpc/powermac: Fix low_sleep_handler with KUAP and KUEP

2020-09-10 Thread Michael Ellerman

Christophe Leroy  writes:
> low_sleep_handler() has an hardcoded restore of segment registers
> that doesn't take KUAP and KUEP into account.
>
> Use head_32's load_segment_registers() routine instead.
>
> Signed-off-by: Christophe Leroy 
> Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access 
> Protection")
> Fixes: 31ed2b13c48d ("powerpc/32s: Implement Kernel Userspace Execution 
> Prevention.")
> Cc: sta...@vger.kernel.org
> ---
>  arch/powerpc/platforms/powermac/sleep.S | 9 +
>  1 file changed, 1 insertion(+), 8 deletions(-)

Doesn't build? pmac32_defconfig, gcc 9.3.0:

ld: arch/powerpc/platforms/powermac/sleep.o: in function `core99_wake_up':
(.text+0x25c): undefined reference to `load_segment_registers'

Missing _GLOBAL() presumably?

cheers

> diff --git a/arch/powerpc/platforms/powermac/sleep.S 
> b/arch/powerpc/platforms/powermac/sleep.S
> index f9a680fdd9c4..51bfdfe85058 100644
> --- a/arch/powerpc/platforms/powermac/sleep.S
> +++ b/arch/powerpc/platforms/powermac/sleep.S
> @@ -294,14 +294,7 @@ grackle_wake_up:
>* we do any r1 memory access as we are not sure they
>* are in a sane state above the first 256Mb region
>*/
> - li  r0,16   /* load up segment register values */
> - mtctr   r0  /* for context 0 */
> - lis r3,0x2000   /* Ku = 1, VSID = 0 */
> - li  r4,0
> -3:   mtsrin  r3,r4
> - addir3,r3,0x111 /* increment VSID */
> - addis   r4,r4,0x1000/* address of next segment */
> - bdnz3b
> + bl  load_segment_registers
>   sync
>   isync
>  
> -- 
> 2.25.0

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 07:57:49PM +0200, Gerald Schaefer wrote:
> On Thu, 10 Sep 2020 10:02:33 -0300
> Jason Gunthorpe  wrote:
> 
> > On Thu, Sep 10, 2020 at 11:39:25AM +0200, Alexander Gordeev wrote:
> > 
> > > As Gerald mentioned, it is very difficult to explain in a clear way.
> > > Hopefully, one could make sense ot of it.
> > 
> > I would say the page table API requires this invariant:
> > 
> > pud = pud_offset(p4d, addr);
> > do {
> > WARN_ON(pud != pud_offset(p4d, addr);
> > next = pud_addr_end(addr, end);
> > } while (pud++, addr = next, addr != end);
> > 
> > ie pud++ is supposed to be a shortcut for 
> >   pud_offset(p4d, next)
> > 
> 
> Hmm, IIUC, all architectures with static folding will simply return
> the passed-in p4d pointer for pud_offset(p4d, addr), for 3-level
> pagetables.

It is probably moot now, but since other arch's don't crash they also
return pud_addr_end() == end so the loop only does one iteration.

ie pud == pud_offset(p4d, addr) for all iterations as the pud++ never
happens.

Which is what this addr_end patch does for s390..

Jason

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread John Hubbard


On 9/10/20 3:11 PM, Jason Gunthorpe wrote:

On Thu, Sep 10, 2020 at 02:22:37PM -0700, John Hubbard wrote:


Or am I way off here, and it really is possible (aside from the current
s390 situation) to observe something that "is no longer a page table"?


Yes, that is the issue. Remember there is no locking for GUP
fast. While a page table cannot be freed there is nothing preventing
the page table entry from being concurrently modified.



OK, then we are saying the same thing after all, good.


Without the stack variable it looks like this:

pud_t pud = READ_ONCE(*pudp);
if (!pud_present(pud))
 return
pmd_offset(pudp, address);

And pmd_offset() expands to

 return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);

Between the READ_ONCE(*pudp) and (*pud) inside pmd_offset() the value
of *pud can change, eg to !pud_present.

Then pud_page_vaddr(*pud) will crash. It is not use after free, it
is using data that has not been validated.



Right, that matches what I had in mind, too: you can still have a problem
even though you're in the same page table. I just wanted to confirm that
there's not some odd way to launch out into completely non-page-table
memory.


thanks,
--
John Hubbard
NVIDIA

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 02:22:37PM -0700, John Hubbard wrote:

> Or am I way off here, and it really is possible (aside from the current
> s390 situation) to observe something that "is no longer a page table"?

Yes, that is the issue. Remember there is no locking for GUP
fast. While a page table cannot be freed there is nothing preventing
the page table entry from being concurrently modified.

Without the stack variable it looks like this:

   pud_t pud = READ_ONCE(*pudp);
   if (!pud_present(pud))
return
   pmd_offset(pudp, address);

And pmd_offset() expands to

return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);

Between the READ_ONCE(*pudp) and (*pud) inside pmd_offset() the value
of *pud can change, eg to !pud_present.

Then pud_page_vaddr(*pud) will crash. It is not use after free, it
is using data that has not been validated.

Jason

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 12:32:05PM -0700, Linus Torvalds wrote:
> Yeah, I get hung up on naming sometimes. I don't tend to care much
> about private local variables ("i" is a perfectly fine variable name),
> but these kinds of somewhat subtle cross-architecture definitions I
> feel matter.

One of the first replys to this patch was to ask "when would I use
_orig vs normal", so you are not alone. The name should convey it..

So, I suggest pXX_offset_unlocked()

Since it is safe to call without the page table lock, while pXX_offset()
requires the page table lock to be held as the internal *pXX is a data
race otherwise.

Patch 1 might be OK for a stable backport, but to get to a clear
pXX_offset_unlocked() all the arches would want to be changed to
implement that API and the generic code would provide the wrapper:

#define pXX_offset(pXXp, address) pXX_offset_unlocked(pXXp, *(pXXp), address)

Arches would not have a *pXX inside their code.

Then we can talk about auditing call sites of pXX_offset and think
about using the _unlocked version in places where the page table lock
is not held.

For instance mm/pagewalk.c should be changed. So should
huge_pte_offset() and probably other places. These places might
already be exsting data-race bugs.

It is code-as-documentation indicating an unlocked page table walk.

Now it is not just a S390 story but a change that makes the data
concurrency much clearer, so I think I prefer this version to the
addr_end one too.

Regards,
Jason

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread John Hubbard


On 9/10/20 11:13 AM, Jason Gunthorpe wrote:

On Thu, Sep 10, 2020 at 10:35:38AM -0700, Linus Torvalds wrote:

On Thu, Sep 10, 2020 at 2:40 AM Alexander Gordeev
 wrote:


It is only gup_fast case that exposes the issue. It hits because
pointers to stack copies are passed to gup_pXd_range iterators, not
pointers to real page tables itself.


Can we possibly change fast-gup to not do the stack copies?

I'd actually rather do something like that, than the "addr_end" thing.



As you say, none of the other page table walking code does what the
GUP code does, and I don't think it's required.


As I understand it, the requirement is because fast-gup walks without
the page table spinlock, or mmap_sem held so it must READ_ONCE the
*pXX.

It then checks that it is a valid page table pointer, then calls
pXX_offset().

The arch implementation of pXX_offset() derefs again the passed pXX
pointer. So it defeats the READ_ONCE and the 2nd load could observe
something that is no longer a page table pointer and crash.


Just to be clear, though, that makes it sound a little wilder and
reckless than it really is, right?

Because actually, the page tables cannot be freed while gup_fast is
walking them, due to either IPI blocking during the walk, or the moral
equivalent (MMU_GATHER_RCU_TABLE_FREE) for non-IPI architectures. So the
pages tables can *change* underneath gup_fast, and for example pages can
be unmapped. But they remain valid page tables, it's just that their
contents are unstable. Even if pXd_none()==true.

Or am I way off here, and it really is possible (aside from the current
s390 situation) to observe something that "is no longer a page table"?


thanks,
--
John Hubbard
NVIDIA

[PATCH 4/7] powerpc/xive: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

CC  arch/powerpc/sysdev/xive/common.o
../arch/powerpc/sysdev/xive/common.c:1568:6: error: no previous prototype for 
‘xive_debug_show_cpu’ [-Werror=missing-prototypes]
 void xive_debug_show_cpu(struct seq_file *m, int cpu)
  ^~~
../arch/powerpc/sysdev/xive/common.c:1602:6: error: no previous prototype for 
‘xive_debug_show_irq’ [-Werror=missing-prototypes]
 void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)
  ^~~

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f591be9f01f4..a80440af491a 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1565,7 +1565,7 @@ static int __init xive_off(char *arg)
 }
 __setup("xive=off", xive_off);
 
-void xive_debug_show_cpu(struct seq_file *m, int cpu)
+static void xive_debug_show_cpu(struct seq_file *m, int cpu)
 {
struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
 
@@ -1599,7 +1599,7 @@ void xive_debug_show_cpu(struct seq_file *m, int cpu)
seq_puts(m, "\n");
 }
 
-void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)
+static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct 
irq_data *d)
 {
struct irq_chip *chip = irq_data_get_irq_chip(d);
int rc;
-- 
2.25.4

[PATCH 7/7] powerpc/32: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

CC  arch/powerpc/kernel/traps.o
../arch/powerpc/kernel/traps.c:1663:6: error: no previous prototype for 
‘stack_overflow_exception’ [-Werror=missing-prototypes]
 void stack_overflow_exception(struct pt_regs *regs)
  ^~~~

Cc: Christophe Leroy 
Fixes: 3978eb78517c ("powerpc/32: Add early stack overflow detection with VMAP 
stack.")
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/asm-prototypes.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index de14b1a34d56..4957119604c7 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -67,6 +67,7 @@ void single_step_exception(struct pt_regs *regs);
 void program_check_exception(struct pt_regs *regs);
 void alignment_exception(struct pt_regs *regs);
 void StackOverflow(struct pt_regs *regs);
+void stack_overflow_exception(struct pt_regs *regs);
 void kernel_fp_unavailable_exception(struct pt_regs *regs);
 void altivec_unavailable_exception(struct pt_regs *regs);
 void vsx_unavailable_exception(struct pt_regs *regs);
-- 
2.25.4

[PATCH 6/7] powerpc/perf: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

CC  arch/powerpc/perf/imc-pmu.o
../arch/powerpc/perf/imc-pmu.c: In function ‘trace_imc_event_init’:
../arch/powerpc/perf/imc-pmu.c:1429:22: error: variable ‘target’ set but not 
used [-Werror=unused-but-set-variable]
  struct task_struct *target;
  ^~

Cc: Anju T Sudhakar 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/perf/imc-pmu.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 62d0b54086f8..9ed4fcccf8a9 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1426,8 +1426,6 @@ static void trace_imc_event_del(struct perf_event *event, 
int flags)
 
 static int trace_imc_event_init(struct perf_event *event)
 {
-   struct task_struct *target;
-
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
@@ -1458,7 +1456,6 @@ static int trace_imc_event_init(struct perf_event *event)
mutex_unlock(_global_refc.lock);
 
event->hw.idx = -1;
-   target = event->hw.target;
 
event->pmu->task_ctx_nr = perf_hw_context;
event->destroy = reset_global_refc;
-- 
2.25.4

[PATCH 1/7] powerpc/sysfs: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

arch/powerpc/kernel/sysfs.c: In function ‘sysfs_create_dscr_default’:
arch/powerpc/kernel/sysfs.c:228:7: error: variable ‘err’ set but not used 
[-Werror=unused-but-set-variable]
   int err = 0;
   ^~~
cc1: all warnings being treated as errors

Cc: Madhavan Srinivasan 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kernel/sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 46b4ebc33db7..821a3dc4c924 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -225,14 +225,13 @@ static DEVICE_ATTR(dscr_default, 0600,
 static void sysfs_create_dscr_default(void)
 {
if (cpu_has_feature(CPU_FTR_DSCR)) {
-   int err = 0;
int cpu;
 
dscr_default = spr_default_dscr;
for_each_possible_cpu(cpu)
paca_ptrs[cpu]->dscr_default = dscr_default;
 
-   err = device_create_file(cpu_subsys.dev_root, 
_attr_dscr_default);
+   device_create_file(cpu_subsys.dev_root, _attr_dscr_default);
}
 }
 #endif /* CONFIG_PPC64 */
-- 
2.25.4

[PATCH 0/7] powerpc: Fix a few W=1 compile warnings

2020-09-10 Thread Cédric Le Goater

Hello,

Here is a small contribution on improving compile with W=1. 

Thanks,

C.

Cédric Le Goater (7):
  powerpc/sysfs: Fix W=1 compile warning
  powerpc/prom: Fix W=1 compile warning
  powerpc/sstep: Fix W=1 compile warning
  powerpc/xive: Fix W=1 compile warning
  powerpc/powernv/pci: Fix W=1 compile warning
  powerpc/perf: Fix W=1 compile warning
  powerpc/32: Fix W=1 compile warning

 arch/powerpc/include/asm/asm-prototypes.h |  1 +
 arch/powerpc/kernel/prom.c| 51 ---
 arch/powerpc/kernel/sysfs.c   |  3 +-
 arch/powerpc/lib/sstep.c  |  3 +-
 arch/powerpc/perf/imc-pmu.c   |  3 --
 arch/powerpc/platforms/powernv/pci-ioda.c |  8 
 arch/powerpc/sysdev/xive/common.c |  4 +-
 7 files changed, 32 insertions(+), 41 deletions(-)

-- 
2.25.4

[PATCH 2/7] powerpc/prom: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not 
used [-Werror=unused-but-set-variable]
  __be64 *reserve_map;
  ^~~
cc1: all warnings being treated as errors

Cc: Christophe Leroy 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kernel/prom.c | 51 +++---
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d8a2fb87ba0c..4bae9ebc7d0b 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -622,11 +622,6 @@ static void __init early_reserve_mem_dt(void)
 
 static void __init early_reserve_mem(void)
 {
-   __be64 *reserve_map;
-
-   reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
-   fdt_off_mem_rsvmap(initial_boot_params));
-
/* Look for the new "reserved-regions" property in the DT */
early_reserve_mem_dt();
 
@@ -639,28 +634,34 @@ static void __init early_reserve_mem(void)
}
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-#ifdef CONFIG_PPC32
-   /* 
-* Handle the case where we might be booting from an old kexec
-* image that setup the mem_rsvmap as pairs of 32-bit values
-*/
-   if (be64_to_cpup(reserve_map) > 0xull) {
-   u32 base_32, size_32;
-   __be32 *reserve_map_32 = (__be32 *)reserve_map;
-
-   DBG("Found old 32-bit reserve map\n");
-
-   while (1) {
-   base_32 = be32_to_cpup(reserve_map_32++);
-   size_32 = be32_to_cpup(reserve_map_32++);
-   if (size_32 == 0)
-   break;
-   DBG("reserving: %x -> %x\n", base_32, size_32);
-   memblock_reserve(base_32, size_32);
+   if (IS_ENABLED(CONFIG_PPC32)) {
+   __be64 *reserve_map;
+
+   reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
+fdt_off_mem_rsvmap(initial_boot_params));
+
+   /*
+* Handle the case where we might be booting from an
+* old kexec image that setup the mem_rsvmap as pairs
+* of 32-bit values
+*/
+   if (be64_to_cpup(reserve_map) > 0xull) {
+   u32 base_32, size_32;
+   __be32 *reserve_map_32 = (__be32 *)reserve_map;
+
+   DBG("Found old 32-bit reserve map\n");
+
+   while (1) {
+   base_32 = be32_to_cpup(reserve_map_32++);
+   size_32 = be32_to_cpup(reserve_map_32++);
+   if (size_32 == 0)
+   break;
+   DBG("reserving: %x -> %x\n", base_32, size_32);
+   memblock_reserve(base_32, size_32);
+   }
+   return;
}
-   return;
}
-#endif
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-- 
2.25.4

[PATCH 5/7] powerpc/powernv/pci: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

  CC  arch/powerpc/platforms/powernv/pci-ioda.o
../arch/powerpc/platforms/powernv/pci-ioda.c: In function 
‘pnv_ioda_configure_pe’:
../arch/powerpc/platforms/powernv/pci-ioda.c:897:18: error: variable ‘parent’ 
set but not used [-Werror=unused-but-set-variable]
  struct pci_dev *parent;
  ^~

Cc: Oliver O'Halloran 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 023a4f987bb2..2b4ceb5e6ce4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -894,7 +894,6 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
 
 int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 {
-   struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
long rc, rid_end, rid;
 
@@ -904,7 +903,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
 
dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
-   parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
count = resource_size(>pbus->busn_res);
else
@@ -925,12 +923,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
-#ifdef CONFIG_PCI_IOV
-   if (pe->flags & PNV_IODA_PE_VF)
-   parent = pe->parent_dev;
-   else
-#endif /* CONFIG_PCI_IOV */
-   parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
-- 
2.25.4

[PATCH 3/7] powerpc/sstep: Fix W=1 compile warning

2020-09-10 Thread Cédric Le Goater

../arch/powerpc/lib/sstep.c: In function ‘mlsd_8lsd_ea’:
../arch/powerpc/lib/sstep.c:225:3: error: suggest braces around empty body in 
an ‘if’ statement [-Werror=empty-body]
   ; /* Invalid form. Should already be checked for by caller! */
   ^

Cc: Jordan Niethe 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/lib/sstep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index caee8cc77e19..14572af16e55 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -221,8 +221,9 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned 
int instr,
; /* Leave ea as is */
else if (prefix_r && !ra)
ea += regs->nip;
-   else if (prefix_r && ra)
+   else if (prefix_r && ra) {
; /* Invalid form. Should already be checked for by caller! */
+   }
 
return ea;
 }
-- 
2.25.4

Re: [PATCH] kbuild: preprocess module linker script

2020-09-10 Thread Kees Cook

On Fri, Sep 04, 2020 at 10:31:21PM +0900, Masahiro Yamada wrote:
> There was a request to preprocess the module linker script like we do
> for the vmlinux one (https://lkml.org/lkml/2020/8/21/512).
> 
> The difference between vmlinux.lds and module.lds is that the latter
> is needed for external module builds, thus must be cleaned up by
> 'make mrproper' instead of 'make clean' (also, it must be created by
> 'make modules_prepare').
> 
> You cannot put it in arch/*/kernel/ because 'make clean' descends into
> it. I moved arch/*/kernel/module.lds to arch/*/include/asm/module.lds.h,
> which is included from scripts/module.lds.S.
> 
> scripts/module.lds is fine because 'make clean' keeps all the build
> artifacts under scripts/.
> 
> You can add arch-specific sections in .
> 
> Signed-off-by: Masahiro Yamada 

Reviewed-by: Kees Cook 

-- 
Kees Cook

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Linus Torvalds

On Thu, Sep 10, 2020 at 12:11 PM Gerald Schaefer
 wrote:
>
> That sounds a lot like the pXd_offset_orig() from Martins first approach
> in this thread:
> https://lore.kernel.org/linuxppc-dev/20190418100218.0a4afd51@mschwideX1/

I have to admit to finding that name horrible, but aside from that, yes.

I don't think "pXd_offset_orig()" makes any sense as a name. Yes,
"orig" may make sense as the variable name (as in "this was the
original value we read"), but a function name should describe what it
*does*, not what the arguments are.

Plus "original" doesn't make sense to me anyway, since we're not
modifying it. To me, "original" means that there's a final version
too, which this interface in no way implies. It's just "this is the
value we already read".

("orig" does make some sense in that fault path - because by
definition we *are* going to modify the page table entry, that's the
whole point of the fault - we need to do something to not keep
faulting. But here, we're not at all necessarily modifying the page
table contents, we're just following them and readign the values once)

Of course, I don't know what a better name would be to describe what
is actually going on, I'm just explaining why I hate that naming.

*Maybe* something like just "pXd_offset_value()" together with a
comment explaining that it's given the upper pXd pointer _and_ the
value behind it, and it needs to return the next level offset? I
dunno. "value" doesn't really seem horribly descriptive either, but at
least it doesn't feel actively misleading to me.

Yeah, I get hung up on naming sometimes. I don't tend to care much
about private local variables ("i" is a perfectly fine variable name),
but these kinds of somewhat subtle cross-architecture definitions I
feel matter.

   Linus

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Gerald Schaefer

On Thu, 10 Sep 2020 11:33:17 -0700
Linus Torvalds  wrote:

> On Thu, Sep 10, 2020 at 11:13 AM Jason Gunthorpe  wrote:
> >
> > So.. To change away from the stack option I think we'd have to pass
> > the READ_ONCE value to pXX_offset() as an extra argument instead of it
> > derefing the pointer internally.
> 
> Yeah, but I think that would actually be the better model than passing
> an address to a random stack location.
> 
> It's also effectively what we do in some other places, eg the whole
> logic with "orig" in the regular pte fault handling is basically doing
> unlocked loads of the pte, various decisions on that, and then doing a
> final "is this still the same pte" after it has gotten the page table
> lock.

That sounds a lot like the pXd_offset_orig() from Martins first approach
in this thread:
https://lore.kernel.org/linuxppc-dev/20190418100218.0a4afd51@mschwideX1/

It is also the "Patch 1" option from the start of this thread:
https://lore.kernel.org/lkml/20200828140314.8556-1-gerald.schae...@linux.ibm.com/

I guess I chose wrongly there, should have had more trust in Martins
approach, and not try so hard to do it like others...

So, maybe we can start over again, from that patch option. It would of
course also initially introduce some gup-specific helpers, like with
the other approach. It seemed harder to generalize when I thought
about it back then, but I guess it should not be a lot harder than
the _addr_end stuff.

Or, maybe this time, just not to risk Christian getting a heart attack,
we could go for the gup-specific helper first, so that we would at
least have a fix for the possible s390 data corruption. Jason, would
you agree that we send a new RFC, this time with pXd_offset_orig()
approach, and have that accepted as short-term fix?

Or would you rather also wait for some proper generic change? Have
lost that option from my radar, so cannot really judge how much more
effort it would be. I'm on vacation next week anyway, but Alexander
or Vasily (who did the option 1 patch) could look into this further.

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread Segher Boessenkool

On Thu, Sep 10, 2020 at 03:31:53PM +, David Laight wrote:
> > >   asm volatile ("" : "+r" (eax));
> > >   // So here eax must contain the value set by the "x" instructions.
> > 
> > No, the register eax will contain the value of the eax variable.  In the
> > asm; it might well be there before or after the asm as well, but none of
> > that is guaranteed.
> 
> Perhaps not 'guaranteed', but very unlikely to be wrong.
> It doesn't give gcc much scope for not generating the desired code.

Wanna bet?  :-)

Correct is correct.  Anything else is not.


Segher

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Linus Torvalds

On Thu, Sep 10, 2020 at 11:13 AM Jason Gunthorpe  wrote:
>
> So.. To change away from the stack option I think we'd have to pass
> the READ_ONCE value to pXX_offset() as an extra argument instead of it
> derefing the pointer internally.

Yeah, but I think that would actually be the better model than passing
an address to a random stack location.

It's also effectively what we do in some other places, eg the whole
logic with "orig" in the regular pte fault handling is basically doing
unlocked loads of the pte, various decisions on that, and then doing a
final "is this still the same pte" after it has gotten the page table
lock.

(And yes, those other pte fault handling cases are different, since
they _do_ hold the mmap lock, so they know the page *tables* are
stable, and it's only the last level that then gets re-checked against
the pte once the pte itself has also been stabilized with the page
table lock).

So I think it would actually be a better conceptual match to make the
page table walking interface be "here, this is the value I read once
carefully, and this is the address, now give me the next address".

The folded case would then just return the address it was given, and
the non-folded case would return the inner page table based on the
value.

I dunno. I don't actually feel all that strongly about this, so
whatever works, I guess.

Linus

Re: [PATCH v3] pseries/drmem: don't cache node id in drmem_lmb struct

2020-09-10 Thread Scott Cheloha

On Fri, Aug 21, 2020 at 10:33:10AM +0200, Laurent Dufour wrote:
> Le 11/08/2020 à 03:51, Scott Cheloha a écrit :
> > 
> > [...]
> > 
> > @@ -631,7 +638,7 @@ static int dlpar_memory_remove_by_ic(u32 
> > lmbs_to_remove, u32 drc_index)
> >   static int dlpar_add_lmb(struct drmem_lmb *lmb)
> >   {
> > unsigned long block_sz;
> > -   int rc;
> > +   int nid, rc;
> > 
> > if (lmb->flags & DRCONF_MEM_ASSIGNED)
> > return -EINVAL;
> > @@ -642,11 +649,13 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
> > return rc;
> > }
> > 
> > -   lmb_set_nid(lmb);
> > block_sz = memory_block_size_bytes();
> > 
> > +   /* Find the node id for this address. */
> > +   nid = memory_add_physaddr_to_nid(lmb->base_addr);
> 
> I think we could be more efficient here.
> Here is the call stack behind memory_add_physaddr_to_nid():
> 
>   memory_add_physaddr_to_nid(lmb->base_addr)
> hot_add_scn_to_nid()
>   if (of_find_node_by_path("/ibm,dynamic-reconfiguration-memory")) == 
> true*
>   then
> hot_add_drconf_scn_to_nid()
>   for_each_drmem_lmb() to find the LMB based on lmb->base_addr
> of_drconf_to_nid_single(found LMB)
>   use lmb->aa_index to get the nid.
> 
> * that test is necessarily true when called from dlpar_add_lmb()
> otherwise the call to update_lmb_associativity_index() would have
> failed earlier.
> 
> Basically, we have a LMB and we later walk all the LMBs to find that lmb
> again.  In the case of dlpar_add_lmb(), it would be more efficient to
> directly call of_drconf_to_nid_single(). That function is not exported
> from arch/powerpc/mm/numa.c but it may be good to export it through that
> patch.

I've posted a patch for this:

https://lore.kernel.org/linuxppc-dev/20200910175637.2865160-1-chel...@linux.ibm.com/T/#u

The speedup is nice, especially for LMBs at the end of the array.

Re: [PATCHv7 12/12] misc: pci_endpoint_test: Add driver data for Layerscape PCIe controllers

2020-09-10 Thread Rob Herring

On Tue, 11 Aug 2020 17:54:41 +0800, Zhiqiang Hou wrote:
> From: Hou Zhiqiang 
> 
> The commit 0a121f9bc3f5 ("misc: pci_endpoint_test: Use streaming DMA
> APIs for buffer allocation") changed to use streaming DMA APIs, however,
> dma_map_single() might not return a 4KB aligned address, so add the
> default_data as driver data for Layerscape PCIe controllers to make it
> 4KB aligned.
> 
> Signed-off-by: Hou Zhiqiang 
> ---
> V7:
>  - New patch.
> 
>  drivers/misc/pci_endpoint_test.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 

Acked-by: Rob Herring

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 10:35:38AM -0700, Linus Torvalds wrote:
> On Thu, Sep 10, 2020 at 2:40 AM Alexander Gordeev
>  wrote:
> >
> > It is only gup_fast case that exposes the issue. It hits because
> > pointers to stack copies are passed to gup_pXd_range iterators, not
> > pointers to real page tables itself.
> 
> Can we possibly change fast-gup to not do the stack copies?
>
> I'd actually rather do something like that, than the "addr_end" thing.

> As you say, none of the other page table walking code does what the
> GUP code does, and I don't think it's required.

As I understand it, the requirement is because fast-gup walks without
the page table spinlock, or mmap_sem held so it must READ_ONCE the
*pXX.

It then checks that it is a valid page table pointer, then calls
pXX_offset().

The arch implementation of pXX_offset() derefs again the passed pXX
pointer. So it defeats the READ_ONCE and the 2nd load could observe
something that is no longer a page table pointer and crash.

Passing it the address of the stack value is a way to force
pXX_offset() to use the READ_ONCE result which has already been tested
to be a page table pointer.

Other page walking code that holds the mmap_sem tends to use
pmd_trans_unstable() which solves this problem by injecting a
barrier. The load hidden in pte_offset() after a pmd_trans_unstable()
can't be re-ordered and will only see a page table entry under the
mmap_sem.

However, I think that logic would have been much clearer following the
GUP model of READ_ONCE vs extra reads and a hidden barrier. At least
it took me a long time to work it out :(

I also think there are real bugs here where places are reading *pXX
multiple times without locking the page table. One was found recently
in the wild in the huge tlb code IIRC.

The mm/pagewalk.c has these missing READ_ONCE bugs too.

So.. To change away from the stack option I think we'd have to pass
the READ_ONCE value to pXX_offset() as an extra argument instead of it
derefing the pointer internally.

Jason

Re: [PATCHv7 04/12] PCI: designware-ep: Modify MSI and MSIX CAP way of finding

2020-09-10 Thread Rob Herring

On Tue, Aug 11, 2020 at 05:54:33PM +0800, Zhiqiang Hou wrote:
> From: Xiaowei Bao 
> 
> Each PF of EP device should have its own MSI or MSIX capabitily
> struct, so create a dw_pcie_ep_func struct and move the msi_cap
> and msix_cap to this struct from dw_pcie_ep, and manage the PFs
> via a list.
> 
> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Hou Zhiqiang 
> ---
> V7:
>  - Rebase the patch without functionality change.
> 
>  .../pci/controller/dwc/pcie-designware-ep.c   | 139 +++---
>  drivers/pci/controller/dwc/pcie-designware.h  |  18 ++-
>  2 files changed, 136 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> b/drivers/pci/controller/dwc/pcie-designware-ep.c
> index 56bd1cd71f16..4680a51c49c0 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> @@ -28,6 +28,19 @@ void dw_pcie_ep_init_notify(struct dw_pcie_ep *ep)
>  }
>  EXPORT_SYMBOL_GPL(dw_pcie_ep_init_notify);
>  
> +struct dw_pcie_ep_func *
> +dw_pcie_ep_get_func_from_ep(struct dw_pcie_ep *ep, u8 func_no)
> +{
> + struct dw_pcie_ep_func *ep_func;
> +
> + list_for_each_entry(ep_func, >func_list, list) {
> + if (ep_func->func_no == func_no)
> + return ep_func;
> + }
> +
> + return NULL;
> +}
> +
>  static unsigned int dw_pcie_ep_func_select(struct dw_pcie_ep *ep, u8 func_no)
>  {
>   unsigned int func_offset = 0;
> @@ -68,6 +81,47 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum 
> pci_barno bar)
>   __dw_pcie_ep_reset_bar(pci, func_no, bar, 0);
>  }
>  
> +static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8 func_no,
> + u8 cap_ptr, u8 cap)
> +{
> + struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + unsigned int func_offset = 0;
> + u8 cap_id, next_cap_ptr;
> + u16 reg;
> +
> + if (!cap_ptr)
> + return 0;
> +
> + func_offset = dw_pcie_ep_func_select(ep, func_no);
> +
> + reg = dw_pcie_readw_dbi(pci, func_offset + cap_ptr);
> + cap_id = (reg & 0x00ff);
> +
> + if (cap_id > PCI_CAP_ID_MAX)
> + return 0;
> +
> + if (cap_id == cap)
> + return cap_ptr;
> +
> + next_cap_ptr = (reg & 0xff00) >> 8;
> + return __dw_pcie_ep_find_next_cap(ep, func_no, next_cap_ptr, cap);
> +}
> +
> +static u8 dw_pcie_ep_find_capability(struct dw_pcie_ep *ep, u8 func_no, u8 
> cap)
> +{
> + struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + unsigned int func_offset = 0;
> + u8 next_cap_ptr;
> + u16 reg;
> +
> + func_offset = dw_pcie_ep_func_select(ep, func_no);
> +
> + reg = dw_pcie_readw_dbi(pci, func_offset + PCI_CAPABILITY_LIST);
> + next_cap_ptr = (reg & 0x00ff);
> +
> + return __dw_pcie_ep_find_next_cap(ep, func_no, next_cap_ptr, cap);
> +}

These are almost the same as __dw_pcie_find_next_cap and 
dw_pcie_find_capability. Please modify them to take a function offset 
and work for both host and endpoints.

> +
>  static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no,
>  struct pci_epf_header *hdr)
>  {
> @@ -257,13 +311,18 @@ static int dw_pcie_ep_get_msi(struct pci_epc *epc, u8 
> func_no)
>   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
>   u32 val, reg;
>   unsigned int func_offset = 0;
> + struct dw_pcie_ep_func *ep_func;
>  
> - if (!ep->msi_cap)
> + ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> + if (!ep_func)
> + return -EINVAL;
> +
> + if (!ep_func->msi_cap)
>   return -EINVAL;
>  
>   func_offset = dw_pcie_ep_func_select(ep, func_no);
>  
> - reg = ep->msi_cap + func_offset + PCI_MSI_FLAGS;
> + reg = ep_func->msi_cap + func_offset + PCI_MSI_FLAGS;
>   val = dw_pcie_readw_dbi(pci, reg);
>   if (!(val & PCI_MSI_FLAGS_ENABLE))
>   return -EINVAL;
> @@ -279,13 +338,18 @@ static int dw_pcie_ep_set_msi(struct pci_epc *epc, u8 
> func_no, u8 interrupts)
>   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
>   u32 val, reg;
>   unsigned int func_offset = 0;
> + struct dw_pcie_ep_func *ep_func;
> +
> + ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> + if (!ep_func)
> + return -EINVAL;
>  
> - if (!ep->msi_cap)
> + if (!ep_func->msi_cap)
>   return -EINVAL;
>  
>   func_offset = dw_pcie_ep_func_select(ep, func_no);
>  
> - reg = ep->msi_cap + func_offset + PCI_MSI_FLAGS;
> + reg = ep_func->msi_cap + func_offset + PCI_MSI_FLAGS;

If msi_cap is now per function, then shouldn't it already include 
'func_offset'?

>   val = dw_pcie_readw_dbi(pci, reg);
>   val &= ~PCI_MSI_FLAGS_QMASK;
>   val |= (interrupts << 1) & PCI_MSI_FLAGS_QMASK;
> @@ -302,13 +366,18 @@ static int dw_pcie_ep_get_msix(struct pci_epc *epc, u8 
> func_no)
>   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
>   u32 val, reg;
>   unsigned int

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Gerald Schaefer

On Thu, 10 Sep 2020 10:02:33 -0300
Jason Gunthorpe  wrote:

> On Thu, Sep 10, 2020 at 11:39:25AM +0200, Alexander Gordeev wrote:
> 
> > As Gerald mentioned, it is very difficult to explain in a clear way.
> > Hopefully, one could make sense ot of it.
> 
> I would say the page table API requires this invariant:
> 
> pud = pud_offset(p4d, addr);
> do {
>   WARN_ON(pud != pud_offset(p4d, addr);
> next = pud_addr_end(addr, end);
> } while (pud++, addr = next, addr != end);
> 
> ie pud++ is supposed to be a shortcut for 
>   pud_offset(p4d, next)
> 

Hmm, IIUC, all architectures with static folding will simply return
the passed-in p4d pointer for pud_offset(p4d, addr), for 3-level
pagetables. There is no difference for s390. For gup_fast, that p4d
pointer is not really a pointer to a value in a pagetable, but
to some local copy of such a value, and not just for s390.

So, pud = p4d = pointer to copy, and increasing that pud pointer
cannot be the same as pud_offset(p4d, next). I do see your point
however, at last I think :-) My problem is that I do not see where
we would have an s390-specific issue here. Maybe my understanding
of how it works for others with static folding is wrong. That
would explain my difficulties in getting your point...

> While S390 does not follow this. Fixing addr_end brings it into
> alignment by preventing pud++ from happening.

Exactly, only that nobody seems to follow it, IIUC. Fixing it up
with pXd_addr_end was my impression of what we need to do, in order to
have it work the same way as for others.

> The only currently known side effect is that gup_fast crashes, but it
> sure is an unexpected thing.

Well, from my understanding it feels more unexpected that something
that is supposed to be a pointer to an entry in a page table, really is
just a pointer to some copy somewhere.

Re: [PATCHv7 02/12] PCI: designware-ep: Add the doorbell mode of MSI-X in EP mode

2020-09-10 Thread Rob Herring

On Tue, Aug 11, 2020 at 05:54:31PM +0800, Zhiqiang Hou wrote:
> From: Xiaowei Bao 
> 
> Add the doorbell mode of MSI-X in DWC EP driver.
> 
> Signed-off-by: Xiaowei Bao 
> Reviewed-by: Andrew Murray 
> Signed-off-by: Hou Zhiqiang 
> ---
> V7:
>  - Rebase the patch without functionality change.
> 
>  drivers/pci/controller/dwc/pcie-designware-ep.c | 14 ++
>  drivers/pci/controller/dwc/pcie-designware.h| 12 
>  2 files changed, 26 insertions(+)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> b/drivers/pci/controller/dwc/pcie-designware-ep.c
> index e5bd3a5ef380..e76b504ed465 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> @@ -471,6 +471,20 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
> func_no,
>   return 0;
>  }
>  
> +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,

return void. It never has an error.

It could also just be an inline function.

> +u16 interrupt_num)
> +{
> + struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + u32 msg_data;
> +
> + msg_data = (func_no << PCIE_MSIX_DOORBELL_PF_SHIFT) |
> +(interrupt_num - 1);
> +
> + dw_pcie_writel_dbi(pci, PCIE_MSIX_DOORBELL, msg_data);
> +
> + return 0;
> +}
> +
>  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> u16 interrupt_num)
>  {
> diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
> b/drivers/pci/controller/dwc/pcie-designware.h
> index 89f8271ec5ee..745b4938225a 100644
> --- a/drivers/pci/controller/dwc/pcie-designware.h
> +++ b/drivers/pci/controller/dwc/pcie-designware.h
> @@ -97,6 +97,9 @@
>  #define PCIE_MISC_CONTROL_1_OFF  0x8BC
>  #define PCIE_DBI_RO_WR_ENBIT(0)
>  
> +#define PCIE_MSIX_DOORBELL   0x948
> +#define PCIE_MSIX_DOORBELL_PF_SHIFT  24
> +
>  #define PCIE_PL_CHK_REG_CONTROL_STATUS   0xB20
>  #define PCIE_PL_CHK_REG_CHK_REG_STARTBIT(0)
>  #define PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS   BIT(1)
> @@ -434,6 +437,8 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
> func_no,
>u8 interrupt_num);
>  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
>u16 interrupt_num);
> +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,
> +u16 interrupt_num);
>  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
>  #else
>  static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> @@ -475,6 +480,13 @@ static inline int dw_pcie_ep_raise_msix_irq(struct 
> dw_pcie_ep *ep, u8 func_no,
>   return 0;
>  }
>  
> +static inline int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep,
> +  u8 func_no,
> +  u16 interrupt_num)
> +{
> + return 0;
> +}
> +
>  static inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno 
> bar)
>  {
>  }
> -- 
> 2.17.1
>

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 07:07:57PM +0200, Gerald Schaefer wrote:

> I might have lost track a bit. Are we still talking about possible
> functional impacts of either our current pagetable walking with s390
> (apart from gup_fast), or the proposed generic change (for s390, or
> others?)?

I'm looking for an more understandable explanation what is wrong with
the S390 implementation.

If the page operations require the invariant I described then it is
quite easy to explain the problem and understand the solution.

Jason

[PATCH v1] pseries/hotplug-memory: hot-add: skip redundant LMB lookup

2020-09-10 Thread Scott Cheloha

During memory hot-add, dlpar_add_lmb() calls memory_add_physaddr_to_nid()
to determine which node id (nid) to use when later calling __add_memory().

This is wasteful.  On pseries, memory_add_physaddr_to_nid() finds an
appropriate nid for a given address by looking up the LMB containing the
address and then passing that LMB to of_drconf_to_nid_single() to get the
nid.  In dlpar_add_lmb() we get this address from the LMB itself.

In short, we have a pointer to an LMB and then we are searching for
that LMB *again* in order to find its nid.

If we call of_drconf_to_nid_single() directly from dlpar_add_lmb() we
can skip the redundant lookup.  The only error handling we need to
duplicate from memory_add_physaddr_to_nid() is the fallback to the
default nid when drconf_to_nid_single() returns -1 (NUMA_NO_NODE) or
an invalid nid.

Skipping the extra lookup makes hot-add operations faster, especially
on machines with many LMBs.

Consider an LPAR with 126976 LMBs.  In one test, hot-adding 126000
LMBs on an upatched kernel took ~3.5 hours while a patched kernel
completed the same operation in ~2 hours:

Unpatched (12450 seconds):
Sep  9 04:06:31 ltc-brazos1 drmgr[810169]: drmgr: -c mem -a -q 126000
Sep  9 04:06:31 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 
126000 LMB(s)
[...]
Sep  9 07:34:01 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 2000 
(drc index 8002) was hot-added

Patched (7065 seconds):
Sep  8 21:49:57 ltc-brazos1 drmgr[877703]: drmgr: -c mem -a -q 126000
Sep  8 21:49:57 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 
126000 LMB(s)
[...]
Sep  8 23:27:42 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 2000 
(drc index 8002) was hot-added

It should be noted that the speedup grows more substantial when
hot-adding LMBs at the end of the drconf range.  This is because we
are skipping a linear LMB search.

To see the distinction, consider smaller hot-add test on the same
LPAR.  A perf-stat run with 10 iterations showed that hot-adding 4096
LMBs completed less than 1 second faster on a patched kernel:

Unpatched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

104,753.42 msec task-clock#0.992 CPUs utilized  
  ( +-  0.55% )
 4,708  context-switches  #0.045 K/sec  
  ( +-  0.69% )
 2,444  cpu-migrations#0.023 K/sec  
  ( +-  1.25% )
   394  page-faults   #0.004 K/sec  
  ( +-  0.22% )
   445,902,503,057  cycles#4.257 GHz
  ( +-  0.55% )  (66.67%)
 8,558,376,740  stalled-cycles-frontend   #1.92% frontend cycles 
idle ( +-  0.88% )  (49.99%)
   300,346,181,651  stalled-cycles-backend#   67.36% backend cycles 
idle  ( +-  0.76% )  (50.01%)
   258,091,488,691  instructions  #0.58  insn per cycle
  #1.16  stalled cycles per 
insn  ( +-  0.22% )  (66.67%)
70,568,169,256  branches  #  673.660 M/sec  
  ( +-  0.17% )  (50.01%)
 3,100,725,426  branch-misses #4.39% of all branches
  ( +-  0.20% )  (49.99%)

   105.583 +- 0.589 seconds time elapsed  ( +-  0.56% )

Patched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

104,055.69 msec task-clock#0.993 CPUs utilized  
  ( +-  0.32% )
 4,606  context-switches  #0.044 K/sec  
  ( +-  0.20% )
 2,463  cpu-migrations#0.024 K/sec  
  ( +-  0.93% )
   394  page-faults   #0.004 K/sec  
  ( +-  0.25% )
   442,951,129,921  cycles#4.257 GHz
  ( +-  0.32% )  (66.66%)
 8,710,413,329  stalled-cycles-frontend   #1.97% frontend cycles 
idle ( +-  0.47% )  (50.06%)
   299,656,905,836  stalled-cycles-backend#   67.65% backend cycles 
idle  ( +-  0.39% )  (50.02%)
   252,731,168,193  instructions  #0.57  insn per cycle
  #1.19  stalled cycles per 
insn  ( +-  0.20% )  (66.66%)
68,902,851,121  branches  #  662.173 M/sec  
  ( +-  0.13% )  (49.94%)
 3,100,242,882  branch-misses #4.50% of all branches
  ( +-  0.15% )  (49.98%)

   104.829 +- 0.325 seconds time elapsed  ( +-  0.31% )

This is consistent.  An add-by-count hot-add operation adds LMBs
greedily, so LMBs near the start of the drconf range are considered
first.  On an otherwise idle LPAR with so many LMBs we would expect to
find the LMBs we need near the start of the drconf range, hence the
smaller speedup.

Signed-off-by: Scott Cheloha 
---

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Linus Torvalds

On Thu, Sep 10, 2020 at 2:40 AM Alexander Gordeev
 wrote:
>
> It is only gup_fast case that exposes the issue. It hits because
> pointers to stack copies are passed to gup_pXd_range iterators, not
> pointers to real page tables itself.

Can we possibly change fast-gup to not do the stack copies?

I'd actually rather do something like that, than the "addr_end" thing.

As you say, none of the other page table walking code does what the
GUP code does, and I don't think it's required.

The GUP code is kind of strange, I'm not quite sure why. Some of it
unusually came from the powerpc code that handled their special odd
hugepage model, and that may be why it's so different.

How painful would it be to just pass the pmd (etc) _pointers_ around,
rather than do the odd "take the address of local copies"?

  Linus

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Gerald Schaefer

On Thu, 10 Sep 2020 12:10:26 -0300
Jason Gunthorpe  wrote:

> On Thu, Sep 10, 2020 at 03:28:03PM +0200, Gerald Schaefer wrote:
> > On Thu, 10 Sep 2020 10:02:33 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Thu, Sep 10, 2020 at 11:39:25AM +0200, Alexander Gordeev wrote:
> > >   
> > > > As Gerald mentioned, it is very difficult to explain in a clear way.
> > > > Hopefully, one could make sense ot of it.
> > > 
> > > I would say the page table API requires this invariant:
> > > 
> > > pud = pud_offset(p4d, addr);
> > > do {
> > >   WARN_ON(pud != pud_offset(p4d, addr);
> > > next = pud_addr_end(addr, end);
> > > } while (pud++, addr = next, addr != end);
> > > 
> > > ie pud++ is supposed to be a shortcut for 
> > >   pud_offset(p4d, next)
> > > 
> > > While S390 does not follow this. Fixing addr_end brings it into
> > > alignment by preventing pud++ from happening.
> > > 
> > > The only currently known side effect is that gup_fast crashes, but it
> > > sure is an unexpected thing.  
> > 
> > It only is unexpected in a "top-level folding" world, see my other reply.
> > Consider it an optimization, which was possible because of how our dynamic
> > folding works, and e.g. because we can determine the correct pagetable
> > level from a pXd value in pXd_offset.  
> 
> No, I disagree. The page walker API the arch presents has to have well
> defined semantics. For instance, there is an effort to define tests
> and invarients for the page table accesses to bring this understanding
> and uniformity:
> 
>  mm/debug_vm_pgtable.c
> 
> If we fix S390 using the pX_addr_end() change then the above should be
> updated with an invariant to check it. I've added Anshuman for some
> thoughts..

We are very aware of those tests, and actually a big supporter of the
idea. Also part of the supported architectures already, and it has
already helped us find / fix some s390 oddities.

However, we did not see any issues wrt to our pagetable walking,
neither with the current version, nor with the new generic approach.
We do currently see other issues, Anshuman will know what I mean :-)

> For better or worse, that invariant does exclude arches from using
> other folding techniques.
> 
> The other solution would be to address the other side of != and adjust
> the pud++
> 
> eg replcae pud++ with something like:
>   pud = pud_next_entry(p4d, pud, next)
> 
> Such that:
>   pud_next_entry(p4d, pud, next) === pud_offset(p4d, next)
> 
> In which case the invarient changes to 'callers can never do pointer
> arithmetic on the result of pXX_offset()' which is a bit harder to
> enforce.

I might have lost track a bit. Are we still talking about possible
functional impacts of either our current pagetable walking with s390
(apart from gup_fast), or the proposed generic change (for s390, or
others?)?

Or is this rather some (other) generic issue / idea that you have,
in order to put "some more structure / enforcement" to generic
pagetable walkers?

Re: [PATCHv7 10/12] arm64: dts: layerscape: Add PCIe EP node for ls1088a

2020-09-10 Thread Rob Herring

On Tue, Aug 11, 2020 at 05:54:39PM +0800, Zhiqiang Hou wrote:
> From: Xiaowei Bao 
> 
> Add PCIe EP node for ls1088a to support EP mode.
> 
> Signed-off-by: Xiaowei Bao 
> Reviewed-by: Andrew Murray 
> Signed-off-by: Hou Zhiqiang 
> ---
> V7:
>  - Rebase the patch without functionality change.
> 
>  .../arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 31 +++
>  1 file changed, 31 insertions(+)
> 
> diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi 
> b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> index 169f4742ae3b..915592141f1b 100644
> --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> @@ -499,6 +499,17 @@
>   status = "disabled";
>   };
>  
> + pcie_ep@340 {

pci-ep@...

> + compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
> + reg = <0x00 0x0340 0x0 0x0010
> +0x20 0x 0x8 0x>;
> + reg-names = "regs", "addr_space";
> + num-ib-windows = <24>;
> + num-ob-windows = <128>;
> + max-functions = /bits/ 8 <2>;
> + status = "disabled";
> + };
> +
>   pcie@350 {
>   compatible = "fsl,ls1088a-pcie";
>   reg = <0x00 0x0350 0x0 0x0010   /* controller 
> registers */
> @@ -525,6 +536,16 @@
>   status = "disabled";
>   };
>  
> + pcie_ep@350 {
> + compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
> + reg = <0x00 0x0350 0x0 0x0010
> +0x28 0x 0x8 0x>;
> + reg-names = "regs", "addr_space";
> + num-ib-windows = <6>;
> + num-ob-windows = <8>;
> + status = "disabled";
> + };
> +
>   pcie@360 {
>   compatible = "fsl,ls1088a-pcie";
>   reg = <0x00 0x0360 0x0 0x0010   /* controller 
> registers */
> @@ -551,6 +572,16 @@
>   status = "disabled";
>   };
>  
> + pcie_ep@360 {
> + compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
> + reg = <0x00 0x0360 0x0 0x0010
> +0x30 0x 0x8 0x>;
> + reg-names = "regs", "addr_space";
> + num-ib-windows = <6>;
> + num-ob-windows = <8>;
> + status = "disabled";
> + };
> +
>   smmu: iommu@500 {
>   compatible = "arm,mmu-500";
>   reg = <0 0x500 0 0x80>;
> -- 
> 2.17.1
>

Re: [PATCH] powerpc/papr_scm: Fix warning triggered by perf_stats_show()

2020-09-10 Thread Ira Weiny

On Thu, Sep 10, 2020 at 02:52:12PM +0530, Vaibhav Jain wrote:
> A warning is reported by the kernel in case perf_stats_show() returns
> an error code. The warning is of the form below:
> 
>  papr_scm ibm,persistent-memory:ibm,pmemory@4411:
> Failed to query performance stats, Err:-10
>  dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
>  fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count
> 
> On investigation it looks like that the compiler is silently truncating the
> return value of drc_pmem_query_stats() from 'long' to 'int', since the
> variable used to store the return code 'rc' is an 'int'. This
> truncated value is then returned back as a 'ssize_t' back from
> perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
> unsigned number and triggers this warning..
> 
> To fix this we update the type of variable 'rc' from 'int' to
> 'ssize_t' that prevents the compiler from truncating the return value
> of drc_pmem_query_stats() and returning correct signed value back from
> perf_stats_show().
> 
> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
>stats from PHYP')
> Signed-off-by: Vaibhav Jain 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index a88a707a608aa..9f00b61676ab9 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor 
> *nd_desc,
>  static ssize_t perf_stats_show(struct device *dev,
>  struct device_attribute *attr, char *buf)
>  {
> - int index, rc;
> + int index;
> + ssize_t rc;

I'm not sure this is really fixing everything here.

drc_pmem_query_stats() can return negative errno's.  Why are those not checked
somewhere in perf_stats_show()?

It seems like all this fix is handling is a > 0 return value: 'ret[0]' from
line 289 in papr_scm.c...  Or something?

Worse yet drc_pmem_query_stats() is returning ssize_t which is a signed value.
Therefore, it should not be returning -errno.  I'm surprised the static
checkers did not catch that.

I believe I caught similar errors with a patch series before which did not pay
attention to variable types.

Please audit this code for these types of errors and ensure you are really
doing the correct thing when using the sysfs interface.  I'm pretty sure bad
things will eventually happen (if they are not already) if you return some
really big number to the sysfs core from *_show().

Ira

>   struct seq_buf s;
>   struct papr_scm_perf_stat *stat;
>   struct papr_scm_perf_stats *stats;
> -- 
> 2.26.2
>

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread Segher Boessenkool

On Wed, Sep 09, 2020 at 02:33:36PM -0700, Linus Torvalds wrote:
> On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
>  wrote:
> >
> > It will not work like this in GCC, no.  The LLVM people know about that.
> > I do not know why they insist on pushing this, being incompatible and
> > everything.
> 
> Umm. Since they'd be the ones supporting this, *gcc* would be the
> incompatible one, not clang.

This breaks the basic requirements of asm goto.

> So I'd phrase it differently. If gcc is planning on doing some
> different model for asm goto with outputs, that would be the
> incompatible case.

If we will do asm goto with outputs, the asm will still be a jump
instruction!  (It is not in LLVM!)

We probably *can* make asm goto have outputs (jump instructions can have
outputs just fine!  Just output reloads on jump instructions are hard,
because not always they are *possible*; but for asm goto it should be
fine).

Doing as LLVM does, and making the asm a "trapping" instruction, makes
it not a jump insn, and opens up whole new cans of worms (including
inferior code quality).  Since it has very different semantics, and we
might want to keep the semantics of asm goto as well anyway, this should
be called something different ("asm break" or "asm __anything" for
example).

It would be nice if they talked to us about it, too.  LLVM claims it
implements the GCC inline asm extension.  It already only is compatible
for the simplest of cases, but this would be much worse still :-(

> and honestly, (b) is actually inferior for the error cases, even if to
> a compiler person it might feel like the "RightThing(tm)" to do.
> Because when an exception happens, the outputs simply won't be
> initialized.

Sure, that is fine, and quite possible useful, but it is not the same as
asm goto.  asm goto is not some exception handling construct: it is a
jump instruction.

> Anyway, for either of those cases, the kernel won't care either way.
> We'll have to support the non-goto case for many years even if
> everybody were to magically implement it today, so it's not like this
> is a "you have to do it" thing.

Yes.

I'm just annoyed because of all the extra work created by people not
communicating.

Segher

RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread David Laight




> -Original Message-
> From: Segher Boessenkool 
> Sent: 10 September 2020 16:21
> To: David Laight 
> Cc: 'Christophe Leroy' ; 'Linus Torvalds' 
>  foundation.org>; linux-arch ; Kees Cook 
> ; the
> arch/x86 maintainers ; Nick Desaulniers 
> ; Linux Kernel
> Mailing List ; Alexey Dobriyan 
> ; Luis Chamberlain
> ; Al Viro ; linux-fsdevel 
> ;
> linuxppc-dev ; Christoph Hellwig 
> Subject: Re: remove the last set_fs() in common code, and remove it for x86 
> and powerpc v3
> 
> On Thu, Sep 10, 2020 at 12:26:53PM +, David Laight wrote:
> > Actually this is pretty sound:
> > __label__ label;
> > register int eax asm ("eax");
> > // Ensure eax can't be reloaded from anywhere
> > // In particular it can't be reloaded after the asm goto line
> > asm volatile ("" : "=r" (eax));
> 
> This asm is fine.  It says it writes the "eax" variable, which lives in
> the eax register *in that asm* (so *not* guaranteed after it!).
> 
> > // Provided gcc doesn't save eax here...
> > asm volatile goto ("x" ::: "eax" : label);
> 
> So this is incorrect.

>From the other email:

> It is neither input nor output operand here!  Only *then* is a local
> register asm guaranteed to be in the given reg: as input or output to an
> inline asm.

Ok, so adding '"r" (eax)' to the input section helps a bit.

> > // ... and reload the saved value here.
> > // The input value here will be that modified by the 'asm goto'.
> > // Since this modifies eax it can't be moved before the 'asm goto'.
> > asm volatile ("" : "+r" (eax));
> > // So here eax must contain the value set by the "x" instructions.
> 
> No, the register eax will contain the value of the eax variable.  In the
> asm; it might well be there before or after the asm as well, but none of
> that is guaranteed.

Perhaps not 'guaranteed', but very unlikely to be wrong.
It doesn't give gcc much scope for not generating the desired code.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread Segher Boessenkool

On Thu, Sep 10, 2020 at 12:26:53PM +, David Laight wrote:
> Actually this is pretty sound:
>   __label__ label;
>   register int eax asm ("eax");
>   // Ensure eax can't be reloaded from anywhere
>   // In particular it can't be reloaded after the asm goto line
>   asm volatile ("" : "=r" (eax));

This asm is fine.  It says it writes the "eax" variable, which lives in
the eax register *in that asm* (so *not* guaranteed after it!).

>   // Provided gcc doesn't save eax here...
>   asm volatile goto ("x" ::: "eax" : label);

So this is incorrect.

>   // ... and reload the saved value here.
>   // The input value here will be that modified by the 'asm goto'.
>   // Since this modifies eax it can't be moved before the 'asm goto'.
>   asm volatile ("" : "+r" (eax));
>   // So here eax must contain the value set by the "x" instructions.

No, the register eax will contain the value of the eax variable.  In the
asm; it might well be there before or after the asm as well, but none of
that is guaranteed.

Segher

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread Segher Boessenkool

On Thu, Sep 10, 2020 at 09:26:28AM +, David Laight wrote:
> From: Christophe Leroy
> > Sent: 10 September 2020 09:14
> > 
> > Le 10/09/2020 à 10:04, David Laight a écrit :
> > > From: Linus Torvalds
> > >> Sent: 09 September 2020 22:34
> > >> On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
> > >>  wrote:
> > >>>
> > >>> It will not work like this in GCC, no.  The LLVM people know about that.
> > >>> I do not know why they insist on pushing this, being incompatible and
> > >>> everything.
> > >>
> > >> Umm. Since they'd be the ones supporting this, *gcc* would be the
> > >> incompatible one, not clang.
> > >
> > > I had an 'interesting' idea.
> > >
> > > Can you use a local asm register variable as an input and output to
> > > an 'asm volatile goto' statement?
> > >
> > > Well you can - but is it guaranteed to work :-)
> > >
> > 
> > With gcc at least it should work according to
> > https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html
> > 
> > They even explicitely tell: "The only supported use for this feature is
> > to specify registers for input and output operands when calling Extended
> > asm "
> 
> A quick test isn't good
> 
> int bar(char *z)
> {
> __label__ label;
> register int eax asm ("eax") = 6;
> asm volatile goto (" mov $1, %%eax" ::: "eax" : label);
> 
> label:
> return eax;
> }

It is neither input nor output operand here!  Only *then* is a local
register asm guaranteed to be in the given reg: as input or output to an
inline asm.


Segher

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 03:28:03PM +0200, Gerald Schaefer wrote:
> On Thu, 10 Sep 2020 10:02:33 -0300
> Jason Gunthorpe  wrote:
> 
> > On Thu, Sep 10, 2020 at 11:39:25AM +0200, Alexander Gordeev wrote:
> > 
> > > As Gerald mentioned, it is very difficult to explain in a clear way.
> > > Hopefully, one could make sense ot of it.  
> > 
> > I would say the page table API requires this invariant:
> > 
> > pud = pud_offset(p4d, addr);
> > do {
> > WARN_ON(pud != pud_offset(p4d, addr);
> > next = pud_addr_end(addr, end);
> > } while (pud++, addr = next, addr != end);
> > 
> > ie pud++ is supposed to be a shortcut for 
> >   pud_offset(p4d, next)
> > 
> > While S390 does not follow this. Fixing addr_end brings it into
> > alignment by preventing pud++ from happening.
> > 
> > The only currently known side effect is that gup_fast crashes, but it
> > sure is an unexpected thing.
> 
> It only is unexpected in a "top-level folding" world, see my other reply.
> Consider it an optimization, which was possible because of how our dynamic
> folding works, and e.g. because we can determine the correct pagetable
> level from a pXd value in pXd_offset.

No, I disagree. The page walker API the arch presents has to have well
defined semantics. For instance, there is an effort to define tests
and invarients for the page table accesses to bring this understanding
and uniformity:

 mm/debug_vm_pgtable.c

If we fix S390 using the pX_addr_end() change then the above should be
updated with an invariant to check it. I've added Anshuman for some
thoughts..

For better or worse, that invariant does exclude arches from using
other folding techniques.

The other solution would be to address the other side of != and adjust
the pud++

eg replcae pud++ with something like:
  pud = pud_next_entry(p4d, pud, next)

Such that:
  pud_next_entry(p4d, pud, next) === pud_offset(p4d, next)

In which case the invarient changes to 'callers can never do pointer
arithmetic on the result of pXX_offset()' which is a bit harder to
enforce.

Jason

[PATCH] soc: fsl: dpio: remove set but not used 'addr_cena'

2020-09-10 Thread Jason Yan

This addresses the following gcc warning with "make W=1":

drivers/soc/fsl/dpio/qbman-portal.c: In function
‘qbman_swp_enqueue_multiple_direct’:
drivers/soc/fsl/dpio/qbman-portal.c:650:11: warning: variable
‘addr_cena’ set but not used [-Wunused-but-set-variable]
  650 |  uint64_t addr_cena;
  |   ^

Reported-by: Hulk Robot 
Signed-off-by: Jason Yan 
---
 drivers/soc/fsl/dpio/qbman-portal.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/soc/fsl/dpio/qbman-portal.c 
b/drivers/soc/fsl/dpio/qbman-portal.c
index 0ab85bfb116f..659b4a570d5b 100644
--- a/drivers/soc/fsl/dpio/qbman-portal.c
+++ b/drivers/soc/fsl/dpio/qbman-portal.c
@@ -647,7 +647,6 @@ int qbman_swp_enqueue_multiple_direct(struct qbman_swp *s,
const uint32_t *cl = (uint32_t *)d;
uint32_t eqcr_ci, eqcr_pi, half_mask, full_mask;
int i, num_enqueued = 0;
-   uint64_t addr_cena;
 
spin_lock(>access_spinlock);
half_mask = (s->eqcr.pi_ci_mask>>1);
@@ -701,7 +700,6 @@ int qbman_swp_enqueue_multiple_direct(struct qbman_swp *s,
 
/* Flush all the cacheline without load/store in between */
eqcr_pi = s->eqcr.pi;
-   addr_cena = (size_t)s->addr_cena;
for (i = 0; i < num_enqueued; i++)
eqcr_pi++;
s->eqcr.pi = eqcr_pi & full_mask;
-- 
2.25.4

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Robin Murphy


On 2020-09-09 21:06, Joe Perches wrote:

fallthrough to a separate case/default label break; isn't very readable.

Convert pseudo-keyword fallthrough; statements to a simple break; when
the next label is case or default and the only statement in the next
label block is break;

Found using:

$ grep-2.5.4 -rP --include=*.[ch] -n 
"fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *

Miscellanea:

o Move or coalesce a couple label blocks above a default: block.

Signed-off-by: Joe Perches 
---

Compiled allyesconfig x86-64 only.
A few files for other arches were not compiled.



[...]

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c192544e874b..743db1abec40 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3777,7 +3777,7 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
switch (FIELD_GET(IDR0_TTF, reg)) {
case IDR0_TTF_AARCH32_64:
smmu->ias = 40;
-   fallthrough;
+   break;
case IDR0_TTF_AARCH64:
break;
default:


I have to say I don't really agree with the readability argument for 
this one - a fallthrough is semantically correct here, since the first 
case is a superset of the second. It just happens that anything we would 
do for the common subset is implicitly assumed (there are other 
potential cases we simply haven't added support for at the moment), thus 
the second case is currently empty.


This change actively obfuscates that distinction.

Robin.

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Gerald Schaefer

On Thu, 10 Sep 2020 10:02:33 -0300
Jason Gunthorpe  wrote:

> On Thu, Sep 10, 2020 at 11:39:25AM +0200, Alexander Gordeev wrote:
> 
> > As Gerald mentioned, it is very difficult to explain in a clear way.
> > Hopefully, one could make sense ot of it.  
> 
> I would say the page table API requires this invariant:
> 
> pud = pud_offset(p4d, addr);
> do {
>   WARN_ON(pud != pud_offset(p4d, addr);
> next = pud_addr_end(addr, end);
> } while (pud++, addr = next, addr != end);
> 
> ie pud++ is supposed to be a shortcut for 
>   pud_offset(p4d, next)
> 
> While S390 does not follow this. Fixing addr_end brings it into
> alignment by preventing pud++ from happening.
> 
> The only currently known side effect is that gup_fast crashes, but it
> sure is an unexpected thing.

It only is unexpected in a "top-level folding" world, see my other reply.
Consider it an optimization, which was possible because of how our dynamic
folding works, and e.g. because we can determine the correct pagetable
level from a pXd value in pXd_offset.

> This suggests another fix, which is to say that pud++ is undefined and
> pud_offset() must always be called, but I think that would cause worse
> codegen on all other archs.

There really is nothing to fix for s390 outside of gup_fast, or other
potential future READ_ONCE pagetable walkers. We do take the side-effect
of the generic change on all other pagetable walkers for s390, but it
really is rather a slight degradation than a fix.

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Gerald Schaefer

On Wed, 9 Sep 2020 15:03:24 -0300
Jason Gunthorpe  wrote:

> On Wed, Sep 09, 2020 at 07:25:34PM +0200, Gerald Schaefer wrote:
> > I actually had to draw myself a picture to get some hold of
> > this, or rather a walk-through with a certain pud-crossing
> > range in a folded 3-level scenario. Not sure if I would have
> > understood my explanation above w/o that, but I hope you can
> > make some sense out of it. Or draw yourself a picture :-)
> 
> What I don't understand is how does anything work with S390 today?

That is totally comprehensible :-)

> If the fix is only to change pxx_addr_end() then than generic code
> like mm/pagewalk.c will iterate over a *different list* of page table
> entries. 
> 
> It's choice of entries to look at is entirely driven by pxx_addr_end().
> 
> Which suggest to me that mm/pagewalk.c also doesn't work properly
> today on S390 and this issue is not really about stack variables?

I guess you are confused by the fact that the generic change will indeed
change the logic for _all_ pagetable walkers on s390, not just for
the gup_fast case. But that doesn't mean that they were doing it wrong
before, we simply can do it both ways. However, we probably should
make that (in theory useless) change more explicit.

Let's compare before and after for mm/pagewalk.c on s390, with 3-level
pagetables, range crossing 2 GB pud boundary.

* Before (with pXd_addr_end always using static 5-level PxD_SIZE):

walk_pgd_range()
-> pgd_addr_end() will use static 2^53 PGDIR_SIZE, range is not cropped,
  no iterations needed, passed over to next level

walk_p4d_range()
-> p4d_addr_end() will use static 2^42 P4D_SIZE, range still not cropped

walk_pud_range()
-> pud_addr_end() now we're cropping, with 2^31 PUD_SIZE, need two
  iterations for range crossing pud boundary, doing
  that right here on a pudp which is actually the
  previously passed-through pgdp/p4dp (pointing to
  correct pagetable entry)

* After (with dynamic pXd_addr_end using "correct" PxD_SIZE boundaries,
 should be similar to other archs static "top-level folding"):

walk_pgd_range()
-> pgd_addr_end() will now determine "correct" boundary based on pgd
  value, i.e. 2^31 PUD_SIZE, do cropping now, iteration
  will now happen here

walk_p4d/pud_range()
->  operate on cropped range, will not iterate, instead return to pgd level,
which will then use the same pointer for iteration as in the "Before"
case, but not on the same level.

IMHO, our "Before" logic is more efficient, and also feels more natural.
After all, it is not really necessary to return to pgd level, and it will
surely cost some extra instructions. We are willing to take that cost
for the sake of doing it in a more generic way, hoping that will reduce
future issues. E.g. you already mentioned that you have plans for using
the READ_ONCE logic also in other places, and that would be such a
"future issue".

> Fundamentally if pXX_offset() and pXX_addr_end() must be consistent
> together, if pXX_offset() is folded then pXX_addr_end() must cause a
> single iteration of that level.

well, that sounds correct in theory, but I guess it depends on "how
you fold it". E.g. what does "if pXX_offset() is folded" mean?
Take pgd_offset() for the 3-level case above. From our previous
"middle-level folding/iteration" perspective, I would say that
pgd/p4d are folded into pud, so if you say "if pgd_offset() is folded
then pgd_addr_end() must cause a single iteration of that level",
we were doing it all correctly, i.e only having single iteration
on pgd/p4d level. You could even say that all others are doing /
using it wrong :-)

Now take pgd_offset() from the "top-level folding/iteration".
Here you would say that p4d/pud are folded into pgd, which again
does not sound like the natural / most efficient way to me,
but IIUC this has to be how it works for all other archs with
(static) pagetable folding. Now you'd say "if pud/p4d_offset()
is folded then pud/p4d_addr_end() must cause a single iteration
of that level", and that would sound correct. At least until
you look more closely, because e.g. p4d_addr_end() in
include/asm-generic/pgtable-nop4d.h is simply this:
#define p4d_addr_end(addr, end) (end)

How can that cause a single iteration? It clearly won't, it only
works because the previous pgd_addr_end already cropped the range
so that there will be only single iterations for p4d/pud.

The more I think of it, the more it sounds like s390 "middle-level
folding/iteration" was doing it "the right way", and everybody else
was wrong, or at least not in an optimally efficient way :-) Might
also be that only we could do this because we can determine the
pagetable level from a pagetable entry value.

Anyway, if you are not yet confused enough, I recommend looking
at the other option we had in mind, for fixing the gup_fast issue.
See "Patch 1" from here:

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Jason Gunthorpe

On Thu, Sep 10, 2020 at 11:39:25AM +0200, Alexander Gordeev wrote:

> As Gerald mentioned, it is very difficult to explain in a clear way.
> Hopefully, one could make sense ot of it.

I would say the page table API requires this invariant:

pud = pud_offset(p4d, addr);
do {
WARN_ON(pud != pud_offset(p4d, addr);
next = pud_addr_end(addr, end);
} while (pud++, addr = next, addr != end);

ie pud++ is supposed to be a shortcut for 
  pud_offset(p4d, next)

While S390 does not follow this. Fixing addr_end brings it into
alignment by preventing pud++ from happening.

The only currently known side effect is that gup_fast crashes, but it
sure is an unexpected thing.

This suggests another fix, which is to say that pud++ is undefined and
pud_offset() must always be called, but I think that would cause worse
codegen on all other archs.

Jason

Re: [PATCH v2] powerpc/papr_scm: Limit the readability of 'perf_stats' sysfs attribute

2020-09-10 Thread Michael Ellerman

On Mon, 7 Sep 2020 16:35:40 +0530, Vaibhav Jain wrote:
> The newly introduced 'perf_stats' attribute uses the default access
> mode of 0444 letting non-root users access performance stats of an
> nvdimm and potentially force the kernel into issuing large number of
> expensive HCALLs. Since the information exposed by this attribute
> cannot be cached hence its better to ward of access to this attribute
> from users who don't need to access these performance statistics.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/papr_scm: Limit the readability of 'perf_stats' sysfs attribute
  https://git.kernel.org/powerpc/c/0460534b532e5518c657c7d6492b9337d975eaa3

cheers

Re: [PATCH kernel] powerpc/dma: Fix dma_map_ops::get_required_mask

2020-09-10 Thread Michael Ellerman

On Tue, 8 Sep 2020 11:51:06 +1000, Alexey Kardashevskiy wrote:
> There are 2 problems with it:
> 1. "<" vs expected "<<"
> 2. the shift number is an IOMMU page number mask, not an address mask
> as the IOMMU page shift is missing.
> 
> This did not hit us before f1565c24b596 ("powerpc: use the generic
> dma_ops_bypass mode") because we had there additional code to handle
> bypass mask so this chunk (almost?) never executed. However there
> were reports that aacraid does not work with "iommu=nobypass".
> After f1565c24b596, aacraid (and probably others which call
> dma_get_required_mask() before setting the mask) was unable to
> enable 64bit DMA and fall back to using IOMMU which was known not to work,
> one of the problems is double free of an IOMMU page.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/dma: Fix dma_map_ops::get_required_mask
  https://git.kernel.org/powerpc/c/437ef802e0adc9f162a95213a3488e8646e5fc03

cheers

Re: [PATCH v2] cpuidle-pseries: Fix CEDE latency conversion from tb to us

2020-09-10 Thread Michael Ellerman

On Thu, 3 Sep 2020 14:57:27 +0530, Gautham R. Shenoy wrote:
> commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
> of the Extended CEDE states advertised by the platform. The values
> advertised by the platform are in timebase ticks. However the cpuidle
> framework requires the latency values in microseconds.
> 
> If the tb-ticks value advertised by the platform correspond to a value
> smaller than 1us, during the conversion from tb-ticks to microseconds,
> in the current code, the result becomes zero. This is incorrect as it
> puts a CEDE state on par with the snooze state.
> 
> [...]

Applied to powerpc/fixes.

[1/1] cpuidle: pseries: Fix CEDE latency conversion from tb to us
  https://git.kernel.org/powerpc/c/1d3ee7df009a46440c58508b8819213c09503acd

cheers

RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread David Laight

From: David Laight
> Sent: 10 September 2020 10:26
...
> > > I had an 'interesting' idea.
> > >
> > > Can you use a local asm register variable as an input and output to
> > > an 'asm volatile goto' statement?
> > >
> > > Well you can - but is it guaranteed to work :-)
> > >
> >
> > With gcc at least it should work according to
> > https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html
> >
> > They even explicitely tell: "The only supported use for this feature is
> > to specify registers for input and output operands when calling Extended
> > asm "
> 
> A quick test isn't good
> 
> int bar(char *z)
> {
> __label__ label;
> register int eax asm ("eax") = 6;
> asm volatile goto (" mov $1, %%eax" ::: "eax" : label);
> label:
> return eax;
> }
> 
> 0040 :
>   40:   b8 01 00 00 00  mov$0x1,%eax
>   45:   b8 06 00 00 00  mov$0x6,%eax
>   4a:   c3  retq
> 
> although adding:
> asm volatile ("" : "+r" (eax));
> either side of the 'asm volatile goto' does fix it.

Actually this is pretty sound:
__label__ label;
register int eax asm ("eax");
// Ensure eax can't be reloaded from anywhere
// In particular it can't be reloaded after the asm goto line
asm volatile ("" : "=r" (eax));
// Provided gcc doesn't save eax here...
asm volatile goto ("x" ::: "eax" : label);
// ... and reload the saved value here.
// The input value here will be that modified by the 'asm goto'.
// Since this modifies eax it can't be moved before the 'asm goto'.
asm volatile ("" : "+r" (eax));
// So here eax must contain the value set by the "x" instructions.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Nicolas.Ferre

Joe,

Please drop this chunk: it's a successive controller version number 
which are all backward compatible with "fallthrough" on each case so 
removing from this last one makes it inconsistent.

In sort: NACK for atmel-mci.

Best regards,
   Nicolas


On 09/09/2020 at 22:06, Joe Perches wrote:
> diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
> index 444bd3a0a922..8324312e4f42 100644
> --- a/drivers/mmc/host/atmel-mci.c
> +++ b/drivers/mmc/host/atmel-mci.c
> @@ -2435,7 +2435,7 @@ static void atmci_get_cap(struct atmel_mci *host)
>  case 0x100:
>  host->caps.has_bad_data_ordering = 0;
>  host->caps.need_reset_after_xfer = 0;
> -   fallthrough;
> +   break;
>  case 0x0:
>  break;
>  default:


-- 
Nicolas Ferre

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Matthias Brugger





On 09/09/2020 22:06, Joe Perches wrote:

diff --git a/drivers/net/wireless/mediatek/mt7601u/dma.c 
b/drivers/net/wireless/mediatek/mt7601u/dma.c
index 09f931d4598c..778be26d329f 100644
--- a/drivers/net/wireless/mediatek/mt7601u/dma.c
+++ b/drivers/net/wireless/mediatek/mt7601u/dma.c
@@ -193,11 +193,11 @@ static void mt7601u_complete_rx(struct urb *urb)
case -ESHUTDOWN:
case -ENOENT:
return;
+   case 0:
+   break;
default:
dev_err_ratelimited(dev->dev, "rx urb failed: %d\n",
urb->status);
-   fallthrough;
-   case 0:
break;
}
  
@@ -238,11 +238,11 @@ static void mt7601u_complete_tx(struct urb *urb)

case -ESHUTDOWN:
case -ENOENT:
return;
+   case 0:
+   break;
default:
dev_err_ratelimited(dev->dev, "tx urb failed: %d\n",
urb->status);
-   fallthrough;
-   case 0:
break;
}


Reviewed-by: Matthias Brugger

Re: [PATCH v3 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-10 Thread Pankaj Gupta

> We soon want to pass flags, e.g., to mark added System RAM resources.
> mergeable. Prepare for that.
>
> This patch is based on a similar patch by Oscar Salvador:
>
> https://lkml.kernel.org/r/20190625075227.15193-3-osalva...@suse.de
>
> Acked-by: Wei Liu 
> Reviewed-by: Juergen Gross  # Xen related part
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Dan Williams 
> Cc: Jason Gunthorpe 
> Cc: Pankaj Gupta 
> Cc: Baoquan He 
> Cc: Wei Yang 
> Cc: Michael Ellerman 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Greg Kroah-Hartman 
> Cc: Vishal Verma 
> Cc: Dave Jiang 
> Cc: "K. Y. Srinivasan" 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: Wei Liu 
> Cc: Heiko Carstens 
> Cc: Vasily Gorbik 
> Cc: Christian Borntraeger 
> Cc: David Hildenbrand 
> Cc: "Michael S. Tsirkin" 
> Cc: Jason Wang 
> Cc: Boris Ostrovsky 
> Cc: Juergen Gross 
> Cc: Stefano Stabellini 
> Cc: "Oliver O'Halloran" 
> Cc: Pingfan Liu 
> Cc: Nathan Lynch 
> Cc: Libor Pechacek 
> Cc: Anton Blanchard 
> Cc: Leonardo Bras 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-a...@vger.kernel.org
> Cc: linux-nvd...@lists.01.org
> Cc: linux-hyp...@vger.kernel.org
> Cc: linux-s...@vger.kernel.org
> Cc: virtualizat...@lists.linux-foundation.org
> Cc: xen-de...@lists.xenproject.org
> Signed-off-by: David Hildenbrand 
> ---
>  arch/powerpc/platforms/powernv/memtrace.c   |  2 +-
>  arch/powerpc/platforms/pseries/hotplug-memory.c |  2 +-
>  drivers/acpi/acpi_memhotplug.c  |  3 ++-
>  drivers/base/memory.c   |  3 ++-
>  drivers/dax/kmem.c  |  2 +-
>  drivers/hv/hv_balloon.c |  2 +-
>  drivers/s390/char/sclp_cmd.c|  2 +-
>  drivers/virtio/virtio_mem.c |  2 +-
>  drivers/xen/balloon.c   |  2 +-
>  include/linux/memory_hotplug.h  | 16 
>  mm/memory_hotplug.c | 14 +++---
>  11 files changed, 30 insertions(+), 20 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
> b/arch/powerpc/platforms/powernv/memtrace.c
> index 13b369d2cc454..6828108486f83 100644
> --- a/arch/powerpc/platforms/powernv/memtrace.c
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -224,7 +224,7 @@ static int memtrace_online(void)
> ent->mem = 0;
> }
>
> -   if (add_memory(ent->nid, ent->start, ent->size)) {
> +   if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
> pr_err("Failed to add trace memory to node %d\n",
> ent->nid);
> ret += 1;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 0ea976d1cac47..e1c9fa0d730f5 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -615,7 +615,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
> nid = memory_add_physaddr_to_nid(lmb->base_addr);
>
> /* Add the memory */
> -   rc = __add_memory(nid, lmb->base_addr, block_sz);
> +   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
> if (rc) {
> invalidate_lmb_associativity_index(lmb);
> return rc;
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index e294f44a78504..2067c3bc55763 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -207,7 +207,8 @@ static int acpi_memory_enable_device(struct 
> acpi_memory_device *mem_device)
> if (node < 0)
> node = memory_add_physaddr_to_nid(info->start_addr);
>
> -   result = __add_memory(node, info->start_addr, info->length);
> +   result = __add_memory(node, info->start_addr, info->length,
> + MHP_NONE);
>
> /*
>  * If the memory block has been used by the kernel, 
> add_memory()
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 4db3c660de831..b4c297dd04755 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -432,7 +432,8 @@ static ssize_t probe_store(struct device *dev, struct 
> device_attribute *attr,
>
> nid = memory_add_physaddr_to_nid(phys_addr);
> ret = __add_memory(nid, phys_addr,
> -  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
> +  MIN_MEMORY_BLOCK_SIZE * sections_per_block,
> +  MHP_NONE);
>
> if (ret)
> goto out;
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 7dcb2902e9b1b..896cb9444e727 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -95,7 +95,7 @@ int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Ilya Dryomov

On Wed, Sep 9, 2020 at 10:10 PM Joe Perches  wrote:
>
> fallthrough to a separate case/default label break; isn't very readable.
>
> Convert pseudo-keyword fallthrough; statements to a simple break; when
> the next label is case or default and the only statement in the next
> label block is break;
>
> Found using:
>
> $ grep-2.5.4 -rP --include=*.[ch] -n 
> "fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *
>
> Miscellanea:
>
> o Move or coalesce a couple label blocks above a default: block.
>
> Signed-off-by: Joe Perches 
> ---
>
> Compiled allyesconfig x86-64 only.
> A few files for other arches were not compiled.
>
>  arch/arm/mach-mmp/pm-pxa910.c |  2 +-
>  arch/arm64/kvm/handle_exit.c  |  2 +-
>  arch/mips/kernel/cpu-probe.c  |  2 +-
>  arch/mips/math-emu/cp1emu.c   |  2 +-
>  arch/s390/pci/pci.c   |  2 +-
>  crypto/tcrypt.c   |  4 ++--
>  drivers/ata/sata_mv.c |  2 +-
>  drivers/atm/lanai.c   |  2 +-
>  drivers/gpu/drm/i915/display/intel_sprite.c   |  2 +-
>  drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmi.c   |  2 +-
>  drivers/hid/wacom_wac.c   |  2 +-
>  drivers/i2c/busses/i2c-i801.c |  2 +-
>  drivers/infiniband/ulp/rtrs/rtrs-clt.c| 14 +++---
>  drivers/infiniband/ulp/rtrs/rtrs-srv.c|  6 +++---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  2 +-
>  drivers/irqchip/irq-vic.c |  4 ++--
>  drivers/md/dm.c   |  2 +-
>  drivers/media/dvb-frontends/drxd_hard.c   |  2 +-
>  drivers/media/i2c/ov5640.c|  2 +-
>  drivers/media/i2c/ov6650.c|  5 ++---
>  drivers/media/i2c/smiapp/smiapp-core.c|  2 +-
>  drivers/media/i2c/tvp5150.c   |  2 +-
>  drivers/media/pci/ddbridge/ddbridge-core.c|  2 +-
>  drivers/media/usb/cpia2/cpia2_core.c  |  2 +-
>  drivers/mfd/iqs62x.c  |  3 +--
>  drivers/mmc/host/atmel-mci.c  |  2 +-
>  drivers/mtd/nand/raw/nandsim.c|  2 +-
>  drivers/net/ethernet/intel/e1000e/phy.c   |  2 +-
>  drivers/net/ethernet/intel/fm10k/fm10k_pf.c   |  2 +-
>  drivers/net/ethernet/intel/i40e/i40e_adminq.c |  2 +-
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  2 +-
>  drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  2 +-
>  drivers/net/ethernet/intel/igb/e1000_phy.c|  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c|  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c|  2 +-
>  drivers/net/ethernet/intel/ixgbevf/vf.c   |  2 +-
>  drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c |  2 +-
>  drivers/net/ethernet/qlogic/qed/qed_mcp.c |  2 +-
>  drivers/net/ethernet/sfc/falcon/farch.c   |  2 +-
>  drivers/net/ethernet/sfc/farch.c  |  2 +-
>  drivers/net/phy/adin.c|  3 +--
>  drivers/net/usb/pegasus.c |  4 ++--
>  drivers/net/usb/usbnet.c  |  2 +-
>  drivers/net/wireless/ath/ath5k/eeprom.c   |  2 +-
>  drivers/net/wireless/mediatek/mt7601u/dma.c   |  8 
>  drivers/nvme/host/core.c  | 12 ++--
>  drivers/pcmcia/db1xxx_ss.c|  4 ++--
>  drivers/power/supply/abx500_chargalg.c|  2 +-
>  drivers/power/supply/charger-manager.c|  2 +-
>  drivers/rtc/rtc-pcf85063.c|  2 +-
>  drivers/s390/scsi/zfcp_fsf.c  |  2 +-
>  drivers/scsi/aic7xxx/aic79xx_core.c   |  4 ++--
>  drivers/scsi/aic94xx/aic94xx_tmf.c|  2 +-
>  drivers/scsi/lpfc/lpfc_sli.c  |  2 +-
>  drivers/scsi/smartpqi/smartpqi_init.c |  2 +-
>  drivers/scsi/sr.c |  2 +-
>  drivers/tty/serial/sunsu.c|  2 +-
>  drivers/tty/serial/sunzilog.c |  2 +-
>  drivers/tty/vt/vt_ioctl.c |  2 +-
>  drivers/usb/dwc3/core.c   |  2 +-
>  drivers/usb/gadget/legacy/inode.c |  2 +-
>  drivers/usb/gadget/udc/pxa25x_udc.c

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Steffen Maier


On 9/9/20 10:06 PM, Joe Perches wrote:

fallthrough to a separate case/default label break; isn't very readable.

Convert pseudo-keyword fallthrough; statements to a simple break; when
the next label is case or default and the only statement in the next
label block is break;

Found using:

$ grep-2.5.4 -rP --include=*.[ch] -n 
"fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *

Miscellanea:

o Move or coalesce a couple label blocks above a default: block.

Signed-off-by: Joe Perches 
---

Compiled allyesconfig x86-64 only.
A few files for other arches were not compiled.



  drivers/s390/scsi/zfcp_fsf.c  |  2 +-



  82 files changed, 109 insertions(+), 112 deletions(-)



diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 140186fe1d1e..2741a07df692 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -2105,7 +2105,7 @@ static void zfcp_fsf_open_lun_handler(struct zfcp_fsf_req 
*req)
  
  	case FSF_PORT_HANDLE_NOT_VALID:

zfcp_erp_adapter_reopen(adapter, 0, "fsouh_1");
-   fallthrough;
+   break;
case FSF_LUN_ALREADY_OPEN:
break;
case FSF_PORT_BOXED:


Acked-by: Steffen Maier  # for zfcp


--
Mit freundlichen Gruessen / Kind regards
Steffen Maier

Linux on IBM Z Development

https://www.ibm.com/privacy/us/en/
IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Matthias Hartmann
Geschaeftsfuehrung: Dirk Wittkopp
Sitz der Gesellschaft: Boeblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-10 Thread Alexander Gordeev

On Wed, Sep 09, 2020 at 03:03:24PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 09, 2020 at 07:25:34PM +0200, Gerald Schaefer wrote:
> > I actually had to draw myself a picture to get some hold of
> > this, or rather a walk-through with a certain pud-crossing
> > range in a folded 3-level scenario. Not sure if I would have
> > understood my explanation above w/o that, but I hope you can
> > make some sense out of it. Or draw yourself a picture :-)
> 
> What I don't understand is how does anything work with S390 today?
> 
> If the fix is only to change pxx_addr_end() then than generic code
> like mm/pagewalk.c will iterate over a *different list* of page table
> entries. 
> 
> It's choice of entries to look at is entirely driven by pxx_addr_end().
> 
> Which suggest to me that mm/pagewalk.c also doesn't work properly
> today on S390 and this issue is not really about stack variables?
> 
> Fundamentally if pXX_offset() and pXX_addr_end() must be consistent
> together, if pXX_offset() is folded then pXX_addr_end() must cause a
> single iteration of that level.

Your observation is correct.

Another way to describe the problem is existing pXd_addr_end helpers
could be applied to mismatching levels on s390 (e.g p4d_addr_end
applied to pud or pgd_addr_end applied to p4d). As you noticed,
all *_pXd_range iterators could be called with address ranges that
exceed single pXd table.

However, when it happens with pointers to real page tables (passed to
*_pXd_range iterators) we still operate on valid tables, which just
(lucky for us) happened to be folded. Thus we still reference correct
table entries.

It is only gup_fast case that exposes the issue. It hits because
pointers to stack copies are passed to gup_pXd_range iterators, not
pointers to real page tables itself.

As Gerald mentioned, it is very difficult to explain in a clear way.
Hopefully, one could make sense ot of it.

> Jason

RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread David Laight

From: Christophe Leroy
> Sent: 10 September 2020 09:14
> 
> Le 10/09/2020 à 10:04, David Laight a écrit :
> > From: Linus Torvalds
> >> Sent: 09 September 2020 22:34
> >> On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
> >>  wrote:
> >>>
> >>> It will not work like this in GCC, no.  The LLVM people know about that.
> >>> I do not know why they insist on pushing this, being incompatible and
> >>> everything.
> >>
> >> Umm. Since they'd be the ones supporting this, *gcc* would be the
> >> incompatible one, not clang.
> >
> > I had an 'interesting' idea.
> >
> > Can you use a local asm register variable as an input and output to
> > an 'asm volatile goto' statement?
> >
> > Well you can - but is it guaranteed to work :-)
> >
> 
> With gcc at least it should work according to
> https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html
> 
> They even explicitely tell: "The only supported use for this feature is
> to specify registers for input and output operands when calling Extended
> asm "

A quick test isn't good

int bar(char *z)
{
__label__ label;
register int eax asm ("eax") = 6;
asm volatile goto (" mov $1, %%eax" ::: "eax" : label);

label:
return eax;
}

0040 :
  40:   b8 01 00 00 00  mov$0x1,%eax
  45:   b8 06 00 00 00  mov$0x6,%eax
  4a:   c3  retq

although adding:
asm volatile ("" : "+r" (eax));
either side of the 'asm volatile goto' does fix it.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

[PATCH] powerpc/papr_scm: Fix warning triggered by perf_stats_show()

2020-09-10 Thread Vaibhav Jain

A warning is reported by the kernel in case perf_stats_show() returns
an error code. The warning is of the form below:

 papr_scm ibm,persistent-memory:ibm,pmemory@4411:
  Failed to query performance stats, Err:-10
 dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
 fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count

On investigation it looks like that the compiler is silently truncating the
return value of drc_pmem_query_stats() from 'long' to 'int', since the
variable used to store the return code 'rc' is an 'int'. This
truncated value is then returned back as a 'ssize_t' back from
perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
unsigned number and triggers this warning..

To fix this we update the type of variable 'rc' from 'int' to
'ssize_t' that prevents the compiler from truncating the return value
of drc_pmem_query_stats() and returning correct signed value back from
perf_stats_show().

Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
   stats from PHYP')
Signed-off-by: Vaibhav Jain 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index a88a707a608aa..9f00b61676ab9 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -785,7 +785,8 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor 
*nd_desc,
 static ssize_t perf_stats_show(struct device *dev,
   struct device_attribute *attr, char *buf)
 {
-   int index, rc;
+   int index;
+   ssize_t rc;
struct seq_buf s;
struct papr_scm_perf_stat *stat;
struct papr_scm_perf_stats *stats;
-- 
2.26.2

[PATCH v3 3/7] mm/memory_hotplug: prepare passing flags to add_memory() and friends

2020-09-10 Thread David Hildenbrand

We soon want to pass flags, e.g., to mark added System RAM resources.
mergeable. Prepare for that.

This patch is based on a similar patch by Oscar Salvador:

https://lkml.kernel.org/r/20190625075227.15193-3-osalva...@suse.de

Acked-by: Wei Liu 
Reviewed-by: Juergen Gross  # Xen related part
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Jason Gunthorpe 
Cc: Pankaj Gupta 
Cc: Baoquan He 
Cc: Wei Yang 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Wei Liu 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
Cc: David Hildenbrand 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: "Oliver O'Halloran" 
Cc: Pingfan Liu 
Cc: Nathan Lynch 
Cc: Libor Pechacek 
Cc: Anton Blanchard 
Cc: Leonardo Bras 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-a...@vger.kernel.org
Cc: linux-nvd...@lists.01.org
Cc: linux-hyp...@vger.kernel.org
Cc: linux-s...@vger.kernel.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: xen-de...@lists.xenproject.org
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c   |  2 +-
 arch/powerpc/platforms/pseries/hotplug-memory.c |  2 +-
 drivers/acpi/acpi_memhotplug.c  |  3 ++-
 drivers/base/memory.c   |  3 ++-
 drivers/dax/kmem.c  |  2 +-
 drivers/hv/hv_balloon.c |  2 +-
 drivers/s390/char/sclp_cmd.c|  2 +-
 drivers/virtio/virtio_mem.c |  2 +-
 drivers/xen/balloon.c   |  2 +-
 include/linux/memory_hotplug.h  | 16 
 mm/memory_hotplug.c | 14 +++---
 11 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 13b369d2cc454..6828108486f83 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -224,7 +224,7 @@ static int memtrace_online(void)
ent->mem = 0;
}
 
-   if (add_memory(ent->nid, ent->start, ent->size)) {
+   if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
pr_err("Failed to add trace memory to node %d\n",
ent->nid);
ret += 1;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 0ea976d1cac47..e1c9fa0d730f5 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -615,7 +615,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = memory_add_physaddr_to_nid(lmb->base_addr);
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index e294f44a78504..2067c3bc55763 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -207,7 +207,8 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (node < 0)
node = memory_add_physaddr_to_nid(info->start_addr);
 
-   result = __add_memory(node, info->start_addr, info->length);
+   result = __add_memory(node, info->start_addr, info->length,
+ MHP_NONE);
 
/*
 * If the memory block has been used by the kernel, add_memory()
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 4db3c660de831..b4c297dd04755 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -432,7 +432,8 @@ static ssize_t probe_store(struct device *dev, struct 
device_attribute *attr,
 
nid = memory_add_physaddr_to_nid(phys_addr);
ret = __add_memory(nid, phys_addr,
-  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+  MIN_MEMORY_BLOCK_SIZE * sections_per_block,
+  MHP_NONE);
 
if (ret)
goto out;
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 7dcb2902e9b1b..896cb9444e727 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -95,7 +95,7 @@ int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 * this as RAM automatically.
 */
rc = add_memory_driver_managed(numa_node, range.start,
-   range_len(), kmem_name);
+   range_len(),

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Felipe Balbi

Hi,

Joe Perches  writes:
>  drivers/usb/dwc3/core.c   |  2 +-
>  drivers/usb/gadget/legacy/inode.c |  2 +-
>  drivers/usb/gadget/udc/pxa25x_udc.c   |  4 ++--
>  drivers/usb/phy/phy-fsl-usb.c |  2 +-

for the drivers above:

Acked-by: Felipe Balbi 

-- 
balbi


signature.asc
Description: PGP signature

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread Christophe Leroy





Le 10/09/2020 à 10:04, David Laight a écrit :

From: Linus Torvalds

Sent: 09 September 2020 22:34
On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
 wrote:


It will not work like this in GCC, no.  The LLVM people know about that.
I do not know why they insist on pushing this, being incompatible and
everything.


Umm. Since they'd be the ones supporting this, *gcc* would be the
incompatible one, not clang.


I had an 'interesting' idea.

Can you use a local asm register variable as an input and output to
an 'asm volatile goto' statement?

Well you can - but is it guaranteed to work :-)



With gcc at least it should work according to 
https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html


They even explicitely tell: "The only supported use for this feature is 
to specify registers for input and output operands when calling Extended 
asm "


Christophe

RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread David Laight

From: Linus Torvalds
> Sent: 09 September 2020 22:34
> On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
>  wrote:
> >
> > It will not work like this in GCC, no.  The LLVM people know about that.
> > I do not know why they insist on pushing this, being incompatible and
> > everything.
> 
> Umm. Since they'd be the ones supporting this, *gcc* would be the
> incompatible one, not clang.

I had an 'interesting' idea.

Can you use a local asm register variable as an input and output to
an 'asm volatile goto' statement?

Well you can - but is it guaranteed to work :-)

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

Re: [PATCH v2] powerpc: Warn about use of smt_snooze_delay

2020-09-10 Thread Michael Ellerman

Michael Ellerman  writes:
> On Tue, 30 Jun 2020 11:29:35 +0930, Joel Stanley wrote:
>> It's not done anything for a long time. Save the percpu variable, and
>> emit a warning to remind users to not expect it to do anything.
>> 
>> Fixes: 3fa8cad82b94 ("powerpc/pseries/cpuidle: smt-snooze-delay cleanup.")
>> Cc: sta...@vger.kernel.org # v3.14
>> Signed-off-by: Joel Stanley 
>> --
>> v2:
>>  Use pr_warn instead of WARN
>>  Reword and print proccess name with pid in message
>>  Leave CPU_FTR_SMT test in
>>  Add Fixes line
>> 
>> [...]
>
> Applied to powerpc/next.
>
> [1/1] powerpc: Warn about use of smt_snooze_delay
>   
> https://git.kernel.org/powerpc/c/a02f6d42357acf6e5de6ffc728e6e77faf3ad217

I applied v3 actually.

cheers

Re: [PATCH] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-09-10 Thread Michael Ellerman

Michael Ellerman  writes:
> On Tue, 28 Jul 2020 12:37:41 -0500, Nathan Lynch wrote:
>> The drmem lmb list can have hundreds of thousands of entries, and
>> unfortunately lookups take the form of linear searches. As long as
>> this is the case, traversals have the potential to monopolize the CPU
>> and provoke lockup reports, workqueue stalls, and the like unless
>> they explicitly yield.
>> 
>> Rather than placing cond_resched() calls within various
>> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
>> expression of the loop macro itself so users can't omit it.
>
> Applied to powerpc/next.
>
> [1/1] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
>   
> https://git.kernel.org/powerpc/c/9d6792ffe140240ae54c881cc4183f9acc24b4df

Some script gremlins here, I actually applied v3 (only once).

cheers

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Wolfram Sang


> diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
> index e32ef3f01fe8..b13b1cbcac29 100644
> --- a/drivers/i2c/busses/i2c-i801.c
> +++ b/drivers/i2c/busses/i2c-i801.c
> @@ -1785,7 +1785,7 @@ static int i801_probe(struct pci_dev *dev, const struct 
> pci_device_id *id)
>   fallthrough;
>   case PCI_DEVICE_ID_INTEL_82801CA_3:
>   priv->features |= FEATURE_HOST_NOTIFY;
> - fallthrough;
> + break;
>   case PCI_DEVICE_ID_INTEL_82801BA_2:
>   case PCI_DEVICE_ID_INTEL_82801AB_3:
>   case PCI_DEVICE_ID_INTEL_82801AA_3:

I am not the maintainer (Jean is) but I suggest to drop this hunk. The
code is more complex with multiple 'fallthrough', so this change alone
actually makes the code inconsistent. A rework would need a seperate
patch.



signature.asc
Description: PGP signature

Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-10 Thread Mauro Carvalho Chehab

Em Wed, 09 Sep 2020 13:06:39 -0700
Joe Perches  escreveu:

> fallthrough to a separate case/default label break; isn't very readable.
> 
> Convert pseudo-keyword fallthrough; statements to a simple break; when
> the next label is case or default and the only statement in the next
> label block is break;
> 
> Found using:
> 
> $ grep-2.5.4 -rP --include=*.[ch] -n 
> "fallthrough;(\s*(case\s+\w+|default)\s*:\s*){1,7}break;" *
> 
> Miscellanea:
> 
> o Move or coalesce a couple label blocks above a default: block.
> 
> Signed-off-by: Joe Perches 
> ---
> 
> Compiled allyesconfig x86-64 only.
> A few files for other arches were not compiled.
> 


>  drivers/media/dvb-frontends/drxd_hard.c   |  2 +-
>  drivers/media/i2c/ov5640.c|  2 +-
>  drivers/media/i2c/ov6650.c|  5 ++---
>  drivers/media/i2c/smiapp/smiapp-core.c|  2 +-
>  drivers/media/i2c/tvp5150.c   |  2 +-
>  drivers/media/pci/ddbridge/ddbridge-core.c|  2 +-
>  drivers/media/usb/cpia2/cpia2_core.c  |  2 +-

For media drivers:

Reviewed-by: Mauro Carvalho Chehab 


Thanks,
Mauro

[PATCH 9/9] powerpc/eeh: Clean up PE addressing

2020-09-10 Thread Oliver O'Halloran

At some point in the distant past we only supported EEH on pseries. The
various EEH RTAS call use the "PE config address" as a handle to the PE
being manipulated so we need to find that address a PE.

There's three ways to determine the address of a PE starting from a device
inside of that PE: The old way, which requires traversing the DT until you
find a built-in device and finding its PE using the
ibm,read-slot-reset-state2 RTAS call. The new way, which requires using the
ibm,get-config-addr-info RTAS call to map the device address to a PE
address. And then there's the Linux way, which is "blindly assume the
device and PE addresses are the same." Naturally, PAPR doesn't recommend
(or even mention) that last one, but it's a technique
Linux on Power has used since the dawn of time.

Most (all?) modern pseries systems will provide the get-addr-info RTAS call
so Linux uses that for everything except for the initial ibm,eeh-set-option
RTAS call to enable EEH on the PE. The Linux way is still broken in that
case, but it seems to work so who maybe firmware have hacks to support it,
who knows. For systems that don't support the RTAS call we'll use the Linux
way as a fallback. For some reason we don't just use the fallback address
to initialise eeh_dev->addr and instead eeh_pe has two addresses and we'll
choose which one to use at runtime. This results in code that looks
something like:

config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;

rtas_call(..., config_addr, ...);

In other words, if the result of the RTAS call is non-zero then Linux will
use that as the pe address. If not, it falls back to using the config_addr.
It's worth pointing out that both fields here used to be part of pci_dn and
ended up in eeh_pe after a while.

Storing both addresses in eeh_pe doesn't really make a whole lot of sense.
Why does the eeh_pe structure, which is platform independent, have two
addresses baked into it for the sake of a pseries platform quirk? Why
doesn't the pseries platform handle determining what the "correct" PE
address is during EEH initialisation for the device and just pass that
address to the EEH core? What does it even mean for a pe to have an
"address" if there's two of them?

There are no good answers to these questions; especially that last one. The
EEH core makes a token effort to support looking up a PE by either address
by having two arguments to eeh_pe_get(). However, a survey of all the
callers to eeh_pe_get() shows that all except one hard-code zero as the
config_addr argument. The only one that doesn't is in eeh_pe_tree_insert()a
which looks like this:

if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr)
return -EINVAL;

pe = eeh_pe_get(hose, edev->pe_config_addr, edev->bdfn);

The third argument (config_addr) is only used if the second (pe->addr)
argument is invalid. In this case that would require edev->pe_config_addr
to be zero and the EEH_VALID_PE_ZERO flag to be unset. The preceding check
ensure that can never be true so there is no situation where eeh_pe_get()
will search for a PE with the specified pe->config_addr.

Similarly, on pseries the EEH_VALID_PE_ZERO flag isn't set so the check
above also ensures that there will never be a PE with pe->addr == 0. As a
result all the logic to choose whether we pass pe->config_addr or pe->addr
to an RTAS call is also dead code. The pe->config_addr will never be used
since pe->addr must be non-zero. Otherwise it wouldn't be in the PE tree.

This patch tries to clean up this mess by:

1) Removing pe->config_addr
2) Removing the EEH_VALID_PE_ZERO flag
3) Removing the fallback address argument to eeh_pe_get().
4) Removing all the checks for pe->addr being zero in the pseries EEH code.

This leaves us with PE's only being identified by what's in their pe->addr
field and relying on the platform to ensure that eeh_dev's are only
inserted into the EEH tree if they're actually inside a PE.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   |  4 +-
 arch/powerpc/kernel/eeh.c|  2 +-
 arch/powerpc/kernel/eeh_pe.c | 46 +++-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 16 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 42 +++---
 5 files changed, 17 insertions(+), 93 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 85030c05e67e..dd6a4ac6c713 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -27,7 +27,6 @@ struct pci_dn;
 #define EEH_FORCE_DISABLED 0x02/* EEH disabled  */
 #define EEH_PROBE_MODE_DEV 0x04/* From PCI device   */
 #define EEH_PROBE_MODE_DEVTREE 0x08/* From device tree  */
-#define EEH_VALID_PE_ZERO  0x10/* PE#0 is valid */
 #define EEH_ENABLE_IO_FOR_LOG  0x20/*

[PATCH 8/9] powerpc/pseries/eeh: Allow zero to be a valid PE configuration address

2020-09-10 Thread Oliver O'Halloran

There's no real reason why zero can't be a valid PE configuration address.
Under qemu each sPAPR PHB (i.e. EEH supporting) has the passed-though
devices on bus zero, so the PE address of bus :00 should be zero.

However, all previous versions of Linux will reject that, so Qemu at least
goes out of it's way to avoid it. The Qemu implementation of
ibm,get-config-addr-info2 RTAS has the following comment:

> /*
>  * We always have PE address of form "00BB0001". "BB"
>  * represents the bus number of PE's primary bus.
>  */

So qemu puts a one into the register portion of the PE's config_addr to
avoid it being zero. The whole is pretty silly considering that RTAS will
return a negative error code if it can't map the device's config_addr to a
PE's.

Fix Linux to treat zero as a valid PE address. This shouldn't have any real
effects due to the Qemu hack mentioned above, and the fact that this has
worked historically on PowerVM means they never pass through devices on bus
zero.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 38 +++-
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index c2ecc0db2f94..e42c026392aa 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -87,21 +87,20 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
  * pseries_eeh_get_pe_config_addr - Find the pe_config_addr for a device
  * @pdn: pci_dn of the input device
  *
- * Retrieve the assocated config address. Actually, there're 2 RTAS
- * function calls dedicated for the purpose. We need implement
- * it through the new function and then the old one. Besides,
- * you should make sure the config address is figured out from
- * FDT node before calling the function.
+ * The EEH RTAS calls use a tuple consisting of: (buid_hi, buid_lo,
+ * pe_config_addr) as a handle to a given PE. This function finds the
+ * pe_config_addr based on the device's config addr.
  *
- * It's notable that zero'ed return value means invalid PE config
- * address.
+ * Keep in mind that the pe_config_addr *might* be numerically identical to the
+ * device's config addr, but the two are conceptually distinct.
+ *
+ * Returns the pe_config_addr, or a negative error code.
  */
 static int pseries_eeh_get_pe_config_addr(struct pci_dn *pdn)
 {
int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
struct pci_controller *phb = pdn->phb;
-   int ret = 0;
-   int rets[3];
+   int ret, rets[3];
 
if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
/*
@@ -112,16 +111,16 @@ static int pseries_eeh_get_pe_config_addr(struct pci_dn 
*pdn)
config_addr, BUID_HI(phb->buid),
BUID_LO(phb->buid), 1);
if (ret || (rets[0] == 0))
-   return 0;
+   return -ENOENT;
 
-   /* Retrieve the associated PE config address */
+   /* Retrieve the associated PE config address with function 0 */
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
config_addr, BUID_HI(phb->buid),
BUID_LO(phb->buid), 0);
if (ret) {
pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
__func__, phb->global_number, config_addr);
-   return 0;
+   return -ENXIO;
}
 
return rets[0];
@@ -134,13 +133,20 @@ static int pseries_eeh_get_pe_config_addr(struct pci_dn 
*pdn)
if (ret) {
pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
__func__, phb->global_number, config_addr);
-   return 0;
+   return -ENXIO;
}
 
return rets[0];
}
 
-   return ret;
+   /*
+* PAPR does describe a process for finding the pe_config_addr that was
+* used before the ibm,get-config-addr-info calls were added. However,
+* I haven't found *any* systems that don't have that RTAS call
+* implemented. If you happen to find one that needs the old DT based
+* process, patches are welcome!
+*/
+   return -ENOENT;
 }
 
 /**
@@ -419,7 +425,7 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
 
/* first up, find the pe_config_addr for the PE containing the device */
addr = pseries_eeh_get_pe_config_addr(pdn);
-   if (addr == 0) {
+   if (addr < 0) {
eeh_edev_dbg(edev, "Unable to find pe_config_addr\n");
goto err;
}
@@ -901,7 +907,7 @@ static int __init eeh_pseries_init(void)
config_addr =

77 matches

Mail list logo