[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:155:23: error: field has incomplete type 'struct tlb_core_data'

2023-08-21 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
head:   0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d
commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM 
feature fixups instead of static branches
config: powerpc64-randconfig-r012-20230822 
(https://download.01.org/0day-ci/archive/20230822/202308221352.nocgtdjp-...@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 
4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: 
(https://download.01.org/0day-ci/archive/20230822/202308221352.nocgtdjp-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202308221352.nocgtdjp-...@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/mm/nohash/kup.c:11:
   In file included from arch/powerpc/include/asm/kup.h:24:
   In file included from arch/powerpc/include/asm/nohash/kup-booke.h:6:
   In file included from arch/powerpc/include/asm/mmu.h:148:
   In file included from arch/powerpc/include/asm/percpu.h:20:
>> arch/powerpc/include/asm/paca.h:155:23: error: field has incomplete type 
>> 'struct tlb_core_data'
 155 | struct tlb_core_data tcd;
 |  ^
   arch/powerpc/include/asm/paca.h:139:9: note: forward declaration of 'struct 
tlb_core_data'
 139 | struct tlb_core_data *tcd_ptr;
 |^
   arch/powerpc/include/asm/paca.h:291:36: warning: declaration of 'struct 
mm_struct' will not be visible outside of this function [-Wvisibility]
 291 | extern void copy_mm_to_paca(struct mm_struct *mm);
 |^
   1 warning and 1 error generated.


vim +155 arch/powerpc/include/asm/paca.h

91c60b5b8209627 Benjamin Herrenschmidt 2009-06-02  131  
e0d68273d706953 Christophe Leroy   2022-09-19  132  #ifdef 
CONFIG_PPC_BOOK3E_64
016f8cf0d87bb2b Kevin Hao  2015-03-10  133  u64 exgen[8] 
__aligned(0x40);
f67f4ef5fcdfdee Scott Wood 2011-06-22  134  /* Keep pgd in 
the same cacheline as the start of extlb */
016f8cf0d87bb2b Kevin Hao  2015-03-10  135  pgd_t *pgd 
__aligned(0x40); /* Current PGD */
f67f4ef5fcdfdee Scott Wood 2011-06-22  136  pgd_t 
*kernel_pgd;  /* Kernel PGD */
28efc35fe68dacb Scott Wood 2013-10-11  137  
28efc35fe68dacb Scott Wood 2013-10-11  138  /* Shared by 
all threads of a core -- points to tcd of first thread */
28efc35fe68dacb Scott Wood 2013-10-11  139  struct 
tlb_core_data *tcd_ptr;
28efc35fe68dacb Scott Wood 2013-10-11  140  
609af38f8fc0f1d Scott Wood 2014-03-10  141  /*
609af38f8fc0f1d Scott Wood 2014-03-10  142   * We can have 
up to 3 levels of reentrancy in the TLB miss handler,
609af38f8fc0f1d Scott Wood 2014-03-10  143   * in each of 
four exception levels (normal, crit, mcheck, debug).
609af38f8fc0f1d Scott Wood 2014-03-10  144   */
609af38f8fc0f1d Scott Wood 2014-03-10  145  u64 
extlb[12][EX_TLB_SIZE / sizeof(u64)];
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  146  u64 exmc[8];
/* used for machine checks */
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  147  u64 excrit[8];  
/* used for crit interrupts */
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  148  u64 exdbg[8];   
/* used for debug interrupts */
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  149  
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  150  /* Kernel stack 
pointers for use by special exceptions */
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  151  void *mc_kstack;
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  152  void 
*crit_kstack;
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  153  void 
*dbg_kstack;
28efc35fe68dacb Scott Wood 2013-10-11  154  
28efc35fe68dacb Scott Wood 2013-10-11 @155  struct 
tlb_core_data tcd;
e0d68273d706953 Christophe Leroy   2022-09-19  156  #endif /* 
CONFIG_PPC_BOOK3E_64 */
dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23  157  

:: The code at line 155 was first introduced by commit
:: 28efc35fe68dacbddc4b12c2fa8f2df1593a4ad3 powerpc/e6500: TLB miss handler 
with hardware tablewalk support

:: TO: Scott Wood 
:: CC: Scott Wood 

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


Re: [PATCH 1/9] ARM: Remove

2023-08-21 Thread Damien Le Moal
On 8/18/23 01:07, Geert Uytterhoeven wrote:
> As of commit b7fb14d3ac63117e ("ide: remove the legacy ide driver") in
> v5.14, there are no more generic users of .
> 
> Signed-off-by: Geert Uytterhoeven 

Looks good to me. All patches are reviewed or acked, except this one.
Can I get an ack from arm folks ?

> ---
>  arch/arm/include/asm/ide.h | 24 
>  1 file changed, 24 deletions(-)
>  delete mode 100644 arch/arm/include/asm/ide.h
> 
> diff --git a/arch/arm/include/asm/ide.h b/arch/arm/include/asm/ide.h
> deleted file mode 100644
> index a81e0b0d6747aa2f..
> --- a/arch/arm/include/asm/ide.h
> +++ /dev/null
> @@ -1,24 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0 */
> -/*
> - *  arch/arm/include/asm/ide.h
> - *
> - *  Copyright (C) 1994-1996  Linus Torvalds & authors
> - */
> -
> -/*
> - *  This file contains the ARM architecture specific IDE code.
> - */
> -
> -#ifndef __ASMARM_IDE_H
> -#define __ASMARM_IDE_H
> -
> -#ifdef __KERNEL__
> -
> -#define __ide_mm_insw(port,addr,len) readsw(port,addr,len)
> -#define __ide_mm_insl(port,addr,len) readsl(port,addr,len)
> -#define __ide_mm_outsw(port,addr,len)writesw(port,addr,len)
> -#define __ide_mm_outsl(port,addr,len)writesl(port,addr,len)
> -
> -#endif /* __KERNEL__ */
> -
> -#endif /* __ASMARM_IDE_H */

-- 
Damien Le Moal
Western Digital Research



Re: [PATCH] powerpc/perf: Convert fsl_emb notifier to state machine callbacks

2023-08-21 Thread Christophe Leroy


Le 18/08/2023 à 10:59, Christophe Leroy a écrit :
>CC  arch/powerpc/perf/core-fsl-emb.o
> arch/powerpc/perf/core-fsl-emb.c:675:6: error: no previous prototype for 
> 'hw_perf_event_setup' [-Werror=missing-prototypes]
>675 | void hw_perf_event_setup(int cpu)
>|  ^~~
> 
> Looks like fsl_emb was completely missed by commit 3f6da3905398 ("perf:
> Rework and fix the arch CPU-hotplug hooks")
> 
> So, apply same changes as commit 3f6da3905398 ("perf: Rework and fix
> the arch CPU-hotplug hooks") then commit 57ecde42cc74 ("powerpc/perf:
> Convert book3s notifier to state machine callbacks")
> 
> While at it, also fix following error:
> 
> arch/powerpc/perf/core-fsl-emb.c: In function 'perf_event_interrupt':
> arch/powerpc/perf/core-fsl-emb.c:648:13: error: variable 'found' set but not 
> used [-Werror=unused-but-set-variable]
>648 | int found = 0;
>| ^
> 
> Fixes: 3f6da3905398 ("perf: Rework and fix the arch CPU-hotplug hooks")
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Signed-off-by: Christophe Leroy 
> Cc: Arnd Bergmann 

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202308220658.or5hhfcd-...@intel.com/

> ---
>   arch/powerpc/perf/core-fsl-emb.c | 8 +---
>   1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/perf/core-fsl-emb.c 
> b/arch/powerpc/perf/core-fsl-emb.c
> index ee721f420a7b..1a53ab08447c 100644
> --- a/arch/powerpc/perf/core-fsl-emb.c
> +++ b/arch/powerpc/perf/core-fsl-emb.c
> @@ -645,7 +645,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
>   struct cpu_hw_events *cpuhw = this_cpu_ptr(_hw_events);
>   struct perf_event *event;
>   unsigned long val;
> - int found = 0;
>   
>   for (i = 0; i < ppmu->n_counter; ++i) {
>   event = cpuhw->event[i];
> @@ -654,7 +653,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
>   if ((int)val < 0) {
>   if (event) {
>   /* event has overflowed */
> - found = 1;
>   record_and_restart(event, val, regs);
>   } else {
>   /*
> @@ -672,11 +670,13 @@ static void perf_event_interrupt(struct pt_regs *regs)
>   isync();
>   }
>   
> -void hw_perf_event_setup(int cpu)
> +static int fsl_emb_pmu_prepare_cpu(unsigned int cpu)
>   {
>   struct cpu_hw_events *cpuhw = _cpu(cpu_hw_events, cpu);
>   
>   memset(cpuhw, 0, sizeof(*cpuhw));
> +
> + return 0;
>   }
>   
>   int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
> @@ -689,6 +689,8 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
>   pmu->name);
>   
>   perf_pmu_register(_emb_pmu, "cpu", PERF_TYPE_RAW);
> + cpuhp_setup_state(CPUHP_PERF_POWER, "perf/powerpc:prepare",
> +   fsl_emb_pmu_prepare_cpu, NULL);
>   
>   return 0;
>   }


Re: linux-next: build failure after merge of the mm tree

2023-08-21 Thread Darrick J. Wong
On Tue, Aug 22, 2023 at 02:34:06AM +0100, Matthew Wilcox wrote:
> On Tue, Aug 22, 2023 at 11:22:17AM +1000, Stephen Rothwell wrote:
> > Hi Matthew,
> > 
> > On Tue, 22 Aug 2023 02:11:44 +0100 Matthew Wilcox  
> > wrote:
> > >
> > > On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote:
> > > > In file included from include/trace/trace_events.h:27,
> > > >  from include/trace/define_trace.h:102,
> > > >  from fs/xfs/xfs_trace.h:4428,
> > > >  from fs/xfs/xfs_trace.c:45:
> > > > include/linux/pgtable.h:8:25: error: initializer element is not constant
> > > > 8 | #define PMD_ORDER   (PMD_SHIFT - PAGE_SHIFT)  
> > > 
> > > Ummm.  PowerPC doesn't have a compile-time constant PMD size?
> > 
> > Yeah, you are not the first (or probably the last) to be caught by that.
> 
> I think this will do the trick.  Any comments?
> 
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 1904eaf7a2e9..d5a4e6c2dcd1 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -796,15 +796,6 @@ DEFINE_INODE_EVENT(xfs_inode_reclaiming);
>  DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
>  DEFINE_INODE_EVENT(xfs_inode_inactivating);
>  
> -/*
> - * ftrace's __print_symbolic requires that all enum values be wrapped in the
> - * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the 
> ftrace
> - * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
> - * code.
> - */

Please leave this ^^^ comment, because the need for TRACE_DEFINE_ENUM to
make enums work in tracepoints is not at all obvious.

> -TRACE_DEFINE_ENUM(PMD_ORDER);
> -TRACE_DEFINE_ENUM(PUD_ORDER);
> -
>  TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
>  TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
>  
> @@ -823,13 +814,10 @@ TRACE_EVENT(xfs_filemap_fault,
>   __entry->order = order;
>   __entry->write_fault = write_fault;
>   ),
> - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
> + TP_printk("dev %d:%d ino 0x%llx order:%u write_fault %d",

"order %u" to match the (non dev_t) style of the rest of the xfs
tracepoints.

--D

> MAJOR(__entry->dev), MINOR(__entry->dev),
> __entry->ino,
> -   __print_symbolic(__entry->order,
> - { 0,"PTE" },
> - { PMD_ORDER,"PMD" },
> - { PUD_ORDER,"PUD" }),
> +   __entry->order,
> __entry->write_fault)
>  )
>  
> 
> 


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-21 Thread Hugh Dickins
On Mon, 21 Aug 2023, Jann Horn wrote:
> On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins  wrote:
> > Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
> > shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
> > thought it had emptied: page lock on the huge page is enough to protect
> > against WP faults (which find the PTE has been cleared), but not enough
> > to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.
> >
> > retract_page_tables() protects against this by checking !vma->anon_vma;
> > but we know that MADV_COLLAPSE needs to be able to work on private shmem
> > mappings, even those with an anon_vma prepared for another part of the
> > mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
> > mappings which are userfaultfd_armed().  Whether it needs to work on
> > private shmem mappings which are userfaultfd_armed(), I'm not so sure:
> > but assume that it does.
> 
> I think we couldn't rely on anon_vma here anyway, since holding the
> mmap_lock in read mode doesn't prevent concurrent creation of an
> anon_vma?

We would have had to do the same as in retract_page_tables() (which
doesn't even have mmap_lock for read): recheck !vma->anon_vma after
finally acquiring ptlock.  But the !anon_vma limitation is certainly
not acceptable here anyway.

> 
> > Just for this case, take the pmd_lock() two steps earlier: not because
> > it gives any protection against this case itself, but because ptlock
> > nests inside it, and it's the dropping of ptlock which let the bug in.
> > In other cases, continue to minimize the pmd_lock() hold time.
> 
> Special-casing userfaultfd like this makes me a bit uncomfortable; but
> I also can't find anything other than userfaultfd that would insert
> pages into regions that are khugepaged-compatible, so I guess this
> works?

I'm as sure as I can be that it's solely because userfaultfd breaks
the usual rules here (and in fairness, IIRC Andrea did ask my permission
before making it behave that way on shmem, COWing without a source page).

Perhaps something else will want that same behaviour in future (it's
tempting, but difficult to guarantee correctness); for now, it is just
userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
expecting uffd to add more such exceptional modes in future).

> 
> I guess an alternative would be to use a spin_trylock() instead of the
> current pmd_lock(), and if that fails, temporarily drop the page table
> lock and then restart from step 2 with both locks held - and at that
> point the page table scan should be fast since we expect it to usually
> be empty.

That's certainly a good idea, if collapse on userfaultfd_armed private
is anything of a common case (I doubt, but I don't know).  It may be a
better idea anyway (saving a drop and retake of ptlock).

I gave it a try, expecting to end up with something that would lead
me to say "I tried it, but it didn't work out well"; but actually it
looks okay to me.  I wouldn't say I prefer it, but it seems reasonable,
and no more complicated (as Peter rightly observes) than the original.

It's up to you and Peter, and whoever has strong feelings about it,
to choose between them: I don't mind (but I shall be sad if someone
demands that I indent that comment deeper - I'm not a fan of long
multi-line comments near column 80).


[PATCH mm-unstable v2] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
thought it had emptied: page lock on the huge page is enough to protect
against WP faults (which find the PTE has been cleared), but not enough
to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.

retract_page_tables() protects against this by checking !vma->anon_vma;
but we know that MADV_COLLAPSE needs to be able to work on private shmem
mappings, even those with an anon_vma prepared for another part of the
mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
mappings which are userfaultfd_armed().  Whether it needs to work on
private shmem mappings which are userfaultfd_armed(), I'm not so sure:
but assume that it does.

Now trylock pmd lock without dropping ptlock (suggested by jannh): if
that fails, drop and retake ptlock around taking pmd lock, and just in
the uffd private case, go back to recheck and empty the page table.

Reported-by: Jann Horn 
Closes: 
https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5fw-kyd5rg5xo_rdbm0ltt...@mail.gmail.com/
Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with 
mmap_read_lock()")
Signed-off-by: Hugh Dickins 
---
 mm/khugepaged.c | 39 +--
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 40d43eccdee8..ad1c571772fe 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1476,7 

[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:291:36: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration

2023-08-21 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
head:   0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d
commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM 
feature fixups instead of static branches
config: powerpc64-randconfig-r031-20230822 
(https://download.01.org/0day-ci/archive/20230822/202308221055.lw3uzjil-...@intel.com/config)
compiler: powerpc64-linux-gcc (GCC) 12.3.0
reproduce: 
(https://download.01.org/0day-ci/archive/20230822/202308221055.lw3uzjil-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202308221055.lw3uzjil-...@intel.com/

All warnings (new ones prefixed by >>):

   In file included from arch/powerpc/include/asm/percpu.h:20,
from arch/powerpc/include/asm/mmu.h:148,
from arch/powerpc/include/asm/nohash/kup-booke.h:6,
from arch/powerpc/include/asm/kup.h:24,
from arch/powerpc/mm/nohash/kup.c:11:
   arch/powerpc/include/asm/paca.h:155:30: error: field 'tcd' has incomplete 
type
 155 | struct tlb_core_data tcd;
 |  ^~~
>> arch/powerpc/include/asm/paca.h:291:36: warning: 'struct mm_struct' declared 
>> inside parameter list will not be visible outside of this definition or 
>> declaration
 291 | extern void copy_mm_to_paca(struct mm_struct *mm);
 |^


vim +291 arch/powerpc/include/asm/paca.h

91c60b5b820962 arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-06-02  131  
e0d68273d70695 arch/powerpc/include/asm/paca.h Christophe Leroy   
2022-09-19  132  #ifdef CONFIG_PPC_BOOK3E_64
016f8cf0d87bb2 arch/powerpc/include/asm/paca.h Kevin Hao  
2015-03-10  133   u64 exgen[8] __aligned(0x40);
f67f4ef5fcdfde arch/powerpc/include/asm/paca.h Scott Wood 
2011-06-22  134   /* Keep pgd in the same cacheline as the start of extlb */
016f8cf0d87bb2 arch/powerpc/include/asm/paca.h Kevin Hao  
2015-03-10  135   pgd_t *pgd __aligned(0x40); /* Current PGD */
f67f4ef5fcdfde arch/powerpc/include/asm/paca.h Scott Wood 
2011-06-22  136   pgd_t *kernel_pgd;  /* Kernel PGD */
28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 
2013-10-11  137  
28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 
2013-10-11  138   /* Shared by all threads of a core -- points to tcd of first 
thread */
28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 
2013-10-11  139   struct tlb_core_data *tcd_ptr;
28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 
2013-10-11  140  
609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 
2014-03-10  141   /*
609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 
2014-03-10  142* We can have up to 3 levels of reentrancy in the TLB miss 
handler,
609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 
2014-03-10  143* in each of four exception levels (normal, crit, mcheck, 
debug).
609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 
2014-03-10  144*/
609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 
2014-03-10  145   u64 extlb[12][EX_TLB_SIZE / sizeof(u64)];
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  146   u64 exmc[8];/* used for machine checks */
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  147   u64 excrit[8];  /* used for crit interrupts */
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  148   u64 exdbg[8];   /* used for debug interrupts */
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  149  
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  150   /* Kernel stack pointers for use by special exceptions */
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  151   void *mc_kstack;
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  152   void *crit_kstack;
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  153   void *dbg_kstack;
28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 
2013-10-11  154  
28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 
2013-10-11 @155   struct tlb_core_data tcd;
e0d68273d70695 arch/powerpc/include/asm/paca.h Christophe Leroy   
2022-09-19  156  #endif /* CONFIG_PPC_BOOK3E_64 */
dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 
2009-07-23  157  
387e220a2e5e63 arch/powerpc/include/asm/paca.h Nicholas Piggin
2021-12-02  158  #ifdef 

Re: linux-next: build failure after merge of the mm tree

2023-08-21 Thread Matthew Wilcox
On Tue, Aug 22, 2023 at 11:22:17AM +1000, Stephen Rothwell wrote:
> Hi Matthew,
> 
> On Tue, 22 Aug 2023 02:11:44 +0100 Matthew Wilcox  wrote:
> >
> > On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote:
> > > In file included from include/trace/trace_events.h:27,
> > >  from include/trace/define_trace.h:102,
> > >  from fs/xfs/xfs_trace.h:4428,
> > >  from fs/xfs/xfs_trace.c:45:
> > > include/linux/pgtable.h:8:25: error: initializer element is not constant
> > > 8 | #define PMD_ORDER   (PMD_SHIFT - PAGE_SHIFT)  
> > 
> > Ummm.  PowerPC doesn't have a compile-time constant PMD size?
> 
> Yeah, you are not the first (or probably the last) to be caught by that.

I think this will do the trick.  Any comments?

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1904eaf7a2e9..d5a4e6c2dcd1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -796,15 +796,6 @@ DEFINE_INODE_EVENT(xfs_inode_reclaiming);
 DEFINE_INODE_EVENT(xfs_inode_set_need_inactive);
 DEFINE_INODE_EVENT(xfs_inode_inactivating);
 
-/*
- * ftrace's __print_symbolic requires that all enum values be wrapped in the
- * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
- * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
- * code.
- */
-TRACE_DEFINE_ENUM(PMD_ORDER);
-TRACE_DEFINE_ENUM(PUD_ORDER);
-
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
 
@@ -823,13 +814,10 @@ TRACE_EVENT(xfs_filemap_fault,
__entry->order = order;
__entry->write_fault = write_fault;
),
-   TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
+   TP_printk("dev %d:%d ino 0x%llx order:%u write_fault %d",
  MAJOR(__entry->dev), MINOR(__entry->dev),
  __entry->ino,
- __print_symbolic(__entry->order,
-   { 0,"PTE" },
-   { PMD_ORDER,"PMD" },
-   { PUD_ORDER,"PUD" }),
+ __entry->order,
  __entry->write_fault)
 )
 




Re: linux-next: build failure after merge of the mm tree

2023-08-21 Thread Stephen Rothwell
Hi Matthew,

On Tue, 22 Aug 2023 02:11:44 +0100 Matthew Wilcox  wrote:
>
> On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote:
> > In file included from include/trace/trace_events.h:27,
> >  from include/trace/define_trace.h:102,
> >  from fs/xfs/xfs_trace.h:4428,
> >  from fs/xfs/xfs_trace.c:45:
> > include/linux/pgtable.h:8:25: error: initializer element is not constant
> > 8 | #define PMD_ORDER   (PMD_SHIFT - PAGE_SHIFT)  
> 
> Ummm.  PowerPC doesn't have a compile-time constant PMD size?

Yeah, you are not the first (or probably the last) to be caught by that.

-- 
Cheers,
Stephen Rothwell


pgp3KpD7r5v3m.pgp
Description: OpenPGP digital signature


Re: linux-next: build failure after merge of the mm tree

2023-08-21 Thread Matthew Wilcox
On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote:
> In file included from include/trace/trace_events.h:27,
>  from include/trace/define_trace.h:102,
>  from fs/xfs/xfs_trace.h:4428,
>  from fs/xfs/xfs_trace.c:45:
> include/linux/pgtable.h:8:25: error: initializer element is not constant
> 8 | #define PMD_ORDER   (PMD_SHIFT - PAGE_SHIFT)

Ummm.  PowerPC doesn't have a compile-time constant PMD size?

arch/powerpc/include/asm/book3s/64/pgtable.h:#define PMD_SHIFT  (PAGE_SHIFT + 
PTE_INDEX_SIZE)
arch/powerpc/include/asm/book3s/64/pgtable.h:#define PTE_INDEX_SIZE  
__pte_index_size

That's really annoying.  I'll try to work around it.



[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:291:36: warning: declaration of 'struct mm_struct' will not be visible outside of this function

2023-08-21 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
head:   0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d
commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM 
feature fixups instead of static branches
config: powerpc64-randconfig-r012-20230822 
(https://download.01.org/0day-ci/archive/20230822/202308220857.ufq2oaxm-...@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 
4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: 
(https://download.01.org/0day-ci/archive/20230822/202308220857.ufq2oaxm-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202308220857.ufq2oaxm-...@intel.com/

All warnings (new ones prefixed by >>):

   In file included from arch/powerpc/mm/nohash/kup.c:11:
   In file included from arch/powerpc/include/asm/kup.h:24:
   In file included from arch/powerpc/include/asm/nohash/kup-booke.h:6:
   In file included from arch/powerpc/include/asm/mmu.h:148:
   In file included from arch/powerpc/include/asm/percpu.h:20:
   arch/powerpc/include/asm/paca.h:155:23: error: field has incomplete type 
'struct tlb_core_data'
 155 | struct tlb_core_data tcd;
 |  ^
   arch/powerpc/include/asm/paca.h:139:9: note: forward declaration of 'struct 
tlb_core_data'
 139 | struct tlb_core_data *tcd_ptr;
 |^
>> arch/powerpc/include/asm/paca.h:291:36: warning: declaration of 'struct 
>> mm_struct' will not be visible outside of this function [-Wvisibility]
 291 | extern void copy_mm_to_paca(struct mm_struct *mm);
 |^
   1 warning and 1 error generated.


vim +291 arch/powerpc/include/asm/paca.h

^1da177e4c3f41 include/asm-ppc64/paca.hLinus Torvalds   2005-04-16  290 
 
54be0b9c7c9888 arch/powerpc/include/asm/paca.h Michael Ellerman 2018-10-02 @291 
 extern void copy_mm_to_paca(struct mm_struct *mm);
d2e60075a3d442 arch/powerpc/include/asm/paca.h Nicholas Piggin  2018-02-14  292 
 extern struct paca_struct **paca_ptrs;
1426d5a3bd0758 arch/powerpc/include/asm/paca.h Michael Ellerman 2010-01-28  293 
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
fc53b4202e61c7 arch/powerpc/include/asm/paca.h Matt Evans   2010-07-07  294 
 extern void setup_paca(struct paca_struct *new_paca);
59f577743d71bf arch/powerpc/include/asm/paca.h Nicholas Piggin  2018-02-14  295 
 extern void allocate_paca_ptrs(void);
59f577743d71bf arch/powerpc/include/asm/paca.h Nicholas Piggin  2018-02-14  296 
 extern void allocate_paca(int cpu);
1426d5a3bd0758 arch/powerpc/include/asm/paca.h Michael Ellerman 2010-01-28  297 
 extern void free_unused_pacas(void);
1426d5a3bd0758 arch/powerpc/include/asm/paca.h Michael Ellerman 2010-01-28  298 
 

:: The code at line 291 was first introduced by commit
:: 54be0b9c7c9888ebe63b89a31a17ee3df6a68d61 Revert "convert SLB miss 
handlers to C" and subsequent commits

:: TO: Michael Ellerman 
:: CC: Michael Ellerman 

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


linux-next: manual merge of the powerpc tree with the mm-stable tree

2023-08-21 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the powerpc tree got a conflict in:

  arch/powerpc/Kconfig

between commit:

  8d05554dca2a ("powerpc: mm: convert to GENERIC_IOREMAP")

from the mm-stable tree and commit:

  0ceef6e99cc3 ("powerpc/idle: Add support for nohlt")

from the powerpc tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/Kconfig
index 21edd664689e,c831e20cf40f..
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@@ -195,7 -194,7 +196,8 @@@ config PP
select GENERIC_CPU_VULNERABILITIES  if PPC_BARRIER_NOSPEC
select GENERIC_EARLY_IOREMAP
select GENERIC_GETTIMEOFDAY
+   select GENERIC_IDLE_POLL_SETUP
 +  select GENERIC_IOREMAP
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAPif PCI


pgpgMePaKLYzL.pgp
Description: OpenPGP digital signature


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Vladimir Oltean
On Mon, Aug 21, 2023 at 07:39:15PM -0400, Sean Anderson wrote:
> Well, I think we should take the opportunity to think about the hardware
> which exists and how we plan to model it. IMO grouping lanes into a
> single phy simplifies both the phy driver and the mac driver.

Ok, but ungrouped for backplane and grouped for !backplane? For the KR
link modes, parallel link training, with separate consumers per lanes in
a group, will be needed per lane.


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Sean Anderson
On 8/21/23 18:48, Vladimir Oltean wrote:
> On Mon, Aug 21, 2023 at 05:06:46PM -0400, Sean Anderson wrote:
>> On 8/21/23 15:58, Vladimir Oltean wrote:
>> > On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote:
>> >> After further review, it seems the reason 28g can get away without this
>> >> is because there's a one-to-one mapping between protocol controllers and
>> >> lanes. Unfortunately, that regularity is not present for 10g.
>> >> 
>> >> --Sean
>> > 
>> > There are some things I saw in your phy-fsl-lynx-10g.c driver and device
>> > tree bindings that I don't understand (the concept of lane groups)
>> 
>> Each lane group corresponds to a phy device (struct phy). For protocols
>> like PCIe or XAUI which can use multiple lanes, this lets the driver
>> coordinate configuring all lanes at once in the correct order.
> 
> For PCIe I admit that I don't know. I don't even know if there's any
> valid (current or future) use case for having the PCIe controller have a
> phandle to the SerDes. Everything else below in my reply assumes Ethernet.

One use case could be selecting PCIe/SATA as appropriate for an M.2
card. Another use case could be allocating lanes to different slots
based on card presence (e.g. 1 card use x4, 2+ cards use x1). I recently
bought a motherboard whose manual says

| PCI_E4 [normally x4] will run x1 speed when installing devices in PCI_E2/
| PCI_E3/ PCI_E5 slot [all x1].
which implies exactly this sort of design.

>> > and
>> > I'm not sure if they're related with what you're saying here, so if you
>> > could elaborate a bit more (ideally with an example) on the one-to-one
>> > mapping and the specific problems it causes, it would be great.
>> 
>> For e.g. the LS2088A, SerDes1 lanes H-A use SG1-8 and XFI1-8. SerDes2
>> lanes A-H use SG9-16 and XFI9-16. Each lane has its own controller, and
>> the mapping is 1-to-1. In the PCCRs, each controller uses the same
>> value, and is mapped in a regular way. So you can go directly from the
>> lane number to the right value to mask in the PCCR, with a very simple
>> translation scheme.
>> 
>> For e.g. the LS1046A, SerDes1 lane D uses XFI.9 (aka XFIA) and lane C
>> uses XFI.10 (aka XFIB). This is the opposite of how SerDes1 lanes A-D
>> use SGMII.9, .10, .5, and .6 (aka SGMIIA-D).
>> 
>> For e.g. the T4240, SerDes1 lanes A-H use sg1.5, .6, .10, .9, .1, .2,
>> .3, .4 (aka SGMII E, F, H, G, A, B, C, D).
>> 
>> For e.g. the B4860, SerDes lanes A uses sgmii1 or sgmii5 and B uses
>> sgmii2 or sgmii6. The MAC selected is determined based on the value
>> programmed into PCCR2.
> 
> It's so exceedingly unlikely that B4860 will gain support for anything
> new, that it's not even worth talking about it, or even considering it
> in the design of a new driver. Just forget about it.
> 
> Let's concentrate on the Layerscapes, and on the T series to the extent
> that we're not going out of our way to support them with a fairly simple
> design.

Well, these are just easy ways to show the oddities in the PCCRs. I
could have made the same points with the various Layerscapes.

> In the Lynx 10G block guide that I have, PCCR2 is a register that does
> something completely different from Ethernet. I'm not sure if B4860 has
> a Lynx 10G block and not something else.

T-series SoCs use a different PCCR layout than Layerscape, despite using
the same registers in the rest of the SerDes. The LS1021 also uses this
scheme, so it's not just T-series (but it's close enough). This is why I
had an option for different PCCR configuration functions in earlier
versions of this series, so if someone wanted they could easily add
T-series support.

>> While I appreciate that your hardware engineers did a better job for
>> 28g, many 10g serdes arbitrarily map lanes to protocol controllers.
>> I think the mapping is too irregular to tame, and it is better to say
>> "if you want this configuration, program this value".
> 
> Ok, but that's a lateral argument (or I'm not understanding the connection).

To restate, there's no systemic organization to the PCCRs. The driver
configuration should reflect this and allow arbitrary mappings.

> Some maintainers (Mark Brown for sure, from my personal experience) prefer
> that expert-level knowledge of hardware should be hardcoded into the kernel
> driver based on known stuff such as the SoC-specific compatible string.
> I certainly share the same view.
> 
> With your case, I think that argument is even more pertinent, because
> IIUC, the lane group protocols won't be put in the SoC .dtsi (so as to
> be written only once), but in the board device trees (since the
> available protocols invariably depend upon the board provisioning).
> So, non-expert board device tree writers will have to know what's with
> the PCCR stuff. Pretty brain-intensive.

The idea is to stick all the PCCR stuff in the SoC devicetree, and the
board-level devicetrees just set up the clocks and pick which MACs use
which phys. Have a look at patches 10 and 15 of this 

[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:155:30: error: field 'tcd' has incomplete type

2023-08-21 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
head:   0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d
commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM 
feature fixups instead of static branches
config: powerpc64-randconfig-r036-20230822 
(https://download.01.org/0day-ci/archive/20230822/202308220708.nrf5auae-...@intel.com/config)
compiler: powerpc64-linux-gcc (GCC) 12.3.0
reproduce: 
(https://download.01.org/0day-ci/archive/20230822/202308220708.nrf5auae-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202308220708.nrf5auae-...@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/include/asm/percpu.h:20,
from arch/powerpc/include/asm/mmu.h:148,
from arch/powerpc/include/asm/nohash/kup-booke.h:6,
from arch/powerpc/include/asm/kup.h:24,
from arch/powerpc/mm/nohash/kup.c:11:
>> arch/powerpc/include/asm/paca.h:155:30: error: field 'tcd' has incomplete 
>> type
 155 | struct tlb_core_data tcd;
 |  ^~~
   arch/powerpc/include/asm/paca.h:291:36: error: 'struct mm_struct' declared 
inside parameter list will not be visible outside of this definition or 
declaration [-Werror]
 291 | extern void copy_mm_to_paca(struct mm_struct *mm);
 |^
   cc1: all warnings being treated as errors


vim +/tcd +155 arch/powerpc/include/asm/paca.h

91c60b5b820962 Benjamin Herrenschmidt 2009-06-02  131  
e0d68273d70695 Christophe Leroy   2022-09-19  132  #ifdef 
CONFIG_PPC_BOOK3E_64
016f8cf0d87bb2 Kevin Hao  2015-03-10  133   u64 exgen[8] 
__aligned(0x40);
f67f4ef5fcdfde Scott Wood 2011-06-22  134   /* Keep pgd in the same 
cacheline as the start of extlb */
016f8cf0d87bb2 Kevin Hao  2015-03-10  135   pgd_t *pgd 
__aligned(0x40); /* Current PGD */
f67f4ef5fcdfde Scott Wood 2011-06-22  136   pgd_t *kernel_pgd;  
/* Kernel PGD */
28efc35fe68dac Scott Wood 2013-10-11  137  
28efc35fe68dac Scott Wood 2013-10-11  138   /* Shared by all 
threads of a core -- points to tcd of first thread */
28efc35fe68dac Scott Wood 2013-10-11  139   struct tlb_core_data 
*tcd_ptr;
28efc35fe68dac Scott Wood 2013-10-11  140  
609af38f8fc0f1 Scott Wood 2014-03-10  141   /*
609af38f8fc0f1 Scott Wood 2014-03-10  142* We can have up to 3 
levels of reentrancy in the TLB miss handler,
609af38f8fc0f1 Scott Wood 2014-03-10  143* in each of four 
exception levels (normal, crit, mcheck, debug).
609af38f8fc0f1 Scott Wood 2014-03-10  144*/
609af38f8fc0f1 Scott Wood 2014-03-10  145   u64 
extlb[12][EX_TLB_SIZE / sizeof(u64)];
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  146   u64 exmc[8];
/* used for machine checks */
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  147   u64 excrit[8];  
/* used for crit interrupts */
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  148   u64 exdbg[8];   
/* used for debug interrupts */
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  149  
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  150   /* Kernel stack 
pointers for use by special exceptions */
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  151   void *mc_kstack;
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  152   void *crit_kstack;
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  153   void *dbg_kstack;
28efc35fe68dac Scott Wood 2013-10-11  154  
28efc35fe68dac Scott Wood 2013-10-11 @155   struct tlb_core_data 
tcd;
e0d68273d70695 Christophe Leroy   2022-09-19  156  #endif /* 
CONFIG_PPC_BOOK3E_64 */
dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23  157  

:: The code at line 155 was first introduced by commit
:: 28efc35fe68dacbddc4b12c2fa8f2df1593a4ad3 powerpc/e6500: TLB miss handler 
with hardware tablewalk support

:: TO: Scott Wood 
:: CC: Scott Wood 

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Vladimir Oltean
On Mon, Aug 21, 2023 at 05:06:46PM -0400, Sean Anderson wrote:
> On 8/21/23 15:58, Vladimir Oltean wrote:
> > On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote:
> >> After further review, it seems the reason 28g can get away without this
> >> is because there's a one-to-one mapping between protocol controllers and
> >> lanes. Unfortunately, that regularity is not present for 10g.
> >> 
> >> --Sean
> > 
> > There are some things I saw in your phy-fsl-lynx-10g.c driver and device
> > tree bindings that I don't understand (the concept of lane groups)
> 
> Each lane group corresponds to a phy device (struct phy). For protocols
> like PCIe or XAUI which can use multiple lanes, this lets the driver
> coordinate configuring all lanes at once in the correct order.

For PCIe I admit that I don't know. I don't even know if there's any
valid (current or future) use case for having the PCIe controller have a
phandle to the SerDes. Everything else below in my reply assumes Ethernet.

> > and
> > I'm not sure if they're related with what you're saying here, so if you
> > could elaborate a bit more (ideally with an example) on the one-to-one
> > mapping and the specific problems it causes, it would be great.
> 
> For e.g. the LS2088A, SerDes1 lanes H-A use SG1-8 and XFI1-8. SerDes2
> lanes A-H use SG9-16 and XFI9-16. Each lane has its own controller, and
> the mapping is 1-to-1. In the PCCRs, each controller uses the same
> value, and is mapped in a regular way. So you can go directly from the
> lane number to the right value to mask in the PCCR, with a very simple
> translation scheme.
> 
> For e.g. the LS1046A, SerDes1 lane D uses XFI.9 (aka XFIA) and lane C
> uses XFI.10 (aka XFIB). This is the opposite of how SerDes1 lanes A-D
> use SGMII.9, .10, .5, and .6 (aka SGMIIA-D).
> 
> For e.g. the T4240, SerDes1 lanes A-H use sg1.5, .6, .10, .9, .1, .2,
> .3, .4 (aka SGMII E, F, H, G, A, B, C, D).
> 
> For e.g. the B4860, SerDes lanes A uses sgmii1 or sgmii5 and B uses
> sgmii2 or sgmii6. The MAC selected is determined based on the value
> programmed into PCCR2.

It's so exceedingly unlikely that B4860 will gain support for anything
new, that it's not even worth talking about it, or even considering it
in the design of a new driver. Just forget about it.

Let's concentrate on the Layerscapes, and on the T series to the extent
that we're not going out of our way to support them with a fairly simple
design.

In the Lynx 10G block guide that I have, PCCR2 is a register that does
something completely different from Ethernet. I'm not sure if B4860 has
a Lynx 10G block and not something else.

> While I appreciate that your hardware engineers did a better job for
> 28g, many 10g serdes arbitrarily map lanes to protocol controllers.
> I think the mapping is too irregular to tame, and it is better to say
> "if you want this configuration, program this value".

Ok, but that's a lateral argument (or I'm not understanding the connection).

Some maintainers (Mark Brown for sure, from my personal experience) prefer
that expert-level knowledge of hardware should be hardcoded into the kernel
driver based on known stuff such as the SoC-specific compatible string.
I certainly share the same view.

With your case, I think that argument is even more pertinent, because
IIUC, the lane group protocols won't be put in the SoC .dtsi (so as to
be written only once), but in the board device trees (since the
available protocols invariably depend upon the board provisioning).
So, non-expert board device tree writers will have to know what's with
the PCCR stuff. Pretty brain-intensive.

> > I may be off with my understanding of the regularity you are talking about,
> > but the LX2160 (and Lynx 28G block) also has multi-lane protocols like 40G,
> > 100G, assuming that's what you are talking about. I haven't started yet
> > working on those for the mtip_backplane driver, but I'm not currently
> > seeing a problem with the architecture where a phy_device represents a
> > single lane that's part of a multi-lane port, and not an entire group.
> 
> Resetting one lane in a group will reset the rest, which could confuse
> the driver. Additionally, treating the lanes as one phy lets us set the
> reset direction and first lane bits correctly.

Yeah, in theory that is probably correct, but in practice can't we hide
our head in the sand and say that the "phys" phandles have to have the
lanes in the same order as LNmGCR0[1STLANE] expects them (which is also
the "natural" order as the SoC RM describes it)? With a "for" loop
implementation in the MAC, that would work just fine 100% of the time.
Doing more intricate massaging of the "phys" in the consumer, and you're
just asking for trouble. My 2 cents.

Sure, it's the same kind of ask of a board device tree writer as "hey,
please give me a good PCCR value", but I honestly think that the head
scratching will be much less severe.

> > In my imagination, there are 2 cases:
> > - all 4 lanes are managed by 

Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-21 Thread Jann Horn
On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins  wrote:
> Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
> shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
> thought it had emptied: page lock on the huge page is enough to protect
> against WP faults (which find the PTE has been cleared), but not enough
> to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.
>
> retract_page_tables() protects against this by checking !vma->anon_vma;
> but we know that MADV_COLLAPSE needs to be able to work on private shmem
> mappings, even those with an anon_vma prepared for another part of the
> mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
> mappings which are userfaultfd_armed().  Whether it needs to work on
> private shmem mappings which are userfaultfd_armed(), I'm not so sure:
> but assume that it does.

I think we couldn't rely on anon_vma here anyway, since holding the
mmap_lock in read mode doesn't prevent concurrent creation of an
anon_vma?

> Just for this case, take the pmd_lock() two steps earlier: not because
> it gives any protection against this case itself, but because ptlock
> nests inside it, and it's the dropping of ptlock which let the bug in.
> In other cases, continue to minimize the pmd_lock() hold time.

Special-casing userfaultfd like this makes me a bit uncomfortable; but
I also can't find anything other than userfaultfd that would insert
pages into regions that are khugepaged-compatible, so I guess this
works?

I guess an alternative would be to use a spin_trylock() instead of the
current pmd_lock(), and if that fails, temporarily drop the page table
lock and then restart from step 2 with both locks held - and at that
point the page table scan should be fast since we expect it to usually
be empty.


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-21 Thread Peter Xu
On Mon, Aug 21, 2023 at 12:51:20PM -0700, Hugh Dickins wrote:
> Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
> shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
> thought it had emptied: page lock on the huge page is enough to protect
> against WP faults (which find the PTE has been cleared), but not enough
> to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.
> 
> retract_page_tables() protects against this by checking !vma->anon_vma;
> but we know that MADV_COLLAPSE needs to be able to work on private shmem
> mappings, even those with an anon_vma prepared for another part of the
> mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
> mappings which are userfaultfd_armed().  Whether it needs to work on
> private shmem mappings which are userfaultfd_armed(), I'm not so sure:
> but assume that it does.
> 
> Just for this case, take the pmd_lock() two steps earlier: not because
> it gives any protection against this case itself, but because ptlock
> nests inside it, and it's the dropping of ptlock which let the bug in.
> In other cases, continue to minimize the pmd_lock() hold time.
> 
> Reported-by: Jann Horn 
> Closes: 
> https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5fw-kyd5rg5xo_rdbm0ltt...@mail.gmail.com/
> Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with 
> mmap_read_lock()")
> Signed-off-by: Hugh Dickins 

The locking is indeed slightly complicated.. but I didn't spot anything
wrong.

Acked-by: Peter Xu 

Thanks,

-- 
Peter Xu



Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Sean Anderson
On 8/21/23 15:58, Vladimir Oltean wrote:
> On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote:
>> After further review, it seems the reason 28g can get away without this
>> is because there's a one-to-one mapping between protocol controllers and
>> lanes. Unfortunately, that regularity is not present for 10g.
>> 
>> --Sean
> 
> There are some things I saw in your phy-fsl-lynx-10g.c driver and device
> tree bindings that I don't understand (the concept of lane groups)

Each lane group corresponds to a phy device (struct phy). For protocols
like PCIe or XAUI which can use multiple lanes, this lets the driver
coordinate configuring all lanes at once in the correct order.

> and
> I'm not sure if they're related with what you're saying here, so if you
> could elaborate a bit more (ideally with an example) on the one-to-one
> mapping and the specific problems it causes, it would be great.

For e.g. the LS2088A, SerDes1 lanes H-A use SG1-8 and XFI1-8. SerDes2
lanes A-H use SG9-16 and XFI9-16. Each lane has its own controller, and
the mapping is 1-to-1. In the PCCRs, each controller uses the same
value, and is mapped in a regular way. So you can go directly from the
lane number to the right value to mask in the PCCR, with a very simple
translation scheme.

For e.g. the LS1046A, SerDes1 lane D uses XFI.9 (aka XFIA) and lane C
uses XFI.10 (aka XFIB). This is the opposite of how SerDes1 lanes A-D
use SGMII.9, .10, .5, and .6 (aka SGMIIA-D).

For e.g. the T4240, SerDes1 lanes A-H use sg1.5, .6, .10, .9, .1, .2,
.3, .4 (aka SGMII E, F, H, G, A, B, C, D).

For e.g. the B4860, SerDes lanes A uses sgmii1 or sgmii5 and B uses
sgmii2 or sgmii6. The MAC selected is determined based on the value
programmed into PCCR2.

While I appreciate that your hardware engineers did a better job for
28g, many 10g serdes arbitrarily map lanes to protocol controllers.
I think the mapping is too irregular to tame, and it is better to say
"if you want this configuration, program this value".

> I may be off with my understanding of the regularity you are talking about,
> but the LX2160 (and Lynx 28G block) also has multi-lane protocols like 40G,
> 100G, assuming that's what you are talking about. I haven't started yet
> working on those for the mtip_backplane driver, but I'm not currently
> seeing a problem with the architecture where a phy_device represents a
> single lane that's part of a multi-lane port, and not an entire group.

Resetting one lane in a group will reset the rest, which could confuse
the driver. Additionally, treating the lanes as one phy lets us set the
reset direction and first lane bits correctly.

> In my imagination, there are 2 cases:
> - all 4 lanes are managed by the single dpaa2-mac consumer (which has 4
>   phandles, and iterates over them with a "for" loop)
> - each of the 4 lanes is managed by the respective backplane AN/LT core,
>   and thus, there's one phandle to each lane

By doing the grouping in the driver, we also simplify the consumer
implementation. The MAC can always use a single phy, without worrying
about the actual number of lanes. This matches the hardware, since the
MAC is going to talk XGMII (or whatever) to the protocol controller
anyway.

I think it will be a lot easier to add multi-lane support with this
method because it gives the driver more information about what's going
on. The driver can control the whole configuration/reset process and the
timing.

> I sketched some dt-bindings for the second case here, so I guess it must
> be the first scenario that's somehow problematic?
> https://cas5-0-urlprotect.trendmicro.com:443/wis/clicktime/v1/query?url=https%3a%2f%2fpatchwork.kernel.org%2fproject%2fnetdevbpf%2fpatch%2f20230817150644.3605105%2d9%2dvladimir.oltean%40nxp.com%2f=9e644233-009e-4197-a266-5d9a85eb1148=d807158c60b7d2502abde8a2fc01f40662980862-cc1d5330d84af8fa40745b165a44849db50f8a67

Yes, no issues with the second case.

--Sean


Re: [PATCH v4 22/28] dt-bindings: net: Add the Lantiq PEF2256 E1/T1/J1 framer

2023-08-21 Thread Rob Herring
On Fri, Aug 18, 2023 at 06:39:16PM +0200, Christophe Leroy wrote:
> From: Herve Codina 
> 
> The Lantiq PEF2256 is a framer and line interface component designed to
> fulfill all required interfacing between an analog E1/T1/J1 line and the
> digital PCM system highway/H.100 bus.
> 
> Signed-off-by: Herve Codina 
> Signed-off-by: Christophe Leroy 
> ---
>  .../bindings/net/lantiq,pef2256.yaml  | 219 ++
>  1 file changed, 219 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/lantiq,pef2256.yaml
> 
> diff --git a/Documentation/devicetree/bindings/net/lantiq,pef2256.yaml 
> b/Documentation/devicetree/bindings/net/lantiq,pef2256.yaml
> new file mode 100644
> index ..72f6777afa3a
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/lantiq,pef2256.yaml
> @@ -0,0 +1,219 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/lantiq,pef2256.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Lantiq PEF2256
> +
> +maintainers:
> +  - Herve Codina 
> +
> +description:
> +  The Lantiq PEF2256, also known as Infineon PEF2256 or FALC56, is a framer 
> and
> +  line interface component designed to fulfill all required interfacing 
> between
> +  an analog E1/T1/J1 line and the digital PCM system highway/H.100 bus.
> +
> +properties:
> +  compatible:
> +items:
> +  - const: lantiq,pef2256
> +
> +  reg:
> +maxItems: 1
> +
> +  clocks:
> +items:
> +  - description: Master clock
> +  - description: System Clock Receive
> +  - description: System Clock Transmit
> +
> +  clock-names:
> +items:
> +  - const: mclk
> +  - const: sclkr
> +  - const: sclkx
> +
> +  interrupts:
> +maxItems: 1
> +
> +  reset-gpios:
> +description:
> +  GPIO used to reset the device.
> +maxItems: 1
> +
> +  '#framer-cells':

Not a standard binding. Do you need provider specific variable number of 
cells?

> +const: 0
> +
> +  pinctrl:
> +$ref: /schemas/pinctrl/pinctrl.yaml#
> +additionalProperties: false
> +
> +patternProperties:
> +  '-pins$':
> +type: object
> +$ref: /schemas/pinctrl/pincfg-node.yaml#
> +additionalProperties: false
> +
> +properties:
> +  pins:
> +enum: [ RPA, RPB, RPC, RPD, XPA, XPB, XPC, XPD ]
> +
> +  function:
> +enum: [ SYPR, RFM, RFMB, RSIGM, RSIG, DLR, FREEZE, RFSP, LOS,
> +SYPX, XFMS, XSIG, TCLK, XMFB, XSIGM, DLX, XCLK, XLT,
> +GPI, GPOH, GPOL ]
> +
> +required:
> +  - pins
> +  - function
> +
> +  lantiq,data-rate-bps:
> +$ref: /schemas/types.yaml#/definitions/uint32
> +enum: [2048000, 4096000, 8192000, 16384000]
> +default: 2048000
> +description:
> +  Data rate (bit per seconds) on the system highway.
> +
> +  lantiq,clock-falling-edge:
> +$ref: /schemas/types.yaml#/definitions/flag
> +description:
> +  Data is sent on falling edge of the clock (and received on the rising
> +  edge). If 'clock-falling-edge' is not present, data is sent on the
> +  rising edge (and received on the falling edge).
> +
> +  lantiq,channel-phase:
> +$ref: /schemas/types.yaml#/definitions/uint32
> +enum: [0, 1, 2, 3, 4, 5, 6, 7]
> +default: 0
> +description:

Need '|' to preserve formatting

> +  The pef2256 delivers a full frame (32 8bit time-slots in E1 and 24 8bit
> +  time-slots 8 8bit signaling in E1/J1) every 125us. This lead to a data
> +  rate of 2048000 bit/s. When lantiq,data-rate-bps is more than 2048000
> +  bit/s, the data (all 32 8bit) present in the frame are interleave with
> +  unused time-slots. The lantiq,channel-phase property allows to set the
> +  correct alignment of the interleave mechanism.
> +  For instance, suppose lantiq,data-rate-bps = 8192000 (ie 4*2048000), 
> and
> +  lantiq,channel-phase = 2, the interleave schema with unused time-slots
> +  (nu) and used time-slots (XX) for TSi is
> +nu nu XX nu nu nu XX nu nu nu XX nu
> +<-- TSi --> <- TSi+1 -> <- TSi+2 ->
> +  With lantiq,data-rate-bps = 8192000, and lantiq,channel-phase = 1, the
> +  interleave schema is
> +nu XX nu nu nu XX nu nu nu XX nu nu
> +<-- TSi --> <- TSi+1 -> <- TSi+2 ->
> +  With lantiq,data-rate-bps = 4096000 (ie 2*2048000), and
> +  lantiq,channel-phase = 1, the interleave schema is
> +nuXXnuXXnuXX
> +<-- TSi --> <- TSi+1 -> <- TSi+2 ->
> +
> +patternProperties:
> +  '^codec(-([0-9]|[1-2][0-9]|3[0-1]))?$':
> +type: object
> +$ref: /schemas/sound/dai-common.yaml
> +unevaluatedProperties: false
> +description:
> +  Codec provided by the pef2256. This codec allows to use some of the PCM
> +  system highway time-slots as audio channels to transport audio data 
> over
> +  

Re: [PATCH v4 06/28] dt-bindings: net: Add support for QMC HDLC

2023-08-21 Thread Rob Herring
On Fri, Aug 18, 2023 at 06:39:00PM +0200, Christophe Leroy wrote:
> From: Herve Codina 
> 
> The QMC (QUICC mutichannel controller) is a controller present in some
> PowerQUICC SoC such as MPC885.
> The QMC HDLC uses the QMC controller to transfer HDLC data.
> 
> Additionally, a framer can be connected to the QMC HDLC.
> If present, this framer is the interface between the TDM bus used by the
> QMC HDLC and the E1/T1 line.
> The QMC HDLC can use this framer to get information about the E1/T1 line
> and configure the E1/T1 line.
> 
> Signed-off-by: Herve Codina 
> Signed-off-by: Christophe Leroy 
> ---
>  .../devicetree/bindings/net/fsl,qmc-hdlc.yaml | 46 +++
>  1 file changed, 46 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml
> 
> diff --git a/Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml 
> b/Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml
> new file mode 100644
> index ..13f3572f0feb
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml
> @@ -0,0 +1,46 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/fsl,qmc-hdlc.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Freescale/NXP QUICC Multichannel Controller (QMC) HDLC
> +
> +maintainers:
> +  - Herve Codina 
> +
> +description: |

Don't need '|'

> +  The QMC HDLC uses a QMC (QUICC Multichannel Controller) channel to transfer
> +  HDLC data.
> +
> +properties:
> +  compatible:
> +const: fsl,qmc-hdlc
> +
> +  fsl,qmc-chan:
> +$ref: /schemas/types.yaml#/definitions/phandle-array
> +items:
> +  - items:
> +  - description: phandle to QMC node
> +  - description: Channel number
> +description:
> +  Should be a phandle/number pair. The phandle to QMC node and the QMC
> +  channel to use.
> +
> +  framer:
> +$ref: /schemas/types.yaml#/definitions/phandle
> +description:
> +  phandle to the framer node

What's the framer? 

> +
> +required:
> +  - compatible
> +  - fsl,qmc-chan
> +
> +additionalProperties: false
> +
> +examples:
> +  - |
> +hdlc {
> +compatible = "fsl,qmc-hdlc";
> +fsl,qmc-chan = < 16>;

Where does this node live? 

QMC is this[1]? Why don't you just add the compatible to channel@10 in 
the QMC node?

Rob

[1] Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-scc-qmc.yaml


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Vladimir Oltean
On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote:
> After further review, it seems the reason 28g can get away without this
> is because there's a one-to-one mapping between protocol controllers and
> lanes. Unfortunately, that regularity is not present for 10g.
> 
> --Sean

There are some things I saw in your phy-fsl-lynx-10g.c driver and device
tree bindings that I don't understand (the concept of lane groups), and
I'm not sure if they're related with what you're saying here, so if you
could elaborate a bit more (ideally with an example) on the one-to-one
mapping and the specific problems it causes, it would be great.

I may be off with my understanding of the regularity you are talking about,
but the LX2160 (and Lynx 28G block) also has multi-lane protocols like 40G,
100G, assuming that's what you are talking about. I haven't started yet
working on those for the mtip_backplane driver, but I'm not currently
seeing a problem with the architecture where a phy_device represents a
single lane that's part of a multi-lane port, and not an entire group.

In my imagination, there are 2 cases:
- all 4 lanes are managed by the single dpaa2-mac consumer (which has 4
  phandles, and iterates over them with a "for" loop)
- each of the 4 lanes is managed by the respective backplane AN/LT core,
  and thus, there's one phandle to each lane

I sketched some dt-bindings for the second case here, so I guess it must
be the first scenario that's somehow problematic?
https://patchwork.kernel.org/project/netdevbpf/patch/20230817150644.3605105-9-vladimir.olt...@nxp.com/


[PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-21 Thread Hugh Dickins
Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
thought it had emptied: page lock on the huge page is enough to protect
against WP faults (which find the PTE has been cleared), but not enough
to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.

retract_page_tables() protects against this by checking !vma->anon_vma;
but we know that MADV_COLLAPSE needs to be able to work on private shmem
mappings, even those with an anon_vma prepared for another part of the
mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
mappings which are userfaultfd_armed().  Whether it needs to work on
private shmem mappings which are userfaultfd_armed(), I'm not so sure:
but assume that it does.

Just for this case, take the pmd_lock() two steps earlier: not because
it gives any protection against this case itself, but because ptlock
nests inside it, and it's the dropping of ptlock which let the bug in.
In other cases, continue to minimize the pmd_lock() hold time.

Reported-by: Jann Horn 
Closes: 
https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5fw-kyd5rg5xo_rdbm0ltt...@mail.gmail.com/
Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with 
mmap_read_lock()")
Signed-off-by: Hugh Dickins 
---
 mm/khugepaged.c | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 40d43eccdee8..d5650541083a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1476,7 +1476,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
-   spinlock_t *pml, *ptl;
+   spinlock_t *pml = NULL, *ptl;
int nr_ptes = 0, result = SCAN_FAIL;
int i;
 
@@ -1572,9 +1572,25 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
notified = true;
-   start_pte = pte_offset_map_lock(mm, pmd, haddr, );
+
+   /*
+* pmd_lock covers a wider range than ptl, and (if split from mm's
+* page_table_lock) ptl nests inside pml. The less time we hold pml,
+* the better; but userfaultfd's mfill_atomic_pte() on a private VMA
+* inserts a valid as-if-COWed PTE without even looking up page cache.
+* So page lock of hpage does not protect from it, so we must not drop
+* ptl before pgt_pmd is removed, so uffd private needs pml taken now.
+*/
+   if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
+   pml = pmd_lock(mm, pmd);
+
+   start_pte = pte_offset_map_nolock(mm, pmd, haddr, );
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
+   if (!pml)
+   spin_lock(ptl);
+   else if (ptl != pml)
+   spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1608,7 +1624,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
nr_ptes++;
}
 
-   pte_unmap_unlock(start_pte, ptl);
+   pte_unmap(start_pte);
+   if (!pml)
+   spin_unlock(ptl);
 
/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
@@ -1616,12 +1634,12 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
}
 
-   /* step 4: remove page table */
-
-   /* Huge page lock is still held, so page table must remain empty */
-   pml = pmd_lock(mm, pmd);
-   if (ptl != pml)
-   spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+   /* step 4: remove empty page table */
+   if (!pml) {
+   pml = pmd_lock(mm, pmd);
+   if (ptl != pml)
+   spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+   }
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
if (ptl != pml)
@@ -1648,6 +1666,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
+   if (pml && pml != ptl)
+   spin_unlock(pml);
if (notified)
mmu_notifier_invalidate_range_end();
 drop_hpage:
-- 
2.35.3



Re: [RFC PATCH v11 28/29] KVM: selftests: Add basic selftest for guest_memfd()

2023-08-21 Thread Ackerley Tng
Sean Christopherson  writes:

> On Mon, Aug 07, 2023, Ackerley Tng wrote:
>> Sean Christopherson  writes:
>> 
>> > Add a selftest to verify the basic functionality of guest_memfd():
>> >
>> > 
>> 
>> Here's one more test:
>
> First off, thank you!  I greatly appreciate all the selftests work you (and
> others!) have been doing.
>
> For v2, can you please post a standalone patch?  My workflow barfs on 
> unrelated,
> inlined patches.  I'm guessing I can get b4 to play nice, but it's easier to 
> just
> yell at people :-)
>

Here's a standalone patch :)
https://lore.kernel.org/lkml/20230821194411.2165757-1-ackerley...@google.com/

> 


Re: [BUG] Re: [PATCH v3 10/13] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()

2023-08-21 Thread Hugh Dickins
On Mon, 14 Aug 2023, Jann Horn wrote:
> On Wed, Jul 12, 2023 at 6:42 AM Hugh Dickins  wrote:
> > Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp().
> > It does need mmap_read_lock(), but it does not need mmap_write_lock(),
> > nor vma_start_write() nor i_mmap lock nor anon_vma lock.  All racing
> > paths are relying on pte_offset_map_lock() and pmd_lock(), so use those.
> 
> We can still have a racing userfaultfd operation at the "/* step 4:
> remove page table */" point that installs a new PTE before the page
> table is removed.

And you've been very polite not to remind me that this is exactly
what you warned me about, in connection with retract_page_tables(),
nearly three months ago:

https://lore.kernel.org/linux-mm/cag48ez0af1rf1apsjn9ycnfyfq4yqsd4gqb6f2wfhf7jmdi...@mail.gmail.com/

> 
> To reproduce, patch a delay into the kernel like this:
> 
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 9a6e0d507759..27cc8dfbf3a7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  #include 
>  #include 
> @@ -1617,6 +1618,11 @@ int collapse_pte_mapped_thp(struct mm_struct
> *mm, unsigned long addr,
> }
> 
> /* step 4: remove page table */
> +   if (strcmp(current->comm, "DELAYME") == 0) {
> +   pr_warn("%s: BEGIN DELAY INJECTION\n", __func__);
> +   mdelay(5000);
> +   pr_warn("%s: END DELAY INJECTION\n", __func__);
> +   }
> 
> /* Huge page lock is still held, so page table must remain empty */
> pml = pmd_lock(mm, pmd);
> 
> 
> And then run the attached reproducer against mm/mm-everything. You
> should get this in dmesg:
> 
> [  206.578096] BUG: Bad rss-counter state mm:0942ebea
> type:MM_ANONPAGES val:1

Very helpful, thank you Jann.

I got a bit distracted when I then found mm's recent addition of
UFFDIO_POISON: thought I needed to change both collapse_pte_mapped_thp()
and retract_page_tables() now to cope with mfill_atomic_pte_poison()
inserting into even a userfaultfd_armed shared VMA.

But eventually, on second thoughts, realized that's only inserting a pte
marker, invalid, so won't cause any actual trouble.  A little untidy,
to leave that behind in a supposedly empty page table about to be freed,
but not worth refactoring these functions to avoid a non-bug.

And though syzbot and JH may find some fun with it, I don't think any
real application would be insertng a PTE_MARKER_POISONED where a huge
page collapse is almost complete.

So I scaled back to a more proportionate fix, following.  Sorry, I've
slightly messed up applying the "DELAY INJECTION" patch above: not
intentional, honest!  (mdelay while holding the locks is still good.)

Hugh

Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

2023-08-21 Thread Sean Christopherson
On Mon, Aug 21, 2023, Ackerley Tng wrote:
> Sean Christopherson  writes:
> 
> > On Tue, Aug 15, 2023, Ackerley Tng wrote:
> >> Sean Christopherson  writes:
> >> > Nullifying the KVM pointer isn't sufficient, because without additional 
> >> > actions
> >> > userspace could extract data from a VM by deleting its memslots and then 
> >> > binding
> >> > the guest_memfd to an attacker controlled VM.  Or more likely with TDX 
> >> > and SNP,
> >> > induce badness by coercing KVM into mapping memory into a guest with the 
> >> > wrong
> >> > ASID/HKID.
> >> >
> >> > I can think of three ways to handle that:
> >> >
> >> >   (a) prevent a different VM from *ever* binding to the gmem instance
> >> >   (b) free/zero physical pages when unbinding
> >> >   (c) free/zero when binding to a different VM
> >> >
> >> > Option (a) is easy, but that pretty much defeats the purpose of 
> >> > decopuling
> >> > guest_memfd from a VM.
> >> >
> >> > Option (b) isn't hard to implement, but it screws up the lifecycle of 
> >> > the memory,
> >> > e.g. would require memory when a memslot is deleted.  That isn't 
> >> > necessarily a
> >> > deal-breaker, but it runs counter to how KVM memlots currently operate.  
> >> > Memslots
> >> > are basically just weird page tables, e.g. deleting a memslot doesn't 
> >> > have any
> >> > impact on the underlying data in memory.  TDX throws a wrench in this as 
> >> > removing
> >> > a page from the Secure EPT is effectively destructive to the data (can't 
> >> > be mapped
> >> > back in to the VM without zeroing the data), but IMO that's an oddity 
> >> > with TDX and
> >> > not necessarily something we want to carry over to other VM types.
> >> >
> >> > There would also be performance implications (probably a non-issue in 
> >> > practice),
> >> > and weirdness if/when we get to sharing, linking and/or mmap()ing gmem.  
> >> > E.g. what
> >> > should happen if the last memslot (binding) is deleted, but there 
> >> > outstanding userspace
> >> > mappings?
> >> >
> >> > Option (c) is better from a lifecycle perspective, but it adds its own 
> >> > flavor of
> >> > complexity, e.g. the performant way to reclaim TDX memory requires the 
> >> > TDMR
> >> > (effectively the VM pointer), and so a deferred relcaim doesn't really 
> >> > work for
> >> > TDX.  And I'm pretty sure it *can't* work for SNP, because RMP entries 
> >> > must not
> >> > outlive the VM; KVM can't reuse an ASID if there are pages assigned to 
> >> > that ASID
> >> > in the RMP, i.e. until all memory belonging to the VM has been fully 
> >> > freed.

...

> I agree with you that nulling the KVM pointer is insufficient to keep
> host userspace out of the TCB. Among the three options (a) preventing a
> different VM (HKID/ASID) from binding to the gmem instance, or zeroing
> the memory either (b) on unbinding, or (c) on binding to another VM
> (HKID/ASID),
> 
> (a) sounds like adding a check issued to TDX/SNP upon binding and this
> check would just return OK for software-protected VMs (line of sight
> to removing host userspace from TCB).
> 
> Or, we could go further for software-protected VMs and add tracking in
> the inode to prevent the same inode from being bound to different
> "HKID/ASID"s, perhaps like this:
> 
> + On first binding, store the KVM pointer in the inode - not file (but
>   not hold a refcount)
> + On rebinding, check that the KVM matches the pointer in the inode
> + On intra-host migration, update the KVM pointer in the inode to allow
>   binding to the new struct kvm
> 
> I think you meant associating the file with a struct kvm at creation
> time as an implementation for (a), but technically since the inode is
> the representation of memory, tracking of struct kvm should be with the
> inode instead of the file.
> 
> (b) You're right that this messes up the lifecycle of the memory and
> wouldn't work with intra-host migration.
> 
> (c) sounds like doing the clearing on a check similar to that of (a)

Sort of, though it's much nastier, because it requires the "old" KVM instance to
be alive enough to support various operations.  I.e. we'd have to make stronger
guarantees about exactly when the handoff/transition could happen.

> If we track struct kvm with the inode, then I think (a), (b) and (c) can
> be independent of the refcounting method. What do you think?

No go.  Because again, the inode (physical memory) is coupled to the virtual 
machine
as a thing, not to a "struct kvm".  Or more concretely, the inode is coupled to 
an
ASID or an HKID, and there can be multiple "struct kvm" objects associated with 
a
single ASID.  And at some point in the future, I suspect we'll have multiple KVM
objects per HKID too.

The current SEV use case is for the migration helper, where two KVM objects 
share
a single ASID (the "real" VM and the helper).  I suspect TDX will end up with
similar behavior where helper "VMs" can use the HKID of the "real" VM.  For KVM,
that means multiple struct kvm objects being 

Re: [PATCH v4 21/28] net: wan: Add framer framework support

2023-08-21 Thread Jakub Kicinski
On Mon, 21 Aug 2023 05:19:22 + Christophe Leroy wrote:
> As I said in the cover letter, this series only fixes critical build 
> failures that happened when CONFIG_MODULES is set. The purpose was to 
> allow robots to perform their job up to the end. Other feedback and 
> comments will be taken into account by Hervé when he is back from holidays.

I missed this too, FTR this is unacceptable.

Quoting documentation:

  **Do not** post your patches just to run them through the checks.
  You must ensure that your patches are ready by testing them locally
  before posting to the mailing list. The patchwork build bot instance
  gets overloaded very easily and netdev@vger really doesn't need more
  traffic if we can help it.
  
See: 
https://www.kernel.org/doc/html/next/process/maintainer-netdev.html#patchwork-checks


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Sean Anderson
On 8/21/23 14:13, Ioana Ciornei wrote:
> On Mon, Aug 21, 2023 at 01:45:44PM -0400, Sean Anderson wrote:
>> Well, we have two pieces of information we need
>> 
>> - What values do we need to program in the PCCRs to select a particular
>>   mode? This includes whether to e.g. set the KX bits.
>> - Implied by the above, what protocols are supported on which lanes?
>>   This is not strictly necessary, but will certainly solve a lot of
>>   headscratching.
>> 
>> This information varies between different socs, and different serdes on
>> the same socs. We can't really look at the RCW or the clocks and figure
>> out what we need to program. So what are our options?
>> 
>> - We can have a separate compatible for each serdes on each SoC (e.g.
>>   "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers.
>> - We can have one compatible for each SoC, and determine the serdes
>>   based on the address. I would like to avoid this...
> 
> To me this really seems like a straightforward approach.

Indeed it would be straightforward, but what's the point of having a
devicetree in the first place then? We could just go back to being a
(non-dt) platform device.

>> - We can stick all the details which vary between serdes/socs into the
>>   device tree. This is very flexible, since supporting new SoCs is
>>   mostly a matter of adding a new compatible and writing a new
>>   devicetree. On the other hand, if you have a bug in your devicetree,
>>   it's not easy to fix it in the kernel.
>> - Just don't support protocol switching. The 28G driver does this, which
>>   is why it only has one compatible. However, supporting protocol
>>   switching is a core goal of this driver, so dropping support is not an
>>   option.
>> 
> 
> The Lynx 28G SerDes driver does support protocol switching.
> How did you arrive at the opposite conclusion?

Sorry, it's been a while and I just did a quick look-over, and noticed
there was no configuration for different SoCs.

After further review, it seems the reason 28g can get away without this
is because there's a one-to-one mapping between protocol controllers and
lanes. Unfortunately, that regularity is not present for 10g.

--Sean

> The initial commit on the driver is even part of a patch set named
> "dpaa2-mac: add support for changing the protocol at runtime". In
> upstream it only supports the 1G <-> 10G transition but I have some
> patches on the way to also support 25G.
> 
> Ioana


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Vladimir Oltean
On Mon, Aug 21, 2023 at 09:13:49PM +0300, Ioana Ciornei wrote:
> > - We can have one compatible for each SoC, and determine the serdes
> >   based on the address. I would like to avoid this...
> 
> To me this really seems like a straightforward approach.

+1


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Ioana Ciornei
On Mon, Aug 21, 2023 at 01:45:44PM -0400, Sean Anderson wrote:
> Well, we have two pieces of information we need
> 
> - What values do we need to program in the PCCRs to select a particular
>   mode? This includes whether to e.g. set the KX bits.
> - Implied by the above, what protocols are supported on which lanes?
>   This is not strictly necessary, but will certainly solve a lot of
>   headscratching.
> 
> This information varies between different socs, and different serdes on
> the same socs. We can't really look at the RCW or the clocks and figure
> out what we need to program. So what are our options?
> 
> - We can have a separate compatible for each serdes on each SoC (e.g.
>   "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers.
> - We can have one compatible for each SoC, and determine the serdes
>   based on the address. I would like to avoid this...

To me this really seems like a straightforward approach.

> - We can stick all the details which vary between serdes/socs into the
>   device tree. This is very flexible, since supporting new SoCs is
>   mostly a matter of adding a new compatible and writing a new
>   devicetree. On the other hand, if you have a bug in your devicetree,
>   it's not easy to fix it in the kernel.
> - Just don't support protocol switching. The 28G driver does this, which
>   is why it only has one compatible. However, supporting protocol
>   switching is a core goal of this driver, so dropping support is not an
>   option.
> 

The Lynx 28G SerDes driver does support protocol switching.
How did you arrive at the opposite conclusion?

The initial commit on the driver is even part of a patch set named
"dpaa2-mac: add support for changing the protocol at runtime". In
upstream it only supports the 1G <-> 10G transition but I have some
patches on the way to also support 25G.

Ioana


Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Sean Anderson
On 8/21/23 08:49, Vladimir Oltean wrote:
> Hi Sean,
> 
> On Fri, Aug 11, 2023 at 07:36:37PM +0300, Vladimir Oltean wrote:
>> Let me explain that approach, because your mention of "swapping out the
>> bootloaders" makes it appear as if you are not visualising what I am
>> proposing.
>> 
>> The Lynx SerDes family has 2 PLLs, and more lanes (4 or 8). Each lane
>> uses one PLL or the other, to derive its protocol frequency. Through the
>> RCW, you provision the 2 PLL frequencies that may be used by the lanes
>> at runtime.
>> 
>> The Lynx 28G SerDes driver reads the PLL frequencies in
>> lynx_28g_pll_read_configuration(), and determines the interface modes
>> supportable by each PLL (this is used by phylink). But it never changes
>> those PLL frequencies, since that operation is practically impossible in
>> the general sense (PLLs are shared by multiple lanes, so changing a PLL
>> frequency disrupts all lanes that use it).
> 
> Is my high-level feedback clear and actionable to you? I am suggesting
> to keep the look and feel the same between the lynx-10g and lynx-28g
> drivers, and to not use "fsl,type" protocols listed in the device tree
> as the immutable source of information for populating mode->protos, but
> instead the current PLL frequency configuration. So this implies that I
> am requesting that the dt-bindings should not contain a listing of the
> supported protocols.

Well, we have two pieces of information we need

- What values do we need to program in the PCCRs to select a particular
  mode? This includes whether to e.g. set the KX bits.
- Implied by the above, what protocols are supported on which lanes?
  This is not strictly necessary, but will certainly solve a lot of
  headscratching.

This information varies between different socs, and different serdes on
the same socs. We can't really look at the RCW or the clocks and figure
out what we need to program. So what are our options?

- We can have a separate compatible for each serdes on each SoC (e.g.
  "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers.
- We can have one compatible for each SoC, and determine the serdes
  based on the address. I would like to avoid this...
- We can stick all the details which vary between serdes/socs into the
  device tree. This is very flexible, since supporting new SoCs is
  mostly a matter of adding a new compatible and writing a new
  devicetree. On the other hand, if you have a bug in your devicetree,
  it's not easy to fix it in the kernel.
- Just don't support protocol switching. The 28G driver does this, which
  is why it only has one compatible. However, supporting protocol
  switching is a core goal of this driver, so dropping support is not an
  option.

I'm open to any other suggestions you have, but this was the best way I
could figure out to get this information to the driver in a way which
would be acceptable to the devicetree folks.

--Sean


Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

2023-08-21 Thread Ackerley Tng
Sean Christopherson  writes:

> On Tue, Aug 15, 2023, Ackerley Tng wrote:
>> Sean Christopherson  writes:
>>
>> >> I feel that memslots form a natural way of managing usage of the gmem
>> >> file. When a memslot is created, it is using the file; hence we take a
>> >> refcount on the gmem file, and as memslots are removed, we drop
>> >> refcounts on the gmem file.
>> >
>> > Yes and no.  It's definitely more natural *if* the goal is to allow 
>> > guest_memfd
>> > memory to exist without being attached to a VM.  But I'm not at all 
>> > convinced
>> > that we want to allow that, or that it has desirable properties.  With TDX 
>> > and
>> > SNP in particuarly, I'm pretty sure that allowing memory to outlive the VM 
>> > is
>> > very underisable (more below).
>> >
>>
>> This is a little confusing, with the file/inode split in gmem where the
>> physical memory/data is attached to the inode and the file represents
>> the VM's view of that memory, won't the memory outlive the VM?
>
> Doh, I overloaded the term "VM".  By "VM" I meant the virtual machine as a 
> "thing"
> the rest of the world sees and interacts with, not the original "struct kvm" 
> object.
>
> Because yes, you're absolutely correct that the memory will outlive "struct 
> kvm",
> but it won't outlive the virtual machine, and specifically won't outlive the
> ASID (SNP) / HKID (TDX) to which it's bound.
>

Yup we agree on this now :) The memory should not outlive the the ASID
(SNP) / HKID (TDX) to which it's bound.

>> This [1] POC was built based on that premise, that the gmem inode can be
>> linked to another file and handed off to another VM, to facilitate
>> intra-host migration, where the point is to save the work of rebuilding
>> the VM's memory in the destination VM.
>>
>> With this, the bindings don't outlive the VM, but the data/memory
>> does. I think this split design you proposed is really nice.
>>
>> >> The KVM pointer is shared among all the bindings in gmem’s xarray, and we 
>> >> can
>> >> enforce that a gmem file is used only with one VM:
>> >>
>> >> + When binding a memslot to the file, if a kvm pointer exists, it must
>> >>   be the same kvm as the one in this binding
>> >> + When the binding to the last memslot is removed from a file, NULL the
>> >>   kvm pointer.
>> >
>> > Nullifying the KVM pointer isn't sufficient, because without additional 
>> > actions
>> > userspace could extract data from a VM by deleting its memslots and then 
>> > binding
>> > the guest_memfd to an attacker controlled VM.  Or more likely with TDX and 
>> > SNP,
>> > induce badness by coercing KVM into mapping memory into a guest with the 
>> > wrong
>> > ASID/HKID.
>> >
>> > I can think of three ways to handle that:
>> >
>> >   (a) prevent a different VM from *ever* binding to the gmem instance
>> >   (b) free/zero physical pages when unbinding
>> >   (c) free/zero when binding to a different VM
>> >
>> > Option (a) is easy, but that pretty much defeats the purpose of decopuling
>> > guest_memfd from a VM.
>> >
>> > Option (b) isn't hard to implement, but it screws up the lifecycle of the 
>> > memory,
>> > e.g. would require memory when a memslot is deleted.  That isn't 
>> > necessarily a
>> > deal-breaker, but it runs counter to how KVM memlots currently operate.  
>> > Memslots
>> > are basically just weird page tables, e.g. deleting a memslot doesn't have 
>> > any
>> > impact on the underlying data in memory.  TDX throws a wrench in this as 
>> > removing
>> > a page from the Secure EPT is effectively destructive to the data (can't 
>> > be mapped
>> > back in to the VM without zeroing the data), but IMO that's an oddity with 
>> > TDX and
>> > not necessarily something we want to carry over to other VM types.
>> >
>> > There would also be performance implications (probably a non-issue in 
>> > practice),
>> > and weirdness if/when we get to sharing, linking and/or mmap()ing gmem.  
>> > E.g. what
>> > should happen if the last memslot (binding) is deleted, but there 
>> > outstanding userspace
>> > mappings?
>> >
>> > Option (c) is better from a lifecycle perspective, but it adds its own 
>> > flavor of
>> > complexity, e.g. the performant way to reclaim TDX memory requires the TDMR
>> > (effectively the VM pointer), and so a deferred relcaim doesn't really 
>> > work for
>> > TDX.  And I'm pretty sure it *can't* work for SNP, because RMP entries 
>> > must not
>> > outlive the VM; KVM can't reuse an ASID if there are pages assigned to 
>> > that ASID
>> > in the RMP, i.e. until all memory belonging to the VM has been fully freed.
>> >
>>
>> If we are on the same page that the memory should outlive the VM but not
>> the bindings, then associating the gmem inode to a new VM should be a
>> feature and not a bug.
>>
>> What do we want to defend against here?
>>
>> (a) Malicious host VMM
>>
>> For a malicious host VMM to read guest memory (with TDX and SNP), it can
>> create a new VM with the same HKID/ASID as the victim VM, rebind the
>> gmem inode 

Re: [PATCH v5 0/3 RESEND] sed-opal: keyrings, discovery, revert, key store

2023-08-21 Thread Greg Joyce
On Wed, 2023-08-16 at 23:41 +0300, Jarkko Sakkinen wrote:
> On Wed Aug 16, 2023 at 10:45 PM EEST, Greg Joyce wrote:
> > It's been almost 4 weeks since the last resend and there haven't
> > been
> > any comments. Is there anything that needs to be changed for
> > acceptance?
> > 
> > Thanks for your input.
> > 
> > Greg
> > 
> > On Fri, 2023-07-21 at 16:15 -0500, gjo...@linux.vnet.ibm.com wrote:
> > > From: Greg Joyce 
> > > 
> > > This patchset has gone through numerous rounds of review and
> > > all comments/suggetions have been addressed. The reviews have
> > > covered all relevant areas including reviews by block and keyring
> > > developers as well as the SED Opal maintainer. The last
> > > patchset submission has not solicited any responses in the
> > > six weeks since it was last distributed. The changes are
> > > generally useful and ready for inclusion.
> > > 
> > > TCG SED Opal is a specification from The Trusted Computing Group
> > > that allows self encrypting storage devices (SED) to be locked at
> > > power on and require an authentication key to unlock the drive.
> > > 
> > > The current SED Opal implementation in the block driver
> > > requires that authentication keys be provided in an ioctl
> > > so that they can be presented to the underlying SED
> > > capable drive. Currently, the key is typically entered by
> > > a user with an application like sedutil or sedcli. While
> > > this process works, it does not lend itself to automation
> > > like unlock by a udev rule.
> > > 
> > > The SED block driver has been extended so it can alternatively
> > > obtain a key from a sed-opal kernel keyring. The SED ioctls
> > > will indicate the source of the key, either directly in the
> > > ioctl data or from the keyring.
> > > 
> > > Two new SED ioctls have also been added. These are:
> > >   1) IOC_OPAL_REVERT_LSP to revert LSP state
> > >   2) IOC_OPAL_DISCOVERY to discover drive capabilities/state
> > > 
> > > change log v5:
> > > - rebase to for-6.5/block
> > > 
> > > change log v4:
> > > - rebase to 6.3-rc7
> > > - replaced "255" magic number with U8_MAX
> > > 
> > > change log:
> > > - rebase to 6.x
> > > - added latest reviews
> > > - removed platform functions for persistent key storage
> > > - replaced key update logic with key_create_or_update()
> > > - minor bracing and padding changes
> > > - add error returns
> > > - opal_key structure is application provided but kernel
> > >   verified
> > > - added brief description of TCG SED Opal
> > > 
> > > 
> > > Greg Joyce (3):
> > >   block: sed-opal: Implement IOC_OPAL_DISCOVERY
> > >   block: sed-opal: Implement IOC_OPAL_REVERT_LSP
> > >   block: sed-opal: keyring support for SED keys
> > > 
> > >  block/Kconfig |   2 +
> > >  block/opal_proto.h|   4 +
> > >  block/sed-opal.c  | 252
> > > +-
> > >  include/linux/sed-opal.h  |   5 +
> > >  include/uapi/linux/sed-opal.h |  25 +++-
> > >  5 files changed, 282 insertions(+), 6 deletions(-)
> > > 
> > > 
> > > base-commit: 1341c7d2ccf42ed91aea80b8579d35bc1ea381e2
> 
> I can give because it looks good to me to all patches:
> 
> Acked-by: Jarkko Sakkinen 
> 
> ... but should not probably go to my tree.
> 
> BR, Jarkko

Thanks for the ack Jarkko. Any thoughts on which tree it should go to?

Greg



[PATCH 2/2] powerpc/powernv: Use struct opal_prd_msg in more places

2023-08-21 Thread Michael Ellerman
Use the newly added struct opal_prd_msg in some other functions that
operate on opal_prd messages, rather than using other types.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/powernv/opal-prd.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c 
b/arch/powerpc/platforms/powernv/opal-prd.c
index 40e26e9f318f..327e2f76905d 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -221,8 +221,8 @@ static ssize_t opal_prd_write(struct file *file, const char 
__user *buf,
size_t count, loff_t *ppos)
 {
struct opal_prd_msg_header hdr;
+   struct opal_prd_msg *msg;
ssize_t size;
-   void *msg;
int rc;
 
size = sizeof(hdr);
@@ -254,12 +254,12 @@ static ssize_t opal_prd_write(struct file *file, const 
char __user *buf,
 
 static int opal_prd_release(struct inode *inode, struct file *file)
 {
-   struct opal_prd_msg_header msg;
+   struct opal_prd_msg msg;
 
-   msg.size = cpu_to_be16(sizeof(msg));
-   msg.type = OPAL_PRD_MSG_TYPE_FINI;
+   msg.header.size = cpu_to_be16(sizeof(msg));
+   msg.header.type = OPAL_PRD_MSG_TYPE_FINI;
 
-   opal_prd_msg((struct opal_prd_msg *));
+   opal_prd_msg();
 
atomic_xchg(_usage, 0);
 
-- 
2.41.0



[PATCH 1/2] powerpc/powernv: Fix fortify source warnings in opal-prd.c

2023-08-21 Thread Michael Ellerman
As reported by Mahesh & Aneesh, opal_prd_msg_notifier() triggers a
FORTIFY_SOURCE warning:

  memcpy: detected field-spanning write (size 32) of single field ">msg" 
at arch/powerpc/platforms/powernv/opal-prd.c:355 (size 4)
  WARNING: CPU: 9 PID: 660 at arch/powerpc/platforms/powernv/opal-prd.c:355 
opal_prd_msg_notifier+0x174/0x188 [opal_prd]
  NIP opal_prd_msg_notifier+0x174/0x188 [opal_prd]
  LR  opal_prd_msg_notifier+0x170/0x188 [opal_prd]
  Call Trace:
opal_prd_msg_notifier+0x170/0x188 [opal_prd] (unreliable)
notifier_call_chain+0xc0/0x1b0
atomic_notifier_call_chain+0x2c/0x40
opal_message_notify+0xf4/0x2c0

This happens because the copy is targetting item->msg, which is only 4
bytes in size, even though the enclosing item was allocated with extra
space following the msg.

To fix the warning define struct opal_prd_msg with a union of the header
and a flex array, and have the memcpy target the flex array.

Reported-by: Aneesh Kumar K.V 
Reported-by: Mahesh Salgaonkar 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/powernv/opal-prd.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c 
b/arch/powerpc/platforms/powernv/opal-prd.c
index 113bdb151f68..40e26e9f318f 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -24,13 +24,20 @@
 #include 
 
 
+struct opal_prd_msg {
+   union {
+   struct opal_prd_msg_header header;
+   DECLARE_FLEX_ARRAY(u8, data);
+   };
+};
+
 /*
  * The msg member must be at the end of the struct, as it's followed by the
  * message data.
  */
 struct opal_prd_msg_queue_item {
-   struct list_headlist;
-   struct opal_prd_msg_header  msg;
+   struct list_headlist;
+   struct opal_prd_msg msg;
 };
 
 static struct device_node *prd_node;
@@ -156,7 +163,7 @@ static ssize_t opal_prd_read(struct file *file, char __user 
*buf,
int rc;
 
/* we need at least a header's worth of data */
-   if (count < sizeof(item->msg))
+   if (count < sizeof(item->msg.header))
return -EINVAL;
 
if (*ppos)
@@ -186,7 +193,7 @@ static ssize_t opal_prd_read(struct file *file, char __user 
*buf,
return -EINTR;
}
 
-   size = be16_to_cpu(item->msg.size);
+   size = be16_to_cpu(item->msg.header.size);
if (size > count) {
err = -EINVAL;
goto err_requeue;
@@ -352,7 +359,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
if (!item)
return -ENOMEM;
 
-   memcpy(>msg, msg->params, msg_size);
+   memcpy(>msg.data, msg->params, msg_size);
 
spin_lock_irqsave(_prd_msg_queue_lock, flags);
list_add_tail(>list, _prd_msg_queue);
-- 
2.41.0



[PATCH] powerpc/powermac: Fix unused function warning

2023-08-21 Thread Michael Ellerman
Clang reports:
  arch/powerpc/platforms/powermac/feature.c:137:19: error: unused function 
'simple_feature_tweak'

It's only used inside the #ifndef CONFIG_PPC64 block, so move it in
there to fix the warning. While at it drop the inline, the compiler will
decide whether it should be inlined or not.

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202308181501.ar5hmdwc-...@intel.com/
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/powermac/feature.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/feature.c 
b/arch/powerpc/platforms/powermac/feature.c
index ed472b797e28..ae62d432db8b 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -134,8 +134,10 @@ static struct pmac_mb_def pmac_mb;
  * Here are the chip specific feature functions
  */
 
-static inline int simple_feature_tweak(struct device_node *node, int type,
-  int reg, u32 mask, int value)
+#ifndef CONFIG_PPC64
+
+static int simple_feature_tweak(struct device_node *node, int type, int reg,
+   u32 mask, int value)
 {
struct macio_chip*  macio;
unsigned long   flags;
@@ -154,8 +156,6 @@ static inline int simple_feature_tweak(struct device_node 
*node, int type,
return 0;
 }
 
-#ifndef CONFIG_PPC64
-
 static long ohare_htw_scc_enable(struct device_node *node, long param,
 long value)
 {
-- 
2.41.0



[PATCH rfc v2 02/10] arm64: mm: use try_vma_locked_page_fault()

2023-08-21 Thread Kefeng Wang
Use new try_vma_locked_page_fault() helper to simplify code, also
pass struct vmf to __do_page_fault() directly instead of each
independent variable. No functional change intended.

Signed-off-by: Kefeng Wang 
---
 arch/arm64/mm/fault.c | 60 ---
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2e5d1e238af9..2b7a1e610b3e 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -498,9 +498,8 @@ static void do_bad_area(unsigned long far, unsigned long 
esr,
 #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x02)
 
 static vm_fault_t __do_page_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long 
addr,
- unsigned int mm_flags, unsigned long vm_flags,
- struct pt_regs *regs)
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
 {
/*
 * Ok, we have a good vm_area for this memory access, so we can handle
@@ -508,9 +507,9 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm,
 * Check that the permissions on the VMA allow for the fault which
 * occurred.
 */
-   if (!(vma->vm_flags & vm_flags))
+   if (!(vma->vm_flags & vmf->vm_flags))
return VM_FAULT_BADACCESS;
-   return handle_mm_fault(vma, addr, mm_flags, regs);
+   return handle_mm_fault(vma, vmf->real_address, vmf->flags, vmf->regs);
 }
 
 static bool is_el0_instruction_abort(unsigned long esr)
@@ -533,10 +532,12 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
const struct fault_info *inf;
struct mm_struct *mm = current->mm;
vm_fault_t fault;
-   unsigned long vm_flags;
-   unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
struct vm_area_struct *vma;
+   struct vm_fault vmf = {
+   .real_address = addr,
+   .flags = FAULT_FLAG_DEFAULT,
+   };
 
if (kprobe_page_fault(regs, esr))
return 0;
@@ -549,7 +550,7 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
goto no_context;
 
if (user_mode(regs))
-   mm_flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
 
/*
 * vm_flags tells us what bits we must have in vma->vm_flags
@@ -559,20 +560,20 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
 */
if (is_el0_instruction_abort(esr)) {
/* It was exec fault */
-   vm_flags = VM_EXEC;
-   mm_flags |= FAULT_FLAG_INSTRUCTION;
+   vmf.vm_flags = VM_EXEC;
+   vmf.flags |= FAULT_FLAG_INSTRUCTION;
} else if (is_write_abort(esr)) {
/* It was write fault */
-   vm_flags = VM_WRITE;
-   mm_flags |= FAULT_FLAG_WRITE;
+   vmf.vm_flags = VM_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
} else {
/* It was read fault */
-   vm_flags = VM_READ;
+   vmf.vm_flags = VM_READ;
/* Write implies read */
-   vm_flags |= VM_WRITE;
+   vmf.vm_flags |= VM_WRITE;
/* If EPAN is absent then exec implies read */
if (!cpus_have_const_cap(ARM64_HAS_EPAN))
-   vm_flags |= VM_EXEC;
+   vmf.vm_flags |= VM_EXEC;
}
 
if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
@@ -587,26 +588,11 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
-   if (!(mm_flags & FAULT_FLAG_USER))
-   goto lock_mmap;
-
-   vma = lock_vma_under_rcu(mm, addr);
-   if (!vma)
-   goto lock_mmap;
-
-   if (!(vma->vm_flags & vm_flags)) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-   fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, 
regs);
-   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
-   vma_end_read(vma);
-
-   if (!(fault & VM_FAULT_RETRY)) {
-   count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
goto done;
-   }
-   count_vm_vma_lock_event(VMA_LOCK_RETRY);
 
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
@@ -614,8 +600,6 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
goto no_context;
return 0;
}
-lock_mmap:
-
 retry:
   

[PATCH rfc v2 04/10] s390: mm: use try_vma_locked_page_fault()

2023-08-21 Thread Kefeng Wang
Use new try_vma_locked_page_fault() helper to simplify code.
No functional change intended.

Signed-off-by: Kefeng Wang 
---
 arch/s390/mm/fault.c | 66 ++--
 1 file changed, 27 insertions(+), 39 deletions(-)

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 099c4824dd8a..fbbdebde6ea7 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -357,16 +357,18 @@ static noinline void do_fault_error(struct pt_regs *regs, 
vm_fault_t fault)
 static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 {
struct gmap *gmap;
-   struct task_struct *tsk;
-   struct mm_struct *mm;
struct vm_area_struct *vma;
enum fault_type type;
-   unsigned long address;
-   unsigned int flags;
+   struct mm_struct *mm = current->mm;
+   unsigned long address = get_fault_address(regs);
vm_fault_t fault;
bool is_write;
+   struct vm_fault vmf = {
+   .real_address = address,
+   .flags = FAULT_FLAG_DEFAULT,
+   .vm_flags = access,
+   };
 
-   tsk = current;
/*
 * The instruction that caused the program check has
 * been nullified. Don't signal single step via SIGTRAP.
@@ -376,8 +378,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, 
int access)
if (kprobe_page_fault(regs, 14))
return 0;
 
-   mm = tsk->mm;
-   address = get_fault_address(regs);
is_write = fault_is_write(regs);
 
/*
@@ -398,45 +398,33 @@ static inline vm_fault_t do_exception(struct pt_regs 
*regs, int access)
}
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-   flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
if (is_write)
-   access = VM_WRITE;
-   if (access == VM_WRITE)
-   flags |= FAULT_FLAG_WRITE;
-   if (!(flags & FAULT_FLAG_USER))
-   goto lock_mmap;
-   vma = lock_vma_under_rcu(mm, address);
-   if (!vma)
-   goto lock_mmap;
-   if (!(vma->vm_flags & access)) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-   fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, 
regs);
-   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
-   vma_end_read(vma);
-   if (!(fault & VM_FAULT_RETRY)) {
-   count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
-   if (likely(!(fault & VM_FAULT_ERROR)))
-   fault = 0;
+   vmf.vm_flags = VM_WRITE;
+   if (vmf.vm_flags == VM_WRITE)
+   vmf.flags |= FAULT_FLAG_WRITE;
+
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto lock_mm;
+   if (!(fault & VM_FAULT_RETRY))
goto out;
-   }
-   count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
goto out;
}
-lock_mmap:
+
+lock_mm:
mmap_read_lock(mm);
 
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
gmap = (struct gmap *) S390_lowcore.gmap;
current->thread.gmap_addr = address;
-   current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+   current->thread.gmap_write_flag = !!(vmf.flags & 
FAULT_FLAG_WRITE);
current->thread.gmap_int_code = regs->int_code & 0x;
address = __gmap_translate(gmap, address);
if (address == -EFAULT) {
@@ -444,7 +432,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, 
int access)
goto out_up;
}
if (gmap->pfault_enabled)
-   flags |= FAULT_FLAG_RETRY_NOWAIT;
+   vmf.flags |= FAULT_FLAG_RETRY_NOWAIT;
}
 
 retry:
@@ -466,7 +454,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, 
int access)
 * we can handle it..
 */
fault = VM_FAULT_BADACCESS;
-   if (unlikely(!(vma->vm_flags & access)))
+   if (unlikely(!(vma->vm_flags & vmf.vm_flags)))
goto out_up;
 
/*
@@ -474,10 +462,10 @@ static inline vm_fault_t do_exception(struct pt_regs 
*regs, int access)
 * make sure we exit gracefully rather than endlessly redo
 * the fault.
 */
-   fault = handle_mm_fault(vma, address, flags, regs);
+   fault = handle_mm_fault(vma, address, vmf.flags, regs);
if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
-   if (flags & FAULT_FLAG_RETRY_NOWAIT)
+   if (vmf.flags & FAULT_FLAG_RETRY_NOWAIT)
goto out_up;
 

[PATCH rfc v2 03/10] x86: mm: use try_vma_locked_page_fault()

2023-08-21 Thread Kefeng Wang
Use new try_vma_locked_page_fault() helper to simplify code.
No functional change intended.

Signed-off-by: Kefeng Wang 
---
 arch/x86/mm/fault.c | 55 +++--
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ab778eac1952..3edc9edc0b28 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1227,6 +1227,13 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
 }
 NOKPROBE_SYMBOL(do_kern_addr_fault);
 
+#ifdef CONFIG_PER_VMA_LOCK
+bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+   return access_error(vmf->fault_code, vma);
+}
+#endif
+
 /*
  * Handle faults in the user portion of the address space.  Nothing in here
  * should check X86_PF_USER without a specific justification: for almost
@@ -1241,13 +1248,13 @@ void do_user_addr_fault(struct pt_regs *regs,
unsigned long address)
 {
struct vm_area_struct *vma;
-   struct task_struct *tsk;
-   struct mm_struct *mm;
+   struct mm_struct *mm = current->mm;
vm_fault_t fault;
-   unsigned int flags = FAULT_FLAG_DEFAULT;
-
-   tsk = current;
-   mm = tsk->mm;
+   struct vm_fault vmf = {
+   .real_address = address,
+   .fault_code = error_code,
+   .flags = FAULT_FLAG_DEFAULT
+   };
 
if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == 
X86_PF_INSTR)) {
/*
@@ -1311,7 +1318,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 */
if (user_mode(regs)) {
local_irq_enable();
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
@@ -1326,11 +1333,11 @@ void do_user_addr_fault(struct pt_regs *regs,
 * maybe_mkwrite() can create a proper shadow stack PTE.
 */
if (error_code & X86_PF_SHSTK)
-   flags |= FAULT_FLAG_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_WRITE)
-   flags |= FAULT_FLAG_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
-   flags |= FAULT_FLAG_INSTRUCTION;
+   vmf.flags |= FAULT_FLAG_INSTRUCTION;
 
 #ifdef CONFIG_X86_64
/*
@@ -1350,26 +1357,11 @@ void do_user_addr_fault(struct pt_regs *regs,
}
 #endif
 
-   if (!(flags & FAULT_FLAG_USER))
-   goto lock_mmap;
-
-   vma = lock_vma_under_rcu(mm, address);
-   if (!vma)
-   goto lock_mmap;
-
-   if (unlikely(access_error(error_code, vma))) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-   fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, 
regs);
-   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
-   vma_end_read(vma);
-
-   if (!(fault & VM_FAULT_RETRY)) {
-   count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
goto done;
-   }
-   count_vm_vma_lock_event(VMA_LOCK_RETRY);
 
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
@@ -1379,7 +1371,6 @@ void do_user_addr_fault(struct pt_regs *regs,
 ARCH_DEFAULT_PKEY);
return;
}
-lock_mmap:
 
 retry:
vma = lock_mm_and_find_vma(mm, address, regs);
@@ -1410,7 +1401,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 * userland). The return to userland is identified whenever
 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
 */
-   fault = handle_mm_fault(vma, address, flags, regs);
+   fault = handle_mm_fault(vma, address, vmf.flags, regs);
 
if (fault_signal_pending(fault, regs)) {
/*
@@ -1434,7 +1425,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 * that we made any progress. Handle this case first.
 */
if (unlikely(fault & VM_FAULT_RETRY)) {
-   flags |= FAULT_FLAG_TRIED;
+   vmf.flags |= FAULT_FLAG_TRIED;
goto retry;
}
 
-- 
2.27.0



[PATCH rfc v2 07/10] ARM: mm: try VMA lock-based page fault handling first

2023-08-21 Thread Kefeng Wang
Attempt VMA lock-based page fault handling first, and fall back
to the existing mmap_lock-based handling if that fails.

Signed-off-by: Kefeng Wang 
---
 arch/arm/Kconfig|  1 +
 arch/arm/mm/fault.c | 35 +--
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1a6a6eb48a15..8b6d4507ccee 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -34,6 +34,7 @@ config ARM
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE
+   select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_MEMTEST
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index fef62e4a9edd..d53bb028899a 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -242,8 +242,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct 
pt_regs *regs)
struct vm_area_struct *vma;
int sig, code;
vm_fault_t fault;
-   unsigned int flags = FAULT_FLAG_DEFAULT;
-   unsigned long vm_flags = VM_ACCESS_FLAGS;
+   struct vm_fault vmf = {
+   .real_address = addr,
+   .flags = FAULT_FLAG_DEFAULT,
+   .vm_flags = VM_ACCESS_FLAGS,
+   };
 
if (kprobe_page_fault(regs, fsr))
return 0;
@@ -261,15 +264,15 @@ do_page_fault(unsigned long addr, unsigned int fsr, 
struct pt_regs *regs)
goto no_context;
 
if (user_mode(regs))
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
 
if (is_write_fault(fsr)) {
-   flags |= FAULT_FLAG_WRITE;
-   vm_flags = VM_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
+   vmf.vm_flags = VM_WRITE;
}
 
if (fsr & FSR_LNX_PF) {
-   vm_flags = VM_EXEC;
+   vmf.vm_flags = VM_EXEC;
 
if (is_permission_fault(fsr) && !user_mode(regs))
die_kernel_fault("execution of memory",
@@ -278,6 +281,18 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct 
pt_regs *regs)
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
+   goto done;
+
+   if (fault_signal_pending(fault, regs)) {
+   if (!user_mode(regs))
+   goto no_context;
+   return 0;
+   }
+
 retry:
vma = lock_mm_and_find_vma(mm, addr, regs);
if (unlikely(!vma)) {
@@ -289,10 +304,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, 
struct pt_regs *regs)
 * ok, we have a good vm_area for this memory access, check the
 * permissions on the VMA allow for the fault which occurred.
 */
-   if (!(vma->vm_flags & vm_flags))
+   if (!(vma->vm_flags & vmf.vm_flags))
fault = VM_FAULT_BADACCESS;
else
-   fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+   fault = handle_mm_fault(vma, addr & PAGE_MASK, vmf.flags, regs);
 
/* If we need to retry but a fatal signal is pending, handle the
 * signal first. We do not need to release the mmap_lock because
@@ -310,13 +325,13 @@ do_page_fault(unsigned long addr, unsigned int fsr, 
struct pt_regs *regs)
 
if (!(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_RETRY) {
-   flags |= FAULT_FLAG_TRIED;
+   vmf.flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
 
mmap_read_unlock(mm);
-
+done:
/*
 * Handle the "normal" case first - VM_FAULT_MAJOR
 */
-- 
2.27.0



[PATCH rfc v2 08/10] loongarch: mm: cleanup __do_page_fault()

2023-08-21 Thread Kefeng Wang
Cleanup __do_page_fault() by reuse bad_area_nosemaphore and
bad_area label.

Signed-off-by: Kefeng Wang 
---
 arch/loongarch/mm/fault.c | 48 +--
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index e6376e3dce86..5d4c742c4bc5 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -157,18 +157,15 @@ static void __kprobes __do_page_fault(struct pt_regs 
*regs,
if (!user_mode(regs))
no_context(regs, write, address);
else
-   do_sigsegv(regs, write, address, si_code);
-   return;
+   goto bad_area_nosemaphore;
}
 
/*
 * If we're in an interrupt or have no user
 * context, we must not take the fault..
 */
-   if (faulthandler_disabled() || !mm) {
-   do_sigsegv(regs, write, address, si_code);
-   return;
-   }
+   if (faulthandler_disabled() || !mm)
+   goto bad_area_nosemaphore;
 
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
@@ -178,23 +175,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
vma = lock_mm_and_find_vma(mm, address, regs);
if (unlikely(!vma))
goto bad_area_nosemaphore;
-   goto good_area;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-   mmap_read_unlock(mm);
-bad_area_nosemaphore:
-   do_sigsegv(regs, write, address, si_code);
-   return;
 
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
si_code = SEGV_ACCERR;
 
if (write) {
@@ -235,22 +216,25 @@ static void __kprobes __do_page_fault(struct pt_regs 
*regs,
 */
goto retry;
}
+
+   mmap_read_unlock(mm);
+
if (unlikely(fault & VM_FAULT_ERROR)) {
-   mmap_read_unlock(mm);
-   if (fault & VM_FAULT_OOM) {
+   if (fault & VM_FAULT_OOM)
do_out_of_memory(regs, write, address);
-   return;
-   } else if (fault & VM_FAULT_SIGSEGV) {
-   do_sigsegv(regs, write, address, si_code);
-   return;
-   } else if (fault & 
(VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+   else if (fault & VM_FAULT_SIGSEGV)
+   goto bad_area_nosemaphore;
+   else if (fault & 
(VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, write, address, si_code);
-   return;
-   }
-   BUG();
+   else
+   BUG();
}
 
+   return;
+bad_area:
mmap_read_unlock(mm);
+bad_area_nosemaphore:
+   do_sigsegv(regs, write, address, si_code);
 }
 
 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-- 
2.27.0



[PATCH rfc v2 09/10] loongarch: mm: add access_error() helper

2023-08-21 Thread Kefeng Wang
Add access_error() to check whether vma could be accessible or not,
which will be used __do_page_fault() and later vma locked based page
fault.

Signed-off-by: Kefeng Wang 
---
 arch/loongarch/mm/fault.c | 30 --
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index 5d4c742c4bc5..2a45e9f3a485 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -126,6 +126,22 @@ static void __kprobes do_sigsegv(struct pt_regs *regs,
force_sig_fault(SIGSEGV, si_code, (void __user *)address);
 }
 
+static inline bool access_error(unsigned int flags, struct pt_regs *regs,
+   unsigned long addr, struct vm_area_struct *vma)
+{
+   if (flags & FAULT_FLAG_WRITE) {
+   if (!(vma->vm_flags & VM_WRITE))
+   return true;
+   } else {
+   if (!(vma->vm_flags & VM_READ) && addr != exception_era(regs))
+   return true;
+   if (!(vma->vm_flags & VM_EXEC) && addr == exception_era(regs))
+   return true;
+   }
+
+   return false;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -169,6 +185,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
+   if (write)
+   flags |= FAULT_FLAG_WRITE;
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 retry:
@@ -178,16 +196,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 
si_code = SEGV_ACCERR;
 
-   if (write) {
-   flags |= FAULT_FLAG_WRITE;
-   if (!(vma->vm_flags & VM_WRITE))
-   goto bad_area;
-   } else {
-   if (!(vma->vm_flags & VM_READ) && address != 
exception_era(regs))
-   goto bad_area;
-   if (!(vma->vm_flags & VM_EXEC) && address == 
exception_era(regs))
-   goto bad_area;
-   }
+   if (access_error(flags, regs, vma))
+   goto bad_area;
 
/*
 * If for any reason at all we couldn't handle the fault,
-- 
2.27.0



[PATCH rfc -next v2 00/10] mm: convert to generic VMA lock-based page fault

2023-08-21 Thread Kefeng Wang
Add a generic VMA lock-based page fault handler in mm core, and convert
architectures to use it, which eliminate architectures's duplicated
codes.

With it, we can avoid multiple changes in architectures's code if we 
add new feature or bugfix, in the end, enable this feature on ARM32
and Loongarch.

This is based on next-20230817, only built test.

v2: 
- convert "int arch_vma_check_access()" to "bool arch_vma_access_error()"
  still use __weak function for arch_vma_access_error(), which avoid to
  declare access_error() in architecture's(x86/powerpc/riscv/loongarch)
  headfile.
- re-use struct vm_fault instead of adding new struct vm_locked_fault,
  per Matthew Wilcox, add necessary pt_regs/fault error code/vm flags
  into vm_fault since they could be used in arch_vma_access_error()
- add special VM_FAULT_NONE and make try_vma_locked_page_fault() to
  return vm_fault_t

Kefeng Wang (10):
  mm: add a generic VMA lock-based page fault handler
  arm64: mm: use try_vma_locked_page_fault()
  x86: mm: use try_vma_locked_page_fault()
  s390: mm: use try_vma_locked_page_fault()
  powerpc: mm: use try_vma_locked_page_fault()
  riscv: mm: use try_vma_locked_page_fault()
  ARM: mm: try VMA lock-based page fault handling first
  loongarch: mm: cleanup __do_page_fault()
  loongarch: mm: add access_error() helper
  loongarch: mm: try VMA lock-based page fault handling first

 arch/arm/Kconfig  |   1 +
 arch/arm/mm/fault.c   |  35 
 arch/arm64/mm/fault.c |  60 -
 arch/loongarch/Kconfig|   1 +
 arch/loongarch/mm/fault.c | 111 ++
 arch/powerpc/mm/fault.c   |  66 +++
 arch/riscv/mm/fault.c |  58 +---
 arch/s390/mm/fault.c  |  66 ++-
 arch/x86/mm/fault.c   |  55 ---
 include/linux/mm.h|  17 ++
 include/linux/mm_types.h  |   2 +
 mm/memory.c   |  39 ++
 12 files changed, 278 insertions(+), 233 deletions(-)

-- 
2.27.0



[PATCH rfc v2 05/10] powerpc: mm: use try_vma_locked_page_fault()

2023-08-21 Thread Kefeng Wang
Use new try_vma_locked_page_fault() helper to simplify code.
No functional change intended.

Signed-off-by: Kefeng Wang 
---
 arch/powerpc/mm/fault.c | 66 -
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b1723094d464..52f9546e020e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -391,6 +391,22 @@ static int page_fault_is_bad(unsigned long err)
 #define page_fault_is_bad(__err)   ((__err) & DSISR_BAD_FAULT_32S)
 #endif
 
+#ifdef CONFIG_PER_VMA_LOCK
+bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+   int is_exec = TRAP(vmf->regs) == INTERRUPT_INST_STORAGE;
+   int is_write = page_fault_is_write(vmf->fault_code);
+
+   if (unlikely(access_pkey_error(is_write, is_exec,
+   (vmf->fault_code & DSISR_KEYFAULT), vma)))
+   return true;
+
+   if (unlikely(access_error(is_write, is_exec, vma)))
+   return true;
+   return false;
+}
+#endif
+
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
  * for a data fault, SRR1 for an instruction fault.
@@ -407,12 +423,18 @@ static int ___do_page_fault(struct pt_regs *regs, 
unsigned long address,
 {
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
-   unsigned int flags = FAULT_FLAG_DEFAULT;
int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
vm_fault_t fault, major = 0;
bool kprobe_fault = kprobe_page_fault(regs, 11);
+   struct vm_fault vmf = {
+   .real_address = address,
+   .fault_code = error_code,
+   .regs = regs,
+   .flags = FAULT_FLAG_DEFAULT,
+   };
+
 
if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
return 0;
@@ -463,45 +485,21 @@ static int ___do_page_fault(struct pt_regs *regs, 
unsigned long address,
 * mmap_lock held
 */
if (is_user)
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
if (is_write)
-   flags |= FAULT_FLAG_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
if (is_exec)
-   flags |= FAULT_FLAG_INSTRUCTION;
+   vmf.flags |= FAULT_FLAG_INSTRUCTION;
 
-   if (!(flags & FAULT_FLAG_USER))
-   goto lock_mmap;
-
-   vma = lock_vma_under_rcu(mm, address);
-   if (!vma)
-   goto lock_mmap;
-
-   if (unlikely(access_pkey_error(is_write, is_exec,
-  (error_code & DSISR_KEYFAULT), vma))) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-
-   if (unlikely(access_error(is_write, is_exec, vma))) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-
-   fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, 
regs);
-   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
-   vma_end_read(vma);
-
-   if (!(fault & VM_FAULT_RETRY)) {
-   count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
goto done;
-   }
-   count_vm_vma_lock_event(VMA_LOCK_RETRY);
 
if (fault_signal_pending(fault, regs))
return user_mode(regs) ? 0 : SIGBUS;
 
-lock_mmap:
-
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -528,7 +526,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * make sure we exit gracefully rather than endlessly redo
 * the fault.
 */
-   fault = handle_mm_fault(vma, address, flags, regs);
+   fault = handle_mm_fault(vma, address, vmf.flags, regs);
 
major |= fault & VM_FAULT_MAJOR;
 
@@ -544,7 +542,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * case.
 */
if (unlikely(fault & VM_FAULT_RETRY)) {
-   flags |= FAULT_FLAG_TRIED;
+   vmf.flags |= FAULT_FLAG_TRIED;
goto retry;
}
 
-- 
2.27.0



[PATCH rfc v2 06/10] riscv: mm: use try_vma_locked_page_fault()

2023-08-21 Thread Kefeng Wang
Use new try_vma_locked_page_fault() helper to simplify code.
No functional change intended.

Signed-off-by: Kefeng Wang 
---
 arch/riscv/mm/fault.c | 58 ++-
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 6115d7514972..b46129b636f2 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -215,6 +215,13 @@ static inline bool access_error(unsigned long cause, 
struct vm_area_struct *vma)
return false;
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+   return access_error(vmf->fault_code, vma);
+}
+#endif
+
 /*
  * This routine handles page faults.  It determines the address and the
  * problem, and then passes it off to one of the appropriate routines.
@@ -223,17 +230,16 @@ void handle_page_fault(struct pt_regs *regs)
 {
struct task_struct *tsk;
struct vm_area_struct *vma;
-   struct mm_struct *mm;
-   unsigned long addr, cause;
-   unsigned int flags = FAULT_FLAG_DEFAULT;
+   struct mm_struct *mm = current->mm;
+   unsigned long addr = regs->badaddr;
+   unsigned long cause = regs->cause;
int code = SEGV_MAPERR;
vm_fault_t fault;
-
-   cause = regs->cause;
-   addr = regs->badaddr;
-
-   tsk = current;
-   mm = tsk->mm;
+   struct vm_fault vmf = {
+   .real_address = addr,
+   .fault_code = cause,
+   .flags = FAULT_FLAG_DEFAULT,
+   };
 
if (kprobe_page_fault(regs, cause))
return;
@@ -268,7 +274,7 @@ void handle_page_fault(struct pt_regs *regs)
}
 
if (user_mode(regs))
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
 
if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & 
SR_SUM))) {
if (fixup_exception(regs))
@@ -280,37 +286,21 @@ void handle_page_fault(struct pt_regs *regs)
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
if (cause == EXC_STORE_PAGE_FAULT)
-   flags |= FAULT_FLAG_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
else if (cause == EXC_INST_PAGE_FAULT)
-   flags |= FAULT_FLAG_INSTRUCTION;
-   if (!(flags & FAULT_FLAG_USER))
-   goto lock_mmap;
-
-   vma = lock_vma_under_rcu(mm, addr);
-   if (!vma)
-   goto lock_mmap;
+   vmf.flags |= FAULT_FLAG_INSTRUCTION;
 
-   if (unlikely(access_error(cause, vma))) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-
-   fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
-   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
-   vma_end_read(vma);
-
-   if (!(fault & VM_FAULT_RETRY)) {
-   count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
goto done;
-   }
-   count_vm_vma_lock_event(VMA_LOCK_RETRY);
 
if (fault_signal_pending(fault, regs)) {
if (!user_mode(regs))
no_context(regs, addr);
return;
}
-lock_mmap:
 
 retry:
vma = lock_mm_and_find_vma(mm, addr, regs);
@@ -337,7 +327,7 @@ void handle_page_fault(struct pt_regs *regs)
 * make sure we exit gracefully rather than endlessly redo
 * the fault.
 */
-   fault = handle_mm_fault(vma, addr, flags, regs);
+   fault = handle_mm_fault(vma, addr, vmf.flags, regs);
 
/*
 * If we need to retry but a fatal signal is pending, handle the
@@ -355,7 +345,7 @@ void handle_page_fault(struct pt_regs *regs)
return;
 
if (unlikely(fault & VM_FAULT_RETRY)) {
-   flags |= FAULT_FLAG_TRIED;
+   vmf.flags |= FAULT_FLAG_TRIED;
 
/*
 * No need to mmap_read_unlock(mm) as we would
-- 
2.27.0



[PATCH rfc v2 01/10] mm: add a generic VMA lock-based page fault handler

2023-08-21 Thread Kefeng Wang
The ARCH_SUPPORTS_PER_VMA_LOCK are enabled by more and more architectures,
eg, x86, arm64, powerpc and s390, and riscv, those implementation are very
similar which results in some duplicated codes, let's add a generic VMA
lock-based page fault handler try_to_vma_locked_page_fault() to eliminate
them, and which also make us easy to support this on new architectures.

Since different architectures use different way to check vma whether is
accessable or not, the struct pt_regs, page fault error code and vma flags
are added into struct vm_fault, then, the architecture's page fault code
could re-use struct vm_fault to record and check vma accessable by each
own implementation.

Signed-off-by: Kefeng Wang 
---
 include/linux/mm.h   | 17 +
 include/linux/mm_types.h |  2 ++
 mm/memory.c  | 39 +++
 3 files changed, 58 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f764e84e567..22a6f4c56ff3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -512,9 +512,12 @@ struct vm_fault {
pgoff_t pgoff;  /* Logical page offset based on 
vma */
unsigned long address;  /* Faulting virtual address - 
masked */
unsigned long real_address; /* Faulting virtual address - 
unmasked */
+   unsigned long fault_code;   /* Faulting error code during 
page fault */
+   struct pt_regs *regs;   /* The registers stored during 
page fault */
};
enum fault_flag flags;  /* FAULT_FLAG_xxx flags
 * XXX: should really be 'const' */
+   vm_flags_t vm_flags;/* VMA flags to be used for access 
checking */
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -774,6 +777,9 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
  unsigned long address);
 
+bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf);
+vm_fault_t try_vma_locked_page_fault(struct vm_fault *vmf);
+
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline bool vma_start_read(struct vm_area_struct *vma)
@@ -801,6 +807,17 @@ static inline void assert_fault_locked(struct vm_fault 
*vmf)
mmap_assert_locked(vmf->vma->vm_mm);
 }
 
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+   unsigned long address)
+{
+   return NULL;
+}
+
+static inline vm_fault_t try_vma_locked_page_fault(struct vm_fault *vmf)
+{
+   return VM_FAULT_NONE;
+}
+
 #endif /* CONFIG_PER_VMA_LOCK */
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f5ba5b0bc836..702820cea3f9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1119,6 +1119,7 @@ typedef __bitwise unsigned int vm_fault_t;
  * fault. Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
  *
+ * @VM_FAULT_NONE: Special case, not starting to handle fault
  * @VM_FAULT_OOM:  Out Of Memory
  * @VM_FAULT_SIGBUS:   Bad access
  * @VM_FAULT_MAJOR:Page read from storage
@@ -1139,6 +1140,7 @@ typedef __bitwise unsigned int vm_fault_t;
  *
  */
 enum vm_fault_reason {
+   VM_FAULT_NONE   = (__force vm_fault_t)0x00,
VM_FAULT_OOM= (__force vm_fault_t)0x01,
VM_FAULT_SIGBUS = (__force vm_fault_t)0x02,
VM_FAULT_MAJOR  = (__force vm_fault_t)0x04,
diff --git a/mm/memory.c b/mm/memory.c
index 3b4aaa0d2fff..60fe35db5134 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5510,6 +5510,45 @@ struct vm_area_struct *lock_vma_under_rcu(struct 
mm_struct *mm,
count_vm_vma_lock_event(VMA_LOCK_ABORT);
return NULL;
 }
+
+#ifdef CONFIG_PER_VMA_LOCK
+bool __weak arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault 
*vmf)
+{
+   return (vma->vm_flags & vmf->vm_flags) == 0;
+}
+#endif
+
+vm_fault_t try_vma_locked_page_fault(struct vm_fault *vmf)
+{
+   vm_fault_t fault = VM_FAULT_NONE;
+   struct vm_area_struct *vma;
+
+   if (!(vmf->flags & FAULT_FLAG_USER))
+   return fault;
+
+   vma = lock_vma_under_rcu(current->mm, vmf->real_address);
+   if (!vma)
+   return fault;
+
+   if (arch_vma_access_error(vma, vmf)) {
+   vma_end_read(vma);
+   return fault;
+   }
+
+   fault = handle_mm_fault(vma, vmf->real_address,
+   vmf->flags | FAULT_FLAG_VMA_LOCK, vmf->regs);
+
+   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+   

Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-21 Thread Vladimir Oltean
Hi Sean,

On Fri, Aug 11, 2023 at 07:36:37PM +0300, Vladimir Oltean wrote:
> Let me explain that approach, because your mention of "swapping out the
> bootloaders" makes it appear as if you are not visualising what I am
> proposing.
> 
> The Lynx SerDes family has 2 PLLs, and more lanes (4 or 8). Each lane
> uses one PLL or the other, to derive its protocol frequency. Through the
> RCW, you provision the 2 PLL frequencies that may be used by the lanes
> at runtime.
> 
> The Lynx 28G SerDes driver reads the PLL frequencies in
> lynx_28g_pll_read_configuration(), and determines the interface modes
> supportable by each PLL (this is used by phylink). But it never changes
> those PLL frequencies, since that operation is practically impossible in
> the general sense (PLLs are shared by multiple lanes, so changing a PLL
> frequency disrupts all lanes that use it).

Is my high-level feedback clear and actionable to you? I am suggesting
to keep the look and feel the same between the lynx-10g and lynx-28g
drivers, and to not use "fsl,type" protocols listed in the device tree
as the immutable source of information for populating mode->protos, but
instead the current PLL frequency configuration. So this implies that I
am requesting that the dt-bindings should not contain a listing of the
supported protocols.


[PATCH rfc v2 10/10] loongarch: mm: try VMA lock-based page fault handling first

2023-08-21 Thread Kefeng Wang
Attempt VMA lock-based page fault handling first, and fall back
to the existing mmap_lock-based handling if that fails.

Signed-off-by: Kefeng Wang 
---
 arch/loongarch/Kconfig|  1 +
 arch/loongarch/mm/fault.c | 37 +++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 2b27b18a63af..6b821f621920 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -56,6 +56,7 @@ config LOONGARCH
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_SUPPORTS_NUMA_BALANCING
+   select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_QUEUED_RWLOCKS
diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index 2a45e9f3a485..f7ac3a14bb06 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -142,6 +142,13 @@ static inline bool access_error(unsigned int flags, struct 
pt_regs *regs,
return false;
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+   return access_error(vmf->flags, vmf->regs, vmf->real_address, vma);
+}
+#endif
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -151,11 +158,15 @@ static void __kprobes __do_page_fault(struct pt_regs 
*regs,
unsigned long write, unsigned long address)
 {
int si_code = SEGV_MAPERR;
-   unsigned int flags = FAULT_FLAG_DEFAULT;
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
struct vm_area_struct *vma = NULL;
vm_fault_t fault;
+   struct vm_fault vmf = {
+   .real_address = address,
+   .regs = regs,
+   .flags = FAULT_FLAG_DEFAULT,
+   };
 
if (kprobe_page_fault(regs, current->thread.trap_nr))
return;
@@ -184,11 +195,24 @@ static void __kprobes __do_page_fault(struct pt_regs 
*regs,
goto bad_area_nosemaphore;
 
if (user_mode(regs))
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
if (write)
-   flags |= FAULT_FLAG_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
+   goto done;
+
+   if (fault_signal_pending(fault, regs)) {
+   if (!user_mode(regs))
+   no_context(regs, write, address);
+   return;
+   }
+
 retry:
vma = lock_mm_and_find_vma(mm, address, regs);
if (unlikely(!vma))
@@ -196,7 +220,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 
si_code = SEGV_ACCERR;
 
-   if (access_error(flags, regs, vma))
+   if (access_error(vmf.flags, regs, address, vma))
goto bad_area;
 
/*
@@ -204,7 +228,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 * make sure we exit gracefully rather than endlessly redo
 * the fault.
 */
-   fault = handle_mm_fault(vma, address, flags, regs);
+   fault = handle_mm_fault(vma, address, vmf.flags, regs);
 
if (fault_signal_pending(fault, regs)) {
if (!user_mode(regs))
@@ -217,7 +241,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
return;
 
if (unlikely(fault & VM_FAULT_RETRY)) {
-   flags |= FAULT_FLAG_TRIED;
+   vmf.flags |= FAULT_FLAG_TRIED;
 
/*
 * No need to mmap_read_unlock(mm) as we would
@@ -229,6 +253,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 
mmap_read_unlock(mm);
 
+done:
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
do_out_of_memory(regs, write, address);
-- 
2.27.0



Re: [PATCH v4 23/28] mfd: core: Ensure disabled devices are skiped without aborting

2023-08-21 Thread Lee Jones
On Fri, 18 Aug 2023, Christophe Leroy wrote:

> From: Herve Codina 
> 
> The loop searching for a matching device based on its compatible
> string is aborted when a matching disabled device is found.
> This abort prevents to add devices as soon as one disabled device
> is found.
> 
> Continue searching for an other device instead of aborting on the
> first disabled one fixes the issue.
> 
> Fixes: 22380b65dc70 ("mfd: mfd-core: Ensure disabled devices are ignored 
> without error")
> Signed-off-by: Herve Codina 
> Reviewed-by: Christophe Leroy 
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/mfd/mfd-core.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)

This is too big of a change to be added this late in the cycle.

Pushing review until after v6.5 is out.
-- 
Lee Jones [李琼斯]


[powerpc:topic/ppc-kvm 5/9] arch/powerpc/kvm/book3s_hv_nestedv2.c:465:29: error: variable 'amor' set but not used

2023-08-21 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
topic/ppc-kvm
head:   a6a4c847287937c42ad33e053af968ed6ea91e13
commit: 6be7fd9b644945c2e4f9f071b9f8caa57f4e5590 [5/9] KVM: PPC: Add support 
for nestedv2 guests
config: powerpc-ppc64_defconfig 
(https://download.01.org/0day-ci/archive/20230821/202308211729.rxk20sco-...@intel.com/config)
compiler: powerpc64-linux-gcc (GCC) 12.3.0
reproduce: 
(https://download.01.org/0day-ci/archive/20230821/202308211729.rxk20sco-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202308211729.rxk20sco-...@intel.com/

All errors (new ones prefixed by >>):

   arch/powerpc/kvm/book3s_hv_nestedv2.c: In function 
'gs_msg_ops_vcpu_refresh_info':
>> arch/powerpc/kvm/book3s_hv_nestedv2.c:465:29: error: variable 'amor' set but 
>> not used [-Werror=unused-but-set-variable]
 465 | u64 amor;
 | ^~~~
   cc1: all warnings being treated as errors


vim +/amor +465 arch/powerpc/kvm/book3s_hv_nestedv2.c

   361  
   362  static int gs_msg_ops_vcpu_refresh_info(struct kvmppc_gs_msg *gsm,
   363  struct kvmppc_gs_buff *gsb)
   364  {
   365  struct kvmppc_gs_parser gsp = { 0 };
   366  struct kvmhv_nestedv2_io *io;
   367  struct kvmppc_gs_bitmap *valids;
   368  struct kvm_vcpu *vcpu;
   369  struct kvmppc_gs_elem *gse;
   370  vector128 v;
   371  int rc, i;
   372  u16 iden;
   373  
   374  vcpu = gsm->data;
   375  
   376  rc = kvmppc_gse_parse(, gsb);
   377  if (rc < 0)
   378  return rc;
   379  
   380  io = >arch.nestedv2_io;
   381  valids = >valids;
   382  
   383  kvmppc_gsp_for_each(, iden, gse)
   384  {
   385  switch (iden) {
   386  case KVMPPC_GSID_DSCR:
   387  vcpu->arch.dscr = kvmppc_gse_get_u64(gse);
   388  break;
   389  case KVMPPC_GSID_MMCRA:
   390  vcpu->arch.mmcra = kvmppc_gse_get_u64(gse);
   391  break;
   392  case KVMPPC_GSID_HFSCR:
   393  vcpu->arch.hfscr = kvmppc_gse_get_u64(gse);
   394  break;
   395  case KVMPPC_GSID_PURR:
   396  vcpu->arch.purr = kvmppc_gse_get_u64(gse);
   397  break;
   398  case KVMPPC_GSID_SPURR:
   399  vcpu->arch.spurr = kvmppc_gse_get_u64(gse);
   400  break;
   401  case KVMPPC_GSID_AMR:
   402  vcpu->arch.amr = kvmppc_gse_get_u64(gse);
   403  break;
   404  case KVMPPC_GSID_UAMOR:
   405  vcpu->arch.uamor = kvmppc_gse_get_u64(gse);
   406  break;
   407  case KVMPPC_GSID_SIAR:
   408  vcpu->arch.siar = kvmppc_gse_get_u64(gse);
   409  break;
   410  case KVMPPC_GSID_SDAR:
   411  vcpu->arch.sdar = kvmppc_gse_get_u64(gse);
   412  break;
   413  case KVMPPC_GSID_IAMR:
   414  vcpu->arch.iamr = kvmppc_gse_get_u64(gse);
   415  break;
   416  case KVMPPC_GSID_DAWR0:
   417  vcpu->arch.dawr0 = kvmppc_gse_get_u64(gse);
   418  break;
   419  case KVMPPC_GSID_DAWR1:
   420  vcpu->arch.dawr1 = kvmppc_gse_get_u64(gse);
   421  break;
   422  case KVMPPC_GSID_DAWRX0:
   423  vcpu->arch.dawrx0 = kvmppc_gse_get_u32(gse);
   424  break;
   425  case KVMPPC_GSID_DAWRX1:
   426  vcpu->arch.dawrx1 = kvmppc_gse_get_u32(gse);
   427  break;
   428  case KVMPPC_GSID_CIABR:
   429  vcpu->arch.ciabr = kvmppc_gse_get_u64(gse);
   430  break;
   431  case KVMPPC_GSID_WORT:
   432  vcpu->arch.wort = kvmppc_gse_get_u32(gse);
   433  break;
   434  case KVMPPC_GSID_PPR:
   435  vcpu->arch.ppr = kvmppc_gse_get_u64(gse);
   436  break;
   437  case KVMPPC_GSID_PSPB:
   438  vcpu->arch.pspb = kvmppc_g

Re: [PATCH v4 21/28] net: wan: Add framer framework support

2023-08-21 Thread Linus Walleij
On Mon, Aug 21, 2023 at 7:19 AM Christophe Leroy
 wrote:
> Le 20/08/2023 à 23:06, Linus Walleij a écrit :
> > On Fri, Aug 18, 2023 at 6:41 PM Christophe Leroy
> >  wrote:
> >
> >> From: Herve Codina 
> >>
> >> A framer is a component in charge of an E1/T1 line interface.
> >> Connected usually to a TDM bus, it converts TDM frames to/from E1/T1
> >> frames. It also provides information related to the E1/T1 line.
> >>
> >> The framer framework provides a set of APIs for the framer drivers
> >> (framer provider) to create/destroy a framer and APIs for the framer
> >> users (framer consumer) to obtain a reference to the framer, and
> >> use the framer.
> >>
> >> This basic implementation provides a framer abstraction for:
> >>   - power on/off the framer
> >>   - get the framer status (line state)
> >>   - be notified on framer status changes
> >>   - get/set the framer configuration
> >>
> >> Signed-off-by: Herve Codina 
> >> Reviewed-by: Christophe Leroy 
> >> Signed-off-by: Christophe Leroy 
> >
> > I had these review comments, you must have missed them?
> > https://lore.kernel.org/netdev/cacrpkdzq9_f6+9csev1l_wgphhujfpayxmjjfjurzszrako...@mail.gmail.com/
> >
>
> As I said in the cover letter, this series only fixes critical build
> failures that happened when CONFIG_MODULES is set. The purpose was to
> allow robots to perform their job up to the end. Other feedback and
> comments will be taken into account by Hervé when he is back from holidays.

Ah I see. I completely missed this.

Yours,
Linus Walleij


Re: [PATCH v4 20/28] wan: qmc_hdlc: Add runtime timeslots changes support

2023-08-21 Thread Christophe JAILLET

Le 18/08/2023 à 18:39, Christophe Leroy a écrit :

From: Herve Codina 

QMC channels support runtime timeslots changes but nothing is done at
the QMC HDLC driver to handle these changes.

Use existing IFACE ioctl in order to configure the timeslots to use.

Signed-off-by: Herve Codina 
Reviewed-by: Christophe Leroy 
Signed-off-by: Christophe Leroy 
---


Hi,

a few nits below, should there be a v5.


  drivers/net/wan/fsl_qmc_hdlc.c | 169 -
  1 file changed, 168 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
index 4f84ac5fc63e..4b8cb5761fd1 100644
--- a/drivers/net/wan/fsl_qmc_hdlc.c
+++ b/drivers/net/wan/fsl_qmc_hdlc.c
@@ -32,6 +32,7 @@ struct qmc_hdlc {
struct qmc_hdlc_desc tx_descs[8];
unsigned int tx_out;
struct qmc_hdlc_desc rx_descs[4];
+   u32 slot_map;
  };
  
  static inline struct qmc_hdlc *netdev_to_qmc_hdlc(struct net_device *netdev)

@@ -202,6 +203,162 @@ static netdev_tx_t qmc_hdlc_xmit(struct sk_buff *skb, 
struct net_device *netdev)
return NETDEV_TX_OK;
  }
  
+static int qmc_hdlc_xlate_slot_map(struct qmc_hdlc *qmc_hdlc,

+  u32 slot_map, struct qmc_chan_ts_info 
*ts_info)
+{
+   u64 ts_mask_avail;
+   unsigned int bit;
+   unsigned int i;
+   u64 ts_mask;
+   u64 map = 0;


This init looks useless.


+
+   /* Tx and Rx masks must be identical */
+   if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) {
+   dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch 
(0x%llx, 0x%llx)\n",
+   ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail);
+   return -EINVAL;
+   }
+
+   ts_mask_avail = ts_info->rx_ts_mask_avail;
+   ts_mask = 0;
+   map = slot_map;
+   bit = 0;
+   for (i = 0; i < 64; i++) {
+   if (ts_mask_avail & BIT_ULL(i)) {
+   if (map & BIT_ULL(bit))
+   ts_mask |= BIT_ULL(i);
+   bit++;
+   }
+   }
+
+   if (hweight64(ts_mask) != hweight64(map)) {
+   dev_err(qmc_hdlc->dev, "Cannot translate timeslots 0x%llx -> 
(0x%llx,0x%llx)\n",
+   map, ts_mask_avail, ts_mask);
+   return -EINVAL;
+   }
+
+   ts_info->tx_ts_mask = ts_mask;
+   ts_info->rx_ts_mask = ts_mask;
+   return 0;
+}
+
+static int qmc_hdlc_xlate_ts_info(struct qmc_hdlc *qmc_hdlc,
+ const struct qmc_chan_ts_info *ts_info, u32 
*slot_map)
+{
+   u64 ts_mask_avail;
+   unsigned int bit;
+   unsigned int i;
+   u64 ts_mask;
+   u64 map = 0;


This init looks useless.


+
+   /* Tx and Rx masks must be identical */
+   if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) {
+   dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch 
(0x%llx, 0x%llx)\n",
+   ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail);
+   return -EINVAL;
+   }
+   if (ts_info->rx_ts_mask != ts_info->tx_ts_mask) {
+   dev_err(qmc_hdlc->dev, "tx and rx timeslots mismatch (0x%llx, 
0x%llx)\n",
+   ts_info->rx_ts_mask, ts_info->tx_ts_mask);
+   return -EINVAL;
+   }
+
+   ts_mask_avail = ts_info->rx_ts_mask_avail;
+   ts_mask = ts_info->rx_ts_mask;
+   map = 0;
+   bit = 0;
+   for (i = 0; i < 64; i++) {
+   if (ts_mask_avail & BIT_ULL(i)) {
+   if (ts_mask & BIT_ULL(i))
+   map |= BIT_ULL(bit);
+   bit++;
+   }
+   }
+
+   if (hweight64(ts_mask) != hweight64(map)) {
+   dev_err(qmc_hdlc->dev, "Cannot translate timeslots (0x%llx,0x%llx) 
-> 0x%llx\n",
+   ts_mask_avail, ts_mask, map);
+   return -EINVAL;
+   }
+
+   if (map >= BIT_ULL(32)) {
+   dev_err(qmc_hdlc->dev, "Slot map out of 32bit (0x%llx,0x%llx) -> 
0x%llx\n",
+   ts_mask_avail, ts_mask, map);
+   return -EINVAL;
+   }
+
+   *slot_map = map;
+   return 0;
+}


...


+static int qmc_hdlc_ioctl(struct net_device *netdev, struct if_settings *ifs)
+{
+   struct qmc_hdlc *qmc_hdlc = netdev_to_qmc_hdlc(netdev);
+   te1_settings te1;
+
+   switch (ifs->type) {
+   case IF_GET_IFACE:
+   ifs->type = IF_IFACE_E1;
+   if (ifs->size < sizeof(te1)) {
+   if (!ifs->size)
+   return 0; /* only type requested */
+
+   ifs->size = sizeof(te1); /* data size wanted */
+   return -ENOBUFS;
+   }
+
+   memset(, 0, sizeof(te1));
+
+   /* Update slot_map */
+   te1.slot_map = qmc_hdlc->slot_map;
+
+   if 

Re: [PATCH v4 21/28] net: wan: Add framer framework support

2023-08-21 Thread Christophe JAILLET

Le 18/08/2023 à 18:39, Christophe Leroy a écrit :

From: Herve Codina 

A framer is a component in charge of an E1/T1 line interface.
Connected usually to a TDM bus, it converts TDM frames to/from E1/T1
frames. It also provides information related to the E1/T1 line.

The framer framework provides a set of APIs for the framer drivers
(framer provider) to create/destroy a framer and APIs for the framer
users (framer consumer) to obtain a reference to the framer, and
use the framer.

This basic implementation provides a framer abstraction for:
  - power on/off the framer
  - get the framer status (line state)
  - be notified on framer status changes
  - get/set the framer configuration

Signed-off-by: Herve Codina 
Reviewed-by: Christophe Leroy 
Signed-off-by: Christophe Leroy 
---


Hi,

should there be a V5, some nits below.

...


+int framer_power_off(struct framer *framer)
+{
+   int ret;
+
+   mutex_lock(>mutex);
+   if (framer->power_count == 1 && framer->ops->power_off) {
+   ret =  framer->ops->power_off(framer);


 ~~
Useless extra space


+   if (ret < 0) {
+   dev_err(>dev, "framer poweroff failed --> 
%d\n", ret);
+   mutex_unlock(>mutex);
+   return ret;
+   }
+   }
+   --framer->power_count;
+   mutex_unlock(>mutex);
+   framer_pm_runtime_put(framer);
+
+   if (framer->pwr)
+   regulator_disable(framer->pwr);
+
+   return 0;
+}


...


+struct framer *framer_create(struct device *dev, struct device_node *node,
+const struct framer_ops *ops)
+{
+   int ret;
+   int id;
+   struct framer *framer;
+
+   if (WARN_ON(!dev))
+   return ERR_PTR(-EINVAL);
+
+   /* get_status() is mandatory if the provider ask for polling status */
+   if (WARN_ON((ops->flags & FRAMER_FLAG_POLL_STATUS) && !ops->get_status))
+   return ERR_PTR(-EINVAL);
+
+   framer = kzalloc(sizeof(*framer), GFP_KERNEL);
+   if (!framer)
+   return ERR_PTR(-ENOMEM);
+
+   id = ida_simple_get(_ida, 0, 0, GFP_KERNEL);


ida_alloc()?
(ida_simple_get() is deprecated)


+   if (id < 0) {
+   dev_err(dev, "unable to get id\n");
+   ret = id;
+   goto free_framer;
+   }
+
+   device_initialize(>dev);
+   mutex_init(>mutex);
+   INIT_WORK(>notify_status_work, framer_notify_status_work);
+   INIT_DELAYED_WORK(>polling_work, framer_polling_work);
+   BLOCKING_INIT_NOTIFIER_HEAD(>notifier_list);
+
+   framer->dev.class = framer_class;
+   framer->dev.parent = dev;
+   framer->dev.of_node = node ? node : dev->of_node;
+   framer->id = id;
+   framer->ops = ops;
+
+   ret = dev_set_name(>dev, "framer-%s.%d", dev_name(dev), id);
+   if (ret)
+   goto put_dev;
+
+   /* framer-supply */
+   framer->pwr = regulator_get_optional(>dev, "framer");
+   if (IS_ERR(framer->pwr)) {
+   ret = PTR_ERR(framer->pwr);
+   if (ret == -EPROBE_DEFER)
+   goto put_dev;
+
+   framer->pwr = NULL;
+   }
+
+   ret = device_add(>dev);
+   if (ret)
+   goto put_dev;
+
+   if (pm_runtime_enabled(dev)) {
+   pm_runtime_enable(>dev);
+   pm_runtime_no_callbacks(>dev);
+   }
+
+   return framer;
+
+put_dev:
+   put_device(>dev);  /* calls framer_release() which frees 
resources */
+   return ERR_PTR(ret);
+
+free_framer:
+   kfree(framer);
+   return ERR_PTR(ret);
+}


...


+void framer_provider_of_unregister(struct framer_provider *framer_provider)
+{
+   mutex_lock(_provider_mutex);
+   list_del(_provider->list);
+   of_node_put(framer_provider->dev->of_node);
+   kfree(framer_provider);
+   mutex_unlock(_provider_mutex);


If it make sense, of_node_put() and kfree() could maybe be out of the 
mutex, in order to match how things are done in 
__framer_provider_of_register().



+}


...


+static void framer_release(struct device *dev)
+{
+   struct framer *framer;
+
+   framer = dev_to_framer(dev);
+   regulator_put(framer->pwr);
+   ida_simple_remove(_ida, framer->id);


ida_free()?
(ida_simple_remove() is deprecated)


+   kfree(framer);
+}


...