[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:155:23: error: field has incomplete type 'struct tlb_core_data'
tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test head: 0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM feature fixups instead of static branches config: powerpc64-randconfig-r012-20230822 (https://download.01.org/0day-ci/archive/20230822/202308221352.nocgtdjp-...@intel.com/config) compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a) reproduce: (https://download.01.org/0day-ci/archive/20230822/202308221352.nocgtdjp-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202308221352.nocgtdjp-...@intel.com/ All errors (new ones prefixed by >>): In file included from arch/powerpc/mm/nohash/kup.c:11: In file included from arch/powerpc/include/asm/kup.h:24: In file included from arch/powerpc/include/asm/nohash/kup-booke.h:6: In file included from arch/powerpc/include/asm/mmu.h:148: In file included from arch/powerpc/include/asm/percpu.h:20: >> arch/powerpc/include/asm/paca.h:155:23: error: field has incomplete type >> 'struct tlb_core_data' 155 | struct tlb_core_data tcd; | ^ arch/powerpc/include/asm/paca.h:139:9: note: forward declaration of 'struct tlb_core_data' 139 | struct tlb_core_data *tcd_ptr; |^ arch/powerpc/include/asm/paca.h:291:36: warning: declaration of 'struct mm_struct' will not be visible outside of this function [-Wvisibility] 291 | extern void copy_mm_to_paca(struct mm_struct *mm); |^ 1 warning and 1 error generated. vim +155 arch/powerpc/include/asm/paca.h 91c60b5b8209627 Benjamin Herrenschmidt 2009-06-02 131 e0d68273d706953 Christophe Leroy 2022-09-19 132 #ifdef CONFIG_PPC_BOOK3E_64 016f8cf0d87bb2b Kevin Hao 2015-03-10 133 u64 exgen[8] __aligned(0x40); f67f4ef5fcdfdee Scott Wood 2011-06-22 134 /* Keep pgd in the same cacheline as the start of extlb */ 016f8cf0d87bb2b Kevin Hao 2015-03-10 135 pgd_t *pgd __aligned(0x40); /* Current PGD */ f67f4ef5fcdfdee Scott Wood 2011-06-22 136 pgd_t *kernel_pgd; /* Kernel PGD */ 28efc35fe68dacb Scott Wood 2013-10-11 137 28efc35fe68dacb Scott Wood 2013-10-11 138 /* Shared by all threads of a core -- points to tcd of first thread */ 28efc35fe68dacb Scott Wood 2013-10-11 139 struct tlb_core_data *tcd_ptr; 28efc35fe68dacb Scott Wood 2013-10-11 140 609af38f8fc0f1d Scott Wood 2014-03-10 141 /* 609af38f8fc0f1d Scott Wood 2014-03-10 142 * We can have up to 3 levels of reentrancy in the TLB miss handler, 609af38f8fc0f1d Scott Wood 2014-03-10 143 * in each of four exception levels (normal, crit, mcheck, debug). 609af38f8fc0f1d Scott Wood 2014-03-10 144 */ 609af38f8fc0f1d Scott Wood 2014-03-10 145 u64 extlb[12][EX_TLB_SIZE / sizeof(u64)]; dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 146 u64 exmc[8]; /* used for machine checks */ dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 147 u64 excrit[8]; /* used for crit interrupts */ dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 148 u64 exdbg[8]; /* used for debug interrupts */ dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 149 dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 150 /* Kernel stack pointers for use by special exceptions */ dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 151 void *mc_kstack; dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 152 void *crit_kstack; dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 153 void *dbg_kstack; 28efc35fe68dacb Scott Wood 2013-10-11 154 28efc35fe68dacb Scott Wood 2013-10-11 @155 struct tlb_core_data tcd; e0d68273d706953 Christophe Leroy 2022-09-19 156 #endif /* CONFIG_PPC_BOOK3E_64 */ dce6670aaa7efec Benjamin Herrenschmidt 2009-07-23 157 :: The code at line 155 was first introduced by commit :: 28efc35fe68dacbddc4b12c2fa8f2df1593a4ad3 powerpc/e6500: TLB miss handler with hardware tablewalk support :: TO: Scott Wood :: CC: Scott Wood -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Re: [PATCH 1/9] ARM: Remove
On 8/18/23 01:07, Geert Uytterhoeven wrote: > As of commit b7fb14d3ac63117e ("ide: remove the legacy ide driver") in > v5.14, there are no more generic users of . > > Signed-off-by: Geert Uytterhoeven Looks good to me. All patches are reviewed or acked, except this one. Can I get an ack from arm folks ? > --- > arch/arm/include/asm/ide.h | 24 > 1 file changed, 24 deletions(-) > delete mode 100644 arch/arm/include/asm/ide.h > > diff --git a/arch/arm/include/asm/ide.h b/arch/arm/include/asm/ide.h > deleted file mode 100644 > index a81e0b0d6747aa2f.. > --- a/arch/arm/include/asm/ide.h > +++ /dev/null > @@ -1,24 +0,0 @@ > -/* SPDX-License-Identifier: GPL-2.0 */ > -/* > - * arch/arm/include/asm/ide.h > - * > - * Copyright (C) 1994-1996 Linus Torvalds & authors > - */ > - > -/* > - * This file contains the ARM architecture specific IDE code. > - */ > - > -#ifndef __ASMARM_IDE_H > -#define __ASMARM_IDE_H > - > -#ifdef __KERNEL__ > - > -#define __ide_mm_insw(port,addr,len) readsw(port,addr,len) > -#define __ide_mm_insl(port,addr,len) readsl(port,addr,len) > -#define __ide_mm_outsw(port,addr,len)writesw(port,addr,len) > -#define __ide_mm_outsl(port,addr,len)writesl(port,addr,len) > - > -#endif /* __KERNEL__ */ > - > -#endif /* __ASMARM_IDE_H */ -- Damien Le Moal Western Digital Research
Re: [PATCH] powerpc/perf: Convert fsl_emb notifier to state machine callbacks
Le 18/08/2023 à 10:59, Christophe Leroy a écrit : >CC arch/powerpc/perf/core-fsl-emb.o > arch/powerpc/perf/core-fsl-emb.c:675:6: error: no previous prototype for > 'hw_perf_event_setup' [-Werror=missing-prototypes] >675 | void hw_perf_event_setup(int cpu) >| ^~~ > > Looks like fsl_emb was completely missed by commit 3f6da3905398 ("perf: > Rework and fix the arch CPU-hotplug hooks") > > So, apply same changes as commit 3f6da3905398 ("perf: Rework and fix > the arch CPU-hotplug hooks") then commit 57ecde42cc74 ("powerpc/perf: > Convert book3s notifier to state machine callbacks") > > While at it, also fix following error: > > arch/powerpc/perf/core-fsl-emb.c: In function 'perf_event_interrupt': > arch/powerpc/perf/core-fsl-emb.c:648:13: error: variable 'found' set but not > used [-Werror=unused-but-set-variable] >648 | int found = 0; >| ^ > > Fixes: 3f6da3905398 ("perf: Rework and fix the arch CPU-hotplug hooks") > Cc: Peter Zijlstra > Cc: Thomas Gleixner > Signed-off-by: Christophe Leroy > Cc: Arnd Bergmann Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308220658.or5hhfcd-...@intel.com/ > --- > arch/powerpc/perf/core-fsl-emb.c | 8 +--- > 1 file changed, 5 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/perf/core-fsl-emb.c > b/arch/powerpc/perf/core-fsl-emb.c > index ee721f420a7b..1a53ab08447c 100644 > --- a/arch/powerpc/perf/core-fsl-emb.c > +++ b/arch/powerpc/perf/core-fsl-emb.c > @@ -645,7 +645,6 @@ static void perf_event_interrupt(struct pt_regs *regs) > struct cpu_hw_events *cpuhw = this_cpu_ptr(_hw_events); > struct perf_event *event; > unsigned long val; > - int found = 0; > > for (i = 0; i < ppmu->n_counter; ++i) { > event = cpuhw->event[i]; > @@ -654,7 +653,6 @@ static void perf_event_interrupt(struct pt_regs *regs) > if ((int)val < 0) { > if (event) { > /* event has overflowed */ > - found = 1; > record_and_restart(event, val, regs); > } else { > /* > @@ -672,11 +670,13 @@ static void perf_event_interrupt(struct pt_regs *regs) > isync(); > } > > -void hw_perf_event_setup(int cpu) > +static int fsl_emb_pmu_prepare_cpu(unsigned int cpu) > { > struct cpu_hw_events *cpuhw = _cpu(cpu_hw_events, cpu); > > memset(cpuhw, 0, sizeof(*cpuhw)); > + > + return 0; > } > > int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) > @@ -689,6 +689,8 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) > pmu->name); > > perf_pmu_register(_emb_pmu, "cpu", PERF_TYPE_RAW); > + cpuhp_setup_state(CPUHP_PERF_POWER, "perf/powerpc:prepare", > + fsl_emb_pmu_prepare_cpu, NULL); > > return 0; > }
Re: linux-next: build failure after merge of the mm tree
On Tue, Aug 22, 2023 at 02:34:06AM +0100, Matthew Wilcox wrote: > On Tue, Aug 22, 2023 at 11:22:17AM +1000, Stephen Rothwell wrote: > > Hi Matthew, > > > > On Tue, 22 Aug 2023 02:11:44 +0100 Matthew Wilcox > > wrote: > > > > > > On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote: > > > > In file included from include/trace/trace_events.h:27, > > > > from include/trace/define_trace.h:102, > > > > from fs/xfs/xfs_trace.h:4428, > > > > from fs/xfs/xfs_trace.c:45: > > > > include/linux/pgtable.h:8:25: error: initializer element is not constant > > > > 8 | #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > > > > > > Ummm. PowerPC doesn't have a compile-time constant PMD size? > > > > Yeah, you are not the first (or probably the last) to be caught by that. > > I think this will do the trick. Any comments? > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 1904eaf7a2e9..d5a4e6c2dcd1 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -796,15 +796,6 @@ DEFINE_INODE_EVENT(xfs_inode_reclaiming); > DEFINE_INODE_EVENT(xfs_inode_set_need_inactive); > DEFINE_INODE_EVENT(xfs_inode_inactivating); > > -/* > - * ftrace's __print_symbolic requires that all enum values be wrapped in the > - * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the > ftrace > - * ring buffer. Somehow this was only worth mentioning in the ftrace sample > - * code. > - */ Please leave this ^^^ comment, because the need for TRACE_DEFINE_ENUM to make enums work in tracepoints is not at all obvious. > -TRACE_DEFINE_ENUM(PMD_ORDER); > -TRACE_DEFINE_ENUM(PUD_ORDER); > - > TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); > TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); > > @@ -823,13 +814,10 @@ TRACE_EVENT(xfs_filemap_fault, > __entry->order = order; > __entry->write_fault = write_fault; > ), > - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", > + TP_printk("dev %d:%d ino 0x%llx order:%u write_fault %d", "order %u" to match the (non dev_t) style of the rest of the xfs tracepoints. --D > MAJOR(__entry->dev), MINOR(__entry->dev), > __entry->ino, > - __print_symbolic(__entry->order, > - { 0,"PTE" }, > - { PMD_ORDER,"PMD" }, > - { PUD_ORDER,"PUD" }), > + __entry->order, > __entry->write_fault) > ) > > >
Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd
On Mon, 21 Aug 2023, Jann Horn wrote: > On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins wrote: > > Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private > > shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() > > thought it had emptied: page lock on the huge page is enough to protect > > against WP faults (which find the PTE has been cleared), but not enough > > to protect against userfaultfd. "BUG: Bad rss-counter state" followed. > > > > retract_page_tables() protects against this by checking !vma->anon_vma; > > but we know that MADV_COLLAPSE needs to be able to work on private shmem > > mappings, even those with an anon_vma prepared for another part of the > > mapping; and we know that MADV_COLLAPSE needs to work on shared shmem > > mappings which are userfaultfd_armed(). Whether it needs to work on > > private shmem mappings which are userfaultfd_armed(), I'm not so sure: > > but assume that it does. > > I think we couldn't rely on anon_vma here anyway, since holding the > mmap_lock in read mode doesn't prevent concurrent creation of an > anon_vma? We would have had to do the same as in retract_page_tables() (which doesn't even have mmap_lock for read): recheck !vma->anon_vma after finally acquiring ptlock. But the !anon_vma limitation is certainly not acceptable here anyway. > > > Just for this case, take the pmd_lock() two steps earlier: not because > > it gives any protection against this case itself, but because ptlock > > nests inside it, and it's the dropping of ptlock which let the bug in. > > In other cases, continue to minimize the pmd_lock() hold time. > > Special-casing userfaultfd like this makes me a bit uncomfortable; but > I also can't find anything other than userfaultfd that would insert > pages into regions that are khugepaged-compatible, so I guess this > works? I'm as sure as I can be that it's solely because userfaultfd breaks the usual rules here (and in fairness, IIRC Andrea did ask my permission before making it behave that way on shmem, COWing without a source page). Perhaps something else will want that same behaviour in future (it's tempting, but difficult to guarantee correctness); for now, it is just userfaultfd (but by saying "_armed" rather than "_missing", I'm half- expecting uffd to add more such exceptional modes in future). > > I guess an alternative would be to use a spin_trylock() instead of the > current pmd_lock(), and if that fails, temporarily drop the page table > lock and then restart from step 2 with both locks held - and at that > point the page table scan should be fast since we expect it to usually > be empty. That's certainly a good idea, if collapse on userfaultfd_armed private is anything of a common case (I doubt, but I don't know). It may be a better idea anyway (saving a drop and retake of ptlock). I gave it a try, expecting to end up with something that would lead me to say "I tried it, but it didn't work out well"; but actually it looks okay to me. I wouldn't say I prefer it, but it seems reasonable, and no more complicated (as Peter rightly observes) than the original. It's up to you and Peter, and whoever has strong feelings about it, to choose between them: I don't mind (but I shall be sad if someone demands that I indent that comment deeper - I'm not a fan of long multi-line comments near column 80). [PATCH mm-unstable v2] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() thought it had emptied: page lock on the huge page is enough to protect against WP faults (which find the PTE has been cleared), but not enough to protect against userfaultfd. "BUG: Bad rss-counter state" followed. retract_page_tables() protects against this by checking !vma->anon_vma; but we know that MADV_COLLAPSE needs to be able to work on private shmem mappings, even those with an anon_vma prepared for another part of the mapping; and we know that MADV_COLLAPSE needs to work on shared shmem mappings which are userfaultfd_armed(). Whether it needs to work on private shmem mappings which are userfaultfd_armed(), I'm not so sure: but assume that it does. Now trylock pmd lock without dropping ptlock (suggested by jannh): if that fails, drop and retake ptlock around taking pmd lock, and just in the uffd private case, go back to recheck and empty the page table. Reported-by: Jann Horn Closes: https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5fw-kyd5rg5xo_rdbm0ltt...@mail.gmail.com/ Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()") Signed-off-by: Hugh Dickins --- mm/khugepaged.c | 39 +-- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40d43eccdee8..ad1c571772fe 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1476,7
[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:291:36: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration
tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test head: 0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM feature fixups instead of static branches config: powerpc64-randconfig-r031-20230822 (https://download.01.org/0day-ci/archive/20230822/202308221055.lw3uzjil-...@intel.com/config) compiler: powerpc64-linux-gcc (GCC) 12.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230822/202308221055.lw3uzjil-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202308221055.lw3uzjil-...@intel.com/ All warnings (new ones prefixed by >>): In file included from arch/powerpc/include/asm/percpu.h:20, from arch/powerpc/include/asm/mmu.h:148, from arch/powerpc/include/asm/nohash/kup-booke.h:6, from arch/powerpc/include/asm/kup.h:24, from arch/powerpc/mm/nohash/kup.c:11: arch/powerpc/include/asm/paca.h:155:30: error: field 'tcd' has incomplete type 155 | struct tlb_core_data tcd; | ^~~ >> arch/powerpc/include/asm/paca.h:291:36: warning: 'struct mm_struct' declared >> inside parameter list will not be visible outside of this definition or >> declaration 291 | extern void copy_mm_to_paca(struct mm_struct *mm); |^ vim +291 arch/powerpc/include/asm/paca.h 91c60b5b820962 arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-06-02 131 e0d68273d70695 arch/powerpc/include/asm/paca.h Christophe Leroy 2022-09-19 132 #ifdef CONFIG_PPC_BOOK3E_64 016f8cf0d87bb2 arch/powerpc/include/asm/paca.h Kevin Hao 2015-03-10 133 u64 exgen[8] __aligned(0x40); f67f4ef5fcdfde arch/powerpc/include/asm/paca.h Scott Wood 2011-06-22 134 /* Keep pgd in the same cacheline as the start of extlb */ 016f8cf0d87bb2 arch/powerpc/include/asm/paca.h Kevin Hao 2015-03-10 135 pgd_t *pgd __aligned(0x40); /* Current PGD */ f67f4ef5fcdfde arch/powerpc/include/asm/paca.h Scott Wood 2011-06-22 136 pgd_t *kernel_pgd; /* Kernel PGD */ 28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 2013-10-11 137 28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 2013-10-11 138 /* Shared by all threads of a core -- points to tcd of first thread */ 28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 2013-10-11 139 struct tlb_core_data *tcd_ptr; 28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 2013-10-11 140 609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 2014-03-10 141 /* 609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 2014-03-10 142* We can have up to 3 levels of reentrancy in the TLB miss handler, 609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 2014-03-10 143* in each of four exception levels (normal, crit, mcheck, debug). 609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 2014-03-10 144*/ 609af38f8fc0f1 arch/powerpc/include/asm/paca.h Scott Wood 2014-03-10 145 u64 extlb[12][EX_TLB_SIZE / sizeof(u64)]; dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 146 u64 exmc[8];/* used for machine checks */ dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 147 u64 excrit[8]; /* used for crit interrupts */ dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 148 u64 exdbg[8]; /* used for debug interrupts */ dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 149 dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 150 /* Kernel stack pointers for use by special exceptions */ dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 151 void *mc_kstack; dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 152 void *crit_kstack; dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 153 void *dbg_kstack; 28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 2013-10-11 154 28efc35fe68dac arch/powerpc/include/asm/paca.h Scott Wood 2013-10-11 @155 struct tlb_core_data tcd; e0d68273d70695 arch/powerpc/include/asm/paca.h Christophe Leroy 2022-09-19 156 #endif /* CONFIG_PPC_BOOK3E_64 */ dce6670aaa7efe arch/powerpc/include/asm/paca.h Benjamin Herrenschmidt 2009-07-23 157 387e220a2e5e63 arch/powerpc/include/asm/paca.h Nicholas Piggin 2021-12-02 158 #ifdef
Re: linux-next: build failure after merge of the mm tree
On Tue, Aug 22, 2023 at 11:22:17AM +1000, Stephen Rothwell wrote: > Hi Matthew, > > On Tue, 22 Aug 2023 02:11:44 +0100 Matthew Wilcox wrote: > > > > On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote: > > > In file included from include/trace/trace_events.h:27, > > > from include/trace/define_trace.h:102, > > > from fs/xfs/xfs_trace.h:4428, > > > from fs/xfs/xfs_trace.c:45: > > > include/linux/pgtable.h:8:25: error: initializer element is not constant > > > 8 | #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > > > > Ummm. PowerPC doesn't have a compile-time constant PMD size? > > Yeah, you are not the first (or probably the last) to be caught by that. I think this will do the trick. Any comments? diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 1904eaf7a2e9..d5a4e6c2dcd1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -796,15 +796,6 @@ DEFINE_INODE_EVENT(xfs_inode_reclaiming); DEFINE_INODE_EVENT(xfs_inode_set_need_inactive); DEFINE_INODE_EVENT(xfs_inode_inactivating); -/* - * ftrace's __print_symbolic requires that all enum values be wrapped in the - * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace - * ring buffer. Somehow this was only worth mentioning in the ftrace sample - * code. - */ -TRACE_DEFINE_ENUM(PMD_ORDER); -TRACE_DEFINE_ENUM(PUD_ORDER); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); @@ -823,13 +814,10 @@ TRACE_EVENT(xfs_filemap_fault, __entry->order = order; __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order:%u write_fault %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __print_symbolic(__entry->order, - { 0,"PTE" }, - { PMD_ORDER,"PMD" }, - { PUD_ORDER,"PUD" }), + __entry->order, __entry->write_fault) )
Re: linux-next: build failure after merge of the mm tree
Hi Matthew, On Tue, 22 Aug 2023 02:11:44 +0100 Matthew Wilcox wrote: > > On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote: > > In file included from include/trace/trace_events.h:27, > > from include/trace/define_trace.h:102, > > from fs/xfs/xfs_trace.h:4428, > > from fs/xfs/xfs_trace.c:45: > > include/linux/pgtable.h:8:25: error: initializer element is not constant > > 8 | #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) > > Ummm. PowerPC doesn't have a compile-time constant PMD size? Yeah, you are not the first (or probably the last) to be caught by that. -- Cheers, Stephen Rothwell pgp3KpD7r5v3m.pgp Description: OpenPGP digital signature
Re: linux-next: build failure after merge of the mm tree
On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote: > In file included from include/trace/trace_events.h:27, > from include/trace/define_trace.h:102, > from fs/xfs/xfs_trace.h:4428, > from fs/xfs/xfs_trace.c:45: > include/linux/pgtable.h:8:25: error: initializer element is not constant > 8 | #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) Ummm. PowerPC doesn't have a compile-time constant PMD size? arch/powerpc/include/asm/book3s/64/pgtable.h:#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) arch/powerpc/include/asm/book3s/64/pgtable.h:#define PTE_INDEX_SIZE __pte_index_size That's really annoying. I'll try to work around it.
[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:291:36: warning: declaration of 'struct mm_struct' will not be visible outside of this function
tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test head: 0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM feature fixups instead of static branches config: powerpc64-randconfig-r012-20230822 (https://download.01.org/0day-ci/archive/20230822/202308220857.ufq2oaxm-...@intel.com/config) compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a) reproduce: (https://download.01.org/0day-ci/archive/20230822/202308220857.ufq2oaxm-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202308220857.ufq2oaxm-...@intel.com/ All warnings (new ones prefixed by >>): In file included from arch/powerpc/mm/nohash/kup.c:11: In file included from arch/powerpc/include/asm/kup.h:24: In file included from arch/powerpc/include/asm/nohash/kup-booke.h:6: In file included from arch/powerpc/include/asm/mmu.h:148: In file included from arch/powerpc/include/asm/percpu.h:20: arch/powerpc/include/asm/paca.h:155:23: error: field has incomplete type 'struct tlb_core_data' 155 | struct tlb_core_data tcd; | ^ arch/powerpc/include/asm/paca.h:139:9: note: forward declaration of 'struct tlb_core_data' 139 | struct tlb_core_data *tcd_ptr; |^ >> arch/powerpc/include/asm/paca.h:291:36: warning: declaration of 'struct >> mm_struct' will not be visible outside of this function [-Wvisibility] 291 | extern void copy_mm_to_paca(struct mm_struct *mm); |^ 1 warning and 1 error generated. vim +291 arch/powerpc/include/asm/paca.h ^1da177e4c3f41 include/asm-ppc64/paca.hLinus Torvalds 2005-04-16 290 54be0b9c7c9888 arch/powerpc/include/asm/paca.h Michael Ellerman 2018-10-02 @291 extern void copy_mm_to_paca(struct mm_struct *mm); d2e60075a3d442 arch/powerpc/include/asm/paca.h Nicholas Piggin 2018-02-14 292 extern struct paca_struct **paca_ptrs; 1426d5a3bd0758 arch/powerpc/include/asm/paca.h Michael Ellerman 2010-01-28 293 extern void initialise_paca(struct paca_struct *new_paca, int cpu); fc53b4202e61c7 arch/powerpc/include/asm/paca.h Matt Evans 2010-07-07 294 extern void setup_paca(struct paca_struct *new_paca); 59f577743d71bf arch/powerpc/include/asm/paca.h Nicholas Piggin 2018-02-14 295 extern void allocate_paca_ptrs(void); 59f577743d71bf arch/powerpc/include/asm/paca.h Nicholas Piggin 2018-02-14 296 extern void allocate_paca(int cpu); 1426d5a3bd0758 arch/powerpc/include/asm/paca.h Michael Ellerman 2010-01-28 297 extern void free_unused_pacas(void); 1426d5a3bd0758 arch/powerpc/include/asm/paca.h Michael Ellerman 2010-01-28 298 :: The code at line 291 was first introduced by commit :: 54be0b9c7c9888ebe63b89a31a17ee3df6a68d61 Revert "convert SLB miss handlers to C" and subsequent commits :: TO: Michael Ellerman :: CC: Michael Ellerman -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
linux-next: manual merge of the powerpc tree with the mm-stable tree
Hi all, Today's linux-next merge of the powerpc tree got a conflict in: arch/powerpc/Kconfig between commit: 8d05554dca2a ("powerpc: mm: convert to GENERIC_IOREMAP") from the mm-stable tree and commit: 0ceef6e99cc3 ("powerpc/idle: Add support for nohlt") from the powerpc tree. I fixed it up (see below) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. -- Cheers, Stephen Rothwell diff --cc arch/powerpc/Kconfig index 21edd664689e,c831e20cf40f.. --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@@ -195,7 -194,7 +196,8 @@@ config PP select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY + select GENERIC_IDLE_POLL_SETUP + select GENERIC_IOREMAP select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAPif PCI pgpgMePaKLYzL.pgp Description: OpenPGP digital signature
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On Mon, Aug 21, 2023 at 07:39:15PM -0400, Sean Anderson wrote: > Well, I think we should take the opportunity to think about the hardware > which exists and how we plan to model it. IMO grouping lanes into a > single phy simplifies both the phy driver and the mac driver. Ok, but ungrouped for backplane and grouped for !backplane? For the KR link modes, parallel link training, with separate consumers per lanes in a group, will be needed per lane.
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On 8/21/23 18:48, Vladimir Oltean wrote: > On Mon, Aug 21, 2023 at 05:06:46PM -0400, Sean Anderson wrote: >> On 8/21/23 15:58, Vladimir Oltean wrote: >> > On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote: >> >> After further review, it seems the reason 28g can get away without this >> >> is because there's a one-to-one mapping between protocol controllers and >> >> lanes. Unfortunately, that regularity is not present for 10g. >> >> >> >> --Sean >> > >> > There are some things I saw in your phy-fsl-lynx-10g.c driver and device >> > tree bindings that I don't understand (the concept of lane groups) >> >> Each lane group corresponds to a phy device (struct phy). For protocols >> like PCIe or XAUI which can use multiple lanes, this lets the driver >> coordinate configuring all lanes at once in the correct order. > > For PCIe I admit that I don't know. I don't even know if there's any > valid (current or future) use case for having the PCIe controller have a > phandle to the SerDes. Everything else below in my reply assumes Ethernet. One use case could be selecting PCIe/SATA as appropriate for an M.2 card. Another use case could be allocating lanes to different slots based on card presence (e.g. 1 card use x4, 2+ cards use x1). I recently bought a motherboard whose manual says | PCI_E4 [normally x4] will run x1 speed when installing devices in PCI_E2/ | PCI_E3/ PCI_E5 slot [all x1]. which implies exactly this sort of design. >> > and >> > I'm not sure if they're related with what you're saying here, so if you >> > could elaborate a bit more (ideally with an example) on the one-to-one >> > mapping and the specific problems it causes, it would be great. >> >> For e.g. the LS2088A, SerDes1 lanes H-A use SG1-8 and XFI1-8. SerDes2 >> lanes A-H use SG9-16 and XFI9-16. Each lane has its own controller, and >> the mapping is 1-to-1. In the PCCRs, each controller uses the same >> value, and is mapped in a regular way. So you can go directly from the >> lane number to the right value to mask in the PCCR, with a very simple >> translation scheme. >> >> For e.g. the LS1046A, SerDes1 lane D uses XFI.9 (aka XFIA) and lane C >> uses XFI.10 (aka XFIB). This is the opposite of how SerDes1 lanes A-D >> use SGMII.9, .10, .5, and .6 (aka SGMIIA-D). >> >> For e.g. the T4240, SerDes1 lanes A-H use sg1.5, .6, .10, .9, .1, .2, >> .3, .4 (aka SGMII E, F, H, G, A, B, C, D). >> >> For e.g. the B4860, SerDes lanes A uses sgmii1 or sgmii5 and B uses >> sgmii2 or sgmii6. The MAC selected is determined based on the value >> programmed into PCCR2. > > It's so exceedingly unlikely that B4860 will gain support for anything > new, that it's not even worth talking about it, or even considering it > in the design of a new driver. Just forget about it. > > Let's concentrate on the Layerscapes, and on the T series to the extent > that we're not going out of our way to support them with a fairly simple > design. Well, these are just easy ways to show the oddities in the PCCRs. I could have made the same points with the various Layerscapes. > In the Lynx 10G block guide that I have, PCCR2 is a register that does > something completely different from Ethernet. I'm not sure if B4860 has > a Lynx 10G block and not something else. T-series SoCs use a different PCCR layout than Layerscape, despite using the same registers in the rest of the SerDes. The LS1021 also uses this scheme, so it's not just T-series (but it's close enough). This is why I had an option for different PCCR configuration functions in earlier versions of this series, so if someone wanted they could easily add T-series support. >> While I appreciate that your hardware engineers did a better job for >> 28g, many 10g serdes arbitrarily map lanes to protocol controllers. >> I think the mapping is too irregular to tame, and it is better to say >> "if you want this configuration, program this value". > > Ok, but that's a lateral argument (or I'm not understanding the connection). To restate, there's no systemic organization to the PCCRs. The driver configuration should reflect this and allow arbitrary mappings. > Some maintainers (Mark Brown for sure, from my personal experience) prefer > that expert-level knowledge of hardware should be hardcoded into the kernel > driver based on known stuff such as the SoC-specific compatible string. > I certainly share the same view. > > With your case, I think that argument is even more pertinent, because > IIUC, the lane group protocols won't be put in the SoC .dtsi (so as to > be written only once), but in the board device trees (since the > available protocols invariably depend upon the board provisioning). > So, non-expert board device tree writers will have to know what's with > the PCCR stuff. Pretty brain-intensive. The idea is to stick all the PCCR stuff in the SoC devicetree, and the board-level devicetrees just set up the clocks and pick which MACs use which phys. Have a look at patches 10 and 15 of this
[powerpc:next-test 9/109] arch/powerpc/include/asm/paca.h:155:30: error: field 'tcd' has incomplete type
tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test head: 0f71dcfb4aef6043da6cc509e7a7f6a3ae87c12d commit: 3a24ea0df83e32355d897a18bbd82e05986dcdc3 [9/109] powerpc/kuap: Use ASM feature fixups instead of static branches config: powerpc64-randconfig-r036-20230822 (https://download.01.org/0day-ci/archive/20230822/202308220708.nrf5auae-...@intel.com/config) compiler: powerpc64-linux-gcc (GCC) 12.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230822/202308220708.nrf5auae-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202308220708.nrf5auae-...@intel.com/ All errors (new ones prefixed by >>): In file included from arch/powerpc/include/asm/percpu.h:20, from arch/powerpc/include/asm/mmu.h:148, from arch/powerpc/include/asm/nohash/kup-booke.h:6, from arch/powerpc/include/asm/kup.h:24, from arch/powerpc/mm/nohash/kup.c:11: >> arch/powerpc/include/asm/paca.h:155:30: error: field 'tcd' has incomplete >> type 155 | struct tlb_core_data tcd; | ^~~ arch/powerpc/include/asm/paca.h:291:36: error: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration [-Werror] 291 | extern void copy_mm_to_paca(struct mm_struct *mm); |^ cc1: all warnings being treated as errors vim +/tcd +155 arch/powerpc/include/asm/paca.h 91c60b5b820962 Benjamin Herrenschmidt 2009-06-02 131 e0d68273d70695 Christophe Leroy 2022-09-19 132 #ifdef CONFIG_PPC_BOOK3E_64 016f8cf0d87bb2 Kevin Hao 2015-03-10 133 u64 exgen[8] __aligned(0x40); f67f4ef5fcdfde Scott Wood 2011-06-22 134 /* Keep pgd in the same cacheline as the start of extlb */ 016f8cf0d87bb2 Kevin Hao 2015-03-10 135 pgd_t *pgd __aligned(0x40); /* Current PGD */ f67f4ef5fcdfde Scott Wood 2011-06-22 136 pgd_t *kernel_pgd; /* Kernel PGD */ 28efc35fe68dac Scott Wood 2013-10-11 137 28efc35fe68dac Scott Wood 2013-10-11 138 /* Shared by all threads of a core -- points to tcd of first thread */ 28efc35fe68dac Scott Wood 2013-10-11 139 struct tlb_core_data *tcd_ptr; 28efc35fe68dac Scott Wood 2013-10-11 140 609af38f8fc0f1 Scott Wood 2014-03-10 141 /* 609af38f8fc0f1 Scott Wood 2014-03-10 142* We can have up to 3 levels of reentrancy in the TLB miss handler, 609af38f8fc0f1 Scott Wood 2014-03-10 143* in each of four exception levels (normal, crit, mcheck, debug). 609af38f8fc0f1 Scott Wood 2014-03-10 144*/ 609af38f8fc0f1 Scott Wood 2014-03-10 145 u64 extlb[12][EX_TLB_SIZE / sizeof(u64)]; dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 146 u64 exmc[8]; /* used for machine checks */ dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 147 u64 excrit[8]; /* used for crit interrupts */ dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 148 u64 exdbg[8]; /* used for debug interrupts */ dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 149 dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 150 /* Kernel stack pointers for use by special exceptions */ dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 151 void *mc_kstack; dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 152 void *crit_kstack; dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 153 void *dbg_kstack; 28efc35fe68dac Scott Wood 2013-10-11 154 28efc35fe68dac Scott Wood 2013-10-11 @155 struct tlb_core_data tcd; e0d68273d70695 Christophe Leroy 2022-09-19 156 #endif /* CONFIG_PPC_BOOK3E_64 */ dce6670aaa7efe Benjamin Herrenschmidt 2009-07-23 157 :: The code at line 155 was first introduced by commit :: 28efc35fe68dacbddc4b12c2fa8f2df1593a4ad3 powerpc/e6500: TLB miss handler with hardware tablewalk support :: TO: Scott Wood :: CC: Scott Wood -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On Mon, Aug 21, 2023 at 05:06:46PM -0400, Sean Anderson wrote: > On 8/21/23 15:58, Vladimir Oltean wrote: > > On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote: > >> After further review, it seems the reason 28g can get away without this > >> is because there's a one-to-one mapping between protocol controllers and > >> lanes. Unfortunately, that regularity is not present for 10g. > >> > >> --Sean > > > > There are some things I saw in your phy-fsl-lynx-10g.c driver and device > > tree bindings that I don't understand (the concept of lane groups) > > Each lane group corresponds to a phy device (struct phy). For protocols > like PCIe or XAUI which can use multiple lanes, this lets the driver > coordinate configuring all lanes at once in the correct order. For PCIe I admit that I don't know. I don't even know if there's any valid (current or future) use case for having the PCIe controller have a phandle to the SerDes. Everything else below in my reply assumes Ethernet. > > and > > I'm not sure if they're related with what you're saying here, so if you > > could elaborate a bit more (ideally with an example) on the one-to-one > > mapping and the specific problems it causes, it would be great. > > For e.g. the LS2088A, SerDes1 lanes H-A use SG1-8 and XFI1-8. SerDes2 > lanes A-H use SG9-16 and XFI9-16. Each lane has its own controller, and > the mapping is 1-to-1. In the PCCRs, each controller uses the same > value, and is mapped in a regular way. So you can go directly from the > lane number to the right value to mask in the PCCR, with a very simple > translation scheme. > > For e.g. the LS1046A, SerDes1 lane D uses XFI.9 (aka XFIA) and lane C > uses XFI.10 (aka XFIB). This is the opposite of how SerDes1 lanes A-D > use SGMII.9, .10, .5, and .6 (aka SGMIIA-D). > > For e.g. the T4240, SerDes1 lanes A-H use sg1.5, .6, .10, .9, .1, .2, > .3, .4 (aka SGMII E, F, H, G, A, B, C, D). > > For e.g. the B4860, SerDes lanes A uses sgmii1 or sgmii5 and B uses > sgmii2 or sgmii6. The MAC selected is determined based on the value > programmed into PCCR2. It's so exceedingly unlikely that B4860 will gain support for anything new, that it's not even worth talking about it, or even considering it in the design of a new driver. Just forget about it. Let's concentrate on the Layerscapes, and on the T series to the extent that we're not going out of our way to support them with a fairly simple design. In the Lynx 10G block guide that I have, PCCR2 is a register that does something completely different from Ethernet. I'm not sure if B4860 has a Lynx 10G block and not something else. > While I appreciate that your hardware engineers did a better job for > 28g, many 10g serdes arbitrarily map lanes to protocol controllers. > I think the mapping is too irregular to tame, and it is better to say > "if you want this configuration, program this value". Ok, but that's a lateral argument (or I'm not understanding the connection). Some maintainers (Mark Brown for sure, from my personal experience) prefer that expert-level knowledge of hardware should be hardcoded into the kernel driver based on known stuff such as the SoC-specific compatible string. I certainly share the same view. With your case, I think that argument is even more pertinent, because IIUC, the lane group protocols won't be put in the SoC .dtsi (so as to be written only once), but in the board device trees (since the available protocols invariably depend upon the board provisioning). So, non-expert board device tree writers will have to know what's with the PCCR stuff. Pretty brain-intensive. > > I may be off with my understanding of the regularity you are talking about, > > but the LX2160 (and Lynx 28G block) also has multi-lane protocols like 40G, > > 100G, assuming that's what you are talking about. I haven't started yet > > working on those for the mtip_backplane driver, but I'm not currently > > seeing a problem with the architecture where a phy_device represents a > > single lane that's part of a multi-lane port, and not an entire group. > > Resetting one lane in a group will reset the rest, which could confuse > the driver. Additionally, treating the lanes as one phy lets us set the > reset direction and first lane bits correctly. Yeah, in theory that is probably correct, but in practice can't we hide our head in the sand and say that the "phys" phandles have to have the lanes in the same order as LNmGCR0[1STLANE] expects them (which is also the "natural" order as the SoC RM describes it)? With a "for" loop implementation in the MAC, that would work just fine 100% of the time. Doing more intricate massaging of the "phys" in the consumer, and you're just asking for trouble. My 2 cents. Sure, it's the same kind of ask of a board device tree writer as "hey, please give me a good PCCR value", but I honestly think that the head scratching will be much less severe. > > In my imagination, there are 2 cases: > > - all 4 lanes are managed by
Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd
On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins wrote: > Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private > shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() > thought it had emptied: page lock on the huge page is enough to protect > against WP faults (which find the PTE has been cleared), but not enough > to protect against userfaultfd. "BUG: Bad rss-counter state" followed. > > retract_page_tables() protects against this by checking !vma->anon_vma; > but we know that MADV_COLLAPSE needs to be able to work on private shmem > mappings, even those with an anon_vma prepared for another part of the > mapping; and we know that MADV_COLLAPSE needs to work on shared shmem > mappings which are userfaultfd_armed(). Whether it needs to work on > private shmem mappings which are userfaultfd_armed(), I'm not so sure: > but assume that it does. I think we couldn't rely on anon_vma here anyway, since holding the mmap_lock in read mode doesn't prevent concurrent creation of an anon_vma? > Just for this case, take the pmd_lock() two steps earlier: not because > it gives any protection against this case itself, but because ptlock > nests inside it, and it's the dropping of ptlock which let the bug in. > In other cases, continue to minimize the pmd_lock() hold time. Special-casing userfaultfd like this makes me a bit uncomfortable; but I also can't find anything other than userfaultfd that would insert pages into regions that are khugepaged-compatible, so I guess this works? I guess an alternative would be to use a spin_trylock() instead of the current pmd_lock(), and if that fails, temporarily drop the page table lock and then restart from step 2 with both locks held - and at that point the page table scan should be fast since we expect it to usually be empty.
Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd
On Mon, Aug 21, 2023 at 12:51:20PM -0700, Hugh Dickins wrote: > Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private > shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() > thought it had emptied: page lock on the huge page is enough to protect > against WP faults (which find the PTE has been cleared), but not enough > to protect against userfaultfd. "BUG: Bad rss-counter state" followed. > > retract_page_tables() protects against this by checking !vma->anon_vma; > but we know that MADV_COLLAPSE needs to be able to work on private shmem > mappings, even those with an anon_vma prepared for another part of the > mapping; and we know that MADV_COLLAPSE needs to work on shared shmem > mappings which are userfaultfd_armed(). Whether it needs to work on > private shmem mappings which are userfaultfd_armed(), I'm not so sure: > but assume that it does. > > Just for this case, take the pmd_lock() two steps earlier: not because > it gives any protection against this case itself, but because ptlock > nests inside it, and it's the dropping of ptlock which let the bug in. > In other cases, continue to minimize the pmd_lock() hold time. > > Reported-by: Jann Horn > Closes: > https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5fw-kyd5rg5xo_rdbm0ltt...@mail.gmail.com/ > Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with > mmap_read_lock()") > Signed-off-by: Hugh Dickins The locking is indeed slightly complicated.. but I didn't spot anything wrong. Acked-by: Peter Xu Thanks, -- Peter Xu
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On 8/21/23 15:58, Vladimir Oltean wrote: > On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote: >> After further review, it seems the reason 28g can get away without this >> is because there's a one-to-one mapping between protocol controllers and >> lanes. Unfortunately, that regularity is not present for 10g. >> >> --Sean > > There are some things I saw in your phy-fsl-lynx-10g.c driver and device > tree bindings that I don't understand (the concept of lane groups) Each lane group corresponds to a phy device (struct phy). For protocols like PCIe or XAUI which can use multiple lanes, this lets the driver coordinate configuring all lanes at once in the correct order. > and > I'm not sure if they're related with what you're saying here, so if you > could elaborate a bit more (ideally with an example) on the one-to-one > mapping and the specific problems it causes, it would be great. For e.g. the LS2088A, SerDes1 lanes H-A use SG1-8 and XFI1-8. SerDes2 lanes A-H use SG9-16 and XFI9-16. Each lane has its own controller, and the mapping is 1-to-1. In the PCCRs, each controller uses the same value, and is mapped in a regular way. So you can go directly from the lane number to the right value to mask in the PCCR, with a very simple translation scheme. For e.g. the LS1046A, SerDes1 lane D uses XFI.9 (aka XFIA) and lane C uses XFI.10 (aka XFIB). This is the opposite of how SerDes1 lanes A-D use SGMII.9, .10, .5, and .6 (aka SGMIIA-D). For e.g. the T4240, SerDes1 lanes A-H use sg1.5, .6, .10, .9, .1, .2, .3, .4 (aka SGMII E, F, H, G, A, B, C, D). For e.g. the B4860, SerDes lanes A uses sgmii1 or sgmii5 and B uses sgmii2 or sgmii6. The MAC selected is determined based on the value programmed into PCCR2. While I appreciate that your hardware engineers did a better job for 28g, many 10g serdes arbitrarily map lanes to protocol controllers. I think the mapping is too irregular to tame, and it is better to say "if you want this configuration, program this value". > I may be off with my understanding of the regularity you are talking about, > but the LX2160 (and Lynx 28G block) also has multi-lane protocols like 40G, > 100G, assuming that's what you are talking about. I haven't started yet > working on those for the mtip_backplane driver, but I'm not currently > seeing a problem with the architecture where a phy_device represents a > single lane that's part of a multi-lane port, and not an entire group. Resetting one lane in a group will reset the rest, which could confuse the driver. Additionally, treating the lanes as one phy lets us set the reset direction and first lane bits correctly. > In my imagination, there are 2 cases: > - all 4 lanes are managed by the single dpaa2-mac consumer (which has 4 > phandles, and iterates over them with a "for" loop) > - each of the 4 lanes is managed by the respective backplane AN/LT core, > and thus, there's one phandle to each lane By doing the grouping in the driver, we also simplify the consumer implementation. The MAC can always use a single phy, without worrying about the actual number of lanes. This matches the hardware, since the MAC is going to talk XGMII (or whatever) to the protocol controller anyway. I think it will be a lot easier to add multi-lane support with this method because it gives the driver more information about what's going on. The driver can control the whole configuration/reset process and the timing. > I sketched some dt-bindings for the second case here, so I guess it must > be the first scenario that's somehow problematic? > https://cas5-0-urlprotect.trendmicro.com:443/wis/clicktime/v1/query?url=https%3a%2f%2fpatchwork.kernel.org%2fproject%2fnetdevbpf%2fpatch%2f20230817150644.3605105%2d9%2dvladimir.oltean%40nxp.com%2f=9e644233-009e-4197-a266-5d9a85eb1148=d807158c60b7d2502abde8a2fc01f40662980862-cc1d5330d84af8fa40745b165a44849db50f8a67 Yes, no issues with the second case. --Sean
Re: [PATCH v4 22/28] dt-bindings: net: Add the Lantiq PEF2256 E1/T1/J1 framer
On Fri, Aug 18, 2023 at 06:39:16PM +0200, Christophe Leroy wrote: > From: Herve Codina > > The Lantiq PEF2256 is a framer and line interface component designed to > fulfill all required interfacing between an analog E1/T1/J1 line and the > digital PCM system highway/H.100 bus. > > Signed-off-by: Herve Codina > Signed-off-by: Christophe Leroy > --- > .../bindings/net/lantiq,pef2256.yaml | 219 ++ > 1 file changed, 219 insertions(+) > create mode 100644 Documentation/devicetree/bindings/net/lantiq,pef2256.yaml > > diff --git a/Documentation/devicetree/bindings/net/lantiq,pef2256.yaml > b/Documentation/devicetree/bindings/net/lantiq,pef2256.yaml > new file mode 100644 > index ..72f6777afa3a > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/lantiq,pef2256.yaml > @@ -0,0 +1,219 @@ > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/net/lantiq,pef2256.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: Lantiq PEF2256 > + > +maintainers: > + - Herve Codina > + > +description: > + The Lantiq PEF2256, also known as Infineon PEF2256 or FALC56, is a framer > and > + line interface component designed to fulfill all required interfacing > between > + an analog E1/T1/J1 line and the digital PCM system highway/H.100 bus. > + > +properties: > + compatible: > +items: > + - const: lantiq,pef2256 > + > + reg: > +maxItems: 1 > + > + clocks: > +items: > + - description: Master clock > + - description: System Clock Receive > + - description: System Clock Transmit > + > + clock-names: > +items: > + - const: mclk > + - const: sclkr > + - const: sclkx > + > + interrupts: > +maxItems: 1 > + > + reset-gpios: > +description: > + GPIO used to reset the device. > +maxItems: 1 > + > + '#framer-cells': Not a standard binding. Do you need provider specific variable number of cells? > +const: 0 > + > + pinctrl: > +$ref: /schemas/pinctrl/pinctrl.yaml# > +additionalProperties: false > + > +patternProperties: > + '-pins$': > +type: object > +$ref: /schemas/pinctrl/pincfg-node.yaml# > +additionalProperties: false > + > +properties: > + pins: > +enum: [ RPA, RPB, RPC, RPD, XPA, XPB, XPC, XPD ] > + > + function: > +enum: [ SYPR, RFM, RFMB, RSIGM, RSIG, DLR, FREEZE, RFSP, LOS, > +SYPX, XFMS, XSIG, TCLK, XMFB, XSIGM, DLX, XCLK, XLT, > +GPI, GPOH, GPOL ] > + > +required: > + - pins > + - function > + > + lantiq,data-rate-bps: > +$ref: /schemas/types.yaml#/definitions/uint32 > +enum: [2048000, 4096000, 8192000, 16384000] > +default: 2048000 > +description: > + Data rate (bit per seconds) on the system highway. > + > + lantiq,clock-falling-edge: > +$ref: /schemas/types.yaml#/definitions/flag > +description: > + Data is sent on falling edge of the clock (and received on the rising > + edge). If 'clock-falling-edge' is not present, data is sent on the > + rising edge (and received on the falling edge). > + > + lantiq,channel-phase: > +$ref: /schemas/types.yaml#/definitions/uint32 > +enum: [0, 1, 2, 3, 4, 5, 6, 7] > +default: 0 > +description: Need '|' to preserve formatting > + The pef2256 delivers a full frame (32 8bit time-slots in E1 and 24 8bit > + time-slots 8 8bit signaling in E1/J1) every 125us. This lead to a data > + rate of 2048000 bit/s. When lantiq,data-rate-bps is more than 2048000 > + bit/s, the data (all 32 8bit) present in the frame are interleave with > + unused time-slots. The lantiq,channel-phase property allows to set the > + correct alignment of the interleave mechanism. > + For instance, suppose lantiq,data-rate-bps = 8192000 (ie 4*2048000), > and > + lantiq,channel-phase = 2, the interleave schema with unused time-slots > + (nu) and used time-slots (XX) for TSi is > +nu nu XX nu nu nu XX nu nu nu XX nu > +<-- TSi --> <- TSi+1 -> <- TSi+2 -> > + With lantiq,data-rate-bps = 8192000, and lantiq,channel-phase = 1, the > + interleave schema is > +nu XX nu nu nu XX nu nu nu XX nu nu > +<-- TSi --> <- TSi+1 -> <- TSi+2 -> > + With lantiq,data-rate-bps = 4096000 (ie 2*2048000), and > + lantiq,channel-phase = 1, the interleave schema is > +nuXXnuXXnuXX > +<-- TSi --> <- TSi+1 -> <- TSi+2 -> > + > +patternProperties: > + '^codec(-([0-9]|[1-2][0-9]|3[0-1]))?$': > +type: object > +$ref: /schemas/sound/dai-common.yaml > +unevaluatedProperties: false > +description: > + Codec provided by the pef2256. This codec allows to use some of the PCM > + system highway time-slots as audio channels to transport audio data > over > +
Re: [PATCH v4 06/28] dt-bindings: net: Add support for QMC HDLC
On Fri, Aug 18, 2023 at 06:39:00PM +0200, Christophe Leroy wrote: > From: Herve Codina > > The QMC (QUICC mutichannel controller) is a controller present in some > PowerQUICC SoC such as MPC885. > The QMC HDLC uses the QMC controller to transfer HDLC data. > > Additionally, a framer can be connected to the QMC HDLC. > If present, this framer is the interface between the TDM bus used by the > QMC HDLC and the E1/T1 line. > The QMC HDLC can use this framer to get information about the E1/T1 line > and configure the E1/T1 line. > > Signed-off-by: Herve Codina > Signed-off-by: Christophe Leroy > --- > .../devicetree/bindings/net/fsl,qmc-hdlc.yaml | 46 +++ > 1 file changed, 46 insertions(+) > create mode 100644 Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml > > diff --git a/Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml > b/Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml > new file mode 100644 > index ..13f3572f0feb > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/fsl,qmc-hdlc.yaml > @@ -0,0 +1,46 @@ > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/net/fsl,qmc-hdlc.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: Freescale/NXP QUICC Multichannel Controller (QMC) HDLC > + > +maintainers: > + - Herve Codina > + > +description: | Don't need '|' > + The QMC HDLC uses a QMC (QUICC Multichannel Controller) channel to transfer > + HDLC data. > + > +properties: > + compatible: > +const: fsl,qmc-hdlc > + > + fsl,qmc-chan: > +$ref: /schemas/types.yaml#/definitions/phandle-array > +items: > + - items: > + - description: phandle to QMC node > + - description: Channel number > +description: > + Should be a phandle/number pair. The phandle to QMC node and the QMC > + channel to use. > + > + framer: > +$ref: /schemas/types.yaml#/definitions/phandle > +description: > + phandle to the framer node What's the framer? > + > +required: > + - compatible > + - fsl,qmc-chan > + > +additionalProperties: false > + > +examples: > + - | > +hdlc { > +compatible = "fsl,qmc-hdlc"; > +fsl,qmc-chan = < 16>; Where does this node live? QMC is this[1]? Why don't you just add the compatible to channel@10 in the QMC node? Rob [1] Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-scc-qmc.yaml
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote: > After further review, it seems the reason 28g can get away without this > is because there's a one-to-one mapping between protocol controllers and > lanes. Unfortunately, that regularity is not present for 10g. > > --Sean There are some things I saw in your phy-fsl-lynx-10g.c driver and device tree bindings that I don't understand (the concept of lane groups), and I'm not sure if they're related with what you're saying here, so if you could elaborate a bit more (ideally with an example) on the one-to-one mapping and the specific problems it causes, it would be great. I may be off with my understanding of the regularity you are talking about, but the LX2160 (and Lynx 28G block) also has multi-lane protocols like 40G, 100G, assuming that's what you are talking about. I haven't started yet working on those for the mtip_backplane driver, but I'm not currently seeing a problem with the architecture where a phy_device represents a single lane that's part of a multi-lane port, and not an entire group. In my imagination, there are 2 cases: - all 4 lanes are managed by the single dpaa2-mac consumer (which has 4 phandles, and iterates over them with a "for" loop) - each of the 4 lanes is managed by the respective backplane AN/LT core, and thus, there's one phandle to each lane I sketched some dt-bindings for the second case here, so I guess it must be the first scenario that's somehow problematic? https://patchwork.kernel.org/project/netdevbpf/patch/20230817150644.3605105-9-vladimir.olt...@nxp.com/
[PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd
Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() thought it had emptied: page lock on the huge page is enough to protect against WP faults (which find the PTE has been cleared), but not enough to protect against userfaultfd. "BUG: Bad rss-counter state" followed. retract_page_tables() protects against this by checking !vma->anon_vma; but we know that MADV_COLLAPSE needs to be able to work on private shmem mappings, even those with an anon_vma prepared for another part of the mapping; and we know that MADV_COLLAPSE needs to work on shared shmem mappings which are userfaultfd_armed(). Whether it needs to work on private shmem mappings which are userfaultfd_armed(), I'm not so sure: but assume that it does. Just for this case, take the pmd_lock() two steps earlier: not because it gives any protection against this case itself, but because ptlock nests inside it, and it's the dropping of ptlock which let the bug in. In other cases, continue to minimize the pmd_lock() hold time. Reported-by: Jann Horn Closes: https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5fw-kyd5rg5xo_rdbm0ltt...@mail.gmail.com/ Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()") Signed-off-by: Hugh Dickins --- mm/khugepaged.c | 38 +- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40d43eccdee8..d5650541083a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1476,7 +1476,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; - spinlock_t *pml, *ptl; + spinlock_t *pml = NULL, *ptl; int nr_ptes = 0, result = SCAN_FAIL; int i; @@ -1572,9 +1572,25 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); notified = true; - start_pte = pte_offset_map_lock(mm, pmd, haddr, ); + + /* +* pmd_lock covers a wider range than ptl, and (if split from mm's +* page_table_lock) ptl nests inside pml. The less time we hold pml, +* the better; but userfaultfd's mfill_atomic_pte() on a private VMA +* inserts a valid as-if-COWed PTE without even looking up page cache. +* So page lock of hpage does not protect from it, so we must not drop +* ptl before pgt_pmd is removed, so uffd private needs pml taken now. +*/ + if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) + pml = pmd_lock(mm, pmd); + + start_pte = pte_offset_map_nolock(mm, pmd, haddr, ); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; + if (!pml) + spin_lock(ptl); + else if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; @@ -1608,7 +1624,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, nr_ptes++; } - pte_unmap_unlock(start_pte, ptl); + pte_unmap(start_pte); + if (!pml) + spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { @@ -1616,12 +1634,12 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); } - /* step 4: remove page table */ - - /* Huge page lock is still held, so page table must remain empty */ - pml = pmd_lock(mm, pmd); - if (ptl != pml) - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + /* step 4: remove empty page table */ + if (!pml) { + pml = pmd_lock(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); if (ptl != pml) @@ -1648,6 +1666,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, } if (start_pte) pte_unmap_unlock(start_pte, ptl); + if (pml && pml != ptl) + spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(); drop_hpage: -- 2.35.3
Re: [RFC PATCH v11 28/29] KVM: selftests: Add basic selftest for guest_memfd()
Sean Christopherson writes: > On Mon, Aug 07, 2023, Ackerley Tng wrote: >> Sean Christopherson writes: >> >> > Add a selftest to verify the basic functionality of guest_memfd(): >> > >> > >> >> Here's one more test: > > First off, thank you! I greatly appreciate all the selftests work you (and > others!) have been doing. > > For v2, can you please post a standalone patch? My workflow barfs on > unrelated, > inlined patches. I'm guessing I can get b4 to play nice, but it's easier to > just > yell at people :-) > Here's a standalone patch :) https://lore.kernel.org/lkml/20230821194411.2165757-1-ackerley...@google.com/ >
Re: [BUG] Re: [PATCH v3 10/13] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()
On Mon, 14 Aug 2023, Jann Horn wrote: > On Wed, Jul 12, 2023 at 6:42 AM Hugh Dickins wrote: > > Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp(). > > It does need mmap_read_lock(), but it does not need mmap_write_lock(), > > nor vma_start_write() nor i_mmap lock nor anon_vma lock. All racing > > paths are relying on pte_offset_map_lock() and pmd_lock(), so use those. > > We can still have a racing userfaultfd operation at the "/* step 4: > remove page table */" point that installs a new PTE before the page > table is removed. And you've been very polite not to remind me that this is exactly what you warned me about, in connection with retract_page_tables(), nearly three months ago: https://lore.kernel.org/linux-mm/cag48ez0af1rf1apsjn9ycnfyfq4yqsd4gqb6f2wfhf7jmdi...@mail.gmail.com/ > > To reproduce, patch a delay into the kernel like this: > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 9a6e0d507759..27cc8dfbf3a7 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -20,6 +20,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -1617,6 +1618,11 @@ int collapse_pte_mapped_thp(struct mm_struct > *mm, unsigned long addr, > } > > /* step 4: remove page table */ > + if (strcmp(current->comm, "DELAYME") == 0) { > + pr_warn("%s: BEGIN DELAY INJECTION\n", __func__); > + mdelay(5000); > + pr_warn("%s: END DELAY INJECTION\n", __func__); > + } > > /* Huge page lock is still held, so page table must remain empty */ > pml = pmd_lock(mm, pmd); > > > And then run the attached reproducer against mm/mm-everything. You > should get this in dmesg: > > [ 206.578096] BUG: Bad rss-counter state mm:0942ebea > type:MM_ANONPAGES val:1 Very helpful, thank you Jann. I got a bit distracted when I then found mm's recent addition of UFFDIO_POISON: thought I needed to change both collapse_pte_mapped_thp() and retract_page_tables() now to cope with mfill_atomic_pte_poison() inserting into even a userfaultfd_armed shared VMA. But eventually, on second thoughts, realized that's only inserting a pte marker, invalid, so won't cause any actual trouble. A little untidy, to leave that behind in a supposedly empty page table about to be freed, but not worth refactoring these functions to avoid a non-bug. And though syzbot and JH may find some fun with it, I don't think any real application would be insertng a PTE_MARKER_POISONED where a huge page collapse is almost complete. So I scaled back to a more proportionate fix, following. Sorry, I've slightly messed up applying the "DELAY INJECTION" patch above: not intentional, honest! (mdelay while holding the locks is still good.) Hugh
Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory
On Mon, Aug 21, 2023, Ackerley Tng wrote: > Sean Christopherson writes: > > > On Tue, Aug 15, 2023, Ackerley Tng wrote: > >> Sean Christopherson writes: > >> > Nullifying the KVM pointer isn't sufficient, because without additional > >> > actions > >> > userspace could extract data from a VM by deleting its memslots and then > >> > binding > >> > the guest_memfd to an attacker controlled VM. Or more likely with TDX > >> > and SNP, > >> > induce badness by coercing KVM into mapping memory into a guest with the > >> > wrong > >> > ASID/HKID. > >> > > >> > I can think of three ways to handle that: > >> > > >> > (a) prevent a different VM from *ever* binding to the gmem instance > >> > (b) free/zero physical pages when unbinding > >> > (c) free/zero when binding to a different VM > >> > > >> > Option (a) is easy, but that pretty much defeats the purpose of > >> > decopuling > >> > guest_memfd from a VM. > >> > > >> > Option (b) isn't hard to implement, but it screws up the lifecycle of > >> > the memory, > >> > e.g. would require memory when a memslot is deleted. That isn't > >> > necessarily a > >> > deal-breaker, but it runs counter to how KVM memlots currently operate. > >> > Memslots > >> > are basically just weird page tables, e.g. deleting a memslot doesn't > >> > have any > >> > impact on the underlying data in memory. TDX throws a wrench in this as > >> > removing > >> > a page from the Secure EPT is effectively destructive to the data (can't > >> > be mapped > >> > back in to the VM without zeroing the data), but IMO that's an oddity > >> > with TDX and > >> > not necessarily something we want to carry over to other VM types. > >> > > >> > There would also be performance implications (probably a non-issue in > >> > practice), > >> > and weirdness if/when we get to sharing, linking and/or mmap()ing gmem. > >> > E.g. what > >> > should happen if the last memslot (binding) is deleted, but there > >> > outstanding userspace > >> > mappings? > >> > > >> > Option (c) is better from a lifecycle perspective, but it adds its own > >> > flavor of > >> > complexity, e.g. the performant way to reclaim TDX memory requires the > >> > TDMR > >> > (effectively the VM pointer), and so a deferred relcaim doesn't really > >> > work for > >> > TDX. And I'm pretty sure it *can't* work for SNP, because RMP entries > >> > must not > >> > outlive the VM; KVM can't reuse an ASID if there are pages assigned to > >> > that ASID > >> > in the RMP, i.e. until all memory belonging to the VM has been fully > >> > freed. ... > I agree with you that nulling the KVM pointer is insufficient to keep > host userspace out of the TCB. Among the three options (a) preventing a > different VM (HKID/ASID) from binding to the gmem instance, or zeroing > the memory either (b) on unbinding, or (c) on binding to another VM > (HKID/ASID), > > (a) sounds like adding a check issued to TDX/SNP upon binding and this > check would just return OK for software-protected VMs (line of sight > to removing host userspace from TCB). > > Or, we could go further for software-protected VMs and add tracking in > the inode to prevent the same inode from being bound to different > "HKID/ASID"s, perhaps like this: > > + On first binding, store the KVM pointer in the inode - not file (but > not hold a refcount) > + On rebinding, check that the KVM matches the pointer in the inode > + On intra-host migration, update the KVM pointer in the inode to allow > binding to the new struct kvm > > I think you meant associating the file with a struct kvm at creation > time as an implementation for (a), but technically since the inode is > the representation of memory, tracking of struct kvm should be with the > inode instead of the file. > > (b) You're right that this messes up the lifecycle of the memory and > wouldn't work with intra-host migration. > > (c) sounds like doing the clearing on a check similar to that of (a) Sort of, though it's much nastier, because it requires the "old" KVM instance to be alive enough to support various operations. I.e. we'd have to make stronger guarantees about exactly when the handoff/transition could happen. > If we track struct kvm with the inode, then I think (a), (b) and (c) can > be independent of the refcounting method. What do you think? No go. Because again, the inode (physical memory) is coupled to the virtual machine as a thing, not to a "struct kvm". Or more concretely, the inode is coupled to an ASID or an HKID, and there can be multiple "struct kvm" objects associated with a single ASID. And at some point in the future, I suspect we'll have multiple KVM objects per HKID too. The current SEV use case is for the migration helper, where two KVM objects share a single ASID (the "real" VM and the helper). I suspect TDX will end up with similar behavior where helper "VMs" can use the HKID of the "real" VM. For KVM, that means multiple struct kvm objects being
Re: [PATCH v4 21/28] net: wan: Add framer framework support
On Mon, 21 Aug 2023 05:19:22 + Christophe Leroy wrote: > As I said in the cover letter, this series only fixes critical build > failures that happened when CONFIG_MODULES is set. The purpose was to > allow robots to perform their job up to the end. Other feedback and > comments will be taken into account by Hervé when he is back from holidays. I missed this too, FTR this is unacceptable. Quoting documentation: **Do not** post your patches just to run them through the checks. You must ensure that your patches are ready by testing them locally before posting to the mailing list. The patchwork build bot instance gets overloaded very easily and netdev@vger really doesn't need more traffic if we can help it. See: https://www.kernel.org/doc/html/next/process/maintainer-netdev.html#patchwork-checks
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On 8/21/23 14:13, Ioana Ciornei wrote: > On Mon, Aug 21, 2023 at 01:45:44PM -0400, Sean Anderson wrote: >> Well, we have two pieces of information we need >> >> - What values do we need to program in the PCCRs to select a particular >> mode? This includes whether to e.g. set the KX bits. >> - Implied by the above, what protocols are supported on which lanes? >> This is not strictly necessary, but will certainly solve a lot of >> headscratching. >> >> This information varies between different socs, and different serdes on >> the same socs. We can't really look at the RCW or the clocks and figure >> out what we need to program. So what are our options? >> >> - We can have a separate compatible for each serdes on each SoC (e.g. >> "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers. >> - We can have one compatible for each SoC, and determine the serdes >> based on the address. I would like to avoid this... > > To me this really seems like a straightforward approach. Indeed it would be straightforward, but what's the point of having a devicetree in the first place then? We could just go back to being a (non-dt) platform device. >> - We can stick all the details which vary between serdes/socs into the >> device tree. This is very flexible, since supporting new SoCs is >> mostly a matter of adding a new compatible and writing a new >> devicetree. On the other hand, if you have a bug in your devicetree, >> it's not easy to fix it in the kernel. >> - Just don't support protocol switching. The 28G driver does this, which >> is why it only has one compatible. However, supporting protocol >> switching is a core goal of this driver, so dropping support is not an >> option. >> > > The Lynx 28G SerDes driver does support protocol switching. > How did you arrive at the opposite conclusion? Sorry, it's been a while and I just did a quick look-over, and noticed there was no configuration for different SoCs. After further review, it seems the reason 28g can get away without this is because there's a one-to-one mapping between protocol controllers and lanes. Unfortunately, that regularity is not present for 10g. --Sean > The initial commit on the driver is even part of a patch set named > "dpaa2-mac: add support for changing the protocol at runtime". In > upstream it only supports the 1G <-> 10G transition but I have some > patches on the way to also support 25G. > > Ioana
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On Mon, Aug 21, 2023 at 09:13:49PM +0300, Ioana Ciornei wrote: > > - We can have one compatible for each SoC, and determine the serdes > > based on the address. I would like to avoid this... > > To me this really seems like a straightforward approach. +1
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On Mon, Aug 21, 2023 at 01:45:44PM -0400, Sean Anderson wrote: > Well, we have two pieces of information we need > > - What values do we need to program in the PCCRs to select a particular > mode? This includes whether to e.g. set the KX bits. > - Implied by the above, what protocols are supported on which lanes? > This is not strictly necessary, but will certainly solve a lot of > headscratching. > > This information varies between different socs, and different serdes on > the same socs. We can't really look at the RCW or the clocks and figure > out what we need to program. So what are our options? > > - We can have a separate compatible for each serdes on each SoC (e.g. > "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers. > - We can have one compatible for each SoC, and determine the serdes > based on the address. I would like to avoid this... To me this really seems like a straightforward approach. > - We can stick all the details which vary between serdes/socs into the > device tree. This is very flexible, since supporting new SoCs is > mostly a matter of adding a new compatible and writing a new > devicetree. On the other hand, if you have a bug in your devicetree, > it's not easy to fix it in the kernel. > - Just don't support protocol switching. The 28G driver does this, which > is why it only has one compatible. However, supporting protocol > switching is a core goal of this driver, so dropping support is not an > option. > The Lynx 28G SerDes driver does support protocol switching. How did you arrive at the opposite conclusion? The initial commit on the driver is even part of a patch set named "dpaa2-mac: add support for changing the protocol at runtime". In upstream it only supports the 1G <-> 10G transition but I have some patches on the way to also support 25G. Ioana
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
On 8/21/23 08:49, Vladimir Oltean wrote: > Hi Sean, > > On Fri, Aug 11, 2023 at 07:36:37PM +0300, Vladimir Oltean wrote: >> Let me explain that approach, because your mention of "swapping out the >> bootloaders" makes it appear as if you are not visualising what I am >> proposing. >> >> The Lynx SerDes family has 2 PLLs, and more lanes (4 or 8). Each lane >> uses one PLL or the other, to derive its protocol frequency. Through the >> RCW, you provision the 2 PLL frequencies that may be used by the lanes >> at runtime. >> >> The Lynx 28G SerDes driver reads the PLL frequencies in >> lynx_28g_pll_read_configuration(), and determines the interface modes >> supportable by each PLL (this is used by phylink). But it never changes >> those PLL frequencies, since that operation is practically impossible in >> the general sense (PLLs are shared by multiple lanes, so changing a PLL >> frequency disrupts all lanes that use it). > > Is my high-level feedback clear and actionable to you? I am suggesting > to keep the look and feel the same between the lynx-10g and lynx-28g > drivers, and to not use "fsl,type" protocols listed in the device tree > as the immutable source of information for populating mode->protos, but > instead the current PLL frequency configuration. So this implies that I > am requesting that the dt-bindings should not contain a listing of the > supported protocols. Well, we have two pieces of information we need - What values do we need to program in the PCCRs to select a particular mode? This includes whether to e.g. set the KX bits. - Implied by the above, what protocols are supported on which lanes? This is not strictly necessary, but will certainly solve a lot of headscratching. This information varies between different socs, and different serdes on the same socs. We can't really look at the RCW or the clocks and figure out what we need to program. So what are our options? - We can have a separate compatible for each serdes on each SoC (e.g. "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers. - We can have one compatible for each SoC, and determine the serdes based on the address. I would like to avoid this... - We can stick all the details which vary between serdes/socs into the device tree. This is very flexible, since supporting new SoCs is mostly a matter of adding a new compatible and writing a new devicetree. On the other hand, if you have a bug in your devicetree, it's not easy to fix it in the kernel. - Just don't support protocol switching. The 28G driver does this, which is why it only has one compatible. However, supporting protocol switching is a core goal of this driver, so dropping support is not an option. I'm open to any other suggestions you have, but this was the best way I could figure out to get this information to the driver in a way which would be acceptable to the devicetree folks. --Sean
Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory
Sean Christopherson writes: > On Tue, Aug 15, 2023, Ackerley Tng wrote: >> Sean Christopherson writes: >> >> >> I feel that memslots form a natural way of managing usage of the gmem >> >> file. When a memslot is created, it is using the file; hence we take a >> >> refcount on the gmem file, and as memslots are removed, we drop >> >> refcounts on the gmem file. >> > >> > Yes and no. It's definitely more natural *if* the goal is to allow >> > guest_memfd >> > memory to exist without being attached to a VM. But I'm not at all >> > convinced >> > that we want to allow that, or that it has desirable properties. With TDX >> > and >> > SNP in particuarly, I'm pretty sure that allowing memory to outlive the VM >> > is >> > very underisable (more below). >> > >> >> This is a little confusing, with the file/inode split in gmem where the >> physical memory/data is attached to the inode and the file represents >> the VM's view of that memory, won't the memory outlive the VM? > > Doh, I overloaded the term "VM". By "VM" I meant the virtual machine as a > "thing" > the rest of the world sees and interacts with, not the original "struct kvm" > object. > > Because yes, you're absolutely correct that the memory will outlive "struct > kvm", > but it won't outlive the virtual machine, and specifically won't outlive the > ASID (SNP) / HKID (TDX) to which it's bound. > Yup we agree on this now :) The memory should not outlive the the ASID (SNP) / HKID (TDX) to which it's bound. >> This [1] POC was built based on that premise, that the gmem inode can be >> linked to another file and handed off to another VM, to facilitate >> intra-host migration, where the point is to save the work of rebuilding >> the VM's memory in the destination VM. >> >> With this, the bindings don't outlive the VM, but the data/memory >> does. I think this split design you proposed is really nice. >> >> >> The KVM pointer is shared among all the bindings in gmem’s xarray, and we >> >> can >> >> enforce that a gmem file is used only with one VM: >> >> >> >> + When binding a memslot to the file, if a kvm pointer exists, it must >> >> be the same kvm as the one in this binding >> >> + When the binding to the last memslot is removed from a file, NULL the >> >> kvm pointer. >> > >> > Nullifying the KVM pointer isn't sufficient, because without additional >> > actions >> > userspace could extract data from a VM by deleting its memslots and then >> > binding >> > the guest_memfd to an attacker controlled VM. Or more likely with TDX and >> > SNP, >> > induce badness by coercing KVM into mapping memory into a guest with the >> > wrong >> > ASID/HKID. >> > >> > I can think of three ways to handle that: >> > >> > (a) prevent a different VM from *ever* binding to the gmem instance >> > (b) free/zero physical pages when unbinding >> > (c) free/zero when binding to a different VM >> > >> > Option (a) is easy, but that pretty much defeats the purpose of decopuling >> > guest_memfd from a VM. >> > >> > Option (b) isn't hard to implement, but it screws up the lifecycle of the >> > memory, >> > e.g. would require memory when a memslot is deleted. That isn't >> > necessarily a >> > deal-breaker, but it runs counter to how KVM memlots currently operate. >> > Memslots >> > are basically just weird page tables, e.g. deleting a memslot doesn't have >> > any >> > impact on the underlying data in memory. TDX throws a wrench in this as >> > removing >> > a page from the Secure EPT is effectively destructive to the data (can't >> > be mapped >> > back in to the VM without zeroing the data), but IMO that's an oddity with >> > TDX and >> > not necessarily something we want to carry over to other VM types. >> > >> > There would also be performance implications (probably a non-issue in >> > practice), >> > and weirdness if/when we get to sharing, linking and/or mmap()ing gmem. >> > E.g. what >> > should happen if the last memslot (binding) is deleted, but there >> > outstanding userspace >> > mappings? >> > >> > Option (c) is better from a lifecycle perspective, but it adds its own >> > flavor of >> > complexity, e.g. the performant way to reclaim TDX memory requires the TDMR >> > (effectively the VM pointer), and so a deferred relcaim doesn't really >> > work for >> > TDX. And I'm pretty sure it *can't* work for SNP, because RMP entries >> > must not >> > outlive the VM; KVM can't reuse an ASID if there are pages assigned to >> > that ASID >> > in the RMP, i.e. until all memory belonging to the VM has been fully freed. >> > >> >> If we are on the same page that the memory should outlive the VM but not >> the bindings, then associating the gmem inode to a new VM should be a >> feature and not a bug. >> >> What do we want to defend against here? >> >> (a) Malicious host VMM >> >> For a malicious host VMM to read guest memory (with TDX and SNP), it can >> create a new VM with the same HKID/ASID as the victim VM, rebind the >> gmem inode
Re: [PATCH v5 0/3 RESEND] sed-opal: keyrings, discovery, revert, key store
On Wed, 2023-08-16 at 23:41 +0300, Jarkko Sakkinen wrote: > On Wed Aug 16, 2023 at 10:45 PM EEST, Greg Joyce wrote: > > It's been almost 4 weeks since the last resend and there haven't > > been > > any comments. Is there anything that needs to be changed for > > acceptance? > > > > Thanks for your input. > > > > Greg > > > > On Fri, 2023-07-21 at 16:15 -0500, gjo...@linux.vnet.ibm.com wrote: > > > From: Greg Joyce > > > > > > This patchset has gone through numerous rounds of review and > > > all comments/suggetions have been addressed. The reviews have > > > covered all relevant areas including reviews by block and keyring > > > developers as well as the SED Opal maintainer. The last > > > patchset submission has not solicited any responses in the > > > six weeks since it was last distributed. The changes are > > > generally useful and ready for inclusion. > > > > > > TCG SED Opal is a specification from The Trusted Computing Group > > > that allows self encrypting storage devices (SED) to be locked at > > > power on and require an authentication key to unlock the drive. > > > > > > The current SED Opal implementation in the block driver > > > requires that authentication keys be provided in an ioctl > > > so that they can be presented to the underlying SED > > > capable drive. Currently, the key is typically entered by > > > a user with an application like sedutil or sedcli. While > > > this process works, it does not lend itself to automation > > > like unlock by a udev rule. > > > > > > The SED block driver has been extended so it can alternatively > > > obtain a key from a sed-opal kernel keyring. The SED ioctls > > > will indicate the source of the key, either directly in the > > > ioctl data or from the keyring. > > > > > > Two new SED ioctls have also been added. These are: > > > 1) IOC_OPAL_REVERT_LSP to revert LSP state > > > 2) IOC_OPAL_DISCOVERY to discover drive capabilities/state > > > > > > change log v5: > > > - rebase to for-6.5/block > > > > > > change log v4: > > > - rebase to 6.3-rc7 > > > - replaced "255" magic number with U8_MAX > > > > > > change log: > > > - rebase to 6.x > > > - added latest reviews > > > - removed platform functions for persistent key storage > > > - replaced key update logic with key_create_or_update() > > > - minor bracing and padding changes > > > - add error returns > > > - opal_key structure is application provided but kernel > > > verified > > > - added brief description of TCG SED Opal > > > > > > > > > Greg Joyce (3): > > > block: sed-opal: Implement IOC_OPAL_DISCOVERY > > > block: sed-opal: Implement IOC_OPAL_REVERT_LSP > > > block: sed-opal: keyring support for SED keys > > > > > > block/Kconfig | 2 + > > > block/opal_proto.h| 4 + > > > block/sed-opal.c | 252 > > > +- > > > include/linux/sed-opal.h | 5 + > > > include/uapi/linux/sed-opal.h | 25 +++- > > > 5 files changed, 282 insertions(+), 6 deletions(-) > > > > > > > > > base-commit: 1341c7d2ccf42ed91aea80b8579d35bc1ea381e2 > > I can give because it looks good to me to all patches: > > Acked-by: Jarkko Sakkinen > > ... but should not probably go to my tree. > > BR, Jarkko Thanks for the ack Jarkko. Any thoughts on which tree it should go to? Greg
[PATCH 2/2] powerpc/powernv: Use struct opal_prd_msg in more places
Use the newly added struct opal_prd_msg in some other functions that operate on opal_prd messages, rather than using other types. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-prd.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 40e26e9f318f..327e2f76905d 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -221,8 +221,8 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct opal_prd_msg_header hdr; + struct opal_prd_msg *msg; ssize_t size; - void *msg; int rc; size = sizeof(hdr); @@ -254,12 +254,12 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, static int opal_prd_release(struct inode *inode, struct file *file) { - struct opal_prd_msg_header msg; + struct opal_prd_msg msg; - msg.size = cpu_to_be16(sizeof(msg)); - msg.type = OPAL_PRD_MSG_TYPE_FINI; + msg.header.size = cpu_to_be16(sizeof(msg)); + msg.header.type = OPAL_PRD_MSG_TYPE_FINI; - opal_prd_msg((struct opal_prd_msg *)); + opal_prd_msg(); atomic_xchg(_usage, 0); -- 2.41.0
[PATCH 1/2] powerpc/powernv: Fix fortify source warnings in opal-prd.c
As reported by Mahesh & Aneesh, opal_prd_msg_notifier() triggers a FORTIFY_SOURCE warning: memcpy: detected field-spanning write (size 32) of single field ">msg" at arch/powerpc/platforms/powernv/opal-prd.c:355 (size 4) WARNING: CPU: 9 PID: 660 at arch/powerpc/platforms/powernv/opal-prd.c:355 opal_prd_msg_notifier+0x174/0x188 [opal_prd] NIP opal_prd_msg_notifier+0x174/0x188 [opal_prd] LR opal_prd_msg_notifier+0x170/0x188 [opal_prd] Call Trace: opal_prd_msg_notifier+0x170/0x188 [opal_prd] (unreliable) notifier_call_chain+0xc0/0x1b0 atomic_notifier_call_chain+0x2c/0x40 opal_message_notify+0xf4/0x2c0 This happens because the copy is targetting item->msg, which is only 4 bytes in size, even though the enclosing item was allocated with extra space following the msg. To fix the warning define struct opal_prd_msg with a union of the header and a flex array, and have the memcpy target the flex array. Reported-by: Aneesh Kumar K.V Reported-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-prd.c | 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 113bdb151f68..40e26e9f318f 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -24,13 +24,20 @@ #include +struct opal_prd_msg { + union { + struct opal_prd_msg_header header; + DECLARE_FLEX_ARRAY(u8, data); + }; +}; + /* * The msg member must be at the end of the struct, as it's followed by the * message data. */ struct opal_prd_msg_queue_item { - struct list_headlist; - struct opal_prd_msg_header msg; + struct list_headlist; + struct opal_prd_msg msg; }; static struct device_node *prd_node; @@ -156,7 +163,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf, int rc; /* we need at least a header's worth of data */ - if (count < sizeof(item->msg)) + if (count < sizeof(item->msg.header)) return -EINVAL; if (*ppos) @@ -186,7 +193,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf, return -EINTR; } - size = be16_to_cpu(item->msg.size); + size = be16_to_cpu(item->msg.header.size); if (size > count) { err = -EINVAL; goto err_requeue; @@ -352,7 +359,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, if (!item) return -ENOMEM; - memcpy(>msg, msg->params, msg_size); + memcpy(>msg.data, msg->params, msg_size); spin_lock_irqsave(_prd_msg_queue_lock, flags); list_add_tail(>list, _prd_msg_queue); -- 2.41.0
[PATCH] powerpc/powermac: Fix unused function warning
Clang reports: arch/powerpc/platforms/powermac/feature.c:137:19: error: unused function 'simple_feature_tweak' It's only used inside the #ifndef CONFIG_PPC64 block, so move it in there to fix the warning. While at it drop the inline, the compiler will decide whether it should be inlined or not. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308181501.ar5hmdwc-...@intel.com/ Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powermac/feature.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index ed472b797e28..ae62d432db8b 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -134,8 +134,10 @@ static struct pmac_mb_def pmac_mb; * Here are the chip specific feature functions */ -static inline int simple_feature_tweak(struct device_node *node, int type, - int reg, u32 mask, int value) +#ifndef CONFIG_PPC64 + +static int simple_feature_tweak(struct device_node *node, int type, int reg, + u32 mask, int value) { struct macio_chip* macio; unsigned long flags; @@ -154,8 +156,6 @@ static inline int simple_feature_tweak(struct device_node *node, int type, return 0; } -#ifndef CONFIG_PPC64 - static long ohare_htw_scc_enable(struct device_node *node, long param, long value) { -- 2.41.0
[PATCH rfc v2 02/10] arm64: mm: use try_vma_locked_page_fault()
Use new try_vma_locked_page_fault() helper to simplify code, also pass struct vmf to __do_page_fault() directly instead of each independent variable. No functional change intended. Signed-off-by: Kefeng Wang --- arch/arm64/mm/fault.c | 60 --- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2e5d1e238af9..2b7a1e610b3e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -498,9 +498,8 @@ static void do_bad_area(unsigned long far, unsigned long esr, #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x02) static vm_fault_t __do_page_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, - unsigned int mm_flags, unsigned long vm_flags, - struct pt_regs *regs) + struct vm_area_struct *vma, + struct vm_fault *vmf) { /* * Ok, we have a good vm_area for this memory access, so we can handle @@ -508,9 +507,9 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, * Check that the permissions on the VMA allow for the fault which * occurred. */ - if (!(vma->vm_flags & vm_flags)) + if (!(vma->vm_flags & vmf->vm_flags)) return VM_FAULT_BADACCESS; - return handle_mm_fault(vma, addr, mm_flags, regs); + return handle_mm_fault(vma, vmf->real_address, vmf->flags, vmf->regs); } static bool is_el0_instruction_abort(unsigned long esr) @@ -533,10 +532,12 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, const struct fault_info *inf; struct mm_struct *mm = current->mm; vm_fault_t fault; - unsigned long vm_flags; - unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; + struct vm_fault vmf = { + .real_address = addr, + .flags = FAULT_FLAG_DEFAULT, + }; if (kprobe_page_fault(regs, esr)) return 0; @@ -549,7 +550,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto no_context; if (user_mode(regs)) - mm_flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; /* * vm_flags tells us what bits we must have in vma->vm_flags @@ -559,20 +560,20 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, */ if (is_el0_instruction_abort(esr)) { /* It was exec fault */ - vm_flags = VM_EXEC; - mm_flags |= FAULT_FLAG_INSTRUCTION; + vmf.vm_flags = VM_EXEC; + vmf.flags |= FAULT_FLAG_INSTRUCTION; } else if (is_write_abort(esr)) { /* It was write fault */ - vm_flags = VM_WRITE; - mm_flags |= FAULT_FLAG_WRITE; + vmf.vm_flags = VM_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; } else { /* It was read fault */ - vm_flags = VM_READ; + vmf.vm_flags = VM_READ; /* Write implies read */ - vm_flags |= VM_WRITE; + vmf.vm_flags |= VM_WRITE; /* If EPAN is absent then exec implies read */ if (!cpus_have_const_cap(ARM64_HAS_EPAN)) - vm_flags |= VM_EXEC; + vmf.vm_flags |= VM_EXEC; } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { @@ -587,26 +588,11 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); - if (!(mm_flags & FAULT_FLAG_USER)) - goto lock_mmap; - - vma = lock_vma_under_rcu(mm, addr); - if (!vma) - goto lock_mmap; - - if (!(vma->vm_flags & vm_flags)) { - vma_end_read(vma); - goto lock_mmap; - } - fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) - vma_end_read(vma); - - if (!(fault & VM_FAULT_RETRY)) { - count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto retry; + if (!(fault & VM_FAULT_RETRY)) goto done; - } - count_vm_vma_lock_event(VMA_LOCK_RETRY); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -614,8 +600,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto no_context; return 0; } -lock_mmap: - retry:
[PATCH rfc v2 04/10] s390: mm: use try_vma_locked_page_fault()
Use new try_vma_locked_page_fault() helper to simplify code. No functional change intended. Signed-off-by: Kefeng Wang --- arch/s390/mm/fault.c | 66 ++-- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 099c4824dd8a..fbbdebde6ea7 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -357,16 +357,18 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) static inline vm_fault_t do_exception(struct pt_regs *regs, int access) { struct gmap *gmap; - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; enum fault_type type; - unsigned long address; - unsigned int flags; + struct mm_struct *mm = current->mm; + unsigned long address = get_fault_address(regs); vm_fault_t fault; bool is_write; + struct vm_fault vmf = { + .real_address = address, + .flags = FAULT_FLAG_DEFAULT, + .vm_flags = access, + }; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. @@ -376,8 +378,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) if (kprobe_page_fault(regs, 14)) return 0; - mm = tsk->mm; - address = get_fault_address(regs); is_write = fault_is_write(regs); /* @@ -398,45 +398,33 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) - flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; if (is_write) - access = VM_WRITE; - if (access == VM_WRITE) - flags |= FAULT_FLAG_WRITE; - if (!(flags & FAULT_FLAG_USER)) - goto lock_mmap; - vma = lock_vma_under_rcu(mm, address); - if (!vma) - goto lock_mmap; - if (!(vma->vm_flags & access)) { - vma_end_read(vma); - goto lock_mmap; - } - fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) - vma_end_read(vma); - if (!(fault & VM_FAULT_RETRY)) { - count_vm_vma_lock_event(VMA_LOCK_SUCCESS); - if (likely(!(fault & VM_FAULT_ERROR))) - fault = 0; + vmf.vm_flags = VM_WRITE; + if (vmf.vm_flags == VM_WRITE) + vmf.flags |= FAULT_FLAG_WRITE; + + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto lock_mm; + if (!(fault & VM_FAULT_RETRY)) goto out; - } - count_vm_vma_lock_event(VMA_LOCK_RETRY); + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; goto out; } -lock_mmap: + +lock_mm: mmap_read_lock(mm); gmap = NULL; if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { gmap = (struct gmap *) S390_lowcore.gmap; current->thread.gmap_addr = address; - current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_write_flag = !!(vmf.flags & FAULT_FLAG_WRITE); current->thread.gmap_int_code = regs->int_code & 0x; address = __gmap_translate(gmap, address); if (address == -EFAULT) { @@ -444,7 +432,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out_up; } if (gmap->pfault_enabled) - flags |= FAULT_FLAG_RETRY_NOWAIT; + vmf.flags |= FAULT_FLAG_RETRY_NOWAIT; } retry: @@ -466,7 +454,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) * we can handle it.. */ fault = VM_FAULT_BADACCESS; - if (unlikely(!(vma->vm_flags & access))) + if (unlikely(!(vma->vm_flags & vmf.vm_flags))) goto out_up; /* @@ -474,10 +462,10 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, regs); + fault = handle_mm_fault(vma, address, vmf.flags, regs); if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; - if (flags & FAULT_FLAG_RETRY_NOWAIT) + if (vmf.flags & FAULT_FLAG_RETRY_NOWAIT) goto out_up;
[PATCH rfc v2 03/10] x86: mm: use try_vma_locked_page_fault()
Use new try_vma_locked_page_fault() helper to simplify code. No functional change intended. Signed-off-by: Kefeng Wang --- arch/x86/mm/fault.c | 55 +++-- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ab778eac1952..3edc9edc0b28 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1227,6 +1227,13 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); +#ifdef CONFIG_PER_VMA_LOCK +bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return access_error(vmf->fault_code, vma); +} +#endif + /* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost @@ -1241,13 +1248,13 @@ void do_user_addr_fault(struct pt_regs *regs, unsigned long address) { struct vm_area_struct *vma; - struct task_struct *tsk; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; vm_fault_t fault; - unsigned int flags = FAULT_FLAG_DEFAULT; - - tsk = current; - mm = tsk->mm; + struct vm_fault vmf = { + .real_address = address, + .fault_code = error_code, + .flags = FAULT_FLAG_DEFAULT + }; if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { /* @@ -1311,7 +1318,7 @@ void do_user_addr_fault(struct pt_regs *regs, */ if (user_mode(regs)) { local_irq_enable(); - flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); @@ -1326,11 +1333,11 @@ void do_user_addr_fault(struct pt_regs *regs, * maybe_mkwrite() can create a proper shadow stack PTE. */ if (error_code & X86_PF_SHSTK) - flags |= FAULT_FLAG_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) - flags |= FAULT_FLAG_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) - flags |= FAULT_FLAG_INSTRUCTION; + vmf.flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 /* @@ -1350,26 +1357,11 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif - if (!(flags & FAULT_FLAG_USER)) - goto lock_mmap; - - vma = lock_vma_under_rcu(mm, address); - if (!vma) - goto lock_mmap; - - if (unlikely(access_error(error_code, vma))) { - vma_end_read(vma); - goto lock_mmap; - } - fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) - vma_end_read(vma); - - if (!(fault & VM_FAULT_RETRY)) { - count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto retry; + if (!(fault & VM_FAULT_RETRY)) goto done; - } - count_vm_vma_lock_event(VMA_LOCK_RETRY); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -1379,7 +1371,6 @@ void do_user_addr_fault(struct pt_regs *regs, ARCH_DEFAULT_PKEY); return; } -lock_mmap: retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1410,7 +1401,7 @@ void do_user_addr_fault(struct pt_regs *regs, * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ - fault = handle_mm_fault(vma, address, flags, regs); + fault = handle_mm_fault(vma, address, vmf.flags, regs); if (fault_signal_pending(fault, regs)) { /* @@ -1434,7 +1425,7 @@ void do_user_addr_fault(struct pt_regs *regs, * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { - flags |= FAULT_FLAG_TRIED; + vmf.flags |= FAULT_FLAG_TRIED; goto retry; } -- 2.27.0
[PATCH rfc v2 07/10] ARM: mm: try VMA lock-based page fault handling first
Attempt VMA lock-based page fault handling first, and fall back to the existing mmap_lock-based handling if that fails. Signed-off-by: Kefeng Wang --- arch/arm/Kconfig| 1 + arch/arm/mm/fault.c | 35 +-- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1a6a6eb48a15..8b6d4507ccee 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -34,6 +34,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE + select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index fef62e4a9edd..d53bb028899a 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -242,8 +242,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct vm_area_struct *vma; int sig, code; vm_fault_t fault; - unsigned int flags = FAULT_FLAG_DEFAULT; - unsigned long vm_flags = VM_ACCESS_FLAGS; + struct vm_fault vmf = { + .real_address = addr, + .flags = FAULT_FLAG_DEFAULT, + .vm_flags = VM_ACCESS_FLAGS, + }; if (kprobe_page_fault(regs, fsr)) return 0; @@ -261,15 +264,15 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) goto no_context; if (user_mode(regs)) - flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; if (is_write_fault(fsr)) { - flags |= FAULT_FLAG_WRITE; - vm_flags = VM_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; + vmf.vm_flags = VM_WRITE; } if (fsr & FSR_LNX_PF) { - vm_flags = VM_EXEC; + vmf.vm_flags = VM_EXEC; if (is_permission_fault(fsr) && !user_mode(regs)) die_kernel_fault("execution of memory", @@ -278,6 +281,18 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto retry; + if (!(fault & VM_FAULT_RETRY)) + goto done; + + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + goto no_context; + return 0; + } + retry: vma = lock_mm_and_find_vma(mm, addr, regs); if (unlikely(!vma)) { @@ -289,10 +304,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * ok, we have a good vm_area for this memory access, check the * permissions on the VMA allow for the fault which occurred. */ - if (!(vma->vm_flags & vm_flags)) + if (!(vma->vm_flags & vmf.vm_flags)) fault = VM_FAULT_BADACCESS; else - fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs); + fault = handle_mm_fault(vma, addr & PAGE_MASK, vmf.flags, regs); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_lock because @@ -310,13 +325,13 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + vmf.flags |= FAULT_FLAG_TRIED; goto retry; } } mmap_read_unlock(mm); - +done: /* * Handle the "normal" case first - VM_FAULT_MAJOR */ -- 2.27.0
[PATCH rfc v2 08/10] loongarch: mm: cleanup __do_page_fault()
Cleanup __do_page_fault() by reuse bad_area_nosemaphore and bad_area label. Signed-off-by: Kefeng Wang --- arch/loongarch/mm/fault.c | 48 +-- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index e6376e3dce86..5d4c742c4bc5 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -157,18 +157,15 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, if (!user_mode(regs)) no_context(regs, write, address); else - do_sigsegv(regs, write, address, si_code); - return; + goto bad_area_nosemaphore; } /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (faulthandler_disabled() || !mm) { - do_sigsegv(regs, write, address, si_code); - return; - } + if (faulthandler_disabled() || !mm) + goto bad_area_nosemaphore; if (user_mode(regs)) flags |= FAULT_FLAG_USER; @@ -178,23 +175,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) goto bad_area_nosemaphore; - goto good_area; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - mmap_read_unlock(mm); -bad_area_nosemaphore: - do_sigsegv(regs, write, address, si_code); - return; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: si_code = SEGV_ACCERR; if (write) { @@ -235,22 +216,25 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, */ goto retry; } + + mmap_read_unlock(mm); + if (unlikely(fault & VM_FAULT_ERROR)) { - mmap_read_unlock(mm); - if (fault & VM_FAULT_OOM) { + if (fault & VM_FAULT_OOM) do_out_of_memory(regs, write, address); - return; - } else if (fault & VM_FAULT_SIGSEGV) { - do_sigsegv(regs, write, address, si_code); - return; - } else if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + else if (fault & VM_FAULT_SIGSEGV) + goto bad_area_nosemaphore; + else if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, write, address, si_code); - return; - } - BUG(); + else + BUG(); } + return; +bad_area: mmap_read_unlock(mm); +bad_area_nosemaphore: + do_sigsegv(regs, write, address, si_code); } asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, -- 2.27.0
[PATCH rfc v2 09/10] loongarch: mm: add access_error() helper
Add access_error() to check whether vma could be accessible or not, which will be used __do_page_fault() and later vma locked based page fault. Signed-off-by: Kefeng Wang --- arch/loongarch/mm/fault.c | 30 -- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index 5d4c742c4bc5..2a45e9f3a485 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -126,6 +126,22 @@ static void __kprobes do_sigsegv(struct pt_regs *regs, force_sig_fault(SIGSEGV, si_code, (void __user *)address); } +static inline bool access_error(unsigned int flags, struct pt_regs *regs, + unsigned long addr, struct vm_area_struct *vma) +{ + if (flags & FAULT_FLAG_WRITE) { + if (!(vma->vm_flags & VM_WRITE)) + return true; + } else { + if (!(vma->vm_flags & VM_READ) && addr != exception_era(regs)) + return true; + if (!(vma->vm_flags & VM_EXEC) && addr == exception_era(regs)) + return true; + } + + return false; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -169,6 +185,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, if (user_mode(regs)) flags |= FAULT_FLAG_USER; + if (write) + flags |= FAULT_FLAG_WRITE; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); retry: @@ -178,16 +196,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, si_code = SEGV_ACCERR; - if (write) { - flags |= FAULT_FLAG_WRITE; - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - } else { - if (!(vma->vm_flags & VM_READ) && address != exception_era(regs)) - goto bad_area; - if (!(vma->vm_flags & VM_EXEC) && address == exception_era(regs)) - goto bad_area; - } + if (access_error(flags, regs, vma)) + goto bad_area; /* * If for any reason at all we couldn't handle the fault, -- 2.27.0
[PATCH rfc -next v2 00/10] mm: convert to generic VMA lock-based page fault
Add a generic VMA lock-based page fault handler in mm core, and convert architectures to use it, which eliminate architectures's duplicated codes. With it, we can avoid multiple changes in architectures's code if we add new feature or bugfix, in the end, enable this feature on ARM32 and Loongarch. This is based on next-20230817, only built test. v2: - convert "int arch_vma_check_access()" to "bool arch_vma_access_error()" still use __weak function for arch_vma_access_error(), which avoid to declare access_error() in architecture's(x86/powerpc/riscv/loongarch) headfile. - re-use struct vm_fault instead of adding new struct vm_locked_fault, per Matthew Wilcox, add necessary pt_regs/fault error code/vm flags into vm_fault since they could be used in arch_vma_access_error() - add special VM_FAULT_NONE and make try_vma_locked_page_fault() to return vm_fault_t Kefeng Wang (10): mm: add a generic VMA lock-based page fault handler arm64: mm: use try_vma_locked_page_fault() x86: mm: use try_vma_locked_page_fault() s390: mm: use try_vma_locked_page_fault() powerpc: mm: use try_vma_locked_page_fault() riscv: mm: use try_vma_locked_page_fault() ARM: mm: try VMA lock-based page fault handling first loongarch: mm: cleanup __do_page_fault() loongarch: mm: add access_error() helper loongarch: mm: try VMA lock-based page fault handling first arch/arm/Kconfig | 1 + arch/arm/mm/fault.c | 35 arch/arm64/mm/fault.c | 60 - arch/loongarch/Kconfig| 1 + arch/loongarch/mm/fault.c | 111 ++ arch/powerpc/mm/fault.c | 66 +++ arch/riscv/mm/fault.c | 58 +--- arch/s390/mm/fault.c | 66 ++- arch/x86/mm/fault.c | 55 --- include/linux/mm.h| 17 ++ include/linux/mm_types.h | 2 + mm/memory.c | 39 ++ 12 files changed, 278 insertions(+), 233 deletions(-) -- 2.27.0
[PATCH rfc v2 05/10] powerpc: mm: use try_vma_locked_page_fault()
Use new try_vma_locked_page_fault() helper to simplify code. No functional change intended. Signed-off-by: Kefeng Wang --- arch/powerpc/mm/fault.c | 66 - 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b1723094d464..52f9546e020e 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -391,6 +391,22 @@ static int page_fault_is_bad(unsigned long err) #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) #endif +#ifdef CONFIG_PER_VMA_LOCK +bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int is_exec = TRAP(vmf->regs) == INTERRUPT_INST_STORAGE; + int is_write = page_fault_is_write(vmf->fault_code); + + if (unlikely(access_pkey_error(is_write, is_exec, + (vmf->fault_code & DSISR_KEYFAULT), vma))) + return true; + + if (unlikely(access_error(is_write, is_exec, vma))) + return true; + return false; +} +#endif + /* * For 600- and 800-family processors, the error_code parameter is DSISR * for a data fault, SRR1 for an instruction fault. @@ -407,12 +423,18 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - unsigned int flags = FAULT_FLAG_DEFAULT; int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE; int is_user = user_mode(regs); int is_write = page_fault_is_write(error_code); vm_fault_t fault, major = 0; bool kprobe_fault = kprobe_page_fault(regs, 11); + struct vm_fault vmf = { + .real_address = address, + .fault_code = error_code, + .regs = regs, + .flags = FAULT_FLAG_DEFAULT, + }; + if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) return 0; @@ -463,45 +485,21 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * mmap_lock held */ if (is_user) - flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; if (is_write) - flags |= FAULT_FLAG_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; if (is_exec) - flags |= FAULT_FLAG_INSTRUCTION; + vmf.flags |= FAULT_FLAG_INSTRUCTION; - if (!(flags & FAULT_FLAG_USER)) - goto lock_mmap; - - vma = lock_vma_under_rcu(mm, address); - if (!vma) - goto lock_mmap; - - if (unlikely(access_pkey_error(is_write, is_exec, - (error_code & DSISR_KEYFAULT), vma))) { - vma_end_read(vma); - goto lock_mmap; - } - - if (unlikely(access_error(is_write, is_exec, vma))) { - vma_end_read(vma); - goto lock_mmap; - } - - fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) - vma_end_read(vma); - - if (!(fault & VM_FAULT_RETRY)) { - count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto retry; + if (!(fault & VM_FAULT_RETRY)) goto done; - } - count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; -lock_mmap: - /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -528,7 +526,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, regs); + fault = handle_mm_fault(vma, address, vmf.flags, regs); major |= fault & VM_FAULT_MAJOR; @@ -544,7 +542,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, * case. */ if (unlikely(fault & VM_FAULT_RETRY)) { - flags |= FAULT_FLAG_TRIED; + vmf.flags |= FAULT_FLAG_TRIED; goto retry; } -- 2.27.0
[PATCH rfc v2 06/10] riscv: mm: use try_vma_locked_page_fault()
Use new try_vma_locked_page_fault() helper to simplify code. No functional change intended. Signed-off-by: Kefeng Wang --- arch/riscv/mm/fault.c | 58 ++- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6115d7514972..b46129b636f2 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -215,6 +215,13 @@ static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) return false; } +#ifdef CONFIG_PER_VMA_LOCK +bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return access_error(vmf->fault_code, vma); +} +#endif + /* * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. @@ -223,17 +230,16 @@ void handle_page_fault(struct pt_regs *regs) { struct task_struct *tsk; struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long addr, cause; - unsigned int flags = FAULT_FLAG_DEFAULT; + struct mm_struct *mm = current->mm; + unsigned long addr = regs->badaddr; + unsigned long cause = regs->cause; int code = SEGV_MAPERR; vm_fault_t fault; - - cause = regs->cause; - addr = regs->badaddr; - - tsk = current; - mm = tsk->mm; + struct vm_fault vmf = { + .real_address = addr, + .fault_code = cause, + .flags = FAULT_FLAG_DEFAULT, + }; if (kprobe_page_fault(regs, cause)) return; @@ -268,7 +274,7 @@ void handle_page_fault(struct pt_regs *regs) } if (user_mode(regs)) - flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) { if (fixup_exception(regs)) @@ -280,37 +286,21 @@ void handle_page_fault(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (cause == EXC_STORE_PAGE_FAULT) - flags |= FAULT_FLAG_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; else if (cause == EXC_INST_PAGE_FAULT) - flags |= FAULT_FLAG_INSTRUCTION; - if (!(flags & FAULT_FLAG_USER)) - goto lock_mmap; - - vma = lock_vma_under_rcu(mm, addr); - if (!vma) - goto lock_mmap; + vmf.flags |= FAULT_FLAG_INSTRUCTION; - if (unlikely(access_error(cause, vma))) { - vma_end_read(vma); - goto lock_mmap; - } - - fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) - vma_end_read(vma); - - if (!(fault & VM_FAULT_RETRY)) { - count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto retry; + if (!(fault & VM_FAULT_RETRY)) goto done; - } - count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) no_context(regs, addr); return; } -lock_mmap: retry: vma = lock_mm_and_find_vma(mm, addr, regs); @@ -337,7 +327,7 @@ void handle_page_fault(struct pt_regs *regs) * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, addr, flags, regs); + fault = handle_mm_fault(vma, addr, vmf.flags, regs); /* * If we need to retry but a fatal signal is pending, handle the @@ -355,7 +345,7 @@ void handle_page_fault(struct pt_regs *regs) return; if (unlikely(fault & VM_FAULT_RETRY)) { - flags |= FAULT_FLAG_TRIED; + vmf.flags |= FAULT_FLAG_TRIED; /* * No need to mmap_read_unlock(mm) as we would -- 2.27.0
[PATCH rfc v2 01/10] mm: add a generic VMA lock-based page fault handler
The ARCH_SUPPORTS_PER_VMA_LOCK are enabled by more and more architectures, eg, x86, arm64, powerpc and s390, and riscv, those implementation are very similar which results in some duplicated codes, let's add a generic VMA lock-based page fault handler try_to_vma_locked_page_fault() to eliminate them, and which also make us easy to support this on new architectures. Since different architectures use different way to check vma whether is accessable or not, the struct pt_regs, page fault error code and vma flags are added into struct vm_fault, then, the architecture's page fault code could re-use struct vm_fault to record and check vma accessable by each own implementation. Signed-off-by: Kefeng Wang --- include/linux/mm.h | 17 + include/linux/mm_types.h | 2 ++ mm/memory.c | 39 +++ 3 files changed, 58 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3f764e84e567..22a6f4c56ff3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -512,9 +512,12 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ + unsigned long fault_code; /* Faulting error code during page fault */ + struct pt_regs *regs; /* The registers stored during page fault */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ + vm_flags_t vm_flags;/* VMA flags to be used for access checking */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -774,6 +777,9 @@ static inline void assert_fault_locked(struct vm_fault *vmf) struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); +bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf); +vm_fault_t try_vma_locked_page_fault(struct vm_fault *vmf); + #else /* CONFIG_PER_VMA_LOCK */ static inline bool vma_start_read(struct vm_area_struct *vma) @@ -801,6 +807,17 @@ static inline void assert_fault_locked(struct vm_fault *vmf) mmap_assert_locked(vmf->vma->vm_mm); } +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + +static inline vm_fault_t try_vma_locked_page_fault(struct vm_fault *vmf) +{ + return VM_FAULT_NONE; +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f5ba5b0bc836..702820cea3f9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1119,6 +1119,7 @@ typedef __bitwise unsigned int vm_fault_t; * fault. Used to decide whether a process gets delivered SIGBUS or * just gets major/minor fault counters bumped up. * + * @VM_FAULT_NONE: Special case, not starting to handle fault * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR:Page read from storage @@ -1139,6 +1140,7 @@ typedef __bitwise unsigned int vm_fault_t; * */ enum vm_fault_reason { + VM_FAULT_NONE = (__force vm_fault_t)0x00, VM_FAULT_OOM= (__force vm_fault_t)0x01, VM_FAULT_SIGBUS = (__force vm_fault_t)0x02, VM_FAULT_MAJOR = (__force vm_fault_t)0x04, diff --git a/mm/memory.c b/mm/memory.c index 3b4aaa0d2fff..60fe35db5134 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5510,6 +5510,45 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } + +#ifdef CONFIG_PER_VMA_LOCK +bool __weak arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return (vma->vm_flags & vmf->vm_flags) == 0; +} +#endif + +vm_fault_t try_vma_locked_page_fault(struct vm_fault *vmf) +{ + vm_fault_t fault = VM_FAULT_NONE; + struct vm_area_struct *vma; + + if (!(vmf->flags & FAULT_FLAG_USER)) + return fault; + + vma = lock_vma_under_rcu(current->mm, vmf->real_address); + if (!vma) + return fault; + + if (arch_vma_access_error(vma, vmf)) { + vma_end_read(vma); + return fault; + } + + fault = handle_mm_fault(vma, vmf->real_address, + vmf->flags | FAULT_FLAG_VMA_LOCK, vmf->regs); + + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) +
Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes
Hi Sean, On Fri, Aug 11, 2023 at 07:36:37PM +0300, Vladimir Oltean wrote: > Let me explain that approach, because your mention of "swapping out the > bootloaders" makes it appear as if you are not visualising what I am > proposing. > > The Lynx SerDes family has 2 PLLs, and more lanes (4 or 8). Each lane > uses one PLL or the other, to derive its protocol frequency. Through the > RCW, you provision the 2 PLL frequencies that may be used by the lanes > at runtime. > > The Lynx 28G SerDes driver reads the PLL frequencies in > lynx_28g_pll_read_configuration(), and determines the interface modes > supportable by each PLL (this is used by phylink). But it never changes > those PLL frequencies, since that operation is practically impossible in > the general sense (PLLs are shared by multiple lanes, so changing a PLL > frequency disrupts all lanes that use it). Is my high-level feedback clear and actionable to you? I am suggesting to keep the look and feel the same between the lynx-10g and lynx-28g drivers, and to not use "fsl,type" protocols listed in the device tree as the immutable source of information for populating mode->protos, but instead the current PLL frequency configuration. So this implies that I am requesting that the dt-bindings should not contain a listing of the supported protocols.
[PATCH rfc v2 10/10] loongarch: mm: try VMA lock-based page fault handling first
Attempt VMA lock-based page fault handling first, and fall back to the existing mmap_lock-based handling if that fails. Signed-off-by: Kefeng Wang --- arch/loongarch/Kconfig| 1 + arch/loongarch/mm/fault.c | 37 +++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 2b27b18a63af..6b821f621920 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -56,6 +56,7 @@ config LOONGARCH select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index 2a45e9f3a485..f7ac3a14bb06 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -142,6 +142,13 @@ static inline bool access_error(unsigned int flags, struct pt_regs *regs, return false; } +#ifdef CONFIG_PER_VMA_LOCK +bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return access_error(vmf->flags, vmf->regs, vmf->real_address, vma); +} +#endif + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -151,11 +158,15 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, unsigned long address) { int si_code = SEGV_MAPERR; - unsigned int flags = FAULT_FLAG_DEFAULT; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; struct vm_area_struct *vma = NULL; vm_fault_t fault; + struct vm_fault vmf = { + .real_address = address, + .regs = regs, + .flags = FAULT_FLAG_DEFAULT, + }; if (kprobe_page_fault(regs, current->thread.trap_nr)) return; @@ -184,11 +195,24 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; if (user_mode(regs)) - flags |= FAULT_FLAG_USER; + vmf.flags |= FAULT_FLAG_USER; if (write) - flags |= FAULT_FLAG_WRITE; + vmf.flags |= FAULT_FLAG_WRITE; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + fault = try_vma_locked_page_fault(); + if (fault == VM_FAULT_NONE) + goto retry; + if (!(fault & VM_FAULT_RETRY)) + goto done; + + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + no_context(regs, write, address); + return; + } + retry: vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) @@ -196,7 +220,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, si_code = SEGV_ACCERR; - if (access_error(flags, regs, vma)) + if (access_error(vmf.flags, regs, address, vma)) goto bad_area; /* @@ -204,7 +228,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags, regs); + fault = handle_mm_fault(vma, address, vmf.flags, regs); if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) @@ -217,7 +241,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, return; if (unlikely(fault & VM_FAULT_RETRY)) { - flags |= FAULT_FLAG_TRIED; + vmf.flags |= FAULT_FLAG_TRIED; /* * No need to mmap_read_unlock(mm) as we would @@ -229,6 +253,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, mmap_read_unlock(mm); +done: if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) do_out_of_memory(regs, write, address); -- 2.27.0
Re: [PATCH v4 23/28] mfd: core: Ensure disabled devices are skiped without aborting
On Fri, 18 Aug 2023, Christophe Leroy wrote: > From: Herve Codina > > The loop searching for a matching device based on its compatible > string is aborted when a matching disabled device is found. > This abort prevents to add devices as soon as one disabled device > is found. > > Continue searching for an other device instead of aborting on the > first disabled one fixes the issue. > > Fixes: 22380b65dc70 ("mfd: mfd-core: Ensure disabled devices are ignored > without error") > Signed-off-by: Herve Codina > Reviewed-by: Christophe Leroy > Signed-off-by: Christophe Leroy > --- > drivers/mfd/mfd-core.c | 17 - > 1 file changed, 12 insertions(+), 5 deletions(-) This is too big of a change to be added this late in the cycle. Pushing review until after v6.5 is out. -- Lee Jones [李琼斯]
[powerpc:topic/ppc-kvm 5/9] arch/powerpc/kvm/book3s_hv_nestedv2.c:465:29: error: variable 'amor' set but not used
tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git topic/ppc-kvm head: a6a4c847287937c42ad33e053af968ed6ea91e13 commit: 6be7fd9b644945c2e4f9f071b9f8caa57f4e5590 [5/9] KVM: PPC: Add support for nestedv2 guests config: powerpc-ppc64_defconfig (https://download.01.org/0day-ci/archive/20230821/202308211729.rxk20sco-...@intel.com/config) compiler: powerpc64-linux-gcc (GCC) 12.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230821/202308211729.rxk20sco-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202308211729.rxk20sco-...@intel.com/ All errors (new ones prefixed by >>): arch/powerpc/kvm/book3s_hv_nestedv2.c: In function 'gs_msg_ops_vcpu_refresh_info': >> arch/powerpc/kvm/book3s_hv_nestedv2.c:465:29: error: variable 'amor' set but >> not used [-Werror=unused-but-set-variable] 465 | u64 amor; | ^~~~ cc1: all warnings being treated as errors vim +/amor +465 arch/powerpc/kvm/book3s_hv_nestedv2.c 361 362 static int gs_msg_ops_vcpu_refresh_info(struct kvmppc_gs_msg *gsm, 363 struct kvmppc_gs_buff *gsb) 364 { 365 struct kvmppc_gs_parser gsp = { 0 }; 366 struct kvmhv_nestedv2_io *io; 367 struct kvmppc_gs_bitmap *valids; 368 struct kvm_vcpu *vcpu; 369 struct kvmppc_gs_elem *gse; 370 vector128 v; 371 int rc, i; 372 u16 iden; 373 374 vcpu = gsm->data; 375 376 rc = kvmppc_gse_parse(, gsb); 377 if (rc < 0) 378 return rc; 379 380 io = >arch.nestedv2_io; 381 valids = >valids; 382 383 kvmppc_gsp_for_each(, iden, gse) 384 { 385 switch (iden) { 386 case KVMPPC_GSID_DSCR: 387 vcpu->arch.dscr = kvmppc_gse_get_u64(gse); 388 break; 389 case KVMPPC_GSID_MMCRA: 390 vcpu->arch.mmcra = kvmppc_gse_get_u64(gse); 391 break; 392 case KVMPPC_GSID_HFSCR: 393 vcpu->arch.hfscr = kvmppc_gse_get_u64(gse); 394 break; 395 case KVMPPC_GSID_PURR: 396 vcpu->arch.purr = kvmppc_gse_get_u64(gse); 397 break; 398 case KVMPPC_GSID_SPURR: 399 vcpu->arch.spurr = kvmppc_gse_get_u64(gse); 400 break; 401 case KVMPPC_GSID_AMR: 402 vcpu->arch.amr = kvmppc_gse_get_u64(gse); 403 break; 404 case KVMPPC_GSID_UAMOR: 405 vcpu->arch.uamor = kvmppc_gse_get_u64(gse); 406 break; 407 case KVMPPC_GSID_SIAR: 408 vcpu->arch.siar = kvmppc_gse_get_u64(gse); 409 break; 410 case KVMPPC_GSID_SDAR: 411 vcpu->arch.sdar = kvmppc_gse_get_u64(gse); 412 break; 413 case KVMPPC_GSID_IAMR: 414 vcpu->arch.iamr = kvmppc_gse_get_u64(gse); 415 break; 416 case KVMPPC_GSID_DAWR0: 417 vcpu->arch.dawr0 = kvmppc_gse_get_u64(gse); 418 break; 419 case KVMPPC_GSID_DAWR1: 420 vcpu->arch.dawr1 = kvmppc_gse_get_u64(gse); 421 break; 422 case KVMPPC_GSID_DAWRX0: 423 vcpu->arch.dawrx0 = kvmppc_gse_get_u32(gse); 424 break; 425 case KVMPPC_GSID_DAWRX1: 426 vcpu->arch.dawrx1 = kvmppc_gse_get_u32(gse); 427 break; 428 case KVMPPC_GSID_CIABR: 429 vcpu->arch.ciabr = kvmppc_gse_get_u64(gse); 430 break; 431 case KVMPPC_GSID_WORT: 432 vcpu->arch.wort = kvmppc_gse_get_u32(gse); 433 break; 434 case KVMPPC_GSID_PPR: 435 vcpu->arch.ppr = kvmppc_gse_get_u64(gse); 436 break; 437 case KVMPPC_GSID_PSPB: 438 vcpu->arch.pspb = kvmppc_g
Re: [PATCH v4 21/28] net: wan: Add framer framework support
On Mon, Aug 21, 2023 at 7:19 AM Christophe Leroy wrote: > Le 20/08/2023 à 23:06, Linus Walleij a écrit : > > On Fri, Aug 18, 2023 at 6:41 PM Christophe Leroy > > wrote: > > > >> From: Herve Codina > >> > >> A framer is a component in charge of an E1/T1 line interface. > >> Connected usually to a TDM bus, it converts TDM frames to/from E1/T1 > >> frames. It also provides information related to the E1/T1 line. > >> > >> The framer framework provides a set of APIs for the framer drivers > >> (framer provider) to create/destroy a framer and APIs for the framer > >> users (framer consumer) to obtain a reference to the framer, and > >> use the framer. > >> > >> This basic implementation provides a framer abstraction for: > >> - power on/off the framer > >> - get the framer status (line state) > >> - be notified on framer status changes > >> - get/set the framer configuration > >> > >> Signed-off-by: Herve Codina > >> Reviewed-by: Christophe Leroy > >> Signed-off-by: Christophe Leroy > > > > I had these review comments, you must have missed them? > > https://lore.kernel.org/netdev/cacrpkdzq9_f6+9csev1l_wgphhujfpayxmjjfjurzszrako...@mail.gmail.com/ > > > > As I said in the cover letter, this series only fixes critical build > failures that happened when CONFIG_MODULES is set. The purpose was to > allow robots to perform their job up to the end. Other feedback and > comments will be taken into account by Hervé when he is back from holidays. Ah I see. I completely missed this. Yours, Linus Walleij
Re: [PATCH v4 20/28] wan: qmc_hdlc: Add runtime timeslots changes support
Le 18/08/2023 à 18:39, Christophe Leroy a écrit : From: Herve Codina QMC channels support runtime timeslots changes but nothing is done at the QMC HDLC driver to handle these changes. Use existing IFACE ioctl in order to configure the timeslots to use. Signed-off-by: Herve Codina Reviewed-by: Christophe Leroy Signed-off-by: Christophe Leroy --- Hi, a few nits below, should there be a v5. drivers/net/wan/fsl_qmc_hdlc.c | 169 - 1 file changed, 168 insertions(+), 1 deletion(-) diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c index 4f84ac5fc63e..4b8cb5761fd1 100644 --- a/drivers/net/wan/fsl_qmc_hdlc.c +++ b/drivers/net/wan/fsl_qmc_hdlc.c @@ -32,6 +32,7 @@ struct qmc_hdlc { struct qmc_hdlc_desc tx_descs[8]; unsigned int tx_out; struct qmc_hdlc_desc rx_descs[4]; + u32 slot_map; }; static inline struct qmc_hdlc *netdev_to_qmc_hdlc(struct net_device *netdev) @@ -202,6 +203,162 @@ static netdev_tx_t qmc_hdlc_xmit(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_OK; } +static int qmc_hdlc_xlate_slot_map(struct qmc_hdlc *qmc_hdlc, + u32 slot_map, struct qmc_chan_ts_info *ts_info) +{ + u64 ts_mask_avail; + unsigned int bit; + unsigned int i; + u64 ts_mask; + u64 map = 0; This init looks useless. + + /* Tx and Rx masks must be identical */ + if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) { + dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch (0x%llx, 0x%llx)\n", + ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail); + return -EINVAL; + } + + ts_mask_avail = ts_info->rx_ts_mask_avail; + ts_mask = 0; + map = slot_map; + bit = 0; + for (i = 0; i < 64; i++) { + if (ts_mask_avail & BIT_ULL(i)) { + if (map & BIT_ULL(bit)) + ts_mask |= BIT_ULL(i); + bit++; + } + } + + if (hweight64(ts_mask) != hweight64(map)) { + dev_err(qmc_hdlc->dev, "Cannot translate timeslots 0x%llx -> (0x%llx,0x%llx)\n", + map, ts_mask_avail, ts_mask); + return -EINVAL; + } + + ts_info->tx_ts_mask = ts_mask; + ts_info->rx_ts_mask = ts_mask; + return 0; +} + +static int qmc_hdlc_xlate_ts_info(struct qmc_hdlc *qmc_hdlc, + const struct qmc_chan_ts_info *ts_info, u32 *slot_map) +{ + u64 ts_mask_avail; + unsigned int bit; + unsigned int i; + u64 ts_mask; + u64 map = 0; This init looks useless. + + /* Tx and Rx masks must be identical */ + if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) { + dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch (0x%llx, 0x%llx)\n", + ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail); + return -EINVAL; + } + if (ts_info->rx_ts_mask != ts_info->tx_ts_mask) { + dev_err(qmc_hdlc->dev, "tx and rx timeslots mismatch (0x%llx, 0x%llx)\n", + ts_info->rx_ts_mask, ts_info->tx_ts_mask); + return -EINVAL; + } + + ts_mask_avail = ts_info->rx_ts_mask_avail; + ts_mask = ts_info->rx_ts_mask; + map = 0; + bit = 0; + for (i = 0; i < 64; i++) { + if (ts_mask_avail & BIT_ULL(i)) { + if (ts_mask & BIT_ULL(i)) + map |= BIT_ULL(bit); + bit++; + } + } + + if (hweight64(ts_mask) != hweight64(map)) { + dev_err(qmc_hdlc->dev, "Cannot translate timeslots (0x%llx,0x%llx) -> 0x%llx\n", + ts_mask_avail, ts_mask, map); + return -EINVAL; + } + + if (map >= BIT_ULL(32)) { + dev_err(qmc_hdlc->dev, "Slot map out of 32bit (0x%llx,0x%llx) -> 0x%llx\n", + ts_mask_avail, ts_mask, map); + return -EINVAL; + } + + *slot_map = map; + return 0; +} ... +static int qmc_hdlc_ioctl(struct net_device *netdev, struct if_settings *ifs) +{ + struct qmc_hdlc *qmc_hdlc = netdev_to_qmc_hdlc(netdev); + te1_settings te1; + + switch (ifs->type) { + case IF_GET_IFACE: + ifs->type = IF_IFACE_E1; + if (ifs->size < sizeof(te1)) { + if (!ifs->size) + return 0; /* only type requested */ + + ifs->size = sizeof(te1); /* data size wanted */ + return -ENOBUFS; + } + + memset(, 0, sizeof(te1)); + + /* Update slot_map */ + te1.slot_map = qmc_hdlc->slot_map; + + if
Re: [PATCH v4 21/28] net: wan: Add framer framework support
Le 18/08/2023 à 18:39, Christophe Leroy a écrit : From: Herve Codina A framer is a component in charge of an E1/T1 line interface. Connected usually to a TDM bus, it converts TDM frames to/from E1/T1 frames. It also provides information related to the E1/T1 line. The framer framework provides a set of APIs for the framer drivers (framer provider) to create/destroy a framer and APIs for the framer users (framer consumer) to obtain a reference to the framer, and use the framer. This basic implementation provides a framer abstraction for: - power on/off the framer - get the framer status (line state) - be notified on framer status changes - get/set the framer configuration Signed-off-by: Herve Codina Reviewed-by: Christophe Leroy Signed-off-by: Christophe Leroy --- Hi, should there be a V5, some nits below. ... +int framer_power_off(struct framer *framer) +{ + int ret; + + mutex_lock(>mutex); + if (framer->power_count == 1 && framer->ops->power_off) { + ret = framer->ops->power_off(framer); ~~ Useless extra space + if (ret < 0) { + dev_err(>dev, "framer poweroff failed --> %d\n", ret); + mutex_unlock(>mutex); + return ret; + } + } + --framer->power_count; + mutex_unlock(>mutex); + framer_pm_runtime_put(framer); + + if (framer->pwr) + regulator_disable(framer->pwr); + + return 0; +} ... +struct framer *framer_create(struct device *dev, struct device_node *node, +const struct framer_ops *ops) +{ + int ret; + int id; + struct framer *framer; + + if (WARN_ON(!dev)) + return ERR_PTR(-EINVAL); + + /* get_status() is mandatory if the provider ask for polling status */ + if (WARN_ON((ops->flags & FRAMER_FLAG_POLL_STATUS) && !ops->get_status)) + return ERR_PTR(-EINVAL); + + framer = kzalloc(sizeof(*framer), GFP_KERNEL); + if (!framer) + return ERR_PTR(-ENOMEM); + + id = ida_simple_get(_ida, 0, 0, GFP_KERNEL); ida_alloc()? (ida_simple_get() is deprecated) + if (id < 0) { + dev_err(dev, "unable to get id\n"); + ret = id; + goto free_framer; + } + + device_initialize(>dev); + mutex_init(>mutex); + INIT_WORK(>notify_status_work, framer_notify_status_work); + INIT_DELAYED_WORK(>polling_work, framer_polling_work); + BLOCKING_INIT_NOTIFIER_HEAD(>notifier_list); + + framer->dev.class = framer_class; + framer->dev.parent = dev; + framer->dev.of_node = node ? node : dev->of_node; + framer->id = id; + framer->ops = ops; + + ret = dev_set_name(>dev, "framer-%s.%d", dev_name(dev), id); + if (ret) + goto put_dev; + + /* framer-supply */ + framer->pwr = regulator_get_optional(>dev, "framer"); + if (IS_ERR(framer->pwr)) { + ret = PTR_ERR(framer->pwr); + if (ret == -EPROBE_DEFER) + goto put_dev; + + framer->pwr = NULL; + } + + ret = device_add(>dev); + if (ret) + goto put_dev; + + if (pm_runtime_enabled(dev)) { + pm_runtime_enable(>dev); + pm_runtime_no_callbacks(>dev); + } + + return framer; + +put_dev: + put_device(>dev); /* calls framer_release() which frees resources */ + return ERR_PTR(ret); + +free_framer: + kfree(framer); + return ERR_PTR(ret); +} ... +void framer_provider_of_unregister(struct framer_provider *framer_provider) +{ + mutex_lock(_provider_mutex); + list_del(_provider->list); + of_node_put(framer_provider->dev->of_node); + kfree(framer_provider); + mutex_unlock(_provider_mutex); If it make sense, of_node_put() and kfree() could maybe be out of the mutex, in order to match how things are done in __framer_provider_of_register(). +} ... +static void framer_release(struct device *dev) +{ + struct framer *framer; + + framer = dev_to_framer(dev); + regulator_put(framer->pwr); + ida_simple_remove(_ida, framer->id); ida_free()? (ida_simple_remove() is deprecated) + kfree(framer); +} ...