Re: [PATCH v2 2/2] powerpc/bpf: enable kfunc call

2024-02-12 Thread Christophe Leroy


Le 01/02/2024 à 18:12, Hari Bathini a écrit :
> With module addresses supported, override bpf_jit_supports_kfunc_call()
> to enable kfunc support. Module address offsets can be more than 32-bit
> long, so override bpf_jit_supports_far_kfunc_call() to enable 64-bit
> pointers.

What's the impact on PPC32 ? There are no 64-bit pointers on PPC32.

> 
> Signed-off-by: Hari Bathini 
> ---
> 
> * No changes since v1.
> 
> 
>   arch/powerpc/net/bpf_jit_comp.c | 10 ++
>   1 file changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 7b4103b4c929..f896a4213696 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp)
>   
>   bpf_prog_unlock_free(fp);
>   }
> +
> +bool bpf_jit_supports_kfunc_call(void)
> +{
> + return true;
> +}
> +
> +bool bpf_jit_supports_far_kfunc_call(void)
> +{
> + return true;
> +}


Re: [PATCH v2 1/2] powerpc/bpf: ensure module addresses are supported

2024-02-12 Thread Christophe Leroy


Le 01/02/2024 à 18:12, Hari Bathini a écrit :
> Currently, bpf jit code on powerpc assumes all the bpf functions and
> helpers to be kernel text. This is false for kfunc case, as function
> addresses are mostly module addresses in that case. Ensure module
> addresses are supported to enable kfunc support.
> 
> Assume kernel text address for programs with no kfunc call to optimize
> instruction sequence in that case. Add a check to error out if this
> assumption ever changes in the future.
> 
> Signed-off-by: Hari Bathini 
> ---
> 
> Changes in v2:
> * Using bpf_prog_has_kfunc_call() to decide whether to use optimized
>instruction sequence or not as suggested by Naveen.
> 
> 
>   arch/powerpc/net/bpf_jit.h|   5 +-
>   arch/powerpc/net/bpf_jit_comp.c   |   4 +-
>   arch/powerpc/net/bpf_jit_comp32.c |   8 ++-
>   arch/powerpc/net/bpf_jit_comp64.c | 109 --
>   4 files changed, 97 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
> index cdea5dccaefe..fc56ee0ee9c5 100644
> --- a/arch/powerpc/net/bpf_jit.h
> +++ b/arch/powerpc/net/bpf_jit.h
> @@ -160,10 +160,11 @@ static inline void bpf_clear_seen_register(struct 
> codegen_context *ctx, int i)
>   }
>   
>   void bpf_jit_init_reg_mapping(struct codegen_context *ctx);
> -int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct 
> codegen_context *ctx, u64 func);
> +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct 
> codegen_context *ctx, u64 func,
> +bool has_kfunc_call);
>   int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct 
> codegen_context *ctx,
>  u32 *addrs, int pass, bool extra_pass);
> -void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
> +void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx, bool 
> has_kfunc_call);
>   void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
>   void bpf_jit_realloc_regs(struct codegen_context *ctx);
>   int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int 
> tmp_reg, long exit_addr);
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 0f9a21783329..7b4103b4c929 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -163,7 +163,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>* update ctgtx.idx as it pretends to output instructions, then we can
>* calculate total size from idx.
>*/
> - bpf_jit_build_prologue(NULL, );
> + bpf_jit_build_prologue(NULL, , bpf_prog_has_kfunc_call(fp));
>   addrs[fp->len] = cgctx.idx * 4;
>   bpf_jit_build_epilogue(NULL, );
>   
> @@ -192,7 +192,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>   /* Now build the prologue, body code & epilogue for real. */
>   cgctx.idx = 0;
>   cgctx.alt_exit_addr = 0;
> - bpf_jit_build_prologue(code_base, );
> + bpf_jit_build_prologue(code_base, , 
> bpf_prog_has_kfunc_call(fp));
>   if (bpf_jit_build_body(fp, code_base, fcode_base, , 
> addrs, pass,
>  extra_pass)) {
>   bpf_arch_text_copy(>size, >size, 
> sizeof(hdr->size));
> diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
> b/arch/powerpc/net/bpf_jit_comp32.c
> index 2f39c50ca729..447747e51a58 100644
> --- a/arch/powerpc/net/bpf_jit_comp32.c
> +++ b/arch/powerpc/net/bpf_jit_comp32.c
> @@ -123,7 +123,7 @@ void bpf_jit_realloc_regs(struct codegen_context *ctx)
>   }
>   }
>   
> -void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
> +void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx, bool 
> has_kfunc_call)
>   {
>   int i;
>   
> @@ -201,7 +201,8 @@ void bpf_jit_build_epilogue(u32 *image, struct 
> codegen_context *ctx)
>   }
>   
>   /* Relative offset needs to be calculated based on final image location */
> -int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct 
> codegen_context *ctx, u64 func)
> +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct 
> codegen_context *ctx, u64 func,
> +bool has_kfunc_call)
>   {
>   s32 rel = (s32)func - (s32)(fimage + ctx->idx);
>   
> @@ -1054,7 +1055,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
> u32 *fimage, struct code
>   EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_5), _R1, 
> 12));
>   }
>   
> - ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, 
> func_addr);
> + ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, 
> func_addr,
> +  
> bpf_prog_has_kfunc_call(fp));
>   if (ret)
>   return ret;
>   
> diff --git a/arch/powerpc/net/bpf_jit_comp64.c 

Re: [PATCH v5 5/5] sched: rename SD_SHARE_PKG_RESOURCES to SD_SHARE_LLC

2024-02-12 Thread Barry Song
On Tue, Feb 13, 2024 at 8:01 PM Barry Song <21cn...@gmail.com> wrote:
>
> Hi Alex, Valentin,
>
>
> On Sun, Feb 11, 2024 at 12:37 AM  wrote:
> >
> > From: Alex Shi 
> >
> > SD_CLUSTER shares the CPU resources like llc tags or l2 cache, that's
> > easy confuse with SD_SHARE_PKG_RESOURCES. So let's specifical point
> > what the latter shares: LLC. That would reduce some confusing.
>
> On neither JACOBSVILLE nor kunpeng920, it seems CLUSTER isn't LLC.
> on Jacobsville, cluster is L2-cache while Jacobsville has L3; on kunpeng920,
> cluster is L3-tag. On kunpeng920, actually 24 cpus or 32cpus share one LLC,
> the whole L3. cluster is kind of like middle-level caches.
>
> So I feel this patch isn't precise.

sorry for my noise, i thought you were renaming cluster to LLC. but after
second reading, you are renaming the level after cluster, so my comment
was wrong. Please feel free to add:

Reviewed-by: Barry Song 

>
> >
> > Suggested-by: Valentin Schneider 
> > Signed-off-by: Alex Shi 
> > Cc: linux-ker...@vger.kernel.org
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: Miaohe Lin 
> > Cc: Barry Song 
> > Cc: Mark Rutland 
> > Cc: Frederic Weisbecker 
> > Cc: Daniel Bristot de Oliveira 
> > Cc: Ben Segall 
> > Cc: Steven Rostedt 
> > Cc: Dietmar Eggemann 
> > Cc: Juri Lelli 
> > Cc: Ingo Molnar 
> > Cc: "Naveen N. Rao" 
> > Cc: "Aneesh Kumar K.V" 
> > Cc: Christophe Leroy 
> > Cc: "Gautham R. Shenoy" 
> > Cc: Yicong Yang 
> > Cc: Ricardo Neri 
> > Cc: Josh Poimboeuf 
> > Cc: Srikar Dronamraju 
> > Cc: Valentin Schneider 
> > Cc: Nicholas Piggin 
> > Cc: Michael Ellerman 
> > Reviewed-by: Valentin Schneider 
> > Reviewed-by: Ricardo Neri 
> > ---
> >  arch/powerpc/kernel/smp.c  |  6 +++---
> >  include/linux/sched/sd_flags.h |  4 ++--
> >  include/linux/sched/topology.h |  6 +++---
> >  kernel/sched/fair.c|  2 +-
> >  kernel/sched/topology.c| 28 ++--
> >  5 files changed, 23 insertions(+), 23 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 693334c20d07..a60e4139214b 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init;
> >  /* cpumask of CPUs with asymmetric SMT dependency */
> >  static int powerpc_smt_flags(void)
> >  {
> > -   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> > +   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
> >
> > if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> > printk_once(KERN_INFO "Enabling Asymmetric SMT 
> > scheduling\n");
> > @@ -1010,9 +1010,9 @@ static __ro_after_init 
> > DEFINE_STATIC_KEY_FALSE(splpar_asym_pack);
> >  static int powerpc_shared_cache_flags(void)
> >  {
> > if (static_branch_unlikely(_asym_pack))
> > -   return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
> > +   return SD_SHARE_LLC | SD_ASYM_PACKING;
> >
> > -   return SD_SHARE_PKG_RESOURCES;
> > +   return SD_SHARE_LLC;
> >  }
> >
> >  static int powerpc_shared_proc_flags(void)
> > diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
> > index a8b28647aafc..b04a5d04dee9 100644
> > --- a/include/linux/sched/sd_flags.h
> > +++ b/include/linux/sched/sd_flags.h
> > @@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | 
> > SDF_NEEDS_GROUPS)
> >  SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
> >
> >  /*
> > - * Domain members share CPU package resources (i.e. caches)
> > + * Domain members share CPU Last Level Caches
> >   *
> >   * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer 
> > share
> >   *   the same cache(s).
> >   * NEEDS_GROUPS: Caches are shared between groups.
> >   */
> > -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
> > +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
> >
> >  /*
> >   * Only a single load balancing instance
> > diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> > index a6e04b4a21d7..191b122158fb 100644
> > --- a/include/linux/sched/topology.h
> > +++ b/include/linux/sched/topology.h
> > @@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[];
> >  #ifdef CONFIG_SCHED_SMT
> >  static inline int cpu_smt_flags(void)
> >  {
> > -   return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> > +   return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
> >  }
> >  #endif
> >
> >  #ifdef CONFIG_SCHED_CLUSTER
> >  static inline int cpu_cluster_flags(void)
> >  {
> > -   return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
> > +   return SD_CLUSTER | SD_SHARE_LLC;
> >  }
> >  #endif
> >
> >  #ifdef CONFIG_SCHED_MC
> >  static inline int cpu_core_flags(void)
> >  {
> > -   return SD_SHARE_PKG_RESOURCES;
> > +   return SD_SHARE_LLC;
> >  }
> >  #endif
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index cd1ec57c0b7b..da6c77d05d07 100644
> > --- a/kernel/sched/fair.c
> > 

Re: [PATCH v5 5/5] sched: rename SD_SHARE_PKG_RESOURCES to SD_SHARE_LLC

2024-02-12 Thread Barry Song
Hi Alex, Valentin,


On Sun, Feb 11, 2024 at 12:37 AM  wrote:
>
> From: Alex Shi 
>
> SD_CLUSTER shares the CPU resources like llc tags or l2 cache, that's
> easy confuse with SD_SHARE_PKG_RESOURCES. So let's specifical point
> what the latter shares: LLC. That would reduce some confusing.

On neither JACOBSVILLE nor kunpeng920, it seems CLUSTER isn't LLC.
on Jacobsville, cluster is L2-cache while Jacobsville has L3; on kunpeng920,
cluster is L3-tag. On kunpeng920, actually 24 cpus or 32cpus share one LLC,
the whole L3. cluster is kind of like middle-level caches.

So I feel this patch isn't precise.

>
> Suggested-by: Valentin Schneider 
> Signed-off-by: Alex Shi 
> Cc: linux-ker...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Miaohe Lin 
> Cc: Barry Song 
> Cc: Mark Rutland 
> Cc: Frederic Weisbecker 
> Cc: Daniel Bristot de Oliveira 
> Cc: Ben Segall 
> Cc: Steven Rostedt 
> Cc: Dietmar Eggemann 
> Cc: Juri Lelli 
> Cc: Ingo Molnar 
> Cc: "Naveen N. Rao" 
> Cc: "Aneesh Kumar K.V" 
> Cc: Christophe Leroy 
> Cc: "Gautham R. Shenoy" 
> Cc: Yicong Yang 
> Cc: Ricardo Neri 
> Cc: Josh Poimboeuf 
> Cc: Srikar Dronamraju 
> Cc: Valentin Schneider 
> Cc: Nicholas Piggin 
> Cc: Michael Ellerman 
> Reviewed-by: Valentin Schneider 
> Reviewed-by: Ricardo Neri 
> ---
>  arch/powerpc/kernel/smp.c  |  6 +++---
>  include/linux/sched/sd_flags.h |  4 ++--
>  include/linux/sched/topology.h |  6 +++---
>  kernel/sched/fair.c|  2 +-
>  kernel/sched/topology.c| 28 ++--
>  5 files changed, 23 insertions(+), 23 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 693334c20d07..a60e4139214b 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init;
>  /* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
> -   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> +   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
>
> if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> @@ -1010,9 +1010,9 @@ static __ro_after_init 
> DEFINE_STATIC_KEY_FALSE(splpar_asym_pack);
>  static int powerpc_shared_cache_flags(void)
>  {
> if (static_branch_unlikely(_asym_pack))
> -   return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
> +   return SD_SHARE_LLC | SD_ASYM_PACKING;
>
> -   return SD_SHARE_PKG_RESOURCES;
> +   return SD_SHARE_LLC;
>  }
>
>  static int powerpc_shared_proc_flags(void)
> diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
> index a8b28647aafc..b04a5d04dee9 100644
> --- a/include/linux/sched/sd_flags.h
> +++ b/include/linux/sched/sd_flags.h
> @@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | 
> SDF_NEEDS_GROUPS)
>  SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
>
>  /*
> - * Domain members share CPU package resources (i.e. caches)
> + * Domain members share CPU Last Level Caches
>   *
>   * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer 
> share
>   *   the same cache(s).
>   * NEEDS_GROUPS: Caches are shared between groups.
>   */
> -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
> +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
>
>  /*
>   * Only a single load balancing instance
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index a6e04b4a21d7..191b122158fb 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[];
>  #ifdef CONFIG_SCHED_SMT
>  static inline int cpu_smt_flags(void)
>  {
> -   return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> +   return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
>  }
>  #endif
>
>  #ifdef CONFIG_SCHED_CLUSTER
>  static inline int cpu_cluster_flags(void)
>  {
> -   return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
> +   return SD_CLUSTER | SD_SHARE_LLC;
>  }
>  #endif
>
>  #ifdef CONFIG_SCHED_MC
>  static inline int cpu_core_flags(void)
>  {
> -   return SD_SHARE_PKG_RESOURCES;
> +   return SD_SHARE_LLC;
>  }
>  #endif
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index cd1ec57c0b7b..da6c77d05d07 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10687,7 +10687,7 @@ static inline void calculate_imbalance(struct lb_env 
> *env, struct sd_lb_stats *s
>  */
> if (local->group_type == group_has_spare) {
> if ((busiest->group_type > group_fully_busy) &&
> -   !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
> +   !(env->sd->flags & SD_SHARE_LLC)) {
> /*
>  * If busiest is overloaded, try to fill spare
>  * capacity. This might end up 

Re: [PATCH] powerpc/pseries: fix accuracy of stolen time

2024-02-12 Thread Srikar Dronamraju
* Shrikanth Hegde  [2024-02-13 10:56:35]:

> powerVM hypervisor updates the VPA fields with stolen time data.
> It currently reports enqueue_dispatch_tb and ready_enqueue_tb for
> this purpose. In linux these two fields are used to report the stolen time.
> 
> The VPA fields are updated at the TB frequency. On powerPC its mostly
> set at 512Mhz. Hence this needs a conversion to ns when reporting it
> back as rest of the kernel timings are in ns. This conversion is already
> handled in tb_to_ns function. So use that function to report accurate
> stolen time.
> 
> Observed this issue and used an Capped Shared Processor LPAR(SPLPAR) to
> simplify the experiments. In all these cases, 100% VP Load is run using
> stress-ng workload. Values of stolen time is in percentages as reported
> by mpstat. With the patch values are close to expected.
> 
>   6.8.rc1 +Patch
> 12EC/12VP0.0 0.0
> 12EC/24VP   25.750.2
> 12EC/36VP   37.369.2
> 12EC/48VP   38.578.3
> 
> 
> Fixes: 0e8a63132800 ("powerpc/pseries: Implement 
> CONFIG_PARAVIRT_TIME_ACCOUNTING")
> Signed-off-by: Shrikanth Hegde 

Looks good to me.

Reviewed-by: Srikar Dronamraju 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH] powerpc/pseries: fix accuracy of stolen time

2024-02-12 Thread Nicholas Piggin
On Tue Feb 13, 2024 at 3:26 PM AEST, Shrikanth Hegde wrote:
> powerVM hypervisor updates the VPA fields with stolen time data.
> It currently reports enqueue_dispatch_tb and ready_enqueue_tb for
> this purpose. In linux these two fields are used to report the stolen time.
>
> The VPA fields are updated at the TB frequency. On powerPC its mostly
> set at 512Mhz. Hence this needs a conversion to ns when reporting it
> back as rest of the kernel timings are in ns. This conversion is already
> handled in tb_to_ns function. So use that function to report accurate
> stolen time.
>
> Observed this issue and used an Capped Shared Processor LPAR(SPLPAR) to
> simplify the experiments. In all these cases, 100% VP Load is run using
> stress-ng workload. Values of stolen time is in percentages as reported
> by mpstat. With the patch values are close to expected.
>
>   6.8.rc1 +Patch
> 12EC/12VP0.0 0.0
> 12EC/24VP   25.750.2
> 12EC/36VP   37.369.2
> 12EC/48VP   38.578.3
>
>
> Fixes: 0e8a63132800 ("powerpc/pseries: Implement 
> CONFIG_PARAVIRT_TIME_ACCOUNTING")

Good find and fix. Paper bag for me.

I wonder why we didn't catch it in the first place. Maybe we
didn't understand the hypervisor's sharing algorithm and what
we expected it to report.

In any case this is right. The KVM implementation of the counters is
in TB, so that's fine.

Reviewed-by: Nicholas Piggin 

Thanks,
Nick

> Signed-off-by: Shrikanth Hegde 
> ---
>  arch/powerpc/platforms/pseries/lpar.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/lpar.c 
> b/arch/powerpc/platforms/pseries/lpar.c
> index 4561667832ed..bdcc428e1c2b 100644
> --- a/arch/powerpc/platforms/pseries/lpar.c
> +++ b/arch/powerpc/platforms/pseries/lpar.c
> @@ -662,8 +662,12 @@ u64 pseries_paravirt_steal_clock(int cpu)
>  {
>   struct lppaca *lppaca = _of(cpu);
>
> - return be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) +
> - be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb));
> + /*
> +  * VPA steal time counters are reported at TB frequency. Hence do a
> +  * conversion to ns before returning
> +  */
> + return tb_to_ns(be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) +
> +  be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb)));
>  }
>  #endif
>
> --
> 2.39.3



Re: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections

2024-02-12 Thread Naveen N Rao
On Mon, Feb 12, 2024 at 07:31:03PM +, Christophe Leroy wrote:
> 
> 
> Le 09/02/2024 à 08:59, Naveen N Rao a écrit :
> > diff --git a/arch/powerpc/include/asm/sections.h 
> > b/arch/powerpc/include/asm/sections.h
> > index ea26665f82cf..d389dcecdb0b 100644
> > --- a/arch/powerpc/include/asm/sections.h
> > +++ b/arch/powerpc/include/asm/sections.h
> > @@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
> >   
> >   extern char __head_end[];
> >   extern char __srwx_boundary[];
> > +extern char _sexittext[], _eexittext[];
> 
> Should we try to at least use the same symbols as others, or best try to 
> move this into include/asm-generic/sections.h, just like inittext ?

I used this name based on what is used for init text start and end in 
the generic code: _sinittext and _einittext.

> 
> $ git grep exittext
> arch/arm64/include/asm/sections.h:extern char __exittext_begin[], 
> __exittext_end[];

Arm64 also uses the non-standard __inittext_begin/__inittext_end, so it 
looks to be something very specific to arm64.

I do agree it would be good to refactor and unify names across 
architectures.


- Naveen



[PATCH] powerpc/pseries: fix accuracy of stolen time

2024-02-12 Thread Shrikanth Hegde
powerVM hypervisor updates the VPA fields with stolen time data.
It currently reports enqueue_dispatch_tb and ready_enqueue_tb for
this purpose. In linux these two fields are used to report the stolen time.

The VPA fields are updated at the TB frequency. On powerPC its mostly
set at 512Mhz. Hence this needs a conversion to ns when reporting it
back as rest of the kernel timings are in ns. This conversion is already
handled in tb_to_ns function. So use that function to report accurate
stolen time.

Observed this issue and used an Capped Shared Processor LPAR(SPLPAR) to
simplify the experiments. In all these cases, 100% VP Load is run using
stress-ng workload. Values of stolen time is in percentages as reported
by mpstat. With the patch values are close to expected.

6.8.rc1 +Patch
12EC/12VP  0.0 0.0
12EC/24VP 25.750.2
12EC/36VP 37.369.2
12EC/48VP 38.578.3


Fixes: 0e8a63132800 ("powerpc/pseries: Implement 
CONFIG_PARAVIRT_TIME_ACCOUNTING")
Signed-off-by: Shrikanth Hegde 
---
 arch/powerpc/platforms/pseries/lpar.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 4561667832ed..bdcc428e1c2b 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -662,8 +662,12 @@ u64 pseries_paravirt_steal_clock(int cpu)
 {
struct lppaca *lppaca = _of(cpu);

-   return be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) +
-   be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb));
+   /*
+* VPA steal time counters are reported at TB frequency. Hence do a
+* conversion to ns before returning
+*/
+   return tb_to_ns(be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) +
+be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb)));
 }
 #endif

--
2.39.3



Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson



- Original Message -
> From: "Michael Ellerman" 
> To: "Timothy Pearson" , "Segher Boessenkool" 
> 
> Cc: "linuxppc-dev" 
> Sent: Monday, February 12, 2024 11:23:30 PM
> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

> Timothy Pearson  writes:
>> - Original Message -
>>> From: "Segher Boessenkool" 
>>> To: "Timothy Pearson" 
>>> Cc: "linuxppc-dev" 
>>> Sent: Monday, February 12, 2024 12:23:22 PM
>>> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions
>>
>>> On Mon, Feb 12, 2024 at 12:07:03PM -0600, Timothy Pearson wrote:
 > I have done it for *all* architectures some ten years ago.  Never found
 > any problem.
 
 That makes sense, what I mean by invasive is that we'd need buy-in from the
 other
 maintainers across all of the affected architectures.  Is that likely to 
 occur?
>>> 
>>> I don't know.  Here is my PowerPC-specific patch, it's a bit older, it
>>> might not apply cleanly anymore, the changes needed should be obvious
>>> though:
>>> 
>>> 
>>> === 8< ===
>>> commit f16dfa5257eb14549ce22243fb2b465615085134
>>> Author: Segher Boessenkool 
>>> Date:   Sat May 3 03:48:06 2008 +0200
>>> 
>>>powerpc: Link vmlinux against libgcc.a
>>> 
>>> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
>>> index b7212b619c52..0a2fac6ffc1c 100644
>>> --- a/arch/powerpc/Makefile
>>> +++ b/arch/powerpc/Makefile
>>> @@ -158,6 +158,9 @@ core-y  += 
>>> arch/powerpc/kernel/
>>> core-$(CONFIG_XMON)+= arch/powerpc/xmon/
>>> core-$(CONFIG_KVM) += arch/powerpc/kvm/
>>> 
>>> +LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
>>> +libs-y += $(LIBGCC)
>>> +
>>> drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/
>>> 
>>> # Default to zImage, override when needed
>>> === 8< ===
>>
>> OK.  PowerPC maintainers, how would you prefer to handle this?
> 
> I'll take the patch to add the functions for now. We can look into
> linking against libgcc as a future cleanup.

Sounds good.

 > There are better options than -Os, fwiw.  Some --param's give smaller
 > *and* faster kernels.  What exactly is best is heavily arch-dependent
 > though (as well as dependent on the application code, the kernel code in
 > this case) :-(
 
 I've been through this a few times, and -Os is the only option that makes
 things (just barely) fit unfortunately.
>>> 
>>> -O2 with appropriate inlining tuning beats -Os every day of the week,
>>> in my experience.
>>
>> On 6.6 it's 24MiB vs 40MiB, O2 vs. Os. :(
> 
> What compiler/config etc. are you using for that?

It's the kernel config that buildroot generates for skiroot -- I think a lot of 
the size difference is in some of the modules that we enable such as amdgpu, 
but haven't dug too deeply.  Once this firmware release is in beta (and 
therefore published publicly) I'll send over a link to the configs.

Thanks!


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Michael Ellerman
Timothy Pearson  writes:
> - Original Message -
>> From: "Segher Boessenkool" 
>> To: "Timothy Pearson" 
>> Cc: "linuxppc-dev" 
>> Sent: Monday, February 12, 2024 12:23:22 PM
>> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions
>
>> On Mon, Feb 12, 2024 at 12:07:03PM -0600, Timothy Pearson wrote:
>>> > I have done it for *all* architectures some ten years ago.  Never found
>>> > any problem.
>>> 
>>> That makes sense, what I mean by invasive is that we'd need buy-in from the
>>> other
>>> maintainers across all of the affected architectures.  Is that likely to 
>>> occur?
>> 
>> I don't know.  Here is my PowerPC-specific patch, it's a bit older, it
>> might not apply cleanly anymore, the changes needed should be obvious
>> though:
>> 
>> 
>> === 8< ===
>> commit f16dfa5257eb14549ce22243fb2b465615085134
>> Author: Segher Boessenkool 
>> Date:   Sat May 3 03:48:06 2008 +0200
>> 
>>powerpc: Link vmlinux against libgcc.a
>> 
>> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
>> index b7212b619c52..0a2fac6ffc1c 100644
>> --- a/arch/powerpc/Makefile
>> +++ b/arch/powerpc/Makefile
>> @@ -158,6 +158,9 @@ core-y  += 
>> arch/powerpc/kernel/
>> core-$(CONFIG_XMON)+= arch/powerpc/xmon/
>> core-$(CONFIG_KVM) += arch/powerpc/kvm/
>> 
>> +LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
>> +libs-y += $(LIBGCC)
>> +
>> drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/
>> 
>> # Default to zImage, override when needed
>> === 8< ===
>
> OK.  PowerPC maintainers, how would you prefer to handle this?

I'll take the patch to add the functions for now. We can look into
linking against libgcc as a future cleanup.

>>> > There are better options than -Os, fwiw.  Some --param's give smaller
>>> > *and* faster kernels.  What exactly is best is heavily arch-dependent
>>> > though (as well as dependent on the application code, the kernel code in
>>> > this case) :-(
>>> 
>>> I've been through this a few times, and -Os is the only option that makes
>>> things (just barely) fit unfortunately.
>> 
>> -O2 with appropriate inlining tuning beats -Os every day of the week,
>> in my experience.
>
> On 6.6 it's 24MiB vs 40MiB, O2 vs. Os. :(

What compiler/config etc. are you using for that?

I see almost no difference, though the defconfig (which uses -O2) is
actually smaller:

$ ls -l vmlinux.Os vmlinux.defconfig
-rwxr-xr-x. 1 michael michael 49936640 Feb 13 16:11 vmlinux.defconfig*
-rwxr-xr-x. 1 michael michael 50108392 Feb 13 16:14 vmlinux.Os*

cheers


Re: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections

2024-02-12 Thread Benjamin Gray
On Fri, 2024-02-09 at 13:29 +0530, Naveen N Rao wrote:
> Michael reported that we are seeing ftrace bug on bootup when KASAN
> is
> enabled, and if we are using -fpatchable-function-entry:
> 
>     ftrace: allocating 47780 entries in 18 pages
>     ftrace-powerpc: 0xc20b3d5c: No module provided for non-
> kernel address
>     [ ftrace bug ]
>     ftrace faulted on modifying
>     [] 0xc20b3d5c
>     Initializing ftrace call sites
>     ftrace record flags: 0
>  (0)
>  expected tramp: c008cef4
>     [ cut here ]
>     WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180
> ftrace_bug+0x3c0/0x424
>     Modules linked in:
>     CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-
> g0f71dcfb4aef #860
>     Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw)
> 0x4e1202 0xf05 of:SLOF,HEAD hv:linux,kvm pSeries
>     NIP:  c03aa81c LR: c03aa818 CTR: 
>     REGS: c33cfab0 TRAP: 0700   Not tainted  (6.5.0-rc3-
> 00120-g0f71dcfb4aef)
>     MSR:  82021033   CR: 28028240 
> XER: 
>     CFAR: c02781a8 IRQMASK: 3
>     ...
>     NIP [c03aa81c] ftrace_bug+0x3c0/0x424
>     LR [c03aa818] ftrace_bug+0x3bc/0x424
>     Call Trace:
>  ftrace_bug+0x3bc/0x424 (unreliable)
>  ftrace_process_locs+0x5f4/0x8a0
>  ftrace_init+0xc0/0x1d0
>  start_kernel+0x1d8/0x484
> 
> With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and
> CONFIG_KASAN=y, compiler emits nops in functions that it generates
> for
> registering and unregistering global variables (unlike with -pg and
> -mprofile-kernel where calls to _mcount() are not generated in those
> functions). Those functions then end up in INIT_TEXT and EXIT_TEXT
> respectively. We don't expect to see any profiled functions in
> EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that
> aren't
> in the core kernel text belongs to a module. Since these functions do
> not match that criteria, we see the above bug.
> 
> Address this by having ftrace ignore all locations in the text exit
> sections of vmlinux.
> 
> Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for -fpatchable-
> function-entry")
> Cc: sta...@vger.kernel.org
> Reported-by: Michael Ellerman 
> Signed-off-by: Naveen N Rao 
> ---
>  arch/powerpc/include/asm/ftrace.h   |  9 +
>  arch/powerpc/include/asm/sections.h |  1 +
>  arch/powerpc/kernel/trace/ftrace.c  | 12 
>  arch/powerpc/kernel/vmlinux.lds.S   |  2 ++
>  4 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/ftrace.h
> b/arch/powerpc/include/asm/ftrace.h
> index 1ebd2ca97f12..d6babd083202 100644
> --- a/arch/powerpc/include/asm/ftrace.h
> +++ b/arch/powerpc/include/asm/ftrace.h
> @@ -20,14 +20,7 @@
>  #ifndef __ASSEMBLY__
>  extern void _mcount(void);
>  
> -static inline unsigned long ftrace_call_adjust(unsigned long addr)
> -{
> - if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
> - addr += MCOUNT_INSN_SIZE;
> -
> - return addr;
> -}
> -
> +unsigned long ftrace_call_adjust(unsigned long addr);
>  unsigned long prepare_ftrace_return(unsigned long parent, unsigned
> long ip,
>       unsigned long sp);
>  
> diff --git a/arch/powerpc/include/asm/sections.h
> b/arch/powerpc/include/asm/sections.h
> index ea26665f82cf..d389dcecdb0b 100644
> --- a/arch/powerpc/include/asm/sections.h
> +++ b/arch/powerpc/include/asm/sections.h
> @@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
>  
>  extern char __head_end[];
>  extern char __srwx_boundary[];
> +extern char _sexittext[], _eexittext[];
>  
>  /* Patch sites */
>  extern s32 patch__call_flush_branch_caches1;
> diff --git a/arch/powerpc/kernel/trace/ftrace.c
> b/arch/powerpc/kernel/trace/ftrace.c
> index 82010629cf88..b5efd8d7bc01 100644
> --- a/arch/powerpc/kernel/trace/ftrace.c
> +++ b/arch/powerpc/kernel/trace/ftrace.c
> @@ -27,10 +27,22 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define  NUM_FTRACE_TRAMPS   2
>  static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
>  
> +unsigned long ftrace_call_adjust(unsigned long addr)
> +{
> + if (addr >= (unsigned long)_sexittext && addr < (unsigned
> long)_eexittext)
> + return 0;
> +
> + if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
> + addr += MCOUNT_INSN_SIZE;
> +
> + return addr;
> +}
> +
>  static ppc_inst_t ftrace_create_branch_inst(unsigned long ip,
> unsigned long addr, int link)
>  {
>   ppc_inst_t op;
> diff --git a/arch/powerpc/kernel/vmlinux.lds.S
> b/arch/powerpc/kernel/vmlinux.lds.S
> index 1c5970df3233..9c376ae6857d 100644
> --- a/arch/powerpc/kernel/vmlinux.lds.S
> +++ b/arch/powerpc/kernel/vmlinux.lds.S
> @@ -281,7 +281,9 @@ SECTIONS
>    * to deal with references from __bug_table
>    */
>   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
> +  

[PATCH] powerpc/code-patching: Disable KASAN in __patch_instructions()

2024-02-12 Thread Benjamin Gray
The memset/memcpy functions are by default instrumented by KASAN, which
complains about user memory access when using a poking page in
userspace.

Using a userspace address is expected though, so don't instrument with
KASAN for this function.

Signed-off-by: Benjamin Gray 

---

I tried to replace the memsetN calls with __memsetN, but we appear to
disable the non-instrumented variants of these when KASAN is enabled.
Christophe might you know more here?

The cost of just suppressing reports for this section shouldn't be too
relevant; KASAN detects the access, but exits before it starts preparing
the report itself. So it's just like any other KASAN instrumented
function for the most part.
---
 arch/powerpc/lib/code-patching.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index c6ab46156cda..24989594578a 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -3,6 +3,7 @@
  *  Copyright 2008 Michael Ellerman, IBM Corporation.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -377,6 +378,7 @@ static int __patch_instructions(u32 *patch_addr, u32 *code, 
size_t len, bool rep
unsigned long start = (unsigned long)patch_addr;
 
/* Repeat instruction */
+   kasan_disable_current();
if (repeat_instr) {
ppc_inst_t instr = ppc_inst_read(code);
 
@@ -392,6 +394,7 @@ static int __patch_instructions(u32 *patch_addr, u32 *code, 
size_t len, bool rep
} else {
memcpy(patch_addr, code, len);
}
+   kasan_enable_current();
 
smp_wmb();  /* smp write barrier */
flush_icache_range(start, start + len);
-- 
2.43.0



Re: [PATCH v15 2/5] crash: add a new kexec flag for hotplug support

2024-02-12 Thread Baoquan He
On 02/12/24 at 07:27pm, Sourabh Jain wrote:
> Hello Baoquan,
> 
> On 05/02/24 08:40, Baoquan He wrote:
> > Hi Sourabh,
> > 
..
> > > diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> > > index 802052d9c64b..7880d74dc5c4 100644
> > > --- a/include/linux/kexec.h
> > > +++ b/include/linux/kexec.h
> > > @@ -317,8 +317,8 @@ struct kimage {
> > >   /* If set, we are using file mode kexec syscall */
> > >   unsigned int file_mode:1;
> > >   #ifdef CONFIG_CRASH_HOTPLUG
> > > - /* If set, allow changes to elfcorehdr of kexec_load'd image */
> > > - unsigned int update_elfcorehdr:1;
> > > + /* If set, allow changes to kexec segments of kexec_load'd image */
> > The code comment doesn't reflect the usage of the flag.
> I should have updated the comment to indicate that this flag is for both
> system calls.
> More comments below.
> 
> > You set it too
> > when it's kexec_file_load. Speaking of this, I do wonder why you need
> > set it too for kexec_file_load,
> If we do this one can just access image->hotplug_support to find hotplug
> support for currently loaded kdump image without bothering about which
> system call was used to load the kdump image.
> 
> > and why we have
> > arch_crash_hotplug_support(), then crash_check_hotplug_support() both of
> > which have the same effect.
> 
> arch_crash_hotplug_support(): This function processes the kexec flags and
> finds the
> hotplug support for the kdump image. Based on the return value of this
> function,
> the image->hotplug_support attribute is set.
> 
> Now, once the kdump image is loaded, we no longer have access to the kexec
> flags.
> Therefore, crash_check_hotplug_support simply returns the value of
> image->hotplug_support
> when user space accesses the following sysfs files:
> /sys/devices/system/[cpu|memory]/crash_hotplug.
> 
> To keep things simple, I have introduced two functions: One function
> processes the kexec flags
> and determines the hotplug support for the image being loaded. And other
> function simply
> accesses image->hotplug_support and advertises CPU/Memory hotplug support to
> userspace.

>From the function name and their functionality, they seems to be
duplicated, even though it's different from the internal detail. This
could bring a little confusion to code understanding. It's fine, we can
refactor them if needed in the future. So let's keep it as the patch is.
Thanks.

> 
> > 
> > > + unsigned int hotplug_support:1;
> > >   #endif
> > >   #ifdef ARCH_HAS_KIMAGE_ARCH
> > > @@ -396,9 +396,10 @@ bool kexec_load_permitted(int kexec_image_type);
> > >   /* List of defined/legal kexec flags */
> > >   #ifndef CONFIG_KEXEC_JUMP
> > > -#define KEXEC_FLAGS(KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR)
> > > +#define KEXEC_FLAGS(KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | 
> > > KEXEC_CRASH_HOTPLUG_SUPPORT)
> > >   #else
> > > -#define KEXEC_FLAGS(KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | 
> > > KEXEC_UPDATE_ELFCOREHDR)
> > > +#define KEXEC_FLAGS(KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | 
> > > KEXEC_UPDATE_ELFCOREHDR | \
> > > + KEXEC_CRASH_HOTPLUG_SUPPORT)
> > >   #endif
> > >   /* List of defined/legal kexec file flags */
> > > @@ -486,14 +487,18 @@ static inline void arch_kexec_pre_free_pages(void 
> > > *vaddr, unsigned int pages) {
> > >   static inline void arch_crash_handle_hotplug_event(struct kimage 
> > > *image, void *arg) { }
> > >   #endif
> > > -int crash_check_update_elfcorehdr(void);
> > > +int crash_check_hotplug_support(void);
> > > -#ifndef crash_hotplug_cpu_support
> > > -static inline int crash_hotplug_cpu_support(void) { return 0; }
> > > -#endif
> > > +#ifndef arch_crash_hotplug_support
> > > +static inline int arch_crash_hotplug_support(struct kimage *image, 
> > > unsigned long kexec_flags)
> > > +{
> > > -#ifndef crash_hotplug_memory_support
> > > -static inline int crash_hotplug_memory_support(void) { return 0; }
> > > +#ifdef CONFIG_KEXEC_FILE
> > > + if (image->file_mode)
> > > + return 1;
> > > +#endif
> > > + return kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT;
> > > +}
> > >   #endif
> > >   #ifndef crash_get_elfcorehdr_size
..



Re: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections

2024-02-12 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 09/02/2024 à 08:59, Naveen N Rao a écrit :
>> Michael reported that we are seeing ftrace bug on bootup when KASAN is
>> enabled, and if we are using -fpatchable-function-entry:
>> 
...
>> diff --git a/arch/powerpc/include/asm/sections.h 
>> b/arch/powerpc/include/asm/sections.h
>> index ea26665f82cf..d389dcecdb0b 100644
>> --- a/arch/powerpc/include/asm/sections.h
>> +++ b/arch/powerpc/include/asm/sections.h
>> @@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
>>   
>>   extern char __head_end[];
>>   extern char __srwx_boundary[];
>> +extern char _sexittext[], _eexittext[];
>
> Should we try to at least use the same symbols as others, or best try to 
> move this into include/asm-generic/sections.h, just like inittext ?
>
> $ git grep exittext
> arch/arm64/include/asm/sections.h:extern char __exittext_begin[], 
> __exittext_end[];
> arch/arm64/kernel/patching.c:   addr >= (unsigned 
> long)__exittext_begin &&
> arch/arm64/kernel/patching.c:   addr < (unsigned 
> long)__exittext_end;
> arch/arm64/kernel/vmlinux.lds.S:__exittext_begin = .;
> arch/arm64/kernel/vmlinux.lds.S:__exittext_end = .;
> arch/riscv/include/asm/sections.h:extern char __exittext_begin[], 
> __exittext_end[];
> arch/riscv/kernel/patch.c:static inline bool 
> is_kernel_exittext(uintptr_t addr)
> arch/riscv/kernel/patch.c:  addr >= 
> (uintptr_t)__exittext_begin &&
> arch/riscv/kernel/patch.c:  addr < (uintptr_t)__exittext_end;
> arch/riscv/kernel/patch.c:  if (core_kernel_text(uintaddr) || 
> is_kernel_exittext(uintaddr))
> arch/riscv/kernel/vmlinux-xip.lds.S:__exittext_begin = .;
> arch/riscv/kernel/vmlinux-xip.lds.S:__exittext_end = .;
> arch/riscv/kernel/vmlinux.lds.S:__exittext_begin = .;
> arch/riscv/kernel/vmlinux.lds.S:__exittext_end = .;

I'll change it to use __exittext_begin/end.

>> diff --git a/arch/powerpc/kernel/trace/ftrace.c 
>> b/arch/powerpc/kernel/trace/ftrace.c
>> index 82010629cf88..b5efd8d7bc01 100644
>> --- a/arch/powerpc/kernel/trace/ftrace.c
>> +++ b/arch/powerpc/kernel/trace/ftrace.c
>> @@ -27,10 +27,22 @@
>>   #include 
>>   #include 
>>   #include 
>> +#include 
>>   
>>   #defineNUM_FTRACE_TRAMPS   2
>>   static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS];
>>   
>> +unsigned long ftrace_call_adjust(unsigned long addr)
>> +{
>> +if (addr >= (unsigned long)_sexittext && addr < (unsigned 
>> long)_eexittext)
>> +return 0;
>
> Then arm64 has a function called is_exit_text() and riscv has 
> is_kernel_exittext(). Can we refactor ?

I'd like to get the fix in and backported, so I'll take it as-is but
with the section names changed to match the other arches.

We can do further refactoring on top.

cheers


Re: [DMARC error][SPF error] Re: [PATCH v4 00/10] devm_led_classdev_register() usage problem

2024-02-12 Thread George Stark

Hello Andy

On 2/12/24 12:53, Andy Shevchenko wrote:

On Mon, Feb 12, 2024 at 1:52 AM George Stark  wrote:

I haven't lose hope for the devm_mutex thing and keep pinging those guys
from time to time.


I don't understand. According to v4 thread Christophe proposed on how
the patch should look like. What you need is to incorporate an updated
version into your series. Am I wrong?


We agreed that the effective way of implementing devm_mutex_init() is in 
mutex.h using forward declaration of struct device.
The only inconvenient thing is that in the mutex.h mutex_init() declared 
after mutex_destroy() so we'll have to use condition #ifdef 
CONFIG_DEBUG_MUTEXES twice. Waiman Long proposed great cleanup patch [1] 
that eliminates the need of doubling #ifdef. That patch was reviewed a 
bit but it's still unapplied (near 2 months). I'm still trying to 
contact mutex.h guys but there're no any feedback yet.


[1] 
https://lore.kernel.org/lkml/20231216013656.1382213-2-long...@redhat.com/T/#m795b230d662c1debb28463ad721ddba5b384340a






Sure I can single out the fix-only patch I'll do it tomorrow.


I believe it can be handled without issuing it separately. `b4` tool
is capable of selective choices. It was rather Q to Lee if he can/want
to apply it right away.


Oh ok, that would be great.




On 2/9/24 20:11, Andy Shevchenko wrote:

On Thu, Dec 21, 2023 at 03:11:11PM +, Lee Jones wrote:

On Thu, 14 Dec 2023, George Stark wrote:


This patch series fixes the problem of devm_led_classdev_register misusing.

The basic problem is described in [1]. Shortly when devm_led_classdev_register()
is used then led_classdev_unregister() called after driver's remove() callback.
led_classdev_unregister() calls driver's brightness_set callback and that 
callback
may use resources which were destroyed already in driver's remove().

After discussion with maintainers [2] [3] we decided:
1) don't touch led subsytem core code and don't remove led_set_brightness() 
from it
but fix drivers
2) don't use devm_led_classdev_unregister

So the solution is to use devm wrappers for all resources
driver's brightness_set() depends on. And introduce dedicated devm wrapper
for mutex as it's often used resource.

[1] 
https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/
[2] 
https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mc132b9b350fa51931b4fcfe14705d9f06e91421f
[3] 
https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mdbf572a85c33f869a553caf986b6228bb65c8383


...


FYI: I'll conduct my review once the locking side is settled.


To reduce burden can you apply the first one? It's a fix.




--
Best regards
George


Re: [PATCH] powerpc/kasan: Limit KASAN thread size increase to 32KB

2024-02-12 Thread Benjamin Gray
Don't know why the previous mail went blank.

On Mon, 2024-02-12 at 17:42 +1100, Michael Ellerman wrote:
> KASAN is seen to increase stack usage, to the point that it was
> reported
> to lead to stack overflow on some 32-bit machines (see link).
> 
> To avoid overflows the stack size was doubled for KASAN builds in
> commit 3e8635fb2e07 ("powerpc/kasan: Force thread size increase with
> KASAN").
> 
> However with a 32KB stack size to begin with, the doubling leads to a
> 64KB stack, which causes build errors:
>   arch/powerpc/kernel/switch.S:249: Error: operand out of range
> (0xfe50 is not between 0x8000 and
> 0x7fff)
> 
> Although the asm could be reworked, in practice a 32KB stack seems
> sufficient even for KASAN builds - the additional usage seems to be
> in
> the 2-3KB range for a 64-bit KASAN build.
> 
> So only increase the stack for KASAN if the stack size is < 32KB.
> 
> Link:
> https://lore.kernel.org/linuxppc-dev/bug-207129-206...@https.bugzilla.kernel.org%2F/
> Reported-by: Spoorthy 
> Reported-by: Benjamin Gray 
> Fixes: 18f14afe2816 ("powerpc/64s: Increase default stack size to
> 32KB")
> Signed-off-by: Michael Ellerman 

Reviewed-by: Benjamin Gray 

> ---
>  arch/powerpc/include/asm/thread_info.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/thread_info.h
> b/arch/powerpc/include/asm/thread_info.h
> index bf5dde1a4114..15c5691dd218 100644
> --- a/arch/powerpc/include/asm/thread_info.h
> +++ b/arch/powerpc/include/asm/thread_info.h
> @@ -14,7 +14,7 @@
>  
>  #ifdef __KERNEL__
>  
> -#ifdef CONFIG_KASAN
> +#if defined(CONFIG_KASAN) && CONFIG_THREAD_SHIFT < 15
>  #define MIN_THREAD_SHIFT (CONFIG_THREAD_SHIFT + 1)
>  #else
>  #define MIN_THREAD_SHIFT CONFIG_THREAD_SHIFT



Re: [PATCH] powerpc/kasan: Limit KASAN thread size increase to 32KB

2024-02-12 Thread Benjamin Gray


Re: [PATCH v5 03/25] mm: Make pte_next_pfn() a wrapper around pte_advance_pfn()

2024-02-12 Thread Ryan Roberts
On 12/02/2024 14:29, David Hildenbrand wrote:
> On 12.02.24 15:10, Ryan Roberts wrote:
>> On 12/02/2024 12:14, David Hildenbrand wrote:
>>> On 02.02.24 09:07, Ryan Roberts wrote:
 The goal is to be able to advance a PTE by an arbitrary number of PFNs.
 So introduce a new API that takes a nr param.

 We are going to remove pte_next_pfn() and replace it with
 pte_advance_pfn(). As a first step, implement pte_next_pfn() as a
 wrapper around pte_advance_pfn() so that we can incrementally switch the
 architectures over. Once all arches are moved over, we will change all
 the core-mm callers to call pte_advance_pfn() directly and remove the
 wrapper.

 Signed-off-by: Ryan Roberts 
 ---
    include/linux/pgtable.h | 8 +++-
    1 file changed, 7 insertions(+), 1 deletion(-)

 diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
 index 5e7eaf8f2b97..815d92dcb96b 100644
 --- a/include/linux/pgtable.h
 +++ b/include/linux/pgtable.h
 @@ -214,9 +214,15 @@ static inline int pmd_dirty(pmd_t pmd)
        #ifndef pte_next_pfn
 +#ifndef pte_advance_pfn
 +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 +{
 +    return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
 +}
 +#endif
    static inline pte_t pte_next_pfn(pte_t pte)
    {
 -    return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
 +    return pte_advance_pfn(pte, 1);
    }
    #endif
    
>>>
>>> I do wonder if we simply want to leave pte_next_pfn() around? Especially 
>>> patch
>>> #4, #6 don't really benefit from the change? So are the other set_ptes()
>>> implementations.
>>>
>>> That is, only convert all pte_next_pfn()->pte_advance_pfn(), and leave a
>>> pte_next_pfn() macro in place.
>>>
>>> Any downsides to that?
>>
>> The downside is just having multiple functions that effectively do the same
>> thing. Personally I think its cleaner and easier to understand the code with
>> just one generic function which we pass 1 to it where we only want to 
>> advance by
>> 1. In the end, there are only a couple of places where pte_advance_pfn(1) is
>> used, so doesn't really seem valuable to me to maintain a specialization.
> 
> Well, not really functions, just a macro. Like we have set_pte_at() 
> translating
> to set_ptes().
> 
> Arguably, we have more callers of set_pte_at().
> 
> "Easier to understand", I don't know. :)
> 
>>
>> Unless you feel strongly that we need to keep pte_next_pfn() then I'd prefer 
>> to
>> leave it as I've done in this series.
> 
> Well, it makes you patch set shorter and there is less code churn.
> 
> So personally, I'd just leave pte_next_pfn() in there. But whatever you 
> prefer,
> not the end of the world.

I thought about this a bit more and remembered that I'm the apprentice so I've
changed it as you suggested.



Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread Ryan Roberts
[...]

 +static inline bool mm_is_user(struct mm_struct *mm)
 +{
 +  /*
 +   * Don't attempt to apply the contig bit to kernel mappings, because
 +   * dynamically adding/removing the contig bit can cause page faults.
 +   * These racing faults are ok for user space, since they get serialized
 +   * on the PTL. But kernel mappings can't tolerate faults.
 +   */
 +  return mm != _mm;
 +}
>>>
>>> We also have the efi_mm as a non-user mm, though I don't think we manipulate
>>> that while it is live, and I'm not sure if that needs any special handling.
>>
>> Well we never need this function in the hot (order-0 folio) path, so I think 
>> I
>> could add a check for efi_mm here with performance implication. It's probably
>> safest to explicitly exclude it? What do you think?
> 
> Oops: This should have read "I think I could add a check for efi_mm here
> *without* performance implication"

It turns out that efi_mm is only defined when CONFIG_EFI is enabled. I can do 
this:

return mm != _mm && (!IS_ENABLED(CONFIG_EFI) || mm != _mm);

Is that acceptable? This is my preference, but nothing else outside of efi
references this symbol currently.

Or perhaps I can convince myself that its safe to treat efi_mm like userspace.
There are a couple of things that need to be garanteed for it to be safe:

  - The PFNs of present ptes either need to have an associated struct page or
need to have the PTE_SPECIAL bit set (either pte_mkspecial() or
pte_mkdevmap())

  - Live mappings must either be static (no changes that could cause fold/unfold
while live) or the system must be able to tolerate a temporary fault

Mark suggests efi_mm is not manipulated while live, so that meets the latter
requirement, but I'm not sure about the former?

Thanks,
Ryan



[PATCH v2 5/5] powerpc: ibmebus: make ibmebus_bus_type const

2024-02-12 Thread Ricardo B. Marliere
Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the ibmebus_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/ibmebus.h   | 2 +-
 arch/powerpc/platforms/pseries/ibmebus.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/ibmebus.h 
b/arch/powerpc/include/asm/ibmebus.h
index 6f33253a364a..46fe406f461c 100644
--- a/arch/powerpc/include/asm/ibmebus.h
+++ b/arch/powerpc/include/asm/ibmebus.h
@@ -48,7 +48,7 @@
 
 struct platform_driver;
 
-extern struct bus_type ibmebus_bus_type;
+extern const struct bus_type ibmebus_bus_type;
 
 int ibmebus_register_driver(struct platform_driver *drv);
 void ibmebus_unregister_driver(struct platform_driver *drv);
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c 
b/arch/powerpc/platforms/pseries/ibmebus.c
index 998e3aff2457..b401282727a4 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -55,7 +55,7 @@ static struct device ibmebus_bus_device = { /* fake "parent" 
device */
.init_name = "ibmebus",
 };
 
-struct bus_type ibmebus_bus_type;
+const struct bus_type ibmebus_bus_type;
 
 /* These devices will automatically be added to the bus during init */
 static const struct of_device_id ibmebus_matches[] __initconst = {
@@ -432,7 +432,7 @@ static int ibmebus_bus_modalias(const struct device *dev, 
struct kobj_uevent_env
return of_device_uevent_modalias(dev, env);
 }
 
-struct bus_type ibmebus_bus_type = {
+const struct bus_type ibmebus_bus_type = {
.name  = "ibmebus",
.uevent= ibmebus_bus_modalias,
.bus_groups = ibmbus_bus_groups,

-- 
2.43.0



[PATCH v2 4/5] powerpc: pmac: make macio_bus_type const

2024-02-12 Thread Ricardo B. Marliere
Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the macio_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/macio.h | 2 +-
 drivers/macintosh/macio_asic.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/macio.h b/arch/powerpc/include/asm/macio.h
index 3a07c62973aa..ab9608e63e40 100644
--- a/arch/powerpc/include/asm/macio.h
+++ b/arch/powerpc/include/asm/macio.h
@@ -6,7 +6,7 @@
 #include 
 #include 
 
-extern struct bus_type macio_bus_type;
+extern const struct bus_type macio_bus_type;
 
 /* MacIO device driver is defined later */
 struct macio_driver;
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index a5ee8f736a8e..565f1e21ff7d 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -136,7 +136,7 @@ static int macio_device_modalias(const struct device *dev, 
struct kobj_uevent_en
 
 extern const struct attribute_group *macio_dev_groups[];
 
-struct bus_type macio_bus_type = {
+const struct bus_type macio_bus_type = {
.name   = "macio",
.match  = macio_bus_match,
.uevent = macio_device_modalias,

-- 
2.43.0



[PATCH v2 3/5] powerpc: mpic: make mpic_subsys const

2024-02-12 Thread Ricardo B. Marliere
Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the mpic_subsys variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/mpic.h | 2 +-
 arch/powerpc/sysdev/mpic.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h
index 58353c5bd3fb..0c03a98986cd 100644
--- a/arch/powerpc/include/asm/mpic.h
+++ b/arch/powerpc/include/asm/mpic.h
@@ -336,7 +336,7 @@ struct mpic
 #endif
 };
 
-extern struct bus_type mpic_subsys;
+extern const struct bus_type mpic_subsys;
 
 /*
  * MPIC flags (passed to mpic_alloc)
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index dabbdd356664..d94cf36b0f65 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -49,7 +49,7 @@
 #define DBG(fmt...)
 #endif
 
-struct bus_type mpic_subsys = {
+const struct bus_type mpic_subsys = {
.name = "mpic",
.dev_name = "mpic",
 };

-- 
2.43.0



[PATCH v2 2/5] powerpc: vio: make vio_bus_type const

2024-02-12 Thread Ricardo B. Marliere
Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the vio_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman 
Suggested-by: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/include/asm/vio.h   | 2 +-
 arch/powerpc/platforms/pseries/vio.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h
index cc9b787627ad..6faf2a931755 100644
--- a/arch/powerpc/include/asm/vio.h
+++ b/arch/powerpc/include/asm/vio.h
@@ -39,7 +39,7 @@
  */
 #define VIO_CMO_MIN_ENT 1562624
 
-extern struct bus_type vio_bus_type;
+extern const struct bus_type vio_bus_type;
 
 struct iommu_table;
 
diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 6c58824190a2..90ff85c879bf 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1615,7 +1615,7 @@ static struct attribute *vio_cmo_dev_attrs[] = {
 };
 ATTRIBUTE_GROUPS(vio_cmo_dev);
 
-struct bus_type vio_bus_type = {
+const struct bus_type vio_bus_type = {
.name = "vio",
.dev_groups = vio_cmo_dev_groups,
.bus_groups = vio_bus_groups,
@@ -1634,7 +1634,7 @@ static struct attribute *vio_dev_attrs[] = {
 };
 ATTRIBUTE_GROUPS(vio_dev);
 
-struct bus_type vio_bus_type = {
+const struct bus_type vio_bus_type = {
.name = "vio",
.dev_groups = vio_dev_groups,
.uevent = vio_hotplug,

-- 
2.43.0



[PATCH v2 1/5] powerpc: vio: move device attributes into a new ifdef

2024-02-12 Thread Ricardo B. Marliere
In order to make the distinction of the vio_bus_type variable based on
CONFIG_PPC_SMLPAR more explicit, move the required structs into a new
ifdef block. This is needed in order to make vio_bus_type const and
because the distinction is made explicit, there is no need to set the
fields within the vio_cmo_sysfs_init function.

Cc: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 
---
 arch/powerpc/platforms/pseries/vio.c | 59 +---
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 2dc9cbc4bcd8..6c58824190a2 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -991,18 +991,6 @@ static DEVICE_ATTR_RO(cmo_allocated);
 static DEVICE_ATTR_RW(cmo_desired);
 static DEVICE_ATTR_RW(cmo_allocs_failed);
 
-static struct attribute *vio_cmo_dev_attrs[] = {
-   _attr_name.attr,
-   _attr_devspec.attr,
-   _attr_modalias.attr,
-   _attr_cmo_entitled.attr,
-   _attr_cmo_allocated.attr,
-   _attr_cmo_desired.attr,
-   _attr_cmo_allocs_failed.attr,
-   NULL,
-};
-ATTRIBUTE_GROUPS(vio_cmo_dev);
-
 /* sysfs bus functions and data structures for CMO */
 
 #define viobus_cmo_rd_attr(name)\
@@ -1062,11 +1050,7 @@ static struct attribute *vio_bus_attrs[] = {
 };
 ATTRIBUTE_GROUPS(vio_bus);
 
-static void __init vio_cmo_sysfs_init(void)
-{
-   vio_bus_type.dev_groups = vio_cmo_dev_groups;
-   vio_bus_type.bus_groups = vio_bus_groups;
-}
+static void __init vio_cmo_sysfs_init(void) { }
 #else /* CONFIG_PPC_SMLPAR */
 int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
 void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
@@ -1584,14 +1568,6 @@ static ssize_t modalias_show(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RO(modalias);
 
-static struct attribute *vio_dev_attrs[] = {
-   _attr_name.attr,
-   _attr_devspec.attr,
-   _attr_modalias.attr,
-   NULL,
-};
-ATTRIBUTE_GROUPS(vio_dev);
-
 void vio_unregister_device(struct vio_dev *viodev)
 {
device_unregister(>dev);
@@ -1626,6 +1602,38 @@ static int vio_hotplug(const struct device *dev, struct 
kobj_uevent_env *env)
return 0;
 }
 
+#ifdef CONFIG_PPC_SMLPAR
+static struct attribute *vio_cmo_dev_attrs[] = {
+   _attr_name.attr,
+   _attr_devspec.attr,
+   _attr_modalias.attr,
+   _attr_cmo_entitled.attr,
+   _attr_cmo_allocated.attr,
+   _attr_cmo_desired.attr,
+   _attr_cmo_allocs_failed.attr,
+   NULL,
+};
+ATTRIBUTE_GROUPS(vio_cmo_dev);
+
+struct bus_type vio_bus_type = {
+   .name = "vio",
+   .dev_groups = vio_cmo_dev_groups,
+   .bus_groups = vio_bus_groups,
+   .uevent = vio_hotplug,
+   .match = vio_bus_match,
+   .probe = vio_bus_probe,
+   .remove = vio_bus_remove,
+   .shutdown = vio_bus_shutdown,
+};
+#else /* CONFIG_PPC_SMLPAR */
+static struct attribute *vio_dev_attrs[] = {
+   _attr_name.attr,
+   _attr_devspec.attr,
+   _attr_modalias.attr,
+   NULL,
+};
+ATTRIBUTE_GROUPS(vio_dev);
+
 struct bus_type vio_bus_type = {
.name = "vio",
.dev_groups = vio_dev_groups,
@@ -1635,6 +1643,7 @@ struct bus_type vio_bus_type = {
.remove = vio_bus_remove,
.shutdown = vio_bus_shutdown,
 };
+#endif /* CONFIG_PPC_SMLPAR */
 
 /**
  * vio_get_attribute: - get attribute for virtual device

-- 
2.43.0



[PATCH v2 0/5] powerpc: struct bus_type cleanup

2024-02-12 Thread Ricardo B. Marliere
This series is part of an effort to cleanup the users of the driver
core, as can be seen in many recent patches authored by Greg across the
tree (e.g. [1]). Patch 1/5 is a prerequisite to 2/5, but the others have
no dependency. They were built using bootlin's without warnings using
powerpc64le-power8--glibc--stable-2023.11-1 toolchain.

---
[1]: 
https://lore.kernel.org/lkml/?q=f%3Agregkh%40linuxfoundation.org+s%3A%22make%22+and+s%3A%22const%22

Cc: Greg Kroah-Hartman 
Signed-off-by: Ricardo B. Marliere 

---
Changes in v2:
- Added a new patch to make macio_bus_type const.
- Improved changelogs to remove the word "Now".
- Fixed a build error: 
https://lore.kernel.org/oe-kbuild-all/202402102142.uphikeqw-...@intel.com/
- Link to v1: 
https://lore.kernel.org/r/20240209-bus_cleanup-powerpc2-v1-0-79a56dcae...@marliere.net

---
Ricardo B. Marliere (5):
  powerpc: vio: move device attributes into a new ifdef
  powerpc: vio: make vio_bus_type const
  powerpc: mpic: make mpic_subsys const
  powerpc: pmac: make macio_bus_type const
  powerpc: ibmebus: make ibmebus_bus_type const

 arch/powerpc/include/asm/ibmebus.h   |  2 +-
 arch/powerpc/include/asm/macio.h |  2 +-
 arch/powerpc/include/asm/mpic.h  |  2 +-
 arch/powerpc/include/asm/vio.h   |  2 +-
 arch/powerpc/platforms/pseries/ibmebus.c |  4 +--
 arch/powerpc/platforms/pseries/vio.c | 61 ++--
 arch/powerpc/sysdev/mpic.c   |  2 +-
 drivers/macintosh/macio_asic.c   |  2 +-
 8 files changed, 43 insertions(+), 34 deletions(-)
---
base-commit: 41bccc98fb7931d63d03f326a746ac4d429c1dd3
change-id: 20240209-bus_cleanup-powerpc2-498426fccb98

Best regards,
-- 
Ricardo B. Marliere 



Re: [PATCH 0/4] powerpc: struct bus_type cleanup

2024-02-12 Thread Ricardo B. Marliere
Please disregard this series, I will send a v2.

Thank you,
-   Ricardo.




Re: [PATCH] powerpc/ftrace: Ignore ftrace locations in exit text sections

2024-02-12 Thread Christophe Leroy


Le 09/02/2024 à 08:59, Naveen N Rao a écrit :
> Michael reported that we are seeing ftrace bug on bootup when KASAN is
> enabled, and if we are using -fpatchable-function-entry:
> 
>  ftrace: allocating 47780 entries in 18 pages
>  ftrace-powerpc: 0xc20b3d5c: No module provided for non-kernel 
> address
>  [ ftrace bug ]
>  ftrace faulted on modifying
>  [] 0xc20b3d5c
>  Initializing ftrace call sites
>  ftrace record flags: 0
>   (0)
>   expected tramp: c008cef4
>  [ cut here ]
>  WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180 
> ftrace_bug+0x3c0/0x424
>  Modules linked in:
>  CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-g0f71dcfb4aef 
> #860
>  Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 
> 0xf05 of:SLOF,HEAD hv:linux,kvm pSeries
>  NIP:  c03aa81c LR: c03aa818 CTR: 
>  REGS: c33cfab0 TRAP: 0700   Not tainted  
> (6.5.0-rc3-00120-g0f71dcfb4aef)
>  MSR:  82021033   CR: 28028240  XER: 
> 
>  CFAR: c02781a8 IRQMASK: 3
>  ...
>  NIP [c03aa81c] ftrace_bug+0x3c0/0x424
>  LR [c03aa818] ftrace_bug+0x3bc/0x424
>  Call Trace:
>   ftrace_bug+0x3bc/0x424 (unreliable)
>   ftrace_process_locs+0x5f4/0x8a0
>   ftrace_init+0xc0/0x1d0
>   start_kernel+0x1d8/0x484
> 
> With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and
> CONFIG_KASAN=y, compiler emits nops in functions that it generates for
> registering and unregistering global variables (unlike with -pg and
> -mprofile-kernel where calls to _mcount() are not generated in those
> functions). Those functions then end up in INIT_TEXT and EXIT_TEXT
> respectively. We don't expect to see any profiled functions in
> EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that aren't
> in the core kernel text belongs to a module. Since these functions do
> not match that criteria, we see the above bug.
> 
> Address this by having ftrace ignore all locations in the text exit
> sections of vmlinux.
> 
> Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for 
> -fpatchable-function-entry")
> Cc: sta...@vger.kernel.org
> Reported-by: Michael Ellerman 
> Signed-off-by: Naveen N Rao 
> ---
>   arch/powerpc/include/asm/ftrace.h   |  9 +
>   arch/powerpc/include/asm/sections.h |  1 +
>   arch/powerpc/kernel/trace/ftrace.c  | 12 
>   arch/powerpc/kernel/vmlinux.lds.S   |  2 ++
>   4 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/ftrace.h 
> b/arch/powerpc/include/asm/ftrace.h
> index 1ebd2ca97f12..d6babd083202 100644
> --- a/arch/powerpc/include/asm/ftrace.h
> +++ b/arch/powerpc/include/asm/ftrace.h
> @@ -20,14 +20,7 @@
>   #ifndef __ASSEMBLY__
>   extern void _mcount(void);
>   
> -static inline unsigned long ftrace_call_adjust(unsigned long addr)
> -{
> - if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY))
> - addr += MCOUNT_INSN_SIZE;
> -
> - return addr;
> -}
> -
> +unsigned long ftrace_call_adjust(unsigned long addr);
>   unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip,
>   unsigned long sp);
>   
> diff --git a/arch/powerpc/include/asm/sections.h 
> b/arch/powerpc/include/asm/sections.h
> index ea26665f82cf..d389dcecdb0b 100644
> --- a/arch/powerpc/include/asm/sections.h
> +++ b/arch/powerpc/include/asm/sections.h
> @@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t;
>   
>   extern char __head_end[];
>   extern char __srwx_boundary[];
> +extern char _sexittext[], _eexittext[];

Should we try to at least use the same symbols as others, or best try to 
move this into include/asm-generic/sections.h, just like inittext ?

$ git grep exittext
arch/arm64/include/asm/sections.h:extern char __exittext_begin[], 
__exittext_end[];
arch/arm64/kernel/patching.c:   addr >= (unsigned 
long)__exittext_begin &&
arch/arm64/kernel/patching.c:   addr < (unsigned 
long)__exittext_end;
arch/arm64/kernel/vmlinux.lds.S:__exittext_begin = .;
arch/arm64/kernel/vmlinux.lds.S:__exittext_end = .;
arch/riscv/include/asm/sections.h:extern char __exittext_begin[], 
__exittext_end[];
arch/riscv/kernel/patch.c:static inline bool 
is_kernel_exittext(uintptr_t addr)
arch/riscv/kernel/patch.c:  addr >= 
(uintptr_t)__exittext_begin &&
arch/riscv/kernel/patch.c:  addr < (uintptr_t)__exittext_end;
arch/riscv/kernel/patch.c:  if (core_kernel_text(uintaddr) || 
is_kernel_exittext(uintaddr))
arch/riscv/kernel/vmlinux-xip.lds.S:__exittext_begin = .;
arch/riscv/kernel/vmlinux-xip.lds.S:__exittext_end = .;
arch/riscv/kernel/vmlinux.lds.S:__exittext_begin = .;
arch/riscv/kernel/vmlinux.lds.S:__exittext_end = .;


>   
>   /* Patch sites */
>   extern s32 

Re: Powerpc: ps3av.c:(.text+0x19e8): undefined reference to `video_get_options'

2024-02-12 Thread Geert Uytterhoeven
On Mon, Feb 12, 2024 at 7:36 PM Naresh Kamboju
 wrote:
> I encountered the following build warnings/errors while compiling the powerpc
> kernel on Linux next-20240208 .. next-20240212 tag with clang toolchain.
>
> Reported-by: Linux Kernel Functional Testing 
>
> powerpc64le-linux-gnu-ld: drivers/ps3/ps3av.o: in function `ps3av_probe':
> ps3av.c:(.text+0x19e8): undefined reference to `video_get_options'
> make[3]: *** [/builds/linux/scripts/Makefile.vmlinux:37: vmlinux] Error 1
> make[3]: Target '__default' not remade because of errors.
>
> Links:
>  - 
> https://storage.tuxsuite.com/public/linaro/lkft/builds/2cFkli5H02fikrpga6PluAWLAMa/

https://lore.kernel.org/linuxppc-dev/43ed64aa-17b0-4d04-a1f3-a6e13f59a...@suse.de/T/#ma2e81d77ee4a708c75d09c4e46904072b3f7b70f

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Yury Norov
On Mon, Feb 12, 2024 at 04:36:36PM +0200, Andy Shevchenko wrote:
> On Mon, Feb 12, 2024 at 03:20:22PM +0100, Herve Codina wrote:
> > On Mon, 12 Feb 2024 16:01:38 +0200
> > Andy Shevchenko  wrote:
> 
> ...
> 
> > Agree, the bitmap_onto() code is simpler to understand than its help.
> > 
> > I introduced bitmap_off() to be the "reverse" bitmap_onto() operations
> > and I preferred to avoid duplicating function that do the same things.
> > 
> > On my side, I initially didn't use the bitmap_*() functions and did the the
> > bits manipulation by hand.
> > During the review, it was suggested to use the bitmap_*() family and I 
> > followed
> > this suggestion.
> 
> I also would go this way, the problems I see with the current implementation 
> are:

Sure, opencoding and duplicating the functionality is always a bad
idea.

> - being related to NUMA (and as Rasmus once pointed out better to be there);

It's 'related to NUMA' for the only reason - it's used by NUMA only.
Nothing NUMA-specific in the function itself.

Now that we've got a non-NUMA user, the bitmap_onto() is not related
to NUMA anymore.

> - unclear naming, esp. proposed bitmap_off();

That's I agree. Scatter/gather from your last approach sound better.
Do you plan to send a v2?

> - the quite hard to understand help text

Yes, we need a picture that would illustrate what actually happens

> - atomicity when it's not needed (AFAICT).

Agree. A series of atomic ops is not atomic. For example

if (test_bit(n, map))
set_bit(m, map);

is not atomic as a whole. And this is what we do in bitmap_onto/off()
in a loop. This must be fixed by using underscoded version.

> > I did tests to be sure that bitmap_onto() and bitmap_off() did
> > exactly the same things as my previous code did.
> 
> Yuri, what do you think about all this?

I think your scatter/gather is better then this onto/off by naming and
implementation. If you'll send a v2, and it would work for Herve, I'd
prefer scatter/gather. But we can live with onto/off as well.

Thanks,
Yury


Re: Powerpc: ps3av.c:(.text+0x19e8): undefined reference to `video_get_options'

2024-02-12 Thread Randy Dunlap



On 2/12/24 10:36, Naresh Kamboju wrote:
> I encountered the following build warnings/errors while compiling the powerpc
> kernel on Linux next-20240208 .. next-20240212 tag with clang toolchain.
> 
> Reported-by: Linux Kernel Functional Testing 
> 
> powerpc64le-linux-gnu-ld: drivers/ps3/ps3av.o: in function `ps3av_probe':
> ps3av.c:(.text+0x19e8): undefined reference to `video_get_options'
> make[3]: *** [/builds/linux/scripts/Makefile.vmlinux:37: vmlinux] Error 1
> make[3]: Target '__default' not remade because of errors.
> 
> Links:
>  - 
> https://storage.tuxsuite.com/public/linaro/lkft/builds/2cFkli5H02fikrpga6PluAWLAMa/
> 
> 
> --
> Linaro LKFT
> https://lkft.linaro.org
> 

Hi,
I posted a patch for this and Thomas Zimmermanm says:
  The patch is now in drm-misc-next. 

https://lore.kernel.org/lkml/20240207161322.8073-1-rdun...@infradead.org/

thanks.
-- 
#Randy


Re: [PATCH v3 RESEND 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Yury Norov
On Mon, Feb 12, 2024 at 10:37:18AM -0800, Yury Norov wrote:
> On Mon, Feb 12, 2024 at 08:56:32AM +0100, Herve Codina wrote:
> > The bitmap_onto() function translates one bitmap relative to another but
> > no function are present to perform the reverse translation.
> > 
> > Introduce bitmap_off() to fill this hole.
> > 
> > Signed-off-by: Herve Codina 
> > ---
> >  include/linux/bitmap.h |  3 +++
> >  lib/bitmap.c   | 42 ++
> >  2 files changed, 45 insertions(+)
> > 
> > diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> > index 99451431e4d6..5ecfcbbc91f4 100644
> > --- a/include/linux/bitmap.h
> > +++ b/include/linux/bitmap.h
> > @@ -65,6 +65,7 @@ struct device;
> >   *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
> >   *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, 
> > new)(oldbit)
> >   *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to 
> > relmap
> > + *  bitmap_off(dst, orig, relmap, nbits)*dst = bitmap_onto() 
> > reverse operation
> >   *  bitmap_fold(dst, orig, sz, nbits)   dst bits = orig bits mod sz
> >   *  bitmap_parse(buf, buflen, dst, nbits)   Parse bitmap dst from 
> > kernel buf
> >   *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user 
> > buf
> > @@ -208,6 +209,8 @@ int bitmap_bitremap(int oldbit,
> > const unsigned long *old, const unsigned long *new, int bits);
> >  void bitmap_onto(unsigned long *dst, const unsigned long *orig,
> > const unsigned long *relmap, unsigned int bits);
> > +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> > +   const unsigned long *relmap, unsigned int bits);
> >  void bitmap_fold(unsigned long *dst, const unsigned long *orig,
> > unsigned int sz, unsigned int nbits);
> >  
> > diff --git a/lib/bitmap.c b/lib/bitmap.c
> > index 2feccb5047dc..71343967335e 100644
> > --- a/lib/bitmap.c
> > +++ b/lib/bitmap.c
> > @@ -682,6 +682,48 @@ void bitmap_onto(unsigned long *dst, const unsigned 
> > long *orig,
> >  }
> >  EXPORT_SYMBOL(bitmap_onto);
> >  
> > +/**
> > + * bitmap_off - revert operation done by bitmap_onto()
> 
> This is definitely a bad name. I've no a better idea, but even
> bitmap_onto_revert() would be better.
> 
> > + * @dst: resulting translated bitmap
> > + * @orig: original untranslated bitmap
> > + * @relmap: bitmap relative to which translated
> > + * @bits: number of bits in each of these bitmaps
> > + *
> > + * Suppose onto computed using bitmap_onto(onto, src, relmap, n)
> > + * The operation bitmap_off(result, onto, relmap, n) leads to a
> > + * result equal or equivalent to src.
> 
> Agree with Rasmus. This should be well tested.
> 
> > + * The result can be 'equivalent' because bitmap_onto() and
> > + * bitmap_off() are not bijective.
> > + * The result and src values are equivalent in that sense that a
> > + * call to bitmap_onto(onto, src, relmap, n) and a call to
> > + * bitmap_onto(onto, result, relmap, n) will lead to the same onto
> > + * value.
> 
> Did you mean "a call to bitmap_onto(onto, src, relmap, n) and a
> call to bitmap_off(onto, result, relmap, n)"? 
> 
> I think the whole paragraph adds more confusion than explanations.
> If a new function is supposed to revert the result of some other
> function, I'd better focus on testing that it actually reverts as
> advertised, and keep description as brief as possible.
> 
> > + * If either of @orig or @relmap is empty (no set bits), then @dst
> > + * will be returned empty.
> 
> Is this an exception from the 'revert' policy? Doesn't look like that.
> So, what for mentioning this specific case?
> 
> > + * All bits in @dst not set by the above rule are cleared.
> 
> The above rule is about empty @orig and @relmap, not about setting
> bits. What did you mean here?
> 
> > + */
> > +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> > +   const unsigned long *relmap, unsigned int bits)
> > +{
> > +   unsigned int n, m;  /* same meaning as in above comment */
> 
> In the above comment, n means the size of bitmaps, and m is not
> mentioned at all.
> 
> > +   if (dst == orig)/* following doesn't handle inplace mappings */
> > +   return;
> > +   bitmap_zero(dst, bits);
> 
> Can you add an empty line after 'return'.
> 
> > +   m = 0;
> > +   for_each_set_bit(n, relmap, bits) {
> > +   /* m == bitmap_pos_to_ord(relmap, n, bits) */
> 
> Don't think we need this comment here. If you want to underline that
> m tracks bit order, can you just give it a more explanatory name. For
> example, 'bit_order'.
> 
> > +   if (test_bit(n, orig))
> > +   set_bit(m, dst);
> > +   m++;

Forgot to mention - we need a __set_bit() and __test_bit(), because the
whole function is not atomic. This applies to the bitmap_onto() as
well. Can you please send a patch fixing it for bitmap_onto() in the

Re: [PATCH v3 RESEND 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Yury Norov
On Mon, Feb 12, 2024 at 08:56:32AM +0100, Herve Codina wrote:
> The bitmap_onto() function translates one bitmap relative to another but
> no function are present to perform the reverse translation.
> 
> Introduce bitmap_off() to fill this hole.
> 
> Signed-off-by: Herve Codina 
> ---
>  include/linux/bitmap.h |  3 +++
>  lib/bitmap.c   | 42 ++
>  2 files changed, 45 insertions(+)
> 
> diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
> index 99451431e4d6..5ecfcbbc91f4 100644
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -65,6 +65,7 @@ struct device;
>   *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
>   *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, 
> new)(oldbit)
>   *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to 
> relmap
> + *  bitmap_off(dst, orig, relmap, nbits)*dst = bitmap_onto() reverse 
> operation
>   *  bitmap_fold(dst, orig, sz, nbits)   dst bits = orig bits mod sz
>   *  bitmap_parse(buf, buflen, dst, nbits)   Parse bitmap dst from kernel 
> buf
>   *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user 
> buf
> @@ -208,6 +209,8 @@ int bitmap_bitremap(int oldbit,
>   const unsigned long *old, const unsigned long *new, int bits);
>  void bitmap_onto(unsigned long *dst, const unsigned long *orig,
>   const unsigned long *relmap, unsigned int bits);
> +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> + const unsigned long *relmap, unsigned int bits);
>  void bitmap_fold(unsigned long *dst, const unsigned long *orig,
>   unsigned int sz, unsigned int nbits);
>  
> diff --git a/lib/bitmap.c b/lib/bitmap.c
> index 2feccb5047dc..71343967335e 100644
> --- a/lib/bitmap.c
> +++ b/lib/bitmap.c
> @@ -682,6 +682,48 @@ void bitmap_onto(unsigned long *dst, const unsigned long 
> *orig,
>  }
>  EXPORT_SYMBOL(bitmap_onto);
>  
> +/**
> + * bitmap_off - revert operation done by bitmap_onto()

This is definitely a bad name. I've no a better idea, but even
bitmap_onto_revert() would be better.

> + * @dst: resulting translated bitmap
> + * @orig: original untranslated bitmap
> + * @relmap: bitmap relative to which translated
> + * @bits: number of bits in each of these bitmaps
> + *
> + * Suppose onto computed using bitmap_onto(onto, src, relmap, n)
> + * The operation bitmap_off(result, onto, relmap, n) leads to a
> + * result equal or equivalent to src.

Agree with Rasmus. This should be well tested.

> + * The result can be 'equivalent' because bitmap_onto() and
> + * bitmap_off() are not bijective.
> + * The result and src values are equivalent in that sense that a
> + * call to bitmap_onto(onto, src, relmap, n) and a call to
> + * bitmap_onto(onto, result, relmap, n) will lead to the same onto
> + * value.

Did you mean "a call to bitmap_onto(onto, src, relmap, n) and a
call to bitmap_off(onto, result, relmap, n)"? 

I think the whole paragraph adds more confusion than explanations.
If a new function is supposed to revert the result of some other
function, I'd better focus on testing that it actually reverts as
advertised, and keep description as brief as possible.

> + * If either of @orig or @relmap is empty (no set bits), then @dst
> + * will be returned empty.

Is this an exception from the 'revert' policy? Doesn't look like that.
So, what for mentioning this specific case?

> + * All bits in @dst not set by the above rule are cleared.

The above rule is about empty @orig and @relmap, not about setting
bits. What did you mean here?

> + */
> +void bitmap_off(unsigned long *dst, const unsigned long *orig,
> + const unsigned long *relmap, unsigned int bits)
> +{
> + unsigned int n, m;  /* same meaning as in above comment */

In the above comment, n means the size of bitmaps, and m is not
mentioned at all.

> + if (dst == orig)/* following doesn't handle inplace mappings */
> + return;
> + bitmap_zero(dst, bits);

Can you add an empty line after 'return'.

> + m = 0;
> + for_each_set_bit(n, relmap, bits) {
> + /* m == bitmap_pos_to_ord(relmap, n, bits) */

Don't think we need this comment here. If you want to underline that
m tracks bit order, can you just give it a more explanatory name. For
example, 'bit_order'.

> + if (test_bit(n, orig))
> + set_bit(m, dst);
> + m++;
> + }
> +}
> +EXPORT_SYMBOL(bitmap_off);
> +
>  #ifdef CONFIG_NUMA
>  /**
>   * bitmap_fold - fold larger bitmap into smaller, modulo specified size
> -- 
> 2.43.0


Powerpc: ps3av.c:(.text+0x19e8): undefined reference to `video_get_options'

2024-02-12 Thread Naresh Kamboju
I encountered the following build warnings/errors while compiling the powerpc
kernel on Linux next-20240208 .. next-20240212 tag with clang toolchain.

Reported-by: Linux Kernel Functional Testing 

powerpc64le-linux-gnu-ld: drivers/ps3/ps3av.o: in function `ps3av_probe':
ps3av.c:(.text+0x19e8): undefined reference to `video_get_options'
make[3]: *** [/builds/linux/scripts/Makefile.vmlinux:37: vmlinux] Error 1
make[3]: Target '__default' not remade because of errors.

Links:
 - 
https://storage.tuxsuite.com/public/linaro/lkft/builds/2cFkli5H02fikrpga6PluAWLAMa/


--
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson



- Original Message -
> From: "Segher Boessenkool" 
> To: "Timothy Pearson" 
> Cc: "linuxppc-dev" 
> Sent: Monday, February 12, 2024 12:23:22 PM
> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

> On Mon, Feb 12, 2024 at 12:07:03PM -0600, Timothy Pearson wrote:
>> > I have done it for *all* architectures some ten years ago.  Never found
>> > any problem.
>> 
>> That makes sense, what I mean by invasive is that we'd need buy-in from the
>> other
>> maintainers across all of the affected architectures.  Is that likely to 
>> occur?
> 
> I don't know.  Here is my PowerPC-specific patch, it's a bit older, it
> might not apply cleanly anymore, the changes needed should be obvious
> though:
> 
> 
> === 8< ===
> commit f16dfa5257eb14549ce22243fb2b465615085134
> Author: Segher Boessenkool 
> Date:   Sat May 3 03:48:06 2008 +0200
> 
>powerpc: Link vmlinux against libgcc.a
> 
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index b7212b619c52..0a2fac6ffc1c 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -158,6 +158,9 @@ core-y  += 
> arch/powerpc/kernel/
> core-$(CONFIG_XMON)+= arch/powerpc/xmon/
> core-$(CONFIG_KVM) += arch/powerpc/kvm/
> 
> +LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
> +libs-y += $(LIBGCC)
> +
> drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/
> 
> # Default to zImage, override when needed
> === 8< ===

OK.  PowerPC maintainers, how would you prefer to handle this?

>> > There are better options than -Os, fwiw.  Some --param's give smaller
>> > *and* faster kernels.  What exactly is best is heavily arch-dependent
>> > though (as well as dependent on the application code, the kernel code in
>> > this case) :-(
>> 
>> I've been through this a few times, and -Os is the only option that makes
>> things (just barely) fit unfortunately.
> 
> -O2 with appropriate inlining tuning beats -Os every day of the week,
> in my experience.

On 6.6 it's 24MiB vs 40MiB, O2 vs. Os. :(


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Segher Boessenkool
On Mon, Feb 12, 2024 at 12:07:03PM -0600, Timothy Pearson wrote:
> > I have done it for *all* architectures some ten years ago.  Never found
> > any problem.
> 
> That makes sense, what I mean by invasive is that we'd need buy-in from the 
> other
> maintainers across all of the affected architectures.  Is that likely to 
> occur?

I don't know.  Here is my PowerPC-specific patch, it's a bit older, it
might not apply cleanly anymore, the changes needed should be obvious
though:


=== 8< ===
commit f16dfa5257eb14549ce22243fb2b465615085134
Author: Segher Boessenkool 
Date:   Sat May 3 03:48:06 2008 +0200

powerpc: Link vmlinux against libgcc.a

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index b7212b619c52..0a2fac6ffc1c 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -158,6 +158,9 @@ core-y  += arch/powerpc/kernel/ 
 core-$(CONFIG_XMON)+= arch/powerpc/xmon/
 core-$(CONFIG_KVM) += arch/powerpc/kvm/
 
+LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
+libs-y += $(LIBGCC)
+
 drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/
 
 # Default to zImage, override when needed
=== 8< ===


> > There are better options than -Os, fwiw.  Some --param's give smaller
> > *and* faster kernels.  What exactly is best is heavily arch-dependent
> > though (as well as dependent on the application code, the kernel code in
> > this case) :-(
> 
> I've been through this a few times, and -Os is the only option that makes
> things (just barely) fit unfortunately.

-O2 with appropriate inlining tuning beats -Os every day of the week,
in my experience.


Segher


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson



- Original Message -
> From: "Segher Boessenkool" 
> To: "Timothy Pearson" 
> Cc: "linuxppc-dev" 
> Sent: Monday, February 12, 2024 11:59:06 AM
> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

> On Mon, Feb 12, 2024 at 11:46:19AM -0600, Timothy Pearson wrote:
>> Interesting, that make sense.
>> 
>> How should we proceed from the current situation?  Bringing in libgcc seems
>> like a fairly invasive change,
> 
> I have done it for *all* architectures some ten years ago.  Never found
> any problem.

That makes sense, what I mean by invasive is that we'd need buy-in from the 
other
maintainers across all of the affected architectures.  Is that likely to occur?

>> should we merge this to fix the current bug
>> (cannot build ppc64 kernel in size-optimized mode) and start discussion on
>> bringing in libgcc as the long-term fix across multiple architectures?
>> 
>> My goal here is to not have to carry a downstream patch in perpetuity for
>> our embedded Linux firmware, which needs to be compiled in size-optimized
>> mode due to hardware Flash limitations.
> 
> There are better options than -Os, fwiw.  Some --param's give smaller
> *and* faster kernels.  What exactly is best is heavily arch-dependent
> though (as well as dependent on the application code, the kernel code in
> this case) :-(

I've been through this a few times, and -Os is the only option that makes
things (just barely) fit unfortunately.


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Segher Boessenkool
On Mon, Feb 12, 2024 at 11:46:19AM -0600, Timothy Pearson wrote:
> Interesting, that make sense.
> 
> How should we proceed from the current situation?  Bringing in libgcc seems
> like a fairly invasive change,

I have done it for *all* architectures some ten years ago.  Never found
any problem.

> should we merge this to fix the current bug
> (cannot build ppc64 kernel in size-optimized mode) and start discussion on
> bringing in libgcc as the long-term fix across multiple architectures?
> 
> My goal here is to not have to carry a downstream patch in perpetuity for
> our embedded Linux firmware, which needs to be compiled in size-optimized
> mode due to hardware Flash limitations.

There are better options than -Os, fwiw.  Some --param's give smaller
*and* faster kernels.  What exactly is best is heavily arch-dependent
though (as well as dependent on the application code, the kernel code in
this case) :-(


Segher


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson



- Original Message -
> From: "Segher Boessenkool" 
> To: "Timothy Pearson" 
> Cc: "linuxppc-dev" 
> Sent: Monday, February 12, 2024 11:30:43 AM
> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions
> 
> Long long time ago, linux-0.11 or something, it was discovered that some
> programmiing mistakes resulted in double-length divisions (64x64->64 on
> 32-bit systems, say).  Most architectures have no hardware support for
> that, x86 is one of those; so you need very expensive support routines
> to do that (_udivdi3 or _divdi3 in that case, ...ti3 on 64-bit archs).
> 
> So it was decided to not link to libgcc to avoid this.  But that means
> that all the extremely many other suppoort routines, more for some other
> archs, are also not there.  While it would have been much easier to just
> link to something that provides the _{u,}divdi3 symbol and then causes a
> forced linking error from that!
> 
> 
> Segher

Interesting, that make sense.

How should we proceed from the current situation?  Bringing in libgcc seems
like a fairly invasive change, should we merge this to fix the current bug
(cannot build ppc64 kernel in size-optimized mode) and start discussion on
bringing in libgcc as the long-term fix across multiple architectures?

My goal here is to not have to carry a downstream patch in perpetuity for
our embedded Linux firmware, which needs to be compiled in size-optimized
mode due to hardware Flash limitations.

Thanks!


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Segher Boessenkool
On Mon, Feb 12, 2024 at 11:09:38AM -0600, Timothy Pearson wrote:
> There is existing code in the kernel right now to provide support functions 
> for gpr0 and altivec save/restore.  I don't know the full story here, but at 
> some point in the kernel's history it seems to have been decided to provide 
> the helper functions in lieu of linking libgcc directly.  If this is 
> incorrect, then I need to know that so I can rework the patch to enable libcc 
> and remove the existing support functions.
> 
> Is there anyone on-list that knows more of the history and decision-making 
> that went into the current state of the kernel here?

Long long time ago, linux-0.11 or something, it was discovered that some
programmiing mistakes resulted in double-length divisions (64x64->64 on
32-bit systems, say).  Most architectures have no hardware support for
that, x86 is one of those; so you need very expensive support routines
to do that (_udivdi3 or _divdi3 in that case, ...ti3 on 64-bit archs).

So it was decided to not link to libgcc to avoid this.  But that means
that all the extremely many other suppoort routines, more for some other
archs, are also not there.  While it would have been much easier to just
link to something that provides the _{u,}divdi3 symbol and then causes a
forced linking error from that!


Segher


[PATCH v2] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson
When building the kernel in size optimized mode with the amdgpu module enabled,
gcc will begin referencing external gpr1 and fpu save/restore functions.  This
will then cause a linker failure as we do not link against libgcc which
normally contains those builtin functions.

Implement gpr1 and fpu save/restore functions per the PowerPC 64-bit ELFv2 ABI
documentation.

Tested on a Talos II with a WX7100 installed and running in DisplayCore mode.

Reported-by: kernel test robot 
Tested-by: Timothy Pearson 
Signed-off-by: Timothy Pearson 
---
 arch/powerpc/kernel/prom_init_check.sh |   4 +-
 arch/powerpc/lib/crtsavres.S   | 244 +
 scripts/mod/modpost.c  |   4 +
 3 files changed, 250 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/prom_init_check.sh 
b/arch/powerpc/kernel/prom_init_check.sh
index 69623b9045d5..76c5651e29d3 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -72,10 +72,10 @@ do
 
# ignore register save/restore funcitons
case $UNDEF in
-   _restgpr_*|_restgpr0_*|_rest32gpr_*)
+   _restgpr_*|_restgpr0_*|_restgpr1_*|_rest32gpr_*)
OK=1
;;
-   _savegpr_*|_savegpr0_*|_save32gpr_*)
+   _savegpr_*|_savegpr0_*|_restgpr0_*|_save32gpr_*)
OK=1
;;
esac
diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S
index 7e5e1c28e56a..6cd870aacd7f 100644
--- a/arch/powerpc/lib/crtsavres.S
+++ b/arch/powerpc/lib/crtsavres.S
@@ -3,6 +3,7 @@
  *
  *   Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc.
  *   Copyright 2008 Freescale Semiconductor, Inc.
+ *   Copyright 2024 Raptor Engineering, LLC
  *   Written By Michael Meissner
  *
  * Based on gcc/config/rs6000/crtsavres.asm from gcc
@@ -435,6 +436,127 @@ _restgpr0_31:
mtlrr0
blr
 
+.globl _savegpr1_14
+_savegpr1_14:
+   std r14,-144(r12)
+.globl _savegpr1_15
+_savegpr1_15:
+   std r15,-136(r12)
+.globl _savegpr1_16
+_savegpr1_16:
+   std r16,-128(r12)
+.globl _savegpr1_17
+_savegpr1_17:
+   std r17,-120(r12)
+.globl _savegpr1_18
+_savegpr1_18:
+   std r18,-112(r12)
+.globl _savegpr1_19
+_savegpr1_19:
+   std r19,-104(r12)
+.globl _savegpr1_20
+_savegpr1_20:
+   std r20,-96(r12)
+.globl _savegpr1_21
+_savegpr1_21:
+   std r21,-88(r12)
+.globl _savegpr1_22
+_savegpr1_22:
+   std r22,-80(r12)
+.globl _savegpr1_23
+_savegpr1_23:
+   std r23,-72(r12)
+.globl _savegpr1_24
+_savegpr1_24:
+   std r24,-64(r12)
+.globl _savegpr1_25
+_savegpr1_25:
+   std r25,-56(r12)
+.globl _savegpr1_26
+_savegpr1_26:
+   std r26,-48(r12)
+.globl _savegpr1_27
+_savegpr1_27:
+   std r27,-40(r12)
+.globl _savegpr1_28
+_savegpr1_28:
+   std r28,-32(r12)
+.globl _savegpr1_29
+_savegpr1_29:
+   std r29,-24(r12)
+.globl _savegpr1_30
+_savegpr1_30:
+   std r30,-16(r12)
+.globl _savegpr1_31
+_savegpr1_31:
+   std r31,-8(r12)
+   std r0,16(r12)
+   blr
+
+.globl _restgpr1_14
+_restgpr1_14:
+   ld  r14,-144(r12)
+.globl _restgpr1_15
+_restgpr1_15:
+   ld  r15,-136(r12)
+.globl _restgpr1_16
+_restgpr1_16:
+   ld  r16,-128(r12)
+.globl _restgpr1_17
+_restgpr1_17:
+   ld  r17,-120(r12)
+.globl _restgpr1_18
+_restgpr1_18:
+   ld  r18,-112(r12)
+.globl _restgpr1_19
+_restgpr1_19:
+   ld  r19,-104(r12)
+.globl _restgpr1_20
+_restgpr1_20:
+   ld  r20,-96(r12)
+.globl _restgpr1_21
+_restgpr1_21:
+   ld  r21,-88(r12)
+.globl _restgpr1_22
+_restgpr1_22:
+   ld  r22,-80(r12)
+.globl _restgpr1_23
+_restgpr1_23:
+   ld  r23,-72(r12)
+.globl _restgpr1_24
+_restgpr1_24:
+   ld  r24,-64(r12)
+.globl _restgpr1_25
+_restgpr1_25:
+   ld  r25,-56(r12)
+.globl _restgpr1_26
+_restgpr1_26:
+   ld  r26,-48(r12)
+.globl _restgpr1_27
+_restgpr1_27:
+   ld  r27,-40(r12)
+.globl _restgpr1_28
+_restgpr1_28:
+   ld  r28,-32(r12)
+.globl _restgpr1_29
+_restgpr1_29:
+   ld  r0,16(r12)
+   ld  r29,-24(r12)
+   mtlrr0
+   ld  r30,-16(r12)
+   ld  r31,-8(r12)
+   blr
+
+.globl _restgpr1_30
+_restgpr1_30:
+   ld  r30,-16(r12)
+.globl _restgpr1_31
+_restgpr1_31:
+   ld  r0,16(r12)
+   ld  r31,-8(r12)
+   mtlrr0
+   blr
+
 #ifdef CONFIG_ALTIVEC
 /* Called with r0 pointing just beyond the end of the vector save area.  */
 
@@ -540,6 +662,128 @@ _restvr_31:
 
 #endif /* CONFIG_ALTIVEC */
 
+#ifdef CONFIG_PPC_FPU
+
+.globl _savefpr_14
+_savefpr_14:
+   stfd f14,-144(r1)
+.globl _savefpr_15
+_savefpr_15:
+   stfd f15,-136(r1)
+.globl _savefpr_16
+_savefpr_16:
+   stfd f16,-128(r1)
+.globl _savefpr_17
+_savefpr_17:
+   stfd f17,-120(r1)
+.globl _savefpr_18
+_savefpr_18:
+   stfd f18,-112(r1)
+.globl 

Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson



- Original Message -
> From: "Segher Boessenkool" 
> To: "Timothy Pearson" 
> Cc: "linuxppc-dev" 
> Sent: Monday, February 12, 2024 11:02:07 AM
> Subject: Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

> On Mon, Feb 12, 2024 at 10:41:18AM -0600, Timothy Pearson wrote:
>> Implement gpr1 and fpu save/restore functions per the ABI v2 documentation.
> 
> There is no "ABI v2".  This is the ELFv2 ABI, it is a name, it is not a
> version 2 of anything (in fact, it is version 1 everywhere).

Apologies, I wasn't precise on the name.

> The same functions are needed and used in other ABIs, too.
> 
> But, why do this patch?  You just need
> 
> +LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
> 
> +libs-y += $(LIBGCC)
> 
> and nothing more.  It is required for proper functioning of GCC to link
> with the libgcc support library.

There is existing code in the kernel right now to provide support functions for 
gpr0 and altivec save/restore.  I don't know the full story here, but at some 
point in the kernel's history it seems to have been decided to provide the 
helper functions in lieu of linking libgcc directly.  If this is incorrect, 
then I need to know that so I can rework the patch to enable libcc and remove 
the existing support functions.

Is there anyone on-list that knows more of the history and decision-making that 
went into the current state of the kernel here?

Thanks!


Re: [PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Segher Boessenkool
On Mon, Feb 12, 2024 at 10:41:18AM -0600, Timothy Pearson wrote:
> Implement gpr1 and fpu save/restore functions per the ABI v2 documentation.

There is no "ABI v2".  This is the ELFv2 ABI, it is a name, it is not a
version 2 of anything (in fact, it is version 1 everywhere).

The same functions are needed and used in other ABIs, too.

But, why do this patch?  You just need

+LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)

+libs-y += $(LIBGCC)

and nothing more.  It is required for proper functioning of GCC to link
with the libgcc support library.


Segher


[PATCH] powerpc: Add gpr1 and fpu save/restore functions

2024-02-12 Thread Timothy Pearson
When building the kernel in size optimized mode with the amdgpu module enabled,
gcc will begin referencing external gpr1 and fpu save/restore functions.  This
will then cause a linker failure as we do not link against libgcc which
normally contains those builtin functions.

Implement gpr1 and fpu save/restore functions per the ABI v2 documentation.

Tested on a Talos II with a WX7100 installed and running in DisplayCore mode.

Reported-by: kernel test robot 
Tested-by: Timothy Pearson 
Signed-off-by: Timothy Pearson 
---
 arch/powerpc/kernel/prom_init_check.sh |   4 +-
 arch/powerpc/lib/crtsavres.S   | 244 +
 scripts/mod/modpost.c  |   4 +
 3 files changed, 250 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/prom_init_check.sh 
b/arch/powerpc/kernel/prom_init_check.sh
index 69623b9045d5..76c5651e29d3 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -72,10 +72,10 @@ do
 
# ignore register save/restore funcitons
case $UNDEF in
-   _restgpr_*|_restgpr0_*|_rest32gpr_*)
+   _restgpr_*|_restgpr0_*|_restgpr1_*|_rest32gpr_*)
OK=1
;;
-   _savegpr_*|_savegpr0_*|_save32gpr_*)
+   _savegpr_*|_savegpr0_*|_restgpr0_*|_save32gpr_*)
OK=1
;;
esac
diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S
index 7e5e1c28e56a..6cd870aacd7f 100644
--- a/arch/powerpc/lib/crtsavres.S
+++ b/arch/powerpc/lib/crtsavres.S
@@ -3,6 +3,7 @@
  *
  *   Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc.
  *   Copyright 2008 Freescale Semiconductor, Inc.
+ *   Copyright 2024 Raptor Engineering, LLC
  *   Written By Michael Meissner
  *
  * Based on gcc/config/rs6000/crtsavres.asm from gcc
@@ -435,6 +436,127 @@ _restgpr0_31:
mtlrr0
blr
 
+.globl _savegpr1_14
+_savegpr1_14:
+   std r14,-144(r12)
+.globl _savegpr1_15
+_savegpr1_15:
+   std r15,-136(r12)
+.globl _savegpr1_16
+_savegpr1_16:
+   std r16,-128(r12)
+.globl _savegpr1_17
+_savegpr1_17:
+   std r17,-120(r12)
+.globl _savegpr1_18
+_savegpr1_18:
+   std r18,-112(r12)
+.globl _savegpr1_19
+_savegpr1_19:
+   std r19,-104(r12)
+.globl _savegpr1_20
+_savegpr1_20:
+   std r20,-96(r12)
+.globl _savegpr1_21
+_savegpr1_21:
+   std r21,-88(r12)
+.globl _savegpr1_22
+_savegpr1_22:
+   std r22,-80(r12)
+.globl _savegpr1_23
+_savegpr1_23:
+   std r23,-72(r12)
+.globl _savegpr1_24
+_savegpr1_24:
+   std r24,-64(r12)
+.globl _savegpr1_25
+_savegpr1_25:
+   std r25,-56(r12)
+.globl _savegpr1_26
+_savegpr1_26:
+   std r26,-48(r12)
+.globl _savegpr1_27
+_savegpr1_27:
+   std r27,-40(r12)
+.globl _savegpr1_28
+_savegpr1_28:
+   std r28,-32(r12)
+.globl _savegpr1_29
+_savegpr1_29:
+   std r29,-24(r12)
+.globl _savegpr1_30
+_savegpr1_30:
+   std r30,-16(r12)
+.globl _savegpr1_31
+_savegpr1_31:
+   std r31,-8(r12)
+   std r0,16(r12)
+   blr
+
+.globl _restgpr1_14
+_restgpr1_14:
+   ld  r14,-144(r12)
+.globl _restgpr1_15
+_restgpr1_15:
+   ld  r15,-136(r12)
+.globl _restgpr1_16
+_restgpr1_16:
+   ld  r16,-128(r12)
+.globl _restgpr1_17
+_restgpr1_17:
+   ld  r17,-120(r12)
+.globl _restgpr1_18
+_restgpr1_18:
+   ld  r18,-112(r12)
+.globl _restgpr1_19
+_restgpr1_19:
+   ld  r19,-104(r12)
+.globl _restgpr1_20
+_restgpr1_20:
+   ld  r20,-96(r12)
+.globl _restgpr1_21
+_restgpr1_21:
+   ld  r21,-88(r12)
+.globl _restgpr1_22
+_restgpr1_22:
+   ld  r22,-80(r12)
+.globl _restgpr1_23
+_restgpr1_23:
+   ld  r23,-72(r12)
+.globl _restgpr1_24
+_restgpr1_24:
+   ld  r24,-64(r12)
+.globl _restgpr1_25
+_restgpr1_25:
+   ld  r25,-56(r12)
+.globl _restgpr1_26
+_restgpr1_26:
+   ld  r26,-48(r12)
+.globl _restgpr1_27
+_restgpr1_27:
+   ld  r27,-40(r12)
+.globl _restgpr1_28
+_restgpr1_28:
+   ld  r28,-32(r12)
+.globl _restgpr1_29
+_restgpr1_29:
+   ld  r0,16(r12)
+   ld  r29,-24(r12)
+   mtlrr0
+   ld  r30,-16(r12)
+   ld  r31,-8(r12)
+   blr
+
+.globl _restgpr1_30
+_restgpr1_30:
+   ld  r30,-16(r12)
+.globl _restgpr1_31
+_restgpr1_31:
+   ld  r0,16(r12)
+   ld  r31,-8(r12)
+   mtlrr0
+   blr
+
 #ifdef CONFIG_ALTIVEC
 /* Called with r0 pointing just beyond the end of the vector save area.  */
 
@@ -540,6 +662,128 @@ _restvr_31:
 
 #endif /* CONFIG_ALTIVEC */
 
+#ifdef CONFIG_PPC_FPU
+
+.globl _savefpr_14
+_savefpr_14:
+   stfd f14,-144(r1)
+.globl _savefpr_15
+_savefpr_15:
+   stfd f15,-136(r1)
+.globl _savefpr_16
+_savefpr_16:
+   stfd f16,-128(r1)
+.globl _savefpr_17
+_savefpr_17:
+   stfd f17,-120(r1)
+.globl _savefpr_18
+_savefpr_18:
+   stfd f18,-112(r1)
+.globl _savefpr_19

Re: [PATCH v5 22/25] mm: Add pte_batch_hint() to reduce scanning in folio_pte_batch()

2024-02-12 Thread David Hildenbrand

On 12.02.24 16:47, Ryan Roberts wrote:

On 12/02/2024 13:43, David Hildenbrand wrote:

On 02.02.24 09:07, Ryan Roberts wrote:

Some architectures (e.g. arm64) can tell from looking at a pte, if some
follow-on ptes also map contiguous physical memory with the same pgprot.
(for arm64, these are contpte mappings).

Take advantage of this knowledge to optimize folio_pte_batch() so that
it can skip these ptes when scanning to create a batch. By default, if
an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so
the changes are optimized out and the behaviour is as before.

arm64 will opt-in to providing this hint in the next patch, which will
greatly reduce the cost of ptep_get() when scanning a range of contptes.

Tested-by: John Hubbard 
Signed-off-by: Ryan Roberts 
---
   include/linux/pgtable.h | 18 ++
   mm/memory.c | 20 +---
   2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 50f32cccbd92..cba31f177d27 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,6 +212,24 @@ static inline int pmd_dirty(pmd_t pmd)
   #define arch_flush_lazy_mmu_mode()    do {} while (0)
   #endif
   +#ifndef pte_batch_hint
+/**
+ * pte_batch_hint - Number of pages that can be added to batch without 
scanning.
+ * @ptep: Page table pointer for the entry.
+ * @pte: Page table entry.
+ *
+ * Some architectures know that a set of contiguous ptes all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pte batching without the core code needing to scan every pte.


I think we might want to document here the expectation regarding
dirty/accessed bits. folio_pte_batch() will ignore dirty bits only with
FPB_IGNORE_DIRTY. But especially for arm64, it makes sense to ignore them
always when batching, because the dirty bit may target any pte part of the
cont-pte group either way.

Maybe something like:

"
An architecture implementation may only ignore the PTE accessed and dirty bits.
Further, it may only ignore the dirty bit if that bit is already not
maintained with precision per PTE inside the hinted batch, and ptep_get()
would already have to collect it from various PTEs.
"


I'm proposing to simplify this to:

"
An architecture implementation may ignore the PTE accessed state. Further, the
dirty state must apply atomically to all the PTEs described by the hint.
"

Which I think more accurately describes the requirement. Shout if you disagree.


I'm not 100% sure if the "must apply atomically" is clear without all of 
the cont-pte details and ptep_get(). But I fail to describe it in a 
better way.


It's all better compared to what we had before, so LGTM :)

--
Cheers,

David / dhildenb



Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread David Hildenbrand

On 12.02.24 16:34, Ryan Roberts wrote:

On 12/02/2024 15:26, David Hildenbrand wrote:

On 12.02.24 15:45, Ryan Roberts wrote:

On 12/02/2024 13:54, David Hildenbrand wrote:

If so, I wonder if we could instead do that comparison modulo the access/dirty
bits,


I think that would work - but will need to think a bit more on it.


and leave ptep_get_lockless() only reading a single entry?


I think we will need to do something a bit less fragile. ptep_get() does
collect
the access/dirty bits so its confusing if ptep_get_lockless() doesn't IMHO. So
we will likely want to rename the function and make its documentation explicit
that it does not return those bits.

ptep_get_lockless_noyoungdirty()? yuk... Any ideas?

Of course if I could convince you the current implementation is safe, I
might be
able to sidestep this optimization until a later date?


As discussed (and pointed out abive), there might be quite some callsites where
we don't really care about uptodate accessed/dirty bits -- where ptep_get() is
used nowadays.

One way to approach that I had in mind was having an explicit interface:

ptep_get()
ptep_get_uptodate()
ptep_get_lockless()
ptep_get_lockless_uptodate()


Yes, I like the direction of this. I guess we anticipate that call sites
requiring the "_uptodate" variant will be the minority so it makes sense to use
the current names for the "_not_uptodate" variants? But to do a slow migration,
it might be better/safer to have the weaker variant use the new name - that
would allow us to downgrade one at a time?


Yes, I was primarily struggling with names. Likely it makes sense to either have
two completely new function names, or use the new name only for the "faster but
less precise" variant.





Especially the last one might not be needed.

I've done a scan through the code and agree with Mark's original conclusions.
Additionally, huge_pte_alloc() (which isn't used for arm64) doesn't rely on
access/dirty info. So I think I could migrate everything to the weaker variant
fairly easily.



Futher, "uptodate" might not be the best choice because of PageUptodate() and
friends. But it's better than "youngdirty"/"noyoungdirty" IMHO.


Certainly agree with "noyoungdirty" being a horrible name. How about "_sync" /
"_nosync"?


I could live with

ptep_get_sync()
ptep_get_nosync()

with proper documentation :)


but could you live with:

ptep_get()
ptep_get_nosync()
ptep_get_lockless_nosync()

?

So leave the "slower, more precise" version with the existing name.


Sure.

--
Cheers,

David / dhildenb



Re: [PATCH v5 22/25] mm: Add pte_batch_hint() to reduce scanning in folio_pte_batch()

2024-02-12 Thread Ryan Roberts
On 12/02/2024 13:43, David Hildenbrand wrote:
> On 02.02.24 09:07, Ryan Roberts wrote:
>> Some architectures (e.g. arm64) can tell from looking at a pte, if some
>> follow-on ptes also map contiguous physical memory with the same pgprot.
>> (for arm64, these are contpte mappings).
>>
>> Take advantage of this knowledge to optimize folio_pte_batch() so that
>> it can skip these ptes when scanning to create a batch. By default, if
>> an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so
>> the changes are optimized out and the behaviour is as before.
>>
>> arm64 will opt-in to providing this hint in the next patch, which will
>> greatly reduce the cost of ptep_get() when scanning a range of contptes.
>>
>> Tested-by: John Hubbard 
>> Signed-off-by: Ryan Roberts 
>> ---
>>   include/linux/pgtable.h | 18 ++
>>   mm/memory.c | 20 +---
>>   2 files changed, 31 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index 50f32cccbd92..cba31f177d27 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -212,6 +212,24 @@ static inline int pmd_dirty(pmd_t pmd)
>>   #define arch_flush_lazy_mmu_mode()    do {} while (0)
>>   #endif
>>   +#ifndef pte_batch_hint
>> +/**
>> + * pte_batch_hint - Number of pages that can be added to batch without 
>> scanning.
>> + * @ptep: Page table pointer for the entry.
>> + * @pte: Page table entry.
>> + *
>> + * Some architectures know that a set of contiguous ptes all map the same
>> + * contiguous memory with the same permissions. In this case, it can 
>> provide a
>> + * hint to aid pte batching without the core code needing to scan every pte.
> 
> I think we might want to document here the expectation regarding
> dirty/accessed bits. folio_pte_batch() will ignore dirty bits only with
> FPB_IGNORE_DIRTY. But especially for arm64, it makes sense to ignore them
> always when batching, because the dirty bit may target any pte part of the
> cont-pte group either way.
> 
> Maybe something like:
> 
> "
> An architecture implementation may only ignore the PTE accessed and dirty 
> bits.
> Further, it may only ignore the dirty bit if that bit is already not
> maintained with precision per PTE inside the hinted batch, and ptep_get()
> would already have to collect it from various PTEs.
> "

I'm proposing to simplify this to:

"
An architecture implementation may ignore the PTE accessed state. Further, the
dirty state must apply atomically to all the PTEs described by the hint.
"

Which I think more accurately describes the requirement. Shout if you disagree.

> 
> I think there are some more details to it, but I'm hoping something along
> the lines above is sufficient.
> 
> 
>> +
>>   #ifndef pte_advance_pfn
>>   static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
>>   {
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 65fbe4f886c1..902665b27702 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -988,16 +988,21 @@ static inline int folio_pte_batch(struct folio *folio,
>> unsigned long addr,
>>   {
>>   unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
>>   const pte_t *end_ptep = start_ptep + max_nr;
>> -    pte_t expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, 1),
>> flags);
>> -    pte_t *ptep = start_ptep + 1;
>> +    pte_t expected_pte = __pte_batch_clear_ignored(pte, flags);
>> +    pte_t *ptep = start_ptep;
>>   bool writable;
>> +    int nr;
>>     if (any_writable)
>>   *any_writable = false;
>>     VM_WARN_ON_FOLIO(!pte_present(pte), folio);
>>   -    while (ptep != end_ptep) {
>> +    nr = pte_batch_hint(ptep, pte);
>> +    expected_pte = pte_advance_pfn(expected_pte, nr);
>> +    ptep += nr;
>> +
> 
> *Maybe* it's easier to get when initializing expected_pte+ptep only once.
> 
> Like:
> 
> [...]
> pte_t expected_pte, *ptep;
> [...]
> 
> nr = pte_batch_hint(start_ptep, pte);
> expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
> ptep = start_ptep + nr;
> 
>> +    while (ptep < end_ptep) {
>>   pte = ptep_get(ptep);
>>   if (any_writable)
>>   writable = !!pte_write(pte);
>> @@ -1011,17 +1016,18 @@ static inline int folio_pte_batch(struct folio 
>> *folio,
>> unsigned long addr,
>>    * corner cases the next PFN might fall into a different
>>    * folio.
>>    */
>> -    if (pte_pfn(pte) == folio_end_pfn)
>> +    if (pte_pfn(pte) >= folio_end_pfn)
>>   break;
>>     if (any_writable)
>>   *any_writable |= writable;
>>   -    expected_pte = pte_advance_pfn(expected_pte, 1);
>> -    ptep++;
>> +    nr = pte_batch_hint(ptep, pte);
>> +    expected_pte = pte_advance_pfn(expected_pte, nr);
>> +    ptep += nr;
>>   }
>>   -    return ptep - start_ptep;
>> +    return min(ptep - start_ptep, max_nr);
>>   }
> 
> Acked-by: David 

Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread Ryan Roberts
On 12/02/2024 15:26, David Hildenbrand wrote:
> On 12.02.24 15:45, Ryan Roberts wrote:
>> On 12/02/2024 13:54, David Hildenbrand wrote:
> If so, I wonder if we could instead do that comparison modulo the 
> access/dirty
> bits,

 I think that would work - but will need to think a bit more on it.

> and leave ptep_get_lockless() only reading a single entry?

 I think we will need to do something a bit less fragile. ptep_get() does
 collect
 the access/dirty bits so its confusing if ptep_get_lockless() doesn't 
 IMHO. So
 we will likely want to rename the function and make its documentation 
 explicit
 that it does not return those bits.

 ptep_get_lockless_noyoungdirty()? yuk... Any ideas?

 Of course if I could convince you the current implementation is safe, I
 might be
 able to sidestep this optimization until a later date?
>>>
>>> As discussed (and pointed out abive), there might be quite some callsites 
>>> where
>>> we don't really care about uptodate accessed/dirty bits -- where ptep_get() 
>>> is
>>> used nowadays.
>>>
>>> One way to approach that I had in mind was having an explicit interface:
>>>
>>> ptep_get()
>>> ptep_get_uptodate()
>>> ptep_get_lockless()
>>> ptep_get_lockless_uptodate()
>>
>> Yes, I like the direction of this. I guess we anticipate that call sites
>> requiring the "_uptodate" variant will be the minority so it makes sense to 
>> use
>> the current names for the "_not_uptodate" variants? But to do a slow 
>> migration,
>> it might be better/safer to have the weaker variant use the new name - that
>> would allow us to downgrade one at a time?
> 
> Yes, I was primarily struggling with names. Likely it makes sense to either 
> have
> two completely new function names, or use the new name only for the "faster 
> but
> less precise" variant.
> 
>>
>>>
>>> Especially the last one might not be needed.
>> I've done a scan through the code and agree with Mark's original conclusions.
>> Additionally, huge_pte_alloc() (which isn't used for arm64) doesn't rely on
>> access/dirty info. So I think I could migrate everything to the weaker 
>> variant
>> fairly easily.
>>
>>>
>>> Futher, "uptodate" might not be the best choice because of PageUptodate() 
>>> and
>>> friends. But it's better than "youngdirty"/"noyoungdirty" IMHO.
>>
>> Certainly agree with "noyoungdirty" being a horrible name. How about "_sync" 
>> /
>> "_nosync"?
> 
> I could live with
> 
> ptep_get_sync()
> ptep_get_nosync()
> 
> with proper documentation :)

but could you live with:

ptep_get()
ptep_get_nosync()
ptep_get_lockless_nosync()

?

So leave the "slower, more precise" version with the existing name.

> 
> I don't think we use "_sync" / "_nosync" in the context of pte operations yet.
> 
> Well, there seems to be "__arm_v7s_pte_sync" in iommu code, bit at least in 
> core
> code nothing jumped at me.
> 



Re: Re: [PATCH v2] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt.

2024-02-12 Thread Mahesh J Salgaonkar
On 2024-02-12 08:06:25 Mon, Christophe Leroy wrote:
> 
> 
> Le 05/02/2024 à 06:36, Mahesh Salgaonkar a écrit :
> > [Vous ne recevez pas souvent de courriers de mah...@linux.ibm.com. 
> > Découvrez pourquoi ceci est important à 
> > https://aka.ms/LearnAboutSenderIdentification ]
> > 
> > nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
> > crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
> > interrupt handler) if percpu allocation comes from vmalloc area.
> > 
> > Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
> > wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
> > percpu allocation is from the embedded first chunk. However with
> > CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
> > allocation can come from the vmalloc area.
> > 
> > With kernel command line "percpu_alloc=page" we can force percpu allocation
> > to come from vmalloc area and can see kernel crash in machine_check_early:
> > 
> > [1.215714] NIP [c0e49eb4] rcu_nmi_enter+0x24/0x110
> > [1.215717] LR [c00461a0] machine_check_early+0xf0/0x2c0
> > [1.215719] --- interrupt: 200
> > [1.215720] [c00fffd73180] [] 0x0 (unreliable)
> > [1.215722] [c00fffd731b0] [] 0x0
> > [1.215724] [c00fffd73210] [c0008364] 
> > machine_check_early_common+0x134/0x1f8
> > 
> > Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
> > first chunk is not embedded.
> > 
> > Signed-off-by: Mahesh Salgaonkar 
> > ---
> > Changes in v2:
> > - Rebase to upstream master
> > - Use jump_labels, if CONFIG_JUMP_LABEL is enabled, to avoid redoing the
> >test at each interrupt entry.
> > - v1 is at 
> > https://lore.kernel.org/linuxppc-dev/164578465828.74956.6065296024817333750.stgit@jupiter/
> > ---
> >   arch/powerpc/include/asm/interrupt.h | 14 ++
> >   arch/powerpc/include/asm/percpu.h| 11 +++
> >   arch/powerpc/kernel/setup_64.c   | 12 
> >   3 files changed, 37 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/interrupt.h 
> > b/arch/powerpc/include/asm/interrupt.h
> > index a4196ab1d0167..3b4e17c23d9a9 100644
> > --- a/arch/powerpc/include/asm/interrupt.h
> > +++ b/arch/powerpc/include/asm/interrupt.h
> > @@ -336,6 +336,16 @@ static inline void interrupt_nmi_enter_prepare(struct 
> > pt_regs *regs, struct inte
> >  if (IS_ENABLED(CONFIG_KASAN))
> >  return;
> > 
> > +   /*
> > +* Likewise, do not use it in real mode if percpu first chunk is not
> > +* embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there
> > +* are chances where percpu allocation can come from vmalloc area.
> > +*/
> > +#ifdef CONFIG_PPC64
> 
> Instead of adding this #ifdef in middle of code, could you define 
> is_embed_first_chunk as always 'true' when CONFIG_PPC64 is not defined ?

Will fix this in v3.

[...]
> > diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> > index 2f19d5e944852..674b6e1bebe9a 100644
> > --- a/arch/powerpc/kernel/setup_64.c
> > +++ b/arch/powerpc/kernel/setup_64.c
> > @@ -834,6 +834,11 @@ static __init int pcpu_cpu_to_node(int cpu)
> > 
> >   unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
> >   EXPORT_SYMBOL(__per_cpu_offset);
> > +#ifdef CONFIG_JUMP_LABEL
> 
> Why this ifdef ? Even when CONFIG_JUMP_LABEL is not selected all this 
> should just work fine.

Yes you are right. I overlooked this. Will fix it in next revision.

Thanks for your review.

-- 
Mahesh J Salgaonkar


Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread Ryan Roberts
On 12/02/2024 12:59, Ryan Roberts wrote:
> On 12/02/2024 12:00, Mark Rutland wrote:
>> Hi Ryan,
>>
>> Overall this looks pretty good; I have a bunch of minor comments below, and a
>> bigger question on the way ptep_get_lockless() works.
> 
> OK great - thanks for the review. Let's see if I can answer them all...
> 
>>
>> On Fri, Feb 02, 2024 at 08:07:50AM +, Ryan Roberts wrote:
>>> With the ptep API sufficiently refactored, we can now introduce a new
>>> "contpte" API layer, which transparently manages the PTE_CONT bit for
>>> user mappings.
>>>
>>> In this initial implementation, only suitable batches of PTEs, set via
>>> set_ptes(), are mapped with the PTE_CONT bit. Any subsequent
>>> modification of individual PTEs will cause an "unfold" operation to
>>> repaint the contpte block as individual PTEs before performing the
>>> requested operation. While, a modification of a single PTE could cause
>>> the block of PTEs to which it belongs to become eligible for "folding"
>>> into a contpte entry, "folding" is not performed in this initial
>>> implementation due to the costs of checking the requirements are met.
>>> Due to this, contpte mappings will degrade back to normal pte mappings
>>> over time if/when protections are changed. This will be solved in a
>>> future patch.
>>>
>>> Since a contpte block only has a single access and dirty bit, the
>>> semantic here changes slightly; when getting a pte (e.g. ptep_get())
>>> that is part of a contpte mapping, the access and dirty information are
>>> pulled from the block (so all ptes in the block return the same
>>> access/dirty info). When changing the access/dirty info on a pte (e.g.
>>> ptep_set_access_flags()) that is part of a contpte mapping, this change
>>> will affect the whole contpte block. This is works fine in practice
>>> since we guarantee that only a single folio is mapped by a contpte
>>> block, and the core-mm tracks access/dirty information per folio.
>>>
>>> In order for the public functions, which used to be pure inline, to
>>> continue to be callable by modules, export all the contpte_* symbols
>>> that are now called by those public inline functions.
>>>
>>> The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter
>>> at build time. It defaults to enabled as long as its dependency,
>>> TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon
>>> TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not
>>> enabled, then there is no chance of meeting the physical contiguity
>>> requirement for contpte mappings.
>>>
>>> Tested-by: John Hubbard 
>>> Signed-off-by: Ryan Roberts 
>>> ---
>>>  arch/arm64/Kconfig   |   9 +
>>>  arch/arm64/include/asm/pgtable.h | 161 ++
>>>  arch/arm64/mm/Makefile   |   1 +
>>>  arch/arm64/mm/contpte.c  | 283 +++
>>>  4 files changed, 454 insertions(+)
>>>  create mode 100644 arch/arm64/mm/contpte.c
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index d86d7f4758b5..1442e8ed95b6 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -2230,6 +2230,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
>>> select UNWIND_TABLES
>>> select DYNAMIC_SCS
>>>  
>>> +config ARM64_CONTPTE
>>> +   bool "Contiguous PTE mappings for user memory" if EXPERT
>>> +   depends on TRANSPARENT_HUGEPAGE
>>> +   default y
>>> +   help
>>> + When enabled, user mappings are configured using the PTE contiguous
>>> + bit, for any mappings that meet the size and alignment requirements.
>>> + This reduces TLB pressure and improves performance.
>>> +
>>>  endmenu # "Kernel Features"
>>>  
>>>  menu "Boot options"
>>> diff --git a/arch/arm64/include/asm/pgtable.h 
>>> b/arch/arm64/include/asm/pgtable.h
>>> index 7dc6b68ee516..34892a95403d 100644
>>> --- a/arch/arm64/include/asm/pgtable.h
>>> +++ b/arch/arm64/include/asm/pgtable.h
>>> @@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t 
>>> phys)
>>>   */
>>>  #define pte_valid_not_user(pte) \
>>> ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | 
>>> PTE_UXN))
>>> +/*
>>> + * Returns true if the pte is valid and has the contiguous bit set.
>>> + */
>>> +#define pte_valid_cont(pte)(pte_valid(pte) && pte_cont(pte))
>>>  /*
>>>   * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
>>>   * so that we don't erroneously return false for pages that have been
>>> @@ -1135,6 +1139,161 @@ void vmemmap_update_pte(unsigned long addr, pte_t 
>>> *ptep, pte_t pte);
>>>  #define vmemmap_update_pte vmemmap_update_pte
>>>  #endif
>>>  
>>> +#ifdef CONFIG_ARM64_CONTPTE
>>> +
>>> +/*
>>> + * The contpte APIs are used to transparently manage the contiguous bit in 
>>> ptes
>>> + * where it is possible and makes sense to do so. The PTE_CONT bit is 
>>> considered
>>> + * a private implementation detail of the public ptep API (see below).
>>> + */
>>> +extern void __contpte_try_unfold(struct 

Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread David Hildenbrand

On 12.02.24 15:45, Ryan Roberts wrote:

On 12/02/2024 13:54, David Hildenbrand wrote:

If so, I wonder if we could instead do that comparison modulo the access/dirty
bits,


I think that would work - but will need to think a bit more on it.


and leave ptep_get_lockless() only reading a single entry?


I think we will need to do something a bit less fragile. ptep_get() does collect
the access/dirty bits so its confusing if ptep_get_lockless() doesn't IMHO. So
we will likely want to rename the function and make its documentation explicit
that it does not return those bits.

ptep_get_lockless_noyoungdirty()? yuk... Any ideas?

Of course if I could convince you the current implementation is safe, I might be
able to sidestep this optimization until a later date?


As discussed (and pointed out abive), there might be quite some callsites where
we don't really care about uptodate accessed/dirty bits -- where ptep_get() is
used nowadays.

One way to approach that I had in mind was having an explicit interface:

ptep_get()
ptep_get_uptodate()
ptep_get_lockless()
ptep_get_lockless_uptodate()


Yes, I like the direction of this. I guess we anticipate that call sites
requiring the "_uptodate" variant will be the minority so it makes sense to use
the current names for the "_not_uptodate" variants? But to do a slow migration,
it might be better/safer to have the weaker variant use the new name - that
would allow us to downgrade one at a time?


Yes, I was primarily struggling with names. Likely it makes sense to 
either have two completely new function names, or use the new name only 
for the "faster but less precise" variant.






Especially the last one might not be needed.

I've done a scan through the code and agree with Mark's original conclusions.
Additionally, huge_pte_alloc() (which isn't used for arm64) doesn't rely on
access/dirty info. So I think I could migrate everything to the weaker variant
fairly easily.



Futher, "uptodate" might not be the best choice because of PageUptodate() and
friends. But it's better than "youngdirty"/"noyoungdirty" IMHO.


Certainly agree with "noyoungdirty" being a horrible name. How about "_sync" /
"_nosync"?


I could live with

ptep_get_sync()
ptep_get_nosync()

with proper documentation :)

I don't think we use "_sync" / "_nosync" in the context of pte 
operations yet.


Well, there seems to be "__arm_v7s_pte_sync" in iommu code, bit at least 
in core code nothing jumped at me.


--
Cheers,

David / dhildenb



Re: [PATCH v5 22/25] mm: Add pte_batch_hint() to reduce scanning in folio_pte_batch()

2024-02-12 Thread Ryan Roberts
On 12/02/2024 13:43, David Hildenbrand wrote:
> On 02.02.24 09:07, Ryan Roberts wrote:
>> Some architectures (e.g. arm64) can tell from looking at a pte, if some
>> follow-on ptes also map contiguous physical memory with the same pgprot.
>> (for arm64, these are contpte mappings).
>>
>> Take advantage of this knowledge to optimize folio_pte_batch() so that
>> it can skip these ptes when scanning to create a batch. By default, if
>> an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so
>> the changes are optimized out and the behaviour is as before.
>>
>> arm64 will opt-in to providing this hint in the next patch, which will
>> greatly reduce the cost of ptep_get() when scanning a range of contptes.
>>
>> Tested-by: John Hubbard 
>> Signed-off-by: Ryan Roberts 
>> ---
>>   include/linux/pgtable.h | 18 ++
>>   mm/memory.c | 20 +---
>>   2 files changed, 31 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index 50f32cccbd92..cba31f177d27 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -212,6 +212,24 @@ static inline int pmd_dirty(pmd_t pmd)
>>   #define arch_flush_lazy_mmu_mode()    do {} while (0)
>>   #endif
>>   +#ifndef pte_batch_hint
>> +/**
>> + * pte_batch_hint - Number of pages that can be added to batch without 
>> scanning.
>> + * @ptep: Page table pointer for the entry.
>> + * @pte: Page table entry.
>> + *
>> + * Some architectures know that a set of contiguous ptes all map the same
>> + * contiguous memory with the same permissions. In this case, it can 
>> provide a
>> + * hint to aid pte batching without the core code needing to scan every pte.
> 
> I think we might want to document here the expectation regarding
> dirty/accessed bits. folio_pte_batch() will ignore dirty bits only with
> FPB_IGNORE_DIRTY. But especially for arm64, it makes sense to ignore them
> always when batching, because the dirty bit may target any pte part of the
> cont-pte group either way.
> 
> Maybe something like:
> 
> "
> An architecture implementation may only ignore the PTE accessed and dirty 
> bits.
> Further, it may only ignore the dirty bit if that bit is already not
> maintained with precision per PTE inside the hinted batch, and ptep_get()
> would already have to collect it from various PTEs.
> "

Yep, sounds good. I'll add it in next version.

> 
> I think there are some more details to it, but I'm hoping something along
> the lines above is sufficient.
> 
> 
>> +
>>   #ifndef pte_advance_pfn
>>   static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
>>   {
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 65fbe4f886c1..902665b27702 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -988,16 +988,21 @@ static inline int folio_pte_batch(struct folio *folio,
>> unsigned long addr,
>>   {
>>   unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
>>   const pte_t *end_ptep = start_ptep + max_nr;
>> -    pte_t expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, 1),
>> flags);
>> -    pte_t *ptep = start_ptep + 1;
>> +    pte_t expected_pte = __pte_batch_clear_ignored(pte, flags);
>> +    pte_t *ptep = start_ptep;
>>   bool writable;
>> +    int nr;
>>     if (any_writable)
>>   *any_writable = false;
>>     VM_WARN_ON_FOLIO(!pte_present(pte), folio);
>>   -    while (ptep != end_ptep) {
>> +    nr = pte_batch_hint(ptep, pte);
>> +    expected_pte = pte_advance_pfn(expected_pte, nr);
>> +    ptep += nr;
>> +
> 
> *Maybe* it's easier to get when initializing expected_pte+ptep only once.
> 
> Like:
> 
> [...]
> pte_t expected_pte, *ptep;
> [...]
> 
> nr = pte_batch_hint(start_ptep, pte);
> expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
> ptep = start_ptep + nr;

Yeah that works for me. Will change for next version.

> 
>> +    while (ptep < end_ptep) {
>>   pte = ptep_get(ptep);
>>   if (any_writable)
>>   writable = !!pte_write(pte);
>> @@ -1011,17 +1016,18 @@ static inline int folio_pte_batch(struct folio 
>> *folio,
>> unsigned long addr,
>>    * corner cases the next PFN might fall into a different
>>    * folio.
>>    */
>> -    if (pte_pfn(pte) == folio_end_pfn)
>> +    if (pte_pfn(pte) >= folio_end_pfn)
>>   break;
>>     if (any_writable)
>>   *any_writable |= writable;
>>   -    expected_pte = pte_advance_pfn(expected_pte, 1);
>> -    ptep++;
>> +    nr = pte_batch_hint(ptep, pte);
>> +    expected_pte = pte_advance_pfn(expected_pte, nr);
>> +    ptep += nr;
>>   }
>>   -    return ptep - start_ptep;
>> +    return min(ptep - start_ptep, max_nr);
>>   }
> 
> Acked-by: David Hildenbrand 

Thanks!

> 



Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread Ryan Roberts
On 12/02/2024 13:54, David Hildenbrand wrote:
>>> If so, I wonder if we could instead do that comparison modulo the 
>>> access/dirty
>>> bits,
>>
>> I think that would work - but will need to think a bit more on it.
>>
>>> and leave ptep_get_lockless() only reading a single entry?
>>
>> I think we will need to do something a bit less fragile. ptep_get() does 
>> collect
>> the access/dirty bits so its confusing if ptep_get_lockless() doesn't IMHO. 
>> So
>> we will likely want to rename the function and make its documentation 
>> explicit
>> that it does not return those bits.
>>
>> ptep_get_lockless_noyoungdirty()? yuk... Any ideas?
>>
>> Of course if I could convince you the current implementation is safe, I 
>> might be
>> able to sidestep this optimization until a later date?
> 
> As discussed (and pointed out abive), there might be quite some callsites 
> where
> we don't really care about uptodate accessed/dirty bits -- where ptep_get() is
> used nowadays.
> 
> One way to approach that I had in mind was having an explicit interface:
> 
> ptep_get()
> ptep_get_uptodate()
> ptep_get_lockless()
> ptep_get_lockless_uptodate()

Yes, I like the direction of this. I guess we anticipate that call sites
requiring the "_uptodate" variant will be the minority so it makes sense to use
the current names for the "_not_uptodate" variants? But to do a slow migration,
it might be better/safer to have the weaker variant use the new name - that
would allow us to downgrade one at a time?

> 
> Especially the last one might not be needed.
I've done a scan through the code and agree with Mark's original conclusions.
Additionally, huge_pte_alloc() (which isn't used for arm64) doesn't rely on
access/dirty info. So I think I could migrate everything to the weaker variant
fairly easily.

> 
> Futher, "uptodate" might not be the best choice because of PageUptodate() and
> friends. But it's better than "youngdirty"/"noyoungdirty" IMHO.

Certainly agree with "noyoungdirty" being a horrible name. How about "_sync" /
"_nosync"?

> 
> Of course, any such changes require care and are better done one step at at 
> time
> separately.
> 

So I propose to introduce ptep_get_lockless_nosync() (name up for discussion)
and migrate all users to it, as part of this series. This will side-step Mark's
correctness concerns. We can add ptep_get_nosync() later and migrate slowly.

Shout if you think this is a bad plan.

Thanks,
Ryan




Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Andy Shevchenko
On Mon, Feb 12, 2024 at 03:20:22PM +0100, Herve Codina wrote:
> On Mon, 12 Feb 2024 16:01:38 +0200
> Andy Shevchenko  wrote:

...

> Agree, the bitmap_onto() code is simpler to understand than its help.
> 
> I introduced bitmap_off() to be the "reverse" bitmap_onto() operations
> and I preferred to avoid duplicating function that do the same things.
> 
> On my side, I initially didn't use the bitmap_*() functions and did the the
> bits manipulation by hand.
> During the review, it was suggested to use the bitmap_*() family and I 
> followed
> this suggestion.

I also would go this way, the problems I see with the current implementation 
are:
- being related to NUMA (and as Rasmus once pointed out better to be there);
- unclear naming, esp. proposed bitmap_off();
- the quite hard to understand help text
- atomicity when it's not needed (AFAICT).

> I did tests to be sure that bitmap_onto() and bitmap_off() did
> exactly the same things as my previous code did.

Yuri, what do you think about all this?

-- 
With Best Regards,
Andy Shevchenko




Re: [PATCH v5 03/25] mm: Make pte_next_pfn() a wrapper around pte_advance_pfn()

2024-02-12 Thread David Hildenbrand

On 12.02.24 15:10, Ryan Roberts wrote:

On 12/02/2024 12:14, David Hildenbrand wrote:

On 02.02.24 09:07, Ryan Roberts wrote:

The goal is to be able to advance a PTE by an arbitrary number of PFNs.
So introduce a new API that takes a nr param.

We are going to remove pte_next_pfn() and replace it with
pte_advance_pfn(). As a first step, implement pte_next_pfn() as a
wrapper around pte_advance_pfn() so that we can incrementally switch the
architectures over. Once all arches are moved over, we will change all
the core-mm callers to call pte_advance_pfn() directly and remove the
wrapper.

Signed-off-by: Ryan Roberts 
---
   include/linux/pgtable.h | 8 +++-
   1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e7eaf8f2b97..815d92dcb96b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -214,9 +214,15 @@ static inline int pmd_dirty(pmd_t pmd)
       #ifndef pte_next_pfn
+#ifndef pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
+{
+    return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
+}
+#endif
   static inline pte_t pte_next_pfn(pte_t pte)
   {
-    return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+    return pte_advance_pfn(pte, 1);
   }
   #endif
   


I do wonder if we simply want to leave pte_next_pfn() around? Especially patch
#4, #6 don't really benefit from the change? So are the other set_ptes()
implementations.

That is, only convert all pte_next_pfn()->pte_advance_pfn(), and leave a
pte_next_pfn() macro in place.

Any downsides to that?


The downside is just having multiple functions that effectively do the same
thing. Personally I think its cleaner and easier to understand the code with
just one generic function which we pass 1 to it where we only want to advance by
1. In the end, there are only a couple of places where pte_advance_pfn(1) is
used, so doesn't really seem valuable to me to maintain a specialization.


Well, not really functions, just a macro. Like we have set_pte_at() 
translating to set_ptes().


Arguably, we have more callers of set_pte_at().

"Easier to understand", I don't know. :)



Unless you feel strongly that we need to keep pte_next_pfn() then I'd prefer to
leave it as I've done in this series.


Well, it makes you patch set shorter and there is less code churn.

So personally, I'd just leave pte_next_pfn() in there. But whatever you 
prefer, not the end of the world.


--
Cheers,

David / dhildenb



Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Herve Codina
On Mon, 12 Feb 2024 16:01:38 +0200
Andy Shevchenko  wrote:

> On Mon, Feb 12, 2024 at 02:37:53PM +0100, Herve Codina wrote:
> > On Mon, 12 Feb 2024 14:27:16 +0200
> > Andy Shevchenko  wrote:  
> > > On Mon, Feb 12, 2024 at 08:56:31AM +0100, Herve Codina wrote:  
> > > > Currently the bitmap_onto() is available only for CONFIG_NUMA=y case,
> > > > while some users may benefit out of it and being independent to NUMA
> > > > code.
> > > > 
> > > > Make it available to users by moving out of ifdeffery and exporting for
> > > > modules.
> > > 
> > > Wondering if you are trying to have something like
> > > https://lore.kernel.org/lkml/20230926052007.3917389-1-andriy.shevche...@linux.intel.com/
> > >   
> > 
> > Yes, it looks like.
> > Can you confirm that your bitmap_scatter() do the same operations as the
> > existing bitmap_onto() ?  
> 
> I have test cases to be 100% sure, but on the first glance, yes it does with
> the adjustment to the atomicity of the operations (which I do not understand
> why be atomic in the original bitmap_onto() implementation).
> 
> This actually gives a question if we should use your approach or mine.
> At least the help of bitmap_onto() is kinda hard to understand.

Agree, the bitmap_onto() code is simpler to understand than its help.

I introduced bitmap_off() to be the "reverse" bitmap_onto() operations
and I preferred to avoid duplicating function that do the same things.

On my side, I initially didn't use the bitmap_*() functions and did the the
bits manipulation by hand.
During the review, it was suggested to use the bitmap_*() family and I followed
this suggestion. I did tests to be sure that bitmap_onto() and bitmap_off() did
exactly the same things as my previous code did.

> 
> > If so, your bitmap_gather() will match my bitmap_off() (patch 4 in this
> > series).  
> 
> Yes.
> 

Regards,
Hervé



Re: [PATCH] mm/hugetlb: Move page order check inside hugetlb_cma_reserve()

2024-02-12 Thread David Hildenbrand

On 09.02.24 06:42, Anshuman Khandual wrote:

All platforms could benefit from page order check against MAX_PAGE_ORDER
before allocating a CMA area for gigantic hugetlb pages. Let's move this
check from individual platforms to generic hugetlb.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Anshuman Khandual 
---
This applies on v6.8-rc3
  
  arch/arm64/mm/hugetlbpage.c   | 7 ---

  arch/powerpc/mm/hugetlbpage.c | 4 +---
  mm/hugetlb.c  | 7 +++
  3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 8116ac599f80..6720ec8d50e7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -45,13 +45,6 @@ void __init arm64_hugetlb_cma_reserve(void)
else
order = CONT_PMD_SHIFT - PAGE_SHIFT;
  
-	/*

-* HugeTLB CMA reservation is required for gigantic
-* huge pages which could not be allocated via the
-* page allocator. Just warn if there is any change
-* breaking this assumption.
-*/
-   WARN_ON(order <= MAX_PAGE_ORDER);
hugetlb_cma_reserve(order);
  }
  #endif /* CONFIG_CMA */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0a540b37aab6..16557d008eef 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -614,8 +614,6 @@ void __init gigantic_hugetlb_cma_reserve(void)
 */
order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
  
-	if (order) {

-   VM_WARN_ON(order <= MAX_PAGE_ORDER);
+   if (order)
hugetlb_cma_reserve(order);
-   }
  }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf9c9b2906ea..345b3524df35 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7699,6 +7699,13 @@ void __init hugetlb_cma_reserve(int order)
bool node_specific_cma_alloc = false;
int nid;
  
+	/*

+* HugeTLB CMA reservation is required for gigantic
+* huge pages which could not be allocated via the
+* page allocator. Just warn if there is any change
+* breaking this assumption.
+*/
+   VM_WARN_ON(order <= MAX_PAGE_ORDER);
cma_reserve_called = true;
  
  	if (!hugetlb_cma_size)


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb



Re: [PATCH v5 03/25] mm: Make pte_next_pfn() a wrapper around pte_advance_pfn()

2024-02-12 Thread Ryan Roberts
On 12/02/2024 12:14, David Hildenbrand wrote:
> On 02.02.24 09:07, Ryan Roberts wrote:
>> The goal is to be able to advance a PTE by an arbitrary number of PFNs.
>> So introduce a new API that takes a nr param.
>>
>> We are going to remove pte_next_pfn() and replace it with
>> pte_advance_pfn(). As a first step, implement pte_next_pfn() as a
>> wrapper around pte_advance_pfn() so that we can incrementally switch the
>> architectures over. Once all arches are moved over, we will change all
>> the core-mm callers to call pte_advance_pfn() directly and remove the
>> wrapper.
>>
>> Signed-off-by: Ryan Roberts 
>> ---
>>   include/linux/pgtable.h | 8 +++-
>>   1 file changed, 7 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index 5e7eaf8f2b97..815d92dcb96b 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -214,9 +214,15 @@ static inline int pmd_dirty(pmd_t pmd)
>>       #ifndef pte_next_pfn
>> +#ifndef pte_advance_pfn
>> +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
>> +{
>> +    return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
>> +}
>> +#endif
>>   static inline pte_t pte_next_pfn(pte_t pte)
>>   {
>> -    return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
>> +    return pte_advance_pfn(pte, 1);
>>   }
>>   #endif
>>   
> 
> I do wonder if we simply want to leave pte_next_pfn() around? Especially patch
> #4, #6 don't really benefit from the change? So are the other set_ptes()
> implementations.
> 
> That is, only convert all pte_next_pfn()->pte_advance_pfn(), and leave a
> pte_next_pfn() macro in place.
> 
> Any downsides to that? 

The downside is just having multiple functions that effectively do the same
thing. Personally I think its cleaner and easier to understand the code with
just one generic function which we pass 1 to it where we only want to advance by
1. In the end, there are only a couple of places where pte_advance_pfn(1) is
used, so doesn't really seem valuable to me to maintain a specialization.

Unless you feel strongly that we need to keep pte_next_pfn() then I'd prefer to
leave it as I've done in this series.

> This patch here would become:
> 
> #ifndef pte_advance_pfn
> static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
> {
> return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
> }
> #endif
> 
> #ifndef pte_next_pfn
> #define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
> #endif
> 
> As you convert the three arches, make them define pte_advance_pfn and udnefine
> pte_next_pfn. in the end, you can drop the #ifdef around pte_next_pfn here.
> 



Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Andy Shevchenko
On Mon, Feb 12, 2024 at 02:37:53PM +0100, Herve Codina wrote:
> On Mon, 12 Feb 2024 14:27:16 +0200
> Andy Shevchenko  wrote:
> > On Mon, Feb 12, 2024 at 08:56:31AM +0100, Herve Codina wrote:
> > > Currently the bitmap_onto() is available only for CONFIG_NUMA=y case,
> > > while some users may benefit out of it and being independent to NUMA
> > > code.
> > > 
> > > Make it available to users by moving out of ifdeffery and exporting for
> > > modules.  
> > 
> > Wondering if you are trying to have something like
> > https://lore.kernel.org/lkml/20230926052007.3917389-1-andriy.shevche...@linux.intel.com/
> 
> Yes, it looks like.
> Can you confirm that your bitmap_scatter() do the same operations as the
> existing bitmap_onto() ?

I have test cases to be 100% sure, but on the first glance, yes it does with
the adjustment to the atomicity of the operations (which I do not understand
why be atomic in the original bitmap_onto() implementation).

This actually gives a question if we should use your approach or mine.
At least the help of bitmap_onto() is kinda hard to understand.

> If so, your bitmap_gather() will match my bitmap_off() (patch 4 in this
> series).

Yes.

-- 
With Best Regards,
Andy Shevchenko




Re: [PATCH v15 2/5] crash: add a new kexec flag for hotplug support

2024-02-12 Thread Sourabh Jain

Hello Baoquan,

On 05/02/24 08:40, Baoquan He wrote:

Hi Sourabh,

Thanks for the great work. There are some concerns, please see inline
comments.


Thank you :)



On 01/11/24 at 04:21pm, Sourabh Jain wrote:
..

Now, if the kexec tool sends KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag to
the kernel, it indicates to the kernel that all the required kexec
segment is skipped from SHA calculation and it is safe to update kdump
image loaded using the kexec_load syscall.

So finally you add a new KEXEC_CRASH_HOTPLUG_SUPPORT flag, that's fine.

..

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 9bb6607e864e..e791129fdf6c 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -211,6 +211,9 @@ extern void kdump_nmi_shootdown_cpus(void);
  void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
  #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
  
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags);

+#define arch_crash_hotplug_support arch_crash_hotplug_support
+
  #ifdef CONFIG_HOTPLUG_CPU
  int arch_crash_hotplug_cpu_support(void);
  #define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support

Then crash_hotplug_cpu_support is not needed any more on x86_64, and
crash_hotplug_memory_support(), if you remove their implementation in
arch/x86/kernel/crash.c, won't it cause building warning or error on x86?


Yeah, crash_hotplug_cpu_support and crash_hotplug_memory_support are
no longer required. My bad, I forgot to remove them.


diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44744e9c68ec..293b54bff706 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -398,20 +398,16 @@ int crash_load_segments(struct kimage *image)
  #undef pr_fmt
  #define pr_fmt(fmt) "crash hp: " fmt
  
-/* These functions provide the value for the sysfs crash_hotplug nodes */

-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void)
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
  {
-   return crash_check_update_elfcorehdr();
-}
-#endif
  
-#ifdef CONFIG_MEMORY_HOTPLUG

-int arch_crash_hotplug_memory_support(void)
-{
-   return crash_check_update_elfcorehdr();
-}
+#ifdef CONFIG_KEXEC_FILE
+   if (image->file_mode)
+   return 1;
  #endif
+   return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+   kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);

Do we need add some document to tell why there are two kexec flags on
x86_64, except of checking this patch log?


Sure I will add a comment about it.




+}
  
  unsigned int arch_crash_get_elfcorehdr_size(void)

  {
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 548491de818e..2f411ddfbd8b 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -306,7 +306,7 @@ static ssize_t crash_hotplug_show(struct device *dev,
 struct device_attribute *attr,
 char *buf)
  {
-   return sysfs_emit(buf, "%d\n", crash_hotplug_cpu_support());
+   return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
  }
  static DEVICE_ATTR_ADMIN_RO(crash_hotplug);
  #endif
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 8a13babd826c..e70ab1d3428e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -514,7 +514,7 @@ static DEVICE_ATTR_RW(auto_online_blocks);
  static ssize_t crash_hotplug_show(struct device *dev,
   struct device_attribute *attr, char *buf)
  {
-   return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support());
+   return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
  }
  static DEVICE_ATTR_RO(crash_hotplug);
  #endif
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 802052d9c64b..7880d74dc5c4 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -317,8 +317,8 @@ struct kimage {
/* If set, we are using file mode kexec syscall */
unsigned int file_mode:1;
  #ifdef CONFIG_CRASH_HOTPLUG
-   /* If set, allow changes to elfcorehdr of kexec_load'd image */
-   unsigned int update_elfcorehdr:1;
+   /* If set, allow changes to kexec segments of kexec_load'd image */

The code comment doesn't reflect the usage of the flag.
I should have updated the comment to indicate that this flag is for both 
system calls.

More comments below.


You set it too
when it's kexec_file_load. Speaking of this, I do wonder why you need
set it too for kexec_file_load,

If we do this one can just access image->hotplug_support to find hotplug
support for currently loaded kdump image without bothering about which
system call was used to load the kdump image.


and why we have
arch_crash_hotplug_support(), then crash_check_hotplug_support() both of
which have the same effect.


arch_crash_hotplug_support(): This function processes the kexec flags 
and finds the

Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread David Hildenbrand

If so, I wonder if we could instead do that comparison modulo the access/dirty
bits,


I think that would work - but will need to think a bit more on it.


and leave ptep_get_lockless() only reading a single entry?


I think we will need to do something a bit less fragile. ptep_get() does collect
the access/dirty bits so its confusing if ptep_get_lockless() doesn't IMHO. So
we will likely want to rename the function and make its documentation explicit
that it does not return those bits.

ptep_get_lockless_noyoungdirty()? yuk... Any ideas?

Of course if I could convince you the current implementation is safe, I might be
able to sidestep this optimization until a later date?


As discussed (and pointed out abive), there might be quite some 
callsites where we don't really care about uptodate accessed/dirty bits 
-- where ptep_get() is used nowadays.


One way to approach that I had in mind was having an explicit interface:

ptep_get()
ptep_get_uptodate()
ptep_get_lockless()
ptep_get_lockless_uptodate()

Especially the last one might not be needed.

Futher, "uptodate" might not be the best choice because of 
PageUptodate() and friends. But it's better than 
"youngdirty"/"noyoungdirty" IMHO.


Of course, any such changes require care and are better done one step at 
at time separately.


--
Cheers,

David / dhildenb



Re: [PATCH v5 23/25] arm64/mm: Implement pte_batch_hint()

2024-02-12 Thread David Hildenbrand

On 02.02.24 09:07, Ryan Roberts wrote:

When core code iterates over a range of ptes and calls ptep_get() for
each of them, if the range happens to cover contpte mappings, the number
of pte reads becomes amplified by a factor of the number of PTEs in a
contpte block. This is because for each call to ptep_get(), the
implementation must read all of the ptes in the contpte block to which
it belongs to gather the access and dirty bits.

This causes a hotspot for fork(), as well as operations that unmap
memory such as munmap(), exit and madvise(MADV_DONTNEED). Fortunately we
can fix this by implementing pte_batch_hint() which allows their
iterators to skip getting the contpte tail ptes when gathering the batch
of ptes to operate on. This results in the number of PTE reads returning
to 1 per pte.

Tested-by: John Hubbard 
Signed-off-by: Ryan Roberts 
---
  arch/arm64/include/asm/pgtable.h | 9 +
  1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index ad04adb7b87f..353ea67b5d75 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1220,6 +1220,15 @@ static inline void contpte_try_unfold(struct mm_struct 
*mm, unsigned long addr,
__contpte_try_unfold(mm, addr, ptep, pte);
  }
  
+#define pte_batch_hint pte_batch_hint

+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+   if (!pte_valid_cont(pte))
+   return 1;
+
+   return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1));
+}
+
  /*
   * The below functions constitute the public API that arm64 presents to the
   * core-mm to manipulate PTE entries within their page tables (or at least 
this



Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb



Re: [PATCH v5 22/25] mm: Add pte_batch_hint() to reduce scanning in folio_pte_batch()

2024-02-12 Thread David Hildenbrand

On 02.02.24 09:07, Ryan Roberts wrote:

Some architectures (e.g. arm64) can tell from looking at a pte, if some
follow-on ptes also map contiguous physical memory with the same pgprot.
(for arm64, these are contpte mappings).

Take advantage of this knowledge to optimize folio_pte_batch() so that
it can skip these ptes when scanning to create a batch. By default, if
an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so
the changes are optimized out and the behaviour is as before.

arm64 will opt-in to providing this hint in the next patch, which will
greatly reduce the cost of ptep_get() when scanning a range of contptes.

Tested-by: John Hubbard 
Signed-off-by: Ryan Roberts 
---
  include/linux/pgtable.h | 18 ++
  mm/memory.c | 20 +---
  2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 50f32cccbd92..cba31f177d27 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,6 +212,24 @@ static inline int pmd_dirty(pmd_t pmd)
  #define arch_flush_lazy_mmu_mode()do {} while (0)
  #endif
  
+#ifndef pte_batch_hint

+/**
+ * pte_batch_hint - Number of pages that can be added to batch without 
scanning.
+ * @ptep: Page table pointer for the entry.
+ * @pte: Page table entry.
+ *
+ * Some architectures know that a set of contiguous ptes all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pte batching without the core code needing to scan every pte.


I think we might want to document here the expectation regarding
dirty/accessed bits. folio_pte_batch() will ignore dirty bits only with
FPB_IGNORE_DIRTY. But especially for arm64, it makes sense to ignore them
always when batching, because the dirty bit may target any pte part of the
cont-pte group either way.

Maybe something like:

"
An architecture implementation may only ignore the PTE accessed and dirty bits.
Further, it may only ignore the dirty bit if that bit is already not
maintained with precision per PTE inside the hinted batch, and ptep_get()
would already have to collect it from various PTEs.
"

I think there are some more details to it, but I'm hoping something along
the lines above is sufficient.



+
  #ifndef pte_advance_pfn
  static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
  {
diff --git a/mm/memory.c b/mm/memory.c
index 65fbe4f886c1..902665b27702 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -988,16 +988,21 @@ static inline int folio_pte_batch(struct folio *folio, 
unsigned long addr,
  {
unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
const pte_t *end_ptep = start_ptep + max_nr;
-   pte_t expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, 1), 
flags);
-   pte_t *ptep = start_ptep + 1;
+   pte_t expected_pte = __pte_batch_clear_ignored(pte, flags);
+   pte_t *ptep = start_ptep;
bool writable;
+   int nr;
  
  	if (any_writable)

*any_writable = false;
  
  	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
  
-	while (ptep != end_ptep) {

+   nr = pte_batch_hint(ptep, pte);
+   expected_pte = pte_advance_pfn(expected_pte, nr);
+   ptep += nr;
+


*Maybe* it's easier to get when initializing expected_pte+ptep only once.

Like:

[...]
pte_t expected_pte, *ptep;
[...]

nr = pte_batch_hint(start_ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
ptep = start_ptep + nr;


+   while (ptep < end_ptep) {
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
@@ -1011,17 +1016,18 @@ static inline int folio_pte_batch(struct folio *folio, 
unsigned long addr,
 * corner cases the next PFN might fall into a different
 * folio.
 */
-   if (pte_pfn(pte) == folio_end_pfn)
+   if (pte_pfn(pte) >= folio_end_pfn)
break;
  
  		if (any_writable)

*any_writable |= writable;
  
-		expected_pte = pte_advance_pfn(expected_pte, 1);

-   ptep++;
+   nr = pte_batch_hint(ptep, pte);
+   expected_pte = pte_advance_pfn(expected_pte, nr);
+   ptep += nr;
}
  
-	return ptep - start_ptep;

+   return min(ptep - start_ptep, max_nr);
  }


Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb



Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Herve Codina
Hi Andy,

On Mon, 12 Feb 2024 14:27:16 +0200
Andy Shevchenko  wrote:

> On Mon, Feb 12, 2024 at 08:56:31AM +0100, Herve Codina wrote:
> > Currently the bitmap_onto() is available only for CONFIG_NUMA=y case,
> > while some users may benefit out of it and being independent to NUMA
> > code.
> > 
> > Make it available to users by moving out of ifdeffery and exporting for
> > modules.  
> 
> Wondering if you are trying to have something like
> https://lore.kernel.org/lkml/20230926052007.3917389-1-andriy.shevche...@linux.intel.com/
> 

Yes, it looks like.
Can you confirm that your bitmap_scatter() do the same operations as the
existing bitmap_onto() ?

If so, your bitmap_gather() will match my bitmap_off() (patch 4 in this series).

Thanks,
Hervé

-- 
Hervé Codina, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com


Re: [PATCH v5 18/25] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

2024-02-12 Thread Ryan Roberts
On 12/02/2024 13:15, David Hildenbrand wrote:
> On 12.02.24 14:05, Ryan Roberts wrote:
>> On 12/02/2024 12:44, David Hildenbrand wrote:
>>> On 02.02.24 09:07, Ryan Roberts wrote:
 Split __flush_tlb_range() into __flush_tlb_range_nosync() +
 __flush_tlb_range(), in the same way as the existing flush_tlb_page()
 arrangement. This allows calling __flush_tlb_range_nosync() to elide the
 trailing DSB. Forthcoming "contpte" code will take advantage of this
 when clearing the young bit from a contiguous range of ptes.

 Tested-by: John Hubbard 
 Signed-off-by: Ryan Roberts 
 ---
    arch/arm64/include/asm/tlbflush.h | 13 +++--
    1 file changed, 11 insertions(+), 2 deletions(-)

 diff --git a/arch/arm64/include/asm/tlbflush.h
 b/arch/arm64/include/asm/tlbflush.h
 index 79e932a1bdf8..50a765917327 100644
 --- a/arch/arm64/include/asm/tlbflush.h
 +++ b/arch/arm64/include/asm/tlbflush.h
 @@ -422,7 +422,7 @@ do {    \
    #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
    __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false,
 kvm_lpa2_is_enabled());
    -static inline void __flush_tlb_range(struct vm_area_struct *vma,
 +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
     unsigned long start, unsigned long end,
     unsigned long stride, bool last_level,
     int tlb_level)
 @@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct
 vm_area_struct *vma,
    __flush_tlb_range_op(vae1is, start, pages, stride, asid,
     tlb_level, true, lpa2_is_enabled());
    -    dsb(ish);
    mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
    }
    +static inline void __flush_tlb_range(struct vm_area_struct *vma,
 + unsigned long start, unsigned long end,
 + unsigned long stride, bool last_level,
 + int tlb_level)
 +{
 +    __flush_tlb_range_nosync(vma, start, end, stride,
 + last_level, tlb_level);
 +    dsb(ish);
 +}
 +
    static inline void flush_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long end)
    {
>>>
>>> You're now calling dsb() after 
>>> mmu_notifier_arch_invalidate_secondary_tlbs().
>>>
>>>
>>> In flush_tlb_mm(), we have the order
>>>
>>>  dsb(ish);
>>>  mmu_notifier_arch_invalidate_secondary_tlbs()
>>>
>>> In flush_tlb_page(), we have the effective order:
>>>
>>>  mmu_notifier_arch_invalidate_secondary_tlbs()
>>>  dsb(ish);
>>>
>>> In flush_tlb_range(), we used to have the order:
>>>
>>>  dsb(ish);
>>>  mmu_notifier_arch_invalidate_secondary_tlbs();
>>>
>>>
>>> So I *suspect* having that DSB before
>>> mmu_notifier_arch_invalidate_secondary_tlbs() is fine. Hopefully, nothing in
>>> there relies on that placement.
>>
>> Will spotted this against v3. My argument was that I was following the 
>> existing
>> pattern in flush_tlb_page(). Apparently that is not correct and needs 
>> changing,
>> but the conclusion was to leave my change as is for now, since it is 
>> consistent
>> and change them at a later date together.
> 
> Good, I think you should add a few words to the patch description ("ordering
> might be incorrect, but is in-line with __flush_tlb_page()"; will be resolved
> separately).
> 

ACK, will do. Thanks!



Re: [PATCH] powerpc/cputable: Add missing PPC_FEATURE_BOOKE on PPC64 Book-E

2024-02-12 Thread Christophe Leroy


Le 07/02/2024 à 10:27, David Engraf a écrit :
> [Vous ne recevez pas souvent de courriers de david.eng...@sysgo.com. 
> Découvrez pourquoi ceci est important à 
> https://aka.ms/LearnAboutSenderIdentification ]
> 
> Commit e320a76db4b0 ("powerpc/cputable: Split cpu_specs[] out of cputable.h")
> moved the cpu_specs to separate header files. Previously PPC_FEATURE_BOOKE
> was enabled by CONFIG_PPC_BOOK3E_64. The definition in cpu_specs_e500mc.h
> for PPC64 no longer enables PPC_FEATURE_BOOKE.
> 
> This breaks user space reading the ELF hwcaps and expect PPC_FEATURE_BOOKE.
> Debugging an application with gdb is no longer working on e5500/e6500
> because the 64-bit detection relies on PPC_FEATURE_BOOKE for Book-E.
> 
> Fixes: e320a76db4b0 ("powerpc/cputable: Split cpu_specs[] out of cputable.h")
> Signed-off-by: David Engraf 

Reviewed-by: Christophe Leroy 

> ---
>   arch/powerpc/kernel/cpu_specs_e500mc.h | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/cpu_specs_e500mc.h 
> b/arch/powerpc/kernel/cpu_specs_e500mc.h
> index ceb06b109f831..2ae8e9a7b461c 100644
> --- a/arch/powerpc/kernel/cpu_specs_e500mc.h
> +++ b/arch/powerpc/kernel/cpu_specs_e500mc.h
> @@ -8,7 +8,8 @@
> 
>   #ifdef CONFIG_PPC64
>   #define COMMON_USER_BOOKE  (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \
> -PPC_FEATURE_HAS_FPU | PPC_FEATURE_64)
> +PPC_FEATURE_HAS_FPU | PPC_FEATURE_64 | \
> +PPC_FEATURE_BOOKE)
>   #else
>   #define COMMON_USER_BOOKE  (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \
>   PPC_FEATURE_BOOKE)
> --
> 2.40.1
> 


Re: [PATCH v5 18/25] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

2024-02-12 Thread David Hildenbrand

On 12.02.24 14:05, Ryan Roberts wrote:

On 12/02/2024 12:44, David Hildenbrand wrote:

On 02.02.24 09:07, Ryan Roberts wrote:

Split __flush_tlb_range() into __flush_tlb_range_nosync() +
__flush_tlb_range(), in the same way as the existing flush_tlb_page()
arrangement. This allows calling __flush_tlb_range_nosync() to elide the
trailing DSB. Forthcoming "contpte" code will take advantage of this
when clearing the young bit from a contiguous range of ptes.

Tested-by: John Hubbard 
Signed-off-by: Ryan Roberts 
---
   arch/arm64/include/asm/tlbflush.h | 13 +++--
   1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h
b/arch/arm64/include/asm/tlbflush.h
index 79e932a1bdf8..50a765917327 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -422,7 +422,7 @@ do {    \
   #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
   __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false,
kvm_lpa2_is_enabled());
   -static inline void __flush_tlb_range(struct vm_area_struct *vma,
+static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
    unsigned long start, unsigned long end,
    unsigned long stride, bool last_level,
    int tlb_level)
@@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct
vm_area_struct *vma,
   __flush_tlb_range_op(vae1is, start, pages, stride, asid,
    tlb_level, true, lpa2_is_enabled());
   -    dsb(ish);
   mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
   }
   +static inline void __flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level,
+ int tlb_level)
+{
+    __flush_tlb_range_nosync(vma, start, end, stride,
+ last_level, tlb_level);
+    dsb(ish);
+}
+
   static inline void flush_tlb_range(struct vm_area_struct *vma,
  unsigned long start, unsigned long end)
   {


You're now calling dsb() after mmu_notifier_arch_invalidate_secondary_tlbs().


In flush_tlb_mm(), we have the order

 dsb(ish);
 mmu_notifier_arch_invalidate_secondary_tlbs()

In flush_tlb_page(), we have the effective order:

 mmu_notifier_arch_invalidate_secondary_tlbs()
 dsb(ish);

In flush_tlb_range(), we used to have the order:

 dsb(ish);
 mmu_notifier_arch_invalidate_secondary_tlbs();


So I *suspect* having that DSB before
mmu_notifier_arch_invalidate_secondary_tlbs() is fine. Hopefully, nothing in
there relies on that placement.


Will spotted this against v3. My argument was that I was following the existing
pattern in flush_tlb_page(). Apparently that is not correct and needs changing,
but the conclusion was to leave my change as is for now, since it is consistent
and change them at a later date together.


Good, I think you should add a few words to the patch description 
("ordering might be incorrect, but is in-line with __flush_tlb_page()"; 
will be resolved separately).


--
Cheers,

David / dhildenb



Re: [PATCH v5 18/25] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

2024-02-12 Thread Ryan Roberts
On 12/02/2024 12:44, David Hildenbrand wrote:
> On 02.02.24 09:07, Ryan Roberts wrote:
>> Split __flush_tlb_range() into __flush_tlb_range_nosync() +
>> __flush_tlb_range(), in the same way as the existing flush_tlb_page()
>> arrangement. This allows calling __flush_tlb_range_nosync() to elide the
>> trailing DSB. Forthcoming "contpte" code will take advantage of this
>> when clearing the young bit from a contiguous range of ptes.
>>
>> Tested-by: John Hubbard 
>> Signed-off-by: Ryan Roberts 
>> ---
>>   arch/arm64/include/asm/tlbflush.h | 13 +++--
>>   1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/tlbflush.h
>> b/arch/arm64/include/asm/tlbflush.h
>> index 79e932a1bdf8..50a765917327 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -422,7 +422,7 @@ do {    \
>>   #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>   __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false,
>> kvm_lpa2_is_enabled());
>>   -static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>    unsigned long start, unsigned long end,
>>    unsigned long stride, bool last_level,
>>    int tlb_level)
>> @@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct
>> vm_area_struct *vma,
>>   __flush_tlb_range_op(vae1is, start, pages, stride, asid,
>>    tlb_level, true, lpa2_is_enabled());
>>   -    dsb(ish);
>>   mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>>   }
>>   +static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> + unsigned long start, unsigned long end,
>> + unsigned long stride, bool last_level,
>> + int tlb_level)
>> +{
>> +    __flush_tlb_range_nosync(vma, start, end, stride,
>> + last_level, tlb_level);
>> +    dsb(ish);
>> +}
>> +
>>   static inline void flush_tlb_range(struct vm_area_struct *vma,
>>  unsigned long start, unsigned long end)
>>   {
> 
> You're now calling dsb() after mmu_notifier_arch_invalidate_secondary_tlbs().
> 
> 
> In flush_tlb_mm(), we have the order
> 
> dsb(ish);   
> mmu_notifier_arch_invalidate_secondary_tlbs()
> 
> In flush_tlb_page(), we have the effective order:
> 
> mmu_notifier_arch_invalidate_secondary_tlbs()
> dsb(ish);
> 
> In flush_tlb_range(), we used to have the order:
> 
> dsb(ish);
> mmu_notifier_arch_invalidate_secondary_tlbs();
> 
> 
> So I *suspect* having that DSB before
> mmu_notifier_arch_invalidate_secondary_tlbs() is fine. Hopefully, nothing in
> there relies on that placement.

Will spotted this against v3. My argument was that I was following the existing
pattern in flush_tlb_page(). Apparently that is not correct and needs changing,
but the conclusion was to leave my change as is for now, since it is consistent
and change them at a later date together.

https://lore.kernel.org/linux-arm-kernel/123a58b0-2ea6-4da3-9719-98ca55c80...@arm.com/



> 
> Maybe wort spelling out in the patch description
> 
> Reviewed-by: David Hildenbrand 
> 

Thanks!




Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread Ryan Roberts
On 12/02/2024 12:00, Mark Rutland wrote:
> Hi Ryan,
> 
> Overall this looks pretty good; I have a bunch of minor comments below, and a
> bigger question on the way ptep_get_lockless() works.

OK great - thanks for the review. Let's see if I can answer them all...

> 
> On Fri, Feb 02, 2024 at 08:07:50AM +, Ryan Roberts wrote:
>> With the ptep API sufficiently refactored, we can now introduce a new
>> "contpte" API layer, which transparently manages the PTE_CONT bit for
>> user mappings.
>>
>> In this initial implementation, only suitable batches of PTEs, set via
>> set_ptes(), are mapped with the PTE_CONT bit. Any subsequent
>> modification of individual PTEs will cause an "unfold" operation to
>> repaint the contpte block as individual PTEs before performing the
>> requested operation. While, a modification of a single PTE could cause
>> the block of PTEs to which it belongs to become eligible for "folding"
>> into a contpte entry, "folding" is not performed in this initial
>> implementation due to the costs of checking the requirements are met.
>> Due to this, contpte mappings will degrade back to normal pte mappings
>> over time if/when protections are changed. This will be solved in a
>> future patch.
>>
>> Since a contpte block only has a single access and dirty bit, the
>> semantic here changes slightly; when getting a pte (e.g. ptep_get())
>> that is part of a contpte mapping, the access and dirty information are
>> pulled from the block (so all ptes in the block return the same
>> access/dirty info). When changing the access/dirty info on a pte (e.g.
>> ptep_set_access_flags()) that is part of a contpte mapping, this change
>> will affect the whole contpte block. This is works fine in practice
>> since we guarantee that only a single folio is mapped by a contpte
>> block, and the core-mm tracks access/dirty information per folio.
>>
>> In order for the public functions, which used to be pure inline, to
>> continue to be callable by modules, export all the contpte_* symbols
>> that are now called by those public inline functions.
>>
>> The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter
>> at build time. It defaults to enabled as long as its dependency,
>> TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon
>> TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not
>> enabled, then there is no chance of meeting the physical contiguity
>> requirement for contpte mappings.
>>
>> Tested-by: John Hubbard 
>> Signed-off-by: Ryan Roberts 
>> ---
>>  arch/arm64/Kconfig   |   9 +
>>  arch/arm64/include/asm/pgtable.h | 161 ++
>>  arch/arm64/mm/Makefile   |   1 +
>>  arch/arm64/mm/contpte.c  | 283 +++
>>  4 files changed, 454 insertions(+)
>>  create mode 100644 arch/arm64/mm/contpte.c
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index d86d7f4758b5..1442e8ed95b6 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -2230,6 +2230,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
>>  select UNWIND_TABLES
>>  select DYNAMIC_SCS
>>  
>> +config ARM64_CONTPTE
>> +bool "Contiguous PTE mappings for user memory" if EXPERT
>> +depends on TRANSPARENT_HUGEPAGE
>> +default y
>> +help
>> +  When enabled, user mappings are configured using the PTE contiguous
>> +  bit, for any mappings that meet the size and alignment requirements.
>> +  This reduces TLB pressure and improves performance.
>> +
>>  endmenu # "Kernel Features"
>>  
>>  menu "Boot options"
>> diff --git a/arch/arm64/include/asm/pgtable.h 
>> b/arch/arm64/include/asm/pgtable.h
>> index 7dc6b68ee516..34892a95403d 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t 
>> phys)
>>   */
>>  #define pte_valid_not_user(pte) \
>>  ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | 
>> PTE_UXN))
>> +/*
>> + * Returns true if the pte is valid and has the contiguous bit set.
>> + */
>> +#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte))
>>  /*
>>   * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
>>   * so that we don't erroneously return false for pages that have been
>> @@ -1135,6 +1139,161 @@ void vmemmap_update_pte(unsigned long addr, pte_t 
>> *ptep, pte_t pte);
>>  #define vmemmap_update_pte vmemmap_update_pte
>>  #endif
>>  
>> +#ifdef CONFIG_ARM64_CONTPTE
>> +
>> +/*
>> + * The contpte APIs are used to transparently manage the contiguous bit in 
>> ptes
>> + * where it is possible and makes sense to do so. The PTE_CONT bit is 
>> considered
>> + * a private implementation detail of the public ptep API (see below).
>> + */
>> +extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
>> +pte_t *ptep, pte_t pte);
>> +extern pte_t contpte_ptep_get(pte_t *ptep, pte_t 

Re: [PATCH v5 18/25] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

2024-02-12 Thread David Hildenbrand

On 02.02.24 09:07, Ryan Roberts wrote:

Split __flush_tlb_range() into __flush_tlb_range_nosync() +
__flush_tlb_range(), in the same way as the existing flush_tlb_page()
arrangement. This allows calling __flush_tlb_range_nosync() to elide the
trailing DSB. Forthcoming "contpte" code will take advantage of this
when clearing the young bit from a contiguous range of ptes.

Tested-by: John Hubbard 
Signed-off-by: Ryan Roberts 
---
  arch/arm64/include/asm/tlbflush.h | 13 +++--
  1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h 
b/arch/arm64/include/asm/tlbflush.h
index 79e932a1bdf8..50a765917327 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -422,7 +422,7 @@ do {
\
  #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, 
kvm_lpa2_is_enabled());
  
-static inline void __flush_tlb_range(struct vm_area_struct *vma,

+static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 unsigned long start, unsigned long end,
 unsigned long stride, bool last_level,
 int tlb_level)
@@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct 
vm_area_struct *vma,
__flush_tlb_range_op(vae1is, start, pages, stride, asid,
 tlb_level, true, lpa2_is_enabled());
  
-	dsb(ish);

mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
  }
  
+static inline void __flush_tlb_range(struct vm_area_struct *vma,

+unsigned long start, unsigned long end,
+unsigned long stride, bool last_level,
+int tlb_level)
+{
+   __flush_tlb_range_nosync(vma, start, end, stride,
+last_level, tlb_level);
+   dsb(ish);
+}
+
  static inline void flush_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long end)
  {


You're now calling dsb() after 
mmu_notifier_arch_invalidate_secondary_tlbs().



In flush_tlb_mm(), we have the order

dsb(ish);   
mmu_notifier_arch_invalidate_secondary_tlbs()

In flush_tlb_page(), we have the effective order:

mmu_notifier_arch_invalidate_secondary_tlbs()
dsb(ish);

In flush_tlb_range(), we used to have the order:

dsb(ish);
mmu_notifier_arch_invalidate_secondary_tlbs();


So I *suspect* having that DSB before 
mmu_notifier_arch_invalidate_secondary_tlbs() is fine. Hopefully, 
nothing in there relies on that placement.


Maybe wort spelling out in the patch description

Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb



Re: [PATCH v3 RESEND 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Andy Shevchenko
On Mon, Feb 12, 2024 at 08:56:31AM +0100, Herve Codina wrote:
> Currently the bitmap_onto() is available only for CONFIG_NUMA=y case,
> while some users may benefit out of it and being independent to NUMA
> code.
> 
> Make it available to users by moving out of ifdeffery and exporting for
> modules.

Wondering if you are trying to have something like
https://lore.kernel.org/lkml/20230926052007.3917389-1-andriy.shevche...@linux.intel.com/

-- 
With Best Regards,
Andy Shevchenko




Re: [PATCH v3 RESEND 1/6] net: wan: Add support for QMC HDLC

2024-02-12 Thread Andy Shevchenko
On Mon, Feb 12, 2024 at 08:56:29AM +0100, Herve Codina wrote:
> The QMC HDLC driver provides support for HDLC using the QMC (QUICC
> Multichannel Controller) to transfer the HDLC data.

...

> +#include 
> +#include 
> +#include 

> +#include 
> +#include 

I do not see how these are being used, am I right?
What's is missing OTOH is the mod_devicetable.h.

> +#include 
> +#include 

+ Blank line?

> +#include 

-- 
With Best Regards,
Andy Shevchenko




Re: [PATCH v5 03/25] mm: Make pte_next_pfn() a wrapper around pte_advance_pfn()

2024-02-12 Thread David Hildenbrand

On 02.02.24 09:07, Ryan Roberts wrote:

The goal is to be able to advance a PTE by an arbitrary number of PFNs.
So introduce a new API that takes a nr param.

We are going to remove pte_next_pfn() and replace it with
pte_advance_pfn(). As a first step, implement pte_next_pfn() as a
wrapper around pte_advance_pfn() so that we can incrementally switch the
architectures over. Once all arches are moved over, we will change all
the core-mm callers to call pte_advance_pfn() directly and remove the
wrapper.

Signed-off-by: Ryan Roberts 
---
  include/linux/pgtable.h | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e7eaf8f2b97..815d92dcb96b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -214,9 +214,15 @@ static inline int pmd_dirty(pmd_t pmd)
  
  
  #ifndef pte_next_pfn

+#ifndef pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
+{
+   return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
+}
+#endif
  static inline pte_t pte_next_pfn(pte_t pte)
  {
-   return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+   return pte_advance_pfn(pte, 1);
  }
  #endif
  


I do wonder if we simply want to leave pte_next_pfn() around? Especially 
patch #4, #6 don't really benefit from the change? So are the other 
set_ptes() implementations.


That is, only convert all pte_next_pfn()->pte_advance_pfn(), and leave a
pte_next_pfn() macro in place.

Any downsides to that? This patch here would become:

#ifndef pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif

#ifndef pte_next_pfn
#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
#endif

As you convert the three arches, make them define pte_advance_pfn and 
udnefine pte_next_pfn. in the end, you can drop the #ifdef around 
pte_next_pfn here.


--
Cheers,

David / dhildenb



Re: [PATCH v5 01/25] mm: Clarify the spec for set_ptes()

2024-02-12 Thread David Hildenbrand

On 02.02.24 09:07, Ryan Roberts wrote:

set_ptes() spec implies that it can only be used to set a present pte
because it interprets the PFN field to increment it. However,
set_pte_at() has been implemented on top of set_ptes() since set_ptes()
was introduced, and set_pte_at() allows setting a pte to a not-present
state. So clarify the spec to state that when nr==1, new state of pte
may be present or not present. When nr>1, new state of all ptes must be
present.

While we are at it, tighten the spec to set requirements around the
initial state of ptes; when nr==1 it may be either present or
not-present. But when nr>1 all ptes must initially be not-present. All
set_ptes() callsites already conform to this requirement. Stating it
explicitly is useful because it allows for a simplification to the
upcoming arm64 contpte implementation.

Signed-off-by: Ryan Roberts 
---
  include/linux/pgtable.h | 4 
  1 file changed, 4 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f0feae7f89fb..5e7eaf8f2b97 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -229,6 +229,10 @@ static inline pte_t pte_next_pfn(pte_t pte)
   * @pte: Page table entry for the first page.
   * @nr: Number of pages to map.
   *
+ * When nr==1, initial state of pte may be present or not present, and new 
state
+ * may be present or not present. When nr>1, initial state of all ptes must be
+ * not present, and new state must be present.
+ *
   * May be overridden by the architecture, or the architecture can define
   * set_pte() and PFN_PTE_SHIFT.
   *


Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb



Re: [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings

2024-02-12 Thread Mark Rutland
Hi Ryan,

Overall this looks pretty good; I have a bunch of minor comments below, and a
bigger question on the way ptep_get_lockless() works.

On Fri, Feb 02, 2024 at 08:07:50AM +, Ryan Roberts wrote:
> With the ptep API sufficiently refactored, we can now introduce a new
> "contpte" API layer, which transparently manages the PTE_CONT bit for
> user mappings.
> 
> In this initial implementation, only suitable batches of PTEs, set via
> set_ptes(), are mapped with the PTE_CONT bit. Any subsequent
> modification of individual PTEs will cause an "unfold" operation to
> repaint the contpte block as individual PTEs before performing the
> requested operation. While, a modification of a single PTE could cause
> the block of PTEs to which it belongs to become eligible for "folding"
> into a contpte entry, "folding" is not performed in this initial
> implementation due to the costs of checking the requirements are met.
> Due to this, contpte mappings will degrade back to normal pte mappings
> over time if/when protections are changed. This will be solved in a
> future patch.
> 
> Since a contpte block only has a single access and dirty bit, the
> semantic here changes slightly; when getting a pte (e.g. ptep_get())
> that is part of a contpte mapping, the access and dirty information are
> pulled from the block (so all ptes in the block return the same
> access/dirty info). When changing the access/dirty info on a pte (e.g.
> ptep_set_access_flags()) that is part of a contpte mapping, this change
> will affect the whole contpte block. This is works fine in practice
> since we guarantee that only a single folio is mapped by a contpte
> block, and the core-mm tracks access/dirty information per folio.
> 
> In order for the public functions, which used to be pure inline, to
> continue to be callable by modules, export all the contpte_* symbols
> that are now called by those public inline functions.
> 
> The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter
> at build time. It defaults to enabled as long as its dependency,
> TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon
> TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not
> enabled, then there is no chance of meeting the physical contiguity
> requirement for contpte mappings.
> 
> Tested-by: John Hubbard 
> Signed-off-by: Ryan Roberts 
> ---
>  arch/arm64/Kconfig   |   9 +
>  arch/arm64/include/asm/pgtable.h | 161 ++
>  arch/arm64/mm/Makefile   |   1 +
>  arch/arm64/mm/contpte.c  | 283 +++
>  4 files changed, 454 insertions(+)
>  create mode 100644 arch/arm64/mm/contpte.c
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index d86d7f4758b5..1442e8ed95b6 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2230,6 +2230,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
>   select UNWIND_TABLES
>   select DYNAMIC_SCS
>  
> +config ARM64_CONTPTE
> + bool "Contiguous PTE mappings for user memory" if EXPERT
> + depends on TRANSPARENT_HUGEPAGE
> + default y
> + help
> +   When enabled, user mappings are configured using the PTE contiguous
> +   bit, for any mappings that meet the size and alignment requirements.
> +   This reduces TLB pressure and improves performance.
> +
>  endmenu # "Kernel Features"
>  
>  menu "Boot options"
> diff --git a/arch/arm64/include/asm/pgtable.h 
> b/arch/arm64/include/asm/pgtable.h
> index 7dc6b68ee516..34892a95403d 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t 
> phys)
>   */
>  #define pte_valid_not_user(pte) \
>   ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | 
> PTE_UXN))
> +/*
> + * Returns true if the pte is valid and has the contiguous bit set.
> + */
> +#define pte_valid_cont(pte)  (pte_valid(pte) && pte_cont(pte))
>  /*
>   * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
>   * so that we don't erroneously return false for pages that have been
> @@ -1135,6 +1139,161 @@ void vmemmap_update_pte(unsigned long addr, pte_t 
> *ptep, pte_t pte);
>  #define vmemmap_update_pte vmemmap_update_pte
>  #endif
>  
> +#ifdef CONFIG_ARM64_CONTPTE
> +
> +/*
> + * The contpte APIs are used to transparently manage the contiguous bit in 
> ptes
> + * where it is possible and makes sense to do so. The PTE_CONT bit is 
> considered
> + * a private implementation detail of the public ptep API (see below).
> + */
> +extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, pte_t pte);
> +extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
> +extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
> +extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, pte_t pte, unsigned int nr);

Re: [PATCH v15 1/5] crash: forward memory_notify arg to arch crash hotplug handler

2024-02-12 Thread Sourabh Jain




On 05/02/24 08:41, Baoquan He wrote:

On 01/11/24 at 04:21pm, Sourabh Jain wrote:

In the event of memory hotplug or online/offline events, the crash
memory hotplug notifier `crash_memhp_notifier()` receives a
`memory_notify` object but doesn't forward that object to the
generic and architecture-specific crash hotplug handler.

The `memory_notify` object contains the starting PFN (Page Frame Number)
and the number of pages in the hot-removed memory. This information is
necessary for architectures like PowerPC to update/recreate the kdump
image, specifically `elfcorehdr`.

So update the function signature of `crash_handle_hotplug_event()` and
`arch_crash_handle_hotplug_event()` to accept the `memory_notify` object
as an argument from crash memory hotplug notifier.

Since no such object is available in the case of CPU hotplug event, the
crash CPU hotplug notifier `crash_cpuhp_online()` passes NULL to the
crash hotplug handler.


..

---
  arch/x86/include/asm/kexec.h |  2 +-
  arch/x86/kernel/crash.c  |  3 ++-
  include/linux/kexec.h|  2 +-
  kernel/crash_core.c  | 14 +++---
  4 files changed, 11 insertions(+), 10 deletions(-)

LGTM,

Acked-by: Baoquan He 


Thanks Baoquan He

- Sourabh




diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index c9f6a6c5de3c..9bb6607e864e 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -208,7 +208,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage 
*image);
  extern void kdump_nmi_shootdown_cpus(void);
  
  #ifdef CONFIG_CRASH_HOTPLUG

-void arch_crash_handle_hotplug_event(struct kimage *image);
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
  #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
  
  #ifdef CONFIG_HOTPLUG_CPU

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index b6b044356f1b..44744e9c68ec 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -428,10 +428,11 @@ unsigned int arch_crash_get_elfcorehdr_size(void)
  /**
   * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
   * @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and NULL for CPU 
hotplug case.
   *
   * Prepare the new elfcorehdr and replace the existing elfcorehdr.
   */
-void arch_crash_handle_hotplug_event(struct kimage *image)
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
  {
void *elfbuf = NULL, *old_elfcorehdr;
unsigned long nr_mem_ranges;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 400cb6c02176..802052d9c64b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -483,7 +483,7 @@ static inline void arch_kexec_pre_free_pages(void *vaddr, 
unsigned int pages) {
  #endif
  
  #ifndef arch_crash_handle_hotplug_event

-static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
+static inline void arch_crash_handle_hotplug_event(struct kimage *image, void 
*arg) { }
  #endif
  
  int crash_check_update_elfcorehdr(void);

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index d48315667752..ab1c8e79759d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -914,7 +914,7 @@ int crash_check_update_elfcorehdr(void)
   * list of segments it checks (since the elfcorehdr changes and thus
   * would require an update to purgatory itself to update the digest).
   */
-static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int 
cpu)
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int 
cpu, void *arg)
  {
struct kimage *image;
  
@@ -976,7 +976,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)

image->hp_action = hp_action;
  
  	/* Now invoke arch-specific update handler */

-   arch_crash_handle_hotplug_event(image);
+   arch_crash_handle_hotplug_event(image, arg);
  
  	/* No longer handling a hotplug event */

image->hp_action = KEXEC_CRASH_HP_NONE;
@@ -992,17 +992,17 @@ static void crash_handle_hotplug_event(unsigned int 
hp_action, unsigned int cpu)
crash_hotplug_unlock();
  }
  
-static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)

+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, 
void *arg)
  {
switch (val) {
case MEM_ONLINE:
crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
-   KEXEC_CRASH_HP_INVALID_CPU);
+   KEXEC_CRASH_HP_INVALID_CPU, arg);
break;
  
  	case MEM_OFFLINE:

crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
-   KEXEC_CRASH_HP_INVALID_CPU);
+   KEXEC_CRASH_HP_INVALID_CPU, arg);
break;
}
return NOTIFY_OK;
@@ -1015,13 +1015,13 @@ static struct notifier_block crash_memhp_nb = {
  

Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread David Hildenbrand

On 12.02.24 12:21, Ryan Roberts wrote:

On 12/02/2024 11:05, David Hildenbrand wrote:

On 12.02.24 11:56, David Hildenbrand wrote:

On 12.02.24 11:32, Ryan Roberts wrote:

On 12/02/2024 10:11, David Hildenbrand wrote:

Hi Ryan,


-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
     {
-    struct mmu_gather_batch *batch;
-
-    for (batch = >local; batch && batch->nr; batch = batch->next) {
-    struct encoded_page **pages = batch->encoded_pages;
+    struct encoded_page **pages = batch->encoded_pages;
+    unsigned int nr, nr_pages;
     +    /*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+    if (!page_poisoning_enabled_static() && !want_init_on_free()) {


Is the performance win really worth 2 separate implementations keyed off this?
It seems a bit fragile, in case any other operations get added to free
which are
proportional to size in future. Why not just always do the conservative
version?


I really don't want to iterate over all entries on the "sane" common case. We
already do that two times:

a) free_pages_and_swap_cache()

b) release_pages()

Only the latter really is required, and I'm planning on removing the one in (a)
to move it into (b) as well.

So I keep it separate to keep any unnecessary overhead to the setups that are
already terribly slow.

No need to iterate a page full of entries if it can be easily avoided.
Especially, no need to degrade the common order-0 case.


Yeah, I understand all that. But given this is all coming from an array, (so
easy to prefetch?) and will presumably all fit in the cache for the common case,
at least, so its hot for (a) and (b), does separating this out really make a
measurable performance difference? If yes then absolutely this optimizaiton
makes sense. But if not, I think its a bit questionable.


I primarily added it because

(a) we learned that each cycle counts during mmap() just like it does
during fork().

(b) Linus was similarly concerned about optimizing out another batching
walk in c47454823bd4 ("mm: mmu_gather: allow more than one batch of
delayed rmaps"):

"it needs to walk that array of pages while still holding the page table
lock, and our mmu_gather infrastructure allows for batching quite a lot
of pages.  We may have thousands on pages queued up for freeing, and we
wanted to walk only the last batch if we then added a dirty page to the
queue."

So if it matters enough for reducing the time we hold the page table
lock, it surely adds "some" overhead in general.




You're the boss though, so if your experience tells you this is neccessary, then
I'm ok with that.


I did not do any measurements myself, I just did that intuitively as
above. After all, it's all pretty straight forward (keeping the existing
logic, we need a new one either way) and not that much code.

So unless there are strong opinions, I'd just leave the common case as
it was, and the odd case be special.


I think we can just reduce the code duplication easily:

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index d175c0f1e2c8..99b3e9408aa0 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -91,18 +91,21 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct
vm_area_struct *vma)
  }
  #endif
  
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)

-{
-    struct mmu_gather_batch *batch;
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+#define MAX_NR_FOLIOS_PER_FREE    512
  
-    for (batch = >local; batch && batch->nr; batch = batch->next) {

-    struct encoded_page **pages = batch->encoded_pages;
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
+{
+    struct encoded_page **pages = batch->encoded_pages;
+    unsigned int nr, nr_pages;
  
-    while (batch->nr) {

-    /*
- * limit free batch count when PAGE_SIZE > 4K
- */
-    unsigned int nr = min(512U, batch->nr);
+    while (batch->nr) {
+    if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+    nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
  
  /*

   * Make sure we cover page + nr_pages, and don't leave
@@ -111,14 +114,39 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  if (unlikely(encoded_page_flags(pages[nr - 1]) &
   ENCODED_PAGE_BIT_NR_PAGES_NEXT))
  nr++;
+    } else {
+    /*
+ * With page poisoning and init_on_free, the time it
+ * takes to free memory grows proportionally with the
+ * actual memory size. Therefore, limit based on the
+

[PATCH] soc: fsl: dpio: fix kcalloc() argument order

2024-02-12 Thread Arnd Bergmann
From: Arnd Bergmann 

A previous bugfix added a call to kcalloc(), which starting in gcc-14
causes a harmless warning about the argument order:

drivers/soc/fsl/dpio/dpio-service.c: In function 
'dpaa2_io_service_enqueue_multiple_desc_fq':
drivers/soc/fsl/dpio/dpio-service.c:526:29: error: 'kcalloc' sizes specified 
with 'sizeof' in the earlier argument and not in the later argument 
[-Werror=calloc-transposed-args]
  526 | ed = kcalloc(sizeof(struct qbman_eq_desc), 32, GFP_KERNEL);
  | ^~
drivers/soc/fsl/dpio/dpio-service.c:526:29: note: earlier argument should 
specify number of elements, later size of each element

Since the two are only multiplied, the order does not change the
behavior, so just fix it now to shut up the compiler warning.

Fixes: 5c4a5999b245 ("soc: fsl: dpio: avoid stack usage warning")
Signed-off-by: Arnd Bergmann 
---
 drivers/soc/fsl/dpio/dpio-service.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/dpio/dpio-service.c 
b/drivers/soc/fsl/dpio/dpio-service.c
index 1d2b27e3ea63..b811446e0fa5 100644
--- a/drivers/soc/fsl/dpio/dpio-service.c
+++ b/drivers/soc/fsl/dpio/dpio-service.c
@@ -523,7 +523,7 @@ int dpaa2_io_service_enqueue_multiple_desc_fq(struct 
dpaa2_io *d,
struct qbman_eq_desc *ed;
int i, ret;
 
-   ed = kcalloc(sizeof(struct qbman_eq_desc), 32, GFP_KERNEL);
+   ed = kcalloc(32, sizeof(struct qbman_eq_desc), GFP_KERNEL);
if (!ed)
return -ENOMEM;
 
-- 
2.39.2



Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread Ryan Roberts
On 12/02/2024 11:05, David Hildenbrand wrote:
> On 12.02.24 11:56, David Hildenbrand wrote:
>> On 12.02.24 11:32, Ryan Roberts wrote:
>>> On 12/02/2024 10:11, David Hildenbrand wrote:
 Hi Ryan,

>> -static void tlb_batch_pages_flush(struct mmu_gather *tlb)
>> +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch 
>> *batch)
>>     {
>> -    struct mmu_gather_batch *batch;
>> -
>> -    for (batch = >local; batch && batch->nr; batch = batch->next) {
>> -    struct encoded_page **pages = batch->encoded_pages;
>> +    struct encoded_page **pages = batch->encoded_pages;
>> +    unsigned int nr, nr_pages;
>>     +    /*
>> + * We might end up freeing a lot of pages. Reschedule on a regular
>> + * basis to avoid soft lockups in configurations without full
>> + * preemption enabled. The magic number of 512 folios seems to work.
>> + */
>> +    if (!page_poisoning_enabled_static() && !want_init_on_free()) {
>
> Is the performance win really worth 2 separate implementations keyed off 
> this?
> It seems a bit fragile, in case any other operations get added to free
> which are
> proportional to size in future. Why not just always do the conservative
> version?

 I really don't want to iterate over all entries on the "sane" common case. 
 We
 already do that two times:

 a) free_pages_and_swap_cache()

 b) release_pages()

 Only the latter really is required, and I'm planning on removing the one 
 in (a)
 to move it into (b) as well.

 So I keep it separate to keep any unnecessary overhead to the setups that 
 are
 already terribly slow.

 No need to iterate a page full of entries if it can be easily avoided.
 Especially, no need to degrade the common order-0 case.
>>>
>>> Yeah, I understand all that. But given this is all coming from an array, (so
>>> easy to prefetch?) and will presumably all fit in the cache for the common 
>>> case,
>>> at least, so its hot for (a) and (b), does separating this out really make a
>>> measurable performance difference? If yes then absolutely this optimizaiton
>>> makes sense. But if not, I think its a bit questionable.
>>
>> I primarily added it because
>>
>> (a) we learned that each cycle counts during mmap() just like it does
>> during fork().
>>
>> (b) Linus was similarly concerned about optimizing out another batching
>> walk in c47454823bd4 ("mm: mmu_gather: allow more than one batch of
>> delayed rmaps"):
>>
>> "it needs to walk that array of pages while still holding the page table
>> lock, and our mmu_gather infrastructure allows for batching quite a lot
>> of pages.  We may have thousands on pages queued up for freeing, and we
>> wanted to walk only the last batch if we then added a dirty page to the
>> queue."
>>
>> So if it matters enough for reducing the time we hold the page table
>> lock, it surely adds "some" overhead in general.
>>
>>
>>>
>>> You're the boss though, so if your experience tells you this is neccessary, 
>>> then
>>> I'm ok with that.
>>
>> I did not do any measurements myself, I just did that intuitively as
>> above. After all, it's all pretty straight forward (keeping the existing
>> logic, we need a new one either way) and not that much code.
>>
>> So unless there are strong opinions, I'd just leave the common case as
>> it was, and the odd case be special.
> 
> I think we can just reduce the code duplication easily:
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index d175c0f1e2c8..99b3e9408aa0 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -91,18 +91,21 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct
> vm_area_struct *vma)
>  }
>  #endif
>  
> -static void tlb_batch_pages_flush(struct mmu_gather *tlb)
> -{
> -    struct mmu_gather_batch *batch;
> +/*
> + * We might end up freeing a lot of pages. Reschedule on a regular
> + * basis to avoid soft lockups in configurations without full
> + * preemption enabled. The magic number of 512 folios seems to work.
> + */
> +#define MAX_NR_FOLIOS_PER_FREE    512
>  
> -    for (batch = >local; batch && batch->nr; batch = batch->next) {
> -    struct encoded_page **pages = batch->encoded_pages;
> +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
> +{
> +    struct encoded_page **pages = batch->encoded_pages;
> +    unsigned int nr, nr_pages;
>  
> -    while (batch->nr) {
> -    /*
> - * limit free batch count when PAGE_SIZE > 4K
> - */
> -    unsigned int nr = min(512U, batch->nr);
> +    while (batch->nr) {
> +    if (!page_poisoning_enabled_static() && !want_init_on_free()) {
> +    nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
>  
>  /*
>   * Make sure we cover page + nr_pages, and don't leave
> @@ -111,14 +114,39 @@ static void tlb_batch_pages_flush(struct 

[PATCH] i2c: pasemi: split driver into two separate modules

2024-02-12 Thread Arnd Bergmann
From: Arnd Bergmann 

On powerpc, it is possible to compile test both the new apple (arm) and
old pasemi (powerpc) drivers for the i2c hardware at the same time,
which leads to a warning about linking the same object file twice:

scripts/Makefile.build:244: drivers/i2c/busses/Makefile: i2c-pasemi-core.o is 
added to multiple modules: i2c-apple i2c-pasemi

Rework the driver to have an explicit helper module, letting Kbuild
take care of whether this should be built-in or a loadable driver.

Fixes: 9bc5f4f660ff ("i2c: pasemi: Split pci driver to its own file")
Signed-off-by: Arnd Bergmann 
---
 drivers/i2c/busses/Makefile  | 6 ++
 drivers/i2c/busses/i2c-pasemi-core.c | 6 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 3757b9391e60..aa0ee8ecd6f2 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -90,10 +90,8 @@ obj-$(CONFIG_I2C_NPCM)   += i2c-npcm7xx.o
 obj-$(CONFIG_I2C_OCORES)   += i2c-ocores.o
 obj-$(CONFIG_I2C_OMAP) += i2c-omap.o
 obj-$(CONFIG_I2C_OWL)  += i2c-owl.o
-i2c-pasemi-objs := i2c-pasemi-core.o i2c-pasemi-pci.o
-obj-$(CONFIG_I2C_PASEMI)   += i2c-pasemi.o
-i2c-apple-objs := i2c-pasemi-core.o i2c-pasemi-platform.o
-obj-$(CONFIG_I2C_APPLE)+= i2c-apple.o
+obj-$(CONFIG_I2C_PASEMI)   += i2c-pasemi-core.o i2c-pasemi-pci.o
+obj-$(CONFIG_I2C_APPLE)+= i2c-pasemi-core.o 
i2c-pasemi-platform.o
 obj-$(CONFIG_I2C_PCA_PLATFORM) += i2c-pca-platform.o
 obj-$(CONFIG_I2C_PNX)  += i2c-pnx.o
 obj-$(CONFIG_I2C_PXA)  += i2c-pxa.o
diff --git a/drivers/i2c/busses/i2c-pasemi-core.c 
b/drivers/i2c/busses/i2c-pasemi-core.c
index 7d54a9f34c74..bd8becbdeeb2 100644
--- a/drivers/i2c/busses/i2c-pasemi-core.c
+++ b/drivers/i2c/busses/i2c-pasemi-core.c
@@ -369,6 +369,7 @@ int pasemi_i2c_common_probe(struct pasemi_smbus *smbus)
 
return 0;
 }
+EXPORT_SYMBOL_GPL(pasemi_i2c_common_probe);
 
 irqreturn_t pasemi_irq_handler(int irq, void *dev_id)
 {
@@ -378,3 +379,8 @@ irqreturn_t pasemi_irq_handler(int irq, void *dev_id)
complete(>irq_completion);
return IRQ_HANDLED;
 }
+EXPORT_SYMBOL_GPL(pasemi_irq_handler);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Olof Johansson ");
+MODULE_DESCRIPTION("PA Semi PWRficient SMBus driver");
-- 
2.39.2



Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread David Hildenbrand

On 12.02.24 11:56, David Hildenbrand wrote:

On 12.02.24 11:32, Ryan Roberts wrote:

On 12/02/2024 10:11, David Hildenbrand wrote:

Hi Ryan,


-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
    {
-    struct mmu_gather_batch *batch;
-
-    for (batch = >local; batch && batch->nr; batch = batch->next) {
-    struct encoded_page **pages = batch->encoded_pages;
+    struct encoded_page **pages = batch->encoded_pages;
+    unsigned int nr, nr_pages;
    +    /*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+    if (!page_poisoning_enabled_static() && !want_init_on_free()) {


Is the performance win really worth 2 separate implementations keyed off this?
It seems a bit fragile, in case any other operations get added to free which are
proportional to size in future. Why not just always do the conservative version?


I really don't want to iterate over all entries on the "sane" common case. We
already do that two times:

a) free_pages_and_swap_cache()

b) release_pages()

Only the latter really is required, and I'm planning on removing the one in (a)
to move it into (b) as well.

So I keep it separate to keep any unnecessary overhead to the setups that are
already terribly slow.

No need to iterate a page full of entries if it can be easily avoided.
Especially, no need to degrade the common order-0 case.


Yeah, I understand all that. But given this is all coming from an array, (so
easy to prefetch?) and will presumably all fit in the cache for the common case,
at least, so its hot for (a) and (b), does separating this out really make a
measurable performance difference? If yes then absolutely this optimizaiton
makes sense. But if not, I think its a bit questionable.


I primarily added it because

(a) we learned that each cycle counts during mmap() just like it does
during fork().

(b) Linus was similarly concerned about optimizing out another batching
walk in c47454823bd4 ("mm: mmu_gather: allow more than one batch of
delayed rmaps"):

"it needs to walk that array of pages while still holding the page table
lock, and our mmu_gather infrastructure allows for batching quite a lot
of pages.  We may have thousands on pages queued up for freeing, and we
wanted to walk only the last batch if we then added a dirty page to the
queue."

So if it matters enough for reducing the time we hold the page table
lock, it surely adds "some" overhead in general.




You're the boss though, so if your experience tells you this is neccessary, then
I'm ok with that.


I did not do any measurements myself, I just did that intuitively as
above. After all, it's all pretty straight forward (keeping the existing
logic, we need a new one either way) and not that much code.

So unless there are strong opinions, I'd just leave the common case as
it was, and the odd case be special.


I think we can just reduce the code duplication easily:

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index d175c0f1e2c8..99b3e9408aa0 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -91,18 +91,21 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct 
vm_area_struct *vma)
 }
 #endif
 
-static void tlb_batch_pages_flush(struct mmu_gather *tlb)

-{
-   struct mmu_gather_batch *batch;
+/*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+#define MAX_NR_FOLIOS_PER_FREE 512
 
-	for (batch = >local; batch && batch->nr; batch = batch->next) {

-   struct encoded_page **pages = batch->encoded_pages;
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
+{
+   struct encoded_page **pages = batch->encoded_pages;
+   unsigned int nr, nr_pages;
 
-		while (batch->nr) {

-   /*
-* limit free batch count when PAGE_SIZE > 4K
-*/
-   unsigned int nr = min(512U, batch->nr);
+   while (batch->nr) {
+   if (!page_poisoning_enabled_static() && !want_init_on_free()) {
+   nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
 
 			/*

 * Make sure we cover page + nr_pages, and don't leave
@@ -111,14 +114,39 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
if (unlikely(encoded_page_flags(pages[nr - 1]) &
 ENCODED_PAGE_BIT_NR_PAGES_NEXT))
nr++;
+   } else {
+   /*
+* With page poisoning and init_on_free, the time it
+* takes to free memory grows proportionally with the
+ 

Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread David Hildenbrand

On 12.02.24 11:32, Ryan Roberts wrote:

On 12/02/2024 10:11, David Hildenbrand wrote:

Hi Ryan,


-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
   {
-    struct mmu_gather_batch *batch;
-
-    for (batch = >local; batch && batch->nr; batch = batch->next) {
-    struct encoded_page **pages = batch->encoded_pages;
+    struct encoded_page **pages = batch->encoded_pages;
+    unsigned int nr, nr_pages;
   +    /*
+ * We might end up freeing a lot of pages. Reschedule on a regular
+ * basis to avoid soft lockups in configurations without full
+ * preemption enabled. The magic number of 512 folios seems to work.
+ */
+    if (!page_poisoning_enabled_static() && !want_init_on_free()) {


Is the performance win really worth 2 separate implementations keyed off this?
It seems a bit fragile, in case any other operations get added to free which are
proportional to size in future. Why not just always do the conservative version?


I really don't want to iterate over all entries on the "sane" common case. We
already do that two times:

a) free_pages_and_swap_cache()

b) release_pages()

Only the latter really is required, and I'm planning on removing the one in (a)
to move it into (b) as well.

So I keep it separate to keep any unnecessary overhead to the setups that are
already terribly slow.

No need to iterate a page full of entries if it can be easily avoided.
Especially, no need to degrade the common order-0 case.


Yeah, I understand all that. But given this is all coming from an array, (so
easy to prefetch?) and will presumably all fit in the cache for the common case,
at least, so its hot for (a) and (b), does separating this out really make a
measurable performance difference? If yes then absolutely this optimizaiton
makes sense. But if not, I think its a bit questionable.


I primarily added it because

(a) we learned that each cycle counts during mmap() just like it does 
during fork().


(b) Linus was similarly concerned about optimizing out another batching 
walk in c47454823bd4 ("mm: mmu_gather: allow more than one batch of 
delayed rmaps"):


"it needs to walk that array of pages while still holding the page table 
lock, and our mmu_gather infrastructure allows for batching quite a lot 
of pages.  We may have thousands on pages queued up for freeing, and we 
wanted to walk only the last batch if we then added a dirty page to the 
queue."


So if it matters enough for reducing the time we hold the page table 
lock, it surely adds "some" overhead in general.





You're the boss though, so if your experience tells you this is neccessary, then
I'm ok with that.


I did not do any measurements myself, I just did that intuitively as 
above. After all, it's all pretty straight forward (keeping the existing 
logic, we need a new one either way) and not that much code.


So unless there are strong opinions, I'd just leave the common case as 
it was, and the odd case be special.




By the way, Matthew had an RFC a while back that was doing some clever things
with batches further down the call chain (I think; be memory). Might be worth
taking a look at that if you are planning a follow up change to (a).



Do you have a pointer?






   while (batch->nr) {
-    /*
- * limit free batch count when PAGE_SIZE > 4K
- */
-    unsigned int nr = min(512U, batch->nr);
+    nr = min(512, batch->nr);


If any entries are for more than 1 page, nr_pages will also be encoded in the
batch, so effectively this could be limiting to 256 actual folios (half of 512).


Right, in the patch description I state "256 folio fragments". It's up to 512
folios (order-0).


Is it worth checking for ENCODED_PAGE_BIT_NR_PAGES_NEXT and limiting 
accordingly?


At least with 4k page size, we never have more than 510 (IIRC) entries per batch
page. So any such optimization would only matter for large page sizes, which I
don't think is worth it.


Yep; agreed.



Which exact optimization do you have in mind and would it really make a 
difference?


No I don't think it would make any difference, performance-wise. I'm just
pointing out that in pathalogical cases you could end up with half the number of
pages being freed at a time.


Yes, I'll extend the patch description!







nit: You're using 512 magic number in 2 places now; perhaps make a macro?


I played 3 times with macro names (including just using something "intuitive"
like MAX_ORDER_NR_PAGES) but returned to just using 512.

That cond_resched() handling is just absolutely disgusting, one way or the 
other.

Do you have a good idea for a macro name?


MAX_NR_FOLIOS_PER_BATCH?
MAX_NR_FOLIOS_PER_FREE?

I don't think the name has to be perfect, because its private to the c file; but
it ensures the 2 usages remain in sync if someone wants to change it in future.


Makes sense, I'll use something along those lines.








      

Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread Ryan Roberts
On 12/02/2024 10:11, David Hildenbrand wrote:
> Hi Ryan,
> 
>>> -static void tlb_batch_pages_flush(struct mmu_gather *tlb)
>>> +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
>>>   {
>>> -    struct mmu_gather_batch *batch;
>>> -
>>> -    for (batch = >local; batch && batch->nr; batch = batch->next) {
>>> -    struct encoded_page **pages = batch->encoded_pages;
>>> +    struct encoded_page **pages = batch->encoded_pages;
>>> +    unsigned int nr, nr_pages;
>>>   +    /*
>>> + * We might end up freeing a lot of pages. Reschedule on a regular
>>> + * basis to avoid soft lockups in configurations without full
>>> + * preemption enabled. The magic number of 512 folios seems to work.
>>> + */
>>> +    if (!page_poisoning_enabled_static() && !want_init_on_free()) {
>>
>> Is the performance win really worth 2 separate implementations keyed off 
>> this?
>> It seems a bit fragile, in case any other operations get added to free which 
>> are
>> proportional to size in future. Why not just always do the conservative 
>> version?
> 
> I really don't want to iterate over all entries on the "sane" common case. We
> already do that two times:
> 
> a) free_pages_and_swap_cache()
> 
> b) release_pages()
> 
> Only the latter really is required, and I'm planning on removing the one in 
> (a)
> to move it into (b) as well.
> 
> So I keep it separate to keep any unnecessary overhead to the setups that are
> already terribly slow.
> 
> No need to iterate a page full of entries if it can be easily avoided.
> Especially, no need to degrade the common order-0 case.

Yeah, I understand all that. But given this is all coming from an array, (so
easy to prefetch?) and will presumably all fit in the cache for the common case,
at least, so its hot for (a) and (b), does separating this out really make a
measurable performance difference? If yes then absolutely this optimizaiton
makes sense. But if not, I think its a bit questionable.

You're the boss though, so if your experience tells you this is neccessary, then
I'm ok with that.

By the way, Matthew had an RFC a while back that was doing some clever things
with batches further down the call chain (I think; be memory). Might be worth
taking a look at that if you are planning a follow up change to (a).

> 
>>
>>>   while (batch->nr) {
>>> -    /*
>>> - * limit free batch count when PAGE_SIZE > 4K
>>> - */
>>> -    unsigned int nr = min(512U, batch->nr);
>>> +    nr = min(512, batch->nr);
>>
>> If any entries are for more than 1 page, nr_pages will also be encoded in the
>> batch, so effectively this could be limiting to 256 actual folios (half of 
>> 512).
> 
> Right, in the patch description I state "256 folio fragments". It's up to 512
> folios (order-0).
> 
>> Is it worth checking for ENCODED_PAGE_BIT_NR_PAGES_NEXT and limiting 
>> accordingly?
> 
> At least with 4k page size, we never have more than 510 (IIRC) entries per 
> batch
> page. So any such optimization would only matter for large page sizes, which I
> don't think is worth it.

Yep; agreed.

> 
> Which exact optimization do you have in mind and would it really make a 
> difference?

No I don't think it would make any difference, performance-wise. I'm just
pointing out that in pathalogical cases you could end up with half the number of
pages being freed at a time.

> 
>>
>> nit: You're using 512 magic number in 2 places now; perhaps make a macro?
> 
> I played 3 times with macro names (including just using something "intuitive"
> like MAX_ORDER_NR_PAGES) but returned to just using 512.
> 
> That cond_resched() handling is just absolutely disgusting, one way or the 
> other.
> 
> Do you have a good idea for a macro name?

MAX_NR_FOLIOS_PER_BATCH?
MAX_NR_FOLIOS_PER_FREE?

I don't think the name has to be perfect, because its private to the c file; but
it ensures the 2 usages remain in sync if someone wants to change it in future.

> 
>>
>>>     /*
>>>    * Make sure we cover page + nr_pages, and don't leave
>>> @@ -119,6 +120,37 @@ static void tlb_batch_pages_flush(struct mmu_gather 
>>> *tlb)
>>>   cond_resched();
>>>   }
>>>   }
>>> +
>>> +    /*
>>> + * With page poisoning and init_on_free, the time it takes to free
>>> + * memory grows proportionally with the actual memory size. Therefore,
>>> + * limit based on the actual memory size and not the number of involved
>>> + * folios.
>>> + */
>>> +    while (batch->nr) {
>>> +    for (nr = 0, nr_pages = 0;
>>> + nr < batch->nr && nr_pages < 512; nr++) {
>>> +    if (unlikely(encoded_page_flags(pages[nr]) &
>>> + ENCODED_PAGE_BIT_NR_PAGES_NEXT))
>>> +    nr_pages += encoded_nr_pages(pages[++nr]);
>>> +    else
>>> +    nr_pages++;
>>> +    }
>>
>> I guess worst case here is freeing (511 + 8192) * 64K pages = ~544M. That's 
>> 

Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread David Hildenbrand

Hi Ryan,


-static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
  {
-   struct mmu_gather_batch *batch;
-
-   for (batch = >local; batch && batch->nr; batch = batch->next) {
-   struct encoded_page **pages = batch->encoded_pages;
+   struct encoded_page **pages = batch->encoded_pages;
+   unsigned int nr, nr_pages;
  
+	/*

+* We might end up freeing a lot of pages. Reschedule on a regular
+* basis to avoid soft lockups in configurations without full
+* preemption enabled. The magic number of 512 folios seems to work.
+*/
+   if (!page_poisoning_enabled_static() && !want_init_on_free()) {


Is the performance win really worth 2 separate implementations keyed off this?
It seems a bit fragile, in case any other operations get added to free which are
proportional to size in future. Why not just always do the conservative version?


I really don't want to iterate over all entries on the "sane" common 
case. We already do that two times:


a) free_pages_and_swap_cache()

b) release_pages()

Only the latter really is required, and I'm planning on removing the one 
in (a) to move it into (b) as well.


So I keep it separate to keep any unnecessary overhead to the setups 
that are already terribly slow.


No need to iterate a page full of entries if it can be easily avoided. 
Especially, no need to degrade the common order-0 case.





while (batch->nr) {
-   /*
-* limit free batch count when PAGE_SIZE > 4K
-*/
-   unsigned int nr = min(512U, batch->nr);
+   nr = min(512, batch->nr);


If any entries are for more than 1 page, nr_pages will also be encoded in the
batch, so effectively this could be limiting to 256 actual folios (half of 512).


Right, in the patch description I state "256 folio fragments". It's up 
to 512 folios (order-0).



Is it worth checking for ENCODED_PAGE_BIT_NR_PAGES_NEXT and limiting 
accordingly?


At least with 4k page size, we never have more than 510 (IIRC) entries 
per batch page. So any such optimization would only matter for large 
page sizes, which I don't think is worth it.


Which exact optimization do you have in mind and would it really make a 
difference?




nit: You're using 512 magic number in 2 places now; perhaps make a macro?


I played 3 times with macro names (including just using something 
"intuitive" like MAX_ORDER_NR_PAGES) but returned to just using 512.


That cond_resched() handling is just absolutely disgusting, one way or 
the other.


Do you have a good idea for a macro name?



  
  			/*

 * Make sure we cover page + nr_pages, and don't leave
@@ -119,6 +120,37 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
cond_resched();
}
}
+
+   /*
+* With page poisoning and init_on_free, the time it takes to free
+* memory grows proportionally with the actual memory size. Therefore,
+* limit based on the actual memory size and not the number of involved
+* folios.
+*/
+   while (batch->nr) {
+   for (nr = 0, nr_pages = 0;
+nr < batch->nr && nr_pages < 512; nr++) {
+   if (unlikely(encoded_page_flags(pages[nr]) &
+ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+   nr_pages += encoded_nr_pages(pages[++nr]);
+   else
+   nr_pages++;
+   }


I guess worst case here is freeing (511 + 8192) * 64K pages = ~544M. That's up
from the old limit of 512 * 64K = 32M, and 511 pages bigger than your statement
in the commit log. Are you comfortable with this? I guess the only alternative
is to start splitting a batch which would be really messy. I agree your approach
is preferable if 544M is acceptable.


Right, I have in the description:

"if we cannot even free a single MAX_ORDER page on a system without 
running into soft lockups, something else is already completely bogus.".


That would be 8192 pages on arm64. Anybody freeing a PMD-mapped THP 
would be in trouble already and should just reconsider life choices 
running such a machine.


We could have 511 more pages, yes. If 8192 don't trigger a soft-lockup, 
I am confident that 511 more pages won't make a difference.


But, if that ever is a problem, we can butcher this code as much as we 
want, because performance with poisoning/zeroing is already down the drain.


As you say, splitting even further is messy, so I rather avoid that 
unless really required.


--
Cheers,

David / dhildenb



Re: [DMARC error][SPF error] Re: [PATCH v4 00/10] devm_led_classdev_register() usage problem

2024-02-12 Thread Andy Shevchenko
On Mon, Feb 12, 2024 at 1:52 AM George Stark  wrote:
> I haven't lose hope for the devm_mutex thing and keep pinging those guys
> from time to time.

I don't understand. According to v4 thread Christophe proposed on how
the patch should look like. What you need is to incorporate an updated
version into your series. Am I wrong?

> Sure I can single out the fix-only patch I'll do it tomorrow.

I believe it can be handled without issuing it separately. `b4` tool
is capable of selective choices. It was rather Q to Lee if he can/want
to apply it right away.

> On 2/9/24 20:11, Andy Shevchenko wrote:
> > On Thu, Dec 21, 2023 at 03:11:11PM +, Lee Jones wrote:
> >> On Thu, 14 Dec 2023, George Stark wrote:
> >>
> >>> This patch series fixes the problem of devm_led_classdev_register 
> >>> misusing.
> >>>
> >>> The basic problem is described in [1]. Shortly when 
> >>> devm_led_classdev_register()
> >>> is used then led_classdev_unregister() called after driver's remove() 
> >>> callback.
> >>> led_classdev_unregister() calls driver's brightness_set callback and that 
> >>> callback
> >>> may use resources which were destroyed already in driver's remove().
> >>>
> >>> After discussion with maintainers [2] [3] we decided:
> >>> 1) don't touch led subsytem core code and don't remove 
> >>> led_set_brightness() from it
> >>> but fix drivers
> >>> 2) don't use devm_led_classdev_unregister
> >>>
> >>> So the solution is to use devm wrappers for all resources
> >>> driver's brightness_set() depends on. And introduce dedicated devm wrapper
> >>> for mutex as it's often used resource.
> >>>
> >>> [1] 
> >>> https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/
> >>> [2] 
> >>> https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mc132b9b350fa51931b4fcfe14705d9f06e91421f
> >>> [3] 
> >>> https://lore.kernel.org/lkml/8704539b-ed3b-44e6-aa82-586e2f895...@salutedevices.com/T/#mdbf572a85c33f869a553caf986b6228bb65c8383
> >
> > ...
> >
> >> FYI: I'll conduct my review once the locking side is settled.
> >
> > To reduce burden can you apply the first one? It's a fix.

-- 
With Best Regards,
Andy Shevchenko


Re: [PATCH v3 RESEND 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Rasmus Villemoes
On 12/02/2024 08.56, Herve Codina wrote:
> The bitmap_onto() function translates one bitmap relative to another but
> no function are present to perform the reverse translation.
> 
> Introduce bitmap_off() to fill this hole.
> 
> Signed-off-by: Herve Codina 
> ---
>  include/linux/bitmap.h |  3 +++
>  lib/bitmap.c   | 42 ++

This patch, or the next in the series, should include a diffstat
mentioning lib/test_bitmap.c. And please make sure that the tests
exercise both expected use as well as corner cases, so that the actual
expected behavior is documented in code and not just in prose (which may
be ambiguous), and so that behavior-changing refactorings will not go
unnoticed.

Rasmus



Re: [PATCH v2 10/10] mm/memory: optimize unmap/zap with PTE-mapped THP

2024-02-12 Thread Ryan Roberts
On 09/02/2024 22:15, David Hildenbrand wrote:
> Similar to how we optimized fork(), let's implement PTE batching when
> consecutive (present) PTEs map consecutive pages of the same large
> folio.
> 
> Most infrastructure we need for batching (mmu gather, rmap) is already
> there. We only have to add get_and_clear_full_ptes() and
> clear_full_ptes(). Similarly, extend zap_install_uffd_wp_if_needed() to
> process a PTE range.
> 
> We won't bother sanity-checking the mapcount of all subpages, but only
> check the mapcount of the first subpage we process. If there is a real
> problem hiding somewhere, we can trigger it simply by using small
> folios, or when we zap single pages of a large folio. Ideally, we had
> that check in rmap code (including for delayed rmap), but then we cannot
> print the PTE. Let's keep it simple for now. If we ever have a cheap
> folio_mapcount(), we might just want to check for underflows there.
> 
> To keep small folios as fast as possible force inlining of a specialized
> variant using __always_inline with nr=1.
> 
> Signed-off-by: David Hildenbrand 

Reviewed-by: Ryan Roberts 

> ---
>  include/linux/pgtable.h | 70 +++
>  mm/memory.c | 92 +
>  2 files changed, 136 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index aab227e12493..49ab1f73b5c2 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct 
> mm_struct *mm,
>  }
>  #endif
>  
> +#ifndef get_and_clear_full_ptes
> +/**
> + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
> + *the same folio, collecting dirty/accessed bits.
> + * @mm: Address space the pages are mapped into.
> + * @addr: Address the first page is mapped at.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to clear.
> + * @full: Whether we are clearing a full mm.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
> + * returned PTE.
> + *
> + * Note that PTE bits in the PTE range besides the PFN can differ. For 
> example,
> + * some PTEs might be write-protected.
> + *
> + * Context: The caller holds the page table lock.  The PTEs map consecutive
> + * pages that belong to the same folio.  The PTEs are all in the same PMD.
> + */
> +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> + pte_t pte, tmp_pte;
> +
> + pte = ptep_get_and_clear_full(mm, addr, ptep, full);
> + while (--nr) {
> + ptep++;
> + addr += PAGE_SIZE;
> + tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
> + if (pte_dirty(tmp_pte))
> + pte = pte_mkdirty(pte);
> + if (pte_young(tmp_pte))
> + pte = pte_mkyoung(pte);
> + }
> + return pte;
> +}
> +#endif
> +
> +#ifndef clear_full_ptes
> +/**
> + * clear_full_ptes - Clear present PTEs that map consecutive pages of the 
> same
> + *folio.
> + * @mm: Address space the pages are mapped into.
> + * @addr: Address the first page is mapped at.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to clear.
> + * @full: Whether we are clearing a full mm.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over ptep_get_and_clear_full().
> + *
> + * Note that PTE bits in the PTE range besides the PFN can differ. For 
> example,
> + * some PTEs might be write-protected.
> + *
> + * Context: The caller holds the page table lock.  The PTEs map consecutive
> + * pages that belong to the same folio.  The PTEs are all in the same PMD.
> + */
> +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, unsigned int nr, int full)
> +{
> + for (;;) {
> + ptep_get_and_clear_full(mm, addr, ptep, full);
> + if (--nr == 0)
> + break;
> + ptep++;
> + addr += PAGE_SIZE;
> + }
> +}
> +#endif
>  
>  /*
>   * If two threads concurrently fault at the same page, the thread that
> diff --git a/mm/memory.c b/mm/memory.c
> index a3efc4da258a..3b8e56eb08a3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1515,7 +1515,7 @@ static inline bool zap_drop_file_uffd_wp(struct 
> zap_details *details)
>   */
>  static inline void
>  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> -   unsigned long addr, pte_t *pte,
> +   unsigned long addr, pte_t *pte, int nr,
> struct zap_details *details, pte_t pteval)
>  {
>   /* Zap on anonymous always means 

Re: [PATCH v2 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing

2024-02-12 Thread Ryan Roberts
On 09/02/2024 22:15, David Hildenbrand wrote:
> It's a pain that we have to handle cond_resched() in
> tlb_batch_pages_flush() manually and cannot simply handle it in
> release_pages() -- release_pages() can be called from atomic context.
> Well, in a perfect world we wouldn't have to make our code more at all.
> 
> With page poisoning and init_on_free, we might now run into soft lockups
> when we free a lot of rather large folio fragments, because page freeing
> time then depends on the actual memory size we are freeing instead of on
> the number of folios that are involved.
> 
> In the absolute (unlikely) worst case, on arm64 with 64k we will be able
> to free up to 256 folio fragments that each span 512 MiB: zeroing out 128
> GiB does sound like it might take a while. But instead of ignoring this
> unlikely case, let's just handle it.
> 
> So, let's teach tlb_batch_pages_flush() that there are some
> configurations where page freeing is horribly slow, and let's reschedule
> more frequently -- similarly like we did for now before we had large folio
> fragments in there. Note that we might end up freeing only a single folio
> fragment at a time that might exceed the old 512 pages limit: but if we
> cannot even free a single MAX_ORDER page on a system without running into
> soft lockups, something else is already completely bogus.
> 
> In the future, we might want to detect if handling cond_resched() is
> required at all, and just not do any of that with full preemption enabled.
> 
> Signed-off-by: David Hildenbrand 
> ---
>  mm/mmu_gather.c | 50 -
>  1 file changed, 41 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index d175c0f1e2c8..2774044b5790 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -91,18 +91,19 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct 
> vm_area_struct *vma)
>  }
>  #endif
>  
> -static void tlb_batch_pages_flush(struct mmu_gather *tlb)
> +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
>  {
> - struct mmu_gather_batch *batch;
> -
> - for (batch = >local; batch && batch->nr; batch = batch->next) {
> - struct encoded_page **pages = batch->encoded_pages;
> + struct encoded_page **pages = batch->encoded_pages;
> + unsigned int nr, nr_pages;
>  
> + /*
> +  * We might end up freeing a lot of pages. Reschedule on a regular
> +  * basis to avoid soft lockups in configurations without full
> +  * preemption enabled. The magic number of 512 folios seems to work.
> +  */
> + if (!page_poisoning_enabled_static() && !want_init_on_free()) {

Is the performance win really worth 2 separate implementations keyed off this?
It seems a bit fragile, in case any other operations get added to free which are
proportional to size in future. Why not just always do the conservative version?

>   while (batch->nr) {
> - /*
> -  * limit free batch count when PAGE_SIZE > 4K
> -  */
> - unsigned int nr = min(512U, batch->nr);
> + nr = min(512, batch->nr);

If any entries are for more than 1 page, nr_pages will also be encoded in the
batch, so effectively this could be limiting to 256 actual folios (half of 512).
Is it worth checking for ENCODED_PAGE_BIT_NR_PAGES_NEXT and limiting 
accordingly?

nit: You're using 512 magic number in 2 places now; perhaps make a macro?

>  
>   /*
>* Make sure we cover page + nr_pages, and don't leave
> @@ -119,6 +120,37 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
>   cond_resched();
>   }
>   }
> +
> + /*
> +  * With page poisoning and init_on_free, the time it takes to free
> +  * memory grows proportionally with the actual memory size. Therefore,
> +  * limit based on the actual memory size and not the number of involved
> +  * folios.
> +  */
> + while (batch->nr) {
> + for (nr = 0, nr_pages = 0;
> +  nr < batch->nr && nr_pages < 512; nr++) {
> + if (unlikely(encoded_page_flags(pages[nr]) &
> +  ENCODED_PAGE_BIT_NR_PAGES_NEXT))
> + nr_pages += encoded_nr_pages(pages[++nr]);
> + else
> + nr_pages++;
> + }

I guess worst case here is freeing (511 + 8192) * 64K pages = ~544M. That's up
from the old limit of 512 * 64K = 32M, and 511 pages bigger than your statement
in the commit log. Are you comfortable with this? I guess the only alternative
is to start splitting a batch which would be really messy. I agree your approach
is preferable if 544M is acceptable.

> +
> + free_pages_and_swap_cache(pages, nr);
> + pages += nr;
> + batch->nr -= nr;
> +
> + 

Re: [PATCH v2 08/10] mm/mmu_gather: add __tlb_remove_folio_pages()

2024-02-12 Thread David Hildenbrand

On 12.02.24 09:51, Ryan Roberts wrote:

On 09/02/2024 22:15, David Hildenbrand wrote:

Add __tlb_remove_folio_pages(), which will remove multiple consecutive
pages that belong to the same large folio, instead of only a single
page. We'll be using this function when optimizing unmapping/zapping of
large folios that are mapped by PTEs.

We're using the remaining spare bit in an encoded_page to indicate that
the next enoced page in an array contains actually shifted "nr_pages".
Teach swap/freeing code about putting multiple folio references, and
delayed rmap handling to remove page ranges of a folio.

This extension allows for still gathering almost as many small folios
as we used to (-1, because we have to prepare for a possibly bigger next
entry), but still allows for gathering consecutive pages that belong to the
same large folio.

Note that we don't pass the folio pointer, because it is not required for
now. Further, we don't support page_size != PAGE_SIZE, it won't be
required for simple PTE batching.

We have to provide a separate s390 implementation, but it's fairly
straight forward.

Another, more invasive and likely more expensive, approach would be to
use folio+range or a PFN range instead of page+nr_pages. But, we should
do that consistently for the whole mmu_gather. For now, let's keep it
simple and add "nr_pages" only.

Note that it is now possible to gather significantly more pages: In the
past, we were able to gather ~1 pages, now we can gather
also gather ~5000 folio fragments that span multiple pages. A folio
fragement on x86-64 can be up to 512 pages (2 MiB THP) and on arm64 with
64k in theory 8192 pages (512 MiB THP). Gathering more memory is not
considered something we should worry about, especially because these are
already corner cases.

While we can gather more total memory, we won't free more folio
fragments. As long as page freeing time primarily only depends on the
number of involved folios, there is no effective change for !preempt
configurations. However, we'll adjust tlb_batch_pages_flush() separately to
handle corner cases where page freeing time grows proportionally with the
actual memory size.

Signed-off-by: David Hildenbrand 
---
  arch/s390/include/asm/tlb.h | 17 +++
  include/asm-generic/tlb.h   |  8 +
  include/linux/mm_types.h| 20 
  mm/mmu_gather.c | 61 +++--
  mm/swap.c   | 12 ++--
  mm/swap_state.c | 15 +++--
  6 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 48df896d5b79..e95b2c8081eb 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -26,6 +26,8 @@ void __tlb_remove_table(void *_table);
  static inline void tlb_flush(struct mmu_gather *tlb);
  static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, bool delay_rmap, int page_size);
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
+   struct page *page, unsigned int nr_pages, bool delay_rmap);
  
  #define tlb_flush tlb_flush

  #define pte_free_tlb pte_free_tlb
@@ -52,6 +54,21 @@ static inline bool __tlb_remove_page_size(struct mmu_gather 
*tlb,
return false;
  }
  
+static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,

+   struct page *page, unsigned int nr_pages, bool delay_rmap)
+{
+   struct encoded_page *encoded_pages[] = {
+   encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
+   encode_nr_pages(nr_pages),
+   };
+
+   VM_WARN_ON_ONCE(delay_rmap);
+   VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
+
+   free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
+   return false;
+}
+
  static inline void tlb_flush(struct mmu_gather *tlb)
  {
__tlb_flush_mm_lazy(tlb->mm);
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 95d60a4f468a..bd00dd238b79 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -69,6 +69,7 @@
   *
   *  - tlb_remove_page() / __tlb_remove_page()
   *  - tlb_remove_page_size() / __tlb_remove_page_size()
+ *  - __tlb_remove_folio_pages()
   *
   *__tlb_remove_page_size() is the basic primitive that queues a page for
   *freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
@@ -78,6 +79,11 @@
   *tlb_remove_page() and tlb_remove_page_size() imply the call to
   *tlb_flush_mmu() when required and has no return value.
   *
+ *__tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
+ *instead of removing a single page, remove the given number of consecutive
+ *pages that are all part of the same (large) folio: just like calling
+ *__tlb_remove_page() on each page individually.
+ *
   *  - tlb_change_page_size()
   *
   *call before __tlb_remove_page*() to set the current 

Re: [PATCH v2 08/10] mm/mmu_gather: add __tlb_remove_folio_pages()

2024-02-12 Thread Ryan Roberts
On 09/02/2024 22:15, David Hildenbrand wrote:
> Add __tlb_remove_folio_pages(), which will remove multiple consecutive
> pages that belong to the same large folio, instead of only a single
> page. We'll be using this function when optimizing unmapping/zapping of
> large folios that are mapped by PTEs.
> 
> We're using the remaining spare bit in an encoded_page to indicate that
> the next enoced page in an array contains actually shifted "nr_pages".
> Teach swap/freeing code about putting multiple folio references, and
> delayed rmap handling to remove page ranges of a folio.
> 
> This extension allows for still gathering almost as many small folios
> as we used to (-1, because we have to prepare for a possibly bigger next
> entry), but still allows for gathering consecutive pages that belong to the
> same large folio.
> 
> Note that we don't pass the folio pointer, because it is not required for
> now. Further, we don't support page_size != PAGE_SIZE, it won't be
> required for simple PTE batching.
> 
> We have to provide a separate s390 implementation, but it's fairly
> straight forward.
> 
> Another, more invasive and likely more expensive, approach would be to
> use folio+range or a PFN range instead of page+nr_pages. But, we should
> do that consistently for the whole mmu_gather. For now, let's keep it
> simple and add "nr_pages" only.
> 
> Note that it is now possible to gather significantly more pages: In the
> past, we were able to gather ~1 pages, now we can gather
> also gather ~5000 folio fragments that span multiple pages. A folio
> fragement on x86-64 can be up to 512 pages (2 MiB THP) and on arm64 with
> 64k in theory 8192 pages (512 MiB THP). Gathering more memory is not
> considered something we should worry about, especially because these are
> already corner cases.
> 
> While we can gather more total memory, we won't free more folio
> fragments. As long as page freeing time primarily only depends on the
> number of involved folios, there is no effective change for !preempt
> configurations. However, we'll adjust tlb_batch_pages_flush() separately to
> handle corner cases where page freeing time grows proportionally with the
> actual memory size.
> 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/s390/include/asm/tlb.h | 17 +++
>  include/asm-generic/tlb.h   |  8 +
>  include/linux/mm_types.h| 20 
>  mm/mmu_gather.c | 61 +++--
>  mm/swap.c   | 12 ++--
>  mm/swap_state.c | 15 +++--
>  6 files changed, 119 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
> index 48df896d5b79..e95b2c8081eb 100644
> --- a/arch/s390/include/asm/tlb.h
> +++ b/arch/s390/include/asm/tlb.h
> @@ -26,6 +26,8 @@ void __tlb_remove_table(void *_table);
>  static inline void tlb_flush(struct mmu_gather *tlb);
>  static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
>   struct page *page, bool delay_rmap, int page_size);
> +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
> + struct page *page, unsigned int nr_pages, bool delay_rmap);
>  
>  #define tlb_flush tlb_flush
>  #define pte_free_tlb pte_free_tlb
> @@ -52,6 +54,21 @@ static inline bool __tlb_remove_page_size(struct 
> mmu_gather *tlb,
>   return false;
>  }
>  
> +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
> + struct page *page, unsigned int nr_pages, bool delay_rmap)
> +{
> + struct encoded_page *encoded_pages[] = {
> + encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT),
> + encode_nr_pages(nr_pages),
> + };
> +
> + VM_WARN_ON_ONCE(delay_rmap);
> + VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
> +
> + free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages));
> + return false;
> +}
> +
>  static inline void tlb_flush(struct mmu_gather *tlb)
>  {
>   __tlb_flush_mm_lazy(tlb->mm);
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index 95d60a4f468a..bd00dd238b79 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -69,6 +69,7 @@
>   *
>   *  - tlb_remove_page() / __tlb_remove_page()
>   *  - tlb_remove_page_size() / __tlb_remove_page_size()
> + *  - __tlb_remove_folio_pages()
>   *
>   *__tlb_remove_page_size() is the basic primitive that queues a page for
>   *freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
> @@ -78,6 +79,11 @@
>   *tlb_remove_page() and tlb_remove_page_size() imply the call to
>   *tlb_flush_mmu() when required and has no return value.
>   *
> + *__tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
> + *instead of removing a single page, remove the given number of 
> consecutive
> + *pages that are all part of the same (large) folio: just like calling
> + *__tlb_remove_page() on each 

Re: [PATCH v2 01/10] mm/memory: factor out zapping of present pte into zap_present_pte()

2024-02-12 Thread Ryan Roberts
On 09/02/2024 22:15, David Hildenbrand wrote:
> Let's prepare for further changes by factoring out processing of present
> PTEs.
> 
> Signed-off-by: David Hildenbrand 

Reviewed-by: Ryan Roberts 

> ---
>  mm/memory.c | 94 ++---
>  1 file changed, 53 insertions(+), 41 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 7c3ca41a7610..5b0dc33133a6 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1532,13 +1532,61 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct 
> *vma,
>   pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
>  }
>  
> +static inline void zap_present_pte(struct mmu_gather *tlb,
> + struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
> + unsigned long addr, struct zap_details *details,
> + int *rss, bool *force_flush, bool *force_break)
> +{
> + struct mm_struct *mm = tlb->mm;
> + struct folio *folio = NULL;
> + bool delay_rmap = false;
> + struct page *page;
> +
> + page = vm_normal_page(vma, addr, ptent);
> + if (page)
> + folio = page_folio(page);
> +
> + if (unlikely(!should_zap_folio(details, folio)))
> + return;
> + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
> + arch_check_zapped_pte(vma, ptent);
> + tlb_remove_tlb_entry(tlb, pte, addr);
> + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
> + if (unlikely(!page)) {
> + ksm_might_unmap_zero_page(mm, ptent);
> + return;
> + }
> +
> + if (!folio_test_anon(folio)) {
> + if (pte_dirty(ptent)) {
> + folio_mark_dirty(folio);
> + if (tlb_delay_rmap(tlb)) {
> + delay_rmap = true;
> + *force_flush = true;
> + }
> + }
> + if (pte_young(ptent) && likely(vma_has_recency(vma)))
> + folio_mark_accessed(folio);
> + }
> + rss[mm_counter(folio)]--;
> + if (!delay_rmap) {
> + folio_remove_rmap_pte(folio, page, vma);
> + if (unlikely(page_mapcount(page) < 0))
> + print_bad_pte(vma, addr, ptent, page);
> + }
> + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
> + *force_flush = true;
> + *force_break = true;
> + }
> +}
> +
>  static unsigned long zap_pte_range(struct mmu_gather *tlb,
>   struct vm_area_struct *vma, pmd_t *pmd,
>   unsigned long addr, unsigned long end,
>   struct zap_details *details)
>  {
> + bool force_flush = false, force_break = false;
>   struct mm_struct *mm = tlb->mm;
> - int force_flush = 0;
>   int rss[NR_MM_COUNTERS];
>   spinlock_t *ptl;
>   pte_t *start_pte;
> @@ -1555,7 +1603,7 @@ static unsigned long zap_pte_range(struct mmu_gather 
> *tlb,
>   arch_enter_lazy_mmu_mode();
>   do {
>   pte_t ptent = ptep_get(pte);
> - struct folio *folio = NULL;
> + struct folio *folio;
>   struct page *page;
>  
>   if (pte_none(ptent))
> @@ -1565,45 +1613,9 @@ static unsigned long zap_pte_range(struct mmu_gather 
> *tlb,
>   break;
>  
>   if (pte_present(ptent)) {
> - unsigned int delay_rmap;
> -
> - page = vm_normal_page(vma, addr, ptent);
> - if (page)
> - folio = page_folio(page);
> -
> - if (unlikely(!should_zap_folio(details, folio)))
> - continue;
> - ptent = ptep_get_and_clear_full(mm, addr, pte,
> - tlb->fullmm);
> - arch_check_zapped_pte(vma, ptent);
> - tlb_remove_tlb_entry(tlb, pte, addr);
> - zap_install_uffd_wp_if_needed(vma, addr, pte, details,
> -   ptent);
> - if (unlikely(!page)) {
> - ksm_might_unmap_zero_page(mm, ptent);
> - continue;
> - }
> -
> - delay_rmap = 0;
> - if (!folio_test_anon(folio)) {
> - if (pte_dirty(ptent)) {
> - folio_mark_dirty(folio);
> - if (tlb_delay_rmap(tlb)) {
> - delay_rmap = 1;
> - force_flush = 1;
> - }
> - }
> - if (pte_young(ptent) && 
> likely(vma_has_recency(vma)))
> - folio_mark_accessed(folio);
> -

Re: [PATCH v2] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt.

2024-02-12 Thread Christophe Leroy


Le 05/02/2024 à 06:36, Mahesh Salgaonkar a écrit :
> [Vous ne recevez pas souvent de courriers de mah...@linux.ibm.com. Découvrez 
> pourquoi ceci est important à https://aka.ms/LearnAboutSenderIdentification ]
> 
> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
> interrupt handler) if percpu allocation comes from vmalloc area.
> 
> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
> percpu allocation is from the embedded first chunk. However with
> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
> allocation can come from the vmalloc area.
> 
> With kernel command line "percpu_alloc=page" we can force percpu allocation
> to come from vmalloc area and can see kernel crash in machine_check_early:
> 
> [1.215714] NIP [c0e49eb4] rcu_nmi_enter+0x24/0x110
> [1.215717] LR [c00461a0] machine_check_early+0xf0/0x2c0
> [1.215719] --- interrupt: 200
> [1.215720] [c00fffd73180] [] 0x0 (unreliable)
> [1.215722] [c00fffd731b0] [] 0x0
> [1.215724] [c00fffd73210] [c0008364] 
> machine_check_early_common+0x134/0x1f8
> 
> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
> first chunk is not embedded.
> 
> Signed-off-by: Mahesh Salgaonkar 
> ---
> Changes in v2:
> - Rebase to upstream master
> - Use jump_labels, if CONFIG_JUMP_LABEL is enabled, to avoid redoing the
>test at each interrupt entry.
> - v1 is at 
> https://lore.kernel.org/linuxppc-dev/164578465828.74956.6065296024817333750.stgit@jupiter/
> ---
>   arch/powerpc/include/asm/interrupt.h | 14 ++
>   arch/powerpc/include/asm/percpu.h| 11 +++
>   arch/powerpc/kernel/setup_64.c   | 12 
>   3 files changed, 37 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h 
> b/arch/powerpc/include/asm/interrupt.h
> index a4196ab1d0167..3b4e17c23d9a9 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -336,6 +336,16 @@ static inline void interrupt_nmi_enter_prepare(struct 
> pt_regs *regs, struct inte
>  if (IS_ENABLED(CONFIG_KASAN))
>  return;
> 
> +   /*
> +* Likewise, do not use it in real mode if percpu first chunk is not
> +* embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there
> +* are chances where percpu allocation can come from vmalloc area.
> +*/
> +#ifdef CONFIG_PPC64

Instead of adding this #ifdef in middle of code, could you define 
is_embed_first_chunk as always 'true' when CONFIG_PPC64 is not defined ?

> +   if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && 
> !is_embed_first_chunk)
> +   return;
> +#endif
> +
>  /* Otherwise, it should be safe to call it */
>  nmi_enter();
>   }
> @@ -351,6 +361,10 @@ static inline void interrupt_nmi_exit_prepare(struct 
> pt_regs *regs, struct inter
>  // no nmi_exit for a pseries hash guest taking a real mode 
> exception
>  } else if (IS_ENABLED(CONFIG_KASAN)) {
>  // no nmi_exit for KASAN in real mode
> +#ifdef CONFIG_PPC64

Same

> +   } else if (IS_ENABLED(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && 
> !is_embed_first_chunk) {
> +   // no nmi_exit if percpu first chunk is not embedded
> +#endif
>  } else {
>  nmi_exit();
>  }
> diff --git a/arch/powerpc/include/asm/percpu.h 
> b/arch/powerpc/include/asm/percpu.h
> index 8e5b7d0b851c6..6b4dce4e78d5f 100644
> --- a/arch/powerpc/include/asm/percpu.h
> +++ b/arch/powerpc/include/asm/percpu.h
> @@ -12,6 +12,17 @@
> 
>   #define __my_cpu_offset local_paca->data_offset
> 
> +#ifdef CONFIG_JUMP_LABEL
> +DECLARE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> +
> +#define is_embed_first_chunk   \
> +   (static_key_enabled(&__percpu_embed_first_chunk.key))
> +
> +#else /* !CONFIG_JUMP_LABEL */
> +extern bool __percpu_embed_first_chunk;
> +#define is_embed_first_chunk   __percpu_embed_first_chunk
> +
> +#endif /* CONFIG_JUMP_LABEL */
>   #endif /* CONFIG_SMP */
>   #endif /* __powerpc64__ */
> 
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 2f19d5e944852..674b6e1bebe9a 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -834,6 +834,11 @@ static __init int pcpu_cpu_to_node(int cpu)
> 
>   unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
>   EXPORT_SYMBOL(__per_cpu_offset);
> +#ifdef CONFIG_JUMP_LABEL

Why this ifdef ? Even when CONFIG_JUMP_LABEL is not selected all this 
should just work fine.

> +DEFINE_STATIC_KEY_FALSE(__percpu_embed_first_chunk);
> +#else
> +bool __percpu_embed_first_chunk;
> +#endif
> 
>   void __init 

Re: [PATCH v3 RESEND 0/6] Add support for QMC HDLC

2024-02-12 Thread Herve Codina
Hi all,

I duplicated patches in this series :(
My bad, I was mistaken in 'git format-patch'.

Can you consider only the "[PATCH v3 RESEND n/6]: xx" in this review.
The other patches ("RESEND PATCH v3") are the duplicated ones.

If ok I will send a clean v4 series.
Of course, if some modification are needed, I will also send a clean v4.

Let me know if a clean v4 is needed right now.

Sorry for this mistake.
Regards,
Hervé

On Mon, 12 Feb 2024 08:56:28 +0100
Herve Codina  wrote:

> Hi,
> 
> Note: Resent this v3 series with missing maintainers added in CC.
> 
> This series introduces the QMC HDLC support.
> 
> Patches were previously sent as part of a full feature series and were
> previously reviewed in that context:
> "Add support for QMC HDLC, framer infrastructure and PEF2256 framer" [1]
> 
> In order to ease the merge, the full feature series has been split and
> needed parts were merged in v6.8-rc1:
>  - "Prepare the PowerQUICC QMC and TSA for the HDLC QMC driver" [2]
>  - "Add support for framer infrastructure and PEF2256 framer" [3]
> 
> This series contains patches related to the QMC HDLC part (QMC HDLC
> driver):
>  - Introduce the QMC HDLC driver (patches 1 and 2)
>  - Add timeslots change support in QMC HDLC (patch 3)
>  - Add framer support as a framer consumer in QMC HDLC (patch 4)
> 
> Compare to the original full feature series, a modification was done on
> patch 3 in order to use a coherent prefix in the commit title.
> 
> I kept the patches unsquashed as they were previously sent and reviewed.
> Of course, I can squash them if needed.
> 
> Compared to the previous iteration:
>   
> https://lore.kernel.org/linux-kernel/20240130084035.115086-1-herve.cod...@bootlin.com/
> this v3 series:
> - Remove 'inline' function specifier from .c file.
> - Fixed a bug introduced in the previous iteration.
> - Remove one lock/unlock sequence in the QMC HDCL xmit path.
> - Use bitmap_from_u64().
> 
> Best regards,
> Hervé
> 
> [1]: 
> https://lore.kernel.org/linux-kernel/20231115144007.478111-1-herve.cod...@bootlin.com/
> [2]: 
> https://lore.kernel.org/linux-kernel/20231205152116.122512-1-herve.cod...@bootlin.com/
> [3]: 
> https://lore.kernel.org/linux-kernel/20231128132534.258459-1-herve.cod...@bootlin.com/
> 
> Changes v2 -> v3
>   - Patch 1
> Remove 'inline' function specifier from .c file.
> Fix a bug introduced when added WARN_ONCE(). The warn condition must
> be desc->skb (descriptor used) instead of !desc->skb.
> Remove a lock/unlock section locking the entire qmc_hdlc_xmit()
> function.
> 
>   - Patch 5
> Use bitmap_from_u64() everywhere instead of bitmap_from_arr32() and
> bitmap_from_arr64().
> 
> Changes v1 -> v2
>   - Patch 1
> Use the same qmc_hdlc initialisation in qmc_hcld_recv_complete()
> than the one present in qmc_hcld_xmit_complete().
> Use WARN_ONCE()
> 
>   - Patch 3 (new patch in v2)
> Make bitmap_onto() available to users
> 
>   - Patch 4 (new patch in v2)
> Introduce bitmap_off()
> 
>   - Patch 5 (patch 3 in v1)
> Use bitmap_*() functions
> 
>   - Patch 6 (patch 4 in v1)
> No changes
> 
> Changes compare to the full feature series:
>   - Patch 3
> Use 'net: wan: fsl_qmc_hdlc:' as commit title prefix
> 
> Patches extracted:
>   - Patch 1 : full feature series patch 7
>   - Patch 2 : full feature series patch 8
>   - Patch 3 : full feature series patch 20
>   - Patch 4 : full feature series patch 27
> 
> Herve Codina (6):
>   net: wan: Add support for QMC HDLC
>   MAINTAINERS: Add the Freescale QMC HDLC driver entry
>   bitmap: Make bitmap_onto() available to users
>   bitmap: Introduce bitmap_off()
>   net: wan: fsl_qmc_hdlc: Add runtime timeslots changes support
>   net: wan: fsl_qmc_hdlc: Add framer support
> 
>  MAINTAINERS|   7 +
>  drivers/net/wan/Kconfig|  12 +
>  drivers/net/wan/Makefile   |   1 +
>  drivers/net/wan/fsl_qmc_hdlc.c | 807 +
>  include/linux/bitmap.h |   3 +
>  lib/bitmap.c   |  45 +-
>  6 files changed, 874 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/net/wan/fsl_qmc_hdlc.c
> 



-- 
Hervé Codina, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com


[RESEND PATCH v3 6/6] net: wan: fsl_qmc_hdlc: Add framer support

2024-02-12 Thread Herve Codina
Add framer support in the fsl_qmc_hdlc driver in order to be able to
signal carrier changes to the network stack based on the framer status
Also use this framer to provide information related to the E1/T1 line
interface on IF_GET_IFACE and configure the line interface according to
IF_IFACE_{E1,T1} information.

Signed-off-by: Herve Codina 
Reviewed-by: Christophe Leroy 
---
 drivers/net/wan/fsl_qmc_hdlc.c | 239 -
 1 file changed, 235 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
index b25d918d5e4e..432b5111b106 100644
--- a/drivers/net/wan/fsl_qmc_hdlc.c
+++ b/drivers/net/wan/fsl_qmc_hdlc.c
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -28,6 +29,9 @@ struct qmc_hdlc {
struct device *dev;
struct qmc_chan *qmc_chan;
struct net_device *netdev;
+   struct framer *framer;
+   spinlock_t carrier_lock; /* Protect carrier detection */
+   struct notifier_block nb;
bool is_crc32;
spinlock_t tx_lock; /* Protect tx descriptors */
struct qmc_hdlc_desc tx_descs[8];
@@ -41,6 +45,195 @@ static struct qmc_hdlc *netdev_to_qmc_hdlc(struct 
net_device *netdev)
return dev_to_hdlc(netdev)->priv;
 }
 
+static int qmc_hdlc_framer_set_carrier(struct qmc_hdlc *qmc_hdlc)
+{
+   struct framer_status framer_status;
+   unsigned long flags;
+   int ret;
+
+   if (!qmc_hdlc->framer)
+   return 0;
+
+   spin_lock_irqsave(_hdlc->carrier_lock, flags);
+
+   ret = framer_get_status(qmc_hdlc->framer, _status);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "get framer status failed (%d)\n", ret);
+   goto end;
+   }
+   if (framer_status.link_is_on)
+   netif_carrier_on(qmc_hdlc->netdev);
+   else
+   netif_carrier_off(qmc_hdlc->netdev);
+
+end:
+   spin_unlock_irqrestore(_hdlc->carrier_lock, flags);
+   return ret;
+}
+
+static int qmc_hdlc_framer_notifier(struct notifier_block *nb, unsigned long 
action,
+   void *data)
+{
+   struct qmc_hdlc *qmc_hdlc = container_of(nb, struct qmc_hdlc, nb);
+   int ret;
+
+   if (action != FRAMER_EVENT_STATUS)
+   return NOTIFY_DONE;
+
+   ret = qmc_hdlc_framer_set_carrier(qmc_hdlc);
+   return ret ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+static int qmc_hdlc_framer_start(struct qmc_hdlc *qmc_hdlc)
+{
+   struct framer_status framer_status;
+   int ret;
+
+   if (!qmc_hdlc->framer)
+   return 0;
+
+   ret = framer_power_on(qmc_hdlc->framer);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "framer power-on failed (%d)\n", ret);
+   return ret;
+   }
+
+   /* Be sure that get_status is supported */
+   ret = framer_get_status(qmc_hdlc->framer, _status);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "get framer status failed (%d)\n", ret);
+   goto framer_power_off;
+   }
+
+   qmc_hdlc->nb.notifier_call = qmc_hdlc_framer_notifier;
+   ret = framer_notifier_register(qmc_hdlc->framer, _hdlc->nb);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "framer notifier register failed 
(%d)\n", ret);
+   goto framer_power_off;
+   }
+
+   return 0;
+
+framer_power_off:
+   framer_power_off(qmc_hdlc->framer);
+   return ret;
+}
+
+static void qmc_hdlc_framer_stop(struct qmc_hdlc *qmc_hdlc)
+{
+   if (!qmc_hdlc->framer)
+   return;
+
+   framer_notifier_unregister(qmc_hdlc->framer, _hdlc->nb);
+   framer_power_off(qmc_hdlc->framer);
+}
+
+static int qmc_hdlc_framer_set_iface(struct qmc_hdlc *qmc_hdlc, int if_iface,
+const te1_settings *te1)
+{
+   struct framer_config config;
+   int ret;
+
+   if (!qmc_hdlc->framer)
+   return 0;
+
+   ret = framer_get_config(qmc_hdlc->framer, );
+   if (ret)
+   return ret;
+
+   switch (if_iface) {
+   case IF_IFACE_E1:
+   config.iface = FRAMER_IFACE_E1;
+   break;
+   case IF_IFACE_T1:
+   config.iface = FRAMER_IFACE_T1;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   switch (te1->clock_type) {
+   case CLOCK_DEFAULT:
+   /* Keep current value */
+   break;
+   case CLOCK_EXT:
+   config.clock_type = FRAMER_CLOCK_EXT;
+   break;
+   case CLOCK_INT:
+   config.clock_type = FRAMER_CLOCK_INT;
+   break;
+   default:
+   return -EINVAL;
+   }
+   config.line_clock_rate = te1->clock_rate;
+
+   return framer_set_config(qmc_hdlc->framer, );
+}
+
+static int qmc_hdlc_framer_get_iface(struct qmc_hdlc *qmc_hdlc, int *if_iface, 
te1_settings *te1)
+{
+   struct framer_config config;
+   int 

[RESEND PATCH v3 5/6] net: wan: fsl_qmc_hdlc: Add runtime timeslots changes support

2024-02-12 Thread Herve Codina
QMC channels support runtime timeslots changes but nothing is done at
the QMC HDLC driver to handle these changes.

Use existing IFACE ioctl in order to configure the timeslots to use.

Signed-off-by: Herve Codina 
Reviewed-by: Christophe Leroy 
Acked-by: Jakub Kicinski 
---
 drivers/net/wan/fsl_qmc_hdlc.c | 152 -
 1 file changed, 151 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
index 835500910d1b..b25d918d5e4e 100644
--- a/drivers/net/wan/fsl_qmc_hdlc.c
+++ b/drivers/net/wan/fsl_qmc_hdlc.c
@@ -7,6 +7,7 @@
  * Author: Herve Codina 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -32,6 +33,7 @@ struct qmc_hdlc {
struct qmc_hdlc_desc tx_descs[8];
unsigned int tx_out;
struct qmc_hdlc_desc rx_descs[4];
+   u32 slot_map;
 };
 
 static struct qmc_hdlc *netdev_to_qmc_hdlc(struct net_device *netdev)
@@ -206,6 +208,144 @@ static netdev_tx_t qmc_hdlc_xmit(struct sk_buff *skb, 
struct net_device *netdev)
return ret;
 }
 
+static int qmc_hdlc_xlate_slot_map(struct qmc_hdlc *qmc_hdlc,
+  u32 slot_map, struct qmc_chan_ts_info 
*ts_info)
+{
+   DECLARE_BITMAP(ts_mask_avail, 64);
+   DECLARE_BITMAP(ts_mask, 64);
+   DECLARE_BITMAP(map, 64);
+
+   /* Tx and Rx available masks must be identical */
+   if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) {
+   dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch 
(0x%llx, 0x%llx)\n",
+   ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail);
+   return -EINVAL;
+   }
+
+   bitmap_from_u64(ts_mask_avail, ts_info->rx_ts_mask_avail);
+   bitmap_from_u64(map, slot_map);
+   bitmap_onto(ts_mask, map, ts_mask_avail, 64);
+
+   if (bitmap_weight(ts_mask, 64) != bitmap_weight(map, 64)) {
+   dev_err(qmc_hdlc->dev, "Cannot translate timeslots %*pb -> 
(%*pb, %*pb)\n",
+   64, map, 64, ts_mask_avail, 64, ts_mask);
+   return -EINVAL;
+   }
+
+   bitmap_to_arr64(_info->tx_ts_mask, ts_mask, 64);
+   ts_info->rx_ts_mask = ts_info->tx_ts_mask;
+   return 0;
+}
+
+static int qmc_hdlc_xlate_ts_info(struct qmc_hdlc *qmc_hdlc,
+ const struct qmc_chan_ts_info *ts_info, u32 
*slot_map)
+{
+   DECLARE_BITMAP(ts_mask_avail, 64);
+   DECLARE_BITMAP(ts_mask, 64);
+   DECLARE_BITMAP(map, 64);
+   u32 array32[2];
+
+   /* Tx and Rx masks and available masks must be identical */
+   if (ts_info->rx_ts_mask_avail != ts_info->tx_ts_mask_avail) {
+   dev_err(qmc_hdlc->dev, "tx and rx available timeslots mismatch 
(0x%llx, 0x%llx)\n",
+   ts_info->rx_ts_mask_avail, ts_info->tx_ts_mask_avail);
+   return -EINVAL;
+   }
+   if (ts_info->rx_ts_mask != ts_info->tx_ts_mask) {
+   dev_err(qmc_hdlc->dev, "tx and rx timeslots mismatch (0x%llx, 
0x%llx)\n",
+   ts_info->rx_ts_mask, ts_info->tx_ts_mask);
+   return -EINVAL;
+   }
+
+   bitmap_from_u64(ts_mask_avail, ts_info->rx_ts_mask_avail);
+   bitmap_from_u64(ts_mask, ts_info->rx_ts_mask);
+   bitmap_off(map, ts_mask, ts_mask_avail, 64);
+
+   if (bitmap_weight(ts_mask, 64) != bitmap_weight(map, 64)) {
+   dev_err(qmc_hdlc->dev, "Cannot translate timeslots (%*pb, %*pb) 
-> %*pb\n",
+   64, ts_mask_avail, 64, ts_mask, 64, map);
+   return -EINVAL;
+   }
+
+   bitmap_to_arr32(array32, map, 64);
+   if (array32[1]) {
+   dev_err(qmc_hdlc->dev, "Slot map out of 32bit (%*pb, %*pb) -> 
%*pb\n",
+   64, ts_mask_avail, 64, ts_mask, 64, map);
+   return -EINVAL;
+   }
+
+   *slot_map = array32[0];
+   return 0;
+}
+
+static int qmc_hdlc_set_iface(struct qmc_hdlc *qmc_hdlc, int if_iface, const 
te1_settings *te1)
+{
+   struct qmc_chan_ts_info ts_info;
+   int ret;
+
+   ret = qmc_chan_get_ts_info(qmc_hdlc->qmc_chan, _info);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "get QMC channel ts info failed %d\n", 
ret);
+   return ret;
+   }
+   ret = qmc_hdlc_xlate_slot_map(qmc_hdlc, te1->slot_map, _info);
+   if (ret)
+   return ret;
+
+   ret = qmc_chan_set_ts_info(qmc_hdlc->qmc_chan, _info);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "set QMC channel ts info failed %d\n", 
ret);
+   return ret;
+   }
+
+   qmc_hdlc->slot_map = te1->slot_map;
+
+   return 0;
+}
+
+static int qmc_hdlc_ioctl(struct net_device *netdev, struct if_settings *ifs)
+{
+   struct qmc_hdlc *qmc_hdlc = netdev_to_qmc_hdlc(netdev);
+   te1_settings te1;
+
+   switch (ifs->type) {
+   case IF_GET_IFACE:
+   ifs->type = IF_IFACE_E1;
+   if (ifs->size < 

[RESEND PATCH v3 3/6] bitmap: Make bitmap_onto() available to users

2024-02-12 Thread Herve Codina
Currently the bitmap_onto() is available only for CONFIG_NUMA=y case,
while some users may benefit out of it and being independent to NUMA
code.

Make it available to users by moving out of ifdeffery and exporting for
modules.

Signed-off-by: Herve Codina 
---
 lib/bitmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index 09522af227f1..2feccb5047dc 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -547,7 +547,6 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
 }
 EXPORT_SYMBOL(bitmap_bitremap);
 
-#ifdef CONFIG_NUMA
 /**
  * bitmap_onto - translate one bitmap relative to another
  * @dst: resulting translated bitmap
@@ -681,7 +680,9 @@ void bitmap_onto(unsigned long *dst, const unsigned long 
*orig,
m++;
}
 }
+EXPORT_SYMBOL(bitmap_onto);
 
+#ifdef CONFIG_NUMA
 /**
  * bitmap_fold - fold larger bitmap into smaller, modulo specified size
  * @dst: resulting smaller bitmap
-- 
2.43.0



[RESEND PATCH v3 1/6] net: wan: Add support for QMC HDLC

2024-02-12 Thread Herve Codina
The QMC HDLC driver provides support for HDLC using the QMC (QUICC
Multichannel Controller) to transfer the HDLC data.

Signed-off-by: Herve Codina 
Reviewed-by: Christophe Leroy 
Acked-by: Jakub Kicinski 
---
 drivers/net/wan/Kconfig|  12 +
 drivers/net/wan/Makefile   |   1 +
 drivers/net/wan/fsl_qmc_hdlc.c | 426 +
 3 files changed, 439 insertions(+)
 create mode 100644 drivers/net/wan/fsl_qmc_hdlc.c

diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig
index 7dda87756d3f..31ab2136cdf1 100644
--- a/drivers/net/wan/Kconfig
+++ b/drivers/net/wan/Kconfig
@@ -197,6 +197,18 @@ config FARSYNC
  To compile this driver as a module, choose M here: the
  module will be called farsync.
 
+config FSL_QMC_HDLC
+   tristate "Freescale QMC HDLC support"
+   depends on HDLC
+   depends on CPM_QMC
+   help
+ HDLC support using the Freescale QUICC Multichannel Controller (QMC).
+
+ To compile this driver as a module, choose M here: the
+ module will be called fsl_qmc_hdlc.
+
+ If unsure, say N.
+
 config FSL_UCC_HDLC
tristate "Freescale QUICC Engine HDLC support"
depends on HDLC
diff --git a/drivers/net/wan/Makefile b/drivers/net/wan/Makefile
index 8119b49d1da9..00e9b7ee1e01 100644
--- a/drivers/net/wan/Makefile
+++ b/drivers/net/wan/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_WANXL)   += wanxl.o
 obj-$(CONFIG_PCI200SYN)+= pci200syn.o
 obj-$(CONFIG_PC300TOO) += pc300too.o
 obj-$(CONFIG_IXP4XX_HSS)   += ixp4xx_hss.o
+obj-$(CONFIG_FSL_QMC_HDLC) += fsl_qmc_hdlc.o
 obj-$(CONFIG_FSL_UCC_HDLC) += fsl_ucc_hdlc.o
 obj-$(CONFIG_SLIC_DS26522) += slic_ds26522.o
 
diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
new file mode 100644
index ..835500910d1b
--- /dev/null
+++ b/drivers/net/wan/fsl_qmc_hdlc.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Freescale QMC HDLC Device Driver
+ *
+ * Copyright 2023 CS GROUP France
+ *
+ * Author: Herve Codina 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct qmc_hdlc_desc {
+   struct net_device *netdev;
+   struct sk_buff *skb; /* NULL if the descriptor is not in use */
+   dma_addr_t dma_addr;
+   size_t dma_size;
+};
+
+struct qmc_hdlc {
+   struct device *dev;
+   struct qmc_chan *qmc_chan;
+   struct net_device *netdev;
+   bool is_crc32;
+   spinlock_t tx_lock; /* Protect tx descriptors */
+   struct qmc_hdlc_desc tx_descs[8];
+   unsigned int tx_out;
+   struct qmc_hdlc_desc rx_descs[4];
+};
+
+static struct qmc_hdlc *netdev_to_qmc_hdlc(struct net_device *netdev)
+{
+   return dev_to_hdlc(netdev)->priv;
+}
+
+static int qmc_hdlc_recv_queue(struct qmc_hdlc *qmc_hdlc, struct qmc_hdlc_desc 
*desc, size_t size);
+
+#define QMC_HDLC_RX_ERROR_FLAGS (QMC_RX_FLAG_HDLC_OVF | \
+QMC_RX_FLAG_HDLC_UNA | \
+QMC_RX_FLAG_HDLC_ABORT | \
+QMC_RX_FLAG_HDLC_CRC)
+
+static void qmc_hcld_recv_complete(void *context, size_t length, unsigned int 
flags)
+{
+   struct qmc_hdlc_desc *desc = context;
+   struct net_device *netdev = desc->netdev;
+   struct qmc_hdlc *qmc_hdlc = netdev_to_qmc_hdlc(netdev);
+   int ret;
+
+   dma_unmap_single(qmc_hdlc->dev, desc->dma_addr, desc->dma_size, 
DMA_FROM_DEVICE);
+
+   if (flags & QMC_HDLC_RX_ERROR_FLAGS) {
+   netdev->stats.rx_errors++;
+   if (flags & QMC_RX_FLAG_HDLC_OVF) /* Data overflow */
+   netdev->stats.rx_over_errors++;
+   if (flags & QMC_RX_FLAG_HDLC_UNA) /* bits received not multiple 
of 8 */
+   netdev->stats.rx_frame_errors++;
+   if (flags & QMC_RX_FLAG_HDLC_ABORT) /* Received an abort 
sequence */
+   netdev->stats.rx_frame_errors++;
+   if (flags & QMC_RX_FLAG_HDLC_CRC) /* CRC error */
+   netdev->stats.rx_crc_errors++;
+   kfree_skb(desc->skb);
+   } else {
+   netdev->stats.rx_packets++;
+   netdev->stats.rx_bytes += length;
+
+   skb_put(desc->skb, length);
+   desc->skb->protocol = hdlc_type_trans(desc->skb, netdev);
+   netif_rx(desc->skb);
+   }
+
+   /* Re-queue a transfer using the same descriptor */
+   ret = qmc_hdlc_recv_queue(qmc_hdlc, desc, desc->dma_size);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "queue recv desc failed (%d)\n", ret);
+   netdev->stats.rx_errors++;
+   }
+}
+
+static int qmc_hdlc_recv_queue(struct qmc_hdlc *qmc_hdlc, struct qmc_hdlc_desc 
*desc, size_t size)
+{
+   int ret;
+
+   desc->skb = dev_alloc_skb(size);
+   if (!desc->skb)
+   return -ENOMEM;
+
+   desc->dma_size = 

[PATCH v3 RESEND 6/6] net: wan: fsl_qmc_hdlc: Add framer support

2024-02-12 Thread Herve Codina
Add framer support in the fsl_qmc_hdlc driver in order to be able to
signal carrier changes to the network stack based on the framer status
Also use this framer to provide information related to the E1/T1 line
interface on IF_GET_IFACE and configure the line interface according to
IF_IFACE_{E1,T1} information.

Signed-off-by: Herve Codina 
Reviewed-by: Christophe Leroy 
---
 drivers/net/wan/fsl_qmc_hdlc.c | 239 -
 1 file changed, 235 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wan/fsl_qmc_hdlc.c b/drivers/net/wan/fsl_qmc_hdlc.c
index b25d918d5e4e..432b5111b106 100644
--- a/drivers/net/wan/fsl_qmc_hdlc.c
+++ b/drivers/net/wan/fsl_qmc_hdlc.c
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -28,6 +29,9 @@ struct qmc_hdlc {
struct device *dev;
struct qmc_chan *qmc_chan;
struct net_device *netdev;
+   struct framer *framer;
+   spinlock_t carrier_lock; /* Protect carrier detection */
+   struct notifier_block nb;
bool is_crc32;
spinlock_t tx_lock; /* Protect tx descriptors */
struct qmc_hdlc_desc tx_descs[8];
@@ -41,6 +45,195 @@ static struct qmc_hdlc *netdev_to_qmc_hdlc(struct 
net_device *netdev)
return dev_to_hdlc(netdev)->priv;
 }
 
+static int qmc_hdlc_framer_set_carrier(struct qmc_hdlc *qmc_hdlc)
+{
+   struct framer_status framer_status;
+   unsigned long flags;
+   int ret;
+
+   if (!qmc_hdlc->framer)
+   return 0;
+
+   spin_lock_irqsave(_hdlc->carrier_lock, flags);
+
+   ret = framer_get_status(qmc_hdlc->framer, _status);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "get framer status failed (%d)\n", ret);
+   goto end;
+   }
+   if (framer_status.link_is_on)
+   netif_carrier_on(qmc_hdlc->netdev);
+   else
+   netif_carrier_off(qmc_hdlc->netdev);
+
+end:
+   spin_unlock_irqrestore(_hdlc->carrier_lock, flags);
+   return ret;
+}
+
+static int qmc_hdlc_framer_notifier(struct notifier_block *nb, unsigned long 
action,
+   void *data)
+{
+   struct qmc_hdlc *qmc_hdlc = container_of(nb, struct qmc_hdlc, nb);
+   int ret;
+
+   if (action != FRAMER_EVENT_STATUS)
+   return NOTIFY_DONE;
+
+   ret = qmc_hdlc_framer_set_carrier(qmc_hdlc);
+   return ret ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+static int qmc_hdlc_framer_start(struct qmc_hdlc *qmc_hdlc)
+{
+   struct framer_status framer_status;
+   int ret;
+
+   if (!qmc_hdlc->framer)
+   return 0;
+
+   ret = framer_power_on(qmc_hdlc->framer);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "framer power-on failed (%d)\n", ret);
+   return ret;
+   }
+
+   /* Be sure that get_status is supported */
+   ret = framer_get_status(qmc_hdlc->framer, _status);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "get framer status failed (%d)\n", ret);
+   goto framer_power_off;
+   }
+
+   qmc_hdlc->nb.notifier_call = qmc_hdlc_framer_notifier;
+   ret = framer_notifier_register(qmc_hdlc->framer, _hdlc->nb);
+   if (ret) {
+   dev_err(qmc_hdlc->dev, "framer notifier register failed 
(%d)\n", ret);
+   goto framer_power_off;
+   }
+
+   return 0;
+
+framer_power_off:
+   framer_power_off(qmc_hdlc->framer);
+   return ret;
+}
+
+static void qmc_hdlc_framer_stop(struct qmc_hdlc *qmc_hdlc)
+{
+   if (!qmc_hdlc->framer)
+   return;
+
+   framer_notifier_unregister(qmc_hdlc->framer, _hdlc->nb);
+   framer_power_off(qmc_hdlc->framer);
+}
+
+static int qmc_hdlc_framer_set_iface(struct qmc_hdlc *qmc_hdlc, int if_iface,
+const te1_settings *te1)
+{
+   struct framer_config config;
+   int ret;
+
+   if (!qmc_hdlc->framer)
+   return 0;
+
+   ret = framer_get_config(qmc_hdlc->framer, );
+   if (ret)
+   return ret;
+
+   switch (if_iface) {
+   case IF_IFACE_E1:
+   config.iface = FRAMER_IFACE_E1;
+   break;
+   case IF_IFACE_T1:
+   config.iface = FRAMER_IFACE_T1;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   switch (te1->clock_type) {
+   case CLOCK_DEFAULT:
+   /* Keep current value */
+   break;
+   case CLOCK_EXT:
+   config.clock_type = FRAMER_CLOCK_EXT;
+   break;
+   case CLOCK_INT:
+   config.clock_type = FRAMER_CLOCK_INT;
+   break;
+   default:
+   return -EINVAL;
+   }
+   config.line_clock_rate = te1->clock_rate;
+
+   return framer_set_config(qmc_hdlc->framer, );
+}
+
+static int qmc_hdlc_framer_get_iface(struct qmc_hdlc *qmc_hdlc, int *if_iface, 
te1_settings *te1)
+{
+   struct framer_config config;
+   int 

[RESEND PATCH v3 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Herve Codina
The bitmap_onto() function translates one bitmap relative to another but
no function are present to perform the reverse translation.

Introduce bitmap_off() to fill this hole.

Signed-off-by: Herve Codina 
---
 include/linux/bitmap.h |  3 +++
 lib/bitmap.c   | 42 ++
 2 files changed, 45 insertions(+)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 99451431e4d6..5ecfcbbc91f4 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -65,6 +65,7 @@ struct device;
  *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
  *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, new)(oldbit)
  *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to relmap
+ *  bitmap_off(dst, orig, relmap, nbits)*dst = bitmap_onto() reverse 
operation
  *  bitmap_fold(dst, orig, sz, nbits)   dst bits = orig bits mod sz
  *  bitmap_parse(buf, buflen, dst, nbits)   Parse bitmap dst from kernel 
buf
  *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
@@ -208,6 +209,8 @@ int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
 void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, unsigned int bits);
+void bitmap_off(unsigned long *dst, const unsigned long *orig,
+   const unsigned long *relmap, unsigned int bits);
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
unsigned int sz, unsigned int nbits);
 
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 2feccb5047dc..71343967335e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -682,6 +682,48 @@ void bitmap_onto(unsigned long *dst, const unsigned long 
*orig,
 }
 EXPORT_SYMBOL(bitmap_onto);
 
+/**
+ * bitmap_off - revert operation done by bitmap_onto()
+ * @dst: resulting translated bitmap
+ * @orig: original untranslated bitmap
+ * @relmap: bitmap relative to which translated
+ * @bits: number of bits in each of these bitmaps
+ *
+ * Suppose onto computed using bitmap_onto(onto, src, relmap, n)
+ * The operation bitmap_off(result, onto, relmap, n) leads to a
+ * result equal or equivalent to src.
+ *
+ * The result can be 'equivalent' because bitmap_onto() and
+ * bitmap_off() are not bijective.
+ * The result and src values are equivalent in that sense that a
+ * call to bitmap_onto(onto, src, relmap, n) and a call to
+ * bitmap_onto(onto, result, relmap, n) will lead to the same onto
+ * value.
+ *
+ * If either of @orig or @relmap is empty (no set bits), then @dst
+ * will be returned empty.
+ *
+ * All bits in @dst not set by the above rule are cleared.
+ */
+void bitmap_off(unsigned long *dst, const unsigned long *orig,
+   const unsigned long *relmap, unsigned int bits)
+{
+   unsigned int n, m;  /* same meaning as in above comment */
+
+   if (dst == orig)/* following doesn't handle inplace mappings */
+   return;
+   bitmap_zero(dst, bits);
+
+   m = 0;
+   for_each_set_bit(n, relmap, bits) {
+   /* m == bitmap_pos_to_ord(relmap, n, bits) */
+   if (test_bit(n, orig))
+   set_bit(m, dst);
+   m++;
+   }
+}
+EXPORT_SYMBOL(bitmap_off);
+
 #ifdef CONFIG_NUMA
 /**
  * bitmap_fold - fold larger bitmap into smaller, modulo specified size
-- 
2.43.0



[RESEND PATCH v3 0/6] Add support for QMC HDLC

2024-02-12 Thread Herve Codina
Hi,

Note: Resent this v3 series with missing maintainers added in CC.

This series introduces the QMC HDLC support.

Patches were previously sent as part of a full feature series and were
previously reviewed in that context:
"Add support for QMC HDLC, framer infrastructure and PEF2256 framer" [1]

In order to ease the merge, the full feature series has been split and
needed parts were merged in v6.8-rc1:
 - "Prepare the PowerQUICC QMC and TSA for the HDLC QMC driver" [2]
 - "Add support for framer infrastructure and PEF2256 framer" [3]

This series contains patches related to the QMC HDLC part (QMC HDLC
driver):
 - Introduce the QMC HDLC driver (patches 1 and 2)
 - Add timeslots change support in QMC HDLC (patch 3)
 - Add framer support as a framer consumer in QMC HDLC (patch 4)

Compare to the original full feature series, a modification was done on
patch 3 in order to use a coherent prefix in the commit title.

I kept the patches unsquashed as they were previously sent and reviewed.
Of course, I can squash them if needed.

Compared to the previous iteration:
  
https://lore.kernel.org/linux-kernel/20240130084035.115086-1-herve.cod...@bootlin.com/
this v3 series:
- Remove 'inline' function specifier from .c file.
- Fixed a bug introduced in the previous iteration.
- Remove one lock/unlock sequence in the QMC HDCL xmit path.
- Use bitmap_from_u64().

Best regards,
Hervé

[1]: 
https://lore.kernel.org/linux-kernel/20231115144007.478111-1-herve.cod...@bootlin.com/
[2]: 
https://lore.kernel.org/linux-kernel/20231205152116.122512-1-herve.cod...@bootlin.com/
[3]: 
https://lore.kernel.org/linux-kernel/20231128132534.258459-1-herve.cod...@bootlin.com/

Changes v2 -> v3
  - Patch 1
Remove 'inline' function specifier from .c file.
Fix a bug introduced when added WARN_ONCE(). The warn condition must
be desc->skb (descriptor used) instead of !desc->skb.
Remove a lock/unlock section locking the entire qmc_hdlc_xmit()
function.

  - Patch 5
Use bitmap_from_u64() everywhere instead of bitmap_from_arr32() and
bitmap_from_arr64().

Changes v1 -> v2
  - Patch 1
Use the same qmc_hdlc initialisation in qmc_hcld_recv_complete()
than the one present in qmc_hcld_xmit_complete().
Use WARN_ONCE()

  - Patch 3 (new patch in v2)
Make bitmap_onto() available to users

  - Patch 4 (new patch in v2)
Introduce bitmap_off()

  - Patch 5 (patch 3 in v1)
Use bitmap_*() functions

  - Patch 6 (patch 4 in v1)
No changes

Changes compare to the full feature series:
  - Patch 3
Use 'net: wan: fsl_qmc_hdlc:' as commit title prefix

Patches extracted:
  - Patch 1 : full feature series patch 7
  - Patch 2 : full feature series patch 8
  - Patch 3 : full feature series patch 20
  - Patch 4 : full feature series patch 27

Herve Codina (6):
  net: wan: Add support for QMC HDLC
  MAINTAINERS: Add the Freescale QMC HDLC driver entry
  bitmap: Make bitmap_onto() available to users
  bitmap: Introduce bitmap_off()
  net: wan: fsl_qmc_hdlc: Add runtime timeslots changes support
  net: wan: fsl_qmc_hdlc: Add framer support

 MAINTAINERS|   7 +
 drivers/net/wan/Kconfig|  12 +
 drivers/net/wan/Makefile   |   1 +
 drivers/net/wan/fsl_qmc_hdlc.c | 807 +
 include/linux/bitmap.h |   3 +
 lib/bitmap.c   |  45 +-
 6 files changed, 874 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/wan/fsl_qmc_hdlc.c

-- 
2.43.0



[PATCH v3 RESEND 4/6] bitmap: Introduce bitmap_off()

2024-02-12 Thread Herve Codina
The bitmap_onto() function translates one bitmap relative to another but
no function are present to perform the reverse translation.

Introduce bitmap_off() to fill this hole.

Signed-off-by: Herve Codina 
---
 include/linux/bitmap.h |  3 +++
 lib/bitmap.c   | 42 ++
 2 files changed, 45 insertions(+)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 99451431e4d6..5ecfcbbc91f4 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -65,6 +65,7 @@ struct device;
  *  bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
  *  bitmap_bitremap(oldbit, old, new, nbits)newbit = map(old, new)(oldbit)
  *  bitmap_onto(dst, orig, relmap, nbits)   *dst = orig relative to relmap
+ *  bitmap_off(dst, orig, relmap, nbits)*dst = bitmap_onto() reverse 
operation
  *  bitmap_fold(dst, orig, sz, nbits)   dst bits = orig bits mod sz
  *  bitmap_parse(buf, buflen, dst, nbits)   Parse bitmap dst from kernel 
buf
  *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
@@ -208,6 +209,8 @@ int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
 void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, unsigned int bits);
+void bitmap_off(unsigned long *dst, const unsigned long *orig,
+   const unsigned long *relmap, unsigned int bits);
 void bitmap_fold(unsigned long *dst, const unsigned long *orig,
unsigned int sz, unsigned int nbits);
 
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 2feccb5047dc..71343967335e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -682,6 +682,48 @@ void bitmap_onto(unsigned long *dst, const unsigned long 
*orig,
 }
 EXPORT_SYMBOL(bitmap_onto);
 
+/**
+ * bitmap_off - revert operation done by bitmap_onto()
+ * @dst: resulting translated bitmap
+ * @orig: original untranslated bitmap
+ * @relmap: bitmap relative to which translated
+ * @bits: number of bits in each of these bitmaps
+ *
+ * Suppose onto computed using bitmap_onto(onto, src, relmap, n)
+ * The operation bitmap_off(result, onto, relmap, n) leads to a
+ * result equal or equivalent to src.
+ *
+ * The result can be 'equivalent' because bitmap_onto() and
+ * bitmap_off() are not bijective.
+ * The result and src values are equivalent in that sense that a
+ * call to bitmap_onto(onto, src, relmap, n) and a call to
+ * bitmap_onto(onto, result, relmap, n) will lead to the same onto
+ * value.
+ *
+ * If either of @orig or @relmap is empty (no set bits), then @dst
+ * will be returned empty.
+ *
+ * All bits in @dst not set by the above rule are cleared.
+ */
+void bitmap_off(unsigned long *dst, const unsigned long *orig,
+   const unsigned long *relmap, unsigned int bits)
+{
+   unsigned int n, m;  /* same meaning as in above comment */
+
+   if (dst == orig)/* following doesn't handle inplace mappings */
+   return;
+   bitmap_zero(dst, bits);
+
+   m = 0;
+   for_each_set_bit(n, relmap, bits) {
+   /* m == bitmap_pos_to_ord(relmap, n, bits) */
+   if (test_bit(n, orig))
+   set_bit(m, dst);
+   m++;
+   }
+}
+EXPORT_SYMBOL(bitmap_off);
+
 #ifdef CONFIG_NUMA
 /**
  * bitmap_fold - fold larger bitmap into smaller, modulo specified size
-- 
2.43.0



  1   2   >