Re: [PATCH 01/19] vdso: Consolidate vdso_calc_delta()

2024-03-08 Thread Christophe Leroy


Le 08/03/2024 à 14:14, Adrian Hunter a écrit :
> [Vous ne recevez pas souvent de courriers de adrian.hun...@intel.com. 
> Découvrez pourquoi ceci est important à 
> https://aka.ms/LearnAboutSenderIdentification ]
> 
> Consolidate vdso_calc_delta(), in preparation for further simplification.
> 
> Suggested-by: Thomas Gleixner 
> Signed-off-by: Adrian Hunter 
> ---
>   arch/powerpc/include/asm/vdso/gettimeofday.h | 17 ++---
>   arch/s390/include/asm/vdso/gettimeofday.h|  7 ++-
>   lib/vdso/gettimeofday.c  |  4 
>   3 files changed, 8 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
> b/arch/powerpc/include/asm/vdso/gettimeofday.h
> index f0a4cf01e85c..f4da8e18cdf3 100644
> --- a/arch/powerpc/include/asm/vdso/gettimeofday.h
> +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
> @@ -14,6 +14,8 @@
> 
>   #define VDSO_HAS_TIME  1
> 
> +#define VDSO_DELTA_NOMASK  1
> +
>   static __always_inline int do_syscall_2(const unsigned long _r0, const 
> unsigned long _r3,
>  const unsigned long _r4)
>   {
> @@ -105,21 +107,6 @@ static inline bool vdso_clocksource_ok(const struct 
> vdso_data *vd)
>   }
>   #define vdso_clocksource_ok vdso_clocksource_ok
> 
> -/*
> - * powerpc specific delta calculation.
> - *
> - * This variant removes the masking of the subtraction because the
> - * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
> - * which would result in a pointless operation. The compiler cannot
> - * optimize it away as the mask comes from the vdso data and is not compile
> - * time constant.
> - */

Please keep the comment. You can move it close to VDSO_DELTA_NOMASK

> -static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, 
> u32 mult)
> -{
> -   return (cycles - last) * mult;
> -}
> -#define vdso_calc_delta vdso_calc_delta
> -
>   #ifndef __powerpc64__
>   static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
>   {
> diff --git a/arch/s390/include/asm/vdso/gettimeofday.h 
> b/arch/s390/include/asm/vdso/gettimeofday.h
> index db84942eb78f..7937765ccfa5 100644
> --- a/arch/s390/include/asm/vdso/gettimeofday.h
> +++ b/arch/s390/include/asm/vdso/gettimeofday.h
> @@ -6,16 +6,13 @@
> 
>   #define VDSO_HAS_CLOCK_GETRES 1
> 
> +#define VDSO_DELTA_NOMASK 1
> +
>   #include 
>   #include 
>   #include 
>   #include 
> 
> -#define vdso_calc_delta __arch_vdso_calc_delta
> -static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 
> mask, u32 mult)
> -{
> -   return (cycles - last) * mult;
> -}
> 
>   static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
>   {
> diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
> index ce2f69552003..042b95e8164d 100644
> --- a/lib/vdso/gettimeofday.c
> +++ b/lib/vdso/gettimeofday.c
> @@ -13,7 +13,11 @@
>   static __always_inline
>   u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
>   {
> +#ifdef VDSO_DELTA_NOMASK
> +   return (cycles - last) * mult;
> +#else
>  return ((cycles - last) & mask) * mult;
> +#endif

See 
https://docs.kernel.org/process/coding-style.html#conditional-compilation

You don't need #ifdefs here.

One solution is to define VDSO_DELTA_NOMASK to 0 in 
include/vdso/datapage.h after including asm/vdso/gettimeofday.h :

#ifndef VDSO_DELTA_NOMASK
#define VDSO_DELTA_NOMASK 0
#endif

Then

u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
{
if (VDSO_DELTA_NOMASK)
mask = ~0ULL;

return ((cycles - last) & mask) * mult;
}

or

u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
{
if (VDSO_DELTA_NOMASK)
return (cycles - last) * mult;

return ((cycles - last) & mask) * mult;
}




>   }
>   #endif
> 
> --
> 2.34.1
> 


[PATCH 3/3] tools/perf/arch/powerc: Add get_arch_regnum for powerpc

2024-03-08 Thread Athira Rajeev
The function get_dwarf_regnum() returns a DWARF register number
from a register name string. This calls arch specific function
get_arch_regnum to return register number for corresponding arch.
Add mappings for register name to register number in powerpc code:
arch/powerpc/util/dwarf-regs.c

Signed-off-by: Athira Rajeev 
---
 tools/perf/arch/powerpc/util/dwarf-regs.c | 29 +++
 1 file changed, 29 insertions(+)

diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c 
b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 0c4f4caf53ac..d955e3e577ea 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -98,3 +98,32 @@ int regs_query_register_offset(const char *name)
return roff->ptregs_offset;
return -EINVAL;
 }
+
+struct dwarf_regs_idx {
+   const char *name;
+   int idx;
+};
+
+static const struct dwarf_regs_idx powerpc_regidx_table[] = {
+   { "r0", 0 }, { "r1", 1 }, { "r2", 2 }, { "r3", 3 }, { "r4", 4 },
+   { "r5", 5 }, { "r6", 6 }, { "r7", 7 }, { "r8", 8 }, { "r9", 9 },
+   { "r10", 10 }, { "r11", 11 }, { "r12", 12 }, { "r13", 13 }, { "r14", 14 
},
+   { "r15", 15 }, { "r16", 16 }, { "r17", 17 }, { "r18", 18 }, { "r19", 19 
},
+   { "r20", 20 }, { "r21", 21 }, { "r22", 22 }, { "r23", 23 }, { "r24", 24 
},
+   { "r25", 25 }, { "r26", 26 }, { "r27", 27 }, { "r27", 27 }, { "r28", 28 
},
+   { "r29", 29 }, { "r30", 30 }, { "r31", 31 },
+};
+
+int get_arch_regnum(const char *name)
+{
+   unsigned int i;
+
+   if (*name != 'r')
+   return -EINVAL;
+
+   for (i = 0; i < ARRAY_SIZE(powerpc_regidx_table); i++)
+   if (!strcmp(powerpc_regidx_table[i].name, name))
+   return powerpc_regidx_table[i].idx;
+
+   return -ENOENT;
+}
-- 
2.43.0



[PATCH 2/3] tools/erf/util/annotate: Set register_char and memory_ref_char for powerpc

2024-03-08 Thread Athira Rajeev
To identify if the instruction has any memory reference,
"memory_ref_char" field needs to be set for specific architecture.
Example memory instruction:
lwz r10,0(r9)

Here "(" is the memory_ref_char. Set this as part of arch->objdump

To get register number and access offset from the given instruction,
arch->objdump.register_char is used. In case of powerpc, the register
char is "r" (since reg names are r0 to r31). Hence set register_char
as "r".

Signed-off-by: Athira Rajeev 
---
 tools/perf/util/annotate.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index ac002d907d81..d69bd6edafcb 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -216,6 +216,11 @@ static struct arch architectures[] = {
{
.name = "powerpc",
.init = powerpc__annotate_init,
+   .objdump =  {
+   .comment_char = '#',
+   .register_char = 'r',
+   .memory_ref_char = '(',
+   },
},
{
.name = "riscv64",
-- 
2.43.0



[PATCH 1/3] tools/perf/arch/powerpc: Add load/store in powerpc annotate instructions for data type profling

2024-03-08 Thread Athira Rajeev
Add powerpc instruction nmemonic table to associate load/store
instructions with move_ops. mov_ops is used to identify mem_type
to associate instruction with data type and offset. Also initialize
and allocate arch specific fields for nr_instructions, instructions and
nr_instructions_allocate.

Signed-off-by: Athira Rajeev 
---
 .../perf/arch/powerpc/annotate/instructions.c | 66 +++
 1 file changed, 66 insertions(+)

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c 
b/tools/perf/arch/powerpc/annotate/instructions.c
index a3f423c27cae..07af4442be38 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -1,6 +1,65 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
 
+/*
+ * powerpc instruction nmemonic table to associate load/store instructions with
+ * move_ops. mov_ops is used to identify mem_type to associate instruction with
+ * data type and offset.
+ */
+static struct ins powerpc__instructions[] = {
+   { .name = "lbz",.ops = _ops,  },
+   { .name = "lbzx",   .ops = _ops,  },
+   { .name = "lbzu",   .ops = _ops,  },
+   { .name = "lbzux",  .ops = _ops,  },
+   { .name = "lhz",.ops = _ops,  },
+   { .name = "lhzx",   .ops = _ops,  },
+   { .name = "lhzu",   .ops = _ops,  },
+   { .name = "lhzux",  .ops = _ops,  },
+   { .name = "lha",.ops = _ops,  },
+   { .name = "lhax",   .ops = _ops,  },
+   { .name = "lhau",   .ops = _ops,  },
+   { .name = "lhaux",  .ops = _ops,  },
+   { .name = "lwz",.ops = _ops,  },
+   { .name = "lwzx",   .ops = _ops,  },
+   { .name = "lwzu",   .ops = _ops,  },
+   { .name = "lwzux",  .ops = _ops,  },
+   { .name = "lwa",.ops = _ops,  },
+   { .name = "lwax",   .ops = _ops,  },
+   { .name = "lwaux",  .ops = _ops,  },
+   { .name = "ld", .ops = _ops,  },
+   { .name = "ldx",.ops = _ops,  },
+   { .name = "ldu",.ops = _ops,  },
+   { .name = "ldux",   .ops = _ops,  },
+   { .name = "stb",.ops = _ops,  },
+   { .name = "stbx",   .ops = _ops,  },
+   { .name = "stbu",   .ops = _ops,  },
+   { .name = "stbux",  .ops = _ops,  },
+   { .name = "sth",.ops = _ops,  },
+   { .name = "sthx",   .ops = _ops,  },
+   { .name = "sthu",   .ops = _ops,  },
+   { .name = "sthux",  .ops = _ops,  },
+   { .name = "stw",.ops = _ops,  },
+   { .name = "stwx",   .ops = _ops,  },
+   { .name = "stwu",   .ops = _ops,  },
+   { .name = "stwux",  .ops = _ops,  },
+   { .name = "std",.ops = _ops,  },
+   { .name = "stdx",   .ops = _ops,  },
+   { .name = "stdu",   .ops = _ops,  },
+   { .name = "stdux",  .ops = _ops,  },
+   { .name = "lhbrx",  .ops = _ops,  },
+   { .name = "sthbrx", .ops = _ops,  },
+   { .name = "lwbrx",  .ops = _ops,  },
+   { .name = "stwbrx", .ops = _ops,  },
+   { .name = "ldbrx",  .ops = _ops,  },
+   { .name = "stdbrx", .ops = _ops,  },
+   { .name = "lmw",.ops = _ops,  },
+   { .name = "stmw",   .ops = _ops,  },
+   { .name = "lswi",   .ops = _ops,  },
+   { .name = "lswx",   .ops = _ops,  },
+   { .name = "stswi",  .ops = _ops,  },
+   { .name = "stswx",  .ops = _ops,  },
+};
+
 static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, 
const char *name)
 {
int i;
@@ -52,6 +111,13 @@ static struct ins_ops 
*powerpc__associate_instruction_ops(struct arch *arch, con
 static int powerpc__annotate_init(struct arch *arch, char *cpuid 
__maybe_unused)
 {
if (!arch->initialized) {
+   arch->nr_instructions = ARRAY_SIZE(powerpc__instructions);
+   arch->instructions = calloc(arch->nr_instructions, 
sizeof(struct ins));
+   if (arch->instructions == NULL)
+   return -ENOMEM;
+
+   memcpy(arch->instructions, (struct ins *)powerpc__instructions, 
sizeof(struct ins) * arch->nr_instructions);
+   arch->nr_instructions_allocated = arch->nr_instructions;
arch->initialized = true;
arch->associate_instruction_ops = 
powerpc__associate_instruction_ops;
arch->objdump.comment_char  = '#';
-- 
2.43.0



[PATCH 0/3] Add data type profiling support for powerpc

2024-03-08 Thread Athira Rajeev
The patchset from Namhyung added support for data type profiling
in perf tool. This enabled support to associate PMU samples to data
types they refer using DWARF debug information. With the upstream
perf, currently it possible to run perf report or perf annotate to
view the data type information on x86.

This patchset includes changes need to enable data type profiling
support for powerpc. Main change are:
1. powerpc instruction nmemonic table to associate load/store
instructions with move_ops which is use to identify if instruction
is a memory access one.
2. To get register number and access offset from the given
instruction, code uses fields from "struct arch" -> objump.
Add entry for powerpc here.
3. A get_arch_regnum to return register number from the
register name string.

These three patches are the basic foundational patches.
With these changes, data types is getting identified for kernel
and user-space samples. There are still samples, which comes as
"unknown" and needs to be checked. We plan to get those addressed
in follow up. With the current patchset:

 ./perf record -a -e mem-loads sleep 1
 ./perf report -s type,typeoff --hierarchy --group --stdio
Snippet of logs:

 Total Lost Samples: 0

 Samples: 277  of events 'mem-loads, dummy:u'
 Event count (approx.): 149813

Overhead  Data Type / Data Type Offset
 ...  

65.93%   0.00% (unknown)
   65.93%   0.00% (unknown) +0 (no field)
 8.19%   0.00% struct vm_area_struct
8.19%   0.00% struct vm_area_struct +136 (vm_file)
 4.53%   0.00% struct rq
3.14%   0.00% struct rq +0 (__lock.raw_lock.val)
0.83%   0.00% struct rq +3216 (avg_irq.runnable_sum)
0.24%   0.00% struct rq +4 (nr_running)
0.14%   0.00% struct rq +12 (nr_preferred_running)
0.12%   0.00% struct rq +2760 (sd)
0.06%   0.00% struct rq +3368 (prev_steal_time_rq)
0.01%   0.00% struct rq +2592 (curr)
 3.53%   0.00% struct rb_node
3.53%   0.00% struct rb_node +0 (__rb_parent_color)
 3.43%   0.00% struct slab
3.43%   0.00% struct slab +32 (freelist)
 3.30%   0.00% unsigned int
3.30%   0.00% unsigned int +0 (no field)
 3.22%   0.00% struct vm_fault
3.22%   0.00% struct vm_fault +48 (pmd)
 2.55%   0.00% unsigned char
2.55%   0.00% unsigned char +0 (no field)
 1.06%   0.00% struct task_struct
1.06%   0.00% struct task_struct +4 (thread_info.cpu)
 0.92%   0.00% void*
0.92%   0.00% void* +0 (no field)
 0.74%   0.00% __int128 unsigned
0.74%   0.00% __int128 unsigned +8 (no field)
 0.59%   0.00% struct perf_event
0.54%   0.00% struct perf_event +552 (ctx)
0.04%   0.00% struct perf_event +152 (pmu)
 0.20%   0.00% struct sched_entity
0.20%   0.00% struct sched_entity +0 (load.weight)
 0.18%   0.00% struct cfs_rq
0.18%   0.00% struct cfs_rq +96 (curr)

Thanks
Athira

Athira Rajeev (3):
  tools/perf/arch/powerpc: Add load/store in powerpc annotate
instructions for data type profling
  tools/erf/util/annotate: Set register_char and memory_ref_char for
powerpc
  tools/perf/arch/powerc: Add get_arch_regnum for powerpc

 .../perf/arch/powerpc/annotate/instructions.c | 66 +++
 tools/perf/arch/powerpc/util/dwarf-regs.c | 29 
 tools/perf/util/annotate.c|  5 ++
 3 files changed, 100 insertions(+)

-- 
2.43.0



Re: [PATCH 1/3] tools/perf/arch/powerpc: Add load/store in powerpc annotate instructions for data type profling

2024-03-08 Thread Athira Rajeev
Hi All,

Please ignore this version. I made mistake in cover letter. I am re-posting the 
correct version now.
Sorry for the confusion

Thanks
Athira

> On 09-Mar-2024, at 11:21 AM, Athira Rajeev  
> wrote:
> 
> Add powerpc instruction nmemonic table to associate load/store
> instructions with move_ops. mov_ops is used to identify mem_type
> to associate instruction with data type and offset. Also initialize
> and allocate arch specific fields for nr_instructions, instructions and
> nr_instructions_allocate.
> 
> Signed-off-by: Athira Rajeev 
> ---
> .../perf/arch/powerpc/annotate/instructions.c | 66 +++
> 1 file changed, 66 insertions(+)
> 
> diff --git a/tools/perf/arch/powerpc/annotate/instructions.c 
> b/tools/perf/arch/powerpc/annotate/instructions.c
> index a3f423c27cae..07af4442be38 100644
> --- a/tools/perf/arch/powerpc/annotate/instructions.c
> +++ b/tools/perf/arch/powerpc/annotate/instructions.c
> @@ -1,6 +1,65 @@
> // SPDX-License-Identifier: GPL-2.0
> #include 
> 
> +/*
> + * powerpc instruction nmemonic table to associate load/store instructions 
> with
> + * move_ops. mov_ops is used to identify mem_type to associate instruction 
> with
> + * data type and offset.
> + */
> +static struct ins powerpc__instructions[] = {
> + { .name = "lbz", .ops = _ops,  },
> + { .name = "lbzx", .ops = _ops,  },
> + { .name = "lbzu", .ops = _ops,  },
> + { .name = "lbzux", .ops = _ops,  },
> + { .name = "lhz", .ops = _ops,  },
> + { .name = "lhzx", .ops = _ops,  },
> + { .name = "lhzu", .ops = _ops,  },
> + { .name = "lhzux", .ops = _ops,  },
> + { .name = "lha", .ops = _ops,  },
> + { .name = "lhax", .ops = _ops,  },
> + { .name = "lhau", .ops = _ops,  },
> + { .name = "lhaux", .ops = _ops,  },
> + { .name = "lwz", .ops = _ops,  },
> + { .name = "lwzx", .ops = _ops,  },
> + { .name = "lwzu", .ops = _ops,  },
> + { .name = "lwzux", .ops = _ops,  },
> + { .name = "lwa", .ops = _ops,  },
> + { .name = "lwax", .ops = _ops,  },
> + { .name = "lwaux", .ops = _ops,  },
> + { .name = "ld", .ops = _ops,  },
> + { .name = "ldx", .ops = _ops,  },
> + { .name = "ldu", .ops = _ops,  },
> + { .name = "ldux", .ops = _ops,  },
> + { .name = "stb", .ops = _ops,  },
> + { .name = "stbx", .ops = _ops,  },
> + { .name = "stbu", .ops = _ops,  },
> + { .name = "stbux", .ops = _ops,  },
> + { .name = "sth", .ops = _ops,  },
> + { .name = "sthx", .ops = _ops,  },
> + { .name = "sthu", .ops = _ops,  },
> + { .name = "sthux", .ops = _ops,  },
> + { .name = "stw", .ops = _ops,  },
> + { .name = "stwx", .ops = _ops,  },
> + { .name = "stwu", .ops = _ops,  },
> + { .name = "stwux", .ops = _ops,  },
> + { .name = "std", .ops = _ops,  },
> + { .name = "stdx", .ops = _ops,  },
> + { .name = "stdu", .ops = _ops,  },
> + { .name = "stdux", .ops = _ops,  },
> + { .name = "lhbrx", .ops = _ops,  },
> + { .name = "sthbrx", .ops = _ops,  },
> + { .name = "lwbrx", .ops = _ops,  },
> + { .name = "stwbrx", .ops = _ops,  },
> + { .name = "ldbrx", .ops = _ops,  },
> + { .name = "stdbrx", .ops = _ops,  },
> + { .name = "lmw", .ops = _ops,  },
> + { .name = "stmw", .ops = _ops,  },
> + { .name = "lswi", .ops = _ops,  },
> + { .name = "lswx", .ops = _ops,  },
> + { .name = "stswi", .ops = _ops,  },
> + { .name = "stswx", .ops = _ops,  },
> +};
> +
> static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, 
> const char *name)
> {
> int i;
> @@ -52,6 +111,13 @@ static struct ins_ops 
> *powerpc__associate_instruction_ops(struct arch *arch, con
> static int powerpc__annotate_init(struct arch *arch, char *cpuid 
> __maybe_unused)
> {
> if (!arch->initialized) {
> + arch->nr_instructions = ARRAY_SIZE(powerpc__instructions);
> + arch->instructions = calloc(arch->nr_instructions, sizeof(struct ins));
> + if (arch->instructions == NULL)
> + return -ENOMEM;
> +
> + memcpy(arch->instructions, (struct ins *)powerpc__instructions, 
> sizeof(struct ins) * arch->nr_instructions);
> + arch->nr_instructions_allocated = arch->nr_instructions;
> arch->initialized = true;
> arch->associate_instruction_ops = powerpc__associate_instruction_ops;
> arch->objdump.comment_char  = '#';
> -- 
> 2.43.0
> 



[PATCH 3/3] tools/perf/arch/powerc: Add get_arch_regnum for powerpc

2024-03-08 Thread Athira Rajeev
The function get_dwarf_regnum() returns a DWARF register number
from a register name string. This calls arch specific function
get_arch_regnum to return register number for corresponding arch.
Add mappings for register name to register number in powerpc code:
arch/powerpc/util/dwarf-regs.c

Signed-off-by: Athira Rajeev 
---
 tools/perf/arch/powerpc/util/dwarf-regs.c | 29 +++
 1 file changed, 29 insertions(+)

diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c 
b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 0c4f4caf53ac..d955e3e577ea 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -98,3 +98,32 @@ int regs_query_register_offset(const char *name)
return roff->ptregs_offset;
return -EINVAL;
 }
+
+struct dwarf_regs_idx {
+   const char *name;
+   int idx;
+};
+
+static const struct dwarf_regs_idx powerpc_regidx_table[] = {
+   { "r0", 0 }, { "r1", 1 }, { "r2", 2 }, { "r3", 3 }, { "r4", 4 },
+   { "r5", 5 }, { "r6", 6 }, { "r7", 7 }, { "r8", 8 }, { "r9", 9 },
+   { "r10", 10 }, { "r11", 11 }, { "r12", 12 }, { "r13", 13 }, { "r14", 14 
},
+   { "r15", 15 }, { "r16", 16 }, { "r17", 17 }, { "r18", 18 }, { "r19", 19 
},
+   { "r20", 20 }, { "r21", 21 }, { "r22", 22 }, { "r23", 23 }, { "r24", 24 
},
+   { "r25", 25 }, { "r26", 26 }, { "r27", 27 }, { "r27", 27 }, { "r28", 28 
},
+   { "r29", 29 }, { "r30", 30 }, { "r31", 31 },
+};
+
+int get_arch_regnum(const char *name)
+{
+   unsigned int i;
+
+   if (*name != 'r')
+   return -EINVAL;
+
+   for (i = 0; i < ARRAY_SIZE(powerpc_regidx_table); i++)
+   if (!strcmp(powerpc_regidx_table[i].name, name))
+   return powerpc_regidx_table[i].idx;
+
+   return -ENOENT;
+}
-- 
2.43.0



[PATCH 2/3] tools/erf/util/annotate: Set register_char and memory_ref_char for powerpc

2024-03-08 Thread Athira Rajeev
To identify if the instruction has any memory reference,
"memory_ref_char" field needs to be set for specific architecture.
Example memory instruction:
lwz r10,0(r9)

Here "(" is the memory_ref_char. Set this as part of arch->objdump

To get register number and access offset from the given instruction,
arch->objdump.register_char is used. In case of powerpc, the register
char is "r" (since reg names are r0 to r31). Hence set register_char
as "r".

Signed-off-by: Athira Rajeev 
---
 tools/perf/util/annotate.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index ac002d907d81..d69bd6edafcb 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -216,6 +216,11 @@ static struct arch architectures[] = {
{
.name = "powerpc",
.init = powerpc__annotate_init,
+   .objdump =  {
+   .comment_char = '#',
+   .register_char = 'r',
+   .memory_ref_char = '(',
+   },
},
{
.name = "riscv64",
-- 
2.43.0



[PATCH 1/3] tools/perf/arch/powerpc: Add load/store in powerpc annotate instructions for data type profling

2024-03-08 Thread Athira Rajeev
Add powerpc instruction nmemonic table to associate load/store
instructions with move_ops. mov_ops is used to identify mem_type
to associate instruction with data type and offset. Also initialize
and allocate arch specific fields for nr_instructions, instructions and
nr_instructions_allocate.

Signed-off-by: Athira Rajeev 
---
 .../perf/arch/powerpc/annotate/instructions.c | 66 +++
 1 file changed, 66 insertions(+)

diff --git a/tools/perf/arch/powerpc/annotate/instructions.c 
b/tools/perf/arch/powerpc/annotate/instructions.c
index a3f423c27cae..07af4442be38 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -1,6 +1,65 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
 
+/*
+ * powerpc instruction nmemonic table to associate load/store instructions with
+ * move_ops. mov_ops is used to identify mem_type to associate instruction with
+ * data type and offset.
+ */
+static struct ins powerpc__instructions[] = {
+   { .name = "lbz",.ops = _ops,  },
+   { .name = "lbzx",   .ops = _ops,  },
+   { .name = "lbzu",   .ops = _ops,  },
+   { .name = "lbzux",  .ops = _ops,  },
+   { .name = "lhz",.ops = _ops,  },
+   { .name = "lhzx",   .ops = _ops,  },
+   { .name = "lhzu",   .ops = _ops,  },
+   { .name = "lhzux",  .ops = _ops,  },
+   { .name = "lha",.ops = _ops,  },
+   { .name = "lhax",   .ops = _ops,  },
+   { .name = "lhau",   .ops = _ops,  },
+   { .name = "lhaux",  .ops = _ops,  },
+   { .name = "lwz",.ops = _ops,  },
+   { .name = "lwzx",   .ops = _ops,  },
+   { .name = "lwzu",   .ops = _ops,  },
+   { .name = "lwzux",  .ops = _ops,  },
+   { .name = "lwa",.ops = _ops,  },
+   { .name = "lwax",   .ops = _ops,  },
+   { .name = "lwaux",  .ops = _ops,  },
+   { .name = "ld", .ops = _ops,  },
+   { .name = "ldx",.ops = _ops,  },
+   { .name = "ldu",.ops = _ops,  },
+   { .name = "ldux",   .ops = _ops,  },
+   { .name = "stb",.ops = _ops,  },
+   { .name = "stbx",   .ops = _ops,  },
+   { .name = "stbu",   .ops = _ops,  },
+   { .name = "stbux",  .ops = _ops,  },
+   { .name = "sth",.ops = _ops,  },
+   { .name = "sthx",   .ops = _ops,  },
+   { .name = "sthu",   .ops = _ops,  },
+   { .name = "sthux",  .ops = _ops,  },
+   { .name = "stw",.ops = _ops,  },
+   { .name = "stwx",   .ops = _ops,  },
+   { .name = "stwu",   .ops = _ops,  },
+   { .name = "stwux",  .ops = _ops,  },
+   { .name = "std",.ops = _ops,  },
+   { .name = "stdx",   .ops = _ops,  },
+   { .name = "stdu",   .ops = _ops,  },
+   { .name = "stdux",  .ops = _ops,  },
+   { .name = "lhbrx",  .ops = _ops,  },
+   { .name = "sthbrx", .ops = _ops,  },
+   { .name = "lwbrx",  .ops = _ops,  },
+   { .name = "stwbrx", .ops = _ops,  },
+   { .name = "ldbrx",  .ops = _ops,  },
+   { .name = "stdbrx", .ops = _ops,  },
+   { .name = "lmw",.ops = _ops,  },
+   { .name = "stmw",   .ops = _ops,  },
+   { .name = "lswi",   .ops = _ops,  },
+   { .name = "lswx",   .ops = _ops,  },
+   { .name = "stswi",  .ops = _ops,  },
+   { .name = "stswx",  .ops = _ops,  },
+};
+
 static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, 
const char *name)
 {
int i;
@@ -52,6 +111,13 @@ static struct ins_ops 
*powerpc__associate_instruction_ops(struct arch *arch, con
 static int powerpc__annotate_init(struct arch *arch, char *cpuid 
__maybe_unused)
 {
if (!arch->initialized) {
+   arch->nr_instructions = ARRAY_SIZE(powerpc__instructions);
+   arch->instructions = calloc(arch->nr_instructions, 
sizeof(struct ins));
+   if (arch->instructions == NULL)
+   return -ENOMEM;
+
+   memcpy(arch->instructions, (struct ins *)powerpc__instructions, 
sizeof(struct ins) * arch->nr_instructions);
+   arch->nr_instructions_allocated = arch->nr_instructions;
arch->initialized = true;
arch->associate_instruction_ops = 
powerpc__associate_instruction_ops;
arch->objdump.comment_char  = '#';
-- 
2.43.0



[PATCH 0/3] Add data type profiling support for powerpc

2024-03-08 Thread Athira Rajeev
From: Akanksha J N 

The patchset from Namhyung added support for data type profiling
in perf tool. This enabled support to associate PMU samples to data
types they refer using DWARF debug information. With the upstream
perf, currently it possible to run perf report or perf annotate to
view the data type information on x86.

This patchset includes changes need to enable data type profiling
support for powerpc. Main change are:
1. powerpc instruction nmemonic table to associate load/store
instructions with move_ops which is use to identify if instruction
is a memory access one.
2. To get register number and access offset from the given
instruction, code uses fields from "struct arch" -> objump.
Add entry for powerpc here.
3. A get_arch_regnum to return register number from the
register name string.

These three patches are the basic foundational patches.
With these changes, data types is getting identified for kernel
and user-space samples. There are still samples, which comes as
"unknown" and needs to be checked. We plan to get those addressed
in follow up. With the current patchset:

# ./perf record -a -e mem-loads sleep 1
# ./perf report -s type,typeoff --hierarchy --group --stdio
Snippet of logs:
#
# Total Lost Samples: 0
#
# Samples: 277  of events 'mem-loads, dummy:u'
# Event count (approx.): 149813
#
#Overhead  Data Type / Data Type Offset
# ...  
#
65.93%   0.00% (unknown)
   65.93%   0.00% (unknown) +0 (no field)
 8.19%   0.00% struct vm_area_struct
8.19%   0.00% struct vm_area_struct +136 (vm_file)
 4.53%   0.00% struct rq
3.14%   0.00% struct rq +0 (__lock.raw_lock.val)
0.83%   0.00% struct rq +3216 (avg_irq.runnable_sum)
0.24%   0.00% struct rq +4 (nr_running)
0.14%   0.00% struct rq +12 (nr_preferred_running)
0.12%   0.00% struct rq +2760 (sd)
0.06%   0.00% struct rq +3368 (prev_steal_time_rq)
0.01%   0.00% struct rq +2592 (curr)
 3.53%   0.00% struct rb_node
3.53%   0.00% struct rb_node +0 (__rb_parent_color)
 3.43%   0.00% struct slab
3.43%   0.00% struct slab +32 (freelist)
 3.30%   0.00% unsigned int
3.30%   0.00% unsigned int +0 (no field)
 3.22%   0.00% struct vm_fault
3.22%   0.00% struct vm_fault +48 (pmd)
 2.55%   0.00% unsigned char
2.55%   0.00% unsigned char +0 (no field)
 1.06%   0.00% struct task_struct
1.06%   0.00% struct task_struct +4 (thread_info.cpu)
 0.92%   0.00% void*
0.92%   0.00% void* +0 (no field)
 0.74%   0.00% __int128 unsigned
0.74%   0.00% __int128 unsigned +8 (no field)
 0.59%   0.00% struct perf_event
0.54%   0.00% struct perf_event +552 (ctx)
0.04%   0.00% struct perf_event +152 (pmu)
 0.20%   0.00% struct sched_entity
0.20%   0.00% struct sched_entity +0 (load.weight)
 0.18%   0.00% struct cfs_rq
0.18%   0.00% struct cfs_rq +96 (curr)

Thanks
Athira

Athira Rajeev (3):
  tools/perf/arch/powerpc: Add load/store in powerpc annotate
instructions for data type profling
  tools/erf/util/annotate: Set register_char and memory_ref_char for
powerpc
  tools/perf/arch/powerc: Add get_arch_regnum for powerpc

 .../perf/arch/powerpc/annotate/instructions.c | 66 +++
 tools/perf/arch/powerpc/util/dwarf-regs.c | 29 
 tools/perf/util/annotate.c|  5 ++
 3 files changed, 100 insertions(+)

-- 
2.43.0



Re: [PATCH 0/4] PCI: Consolidate TLP Log reading and printing

2024-03-08 Thread Bjorn Helgaas
On Tue, Feb 06, 2024 at 03:57:13PM +0200, Ilpo Järvinen wrote:
> This series consolidates AER & DPC TLP Log handling code. Helpers are
> added for reading and printing the TLP Log and the format is made to
> include E-E Prefixes in both cases (previously only one DPC RP PIO
> displayed the E-E Prefixes).
> 
> I'd appreciate if people familiar with ixgbe could check the error
> handling conversion within the driver is correct.
> 
> Ilpo Järvinen (4):
>   PCI/AER: Cleanup register variable
>   PCI: Generalize TLP Header Log reading

I applied these first two to pci/aer for v6.9, thanks, these are all
nice improvements!

I postponed the ixgbe part for now because I think we should get an
ack from those maintainers or just send it to them since it subtly
changes the error and device removal checking there.

>   PCI: Add TLP Prefix reading into pcie_read_tlp_log()
>   PCI: Create helper to print TLP Header and Prefix Log

I'll respond to these with some minor comments.

>  drivers/firmware/efi/cper.c   |  4 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 39 +++--
>  drivers/pci/ats.c |  2 +-
>  drivers/pci/pci.c | 79 +++
>  drivers/pci/pci.h |  2 +-
>  drivers/pci/pcie/aer.c| 28 ++-
>  drivers/pci/pcie/dpc.c| 31 
>  drivers/pci/probe.c   | 14 ++--
>  include/linux/aer.h   | 16 ++--
>  include/linux/pci.h   |  2 +-
>  include/ras/ras_event.h   | 10 +--
>  include/uapi/linux/pci_regs.h |  2 +
>  12 files changed, 145 insertions(+), 84 deletions(-)
> 
> -- 
> 2.39.2
> 


Re: [PATCH 1/2] powerpc/prom_init: Replace linux,sml-base/sml-size with linux,sml-log

2024-03-08 Thread Stefan Berger




On 3/8/24 15:57, Rob Herring wrote:

On Fri, Mar 08, 2024 at 07:23:35AM -0500, Stefan Berger wrote:



On 3/7/24 16:52, Rob Herring wrote:

On Thu, Mar 07, 2024 at 09:41:31PM +1100, Michael Ellerman wrote:

Stefan Berger  writes:

linux,sml-base holds the address of a buffer with the TPM log. This
buffer may become invalid after a kexec and therefore embed the whole TPM
log in linux,sml-log. This helps to protect the log since it is properly
carried across a kexec with both of the kexec syscalls.

Signed-off-by: Stefan Berger 
---
   arch/powerpc/kernel/prom_init.c | 8 ++--
   1 file changed, 2 insertions(+), 6 deletions(-)







Also adding the new linux,sml-log property should be accompanied by a
change to the device tree binding.

The syntax is not very obvious to me, but possibly something like?

diff --git a/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml 
b/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
index 50a3fd31241c..cd75037948bc 100644
--- a/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
+++ b/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
@@ -74,8 +74,6 @@ required:
 - ibm,my-dma-window
 - ibm,my-drc-index
 - ibm,loc-code
-  - linux,sml-base
-  - linux,sml-size


Dropping required properties is an ABI break. If you drop them, an older
OS version won't work.


1) On PowerVM and KVM on Power these two properties were added in the Linux
code. I replaced the creation of these properties with creation of
linux,sml-log (1/2 in this series). I also replaced the handling of
these two (2/2 in this series) for these two platforms but leaving it for
powernv systems where the firmware creates these.


Okay, I guess your case is not a ABI break if the kernel is populating
it and the same kernel consumes it.

You failed to answer my question on using /reserved-memory. Again, why


I am not familiar with /reserved-memory and whether it is supported on 
the targeted platforms.




Re: [RFC] sched/eevdf: sched feature to dismiss lag on wakeup

2024-03-08 Thread Luis Machado
Hi Tobias,

On 2/28/24 16:10, Tobias Huschle wrote:
> The previously used CFS scheduler gave tasks that were woken up an
> enhanced chance to see runtime immediately by deducting a certain value
> from its vruntime on runqueue placement during wakeup.
> 
> This property was used by some, at least vhost, to ensure, that certain
> kworkers are scheduled immediately after being woken up. The EEVDF
> scheduler, does not support this so far. Instead, if such a woken up
> entitiy carries a negative lag from its previous execution, it will have
> to wait for the current time slice to finish, which affects the
> performance of the process expecting the immediate execution negatively.
> 
> To address this issue, implement EEVDF strategy #2 for rejoining
> entities, which dismisses the lag from previous execution and allows
> the woken up task to run immediately (if no other entities are deemed
> to be preferred for scheduling by EEVDF).
> 
> The vruntime is decremented by an additional value of 1 to make sure,
> that the woken up tasks gets to actually run. This is of course not
> following strategy #2 in an exact manner but guarantees the expected
> behavior for the scenario described above. Without the additional
> decrement, the performance goes south even more. So there are some
> side effects I could not get my head around yet.
> 
> Questions:
> 1. The kworker getting its negative lag occurs in the following scenario
>- kworker and a cgroup are supposed to execute on the same CPU
>- one task within the cgroup is executing and wakes up the kworker
>- kworker with 0 lag, gets picked immediately and finishes its
>  execution within ~5000ns
>- on dequeue, kworker gets assigned a negative lag
>Is this expected behavior? With this short execution time, I would
>expect the kworker to be fine.

That strikes me as a bit odd as well. Have you been able to determine how a 
negative lag
is assigned to the kworker after such a short runtime?

I was looking at a different thread 
(https://lore.kernel.org/lkml/20240226082349.302363-1-yu.c.c...@intel.com/) that
uncovers a potential overflow in the eligibility calculation. Though I don't 
think that is the case for this particular
vhost problem.


Re: [PATCH 1/2] powerpc/prom_init: Replace linux,sml-base/sml-size with linux,sml-log

2024-03-08 Thread Rob Herring
On Fri, Mar 08, 2024 at 07:23:35AM -0500, Stefan Berger wrote:
> 
> 
> On 3/7/24 16:52, Rob Herring wrote:
> > On Thu, Mar 07, 2024 at 09:41:31PM +1100, Michael Ellerman wrote:
> > > Stefan Berger  writes:
> > > > linux,sml-base holds the address of a buffer with the TPM log. This
> > > > buffer may become invalid after a kexec and therefore embed the whole 
> > > > TPM
> > > > log in linux,sml-log. This helps to protect the log since it is properly
> > > > carried across a kexec with both of the kexec syscalls.
> > > > 
> > > > Signed-off-by: Stefan Berger 
> > > > ---
> > > >   arch/powerpc/kernel/prom_init.c | 8 ++--
> > > >   1 file changed, 2 insertions(+), 6 deletions(-)
> > > > 
> 
> > 
> > 
> > > Also adding the new linux,sml-log property should be accompanied by a
> > > change to the device tree binding.
> > > 
> > > The syntax is not very obvious to me, but possibly something like?
> > > 
> > > diff --git a/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml 
> > > b/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
> > > index 50a3fd31241c..cd75037948bc 100644
> > > --- a/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
> > > +++ b/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
> > > @@ -74,8 +74,6 @@ required:
> > > - ibm,my-dma-window
> > > - ibm,my-drc-index
> > > - ibm,loc-code
> > > -  - linux,sml-base
> > > -  - linux,sml-size
> > 
> > Dropping required properties is an ABI break. If you drop them, an older
> > OS version won't work.
> 
> 1) On PowerVM and KVM on Power these two properties were added in the Linux
> code. I replaced the creation of these properties with creation of
> linux,sml-log (1/2 in this series). I also replaced the handling of
> these two (2/2 in this series) for these two platforms but leaving it for
> powernv systems where the firmware creates these.

Okay, I guess your case is not a ABI break if the kernel is populating 
it and the same kernel consumes it. 

You failed to answer my question on using /reserved-memory. Again, why 
can't that be used? That is the standard way we prevent chunks of memory 
from being clobbered. There's already support for describing the TPM log 
that way anyways. The only reasoning I can see writing out a node for 
that is harder than just adding a property, but that's not a great 
argument IMO.


> 2) There is an example in the ibm,vtpm.yaml file that has both of these
> and the test case still passes the check when the two entries above are
> removed. I will post v2 with the changes to the DT bindings for
> linux,sml-log including an example for linux,sml-log. [The test cases fail,
> as expected, when an additional property is added, such as when
> linux,sml-base is added when linux,sml-log is there or linux,sml-log is
> added when linux,sml-base is there.]

Sure, removing a required property is never going to break the DT 
checks. What would break is a client (OS) version that only understands 
linux,sml-base and can no longer get the log assuming getting the log 
itself was required. 

Rob


Re: [v2 PATCH 0/3] arch: mm, vdso: consolidate PAGE_SIZE definition

2024-03-08 Thread Vincenzo Frascino



On 06/03/2024 14:14, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> Naresh noticed that the newly added usage of the PAGE_SIZE macro in
> include/vdso/datapage.h introduced a build regression. I had an older
> patch that I revived to have this defined through Kconfig rather than
> through including asm/page.h, which is not allowed in vdso code.
> 
> The vdso patch series now has a temporary workaround, but I still want to
> get this into v6.9 so we can place the hack with CONFIG_PAGE_SIZE
> in the vdso.
> 
> I've applied this to the asm-generic tree already, please let me know if
> there are still remaining issues. It's really close to the merge window
> already, so I'd probably give this a few more days before I send a pull
> request, or defer it to v6.10 if anything goes wrong.
> 
> Sorry for the delay, I was still waiting to resolve the m68k question,
> but there were no further replies in the end, so I kept my original
> version.
> 
> Changes from v1:
> 
>  - improve Kconfig help texts
>  - remove an extraneous line in hexagon
> 
>   Arnd
>

Thanks Arnd, looks good to me.

Reviewed-by: Vincenzo Frascino 


Re: [PATCH RFC 13/13] mm: Document pXd_leaf() API

2024-03-08 Thread Jason Gunthorpe
On Wed, Mar 06, 2024 at 06:41:47PM +0800, pet...@redhat.com wrote:
> From: Peter Xu 
> 
> There's one small section already, but since we're going to remove
> pXd_huge(), that comment may start to obsolete.
> 
> Rewrite that section with more information, hopefully with that the API is
> crystal clear on what it implies.
> 
> Signed-off-by: Peter Xu 
> ---
>  include/linux/pgtable.h | 24 +++-
>  1 file changed, 19 insertions(+), 5 deletions(-)

Reviewed-by: Jason Gunthorpe 

Jason


Re: [PATCH v13 09/16] media: uapi: Define audio sample format fourcc type

2024-03-08 Thread Hans Verkuil
On 08/03/2024 2:52 pm, Shengjiu Wang wrote:
> On Fri, Mar 8, 2024 at 8:06 PM Hans Verkuil  wrote:
>>
>> On 08/03/2024 12:52 pm, Shengjiu Wang wrote:
>>> On Fri, Mar 8, 2024 at 3:34 PM Hans Verkuil  wrote:

 Hi Shengjiu,

 After thinking it over I think this patch can be improved:

 On 26/02/2024 9:28 am, Shengjiu Wang wrote:
> The audio sample format definition is from alsa,
> the header file is include/uapi/sound/asound.h, but
> don't include this header file directly, because in
> user space, there is another copy in alsa-lib.
> There will be conflict in userspace for include
> videodev2.h & asound.h and asoundlib.h
>
> Here still use the fourcc format.
>
> Signed-off-by: Shengjiu Wang 
> ---
>  .../userspace-api/media/v4l/pixfmt-audio.rst  | 87 +++
>  .../userspace-api/media/v4l/pixfmt.rst|  1 +
>  drivers/media/v4l2-core/v4l2-ioctl.c  | 13 +++
>  include/uapi/linux/videodev2.h| 23 +
>  4 files changed, 124 insertions(+)
>  create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-audio.rst
>
> diff --git a/Documentation/userspace-api/media/v4l/pixfmt-audio.rst 
> b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> new file mode 100644
> index ..04b4a7fbd8f4
> --- /dev/null
> +++ b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> @@ -0,0 +1,87 @@
> +.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
> +
> +.. _pixfmt-audio:
> +
> +*
> +Audio Formats
> +*
> +
> +These formats are used for :ref:`audiomem2mem` interface only.

 Here you should also document that all these fourccs start with 'AU' and 
 are
 reserved for mappings of the snd_pcm_format_t type.

 Also document the v4l2_fourcc_to_audfmt define and the 
 v4l2_audfmt_to_fourcc
 define (see also below).
>>>
>>> How about below description?
>>> '
>>> All these fourccs starting with 'AU' are reserved for mappings
>>
>> All these fourccs -> All FourCCs
>>
>>> of the snd_pcm_format_t type.
>>>
>>> The v4l2_audfmt_to_fourcc() is defined to convert snd_pcm_format_t
>>
>> convert -> convert the
>>
>>> type to fourcc. The first character is 'A', the second character
>>
>> fourcc -> a FourCC
>>
>>> is 'U', the third character is ten's digit of snd_pcm_format_t,
>>> the fourth character is unit's digit of snd_pcm_format_t.
>>
>> "is 'U', and the remaining two characters is the snd_pcm_format_t
>> value in ASCII. Example: SNDRV_PCM_FORMAT_S16_LE (with value 2)
>> maps to 'AU02' and SNDRV_PCM_FORMAT_S24_3LE (with value 32) maps
>> to 'AU32'."
>>
>>>
>>> The v4l2_fourcc_to_audfmt() is defined to convert these fourccs to
>>
>> fourccs -> FourCCs
>>
>>> snd_pcm_format_t type.
>>
>> BTW, given the way snd_pcm_format_t is defined I am fairly certain
>> some casts are needed for the v4l2_audfmt_to_fourcc define.
>>
>> Regards,
>>
>> Hans
>>
>>> '
>>> Best regards
>>> Shengjiu Wang

> +
> +.. tabularcolumns:: |p{5.8cm}|p{1.2cm}|p{10.3cm}|
> +
> +.. cssclass:: longtable
> +
> +.. flat-table:: Audio Format
> +:header-rows:  1
> +:stub-columns: 0
> +:widths:   3 1 4
> +
> +* - Identifier
> +  - Code
> +  - Details
> +* .. _V4L2-AUDIO-FMT-S8:
> +
> +  - ``V4L2_AUDIO_FMT_S8``
> +  - 'S8'
> +  - Corresponds to SNDRV_PCM_FORMAT_S8 in ALSA
> +* .. _V4L2-AUDIO-FMT-S16-LE:
> +
> +  - ``V4L2_AUDIO_FMT_S16_LE``
> +  - 'S16_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_S16_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-U16-LE:
> +
> +  - ``V4L2_AUDIO_FMT_U16_LE``
> +  - 'U16_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_U16_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-S24-LE:
> +
> +  - ``V4L2_AUDIO_FMT_S24_LE``
> +  - 'S24_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_S24_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-U24-LE:
> +
> +  - ``V4L2_AUDIO_FMT_U24_LE``
> +  - 'U24_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_U24_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-S32-LE:
> +
> +  - ``V4L2_AUDIO_FMT_S32_LE``
> +  - 'S32_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_S32_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-U32-LE:
> +
> +  - ``V4L2_AUDIO_FMT_U32_LE``
> +  - 'U32_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_U32_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-FLOAT-LE:
> +
> +  - ``V4L2_AUDIO_FMT_FLOAT_LE``
> +  - 'FLOAT_LE'
> +  - Corresponds to SNDRV_PCM_FORMAT_FLOAT_LE in ALSA
> +* .. _V4L2-AUDIO-FMT-IEC958-SUBFRAME-LE:
> +
> +  - ``V4L2_AUDIO_FMT_IEC958_SUBFRAME_LE``
> +  - 'IEC958_SUBFRAME_LE'
> +  - Corresponds to 

Re: [PATCH v13 09/16] media: uapi: Define audio sample format fourcc type

2024-03-08 Thread Shengjiu Wang
On Fri, Mar 8, 2024 at 8:06 PM Hans Verkuil  wrote:
>
> On 08/03/2024 12:52 pm, Shengjiu Wang wrote:
> > On Fri, Mar 8, 2024 at 3:34 PM Hans Verkuil  wrote:
> >>
> >> Hi Shengjiu,
> >>
> >> After thinking it over I think this patch can be improved:
> >>
> >> On 26/02/2024 9:28 am, Shengjiu Wang wrote:
> >>> The audio sample format definition is from alsa,
> >>> the header file is include/uapi/sound/asound.h, but
> >>> don't include this header file directly, because in
> >>> user space, there is another copy in alsa-lib.
> >>> There will be conflict in userspace for include
> >>> videodev2.h & asound.h and asoundlib.h
> >>>
> >>> Here still use the fourcc format.
> >>>
> >>> Signed-off-by: Shengjiu Wang 
> >>> ---
> >>>  .../userspace-api/media/v4l/pixfmt-audio.rst  | 87 +++
> >>>  .../userspace-api/media/v4l/pixfmt.rst|  1 +
> >>>  drivers/media/v4l2-core/v4l2-ioctl.c  | 13 +++
> >>>  include/uapi/linux/videodev2.h| 23 +
> >>>  4 files changed, 124 insertions(+)
> >>>  create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> >>>
> >>> diff --git a/Documentation/userspace-api/media/v4l/pixfmt-audio.rst 
> >>> b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> >>> new file mode 100644
> >>> index ..04b4a7fbd8f4
> >>> --- /dev/null
> >>> +++ b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> >>> @@ -0,0 +1,87 @@
> >>> +.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
> >>> +
> >>> +.. _pixfmt-audio:
> >>> +
> >>> +*
> >>> +Audio Formats
> >>> +*
> >>> +
> >>> +These formats are used for :ref:`audiomem2mem` interface only.
> >>
> >> Here you should also document that all these fourccs start with 'AU' and 
> >> are
> >> reserved for mappings of the snd_pcm_format_t type.
> >>
> >> Also document the v4l2_fourcc_to_audfmt define and the 
> >> v4l2_audfmt_to_fourcc
> >> define (see also below).
> >
> > How about below description?
> > '
> > All these fourccs starting with 'AU' are reserved for mappings
>
> All these fourccs -> All FourCCs
>
> > of the snd_pcm_format_t type.
> >
> > The v4l2_audfmt_to_fourcc() is defined to convert snd_pcm_format_t
>
> convert -> convert the
>
> > type to fourcc. The first character is 'A', the second character
>
> fourcc -> a FourCC
>
> > is 'U', the third character is ten's digit of snd_pcm_format_t,
> > the fourth character is unit's digit of snd_pcm_format_t.
>
> "is 'U', and the remaining two characters is the snd_pcm_format_t
> value in ASCII. Example: SNDRV_PCM_FORMAT_S16_LE (with value 2)
> maps to 'AU02' and SNDRV_PCM_FORMAT_S24_3LE (with value 32) maps
> to 'AU32'."
>
> >
> > The v4l2_fourcc_to_audfmt() is defined to convert these fourccs to
>
> fourccs -> FourCCs
>
> > snd_pcm_format_t type.
>
> BTW, given the way snd_pcm_format_t is defined I am fairly certain
> some casts are needed for the v4l2_audfmt_to_fourcc define.
>
> Regards,
>
> Hans
>
> > '
> > Best regards
> > Shengjiu Wang
> >>
> >>> +
> >>> +.. tabularcolumns:: |p{5.8cm}|p{1.2cm}|p{10.3cm}|
> >>> +
> >>> +.. cssclass:: longtable
> >>> +
> >>> +.. flat-table:: Audio Format
> >>> +:header-rows:  1
> >>> +:stub-columns: 0
> >>> +:widths:   3 1 4
> >>> +
> >>> +* - Identifier
> >>> +  - Code
> >>> +  - Details
> >>> +* .. _V4L2-AUDIO-FMT-S8:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_S8``
> >>> +  - 'S8'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_S8 in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-S16-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_S16_LE``
> >>> +  - 'S16_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_S16_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-U16-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_U16_LE``
> >>> +  - 'U16_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_U16_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-S24-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_S24_LE``
> >>> +  - 'S24_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_S24_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-U24-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_U24_LE``
> >>> +  - 'U24_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_U24_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-S32-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_S32_LE``
> >>> +  - 'S32_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_S32_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-U32-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_U32_LE``
> >>> +  - 'U32_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_U32_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-FLOAT-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_FLOAT_LE``
> >>> +  - 'FLOAT_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_FLOAT_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-IEC958-SUBFRAME-LE:
> >>> +
> >>> +  - ``V4L2_AUDIO_FMT_IEC958_SUBFRAME_LE``
> >>> +  - 'IEC958_SUBFRAME_LE'
> >>> +  - Corresponds to SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE in ALSA
> >>> +* .. _V4L2-AUDIO-FMT-S24-3LE:
> >>> +

Re: [PATCH v9 07/10] PCI: dwc: ep: Remove "core_init_notifier" flag

2024-03-08 Thread Niklas Cassel
On Mon, Mar 04, 2024 at 02:52:19PM +0530, Manivannan Sadhasivam wrote:
> "core_init_notifier" flag is set by the glue drivers requiring refclk from
> the host to complete the DWC core initialization. Also, those drivers will
> send a notification to the EPF drivers once the initialization is fully
> completed using the pci_epc_init_notify() API. Only then, the EPF drivers
> will start functioning.
> 
> For the rest of the drivers generating refclk locally, EPF drivers will
> start functioning post binding with them. EPF drivers rely on the
> 'core_init_notifier' flag to differentiate between the drivers.
> Unfortunately, this creates two different flows for the EPF drivers.
> 
> So to avoid that, let's get rid of the "core_init_notifier" flag and follow
> a single initialization flow for the EPF drivers. This is done by calling
> the dw_pcie_ep_init_notify() from all glue drivers after the completion of
> dw_pcie_ep_init_registers() API. This will allow all the glue drivers to
> send the notification to the EPF drivers once the initialization is fully
> completed.
> 
> Only difference here is that, the drivers requiring refclk from host will
> send the notification once refclk is received, while others will send it
> during probe time itself.
> 
> Reviewed-by: Frank Li 
> Signed-off-by: Manivannan Sadhasivam 
> ---
> diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c 
> b/drivers/pci/endpoint/functions/pci-epf-test.c
> index 18c80002d3bd..fc0282b0d626 100644
> --- a/drivers/pci/endpoint/functions/pci-epf-test.c
> +++ b/drivers/pci/endpoint/functions/pci-epf-test.c
> @@ -927,21 +928,12 @@ static int pci_epf_test_bind(struct pci_epf *epf)
>   if (ret)
>   return ret;
>

Hello Mani,

Since you asked for testing, I gave your series a spin
(with a driver without .core_init_notifier).


There seems to be a problem that pci_epc_write_header() is never called.

Debugging this, it seems that .core_init in pci-epf-test is never called.

If I add debug prints in pci_epc_init_notify(), I see that it does not
notify a single EPF driver.

It appears that the patch in $subject will call pci_epc_init_notify()
at EPC driver .probe() time, and at that point in time, there are no
EPF drivers registered.

They get registered later, when doing the configfs write.


I would say that it is the following change that breaks things:

> - if (!core_init_notifier) {
> - ret = pci_epf_test_core_init(epf);
> - if (ret)
> - return ret;
> - }
> -

Since without this code, pci_epf_test_core_init() will no longer be called,
as there is currently no one that calls epf->core_init() for a EPF driver
after it has been bound. (For drivers that call dw_pcie_ep_init_notify() in
.probe())

I guess one way to solve this would be for the EPC core to keep track of
the current EPC "core state" (up/down). If the core is "up" at EPF .bind()
time, notify the EPF driver directly after .bind()?


Kind regards,
Niklas


[PATCH 10/19] timekeeping: Rename fast_tk_get_delta_ns() to __timekeeping_get_ns()

2024-03-08 Thread Adrian Hunter
Rename fast_tk_get_delta_ns() to __timekeeping_get_ns() to prepare for its
reuse as a general timekeeping helper function.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3375f0a6400f..63061332a75c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -390,7 +390,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct 
tk_read_base *tkr, u64 c
return timekeeping_delta_to_ns(tkr, delta);
 }
 
-static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
+static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
 {
u64 delta, cycles = tk_clock_read(tkr);
 
@@ -449,7 +449,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct 
tk_fast *tkf)
seq = raw_read_seqcount_latch(>seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
-   now += fast_tk_get_delta_ns(tkr);
+   now += __timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(>seq, seq));
 
return now;
@@ -565,7 +565,7 @@ static __always_inline u64 __ktime_get_real_fast(struct 
tk_fast *tkf, u64 *mono)
tkr = tkf->base + (seq & 0x01);
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
-   delta = fast_tk_get_delta_ns(tkr);
+   delta = __timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(>seq, seq));
 
if (mono)
-- 
2.34.1



[PATCH 19/19] clocksource: Make watchdog and suspend-timing multiplication overflow safe

2024-03-08 Thread Adrian Hunter
Kernel timekeeping is designed to keep the change in cycles (since the last
timer interrupt) below max_cycles, which prevents multiplication overflow
when converting cycles to nanoseconds. However, if timer interrupts stop,
the clocksource_cyc2ns() calculation will eventually overflow.

Add protection against that. Simplify by folding together
clocksource_delta() and clocksource_cyc2ns() into cycles_to_nsec_safe().
Check against max_cycles, falling back to a slower higher precision
calculation.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/clocksource.c | 42 +++
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e5b260aa0e02..4d50d53ac719 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -20,6 +20,16 @@
 #include "tick-internal.h"
 #include "timekeeping_internal.h"
 
+static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 
end)
+{
+   u64 delta = clocksource_delta(end, start, cs->mask);
+
+   if (likely(delta < cs->max_cycles))
+   return clocksource_cyc2ns(delta, cs->mult, cs->shift);
+
+   return mul_u64_u32_shr(delta, cs->mult, cs->shift);
+}
+
 /**
  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of 
clocks
  * @mult:  pointer to mult variable
@@ -222,8 +232,8 @@ enum wd_read_status {
 static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 
*csnow, u64 *wdnow)
 {
unsigned int nretries, max_retries;
-   u64 wd_end, wd_end2, wd_delta;
int64_t wd_delay, wd_seq_delay;
+   u64 wd_end, wd_end2;
 
max_retries = clocksource_get_max_watchdog_retry();
for (nretries = 0; nretries <= max_retries; nretries++) {
@@ -234,9 +244,7 @@ static enum wd_read_status cs_watchdog_read(struct 
clocksource *cs, u64 *csnow,
wd_end2 = watchdog->read(watchdog);
local_irq_enable();
 
-   wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
-   wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
- watchdog->shift);
+   wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
if (wd_delay <= WATCHDOG_MAX_SKEW) {
if (nretries > 1 || nretries >= max_retries) {
pr_warn("timekeeping watchdog on CPU%d: %s 
retried %d times before success\n",
@@ -254,8 +262,7 @@ static enum wd_read_status cs_watchdog_read(struct 
clocksource *cs, u64 *csnow,
 * report system busy, reinit the watchdog and skip the current
 * watchdog test.
 */
-   wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask);
-   wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, 
watchdog->shift);
+   wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
if (wd_seq_delay > WATCHDOG_MAX_SKEW/2)
goto skip_test;
}
@@ -366,8 +373,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
delta = (csnow_end - csnow_mid) & cs->mask;
if (delta < 0)
cpumask_set_cpu(cpu, _ahead);
-   delta = clocksource_delta(csnow_end, csnow_begin, cs->mask);
-   cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+   cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
if (cs_nsec > cs_nsec_max)
cs_nsec_max = cs_nsec;
if (cs_nsec < cs_nsec_min)
@@ -398,8 +404,8 @@ static inline void clocksource_reset_watchdog(void)
 
 static void clocksource_watchdog(struct timer_list *unused)
 {
-   u64 csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec, interval;
+   u64 csnow, wdnow, cslast, wdlast;
int next_cpu, reset_pending;
struct clocksource *cs;
enum wd_read_status read_ret;
@@ -456,12 +462,8 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
 
-   delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
-   wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
-watchdog->shift);
-
-   delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
-   cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+   wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
+   cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
wdlast = cs->wd_last; /* save these in case we print them */
cslast = cs->cs_last;
cs->cs_last = csnow;
@@ -832,7 +834,7 @@ void clocksource_start_suspend_timing(struct clocksource 
*cs, u64 start_cycles)
  */
 

[PATCH 18/19] timekeeping: Let timekeeping_cycles_to_ns() handle both under and overflow

2024-03-08 Thread Adrian Hunter
For the case !CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE, forego overflow
protection in the range (mask << 1) < delta <= mask, and interpret it
always as an inconsistency between CPU clock values. That allows
slightly neater code, and it is on a slow path so has no effect on
performance.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 31 +--
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 111dfdbd488f..4e18db1819f8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -266,17 +266,14 @@ static inline u64 timekeeping_debug_get_ns(const struct 
tk_read_base *tkr)
 * Try to catch underflows by checking if we are seeing small
 * mask-relative negative values.
 */
-   if (unlikely((~delta & mask) < (mask >> 3))) {
+   if (unlikely((~delta & mask) < (mask >> 3)))
tk->underflow_seen = 1;
-   now = last;
-   }
 
-   /* Cap delta value to the max_cycles values to avoid mult overflows */
-   if (unlikely(delta > max)) {
+   /* Check for multiplication overflows */
+   if (unlikely(delta > max))
tk->overflow_seen = 1;
-   now = last + max;
-   }
 
+   /* timekeeping_cycles_to_ns() handles both under and overflow */
return timekeeping_cycles_to_ns(tkr, now);
 }
 #else
@@ -375,19 +372,17 @@ static inline u64 timekeeping_cycles_to_ns(const struct 
tk_read_base *tkr, u64 c
u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
 
/*
-* This detects the case where the delta overflows the multiplication
-* with tkr->mult.
+* This detects both negative motion and the case where the delta
+* overflows the multiplication with tkr->mult.
 */
if (unlikely(delta > tkr->clock->max_cycles)) {
-   if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) {
-   /*
-* Handle clocksource inconsistency between CPUs to 
prevent
-* time from going backwards by checking for the MSB of 
the
-* mask being set in the delta.
-*/
-   if (unlikely(delta & ~(mask >> 1)))
-   return tkr->xtime_nsec >> tkr->shift;
-   }
+   /*
+* Handle clocksource inconsistency between CPUs to prevent
+* time from going backwards by checking for the MSB of the
+* mask being set in the delta.
+*/
+   if (delta & ~(mask >> 1))
+   return tkr->xtime_nsec >> tkr->shift;
 
return delta_to_ns_safe(tkr, delta);
}
-- 
2.34.1



[PATCH 09/19] timekeeping: Move timekeeping helper functions

2024-03-08 Thread Adrian Hunter
Move timekeeping helper functions to prepare for their reuse.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b58dffc58a8f..3375f0a6400f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -381,6 +381,23 @@ static inline u64 timekeeping_delta_to_ns(const struct 
tk_read_base *tkr, u64 de
return nsec;
 }
 
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles)
+{
+   u64 delta;
+
+   /* calculate the delta since the last update_wall_time */
+   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+   return timekeeping_delta_to_ns(tkr, delta);
+}
+
+static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
+{
+   u64 delta, cycles = tk_clock_read(tkr);
+
+   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+   return timekeeping_delta_to_ns(tkr, delta);
+}
+
 static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
 {
u64 delta;
@@ -389,15 +406,6 @@ static inline u64 timekeeping_get_ns(const struct 
tk_read_base *tkr)
return timekeeping_delta_to_ns(tkr, delta);
 }
 
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles)
-{
-   u64 delta;
-
-   /* calculate the delta since the last update_wall_time */
-   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
-   return timekeeping_delta_to_ns(tkr, delta);
-}
-
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
  * @tkr: Timekeeping readout base from which we take the update
@@ -431,14 +439,6 @@ static void update_fast_timekeeper(const struct 
tk_read_base *tkr,
memcpy(base + 1, base, sizeof(*base));
 }
 
-static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
-{
-   u64 delta, cycles = tk_clock_read(tkr);
-
-   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
-   return timekeeping_delta_to_ns(tkr, delta);
-}
-
 static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
struct tk_read_base *tkr;
-- 
2.34.1



[PATCH 08/19] x86/vdso: Make delta calculation overflow safe

2024-03-08 Thread Adrian Hunter
Kernel timekeeping is designed to keep the change in cycles (since the last
timer interrupt) below max_cycles, which prevents multiplication overflow
when converting cycles to nanoseconds. However, if timer interrupts stop,
the calculation will eventually overflow.

Add protection against that. Select GENERIC_VDSO_OVERFLOW_PROTECT so that
max_cycles is made available in the VDSO data page. Check against
max_cycles, falling back to a slower higher precision calculation. Take
advantage of the opportunity to move masking and negative motion check
into the slow path.

The result is a calculation that has similar performance as before. Newer
machines showed performance benefit, whereas older Skylake-based hardware
such as Intel Kaby Lake was seen <1% worse.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/vdso/gettimeofday.h | 29 +---
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 720b96388191..200f80a36593 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -168,6 +168,7 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
+   select GENERIC_VDSO_OVERFLOW_PROTECT
select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP   if X86_64
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h 
b/arch/x86/include/asm/vdso/gettimeofday.h
index 5727dedd3549..0ef36190abe6 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -319,18 +319,31 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
  */
 static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 
cycles, u64 base)
 {
+   u64 delta = cycles - vd->cycle_last;
+
/*
+* Negative motion and deltas which can cause multiplication
+* overflow require special treatment. This check covers both as
+* negative motion is guaranteed to be greater than @vd::max_cycles
+* due to unsigned comparison.
+*
 * Due to the MSB/Sign-bit being used as invalid marker (see
-* arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
+* arch_vdso_cycles_valid() above), the effective mask is S64_MAX,
+* but that case is also unlikely and will also take the unlikely path
+* here.
 */
-   u64 delta = (cycles - vd->cycle_last) & S64_MAX;
+   if (unlikely(delta > vd->max_cycles)) {
+   /*
+* Due to the above mentioned TSC wobbles, filter out
+* negative motion.  Per the above masking, the effective
+* sign bit is now bit 62.
+*/
+   if (delta & (1ULL << 62))
+   return base >> vd->shift;
 
-   /*
-* Due to the above mentioned TSC wobbles, filter out negative motion.
-* Per the above masking, the effective sign bit is now bit 62.
-*/
-   if (unlikely(delta & (1ULL << 62)))
-   return base >> vd->shift;
+   /* Handle multiplication overflow gracefully */
+   return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, 
vd->shift);
+   }
 
return ((delta * vd->mult) + base) >> vd->shift;
 }
-- 
2.34.1



[PATCH 17/19] timekeeping: Make delta calculation overflow safe

2024-03-08 Thread Adrian Hunter
Kernel timekeeping is designed to keep the change in cycles (since the last
timer interrupt) below max_cycles, which prevents multiplication overflow
when converting cycles to nanoseconds. However, if timer interrupts stop,
the calculation will eventually overflow.

Add protection against that. In timekeeping_cycles_to_ns() calculation,
check against max_cycles, falling back to a slower higher precision
calculation. In timekeeping_forward_now(), process delta in chunks of at
most max_cycles.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 40 ---
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d17484082e2c..111dfdbd488f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -364,19 +364,32 @@ static void tk_setup_internals(struct timekeeper *tk, 
struct clocksource *clock)
 }
 
 /* Timekeeper helper functions. */
+static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
+{
+   return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, 
tkr->shift);
+}
+
 static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles)
 {
/* Calculate the delta since the last update_wall_time() */
u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
 
-   if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) {
-   /*
-* Handle clocksource inconsistency between CPUs to prevent
-* time from going backwards by checking for the MSB of the
-* mask being set in the delta.
-*/
-   if (unlikely(delta & ~(mask >> 1)))
-   return tkr->xtime_nsec >> tkr->shift;
+   /*
+* This detects the case where the delta overflows the multiplication
+* with tkr->mult.
+*/
+   if (unlikely(delta > tkr->clock->max_cycles)) {
+   if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) {
+   /*
+* Handle clocksource inconsistency between CPUs to 
prevent
+* time from going backwards by checking for the MSB of 
the
+* mask being set in the delta.
+*/
+   if (unlikely(delta & ~(mask >> 1)))
+   return tkr->xtime_nsec >> tkr->shift;
+   }
+
+   return delta_to_ns_safe(tkr, delta);
}
 
return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
@@ -789,10 +802,15 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last  = cycle_now;
 
-   tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
-   tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+   while (delta > 0) {
+   u64 max = tk->tkr_mono.clock->max_cycles;
+   u64 incr = delta < max ? delta : max;
 
-   tk_normalize_xtime(tk);
+   tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
+   tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
+   tk_normalize_xtime(tk);
+   delta -= incr;
+   }
 }
 
 /**
-- 
2.34.1



[PATCH 16/19] timekeeping: Prepare timekeeping_cycles_to_ns() for overflow safety

2024-03-08 Thread Adrian Hunter
Open code clocksource_delta() in timekeeping_cycles_to_ns() so that
overflow safety can be added efficiently.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 749387f22f0f..d17484082e2c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -367,7 +367,17 @@ static void tk_setup_internals(struct timekeeper *tk, 
struct clocksource *clock)
 static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles)
 {
/* Calculate the delta since the last update_wall_time() */
-   u64 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+   u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
+
+   if (IS_ENABLED(CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE)) {
+   /*
+* Handle clocksource inconsistency between CPUs to prevent
+* time from going backwards by checking for the MSB of the
+* mask being set in the delta.
+*/
+   if (unlikely(delta & ~(mask >> 1)))
+   return tkr->xtime_nsec >> tkr->shift;
+   }
 
return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
 }
-- 
2.34.1



[PATCH 07/19] vdso: Make delta calculation overflow safe

2024-03-08 Thread Adrian Hunter
Kernel timekeeping is designed to keep the change in cycles (since the last
timer interrupt) below max_cycles, which prevents multiplication overflow
when converting cycles to nanoseconds. However, if timer interrupts stop,
the calculation will eventually overflow.

Add protection against that, enabled by config option
CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT. Check against max_cycles, falling
back to a slower higher precision calculation.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 lib/vdso/gettimeofday.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index 9fa90e0794c9..9c3a8d2440c9 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -13,6 +13,18 @@
 # define VDSO_DELTA_MASK(vd)   (vd->mask)
 #endif
 
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 
delta)
+{
+   return delta < vd->max_cycles;
+}
+#else
+static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 
delta)
+{
+   return true;
+}
+#endif
+
 #ifndef vdso_shift_ns
 static __always_inline u64 vdso_shift_ns(u64 ns, u32 shift)
 {
@@ -28,7 +40,10 @@ static __always_inline u64 vdso_calc_ns(const struct 
vdso_data *vd, u64 cycles,
 {
u64 delta = (cycles - vd->cycle_last) & VDSO_DELTA_MASK(vd);
 
-   return vdso_shift_ns((delta * vd->mult) + base, vd->shift);
+   if (likely(vdso_delta_ok(vd, delta)))
+   return vdso_shift_ns((delta * vd->mult) + base, vd->shift);
+
+   return mul_u64_u32_add_u64_shr(delta, vd->mult, base, vd->shift);
 }
 #endif /* vdso_calc_ns */
 
-- 
2.34.1



[PATCH 06/19] vdso: Add vdso_data::max_cycles

2024-03-08 Thread Adrian Hunter
Add vdso_data::max_cycles in preparation to use it to detect potential
multiplication overflow.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 include/vdso/datapage.h | 4 
 kernel/time/vsyscall.c  | 6 ++
 2 files changed, 10 insertions(+)

diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index 5d5c0b8efff2..6c3d67d6b758 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -67,6 +67,7 @@ struct vdso_timestamp {
  * @seq:   timebase sequence counter
  * @clock_mode:clock mode
  * @cycle_last:timebase at clocksource init
+ * @max_cycles:maximum cycles which won't overflow 64bit 
multiplication
  * @mask:  clocksource mask
  * @mult:  clocksource multiplier
  * @shift: clocksource shift
@@ -98,6 +99,9 @@ struct vdso_data {
 
s32 clock_mode;
u64 cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+   u64 max_cycles;
+#endif
u64 mask;
u32 mult;
u32 shift;
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index f0d5062d9cbc..9193d6133e5d 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -22,10 +22,16 @@ static inline void update_vdso_data(struct vdso_data *vdata,
u64 nsec, sec;
 
vdata[CS_HRES_COARSE].cycle_last= tk->tkr_mono.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+   vdata[CS_HRES_COARSE].max_cycles= 
tk->tkr_mono.clock->max_cycles;
+#endif
vdata[CS_HRES_COARSE].mask  = tk->tkr_mono.mask;
vdata[CS_HRES_COARSE].mult  = tk->tkr_mono.mult;
vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
vdata[CS_RAW].cycle_last= tk->tkr_raw.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+   vdata[CS_RAW].max_cycles= tk->tkr_raw.clock->max_cycles;
+#endif
vdata[CS_RAW].mask  = tk->tkr_raw.mask;
vdata[CS_RAW].mult  = tk->tkr_raw.mult;
vdata[CS_RAW].shift = tk->tkr_raw.shift;
-- 
2.34.1



[PATCH 15/19] timekeeping: Fold in timekeeping_delta_to_ns()

2024-03-08 Thread Adrian Hunter
timekeeping_delta_to_ns() is now called only from
timekeeping_cycles_to_ns(), and it is not useful otherwise. Simplify by
folding it into timekeeping_cycles_to_ns().

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1bbfe1ff8d24..749387f22f0f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -364,23 +364,12 @@ static void tk_setup_internals(struct timekeeper *tk, 
struct clocksource *clock)
 }
 
 /* Timekeeper helper functions. */
-
-static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 
delta)
-{
-   u64 nsec;
-
-   nsec = delta * tkr->mult + tkr->xtime_nsec;
-   nsec >>= tkr->shift;
-
-   return nsec;
-}
-
 static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles)
 {
/* Calculate the delta since the last update_wall_time() */
u64 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
 
-   return timekeeping_delta_to_ns(tkr, delta);
+   return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
 }
 
 static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
-- 
2.34.1



[PATCH 14/19] timekeeping: Consolidate timekeeping helpers

2024-03-08 Thread Adrian Hunter
Consolidate timekeeping helpers, making use of timekeeping_cycles_to_ns()
in preference to directly using timekeeping_delta_to_ns().

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 618328cd4bc4..1bbfe1ff8d24 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -237,7 +237,9 @@ static void timekeeping_check_update(struct timekeeper *tk, 
u64 offset)
}
 }
 
-static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles);
+
+static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
 {
struct timekeeper *tk = _core.timekeeper;
u64 now, last, mask, max, delta;
@@ -266,22 +268,22 @@ static inline u64 timekeeping_debug_get_delta(const 
struct tk_read_base *tkr)
 */
if (unlikely((~delta & mask) < (mask >> 3))) {
tk->underflow_seen = 1;
-   delta = 0;
+   now = last;
}
 
/* Cap delta value to the max_cycles values to avoid mult overflows */
if (unlikely(delta > max)) {
tk->overflow_seen = 1;
-   delta = tkr->clock->max_cycles;
+   now = last + max;
}
 
-   return delta;
+   return timekeeping_cycles_to_ns(tkr, now);
 }
 #else
 static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
 {
 }
-static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
 {
BUG();
 }
@@ -389,7 +391,7 @@ static __always_inline u64 __timekeeping_get_ns(const 
struct tk_read_base *tkr)
 static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
 {
if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
-   return timekeeping_delta_to_ns(tkr, 
timekeeping_debug_get_delta(tkr));
+   return timekeeping_debug_get_ns(tkr);
 
return __timekeeping_get_ns(tkr);
 }
-- 
2.34.1



[PATCH 05/19] vdso: math64: Provide mul_u64_u32_add_u64_shr()

2024-03-08 Thread Adrian Hunter
Provide mul_u64_u32_add_u64_shr() which is a calculation that will be used
by timekeeping and VDSO.

Place #include  after #include  to allow
architecture-specific overrides, at least for the kernel.

Signed-off-by: Adrian Hunter 
---
 include/linux/math64.h |  2 +-
 include/vdso/math64.h  | 38 ++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/include/linux/math64.h b/include/linux/math64.h
index fd13622b2056..d34def7f9a8c 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -4,8 +4,8 @@
 
 #include 
 #include 
-#include 
 #include 
+#include 
 
 #if BITS_PER_LONG == 64
 
diff --git a/include/vdso/math64.h b/include/vdso/math64.h
index 7da703ee5561..22ae212f8b28 100644
--- a/include/vdso/math64.h
+++ b/include/vdso/math64.h
@@ -21,4 +21,42 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
return ret;
 }
 
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u32_add_u64_shr
+static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, 
unsigned int shift)
+{
+   return (u64)unsigned __int128)a * mul) + b) >> shift);
+}
+#endif /* mul_u64_u32_add_u64_shr */
+
+#else
+
+#ifndef mul_u64_u32_add_u64_shr
+#ifndef mul_u32_u32
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+   return (u64)a * b;
+}
+#define mul_u32_u32 mul_u32_u32
+#endif
+static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, 
unsigned int shift)
+{
+   u32 ah = a >> 32, al = a;
+   bool ovf;
+   u64 ret;
+
+   ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, );
+   ret >>= shift;
+   if (ovf && shift)
+   ret += 1ULL << (64 - shift);
+   if (ah)
+   ret += mul_u32_u32(ah, mul) << (32 - shift);
+
+   return ret;
+}
+#endif /* mul_u64_u32_add_u64_shr */
+
+#endif
+
 #endif /* __VDSO_MATH64_H */
-- 
2.34.1



[PATCH 04/19] math64: Tidy mul_u64_u32_shr()

2024-03-08 Thread Adrian Hunter
Put together declaration and initialization of local variables.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 include/linux/math64.h | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/linux/math64.h b/include/linux/math64.h
index bf74478926d4..fd13622b2056 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -179,16 +179,12 @@ static __always_inline u64 mul_u64_u64_shr(u64 a, u64 
mul, unsigned int shift)
 #ifndef mul_u64_u32_shr
 static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 {
-   u32 ah, al;
+   u32 ah = a >> 32, al = a;
u64 ret;
 
-   al = a;
-   ah = a >> 32;
-
ret = mul_u32_u32(al, mul) >> shift;
if (ah)
ret += mul_u32_u32(ah, mul) << (32 - shift);
-
return ret;
 }
 #endif /* mul_u64_u32_shr */
-- 
2.34.1



[PATCH 13/19] timekeeping: Refactor timekeeping helpers

2024-03-08 Thread Adrian Hunter
Simplify use of timekeeping sanity checking, in preparation for
consolidating timekeeping helpers. This works towards eliminating
timekeeping_delta_to_ns() in favour of timekeeping_cycles_to_ns().

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f81d675291e0..618328cd4bc4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -237,7 +237,7 @@ static void timekeeping_check_update(struct timekeeper *tk, 
u64 offset)
}
 }
 
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr)
 {
struct timekeeper *tk = _core.timekeeper;
u64 now, last, mask, max, delta;
@@ -281,17 +281,9 @@ static inline u64 timekeeping_get_delta(const struct 
tk_read_base *tkr)
 static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
 {
 }
-static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
+static inline u64 timekeeping_debug_get_delta(const struct tk_read_base *tkr)
 {
-   u64 cycle_now, delta;
-
-   /* read clocksource */
-   cycle_now = tk_clock_read(tkr);
-
-   /* calculate the delta since the last update_wall_time */
-   delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
-
-   return delta;
+   BUG();
 }
 #endif
 
@@ -396,10 +388,10 @@ static __always_inline u64 __timekeeping_get_ns(const 
struct tk_read_base *tkr)
 
 static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
 {
-   u64 delta;
+   if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
+   return timekeeping_delta_to_ns(tkr, 
timekeeping_debug_get_delta(tkr));
 
-   delta = timekeeping_get_delta(tkr);
-   return timekeeping_delta_to_ns(tkr, delta);
+   return __timekeeping_get_ns(tkr);
 }
 
 /**
-- 
2.34.1



[PATCH 03/19] vdso: Add CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT

2024-03-08 Thread Adrian Hunter
Add CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT in preparation to add
multiplication overflow protection to the VDSO time getter functions.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 lib/vdso/Kconfig | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index d883ac299508..c46c2300517c 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -30,4 +30,11 @@ config GENERIC_VDSO_TIME_NS
  Selected by architectures which support time namespaces in the
  VDSO
 
+config GENERIC_VDSO_OVERFLOW_PROTECT
+   bool
+   help
+ Select to add multiplication overflow protection to the VDSO
+ time getter functions for the price of an extra conditional
+ in the hotpath.
+
 endif
-- 
2.34.1



[PATCH 12/19] timekeeping: Reuse timekeeping_cycles_to_ns()

2024-03-08 Thread Adrian Hunter
Simplify __timekeeping_get_ns() by reusing timekeeping_cycles_to_ns().

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c698219b152d..f81d675291e0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -391,10 +391,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct 
tk_read_base *tkr, u64 c
 
 static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
 {
-   u64 delta, cycles = tk_clock_read(tkr);
-
-   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
-   return timekeeping_delta_to_ns(tkr, delta);
+   return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
 }
 
 static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
-- 
2.34.1



[PATCH 02/19] vdso: Consolidate nanoseconds calculation

2024-03-08 Thread Adrian Hunter
Consolidate nanoseconds calculation to simplify and reduce code
duplication.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 arch/x86/include/asm/vdso/gettimeofday.h | 17 +
 lib/vdso/gettimeofday.c  | 44 +++-
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/vdso/gettimeofday.h 
b/arch/x86/include/asm/vdso/gettimeofday.h
index 8e048ca980df..5727dedd3549 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -300,7 +300,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
 #define vdso_cycles_ok arch_vdso_cycles_ok
 
 /*
- * x86 specific delta calculation.
+ * x86 specific calculation of nanoseconds for the current cycle count
  *
  * The regular implementation assumes that clocksource reads are globally
  * monotonic. The TSC can be slightly off across sockets which can cause
@@ -308,8 +308,8 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
  * jump.
  *
  * Therefore it needs to be verified that @cycles are greater than
- * @last. If not then use @last, which is the base time of the current
- * conversion period.
+ * @vd->cycles_last. If not then use @vd->cycles_last, which is the base
+ * time of the current conversion period.
  *
  * This variant also uses a custom mask because while the clocksource mask of
  * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
@@ -317,25 +317,24 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
  * declares everything with the MSB/Sign-bit set as invalid. Therefore the
  * effective mask is S64_MAX.
  */
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 
cycles, u64 base)
 {
/*
 * Due to the MSB/Sign-bit being used as invalid marker (see
 * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
 */
-   u64 delta = (cycles - last) & S64_MAX;
+   u64 delta = (cycles - vd->cycle_last) & S64_MAX;
 
/*
 * Due to the above mentioned TSC wobbles, filter out negative motion.
 * Per the above masking, the effective sign bit is now bit 62.
 */
if (unlikely(delta & (1ULL << 62)))
-   return 0;
+   return base >> vd->shift;
 
-   return delta * mult;
+   return ((delta * vd->mult) + base) >> vd->shift;
 }
-#define vdso_calc_delta vdso_calc_delta
+#define vdso_calc_ns vdso_calc_ns
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index 042b95e8164d..9fa90e0794c9 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -5,20 +5,12 @@
 #include 
 #include 
 
-#ifndef vdso_calc_delta
-/*
- * Default implementation which works for all sane clocksources. That
- * obviously excludes x86/TSC.
- */
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
-{
+#ifndef vdso_calc_ns
+
 #ifdef VDSO_DELTA_NOMASK
-   return (cycles - last) * mult;
+# define VDSO_DELTA_MASK(vd)   U64_MAX
 #else
-   return ((cycles - last) & mask) * mult;
-#endif
-}
+# define VDSO_DELTA_MASK(vd)   (vd->mask)
 #endif
 
 #ifndef vdso_shift_ns
@@ -28,6 +20,18 @@ static __always_inline u64 vdso_shift_ns(u64 ns, u32 shift)
 }
 #endif
 
+/*
+ * Default implementation which works for all sane clocksources. That
+ * obviously excludes x86/TSC.
+ */
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 
cycles, u64 base)
+{
+   u64 delta = (cycles - vd->cycle_last) & VDSO_DELTA_MASK(vd);
+
+   return vdso_shift_ns((delta * vd->mult) + base, vd->shift);
+}
+#endif /* vdso_calc_ns */
+
 #ifndef __arch_vdso_hres_capable
 static inline bool __arch_vdso_hres_capable(void)
 {
@@ -53,10 +57,10 @@ static inline bool vdso_cycles_ok(u64 cycles)
 static __always_inline int do_hres_timens(const struct vdso_data *vdns, 
clockid_t clk,
  struct __kernel_timespec *ts)
 {
-   const struct vdso_data *vd;
const struct timens_offset *offs = >offset[clk];
const struct vdso_timestamp *vdso_ts;
-   u64 cycles, last, ns;
+   const struct vdso_data *vd;
+   u64 cycles, ns;
u32 seq;
s64 sec;
 
@@ -77,10 +81,7 @@ static __always_inline int do_hres_timens(const struct 
vdso_data *vdns, clockid_
cycles = __arch_get_hw_counter(vd->clock_mode, vd);
if (unlikely(!vdso_cycles_ok(cycles)))
return -1;
-   ns = vdso_ts->nsec;
-   last = vd->cycle_last;
-   ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-   ns = vdso_shift_ns(ns, vd->shift);
+   ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec);
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -115,7 +116,7 @@ static 

[PATCH 01/19] vdso: Consolidate vdso_calc_delta()

2024-03-08 Thread Adrian Hunter
Consolidate vdso_calc_delta(), in preparation for further simplification.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 arch/powerpc/include/asm/vdso/gettimeofday.h | 17 ++---
 arch/s390/include/asm/vdso/gettimeofday.h|  7 ++-
 lib/vdso/gettimeofday.c  |  4 
 3 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index f0a4cf01e85c..f4da8e18cdf3 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -14,6 +14,8 @@
 
 #define VDSO_HAS_TIME  1
 
+#define VDSO_DELTA_NOMASK  1
+
 static __always_inline int do_syscall_2(const unsigned long _r0, const 
unsigned long _r3,
const unsigned long _r4)
 {
@@ -105,21 +107,6 @@ static inline bool vdso_clocksource_ok(const struct 
vdso_data *vd)
 }
 #define vdso_clocksource_ok vdso_clocksource_ok
 
-/*
- * powerpc specific delta calculation.
- *
- * This variant removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
- */
-static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 
mult)
-{
-   return (cycles - last) * mult;
-}
-#define vdso_calc_delta vdso_calc_delta
-
 #ifndef __powerpc64__
 static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
 {
diff --git a/arch/s390/include/asm/vdso/gettimeofday.h 
b/arch/s390/include/asm/vdso/gettimeofday.h
index db84942eb78f..7937765ccfa5 100644
--- a/arch/s390/include/asm/vdso/gettimeofday.h
+++ b/arch/s390/include/asm/vdso/gettimeofday.h
@@ -6,16 +6,13 @@
 
 #define VDSO_HAS_CLOCK_GETRES 1
 
+#define VDSO_DELTA_NOMASK 1
+
 #include 
 #include 
 #include 
 #include 
 
-#define vdso_calc_delta __arch_vdso_calc_delta
-static __always_inline u64 __arch_vdso_calc_delta(u64 cycles, u64 last, u64 
mask, u32 mult)
-{
-   return (cycles - last) * mult;
-}
 
 static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
 {
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index ce2f69552003..042b95e8164d 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -13,7 +13,11 @@
 static __always_inline
 u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 {
+#ifdef VDSO_DELTA_NOMASK
+   return (cycles - last) * mult;
+#else
return ((cycles - last) & mask) * mult;
+#endif
 }
 #endif
 
-- 
2.34.1



[PATCH 00/19] timekeeping: Handle potential multiplication overflow

2024-03-08 Thread Adrian Hunter
Hi

Kernel timekeeping calculates a clock value by keeping a base value and
adding the number of nanoseconds since that time. Those nanoseconds are
calculated from the clocksource delta. Then periodically, the base value is
moved forwards (refer timekeeping_advance()) which is done by the local
timer interrupt handler. It is designed such that there will always be a
timer interrupt before the delta becomes big enough to overflow the 64-bit
multiplication used in the conversion of delta to nanoseconds (refer
timekeeping_delta_to_ns()). Obviously if timer interrupts are stopped, then
the multiplication does eventually overflow.

Timekeeping multiplication overflow results in a "time loop", typically
cycling about every 15 minutes with x86 TSC, for example starting at 10:00:

  10:00, 10:01, 10:02 ... 10:15, 10:00, 10:01, ... 10:15, 10:00, 10:01 ...

Because a VMM can deliberately stop timer interrupts for a guest, a virtual
machine can be exposed to this issue.

TDX maintains a monotonically increasing virtual TSC for a TDX guest, so
the overflow is allowing a backwards movement of timekeeping that would not
happen otherwise.

It is considered this could break security of cryptographic protocols that
rely on the timestamps for freshness / replay protection, and consequently
the kernel should prevent such a time loop.

Handle multiplication overflows by falling back to higher precision
calculation when the possibility of an overflow is detected.

Extend the facility also to VDSO, dependent on new config option
GENERIC_VDSO_OVERFLOW_PROTECT which is selected by x86 only, so other
architectures are not affected. The result is a calculation that has
similar performance as before. Most machines showed performance benefit,
except Skylake-based hardware such as Intel Kaby Lake which was seen <1%
worse.


Adrian Hunter (19):
  vdso: Consolidate vdso_calc_delta()
  vdso: Consolidate nanoseconds calculation
  vdso: Add CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
  math64: Tidy mul_u64_u32_shr()
  vdso: math64: Provide mul_u64_u32_add_u64_shr()
  vdso: Add vdso_data::max_cycles
  vdso: Make delta calculation overflow safe
  x86/vdso: Make delta calculation overflow safe
  timekeeping: Move timekeeping helper functions
  timekeeping: Rename fast_tk_get_delta_ns() to __timekeeping_get_ns()
  timekeeping: Tidy timekeeping_cycles_to_ns() slightly
  timekeeping: Reuse timekeeping_cycles_to_ns()
  timekeeping: Refactor timekeeping helpers
  timekeeping: Consolidate timekeeping helpers
  timekeeping: Fold in timekeeping_delta_to_ns()
  timekeeping: Prepare timekeeping_cycles_to_ns() for overflow safety
  timekeeping: Make delta calculation overflow safe
  timekeeping: Let timekeeping_cycles_to_ns() handle both under and overflow
  clocksource: Make watchdog and suspend-timing multiplication overflow safe

 arch/powerpc/include/asm/vdso/gettimeofday.h |  17 +
 arch/s390/include/asm/vdso/gettimeofday.h|   7 +-
 arch/x86/Kconfig |   1 +
 arch/x86/include/asm/vdso/gettimeofday.h |  42 +++
 include/linux/math64.h   |   8 +-
 include/vdso/datapage.h  |   4 +
 include/vdso/math64.h|  38 ++
 kernel/time/clocksource.c|  42 +--
 kernel/time/timekeeping.c| 106 ++-
 kernel/time/vsyscall.c   |   6 ++
 lib/vdso/Kconfig |   7 ++
 lib/vdso/gettimeofday.c  |  55 +-
 12 files changed, 199 insertions(+), 134 deletions(-)


Regards
Adrian


[PATCH 11/19] timekeeping: Tidy timekeeping_cycles_to_ns() slightly

2024-03-08 Thread Adrian Hunter
Put together declaration and initialization of the local variable 'delta'.

Suggested-by: Thomas Gleixner 
Signed-off-by: Adrian Hunter 
---
 kernel/time/timekeeping.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 63061332a75c..c698219b152d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -383,10 +383,9 @@ static inline u64 timekeeping_delta_to_ns(const struct 
tk_read_base *tkr, u64 de
 
 static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 
cycles)
 {
-   u64 delta;
+   /* Calculate the delta since the last update_wall_time() */
+   u64 delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
 
-   /* calculate the delta since the last update_wall_time */
-   delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
return timekeeping_delta_to_ns(tkr, delta);
 }
 
-- 
2.34.1



Re: [PATCH 1/2] powerpc/prom_init: Replace linux,sml-base/sml-size with linux,sml-log

2024-03-08 Thread Stefan Berger




On 3/7/24 16:52, Rob Herring wrote:

On Thu, Mar 07, 2024 at 09:41:31PM +1100, Michael Ellerman wrote:

Stefan Berger  writes:

linux,sml-base holds the address of a buffer with the TPM log. This
buffer may become invalid after a kexec and therefore embed the whole TPM
log in linux,sml-log. This helps to protect the log since it is properly
carried across a kexec with both of the kexec syscalls.

Signed-off-by: Stefan Berger 
---
  arch/powerpc/kernel/prom_init.c | 8 ++--
  1 file changed, 2 insertions(+), 6 deletions(-)







Also adding the new linux,sml-log property should be accompanied by a
change to the device tree binding.

The syntax is not very obvious to me, but possibly something like?

diff --git a/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml 
b/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
index 50a3fd31241c..cd75037948bc 100644
--- a/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
+++ b/Documentation/devicetree/bindings/tpm/ibm,vtpm.yaml
@@ -74,8 +74,6 @@ required:
- ibm,my-dma-window
- ibm,my-drc-index
- ibm,loc-code
-  - linux,sml-base
-  - linux,sml-size


Dropping required properties is an ABI break. If you drop them, an older
OS version won't work.


1) On PowerVM and KVM on Power these two properties were added in the 
Linux code. I replaced the creation of these properties with creation of 
linux,sml-log (1/2 in this series). I also replaced the handling of
these two (2/2 in this series) for these two platforms but leaving it 
for powernv systems where the firmware creates these.


https://elixir.bootlin.com/linux/latest/source/arch/powerpc/kernel/prom_init.c#L1959

2) There is an example in the ibm,vtpm.yaml file that has both of these
and the test case still passes the check when the two entries above are 
removed. I will post v2 with the changes to the DT bindings for 
linux,sml-log including an example for linux,sml-log. [The test cases 
fail, as expected, when an additional property is added, such as when 
linux,sml-base is added when linux,sml-log is there or linux,sml-log is 
added when linux,sml-base is there.]




  
  allOf:

- $ref: tpm-common.yaml#
diff --git a/Documentation/devicetree/bindings/tpm/tpm-common.yaml 
b/Documentation/devicetree/bindings/tpm/tpm-common.yaml
index 3c1241b2a43f..616604707c95 100644
--- a/Documentation/devicetree/bindings/tpm/tpm-common.yaml
+++ b/Documentation/devicetree/bindings/tpm/tpm-common.yaml
@@ -25,6 +25,11 @@ properties:
base address of reserved memory allocated for firmware event log
  $ref: /schemas/types.yaml#/definitions/uint64
  
+  linux,sml-log:


Why is this Linux specific?


I am not sure about the criteria when to prefix with 'linux,' and when 
not to. In this case I did it because it is created by Linux and because 
of existing linux,sml-base and linux,sml-size.


Re: [PATCH 2/2] tpm: of: If available Use linux,sml-log to get the log and its size

2024-03-08 Thread Stefan Berger




On 3/7/24 15:00, Jarkko Sakkinen wrote:

On Thu Mar 7, 2024 at 9:57 PM EET, Jarkko Sakkinen wrote:

in short summary: s/Use/use/

On Wed Mar 6, 2024 at 5:55 PM EET, Stefan Berger wrote:

If linux,sml-log is available use it to get the TPM log rather than the
pointer found in linux,sml-base. This resolves an issue on PowerVM and KVM
on Power where after a kexec the memory pointed to by linux,sml-base may
have been corrupted. Also, linux,sml-log has replaced linux,sml-base and
linux,sml-size on these two platforms.

Signed-off-by: Stefan Berger 


So shouldn't this have a fixed tag, or not?


In English: do we want this to be backported to stable kernel releases or not?


Ideally, yes. v3 will have 3 patches and all 3 of them will have to be 
backported *together* and not applied otherwise if any one of them 
fails. Can this be 'guaranteed'?




BR, Jarkko


Re: [PATCH v13 09/16] media: uapi: Define audio sample format fourcc type

2024-03-08 Thread Hans Verkuil
On 08/03/2024 12:52 pm, Shengjiu Wang wrote:
> On Fri, Mar 8, 2024 at 3:34 PM Hans Verkuil  wrote:
>>
>> Hi Shengjiu,
>>
>> After thinking it over I think this patch can be improved:
>>
>> On 26/02/2024 9:28 am, Shengjiu Wang wrote:
>>> The audio sample format definition is from alsa,
>>> the header file is include/uapi/sound/asound.h, but
>>> don't include this header file directly, because in
>>> user space, there is another copy in alsa-lib.
>>> There will be conflict in userspace for include
>>> videodev2.h & asound.h and asoundlib.h
>>>
>>> Here still use the fourcc format.
>>>
>>> Signed-off-by: Shengjiu Wang 
>>> ---
>>>  .../userspace-api/media/v4l/pixfmt-audio.rst  | 87 +++
>>>  .../userspace-api/media/v4l/pixfmt.rst|  1 +
>>>  drivers/media/v4l2-core/v4l2-ioctl.c  | 13 +++
>>>  include/uapi/linux/videodev2.h| 23 +
>>>  4 files changed, 124 insertions(+)
>>>  create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-audio.rst
>>>
>>> diff --git a/Documentation/userspace-api/media/v4l/pixfmt-audio.rst 
>>> b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
>>> new file mode 100644
>>> index ..04b4a7fbd8f4
>>> --- /dev/null
>>> +++ b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
>>> @@ -0,0 +1,87 @@
>>> +.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
>>> +
>>> +.. _pixfmt-audio:
>>> +
>>> +*
>>> +Audio Formats
>>> +*
>>> +
>>> +These formats are used for :ref:`audiomem2mem` interface only.
>>
>> Here you should also document that all these fourccs start with 'AU' and are
>> reserved for mappings of the snd_pcm_format_t type.
>>
>> Also document the v4l2_fourcc_to_audfmt define and the v4l2_audfmt_to_fourcc
>> define (see also below).
> 
> How about below description?
> '
> All these fourccs starting with 'AU' are reserved for mappings

All these fourccs -> All FourCCs

> of the snd_pcm_format_t type.
> 
> The v4l2_audfmt_to_fourcc() is defined to convert snd_pcm_format_t

convert -> convert the

> type to fourcc. The first character is 'A', the second character

fourcc -> a FourCC

> is 'U', the third character is ten's digit of snd_pcm_format_t,
> the fourth character is unit's digit of snd_pcm_format_t.

"is 'U', and the remaining two characters is the snd_pcm_format_t
value in ASCII. Example: SNDRV_PCM_FORMAT_S16_LE (with value 2)
maps to 'AU02' and SNDRV_PCM_FORMAT_S24_3LE (with value 32) maps
to 'AU32'."

> 
> The v4l2_fourcc_to_audfmt() is defined to convert these fourccs to

fourccs -> FourCCs

> snd_pcm_format_t type.

BTW, given the way snd_pcm_format_t is defined I am fairly certain
some casts are needed for the v4l2_audfmt_to_fourcc define.

Regards,

Hans

> '
> Best regards
> Shengjiu Wang
>>
>>> +
>>> +.. tabularcolumns:: |p{5.8cm}|p{1.2cm}|p{10.3cm}|
>>> +
>>> +.. cssclass:: longtable
>>> +
>>> +.. flat-table:: Audio Format
>>> +:header-rows:  1
>>> +:stub-columns: 0
>>> +:widths:   3 1 4
>>> +
>>> +* - Identifier
>>> +  - Code
>>> +  - Details
>>> +* .. _V4L2-AUDIO-FMT-S8:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_S8``
>>> +  - 'S8'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_S8 in ALSA
>>> +* .. _V4L2-AUDIO-FMT-S16-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_S16_LE``
>>> +  - 'S16_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_S16_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-U16-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_U16_LE``
>>> +  - 'U16_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_U16_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-S24-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_S24_LE``
>>> +  - 'S24_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_S24_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-U24-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_U24_LE``
>>> +  - 'U24_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_U24_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-S32-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_S32_LE``
>>> +  - 'S32_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_S32_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-U32-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_U32_LE``
>>> +  - 'U32_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_U32_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-FLOAT-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_FLOAT_LE``
>>> +  - 'FLOAT_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_FLOAT_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-IEC958-SUBFRAME-LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_IEC958_SUBFRAME_LE``
>>> +  - 'IEC958_SUBFRAME_LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-S24-3LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_S24_3LE``
>>> +  - 'S24_3LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_S24_3LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-U24-3LE:
>>> +
>>> +  - ``V4L2_AUDIO_FMT_U24_3LE``
>>> +  - 'U24_3LE'
>>> +  - Corresponds to SNDRV_PCM_FORMAT_U24_3LE in ALSA
>>> +* .. _V4L2-AUDIO-FMT-S20-3LE:
>>> +
>>> +  - 

Re: [PATCH v13 09/16] media: uapi: Define audio sample format fourcc type

2024-03-08 Thread Shengjiu Wang
On Fri, Mar 8, 2024 at 3:34 PM Hans Verkuil  wrote:
>
> Hi Shengjiu,
>
> After thinking it over I think this patch can be improved:
>
> On 26/02/2024 9:28 am, Shengjiu Wang wrote:
> > The audio sample format definition is from alsa,
> > the header file is include/uapi/sound/asound.h, but
> > don't include this header file directly, because in
> > user space, there is another copy in alsa-lib.
> > There will be conflict in userspace for include
> > videodev2.h & asound.h and asoundlib.h
> >
> > Here still use the fourcc format.
> >
> > Signed-off-by: Shengjiu Wang 
> > ---
> >  .../userspace-api/media/v4l/pixfmt-audio.rst  | 87 +++
> >  .../userspace-api/media/v4l/pixfmt.rst|  1 +
> >  drivers/media/v4l2-core/v4l2-ioctl.c  | 13 +++
> >  include/uapi/linux/videodev2.h| 23 +
> >  4 files changed, 124 insertions(+)
> >  create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> >
> > diff --git a/Documentation/userspace-api/media/v4l/pixfmt-audio.rst 
> > b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> > new file mode 100644
> > index ..04b4a7fbd8f4
> > --- /dev/null
> > +++ b/Documentation/userspace-api/media/v4l/pixfmt-audio.rst
> > @@ -0,0 +1,87 @@
> > +.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
> > +
> > +.. _pixfmt-audio:
> > +
> > +*
> > +Audio Formats
> > +*
> > +
> > +These formats are used for :ref:`audiomem2mem` interface only.
>
> Here you should also document that all these fourccs start with 'AU' and are
> reserved for mappings of the snd_pcm_format_t type.
>
> Also document the v4l2_fourcc_to_audfmt define and the v4l2_audfmt_to_fourcc
> define (see also below).

How about below description?
'
All these fourccs starting with 'AU' are reserved for mappings
of the snd_pcm_format_t type.

The v4l2_audfmt_to_fourcc() is defined to convert snd_pcm_format_t
type to fourcc. The first character is 'A', the second character
is 'U', the third character is ten's digit of snd_pcm_format_t,
the fourth character is unit's digit of snd_pcm_format_t.

The v4l2_fourcc_to_audfmt() is defined to convert these fourccs to
snd_pcm_format_t type.
'
Best regards
Shengjiu Wang
>
> > +
> > +.. tabularcolumns:: |p{5.8cm}|p{1.2cm}|p{10.3cm}|
> > +
> > +.. cssclass:: longtable
> > +
> > +.. flat-table:: Audio Format
> > +:header-rows:  1
> > +:stub-columns: 0
> > +:widths:   3 1 4
> > +
> > +* - Identifier
> > +  - Code
> > +  - Details
> > +* .. _V4L2-AUDIO-FMT-S8:
> > +
> > +  - ``V4L2_AUDIO_FMT_S8``
> > +  - 'S8'
> > +  - Corresponds to SNDRV_PCM_FORMAT_S8 in ALSA
> > +* .. _V4L2-AUDIO-FMT-S16-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_S16_LE``
> > +  - 'S16_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_S16_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-U16-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_U16_LE``
> > +  - 'U16_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_U16_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-S24-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_S24_LE``
> > +  - 'S24_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_S24_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-U24-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_U24_LE``
> > +  - 'U24_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_U24_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-S32-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_S32_LE``
> > +  - 'S32_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_S32_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-U32-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_U32_LE``
> > +  - 'U32_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_U32_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-FLOAT-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_FLOAT_LE``
> > +  - 'FLOAT_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_FLOAT_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-IEC958-SUBFRAME-LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_IEC958_SUBFRAME_LE``
> > +  - 'IEC958_SUBFRAME_LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-S24-3LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_S24_3LE``
> > +  - 'S24_3LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_S24_3LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-U24-3LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_U24_3LE``
> > +  - 'U24_3LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_U24_3LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-S20-3LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_S20_3LE``
> > +  - 'S20_3LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_S24_3LE in ALSA
> > +* .. _V4L2-AUDIO-FMT-U20-3LE:
> > +
> > +  - ``V4L2_AUDIO_FMT_U20_3LE``
> > +  - 'U20_3LE'
> > +  - Corresponds to SNDRV_PCM_FORMAT_U20_3LE in ALSA
> > diff --git a/Documentation/userspace-api/media/v4l/pixfmt.rst 
> > b/Documentation/userspace-api/media/v4l/pixfmt.rst
> > index 11dab4a90630..2eb6fdd3b43d 100644
> > --- a/Documentation/userspace-api/media/v4l/pixfmt.rst
> > +++ 

RE: [PATCH v9 06/10] PCI: dwc: ep: Call dw_pcie_ep_init_registers() API directly from all glue drivers

2024-03-08 Thread Yoshihiro Shimoda
> From: Manivannan Sadhasivam, Sent: Monday, March 4, 2024 6:22 PM
> 
> Currently, dw_pcie_ep_init_registers() API is directly called by the glue
> drivers requiring active refclk from host. But for the other drivers, it is
> getting called implicitly by dw_pcie_ep_init(). This is due to the fact
> that this API initializes DWC EP specific registers and that requires an
> active refclk (either from host or generated locally by endpoint itsef).
> 
> But, this causes a discrepancy among the glue drivers. So to avoid this
> confusion, let's call this API directly from all glue drivers irrespective
> of refclk dependency. Only difference here is that the drivers requiring
> refclk from host will call this API only after the refclk is received and
> other drivers without refclk dependency will call this API right after
> dw_pcie_ep_init().
> 
> With this change, the check for 'core_init_notifier' flag can now be
> dropped from dw_pcie_ep_init() API. This will also allow us to remove the
> 'core_init_notifier' flag completely in the later commits.
> 
> Signed-off-by: Manivannan Sadhasivam 
> ---
>  drivers/pci/controller/dwc/pci-dra7xx.c   |  7 +++
>  drivers/pci/controller/dwc/pci-imx6.c |  8 
>  drivers/pci/controller/dwc/pci-keystone.c |  9 +
>  drivers/pci/controller/dwc/pci-layerscape-ep.c|  7 +++
>  drivers/pci/controller/dwc/pcie-artpec6.c | 13 -
>  drivers/pci/controller/dwc/pcie-designware-ep.c   | 22 --
>  drivers/pci/controller/dwc/pcie-designware-plat.c |  9 +
>  drivers/pci/controller/dwc/pcie-keembay.c | 16 +++-
>  drivers/pci/controller/dwc/pcie-rcar-gen4.c   | 12 +++-

Thank you for the patch! About pcie-rcar-gen4.c,

Reviewed-by: Yoshihiro Shimoda 

Best regards,
Yoshihiro Shimoda

>  drivers/pci/controller/dwc/pcie-uniphier-ep.c | 13 -
>  10 files changed, 90 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c 
> b/drivers/pci/controller/dwc/pci-dra7xx.c
> index 0e406677060d..395042b29ffc 100644
> --- a/drivers/pci/controller/dwc/pci-dra7xx.c
> +++ b/drivers/pci/controller/dwc/pci-dra7xx.c
> @@ -467,6 +467,13 @@ static int dra7xx_add_pcie_ep(struct dra7xx_pcie *dra7xx,
>   return ret;
>   }
> 
> + ret = dw_pcie_ep_init_registers(ep);
> + if (ret) {
> + dev_err(dev, "Failed to initialize DWC endpoint registers\n");
> + dw_pcie_ep_deinit(ep);
> + return ret;
> + }
> +
>   return 0;
>  }
> 
> diff --git a/drivers/pci/controller/dwc/pci-imx6.c 
> b/drivers/pci/controller/dwc/pci-imx6.c
> index dc2c036ab28c..bfcafa440ddb 100644
> --- a/drivers/pci/controller/dwc/pci-imx6.c
> +++ b/drivers/pci/controller/dwc/pci-imx6.c
> @@ -1136,6 +1136,14 @@ static int imx6_add_pcie_ep(struct imx6_pcie 
> *imx6_pcie,
>   dev_err(dev, "failed to initialize endpoint\n");
>   return ret;
>   }
> +
> + ret = dw_pcie_ep_init_registers(ep);
> + if (ret) {
> + dev_err(dev, "Failed to initialize DWC endpoint registers\n");
> + dw_pcie_ep_deinit(ep);
> + return ret;
> + }
> +
>   /* Start LTSSM. */
>   imx6_pcie_ltssm_enable(dev);
> 
> diff --git a/drivers/pci/controller/dwc/pci-keystone.c 
> b/drivers/pci/controller/dwc/pci-keystone.c
> index c0c62533a3f1..8392894ed286 100644
> --- a/drivers/pci/controller/dwc/pci-keystone.c
> +++ b/drivers/pci/controller/dwc/pci-keystone.c
> @@ -1286,6 +1286,13 @@ static int ks_pcie_probe(struct platform_device *pdev)
>   ret = dw_pcie_ep_init(>ep);
>   if (ret < 0)
>   goto err_get_sync;
> +
> + ret = dw_pcie_ep_init_registers(>ep);
> + if (ret < 0) {
> + dev_err(dev, "Failed to initialize DWC endpoint 
> registers\n");
> + goto err_ep_init;
> + }
> +
>   break;
>   default:
>   dev_err(dev, "INVALID device type %d\n", mode);
> @@ -1295,6 +1302,8 @@ static int ks_pcie_probe(struct platform_device *pdev)
> 
>   return 0;
> 
> +err_ep_init:
> + dw_pcie_ep_deinit(>ep);
>  err_get_sync:
>   pm_runtime_put(dev);
>   pm_runtime_disable(dev);
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index 2e398494e7c0..b712fdd06549 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -276,6 +276,13 @@ static int __init ls_pcie_ep_probe(struct 
> platform_device *pdev)
>   if (ret)
>   return ret;
> 
> + ret = dw_pcie_ep_init_registers(>ep);
> + if (ret) {
> + dev_err(dev, "Failed to initialize DWC endpoint registers\n");
> + dw_pcie_ep_deinit(>ep);
> + return ret;
> + }
> +
>   return 

RE: [PATCH v9 02/10] PCI: dwc: ep: Rename dw_pcie_ep_exit() to dw_pcie_ep_deinit()

2024-03-08 Thread Yoshihiro Shimoda
> From: Manivannan Sadhasivam, Sent: Monday, March 4, 2024 6:22 PM
> 
> dw_pcie_ep_exit() API is undoing what the dw_pcie_ep_init() API has done
> already (at least partly). But the API name dw_pcie_ep_exit() is not quite
> reflecting that. So let's rename it to dw_pcie_ep_deinit() to make the
> purpose of this API clear. This also aligns with the DWC host driver.
> 
> Reviewed-by: Frank Li 
> Signed-off-by: Manivannan Sadhasivam 

Thank you for the patch!

Reviewed-by: Yoshihiro Shimoda 

Best regards,
Yoshihiro Shimoda

> ---
>  drivers/pci/controller/dwc/pcie-designware-ep.c | 4 ++--
>  drivers/pci/controller/dwc/pcie-designware.h| 4 ++--
>  drivers/pci/controller/dwc/pcie-rcar-gen4.c | 2 +-
>  3 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> b/drivers/pci/controller/dwc/pcie-designware-ep.c
> index d305f9b4cdfe..2b11290aab4c 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> @@ -564,7 +564,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 
> func_no,
>   return 0;
>  }
> 
> -void dw_pcie_ep_exit(struct dw_pcie_ep *ep)
> +void dw_pcie_ep_deinit(struct dw_pcie_ep *ep)
>  {
>   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
>   struct pci_epc *epc = ep->epc;
> @@ -576,7 +576,7 @@ void dw_pcie_ep_exit(struct dw_pcie_ep *ep)
> 
>   pci_epc_mem_exit(epc);
>  }
> -EXPORT_SYMBOL_GPL(dw_pcie_ep_exit);
> +EXPORT_SYMBOL_GPL(dw_pcie_ep_deinit);
> 
>  static unsigned int dw_pcie_ep_find_ext_capability(struct dw_pcie *pci, int 
> cap)
>  {
> diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
> b/drivers/pci/controller/dwc/pcie-designware.h
> index ab7431a37209..61465203bb60 100644
> --- a/drivers/pci/controller/dwc/pcie-designware.h
> +++ b/drivers/pci/controller/dwc/pcie-designware.h
> @@ -671,7 +671,7 @@ void dw_pcie_ep_linkup(struct dw_pcie_ep *ep);
>  int dw_pcie_ep_init(struct dw_pcie_ep *ep);
>  int dw_pcie_ep_init_complete(struct dw_pcie_ep *ep);
>  void dw_pcie_ep_init_notify(struct dw_pcie_ep *ep);
> -void dw_pcie_ep_exit(struct dw_pcie_ep *ep);
> +void dw_pcie_ep_deinit(struct dw_pcie_ep *ep);
>  int dw_pcie_ep_raise_intx_irq(struct dw_pcie_ep *ep, u8 func_no);
>  int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
>u8 interrupt_num);
> @@ -701,7 +701,7 @@ static inline void dw_pcie_ep_init_notify(struct 
> dw_pcie_ep *ep)
>  {
>  }
> 
> -static inline void dw_pcie_ep_exit(struct dw_pcie_ep *ep)
> +static inline void dw_pcie_ep_deinit(struct dw_pcie_ep *ep)
>  {
>  }
> 
> diff --git a/drivers/pci/controller/dwc/pcie-rcar-gen4.c 
> b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
> index ac97d594ea47..9d9d22e367bb 100644
> --- a/drivers/pci/controller/dwc/pcie-rcar-gen4.c
> +++ b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
> @@ -430,7 +430,7 @@ static int rcar_gen4_add_dw_pcie_ep(struct rcar_gen4_pcie 
> *rcar)
> 
>  static void rcar_gen4_remove_dw_pcie_ep(struct rcar_gen4_pcie *rcar)
>  {
> - dw_pcie_ep_exit(>dw.ep);
> + dw_pcie_ep_deinit(>dw.ep);
>   rcar_gen4_pcie_ep_deinit(rcar);
>  }
> 
> 
> --
> 2.25.1



RE: [PATCH v9 01/10] PCI: dwc: ep: Remove deinit() callback from struct dw_pcie_ep_ops

2024-03-08 Thread Yoshihiro Shimoda
Hello Manivannan,

> From: Manivannan Sadhasivam, Sent: Monday, March 4, 2024 6:22 PM
> 
> deinit() callback was solely introduced for the pcie-rcar-gen4 driver where
> it is used to do platform specific resource deallocation. And this callback
> is called right at the end of the dw_pcie_ep_exit() API. So it doesn't
> matter whether it is called within or outside of dw_pcie_ep_exit() API.
> 
> So let's remove this callback and directly call rcar_gen4_pcie_ep_deinit()
> in pcie-rcar-gen4 driver to do resource deallocation after the completion
> of dw_pcie_ep_exit() API in rcar_gen4_remove_dw_pcie_ep().
> 
> This simplifies the DWC layer.
> 
> Reviewed-by: Frank Li 
> Signed-off-by: Manivannan Sadhasivam 

Thank you for the patch!

Reviewed-by: Yoshihiro Shimoda 

Best regards,
Yoshihiro Shimoda

> ---
>  drivers/pci/controller/dwc/pcie-designware-ep.c |  9 +
>  drivers/pci/controller/dwc/pcie-designware.h|  1 -
>  drivers/pci/controller/dwc/pcie-rcar-gen4.c | 14 --
>  3 files changed, 9 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> b/drivers/pci/controller/dwc/pcie-designware-ep.c
> index 5befed2dc02b..d305f9b4cdfe 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> @@ -575,9 +575,6 @@ void dw_pcie_ep_exit(struct dw_pcie_ep *ep)
> epc->mem->window.page_size);
> 
>   pci_epc_mem_exit(epc);
> -
> - if (ep->ops->deinit)
> - ep->ops->deinit(ep);
>  }
>  EXPORT_SYMBOL_GPL(dw_pcie_ep_exit);
> 
> @@ -738,7 +735,7 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
>  ep->page_size);
>   if (ret < 0) {
>   dev_err(dev, "Failed to initialize address space\n");
> - goto err_ep_deinit;
> + return ret;
>   }
> 
>   ep->msi_mem = pci_epc_mem_alloc_addr(epc, >msi_mem_phys,
> @@ -775,10 +772,6 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
>  err_exit_epc_mem:
>   pci_epc_mem_exit(epc);
> 
> -err_ep_deinit:
> - if (ep->ops->deinit)
> - ep->ops->deinit(ep);
> -
>   return ret;
>  }
>  EXPORT_SYMBOL_GPL(dw_pcie_ep_init);
> diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
> b/drivers/pci/controller/dwc/pcie-designware.h
> index 26dae4837462..ab7431a37209 100644
> --- a/drivers/pci/controller/dwc/pcie-designware.h
> +++ b/drivers/pci/controller/dwc/pcie-designware.h
> @@ -333,7 +333,6 @@ struct dw_pcie_rp {
>  struct dw_pcie_ep_ops {
>   void(*pre_init)(struct dw_pcie_ep *ep);
>   void(*init)(struct dw_pcie_ep *ep);
> - void(*deinit)(struct dw_pcie_ep *ep);
>   int (*raise_irq)(struct dw_pcie_ep *ep, u8 func_no,
>unsigned int type, u16 interrupt_num);
>   const struct pci_epc_features* (*get_features)(struct dw_pcie_ep *ep);
> diff --git a/drivers/pci/controller/dwc/pcie-rcar-gen4.c 
> b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
> index e9166619b1f9..ac97d594ea47 100644
> --- a/drivers/pci/controller/dwc/pcie-rcar-gen4.c
> +++ b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
> @@ -352,11 +352,8 @@ static void rcar_gen4_pcie_ep_init(struct dw_pcie_ep *ep)
>   dw_pcie_ep_reset_bar(pci, bar);
>  }
> 
> -static void rcar_gen4_pcie_ep_deinit(struct dw_pcie_ep *ep)
> +static void rcar_gen4_pcie_ep_deinit(struct rcar_gen4_pcie *rcar)
>  {
> - struct dw_pcie *dw = to_dw_pcie_from_ep(ep);
> - struct rcar_gen4_pcie *rcar = to_rcar_gen4_pcie(dw);
> -
>   writel(0, rcar->base + PCIEDMAINTSTSEN);
>   rcar_gen4_pcie_common_deinit(rcar);
>  }
> @@ -408,7 +405,6 @@ static unsigned int 
> rcar_gen4_pcie_ep_get_dbi2_offset(struct dw_pcie_ep *ep,
>  static const struct dw_pcie_ep_ops pcie_ep_ops = {
>   .pre_init = rcar_gen4_pcie_ep_pre_init,
>   .init = rcar_gen4_pcie_ep_init,
> - .deinit = rcar_gen4_pcie_ep_deinit,
>   .raise_irq = rcar_gen4_pcie_ep_raise_irq,
>   .get_features = rcar_gen4_pcie_ep_get_features,
>   .get_dbi_offset = rcar_gen4_pcie_ep_get_dbi_offset,
> @@ -418,18 +414,24 @@ static const struct dw_pcie_ep_ops pcie_ep_ops = {
>  static int rcar_gen4_add_dw_pcie_ep(struct rcar_gen4_pcie *rcar)
>  {
>   struct dw_pcie_ep *ep = >dw.ep;
> + int ret;
> 
>   if (!IS_ENABLED(CONFIG_PCIE_RCAR_GEN4_EP))
>   return -ENODEV;
> 
>   ep->ops = _ep_ops;
> 
> - return dw_pcie_ep_init(ep);
> + ret = dw_pcie_ep_init(ep);
> + if (ret)
> + rcar_gen4_pcie_ep_deinit(rcar);
> +
> + return ret;
>  }
> 
>  static void rcar_gen4_remove_dw_pcie_ep(struct rcar_gen4_pcie *rcar)
>  {
>   dw_pcie_ep_exit(>dw.ep);
> + rcar_gen4_pcie_ep_deinit(rcar);
>  }
> 
>  /* Common */
> 
> --
> 2.25.1



Re: [PATCH v9 06/10] PCI: dwc: ep: Call dw_pcie_ep_init_registers() API directly from all glue drivers

2024-03-08 Thread Niklas Cassel
On Fri, Mar 08, 2024 at 03:19:47PM +0530, Manivannan Sadhasivam wrote:
> > > > > @@ -467,6 +467,13 @@ static int dra7xx_add_pcie_ep(struct dra7xx_pcie 
> > > > > *dra7xx,
> > > > >   return ret;
> > > > >   }
> > > > >  
> > > > > + ret = dw_pcie_ep_init_registers(ep);
> > > > > + if (ret) {
> > > > 
> > > > Here you are using if (ret) to error check the return from
> > > > dw_pcie_ep_init_registers().
> > > > 
> > > > 
> > > > > index c0c62533a3f1..8392894ed286 100644
> > > > > --- a/drivers/pci/controller/dwc/pci-keystone.c
> > > > > +++ b/drivers/pci/controller/dwc/pci-keystone.c
> > > > > @@ -1286,6 +1286,13 @@ static int ks_pcie_probe(struct 
> > > > > platform_device *pdev)
> > > > >   ret = dw_pcie_ep_init(>ep);
> > > > >   if (ret < 0)
> > > > >   goto err_get_sync;
> > > > > +
> > > > > + ret = dw_pcie_ep_init_registers(>ep);
> > > > > + if (ret < 0) {
> > > > 
> > > > Here you are using if (ret < 0) to error check the return from
> > > > dw_pcie_ep_init_registers(). Please be consistent.
> > > > 
> > > 
> > > I maintained the consistency w.r.t individual drivers. Please check them
> > > individually.
> > > 
> > > If I maintain consistency w.r.t this patch, then the style will change 
> > > within
> > > the drivers.
> > 
> > Personally, I disagree with that.
> > 
> > All glue drivers should use the same way of checking dw_pcie_ep_init(),
> > depending on the kdoc of dw_pcie_ep_init().
> > 
> > If the kdoc for dw_pcie_ep_init() says returns 0 on success,
> > then I think that it is strictly more correct to do:
> > 
> > ret = dw_pcie_ep_init()
> > if (ret) {
> > 
> > }
> > 
> > And if a glue driver doesn't look like that, then I think we should change
> > them. (Same reasoning for dw_pcie_ep_init_registers().)
> > 
> > 
> > If you read code that looks like:
> > ret = dw_pcie_ep_init()
> > if (ret < 0) {
> > 
> > }
> > 
> > then you assume that is is a function with a kdoc that says it can return 0
> > or a positive value on success, e.g. a function that returns an index in an
> > array.
> > 
> 
> But if you read the same function from the individual drivers, it could 
> present
> a different opinion because the samantics is different than others.

Is there any glue driver where a positive result from dw_pcie_ep_init() is
considered valid?


> 
> I'm not opposed to keeping the API semantics consistent, but we have to take
> account of the drivers style as well.

kdoc > "driver style"
IMO, but you are the maintainer, I just offered my 50 cents :)


Kind regards,
Niklas


Re: [PATCH] powerpc: align memory_limit to 16MB in early_parse_mem

2024-03-08 Thread Aneesh Kumar K . V
Joel Savitz  writes:

> On 64-bit powerpc, usage of a non-16MB-aligned value for the mem= kernel
> cmdline parameter results in a system hang at boot.
>
> For example, using 'mem=4198400K' will always reproduce this issue.
>
> This patch fixes the problem by aligning any argument to mem= to 16MB
> corresponding with the large page size on powerpc.
>
> Fixes: 2babf5c2ec2f ("[PATCH] powerpc: Unify mem= handling")
> Co-developed-by: Gonzalo Siero 
> Signed-off-by: Gonzalo Siero 
> Signed-off-by: Joel Savitz 
> ---
>  arch/powerpc/kernel/prom.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index 0b5878c3125b..8cd3e2445d8a 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -82,8 +82,12 @@ static int __init early_parse_mem(char *p)
>  {
>   if (!p)
>   return 1;
> -
> +#ifdef CONFIG_PPC64
> + /* Align to 16 MB == size of ppc64 large page */
> + memory_limit = ALIGN(memparse(p, ), 0x100);
> +#else
>   memory_limit = PAGE_ALIGN(memparse(p, ));
> +#endif
>   DBG("memory limit = 0x%llx\n", memory_limit);
>  
>   return 0;
> -- 
> 2.43.0

Can you try this change?

commit bc55e1aa71f545cff31e1eccdb4a2e39df84
Author: Aneesh Kumar K.V (IBM) 
Date:   Fri Mar 8 14:45:26 2024 +0530

powerpc/mm: Align memory_limit value specified using mem= kernel parameter

The value specified for the memory limit is used to set a restriction on
memory usage. It is important to ensure that this restriction is within
the linear map kernel address space range. The hash page table
translation uses a 16MB page size to map the kernel linear map address
space. htab_bolt_mapping() function aligns down the size of the range
while mapping kernel linear address space. Since the memblock limit is
enforced very early during boot, before we can detect the type of memory
translation (radix vs hash), we align the memory limit value specified
as a kernel parameter to 16MB. This alignment value will work for both
hash and radix translations.

Signed-off-by: Aneesh Kumar K.V (IBM) 

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 0b5878c3125b..9bd965d35352 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -824,8 +824,11 @@ void __init early_init_devtree(void *params)
reserve_crashkernel();
early_reserve_mem();
 
-   /* Ensure that total memory size is page-aligned. */
-   limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
+   if (memory_limit > memblock_phys_mem_size())
+   memory_limit = 0;
+
+   /* Align down to 16 MB which is large page size with hash page 
translation */
+   limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), SZ_16M);
memblock_enforce_memory_limit(limit);
 
 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES)
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index e67effdba85c..d6410549e141 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -817,8 +817,8 @@ static void __init early_cmdline_parse(void)
opt += 4;
prom_memory_limit = prom_memparse(opt, (const char **));
 #ifdef CONFIG_PPC64
-   /* Align to 16 MB == size of ppc64 large page */
-   prom_memory_limit = ALIGN(prom_memory_limit, 0x100);
+   /* Align down to 16 MB which is large page size with hash page 
translation */
+   prom_memory_limit = ALIGN_DOWN(prom_memory_limit, SZ_16M);
 #endif
}
 


Re: [PATCH v9 08/10] PCI: dwc: ep: Add a generic dw_pcie_ep_linkdown() API to handle LINK_DOWN event

2024-03-08 Thread Niklas Cassel
On Fri, Mar 08, 2024 at 03:16:06PM +0530, Manivannan Sadhasivam wrote:
> On Fri, Mar 08, 2024 at 09:56:33AM +0100, Niklas Cassel wrote:
> > On Fri, Mar 08, 2024 at 11:11:52AM +0530, Manivannan Sadhasivam wrote:
> > > On Thu, Mar 07, 2024 at 10:43:19PM +0100, Niklas Cassel wrote:
> > > > On Mon, Mar 04, 2024 at 02:52:20PM +0530, Manivannan Sadhasivam wrote:
> > > > > The PCIe link can go to LINK_DOWN state in one of the following 
> > > > > scenarios:
> > > > > 
> > > > > 1. Fundamental (PERST#)/hot/warm reset
> > > > > 2. Link transition from L2/L3 to L0
> > > > > 
> > > > > In those cases, LINK_DOWN causes some non-sticky DWC registers to 
> > > > > loose the
> > > > > state (like REBAR, PTM_CAP etc...). So the drivers need to 
> > > > > reinitialize
> > > > > them to function properly once the link comes back again.
> > > > > 
> > > > > This is not a problem for drivers supporting PERST# IRQ, since they 
> > > > > can
> > > > > reinitialize the registers in the PERST# IRQ callback. But for the 
> > > > > drivers
> > > > > not supporting PERST#, there is no way they can reinitialize the 
> > > > > registers
> > > > > other than relying on LINK_DOWN IRQ received when the link goes down. 
> > > > > So
> > > > > let's add a DWC generic API dw_pcie_ep_linkdown() that reinitializes 
> > > > > the
> > > > > non-sticky registers and also notifies the EPF drivers about link 
> > > > > going
> > > > > down.
> > > > > 
> > > > > This API can also be used by the drivers supporting PERST# to handle 
> > > > > the
> > > > > scenario (2) mentioned above.
> > > > > 
> > > > > Signed-off-by: Manivannan Sadhasivam 
> > > > > 
> > > > > ---
> > > > >  drivers/pci/controller/dwc/pcie-designware-ep.c | 111 
> > > > > ++--
> > > > >  drivers/pci/controller/dwc/pcie-designware.h|   5 ++
> > > > >  2 files changed, 72 insertions(+), 44 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> > > > > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > > > index 278bdc9b2269..fed4c2936c78 100644
> > > > > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > > > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > > > @@ -14,14 +14,6 @@
> > > > >  #include 
> > > > >  #include 
> > > > >  
> > > > > -void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> > > > > -{
> > > > > - struct pci_epc *epc = ep->epc;
> > > > > -
> > > > > - pci_epc_linkup(epc);
> > > > > -}
> > > > > -EXPORT_SYMBOL_GPL(dw_pcie_ep_linkup);
> > > > > -
> > > > >  void dw_pcie_ep_init_notify(struct dw_pcie_ep *ep)
> > > > >  {
> > > > >   struct pci_epc *epc = ep->epc;
> > > > > @@ -603,19 +595,56 @@ static unsigned int 
> > > > > dw_pcie_ep_find_ext_capability(struct dw_pcie *pci, int cap)
> > > > >   return 0;
> > > > >  }
> > > > >  
> > > > > +static void dw_pcie_ep_init_non_sticky_registers(struct dw_pcie *pci)
> > > > > +{
> > > > > + unsigned int offset, ptm_cap_base;
> > > > > + unsigned int nbars;
> > > > > + u32 reg, i;
> > > > > +
> > > > > + offset = dw_pcie_ep_find_ext_capability(pci, 
> > > > > PCI_EXT_CAP_ID_REBAR);
> > > > > + ptm_cap_base = dw_pcie_ep_find_ext_capability(pci, 
> > > > > PCI_EXT_CAP_ID_PTM);
> > > > > +
> > > > > + dw_pcie_dbi_ro_wr_en(pci);
> > > > > +
> > > > > + if (offset) {
> > > > > + reg = dw_pcie_readl_dbi(pci, offset + PCI_REBAR_CTRL);
> > > > > + nbars = (reg & PCI_REBAR_CTRL_NBAR_MASK) >>
> > > > > + PCI_REBAR_CTRL_NBAR_SHIFT;
> > > > > +
> > > > > + for (i = 0; i < nbars; i++, offset += PCI_REBAR_CTRL)
> > > > > + dw_pcie_writel_dbi(pci, offset + PCI_REBAR_CAP, 
> > > > > 0x0);
> > > > 
> > > > If you look at PCI_REBAR_CAP, you will see that it is sticky,
> > > > but you have to actually read the databook to see that:
> > > > 
> > > > "The RESBAR_CTRL_REG_BAR_SIZE field is automatically updated
> > > > when you write to RESBAR_CAP_REG_0_REG through the DBI."
> > > > 
> > > > So the reason why we need to write this register, even though
> > > > it is sticky, is to update the RESBAR_CTRL_REG_BAR_SIZE register,
> > > > which is not sticky :)
> > > > 
> > > > (Perhaps we should add that as a comment?)
> > > > 
> > > 
> > > Yeah, makes sense.
> > 
> > Note that I add a (unrelated) comment related to REBAR_CAP in this patch:
> > https://lore.kernel.org/linux-pci/20240307111520.3303774-1-cas...@kernel.org/T/#u
> > 
> > But once we move/add code to dw_pcie_ep_init_non_sticky_registers(), I think
> > that it might be a good "rule" to have a small comment for each write in
> > dw_pcie_ep_init_non_sticky_registers() which explains why the code should be
> > in dw_pcie_ep_init_non_sticky_registers() instead of 
> > dw_pcie_ep_init_registers(),
> > even if it just a small:
> > 
> > /* Field PCI_XXX_YYY.ZZZ is non-sticky */
> > writel_dbi(pci, offset + PCI_XXX_YYY, 0);
> > 
> 
> Why? The function name itself suggests that we are 

Re: [PATCH v9 06/10] PCI: dwc: ep: Call dw_pcie_ep_init_registers() API directly from all glue drivers

2024-03-08 Thread Manivannan Sadhasivam
On Fri, Mar 08, 2024 at 10:05:11AM +0100, Niklas Cassel wrote:
> On Fri, Mar 08, 2024 at 11:06:24AM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Mar 07, 2024 at 09:36:56PM +0100, Niklas Cassel wrote:
> > > On Mon, Mar 04, 2024 at 02:52:18PM +0530, Manivannan Sadhasivam wrote:
> > > > Currently, dw_pcie_ep_init_registers() API is directly called by the 
> > > > glue
> > > > drivers requiring active refclk from host. But for the other drivers, 
> > > > it is
> > > > getting called implicitly by dw_pcie_ep_init(). This is due to the fact
> > > > that this API initializes DWC EP specific registers and that requires an
> > > > active refclk (either from host or generated locally by endpoint itsef).
> > > > 
> > > > But, this causes a discrepancy among the glue drivers. So to avoid this
> > > > confusion, let's call this API directly from all glue drivers 
> > > > irrespective
> > > > of refclk dependency. Only difference here is that the drivers requiring
> > > > refclk from host will call this API only after the refclk is received 
> > > > and
> > > > other drivers without refclk dependency will call this API right after
> > > > dw_pcie_ep_init().
> > > > 
> > > > With this change, the check for 'core_init_notifier' flag can now be
> > > > dropped from dw_pcie_ep_init() API. This will also allow us to remove 
> > > > the
> > > > 'core_init_notifier' flag completely in the later commits.
> > > > 
> > > > Signed-off-by: Manivannan Sadhasivam 
> > > > ---
> > > >  drivers/pci/controller/dwc/pci-dra7xx.c   |  7 +++
> > > >  drivers/pci/controller/dwc/pci-imx6.c |  8 
> > > >  drivers/pci/controller/dwc/pci-keystone.c |  9 +
> > > >  drivers/pci/controller/dwc/pci-layerscape-ep.c|  7 +++
> > > >  drivers/pci/controller/dwc/pcie-artpec6.c | 13 -
> > > >  drivers/pci/controller/dwc/pcie-designware-ep.c   | 22 
> > > > --
> > > >  drivers/pci/controller/dwc/pcie-designware-plat.c |  9 +
> > > >  drivers/pci/controller/dwc/pcie-keembay.c | 16 +++-
> > > >  drivers/pci/controller/dwc/pcie-rcar-gen4.c   | 12 +++-
> > > >  drivers/pci/controller/dwc/pcie-uniphier-ep.c | 13 -
> > > >  10 files changed, 90 insertions(+), 26 deletions(-)
> > > > 
> > > > diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c 
> > > > b/drivers/pci/controller/dwc/pci-dra7xx.c
> > > > index 0e406677060d..395042b29ffc 100644
> > > > --- a/drivers/pci/controller/dwc/pci-dra7xx.c
> > > > +++ b/drivers/pci/controller/dwc/pci-dra7xx.c
> > > > @@ -467,6 +467,13 @@ static int dra7xx_add_pcie_ep(struct dra7xx_pcie 
> > > > *dra7xx,
> > > > return ret;
> > > > }
> > > >  
> > > > +   ret = dw_pcie_ep_init_registers(ep);
> > > > +   if (ret) {
> > > 
> > > Here you are using if (ret) to error check the return from
> > > dw_pcie_ep_init_registers().
> > > 
> > > 
> > > > index c0c62533a3f1..8392894ed286 100644
> > > > --- a/drivers/pci/controller/dwc/pci-keystone.c
> > > > +++ b/drivers/pci/controller/dwc/pci-keystone.c
> > > > @@ -1286,6 +1286,13 @@ static int ks_pcie_probe(struct platform_device 
> > > > *pdev)
> > > > ret = dw_pcie_ep_init(>ep);
> > > > if (ret < 0)
> > > > goto err_get_sync;
> > > > +
> > > > +   ret = dw_pcie_ep_init_registers(>ep);
> > > > +   if (ret < 0) {
> > > 
> > > Here you are using if (ret < 0) to error check the return from
> > > dw_pcie_ep_init_registers(). Please be consistent.
> > > 
> > 
> > I maintained the consistency w.r.t individual drivers. Please check them
> > individually.
> > 
> > If I maintain consistency w.r.t this patch, then the style will change 
> > within
> > the drivers.
> 
> Personally, I disagree with that.
> 
> All glue drivers should use the same way of checking dw_pcie_ep_init(),
> depending on the kdoc of dw_pcie_ep_init().
> 
> If the kdoc for dw_pcie_ep_init() says returns 0 on success,
> then I think that it is strictly more correct to do:
> 
> ret = dw_pcie_ep_init()
> if (ret) {
>   
> }
> 
> And if a glue driver doesn't look like that, then I think we should change
> them. (Same reasoning for dw_pcie_ep_init_registers().)
> 
> 
> If you read code that looks like:
> ret = dw_pcie_ep_init()
> if (ret < 0) {
>   
> }
> 
> then you assume that is is a function with a kdoc that says it can return 0
> or a positive value on success, e.g. a function that returns an index in an
> array.
> 

But if you read the same function from the individual drivers, it could present
a different opinion because the samantics is different than others.

I'm not opposed to keeping the API semantics consistent, but we have to take
account of the drivers style as well.

- Mani

-- 
மணிவண்ணன் சதாசிவம்


Re: [PATCH v9 08/10] PCI: dwc: ep: Add a generic dw_pcie_ep_linkdown() API to handle LINK_DOWN event

2024-03-08 Thread Manivannan Sadhasivam
On Fri, Mar 08, 2024 at 09:56:33AM +0100, Niklas Cassel wrote:
> On Fri, Mar 08, 2024 at 11:11:52AM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Mar 07, 2024 at 10:43:19PM +0100, Niklas Cassel wrote:
> > > On Mon, Mar 04, 2024 at 02:52:20PM +0530, Manivannan Sadhasivam wrote:
> > > > The PCIe link can go to LINK_DOWN state in one of the following 
> > > > scenarios:
> > > > 
> > > > 1. Fundamental (PERST#)/hot/warm reset
> > > > 2. Link transition from L2/L3 to L0
> > > > 
> > > > In those cases, LINK_DOWN causes some non-sticky DWC registers to loose 
> > > > the
> > > > state (like REBAR, PTM_CAP etc...). So the drivers need to reinitialize
> > > > them to function properly once the link comes back again.
> > > > 
> > > > This is not a problem for drivers supporting PERST# IRQ, since they can
> > > > reinitialize the registers in the PERST# IRQ callback. But for the 
> > > > drivers
> > > > not supporting PERST#, there is no way they can reinitialize the 
> > > > registers
> > > > other than relying on LINK_DOWN IRQ received when the link goes down. So
> > > > let's add a DWC generic API dw_pcie_ep_linkdown() that reinitializes the
> > > > non-sticky registers and also notifies the EPF drivers about link going
> > > > down.
> > > > 
> > > > This API can also be used by the drivers supporting PERST# to handle the
> > > > scenario (2) mentioned above.
> > > > 
> > > > Signed-off-by: Manivannan Sadhasivam 
> > > > ---
> > > >  drivers/pci/controller/dwc/pcie-designware-ep.c | 111 
> > > > ++--
> > > >  drivers/pci/controller/dwc/pcie-designware.h|   5 ++
> > > >  2 files changed, 72 insertions(+), 44 deletions(-)
> > > > 
> > > > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> > > > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > > index 278bdc9b2269..fed4c2936c78 100644
> > > > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > > @@ -14,14 +14,6 @@
> > > >  #include 
> > > >  #include 
> > > >  
> > > > -void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> > > > -{
> > > > -   struct pci_epc *epc = ep->epc;
> > > > -
> > > > -   pci_epc_linkup(epc);
> > > > -}
> > > > -EXPORT_SYMBOL_GPL(dw_pcie_ep_linkup);
> > > > -
> > > >  void dw_pcie_ep_init_notify(struct dw_pcie_ep *ep)
> > > >  {
> > > > struct pci_epc *epc = ep->epc;
> > > > @@ -603,19 +595,56 @@ static unsigned int 
> > > > dw_pcie_ep_find_ext_capability(struct dw_pcie *pci, int cap)
> > > > return 0;
> > > >  }
> > > >  
> > > > +static void dw_pcie_ep_init_non_sticky_registers(struct dw_pcie *pci)
> > > > +{
> > > > +   unsigned int offset, ptm_cap_base;
> > > > +   unsigned int nbars;
> > > > +   u32 reg, i;
> > > > +
> > > > +   offset = dw_pcie_ep_find_ext_capability(pci, 
> > > > PCI_EXT_CAP_ID_REBAR);
> > > > +   ptm_cap_base = dw_pcie_ep_find_ext_capability(pci, 
> > > > PCI_EXT_CAP_ID_PTM);
> > > > +
> > > > +   dw_pcie_dbi_ro_wr_en(pci);
> > > > +
> > > > +   if (offset) {
> > > > +   reg = dw_pcie_readl_dbi(pci, offset + PCI_REBAR_CTRL);
> > > > +   nbars = (reg & PCI_REBAR_CTRL_NBAR_MASK) >>
> > > > +   PCI_REBAR_CTRL_NBAR_SHIFT;
> > > > +
> > > > +   for (i = 0; i < nbars; i++, offset += PCI_REBAR_CTRL)
> > > > +   dw_pcie_writel_dbi(pci, offset + PCI_REBAR_CAP, 
> > > > 0x0);
> > > 
> > > If you look at PCI_REBAR_CAP, you will see that it is sticky,
> > > but you have to actually read the databook to see that:
> > > 
> > > "The RESBAR_CTRL_REG_BAR_SIZE field is automatically updated
> > > when you write to RESBAR_CAP_REG_0_REG through the DBI."
> > > 
> > > So the reason why we need to write this register, even though
> > > it is sticky, is to update the RESBAR_CTRL_REG_BAR_SIZE register,
> > > which is not sticky :)
> > > 
> > > (Perhaps we should add that as a comment?)
> > > 
> > 
> > Yeah, makes sense.
> 
> Note that I add a (unrelated) comment related to REBAR_CAP in this patch:
> https://lore.kernel.org/linux-pci/20240307111520.3303774-1-cas...@kernel.org/T/#u
> 
> But once we move/add code to dw_pcie_ep_init_non_sticky_registers(), I think
> that it might be a good "rule" to have a small comment for each write in
> dw_pcie_ep_init_non_sticky_registers() which explains why the code should be
> in dw_pcie_ep_init_non_sticky_registers() instead of 
> dw_pcie_ep_init_registers(),
> even if it just a small:
> 
> /* Field PCI_XXX_YYY.ZZZ is non-sticky */
> writel_dbi(pci, offset + PCI_XXX_YYY, 0);
> 

Why? The function name itself suggests that we are reinitializing non-sticky
registers. So a comment for each write is overkill.

- Mani

-- 
மணிவண்ணன் சதாசிவம்


Re: [PATCH v9 07/10] PCI: dwc: ep: Remove "core_init_notifier" flag

2024-03-08 Thread Manivannan Sadhasivam
On Fri, Mar 08, 2024 at 09:48:07AM +0100, Niklas Cassel wrote:
> On Fri, Mar 08, 2024 at 11:08:29AM +0530, Manivannan Sadhasivam wrote:
> > On Thu, Mar 07, 2024 at 10:09:06PM +0100, Niklas Cassel wrote:
> > > On Mon, Mar 04, 2024 at 02:52:19PM +0530, Manivannan Sadhasivam wrote:
> > > > "core_init_notifier" flag is set by the glue drivers requiring refclk 
> > > > from
> > > > the host to complete the DWC core initialization. Also, those drivers 
> > > > will
> > > > send a notification to the EPF drivers once the initialization is fully
> > > > completed using the pci_epc_init_notify() API. Only then, the EPF 
> > > > drivers
> > > > will start functioning.
> > > > 
> > > > For the rest of the drivers generating refclk locally, EPF drivers will
> > > > start functioning post binding with them. EPF drivers rely on the
> > > > 'core_init_notifier' flag to differentiate between the drivers.
> > > > Unfortunately, this creates two different flows for the EPF drivers.
> > > > 
> > > > So to avoid that, let's get rid of the "core_init_notifier" flag and 
> > > > follow
> > > > a single initialization flow for the EPF drivers. This is done by 
> > > > calling
> > > > the dw_pcie_ep_init_notify() from all glue drivers after the completion 
> > > > of
> > > > dw_pcie_ep_init_registers() API. This will allow all the glue drivers to
> > > > send the notification to the EPF drivers once the initialization is 
> > > > fully
> > > > completed.
> > > > 
> > > > Only difference here is that, the drivers requiring refclk from host 
> > > > will
> > > > send the notification once refclk is received, while others will send it
> > > > during probe time itself.
> > > > 
> > > > Reviewed-by: Frank Li 
> > > > Signed-off-by: Manivannan Sadhasivam 
> > > > ---
> > > 
> > > You have removed the .core_init_notifier from EPC drivers,
> > > but the callback in EPF drivers is still called .core_init.
> > > 
> > > Yes, this was a confusing name even before this patch, but
> > > after this patch, it is probably even worse :)
> > > 
> > > The callback should be named from the perspective of EPF drivers IMO.
> > > .core_init sounds like a EPF driver should initialize the core.
> > > (But that is of course done by the EPC driver.)
> > > 
> > > The .link_up() callback name is better, the EPF driver is informed
> > > that the link is up.
> > > 
> > > Perhaps we could rename .core_init to .core_up ?
> > > 
> > > It tells the EPF drivers that the core is now up.
> > > (And the EPF driver can configure the BARs.)
> > > 
> > 
> > I don't disagree :) I thought about it but then decided to not extend the 
> > scope
> > of this series further. So saved that for next series.
> > 
> > But yeah, it is good to clean it up here itself.
> 
> If you intend to create a .core_deinit or .core_down (or whatever name
> you decide on), perhaps it is better to leave this cleanup to be part
> of that same series?
> 

I already added a patch. So let's do it here itself :)

- Mani

-- 
மணிவண்ணன் சதாசிவம்


Re: [PATCH] powerpc: align memory_limit to 16MB in early_parse_mem

2024-03-08 Thread Michael Ellerman
Joel Savitz  writes:
> On Fri, Mar 1, 2024 at 6:23 PM Michael Ellerman  wrote:
>> Joel Savitz  writes:
>> > On 64-bit powerpc, usage of a non-16MB-aligned value for the mem= kernel
>> > cmdline parameter results in a system hang at boot.
>>
>> Can you give us any more details on that? It might be a bug we can fix.
>
> The console freezes after the following output:
>
>   Booting a command list
>
> OF stdout device is: /vdevice/vty@3000
> Preparing to boot Linux version 6.8.0-rc6.memNOfix-00120-g87adedeba51a
> (r...@ibm-p9z-26-lp11.virt.pnr.lab.eng.rdu2.redhat.com) (gcc (GCC)
> 11.4.1 20231218 (Red Hat 11.4.1-3), GNU ld version 2.35.2-43.el9) #3
> SMP Fri Mar  1 10:45:45 EST 2024
> Detected machine type: 0101
> command line: 
> BOOT_IMAGE=(ieee1275//vdevice/v-scsi@3003/disk@8100,msdos2)/vmlinuz-6.8.0-rc6.memNOfix-00120-g87adedeba51a
> root=/dev/mapper/rhel_ibm--p9z--26--lp11-root ro
> crashkernel=2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G
> rd.lvm.lv=rhel_ibm-p9z-26-lp11/root
> rd.lvm.lv=rhel_ibm-p9z-26-lp11/swap mem=4198400K
> Max number of cores passed to firmware: 256 (NR_CPUS = 2048)
> Calling ibm,client-architecture-support... done
> Ignoring mem=00010100 >= ram_top.
> memory layout at init:
>   memory_limit :  (16 MB aligned)
>   alloc_bottom : 114f
>   alloc_top: 2000
>   alloc_top_hi : 2000
>   rmo_top  : 2000
>   ram_top  : 2000
> instantiating rtas at 0x1ecb... done
> prom_hold_cpus: skipped
> copying OF device tree...
> Building dt strings...
> Building dt structure...
> Device tree strings 0x1150 -> 0x115017b7
> Device tree struct  0x1151 -> 0x1152
> Quiescing Open Firmware ...
> Booting Linux via __start() @ 0x0a6e ...

Thanks.

I haven't been able to reproduce this unfortunately, and I don't see the
bug. As Aneesh pointed out the code should be aligning later anyway.

Can you build a kernel with CONFIG_PPC_EARLY_DEBUG_LPAR=y and boot it
without the patch? That should hopefully give you some more output.

cheers


Re: [PATCH v9 06/10] PCI: dwc: ep: Call dw_pcie_ep_init_registers() API directly from all glue drivers

2024-03-08 Thread Niklas Cassel
On Fri, Mar 08, 2024 at 11:06:24AM +0530, Manivannan Sadhasivam wrote:
> On Thu, Mar 07, 2024 at 09:36:56PM +0100, Niklas Cassel wrote:
> > On Mon, Mar 04, 2024 at 02:52:18PM +0530, Manivannan Sadhasivam wrote:
> > > Currently, dw_pcie_ep_init_registers() API is directly called by the glue
> > > drivers requiring active refclk from host. But for the other drivers, it 
> > > is
> > > getting called implicitly by dw_pcie_ep_init(). This is due to the fact
> > > that this API initializes DWC EP specific registers and that requires an
> > > active refclk (either from host or generated locally by endpoint itsef).
> > > 
> > > But, this causes a discrepancy among the glue drivers. So to avoid this
> > > confusion, let's call this API directly from all glue drivers irrespective
> > > of refclk dependency. Only difference here is that the drivers requiring
> > > refclk from host will call this API only after the refclk is received and
> > > other drivers without refclk dependency will call this API right after
> > > dw_pcie_ep_init().
> > > 
> > > With this change, the check for 'core_init_notifier' flag can now be
> > > dropped from dw_pcie_ep_init() API. This will also allow us to remove the
> > > 'core_init_notifier' flag completely in the later commits.
> > > 
> > > Signed-off-by: Manivannan Sadhasivam 
> > > ---
> > >  drivers/pci/controller/dwc/pci-dra7xx.c   |  7 +++
> > >  drivers/pci/controller/dwc/pci-imx6.c |  8 
> > >  drivers/pci/controller/dwc/pci-keystone.c |  9 +
> > >  drivers/pci/controller/dwc/pci-layerscape-ep.c|  7 +++
> > >  drivers/pci/controller/dwc/pcie-artpec6.c | 13 -
> > >  drivers/pci/controller/dwc/pcie-designware-ep.c   | 22 
> > > --
> > >  drivers/pci/controller/dwc/pcie-designware-plat.c |  9 +
> > >  drivers/pci/controller/dwc/pcie-keembay.c | 16 +++-
> > >  drivers/pci/controller/dwc/pcie-rcar-gen4.c   | 12 +++-
> > >  drivers/pci/controller/dwc/pcie-uniphier-ep.c | 13 -
> > >  10 files changed, 90 insertions(+), 26 deletions(-)
> > > 
> > > diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c 
> > > b/drivers/pci/controller/dwc/pci-dra7xx.c
> > > index 0e406677060d..395042b29ffc 100644
> > > --- a/drivers/pci/controller/dwc/pci-dra7xx.c
> > > +++ b/drivers/pci/controller/dwc/pci-dra7xx.c
> > > @@ -467,6 +467,13 @@ static int dra7xx_add_pcie_ep(struct dra7xx_pcie 
> > > *dra7xx,
> > >   return ret;
> > >   }
> > >  
> > > + ret = dw_pcie_ep_init_registers(ep);
> > > + if (ret) {
> > 
> > Here you are using if (ret) to error check the return from
> > dw_pcie_ep_init_registers().
> > 
> > 
> > > index c0c62533a3f1..8392894ed286 100644
> > > --- a/drivers/pci/controller/dwc/pci-keystone.c
> > > +++ b/drivers/pci/controller/dwc/pci-keystone.c
> > > @@ -1286,6 +1286,13 @@ static int ks_pcie_probe(struct platform_device 
> > > *pdev)
> > >   ret = dw_pcie_ep_init(>ep);
> > >   if (ret < 0)
> > >   goto err_get_sync;
> > > +
> > > + ret = dw_pcie_ep_init_registers(>ep);
> > > + if (ret < 0) {
> > 
> > Here you are using if (ret < 0) to error check the return from
> > dw_pcie_ep_init_registers(). Please be consistent.
> > 
> 
> I maintained the consistency w.r.t individual drivers. Please check them
> individually.
> 
> If I maintain consistency w.r.t this patch, then the style will change within
> the drivers.

Personally, I disagree with that.

All glue drivers should use the same way of checking dw_pcie_ep_init(),
depending on the kdoc of dw_pcie_ep_init().

If the kdoc for dw_pcie_ep_init() says returns 0 on success,
then I think that it is strictly more correct to do:

ret = dw_pcie_ep_init()
if (ret) {

}

And if a glue driver doesn't look like that, then I think we should change
them. (Same reasoning for dw_pcie_ep_init_registers().)


If you read code that looks like:
ret = dw_pcie_ep_init()
if (ret < 0) {

}

then you assume that is is a function with a kdoc that says it can return 0
or a positive value on success, e.g. a function that returns an index in an
array.


Kind regards,
Niklas


Re: [PATCH v9 08/10] PCI: dwc: ep: Add a generic dw_pcie_ep_linkdown() API to handle LINK_DOWN event

2024-03-08 Thread Niklas Cassel
On Fri, Mar 08, 2024 at 11:11:52AM +0530, Manivannan Sadhasivam wrote:
> On Thu, Mar 07, 2024 at 10:43:19PM +0100, Niklas Cassel wrote:
> > On Mon, Mar 04, 2024 at 02:52:20PM +0530, Manivannan Sadhasivam wrote:
> > > The PCIe link can go to LINK_DOWN state in one of the following scenarios:
> > > 
> > > 1. Fundamental (PERST#)/hot/warm reset
> > > 2. Link transition from L2/L3 to L0
> > > 
> > > In those cases, LINK_DOWN causes some non-sticky DWC registers to loose 
> > > the
> > > state (like REBAR, PTM_CAP etc...). So the drivers need to reinitialize
> > > them to function properly once the link comes back again.
> > > 
> > > This is not a problem for drivers supporting PERST# IRQ, since they can
> > > reinitialize the registers in the PERST# IRQ callback. But for the drivers
> > > not supporting PERST#, there is no way they can reinitialize the registers
> > > other than relying on LINK_DOWN IRQ received when the link goes down. So
> > > let's add a DWC generic API dw_pcie_ep_linkdown() that reinitializes the
> > > non-sticky registers and also notifies the EPF drivers about link going
> > > down.
> > > 
> > > This API can also be used by the drivers supporting PERST# to handle the
> > > scenario (2) mentioned above.
> > > 
> > > Signed-off-by: Manivannan Sadhasivam 
> > > ---
> > >  drivers/pci/controller/dwc/pcie-designware-ep.c | 111 
> > > ++--
> > >  drivers/pci/controller/dwc/pcie-designware.h|   5 ++
> > >  2 files changed, 72 insertions(+), 44 deletions(-)
> > > 
> > > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> > > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > index 278bdc9b2269..fed4c2936c78 100644
> > > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > > @@ -14,14 +14,6 @@
> > >  #include 
> > >  #include 
> > >  
> > > -void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> > > -{
> > > - struct pci_epc *epc = ep->epc;
> > > -
> > > - pci_epc_linkup(epc);
> > > -}
> > > -EXPORT_SYMBOL_GPL(dw_pcie_ep_linkup);
> > > -
> > >  void dw_pcie_ep_init_notify(struct dw_pcie_ep *ep)
> > >  {
> > >   struct pci_epc *epc = ep->epc;
> > > @@ -603,19 +595,56 @@ static unsigned int 
> > > dw_pcie_ep_find_ext_capability(struct dw_pcie *pci, int cap)
> > >   return 0;
> > >  }
> > >  
> > > +static void dw_pcie_ep_init_non_sticky_registers(struct dw_pcie *pci)
> > > +{
> > > + unsigned int offset, ptm_cap_base;
> > > + unsigned int nbars;
> > > + u32 reg, i;
> > > +
> > > + offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR);
> > > + ptm_cap_base = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_PTM);
> > > +
> > > + dw_pcie_dbi_ro_wr_en(pci);
> > > +
> > > + if (offset) {
> > > + reg = dw_pcie_readl_dbi(pci, offset + PCI_REBAR_CTRL);
> > > + nbars = (reg & PCI_REBAR_CTRL_NBAR_MASK) >>
> > > + PCI_REBAR_CTRL_NBAR_SHIFT;
> > > +
> > > + for (i = 0; i < nbars; i++, offset += PCI_REBAR_CTRL)
> > > + dw_pcie_writel_dbi(pci, offset + PCI_REBAR_CAP, 0x0);
> > 
> > If you look at PCI_REBAR_CAP, you will see that it is sticky,
> > but you have to actually read the databook to see that:
> > 
> > "The RESBAR_CTRL_REG_BAR_SIZE field is automatically updated
> > when you write to RESBAR_CAP_REG_0_REG through the DBI."
> > 
> > So the reason why we need to write this register, even though
> > it is sticky, is to update the RESBAR_CTRL_REG_BAR_SIZE register,
> > which is not sticky :)
> > 
> > (Perhaps we should add that as a comment?)
> > 
> 
> Yeah, makes sense.

Note that I add a (unrelated) comment related to REBAR_CAP in this patch:
https://lore.kernel.org/linux-pci/20240307111520.3303774-1-cas...@kernel.org/T/#u

But once we move/add code to dw_pcie_ep_init_non_sticky_registers(), I think
that it might be a good "rule" to have a small comment for each write in
dw_pcie_ep_init_non_sticky_registers() which explains why the code should be
in dw_pcie_ep_init_non_sticky_registers() instead of 
dw_pcie_ep_init_registers(),
even if it just a small:

/* Field PCI_XXX_YYY.ZZZ is non-sticky */
writel_dbi(pci, offset + PCI_XXX_YYY, 0);


Kind regards,
Niklas


Re: [PATCH v9 07/10] PCI: dwc: ep: Remove "core_init_notifier" flag

2024-03-08 Thread Niklas Cassel
On Fri, Mar 08, 2024 at 11:08:29AM +0530, Manivannan Sadhasivam wrote:
> On Thu, Mar 07, 2024 at 10:09:06PM +0100, Niklas Cassel wrote:
> > On Mon, Mar 04, 2024 at 02:52:19PM +0530, Manivannan Sadhasivam wrote:
> > > "core_init_notifier" flag is set by the glue drivers requiring refclk from
> > > the host to complete the DWC core initialization. Also, those drivers will
> > > send a notification to the EPF drivers once the initialization is fully
> > > completed using the pci_epc_init_notify() API. Only then, the EPF drivers
> > > will start functioning.
> > > 
> > > For the rest of the drivers generating refclk locally, EPF drivers will
> > > start functioning post binding with them. EPF drivers rely on the
> > > 'core_init_notifier' flag to differentiate between the drivers.
> > > Unfortunately, this creates two different flows for the EPF drivers.
> > > 
> > > So to avoid that, let's get rid of the "core_init_notifier" flag and 
> > > follow
> > > a single initialization flow for the EPF drivers. This is done by calling
> > > the dw_pcie_ep_init_notify() from all glue drivers after the completion of
> > > dw_pcie_ep_init_registers() API. This will allow all the glue drivers to
> > > send the notification to the EPF drivers once the initialization is fully
> > > completed.
> > > 
> > > Only difference here is that, the drivers requiring refclk from host will
> > > send the notification once refclk is received, while others will send it
> > > during probe time itself.
> > > 
> > > Reviewed-by: Frank Li 
> > > Signed-off-by: Manivannan Sadhasivam 
> > > ---
> > 
> > You have removed the .core_init_notifier from EPC drivers,
> > but the callback in EPF drivers is still called .core_init.
> > 
> > Yes, this was a confusing name even before this patch, but
> > after this patch, it is probably even worse :)
> > 
> > The callback should be named from the perspective of EPF drivers IMO.
> > .core_init sounds like a EPF driver should initialize the core.
> > (But that is of course done by the EPC driver.)
> > 
> > The .link_up() callback name is better, the EPF driver is informed
> > that the link is up.
> > 
> > Perhaps we could rename .core_init to .core_up ?
> > 
> > It tells the EPF drivers that the core is now up.
> > (And the EPF driver can configure the BARs.)
> > 
> 
> I don't disagree :) I thought about it but then decided to not extend the 
> scope
> of this series further. So saved that for next series.
> 
> But yeah, it is good to clean it up here itself.

If you intend to create a .core_deinit or .core_down (or whatever name
you decide on), perhaps it is better to leave this cleanup to be part
of that same series?


Kind regards,
Niklas


Re: [PATCH v4] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt.

2024-03-08 Thread Michael Ellerman
Aneesh Kumar K V  writes:
> On 3/7/24 5:13 PM, Michael Ellerman wrote:
>> Mahesh Salgaonkar  writes:
>>> nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel
>>> crash when invoked during real mode interrupt handling (e.g. early HMI/MCE
>>> interrupt handler) if percpu allocation comes from vmalloc area.
>>>
>>> Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI()
>>> wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when
>>> percpu allocation is from the embedded first chunk. However with
>>> CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu
>>> allocation can come from the vmalloc area.
>>>
>>> With kernel command line "percpu_alloc=page" we can force percpu allocation
>>> to come from vmalloc area and can see kernel crash in machine_check_early:
>>>
>>> [1.215714] NIP [c0e49eb4] rcu_nmi_enter+0x24/0x110
>>> [1.215717] LR [c00461a0] machine_check_early+0xf0/0x2c0
>>> [1.215719] --- interrupt: 200
>>> [1.215720] [c00fffd73180] [] 0x0 (unreliable)
>>> [1.215722] [c00fffd731b0] [] 0x0
>>> [1.215724] [c00fffd73210] [c0008364] 
>>> machine_check_early_common+0x134/0x1f8
>>>
>>> Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu
>>> first chunk is not embedded.
>> 
>> My system (powernv) doesn't even boot with percpu_alloc=page.
>
>
> Can you share the crash details?

Yes but it's not pretty :)

  [1.725257][  T714] systemd-journald[714]: Collecting audit messages is 
disabled.
  [1.729401][T1] systemd[1]: Finished 
systemd-tmpfiles-setup-dev.service - Create Static Device Nodes in /dev.
  [^[[0;32m  OK  ^[[0m] Finished ^[[0;1;39msystemd-tmpfiles-…reate Static 
Device Nodes in /dev.
  [1.773902][   C22] Disabling lock debugging due to kernel taint
  [1.773905][   C23] Oops: Machine check, sig: 7 [#1]
  [1.773911][   C23] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA 
PowerNV
  [1.773916][   C23] Modules linked in:
  [1.773920][   C23] CPU: 23 PID: 0 Comm: swapper/23 Tainted: G   M 
  6.8.0-rc7-02500-g23515c370cbb #1
  [1.773924][   C23] Hardware name: 8335-GTH POWER9 0x4e1202 
opal:skiboot-v6.5.3-35-g1851b2a06 PowerNV
  [1.773926][   C23] NIP:   LR:  CTR: 

  [1.773929][   C23] REGS: c00fffa6ef50 TRAP:    Tainted: G   M 
   (6.8.0-rc7-02500-g23515c370cbb)
  [1.773932][   C23] MSR:   <>  CR:   XER: 
  [1.773937][   C23] CFAR:  IRQMASK: 3 
  [1.773937][   C23] GPR00:  c00fffa6efe0 
c00fffa6efb0  
  [1.773937][   C23] GPR04: c003d8c0 c1f5f000 
 0103 
  [1.773937][   C23] GPR08: 0003 653a0d962a590300 
  
  [1.773937][   C23] GPR12: c00fffa6f280  
c00084a4  
  [1.773937][   C23] GPR16: 53474552  
c003d8c0 c00fffa6f280 
  [1.773937][   C23] GPR20: c1f5f000 c00fffa6f340 
c00fffa6f2e8  
  [1.773937][   C23] GPR24: 0007fecf c65bbb80 
00550102 c2172b20 
  [1.773937][   C23] GPR28:  53474552 
 c00c6d80 
  [1.773982][   C23] NIP [] 0x0
  [1.773988][   C23] LR [] 0x0
  [1.773990][   C23] Call Trace:
  [1.773991][   C23] [c00fffa6efe0] [c1f5f000] 
.TOC.+0x0/0xa1000 (unreliable)
  [1.773999][   C23] Code:      
        
   
  [1.774021][   C23] ---[ end trace  ]---

Something has gone badly wrong.

That was a test kernel with some other commits, but nothing that should
cause that. Removing percpu_alloc=page fix it.

It's based on fddff98e83b4b4d54470902ea0d520c4d423ca3b.

>> AFAIK the only reason we added support for it was to handle 4K kernels
>> with HPT. See commit eb553f16973a ("powerpc/64/mm: implement page
>> mapping percpu first chunk allocator").
>> 
>> So I wonder if we should change the Kconfig to only offer it as an
>> option in that case, and change the logic in setup_per_cpu_areas() to
>> only use it as a last resort.
>> 
>> I guess we probably still need this commit though, even if just for 4K HPT.
>> 
>>
> We have also observed some error when we have large gap between the start 
> memory of
> NUMA nodes. That made the percpu offset really large causing boot failures 
> even on 64K.

Yeah, I have vague memories of that :)

cheers