Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs

2021-08-10 Thread Laurent Vivier
On 07/08/2021 09:20, Cédric Le Goater wrote:
> On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
> runtime. Today, the IPI is not created for such nodes, and hot-plugged
> CPUs use a bogus IPI, which leads to soft lockups.
> 
> We can not directly allocate and request the IPI on demand because
> bringup_up() is called under the IRQ sparse lock. The alternative is
> to allocate the IPIs for all possible nodes at startup and to request
> the mapping on demand when the first CPU of a node is brought up.
> 
> Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
> Cc: sta...@vger.kernel.org # v5.13
> Reported-by: Geetika Moolchandani 
> Cc: Srikar Dronamraju 
> Cc: Laurent Vivier 
> Signed-off-by: Cédric Le Goater 
> Message-Id: <20210629131542.743888-1-...@kaod.org>
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/sysdev/xive/common.c | 35 +--
>  1 file changed, 24 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index dbdbbc2f1dc5..943fd30095af 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain;
>  static struct xive_ipi_desc {
>   unsigned int irq;
>   char name[16];
> + atomic_t started;
>  } *xive_ipis;
>  
>  /*
> @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops 
> xive_ipi_irq_domain_ops = {
>   .alloc  = xive_ipi_irq_domain_alloc,
>  };
>  
> -static int __init xive_request_ipi(void)
> +static int __init xive_init_ipis(void)
>  {
>   struct fwnode_handle *fwnode;
>   struct irq_domain *ipi_domain;
> @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void)
>   struct xive_ipi_desc *xid = _ipis[node];
>   struct xive_ipi_alloc_info info = { node };
>  
> - /* Skip nodes without CPUs */
> - if (cpumask_empty(cpumask_of_node(node)))
> - continue;
> -
>   /*
>* Map one IPI interrupt per node for all cpus of that node.
>* Since the HW interrupt number doesn't have any meaning,
> @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void)
>   xid->irq = ret;
>  
>   snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
> -
> - ret = request_irq(xid->irq, xive_muxed_ipi_action,
> -   IRQF_PERCPU | IRQF_NO_THREAD, xid->name, 
> NULL);
> -
> - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
>   }
>  
>   return ret;
> @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void)
>   return ret;
>  }
>  
> +static int __init xive_request_ipi(unsigned int cpu)
> +{
> + struct xive_ipi_desc *xid = _ipis[early_cpu_to_node(cpu)];
> + int ret;
> +
> + if (atomic_inc_return(>started) > 1)
> + return 0;
> +
> + ret = request_irq(xid->irq, xive_muxed_ipi_action,
> +   IRQF_PERCPU | IRQF_NO_THREAD,
> +   xid->name, NULL);
> +
> + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
> + return ret;
> +}
> +
>  static int xive_setup_cpu_ipi(unsigned int cpu)
>  {
>   unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>   if (xc->hw_ipi != XIVE_BAD_IRQ)
>   return 0;
>  
> + /* Register the IPI */
> + xive_request_ipi(cpu);
> +
>   /* Grab an IPI from the backend, this will populate xc->hw_ipi */
>   if (xive_ops->get_ipi(cpu, xc))
>   return -EIO;
> @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, 
> struct xive_cpu *xc)
>   if (xc->hw_ipi == XIVE_BAD_IRQ)
>   return;
>  
> + /* TODO: clear IPI mapping */
> +
>   /* Mask the IPI */
>   xive_do_source_set_mask(>ipi_data, true);
>  
> @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void)
>   smp_ops->cause_ipi = xive_cause_ipi;
>  
>   /* Register the IPI */
> - xive_request_ipi();
> + xive_init_ipis();
>  
>   /* Allocate and setup IPI for the boot CPU */
>   xive_setup_cpu_ipi(smp_processor_id());
> 

Tested-by: Laurent Vivier 



[bug report] soc: fsl: qe: convert QE interrupt controller to platform_device

2021-08-10 Thread Dan Carpenter
Hello Maxim Kochetkov,

The patch be7ecbd240b2: "soc: fsl: qe: convert QE interrupt
controller to platform_device" from Aug 3, 2021, leads to the
following static checker warning:

drivers/soc/fsl/qe/qe_ic.c:438 qe_ic_init()
warn: unsigned 'qe_ic->virq_low' is never less than zero.

drivers/soc/fsl/qe/qe_ic.c
408 static int qe_ic_init(struct platform_device *pdev)
409 {
410 struct device *dev = >dev;
411 void (*low_handler)(struct irq_desc *desc);
412 void (*high_handler)(struct irq_desc *desc);
413 struct qe_ic *qe_ic;
414 struct resource *res;
415 struct device_node *node = pdev->dev.of_node;
416 
417 res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
418 if (res == NULL) {
419 dev_err(dev, "no memory resource defined\n");
420 return -ENODEV;
421 }
422 
423 qe_ic = devm_kzalloc(dev, sizeof(*qe_ic), GFP_KERNEL);
424 if (qe_ic == NULL)
425 return -ENOMEM;
426 
427 qe_ic->regs = devm_ioremap(dev, res->start, resource_size(res));
428 if (qe_ic->regs == NULL) {
429 dev_err(dev, "failed to ioremap() registers\n");
430 return -ENODEV;
431 }
432 
433 qe_ic->hc_irq = qe_ic_irq_chip;
434 
435 qe_ic->virq_high = platform_get_irq(pdev, 0);
436 qe_ic->virq_low = platform_get_irq(pdev, 1);
437 
--> 438 if (qe_ic->virq_low < 0) {
439 return -ENODEV;
440 }

Unsigned can't be less than zero.  It's weird that it doesn't check
qe_ic->virq_high as well.  Also remove the curly braces to make
checkpatch happy?

441 
442 if (qe_ic->virq_high != qe_ic->virq_low) {
443 low_handler = qe_ic_cascade_low;
444 high_handler = qe_ic_cascade_high;
445 } else {
446 low_handler = qe_ic_cascade_muxed_mpic;
447 high_handler = NULL;
448 }
449 
450 qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS,
451_ic_host_ops, qe_ic);
452 if (qe_ic->irqhost == NULL) {
453 dev_err(dev, "failed to add irq domain\n");
454 return -ENODEV;
455 }
456 
457 qe_ic_write(qe_ic->regs, QEIC_CICR, 0);
458 
459 irq_set_handler_data(qe_ic->virq_low, qe_ic);
460 irq_set_chained_handler(qe_ic->virq_low, low_handler);
461 
462 if (qe_ic->virq_high && qe_ic->virq_high != qe_ic->virq_low) {
463 irq_set_handler_data(qe_ic->virq_high, qe_ic);
464 irq_set_chained_handler(qe_ic->virq_high, high_handler);
465 }
466 return 0;
467 }

regards,
dan carpenter


Re: [PATCH v2 3/4] powerpc: Optimize register usage for dear register

2021-08-10 Thread Xiongwei Song
On Sat, Aug 7, 2021 at 2:58 PM Christophe Leroy
 wrote:
>
>
>
> Le 07/08/2021 à 03:02, sxwj...@me.com a écrit :
> > From: Xiongwei Song 
> >
> > Create an anonymous union for dar and dear regsiters, we can reference
> > dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y.
> > Otherwise, reference dar. This makes code more clear.
> >
> > Signed-off-by: Xiongwei Song 
> > ---
> >   arch/powerpc/include/asm/ptrace.h   | 5 -
> >   arch/powerpc/kernel/process.c   | 2 +-
> >   arch/powerpc/kernel/ptrace/ptrace.c | 2 ++
> >   3 files changed, 7 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/ptrace.h 
> > b/arch/powerpc/include/asm/ptrace.h
> > index c252d04b1206..fa725e3238c2 100644
> > --- a/arch/powerpc/include/asm/ptrace.h
> > +++ b/arch/powerpc/include/asm/ptrace.h
> > @@ -43,7 +43,10 @@ struct pt_regs
> >   unsigned long mq;
> >   #endif
> >   unsigned long trap;
> > - unsigned long dar;
> > + union {
> > + unsigned long dar;
> > + unsigned long dear;
> > + };
> >   union {
> >   unsigned long dsisr;
> >   unsigned long esr;
> > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> > index f74af8f9133c..50436b52c213 100644
> > --- a/arch/powerpc/kernel/process.c
> > +++ b/arch/powerpc/kernel/process.c
> > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
> >   trap == INTERRUPT_DATA_STORAGE ||
> >   trap == INTERRUPT_ALIGNMENT) {
> >   if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> > - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
> > regs->esr);
> > + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, 
> > regs->esr);
> >   else
> >   pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
> > regs->dsisr);
> >   }
> > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
> > b/arch/powerpc/kernel/ptrace/ptrace.c
> > index a222fd4d6334..7c7093c17c45 100644
> > --- a/arch/powerpc/kernel/ptrace/ptrace.c
> > +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> > @@ -373,6 +373,8 @@ void __init pt_regs_check(void)
> >offsetof(struct user_pt_regs, trap));
> >   BUILD_BUG_ON(offsetof(struct pt_regs, dar) !=
> >offsetof(struct user_pt_regs, dar));
> > + BUILD_BUG_ON(offsetof(struct pt_regs, dear) !=
> > +  offsetof(struct user_pt_regs, dar));
>
> dar and dear are the same, so checking the same thing a second time is 
> pointless.

Same reply as the patch 1.

Regards,
Xiongwei

>
> >   BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
> >offsetof(struct user_pt_regs, dsisr));
> >   BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
> >


Re: [PATCH v2 1/4] powerpc: Optimize register usage for esr register

2021-08-10 Thread Xiongwei Song
On Sat, Aug 7, 2021 at 2:57 PM Christophe Leroy
 wrote:
>
>
>
> Le 07/08/2021 à 03:02, sxwj...@me.com a écrit :
> > From: Xiongwei Song 
> >
> > Create an anonymous union for dsisr and esr regsiters, we can reference
> > esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
> > Otherwise, reference dsisr. This makes code more clear.
> >
> > Signed-off-by: Xiongwei Song 
> > ---
> >   arch/powerpc/include/asm/ptrace.h  | 5 -
> >   arch/powerpc/kernel/process.c  | 2 +-
> >   arch/powerpc/kernel/ptrace/ptrace.c| 2 ++
> >   arch/powerpc/kernel/traps.c| 2 +-
> >   arch/powerpc/platforms/44x/machine_check.c | 4 ++--
> >   arch/powerpc/platforms/4xx/machine_check.c | 2 +-
> >   6 files changed, 11 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/ptrace.h 
> > b/arch/powerpc/include/asm/ptrace.h
> > index 3e5d470a6155..c252d04b1206 100644
> > --- a/arch/powerpc/include/asm/ptrace.h
> > +++ b/arch/powerpc/include/asm/ptrace.h
> > @@ -44,7 +44,10 @@ struct pt_regs
> >   #endif
> >   unsigned long trap;
> >   unsigned long dar;
> > - unsigned long dsisr;
> > + union {
> > + unsigned long dsisr;
> > + unsigned long esr;
> > + };
> >   unsigned long result;
> >   };
> >   };
> > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> > index 185beb290580..f74af8f9133c 100644
> > --- a/arch/powerpc/kernel/process.c
> > +++ b/arch/powerpc/kernel/process.c
> > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
> >   trap == INTERRUPT_DATA_STORAGE ||
> >   trap == INTERRUPT_ALIGNMENT) {
> >   if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> > - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
> > regs->dsisr);
> > + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
> > regs->esr);
> >   else
> >   pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
> > regs->dsisr);
> >   }
> > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
> > b/arch/powerpc/kernel/ptrace/ptrace.c
> > index 0a0a33eb0d28..a222fd4d6334 100644
> > --- a/arch/powerpc/kernel/ptrace/ptrace.c
> > +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> > @@ -375,6 +375,8 @@ void __init pt_regs_check(void)
> >offsetof(struct user_pt_regs, dar));
> >   BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
> >offsetof(struct user_pt_regs, dsisr));
> > + BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
> > +  offsetof(struct user_pt_regs, dsisr));
>
> esr and dsisr are the same, so checking the same thing a second time is 
> pointless.

Hmm...it's better to check if the changes on pt_regs structure is
expected I think.

Regards,
Xiongwei

>
> >   BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
> >offsetof(struct user_pt_regs, result));
> >
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> > index dfbce527c98e..2164f5705a0b 100644
> > --- a/arch/powerpc/kernel/traps.c
> > +++ b/arch/powerpc/kernel/traps.c
> > @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs)
> >   #ifdef CONFIG_PPC_ADV_DEBUG_REGS
> >   /* On 4xx, the reason for the machine check or program exception
> >  is in the ESR. */
> > -#define get_reason(regs) ((regs)->dsisr)
> > +#define get_reason(regs) ((regs)->esr)
> >   #define REASON_FP   ESR_FP
> >   #define REASON_ILLEGAL  (ESR_PIL | ESR_PUO)
> >   #define REASON_PRIVILEGED   ESR_PPR
> > diff --git a/arch/powerpc/platforms/44x/machine_check.c 
> > b/arch/powerpc/platforms/44x/machine_check.c
> > index a5c898bb9bab..5d19daacd78a 100644
> > --- a/arch/powerpc/platforms/44x/machine_check.c
> > +++ b/arch/powerpc/platforms/44x/machine_check.c
> > @@ -11,7 +11,7 @@
> >
> >   int machine_check_440A(struct pt_regs *regs)
> >   {
> > - unsigned long reason = regs->dsisr;
> > + unsigned long reason = regs->esr;
> >
> >   printk("Machine check in kernel mode.\n");
> >   if (reason & ESR_IMCP){
> > @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs)
> >   #ifdef CONFIG_PPC_47x
> >   int machine_check_47x(struct pt_regs *regs)
> >   {
> > - unsigned long reason = regs->dsisr;
> > + unsigned long reason = regs->esr;
> >   u32 mcsr;
> >
> >   printk(KERN_ERR "Machine check in kernel mode.\n");
> > diff --git a/arch/powerpc/platforms/4xx/machine_check.c 
> > b/arch/powerpc/platforms/4xx/machine_check.c
> > index a71c29892a91..a905da1d6f41 100644
> > --- a/arch/powerpc/platforms/4xx/machine_check.c
> > +++ b/arch/powerpc/platforms/4xx/machine_check.c
> > @@ -10,7 +10,7 @@
> >
> >   int machine_check_4xx(struct pt_regs *regs)
> >   {
> > - unsigned long 

Re: [PATCH v4 0/5] nvmem: nintendo-otp: Add new driver for the Wii and Wii U OTP

2021-08-10 Thread Srinivas Kandagatla




On 01/08/2021 08:38, Emmanuel Gil Peyrot wrote:

The OTP is a read-only memory area which contains various keys and
signatures used to decrypt, encrypt or verify various pieces of storage.

Its size depends on the console, it is 128 bytes on the Wii and
1024 bytes on the Wii U (split into eight 128 bytes banks).

It can be used directly by writing into one register and reading from
the other one, without any additional synchronisation.

This series has been tested on both the Wii U (using my downstream
master-wiiu branch[1]), as well as on the Wii on mainline.

[1] https://gitlab.com/linkmauve/linux-wiiu/-/commits/master-wiiu

Changes since v1:
- Fixed the commit messages so they can be accepted by other email
   servers, sorry about that.

Changes since v2:
- Switched the dt binding documentation to YAML.
- Used more obvious register arithmetic, and tested that gcc (at -O1 and
   above) outputs the exact same rlwinm instructions for them.
- Use more #defines to make the code easier to read.
- Include some links to the reversed documentation.
- Avoid overlapping dt regions by changing the existing control@d800100
   node to end before the OTP registers, with some bigger dt refactoring
   left for a future series.

Changes since v3:
- Relicense the dt-binding documentation under GPLv2-only or
   BSD-2-clauses.

Emmanuel Gil Peyrot (5):
   nvmem: nintendo-otp: Add new driver for the Wii and Wii U OTP
   dt-bindings: nintendo-otp: Document the Wii and Wii U OTP support



Applied 1/5 and 2/5 to nvmem next,
rest of the patches should go via powerpc dts tree.

thanks,
--srini

   powerpc: wii.dts: Reduce the size of the control area
   powerpc: wii.dts: Expose the OTP on this platform
   powerpc: wii_defconfig: Enable OTP by default

  .../bindings/nvmem/nintendo-otp.yaml  |  44 +++
  arch/powerpc/boot/dts/wii.dts |  13 +-
  arch/powerpc/configs/wii_defconfig|   1 +
  drivers/nvmem/Kconfig |  11 ++
  drivers/nvmem/Makefile|   2 +
  drivers/nvmem/nintendo-otp.c  | 124 ++
  6 files changed, 194 insertions(+), 1 deletion(-)
  create mode 100644 Documentation/devicetree/bindings/nvmem/nintendo-otp.yaml
  create mode 100644 drivers/nvmem/nintendo-otp.c



Re: [PATCH v2 1/2] sched/topology: Skip updating masks for non-online nodes

2021-08-10 Thread Srikar Dronamraju
* Valentin Schneider  [2021-08-09 13:52:38]:

> On 09/08/21 12:22, Srikar Dronamraju wrote:
> > * Valentin Schneider  [2021-08-08 16:56:47]:
> >> Wait, doesn't the distance matrix (without any offline node) say
> >>
> >>   distance(0, 3) == 40
> >>
> >> ? We should have at the very least:
> >>
> >>   node   0   1   2   3
> >> 0:  10  20  ??  40
> >> 1:  20  20  ??  40
> >> 2:  ??  ??  ??  ??
> >> 3:  40  40  ??  10
> >>
> >
> > Before onlining node 3 and CPU 3 (node/CPU 0 and 1 are already online)
> > Note: Node 2-7 and CPU 2-7 are still offline.
> >
> > node   0   1   2   3
> >   0:  10  20  40  10
> >   1:  20  20  40  10
> >   2:  40  40  10  10
> >   3:  10  10  10  10
> >
> > NODE->mask(0) == 0
> > NODE->mask(1) == 1
> > NODE->mask(2) == 0
> > NODE->mask(3) == 0
> >
> > Note: This is with updating Node 2's distance as 40 for figuring out
> > the number of numa levels. Since we have all possible distances, we
> > dont update Node 3 distance, so it will be as if its local to node 0.
> >
> > Now when Node 3 and CPU 3 are onlined
> > Note: Node 2, 3-7 and CPU 2, 3-7 are still offline.
> >
> > node   0   1   2   3
> >   0:  10  20  40  40
> >   1:  20  20  40  40
> >   2:  40  40  10  40
> >   3:  40  40  40  10
> >
> > NODE->mask(0) == 0
> > NODE->mask(1) == 1
> > NODE->mask(2) == 0
> > NODE->mask(3) == 0,3
> >
> > CPU 0 continues to be part of Node->mask(3) because when we online and
> > we find the right distance, there is no API to reset the numa mask of
> > 3 to remove CPU 0 from the numa masks.
> >
> > If we had an API to clear/set sched_domains_numa_masks[node][] when
> > the node state changes, we could probably plug-in to clear/set the
> > node masks whenever node state changes.
> >
> 
> Gotcha, this is now coming back to me...
> 
> [...]
> 
> >> Ok, so it looks like we really can't do without that part - even if we get
> >> "sensible" distance values for the online nodes, we can't divine values for
> >> the offline ones.
> >>
> >
> > Yes
> >
> 
> Argh, while your approach does take care of the masks, it leaves
> sched_numa_topology_type unchanged. You *can* force an update of it, but
> yuck :(
> 
> I got to the below...
> 

Yes, I completely missed that we should update sched_numa_topology_type.


> ---
> From: Srikar Dronamraju 
> Date: Thu, 1 Jul 2021 09:45:51 +0530
> Subject: [PATCH 1/1] sched/topology: Skip updating masks for non-online nodes
> 
> The scheduler currently expects NUMA node distances to be stable from init
> onwards, and as a consequence builds the related data structures
> once-and-for-all at init (see sched_init_numa()).
> 
> Unfortunately, on some architectures node distance is unreliable for
> offline nodes and may very well change upon onlining.
> 
> Skip over offline nodes during sched_init_numa(). Track nodes that have
> been onlined at least once, and trigger a build of a node's NUMA masks when
> it is first onlined post-init.
> 

Your version is much much better than mine.
And I have verified that it works as expected.


> Reported-by: Geetika Moolchandani 
> Signed-off-by: Srikar Dronamraju 
> Signed-off-by: Valentin Schneider 
> ---
>  kernel/sched/topology.c | 65 +
>  1 file changed, 65 insertions(+)
> 
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index b77ad49dc14f..cba95793a9b7 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1482,6 +1482,8 @@ int sched_max_numa_distance;
>  static int   *sched_domains_numa_distance;
>  static struct cpumask***sched_domains_numa_masks;
>  int __read_mostlynode_reclaim_distance = RECLAIM_DISTANCE;
> +
> +static unsigned long __read_mostly *sched_numa_onlined_nodes;
>  #endif
> 
>  /*
> @@ -1833,6 +1835,16 @@ void sched_init_numa(void)
>   sched_domains_numa_masks[i][j] = mask;
> 
>   for_each_node(k) {
> + /*
> +  * Distance information can be unreliable for
> +  * offline nodes, defer building the node
> +  * masks to its bringup.
> +  * This relies on all unique distance values
> +  * still being visible at init time.
> +  */
> + if (!node_online(j))
> + continue;
> +
>   if (sched_debug() && (node_distance(j, k) != 
> node_distance(k, j)))
>   sched_numa_warn("Node-distance not 
> symmetric");
> 
> @@ -1886,6 +1898,53 @@ void sched_init_numa(void)
>   sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
> 
>   init_numa_topology_type();
> +
> + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
> + if (!sched_numa_onlined_nodes)
> + 

Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs

2021-08-10 Thread Srikar Dronamraju
* C?dric Le Goater  [2021-08-07 09:20:57]:

> On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
> runtime. Today, the IPI is not created for such nodes, and hot-plugged
> CPUs use a bogus IPI, which leads to soft lockups.
> 
> We can not directly allocate and request the IPI on demand because
> bringup_up() is called under the IRQ sparse lock. The alternative is
> to allocate the IPIs for all possible nodes at startup and to request
> the mapping on demand when the first CPU of a node is brought up.
> 

Thank you, this version too works for me.

Tested-by: Srikar Dronamraju 


> Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
> Cc: sta...@vger.kernel.org # v5.13
> Reported-by: Geetika Moolchandani 
> Cc: Srikar Dronamraju 
> Cc: Laurent Vivier 
> Signed-off-by: Cédric Le Goater 
> Message-Id: <20210629131542.743888-1-...@kaod.org>
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/sysdev/xive/common.c | 35 +--
>  1 file changed, 24 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index dbdbbc2f1dc5..943fd30095af 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain;
>  static struct xive_ipi_desc {
>   unsigned int irq;
>   char name[16];
> + atomic_t started;
>  } *xive_ipis;
>  
>  /*
> @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops 
> xive_ipi_irq_domain_ops = {
>   .alloc  = xive_ipi_irq_domain_alloc,
>  };
>  
> -static int __init xive_request_ipi(void)
> +static int __init xive_init_ipis(void)
>  {
>   struct fwnode_handle *fwnode;
>   struct irq_domain *ipi_domain;
> @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void)
>   struct xive_ipi_desc *xid = _ipis[node];
>   struct xive_ipi_alloc_info info = { node };
>  
> - /* Skip nodes without CPUs */
> - if (cpumask_empty(cpumask_of_node(node)))
> - continue;
> -
>   /*
>* Map one IPI interrupt per node for all cpus of that node.
>* Since the HW interrupt number doesn't have any meaning,
> @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void)
>   xid->irq = ret;
>  
>   snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
> -
> - ret = request_irq(xid->irq, xive_muxed_ipi_action,
> -   IRQF_PERCPU | IRQF_NO_THREAD, xid->name, 
> NULL);
> -
> - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
>   }
>  
>   return ret;
> @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void)
>   return ret;
>  }
>  
> +static int __init xive_request_ipi(unsigned int cpu)
> +{
> + struct xive_ipi_desc *xid = _ipis[early_cpu_to_node(cpu)];
> + int ret;
> +
> + if (atomic_inc_return(>started) > 1)
> + return 0;
> +
> + ret = request_irq(xid->irq, xive_muxed_ipi_action,
> +   IRQF_PERCPU | IRQF_NO_THREAD,
> +   xid->name, NULL);
> +
> + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
> + return ret;
> +}
> +
>  static int xive_setup_cpu_ipi(unsigned int cpu)
>  {
>   unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>   if (xc->hw_ipi != XIVE_BAD_IRQ)
>   return 0;
>  
> + /* Register the IPI */
> + xive_request_ipi(cpu);
> +
>   /* Grab an IPI from the backend, this will populate xc->hw_ipi */
>   if (xive_ops->get_ipi(cpu, xc))
>   return -EIO;
> @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, 
> struct xive_cpu *xc)
>   if (xc->hw_ipi == XIVE_BAD_IRQ)
>   return;
>  
> + /* TODO: clear IPI mapping */
> +
>   /* Mask the IPI */
>   xive_do_source_set_mask(>ipi_data, true);
>  
> @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void)
>   smp_ops->cause_ipi = xive_cause_ipi;
>  
>   /* Register the IPI */
> - xive_request_ipi();
> + xive_init_ipis();
>  
>   /* Allocate and setup IPI for the boot CPU */
>   xive_setup_cpu_ipi(smp_processor_id());
> -- 
> 2.31.1
> 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs

2021-08-10 Thread Cédric Le Goater
On 8/7/21 9:20 AM, Cédric Le Goater wrote:
> On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
> runtime. Today, the IPI is not created for such nodes, and hot-plugged
> CPUs use a bogus IPI, which leads to soft lockups.
> 
> We can not directly allocate and request the IPI on demand because
> bringup_up() is called under the IRQ sparse lock. The alternative is
> to allocate the IPIs for all possible nodes at startup and to request
> the mapping on demand when the first CPU of a node is brought up.
> 
> Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
> Cc: sta...@vger.kernel.org # v5.13
> Reported-by: Geetika Moolchandani 
> Cc: Srikar Dronamraju 
> Cc: Laurent Vivier 
> Signed-off-by: Cédric Le Goater 
> Message-Id: <20210629131542.743888-1-...@kaod.org>
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/sysdev/xive/common.c | 35 +--
>  1 file changed, 24 insertions(+), 11 deletions(-)

I forgot to add that this version does break irqbalance anymore, since
Linux is not mapping interrupts of CPU-less nodes.

Anyhow, irqbalance is now fixed : 

  
https://github.com/Irqbalance/irqbalance/commit/a7f81483a95a94d6a62ca7fb999a090e01c0de9b

So v1 (plus irqbalance patch above) or v2 are safe to use. I do prefer 
v2.

Thanks to Laurent and Srikar for the extra tests,

C.

> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index dbdbbc2f1dc5..943fd30095af 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain;
>  static struct xive_ipi_desc {
>   unsigned int irq;
>   char name[16];
> + atomic_t started;
>  } *xive_ipis;
>  
>  /*
> @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops 
> xive_ipi_irq_domain_ops = {
>   .alloc  = xive_ipi_irq_domain_alloc,
>  };
>  
> -static int __init xive_request_ipi(void)
> +static int __init xive_init_ipis(void)
>  {
>   struct fwnode_handle *fwnode;
>   struct irq_domain *ipi_domain;
> @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void)
>   struct xive_ipi_desc *xid = _ipis[node];
>   struct xive_ipi_alloc_info info = { node };
>  
> - /* Skip nodes without CPUs */
> - if (cpumask_empty(cpumask_of_node(node)))
> - continue;
> -
>   /*
>* Map one IPI interrupt per node for all cpus of that node.
>* Since the HW interrupt number doesn't have any meaning,
> @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void)
>   xid->irq = ret;
>  
>   snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
> -
> - ret = request_irq(xid->irq, xive_muxed_ipi_action,
> -   IRQF_PERCPU | IRQF_NO_THREAD, xid->name, 
> NULL);
> -
> - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
>   }
>  
>   return ret;
> @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void)
>   return ret;
>  }
>  
> +static int __init xive_request_ipi(unsigned int cpu)
> +{
> + struct xive_ipi_desc *xid = _ipis[early_cpu_to_node(cpu)];
> + int ret;
> +
> + if (atomic_inc_return(>started) > 1)
> + return 0;
> +
> + ret = request_irq(xid->irq, xive_muxed_ipi_action,
> +   IRQF_PERCPU | IRQF_NO_THREAD,
> +   xid->name, NULL);
> +
> + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
> + return ret;
> +}
> +
>  static int xive_setup_cpu_ipi(unsigned int cpu)
>  {
>   unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>   if (xc->hw_ipi != XIVE_BAD_IRQ)
>   return 0;
>  
> + /* Register the IPI */
> + xive_request_ipi(cpu);
> +
>   /* Grab an IPI from the backend, this will populate xc->hw_ipi */
>   if (xive_ops->get_ipi(cpu, xc))
>   return -EIO;
> @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, 
> struct xive_cpu *xc)
>   if (xc->hw_ipi == XIVE_BAD_IRQ)
>   return;
>  
> + /* TODO: clear IPI mapping */
> +
>   /* Mask the IPI */
>   xive_do_source_set_mask(>ipi_data, true);
>  
> @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void)
>   smp_ops->cause_ipi = xive_cause_ipi;
>  
>   /* Register the IPI */
> - xive_request_ipi();
> + xive_init_ipis();
>  
>   /* Allocate and setup IPI for the boot CPU */
>   xive_setup_cpu_ipi(smp_processor_id());
> 



[PATCH v4 5/6] sched/fair: Carve out logic to mark a group for asymmetric packing

2021-08-10 Thread Ricardo Neri
Create a separate function, sched_asym(). A subsequent changeset will
introduce logic to deal with SMT in conjunction with asmymmetric
packing. Such logic will need the statistics of the scheduling
group provided as argument. Update them before calling sched_asym().

Cc: Aubrey Li 
Cc: Ben Segall 
Cc: Daniel Bristot de Oliveira 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Srinivas Pandruvada 
Cc: Steven Rostedt 
Cc: Tim Chen 
Reviewed-by: Joel Fernandes (Google) 
Reviewed-by: Len Brown 
Co-developed-by: Peter Zijlstra (Intel) 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ricardo Neri 
---
Changes since v3:
  * Remove a redundant check for the local group in sched_asym().
(Dietmar)
  * Reworded commit message for clarity. (Len)

Changes since v2:
  * Introduced this patch.

Changes since v1:
  * N/A
---
 kernel/sched/fair.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae3d2bc59d8d..dd411cefb63f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8531,6 +8531,13 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats 
*sgs,
+  struct sched_group *group)
+{
+   return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -8591,18 +8598,17 @@ static inline void update_sg_lb_stats(struct lb_env 
*env,
}
}
 
+   sgs->group_capacity = group->sgc->capacity;
+
+   sgs->group_weight = group->group_weight;
+
/* Check if dst CPU is idle and preferred to this group */
if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
-   env->idle != CPU_NOT_IDLE &&
-   sgs->sum_h_nr_running &&
-   sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+   env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+   sched_asym(env, sds, sgs, group)) {
sgs->group_asym_packing = 1;
}
 
-   sgs->group_capacity = group->sgc->capacity;
-
-   sgs->group_weight = group->group_weight;
-
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
 
/* Computing avg_load makes sense only when group is overloaded */
-- 
2.17.1



[PATCH v4 1/6] x86/sched: Decrease further the priorities of SMT siblings

2021-08-10 Thread Ricardo Neri
When scheduling, it is better to prefer a separate physical core rather
than the SMT sibling of a high priority core. The existing formula to
compute priorities takes such fact in consideration. There may exist,
however, combinations of priorities (i.e., maximum frequencies) in which
the priority of high-numbered SMT siblings of high-priority cores collides
with the priority of low-numbered SMT siblings of low-priority cores.

Consider for instance an SMT2 system with CPUs [0, 1] with priority 60 and
[2, 3] with priority 30(CPUs in brackets are SMT siblings. In such a case,
the resulting priorities would be [120, 60], [60, 30]. Thus, to ensure
that CPU2 has higher priority than CPU1, divide the raw priority by the
squared SMT iterator. The resulting priorities are [120, 30]. [60, 15].

Cc: Aubrey Li 
Cc: Ben Segall 
Cc: Daniel Bristot de Oliveira 
Cc: Dietmar Eggemann 
Cc: Joel Fernandes (Google) 
Cc: Mel Gorman 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Srinivas Pandruvada 
Cc: Steven Rostedt 
Cc: Tim Chen 
Originally-by: Len Brown 
Signed-off-by: Len Brown 
Signed-off-by: Ricardo Neri 
---
Changes since v3:
  * Introduced this patch

Changes since v2:
  * N/A

Changes since v1:
  * N/A
---
 arch/x86/kernel/itmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 1afbdd1dd777..9ff480e94511 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu)
 * of the priority chain and only used when
 * all other high priority cpus are out of capacity.
 */
-   smt_prio = prio * smp_num_siblings / i;
+   smt_prio = prio * smp_num_siblings / (i * i);
per_cpu(sched_core_priority, cpu) = smt_prio;
i++;
}
-- 
2.17.1



[PATCH v4 0/6] sched/fair: Fix load balancing of SMT siblings with ASYM_PACKING

2021-08-10 Thread Ricardo Neri
This is v4 the series. v1, v2, and v3 patches and test results can be found
in [1], [2], and [3], respectively.

=== Problem statement ===
++ Load balancing ++

When using asymmetric packing, there exists CPU topologies with three
priority levels in which only a subset of the physical cores support SMT.
An instance of such topology is Intel Alderlake, a hybrid processor with
a mixture of Intel Core (with support for SMT) and Intel Atom CPUs.

On Alderlake, it is almost always beneficial to spread work by picking
first the Core CPUs, then the Atoms and at last the SMT siblings.

The current load balancer, however, does not behave as described when using
ASYM_PACKING. Instead, the load balancer will choose higher-priority CPUs 
(an Intel Core) over medium-priority CPUs (an Intel Atom), and subsequently
overflow the load to a low priority SMT sibling CPU. This leaves medium-
priority CPUs idle while low-priority CPUs are busy.

This patchset fixes this behavior by also checking the idle state of the
SMT siblings of both the CPU doing the load balance and the busiest
candidate group when deciding whether the destination CPUs can pull tasks
from the busiest CPU.

++ Rework ASYM_PACKING priorities with ITMT
We also reworked the priority computation of the SMT siblings to ensure
that higher-numbered SMT siblings are always low priority. The current
computation may lead to situations in which in some processors those
higher-numbered SMT siblings have the same priority as the Intel Atom
CPUs.

=== Testing ===
I ran a few benchmarks with and without this version of the patchset on
an Intel Alderlake system with 8 Intel Core (with SMT) and 8 Intel
Atom CPUs.

The baseline for the results is an unmodified v5.13-rc4 kernel. Results
show a comparative percentage of improvement (positive) or degradation
(negative). Each test case is repeated eight times, and the standard
deviation among repetitions is also documented.

Table 1 shows the results when using hardware-controlled performance
performance states (HWP), a common use case. The impact of the patches
is overall positive with a few test cases showing slight degradation.
hackbench is especially difficult to assess it shows a high degree of
variability.

Thanks and BR,
Ricardo

ITMT: Intel Turbo Boost Max Technology 3.0


Changes since v3:
  * Reworked the ITMT priority computation to further reduce the priority
of SMT siblings (patch 1).
  * Clear scheduling group flags when a child scheduling level
degenerates (patch 2).
  * Removed arch-specific hooks (patch 6, PeterZ)
  * Removed redundant checks for the local group. (patch 5, Dietmar)
  * Removed redundant check for local idle CPUs and local group
utilization. (patch 6, Joel)
  * Reworded commit messages of patches 2, 3, 5, and 6 for clarity.
(Len, PeterZ)
  * Added Joel's Reviewed-by tag.
  * Unchanged patches: 4

Changes since v2:
  * Removed arch_sched_asym_prefer_early() and re-evaluation of the
candidate busiest group in update_sd_pick_busiest(). (PeterZ)
  * Introduced sched_group::flags to reflect the properties of CPUs
in the scheduling group. This helps to identify scheduling groups
whose CPUs are SMT siblings. (PeterZ)
  * Modified update_sg_lb_stats() to get statistics of the scheduling
domain as an argument. This provides it with the statistics of the
local domain. (PeterZ)
  * Introduced sched_asym() to decide if a busiest candidate group can
be marked for asymmetric packing.
  * Reworded patch 1 for clarity. (PeterZ)

Changes since v1:
  * Don't bailout in update_sd_pick_busiest() if dst_cpu cannot pull
tasks. Instead, reclassify the candidate busiest group, as it
may still be selected. (PeterZ)
  * Avoid an expensive and unnecessary call to cpumask_weight() when
determining if a sched_group is comprised of SMT siblings.
(PeterZ).
  * Updated test results using the v2 patches.


  Table 1. Test results of patches with HWP
===
hackbench
=
caseloadbaseline(std%)  compare%( std%)
process-pipegroup-1  1.00 (  1.45)   -2.99 (  4.35)
process-pipegroup-2  1.00 ( 18.32)   +2.14 (  8.33)
process-pipegroup-4  1.00 ( 17.27)  -10.68 ( 15.85)
process-pipegroup-8  1.00 ( 12.33)   +2.26 ( 13.28)
process-pipegroup-12 1.00 (  6.52)   -4.07 (  7.97)
process-pipegroup-16 1.00 (  9.70)   -7.71 (  6.01)
process-pipegroup-20 1.00 (  2.52)   -4.15 (  6.35)
process-pipegroup-24 1.00 (  4.84)   +1.04 (  4.60)
process-pipegroup-32 1.00 (  4.79)   +1.72 (  5.13)
process-pipegroup-48 1.00 (  6.77)   +4.68 (  4.24)
process-sockets group-1  1.00 (  1.89)   +0.53 (  3.48)
process-sockets group-2  1.00 (  

[PATCH v4 2/6] sched/topology: Introduce sched_group::flags

2021-08-10 Thread Ricardo Neri
There exist situations in which the load balance needs to know the
properties of the CPUs in a scheduling group. When using asymmetric
packing, for instance, the load balancer needs to know not only the
state of dst_cpu but also of its SMT siblings, if any.

Use the flags of the child scheduling domains to initialize scheduling
group flags. This will reflect the properties of the CPUs in the
group.

A subsequent changeset will make use of these new flags. No functional
changes are introduced.

Cc: Aubrey Li 
Cc: Ben Segall 
Cc: Daniel Bristot de Oliveira 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Srinivas Pandruvada 
Cc: Steven Rostedt 
Cc: Tim Chen 
Reviewed-by: Joel Fernandes (Google) 
Reviewed-by: Len Brown 
Originally-by: Peter Zijlstra (Intel) 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ricardo Neri 
---
Changes since v3:
  * Clear the flags of the scheduling groups of a domain if its child is
destroyed.
  * Minor rewording of the commit message.

Changes since v2:
  * Introduced this patch.

Changes since v1:
  * N/A
---
 kernel/sched/sched.h|  1 +
 kernel/sched/topology.c | 21 ++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a2c6f6ae6392..fb716e78974d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1799,6 +1799,7 @@ struct sched_group {
unsigned intgroup_weight;
struct sched_group_capacity *sgc;
int asym_prefer_cpu;/* CPU of highest 
priority in group */
+   int flags;
 
/*
 * The CPUs this group covers.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b77ad49dc14f..27c959de175d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -716,8 +716,20 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
tmp = sd;
sd = sd->parent;
destroy_sched_domain(tmp);
-   if (sd)
+   if (sd) {
+   struct sched_group *sg = sd->groups;
+
+   /*
+* sched groups hold the flags of the child sched
+* domain for convenience. Clear such flags since
+* the child is being destroyed.
+*/
+   do {
+   sg->flags = 0;
+   } while (sg != sd->groups);
+
sd->child = NULL;
+   }
}
 
for (tmp = sd; tmp; tmp = tmp->parent)
@@ -916,10 +928,12 @@ build_group_from_child_sched_domain(struct sched_domain 
*sd, int cpu)
return NULL;
 
sg_span = sched_group_span(sg);
-   if (sd->child)
+   if (sd->child) {
cpumask_copy(sg_span, sched_domain_span(sd->child));
-   else
+   sg->flags = sd->child->flags;
+   } else {
cpumask_copy(sg_span, sched_domain_span(sd));
+   }
 
atomic_inc(>ref);
return sg;
@@ -1169,6 +1183,7 @@ static struct sched_group *get_group(int cpu, struct 
sd_data *sdd)
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+   sg->flags = child->flags;
} else {
cpumask_set_cpu(cpu, sched_group_span(sg));
cpumask_set_cpu(cpu, group_balance_mask(sg));
-- 
2.17.1



[PATCH v4 4/6] sched/fair: Provide update_sg_lb_stats() with sched domain statistics

2021-08-10 Thread Ricardo Neri
Before deciding to pull tasks when using asymmetric packing of tasks,
on some architectures (e.g., x86) it is necessary to know not only the
state of dst_cpu but also of its SMT siblings. The decision to classify
a candidate busiest group as group_asym_packing is done in
update_sg_lb_stats(). Give this function access to the scheduling domain
statistics, which contains the statistics of the local group.

Cc: Aubrey Li 
Cc: Ben Segall 
Cc: Daniel Bristot de Oliveira 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Srinivas Pandruvada 
Cc: Steven Rostedt 
Cc: Tim Chen 
Reviewed-by: Joel Fernandes (Google) 
Reviewed-by: Len Brown 
Originally-by: Peter Zijlstra (Intel) 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ricardo Neri 
---
Changes since v3:
  * None

Changes since v2:
  * Introduced this patch.

Changes since v1:
  * N/A
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b686e18a39b..ae3d2bc59d8d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8539,6 +8539,7 @@ group_type group_classify(unsigned int imbalance_pct,
  * @sg_status: Holds flag indicating the status of the sched_group
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sd_lb_stats *sds,
  struct sched_group *group,
  struct sg_lb_stats *sgs,
  int *sg_status)
@@ -8547,7 +8548,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
memset(sgs, 0, sizeof(*sgs));
 
-   local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+   local_group = group == sds->local;
 
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
@@ -9110,7 +9111,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, 
struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
 
-   update_sg_lb_stats(env, sg, sgs, _status);
+   update_sg_lb_stats(env, sds, sg, sgs, _status);
 
if (local_group)
goto next_group;
-- 
2.17.1



[PATCH v4 3/6] sched/fair: Optimize checking for group_asym_packing

2021-08-10 Thread Ricardo Neri
sched_asmy_prefer() always returns false when called on the local group. By
checking local_group, we can avoid additional checks and invoking
sched_asmy_prefer() when it is not needed. No functional changes are
introduced.

Cc: Aubrey Li 
Cc: Ben Segall 
Cc: Daniel Bristot de Oliveira 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Srinivas Pandruvada 
Cc: Steven Rostedt 
Cc: Tim Chen 
Reviewed-by: Joel Fernandes (Google) 
Reviewed-by: Len Brown 
Signed-off-by: Ricardo Neri 
---
Changes since v3:
  * Further rewording of the commit message. (Len)

Changes since v2:
  * Reworded the commit message for clarity. (Peter Z)

Changes since v1:
  * None
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44e44c235f1f..3b686e18a39b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8591,7 +8591,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
 
/* Check if dst CPU is idle and preferred to this group */
-   if (env->sd->flags & SD_ASYM_PACKING &&
+   if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
env->idle != CPU_NOT_IDLE &&
sgs->sum_h_nr_running &&
sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
-- 
2.17.1



[PATCH v4 6/6] sched/fair: Consider SMT in ASYM_PACKING load balance

2021-08-10 Thread Ricardo Neri
When deciding to pull tasks in ASYM_PACKING, it is necessary not only to
check for the idle state of the destination CPU, dst_cpu, but also of
its SMT siblings.

If dst_cpu is idle but its SMT siblings are busy, performance suffers
if it pulls tasks from a medium priority CPU that does not have SMT
siblings.

Implement asym_smt_can_pull_tasks() to inspect the state of the SMT
siblings of both dst_cpu and the CPUs in the candidate busiest group.

Cc: Aubrey Li 
Cc: Ben Segall 
Cc: Daniel Bristot de Oliveira 
Cc: Dietmar Eggemann 
Cc: Mel Gorman 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Srinivas Pandruvada 
Cc: Steven Rostedt 
Cc: Tim Chen 
Reviewed-by: Joel Fernandes (Google) 
Reviewed-by: Len Brown 
Signed-off-by: Ricardo Neri 
---
Changes since v3:
  * Removed the arch_asym_check_smt_siblings() hook. Discussions with the
powerpc folks showed that this patch should not impact them. Also, more
recent powerpc processor no longer use asym_packing. (PeterZ)
  * Removed unnecessary local variable in asym_can_pull_tasks(). (Dietmar)
  * Removed unnecessary check for local CPUs when the local group has zero
utilization. (Joel)
  * Renamed asym_can_pull_tasks() as asym_smt_can_pull_tasks() to reflect
the fact that it deals with SMT cases.
  * Made asym_smt_can_pull_tasks() return false for !CONFIG_SCHED_SMT so
that callers can deal with non-SMT cases.

Changes since v2:
  * Reworded the commit message to reflect updates in code.
  * Corrected misrepresentation of dst_cpu as the CPU doing the load
balancing. (PeterZ)
  * Removed call to arch_asym_check_smt_siblings() as it is now called in
sched_asym().

Changes since v1:
  * Don't bailout in update_sd_pick_busiest() if dst_cpu cannot pull
tasks. Instead, reclassify the candidate busiest group, as it
may still be selected. (PeterZ)
  * Avoid an expensive and unnecessary call to cpumask_weight() when
determining if a sched_group is comprised of SMT siblings.
(PeterZ).
---
 kernel/sched/fair.c | 95 +
 1 file changed, 95 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd411cefb63f..8a1a2a43732c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8531,10 +8531,99 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
+/**
+ * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull 
tasks
+ * @dst_cpu:   Destination CPU of the load balancing
+ * @sds:   Load-balancing data with statistics of the local group
+ * @sgs:   Load-balancing statistics of the candidate busiest group
+ * @sg:The candidate busiet group
+ *
+ * Check the state of the SMT siblings of both @sds::local and @sg and decide
+ * if @dst_cpu can pull tasks. If @dst_cpu does not have SMT siblings, it can
+ * pull tasks if two or more of the SMT siblings of @sg are busy. If only one
+ * CPU in @sg is busy, pull tasks only if @dst_cpu has higher priority.
+ *
+ * If both @dst_cpu and @sg have SMT siblings, even the number of idle CPUs
+ * between @sds::local and @sg. Thus, pull tasks from @sg if the difference
+ * between the number of busy CPUs is 2 or more. If the difference is of 1,
+ * only pull if @dst_cpu has higher priority. If @sg does not have SMT siblings
+ * only pull tasks if all of the SMT siblings of @dst_cpu are idle and @sg
+ * has lower priority.
+ */
+static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+   struct sg_lb_stats *sgs,
+   struct sched_group *sg)
+{
+#ifdef CONFIG_SCHED_SMT
+   bool local_is_smt, sg_is_smt;
+   int sg_busy_cpus;
+
+   local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
+   sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
+
+   sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
+
+   if (!local_is_smt) {
+   /*
+* If we are here, @dst_cpu is idle and does not have SMT
+* siblings. Pull tasks if candidate group has two or more
+* busy CPUs.
+*/
+   if (sg_is_smt && sg_busy_cpus >= 2)
+   return true;
+
+   /*
+* @dst_cpu does not have SMT siblings. @sg may have SMT
+* siblings and only one is busy. In such case, @dst_cpu
+* can help if it has higher priority and is idle.
+*/
+   return !sds->local_stat.group_util &&
+  sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+   }
+
+   /* @dst_cpu has SMT siblings. */
+
+   if (sg_is_smt) {
+   int local_busy_cpus = sds->local->group_weight -
+ sds->local_stat.idle_cpus;
+   int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
+
+   /* Local can always help to even the number busy CPUs. */
+   

Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs

2021-08-10 Thread Cédric Le Goater
 
> I forgot to add that this version does break irqbalance anymore, since

... does not break ...

Sorry for the noise,

> Linux is not mapping interrupts of CPU-less nodes.
> 
> Anyhow, irqbalance is now fixed : 
> 
>   
> https://github.com/Irqbalance/irqbalance/commit/a7f81483a95a94d6a62ca7fb999a090e01c0de9b
> 
> So v1 (plus irqbalance patch above) or v2 are safe to use. I do prefer 
> v2.
> 
> Thanks to Laurent and Srikar for the extra tests,
> 
> C.


[powerpc:next-test 60/60] arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry'

2021-08-10 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
head:   5e3aa4531ecc7febbfa18218145c903dab17e651
commit: 5e3aa4531ecc7febbfa18218145c903dab17e651 [60/60] powerpc/bug: Provide 
better flexibility to WARN_ON/__WARN_FLAGS() with asm goto
config: powerpc64-buildonly-randconfig-r001-20210810 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 10.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=5e3aa4531ecc7febbfa18218145c903dab17e651
git remote add powerpc 
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
git fetch --no-tags powerpc next-test
git checkout 5e3aa4531ecc7febbfa18218145c903dab17e651
# save the attached .config to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross 
O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   arch/powerpc/kernel/misc_32.S: Assembler messages:
>> arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: 
>> `emit_warn_entry'


vim +240 arch/powerpc/kernel/misc_32.S

   218  
   219  /*
   220   * Copy a whole page.  We use the dcbz instruction on the destination
   221   * to reduce memory traffic (it eliminates the unnecessary reads of
   222   * the destination into cache).  This requires that the destination
   223   * is cacheable.
   224   */
   225  #define COPY_16_BYTES   \
   226  lwz r6,4(r4);   \
   227  lwz r7,8(r4);   \
   228  lwz r8,12(r4);  \
   229  lwzur9,16(r4);  \
   230  stw r6,4(r3);   \
   231  stw r7,8(r3);   \
   232  stw r8,12(r3);  \
   233  stwur9,16(r3)
   234  
   235  _GLOBAL(copy_page)
   236  rlwinm  r5, r3, 0, L1_CACHE_BYTES - 1
   237  addir3,r3,-4
   238  
   239  0:  twnei   r5, 0   /* WARN if r3 is not cache aligned */
 > 240  EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
   241  
   242  addir4,r4,-4
   243  
   244  li  r5,4
   245  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()

2021-08-10 Thread Tom Lendacky
On 8/10/21 1:45 PM, Kuppuswamy, Sathyanarayanan wrote:
> 
> 
> On 7/27/21 3:26 PM, Tom Lendacky wrote:
>> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
>> index de01903c3735..cafed6456d45 100644
>> --- a/arch/x86/kernel/head64.c
>> +++ b/arch/x86/kernel/head64.c
>> @@ -19,7 +19,7 @@
>>   #include 
>>   #include 
>>   #include 
>> -#include 
>> +#include 
>>   #include 
>>     #include 
>> @@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long
>> physaddr,
>>    * there is no need to zero it after changing the memory encryption
>>    * attribute.
>>    */
>> -    if (mem_encrypt_active()) {
>> +    if (prot_guest_has(PATTR_MEM_ENCRYPT)) {
>>   vaddr = (unsigned long)__start_bss_decrypted;
>>   vaddr_end = (unsigned long)__end_bss_decrypted;
> 
> 
> Since this change is specific to AMD, can you replace PATTR_MEM_ENCRYPT with
> prot_guest_has(PATTR_SME) || prot_guest_has(PATTR_SEV). It is not used in
> TDX.

This is a direct replacement for now. I think the change you're requesting
should be done as part of the TDX support patches so it's clear why it is
being changed.

But, wouldn't TDX still need to do something with this shared/unencrypted
area, though? Or since it is shared, there's actually nothing you need to
do (the bss decrpyted section exists even if CONFIG_AMD_MEM_ENCRYPT is not
configured)?

Thanks,
Tom

> 


Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()

2021-08-10 Thread Kuppuswamy, Sathyanarayanan




On 7/27/21 3:26 PM, Tom Lendacky wrote:

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index de01903c3735..cafed6456d45 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,7 +19,7 @@
  #include 
  #include 
  #include 
-#include 
+#include 
  #include 
  
  #include 

@@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
 * there is no need to zero it after changing the memory encryption
 * attribute.
 */
-   if (mem_encrypt_active()) {
+   if (prot_guest_has(PATTR_MEM_ENCRYPT)) {
vaddr = (unsigned long)__start_bss_decrypted;
vaddr_end = (unsigned long)__end_bss_decrypted;



Since this change is specific to AMD, can you replace PATTR_MEM_ENCRYPT with
prot_guest_has(PATTR_SME) || prot_guest_has(PATTR_SEV). It is not used in
TDX.

--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()

2021-08-10 Thread Kuppuswamy, Sathyanarayanan




On 8/10/21 12:48 PM, Tom Lendacky wrote:

On 8/10/21 1:45 PM, Kuppuswamy, Sathyanarayanan wrote:



On 7/27/21 3:26 PM, Tom Lendacky wrote:

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index de01903c3735..cafed6456d45 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,7 +19,7 @@
   #include 
   #include 
   #include 
-#include 
+#include 
   #include 
     #include 
@@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long
physaddr,
    * there is no need to zero it after changing the memory encryption
    * attribute.
    */
-    if (mem_encrypt_active()) {
+    if (prot_guest_has(PATTR_MEM_ENCRYPT)) {
   vaddr = (unsigned long)__start_bss_decrypted;
   vaddr_end = (unsigned long)__end_bss_decrypted;



Since this change is specific to AMD, can you replace PATTR_MEM_ENCRYPT with
prot_guest_has(PATTR_SME) || prot_guest_has(PATTR_SEV). It is not used in
TDX.


This is a direct replacement for now. I think the change you're requesting
should be done as part of the TDX support patches so it's clear why it is
being changed.


Ok. I will include it part of TDX changes.



But, wouldn't TDX still need to do something with this shared/unencrypted
area, though? Or since it is shared, there's actually nothing you need to
do (the bss decrpyted section exists even if CONFIG_AMD_MEM_ENCRYPT is not
configured)?


Kirill had a requirement to turn on CONFIG_AMD_MEM_ENCRYPT for adding lazy
accept support in TDX guest kernel. Kirill, can you add details here?



Thanks,
Tom





--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


[PATCH] powerpc/interrupt: Fix OOPS by not calling do_IRQ() from timer_interrupt()

2021-08-10 Thread Christophe Leroy
An interrupt handler shall not be called from another interrupt
handler otherwise this leads to problems like the following:

Kernel attempted to write user page (afd4fa84) - exploit attempt? (uid: 
1000)
[ cut here ]
Bug: Write fault blocked by KUAP!
WARNING: CPU: 0 PID: 1617 at arch/powerpc/mm/fault.c:230 
do_page_fault+0x484/0x720
Modules linked in:
CPU: 0 PID: 1617 Comm: sshd Tainted: GW 
5.13.0-pmac-00010-g8393422eb77 #7
NIP:  c001b77c LR: c001b77c CTR: 
REGS: cb9e5bc0 TRAP: 0700   Tainted: GW  
(5.13.0-pmac-00010-g8393422eb77)
MSR:  00021032   CR: 24942424  XER: 

GPR00: c001b77c cb9e5c80 c1582c00 0021 3bff 085b 0027 
c8eb644c
GPR08: 0023    24942424 0063f8c8  
000186a0
GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 c07640c4 cb9e5e98 
cb9e5e90
GPR24: 0040 afd4fa96 0040 0200 c1fda6c0 afd4fa84 0300 
cb9e5cc0
NIP [c001b77c] do_page_fault+0x484/0x720
LR [c001b77c] do_page_fault+0x484/0x720
Call Trace:
[cb9e5c80] [c001b77c] do_page_fault+0x484/0x720 (unreliable)
[cb9e5cb0] [c000424c] DataAccess_virt+0xd4/0xe4
--- interrupt: 300 at __copy_tofrom_user+0x110/0x20c
NIP:  c001f9b4 LR: c03250a0 CTR: 0004
REGS: cb9e5cc0 TRAP: 0300   Tainted: GW  
(5.13.0-pmac-00010-g8393422eb77)
MSR:  9032   CR: 48028468  XER: 2000
DAR: afd4fa84 DSISR: 0a00
GPR00: 20726f6f cb9e5d80 c1582c00 0004 cb9e5e3a 0016 afd4fa80 

GPR08: 3835202d 72777872 2d78722d 0004 28028464 0063f8c8  
000186a0
GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 c07640c4 cb9e5e98 
cb9e5e90
GPR24: 0040 afd4fa96 0040 cb9e5e0c 0daa a000 cb9e5e98 
afd4fa56
NIP [c001f9b4] __copy_tofrom_user+0x110/0x20c
LR [c03250a0] _copy_to_iter+0x144/0x990
--- interrupt: 300
[cb9e5d80] [c03e89c0] n_tty_read+0xa4/0x598 (unreliable)
[cb9e5df0] [c03e2a0c] tty_read+0xdc/0x2b4
[cb9e5e80] [c0156bf8] vfs_read+0x274/0x340
[cb9e5f00] [c01571ac] ksys_read+0x70/0x118
[cb9e5f30] [c0016048] ret_from_syscall+0x0/0x28
--- interrupt: c00 at 0xa7855c88
NIP:  a7855c88 LR: a7855c5c CTR: 
REGS: cb9e5f40 TRAP: 0c00   Tainted: GW  
(5.13.0-pmac-00010-g8393422eb77)
MSR:  d032   CR: 2402446c  XER: 

GPR00: 0003 afd4ec70 a72137d0 000b afd4ecac 4000 0065a990 
0800
GPR08:  a7947930  0004 c15831b0 0063f8c8  
000186a0
GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 0065a9e0 0001 
0065fac0
GPR24:  0089 00664050  00668e30 a720c8dc a7943ff4 
0065f9b0
NIP [a7855c88] 0xa7855c88
LR [a7855c5c] 0xa7855c5c
--- interrupt: c00
Instruction dump:
3884aa88 38630178 48076861 807f0080 48042e45 2f83 419e0148 3c80c079
3c60c076 38841be4 386301c0 4801f705 <0fe0> 386b 4bfffe30 
3c80c06b
---[ end trace fd69b91a8046c2e5 ]---

Here the problem is that by re-enterring an exception handler,
kuap_save_and_lock() is called a second time with this time KUAP
access locked, leading to regs->kuap being overwritten hence
KUAP not being unlocked at exception exit as expected.

Do not call do_IRQ() from timer_interrupt() directly. Instead,
redefine do_IRQ() as a standard function named __do_IRQ(), and
call it from both do_IRQ() and time_interrupt() handlers.

Reported-by: Stan Johnson 
Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers")
Cc: sta...@vger.kernel.org
Cc: Nicholas Piggin 
Cc: Finn Thain 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/interrupt.h | 3 +++
 arch/powerpc/include/asm/irq.h   | 2 +-
 arch/powerpc/kernel/irq.c| 7 ++-
 arch/powerpc/kernel/time.c   | 2 +-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index fc4702bdd119..1e984a35a39f 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -590,6 +590,9 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode);
 
 DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException);
 
+/* irq.c */
+DECLARE_INTERRUPT_HANDLER_ASYNC(do_IRQ);
+
 void __noreturn unrecoverable_exception(struct pt_regs *regs);
 
 void replay_system_reset(void);
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 4982f3711fc3..2b3278534bc1 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -52,7 +52,7 @@ extern void *mcheckirq_ctx[NR_CPUS];
 extern void *hardirq_ctx[NR_CPUS];
 extern void *softirq_ctx[NR_CPUS];
 
-extern void 

[PATCH] powerpc/interrupt: Do not call single_step_exception() from other exceptions

2021-08-10 Thread Christophe Leroy
single_step_exception() is called by emulate_single_step() which
is called from (at least) alignment exception() handler and
program_check_exception() handler.

Redefine it as a regular __single_step_exception() which is called
by both single_step_exception() handler and emulate_single_step()
function.

Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers")
Cc: sta...@vger.kernel.org
Cc: Stan Johnson 
Cc: Nicholas Piggin 
Cc: Finn Thain 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/traps.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index dfbce527c98e..d56254f05e17 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1104,7 +1104,7 @@ DEFINE_INTERRUPT_HANDLER(RunModeException)
_exception(SIGTRAP, regs, TRAP_UNK, 0);
 }
 
-DEFINE_INTERRUPT_HANDLER(single_step_exception)
+static void __single_step_exception(struct pt_regs *regs)
 {
clear_single_step(regs);
clear_br_trace(regs);
@@ -1121,6 +1121,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception)
_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
 }
 
+DEFINE_INTERRUPT_HANDLER(single_step_exception)
+{
+   __single_step_exception(regs);
+}
+
 /*
  * After we have successfully emulated an instruction, we have to
  * check if the instruction was being single-stepped, and if so,
@@ -1130,7 +1135,7 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception)
 static void emulate_single_step(struct pt_regs *regs)
 {
if (single_stepping(regs))
-   single_step_exception(regs);
+   __single_step_exception(regs);
 }
 
 static inline int __parse_fpscr(unsigned long fpscr)
-- 
2.25.0



Re: clang/ld.lld build fails with `can't create dynamic relocation R_PPC64_ADDR64 against local symbol in readonly segment`

2021-08-10 Thread Paul Menzel

Dear Linux folks,


Am 29.07.21 um 10:23 schrieb Paul Menzel:

I just wanted to make you aware that building Linux for ppc64le with 
clang/lld.ld fails with [1]:


     ld.lld: error: can't create dynamic relocation R_PPC64_ADDR64 
against symbol: empty_zero_page in readonly segment; recompile object 
files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in 
the output

     >>> defined in arch/powerpc/kernel/head_64.o
     >>> referenced by 
arch/powerpc/kernel/head_64.o:(___ksymtab+empty_zero_page+0x0)


The patch below from one of the comments [2] fixes it.

--- i/arch/powerpc/Makefile
+++ w/arch/powerpc/Makefile
@@ -122,7 +122,7 @@ cflags-$(CONFIG_STACKPROTECTOR) += 
-mstack-protector-guard-reg=r2

  endif

  LDFLAGS_vmlinux-y := -Bstatic
-LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
+LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie -z notext
  LDFLAGS_vmlinux    := $(LDFLAGS_vmlinux-y)
  LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn)


Any comments, if this is the right fix? Current Linux master branch 
still fails to build with `LLVM=1` on Ubuntu 21.04 without this change.



Kind regards,

Paul



[1]: https://github.com/ClangBuiltLinux/linux/issues/811
[2]: https://github.com/ClangBuiltLinux/linux/issues/811#issuecomment-568316320


[powerpc:next-test] BUILD REGRESSION 5e3aa4531ecc7febbfa18218145c903dab17e651

2021-08-10 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
branch HEAD: 5e3aa4531ecc7febbfa18218145c903dab17e651  powerpc/bug: Provide 
better flexibility to WARN_ON/__WARN_FLAGS() with asm goto

Error/Warning reports:

https://lore.kernel.org/linuxppc-dev/202108110231.pw76zfp3-...@intel.com

Error/Warning in current branch:

arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry'

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
`-- powerpc64-buildonly-randconfig-r001-20210810
`-- arch-powerpc-kernel-misc_32.S:Error:unrecognized-opcode:emit_warn_entry

elapsed time: 847m

configs tested: 92
configs skipped: 3

gcc tested configs:
arm defconfig
arm64allyesconfig
arm  allyesconfig
arm  allmodconfig
arm64   defconfig
sh   j2_defconfig
mips  rb532_defconfig
shdreamcast_defconfig
ia64 allmodconfig
ia64defconfig
ia64 allyesconfig
x86_64allnoconfig
m68k allmodconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
arc  allyesconfig
nds32 allnoconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
parisc   allyesconfig
s390defconfig
i386 allyesconfig
sparcallyesconfig
sparc   defconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a004-20210810
x86_64   randconfig-a006-20210810
x86_64   randconfig-a003-20210810
x86_64   randconfig-a005-20210810
x86_64   randconfig-a002-20210810
x86_64   randconfig-a001-20210810
i386 randconfig-a004-20210810
i386 randconfig-a002-20210810
i386 randconfig-a001-20210810
i386 randconfig-a003-20210810
i386 randconfig-a006-20210810
i386 randconfig-a005-20210810
i386 randconfig-a012-20210809
i386 randconfig-a015-20210809
i386 randconfig-a011-20210809
i386 randconfig-a013-20210809
i386 randconfig-a014-20210809
i386 randconfig-a016-20210809
i386 randconfig-a011-20210810
i386 randconfig-a015-20210810
i386 randconfig-a013-20210810
i386 randconfig-a014-20210810
i386 randconfig-a016-20210810
i386 randconfig-a012-20210810
riscvnommu_k210_defconfig
riscvallyesconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
riscv  rv32_defconfig
riscvallmodconfig
x86_64rhel-8.3-kselftests
um   x86_64_defconfig
um i386_defconfig
x86_64   allyesconfig
x86_64  defconfig
x86_64   rhel-8.3
x86_64  kexec

clang tested configs:
x86_64   randconfig-c001-20210810
x86_64   randconfig-a002-20210808
x86_64   randconfig-a004-20210808
x86_64   randconfig-a006-20210808
x86_64   randconfig-a003-20210808
x86_64   randconfig-a001-20210808
x86_64   randconfig-a005-20210808
x86_64   randconfig-a013-20210810
x86_64   randconfig-a011-20210810
x86_64   randconfig-a012-20210810
x86_64   randconfig-a016-20210810
x86_64   randconfig-a014-20210810
x86_64   randconfig-a015-20210810

---
0-DAY CI Kernel Test Service

Re: [PATCH v2] powerpc/kprobes: Fix kprobe Oops happens in booke

2021-08-10 Thread Pu Lehui

Ping, serious problem here. All booke ppc will trigger Oops when
perform kprobes related operations.

On 2021/8/9 10:36, Pu Lehui wrote:

When using kprobe on powerpc booke series processor, Oops happens
as show bellow:

/ # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events
/ # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
/ # sleep 1
[   50.076730] Oops: Exception in kernel mode, sig: 5 [#1]
[   50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
[   50.077221] Modules linked in:
[   50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 
5.14.0-rc4-00022-g251a1524293d #21
[   50.077887] NIP:  c0b9c4e0 LR: c00ebecc CTR: 
[   50.078067] REGS: c3883de0 TRAP: 0700   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.078349] MSR:  00029000   CR: 24000228  XER: 2000
[   50.078675]
[   50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001  
c3883ecc 0001
[   50.078675] GPR08: c100598c c00ea250 0004  24000222 102490c2 
bff4180c 101e60d4
[   50.078675] GPR16:  102454ac 0040 1024 10241100 102410f8 
1024 0050
[   50.078675] GPR24: 0002  c3883ea0 0001  c350 
3b9b8d50 
[   50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190
[   50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0
[   50.080638] Call Trace:
[   50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable)
[   50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110
[   50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28
[   50.081541] --- interrupt: c00 at 0x100a4d08
[   50.081749] NIP:  100a4d08 LR: 101b5234 CTR: 0003
[   50.081931] REGS: c3883f50 TRAP: 0c00   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.082183] MSR:  0002f902   CR: 24000222  XER: 
[   50.082457]
[   50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 6400 
00555345 fefefeff
[   50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 102490c2 
bff4180c 101e60d4
[   50.082457] GPR16:  102454ac 0040 1024 10241100 102410f8 
1024 0050
[   50.082457] GPR24: 0002 bf9803f4 1024   100039e0 
 102444e8
[   50.083789] NIP [100a4d08] 0x100a4d08
[   50.083917] LR [101b5234] 0x101b5234
[   50.084042] --- interrupt: c00
[   50.084238] Instruction dump:
[   50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 914200c0 
38210010
[   50.084841] 4bfffc20    <7fe8> 7c0802a6 7c892378 
93c10048
[   50.085487] ---[ end trace f6fffe98e2fa8f3e ]---
[   50.085678]
Trace/breakpoint trap

There is no real mode for booke arch and the MMU translation is
always on. The corresponding MSR_IS/MSR_DS bit in booke is used
to switch the address space, but not for real mode judgment.

Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in real mode")
Signed-off-by: Pu Lehui 
---
v1->v2:
- use IS_ENABLED(CONFIG_BOOKE) as suggested by Michael Ellerman and
   Christophe Leroy
- update Oops log to make problem clear

  arch/powerpc/kernel/kprobes.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index cbc28d1a2e1b..7a7cd6bda53e 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -292,7 +292,8 @@ int kprobe_handler(struct pt_regs *regs)
if (user_mode(regs))
return 0;
  
-	if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))

+   if (!IS_ENABLED(CONFIG_BOOKE) &&
+   (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)))
return 0;
  
  	/*




[powerpc:fixes-test] BUILD SUCCESS 43e8f76006592cb1573a959aa287c45421066f9c

2021-08-10 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
fixes-test
branch HEAD: 43e8f76006592cb1573a959aa287c45421066f9c  powerpc/kprobes: Fix 
kprobe Oops happens in booke

elapsed time: 724m

configs tested: 83
configs skipped: 86

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm  allyesconfig
arm  allmodconfig
arm64   defconfig
ia64 allmodconfig
ia64defconfig
ia64 allyesconfig
x86_64allnoconfig
m68k allmodconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
arc  allyesconfig
nds32 allnoconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
i386 allyesconfig
sparcallyesconfig
sparc   defconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a004-20210810
x86_64   randconfig-a006-20210810
x86_64   randconfig-a003-20210810
x86_64   randconfig-a005-20210810
x86_64   randconfig-a002-20210810
x86_64   randconfig-a001-20210810
i386 randconfig-a004-20210810
i386 randconfig-a002-20210810
i386 randconfig-a001-20210810
i386 randconfig-a003-20210810
i386 randconfig-a006-20210810
i386 randconfig-a005-20210810
i386 randconfig-a012-20210809
i386 randconfig-a015-20210809
i386 randconfig-a011-20210809
i386 randconfig-a013-20210809
i386 randconfig-a014-20210809
i386 randconfig-a016-20210809
i386 randconfig-a011-20210810
i386 randconfig-a015-20210810
i386 randconfig-a013-20210810
i386 randconfig-a014-20210810
i386 randconfig-a016-20210810
i386 randconfig-a012-20210810
riscvnommu_k210_defconfig
riscvallyesconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
riscv  rv32_defconfig
riscvallmodconfig
x86_64rhel-8.3-kselftests
um   x86_64_defconfig
um i386_defconfig
x86_64   allyesconfig
x86_64  defconfig
x86_64   rhel-8.3
x86_64  kexec

clang tested configs:
x86_64   randconfig-a002-20210808
x86_64   randconfig-a004-20210808
x86_64   randconfig-a006-20210808
x86_64   randconfig-a003-20210808
x86_64   randconfig-a001-20210808
x86_64   randconfig-a005-20210808
x86_64   randconfig-a013-20210810
x86_64   randconfig-a011-20210810
x86_64   randconfig-a012-20210810
x86_64   randconfig-a016-20210810
x86_64   randconfig-a014-20210810
x86_64   randconfig-a015-20210810

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity

2021-08-10 Thread Aneesh Kumar K.V
David Gibson  writes:

> On Mon, Aug 09, 2021 at 10:54:33AM +0530, Aneesh Kumar K.V wrote:
>> PAPR interface currently supports two different ways of communicating 
>> resource
>> grouping details to the OS. These are referred to as Form 0 and Form 1
>> associativity grouping. Form 0 is the older format and is now considered
>> deprecated. This patch adds another resource grouping named FORM2.
>> 
>> Signed-off-by: Daniel Henrique Barboza 
>> Signed-off-by: Aneesh Kumar K.V 
>
> LGTM, with the exception of some minor nits noted below.
...

> +
>> +for (i = 0; i < max_numa_index; i++)
>> +/* +1 skip the max_numa_index in the property */
>> +numa_id_index_table[i] = of_read_number(_lookup_index[i + 
>> 1], 1);
>> +
>> +
>> +if (numa_dist_table_length != max_numa_index * max_numa_index) {
>> +
>
> Stray extra whitespace line here.
>
>> +WARN(1, "Wrong NUMA distance information\n");
>> +/* consider everybody else just remote. */
>> +for (i = 0;  i < max_numa_index; i++) {
>> +for (j = 0; j < max_numa_index; j++) {
>> +int nodeA = numa_id_index_table[i];
>> +int nodeB = numa_id_index_table[j];
>> +
>> +if (nodeA == nodeB)
>> +numa_distance_table[nodeA][nodeB] = 
>> LOCAL_DISTANCE;
>> +else
>> +numa_distance_table[nodeA][nodeB] = 
>> REMOTE_DISTANCE;
>> +}
>> +}
>
> I don't think it's necessarily a problem, but something to consider is
> that this fallback will initialize distance for *all* node IDs,
> whereas the normal path will only initialize it for nodes that are in
> the index table.  Since some later error checks key off whether
> certain fields in the distance table are initialized, is that the
> outcome you want?
>

With the device tree details not correct, one of the possible way to
make progress is to consider everybody remote. With new node hotplug
support we used to check whether the distance table entry is
initialized. With the updated spec, we expect all possible numa node
distance to be available during boot.

-aneesh


Re: [powerpc:next-test 60/60] arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry'

2021-08-10 Thread Christophe Leroy




Le 10/08/2021 à 20:10, kernel test robot a écrit :

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next-test
head:   5e3aa4531ecc7febbfa18218145c903dab17e651
commit: 5e3aa4531ecc7febbfa18218145c903dab17e651 [60/60] powerpc/bug: Provide 
better flexibility to WARN_ON/__WARN_FLAGS() with asm goto
config: powerpc64-buildonly-randconfig-r001-20210810 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 10.3.0
reproduce (this is a W=1 build):
 wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
 chmod +x ~/bin/make.cross
 # 
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=5e3aa4531ecc7febbfa18218145c903dab17e651
 git remote add powerpc 
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
 git fetch --no-tags powerpc next-test
 git checkout 5e3aa4531ecc7febbfa18218145c903dab17e651
 # save the attached .config to linux build tree
 mkdir build_dir
 COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross 
O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

arch/powerpc/kernel/misc_32.S: Assembler messages:

arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry'


We are missing stubs when CONFIG_BUG is not selected.

The following show fix it:

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index f36b872cb571..1ee0f22313ee 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -137,8 +137,11 @@ __label_warn_on:   
\
 #ifdef __ASSEMBLY__
 .macro EMIT_BUG_ENTRY addr,file,line,flags
 .endm
+.macro EMIT_WARN_ENTRY addr,file,line,flags
+.endm
 #else /* !__ASSEMBLY__ */
 #define _EMIT_BUG_ENTRY
+#define _EMIT_WARN_ENTRY
 #endif
 #endif /* CONFIG_BUG */



Re: [PATCH v2] powerpc/kprobes: Fix kprobe Oops happens in booke

2021-08-10 Thread Christophe Leroy




Le 11/08/2021 à 04:53, Pu Lehui a écrit :

Ping, serious problem here. All booke ppc will trigger Oops when
perform kprobes related operations.


As far as I can see it is in the fixes branch: 
https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/log/?h=fixes




On 2021/8/9 10:36, Pu Lehui wrote:

When using kprobe on powerpc booke series processor, Oops happens
as show bellow:

/ # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events
/ # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
/ # sleep 1
[   50.076730] Oops: Exception in kernel mode, sig: 5 [#1]
[   50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
[   50.077221] Modules linked in:
[   50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 
5.14.0-rc4-00022-g251a1524293d #21
[   50.077887] NIP:  c0b9c4e0 LR: c00ebecc CTR: 
[   50.078067] REGS: c3883de0 TRAP: 0700   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.078349] MSR:  00029000   CR: 24000228  XER: 2000
[   50.078675]
[   50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001  
c3883ecc 0001
[   50.078675] GPR08: c100598c c00ea250 0004  24000222 102490c2 
bff4180c 101e60d4
[   50.078675] GPR16:  102454ac 0040 1024 10241100 102410f8 
1024 0050
[   50.078675] GPR24: 0002  c3883ea0 0001  c350 
3b9b8d50 
[   50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190
[   50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0
[   50.080638] Call Trace:
[   50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable)
[   50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110
[   50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28
[   50.081541] --- interrupt: c00 at 0x100a4d08
[   50.081749] NIP:  100a4d08 LR: 101b5234 CTR: 0003
[   50.081931] REGS: c3883f50 TRAP: 0c00   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.082183] MSR:  0002f902   CR: 24000222  XER: 
[   50.082457]
[   50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 6400 
00555345 fefefeff
[   50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 102490c2 
bff4180c 101e60d4
[   50.082457] GPR16:  102454ac 0040 1024 10241100 102410f8 
1024 0050
[   50.082457] GPR24: 0002 bf9803f4 1024   100039e0 
 102444e8
[   50.083789] NIP [100a4d08] 0x100a4d08
[   50.083917] LR [101b5234] 0x101b5234
[   50.084042] --- interrupt: c00
[   50.084238] Instruction dump:
[   50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 914200c0 
38210010
[   50.084841] 4bfffc20    <7fe8> 7c0802a6 7c892378 
93c10048
[   50.085487] ---[ end trace f6fffe98e2fa8f3e ]---
[   50.085678]
Trace/breakpoint trap

There is no real mode for booke arch and the MMU translation is
always on. The corresponding MSR_IS/MSR_DS bit in booke is used
to switch the address space, but not for real mode judgment.

Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in real mode")
Signed-off-by: Pu Lehui 
---
v1->v2:
- use IS_ENABLED(CONFIG_BOOKE) as suggested by Michael Ellerman and
   Christophe Leroy
- update Oops log to make problem clear

  arch/powerpc/kernel/kprobes.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index cbc28d1a2e1b..7a7cd6bda53e 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -292,7 +292,8 @@ int kprobe_handler(struct pt_regs *regs)
  if (user_mode(regs))
  return 0;
-    if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))
+    if (!IS_ENABLED(CONFIG_BOOKE) &&
+    (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)))
  return 0;
  /*