Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
On 07/08/2021 09:20, Cédric Le Goater wrote: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We can not directly allocate and request the IPI on demand because > bringup_up() is called under the IRQ sparse lock. The alternative is > to allocate the IPIs for all possible nodes at startup and to request > the mapping on demand when the first CPU of a node is brought up. > > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: sta...@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani > Cc: Srikar Dronamraju > Cc: Laurent Vivier > Signed-off-by: Cédric Le Goater > Message-Id: <20210629131542.743888-1-...@kaod.org> > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/sysdev/xive/common.c | 35 +-- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index dbdbbc2f1dc5..943fd30095af 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; > static struct xive_ipi_desc { > unsigned int irq; > char name[16]; > + atomic_t started; > } *xive_ipis; > > /* > @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops > xive_ipi_irq_domain_ops = { > .alloc = xive_ipi_irq_domain_alloc, > }; > > -static int __init xive_request_ipi(void) > +static int __init xive_init_ipis(void) > { > struct fwnode_handle *fwnode; > struct irq_domain *ipi_domain; > @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = _ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* >* Map one IPI interrupt per node for all cpus of that node. >* Since the HW interrupt number doesn't have any meaning, > @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) > xid->irq = ret; > > snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > - > - ret = request_irq(xid->irq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, > NULL); > - > - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > } > > return ret; > @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) > return ret; > } > > +static int __init xive_request_ipi(unsigned int cpu) > +{ > + struct xive_ipi_desc *xid = _ipis[early_cpu_to_node(cpu)]; > + int ret; > + > + if (atomic_inc_return(>started) > 1) > + return 0; > + > + ret = request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, > + xid->name, NULL); > + > + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > + return ret; > +} > + > static int xive_setup_cpu_ipi(unsigned int cpu) > { > unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > if (xc->hw_ipi != XIVE_BAD_IRQ) > return 0; > > + /* Register the IPI */ > + xive_request_ipi(cpu); > + > /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > if (xive_ops->get_ipi(cpu, xc)) > return -EIO; > @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, > struct xive_cpu *xc) > if (xc->hw_ipi == XIVE_BAD_IRQ) > return; > > + /* TODO: clear IPI mapping */ > + > /* Mask the IPI */ > xive_do_source_set_mask(>ipi_data, true); > > @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) > smp_ops->cause_ipi = xive_cause_ipi; > > /* Register the IPI */ > - xive_request_ipi(); > + xive_init_ipis(); > > /* Allocate and setup IPI for the boot CPU */ > xive_setup_cpu_ipi(smp_processor_id()); > Tested-by: Laurent Vivier
[bug report] soc: fsl: qe: convert QE interrupt controller to platform_device
Hello Maxim Kochetkov, The patch be7ecbd240b2: "soc: fsl: qe: convert QE interrupt controller to platform_device" from Aug 3, 2021, leads to the following static checker warning: drivers/soc/fsl/qe/qe_ic.c:438 qe_ic_init() warn: unsigned 'qe_ic->virq_low' is never less than zero. drivers/soc/fsl/qe/qe_ic.c 408 static int qe_ic_init(struct platform_device *pdev) 409 { 410 struct device *dev = >dev; 411 void (*low_handler)(struct irq_desc *desc); 412 void (*high_handler)(struct irq_desc *desc); 413 struct qe_ic *qe_ic; 414 struct resource *res; 415 struct device_node *node = pdev->dev.of_node; 416 417 res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 418 if (res == NULL) { 419 dev_err(dev, "no memory resource defined\n"); 420 return -ENODEV; 421 } 422 423 qe_ic = devm_kzalloc(dev, sizeof(*qe_ic), GFP_KERNEL); 424 if (qe_ic == NULL) 425 return -ENOMEM; 426 427 qe_ic->regs = devm_ioremap(dev, res->start, resource_size(res)); 428 if (qe_ic->regs == NULL) { 429 dev_err(dev, "failed to ioremap() registers\n"); 430 return -ENODEV; 431 } 432 433 qe_ic->hc_irq = qe_ic_irq_chip; 434 435 qe_ic->virq_high = platform_get_irq(pdev, 0); 436 qe_ic->virq_low = platform_get_irq(pdev, 1); 437 --> 438 if (qe_ic->virq_low < 0) { 439 return -ENODEV; 440 } Unsigned can't be less than zero. It's weird that it doesn't check qe_ic->virq_high as well. Also remove the curly braces to make checkpatch happy? 441 442 if (qe_ic->virq_high != qe_ic->virq_low) { 443 low_handler = qe_ic_cascade_low; 444 high_handler = qe_ic_cascade_high; 445 } else { 446 low_handler = qe_ic_cascade_muxed_mpic; 447 high_handler = NULL; 448 } 449 450 qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS, 451_ic_host_ops, qe_ic); 452 if (qe_ic->irqhost == NULL) { 453 dev_err(dev, "failed to add irq domain\n"); 454 return -ENODEV; 455 } 456 457 qe_ic_write(qe_ic->regs, QEIC_CICR, 0); 458 459 irq_set_handler_data(qe_ic->virq_low, qe_ic); 460 irq_set_chained_handler(qe_ic->virq_low, low_handler); 461 462 if (qe_ic->virq_high && qe_ic->virq_high != qe_ic->virq_low) { 463 irq_set_handler_data(qe_ic->virq_high, qe_ic); 464 irq_set_chained_handler(qe_ic->virq_high, high_handler); 465 } 466 return 0; 467 } regards, dan carpenter
Re: [PATCH v2 3/4] powerpc: Optimize register usage for dear register
On Sat, Aug 7, 2021 at 2:58 PM Christophe Leroy wrote: > > > > Le 07/08/2021 à 03:02, sxwj...@me.com a écrit : > > From: Xiongwei Song > > > > Create an anonymous union for dar and dear regsiters, we can reference > > dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y. > > Otherwise, reference dar. This makes code more clear. > > > > Signed-off-by: Xiongwei Song > > --- > > arch/powerpc/include/asm/ptrace.h | 5 - > > arch/powerpc/kernel/process.c | 2 +- > > arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ > > 3 files changed, 7 insertions(+), 2 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/ptrace.h > > b/arch/powerpc/include/asm/ptrace.h > > index c252d04b1206..fa725e3238c2 100644 > > --- a/arch/powerpc/include/asm/ptrace.h > > +++ b/arch/powerpc/include/asm/ptrace.h > > @@ -43,7 +43,10 @@ struct pt_regs > > unsigned long mq; > > #endif > > unsigned long trap; > > - unsigned long dar; > > + union { > > + unsigned long dar; > > + unsigned long dear; > > + }; > > union { > > unsigned long dsisr; > > unsigned long esr; > > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c > > index f74af8f9133c..50436b52c213 100644 > > --- a/arch/powerpc/kernel/process.c > > +++ b/arch/powerpc/kernel/process.c > > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) > > trap == INTERRUPT_DATA_STORAGE || > > trap == INTERRUPT_ALIGNMENT) { > > if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > > - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, > > regs->esr); > > + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, > > regs->esr); > > else > > pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, > > regs->dsisr); > > } > > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c > > b/arch/powerpc/kernel/ptrace/ptrace.c > > index a222fd4d6334..7c7093c17c45 100644 > > --- a/arch/powerpc/kernel/ptrace/ptrace.c > > +++ b/arch/powerpc/kernel/ptrace/ptrace.c > > @@ -373,6 +373,8 @@ void __init pt_regs_check(void) > >offsetof(struct user_pt_regs, trap)); > > BUILD_BUG_ON(offsetof(struct pt_regs, dar) != > >offsetof(struct user_pt_regs, dar)); > > + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != > > + offsetof(struct user_pt_regs, dar)); > > dar and dear are the same, so checking the same thing a second time is > pointless. Same reply as the patch 1. Regards, Xiongwei > > > BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != > >offsetof(struct user_pt_regs, dsisr)); > > BUILD_BUG_ON(offsetof(struct pt_regs, esr) != > >
Re: [PATCH v2 1/4] powerpc: Optimize register usage for esr register
On Sat, Aug 7, 2021 at 2:57 PM Christophe Leroy wrote: > > > > Le 07/08/2021 à 03:02, sxwj...@me.com a écrit : > > From: Xiongwei Song > > > > Create an anonymous union for dsisr and esr regsiters, we can reference > > esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. > > Otherwise, reference dsisr. This makes code more clear. > > > > Signed-off-by: Xiongwei Song > > --- > > arch/powerpc/include/asm/ptrace.h | 5 - > > arch/powerpc/kernel/process.c | 2 +- > > arch/powerpc/kernel/ptrace/ptrace.c| 2 ++ > > arch/powerpc/kernel/traps.c| 2 +- > > arch/powerpc/platforms/44x/machine_check.c | 4 ++-- > > arch/powerpc/platforms/4xx/machine_check.c | 2 +- > > 6 files changed, 11 insertions(+), 6 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/ptrace.h > > b/arch/powerpc/include/asm/ptrace.h > > index 3e5d470a6155..c252d04b1206 100644 > > --- a/arch/powerpc/include/asm/ptrace.h > > +++ b/arch/powerpc/include/asm/ptrace.h > > @@ -44,7 +44,10 @@ struct pt_regs > > #endif > > unsigned long trap; > > unsigned long dar; > > - unsigned long dsisr; > > + union { > > + unsigned long dsisr; > > + unsigned long esr; > > + }; > > unsigned long result; > > }; > > }; > > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c > > index 185beb290580..f74af8f9133c 100644 > > --- a/arch/powerpc/kernel/process.c > > +++ b/arch/powerpc/kernel/process.c > > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) > > trap == INTERRUPT_DATA_STORAGE || > > trap == INTERRUPT_ALIGNMENT) { > > if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > > - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, > > regs->dsisr); > > + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, > > regs->esr); > > else > > pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, > > regs->dsisr); > > } > > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c > > b/arch/powerpc/kernel/ptrace/ptrace.c > > index 0a0a33eb0d28..a222fd4d6334 100644 > > --- a/arch/powerpc/kernel/ptrace/ptrace.c > > +++ b/arch/powerpc/kernel/ptrace/ptrace.c > > @@ -375,6 +375,8 @@ void __init pt_regs_check(void) > >offsetof(struct user_pt_regs, dar)); > > BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != > >offsetof(struct user_pt_regs, dsisr)); > > + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != > > + offsetof(struct user_pt_regs, dsisr)); > > esr and dsisr are the same, so checking the same thing a second time is > pointless. Hmm...it's better to check if the changes on pt_regs structure is expected I think. Regards, Xiongwei > > > BUILD_BUG_ON(offsetof(struct pt_regs, result) != > >offsetof(struct user_pt_regs, result)); > > > > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c > > index dfbce527c98e..2164f5705a0b 100644 > > --- a/arch/powerpc/kernel/traps.c > > +++ b/arch/powerpc/kernel/traps.c > > @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs) > > #ifdef CONFIG_PPC_ADV_DEBUG_REGS > > /* On 4xx, the reason for the machine check or program exception > > is in the ESR. */ > > -#define get_reason(regs) ((regs)->dsisr) > > +#define get_reason(regs) ((regs)->esr) > > #define REASON_FP ESR_FP > > #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) > > #define REASON_PRIVILEGED ESR_PPR > > diff --git a/arch/powerpc/platforms/44x/machine_check.c > > b/arch/powerpc/platforms/44x/machine_check.c > > index a5c898bb9bab..5d19daacd78a 100644 > > --- a/arch/powerpc/platforms/44x/machine_check.c > > +++ b/arch/powerpc/platforms/44x/machine_check.c > > @@ -11,7 +11,7 @@ > > > > int machine_check_440A(struct pt_regs *regs) > > { > > - unsigned long reason = regs->dsisr; > > + unsigned long reason = regs->esr; > > > > printk("Machine check in kernel mode.\n"); > > if (reason & ESR_IMCP){ > > @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs) > > #ifdef CONFIG_PPC_47x > > int machine_check_47x(struct pt_regs *regs) > > { > > - unsigned long reason = regs->dsisr; > > + unsigned long reason = regs->esr; > > u32 mcsr; > > > > printk(KERN_ERR "Machine check in kernel mode.\n"); > > diff --git a/arch/powerpc/platforms/4xx/machine_check.c > > b/arch/powerpc/platforms/4xx/machine_check.c > > index a71c29892a91..a905da1d6f41 100644 > > --- a/arch/powerpc/platforms/4xx/machine_check.c > > +++ b/arch/powerpc/platforms/4xx/machine_check.c > > @@ -10,7 +10,7 @@ > > > > int machine_check_4xx(struct pt_regs *regs) > > { > > - unsigned long
Re: [PATCH v4 0/5] nvmem: nintendo-otp: Add new driver for the Wii and Wii U OTP
On 01/08/2021 08:38, Emmanuel Gil Peyrot wrote: The OTP is a read-only memory area which contains various keys and signatures used to decrypt, encrypt or verify various pieces of storage. Its size depends on the console, it is 128 bytes on the Wii and 1024 bytes on the Wii U (split into eight 128 bytes banks). It can be used directly by writing into one register and reading from the other one, without any additional synchronisation. This series has been tested on both the Wii U (using my downstream master-wiiu branch[1]), as well as on the Wii on mainline. [1] https://gitlab.com/linkmauve/linux-wiiu/-/commits/master-wiiu Changes since v1: - Fixed the commit messages so they can be accepted by other email servers, sorry about that. Changes since v2: - Switched the dt binding documentation to YAML. - Used more obvious register arithmetic, and tested that gcc (at -O1 and above) outputs the exact same rlwinm instructions for them. - Use more #defines to make the code easier to read. - Include some links to the reversed documentation. - Avoid overlapping dt regions by changing the existing control@d800100 node to end before the OTP registers, with some bigger dt refactoring left for a future series. Changes since v3: - Relicense the dt-binding documentation under GPLv2-only or BSD-2-clauses. Emmanuel Gil Peyrot (5): nvmem: nintendo-otp: Add new driver for the Wii and Wii U OTP dt-bindings: nintendo-otp: Document the Wii and Wii U OTP support Applied 1/5 and 2/5 to nvmem next, rest of the patches should go via powerpc dts tree. thanks, --srini powerpc: wii.dts: Reduce the size of the control area powerpc: wii.dts: Expose the OTP on this platform powerpc: wii_defconfig: Enable OTP by default .../bindings/nvmem/nintendo-otp.yaml | 44 +++ arch/powerpc/boot/dts/wii.dts | 13 +- arch/powerpc/configs/wii_defconfig| 1 + drivers/nvmem/Kconfig | 11 ++ drivers/nvmem/Makefile| 2 + drivers/nvmem/nintendo-otp.c | 124 ++ 6 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/nvmem/nintendo-otp.yaml create mode 100644 drivers/nvmem/nintendo-otp.c
Re: [PATCH v2 1/2] sched/topology: Skip updating masks for non-online nodes
* Valentin Schneider [2021-08-09 13:52:38]: > On 09/08/21 12:22, Srikar Dronamraju wrote: > > * Valentin Schneider [2021-08-08 16:56:47]: > >> Wait, doesn't the distance matrix (without any offline node) say > >> > >> distance(0, 3) == 40 > >> > >> ? We should have at the very least: > >> > >> node 0 1 2 3 > >> 0: 10 20 ?? 40 > >> 1: 20 20 ?? 40 > >> 2: ?? ?? ?? ?? > >> 3: 40 40 ?? 10 > >> > > > > Before onlining node 3 and CPU 3 (node/CPU 0 and 1 are already online) > > Note: Node 2-7 and CPU 2-7 are still offline. > > > > node 0 1 2 3 > > 0: 10 20 40 10 > > 1: 20 20 40 10 > > 2: 40 40 10 10 > > 3: 10 10 10 10 > > > > NODE->mask(0) == 0 > > NODE->mask(1) == 1 > > NODE->mask(2) == 0 > > NODE->mask(3) == 0 > > > > Note: This is with updating Node 2's distance as 40 for figuring out > > the number of numa levels. Since we have all possible distances, we > > dont update Node 3 distance, so it will be as if its local to node 0. > > > > Now when Node 3 and CPU 3 are onlined > > Note: Node 2, 3-7 and CPU 2, 3-7 are still offline. > > > > node 0 1 2 3 > > 0: 10 20 40 40 > > 1: 20 20 40 40 > > 2: 40 40 10 40 > > 3: 40 40 40 10 > > > > NODE->mask(0) == 0 > > NODE->mask(1) == 1 > > NODE->mask(2) == 0 > > NODE->mask(3) == 0,3 > > > > CPU 0 continues to be part of Node->mask(3) because when we online and > > we find the right distance, there is no API to reset the numa mask of > > 3 to remove CPU 0 from the numa masks. > > > > If we had an API to clear/set sched_domains_numa_masks[node][] when > > the node state changes, we could probably plug-in to clear/set the > > node masks whenever node state changes. > > > > Gotcha, this is now coming back to me... > > [...] > > >> Ok, so it looks like we really can't do without that part - even if we get > >> "sensible" distance values for the online nodes, we can't divine values for > >> the offline ones. > >> > > > > Yes > > > > Argh, while your approach does take care of the masks, it leaves > sched_numa_topology_type unchanged. You *can* force an update of it, but > yuck :( > > I got to the below... > Yes, I completely missed that we should update sched_numa_topology_type. > --- > From: Srikar Dronamraju > Date: Thu, 1 Jul 2021 09:45:51 +0530 > Subject: [PATCH 1/1] sched/topology: Skip updating masks for non-online nodes > > The scheduler currently expects NUMA node distances to be stable from init > onwards, and as a consequence builds the related data structures > once-and-for-all at init (see sched_init_numa()). > > Unfortunately, on some architectures node distance is unreliable for > offline nodes and may very well change upon onlining. > > Skip over offline nodes during sched_init_numa(). Track nodes that have > been onlined at least once, and trigger a build of a node's NUMA masks when > it is first onlined post-init. > Your version is much much better than mine. And I have verified that it works as expected. > Reported-by: Geetika Moolchandani > Signed-off-by: Srikar Dronamraju > Signed-off-by: Valentin Schneider > --- > kernel/sched/topology.c | 65 + > 1 file changed, 65 insertions(+) > > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index b77ad49dc14f..cba95793a9b7 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -1482,6 +1482,8 @@ int sched_max_numa_distance; > static int *sched_domains_numa_distance; > static struct cpumask***sched_domains_numa_masks; > int __read_mostlynode_reclaim_distance = RECLAIM_DISTANCE; > + > +static unsigned long __read_mostly *sched_numa_onlined_nodes; > #endif > > /* > @@ -1833,6 +1835,16 @@ void sched_init_numa(void) > sched_domains_numa_masks[i][j] = mask; > > for_each_node(k) { > + /* > + * Distance information can be unreliable for > + * offline nodes, defer building the node > + * masks to its bringup. > + * This relies on all unique distance values > + * still being visible at init time. > + */ > + if (!node_online(j)) > + continue; > + > if (sched_debug() && (node_distance(j, k) != > node_distance(k, j))) > sched_numa_warn("Node-distance not > symmetric"); > > @@ -1886,6 +1898,53 @@ void sched_init_numa(void) > sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; > > init_numa_topology_type(); > + > + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); > + if (!sched_numa_onlined_nodes) > +
Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
* C?dric Le Goater [2021-08-07 09:20:57]: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We can not directly allocate and request the IPI on demand because > bringup_up() is called under the IRQ sparse lock. The alternative is > to allocate the IPIs for all possible nodes at startup and to request > the mapping on demand when the first CPU of a node is brought up. > Thank you, this version too works for me. Tested-by: Srikar Dronamraju > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: sta...@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani > Cc: Srikar Dronamraju > Cc: Laurent Vivier > Signed-off-by: Cédric Le Goater > Message-Id: <20210629131542.743888-1-...@kaod.org> > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/sysdev/xive/common.c | 35 +-- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index dbdbbc2f1dc5..943fd30095af 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; > static struct xive_ipi_desc { > unsigned int irq; > char name[16]; > + atomic_t started; > } *xive_ipis; > > /* > @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops > xive_ipi_irq_domain_ops = { > .alloc = xive_ipi_irq_domain_alloc, > }; > > -static int __init xive_request_ipi(void) > +static int __init xive_init_ipis(void) > { > struct fwnode_handle *fwnode; > struct irq_domain *ipi_domain; > @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = _ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* >* Map one IPI interrupt per node for all cpus of that node. >* Since the HW interrupt number doesn't have any meaning, > @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) > xid->irq = ret; > > snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > - > - ret = request_irq(xid->irq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, > NULL); > - > - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > } > > return ret; > @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) > return ret; > } > > +static int __init xive_request_ipi(unsigned int cpu) > +{ > + struct xive_ipi_desc *xid = _ipis[early_cpu_to_node(cpu)]; > + int ret; > + > + if (atomic_inc_return(>started) > 1) > + return 0; > + > + ret = request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, > + xid->name, NULL); > + > + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > + return ret; > +} > + > static int xive_setup_cpu_ipi(unsigned int cpu) > { > unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > if (xc->hw_ipi != XIVE_BAD_IRQ) > return 0; > > + /* Register the IPI */ > + xive_request_ipi(cpu); > + > /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > if (xive_ops->get_ipi(cpu, xc)) > return -EIO; > @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, > struct xive_cpu *xc) > if (xc->hw_ipi == XIVE_BAD_IRQ) > return; > > + /* TODO: clear IPI mapping */ > + > /* Mask the IPI */ > xive_do_source_set_mask(>ipi_data, true); > > @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) > smp_ops->cause_ipi = xive_cause_ipi; > > /* Register the IPI */ > - xive_request_ipi(); > + xive_init_ipis(); > > /* Allocate and setup IPI for the boot CPU */ > xive_setup_cpu_ipi(smp_processor_id()); > -- > 2.31.1 > -- Thanks and Regards Srikar Dronamraju
Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
On 8/7/21 9:20 AM, Cédric Le Goater wrote: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We can not directly allocate and request the IPI on demand because > bringup_up() is called under the IRQ sparse lock. The alternative is > to allocate the IPIs for all possible nodes at startup and to request > the mapping on demand when the first CPU of a node is brought up. > > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: sta...@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani > Cc: Srikar Dronamraju > Cc: Laurent Vivier > Signed-off-by: Cédric Le Goater > Message-Id: <20210629131542.743888-1-...@kaod.org> > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/sysdev/xive/common.c | 35 +-- > 1 file changed, 24 insertions(+), 11 deletions(-) I forgot to add that this version does break irqbalance anymore, since Linux is not mapping interrupts of CPU-less nodes. Anyhow, irqbalance is now fixed : https://github.com/Irqbalance/irqbalance/commit/a7f81483a95a94d6a62ca7fb999a090e01c0de9b So v1 (plus irqbalance patch above) or v2 are safe to use. I do prefer v2. Thanks to Laurent and Srikar for the extra tests, C. > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index dbdbbc2f1dc5..943fd30095af 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; > static struct xive_ipi_desc { > unsigned int irq; > char name[16]; > + atomic_t started; > } *xive_ipis; > > /* > @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops > xive_ipi_irq_domain_ops = { > .alloc = xive_ipi_irq_domain_alloc, > }; > > -static int __init xive_request_ipi(void) > +static int __init xive_init_ipis(void) > { > struct fwnode_handle *fwnode; > struct irq_domain *ipi_domain; > @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = _ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* >* Map one IPI interrupt per node for all cpus of that node. >* Since the HW interrupt number doesn't have any meaning, > @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) > xid->irq = ret; > > snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); > - > - ret = request_irq(xid->irq, xive_muxed_ipi_action, > - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, > NULL); > - > - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > } > > return ret; > @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) > return ret; > } > > +static int __init xive_request_ipi(unsigned int cpu) > +{ > + struct xive_ipi_desc *xid = _ipis[early_cpu_to_node(cpu)]; > + int ret; > + > + if (atomic_inc_return(>started) > 1) > + return 0; > + > + ret = request_irq(xid->irq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, > + xid->name, NULL); > + > + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); > + return ret; > +} > + > static int xive_setup_cpu_ipi(unsigned int cpu) > { > unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); > @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) > if (xc->hw_ipi != XIVE_BAD_IRQ) > return 0; > > + /* Register the IPI */ > + xive_request_ipi(cpu); > + > /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > if (xive_ops->get_ipi(cpu, xc)) > return -EIO; > @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, > struct xive_cpu *xc) > if (xc->hw_ipi == XIVE_BAD_IRQ) > return; > > + /* TODO: clear IPI mapping */ > + > /* Mask the IPI */ > xive_do_source_set_mask(>ipi_data, true); > > @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) > smp_ops->cause_ipi = xive_cause_ipi; > > /* Register the IPI */ > - xive_request_ipi(); > + xive_init_ipis(); > > /* Allocate and setup IPI for the boot CPU */ > xive_setup_cpu_ipi(smp_processor_id()); >
[PATCH v4 5/6] sched/fair: Carve out logic to mark a group for asymmetric packing
Create a separate function, sched_asym(). A subsequent changeset will introduce logic to deal with SMT in conjunction with asmymmetric packing. Such logic will need the statistics of the scheduling group provided as argument. Update them before calling sched_asym(). Cc: Aubrey Li Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Mel Gorman Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim Chen Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Co-developed-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ricardo Neri --- Changes since v3: * Remove a redundant check for the local group in sched_asym(). (Dietmar) * Reworded commit message for clarity. (Len) Changes since v2: * Introduced this patch. Changes since v1: * N/A --- kernel/sched/fair.c | 20 +--- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae3d2bc59d8d..dd411cefb63f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8531,6 +8531,13 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } +static inline bool +sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -8591,18 +8598,17 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } + sgs->group_capacity = group->sgc->capacity; + + sgs->group_weight = group->group_weight; + /* Check if dst CPU is idle and preferred to this group */ if (!local_group && env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_asym(env, sds, sgs, group)) { sgs->group_asym_packing = 1; } - sgs->group_capacity = group->sgc->capacity; - - sgs->group_weight = group->group_weight; - sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ -- 2.17.1
[PATCH v4 1/6] x86/sched: Decrease further the priorities of SMT siblings
When scheduling, it is better to prefer a separate physical core rather than the SMT sibling of a high priority core. The existing formula to compute priorities takes such fact in consideration. There may exist, however, combinations of priorities (i.e., maximum frequencies) in which the priority of high-numbered SMT siblings of high-priority cores collides with the priority of low-numbered SMT siblings of low-priority cores. Consider for instance an SMT2 system with CPUs [0, 1] with priority 60 and [2, 3] with priority 30(CPUs in brackets are SMT siblings. In such a case, the resulting priorities would be [120, 60], [60, 30]. Thus, to ensure that CPU2 has higher priority than CPU1, divide the raw priority by the squared SMT iterator. The resulting priorities are [120, 30]. [60, 15]. Cc: Aubrey Li Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Joel Fernandes (Google) Cc: Mel Gorman Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim Chen Originally-by: Len Brown Signed-off-by: Len Brown Signed-off-by: Ricardo Neri --- Changes since v3: * Introduced this patch Changes since v2: * N/A Changes since v1: * N/A --- arch/x86/kernel/itmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1afbdd1dd777..9ff480e94511 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) * of the priority chain and only used when * all other high priority cpus are out of capacity. */ - smt_prio = prio * smp_num_siblings / i; + smt_prio = prio * smp_num_siblings / (i * i); per_cpu(sched_core_priority, cpu) = smt_prio; i++; } -- 2.17.1
[PATCH v4 0/6] sched/fair: Fix load balancing of SMT siblings with ASYM_PACKING
This is v4 the series. v1, v2, and v3 patches and test results can be found in [1], [2], and [3], respectively. === Problem statement === ++ Load balancing ++ When using asymmetric packing, there exists CPU topologies with three priority levels in which only a subset of the physical cores support SMT. An instance of such topology is Intel Alderlake, a hybrid processor with a mixture of Intel Core (with support for SMT) and Intel Atom CPUs. On Alderlake, it is almost always beneficial to spread work by picking first the Core CPUs, then the Atoms and at last the SMT siblings. The current load balancer, however, does not behave as described when using ASYM_PACKING. Instead, the load balancer will choose higher-priority CPUs (an Intel Core) over medium-priority CPUs (an Intel Atom), and subsequently overflow the load to a low priority SMT sibling CPU. This leaves medium- priority CPUs idle while low-priority CPUs are busy. This patchset fixes this behavior by also checking the idle state of the SMT siblings of both the CPU doing the load balance and the busiest candidate group when deciding whether the destination CPUs can pull tasks from the busiest CPU. ++ Rework ASYM_PACKING priorities with ITMT We also reworked the priority computation of the SMT siblings to ensure that higher-numbered SMT siblings are always low priority. The current computation may lead to situations in which in some processors those higher-numbered SMT siblings have the same priority as the Intel Atom CPUs. === Testing === I ran a few benchmarks with and without this version of the patchset on an Intel Alderlake system with 8 Intel Core (with SMT) and 8 Intel Atom CPUs. The baseline for the results is an unmodified v5.13-rc4 kernel. Results show a comparative percentage of improvement (positive) or degradation (negative). Each test case is repeated eight times, and the standard deviation among repetitions is also documented. Table 1 shows the results when using hardware-controlled performance performance states (HWP), a common use case. The impact of the patches is overall positive with a few test cases showing slight degradation. hackbench is especially difficult to assess it shows a high degree of variability. Thanks and BR, Ricardo ITMT: Intel Turbo Boost Max Technology 3.0 Changes since v3: * Reworked the ITMT priority computation to further reduce the priority of SMT siblings (patch 1). * Clear scheduling group flags when a child scheduling level degenerates (patch 2). * Removed arch-specific hooks (patch 6, PeterZ) * Removed redundant checks for the local group. (patch 5, Dietmar) * Removed redundant check for local idle CPUs and local group utilization. (patch 6, Joel) * Reworded commit messages of patches 2, 3, 5, and 6 for clarity. (Len, PeterZ) * Added Joel's Reviewed-by tag. * Unchanged patches: 4 Changes since v2: * Removed arch_sched_asym_prefer_early() and re-evaluation of the candidate busiest group in update_sd_pick_busiest(). (PeterZ) * Introduced sched_group::flags to reflect the properties of CPUs in the scheduling group. This helps to identify scheduling groups whose CPUs are SMT siblings. (PeterZ) * Modified update_sg_lb_stats() to get statistics of the scheduling domain as an argument. This provides it with the statistics of the local domain. (PeterZ) * Introduced sched_asym() to decide if a busiest candidate group can be marked for asymmetric packing. * Reworded patch 1 for clarity. (PeterZ) Changes since v1: * Don't bailout in update_sd_pick_busiest() if dst_cpu cannot pull tasks. Instead, reclassify the candidate busiest group, as it may still be selected. (PeterZ) * Avoid an expensive and unnecessary call to cpumask_weight() when determining if a sched_group is comprised of SMT siblings. (PeterZ). * Updated test results using the v2 patches. Table 1. Test results of patches with HWP === hackbench = caseloadbaseline(std%) compare%( std%) process-pipegroup-1 1.00 ( 1.45) -2.99 ( 4.35) process-pipegroup-2 1.00 ( 18.32) +2.14 ( 8.33) process-pipegroup-4 1.00 ( 17.27) -10.68 ( 15.85) process-pipegroup-8 1.00 ( 12.33) +2.26 ( 13.28) process-pipegroup-12 1.00 ( 6.52) -4.07 ( 7.97) process-pipegroup-16 1.00 ( 9.70) -7.71 ( 6.01) process-pipegroup-20 1.00 ( 2.52) -4.15 ( 6.35) process-pipegroup-24 1.00 ( 4.84) +1.04 ( 4.60) process-pipegroup-32 1.00 ( 4.79) +1.72 ( 5.13) process-pipegroup-48 1.00 ( 6.77) +4.68 ( 4.24) process-sockets group-1 1.00 ( 1.89) +0.53 ( 3.48) process-sockets group-2 1.00 (
[PATCH v4 2/6] sched/topology: Introduce sched_group::flags
There exist situations in which the load balance needs to know the properties of the CPUs in a scheduling group. When using asymmetric packing, for instance, the load balancer needs to know not only the state of dst_cpu but also of its SMT siblings, if any. Use the flags of the child scheduling domains to initialize scheduling group flags. This will reflect the properties of the CPUs in the group. A subsequent changeset will make use of these new flags. No functional changes are introduced. Cc: Aubrey Li Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Mel Gorman Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim Chen Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Originally-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ricardo Neri --- Changes since v3: * Clear the flags of the scheduling groups of a domain if its child is destroyed. * Minor rewording of the commit message. Changes since v2: * Introduced this patch. Changes since v1: * N/A --- kernel/sched/sched.h| 1 + kernel/sched/topology.c | 21 ++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a2c6f6ae6392..fb716e78974d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1799,6 +1799,7 @@ struct sched_group { unsigned intgroup_weight; struct sched_group_capacity *sgc; int asym_prefer_cpu;/* CPU of highest priority in group */ + int flags; /* * The CPUs this group covers. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..27c959de175d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -716,8 +716,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); - if (sd) + if (sd) { + struct sched_group *sg = sd->groups; + + /* +* sched groups hold the flags of the child sched +* domain for convenience. Clear such flags since +* the child is being destroyed. +*/ + do { + sg->flags = 0; + } while (sg != sd->groups); + sd->child = NULL; + } } for (tmp = sd; tmp; tmp = tmp->parent) @@ -916,10 +928,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) return NULL; sg_span = sched_group_span(sg); - if (sd->child) + if (sd->child) { cpumask_copy(sg_span, sched_domain_span(sd->child)); - else + sg->flags = sd->child->flags; + } else { cpumask_copy(sg_span, sched_domain_span(sd)); + } atomic_inc(>ref); return sg; @@ -1169,6 +1183,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + sg->flags = child->flags; } else { cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); -- 2.17.1
[PATCH v4 4/6] sched/fair: Provide update_sg_lb_stats() with sched domain statistics
Before deciding to pull tasks when using asymmetric packing of tasks, on some architectures (e.g., x86) it is necessary to know not only the state of dst_cpu but also of its SMT siblings. The decision to classify a candidate busiest group as group_asym_packing is done in update_sg_lb_stats(). Give this function access to the scheduling domain statistics, which contains the statistics of the local group. Cc: Aubrey Li Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Mel Gorman Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim Chen Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Originally-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ricardo Neri --- Changes since v3: * None Changes since v2: * Introduced this patch. Changes since v1: * N/A --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b686e18a39b..ae3d2bc59d8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8539,6 +8539,7 @@ group_type group_classify(unsigned int imbalance_pct, * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, int *sg_status) @@ -8547,7 +8548,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + local_group = group == sds->local; for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -9110,7 +9111,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, sgs, _status); + update_sg_lb_stats(env, sds, sg, sgs, _status); if (local_group) goto next_group; -- 2.17.1
[PATCH v4 3/6] sched/fair: Optimize checking for group_asym_packing
sched_asmy_prefer() always returns false when called on the local group. By checking local_group, we can avoid additional checks and invoking sched_asmy_prefer() when it is not needed. No functional changes are introduced. Cc: Aubrey Li Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Mel Gorman Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim Chen Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Signed-off-by: Ricardo Neri --- Changes since v3: * Further rewording of the commit message. (Len) Changes since v2: * Reworded the commit message for clarity. (Peter Z) Changes since v1: * None --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44e44c235f1f..3b686e18a39b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8591,7 +8591,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, } /* Check if dst CPU is idle and preferred to this group */ - if (env->sd->flags & SD_ASYM_PACKING && + if (!local_group && env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { -- 2.17.1
[PATCH v4 6/6] sched/fair: Consider SMT in ASYM_PACKING load balance
When deciding to pull tasks in ASYM_PACKING, it is necessary not only to check for the idle state of the destination CPU, dst_cpu, but also of its SMT siblings. If dst_cpu is idle but its SMT siblings are busy, performance suffers if it pulls tasks from a medium priority CPU that does not have SMT siblings. Implement asym_smt_can_pull_tasks() to inspect the state of the SMT siblings of both dst_cpu and the CPUs in the candidate busiest group. Cc: Aubrey Li Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Mel Gorman Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Srinivas Pandruvada Cc: Steven Rostedt Cc: Tim Chen Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Signed-off-by: Ricardo Neri --- Changes since v3: * Removed the arch_asym_check_smt_siblings() hook. Discussions with the powerpc folks showed that this patch should not impact them. Also, more recent powerpc processor no longer use asym_packing. (PeterZ) * Removed unnecessary local variable in asym_can_pull_tasks(). (Dietmar) * Removed unnecessary check for local CPUs when the local group has zero utilization. (Joel) * Renamed asym_can_pull_tasks() as asym_smt_can_pull_tasks() to reflect the fact that it deals with SMT cases. * Made asym_smt_can_pull_tasks() return false for !CONFIG_SCHED_SMT so that callers can deal with non-SMT cases. Changes since v2: * Reworded the commit message to reflect updates in code. * Corrected misrepresentation of dst_cpu as the CPU doing the load balancing. (PeterZ) * Removed call to arch_asym_check_smt_siblings() as it is now called in sched_asym(). Changes since v1: * Don't bailout in update_sd_pick_busiest() if dst_cpu cannot pull tasks. Instead, reclassify the candidate busiest group, as it may still be selected. (PeterZ) * Avoid an expensive and unnecessary call to cpumask_weight() when determining if a sched_group is comprised of SMT siblings. (PeterZ). --- kernel/sched/fair.c | 95 + 1 file changed, 95 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd411cefb63f..8a1a2a43732c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8531,10 +8531,99 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } +/** + * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks + * @dst_cpu: Destination CPU of the load balancing + * @sds: Load-balancing data with statistics of the local group + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg:The candidate busiet group + * + * Check the state of the SMT siblings of both @sds::local and @sg and decide + * if @dst_cpu can pull tasks. If @dst_cpu does not have SMT siblings, it can + * pull tasks if two or more of the SMT siblings of @sg are busy. If only one + * CPU in @sg is busy, pull tasks only if @dst_cpu has higher priority. + * + * If both @dst_cpu and @sg have SMT siblings, even the number of idle CPUs + * between @sds::local and @sg. Thus, pull tasks from @sg if the difference + * between the number of busy CPUs is 2 or more. If the difference is of 1, + * only pull if @dst_cpu has higher priority. If @sg does not have SMT siblings + * only pull tasks if all of the SMT siblings of @dst_cpu are idle and @sg + * has lower priority. + */ +static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sg_lb_stats *sgs, + struct sched_group *sg) +{ +#ifdef CONFIG_SCHED_SMT + bool local_is_smt, sg_is_smt; + int sg_busy_cpus; + + local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; + sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; + + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + + if (!local_is_smt) { + /* +* If we are here, @dst_cpu is idle and does not have SMT +* siblings. Pull tasks if candidate group has two or more +* busy CPUs. +*/ + if (sg_is_smt && sg_busy_cpus >= 2) + return true; + + /* +* @dst_cpu does not have SMT siblings. @sg may have SMT +* siblings and only one is busy. In such case, @dst_cpu +* can help if it has higher priority and is idle. +*/ + return !sds->local_stat.group_util && + sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + } + + /* @dst_cpu has SMT siblings. */ + + if (sg_is_smt) { + int local_busy_cpus = sds->local->group_weight - + sds->local_stat.idle_cpus; + int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; + + /* Local can always help to even the number busy CPUs. */ +
Re: [PATCH v2] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
> I forgot to add that this version does break irqbalance anymore, since ... does not break ... Sorry for the noise, > Linux is not mapping interrupts of CPU-less nodes. > > Anyhow, irqbalance is now fixed : > > > https://github.com/Irqbalance/irqbalance/commit/a7f81483a95a94d6a62ca7fb999a090e01c0de9b > > So v1 (plus irqbalance patch above) or v2 are safe to use. I do prefer > v2. > > Thanks to Laurent and Srikar for the extra tests, > > C.
[powerpc:next-test 60/60] arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry'
tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test head: 5e3aa4531ecc7febbfa18218145c903dab17e651 commit: 5e3aa4531ecc7febbfa18218145c903dab17e651 [60/60] powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto config: powerpc64-buildonly-randconfig-r001-20210810 (attached as .config) compiler: powerpc-linux-gcc (GCC) 10.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=5e3aa4531ecc7febbfa18218145c903dab17e651 git remote add powerpc https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git git fetch --no-tags powerpc next-test git checkout 5e3aa4531ecc7febbfa18218145c903dab17e651 # save the attached .config to linux build tree mkdir build_dir COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/ If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): arch/powerpc/kernel/misc_32.S: Assembler messages: >> arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: >> `emit_warn_entry' vim +240 arch/powerpc/kernel/misc_32.S 218 219 /* 220 * Copy a whole page. We use the dcbz instruction on the destination 221 * to reduce memory traffic (it eliminates the unnecessary reads of 222 * the destination into cache). This requires that the destination 223 * is cacheable. 224 */ 225 #define COPY_16_BYTES \ 226 lwz r6,4(r4); \ 227 lwz r7,8(r4); \ 228 lwz r8,12(r4); \ 229 lwzur9,16(r4); \ 230 stw r6,4(r3); \ 231 stw r7,8(r3); \ 232 stw r8,12(r3); \ 233 stwur9,16(r3) 234 235 _GLOBAL(copy_page) 236 rlwinm r5, r3, 0, L1_CACHE_BYTES - 1 237 addir3,r3,-4 238 239 0: twnei r5, 0 /* WARN if r3 is not cache aligned */ > 240 EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING 241 242 addir4,r4,-4 243 244 li r5,4 245 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org .config.gz Description: application/gzip
Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()
On 8/10/21 1:45 PM, Kuppuswamy, Sathyanarayanan wrote: > > > On 7/27/21 3:26 PM, Tom Lendacky wrote: >> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c >> index de01903c3735..cafed6456d45 100644 >> --- a/arch/x86/kernel/head64.c >> +++ b/arch/x86/kernel/head64.c >> @@ -19,7 +19,7 @@ >> #include >> #include >> #include >> -#include >> +#include >> #include >> #include >> @@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long >> physaddr, >> * there is no need to zero it after changing the memory encryption >> * attribute. >> */ >> - if (mem_encrypt_active()) { >> + if (prot_guest_has(PATTR_MEM_ENCRYPT)) { >> vaddr = (unsigned long)__start_bss_decrypted; >> vaddr_end = (unsigned long)__end_bss_decrypted; > > > Since this change is specific to AMD, can you replace PATTR_MEM_ENCRYPT with > prot_guest_has(PATTR_SME) || prot_guest_has(PATTR_SEV). It is not used in > TDX. This is a direct replacement for now. I think the change you're requesting should be done as part of the TDX support patches so it's clear why it is being changed. But, wouldn't TDX still need to do something with this shared/unencrypted area, though? Or since it is shared, there's actually nothing you need to do (the bss decrpyted section exists even if CONFIG_AMD_MEM_ENCRYPT is not configured)? Thanks, Tom >
Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()
On 7/27/21 3:26 PM, Tom Lendacky wrote: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index de01903c3735..cafed6456d45 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * there is no need to zero it after changing the memory encryption * attribute. */ - if (mem_encrypt_active()) { + if (prot_guest_has(PATTR_MEM_ENCRYPT)) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; Since this change is specific to AMD, can you replace PATTR_MEM_ENCRYPT with prot_guest_has(PATTR_SME) || prot_guest_has(PATTR_SEV). It is not used in TDX. -- Sathyanarayanan Kuppuswamy Linux Kernel Developer
Re: [PATCH 07/11] treewide: Replace the use of mem_encrypt_active() with prot_guest_has()
On 8/10/21 12:48 PM, Tom Lendacky wrote: On 8/10/21 1:45 PM, Kuppuswamy, Sathyanarayanan wrote: On 7/27/21 3:26 PM, Tom Lendacky wrote: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index de01903c3735..cafed6456d45 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include @@ -285,7 +285,7 @@ unsigned long __head __startup_64(unsigned long physaddr, * there is no need to zero it after changing the memory encryption * attribute. */ - if (mem_encrypt_active()) { + if (prot_guest_has(PATTR_MEM_ENCRYPT)) { vaddr = (unsigned long)__start_bss_decrypted; vaddr_end = (unsigned long)__end_bss_decrypted; Since this change is specific to AMD, can you replace PATTR_MEM_ENCRYPT with prot_guest_has(PATTR_SME) || prot_guest_has(PATTR_SEV). It is not used in TDX. This is a direct replacement for now. I think the change you're requesting should be done as part of the TDX support patches so it's clear why it is being changed. Ok. I will include it part of TDX changes. But, wouldn't TDX still need to do something with this shared/unencrypted area, though? Or since it is shared, there's actually nothing you need to do (the bss decrpyted section exists even if CONFIG_AMD_MEM_ENCRYPT is not configured)? Kirill had a requirement to turn on CONFIG_AMD_MEM_ENCRYPT for adding lazy accept support in TDX guest kernel. Kirill, can you add details here? Thanks, Tom -- Sathyanarayanan Kuppuswamy Linux Kernel Developer
[PATCH] powerpc/interrupt: Fix OOPS by not calling do_IRQ() from timer_interrupt()
An interrupt handler shall not be called from another interrupt handler otherwise this leads to problems like the following: Kernel attempted to write user page (afd4fa84) - exploit attempt? (uid: 1000) [ cut here ] Bug: Write fault blocked by KUAP! WARNING: CPU: 0 PID: 1617 at arch/powerpc/mm/fault.c:230 do_page_fault+0x484/0x720 Modules linked in: CPU: 0 PID: 1617 Comm: sshd Tainted: GW 5.13.0-pmac-00010-g8393422eb77 #7 NIP: c001b77c LR: c001b77c CTR: REGS: cb9e5bc0 TRAP: 0700 Tainted: GW (5.13.0-pmac-00010-g8393422eb77) MSR: 00021032 CR: 24942424 XER: GPR00: c001b77c cb9e5c80 c1582c00 0021 3bff 085b 0027 c8eb644c GPR08: 0023 24942424 0063f8c8 000186a0 GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 c07640c4 cb9e5e98 cb9e5e90 GPR24: 0040 afd4fa96 0040 0200 c1fda6c0 afd4fa84 0300 cb9e5cc0 NIP [c001b77c] do_page_fault+0x484/0x720 LR [c001b77c] do_page_fault+0x484/0x720 Call Trace: [cb9e5c80] [c001b77c] do_page_fault+0x484/0x720 (unreliable) [cb9e5cb0] [c000424c] DataAccess_virt+0xd4/0xe4 --- interrupt: 300 at __copy_tofrom_user+0x110/0x20c NIP: c001f9b4 LR: c03250a0 CTR: 0004 REGS: cb9e5cc0 TRAP: 0300 Tainted: GW (5.13.0-pmac-00010-g8393422eb77) MSR: 9032 CR: 48028468 XER: 2000 DAR: afd4fa84 DSISR: 0a00 GPR00: 20726f6f cb9e5d80 c1582c00 0004 cb9e5e3a 0016 afd4fa80 GPR08: 3835202d 72777872 2d78722d 0004 28028464 0063f8c8 000186a0 GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 c07640c4 cb9e5e98 cb9e5e90 GPR24: 0040 afd4fa96 0040 cb9e5e0c 0daa a000 cb9e5e98 afd4fa56 NIP [c001f9b4] __copy_tofrom_user+0x110/0x20c LR [c03250a0] _copy_to_iter+0x144/0x990 --- interrupt: 300 [cb9e5d80] [c03e89c0] n_tty_read+0xa4/0x598 (unreliable) [cb9e5df0] [c03e2a0c] tty_read+0xdc/0x2b4 [cb9e5e80] [c0156bf8] vfs_read+0x274/0x340 [cb9e5f00] [c01571ac] ksys_read+0x70/0x118 [cb9e5f30] [c0016048] ret_from_syscall+0x0/0x28 --- interrupt: c00 at 0xa7855c88 NIP: a7855c88 LR: a7855c5c CTR: REGS: cb9e5f40 TRAP: 0c00 Tainted: GW (5.13.0-pmac-00010-g8393422eb77) MSR: d032 CR: 2402446c XER: GPR00: 0003 afd4ec70 a72137d0 000b afd4ecac 4000 0065a990 0800 GPR08: a7947930 0004 c15831b0 0063f8c8 000186a0 GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 0065a9e0 0001 0065fac0 GPR24: 0089 00664050 00668e30 a720c8dc a7943ff4 0065f9b0 NIP [a7855c88] 0xa7855c88 LR [a7855c5c] 0xa7855c5c --- interrupt: c00 Instruction dump: 3884aa88 38630178 48076861 807f0080 48042e45 2f83 419e0148 3c80c079 3c60c076 38841be4 386301c0 4801f705 <0fe0> 386b 4bfffe30 3c80c06b ---[ end trace fd69b91a8046c2e5 ]--- Here the problem is that by re-enterring an exception handler, kuap_save_and_lock() is called a second time with this time KUAP access locked, leading to regs->kuap being overwritten hence KUAP not being unlocked at exception exit as expected. Do not call do_IRQ() from timer_interrupt() directly. Instead, redefine do_IRQ() as a standard function named __do_IRQ(), and call it from both do_IRQ() and time_interrupt() handlers. Reported-by: Stan Johnson Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers") Cc: sta...@vger.kernel.org Cc: Nicholas Piggin Cc: Finn Thain Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/interrupt.h | 3 +++ arch/powerpc/include/asm/irq.h | 2 +- arch/powerpc/kernel/irq.c| 7 ++- arch/powerpc/kernel/time.c | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index fc4702bdd119..1e984a35a39f 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -590,6 +590,9 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode); DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException); +/* irq.c */ +DECLARE_INTERRUPT_HANDLER_ASYNC(do_IRQ); + void __noreturn unrecoverable_exception(struct pt_regs *regs); void replay_system_reset(void); diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 4982f3711fc3..2b3278534bc1 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -52,7 +52,7 @@ extern void *mcheckirq_ctx[NR_CPUS]; extern void *hardirq_ctx[NR_CPUS]; extern void *softirq_ctx[NR_CPUS]; -extern void
[PATCH] powerpc/interrupt: Do not call single_step_exception() from other exceptions
single_step_exception() is called by emulate_single_step() which is called from (at least) alignment exception() handler and program_check_exception() handler. Redefine it as a regular __single_step_exception() which is called by both single_step_exception() handler and emulate_single_step() function. Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers") Cc: sta...@vger.kernel.org Cc: Stan Johnson Cc: Nicholas Piggin Cc: Finn Thain Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/traps.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..d56254f05e17 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1104,7 +1104,7 @@ DEFINE_INTERRUPT_HANDLER(RunModeException) _exception(SIGTRAP, regs, TRAP_UNK, 0); } -DEFINE_INTERRUPT_HANDLER(single_step_exception) +static void __single_step_exception(struct pt_regs *regs) { clear_single_step(regs); clear_br_trace(regs); @@ -1121,6 +1121,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception) _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); } +DEFINE_INTERRUPT_HANDLER(single_step_exception) +{ + __single_step_exception(regs); +} + /* * After we have successfully emulated an instruction, we have to * check if the instruction was being single-stepped, and if so, @@ -1130,7 +1135,7 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception) static void emulate_single_step(struct pt_regs *regs) { if (single_stepping(regs)) - single_step_exception(regs); + __single_step_exception(regs); } static inline int __parse_fpscr(unsigned long fpscr) -- 2.25.0
Re: clang/ld.lld build fails with `can't create dynamic relocation R_PPC64_ADDR64 against local symbol in readonly segment`
Dear Linux folks, Am 29.07.21 um 10:23 schrieb Paul Menzel: I just wanted to make you aware that building Linux for ppc64le with clang/lld.ld fails with [1]: ld.lld: error: can't create dynamic relocation R_PPC64_ADDR64 against symbol: empty_zero_page in readonly segment; recompile object files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in the output >>> defined in arch/powerpc/kernel/head_64.o >>> referenced by arch/powerpc/kernel/head_64.o:(___ksymtab+empty_zero_page+0x0) The patch below from one of the comments [2] fixes it. --- i/arch/powerpc/Makefile +++ w/arch/powerpc/Makefile @@ -122,7 +122,7 @@ cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r2 endif LDFLAGS_vmlinux-y := -Bstatic -LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie +LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie -z notext LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn) Any comments, if this is the right fix? Current Linux master branch still fails to build with `LLVM=1` on Ubuntu 21.04 without this change. Kind regards, Paul [1]: https://github.com/ClangBuiltLinux/linux/issues/811 [2]: https://github.com/ClangBuiltLinux/linux/issues/811#issuecomment-568316320
[powerpc:next-test] BUILD REGRESSION 5e3aa4531ecc7febbfa18218145c903dab17e651
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test branch HEAD: 5e3aa4531ecc7febbfa18218145c903dab17e651 powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto Error/Warning reports: https://lore.kernel.org/linuxppc-dev/202108110231.pw76zfp3-...@intel.com Error/Warning in current branch: arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry' Error/Warning ids grouped by kconfigs: gcc_recent_errors `-- powerpc64-buildonly-randconfig-r001-20210810 `-- arch-powerpc-kernel-misc_32.S:Error:unrecognized-opcode:emit_warn_entry elapsed time: 847m configs tested: 92 configs skipped: 3 gcc tested configs: arm defconfig arm64allyesconfig arm allyesconfig arm allmodconfig arm64 defconfig sh j2_defconfig mips rb532_defconfig shdreamcast_defconfig ia64 allmodconfig ia64defconfig ia64 allyesconfig x86_64allnoconfig m68k allmodconfig m68kdefconfig m68k allyesconfig nios2 defconfig arc allyesconfig nds32 allnoconfig nds32 defconfig nios2allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig arc defconfig sh allmodconfig parisc defconfig s390 allyesconfig s390 allmodconfig parisc allyesconfig s390defconfig i386 allyesconfig sparcallyesconfig sparc defconfig i386defconfig mips allyesconfig mips allmodconfig powerpc allyesconfig powerpc allmodconfig powerpc allnoconfig x86_64 randconfig-a004-20210810 x86_64 randconfig-a006-20210810 x86_64 randconfig-a003-20210810 x86_64 randconfig-a005-20210810 x86_64 randconfig-a002-20210810 x86_64 randconfig-a001-20210810 i386 randconfig-a004-20210810 i386 randconfig-a002-20210810 i386 randconfig-a001-20210810 i386 randconfig-a003-20210810 i386 randconfig-a006-20210810 i386 randconfig-a005-20210810 i386 randconfig-a012-20210809 i386 randconfig-a015-20210809 i386 randconfig-a011-20210809 i386 randconfig-a013-20210809 i386 randconfig-a014-20210809 i386 randconfig-a016-20210809 i386 randconfig-a011-20210810 i386 randconfig-a015-20210810 i386 randconfig-a013-20210810 i386 randconfig-a014-20210810 i386 randconfig-a016-20210810 i386 randconfig-a012-20210810 riscvnommu_k210_defconfig riscvallyesconfig riscvnommu_virt_defconfig riscv allnoconfig riscv defconfig riscv rv32_defconfig riscvallmodconfig x86_64rhel-8.3-kselftests um x86_64_defconfig um i386_defconfig x86_64 allyesconfig x86_64 defconfig x86_64 rhel-8.3 x86_64 kexec clang tested configs: x86_64 randconfig-c001-20210810 x86_64 randconfig-a002-20210808 x86_64 randconfig-a004-20210808 x86_64 randconfig-a006-20210808 x86_64 randconfig-a003-20210808 x86_64 randconfig-a001-20210808 x86_64 randconfig-a005-20210808 x86_64 randconfig-a013-20210810 x86_64 randconfig-a011-20210810 x86_64 randconfig-a012-20210810 x86_64 randconfig-a016-20210810 x86_64 randconfig-a014-20210810 x86_64 randconfig-a015-20210810 --- 0-DAY CI Kernel Test Service
Re: [PATCH v2] powerpc/kprobes: Fix kprobe Oops happens in booke
Ping, serious problem here. All booke ppc will trigger Oops when perform kprobes related operations. On 2021/8/9 10:36, Pu Lehui wrote: When using kprobe on powerpc booke series processor, Oops happens as show bellow: / # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events / # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable / # sleep 1 [ 50.076730] Oops: Exception in kernel mode, sig: 5 [#1] [ 50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 [ 50.077221] Modules linked in: [ 50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 5.14.0-rc4-00022-g251a1524293d #21 [ 50.077887] NIP: c0b9c4e0 LR: c00ebecc CTR: [ 50.078067] REGS: c3883de0 TRAP: 0700 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.078349] MSR: 00029000 CR: 24000228 XER: 2000 [ 50.078675] [ 50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001 c3883ecc 0001 [ 50.078675] GPR08: c100598c c00ea250 0004 24000222 102490c2 bff4180c 101e60d4 [ 50.078675] GPR16: 102454ac 0040 1024 10241100 102410f8 1024 0050 [ 50.078675] GPR24: 0002 c3883ea0 0001 c350 3b9b8d50 [ 50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190 [ 50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0 [ 50.080638] Call Trace: [ 50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable) [ 50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110 [ 50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28 [ 50.081541] --- interrupt: c00 at 0x100a4d08 [ 50.081749] NIP: 100a4d08 LR: 101b5234 CTR: 0003 [ 50.081931] REGS: c3883f50 TRAP: 0c00 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.082183] MSR: 0002f902 CR: 24000222 XER: [ 50.082457] [ 50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 6400 00555345 fefefeff [ 50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 102490c2 bff4180c 101e60d4 [ 50.082457] GPR16: 102454ac 0040 1024 10241100 102410f8 1024 0050 [ 50.082457] GPR24: 0002 bf9803f4 1024 100039e0 102444e8 [ 50.083789] NIP [100a4d08] 0x100a4d08 [ 50.083917] LR [101b5234] 0x101b5234 [ 50.084042] --- interrupt: c00 [ 50.084238] Instruction dump: [ 50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 914200c0 38210010 [ 50.084841] 4bfffc20 <7fe8> 7c0802a6 7c892378 93c10048 [ 50.085487] ---[ end trace f6fffe98e2fa8f3e ]--- [ 50.085678] Trace/breakpoint trap There is no real mode for booke arch and the MMU translation is always on. The corresponding MSR_IS/MSR_DS bit in booke is used to switch the address space, but not for real mode judgment. Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in real mode") Signed-off-by: Pu Lehui --- v1->v2: - use IS_ENABLED(CONFIG_BOOKE) as suggested by Michael Ellerman and Christophe Leroy - update Oops log to make problem clear arch/powerpc/kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index cbc28d1a2e1b..7a7cd6bda53e 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -292,7 +292,8 @@ int kprobe_handler(struct pt_regs *regs) if (user_mode(regs)) return 0; - if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)) + if (!IS_ENABLED(CONFIG_BOOKE) && + (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))) return 0; /*
[powerpc:fixes-test] BUILD SUCCESS 43e8f76006592cb1573a959aa287c45421066f9c
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git fixes-test branch HEAD: 43e8f76006592cb1573a959aa287c45421066f9c powerpc/kprobes: Fix kprobe Oops happens in booke elapsed time: 724m configs tested: 83 configs skipped: 86 The following configs have been built successfully. More configs may be tested in the coming days. gcc tested configs: arm defconfig arm64allyesconfig arm allyesconfig arm allmodconfig arm64 defconfig ia64 allmodconfig ia64defconfig ia64 allyesconfig x86_64allnoconfig m68k allmodconfig m68kdefconfig m68k allyesconfig nios2 defconfig arc allyesconfig nds32 allnoconfig nds32 defconfig nios2allyesconfig cskydefconfig alpha defconfig alphaallyesconfig xtensa allyesconfig h8300allyesconfig arc defconfig sh allmodconfig i386 allyesconfig sparcallyesconfig sparc defconfig i386defconfig mips allyesconfig mips allmodconfig powerpc allyesconfig powerpc allmodconfig powerpc allnoconfig x86_64 randconfig-a004-20210810 x86_64 randconfig-a006-20210810 x86_64 randconfig-a003-20210810 x86_64 randconfig-a005-20210810 x86_64 randconfig-a002-20210810 x86_64 randconfig-a001-20210810 i386 randconfig-a004-20210810 i386 randconfig-a002-20210810 i386 randconfig-a001-20210810 i386 randconfig-a003-20210810 i386 randconfig-a006-20210810 i386 randconfig-a005-20210810 i386 randconfig-a012-20210809 i386 randconfig-a015-20210809 i386 randconfig-a011-20210809 i386 randconfig-a013-20210809 i386 randconfig-a014-20210809 i386 randconfig-a016-20210809 i386 randconfig-a011-20210810 i386 randconfig-a015-20210810 i386 randconfig-a013-20210810 i386 randconfig-a014-20210810 i386 randconfig-a016-20210810 i386 randconfig-a012-20210810 riscvnommu_k210_defconfig riscvallyesconfig riscvnommu_virt_defconfig riscv allnoconfig riscv defconfig riscv rv32_defconfig riscvallmodconfig x86_64rhel-8.3-kselftests um x86_64_defconfig um i386_defconfig x86_64 allyesconfig x86_64 defconfig x86_64 rhel-8.3 x86_64 kexec clang tested configs: x86_64 randconfig-a002-20210808 x86_64 randconfig-a004-20210808 x86_64 randconfig-a006-20210808 x86_64 randconfig-a003-20210808 x86_64 randconfig-a001-20210808 x86_64 randconfig-a005-20210808 x86_64 randconfig-a013-20210810 x86_64 randconfig-a011-20210810 x86_64 randconfig-a012-20210810 x86_64 randconfig-a016-20210810 x86_64 randconfig-a014-20210810 x86_64 randconfig-a015-20210810 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity
David Gibson writes: > On Mon, Aug 09, 2021 at 10:54:33AM +0530, Aneesh Kumar K.V wrote: >> PAPR interface currently supports two different ways of communicating >> resource >> grouping details to the OS. These are referred to as Form 0 and Form 1 >> associativity grouping. Form 0 is the older format and is now considered >> deprecated. This patch adds another resource grouping named FORM2. >> >> Signed-off-by: Daniel Henrique Barboza >> Signed-off-by: Aneesh Kumar K.V > > LGTM, with the exception of some minor nits noted below. ... > + >> +for (i = 0; i < max_numa_index; i++) >> +/* +1 skip the max_numa_index in the property */ >> +numa_id_index_table[i] = of_read_number(_lookup_index[i + >> 1], 1); >> + >> + >> +if (numa_dist_table_length != max_numa_index * max_numa_index) { >> + > > Stray extra whitespace line here. > >> +WARN(1, "Wrong NUMA distance information\n"); >> +/* consider everybody else just remote. */ >> +for (i = 0; i < max_numa_index; i++) { >> +for (j = 0; j < max_numa_index; j++) { >> +int nodeA = numa_id_index_table[i]; >> +int nodeB = numa_id_index_table[j]; >> + >> +if (nodeA == nodeB) >> +numa_distance_table[nodeA][nodeB] = >> LOCAL_DISTANCE; >> +else >> +numa_distance_table[nodeA][nodeB] = >> REMOTE_DISTANCE; >> +} >> +} > > I don't think it's necessarily a problem, but something to consider is > that this fallback will initialize distance for *all* node IDs, > whereas the normal path will only initialize it for nodes that are in > the index table. Since some later error checks key off whether > certain fields in the distance table are initialized, is that the > outcome you want? > With the device tree details not correct, one of the possible way to make progress is to consider everybody remote. With new node hotplug support we used to check whether the distance table entry is initialized. With the updated spec, we expect all possible numa node distance to be available during boot. -aneesh
Re: [powerpc:next-test 60/60] arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry'
Le 10/08/2021 à 20:10, kernel test robot a écrit : tree: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next-test head: 5e3aa4531ecc7febbfa18218145c903dab17e651 commit: 5e3aa4531ecc7febbfa18218145c903dab17e651 [60/60] powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto config: powerpc64-buildonly-randconfig-r001-20210810 (attached as .config) compiler: powerpc-linux-gcc (GCC) 10.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=5e3aa4531ecc7febbfa18218145c903dab17e651 git remote add powerpc https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git git fetch --no-tags powerpc next-test git checkout 5e3aa4531ecc7febbfa18218145c903dab17e651 # save the attached .config to linux build tree mkdir build_dir COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/ If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>): arch/powerpc/kernel/misc_32.S: Assembler messages: arch/powerpc/kernel/misc_32.S:240: Error: unrecognized opcode: `emit_warn_entry' We are missing stubs when CONFIG_BUG is not selected. The following show fix it: diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index f36b872cb571..1ee0f22313ee 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -137,8 +137,11 @@ __label_warn_on: \ #ifdef __ASSEMBLY__ .macro EMIT_BUG_ENTRY addr,file,line,flags .endm +.macro EMIT_WARN_ENTRY addr,file,line,flags +.endm #else /* !__ASSEMBLY__ */ #define _EMIT_BUG_ENTRY +#define _EMIT_WARN_ENTRY #endif #endif /* CONFIG_BUG */
Re: [PATCH v2] powerpc/kprobes: Fix kprobe Oops happens in booke
Le 11/08/2021 à 04:53, Pu Lehui a écrit : Ping, serious problem here. All booke ppc will trigger Oops when perform kprobes related operations. As far as I can see it is in the fixes branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/log/?h=fixes On 2021/8/9 10:36, Pu Lehui wrote: When using kprobe on powerpc booke series processor, Oops happens as show bellow: / # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events / # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable / # sleep 1 [ 50.076730] Oops: Exception in kernel mode, sig: 5 [#1] [ 50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 [ 50.077221] Modules linked in: [ 50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 5.14.0-rc4-00022-g251a1524293d #21 [ 50.077887] NIP: c0b9c4e0 LR: c00ebecc CTR: [ 50.078067] REGS: c3883de0 TRAP: 0700 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.078349] MSR: 00029000 CR: 24000228 XER: 2000 [ 50.078675] [ 50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001 c3883ecc 0001 [ 50.078675] GPR08: c100598c c00ea250 0004 24000222 102490c2 bff4180c 101e60d4 [ 50.078675] GPR16: 102454ac 0040 1024 10241100 102410f8 1024 0050 [ 50.078675] GPR24: 0002 c3883ea0 0001 c350 3b9b8d50 [ 50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190 [ 50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0 [ 50.080638] Call Trace: [ 50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable) [ 50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110 [ 50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28 [ 50.081541] --- interrupt: c00 at 0x100a4d08 [ 50.081749] NIP: 100a4d08 LR: 101b5234 CTR: 0003 [ 50.081931] REGS: c3883f50 TRAP: 0c00 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.082183] MSR: 0002f902 CR: 24000222 XER: [ 50.082457] [ 50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 6400 00555345 fefefeff [ 50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 102490c2 bff4180c 101e60d4 [ 50.082457] GPR16: 102454ac 0040 1024 10241100 102410f8 1024 0050 [ 50.082457] GPR24: 0002 bf9803f4 1024 100039e0 102444e8 [ 50.083789] NIP [100a4d08] 0x100a4d08 [ 50.083917] LR [101b5234] 0x101b5234 [ 50.084042] --- interrupt: c00 [ 50.084238] Instruction dump: [ 50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 914200c0 38210010 [ 50.084841] 4bfffc20 <7fe8> 7c0802a6 7c892378 93c10048 [ 50.085487] ---[ end trace f6fffe98e2fa8f3e ]--- [ 50.085678] Trace/breakpoint trap There is no real mode for booke arch and the MMU translation is always on. The corresponding MSR_IS/MSR_DS bit in booke is used to switch the address space, but not for real mode judgment. Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in real mode") Signed-off-by: Pu Lehui --- v1->v2: - use IS_ENABLED(CONFIG_BOOKE) as suggested by Michael Ellerman and Christophe Leroy - update Oops log to make problem clear arch/powerpc/kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index cbc28d1a2e1b..7a7cd6bda53e 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -292,7 +292,8 @@ int kprobe_handler(struct pt_regs *regs) if (user_mode(regs)) return 0; - if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)) + if (!IS_ENABLED(CONFIG_BOOKE) && + (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))) return 0; /*