Re: [PATCH v2 3/4] powerpc: Optimize register usage for dear register
Le 07/08/2021 à 03:02, sxwj...@me.com a écrit : From: Xiongwei Song Create an anonymous union for dar and dear regsiters, we can reference dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dar. This makes code more clear. Signed-off-by: Xiongwei Song --- arch/powerpc/include/asm/ptrace.h | 5 - arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index c252d04b1206..fa725e3238c2 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -43,7 +43,10 @@ struct pt_regs unsigned long mq; #endif unsigned long trap; - unsigned long dar; + union { + unsigned long dar; + unsigned long dear; + }; union { unsigned long dsisr; unsigned long esr; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f74af8f9133c..50436b52c213 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index a222fd4d6334..7c7093c17c45 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -373,6 +373,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, trap)); BUILD_BUG_ON(offsetof(struct pt_regs, dar) != offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != +offsetof(struct user_pt_regs, dar)); dar and dear are the same, so checking the same thing a second time is pointless. BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
Re: [PATCH v2 1/4] powerpc: Optimize register usage for esr register
Le 07/08/2021 à 03:02, sxwj...@me.com a écrit : From: Xiongwei Song Create an anonymous union for dsisr and esr regsiters, we can reference esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dsisr. This makes code more clear. Signed-off-by: Xiongwei Song --- arch/powerpc/include/asm/ptrace.h | 5 - arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c| 2 ++ arch/powerpc/kernel/traps.c| 2 +- arch/powerpc/platforms/44x/machine_check.c | 4 ++-- arch/powerpc/platforms/4xx/machine_check.c | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 3e5d470a6155..c252d04b1206 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -44,7 +44,10 @@ struct pt_regs #endif unsigned long trap; unsigned long dar; - unsigned long dsisr; + union { + unsigned long dsisr; + unsigned long esr; + }; unsigned long result; }; }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 185beb290580..f74af8f9133c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 0a0a33eb0d28..a222fd4d6334 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -375,6 +375,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != +offsetof(struct user_pt_regs, dsisr)); esr and dsisr are the same, so checking the same thing a second time is pointless. BUILD_BUG_ON(offsetof(struct pt_regs, result) != offsetof(struct user_pt_regs, result)); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..2164f5705a0b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs) #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ -#define get_reason(regs) ((regs)->dsisr) +#define get_reason(regs) ((regs)->esr) #define REASON_FP ESR_FP #define REASON_ILLEGAL(ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c index a5c898bb9bab..5d19daacd78a 100644 --- a/arch/powerpc/platforms/44x/machine_check.c +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -11,7 +11,7 @@ int machine_check_440A(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; printk("Machine check in kernel mode.\n"); if (reason & ESR_IMCP){ @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs) #ifdef CONFIG_PPC_47x int machine_check_47x(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; u32 mcsr; printk(KERN_ERR "Machine check in kernel mode.\n"); diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c index a71c29892a91..a905da1d6f41 100644 --- a/arch/powerpc/platforms/4xx/machine_check.c +++ b/arch/powerpc/platforms/4xx/machine_check.c @@ -10,7 +10,7 @@ int machine_check_4xx(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; if (reason & ESR_IMCP) { printk("Instruction");
Re: Debian SID kernel doesn't boot on PowerBook 3400c
On Fri, 6 Aug 2021, Stan Johnson wrote: > $ egrep '(CONFIG_PPC_KUAP|CONFIG_VMAP_STACK)' .config > CONFIG_PPC_KUAP=y > CONFIG_PPC_KUAP_DEBUG=y > CONFIG_VMAP_STACK=y > $ strings vmlinux | fgrep "Linux version" > Linux version 5.13.0-pmac-4-g63e3756d1bd ... > $ cp vmlinux ../vmlinux-5.13.0-pmac-4-g63e3756d1bd-1 > > 1) PB 3400c > vmlinux-5.13.0-pmac-4-g63e3756d1bd-1 > Boots, no errors logging in at (text) fb console. Logging in via ssh and > running "ls -Rail /usr/include" generated errors (and a hung ssh > session). Once errors started, they repeated for almost every command. > See pb3400c-63e3756d1bdf-1.txt. > > 2) Wallstreet > vmlinux-5.13.0-pmac-4-g63e3756d1bd-1 > X login failed, there were errors ("Oops: Kernel access of bad area", > "Oops: Exception in kernel mode"). Logging in via SSH, there were no > additional errors after running "ls -Rail /usr/include" -- the errors > did not escalate as they did on the PB 3400. > See Wallstreet-63e3756d1bdf-1.txt. > ... > $ egrep '(CONFIG_PPC_KUAP|CONFIG_VMAP_STACK)' .config > CONFIG_PPC_KUAP=y > CONFIG_PPC_KUAP_DEBUG=y > # CONFIG_VMAP_STACK is not set > $ strings vmlinux | fgrep "Linux version" > Linux version 5.13.0-pmac-4-g63e3756d1bd ... > $ cp vmlinux ../vmlinux-5.13.0-pmac-4-g63e3756d1bd-2 > > 3) PB 3400c > vmlinux-5.13.0-pmac-4-g63e3756d1bd-2 > Filesystem was corrupt from the previous test (probably from all the > errors during shutdown). After fixing the filesystem: > Boots, no errors logging in at (text) fb console. Logging in via ssh and > running "ls -Rail /usr/include" generated a few errors. There didn't > seem to be as many errors as in the previous test, there were a few > errors during shutdown but the shutdown was otherwise normal. > See pb3400c-63e3756d1bdf-2.txt. > > 4) Wallstreet > vmlinux-5.13.0-pmac-4-g63e3756d1bd-2 > X login worked, and there were no errors. There were no errors during > ssh access. > See Wallstreet-63e3756d1bdf-2.txt. > Thanks for collecting these results, Stan. Do you think that the successful result from test 4) could have been just chance? It appears that the bug affecting the Powerbook 3400 is unaffected by CONFIG_VMAP_STACK. Whereas the bug affecting the Powerbook G3 disappears when CONFIG_VMAP_STACK is disabled (assuming the result from 4 is reliable). Either way, these results reiterate that "Oops: Kernel access of bad area, sig: 11" was not entirely resolved by "powerpc/32s: Fix napping restore in data storage interrupt (DSI)".
Re: Debian SID kernel doesn't boot on PowerBook 3400c
On Fri, 6 Aug 2021, Christophe Leroy wrote: > > I have cooked a tentative fix for that KUAP stuff. > Could you try the branch 'bugtest' at https://github.com/chleroy/linux.git > Thanks, Christophe. Stan, please test the following build. $ git remote add chleroy-linux https://github.com/chleroy/linux.git -f -t bugtest ... $ git checkout chleroy-linux/bugtest HEAD is now at 63e3756d1bdf powerpc/interrupts: Also perform KUAP/KUEP lock and usertime accounting on NMI $ cp ../dot-config-powermac-5.13 .config $ scripts/config -e CONFIG_PPC_KUAP -e CONFIG_PPC_KUAP_DEBUG -e CONFIG_VMAP_STACK $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux $ egrep "CONFIG_PPC_KUAP|CONFIG_VMAP_STACK" .config $ strings vmlinux |grep "Linux version" If that kernel produces errors, I'd try a second build as well: $ scripts/config -e CONFIG_PPC_KUAP -e CONFIG_PPC_KUAP_DEBUG -d CONFIG_VMAP_STACK $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux $ egrep "CONFIG_PPC_KUAP|CONFIG_VMAP_STACK" .config $ strings vmlinux |grep "Linux version" Please boot using the same kernel parameters as last time and capture the serial console logs. In case we're still dealing with intermittent bugs it might be necessary to repeat these tests so I suggest you retain the vmlinux files.
[PATCH v2 4/4] powerpc/64e: Get dear offset with _DEAR macro
From: Xiongwei Song Use _DEAR to get the offset of dear register in pr_regs for 64e cpus. Signed-off-by: Xiongwei Song --- arch/powerpc/kernel/asm-offsets.c| 13 +++-- arch/powerpc/kernel/exceptions-64e.S | 8 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f4ebc435fd78..8357d5fcd09e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -286,23 +286,16 @@ int main(void) STACK_PT_REGS_OFFSET(_CCR, ccr); STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); + STACK_PT_REGS_OFFSET(_DEAR, dear); STACK_PT_REGS_OFFSET(_DSISR, dsisr); STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); -#ifndef CONFIG_PPC64 - /* -* The PowerPC 400-class & Book-E processors have neither the DAR -* nor the DSISR SPRs. Hence, we overload them to hold the similar -* DEAR and ESR SPRs for such processors. For critical interrupts -* we use them to hold SRR0 and SRR1. -*/ - STACK_PT_REGS_OFFSET(_DEAR, dar); -#else /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC64 STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); -#endif /* CONFIG_PPC64 */ +#endif #ifdef CONFIG_PPC_PKEY STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index bf8c4c2f98ea..221e085e8c8c 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -545,7 +545,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -558,7 +558,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -575,7 +575,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -1057,7 +1057,7 @@ bad_stack_book3e: std r11,_CCR(r1) mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR - std r10,_DAR(r1) + std r10,_DEAR(r1) std r11,_ESR(r1) std r0,GPR0(r1);/* save r0 in stackframe */ \ std r2,GPR2(r1);/* save r2 in stackframe */ \ -- 2.30.2
[PATCH v2 3/4] powerpc: Optimize register usage for dear register
From: Xiongwei Song Create an anonymous union for dar and dear regsiters, we can reference dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dar. This makes code more clear. Signed-off-by: Xiongwei Song --- arch/powerpc/include/asm/ptrace.h | 5 - arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index c252d04b1206..fa725e3238c2 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -43,7 +43,10 @@ struct pt_regs unsigned long mq; #endif unsigned long trap; - unsigned long dar; + union { + unsigned long dar; + unsigned long dear; + }; union { unsigned long dsisr; unsigned long esr; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f74af8f9133c..50436b52c213 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index a222fd4d6334..7c7093c17c45 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -373,6 +373,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, trap)); BUILD_BUG_ON(offsetof(struct pt_regs, dar) != offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != +offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, esr) != -- 2.30.2
[PATCH v2 2/4] powerpc/64e: Get esr offset with _ESR macro
From: Xiongwei Song Use _ESR to get the offset of esr register in pr_regs for 64e cpus. Signed-off-by: Xiongwei Song --- arch/powerpc/kernel/asm-offsets.c| 2 +- arch/powerpc/kernel/exceptions-64e.S | 10 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a47eefa09bcb..f4ebc435fd78 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -287,6 +287,7 @@ int main(void) STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); STACK_PT_REGS_OFFSET(_DSISR, dsisr); + STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); @@ -298,7 +299,6 @@ int main(void) * we use them to hold SRR0 and SRR1. */ STACK_PT_REGS_OFFSET(_DEAR, dar); - STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1401787b0b93..bf8c4c2f98ea 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -546,7 +546,7 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x300) @@ -559,7 +559,7 @@ __end_interrupts: li r15,0 mr r14,r10 std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x400) @@ -576,7 +576,7 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x600) @@ -587,7 +587,7 @@ __end_interrupts: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - std r14,_DSISR(r1) + std r14,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) EXCEPTION_COMMON(0x700) addir3,r1,STACK_FRAME_OVERHEAD @@ -1058,7 +1058,7 @@ bad_stack_book3e: mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR std r10,_DAR(r1) - std r11,_DSISR(r1) + std r11,_ESR(r1) std r0,GPR0(r1);/* save r0 in stackframe */ \ std r2,GPR2(r1);/* save r2 in stackframe */ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */\ -- 2.30.2
[PATCH v2 1/4] powerpc: Optimize register usage for esr register
From: Xiongwei Song Create an anonymous union for dsisr and esr regsiters, we can reference esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dsisr. This makes code more clear. Signed-off-by: Xiongwei Song --- arch/powerpc/include/asm/ptrace.h | 5 - arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c| 2 ++ arch/powerpc/kernel/traps.c| 2 +- arch/powerpc/platforms/44x/machine_check.c | 4 ++-- arch/powerpc/platforms/4xx/machine_check.c | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 3e5d470a6155..c252d04b1206 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -44,7 +44,10 @@ struct pt_regs #endif unsigned long trap; unsigned long dar; - unsigned long dsisr; + union { + unsigned long dsisr; + unsigned long esr; + }; unsigned long result; }; }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 185beb290580..f74af8f9133c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 0a0a33eb0d28..a222fd4d6334 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -375,6 +375,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != +offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, result) != offsetof(struct user_pt_regs, result)); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..2164f5705a0b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs) #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ -#define get_reason(regs) ((regs)->dsisr) +#define get_reason(regs) ((regs)->esr) #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c index a5c898bb9bab..5d19daacd78a 100644 --- a/arch/powerpc/platforms/44x/machine_check.c +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -11,7 +11,7 @@ int machine_check_440A(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; printk("Machine check in kernel mode.\n"); if (reason & ESR_IMCP){ @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs) #ifdef CONFIG_PPC_47x int machine_check_47x(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; u32 mcsr; printk(KERN_ERR "Machine check in kernel mode.\n"); diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c index a71c29892a91..a905da1d6f41 100644 --- a/arch/powerpc/platforms/4xx/machine_check.c +++ b/arch/powerpc/platforms/4xx/machine_check.c @@ -10,7 +10,7 @@ int machine_check_4xx(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; if (reason & ESR_IMCP) { printk("Instruction"); -- 2.30.2
[PATCH v2 0/4] Some improvements on regs usage
From: Xiongwei Song When CONFIG_4xx=y or CONFIG_BOOKE=y, currently in code we reference dsisr to get interrupt reasons and reference dar to get excepiton address. However, in reference manuals, esr is used for interrupt reasons and dear is used for excepiton address, so the patchset changes dsisr -> esr, dar -> dear for CONFIG_4xx=y or CONFIG_BOOKE=y. Meanwhile, we use _ESR and _DEAR to get offsets of esr and dear on stack. v2: - Discard changes in arch/powerpc/mm/fault.c as Christophe and Michael suggested. - Discard changes in UAPI headers to avoid possible compile issue. v1: - https://lkml.org/lkml/2021/8/6/57 - https://lkml.org/lkml/2021/7/26/533 - https://lkml.org/lkml/2021/7/26/534 - https://lkml.org/lkml/2021/7/26/535 Xiongwei Song (4): powerpc: Optimize register usage for esr register powerpc/64e: Get esr offset with _ESR macro powerpc: Optimize register usage for dear register powerpc/64e: Get dear offset with _DEAR macro arch/powerpc/include/asm/ptrace.h | 10 -- arch/powerpc/kernel/asm-offsets.c | 15 --- arch/powerpc/kernel/exceptions-64e.S | 18 +- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c| 4 arch/powerpc/kernel/traps.c| 2 +- arch/powerpc/platforms/44x/machine_check.c | 4 ++-- arch/powerpc/platforms/4xx/machine_check.c | 2 +- 8 files changed, 30 insertions(+), 27 deletions(-) -- 2.30.2
Re: [PATCH v2 0/6] PCI: Drop duplicated tracking of a pci_dev's bound driver
On Fri, Aug 06, 2021 at 08:46:23AM +0200, Uwe Kleine-König wrote: > On Thu, Aug 05, 2021 at 06:42:34PM -0500, Bjorn Helgaas wrote: > > I looked at all the bus_type.probe() methods, it looks like pci_dev is > > not the only offender here. At least the following also have a driver > > pointer in the device struct: > > > > parisc_device.driver > > acpi_device.driver > > dio_dev.driver > > hid_device.driver > > pci_dev.driver > > pnp_dev.driver > > rio_dev.driver > > zorro_dev.driver > > Right, when I converted zorro_dev it was pointed out that the code was > copied from pci and the latter has the same construct. :-) > See > https://lore.kernel.org/r/20210730191035.1455248-5-u.kleine-koe...@pengutronix.de > for the patch, I don't find where pci was pointed out, maybe it was on > irc only. Oh, thanks! I looked to see if you'd done something similar elsewhere, but I missed this one. > > Looking through the places that care about pci_dev.driver (the ones > > updated by patch 5/6), many of them are ... a little dubious to begin > > with. A few need the "struct pci_error_handlers *err_handler" > > pointer, so that's probably legitimate. But many just need a name, > > and should probably be using dev_driver_string() instead. > > Yeah, I considered adding a function to get the driver name from a > pci_dev and a function to get the error handlers. Maybe it's an idea to > introduce these two and then use to_pci_driver(pdev->dev.driver) for the > few remaining users? Maybe doing that on top of my current series makes > sense to have a clean switch from pdev->driver to pdev->dev.driver?! I'd propose using dev_driver_string() for these places: eeh_driver_name() (could change callers to use dev_driver_string()) bcma_host_pci_probe() qm_alloc_uacce() hns3_get_drvinfo() prestera_pci_probe() mlxsw_pci_probe() nfp_get_drvinfo() ssb_pcihost_probe() The use in mpt_device_driver_register() looks unnecessary: it's only to get a struct pci_device_id *, which is passed to ->probe() functions that don't need it. The use in adf_enable_aer() looks wrong: it sets the err_handler pointer in one of the adf_driver structs. I think those structs should be basically immutable, and the drivers that call adf_enable_aer() from their .probe() methods should set ".err_handler = &adf_err_handler" in their static adf_driver definitions instead. I think that basically leaves these: uncore_pci_probe() # .id_table, custom driver "registration" match_id() # .id_table, arch/x86/kernel/probe_roms.c xhci_pci_quirks() # .id_table pci_error_handlers() # roll-your-own AER handling, drivers/misc/cxl/guest.c I think it would be fine to use to_pci_driver(pdev->dev.driver) for these few. Bjorn
RE: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to platform_device
> -Original Message- > From: Maxim Kochetkov > Sent: Tuesday, August 3, 2021 6:36 AM > To: linuxppc-dev@lists.ozlabs.org > Cc: Qiang Zhao ; Leo Li ; > gre...@linuxfoundation.org; sarava...@google.com; linux-arm- > ker...@lists.infradead.org; linux-ker...@vger.kernel.org; Maxim Kochetkov > ; kernel test robot ; Dan Carpenter > > Subject: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to > platform_device > > Since 5.13 QE's ucc nodes can't get interrupts from devicetree: > > ucc@2000 { > cell-index = <1>; > reg = <0x2000 0x200>; > interrupts = <32>; > interrupt-parent = <&qeic>; > }; > > Now fw_devlink expects driver to create and probe a struct device for > interrupt controller. > > So lets convert this driver to simple platform_device with probe(). > Also use platform_get_ and devm_ family function to get/allocate resources > and drop unused .compatible = "qeic". > > [1] - > https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.k > ernel.org%2Flkml%2FCAGETcx9PiX%3D%3DmLxB9PO8Myyk6u2vhPVwTMsA > 5NkD- > ywH5xhusw%40mail.gmail.com&data=04%7C01%7Cleoyang.li%40nxp.co > m%7C1833b32e26de4ed7ef7908d956728eae%7C686ea1d3bc2b4c6fa92cd99c5 > c301635%7C0%7C0%7C637635872281355718%7CUnknown%7CTWFpbGZsb3d > 8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0% > 3D%7C1000&sdata=HrivK73GYFAwygPz24JtO%2BTdkicCVYXOl3uywjOqS > %2BA%3D&reserved=0 > Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default") > Fixes: ea718c699055 ("Revert "Revert "driver core: Set fw_devlink=on by > default""") > Signed-off-by: Maxim Kochetkov > Reported-by: kernel test robot > Reported-by: Dan Carpenter Applied to fix. Thanks. > --- > drivers/soc/fsl/qe/qe_ic.c | 75 ++ > 1 file changed, 44 insertions(+), 31 deletions(-) > > diff --git a/drivers/soc/fsl/qe/qe_ic.c b/drivers/soc/fsl/qe/qe_ic.c index > 3f711c1a0996..e710d554425d 100644 > --- a/drivers/soc/fsl/qe/qe_ic.c > +++ b/drivers/soc/fsl/qe/qe_ic.c > @@ -23,6 +23,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -404,41 +405,40 @@ static void qe_ic_cascade_muxed_mpic(struct > irq_desc *desc) > chip->irq_eoi(&desc->irq_data); > } > > -static void __init qe_ic_init(struct device_node *node) > +static int qe_ic_init(struct platform_device *pdev) > { > + struct device *dev = &pdev->dev; > void (*low_handler)(struct irq_desc *desc); > void (*high_handler)(struct irq_desc *desc); > struct qe_ic *qe_ic; > - struct resource res; > - u32 ret; > + struct resource *res; > + struct device_node *node = pdev->dev.of_node; > > - ret = of_address_to_resource(node, 0, &res); > - if (ret) > - return; > + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); > + if (res == NULL) { > + dev_err(dev, "no memory resource defined\n"); > + return -ENODEV; > + } > > - qe_ic = kzalloc(sizeof(*qe_ic), GFP_KERNEL); > + qe_ic = devm_kzalloc(dev, sizeof(*qe_ic), GFP_KERNEL); > if (qe_ic == NULL) > - return; > + return -ENOMEM; > > - qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS, > -&qe_ic_host_ops, qe_ic); > - if (qe_ic->irqhost == NULL) { > - kfree(qe_ic); > - return; > + qe_ic->regs = devm_ioremap(dev, res->start, resource_size(res)); > + if (qe_ic->regs == NULL) { > + dev_err(dev, "failed to ioremap() registers\n"); > + return -ENODEV; > } > > - qe_ic->regs = ioremap(res.start, resource_size(&res)); > - > qe_ic->hc_irq = qe_ic_irq_chip; > > - qe_ic->virq_high = irq_of_parse_and_map(node, 0); > - qe_ic->virq_low = irq_of_parse_and_map(node, 1); > + qe_ic->virq_high = platform_get_irq(pdev, 0); > + qe_ic->virq_low = platform_get_irq(pdev, 1); > > - if (!qe_ic->virq_low) { > - printk(KERN_ERR "Failed to map QE_IC low IRQ\n"); > - kfree(qe_ic); > - return; > + if (qe_ic->virq_low < 0) { > + return -ENODEV; > } > + > if (qe_ic->virq_high != qe_ic->virq_low) { > low_handler = qe_ic_cascade_low; > high_handler = qe_ic_cascade_high; > @@ -447,6 +447,13 @@ static void __init qe_ic_init(struct device_node > *node) > high_handler = NULL; > } > > + qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS, > +&qe_ic_host_ops, qe_ic); > + if (qe_ic->irqhost == NULL) { > + dev_err(dev, "failed to add irq domain\n"); > + return -ENODEV; > + } > + > qe_ic_write(qe_ic->regs, QEIC_CICR, 0); > > irq_set_handler_data(qe_ic->virq_low, qe_ic); @@ -456,20 +463,26 > @@ static void __init qe_ic_init(struct device_node
Re: [PATCH v4 2/5] dt-bindings: nintendo-otp: Document the Wii and Wii U OTP support
On Sun, 01 Aug 2021 09:38:19 +0200, Emmanuel Gil Peyrot wrote: > Both of these consoles use the exact same two registers, even at the > same address, but the Wii U has eight banks of 128 bytes memory while > the Wii only has one, hence the two compatible strings. > > Signed-off-by: Emmanuel Gil Peyrot > --- > .../bindings/nvmem/nintendo-otp.yaml | 44 +++ > 1 file changed, 44 insertions(+) > create mode 100644 Documentation/devicetree/bindings/nvmem/nintendo-otp.yaml > Reviewed-by: Rob Herring
Re: [PATCH v1 32/55] KVM: PPC: Book3S HV P9: Move vcpu register save/restore into functions
Nicholas Piggin writes: > This should be no functional difference but makes the caller easier > to read. > > Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas > --- > arch/powerpc/kvm/book3s_hv.c | 65 +++- > 1 file changed, 41 insertions(+), 24 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index c2c72875fca9..45211458ac05 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4062,6 +4062,44 @@ static void store_spr_state(struct kvm_vcpu *vcpu) > vcpu->arch.ctrl = mfspr(SPRN_CTRLF); > } > > +/* Returns true if current MSR and/or guest MSR may have changed */ > +static bool load_vcpu_state(struct kvm_vcpu *vcpu, > +struct p9_host_os_sprs *host_os_sprs) > +{ > + bool ret = false; > + > + if (cpu_has_feature(CPU_FTR_TM) || > + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { > + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > + ret = true; > + } > + > + load_spr_state(vcpu, host_os_sprs); > + > + load_fp_state(&vcpu->arch.fp); > +#ifdef CONFIG_ALTIVEC > + load_vr_state(&vcpu->arch.vr); > +#endif > + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); > + > + return ret; > +} > + > +static void store_vcpu_state(struct kvm_vcpu *vcpu) > +{ > + store_spr_state(vcpu); > + > + store_fp_state(&vcpu->arch.fp); > +#ifdef CONFIG_ALTIVEC > + store_vr_state(&vcpu->arch.vr); > +#endif > + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); > + > + if (cpu_has_feature(CPU_FTR_TM) || > + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) > + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > +} > + > static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) > { > if (!cpu_has_feature(CPU_FTR_ARCH_31)) > @@ -4169,19 +4207,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > > vcpu_vpa_increment_dispatch(vcpu); > > - if (cpu_has_feature(CPU_FTR_TM) || > - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { > - kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > - msr = mfmsr(); /* TM restore can update msr */ > - } > - > - load_spr_state(vcpu, &host_os_sprs); > - > - load_fp_state(&vcpu->arch.fp); > -#ifdef CONFIG_ALTIVEC > - load_vr_state(&vcpu->arch.vr); > -#endif > - mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); > + if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) > + msr = mfmsr(); /* MSR may have been updated */ > > switch_pmu_to_guest(vcpu, &host_os_sprs); > > @@ -4285,17 +4312,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > > switch_pmu_to_host(vcpu, &host_os_sprs); > > - store_spr_state(vcpu); > - > - store_fp_state(&vcpu->arch.fp); > -#ifdef CONFIG_ALTIVEC > - store_vr_state(&vcpu->arch.vr); > -#endif > - vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); > - > - if (cpu_has_feature(CPU_FTR_TM) || > - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) > - kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > + store_vcpu_state(vcpu); > > vcpu_vpa_increment_dispatch(vcpu);
Re: [PATCH v1 31/55] KVM: PPC: Book3S HV P9: Juggle SPR switching around
Nicholas Piggin writes: > This juggles SPR switching on the entry and exit sides to be more > symmetric, which makes the next refactoring patch possible with no > functional change. > > Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas > --- > arch/powerpc/kvm/book3s_hv.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 56429b53f4dc..c2c72875fca9 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4175,7 +4175,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > msr = mfmsr(); /* TM restore can update msr */ > } > > - switch_pmu_to_guest(vcpu, &host_os_sprs); > + load_spr_state(vcpu, &host_os_sprs); > > load_fp_state(&vcpu->arch.fp); > #ifdef CONFIG_ALTIVEC > @@ -4183,7 +4183,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > #endif > mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); > > - load_spr_state(vcpu, &host_os_sprs); > + switch_pmu_to_guest(vcpu, &host_os_sprs); > > if (kvmhv_on_pseries()) { > /* > @@ -4283,6 +4283,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > vcpu->arch.slb_max = 0; > } > > + switch_pmu_to_host(vcpu, &host_os_sprs); > + > store_spr_state(vcpu); > > store_fp_state(&vcpu->arch.fp); > @@ -4297,8 +4299,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > > vcpu_vpa_increment_dispatch(vcpu); > > - switch_pmu_to_host(vcpu, &host_os_sprs); > - > timer_rearm_host_dec(*tb); > > restore_p9_host_os_sprs(vcpu, &host_os_sprs);
Re: [PATCH v1 30/55] KVM: PPC: Book3S HV P9: Only execute mtSPR if the value changed
Nicholas Piggin writes: > Keep better track of the current SPR value in places where > they are to be loaded with a new context, to reduce expensive > mtSPR operations. > > -73 cycles (7354) POWER9 virt-mode NULL hcall > > Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas > --- > arch/powerpc/kvm/book3s_hv.c | 64 ++-- > 1 file changed, 39 insertions(+), 25 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 0d97138e6fa4..56429b53f4dc 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4009,19 +4009,28 @@ static void switch_pmu_to_host(struct kvm_vcpu *vcpu, > } > } > > -static void load_spr_state(struct kvm_vcpu *vcpu) > +static void load_spr_state(struct kvm_vcpu *vcpu, > + struct p9_host_os_sprs *host_os_sprs) > { > - mtspr(SPRN_DSCR, vcpu->arch.dscr); > - mtspr(SPRN_IAMR, vcpu->arch.iamr); > - mtspr(SPRN_PSPB, vcpu->arch.pspb); > - mtspr(SPRN_FSCR, vcpu->arch.fscr); > mtspr(SPRN_TAR, vcpu->arch.tar); > mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); > mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); > mtspr(SPRN_BESCR, vcpu->arch.bescr); > - mtspr(SPRN_TIDR, vcpu->arch.tid); > - mtspr(SPRN_AMR, vcpu->arch.amr); > - mtspr(SPRN_UAMOR, vcpu->arch.uamor); > + > + if (!cpu_has_feature(CPU_FTR_ARCH_31)) > + mtspr(SPRN_TIDR, vcpu->arch.tid); > + if (host_os_sprs->iamr != vcpu->arch.iamr) > + mtspr(SPRN_IAMR, vcpu->arch.iamr); > + if (host_os_sprs->amr != vcpu->arch.amr) > + mtspr(SPRN_AMR, vcpu->arch.amr); > + if (vcpu->arch.uamor != 0) > + mtspr(SPRN_UAMOR, vcpu->arch.uamor); > + if (host_os_sprs->fscr != vcpu->arch.fscr) > + mtspr(SPRN_FSCR, vcpu->arch.fscr); > + if (host_os_sprs->dscr != vcpu->arch.dscr) > + mtspr(SPRN_DSCR, vcpu->arch.dscr); > + if (vcpu->arch.pspb != 0) > + mtspr(SPRN_PSPB, vcpu->arch.pspb); > > /* >* DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI] > @@ -4036,28 +4045,31 @@ static void load_spr_state(struct kvm_vcpu *vcpu) > > static void store_spr_state(struct kvm_vcpu *vcpu) > { > - vcpu->arch.ctrl = mfspr(SPRN_CTRLF); > - > - vcpu->arch.iamr = mfspr(SPRN_IAMR); > - vcpu->arch.pspb = mfspr(SPRN_PSPB); > - vcpu->arch.fscr = mfspr(SPRN_FSCR); > vcpu->arch.tar = mfspr(SPRN_TAR); > vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); > vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); > vcpu->arch.bescr = mfspr(SPRN_BESCR); > - vcpu->arch.tid = mfspr(SPRN_TIDR); > + > + if (!cpu_has_feature(CPU_FTR_ARCH_31)) > + vcpu->arch.tid = mfspr(SPRN_TIDR); > + vcpu->arch.iamr = mfspr(SPRN_IAMR); > vcpu->arch.amr = mfspr(SPRN_AMR); > vcpu->arch.uamor = mfspr(SPRN_UAMOR); > + vcpu->arch.fscr = mfspr(SPRN_FSCR); > vcpu->arch.dscr = mfspr(SPRN_DSCR); > + vcpu->arch.pspb = mfspr(SPRN_PSPB); > + > + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); > } > > static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) > { > - host_os_sprs->dscr = mfspr(SPRN_DSCR); > - host_os_sprs->tidr = mfspr(SPRN_TIDR); > + if (!cpu_has_feature(CPU_FTR_ARCH_31)) > + host_os_sprs->tidr = mfspr(SPRN_TIDR); > host_os_sprs->iamr = mfspr(SPRN_IAMR); > host_os_sprs->amr = mfspr(SPRN_AMR); > host_os_sprs->fscr = mfspr(SPRN_FSCR); > + host_os_sprs->dscr = mfspr(SPRN_DSCR); > } > > /* vcpu guest regs must already be saved */ > @@ -4066,18 +4078,20 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu > *vcpu, > { > mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); > > - mtspr(SPRN_PSPB, 0); > - mtspr(SPRN_UAMOR, 0); > - > - mtspr(SPRN_DSCR, host_os_sprs->dscr); > - mtspr(SPRN_TIDR, host_os_sprs->tidr); > - mtspr(SPRN_IAMR, host_os_sprs->iamr); > - > + if (!cpu_has_feature(CPU_FTR_ARCH_31)) > + mtspr(SPRN_TIDR, host_os_sprs->tidr); > + if (host_os_sprs->iamr != vcpu->arch.iamr) > + mtspr(SPRN_IAMR, host_os_sprs->iamr); > + if (vcpu->arch.uamor != 0) > + mtspr(SPRN_UAMOR, 0); > if (host_os_sprs->amr != vcpu->arch.amr) > mtspr(SPRN_AMR, host_os_sprs->amr); > - > if (host_os_sprs->fscr != vcpu->arch.fscr) > mtspr(SPRN_FSCR, host_os_sprs->fscr); > + if (host_os_sprs->dscr != vcpu->arch.dscr) > + mtspr(SPRN_DSCR, host_os_sprs->dscr); > + if (vcpu->arch.pspb != 0) > + mtspr(SPRN_PSPB, 0); > > /* Save guest CTRL register, set runlatch to 1 */ > if (!(vcpu->arch.ctrl & 1)) > @@ -4169,7 +4183,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > #endif > mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); > > - load_spr_state(vcpu); > + load_spr_state(vcpu, &host_os_sprs); > > if (kvmhv_on_pseries()) { >
Re: [PATCH v2] scripts/Makefile.clang: default to LLVM_IAS=1
On Fri, Aug 06, 2021 at 10:27:01AM -0700, Nick Desaulniers wrote: > LLVM_IAS=1 controls enabling clang's integrated assembler via > -integrated-as. This was an explicit opt in until we could enable > assembler support in Clang for more architecures. Now we have support > and CI coverage of LLVM_IAS=1 for all architecures except a few more > bugs affecting s390 and powerpc. The powerpc and s390 folks have been testing with clang, I think they should have been on CC for this change (done now). > This commit flips the default from opt in via LLVM_IAS=1 to opt out via > LLVM_IAS=0. CI systems or developers that were previously doing builds > with CC=clang or LLVM=1 without explicitly setting LLVM_IAS must now > explicitly opt out via LLVM_IAS=0, otherwise they will be implicitly > opted-in. > > This finally shortens the command line invocation when cross compiling > with LLVM to simply: > > $ make ARCH=arm64 LLVM=1 > > Signed-off-by: Nick Desaulniers I am still not really sure how I feel about this. I would prefer not to break people's builds but I suppose this is inevitabile eventually. A little support matrix that I drafted up where based on ARCH and clang version for LLVM_IAS=1 support: | 10.x | 11.x | 12.x | 13.x | 14.x | ARCH=arm | NO | NO | NO | YES | YES | ARCH=arm64 | NO | YES | YES | YES | YES | ARCH=i386| YES | YES | YES | YES | YES | ARCH=mips* | YES | YES | YES | YES | YES | ARCH=powerpc | NO | NO | NO | NO | NO | ARCH=s390| NO | NO | NO | NO | NO | ARCH=x86_64 | NO | YES | YES | YES | YES | The main issue that I have with this change is that all of these architectures work fine with CC=clang and their build commands that used to work fine will not with this change, as they will have to specify LLVM_IAS=0. I think that making this change for LLVM=1 makes sense but changing the default for just CC=clang feels like a bit much at this point in time. I would love to hear from others on this though, I am not going to object much further than this. Regardless of that concern, this patch does what it says so: Reviewed-by: Nathan Chancellor > --- > Changes v1 -> v2: > * Drop "Currently" from Documentation/, as per Matthew. > * Drop Makefile and riscv Makefile, rebase on > https://lore.kernel.org/lkml/20210805150102.131008-1-masahi...@kernel.org/ > as per Masahiro. > * Base is kbuild/for-next, plus > > https://lore.kernel.org/lkml/20210802183910.1802120-1-ndesaulni...@google.com/ > https://lore.kernel.org/lkml/20210805150102.131008-1-masahi...@kernel.org/. > > Documentation/kbuild/llvm.rst | 14 -- > scripts/Makefile.clang| 6 +++--- > 2 files changed, 11 insertions(+), 9 deletions(-) > > diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst > index f8a360958f4c..e87ed5479963 100644 > --- a/Documentation/kbuild/llvm.rst > +++ b/Documentation/kbuild/llvm.rst > @@ -60,17 +60,14 @@ They can be enabled individually. The full list of the > parameters: :: > OBJCOPY=llvm-objcopy OBJDUMP=llvm-objdump READELF=llvm-readelf \ > HOSTCC=clang HOSTCXX=clang++ HOSTAR=llvm-ar HOSTLD=ld.lld > > -Currently, the integrated assembler is disabled by default. You can pass > -``LLVM_IAS=1`` to enable it. > +The integrated assembler is enabled by default. You can pass ``LLVM_IAS=0`` > to > +disable it. > > Omitting CROSS_COMPILE > -- > > As explained above, ``CROSS_COMPILE`` is used to set ``--target=``. > > -Unless ``LLVM_IAS=1`` is specified, ``CROSS_COMPILE`` is also used to derive > -``--prefix=`` to search for the GNU assembler and linker. > - > If ``CROSS_COMPILE`` is not specified, the ``--target=`` is inferred > from ``ARCH``. > > @@ -78,7 +75,12 @@ That means if you use only LLVM tools, ``CROSS_COMPILE`` > becomes unnecessary. > > For example, to cross-compile the arm64 kernel:: > > - make ARCH=arm64 LLVM=1 LLVM_IAS=1 > + make ARCH=arm64 LLVM=1 > + > +If ``LLVM_IAS=0`` is specified, ``CROSS_COMPILE`` is also used to derive > +``--prefix=`` to search for the GNU assembler and linker. :: > + > + make ARCH=arm64 LLVM=1 LLVM_IAS=0 CROSS_COMPILE=aarch64-linux-gnu- > > Supported Architectures > --- > diff --git a/scripts/Makefile.clang b/scripts/Makefile.clang > index 1f4e3eb70f88..3ae63bd35582 100644 > --- a/scripts/Makefile.clang > +++ b/scripts/Makefile.clang > @@ -22,12 +22,12 @@ else > CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) > endif # CROSS_COMPILE > > -ifeq ($(LLVM_IAS),1) > -CLANG_FLAGS += -integrated-as > -else > +ifeq ($(LLVM_IAS),0) > CLANG_FLAGS += -no-integrated-as > GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit)) > CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE)) > +else > +CLANG_FLAGS += -integrated-as > endif > CLANG_FLAGS += -Werror=unknown-warning-option > KBUILD_CFLAGS+= $(CLANG_FLAGS) > >
Re: [PATCH v6 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper
On 8/6/21 12:17 PM, David Gibson wrote: On Tue, Jul 27, 2021 at 03:33:11PM +0530, Aneesh Kumar K.V wrote: Currently, we duplicate parsing code for ibm,associativity and ibm,associativity-lookup-arrays in the kernel. The associativity array provided by these device tree properties are very similar and hence can use a helper to parse the node id and numa distance details. Oh... sorry.. comments on the earlier patch were from before I read and saw you adjusted things here. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/numa.c | 83 ++ 1 file changed, 51 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fffb3c40f595..7506251e17f2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -171,19 +171,19 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -/* - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA - * info is found. - */ -static int associativity_to_nid(const __be32 *associativity) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) { int nid = NUMA_NO_NODE; + /* +* primary_domain_index is 1 based array index. +*/ + int index = primary_domain_index - 1; - if (!numa_enabled) + if (!numa_enabled || index >= max_array_sz) goto out; You don't need a goto, you can just return NUMA_NO_NODE. updated - if (of_read_number(associativity, 1) >= primary_domain_index) - nid = of_read_number(&associativity[primary_domain_index], 1); + nid = of_read_number(&associativity[index], 1); /* POWER4 LPAR uses 0x as invalid node */ if (nid == 0x || nid >= nr_node_ids) @@ -191,6 +191,17 @@ static int associativity_to_nid(const __be32 *associativity) out: return nid; } +/* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { @@ -295,24 +306,41 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); -static void __initialize_form1_numa_distance(const __be32 *associativity) +static void ___initialize_form1_numa_distance(const __be32 *associativity, +int max_array_sz) { int i, nid; if (affinity_form != FORM1_AFFINITY) return; - nid = associativity_to_nid(associativity); + nid = __associativity_to_nid(associativity, max_array_sz); if (nid != NUMA_NO_NODE) { for (i = 0; i < distance_ref_points_depth; i++) { const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* +* broken hierarchy, return with broken distance table WARN_ON, maybe? updated +*/ + if (index >= max_array_sz) + return; - entry = &associativity[be32_to_cpu(distance_ref_points[i])]; + entry = &associativity[index]; distance_lookup_table[nid][i] = of_read_number(entry, 1); } } } +static void __initialize_form1_numa_distance(const __be32 *associativity) Do you actually use this in-between wrapper? yes used in static void initialize_form1_numa_distance(struct device_node *node) { const __be32 *associativity; associativity = of_get_associativity(node); if (!associativity) return; __initialize_form1_numa_distance(associativity); } +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + ___initialize_form1_numa_distance(associativity + 1, array_sz); +} + static void initialize_form1_numa_distance(struct device_node *node) { const __be32 *associativity; @@ -586,27 +614,18 @@ static int get_nid_and_numa_distance(struct drmem_lmb *lmb) if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; - nid = of_read_number(&aa.arrays[index], 1); + const __be32 *associativity; - if (nid == 0x || nid >= nr_node_ids) - nid = default_nid; + index = lmb->aa_index * aa.array_sz; + assoc
Re: [PATCH kernel v2] KVM: PPC: Use arch_get_random_seed_long instead of powernv variant
Alexey Kardashevskiy writes: > The powernv_get_random_long() does not work in nested KVM (which is > pseries) and produces a crash when accessing in_be64(rng->regs) in > powernv_get_random_long(). > > This replaces powernv_get_random_long with the ppc_md machine hook > wrapper. > > Signed-off-by: Alexey Kardashevskiy Reviewed-by: Fabiano Rosas > --- > > Changes: > v2: > * replaces [PATCH kernel] powerpc/powernv: Check if powernv_rng is initialized > > --- > arch/powerpc/kvm/book3s_hv.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index be0cde26f156..ecfd133e0ca8 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -1165,7 +1165,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) > break; > #endif > case H_RANDOM: > - if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) > + if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4])) > ret = H_HARDWARE; > break; > case H_RPT_INVALIDATE:
Re: [PATCH v4 1/2] tty: hvc: pass DMA capable memory to put_chars()
On Fri, Aug 6, 2021 at 5:01 AM Xianting Tian wrote: > @@ -163,6 +155,13 @@ static void hvc_console_print(struct console *co, const > char *b, > if (vtermnos[index] == -1) > return; > > + list_for_each_entry(hp, &hvc_structs, next) > + if (hp->vtermno == vtermnos[index]) > + break; > + > + c = hp->c; > + > + spin_lock_irqsave(&hp->c_lock, flags); The loop looks like it might race against changes to the list. It seems strange that the print function has to actually search for the structure here. It may be better to have yet another array for the buffer pointers next to the cons_ops[] and vtermnos[] arrays. > +/* > + * These sizes are most efficient for vio, because they are the > + * native transfer size. We could make them selectable in the > + * future to better deal with backends that want other buffer sizes. > + */ > +#define N_OUTBUF 16 > +#define N_INBUF16 > + > +#define __ALIGNED__ __attribute__((__aligned__(sizeof(long I think you need a higher alignment for DMA buffers, instead of sizeof(long), I would suggest ARCH_DMA_MINALIGN. Arnd
Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
On Fri, Aug 06, 2021 at 04:53:14PM +1000, Michael Ellerman wrote: > But I'm not sure about the use of anonymous unions in UAPI headers. Old > compilers don't support them, so there's a risk of breakage. More precisely, it exists only since C11, so even with all not-so-ancient compilers it will not work if the user uses (say) -std=c99, which still is popular. > I'd rather we didn't touch the uapi version. Yeah. > > - err = ___do_page_fault(regs, regs->dar, regs->dsisr); > > + if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > > + err = ___do_page_fault(regs, regs->dar, regs->esr); > > + else > > + err = ___do_page_fault(regs, regs->dar, regs->dsisr); > > As Christophe said, I don't thinks this is an improvement. > > It makes the code less readable. If anyone is confused about what is > passed to ___do_page_fault() they can either read the comment above it, > or look at the definition of pt_regs to see that esr and dsisr share > storage. Esp. since the affected platforms are legacy, yup. Segher
[PATCH v6 1/2] KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path
As one of the arguments of the H_ENTER_NESTED hypercall, the nested hypervisor (L1) prepares a structure containing the values of various hypervisor-privileged registers with which it wants the nested guest (L2) to run. Since the nested HV runs in supervisor mode it needs the host to write to these registers. To stop a nested HV manipulating this mechanism and using a nested guest as a proxy to access a facility that has been made unavailable to it, we have a routine that sanitises the values of the HV registers before copying them into the nested guest's vcpu struct. However, when coming out of the guest the values are copied as they were back into L1 memory, which means that any sanitisation we did during guest entry will be exposed to L1 after H_ENTER_NESTED returns. This patch alters this sanitisation to have effect on the vcpu->arch registers directly before entering and after exiting the guest, leaving the structure that is copied back into L1 unchanged (except when we really want L1 to access the value, e.g the Cause bits of HFSCR). Signed-off-by: Fabiano Rosas Reviewed-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv_nested.c | 94 ++--- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 898f942eb198..1823674d46ef 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -105,7 +105,6 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, struct kvmppc_vcore *vc = vcpu->arch.vcore; hr->dpdes = vc->dpdes; - hr->hfscr = vcpu->arch.hfscr; hr->purr = vcpu->arch.purr; hr->spurr = vcpu->arch.spurr; hr->ic = vcpu->arch.ic; @@ -128,55 +127,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, case BOOK3S_INTERRUPT_H_INST_STORAGE: hr->asdr = vcpu->arch.fault_gpa; break; + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | +(HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); + break; case BOOK3S_INTERRUPT_H_EMUL_ASSIST: hr->heir = vcpu->arch.emul_inst; break; } } -/* - * This can result in some L0 HV register state being leaked to an L1 - * hypervisor when the hv_guest_state is copied back to the guest after - * being modified here. - * - * There is no known problem with such a leak, and in many cases these - * register settings could be derived by the guest by observing behaviour - * and timing, interrupts, etc., but it is an issue to consider. - */ -static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) -{ - struct kvmppc_vcore *vc = vcpu->arch.vcore; - u64 mask; - - /* -* Don't let L1 change LPCR bits for the L2 except these: -*/ - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | - LPCR_LPES | LPCR_MER; - - /* -* Additional filtering is required depending on hardware -* and configuration. -*/ - hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, - (vc->lpcr & ~mask) | (hr->lpcr & mask)); - - /* -* Don't let L1 enable features for L2 which we've disabled for L1, -* but preserve the interrupt cause field. -*/ - hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); - - /* Don't let data address watchpoint match in hypervisor state */ - hr->dawrx0 &= ~DAWRX_HYP; - hr->dawrx1 &= ~DAWRX_HYP; - - /* Don't let completed instruction address breakpt match in HV state */ - if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) - hr->ciabr &= ~CIABR_PRIV; -} - -static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -288,6 +249,43 @@ static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu, sizeof(struct pt_regs)); } +static void load_l2_hv_regs(struct kvm_vcpu *vcpu, + const struct hv_guest_state *l2_hv, + const struct hv_guest_state *l1_hv, u64 *lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + restore_hv_regs(vcpu, l2_hv); + + /* +* Don't let L1 change LPCR bits for the L2 except these: +*/ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + + /* +* Additional filtering is required depending on hardware +* and configuration. +*/ + *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, + (vc->lpcr & ~mask) | (*lpcr & mask)); + + /* +* Don't l
[PATCH v6 2/2] KVM: PPC: Book3S HV: Stop forwarding all HFUs to L1
If the nested hypervisor has no access to a facility because it has been disabled by the host, it should also not be able to see the Hypervisor Facility Unavailable that arises from one of its guests trying to access the facility. This patch turns a HFU that happened in L2 into a Hypervisor Emulation Assistance interrupt and forwards it to L1 for handling. The ones that happened because L1 explicitly disabled the facility for L2 are still let through, along with the corresponding Cause bits in the HFSCR. Signed-off-by: Fabiano Rosas --- arch/powerpc/kvm/book3s_hv.c| 13 + arch/powerpc/kvm/book3s_hv_nested.c | 29 +++-- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 085fb8ecbf68..9123b493c79e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1837,6 +1837,19 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) r = RESUME_HOST; break; } + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + /* +* We might decide later to turn this interrupt into a +* HEAI. Load the last instruction now that we can go +* back into the guest to retry if needed. +*/ + r = kvmppc_get_last_inst(vcpu, INST_GENERIC, +&vcpu->arch.emul_inst); + if (r != EMULATE_DONE) + r = RESUME_GUEST; + else + r = RESUME_HOST; + break; default: r = RESUME_HOST; break; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 1823674d46ef..1904697a3132 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -99,7 +99,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->dawrx1 = swab64(hr->dawrx1); } -static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, +static void save_hv_return_state(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -118,7 +118,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, hr->pidr = vcpu->arch.pid; hr->cfar = vcpu->arch.cfar; hr->ppr = vcpu->arch.ppr; - switch (trap) { + switch (vcpu->arch.trap) { case BOOK3S_INTERRUPT_H_DATA_STORAGE: hr->hdar = vcpu->arch.fault_dar; hr->hdsisr = vcpu->arch.fault_dsisr; @@ -128,9 +128,26 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, hr->asdr = vcpu->arch.fault_gpa; break; case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: - hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | -(HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); - break; + { + u64 cause = vcpu->arch.hfscr >> 56; + + WARN_ON_ONCE(cause >= BITS_PER_LONG); + + if (!(hr->hfscr & (1UL << cause))) { + hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | +(HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); + break; + } + + /* +* We have disabled this facility, so it does not +* exist from L1's perspective. Turn it into a +* HEAI. The instruction was already loaded at +* kvmppc_handle_nested_exit(). +*/ + vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; + fallthrough; + } case BOOK3S_INTERRUPT_H_EMUL_ASSIST: hr->heir = vcpu->arch.emul_inst; break; @@ -388,7 +405,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) delta_spurr = vcpu->arch.spurr - l2_hv.spurr; delta_ic = vcpu->arch.ic - l2_hv.ic; delta_vtb = vc->vtb - l2_hv.vtb; - save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + save_hv_return_state(vcpu, &l2_hv); /* restore L1 state */ vcpu->arch.nested = NULL; -- 2.29.2
[PATCH v6 0/2] KVM: PPC: Book3S HV: Nested guest state sanitising changes
This series aims to stop contaminating the l2_hv structure with bits that might have come from L1 state. Patch 1 makes l2_hv read-only (mostly). It is now only changed when we explicitly want to pass information to L1. Patch 2 makes sure that L1 is not forwarded HFU interrupts when the host has decided to disable any facilities (theoretical for now, since HFSCR bits are always the same between L1/Ln). Changes since v5: - patch 2 now reads the instruction earlier at the nested exit handler to allow the guest to retry if the load fails. v5: - moved setting of the Cause bits under BOOK3S_INTERRUPT_H_FAC_UNAVAIL. https://lkml.kernel.org/r/20210726201710.2432874-1-faro...@linux.ibm.com v4: - now passing lpcr separately into load_l2_hv_regs to solve the conflict with commit a19b70abc69a ("KVM: PPC: Book3S HV: Nested move LPCR sanitising to sanitise_hv_regs"); - patch 2 now forwards a HEAI instead of injecting a Program. https://lkml.kernel.org/r/2021071240.2384655-1-faro...@linux.ibm.com v3: - removed the sanitise functions; - moved the entry code into a new load_l2_hv_regs and the exit code into the existing save_hv_return_state; - new patch: removes the cause bits when L0 has disabled the corresponding facility. https://lkml.kernel.org/r/20210415230948.3563415-1-faro...@linux.ibm.com v2: - made the change more generic, not only applies to hfscr anymore; - sanitisation is now done directly on the vcpu struct, l2_hv is left unchanged. https://lkml.kernel.org/r/20210406214645.3315819-1-faro...@linux.ibm.com v1: https://lkml.kernel.org/r/20210305231055.2913892-1-faro...@linux.ibm.com Fabiano Rosas (2): KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path KVM: PPC: Book3S HV: Stop forwarding all HFUs to L1 arch/powerpc/kvm/book3s_hv.c| 13 arch/powerpc/kvm/book3s_hv_nested.c | 117 2 files changed, 79 insertions(+), 51 deletions(-) -- 2.29.2
[PATCH] powerpc/mce: check if event info is valid
Check if the event info is valid before printing the event information. When a fwnmi enabled nested kvm guest hits a machine check exception L0 and L2 would generate machine check event info, But L1 would not generate any machine check event info as it won't go through 0x200 vector and prints some unwanted message. To fix this, 'in_use' variable in machine check event info is no more in use, rename it to 'valid' and check if the event information is valid before logging the event information. without this patch L1 would print following message for exceptions encountered in L2, as event structure will be empty in L1. "Machine Check Exception, Unknown event version 0". Signed-off-by: Ganesh Goudar --- arch/powerpc/include/asm/mce.h | 2 +- arch/powerpc/kernel/mce.c | 7 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 331d944280b8..3646f53f228f 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -113,7 +113,7 @@ enum MCE_LinkErrorType { struct machine_check_event { enum MCE_Versionversion:8; - u8 in_use; + u8 valid; enum MCE_Severity severity:8; enum MCE_Initiator initiator:8; enum MCE_ErrorType error_type:8; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 47a683cd00d2..b778394a06b5 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -114,7 +114,7 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->srr0 = nip; mce->srr1 = regs->msr; mce->gpr3 = regs->gpr[3]; - mce->in_use = 1; + mce->valid = 1; mce->cpu = get_paca()->paca_index; /* Mark it recovered if we have handled it and MSR(RI=1). */ @@ -202,7 +202,7 @@ int get_mce_event(struct machine_check_event *mce, bool release) if (mce) *mce = *mc_evt; if (release) - mc_evt->in_use = 0; + mc_evt->valid = 0; ret = 1; } /* Decrement the count to free the slot. */ @@ -413,6 +413,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, "Probable Software error (some chance of hardware cause)", }; + if (!evt->valid) + return; + /* Print things out */ if (evt->version != MCE_V1) { pr_err("Machine Check Exception, Unknown event version %d !\n", -- 2.31.1
Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
On Fri, Aug 6, 2021 at 3:32 PM Christophe Leroy wrote: > > > > Le 06/08/2021 à 05:16, Xiongwei Song a écrit : > > On Thu, Aug 5, 2021 at 6:06 PM Christophe Leroy > > wrote: > >> > >> > >> > >> Le 26/07/2021 à 16:30, sxwj...@me.com a écrit : > >>> From: Xiongwei Song > >>> > >>> Create an anonymous union for dsisr and esr regsiters, we can reference > >>> esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. > >>> Otherwise, reference dsisr. This makes code more clear. > >> > >> I'm not sure it is worth doing that. > > Why don't we use "esr" as reference manauls mentioned? > > > >> > >> What is the point in doing the following when you know that regs->esr and > >> regs->dsisr are exactly > >> the same: > >> > >> > -err = ___do_page_fault(regs, regs->dar, regs->dsisr); > >> > +if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > >> > +err = ___do_page_fault(regs, regs->dar, regs->esr); > >> > +else > >> > +err = ___do_page_fault(regs, regs->dar, regs->dsisr); > >> > + > > Yes, we can drop this. But it's a bit vague. > > > >> Or even > >> > >> > -int is_write = page_fault_is_write(regs->dsisr); > >> > +unsigned long err_reg; > >> > +int is_write; > >> > + > >> > +if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > >> > +err_reg = regs->esr; > >> > +else > >> > +err_reg = regs->dsisr; > >> > + > >> > +is_write = page_fault_is_write(err_reg); > >> > >> > >> Artificially growing the code for that makes no sense to me. > > > > We can drop this too. > >> > >> > >> To avoid anbiguity, maybe the best would be to rename regs->dsisr to > >> something like regs->sr , so > >> that we know it represents the status register, which is DSISR or ESR > >> depending on the platform. > > > > If so, this would make other people more confused. My consideration is > > to follow what the reference > > manuals represent. > > Maybe then we could rename the fields as regs->dsisr_esr and regs->dar_dear I still prefer my method. > > That would be more explicit for everyone. > > The UAPI header however should remain as is because anonymous unions are not > supported by old > compilers as mentioned by Michael. Sure. Will update in v2. > > But nevertheless, there are also situations where was is stored in > regs->dsisr is not what we have > in DSISR register. For instance on an ISI exception, we store a subset of the > content of SRR1 > register into regs->dsisr. Can I think my method has better expansibility here;-)? Let me finish esr and dear first. Thank you for the reminder. Regards, Xiongwei > > Christophe
Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
On Fri, Aug 6, 2021 at 2:53 PM Michael Ellerman wrote: > > sxwj...@me.com writes: > > From: Xiongwei Song > > > > Create an anonymous union for dsisr and esr regsiters, we can reference > > esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. > > Otherwise, reference dsisr. This makes code more clear. > > > > Signed-off-by: Xiongwei Song > > --- > > arch/powerpc/include/asm/ptrace.h | 5 - > > arch/powerpc/include/uapi/asm/ptrace.h | 5 - > > arch/powerpc/kernel/process.c | 2 +- > > arch/powerpc/kernel/ptrace/ptrace.c| 2 ++ > > arch/powerpc/kernel/traps.c| 2 +- > > arch/powerpc/mm/fault.c| 16 ++-- > > arch/powerpc/platforms/44x/machine_check.c | 4 ++-- > > arch/powerpc/platforms/4xx/machine_check.c | 2 +- > > 8 files changed, 29 insertions(+), 9 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/ptrace.h > > b/arch/powerpc/include/asm/ptrace.h > > index 3e5d470a6155..c252d04b1206 100644 > > --- a/arch/powerpc/include/asm/ptrace.h > > +++ b/arch/powerpc/include/asm/ptrace.h > > @@ -44,7 +44,10 @@ struct pt_regs > > #endif > > unsigned long trap; > > unsigned long dar; > > - unsigned long dsisr; > > + union { > > + unsigned long dsisr; > > + unsigned long esr; > > + }; > > I don't mind doing that. > > > unsigned long result; > > }; > > }; > > diff --git a/arch/powerpc/include/uapi/asm/ptrace.h > > b/arch/powerpc/include/uapi/asm/ptrace.h > > index 7004cfea3f5f..e357288b5f34 100644 > > --- a/arch/powerpc/include/uapi/asm/ptrace.h > > +++ b/arch/powerpc/include/uapi/asm/ptrace.h > > @@ -53,7 +53,10 @@ struct pt_regs > > /* N.B. for critical exceptions on 4xx, the dar and dsisr > > fields are overloaded to hold srr0 and srr1. */ > > unsigned long dar; /* Fault registers */ > > - unsigned long dsisr;/* on 4xx/Book-E used for ESR */ > > + union { > > + unsigned long dsisr;/* on Book-S used for DSISR */ > > + unsigned long esr; /* on 4xx/Book-E used for ESR > > */ > > + }; > > unsigned long result; /* Result of a system call */ > > }; > > But I'm not sure about the use of anonymous unions in UAPI headers. Old > compilers don't support them, so there's a risk of breakage. > > I'd rather we didn't touch the uapi version. Ok. Will discard the change. > > > > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c > > index 185beb290580..f74af8f9133c 100644 > > --- a/arch/powerpc/kernel/process.c > > +++ b/arch/powerpc/kernel/process.c > > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) > > trap == INTERRUPT_DATA_STORAGE || > > trap == INTERRUPT_ALIGNMENT) { > > if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > > - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, > > regs->dsisr); > > + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, > > regs->esr); > > else > > pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, > > regs->dsisr); > > } > > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c > > b/arch/powerpc/kernel/ptrace/ptrace.c > > index 0a0a33eb0d28..00789ad2c4a3 100644 > > --- a/arch/powerpc/kernel/ptrace/ptrace.c > > +++ b/arch/powerpc/kernel/ptrace/ptrace.c > > @@ -375,6 +375,8 @@ void __init pt_regs_check(void) > >offsetof(struct user_pt_regs, dar)); > > BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != > >offsetof(struct user_pt_regs, dsisr)); > > + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != > > + offsetof(struct user_pt_regs, esr)); > > BUILD_BUG_ON(offsetof(struct pt_regs, result) != > >offsetof(struct user_pt_regs, result)); > > > > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c > > index dfbce527c98e..2164f5705a0b 100644 > > --- a/arch/powerpc/kernel/traps.c > > +++ b/arch/powerpc/kernel/traps.c > > @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs) > > #ifdef CONFIG_PPC_ADV_DEBUG_REGS > > /* On 4xx, the reason for the machine check or program exception > > is in the ESR. */ > > -#define get_reason(regs) ((regs)->dsisr) > > +#define get_reason(regs) ((regs)->esr) > > #define REASON_FPESR_FP > > #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) > > #define REASON_PRIVILEGEDESR_PPR > > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c > > index a8d0ce85d39a..62953d4e7c93 100644 > > --- a/arch/powerpc/mm/fault.c > > +++ b/arch/powerpc/mm/fault.c > > @@ -541,7 +541,11 @@ static __always_inline void __do_page_f
Re: [PATCH] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
On 6/29/21 3:15 PM, Cédric Le Goater wrote: > On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at > runtime. Today, the IPI is not created for such nodes, and hot-plugged > CPUs use a bogus IPI, which leads to soft lockups. > > We could create the node IPI on demand but it is a bit complex because > this code would be called under bringup_up() and some IRQ locking is > being done. The simplest solution is to create the IPIs for all nodes > at startup. > > Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") > Cc: sta...@vger.kernel.org # v5.13 > Reported-by: Geetika Moolchandani > Cc: Srikar Dronamraju > Signed-off-by: Cédric Le Goater > --- > > This patch breaks old versions of irqbalance (<= v1.4). Possible nodes > are collected from /sys/devices/system/node/ but CPU-less nodes are > not listed there. When interrupts are scanned, the link representing > the node structure is NULL and segfault occurs. This is an irqbalance regression due to : https://github.com/Irqbalance/irqbalance/pull/172 I will report through an issue. Anyhow, there is a better approach which is to allocate IPIs for all nodes at boot time and do the mapping on demand. Removing the mapping on last use seems more complex though. I will send a v2 after some tests. Thanks, C. > Version 1.7 seems immune. > > --- > arch/powerpc/sysdev/xive/common.c | 4 > 1 file changed, 4 deletions(-) > > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index f3b16ed48b05..5d2c58dba57e 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -1143,10 +1143,6 @@ static int __init xive_request_ipi(void) > struct xive_ipi_desc *xid = &xive_ipis[node]; > struct xive_ipi_alloc_info info = { node }; > > - /* Skip nodes without CPUs */ > - if (cpumask_empty(cpumask_of_node(node))) > - continue; > - > /* >* Map one IPI interrupt per node for all cpus of that node. >* Since the HW interrupt number doesn't have any meaning, >
Re: [PATCH printk v1 03/10] kgdb: delay roundup if holding printk cpulock
On Thu, Aug 05, 2021 at 05:52:43AM +0206, John Ogness wrote: > On 2021-08-04, Daniel Thompson wrote: > > On Wed, Aug 04, 2021 at 02:12:22PM +0200, Petr Mladek wrote: > >> On Wed 2021-08-04 12:31:59, Daniel Thompson wrote: > >> > On Tue, Aug 03, 2021 at 05:36:32PM +0206, John Ogness wrote: > >> > > On 2021-08-03, Daniel Thompson wrote: > >> > > > On Tue, Aug 03, 2021 at 03:18:54PM +0206, John Ogness wrote: > >> > > >> kgdb makes use of its own cpulock (@dbg_master_lock, @kgdb_active) > >> > > >> during cpu roundup. This will conflict with the printk cpulock. > >> > > > > >> > > > When the full vision is realized what will be the purpose of the > >> > > > printk > >> > > > cpulock? > >> > > > > >> > > > I'm asking largely because it's current role is actively unhelpful > >> > > > w.r.t. kdb. It is possible that cautious use of in_dbg_master() might > >> > > > be a better (and safer) solution. However it sounds like there is a > >> > > > larger role planned for the printk cpulock... > >> > > > >> > > The printk cpulock is used as a synchronization mechanism for > >> > > implementing atomic consoles, which need to be able to safely interrupt > >> > > the console write() activity at any time and immediately continue with > >> > > their own printing. The ultimate goal is to move all console printing > >> > > into per-console dedicated kthreads, so the primary function of the > >> > > printk cpulock is really to immediately _stop_ the CPU/kthread > >> > > performing write() in order to allow write_atomic() (from any context > >> > > on > >> > > any CPU) to safely and reliably take over. > >> > > >> > I see. > >> > > >> > Is there any mileage in allowing in_dbg_master() to suppress taking > >> > the console lock? > >> > > >> > There's a couple of reasons to worry about the current approach. > >> > > >> > The first is that we don't want this code to trigger in the case when > >> > kgdb is enabled and kdb is not since it is only kdb (a self-hosted > >> > debugger) than uses the consoles. This case is relatively trivial to > >> > address since we can rename it kdb_roundup_delay() and alter the way it > >> > is conditionally compiled. > > Well, _I_ want this code to trigger even without kdb. The printk cpulock > is meant to be the innermost locking for the entire kernel. No code is > allowed to block/spin on any kind of lock if holding the printk > cpulock. This is the only way to guarantee the functionality of the > atomic consoles. > > For example, if the kernel were to crash while inside kgdb code, we want > to see the backtrace. That would certainly help me debug any such problems in kgdb ;-) . > Since kgdb _does_ take locks (spinning on @dbg_slave_lock during roundup > and the master's own cpu lock as a retry loop on @dbg_master_lock), > clearly it is not allowed to hold the printk cpulock. The simplest > solution I could find was just to make sure kgdb_cpu_enter() isn't > called while holding the printk cpulock. We might have to come back to this. I'm pretty certain your patch does not currently achieve this goal. > >> > The second is more of a problem however. kdb will only call into the > >> > console code from the debug master. By default this is the CPU that > >> > takes the debug trap so initial prints will work fine. However it is > >> > possible to switch to a different master (so we can read per-CPU > >> > registers and things like that). This will result in one of the CPUs > >> > that did the IPI round up calling into console code and this is unsafe > >> > in that instance. > > It is only unsafe if a CPU enters "kgdb/kdb context" while holding the > printk cpulock. That is what I want to prevent. Currently you can preventing this only for CPUs that enter the debugger via an IPI. CPUs that enter due to a breakpoint (and there can be more than one at a time) cannot just continue until the lock is dropped since they would end up re-executing the breakpoint instruction. > >> > There are a couple of tricks we could adopt to work around this but > >> > given the slightly odd calling context for kdb (all CPUs quiesced, no > >> > log interleaving possible) it sounds like it would remain safe to > >> > bypass the lock if in_dbg_master() is true. > >> > > >> > Bypassing an inconvenient lock might sound icky but: > >> > > >> > 1. If the lock is not owned by any CPU then what kdb will do is safe. > > No. The printk cpulock exists for low-level synchronization. The atomic > consoles need this synchronization. (For example, the 8250 needs this > for correct tracking of its interrupt register, even for > serial8250_put_poll_char().) What I mean is that because kdb is mono-threaded (even on SMP systems due to the quiescing of other CPUs) then if the lock is not taken when we enter kdb then it is safe for kdb to contend for the lock in the normal way since it cannot deadlock. BTW the synchronization in question is the need for strict nesting, is that right? (e.g. that each context that recursiv
Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
Excerpts from Athira Rajeev's message of August 6, 2021 7:28 pm: > > >> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin wrote: >> >> It can be useful in simulators (with very constrained environments) >> to allow some PMCs to run from boot so they can be sampled directly >> by a test harness, rather than having to run perf. >> >> A previous change freezes counters at boot by default, so provide >> a boot time option to un-freeze (plus a bit more flexibility). >> >> Signed-off-by: Nicholas Piggin >> --- >> .../admin-guide/kernel-parameters.txt | 7 >> arch/powerpc/perf/core-book3s.c | 35 +++ >> 2 files changed, 42 insertions(+) >> >> diff --git a/Documentation/admin-guide/kernel-parameters.txt >> b/Documentation/admin-guide/kernel-parameters.txt >> index bdb22006f713..96b7d0ebaa40 100644 >> --- a/Documentation/admin-guide/kernel-parameters.txt >> +++ b/Documentation/admin-guide/kernel-parameters.txt >> @@ -4089,6 +4089,13 @@ >> Override pmtimer IOPort with a hex value. >> e.g. pmtmr=0x508 >> >> +pmu=[PPC] Manually enable the PMU. >> +Enable the PMU by setting MMCR0 to 0 (clear FC bit). >> +This option is implemented for Book3S processors. >> +If a number is given, then MMCR1 is set to that number, >> +otherwise (e.g., 'pmu=on'), it is left 0. The perf >> +subsystem is disabled if this option is used. >> + >> pm_debug_messages [SUSPEND,KNL] >> Enable suspend/resume debug messages during boot up. >> >> diff --git a/arch/powerpc/perf/core-book3s.c >> b/arch/powerpc/perf/core-book3s.c >> index 65795cadb475..e7cef4fe17d7 100644 >> --- a/arch/powerpc/perf/core-book3s.c >> +++ b/arch/powerpc/perf/core-book3s.c >> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu) >> } >> >> #ifdef CONFIG_PPC64 >> +static bool pmu_override = false; >> +static unsigned long pmu_override_val; >> +static void do_pmu_override(void *data) >> +{ >> +ppc_set_pmu_inuse(1); >> +if (pmu_override_val) >> +mtspr(SPRN_MMCR1, pmu_override_val); >> +mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC); > > Hi Nick > > Here, we are not doing any validity check for the value used to set MMCR1. > For advanced users, the option to pass value for MMCR1 is fine. But other > cases, it could result in > invalid event getting used. Do we need to restrict this boot time option for > only PMC5/6 ? Depends what would be useful. We don't have to prevent the admin shooting themselves in the foot with options like this, but if we can make it safer without making it less useful then that's always a good option. Thanks, Nick
Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
Excerpts from Madhavan Srinivasan's message of August 6, 2021 5:33 pm: > > On 7/26/21 9:19 AM, Nicholas Piggin wrote: >> It can be useful in simulators (with very constrained environments) >> to allow some PMCs to run from boot so they can be sampled directly >> by a test harness, rather than having to run perf. >> >> A previous change freezes counters at boot by default, so provide >> a boot time option to un-freeze (plus a bit more flexibility). >> >> Signed-off-by: Nicholas Piggin >> --- >> .../admin-guide/kernel-parameters.txt | 7 >> arch/powerpc/perf/core-book3s.c | 35 +++ >> 2 files changed, 42 insertions(+) >> >> diff --git a/Documentation/admin-guide/kernel-parameters.txt >> b/Documentation/admin-guide/kernel-parameters.txt >> index bdb22006f713..96b7d0ebaa40 100644 >> --- a/Documentation/admin-guide/kernel-parameters.txt >> +++ b/Documentation/admin-guide/kernel-parameters.txt >> @@ -4089,6 +4089,13 @@ >> Override pmtimer IOPort with a hex value. >> e.g. pmtmr=0x508 >> >> +pmu=[PPC] Manually enable the PMU. > > > This is bit confusing, IIUC, we are manually disabling the perf > registration > with this option and not pmu. > If this option is used, we will unfreeze the > MMCR0_FC (only in the HV_mode) and not register perf subsystem. With the previous patch, this option un-freezes the PMU (and disables perf). > Since this option is valid only for HV_mode, canwe call it > kvm_disable_perf or kvm_dis_perf. It's only disabled for guests because it would require a bit of logic to set pmcregs_in_use when we register our lppaca. We could add that if needed, but the intention is for use on BML, not exactly KVM specific. I can add HV restriction to the help text. And we could rename the option. free_run_pmu= or something? Thanks, Nick > > >> +Enable the PMU by setting MMCR0 to 0 (clear FC bit). >> +This option is implemented for Book3S processors. >> +If a number is given, then MMCR1 is set to that number, >> +otherwise (e.g., 'pmu=on'), it is left 0. The perf >> +subsystem is disabled if this option is used. >> + >> pm_debug_messages [SUSPEND,KNL] >> Enable suspend/resume debug messages during boot up. >> >> diff --git a/arch/powerpc/perf/core-book3s.c >> b/arch/powerpc/perf/core-book3s.c >> index 65795cadb475..e7cef4fe17d7 100644 >> --- a/arch/powerpc/perf/core-book3s.c >> +++ b/arch/powerpc/perf/core-book3s.c >> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu) >> } >> >> #ifdef CONFIG_PPC64 >> +static bool pmu_override = false; >> +static unsigned long pmu_override_val; >> +static void do_pmu_override(void *data) >> +{ >> +ppc_set_pmu_inuse(1); >> +if (pmu_override_val) >> +mtspr(SPRN_MMCR1, pmu_override_val); >> +mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC); >> +} >> + >> static int __init init_ppc64_pmu(void) >> { >> +if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) { >> +printk(KERN_WARNING "perf: disabling perf due to pmu= command >> line option.\n"); >> +on_each_cpu(do_pmu_override, NULL, 1); >> +return 0; >> +} >> + >> /* run through all the pmu drivers one at a time */ >> if (!init_power5_pmu()) >> return 0; >> @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void) >> return init_generic_compat_pmu(); >> } >> early_initcall(init_ppc64_pmu); >> + >> +static int __init pmu_setup(char *str) >> +{ >> +unsigned long val; >> + >> +if (!early_cpu_has_feature(CPU_FTR_HVMODE)) >> +return 0; >> + >> +pmu_override = true; >> + >> +if (kstrtoul(str, 0, &val)) >> +val = 0; >> + >> +pmu_override_val = val; >> + >> +return 1; >> +} >> +__setup("pmu=", pmu_setup); >> + >> #endif >
Re: [PATCH v1 14/55] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting
Excerpts from Michael Ellerman's message of August 6, 2021 5:34 pm: > Nicholas Piggin writes: >> Revert the workaround added by commit 63279eeb7f93a ("KVM: PPC: Book3S >> HV: Always save guest pmu for guest capable of nesting"). >> >> Nested capable guests running with the earlier commit ("KVM: PPC: Book3S >> HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU >> in-use status of their guests, which means the parent does not need to >> unconditionally save the PMU for nested capable guests. >> >> This will cause the PMU to break for nested guests when running older >> nested hypervisor guests under a kernel with this change. It's unclear >> there's an easy way to avoid that, so this could wait for a release or >> so for the fix to filter into stable kernels. > > I'm not sure PMU inside nested guests is getting much use, but I don't > think we can break this quite so casually :) > > Especially as the failure mode will be PMU counts that don't match > reality, which is hard to diagnose. It took nearly a year for us to find > the original bug. > > I think we need to hold this back for a while. > > We could put it under a CONFIG option, and then flip that option to off > at some point in the future. Okay that might be a good compromise, I'll do that. Thanks, Nick
Re: [PATCH v1 11/55] powerpc/time: add API for KVM to re-arm the host timer/decrementer
Excerpts from Christophe Leroy's message of August 5, 2021 5:22 pm: > > > Le 26/07/2021 à 05:49, Nicholas Piggin a écrit : >> Rather than have KVM look up the host timer and fiddle with the >> irq-work internal details, have the powerpc/time.c code provide a >> function for KVM to re-arm the Linux timer code when exiting a >> guest. >> >> This is implementation has an improvement over existing code of >> marking a decrementer interrupt as soft-pending if a timer has >> expired, rather than setting DEC to a -ve value, which tended to >> cause host timers to take two interrupts (first hdec to exit the >> guest, then the immediate dec). >> >> Signed-off-by: Nicholas Piggin >> --- >> arch/powerpc/include/asm/time.h | 16 +++--- >> arch/powerpc/kernel/time.c | 52 +++-- >> arch/powerpc/kvm/book3s_hv.c| 7 ++--- >> 3 files changed, 49 insertions(+), 26 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/time.h >> b/arch/powerpc/include/asm/time.h >> index 69b6be617772..924b2157882f 100644 >> --- a/arch/powerpc/include/asm/time.h >> +++ b/arch/powerpc/include/asm/time.h >> @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 >> dividend_low, >> extern void secondary_cpu_time_init(void); >> extern void __init time_init(void); >> >> -#ifdef CONFIG_PPC64 >> -static inline unsigned long test_irq_work_pending(void) >> -{ >> -unsigned long x; >> - >> -asm volatile("lbz %0,%1(13)" >> -: "=r" (x) >> -: "i" (offsetof(struct paca_struct, irq_work_pending))); >> -return x; >> -} >> -#endif >> - >> DECLARE_PER_CPU(u64, decrementers_next_tb); >> >> static inline u64 timer_get_next_tb(void) >> @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void) >> return __this_cpu_read(decrementers_next_tb); >> } >> >> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE >> +void timer_rearm_host_dec(u64 now); >> +#endif >> + >> /* Convert timebase ticks to nanoseconds */ >> unsigned long long tb_to_ns(unsigned long long tb_ticks); >> >> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c >> index 72d872b49167..016828b7401b 100644 >> --- a/arch/powerpc/kernel/time.c >> +++ b/arch/powerpc/kernel/time.c >> @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc); >>* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... >>*/ >> #ifdef CONFIG_PPC64 >> +static inline unsigned long test_irq_work_pending(void) >> +{ >> +unsigned long x; >> + >> +asm volatile("lbz %0,%1(13)" >> +: "=r" (x) >> +: "i" (offsetof(struct paca_struct, irq_work_pending))); > > Can we just use READ_ONCE() instead of hard coding the read ? Good question, probably yes. Probably calls for its own patch series, e.g., hw_irq.h has all similar paca accesses. Thanks, Nick
Re: [PATCH v1 02/55] KVM: PPC: Book3S HV P9: Fixes for TM softpatch interrupt
Excerpts from Michael Ellerman's message of August 6, 2021 11:16 am: > Nicholas Piggin writes: >> The softpatch interrupt sets HSRR0 to the faulting instruction +4, so >> it should subtract 4 for the faulting instruction address. Also have it >> emulate and deliver HFAC interrupts correctly, which is important for >> nested HV and facility demand-faulting in future. > > The nip being off by 4 sounds bad. But I guess it's not that big a deal > because it's only used for reporting the instruction address? Yeah currently I think so. It's not that bad of a bug. > > Would also be good to have some more explanation of why it's OK to > change from illegal to HFAC, which is a guest visible change. Good point. Again for now it doesn't really matter because the HFAC handler turns everything (except msgsndp) into a sigill anyway, so becomes important when we start using HFACs. Put that way I'll probably split it out. > >> diff --git a/arch/powerpc/kvm/book3s_hv_tm.c >> b/arch/powerpc/kvm/book3s_hv_tm.c >> index cc90b8b82329..e4fd4a9dee08 100644 >> --- a/arch/powerpc/kvm/book3s_hv_tm.c >> +++ b/arch/powerpc/kvm/book3s_hv_tm.c >> @@ -74,19 +74,23 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) >> case PPC_INST_RFEBB: >> if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206)) { >> /* generate an illegal instruction interrupt */ >> +vcpu->arch.regs.nip -= 4; >> kvmppc_core_queue_program(vcpu, SRR1_PROGILL); >> return RESUME_GUEST; >> } >> /* check EBB facility is available */ >> if (!(vcpu->arch.hfscr & HFSCR_EBB)) { >> -/* generate an illegal instruction interrupt */ >> -kvmppc_core_queue_program(vcpu, SRR1_PROGILL); >> -return RESUME_GUEST; >> +vcpu->arch.regs.nip -= 4; >> +vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; >> +vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56; >> +vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; >> +return -1; /* rerun host interrupt handler */ > > This is EBB not TM. Probably OK to leave it in this patch as long as > it's mentioned in the change log? It is, but you can get a softpatch interrupt on rfebb changing TM state. Although I haven't actually tested to see if you get a softpatch when HFSCR disables EBB or the hardware just gives you the HFAC. For that matter, same for all the other facility tests. Thanks, Nick > >> } >> if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) { >> /* generate a facility unavailable interrupt */ >> -vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | >> -((u64)FSCR_EBB_LG << 56); >> +vcpu->arch.regs.nip -= 4; >> +vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; >> +vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56; > > Same. > > > cheers >
Re: [PATCH v2 2/3] KVM: PPC: Book3S HV: Add sanity check to copy_tofrom_guest
Excerpts from Fabiano Rosas's message of August 6, 2021 7:26 am: > Both paths into __kvmhv_copy_tofrom_guest_radix ensure that we arrive > with an effective address that is smaller than our total addressable > space and addresses quadrant 0. > > - The H_COPY_TOFROM_GUEST hypercall path rejects the call with > H_PARAMETER if the effective address has any of the twelve most > significant bits set. > > - The kvmhv_copy_tofrom_guest_radix path clears the top twelve bits > before calling the internal function. > > Although the callers make sure that the effective address is sane, any > future use of the function is exposed to a programming error, so add a > sanity check. We possibly should put these into #defines in radix pgtable headers somewhere but KVM already open codes them so this is good for now. Reviewed-by: Nicholas Piggin > > Suggested-by: Nicholas Piggin > Signed-off-by: Fabiano Rosas > --- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c > b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index 44eb7b1ef289..1b1c9e9e539b 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int > pid, > (to != NULL) ? __pa(to): 0, > (from != NULL) ? __pa(from): 0, n); > > + if (eaddr & (0xFFFUL << 52)) > + return ret; > + > quadrant = 1; > if (!pid) > quadrant = 2; > -- > 2.29.2 > >
Re: [PATCH v2 1/3] KVM: PPC: Book3S HV: Fix copy_tofrom_guest routines
Excerpts from Fabiano Rosas's message of August 6, 2021 7:26 am: > The __kvmhv_copy_tofrom_guest_radix function was introduced along with > nested HV guest support. It uses the platform's Radix MMU quadrants to > provide a nested hypervisor with fast access to its nested guests > memory (H_COPY_TOFROM_GUEST hypercall). It has also since been added > as a fast path for the kvmppc_ld/st routines which are used during > instruction emulation. > > The commit def0bfdbd603 ("powerpc: use probe_user_read() and > probe_user_write()") changed the low level copy function from > raw_copy_from_user to probe_user_read, which adds a check to > access_ok. In powerpc that is: > > static inline bool __access_ok(unsigned long addr, unsigned long size) > { > return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr; > } > > and TASK_SIZE_MAX is 0x0010UL for 64-bit, which means that > setting the two MSBs of the effective address (which correspond to the > quadrant) now cause access_ok to reject the access. > > This was not caught earlier because the most common code path via > kvmppc_ld/st contains a fallback (kvm_read_guest) that is likely to > succeed for L1 guests. For nested guests there is no fallback. > > Another issue is that probe_user_read (now __copy_from_user_nofault) > does not return the number of bytes not copied in case of failure, so > the destination memory is not being cleared anymore in > kvmhv_copy_from_guest_radix: > > ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); > if (ret > 0)<-- always false! > memset(to + (n - ret), 0, ret); > > This patch fixes both issues by skipping access_ok and open-coding the > low level __copy_to/from_user_inatomic. > > Fixes: def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()") Reviewed-by: Nicholas Piggin > Signed-off-by: Fabiano Rosas > --- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c > b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index b5905ae4377c..44eb7b1ef289 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -65,10 +65,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, > int pid, > } > isync(); > > + pagefault_disable(); > if (is_load) > - ret = copy_from_user_nofault(to, (const void __user *)from, n); > + ret = __copy_from_user_inatomic(to, (const void __user *)from, > n); > else > - ret = copy_to_user_nofault((void __user *)to, from, n); > + ret = __copy_to_user_inatomic((void __user *)to, from, n); > + pagefault_enable(); > > /* switch the pid first to avoid running host with unallocated pid */ > if (quadrant == 1 && pid != old_pid) > -- > 2.29.2 > >
Re: Debian SID kernel doesn't boot on PowerBook 3400c
Le 06/08/2021 à 11:43, Finn Thain a écrit : On Fri, 6 Aug 2021, Christophe Leroy wrote: Can you check if they DO NOT happen at preceding commit c16728835~ $ git checkout c16728835~ Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap save/restore/check helpers $ git am ../message.mbox warning: Patch sent with format=flowed; space at the end of lines might be lost. Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE $ cp ../dot-config-powermac-5.13 .config $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045 3) PB 3400c Hangs at boot (Mac OS screen) 4) Wallstreet X fails, errors in console log (different than test 2), see Wallstreet_console-2.txt. This log shows that the errors "xfce4-session[1775]: bus error (7)" and "kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit c16728835eec ("powerpc/32: Manage KUAP in C"). As mentionned by Nic, this is due to r11 being cloberred. For the time being the only r11 clobber identified is the one I have provided a fix for. I'm wondering whether it was applied for all further tests or not. Your fix was applied to this build with "git am ../message.mbox". Ok good. ... Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG ... $scripts/config -e CONFIG_PPC_KUAP $ scripts/config -e CONFIG_PPC_KUAP_DEBUG $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux $ grep CONFIG_PPC_KUAP .config CONFIG_PPC_KUAP=y CONFIG_PPC_KUAP_DEBUG=y Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752 9) PB 3400c Hangs at boot (Mac OS screen) 10) Wallstreet X failed at first login, worked at second login, one error in console log ("BUG: Unable to handle kernel instruction fetch"), see Wallstreet_console-5.txt. One might expect to see "Kernel attempted to write user page (b3399774) - exploit attempt?" again here (see c16728835eec build above) but instead this log says "Oops: Kernel access of bad area, sig: 11". Maybe the test should be done a second time. As r11 is garbage it may or may not be a user address. If it is a user address the we get "Kernel attempted to write user page". If it is a random kernel address, we likely get "Kernel access of bad area" instead. Your fix was applied here also. Anyway, it would be worth trying to boot a few times more with the same kernel, because as I said the value is random, so it may or may not hit userspace, hence the possible difference of message, either "Kernel attempted to write user page" or "Kernel access of bad area" depending on whether the address is a user address or not. I have cooked a tentative fix for that KUAP stuff. Could you try the branch 'bugtest' at https://github.com/chleroy/linux.git Thanks Christophe
Re: Debian SID kernel doesn't boot on PowerBook 3400c
On Fri, 6 Aug 2021, Christophe Leroy wrote: > > > > > > > > > > Can you check if they DO NOT happen at preceding commit c16728835~ > > > > > > > > > > > $ git checkout c16728835~ > > > Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C > > > HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap > > > save/restore/check helpers > > > $ git am ../message.mbox > > > warning: Patch sent with format=flowed; space at the end of lines might be > > > lost. > > > Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE > > > $ cp ../dot-config-powermac-5.13 .config > > > $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean > > > olddefconfig vmlinux > > > > > > Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045 > > > > > > 3) PB 3400c > > > Hangs at boot (Mac OS screen) > > > > > > 4) Wallstreet > > > X fails, errors in console log (different than test 2), see > > > Wallstreet_console-2.txt. > > > > > > > This log shows that the errors "xfce4-session[1775]: bus error (7)" and > > "kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit > > c16728835eec ("powerpc/32: Manage KUAP in C"). > > As mentionned by Nic, this is due to r11 being cloberred. For the time being > the only r11 clobber identified is the one I have provided a fix for. I'm > wondering whether it was applied for all further tests or not. > Your fix was applied to this build with "git am ../message.mbox". > ... > > > > > > > > > > > > Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG > > > ... > > > > > > $scripts/config -e CONFIG_PPC_KUAP > > > $ scripts/config -e CONFIG_PPC_KUAP_DEBUG > > > $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean > > > olddefconfig vmlinux > > > $ grep CONFIG_PPC_KUAP .config > > > CONFIG_PPC_KUAP=y > > > CONFIG_PPC_KUAP_DEBUG=y > > > > > > Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752 > > > > > > 9) PB 3400c > > > Hangs at boot (Mac OS screen) > > > > > > 10) Wallstreet > > > X failed at first login, worked at second login, one error in console > > > log ("BUG: Unable to handle kernel instruction fetch"), see > > > Wallstreet_console-5.txt. > > > > > > > One might expect to see "Kernel attempted to write user page (b3399774) - > > exploit attempt?" again here (see c16728835eec build above) but instead > > this log says "Oops: Kernel access of bad area, sig: 11". > > Maybe the test should be done a second time. As r11 is garbage it may or > may not be a user address. If it is a user address the we get "Kernel > attempted to write user page". If it is a random kernel address, we > likely get "Kernel access of bad area" instead. > Your fix was applied here also.
Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin wrote: > > It can be useful in simulators (with very constrained environments) > to allow some PMCs to run from boot so they can be sampled directly > by a test harness, rather than having to run perf. > > A previous change freezes counters at boot by default, so provide > a boot time option to un-freeze (plus a bit more flexibility). > > Signed-off-by: Nicholas Piggin > --- > .../admin-guide/kernel-parameters.txt | 7 > arch/powerpc/perf/core-book3s.c | 35 +++ > 2 files changed, 42 insertions(+) > > diff --git a/Documentation/admin-guide/kernel-parameters.txt > b/Documentation/admin-guide/kernel-parameters.txt > index bdb22006f713..96b7d0ebaa40 100644 > --- a/Documentation/admin-guide/kernel-parameters.txt > +++ b/Documentation/admin-guide/kernel-parameters.txt > @@ -4089,6 +4089,13 @@ > Override pmtimer IOPort with a hex value. > e.g. pmtmr=0x508 > > + pmu=[PPC] Manually enable the PMU. > + Enable the PMU by setting MMCR0 to 0 (clear FC bit). > + This option is implemented for Book3S processors. > + If a number is given, then MMCR1 is set to that number, > + otherwise (e.g., 'pmu=on'), it is left 0. The perf > + subsystem is disabled if this option is used. > + > pm_debug_messages [SUSPEND,KNL] > Enable suspend/resume debug messages during boot up. > > diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c > index 65795cadb475..e7cef4fe17d7 100644 > --- a/arch/powerpc/perf/core-book3s.c > +++ b/arch/powerpc/perf/core-book3s.c > @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu) > } > > #ifdef CONFIG_PPC64 > +static bool pmu_override = false; > +static unsigned long pmu_override_val; > +static void do_pmu_override(void *data) > +{ > + ppc_set_pmu_inuse(1); > + if (pmu_override_val) > + mtspr(SPRN_MMCR1, pmu_override_val); > + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC); Hi Nick Here, we are not doing any validity check for the value used to set MMCR1. For advanced users, the option to pass value for MMCR1 is fine. But other cases, it could result in invalid event getting used. Do we need to restrict this boot time option for only PMC5/6 ? Thanks Athira > +} > + > static int __init init_ppc64_pmu(void) > { > + if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) { > + printk(KERN_WARNING "perf: disabling perf due to pmu= command > line option.\n"); > + on_each_cpu(do_pmu_override, NULL, 1); > + return 0; > + } > + > /* run through all the pmu drivers one at a time */ > if (!init_power5_pmu()) > return 0; > @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void) > return init_generic_compat_pmu(); > } > early_initcall(init_ppc64_pmu); > + > +static int __init pmu_setup(char *str) > +{ > + unsigned long val; > + > + if (!early_cpu_has_feature(CPU_FTR_HVMODE)) > + return 0; > + > + pmu_override = true; > + > + if (kstrtoul(str, 0, &val)) > + val = 0; > + > + pmu_override_val = val; > + > + return 1; > +} > +__setup("pmu=", pmu_setup); > + > #endif > -- > 2.23.0 >
Re: [PATCH] powerpc/kprobes: Fix kprobe Oops happens in booke
On 2021/8/5 17:51, Christophe Leroy wrote: Le 04/08/2021 à 16:37, Pu Lehui a écrit : When using kprobe on powerpc booke series processor, Oops happens as show bellow: [ 35.861352] Oops: Exception in kernel mode, sig: 5 [#1] [ 35.861676] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 [ 35.861905] Modules linked in: [ 35.862144] CPU: 0 PID: 76 Comm: sh Not tainted 5.14.0-rc3-00060-g7e96bf476270 #18 [ 35.862610] NIP: c0b96470 LR: c00107b4 CTR: c0161c80 [ 35.862805] REGS: c387fe70 TRAP: 0700 Not tainted (5.14.0-rc3-00060-g7e96bf476270) [ 35.863198] MSR: 00029002 CR: 24022824 XER: 2000 [ 35.863577] [ 35.863577] GPR00: c0015218 c387ff20 c313e300 c387ff50 0004 4002 4000 0a1a2cce [ 35.863577] GPR08: 0004 59764000 24022422 102490c2 [ 35.863577] GPR16: 0040 1024 1024 1024 1024 1022 [ 35.863577] GPR24: 1024 bfc655e8 0800 c387ff50 [ 35.865367] NIP [c0b96470] schedule+0x0/0x130 [ 35.865606] LR [c00107b4] interrupt_exit_user_prepare_main+0xf4/0x100 [ 35.865974] Call Trace: [ 35.866142] [c387ff20] [c0053224] irq_exit+0x114/0x120 (unreliable) [ 35.866472] [c387ff40] [c0015218] interrupt_return+0x14/0x13c [ 35.866728] --- interrupt: 900 at 0x100af3dc [ 35.866963] NIP: 100af3dc LR: 100de020 CTR: [ 35.867177] REGS: c387ff50 TRAP: 0900 Not tainted (5.14.0-rc3-00060-g7e96bf476270) [ 35.867488] MSR: 0002f902 CR: 20022422 XER: 2000 [ 35.867808] [ 35.867808] GPR00: c001509c bfc65570 1024b4d0 100de020 20022422 bfc655a8 100af3dc [ 35.867808] GPR08: 0002f902 72656773 102490c2 [ 35.867808] GPR16: 0040 1024 1024 1024 1024 1022 [ 35.867808] GPR24: 1024 bfc655e8 10245910 0001 [ 35.869406] NIP [100af3dc] 0x100af3dc [ 35.869578] LR [100de020] 0x100de020 [ 35.869751] --- interrupt: 900 [ 35.870001] Instruction dump: [ 35.870283] 40c20010 815e0518 714a0100 41e2fd04 3920 913e00c0 3b1e0450 4bfffd80 [ 35.870666] 0fe0 92a10024 4bfff1a9 6000 <7fe8> 7c0802a6 93e1001c 7c5f1378 [ 35.871339] ---[ end trace 23ff848139efa9b9 ]--- There is no real mode for booke arch and the MMU translation is always on. The corresponding MSR_IS/MSR_DS bit in booke is used to switch the address space, but not for real mode judgment. Can you explain more the link between that explanation and the Oops itself ? In fact, the same Oops appears when any probed function is hit, like do_nanosleep / # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events / # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable / # sleep 1 [ 50.076730] Oops: Exception in kernel mode, sig: 5 [#1] [ 50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 [ 50.077221] Modules linked in: [ 50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 5.14.0-rc4-00022-g251a1524293d #21 [ 50.077887] NIP: c0b9c4e0 LR: c00ebecc CTR: [ 50.078067] REGS: c3883de0 TRAP: 0700 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.078349] MSR: 00029000 CR: 24000228 XER: 2000 [ 50.078675] [ 50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001 c3883ecc 0001 [ 50.078675] GPR08: c100598c c00ea250 0004 24000222 102490c2 bff4180c 101e60d4 [ 50.078675] GPR16: 102454ac 0040 1024 10241100 102410f8 1024 0050 [ 50.078675] GPR24: 0002 c3883ea0 0001 c350 3b9b8d50 [ 50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190 [ 50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0 [ 50.080638] Call Trace: [ 50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable) [ 50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110 [ 50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28 [ 50.081541] --- interrupt: c00 at 0x100a4d08 [ 50.081749] NIP: 100a4d08 LR: 101b5234 CTR: 0003 [ 50.081931] REGS: c3883f50 TRAP: 0c00 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.082183] MSR: 0002f902 CR: 24000222 XER: [ 50.082457] [ 50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 6400 00555345 fefefeff [ 50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 102490c2 bff4180c 101e60d4 [ 50.082457] GPR16: 102454ac 0040 1024 10241100 102410f8 1024 0050 [ 50.082457] GPR24: 0002 bf9803f4 1024 100039e0 102444e8 [ 50.083789] NIP [100a4d08] 0x100a4d08 [ 50.083917] LR [101b5234] 0x101b5234 [ 50.084042] --- interrupt: c00 [ 50.084238] Instruction dump: [ 50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 914200c0 38210010 [ 50.084841] 4bfffc20 <7fe8> 7c0802a6
Re: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to platform_device
On Thu, Aug 5, 2021 at 9:35 PM Maxim Kochetkov wrote: > > 03.08.2021 20:51, Saravana Kannan wrote: > >> So lets convert this driver to simple platform_device with probe(). > >> Also use platform_get_ and devm_ family function to get/allocate > >> resources and drop unused .compatible = "qeic". > > Yes, please! > > Should I totally drop { .type = "qeic"}, or keep? Sorry for the confusion. My "Yes, please"!" was a show of support for switching this to a proper platform driver. Not a response to that specific question. I didn't look at the code/DT close enough to know/care about the "type" part. -Saravana
Re: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to platform_device
03.08.2021 20:51, Saravana Kannan wrote: So lets convert this driver to simple platform_device with probe(). Also use platform_get_ and devm_ family function to get/allocate resources and drop unused .compatible = "qeic". Yes, please! Should I totally drop { .type = "qeic"}, or keep?
Re: [RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE
> With shared mapping, even though we are unmapping a large range, the kernel > will force a TLB flush with ptl lock held to avoid the race mentioned in > commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and memory freeing parts") > This results in the kernel issuing a high number of TLB flushes even for a large > range. This can be improved by making sure the kernel switch to pid based flush if the > kernel is unmapping a 2M range. > > Signed-off-by: Aneesh Kumar K.V > --- > arch/powerpc/mm/book3s64/radix_tlb.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c > index aefc100d79a7..21d0f098e43b 100644 > --- a/arch/powerpc/mm/book3s64/radix_tlb.c > +++ b/arch/powerpc/mm/book3s64/radix_tlb.c > @@ -1106,7 +1106,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); > * invalidating a full PID, so it has a far lower threshold to change from > * individual page flushes to full-pid flushes. > */ > -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; > +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 32; > static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; > > static inline void __radix__flush_tlb_range(struct mm_struct *mm, > @@ -1133,7 +1133,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, > if (fullmm) > flush_pid = true; > else if (type == FLUSH_TYPE_GLOBAL) > - flush_pid = nr_pages > tlb_single_page_flush_ceiling; > + flush_pid = nr_pages >= tlb_single_page_flush_ceiling; > else > flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; Additional details on the test environment. This was tested on a 2 Node/8 socket Power10 system. The LPAR had 105 cores and the LPAR spanned across all the sockets. # perf stat -I 1000 -a -e cycles,instructions -e "{cpu/config=0x030008,name=PM_EXEC_STALL/}" -e "{cpu/config=0x02E01C,name=PM_EXEC_STALL_TLBIE/}" ./tlbie -i 10 -c 1 -t 1 Rate of work: = 176 # time counts unit events 1.029206442 4198594519 cycles 1.029206442 2458254252 instructions # 0.59 insn per cycle 1.029206442 3004031488 PM_EXEC_STALL 1.029206442 1798186036 PM_EXEC_STALL_TLBIE Rate of work: = 181 2.054288539 4183883450 cycles 2.054288539 2472178171 instructions # 0.59 insn per cycle 2.054288539 3014609313 PM_EXEC_STALL 2.054288539 1797851642 PM_EXEC_STALL_TLBIE Rate of work: = 180 3.078306883 4171250717 cycles 3.078306883 2468341094 instructions # 0.59 insn per cycle 3.078306883 2993036205 PM_EXEC_STALL 3.078306883 1798181890 PM_EXEC_STALL_TLBIE . . # cat /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling 34 # echo 32 > /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling # perf stat -I 1000 -a -e cycles,instructions -e "{cpu/config=0x030008,name=PM_EXEC_STALL/}" -e "{cpu/config=0x02E01C,name=PM_EXEC_STALL_TLBIE/}" ./tlbie -i 10 -c 1 -t 1 Rate of work: = 313 # time counts unit events 1.030310506 4206071143 cycles 1.030310506 4314716958 instructions # 1.03 insn per cycle 1.030310506 2157762167 PM_EXEC_STALL 1.030310506 110825573 PM_EXEC_STALL_TLBIE Rate of work: = 322 2.056034068 4331745630 cycles 2.056034068 4531658304 instructions # 1.05 insn per cycle 2.056034068 2288971361 PM_EXEC_STALL 2.056034068 111267927 PM_EXEC_STALL_TLBIE Rate of work: = 321 3.081216434 4327050349 cycles 3.081216434 4379679508 instructions # 1.01 insn per cycle 3.081216434 2252602550 PM_EXEC_STALL 3.081216434 110974887 PM_EXEC_STALL_TLBIE . . Regards, Puvichakravarthy Ramachandran
Re: [PATCH v2 0/6] PCI: Drop duplicated tracking of a pci_dev's bound driver
Hello Bjorn, On Thu, Aug 05, 2021 at 06:42:34PM -0500, Bjorn Helgaas wrote: > On Tue, Aug 03, 2021 at 12:01:44PM +0200, Uwe Kleine-König wrote: > > Hello, > > > > changes since v1 > > (https://lore.kernel.org/linux-pci/20210729203740.1377045-1-u.kleine-koe...@pengutronix.de): > > > > - New patch to simplify drivers/pci/xen-pcifront.c, spotted and > > suggested by Boris Ostrovsky > > - Fix a possible NULL pointer dereference I introduced in xen-pcifront.c > > - A few whitespace improvements > > - Add a commit log to patch #6 (formerly #5) > > > > I also expanded the audience for patches #4 and #6 to allow affected > > people to actually see the changes to their drivers. > > > > Interdiff can be found below. > > > > The idea is still the same: After a few cleanups (#1 - #3) a new macro > > is introduced abstracting access to struct pci_dev->driver. All users > > are then converted to use this and in the last patch the macro is > > changed to make use of struct pci_dev::dev->driver to get rid of the > > duplicated tracking. > > I love the idea of this series! \o/ > I looked at all the bus_type.probe() methods, it looks like pci_dev is > not the only offender here. At least the following also have a driver > pointer in the device struct: > > parisc_device.driver > acpi_device.driver > dio_dev.driver > hid_device.driver > pci_dev.driver > pnp_dev.driver > rio_dev.driver > zorro_dev.driver Right, when I converted zorro_dev it was pointed out that the code was copied from pci and the latter has the same construct. :-) See https://lore.kernel.org/r/20210730191035.1455248-5-u.kleine-koe...@pengutronix.de for the patch, I don't find where pci was pointed out, maybe it was on irc only. > Do you plan to do the same for all of them, or is there some reason > why they need the pointer and PCI doesn't? There is a list of cleanup stuff I intend to work on. Considering how working on that list only made it longer in the recent past, maybe it makes more sense to not work on it :-) > In almost all cases, other buses define a "to__driver()" > interface. In fact, PCI already has a to_pci_driver(). > > This series adds pci_driver_of_dev(), which basically just means we > can do this: > > pdrv = pci_driver_of_dev(pdev); > > instead of this: > > pdrv = to_pci_driver(pdev->dev.driver); > > I don't see any other "_driver_of_dev()" interfaces, so I assume > other buses just live with the latter style? I'd rather not be > different and have two ways to get the "struct pci_driver *" unless > there's a good reason. Among few the busses I already fixed in this regard pci was the first that has a considerable amount of usage. So I considered it worth giving it a name. > Looking through the places that care about pci_dev.driver (the ones > updated by patch 5/6), many of them are ... a little dubious to begin > with. A few need the "struct pci_error_handlers *err_handler" > pointer, so that's probably legitimate. But many just need a name, > and should probably be using dev_driver_string() instead. Yeah, I considered adding a function to get the driver name from a pci_dev and a function to get the error handlers. Maybe it's an idea to introduce these two and then use to_pci_driver(pdev->dev.driver) for the few remaining users? Maybe doing that on top of my current series makes sense to have a clean switch from pdev->driver to pdev->dev.driver?! Best regards Uwe -- Pengutronix e.K. | Uwe Kleine-König| Industrial Linux Solutions | https://www.pengutronix.de/ | signature.asc Description: PGP signature
Re: [RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE
> With shared mapping, even though we are unmapping a large range, the kernel > will force a TLB flush with ptl lock held to avoid the race mentioned in > commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and memory freeing parts") > This results in the kernel issuing a high number of TLB flushes even for a large > range. This can be improved by making sure the kernel switch to pid based flush if the > kernel is unmapping a 2M range. > > Signed-off-by: Aneesh Kumar K.V > --- > arch/powerpc/mm/book3s64/radix_tlb.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c > index aefc100d79a7..21d0f098e43b 100644 > --- a/arch/powerpc/mm/book3s64/radix_tlb.c > +++ b/arch/powerpc/mm/book3s64/radix_tlb.c > @@ -1106,7 +1106,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); > * invalidating a full PID, so it has a far lower threshold to change from > * individual page flushes to full-pid flushes. > */ > -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; > +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 32; > static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; > > static inline void __radix__flush_tlb_range(struct mm_struct *mm, > @@ -1133,7 +1133,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, > if (fullmm) > flush_pid = true; > else if (type == FLUSH_TYPE_GLOBAL) > - flush_pid = nr_pages > tlb_single_page_flush_ceiling; > + flush_pid = nr_pages >= tlb_single_page_flush_ceiling; > else > flush_pid = nr_pages > tlb_local_single_page_flush_ceiling; I evaluated the patches from Aneesh with a micro benchmark which does shmat, shmdt of 256 MB segment. Higher the rate of work, better the performance. With a value of 32, we match the performance of GTSE=off. This was evaluated on SLES15 SP3 kernel. # cat /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling 32 # perf stat -I 1000 -a -e powerpc:tlbie,r30058 ./tlbie -i 5 -c 1 t 1 Rate of work: = 311 # time counts unit events 1.013131404 50939 powerpc:tlbie 1.013131404 50658 r30058 Rate of work: = 318 2.026957019 51520 powerpc:tlbie 2.026957019 51481 r30058 Rate of work: = 318 3.038884431 51485 powerpc:tlbie 3.038884431 51461 r30058 Rate of work: = 318 4.051483926 51485 powerpc:tlbie 4.051483926 51520 r30058 Rate of work: = 318 5.063635713 48577 powerpc:tlbie 5.063635713 48347 r30058 # echo 34 > /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling # perf stat -I 1000 -a -e powerpc:tlbie,r30058 ./tlbie -i 5 -c 1 t 1 Rate of work: = 174 # time counts unit events 1.012672696 721471 powerpc:tlbie 1.012672696 726491 r30058 Rate of work: = 177 2.026348661 737460 powerpc:tlbie 2.026348661 736138 r30058 Rate of work: = 178 3.037932122 737460 powerpc:tlbie 3.037932122 737460 r30058 Rate of work: = 178 4.050198819 737044 powerpc:tlbie 4.050198819 737460 r30058 Rate of work: = 177 5.062400776 692832 powerpc:tlbie 5.062400776 688319 r30058 Regards, Puvichakravarthy Ramachandran
Re: [PATCH v1 14/55] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting
Nicholas Piggin writes: > Revert the workaround added by commit 63279eeb7f93a ("KVM: PPC: Book3S > HV: Always save guest pmu for guest capable of nesting"). > > Nested capable guests running with the earlier commit ("KVM: PPC: Book3S > HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU > in-use status of their guests, which means the parent does not need to > unconditionally save the PMU for nested capable guests. > > This will cause the PMU to break for nested guests when running older > nested hypervisor guests under a kernel with this change. It's unclear > there's an easy way to avoid that, so this could wait for a release or > so for the fix to filter into stable kernels. I'm not sure PMU inside nested guests is getting much use, but I don't think we can break this quite so casually :) Especially as the failure mode will be PMU counts that don't match reality, which is hard to diagnose. It took nearly a year for us to find the original bug. I think we need to hold this back for a while. We could put it under a CONFIG option, and then flip that option to off at some point in the future. cheers > index e7f8cc04944b..ab89db561c85 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -4003,8 +4003,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, > u64 time_limit, > vcpu->arch.vpa.dirty = 1; > save_pmu = lp->pmcregs_in_use; > } > - /* Must save pmu if this guest is capable of running nested guests */ > - save_pmu |= nesting_enabled(vcpu->kvm); > > kvmhv_save_guest_pmu(vcpu, save_pmu); > #ifdef CONFIG_PPC_PSERIES > -- > 2.23.0
Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
On 7/26/21 9:19 AM, Nicholas Piggin wrote: It can be useful in simulators (with very constrained environments) to allow some PMCs to run from boot so they can be sampled directly by a test harness, rather than having to run perf. A previous change freezes counters at boot by default, so provide a boot time option to un-freeze (plus a bit more flexibility). Signed-off-by: Nicholas Piggin --- .../admin-guide/kernel-parameters.txt | 7 arch/powerpc/perf/core-book3s.c | 35 +++ 2 files changed, 42 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdb22006f713..96b7d0ebaa40 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4089,6 +4089,13 @@ Override pmtimer IOPort with a hex value. e.g. pmtmr=0x508 + pmu=[PPC] Manually enable the PMU. This is bit confusing, IIUC, we are manually disabling the perf registration with this option and not pmu. If this option is used, we will unfreeze the MMCR0_FC (only in the HV_mode) and not register perf subsystem. Since this option is valid only for HV_mode, canwe call it kvm_disable_perf or kvm_dis_perf. + Enable the PMU by setting MMCR0 to 0 (clear FC bit). + This option is implemented for Book3S processors. + If a number is given, then MMCR1 is set to that number, + otherwise (e.g., 'pmu=on'), it is left 0. The perf + subsystem is disabled if this option is used. + pm_debug_messages [SUSPEND,KNL] Enable suspend/resume debug messages during boot up. diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 65795cadb475..e7cef4fe17d7 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu) } #ifdef CONFIG_PPC64 +static bool pmu_override = false; +static unsigned long pmu_override_val; +static void do_pmu_override(void *data) +{ + ppc_set_pmu_inuse(1); + if (pmu_override_val) + mtspr(SPRN_MMCR1, pmu_override_val); + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC); +} + static int __init init_ppc64_pmu(void) { + if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) { + printk(KERN_WARNING "perf: disabling perf due to pmu= command line option.\n"); + on_each_cpu(do_pmu_override, NULL, 1); + return 0; + } + /* run through all the pmu drivers one at a time */ if (!init_power5_pmu()) return 0; @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void) return init_generic_compat_pmu(); } early_initcall(init_ppc64_pmu); + +static int __init pmu_setup(char *str) +{ + unsigned long val; + + if (!early_cpu_has_feature(CPU_FTR_HVMODE)) + return 0; + + pmu_override = true; + + if (kstrtoul(str, 0, &val)) + val = 0; + + pmu_override_val = val; + + return 1; +} +__setup("pmu=", pmu_setup); + #endif
Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
Le 06/08/2021 à 05:16, Xiongwei Song a écrit : On Thu, Aug 5, 2021 at 6:06 PM Christophe Leroy wrote: Le 26/07/2021 à 16:30, sxwj...@me.com a écrit : From: Xiongwei Song Create an anonymous union for dsisr and esr regsiters, we can reference esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dsisr. This makes code more clear. I'm not sure it is worth doing that. Why don't we use "esr" as reference manauls mentioned? What is the point in doing the following when you know that regs->esr and regs->dsisr are exactly the same: > -err = ___do_page_fault(regs, regs->dar, regs->dsisr); > +if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > +err = ___do_page_fault(regs, regs->dar, regs->esr); > +else > +err = ___do_page_fault(regs, regs->dar, regs->dsisr); > + Yes, we can drop this. But it's a bit vague. Or even > -int is_write = page_fault_is_write(regs->dsisr); > +unsigned long err_reg; > +int is_write; > + > +if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) > +err_reg = regs->esr; > +else > +err_reg = regs->dsisr; > + > +is_write = page_fault_is_write(err_reg); Artificially growing the code for that makes no sense to me. We can drop this too. To avoid anbiguity, maybe the best would be to rename regs->dsisr to something like regs->sr , so that we know it represents the status register, which is DSISR or ESR depending on the platform. If so, this would make other people more confused. My consideration is to follow what the reference manuals represent. Maybe then we could rename the fields as regs->dsisr_esr and regs->dar_dear That would be more explicit for everyone. The UAPI header however should remain as is because anonymous unions are not supported by old compilers as mentioned by Michael. But nevertheless, there are also situations where was is stored in regs->dsisr is not what we have in DSISR register. For instance on an ISI exception, we store a subset of the content of SRR1 register into regs->dsisr. Christophe