date:20210806





Le 07/08/2021 à 03:02, sxwj...@me.com a écrit :

From: Xiongwei Song 

Create an anonymous union for dar and dear regsiters, we can reference
dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y.
Otherwise, reference dar. This makes code more clear.

Signed-off-by: Xiongwei Song 
---
  arch/powerpc/include/asm/ptrace.h   | 5 -
  arch/powerpc/kernel/process.c   | 2 +-
  arch/powerpc/kernel/ptrace/ptrace.c | 2 ++
  3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index c252d04b1206..fa725e3238c2 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -43,7 +43,10 @@ struct pt_regs
unsigned long mq;
  #endif
unsigned long trap;
-   unsigned long dar;
+   union {
+   unsigned long dar;
+   unsigned long dear;
+   };
union {
unsigned long dsisr;
unsigned long esr;
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index f74af8f9133c..50436b52c213 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
trap == INTERRUPT_DATA_STORAGE ||
trap == INTERRUPT_ALIGNMENT) {
if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
-   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
regs->esr);
+   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, 
regs->esr);
else
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
regs->dsisr);
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index a222fd4d6334..7c7093c17c45 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -373,6 +373,8 @@ void __init pt_regs_check(void)
 offsetof(struct user_pt_regs, trap));
BUILD_BUG_ON(offsetof(struct pt_regs, dar) !=
 offsetof(struct user_pt_regs, dar));
+   BUILD_BUG_ON(offsetof(struct pt_regs, dear) !=
+offsetof(struct user_pt_regs, dar));


dar and dear are the same, so checking the same thing a second time is 
pointless.


BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
 offsetof(struct user_pt_regs, dsisr));
BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=

Re: [PATCH v2 1/4] powerpc: Optimize register usage for esr register





Le 07/08/2021 à 03:02, sxwj...@me.com a écrit :

From: Xiongwei Song 

Create an anonymous union for dsisr and esr regsiters, we can reference
esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
Otherwise, reference dsisr. This makes code more clear.

Signed-off-by: Xiongwei Song 
---
  arch/powerpc/include/asm/ptrace.h  | 5 -
  arch/powerpc/kernel/process.c  | 2 +-
  arch/powerpc/kernel/ptrace/ptrace.c| 2 ++
  arch/powerpc/kernel/traps.c| 2 +-
  arch/powerpc/platforms/44x/machine_check.c | 4 ++--
  arch/powerpc/platforms/4xx/machine_check.c | 2 +-
  6 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 3e5d470a6155..c252d04b1206 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -44,7 +44,10 @@ struct pt_regs
  #endif
unsigned long trap;
unsigned long dar;
-   unsigned long dsisr;
+   union {
+   unsigned long dsisr;
+   unsigned long esr;
+   };
unsigned long result;
};
};
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 185beb290580..f74af8f9133c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
trap == INTERRUPT_DATA_STORAGE ||
trap == INTERRUPT_ALIGNMENT) {
if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
-   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
regs->dsisr);
+   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
regs->esr);
else
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
regs->dsisr);
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index 0a0a33eb0d28..a222fd4d6334 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -375,6 +375,8 @@ void __init pt_regs_check(void)
 offsetof(struct user_pt_regs, dar));
BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
 offsetof(struct user_pt_regs, dsisr));
+   BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
+offsetof(struct user_pt_regs, dsisr));


esr and dsisr are the same, so checking the same thing a second time is 
pointless.


BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
 offsetof(struct user_pt_regs, result));
  
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c

index dfbce527c98e..2164f5705a0b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs)
  #ifdef CONFIG_PPC_ADV_DEBUG_REGS
  /* On 4xx, the reason for the machine check or program exception
 is in the ESR. */
-#define get_reason(regs)   ((regs)->dsisr)
+#define get_reason(regs)   ((regs)->esr)
  #define REASON_FP ESR_FP
  #define REASON_ILLEGAL(ESR_PIL | ESR_PUO)
  #define REASON_PRIVILEGED ESR_PPR
diff --git a/arch/powerpc/platforms/44x/machine_check.c 
b/arch/powerpc/platforms/44x/machine_check.c
index a5c898bb9bab..5d19daacd78a 100644
--- a/arch/powerpc/platforms/44x/machine_check.c
+++ b/arch/powerpc/platforms/44x/machine_check.c
@@ -11,7 +11,7 @@
  
  int machine_check_440A(struct pt_regs *regs)

  {
-   unsigned long reason = regs->dsisr;
+   unsigned long reason = regs->esr;
  
  	printk("Machine check in kernel mode.\n");

if (reason & ESR_IMCP){
@@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs)
  #ifdef CONFIG_PPC_47x
  int machine_check_47x(struct pt_regs *regs)
  {
-   unsigned long reason = regs->dsisr;
+   unsigned long reason = regs->esr;
u32 mcsr;
  
  	printk(KERN_ERR "Machine check in kernel mode.\n");

diff --git a/arch/powerpc/platforms/4xx/machine_check.c 
b/arch/powerpc/platforms/4xx/machine_check.c
index a71c29892a91..a905da1d6f41 100644
--- a/arch/powerpc/platforms/4xx/machine_check.c
+++ b/arch/powerpc/platforms/4xx/machine_check.c
@@ -10,7 +10,7 @@
  
  int machine_check_4xx(struct pt_regs *regs)

  {
-   unsigned long reason = regs->dsisr;
+   unsigned long reason = regs->esr;
  
  	if (reason & ESR_IMCP) {

printk("Instruction");

Re: Debian SID kernel doesn't boot on PowerBook 3400c

2021-08-06 Thread Finn Thain



On Fri, 6 Aug 2021, Stan Johnson wrote:

> $ egrep '(CONFIG_PPC_KUAP|CONFIG_VMAP_STACK)' .config
> CONFIG_PPC_KUAP=y
> CONFIG_PPC_KUAP_DEBUG=y
> CONFIG_VMAP_STACK=y
> $ strings vmlinux | fgrep "Linux version"
> Linux version 5.13.0-pmac-4-g63e3756d1bd ...
> $ cp vmlinux ../vmlinux-5.13.0-pmac-4-g63e3756d1bd-1
> 
> 1) PB 3400c
> vmlinux-5.13.0-pmac-4-g63e3756d1bd-1
> Boots, no errors logging in at (text) fb console. Logging in via ssh and
> running "ls -Rail /usr/include" generated errors (and a hung ssh
> session). Once errors started, they repeated for almost every command.
> See pb3400c-63e3756d1bdf-1.txt.
> 
> 2) Wallstreet
> vmlinux-5.13.0-pmac-4-g63e3756d1bd-1
> X login failed, there were errors ("Oops: Kernel access of bad area",
> "Oops: Exception in kernel mode"). Logging in via SSH, there were no
> additional errors after running "ls -Rail /usr/include" -- the errors
> did not escalate as they did on the PB 3400.
> See Wallstreet-63e3756d1bdf-1.txt.
> 
...
> $ egrep '(CONFIG_PPC_KUAP|CONFIG_VMAP_STACK)' .config
> CONFIG_PPC_KUAP=y
> CONFIG_PPC_KUAP_DEBUG=y
> # CONFIG_VMAP_STACK is not set
> $ strings vmlinux | fgrep "Linux version"
> Linux version 5.13.0-pmac-4-g63e3756d1bd ...
> $ cp vmlinux ../vmlinux-5.13.0-pmac-4-g63e3756d1bd-2
> 
> 3) PB 3400c
> vmlinux-5.13.0-pmac-4-g63e3756d1bd-2
> Filesystem was corrupt from the previous test (probably from all the
> errors during shutdown). After fixing the filesystem:
> Boots, no errors logging in at (text) fb console. Logging in via ssh and
> running "ls -Rail /usr/include" generated a few errors. There didn't
> seem to be as many errors as in the previous test, there were a few
> errors during shutdown but the shutdown was otherwise normal.
> See pb3400c-63e3756d1bdf-2.txt.
> 
> 4) Wallstreet
> vmlinux-5.13.0-pmac-4-g63e3756d1bd-2
> X login worked, and there were no errors. There were no errors during
> ssh access.
> See Wallstreet-63e3756d1bdf-2.txt.
> 

Thanks for collecting these results, Stan. Do you think that the 
successful result from test 4) could have been just chance?

It appears that the bug affecting the Powerbook 3400 is unaffected by 
CONFIG_VMAP_STACK.

Whereas the bug affecting the Powerbook G3 disappears when 
CONFIG_VMAP_STACK is disabled (assuming the result from 4 is reliable).

Either way, these results reiterate that "Oops: Kernel access of bad area, 
sig: 11" was not entirely resolved by "powerpc/32s: Fix napping restore in 
data storage interrupt (DSI)".

Re: Debian SID kernel doesn't boot on PowerBook 3400c

2021-08-06 Thread Finn Thain

On Fri, 6 Aug 2021, Christophe Leroy wrote:

> 
> I have cooked a tentative fix for that KUAP stuff.
> Could you try the branch 'bugtest' at https://github.com/chleroy/linux.git
> 

Thanks, Christophe.

Stan, please test the following build.

$ git remote add chleroy-linux https://github.com/chleroy/linux.git -f -t 
bugtest
...
$ git checkout chleroy-linux/bugtest
HEAD is now at 63e3756d1bdf powerpc/interrupts: Also perform KUAP/KUEP lock and 
usertime accounting on NMI
$ cp ../dot-config-powermac-5.13 .config
$ scripts/config -e CONFIG_PPC_KUAP -e CONFIG_PPC_KUAP_DEBUG -e 
CONFIG_VMAP_STACK
$ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig 
vmlinux
$ egrep "CONFIG_PPC_KUAP|CONFIG_VMAP_STACK" .config
$ strings vmlinux |grep "Linux version"

If that kernel produces errors, I'd try a second build as well:

$ scripts/config -e CONFIG_PPC_KUAP -e CONFIG_PPC_KUAP_DEBUG -d 
CONFIG_VMAP_STACK
$ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig 
vmlinux
$ egrep "CONFIG_PPC_KUAP|CONFIG_VMAP_STACK" .config
$ strings vmlinux |grep "Linux version"

Please boot using the same kernel parameters as last time and capture the 
serial console logs. In case we're still dealing with intermittent bugs it 
might be necessary to repeat these tests so I suggest you retain the 
vmlinux files.

[PATCH v2 4/4] powerpc/64e: Get dear offset with _DEAR macro

From: Xiongwei Song 

Use _DEAR to get the offset of dear register in pr_regs for 64e cpus.

Signed-off-by: Xiongwei Song 
---
 arch/powerpc/kernel/asm-offsets.c| 13 +++--
 arch/powerpc/kernel/exceptions-64e.S |  8 
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index f4ebc435fd78..8357d5fcd09e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -286,23 +286,16 @@ int main(void)
STACK_PT_REGS_OFFSET(_CCR, ccr);
STACK_PT_REGS_OFFSET(_XER, xer);
STACK_PT_REGS_OFFSET(_DAR, dar);
+   STACK_PT_REGS_OFFSET(_DEAR, dear);
STACK_PT_REGS_OFFSET(_DSISR, dsisr);
STACK_PT_REGS_OFFSET(_ESR, esr);
STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3);
STACK_PT_REGS_OFFSET(RESULT, result);
STACK_PT_REGS_OFFSET(_TRAP, trap);
-#ifndef CONFIG_PPC64
-   /*
-* The PowerPC 400-class & Book-E processors have neither the DAR
-* nor the DSISR SPRs. Hence, we overload them to hold the similar
-* DEAR and ESR SPRs for such processors.  For critical interrupts
-* we use them to hold SRR0 and SRR1.
-*/
-   STACK_PT_REGS_OFFSET(_DEAR, dar);
-#else /* CONFIG_PPC64 */
+#ifdef CONFIG_PPC64
STACK_PT_REGS_OFFSET(SOFTE, softe);
STACK_PT_REGS_OFFSET(_PPR, ppr);
-#endif /* CONFIG_PPC64 */
+#endif
 
 #ifdef CONFIG_PPC_PKEY
STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr);
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index bf8c4c2f98ea..221e085e8c8c 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -545,7 +545,7 @@ __end_interrupts:
PROLOG_ADDITION_2REGS)
mfspr   r14,SPRN_DEAR
mfspr   r15,SPRN_ESR
-   std r14,_DAR(r1)
+   std r14,_DEAR(r1)
std r15,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
@@ -558,7 +558,7 @@ __end_interrupts:
PROLOG_ADDITION_2REGS)
li  r15,0
mr  r14,r10
-   std r14,_DAR(r1)
+   std r14,_DEAR(r1)
std r15,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
@@ -575,7 +575,7 @@ __end_interrupts:
PROLOG_ADDITION_2REGS)
mfspr   r14,SPRN_DEAR
mfspr   r15,SPRN_ESR
-   std r14,_DAR(r1)
+   std r14,_DEAR(r1)
std r15,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
@@ -1057,7 +1057,7 @@ bad_stack_book3e:
std r11,_CCR(r1)
mfspr   r10,SPRN_DEAR
mfspr   r11,SPRN_ESR
-   std r10,_DAR(r1)
+   std r10,_DEAR(r1)
std r11,_ESR(r1)
std r0,GPR0(r1);/* save r0 in stackframe */ \
std r2,GPR2(r1);/* save r2 in stackframe */ \
-- 
2.30.2

[PATCH v2 3/4] powerpc: Optimize register usage for dear register

From: Xiongwei Song 

Create an anonymous union for dar and dear regsiters, we can reference
dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y.
Otherwise, reference dar. This makes code more clear.

Signed-off-by: Xiongwei Song 
---
 arch/powerpc/include/asm/ptrace.h   | 5 -
 arch/powerpc/kernel/process.c   | 2 +-
 arch/powerpc/kernel/ptrace/ptrace.c | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index c252d04b1206..fa725e3238c2 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -43,7 +43,10 @@ struct pt_regs
unsigned long mq;
 #endif
unsigned long trap;
-   unsigned long dar;
+   union {
+   unsigned long dar;
+   unsigned long dear;
+   };
union {
unsigned long dsisr;
unsigned long esr;
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index f74af8f9133c..50436b52c213 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
trap == INTERRUPT_DATA_STORAGE ||
trap == INTERRUPT_ALIGNMENT) {
if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
-   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
regs->esr);
+   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, 
regs->esr);
else
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
regs->dsisr);
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index a222fd4d6334..7c7093c17c45 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -373,6 +373,8 @@ void __init pt_regs_check(void)
 offsetof(struct user_pt_regs, trap));
BUILD_BUG_ON(offsetof(struct pt_regs, dar) !=
 offsetof(struct user_pt_regs, dar));
+   BUILD_BUG_ON(offsetof(struct pt_regs, dear) !=
+offsetof(struct user_pt_regs, dar));
BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
 offsetof(struct user_pt_regs, dsisr));
BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
-- 
2.30.2

[PATCH v2 2/4] powerpc/64e: Get esr offset with _ESR macro

From: Xiongwei Song 

Use _ESR to get the offset of esr register in pr_regs for 64e cpus.

Signed-off-by: Xiongwei Song 
---
 arch/powerpc/kernel/asm-offsets.c|  2 +-
 arch/powerpc/kernel/exceptions-64e.S | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index a47eefa09bcb..f4ebc435fd78 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -287,6 +287,7 @@ int main(void)
STACK_PT_REGS_OFFSET(_XER, xer);
STACK_PT_REGS_OFFSET(_DAR, dar);
STACK_PT_REGS_OFFSET(_DSISR, dsisr);
+   STACK_PT_REGS_OFFSET(_ESR, esr);
STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3);
STACK_PT_REGS_OFFSET(RESULT, result);
STACK_PT_REGS_OFFSET(_TRAP, trap);
@@ -298,7 +299,6 @@ int main(void)
 * we use them to hold SRR0 and SRR1.
 */
STACK_PT_REGS_OFFSET(_DEAR, dar);
-   STACK_PT_REGS_OFFSET(_ESR, dsisr);
 #else /* CONFIG_PPC64 */
STACK_PT_REGS_OFFSET(SOFTE, softe);
STACK_PT_REGS_OFFSET(_PPR, ppr);
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 1401787b0b93..bf8c4c2f98ea 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -546,7 +546,7 @@ __end_interrupts:
mfspr   r14,SPRN_DEAR
mfspr   r15,SPRN_ESR
std r14,_DAR(r1)
-   std r15,_DSISR(r1)
+   std r15,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
EXCEPTION_COMMON(0x300)
@@ -559,7 +559,7 @@ __end_interrupts:
li  r15,0
mr  r14,r10
std r14,_DAR(r1)
-   std r15,_DSISR(r1)
+   std r15,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
EXCEPTION_COMMON(0x400)
@@ -576,7 +576,7 @@ __end_interrupts:
mfspr   r14,SPRN_DEAR
mfspr   r15,SPRN_ESR
std r14,_DAR(r1)
-   std r15,_DSISR(r1)
+   std r15,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
EXCEPTION_COMMON(0x600)
@@ -587,7 +587,7 @@ __end_interrupts:
NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM,
PROLOG_ADDITION_1REG)
mfspr   r14,SPRN_ESR
-   std r14,_DSISR(r1)
+   std r14,_ESR(r1)
ld  r14,PACA_EXGEN+EX_R14(r13)
EXCEPTION_COMMON(0x700)
addir3,r1,STACK_FRAME_OVERHEAD
@@ -1058,7 +1058,7 @@ bad_stack_book3e:
mfspr   r10,SPRN_DEAR
mfspr   r11,SPRN_ESR
std r10,_DAR(r1)
-   std r11,_DSISR(r1)
+   std r11,_ESR(r1)
std r0,GPR0(r1);/* save r0 in stackframe */ \
std r2,GPR2(r1);/* save r2 in stackframe */ \
SAVE_4GPRS(3, r1);  /* save r3 - r6 in stackframe */\
-- 
2.30.2

[PATCH v2 1/4] powerpc: Optimize register usage for esr register

From: Xiongwei Song 

Create an anonymous union for dsisr and esr regsiters, we can reference
esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
Otherwise, reference dsisr. This makes code more clear.

Signed-off-by: Xiongwei Song 
---
 arch/powerpc/include/asm/ptrace.h  | 5 -
 arch/powerpc/kernel/process.c  | 2 +-
 arch/powerpc/kernel/ptrace/ptrace.c| 2 ++
 arch/powerpc/kernel/traps.c| 2 +-
 arch/powerpc/platforms/44x/machine_check.c | 4 ++--
 arch/powerpc/platforms/4xx/machine_check.c | 2 +-
 6 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 3e5d470a6155..c252d04b1206 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -44,7 +44,10 @@ struct pt_regs
 #endif
unsigned long trap;
unsigned long dar;
-   unsigned long dsisr;
+   union {
+   unsigned long dsisr;
+   unsigned long esr;
+   };
unsigned long result;
};
};
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 185beb290580..f74af8f9133c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
trap == INTERRUPT_DATA_STORAGE ||
trap == INTERRUPT_ALIGNMENT) {
if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
-   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
regs->dsisr);
+   pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
regs->esr);
else
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
regs->dsisr);
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index 0a0a33eb0d28..a222fd4d6334 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -375,6 +375,8 @@ void __init pt_regs_check(void)
 offsetof(struct user_pt_regs, dar));
BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
 offsetof(struct user_pt_regs, dsisr));
+   BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
+offsetof(struct user_pt_regs, dsisr));
BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
 offsetof(struct user_pt_regs, result));
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index dfbce527c98e..2164f5705a0b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs)
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 /* On 4xx, the reason for the machine check or program exception
is in the ESR. */
-#define get_reason(regs)   ((regs)->dsisr)
+#define get_reason(regs)   ((regs)->esr)
 #define REASON_FP  ESR_FP
 #define REASON_ILLEGAL (ESR_PIL | ESR_PUO)
 #define REASON_PRIVILEGED  ESR_PPR
diff --git a/arch/powerpc/platforms/44x/machine_check.c 
b/arch/powerpc/platforms/44x/machine_check.c
index a5c898bb9bab..5d19daacd78a 100644
--- a/arch/powerpc/platforms/44x/machine_check.c
+++ b/arch/powerpc/platforms/44x/machine_check.c
@@ -11,7 +11,7 @@
 
 int machine_check_440A(struct pt_regs *regs)
 {
-   unsigned long reason = regs->dsisr;
+   unsigned long reason = regs->esr;
 
printk("Machine check in kernel mode.\n");
if (reason & ESR_IMCP){
@@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs)
 #ifdef CONFIG_PPC_47x
 int machine_check_47x(struct pt_regs *regs)
 {
-   unsigned long reason = regs->dsisr;
+   unsigned long reason = regs->esr;
u32 mcsr;
 
printk(KERN_ERR "Machine check in kernel mode.\n");
diff --git a/arch/powerpc/platforms/4xx/machine_check.c 
b/arch/powerpc/platforms/4xx/machine_check.c
index a71c29892a91..a905da1d6f41 100644
--- a/arch/powerpc/platforms/4xx/machine_check.c
+++ b/arch/powerpc/platforms/4xx/machine_check.c
@@ -10,7 +10,7 @@
 
 int machine_check_4xx(struct pt_regs *regs)
 {
-   unsigned long reason = regs->dsisr;
+   unsigned long reason = regs->esr;
 
if (reason & ESR_IMCP) {
printk("Instruction");
-- 
2.30.2

[PATCH v2 0/4] Some improvements on regs usage

From: Xiongwei Song 

When CONFIG_4xx=y or CONFIG_BOOKE=y, currently in code we reference dsisr
to get interrupt reasons and reference dar to get excepiton address.
However, in reference manuals, esr is used for interrupt reasons and dear
is used for excepiton address, so the patchset changes dsisr -> esr,
dar -> dear for CONFIG_4xx=y or CONFIG_BOOKE=y.

Meanwhile, we use _ESR and _DEAR to get offsets of esr and dear on stack.

v2:
 - Discard changes in arch/powerpc/mm/fault.c as Christophe and Michael
   suggested.
 - Discard changes in UAPI headers to avoid possible compile issue.

v1:
 - https://lkml.org/lkml/2021/8/6/57
 - https://lkml.org/lkml/2021/7/26/533
 - https://lkml.org/lkml/2021/7/26/534
 - https://lkml.org/lkml/2021/7/26/535

Xiongwei Song (4):
  powerpc: Optimize register usage for esr register
  powerpc/64e: Get esr offset with _ESR macro
  powerpc: Optimize register usage for dear register
  powerpc/64e: Get dear offset with _DEAR macro

 arch/powerpc/include/asm/ptrace.h  | 10 --
 arch/powerpc/kernel/asm-offsets.c  | 15 ---
 arch/powerpc/kernel/exceptions-64e.S   | 18 +-
 arch/powerpc/kernel/process.c  |  2 +-
 arch/powerpc/kernel/ptrace/ptrace.c|  4 
 arch/powerpc/kernel/traps.c|  2 +-
 arch/powerpc/platforms/44x/machine_check.c |  4 ++--
 arch/powerpc/platforms/4xx/machine_check.c |  2 +-
 8 files changed, 30 insertions(+), 27 deletions(-)

-- 
2.30.2

Re: [PATCH v2 0/6] PCI: Drop duplicated tracking of a pci_dev's bound driver

2021-08-06 Thread Bjorn Helgaas

On Fri, Aug 06, 2021 at 08:46:23AM +0200, Uwe Kleine-König wrote:
> On Thu, Aug 05, 2021 at 06:42:34PM -0500, Bjorn Helgaas wrote:

> > I looked at all the bus_type.probe() methods, it looks like pci_dev is
> > not the only offender here.  At least the following also have a driver
> > pointer in the device struct:
> > 
> >   parisc_device.driver
> >   acpi_device.driver
> >   dio_dev.driver
> >   hid_device.driver
> >   pci_dev.driver
> >   pnp_dev.driver
> >   rio_dev.driver
> >   zorro_dev.driver
> 
> Right, when I converted zorro_dev it was pointed out that the code was
> copied from pci and the latter has the same construct. :-)
> See
> https://lore.kernel.org/r/20210730191035.1455248-5-u.kleine-koe...@pengutronix.de
> for the patch, I don't find where pci was pointed out, maybe it was on
> irc only.

Oh, thanks!  I looked to see if you'd done something similar
elsewhere, but I missed this one.

> > Looking through the places that care about pci_dev.driver (the ones
> > updated by patch 5/6), many of them are ... a little dubious to begin
> > with.  A few need the "struct pci_error_handlers *err_handler"
> > pointer, so that's probably legitimate.  But many just need a name,
> > and should probably be using dev_driver_string() instead.
> 
> Yeah, I considered adding a function to get the driver name from a
> pci_dev and a function to get the error handlers. Maybe it's an idea to
> introduce these two and then use to_pci_driver(pdev->dev.driver) for the
> few remaining users? Maybe doing that on top of my current series makes
> sense to have a clean switch from pdev->driver to pdev->dev.driver?!

I'd propose using dev_driver_string() for these places:

  eeh_driver_name() (could change callers to use dev_driver_string())
  bcma_host_pci_probe()
  qm_alloc_uacce()
  hns3_get_drvinfo()
  prestera_pci_probe()
  mlxsw_pci_probe()
  nfp_get_drvinfo()
  ssb_pcihost_probe()

The use in mpt_device_driver_register() looks unnecessary: it's only
to get a struct pci_device_id *, which is passed to ->probe()
functions that don't need it.

The use in adf_enable_aer() looks wrong: it sets the err_handler
pointer in one of the adf_driver structs.  I think those structs
should be basically immutable, and the drivers that call
adf_enable_aer() from their .probe() methods should set
".err_handler = &adf_err_handler" in their static adf_driver
definitions instead.

I think that basically leaves these:

  uncore_pci_probe() # .id_table, custom driver "registration"
  match_id() # .id_table, arch/x86/kernel/probe_roms.c
  xhci_pci_quirks()  # .id_table
  pci_error_handlers()   # roll-your-own AER handling, drivers/misc/cxl/guest.c

I think it would be fine to use to_pci_driver(pdev->dev.driver) for
these few.

Bjorn

RE: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to platform_device

2021-08-06 Thread Leo Li




> -Original Message-
> From: Maxim Kochetkov 
> Sent: Tuesday, August 3, 2021 6:36 AM
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Qiang Zhao ; Leo Li ;
> gre...@linuxfoundation.org; sarava...@google.com; linux-arm-
> ker...@lists.infradead.org; linux-ker...@vger.kernel.org; Maxim Kochetkov
> ; kernel test robot ; Dan Carpenter
> 
> Subject: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to
> platform_device
> 
> Since 5.13 QE's ucc nodes can't get interrupts from devicetree:
> 
>   ucc@2000 {
>   cell-index = <1>;
>   reg = <0x2000 0x200>;
>   interrupts = <32>;
>   interrupt-parent = <&qeic>;
>   };
> 
> Now fw_devlink expects driver to create and probe a struct device for
> interrupt controller.
> 
> So lets convert this driver to simple platform_device with probe().
> Also use platform_get_ and devm_ family function to get/allocate resources
> and drop unused .compatible = "qeic".
> 
> [1] -
> https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.k
> ernel.org%2Flkml%2FCAGETcx9PiX%3D%3DmLxB9PO8Myyk6u2vhPVwTMsA
> 5NkD-
> ywH5xhusw%40mail.gmail.com&data=04%7C01%7Cleoyang.li%40nxp.co
> m%7C1833b32e26de4ed7ef7908d956728eae%7C686ea1d3bc2b4c6fa92cd99c5
> c301635%7C0%7C0%7C637635872281355718%7CUnknown%7CTWFpbGZsb3d
> 8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%
> 3D%7C1000&sdata=HrivK73GYFAwygPz24JtO%2BTdkicCVYXOl3uywjOqS
> %2BA%3D&reserved=0
> Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default")
> Fixes: ea718c699055 ("Revert "Revert "driver core: Set fw_devlink=on by
> default""")
> Signed-off-by: Maxim Kochetkov 
> Reported-by: kernel test robot 
> Reported-by: Dan Carpenter 

Applied to fix.  Thanks.

> ---
>  drivers/soc/fsl/qe/qe_ic.c | 75 ++
>  1 file changed, 44 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/soc/fsl/qe/qe_ic.c b/drivers/soc/fsl/qe/qe_ic.c index
> 3f711c1a0996..e710d554425d 100644
> --- a/drivers/soc/fsl/qe/qe_ic.c
> +++ b/drivers/soc/fsl/qe/qe_ic.c
> @@ -23,6 +23,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -404,41 +405,40 @@ static void qe_ic_cascade_muxed_mpic(struct
> irq_desc *desc)
>   chip->irq_eoi(&desc->irq_data);
>  }
> 
> -static void __init qe_ic_init(struct device_node *node)
> +static int qe_ic_init(struct platform_device *pdev)
>  {
> + struct device *dev = &pdev->dev;
>   void (*low_handler)(struct irq_desc *desc);
>   void (*high_handler)(struct irq_desc *desc);
>   struct qe_ic *qe_ic;
> - struct resource res;
> - u32 ret;
> + struct resource *res;
> + struct device_node *node = pdev->dev.of_node;
> 
> - ret = of_address_to_resource(node, 0, &res);
> - if (ret)
> - return;
> + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> + if (res == NULL) {
> + dev_err(dev, "no memory resource defined\n");
> + return -ENODEV;
> + }
> 
> - qe_ic = kzalloc(sizeof(*qe_ic), GFP_KERNEL);
> + qe_ic = devm_kzalloc(dev, sizeof(*qe_ic), GFP_KERNEL);
>   if (qe_ic == NULL)
> - return;
> + return -ENOMEM;
> 
> - qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS,
> -&qe_ic_host_ops, qe_ic);
> - if (qe_ic->irqhost == NULL) {
> - kfree(qe_ic);
> - return;
> + qe_ic->regs = devm_ioremap(dev, res->start, resource_size(res));
> + if (qe_ic->regs == NULL) {
> + dev_err(dev, "failed to ioremap() registers\n");
> + return -ENODEV;
>   }
> 
> - qe_ic->regs = ioremap(res.start, resource_size(&res));
> -
>   qe_ic->hc_irq = qe_ic_irq_chip;
> 
> - qe_ic->virq_high = irq_of_parse_and_map(node, 0);
> - qe_ic->virq_low = irq_of_parse_and_map(node, 1);
> + qe_ic->virq_high = platform_get_irq(pdev, 0);
> + qe_ic->virq_low = platform_get_irq(pdev, 1);
> 
> - if (!qe_ic->virq_low) {
> - printk(KERN_ERR "Failed to map QE_IC low IRQ\n");
> - kfree(qe_ic);
> - return;
> + if (qe_ic->virq_low < 0) {
> + return -ENODEV;
>   }
> +
>   if (qe_ic->virq_high != qe_ic->virq_low) {
>   low_handler = qe_ic_cascade_low;
>   high_handler = qe_ic_cascade_high;
> @@ -447,6 +447,13 @@ static void __init qe_ic_init(struct device_node
> *node)
>   high_handler = NULL;
>   }
> 
> + qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS,
> +&qe_ic_host_ops, qe_ic);
> + if (qe_ic->irqhost == NULL) {
> + dev_err(dev, "failed to add irq domain\n");
> + return -ENODEV;
> + }
> +
>   qe_ic_write(qe_ic->regs, QEIC_CICR, 0);
> 
>   irq_set_handler_data(qe_ic->virq_low, qe_ic); @@ -456,20 +463,26
> @@ static void __init qe_ic_init(struct device_node

Re: [PATCH v4 2/5] dt-bindings: nintendo-otp: Document the Wii and Wii U OTP support

2021-08-06 Thread Rob Herring

On Sun, 01 Aug 2021 09:38:19 +0200, Emmanuel Gil Peyrot wrote:
> Both of these consoles use the exact same two registers, even at the
> same address, but the Wii U has eight banks of 128 bytes memory while
> the Wii only has one, hence the two compatible strings.
> 
> Signed-off-by: Emmanuel Gil Peyrot 
> ---
>  .../bindings/nvmem/nintendo-otp.yaml  | 44 +++
>  1 file changed, 44 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/nvmem/nintendo-otp.yaml
> 

Reviewed-by: Rob Herring

Re: [PATCH v1 32/55] KVM: PPC: Book3S HV P9: Move vcpu register save/restore into functions

Nicholas Piggin  writes:

> This should be no functional difference but makes the caller easier
> to read.
>
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Fabiano Rosas 

> ---
>  arch/powerpc/kvm/book3s_hv.c | 65 +++-
>  1 file changed, 41 insertions(+), 24 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index c2c72875fca9..45211458ac05 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4062,6 +4062,44 @@ static void store_spr_state(struct kvm_vcpu *vcpu)
>   vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
>  }
>
> +/* Returns true if current MSR and/or guest MSR may have changed */
> +static bool load_vcpu_state(struct kvm_vcpu *vcpu,
> +struct p9_host_os_sprs *host_os_sprs)
> +{
> + bool ret = false;
> +
> + if (cpu_has_feature(CPU_FTR_TM) ||
> + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
> + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
> + ret = true;
> + }
> +
> + load_spr_state(vcpu, host_os_sprs);
> +
> + load_fp_state(&vcpu->arch.fp);
> +#ifdef CONFIG_ALTIVEC
> + load_vr_state(&vcpu->arch.vr);
> +#endif
> + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
> +
> + return ret;
> +}
> +
> +static void store_vcpu_state(struct kvm_vcpu *vcpu)
> +{
> + store_spr_state(vcpu);
> +
> + store_fp_state(&vcpu->arch.fp);
> +#ifdef CONFIG_ALTIVEC
> + store_vr_state(&vcpu->arch.vr);
> +#endif
> + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
> +
> + if (cpu_has_feature(CPU_FTR_TM) ||
> + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
> + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
> +}
> +
>  static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
>  {
>   if (!cpu_has_feature(CPU_FTR_ARCH_31))
> @@ -4169,19 +4207,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>
>   vcpu_vpa_increment_dispatch(vcpu);
>
> - if (cpu_has_feature(CPU_FTR_TM) ||
> - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
> - kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
> - msr = mfmsr(); /* TM restore can update msr */
> - }
> -
> - load_spr_state(vcpu, &host_os_sprs);
> -
> - load_fp_state(&vcpu->arch.fp);
> -#ifdef CONFIG_ALTIVEC
> - load_vr_state(&vcpu->arch.vr);
> -#endif
> - mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
> + if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
> + msr = mfmsr(); /* MSR may have been updated */
>
>   switch_pmu_to_guest(vcpu, &host_os_sprs);
>
> @@ -4285,17 +4312,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>
>   switch_pmu_to_host(vcpu, &host_os_sprs);
>
> - store_spr_state(vcpu);
> -
> - store_fp_state(&vcpu->arch.fp);
> -#ifdef CONFIG_ALTIVEC
> - store_vr_state(&vcpu->arch.vr);
> -#endif
> - vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
> -
> - if (cpu_has_feature(CPU_FTR_TM) ||
> - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
> - kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
> + store_vcpu_state(vcpu);
>
>   vcpu_vpa_increment_dispatch(vcpu);

Re: [PATCH v1 31/55] KVM: PPC: Book3S HV P9: Juggle SPR switching around

Nicholas Piggin  writes:

> This juggles SPR switching on the entry and exit sides to be more
> symmetric, which makes the next refactoring patch possible with no
> functional change.
>
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Fabiano Rosas 

> ---
>  arch/powerpc/kvm/book3s_hv.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 56429b53f4dc..c2c72875fca9 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4175,7 +4175,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>   msr = mfmsr(); /* TM restore can update msr */
>   }
>
> - switch_pmu_to_guest(vcpu, &host_os_sprs);
> + load_spr_state(vcpu, &host_os_sprs);
>
>   load_fp_state(&vcpu->arch.fp);
>  #ifdef CONFIG_ALTIVEC
> @@ -4183,7 +4183,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>  #endif
>   mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
>
> - load_spr_state(vcpu, &host_os_sprs);
> + switch_pmu_to_guest(vcpu, &host_os_sprs);
>
>   if (kvmhv_on_pseries()) {
>   /*
> @@ -4283,6 +4283,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>   vcpu->arch.slb_max = 0;
>   }
>
> + switch_pmu_to_host(vcpu, &host_os_sprs);
> +
>   store_spr_state(vcpu);
>
>   store_fp_state(&vcpu->arch.fp);
> @@ -4297,8 +4299,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>
>   vcpu_vpa_increment_dispatch(vcpu);
>
> - switch_pmu_to_host(vcpu, &host_os_sprs);
> -
>   timer_rearm_host_dec(*tb);
>
>   restore_p9_host_os_sprs(vcpu, &host_os_sprs);

Re: [PATCH v1 30/55] KVM: PPC: Book3S HV P9: Only execute mtSPR if the value changed

Nicholas Piggin  writes:

> Keep better track of the current SPR value in places where
> they are to be loaded with a new context, to reduce expensive
> mtSPR operations.
>
> -73 cycles (7354) POWER9 virt-mode NULL hcall
>
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Fabiano Rosas 

> ---
>  arch/powerpc/kvm/book3s_hv.c | 64 ++--
>  1 file changed, 39 insertions(+), 25 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 0d97138e6fa4..56429b53f4dc 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4009,19 +4009,28 @@ static void switch_pmu_to_host(struct kvm_vcpu *vcpu,
>   }
>  }
>
> -static void load_spr_state(struct kvm_vcpu *vcpu)
> +static void load_spr_state(struct kvm_vcpu *vcpu,
> + struct p9_host_os_sprs *host_os_sprs)
>  {
> - mtspr(SPRN_DSCR, vcpu->arch.dscr);
> - mtspr(SPRN_IAMR, vcpu->arch.iamr);
> - mtspr(SPRN_PSPB, vcpu->arch.pspb);
> - mtspr(SPRN_FSCR, vcpu->arch.fscr);
>   mtspr(SPRN_TAR, vcpu->arch.tar);
>   mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
>   mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
>   mtspr(SPRN_BESCR, vcpu->arch.bescr);
> - mtspr(SPRN_TIDR, vcpu->arch.tid);
> - mtspr(SPRN_AMR, vcpu->arch.amr);
> - mtspr(SPRN_UAMOR, vcpu->arch.uamor);
> +
> + if (!cpu_has_feature(CPU_FTR_ARCH_31))
> + mtspr(SPRN_TIDR, vcpu->arch.tid);
> + if (host_os_sprs->iamr != vcpu->arch.iamr)
> + mtspr(SPRN_IAMR, vcpu->arch.iamr);
> + if (host_os_sprs->amr != vcpu->arch.amr)
> + mtspr(SPRN_AMR, vcpu->arch.amr);
> + if (vcpu->arch.uamor != 0)
> + mtspr(SPRN_UAMOR, vcpu->arch.uamor);
> + if (host_os_sprs->fscr != vcpu->arch.fscr)
> + mtspr(SPRN_FSCR, vcpu->arch.fscr);
> + if (host_os_sprs->dscr != vcpu->arch.dscr)
> + mtspr(SPRN_DSCR, vcpu->arch.dscr);
> + if (vcpu->arch.pspb != 0)
> + mtspr(SPRN_PSPB, vcpu->arch.pspb);
>
>   /*
>* DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
> @@ -4036,28 +4045,31 @@ static void load_spr_state(struct kvm_vcpu *vcpu)
>
>  static void store_spr_state(struct kvm_vcpu *vcpu)
>  {
> - vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
> -
> - vcpu->arch.iamr = mfspr(SPRN_IAMR);
> - vcpu->arch.pspb = mfspr(SPRN_PSPB);
> - vcpu->arch.fscr = mfspr(SPRN_FSCR);
>   vcpu->arch.tar = mfspr(SPRN_TAR);
>   vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
>   vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
>   vcpu->arch.bescr = mfspr(SPRN_BESCR);
> - vcpu->arch.tid = mfspr(SPRN_TIDR);
> +
> + if (!cpu_has_feature(CPU_FTR_ARCH_31))
> + vcpu->arch.tid = mfspr(SPRN_TIDR);
> + vcpu->arch.iamr = mfspr(SPRN_IAMR);
>   vcpu->arch.amr = mfspr(SPRN_AMR);
>   vcpu->arch.uamor = mfspr(SPRN_UAMOR);
> + vcpu->arch.fscr = mfspr(SPRN_FSCR);
>   vcpu->arch.dscr = mfspr(SPRN_DSCR);
> + vcpu->arch.pspb = mfspr(SPRN_PSPB);
> +
> + vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
>  }
>
>  static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
>  {
> - host_os_sprs->dscr = mfspr(SPRN_DSCR);
> - host_os_sprs->tidr = mfspr(SPRN_TIDR);
> + if (!cpu_has_feature(CPU_FTR_ARCH_31))
> + host_os_sprs->tidr = mfspr(SPRN_TIDR);
>   host_os_sprs->iamr = mfspr(SPRN_IAMR);
>   host_os_sprs->amr = mfspr(SPRN_AMR);
>   host_os_sprs->fscr = mfspr(SPRN_FSCR);
> + host_os_sprs->dscr = mfspr(SPRN_DSCR);
>  }
>
>  /* vcpu guest regs must already be saved */
> @@ -4066,18 +4078,20 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu 
> *vcpu,
>  {
>   mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
>
> - mtspr(SPRN_PSPB, 0);
> - mtspr(SPRN_UAMOR, 0);
> -
> - mtspr(SPRN_DSCR, host_os_sprs->dscr);
> - mtspr(SPRN_TIDR, host_os_sprs->tidr);
> - mtspr(SPRN_IAMR, host_os_sprs->iamr);
> -
> + if (!cpu_has_feature(CPU_FTR_ARCH_31))
> + mtspr(SPRN_TIDR, host_os_sprs->tidr);
> + if (host_os_sprs->iamr != vcpu->arch.iamr)
> + mtspr(SPRN_IAMR, host_os_sprs->iamr);
> + if (vcpu->arch.uamor != 0)
> + mtspr(SPRN_UAMOR, 0);
>   if (host_os_sprs->amr != vcpu->arch.amr)
>   mtspr(SPRN_AMR, host_os_sprs->amr);
> -
>   if (host_os_sprs->fscr != vcpu->arch.fscr)
>   mtspr(SPRN_FSCR, host_os_sprs->fscr);
> + if (host_os_sprs->dscr != vcpu->arch.dscr)
> + mtspr(SPRN_DSCR, host_os_sprs->dscr);
> + if (vcpu->arch.pspb != 0)
> + mtspr(SPRN_PSPB, 0);
>
>   /* Save guest CTRL register, set runlatch to 1 */
>   if (!(vcpu->arch.ctrl & 1))
> @@ -4169,7 +4183,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>  #endif
>   mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
>
> - load_spr_state(vcpu);
> + load_spr_state(vcpu, &host_os_sprs);
>
>   if (kvmhv_on_pseries()) {
>

Re: [PATCH v2] scripts/Makefile.clang: default to LLVM_IAS=1

2021-08-06 Thread Nathan Chancellor

On Fri, Aug 06, 2021 at 10:27:01AM -0700, Nick Desaulniers wrote:
> LLVM_IAS=1 controls enabling clang's integrated assembler via
> -integrated-as. This was an explicit opt in until we could enable
> assembler support in Clang for more architecures. Now we have support
> and CI coverage of LLVM_IAS=1 for all architecures except a few more
> bugs affecting s390 and powerpc.

The powerpc and s390 folks have been testing with clang, I think they
should have been on CC for this change (done now).

> This commit flips the default from opt in via LLVM_IAS=1 to opt out via
> LLVM_IAS=0.  CI systems or developers that were previously doing builds
> with CC=clang or LLVM=1 without explicitly setting LLVM_IAS must now
> explicitly opt out via LLVM_IAS=0, otherwise they will be implicitly
> opted-in.
> 
> This finally shortens the command line invocation when cross compiling
> with LLVM to simply:
> 
> $ make ARCH=arm64 LLVM=1
> 
> Signed-off-by: Nick Desaulniers 

I am still not really sure how I feel about this. I would prefer not to
break people's builds but I suppose this is inevitabile eventually.

A little support matrix that I drafted up where based on ARCH and clang
version for LLVM_IAS=1 support:

 | 10.x | 11.x | 12.x | 13.x | 14.x |
ARCH=arm |  NO  |  NO  |  NO  |  YES |  YES |
ARCH=arm64   |  NO  |  YES |  YES |  YES |  YES |
ARCH=i386|  YES |  YES |  YES |  YES |  YES |
ARCH=mips*   |  YES |  YES |  YES |  YES |  YES |
ARCH=powerpc |  NO  |  NO  |  NO  |  NO  |  NO  |
ARCH=s390|  NO  |  NO  |  NO  |  NO  |  NO  |
ARCH=x86_64  |  NO  |  YES |  YES |  YES |  YES |

The main issue that I have with this change is that all of these
architectures work fine with CC=clang and their build commands that used
to work fine will not with this change, as they will have to specify
LLVM_IAS=0. I think that making this change for LLVM=1 makes sense but
changing the default for just CC=clang feels like a bit much at this
point in time. I would love to hear from others on this though, I am not
going to object much further than this.

Regardless of that concern, this patch does what it says so:

Reviewed-by: Nathan Chancellor 

> ---
> Changes v1 -> v2:
> * Drop "Currently" from Documentation/, as per Matthew.
> * Drop Makefile and riscv Makefile, rebase on
>   https://lore.kernel.org/lkml/20210805150102.131008-1-masahi...@kernel.org/
>   as per Masahiro.
> * Base is kbuild/for-next, plus
>   
> https://lore.kernel.org/lkml/20210802183910.1802120-1-ndesaulni...@google.com/
>   https://lore.kernel.org/lkml/20210805150102.131008-1-masahi...@kernel.org/.
> 
>  Documentation/kbuild/llvm.rst | 14 --
>  scripts/Makefile.clang|  6 +++---
>  2 files changed, 11 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst
> index f8a360958f4c..e87ed5479963 100644
> --- a/Documentation/kbuild/llvm.rst
> +++ b/Documentation/kbuild/llvm.rst
> @@ -60,17 +60,14 @@ They can be enabled individually. The full list of the 
> parameters: ::
> OBJCOPY=llvm-objcopy OBJDUMP=llvm-objdump READELF=llvm-readelf \
> HOSTCC=clang HOSTCXX=clang++ HOSTAR=llvm-ar HOSTLD=ld.lld
>  
> -Currently, the integrated assembler is disabled by default. You can pass
> -``LLVM_IAS=1`` to enable it.
> +The integrated assembler is enabled by default. You can pass ``LLVM_IAS=0`` 
> to
> +disable it.
>  
>  Omitting CROSS_COMPILE
>  --
>  
>  As explained above, ``CROSS_COMPILE`` is used to set ``--target=``.
>  
> -Unless ``LLVM_IAS=1`` is specified, ``CROSS_COMPILE`` is also used to derive
> -``--prefix=`` to search for the GNU assembler and linker.
> -
>  If ``CROSS_COMPILE`` is not specified, the ``--target=`` is inferred
>  from ``ARCH``.
>  
> @@ -78,7 +75,12 @@ That means if you use only LLVM tools, ``CROSS_COMPILE`` 
> becomes unnecessary.
>  
>  For example, to cross-compile the arm64 kernel::
>  
> - make ARCH=arm64 LLVM=1 LLVM_IAS=1
> + make ARCH=arm64 LLVM=1
> +
> +If ``LLVM_IAS=0`` is specified, ``CROSS_COMPILE`` is also used to derive
> +``--prefix=`` to search for the GNU assembler and linker. ::
> +
> + make ARCH=arm64 LLVM=1 LLVM_IAS=0 CROSS_COMPILE=aarch64-linux-gnu-
>  
>  Supported Architectures
>  ---
> diff --git a/scripts/Makefile.clang b/scripts/Makefile.clang
> index 1f4e3eb70f88..3ae63bd35582 100644
> --- a/scripts/Makefile.clang
> +++ b/scripts/Makefile.clang
> @@ -22,12 +22,12 @@ else
>  CLANG_FLAGS  += --target=$(notdir $(CROSS_COMPILE:%-=%))
>  endif # CROSS_COMPILE
>  
> -ifeq ($(LLVM_IAS),1)
> -CLANG_FLAGS  += -integrated-as
> -else
> +ifeq ($(LLVM_IAS),0)
>  CLANG_FLAGS  += -no-integrated-as
>  GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
>  CLANG_FLAGS  += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
> +else
> +CLANG_FLAGS  += -integrated-as
>  endif
>  CLANG_FLAGS  += -Werror=unknown-warning-option
>  KBUILD_CFLAGS+= $(CLANG_FLAGS)
> 
>

Re: [PATCH v6 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-06 Thread Aneesh Kumar K.V


On 8/6/21 12:17 PM, David Gibson wrote:

On Tue, Jul 27, 2021 at 03:33:11PM +0530, Aneesh Kumar K.V wrote:

Currently, we duplicate parsing code for ibm,associativity and
ibm,associativity-lookup-arrays in the kernel. The associativity array provided
by these device tree properties are very similar and hence can use
a helper to parse the node id and numa distance details.


Oh... sorry.. comments on the earlier patch were from before I read
and saw you adjusted things here.



Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/mm/numa.c | 83 ++
  1 file changed, 51 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index fffb3c40f595..7506251e17f2 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -171,19 +171,19 @@ static void unmap_cpu_from_node(unsigned long cpu)
  }
  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
  
-/*

- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
- * info is found.
- */
-static int associativity_to_nid(const __be32 *associativity)
+static int __associativity_to_nid(const __be32 *associativity,
+ int max_array_sz)
  {
int nid = NUMA_NO_NODE;
+   /*
+* primary_domain_index is 1 based array index.
+*/
+   int index = primary_domain_index  - 1;
  
-	if (!numa_enabled)

+   if (!numa_enabled || index >= max_array_sz)
goto out;


You don't need a goto, you can just return NUMA_NO_NODE.


updated



  
-	if (of_read_number(associativity, 1) >= primary_domain_index)

-   nid = of_read_number(&associativity[primary_domain_index], 1);
+   nid = of_read_number(&associativity[index], 1);
  
  	/* POWER4 LPAR uses 0x as invalid node */

if (nid == 0x || nid >= nr_node_ids)
@@ -191,6 +191,17 @@ static int associativity_to_nid(const __be32 
*associativity)
  out:
return nid;
  }
+/*
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
+ * info is found.
+ */
+static int associativity_to_nid(const __be32 *associativity)
+{
+   int array_sz = of_read_number(associativity, 1);
+
+   /* Skip the first element in the associativity array */
+   return __associativity_to_nid((associativity + 1), array_sz);
+}
  
  static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)

  {
@@ -295,24 +306,41 @@ int of_node_to_nid(struct device_node *device)
  }
  EXPORT_SYMBOL(of_node_to_nid);
  
-static void __initialize_form1_numa_distance(const __be32 *associativity)

+static void ___initialize_form1_numa_distance(const __be32 *associativity,
+int max_array_sz)
  {
int i, nid;
  
  	if (affinity_form != FORM1_AFFINITY)

return;
  
-	nid = associativity_to_nid(associativity);

+   nid = __associativity_to_nid(associativity, max_array_sz);
if (nid != NUMA_NO_NODE) {
for (i = 0; i < distance_ref_points_depth; i++) {
const __be32 *entry;
+   int index = be32_to_cpu(distance_ref_points[i]) - 1;
+
+   /*
+* broken hierarchy, return with broken distance table


WARN_ON, maybe?



updated




+*/
+   if (index >= max_array_sz)
+   return;
  
-			entry = &associativity[be32_to_cpu(distance_ref_points[i])];

+   entry = &associativity[index];
distance_lookup_table[nid][i] = of_read_number(entry, 
1);
}
}
  }
  
+static void __initialize_form1_numa_distance(const __be32 *associativity)


Do you actually use this in-between wrapper?


yes used in

static void initialize_form1_numa_distance(struct device_node *node)
{
const __be32 *associativity;

associativity = of_get_associativity(node);
if (!associativity)
return;

__initialize_form1_numa_distance(associativity);
}






+{
+   int array_sz;
+
+   array_sz = of_read_number(associativity, 1);
+   /* Skip the first element in the associativity array */
+   ___initialize_form1_numa_distance(associativity + 1, array_sz);
+}
+
  static void initialize_form1_numa_distance(struct device_node *node)
  {
const __be32 *associativity;
@@ -586,27 +614,18 @@ static int get_nid_and_numa_distance(struct drmem_lmb 
*lmb)
  
  	if (primary_domain_index <= aa.array_sz &&

!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
-   nid = of_read_number(&aa.arrays[index], 1);
+   const __be32 *associativity;
  
-		if (nid == 0x || nid >= nr_node_ids)

-   nid = default_nid;
+   index = lmb->aa_index * aa.array_sz;
+   assoc

Re: [PATCH kernel v2] KVM: PPC: Use arch_get_random_seed_long instead of powernv variant

Alexey Kardashevskiy  writes:

> The powernv_get_random_long() does not work in nested KVM (which is
> pseries) and produces a crash when accessing in_be64(rng->regs) in
> powernv_get_random_long().
>
> This replaces powernv_get_random_long with the ppc_md machine hook
> wrapper.
>
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: Fabiano Rosas 

> ---
>
> Changes:
> v2:
> * replaces [PATCH kernel] powerpc/powernv: Check if powernv_rng is initialized
>
> ---
>  arch/powerpc/kvm/book3s_hv.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index be0cde26f156..ecfd133e0ca8 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1165,7 +1165,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>   break;
>  #endif
>   case H_RANDOM:
> - if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
> + if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4]))
>   ret = H_HARDWARE;
>   break;
>   case H_RPT_INVALIDATE:

Re: [PATCH v4 1/2] tty: hvc: pass DMA capable memory to put_chars()

2021-08-06 Thread Arnd Bergmann

On Fri, Aug 6, 2021 at 5:01 AM Xianting Tian
 wrote:
> @@ -163,6 +155,13 @@ static void hvc_console_print(struct console *co, const 
> char *b,
> if (vtermnos[index] == -1)
> return;
>
> +   list_for_each_entry(hp, &hvc_structs, next)
> +   if (hp->vtermno == vtermnos[index])
> +   break;
> +
> +   c = hp->c;
> +
> +   spin_lock_irqsave(&hp->c_lock, flags);

The loop looks like it might race against changes to the list. It seems strange
that the print function has to actually search for the structure here.

It may be better to have yet another array for the buffer pointers next to
the cons_ops[] and vtermnos[] arrays.

> +/*
> + * These sizes are most efficient for vio, because they are the
> + * native transfer size. We could make them selectable in the
> + * future to better deal with backends that want other buffer sizes.
> + */
> +#define N_OUTBUF   16
> +#define N_INBUF16
> +
> +#define __ALIGNED__ __attribute__((__aligned__(sizeof(long

I think you need a higher alignment for DMA buffers, instead of sizeof(long),
I would suggest ARCH_DMA_MINALIGN.

   Arnd

Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register

2021-08-06 Thread Segher Boessenkool

On Fri, Aug 06, 2021 at 04:53:14PM +1000, Michael Ellerman wrote:
> But I'm not sure about the use of anonymous unions in UAPI headers. Old
> compilers don't support them, so there's a risk of breakage.

More precisely, it exists only since C11, so even with all not-so-ancient
compilers it will not work if the user uses (say) -std=c99, which still
is popular.

> I'd rather we didn't touch the uapi version.

Yeah.

> > -   err = ___do_page_fault(regs, regs->dar, regs->dsisr);
> > +   if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> > +   err = ___do_page_fault(regs, regs->dar, regs->esr);
> > +   else
> > +   err = ___do_page_fault(regs, regs->dar, regs->dsisr);
> 
> As Christophe said, I don't thinks this is an improvement.
> 
> It makes the code less readable. If anyone is confused about what is
> passed to ___do_page_fault() they can either read the comment above it,
> or look at the definition of pt_regs to see that esr and dsisr share
> storage.

Esp. since the affected platforms are legacy, yup.


Segher

[PATCH v6 1/2] KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path

As one of the arguments of the H_ENTER_NESTED hypercall, the nested
hypervisor (L1) prepares a structure containing the values of various
hypervisor-privileged registers with which it wants the nested guest
(L2) to run. Since the nested HV runs in supervisor mode it needs the
host to write to these registers.

To stop a nested HV manipulating this mechanism and using a nested
guest as a proxy to access a facility that has been made unavailable
to it, we have a routine that sanitises the values of the HV registers
before copying them into the nested guest's vcpu struct.

However, when coming out of the guest the values are copied as they
were back into L1 memory, which means that any sanitisation we did
during guest entry will be exposed to L1 after H_ENTER_NESTED returns.

This patch alters this sanitisation to have effect on the vcpu->arch
registers directly before entering and after exiting the guest,
leaving the structure that is copied back into L1 unchanged (except
when we really want L1 to access the value, e.g the Cause bits of
HFSCR).

Signed-off-by: Fabiano Rosas 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 94 ++---
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 898f942eb198..1823674d46ef 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -105,7 +105,6 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int 
trap,
struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
hr->dpdes = vc->dpdes;
-   hr->hfscr = vcpu->arch.hfscr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
@@ -128,55 +127,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, 
int trap,
case BOOK3S_INTERRUPT_H_INST_STORAGE:
hr->asdr = vcpu->arch.fault_gpa;
break;
+   case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+   hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) |
+(HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
+   break;
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
hr->heir = vcpu->arch.emul_inst;
break;
}
 }
 
-/*
- * This can result in some L0 HV register state being leaked to an L1
- * hypervisor when the hv_guest_state is copied back to the guest after
- * being modified here.
- *
- * There is no known problem with such a leak, and in many cases these
- * register settings could be derived by the guest by observing behaviour
- * and timing, interrupts, etc., but it is an issue to consider.
- */
-static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
-{
-   struct kvmppc_vcore *vc = vcpu->arch.vcore;
-   u64 mask;
-
-   /*
-* Don't let L1 change LPCR bits for the L2 except these:
-*/
-   mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
-   LPCR_LPES | LPCR_MER;
-
-   /*
-* Additional filtering is required depending on hardware
-* and configuration.
-*/
-   hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm,
-   (vc->lpcr & ~mask) | (hr->lpcr & mask));
-
-   /*
-* Don't let L1 enable features for L2 which we've disabled for L1,
-* but preserve the interrupt cause field.
-*/
-   hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
-
-   /* Don't let data address watchpoint match in hypervisor state */
-   hr->dawrx0 &= ~DAWRX_HYP;
-   hr->dawrx1 &= ~DAWRX_HYP;
-
-   /* Don't let completed instruction address breakpt match in HV state */
-   if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
-   hr->ciabr &= ~CIABR_PRIV;
-}
-
-static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state 
*hr)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
@@ -288,6 +249,43 @@ static int kvmhv_write_guest_state_and_regs(struct 
kvm_vcpu *vcpu,
 sizeof(struct pt_regs));
 }
 
+static void load_l2_hv_regs(struct kvm_vcpu *vcpu,
+   const struct hv_guest_state *l2_hv,
+   const struct hv_guest_state *l1_hv, u64 *lpcr)
+{
+   struct kvmppc_vcore *vc = vcpu->arch.vcore;
+   u64 mask;
+
+   restore_hv_regs(vcpu, l2_hv);
+
+   /*
+* Don't let L1 change LPCR bits for the L2 except these:
+*/
+   mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+   LPCR_LPES | LPCR_MER;
+
+   /*
+* Additional filtering is required depending on hardware
+* and configuration.
+*/
+   *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm,
+ (vc->lpcr & ~mask) | (*lpcr & mask));
+
+   /*
+* Don't l

[PATCH v6 2/2] KVM: PPC: Book3S HV: Stop forwarding all HFUs to L1

If the nested hypervisor has no access to a facility because it has
been disabled by the host, it should also not be able to see the
Hypervisor Facility Unavailable that arises from one of its guests
trying to access the facility.

This patch turns a HFU that happened in L2 into a Hypervisor Emulation
Assistance interrupt and forwards it to L1 for handling. The ones that
happened because L1 explicitly disabled the facility for L2 are still
let through, along with the corresponding Cause bits in the HFSCR.

Signed-off-by: Fabiano Rosas 
---
 arch/powerpc/kvm/book3s_hv.c| 13 +
 arch/powerpc/kvm/book3s_hv_nested.c | 29 +++--
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 085fb8ecbf68..9123b493c79e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1837,6 +1837,19 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
*vcpu)
r = RESUME_HOST;
break;
}
+   case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+   /*
+* We might decide later to turn this interrupt into a
+* HEAI. Load the last instruction now that we can go
+* back into the guest to retry if needed.
+*/
+   r = kvmppc_get_last_inst(vcpu, INST_GENERIC,
+&vcpu->arch.emul_inst);
+   if (r != EMULATE_DONE)
+   r = RESUME_GUEST;
+   else
+   r = RESUME_HOST;
+   break;
default:
r = RESUME_HOST;
break;
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 1823674d46ef..1904697a3132 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -99,7 +99,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
hr->dawrx1 = swab64(hr->dawrx1);
 }
 
-static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+static void save_hv_return_state(struct kvm_vcpu *vcpu,
 struct hv_guest_state *hr)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -118,7 +118,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int 
trap,
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
-   switch (trap) {
+   switch (vcpu->arch.trap) {
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
hr->hdar = vcpu->arch.fault_dar;
hr->hdsisr = vcpu->arch.fault_dsisr;
@@ -128,9 +128,26 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, 
int trap,
hr->asdr = vcpu->arch.fault_gpa;
break;
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
-   hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) |
-(HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
-   break;
+   {
+   u64 cause = vcpu->arch.hfscr >> 56;
+
+   WARN_ON_ONCE(cause >= BITS_PER_LONG);
+
+   if (!(hr->hfscr & (1UL << cause))) {
+   hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) |
+(HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
+   break;
+   }
+
+   /*
+* We have disabled this facility, so it does not
+* exist from L1's perspective. Turn it into a
+* HEAI. The instruction was already loaded at
+* kvmppc_handle_nested_exit().
+*/
+   vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
+   fallthrough;
+   }
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
hr->heir = vcpu->arch.emul_inst;
break;
@@ -388,7 +405,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
delta_ic = vcpu->arch.ic - l2_hv.ic;
delta_vtb = vc->vtb - l2_hv.vtb;
-   save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+   save_hv_return_state(vcpu, &l2_hv);
 
/* restore L1 state */
vcpu->arch.nested = NULL;
-- 
2.29.2

[PATCH v6 0/2] KVM: PPC: Book3S HV: Nested guest state sanitising changes

This series aims to stop contaminating the l2_hv structure with bits
that might have come from L1 state.

Patch 1 makes l2_hv read-only (mostly). It is now only changed when we
explicitly want to pass information to L1.

Patch 2 makes sure that L1 is not forwarded HFU interrupts when the
host has decided to disable any facilities (theoretical for now, since
HFSCR bits are always the same between L1/Ln).

Changes since v5:

- patch 2 now reads the instruction earlier at the nested exit handler
  to allow the guest to retry if the load fails.

v5:

- moved setting of the Cause bits under BOOK3S_INTERRUPT_H_FAC_UNAVAIL.

https://lkml.kernel.org/r/20210726201710.2432874-1-faro...@linux.ibm.com

v4:

- now passing lpcr separately into load_l2_hv_regs to solve the
  conflict with commit a19b70abc69a ("KVM: PPC: Book3S HV: Nested move
  LPCR sanitising to sanitise_hv_regs");

- patch 2 now forwards a HEAI instead of injecting a Program.

https://lkml.kernel.org/r/2021071240.2384655-1-faro...@linux.ibm.com

v3:

- removed the sanitise functions;
- moved the entry code into a new load_l2_hv_regs and the exit code
  into the existing save_hv_return_state;
- new patch: removes the cause bits when L0 has disabled the
  corresponding facility.

https://lkml.kernel.org/r/20210415230948.3563415-1-faro...@linux.ibm.com

v2:

- made the change more generic, not only applies to hfscr anymore;
- sanitisation is now done directly on the vcpu struct, l2_hv is left
  unchanged.

https://lkml.kernel.org/r/20210406214645.3315819-1-faro...@linux.ibm.com

v1:
https://lkml.kernel.org/r/20210305231055.2913892-1-faro...@linux.ibm.com

Fabiano Rosas (2):
  KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path
  KVM: PPC: Book3S HV: Stop forwarding all HFUs to L1

 arch/powerpc/kvm/book3s_hv.c|  13 
 arch/powerpc/kvm/book3s_hv_nested.c | 117 
 2 files changed, 79 insertions(+), 51 deletions(-)

-- 
2.29.2

[PATCH] powerpc/mce: check if event info is valid

2021-08-06 Thread Ganesh Goudar

Check if the event info is valid before printing the
event information. When a fwnmi enabled nested kvm guest
hits a machine check exception L0 and L2 would generate
machine check event info, But L1 would not generate any
machine check event info as it won't go through 0x200
vector and prints some unwanted message.

To fix this, 'in_use' variable in machine check event info is
no more in use, rename it to 'valid' and check if the event
information is valid before logging the event information.

without this patch L1 would print following message for
exceptions encountered in L2, as event structure will be
empty in L1.

"Machine Check Exception, Unknown event version 0".

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h | 2 +-
 arch/powerpc/kernel/mce.c  | 7 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..3646f53f228f 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -113,7 +113,7 @@ enum MCE_LinkErrorType {
 
 struct machine_check_event {
enum MCE_Versionversion:8;
-   u8  in_use;
+   u8  valid;
enum MCE_Severity   severity:8;
enum MCE_Initiator  initiator:8;
enum MCE_ErrorType  error_type:8;
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..b778394a06b5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -114,7 +114,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->srr0 = nip;
mce->srr1 = regs->msr;
mce->gpr3 = regs->gpr[3];
-   mce->in_use = 1;
+   mce->valid = 1;
mce->cpu = get_paca()->paca_index;
 
/* Mark it recovered if we have handled it and MSR(RI=1). */
@@ -202,7 +202,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
if (mce)
*mce = *mc_evt;
if (release)
-   mc_evt->in_use = 0;
+   mc_evt->valid = 0;
ret = 1;
}
/* Decrement the count to free the slot. */
@@ -413,6 +413,9 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
"Probable Software error (some chance of hardware cause)",
};
 
+   if (!evt->valid)
+   return;
+
/* Print things out */
if (evt->version != MCE_V1) {
pr_err("Machine Check Exception, Unknown event version %d !\n",
-- 
2.31.1

Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register

2021-08-06 Thread Xiongwei Song

On Fri, Aug 6, 2021 at 3:32 PM Christophe Leroy
 wrote:
>
>
>
> Le 06/08/2021 à 05:16, Xiongwei Song a écrit :
> > On Thu, Aug 5, 2021 at 6:06 PM Christophe Leroy
> >  wrote:
> >>
> >>
> >>
> >> Le 26/07/2021 à 16:30, sxwj...@me.com a écrit :
> >>> From: Xiongwei Song 
> >>>
> >>> Create an anonymous union for dsisr and esr regsiters, we can reference
> >>> esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
> >>> Otherwise, reference dsisr. This makes code more clear.
> >>
> >> I'm not sure it is worth doing that.
> > Why don't we use "esr" as reference manauls mentioned?
> >
> >>
> >> What is the point in doing the following when you know that regs->esr and 
> >> regs->dsisr are exactly
> >> the same:
> >>
> >>   > -err = ___do_page_fault(regs, regs->dar, regs->dsisr);
> >>   > +if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> >>   > +err = ___do_page_fault(regs, regs->dar, regs->esr);
> >>   > +else
> >>   > +err = ___do_page_fault(regs, regs->dar, regs->dsisr);
> >>   > +
> > Yes, we can drop this. But it's a bit vague.
> >
> >> Or even
> >>
> >>   > -int is_write = page_fault_is_write(regs->dsisr);
> >>   > +unsigned long err_reg;
> >>   > +int is_write;
> >>   > +
> >>   > +if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> >>   > +err_reg = regs->esr;
> >>   > +else
> >>   > +err_reg = regs->dsisr;
> >>   > +
> >>   > +is_write = page_fault_is_write(err_reg);
> >>
> >>
> >> Artificially growing the code for that makes no sense to me.
> >
> > We can drop this too.
> >>
> >>
> >> To avoid anbiguity, maybe the best would be to rename regs->dsisr to 
> >> something like regs->sr , so
> >> that we know it represents the status register, which is DSISR or ESR 
> >> depending on the platform.
> >
> > If so, this would make other people more confused. My consideration is
> > to follow what the reference
> > manuals represent.
>
> Maybe then we could rename the fields as regs->dsisr_esr and regs->dar_dear

I still prefer my method.

>
> That would be more explicit for everyone.
>
> The UAPI header however should remain as is because anonymous unions are not 
> supported by old
> compilers as mentioned by Michael.

Sure. Will update in v2.

>
> But nevertheless, there are also situations where was is stored in 
> regs->dsisr is not what we have
> in DSISR register. For instance on an ISI exception, we store a subset of the 
> content of SRR1
> register into regs->dsisr.

Can I think my method has better expansibility here;-)?
Let me finish esr and dear first. Thank you for the reminder.

Regards,
Xiongwei
>
> Christophe

Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register

2021-08-06 Thread Xiongwei Song

On Fri, Aug 6, 2021 at 2:53 PM Michael Ellerman  wrote:
>
> sxwj...@me.com writes:
> > From: Xiongwei Song 
> >
> > Create an anonymous union for dsisr and esr regsiters, we can reference
> > esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
> > Otherwise, reference dsisr. This makes code more clear.
> >
> > Signed-off-by: Xiongwei Song 
> > ---
> >  arch/powerpc/include/asm/ptrace.h  |  5 -
> >  arch/powerpc/include/uapi/asm/ptrace.h |  5 -
> >  arch/powerpc/kernel/process.c  |  2 +-
> >  arch/powerpc/kernel/ptrace/ptrace.c|  2 ++
> >  arch/powerpc/kernel/traps.c|  2 +-
> >  arch/powerpc/mm/fault.c| 16 ++--
> >  arch/powerpc/platforms/44x/machine_check.c |  4 ++--
> >  arch/powerpc/platforms/4xx/machine_check.c |  2 +-
> >  8 files changed, 29 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/ptrace.h 
> > b/arch/powerpc/include/asm/ptrace.h
> > index 3e5d470a6155..c252d04b1206 100644
> > --- a/arch/powerpc/include/asm/ptrace.h
> > +++ b/arch/powerpc/include/asm/ptrace.h
> > @@ -44,7 +44,10 @@ struct pt_regs
> >  #endif
> >   unsigned long trap;
> >   unsigned long dar;
> > - unsigned long dsisr;
> > + union {
> > + unsigned long dsisr;
> > + unsigned long esr;
> > + };
>
> I don't mind doing that.
>
> >   unsigned long result;
> >   };
> >   };
> > diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
> > b/arch/powerpc/include/uapi/asm/ptrace.h
> > index 7004cfea3f5f..e357288b5f34 100644
> > --- a/arch/powerpc/include/uapi/asm/ptrace.h
> > +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> > @@ -53,7 +53,10 @@ struct pt_regs
> >   /* N.B. for critical exceptions on 4xx, the dar and dsisr
> >  fields are overloaded to hold srr0 and srr1. */
> >   unsigned long dar;  /* Fault registers */
> > - unsigned long dsisr;/* on 4xx/Book-E used for ESR */
> > + union {
> > + unsigned long dsisr;/* on Book-S used for DSISR */
> > + unsigned long esr;  /* on 4xx/Book-E used for ESR 
> > */
> > + };
> >   unsigned long result;   /* Result of a system call */
> >  };
>
> But I'm not sure about the use of anonymous unions in UAPI headers. Old
> compilers don't support them, so there's a risk of breakage.
>
> I'd rather we didn't touch the uapi version.

Ok. Will discard the change.

>
>
> > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> > index 185beb290580..f74af8f9133c 100644
> > --- a/arch/powerpc/kernel/process.c
> > +++ b/arch/powerpc/kernel/process.c
> > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
> >   trap == INTERRUPT_DATA_STORAGE ||
> >   trap == INTERRUPT_ALIGNMENT) {
> >   if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> > - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
> > regs->dsisr);
> > + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, 
> > regs->esr);
> >   else
> >   pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, 
> > regs->dsisr);
> >   }
> > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
> > b/arch/powerpc/kernel/ptrace/ptrace.c
> > index 0a0a33eb0d28..00789ad2c4a3 100644
> > --- a/arch/powerpc/kernel/ptrace/ptrace.c
> > +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> > @@ -375,6 +375,8 @@ void __init pt_regs_check(void)
> >offsetof(struct user_pt_regs, dar));
> >   BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
> >offsetof(struct user_pt_regs, dsisr));
> > + BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
> > +  offsetof(struct user_pt_regs, esr));
> >   BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
> >offsetof(struct user_pt_regs, result));
> >
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> > index dfbce527c98e..2164f5705a0b 100644
> > --- a/arch/powerpc/kernel/traps.c
> > +++ b/arch/powerpc/kernel/traps.c
> > @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs)
> >  #ifdef CONFIG_PPC_ADV_DEBUG_REGS
> >  /* On 4xx, the reason for the machine check or program exception
> > is in the ESR. */
> > -#define get_reason(regs) ((regs)->dsisr)
> > +#define get_reason(regs) ((regs)->esr)
> >  #define REASON_FPESR_FP
> >  #define REASON_ILLEGAL   (ESR_PIL | ESR_PUO)
> >  #define REASON_PRIVILEGEDESR_PPR
> > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> > index a8d0ce85d39a..62953d4e7c93 100644
> > --- a/arch/powerpc/mm/fault.c
> > +++ b/arch/powerpc/mm/fault.c
> > @@ -541,7 +541,11 @@ static __always_inline void __do_page_f

Re: [PATCH] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs

2021-08-06 Thread Cédric Le Goater

On 6/29/21 3:15 PM, Cédric Le Goater wrote:
> On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
> runtime. Today, the IPI is not created for such nodes, and hot-plugged
> CPUs use a bogus IPI, which leads to soft lockups.
> 
> We could create the node IPI on demand but it is a bit complex because
> this code would be called under bringup_up() and some IRQ locking is
> being done. The simplest solution is to create the IPIs for all nodes
> at startup.
> 
> Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
> Cc: sta...@vger.kernel.org # v5.13
> Reported-by: Geetika Moolchandani 
> Cc: Srikar Dronamraju 
> Signed-off-by: Cédric Le Goater 
> ---
> 
> This patch breaks old versions of irqbalance (<= v1.4). Possible nodes
> are collected from /sys/devices/system/node/ but CPU-less nodes are
> not listed there. When interrupts are scanned, the link representing
> the node structure is NULL and segfault occurs.

This is an irqbalance regression due to :

  https://github.com/Irqbalance/irqbalance/pull/172

I will report through an issue. 

Anyhow, there is a better approach which is to allocate IPIs for all 
nodes at boot time and do the mapping on demand. Removing the mapping 
on last use seems more complex though. 

I will send a v2 after some tests.  

Thanks,

C.

> Version 1.7 seems immune. 
> 
> ---
>  arch/powerpc/sysdev/xive/common.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index f3b16ed48b05..5d2c58dba57e 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -1143,10 +1143,6 @@ static int __init xive_request_ipi(void)
>   struct xive_ipi_desc *xid = &xive_ipis[node];
>   struct xive_ipi_alloc_info info = { node };
>  
> - /* Skip nodes without CPUs */
> - if (cpumask_empty(cpumask_of_node(node)))
> - continue;
> -
>   /*
>* Map one IPI interrupt per node for all cpus of that node.
>* Since the HW interrupt number doesn't have any meaning,
>

Re: [PATCH printk v1 03/10] kgdb: delay roundup if holding printk cpulock

2021-08-06 Thread Daniel Thompson

On Thu, Aug 05, 2021 at 05:52:43AM +0206, John Ogness wrote:
> On 2021-08-04, Daniel Thompson  wrote:
> > On Wed, Aug 04, 2021 at 02:12:22PM +0200, Petr Mladek wrote:
> >> On Wed 2021-08-04 12:31:59, Daniel Thompson wrote:
> >> > On Tue, Aug 03, 2021 at 05:36:32PM +0206, John Ogness wrote:
> >> > > On 2021-08-03, Daniel Thompson  wrote:
> >> > > > On Tue, Aug 03, 2021 at 03:18:54PM +0206, John Ogness wrote:
> >> > > >> kgdb makes use of its own cpulock (@dbg_master_lock, @kgdb_active)
> >> > > >> during cpu roundup. This will conflict with the printk cpulock.
> >> > > >
> >> > > > When the full vision is realized what will be the purpose of the 
> >> > > > printk
> >> > > > cpulock?
> >> > > >
> >> > > > I'm asking largely because it's current role is actively unhelpful
> >> > > > w.r.t. kdb. It is possible that cautious use of in_dbg_master() might
> >> > > > be a better (and safer) solution. However it sounds like there is a
> >> > > > larger role planned for the printk cpulock...
> >> > > 
> >> > > The printk cpulock is used as a synchronization mechanism for
> >> > > implementing atomic consoles, which need to be able to safely interrupt
> >> > > the console write() activity at any time and immediately continue with
> >> > > their own printing. The ultimate goal is to move all console printing
> >> > > into per-console dedicated kthreads, so the primary function of the
> >> > > printk cpulock is really to immediately _stop_ the CPU/kthread
> >> > > performing write() in order to allow write_atomic() (from any context 
> >> > > on
> >> > > any CPU) to safely and reliably take over.
> >> > 
> >> > I see.
> >> > 
> >> > Is there any mileage in allowing in_dbg_master() to suppress taking
> >> > the console lock?
> >> > 
> >> > There's a couple of reasons to worry about the current approach.
> >> > 
> >> > The first is that we don't want this code to trigger in the case when
> >> > kgdb is enabled and kdb is not since it is only kdb (a self-hosted
> >> > debugger) than uses the consoles. This case is relatively trivial to
> >> > address since we can rename it kdb_roundup_delay() and alter the way it
> >> > is conditionally compiled.
> 
> Well, _I_ want this code to trigger even without kdb. The printk cpulock
> is meant to be the innermost locking for the entire kernel. No code is
> allowed to block/spin on any kind of lock if holding the printk
> cpulock. This is the only way to guarantee the functionality of the
> atomic consoles.
>
> For example, if the kernel were to crash while inside kgdb code, we want
> to see the backtrace.

That would certainly help me debug any such problems in kgdb ;-) .


> Since kgdb _does_ take locks (spinning on @dbg_slave_lock during roundup
> and the master's own cpu lock as a retry loop on @dbg_master_lock),
> clearly it is not allowed to hold the printk cpulock. The simplest
> solution I could find was just to make sure kgdb_cpu_enter() isn't
> called while holding the printk cpulock.

We might have to come back to this. I'm pretty certain your patch
does not currently achieve this goal.


> >> > The second is more of a problem however. kdb will only call into the
> >> > console code from the debug master. By default this is the CPU that
> >> > takes the debug trap so initial prints will work fine. However it is
> >> > possible to switch to a different master (so we can read per-CPU
> >> > registers and things like that). This will result in one of the CPUs
> >> > that did the IPI round up calling into console code and this is unsafe
> >> > in that instance.
> 
> It is only unsafe if a CPU enters "kgdb/kdb context" while holding the
> printk cpulock. That is what I want to prevent.

Currently you can preventing this only for CPUs that enter the debugger
via an IPI. CPUs that enter due to a breakpoint (and there can be more
than one at a time) cannot just continue until the lock is dropped
since they would end up re-executing the breakpoint instruction.


> >> > There are a couple of tricks we could adopt to work around this but
> >> > given the slightly odd calling context for kdb (all CPUs quiesced, no
> >> > log interleaving possible) it sounds like it would remain safe to
> >> > bypass the lock if in_dbg_master() is true.
> >> > 
> >> > Bypassing an inconvenient lock might sound icky but:
> >> > 
> >> > 1. If the lock is not owned by any CPU then what kdb will do is safe.
> 
> No. The printk cpulock exists for low-level synchronization. The atomic
> consoles need this synchronization. (For example, the 8250 needs this
> for correct tracking of its interrupt register, even for
> serial8250_put_poll_char().)

What I mean is that because kdb is mono-threaded (even on SMP systems
due to the quiescing of other CPUs) then if the lock is not taken when
we enter kdb then it is safe for kdb to contend for the lock in the
normal way since it cannot deadlock.

BTW the synchronization in question is the need for strict nesting, is
that right? (e.g.  that each context that recursiv

Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option

Excerpts from Athira Rajeev's message of August 6, 2021 7:28 pm:
> 
> 
>> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin  wrote:
>> 
>> It can be useful in simulators (with very constrained environments)
>> to allow some PMCs to run from boot so they can be sampled directly
>> by a test harness, rather than having to run perf.
>> 
>> A previous change freezes counters at boot by default, so provide
>> a boot time option to un-freeze (plus a bit more flexibility).
>> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>> .../admin-guide/kernel-parameters.txt |  7 
>> arch/powerpc/perf/core-book3s.c   | 35 +++
>> 2 files changed, 42 insertions(+)
>> 
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
>> b/Documentation/admin-guide/kernel-parameters.txt
>> index bdb22006f713..96b7d0ebaa40 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -4089,6 +4089,13 @@
>>  Override pmtimer IOPort with a hex value.
>>  e.g. pmtmr=0x508
>> 
>> +pmu=[PPC] Manually enable the PMU.
>> +Enable the PMU by setting MMCR0 to 0 (clear FC bit).
>> +This option is implemented for Book3S processors.
>> +If a number is given, then MMCR1 is set to that number,
>> +otherwise (e.g., 'pmu=on'), it is left 0. The perf
>> +subsystem is disabled if this option is used.
>> +
>>  pm_debug_messages   [SUSPEND,KNL]
>>  Enable suspend/resume debug messages during boot up.
>> 
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index 65795cadb475..e7cef4fe17d7 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
>> }
>> 
>> #ifdef CONFIG_PPC64
>> +static bool pmu_override = false;
>> +static unsigned long pmu_override_val;
>> +static void do_pmu_override(void *data)
>> +{
>> +ppc_set_pmu_inuse(1);
>> +if (pmu_override_val)
>> +mtspr(SPRN_MMCR1, pmu_override_val);
>> +mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
> 
> Hi Nick
> 
> Here, we are not doing any validity check for the value used to set MMCR1. 
> For advanced users, the option to pass value for MMCR1 is fine. But other 
> cases, it could result in
> invalid event getting used. Do we need to restrict this boot time option for 
> only PMC5/6 ?

Depends what would be useful. We don't have to prevent the admin shooting 
themselves in the foot with options like this, but if we can make it 
safer without making it less useful then that's always a good option.

Thanks,
Nick

Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option

Excerpts from Madhavan Srinivasan's message of August 6, 2021 5:33 pm:
> 
> On 7/26/21 9:19 AM, Nicholas Piggin wrote:
>> It can be useful in simulators (with very constrained environments)
>> to allow some PMCs to run from boot so they can be sampled directly
>> by a test harness, rather than having to run perf.
>>
>> A previous change freezes counters at boot by default, so provide
>> a boot time option to un-freeze (plus a bit more flexibility).
>>
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   .../admin-guide/kernel-parameters.txt |  7 
>>   arch/powerpc/perf/core-book3s.c   | 35 +++
>>   2 files changed, 42 insertions(+)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
>> b/Documentation/admin-guide/kernel-parameters.txt
>> index bdb22006f713..96b7d0ebaa40 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -4089,6 +4089,13 @@
>>  Override pmtimer IOPort with a hex value.
>>  e.g. pmtmr=0x508
>>
>> +pmu=[PPC] Manually enable the PMU.
> 
> 
> This is bit confusing, IIUC, we are manually disabling the perf 
> registration
> with this option and not pmu.
> If this option is used, we will unfreeze the
> MMCR0_FC (only in the HV_mode) and not register perf subsystem.

With the previous patch, this option un-freezes the PMU
(and disables perf).

> Since this option is valid only for HV_mode, canwe call it
> kvm_disable_perf or kvm_dis_perf.

It's only disabled for guests because it would require a bit
of logic to set pmcregs_in_use when we register our lppaca. We could
add that if needed, but the intention is for use on BML, not exactly
KVM specific.

I can add HV restriction to the help text. And we could rename the 
option. free_run_pmu= or something?

Thanks,
Nick

> 
> 
>> +Enable the PMU by setting MMCR0 to 0 (clear FC bit).
>> +This option is implemented for Book3S processors.
>> +If a number is given, then MMCR1 is set to that number,
>> +otherwise (e.g., 'pmu=on'), it is left 0. The perf
>> +subsystem is disabled if this option is used.
>> +
>>  pm_debug_messages   [SUSPEND,KNL]
>>  Enable suspend/resume debug messages during boot up.
>>
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index 65795cadb475..e7cef4fe17d7 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
>>   }
>>
>>   #ifdef CONFIG_PPC64
>> +static bool pmu_override = false;
>> +static unsigned long pmu_override_val;
>> +static void do_pmu_override(void *data)
>> +{
>> +ppc_set_pmu_inuse(1);
>> +if (pmu_override_val)
>> +mtspr(SPRN_MMCR1, pmu_override_val);
>> +mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
>> +}
>> +
>>   static int __init init_ppc64_pmu(void)
>>   {
>> +if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
>> +printk(KERN_WARNING "perf: disabling perf due to pmu= command 
>> line option.\n");
>> +on_each_cpu(do_pmu_override, NULL, 1);
>> +return 0;
>> +}
>> +
>>  /* run through all the pmu drivers one at a time */
>>  if (!init_power5_pmu())
>>  return 0;
>> @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void)
>>  return init_generic_compat_pmu();
>>   }
>>   early_initcall(init_ppc64_pmu);
>> +
>> +static int __init pmu_setup(char *str)
>> +{
>> +unsigned long val;
>> +
>> +if (!early_cpu_has_feature(CPU_FTR_HVMODE))
>> +return 0;
>> +
>> +pmu_override = true;
>> +
>> +if (kstrtoul(str, 0, &val))
>> +val = 0;
>> +
>> +pmu_override_val = val;
>> +
>> +return 1;
>> +}
>> +__setup("pmu=", pmu_setup);
>> +
>>   #endif
>

Re: [PATCH v1 14/55] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting

Excerpts from Michael Ellerman's message of August 6, 2021 5:34 pm:
> Nicholas Piggin  writes:
>> Revert the workaround added by commit 63279eeb7f93a ("KVM: PPC: Book3S
>> HV: Always save guest pmu for guest capable of nesting").
>>
>> Nested capable guests running with the earlier commit ("KVM: PPC: Book3S
>> HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU
>> in-use status of their guests, which means the parent does not need to
>> unconditionally save the PMU for nested capable guests.
>>
>> This will cause the PMU to break for nested guests when running older
>> nested hypervisor guests under a kernel with this change. It's unclear
>> there's an easy way to avoid that, so this could wait for a release or
>> so for the fix to filter into stable kernels.
> 
> I'm not sure PMU inside nested guests is getting much use, but I don't
> think we can break this quite so casually :)
> 
> Especially as the failure mode will be PMU counts that don't match
> reality, which is hard to diagnose. It took nearly a year for us to find
> the original bug.
> 
> I think we need to hold this back for a while.
> 
> We could put it under a CONFIG option, and then flip that option to off
> at some point in the future.

Okay that might be a good compromise, I'll do that.

Thanks,
Nick

Re: [PATCH v1 11/55] powerpc/time: add API for KVM to re-arm the host timer/decrementer

Excerpts from Christophe Leroy's message of August 5, 2021 5:22 pm:
> 
> 
> Le 26/07/2021 à 05:49, Nicholas Piggin a écrit :
>> Rather than have KVM look up the host timer and fiddle with the
>> irq-work internal details, have the powerpc/time.c code provide a
>> function for KVM to re-arm the Linux timer code when exiting a
>> guest.
>> 
>> This is implementation has an improvement over existing code of
>> marking a decrementer interrupt as soft-pending if a timer has
>> expired, rather than setting DEC to a -ve value, which tended to
>> cause host timers to take two interrupts (first hdec to exit the
>> guest, then the immediate dec).
>> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   arch/powerpc/include/asm/time.h | 16 +++---
>>   arch/powerpc/kernel/time.c  | 52 +++--
>>   arch/powerpc/kvm/book3s_hv.c|  7 ++---
>>   3 files changed, 49 insertions(+), 26 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/time.h 
>> b/arch/powerpc/include/asm/time.h
>> index 69b6be617772..924b2157882f 100644
>> --- a/arch/powerpc/include/asm/time.h
>> +++ b/arch/powerpc/include/asm/time.h
>> @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 
>> dividend_low,
>>   extern void secondary_cpu_time_init(void);
>>   extern void __init time_init(void);
>>   
>> -#ifdef CONFIG_PPC64
>> -static inline unsigned long test_irq_work_pending(void)
>> -{
>> -unsigned long x;
>> -
>> -asm volatile("lbz %0,%1(13)"
>> -: "=r" (x)
>> -: "i" (offsetof(struct paca_struct, irq_work_pending)));
>> -return x;
>> -}
>> -#endif
>> -
>>   DECLARE_PER_CPU(u64, decrementers_next_tb);
>>   
>>   static inline u64 timer_get_next_tb(void)
>> @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
>>  return __this_cpu_read(decrementers_next_tb);
>>   }
>>   
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +void timer_rearm_host_dec(u64 now);
>> +#endif
>> +
>>   /* Convert timebase ticks to nanoseconds */
>>   unsigned long long tb_to_ns(unsigned long long tb_ticks);
>>   
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 72d872b49167..016828b7401b 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc);
>>* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
>>*/
>>   #ifdef CONFIG_PPC64
>> +static inline unsigned long test_irq_work_pending(void)
>> +{
>> +unsigned long x;
>> +
>> +asm volatile("lbz %0,%1(13)"
>> +: "=r" (x)
>> +: "i" (offsetof(struct paca_struct, irq_work_pending)));
> 
> Can we just use READ_ONCE() instead of hard coding the read ?

Good question, probably yes. Probably calls for its own patch series, 
e.g., hw_irq.h has all similar paca accesses.

Thanks,
Nick

Re: [PATCH v1 02/55] KVM: PPC: Book3S HV P9: Fixes for TM softpatch interrupt

Excerpts from Michael Ellerman's message of August 6, 2021 11:16 am:
> Nicholas Piggin  writes:
>> The softpatch interrupt sets HSRR0 to the faulting instruction +4, so
>> it should subtract 4 for the faulting instruction address. Also have it
>> emulate and deliver HFAC interrupts correctly, which is important for
>> nested HV and facility demand-faulting in future.
> 
> The nip being off by 4 sounds bad. But I guess it's not that big a deal
> because it's only used for reporting the instruction address?

Yeah currently I think so. It's not that bad of a bug.

> 
> Would also be good to have some more explanation of why it's OK to
> change from illegal to HFAC, which is a guest visible change.

Good point. Again for now it doesn't really matter because the HFAC
handler turns everything (except msgsndp) into a sigill anyway, so
becomes important when we start using HFACs. Put that way I'll probably
split it out.

> 
>> diff --git a/arch/powerpc/kvm/book3s_hv_tm.c 
>> b/arch/powerpc/kvm/book3s_hv_tm.c
>> index cc90b8b82329..e4fd4a9dee08 100644
>> --- a/arch/powerpc/kvm/book3s_hv_tm.c
>> +++ b/arch/powerpc/kvm/book3s_hv_tm.c
>> @@ -74,19 +74,23 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
>>  case PPC_INST_RFEBB:
>>  if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206)) {
>>  /* generate an illegal instruction interrupt */
>> +vcpu->arch.regs.nip -= 4;
>>  kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
>>  return RESUME_GUEST;
>>  }
>>  /* check EBB facility is available */
>>  if (!(vcpu->arch.hfscr & HFSCR_EBB)) {
>> -/* generate an illegal instruction interrupt */
>> -kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
>> -return RESUME_GUEST;
>> +vcpu->arch.regs.nip -= 4;
>> +vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
>> +vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56;
>> +vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
>> +return -1; /* rerun host interrupt handler */
> 
> This is EBB not TM. Probably OK to leave it in this patch as long as
> it's mentioned in the change log?

It is, but you can get a softpatch interrupt on rfebb changing TM state. 
Although I haven't actually tested to see if you get a softpatch when
HFSCR disables EBB or the hardware just gives you the HFAC. For that 
matter, same for all the other facility tests.

Thanks,
Nick

> 
>>  }
>>  if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) {
>>  /* generate a facility unavailable interrupt */
>> -vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) |
>> -((u64)FSCR_EBB_LG << 56);
>> +vcpu->arch.regs.nip -= 4;
>> +vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
>> +vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56;
> 
> Same.
> 
> 
> cheers
>

Re: [PATCH v2 2/3] KVM: PPC: Book3S HV: Add sanity check to copy_tofrom_guest

Excerpts from Fabiano Rosas's message of August 6, 2021 7:26 am:
> Both paths into __kvmhv_copy_tofrom_guest_radix ensure that we arrive
> with an effective address that is smaller than our total addressable
> space and addresses quadrant 0.
> 
> - The H_COPY_TOFROM_GUEST hypercall path rejects the call with
> H_PARAMETER if the effective address has any of the twelve most
> significant bits set.
> 
> - The kvmhv_copy_tofrom_guest_radix path clears the top twelve bits
> before calling the internal function.
> 
> Although the callers make sure that the effective address is sane, any
> future use of the function is exposed to a programming error, so add a
> sanity check.

We possibly should put these into #defines in radix pgtable headers 
somewhere but KVM already open codes them so this is good for now.

Reviewed-by: Nicholas Piggin 

> 
> Suggested-by: Nicholas Piggin 
> Signed-off-by: Fabiano Rosas 
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 44eb7b1ef289..1b1c9e9e539b 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int 
> pid,
> (to != NULL) ? __pa(to): 0,
> (from != NULL) ? __pa(from): 0, n);
>  
> + if (eaddr & (0xFFFUL << 52))
> + return ret;
> +
>   quadrant = 1;
>   if (!pid)
>   quadrant = 2;
> -- 
> 2.29.2
> 
>

Re: [PATCH v2 1/3] KVM: PPC: Book3S HV: Fix copy_tofrom_guest routines

Excerpts from Fabiano Rosas's message of August 6, 2021 7:26 am:
> The __kvmhv_copy_tofrom_guest_radix function was introduced along with
> nested HV guest support. It uses the platform's Radix MMU quadrants to
> provide a nested hypervisor with fast access to its nested guests
> memory (H_COPY_TOFROM_GUEST hypercall). It has also since been added
> as a fast path for the kvmppc_ld/st routines which are used during
> instruction emulation.
> 
> The commit def0bfdbd603 ("powerpc: use probe_user_read() and
> probe_user_write()") changed the low level copy function from
> raw_copy_from_user to probe_user_read, which adds a check to
> access_ok. In powerpc that is:
> 
>  static inline bool __access_ok(unsigned long addr, unsigned long size)
>  {
>  return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr;
>  }
> 
> and TASK_SIZE_MAX is 0x0010UL for 64-bit, which means that
> setting the two MSBs of the effective address (which correspond to the
> quadrant) now cause access_ok to reject the access.
> 
> This was not caught earlier because the most common code path via
> kvmppc_ld/st contains a fallback (kvm_read_guest) that is likely to
> succeed for L1 guests. For nested guests there is no fallback.
> 
> Another issue is that probe_user_read (now __copy_from_user_nofault)
> does not return the number of bytes not copied in case of failure, so
> the destination memory is not being cleared anymore in
> kvmhv_copy_from_guest_radix:
> 
>  ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
>  if (ret > 0)<-- always false!
>  memset(to + (n - ret), 0, ret);
> 
> This patch fixes both issues by skipping access_ok and open-coding the
> low level __copy_to/from_user_inatomic.
> 
> Fixes: def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()")

Reviewed-by: Nicholas Piggin 

> Signed-off-by: Fabiano Rosas 
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index b5905ae4377c..44eb7b1ef289 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -65,10 +65,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, 
> int pid,
>   }
>   isync();
>  
> + pagefault_disable();
>   if (is_load)
> - ret = copy_from_user_nofault(to, (const void __user *)from, n);
> + ret = __copy_from_user_inatomic(to, (const void __user *)from, 
> n);
>   else
> - ret = copy_to_user_nofault((void __user *)to, from, n);
> + ret = __copy_to_user_inatomic((void __user *)to, from, n);
> + pagefault_enable();
>  
>   /* switch the pid first to avoid running host with unallocated pid */
>   if (quadrant == 1 && pid != old_pid)
> -- 
> 2.29.2
> 
>

Re: Debian SID kernel doesn't boot on PowerBook 3400c

Le 06/08/2021 à 11:43, Finn Thain a écrit :

On Fri, 6 Aug 2021, Christophe Leroy wrote:

Can you check if they DO NOT happen at preceding commit c16728835~

$ git checkout c16728835~
Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C
HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap
save/restore/check helpers
$ git am ../message.mbox
warning: Patch sent with format=flowed; space at the end of lines might be
lost.
Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
$ cp ../dot-config-powermac-5.13 .config
$ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
olddefconfig vmlinux

Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045

3) PB 3400c
Hangs at boot (Mac OS screen)

4) Wallstreet
X fails, errors in console log (different than test 2), see
Wallstreet_console-2.txt.

This log shows that the errors "xfce4-session[1775]: bus error (7)" and
"kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit
c16728835eec ("powerpc/32: Manage KUAP in C").

As mentionned by Nic, this is due to r11 being cloberred. For the time being
the only r11 clobber identified is the one I have provided a fix for. I'm
wondering whether it was applied for all further tests or not.

Your fix was applied to this build with "git am ../message.mbox".

Ok good.

...

Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG

...

$scripts/config -e CONFIG_PPC_KUAP
$ scripts/config -e CONFIG_PPC_KUAP_DEBUG
$ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
olddefconfig vmlinux
$ grep CONFIG_PPC_KUAP .config
CONFIG_PPC_KUAP=y
CONFIG_PPC_KUAP_DEBUG=y

Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752

9) PB 3400c
Hangs at boot (Mac OS screen)

10) Wallstreet
X failed at first login, worked at second login, one error in console
log ("BUG: Unable to handle kernel instruction fetch"), see
Wallstreet_console-5.txt.

One might expect to see "Kernel attempted to write user page (b3399774) -
exploit attempt?" again here (see c16728835eec build above) but instead
this log says "Oops: Kernel access of bad area, sig: 11".

Maybe the test should be done a second time. As r11 is garbage it may or
may not be a user address. If it is a user address the we get "Kernel
attempted to write user page". If it is a random kernel address, we
likely get "Kernel access of bad area" instead.

Your fix was applied here also.

Anyway, it would be worth trying to boot a few times more with the same kernel, because as I said
the value is random, so it may or may not hit userspace, hence the possible difference of message,
either "Kernel attempted to write user page" or "Kernel access of bad area" depending on whether the
address is a user address or not.

I have cooked a tentative fix for that KUAP stuff.
Could you try the branch 'bugtest' at https://github.com/chleroy/linux.git

Thanks
Christophe

Re: Debian SID kernel doesn't boot on PowerBook 3400c

2021-08-06 Thread Finn Thain

On Fri, 6 Aug 2021, Christophe Leroy wrote:

> > > > > 
> > > > > Can you check if they DO NOT happen at preceding commit c16728835~
> > > > > 
> > > 
> > > $ git checkout c16728835~
> > > Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C
> > > HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap
> > > save/restore/check helpers
> > > $ git am ../message.mbox
> > > warning: Patch sent with format=flowed; space at the end of lines might be
> > > lost.
> > > Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
> > > $ cp ../dot-config-powermac-5.13 .config
> > > $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
> > > olddefconfig vmlinux
> > > 
> > > Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045
> > > 
> > > 3) PB 3400c
> > > Hangs at boot (Mac OS screen)
> > > 
> > > 4) Wallstreet
> > > X fails, errors in console log (different than test 2), see
> > > Wallstreet_console-2.txt.
> > > 
> > 
> > This log shows that the errors "xfce4-session[1775]: bus error (7)" and
> > "kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit
> > c16728835eec ("powerpc/32: Manage KUAP in C").
> 
> As mentionned by Nic, this is due to r11 being cloberred. For the time being
> the only r11 clobber identified is the one I have provided a fix for. I'm
> wondering whether it was applied for all further tests or not.
> 

Your fix was applied to this build with "git am ../message.mbox".

> ...
> > > 
> > > > 
> > > > > Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG
> > > ...
> > > 
> > > $scripts/config -e CONFIG_PPC_KUAP
> > > $ scripts/config -e CONFIG_PPC_KUAP_DEBUG
> > > $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
> > > olddefconfig vmlinux
> > > $ grep CONFIG_PPC_KUAP .config
> > > CONFIG_PPC_KUAP=y
> > > CONFIG_PPC_KUAP_DEBUG=y
> > > 
> > > Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752
> > > 
> > > 9) PB 3400c
> > > Hangs at boot (Mac OS screen)
> > > 
> > > 10) Wallstreet
> > > X failed at first login, worked at second login, one error in console
> > > log ("BUG: Unable to handle kernel instruction fetch"), see
> > > Wallstreet_console-5.txt.
> > > 
> > 
> > One might expect to see "Kernel attempted to write user page (b3399774) -
> > exploit attempt?" again here (see c16728835eec build above) but instead
> > this log says "Oops: Kernel access of bad area, sig: 11".
> 
> Maybe the test should be done a second time. As r11 is garbage it may or 
> may not be a user address. If it is a user address the we get "Kernel 
> attempted to write user page". If it is a random kernel address, we 
> likely get "Kernel access of bad area" instead.
> 

Your fix was applied here also.

Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option

2021-08-06 Thread Athira Rajeev




> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin  wrote:
> 
> It can be useful in simulators (with very constrained environments)
> to allow some PMCs to run from boot so they can be sampled directly
> by a test harness, rather than having to run perf.
> 
> A previous change freezes counters at boot by default, so provide
> a boot time option to un-freeze (plus a bit more flexibility).
> 
> Signed-off-by: Nicholas Piggin 
> ---
> .../admin-guide/kernel-parameters.txt |  7 
> arch/powerpc/perf/core-book3s.c   | 35 +++
> 2 files changed, 42 insertions(+)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index bdb22006f713..96b7d0ebaa40 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4089,6 +4089,13 @@
>   Override pmtimer IOPort with a hex value.
>   e.g. pmtmr=0x508
> 
> + pmu=[PPC] Manually enable the PMU.
> + Enable the PMU by setting MMCR0 to 0 (clear FC bit).
> + This option is implemented for Book3S processors.
> + If a number is given, then MMCR1 is set to that number,
> + otherwise (e.g., 'pmu=on'), it is left 0. The perf
> + subsystem is disabled if this option is used.
> +
>   pm_debug_messages   [SUSPEND,KNL]
>   Enable suspend/resume debug messages during boot up.
> 
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 65795cadb475..e7cef4fe17d7 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
> }
> 
> #ifdef CONFIG_PPC64
> +static bool pmu_override = false;
> +static unsigned long pmu_override_val;
> +static void do_pmu_override(void *data)
> +{
> + ppc_set_pmu_inuse(1);
> + if (pmu_override_val)
> + mtspr(SPRN_MMCR1, pmu_override_val);
> + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);

Hi Nick

Here, we are not doing any validity check for the value used to set MMCR1. 
For advanced users, the option to pass value for MMCR1 is fine. But other 
cases, it could result in
invalid event getting used. Do we need to restrict this boot time option for 
only PMC5/6 ?
 
Thanks
Athira

> +}
> +
> static int __init init_ppc64_pmu(void)
> {
> + if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
> + printk(KERN_WARNING "perf: disabling perf due to pmu= command 
> line option.\n");
> + on_each_cpu(do_pmu_override, NULL, 1);
> + return 0;
> + }
> +
>   /* run through all the pmu drivers one at a time */
>   if (!init_power5_pmu())
>   return 0;
> @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void)
>   return init_generic_compat_pmu();
> }
> early_initcall(init_ppc64_pmu);
> +
> +static int __init pmu_setup(char *str)
> +{
> + unsigned long val;
> +
> + if (!early_cpu_has_feature(CPU_FTR_HVMODE))
> + return 0;
> +
> + pmu_override = true;
> +
> + if (kstrtoul(str, 0, &val))
> + val = 0;
> +
> + pmu_override_val = val;
> +
> + return 1;
> +}
> +__setup("pmu=", pmu_setup);
> +
> #endif
> -- 
> 2.23.0
>

Re: [PATCH] powerpc/kprobes: Fix kprobe Oops happens in booke

2021-08-06 Thread Pu Lehui





On 2021/8/5 17:51, Christophe Leroy wrote:



Le 04/08/2021 à 16:37, Pu Lehui a écrit :

When using kprobe on powerpc booke series processor, Oops happens
as show bellow:

[   35.861352] Oops: Exception in kernel mode, sig: 5 [#1]
[   35.861676] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
[   35.861905] Modules linked in:
[   35.862144] CPU: 0 PID: 76 Comm: sh Not tainted 
5.14.0-rc3-00060-g7e96bf476270 #18

[   35.862610] NIP:  c0b96470 LR: c00107b4 CTR: c0161c80
[   35.862805] REGS: c387fe70 TRAP: 0700   Not tainted 
(5.14.0-rc3-00060-g7e96bf476270)

[   35.863198] MSR:  00029002   CR: 24022824  XER: 2000
[   35.863577]
[   35.863577] GPR00: c0015218 c387ff20 c313e300 c387ff50 0004 
4002 4000 0a1a2cce
[   35.863577] GPR08:  0004  59764000 24022422 
102490c2  
[   35.863577] GPR16:   0040 1024 1024 
1024 1024 1022
[   35.863577] GPR24:  1024   bfc655e8 
0800 c387ff50 

[   35.865367] NIP [c0b96470] schedule+0x0/0x130
[   35.865606] LR [c00107b4] interrupt_exit_user_prepare_main+0xf4/0x100
[   35.865974] Call Trace:
[   35.866142] [c387ff20] [c0053224] irq_exit+0x114/0x120 (unreliable)
[   35.866472] [c387ff40] [c0015218] interrupt_return+0x14/0x13c
[   35.866728] --- interrupt: 900 at 0x100af3dc
[   35.866963] NIP:  100af3dc LR: 100de020 CTR: 
[   35.867177] REGS: c387ff50 TRAP: 0900   Not tainted 
(5.14.0-rc3-00060-g7e96bf476270)
[   35.867488] MSR:  0002f902   CR: 20022422  XER: 
2000

[   35.867808]
[   35.867808] GPR00: c001509c bfc65570 1024b4d0  100de020 
20022422 bfc655a8 100af3dc
[   35.867808] GPR08: 0002f902    72656773 
102490c2  
[   35.867808] GPR16:   0040 1024 1024 
1024 1024 1022
[   35.867808] GPR24:  1024   bfc655e8 
10245910  0001

[   35.869406] NIP [100af3dc] 0x100af3dc
[   35.869578] LR [100de020] 0x100de020
[   35.869751] --- interrupt: 900
[   35.870001] Instruction dump:
[   35.870283] 40c20010 815e0518 714a0100 41e2fd04 3920 913e00c0 
3b1e0450 4bfffd80
[   35.870666] 0fe0 92a10024 4bfff1a9 6000 <7fe8> 7c0802a6 
93e1001c 7c5f1378

[   35.871339] ---[ end trace 23ff848139efa9b9 ]---

There is no real mode for booke arch and the MMU translation is
always on. The corresponding MSR_IS/MSR_DS bit in booke is used
to switch the address space, but not for real mode judgment.


Can you explain more the link between that explanation and the Oops 
itself ?


In fact, the same Oops appears when any probed function is hit, like 
do_nanosleep


/ # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events
/ # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
/ # sleep 1
[   50.076730] Oops: Exception in kernel mode, sig: 5 [#1]
[   50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
[   50.077221] Modules linked in:
[   50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 
5.14.0-rc4-00022-g251a1524293d #21

[   50.077887] NIP:  c0b9c4e0 LR: c00ebecc CTR: 
[   50.078067] REGS: c3883de0 TRAP: 0700   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)

[   50.078349] MSR:  00029000   CR: 24000228  XER: 2000
[   50.078675]
[   50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001 
 c3883ecc 0001
[   50.078675] GPR08: c100598c c00ea250 0004  24000222 
102490c2 bff4180c 101e60d4
[   50.078675] GPR16:  102454ac 0040 1024 10241100 
102410f8 1024 0050
[   50.078675] GPR24: 0002  c3883ea0 0001  
c350 3b9b8d50 

[   50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190
[   50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0
[   50.080638] Call Trace:
[   50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 
(unreliable)

[   50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110
[   50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28
[   50.081541] --- interrupt: c00 at 0x100a4d08
[   50.081749] NIP:  100a4d08 LR: 101b5234 CTR: 0003
[   50.081931] REGS: c3883f50 TRAP: 0c00   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)

[   50.082183] MSR:  0002f902   CR: 24000222  XER: 
[   50.082457]
[   50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 
6400 00555345 fefefeff
[   50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 
102490c2 bff4180c 101e60d4
[   50.082457] GPR16:  102454ac 0040 1024 10241100 
102410f8 1024 0050
[   50.082457] GPR24: 0002 bf9803f4 1024   
100039e0  102444e8

[   50.083789] NIP [100a4d08] 0x100a4d08
[   50.083917] LR [101b5234] 0x101b5234
[   50.084042] --- interrupt: c00
[   50.084238] Instruction dump:
[   50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 
914200c0 38210010
[   50.084841] 4bfffc20    <7fe8> 7c0802a6

Re: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to platform_device

2021-08-06 Thread Saravana Kannan

On Thu, Aug 5, 2021 at 9:35 PM Maxim Kochetkov  wrote:
>
> 03.08.2021 20:51, Saravana Kannan wrote:
> >> So lets convert this driver to simple platform_device with probe().
> >> Also use platform_get_ and devm_ family function to get/allocate
> >> resources and drop unused .compatible = "qeic".
> > Yes, please!
>
> Should I totally drop { .type = "qeic"}, or keep?

Sorry for the confusion. My "Yes, please"!" was a show of support for
switching this to a proper platform driver. Not a response to that
specific question.

I didn't look at the code/DT close enough to know/care about the "type" part.

-Saravana

Re: [PATCH v4] soc: fsl: qe: convert QE interrupt controller to platform_device

2021-08-06 Thread Maxim Kochetkov


03.08.2021 20:51, Saravana Kannan wrote:

So lets convert this driver to simple platform_device with probe().
Also use platform_get_ and devm_ family function to get/allocate
resources and drop unused .compatible = "qeic".

Yes, please!


Should I totally drop { .type = "qeic"}, or keep?

Re: [RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE

2021-08-06 Thread Puvichakravarthy Ramachandran

> With shared mapping, even though we are unmapping a large range, the 
kernel
> will force a TLB flush with ptl lock held to avoid the race mentioned in
> commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and 
memory freeing parts")
> This results in the kernel issuing a high number of TLB flushes even for 
a large
> range. This can be improved by making sure the kernel switch to pid 
based flush if the
> kernel is unmapping a 2M range.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/book3s64/radix_tlb.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
> index aefc100d79a7..21d0f098e43b 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1106,7 +1106,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>   * invalidating a full PID, so it has a far lower threshold to change 
from
>   * individual page flushes to full-pid flushes.
>   */
> -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
> +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 32;
>  static unsigned long tlb_local_single_page_flush_ceiling __read_mostly 
= POWER9_TLB_SETS_RADIX * 2;
> 
>  static inline void __radix__flush_tlb_range(struct mm_struct *mm,
> @@ -1133,7 +1133,7 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>   if (fullmm)
>   flush_pid = true;
>   else if (type == FLUSH_TYPE_GLOBAL)
> - flush_pid = nr_pages > tlb_single_page_flush_ceiling;
> + flush_pid = nr_pages >= tlb_single_page_flush_ceiling;
>   else
>   flush_pid = nr_pages > 
tlb_local_single_page_flush_ceiling;

Additional details on the test environment. This was tested on a 2 Node/8 
socket Power10 system.
The LPAR had 105 cores and the LPAR spanned across all the sockets. 

# perf stat -I 1000 -a -e cycles,instructions -e 
"{cpu/config=0x030008,name=PM_EXEC_STALL/}" -e 
"{cpu/config=0x02E01C,name=PM_EXEC_STALL_TLBIE/}" ./tlbie -i 10 -c 1  -t 1
 Rate of work: = 176 
#   time counts unit events
 1.029206442 4198594519  cycles  
 1.029206442 2458254252  instructions  # 0.59 
insn per cycle 
 1.029206442 3004031488  PM_EXEC_STALL   
 1.029206442 1798186036  PM_EXEC_STALL_TLBIE   
 Rate of work: = 181 
 2.054288539 4183883450  cycles  
 2.054288539 2472178171  instructions  # 0.59 
insn per cycle 
 2.054288539 3014609313  PM_EXEC_STALL   
 2.054288539 1797851642  PM_EXEC_STALL_TLBIE   
 Rate of work: = 180 
 3.078306883 4171250717  cycles  
 3.078306883 2468341094  instructions  # 0.59 
insn per cycle 
 3.078306883 2993036205  PM_EXEC_STALL   
 3.078306883 1798181890  PM_EXEC_STALL_TLBIE   
.
. 

# cat /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling
34

# echo 32 > /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling

# perf stat -I 1000 -a -e cycles,instructions -e 
"{cpu/config=0x030008,name=PM_EXEC_STALL/}" -e 
"{cpu/config=0x02E01C,name=PM_EXEC_STALL_TLBIE/}" ./tlbie -i 10 -c 1  -t 1
 Rate of work: = 313 
#   time counts unit events
 1.030310506 4206071143  cycles  
 1.030310506 4314716958  instructions  # 1.03 
insn per cycle 
 1.030310506 2157762167  PM_EXEC_STALL   
 1.030310506  110825573  PM_EXEC_STALL_TLBIE   
 Rate of work: = 322 
 2.056034068 4331745630  cycles  
 2.056034068 4531658304  instructions  # 1.05 
insn per cycle 
 2.056034068 2288971361  PM_EXEC_STALL   
 2.056034068  111267927  PM_EXEC_STALL_TLBIE   
 Rate of work: = 321 
 3.081216434 4327050349  cycles  
 3.081216434 4379679508  instructions  # 1.01 
insn per cycle 
 3.081216434 2252602550  PM_EXEC_STALL   
 3.081216434  110974887  PM_EXEC_STALL_TLBIE   
.
.
 

Regards,
Puvichakravarthy Ramachandran

Re: [PATCH v2 0/6] PCI: Drop duplicated tracking of a pci_dev's bound driver

2021-08-06 Thread Uwe Kleine-König

Hello Bjorn,

On Thu, Aug 05, 2021 at 06:42:34PM -0500, Bjorn Helgaas wrote:
> On Tue, Aug 03, 2021 at 12:01:44PM +0200, Uwe Kleine-König wrote:
> > Hello,
> > 
> > changes since v1 
> > (https://lore.kernel.org/linux-pci/20210729203740.1377045-1-u.kleine-koe...@pengutronix.de):
> > 
> > - New patch to simplify drivers/pci/xen-pcifront.c, spotted and
> >   suggested by Boris Ostrovsky
> > - Fix a possible NULL pointer dereference I introduced in xen-pcifront.c
> > - A few whitespace improvements
> > - Add a commit log to patch #6 (formerly #5)
> > 
> > I also expanded the audience for patches #4 and #6 to allow affected
> > people to actually see the changes to their drivers.
> > 
> > Interdiff can be found below.
> > 
> > The idea is still the same: After a few cleanups (#1 - #3) a new macro
> > is introduced abstracting access to struct pci_dev->driver. All users
> > are then converted to use this and in the last patch the macro is
> > changed to make use of struct pci_dev::dev->driver to get rid of the
> > duplicated tracking.
> 
> I love the idea of this series!

\o/

> I looked at all the bus_type.probe() methods, it looks like pci_dev is
> not the only offender here.  At least the following also have a driver
> pointer in the device struct:
> 
>   parisc_device.driver
>   acpi_device.driver
>   dio_dev.driver
>   hid_device.driver
>   pci_dev.driver
>   pnp_dev.driver
>   rio_dev.driver
>   zorro_dev.driver

Right, when I converted zorro_dev it was pointed out that the code was
copied from pci and the latter has the same construct. :-)
See
https://lore.kernel.org/r/20210730191035.1455248-5-u.kleine-koe...@pengutronix.de
for the patch, I don't find where pci was pointed out, maybe it was on
irc only.

> Do you plan to do the same for all of them, or is there some reason
> why they need the pointer and PCI doesn't?

There is a list of cleanup stuff I intend to work on. Considering how
working on that list only made it longer in the recent past, maybe it
makes more sense to not work on it :-)

> In almost all cases, other buses define a "to__driver()"
> interface.  In fact, PCI already has a to_pci_driver().
> 
> This series adds pci_driver_of_dev(), which basically just means we
> can do this:
> 
>   pdrv = pci_driver_of_dev(pdev);
> 
> instead of this:
> 
>   pdrv = to_pci_driver(pdev->dev.driver);
> 
> I don't see any other "_driver_of_dev()" interfaces, so I assume
> other buses just live with the latter style?  I'd rather not be
> different and have two ways to get the "struct pci_driver *" unless
> there's a good reason.

Among few the busses I already fixed in this regard pci was the first
that has a considerable amount of usage. So I considered it worth giving
it a name.
 
> Looking through the places that care about pci_dev.driver (the ones
> updated by patch 5/6), many of them are ... a little dubious to begin
> with.  A few need the "struct pci_error_handlers *err_handler"
> pointer, so that's probably legitimate.  But many just need a name,
> and should probably be using dev_driver_string() instead.

Yeah, I considered adding a function to get the driver name from a
pci_dev and a function to get the error handlers. Maybe it's an idea to
introduce these two and then use to_pci_driver(pdev->dev.driver) for the
few remaining users? Maybe doing that on top of my current series makes
sense to have a clean switch from pdev->driver to pdev->dev.driver?!

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | https://www.pengutronix.de/ |


signature.asc
Description: PGP signature

Re: [RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE

2021-08-06 Thread Puvichakravarthy Ramachandran

> With shared mapping, even though we are unmapping a large range, the 
kernel
> will force a TLB flush with ptl lock held to avoid the race mentioned in
> commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and 
memory freeing parts")
> This results in the kernel issuing a high number of TLB flushes even for 
a large
> range. This can be improved by making sure the kernel switch to pid 
based flush if the
> kernel is unmapping a 2M range.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/book3s64/radix_tlb.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
> index aefc100d79a7..21d0f098e43b 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1106,7 +1106,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>   * invalidating a full PID, so it has a far lower threshold to change 
from
>   * individual page flushes to full-pid flushes.
>   */
> -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
> +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 32;
>  static unsigned long tlb_local_single_page_flush_ceiling __read_mostly 
= POWER9_TLB_SETS_RADIX * 2;
> 
>  static inline void __radix__flush_tlb_range(struct mm_struct *mm,
> @@ -1133,7 +1133,7 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>   if (fullmm)
>   flush_pid = true;
>   else if (type == FLUSH_TYPE_GLOBAL)
> - flush_pid = nr_pages > tlb_single_page_flush_ceiling;
> + flush_pid = nr_pages >= tlb_single_page_flush_ceiling;
>   else
>   flush_pid = nr_pages > 
tlb_local_single_page_flush_ceiling;

I evaluated the patches from Aneesh with a micro benchmark which does 
shmat, shmdt of 256 MB segment.
Higher the rate of work, better the performance. With a value of 32, we 
match the performance of 
GTSE=off. This was evaluated on SLES15 SP3 kernel.


# cat /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling 
32

# perf stat -I 1000 -a -e powerpc:tlbie,r30058 ./tlbie -i 5 -c 1 t 1
 Rate of work: = 311 
#   time counts unit events
 1.013131404  50939  powerpc:tlbie   
 1.013131404  50658  r30058  
 Rate of work: = 318 
 2.026957019  51520  powerpc:tlbie   
 2.026957019  51481  r30058  
 Rate of work: = 318 
 3.038884431  51485  powerpc:tlbie   
 3.038884431  51461  r30058  
 Rate of work: = 318 
 4.051483926  51485  powerpc:tlbie   
 4.051483926  51520  r30058  
 Rate of work: = 318 
 5.063635713  48577  powerpc:tlbie   
 5.063635713  48347  r30058  
 
# echo 34 > /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling 

# perf stat -I 1000 -a -e powerpc:tlbie,r30058 ./tlbie -i 5 -c 1 t 1
 Rate of work: = 174 
#   time counts unit events
 1.012672696 721471  powerpc:tlbie   
 1.012672696 726491  r30058  
 Rate of work: = 177 
 2.026348661 737460  powerpc:tlbie   
 2.026348661 736138  r30058  
 Rate of work: = 178 
 3.037932122 737460  powerpc:tlbie   
 3.037932122 737460  r30058  
 Rate of work: = 178 
 4.050198819 737044  powerpc:tlbie   
 4.050198819 737460  r30058  
 Rate of work: = 177 
 5.062400776 692832  powerpc:tlbie   
 5.062400776 688319  r30058  


Regards,
Puvichakravarthy Ramachandran

Re: [PATCH v1 14/55] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting

2021-08-06 Thread Michael Ellerman

Nicholas Piggin  writes:
> Revert the workaround added by commit 63279eeb7f93a ("KVM: PPC: Book3S
> HV: Always save guest pmu for guest capable of nesting").
>
> Nested capable guests running with the earlier commit ("KVM: PPC: Book3S
> HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU
> in-use status of their guests, which means the parent does not need to
> unconditionally save the PMU for nested capable guests.
>
> This will cause the PMU to break for nested guests when running older
> nested hypervisor guests under a kernel with this change. It's unclear
> there's an easy way to avoid that, so this could wait for a release or
> so for the fix to filter into stable kernels.

I'm not sure PMU inside nested guests is getting much use, but I don't
think we can break this quite so casually :)

Especially as the failure mode will be PMU counts that don't match
reality, which is hard to diagnose. It took nearly a year for us to find
the original bug.

I think we need to hold this back for a while.

We could put it under a CONFIG option, and then flip that option to off
at some point in the future.

cheers

> index e7f8cc04944b..ab89db561c85 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4003,8 +4003,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
> u64 time_limit,
>   vcpu->arch.vpa.dirty = 1;
>   save_pmu = lp->pmcregs_in_use;
>   }
> - /* Must save pmu if this guest is capable of running nested guests */
> - save_pmu |= nesting_enabled(vcpu->kvm);
>  
>   kvmhv_save_guest_pmu(vcpu, save_pmu);
>  #ifdef CONFIG_PPC_PSERIES
> -- 
> 2.23.0

Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option

2021-08-06 Thread Madhavan Srinivasan




On 7/26/21 9:19 AM, Nicholas Piggin wrote:

It can be useful in simulators (with very constrained environments)
to allow some PMCs to run from boot so they can be sampled directly
by a test harness, rather than having to run perf.

A previous change freezes counters at boot by default, so provide
a boot time option to un-freeze (plus a bit more flexibility).

Signed-off-by: Nicholas Piggin 
---
  .../admin-guide/kernel-parameters.txt |  7 
  arch/powerpc/perf/core-book3s.c   | 35 +++
  2 files changed, 42 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index bdb22006f713..96b7d0ebaa40 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4089,6 +4089,13 @@
Override pmtimer IOPort with a hex value.
e.g. pmtmr=0x508

+   pmu=[PPC] Manually enable the PMU.



This is bit confusing, IIUC, we are manually disabling the perf 
registration

with this option and not pmu. If this option is used, we will unfreeze the
MMCR0_FC (only in the HV_mode) and not register perf subsystem.
Since this option is valid only for HV_mode, canwe call it
kvm_disable_perf or kvm_dis_perf.



+   Enable the PMU by setting MMCR0 to 0 (clear FC bit).
+   This option is implemented for Book3S processors.
+   If a number is given, then MMCR1 is set to that number,
+   otherwise (e.g., 'pmu=on'), it is left 0. The perf
+   subsystem is disabled if this option is used.
+
pm_debug_messages   [SUSPEND,KNL]
Enable suspend/resume debug messages during boot up.

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 65795cadb475..e7cef4fe17d7 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
  }

  #ifdef CONFIG_PPC64
+static bool pmu_override = false;
+static unsigned long pmu_override_val;
+static void do_pmu_override(void *data)
+{
+   ppc_set_pmu_inuse(1);
+   if (pmu_override_val)
+   mtspr(SPRN_MMCR1, pmu_override_val);
+   mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
+}
+
  static int __init init_ppc64_pmu(void)
  {
+   if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
+   printk(KERN_WARNING "perf: disabling perf due to pmu= command line 
option.\n");
+   on_each_cpu(do_pmu_override, NULL, 1);
+   return 0;
+   }
+
/* run through all the pmu drivers one at a time */
if (!init_power5_pmu())
return 0;
@@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void)
return init_generic_compat_pmu();
  }
  early_initcall(init_ppc64_pmu);
+
+static int __init pmu_setup(char *str)
+{
+   unsigned long val;
+
+   if (!early_cpu_has_feature(CPU_FTR_HVMODE))
+   return 0;
+
+   pmu_override = true;
+
+   if (kstrtoul(str, 0, &val))
+   val = 0;
+
+   pmu_override_val = val;
+
+   return 1;
+}
+__setup("pmu=", pmu_setup);
+
  #endif

Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register