[PATCH v2] powerpc64/idle: Fix SP offsets when saving GPRs

2021-02-05 Thread Christopher M. Riedl
The idle entry/exit code saves/restores GPRs in the stack "red zone"
(Protected Zone according to PowerPC64 ELF ABI v2). However, the offset
used for the first GPR is incorrect and overwrites the back chain - the
Protected Zone actually starts below the current SP. In practice this is
probably not an issue, but it's still incorrect so fix it.

Also expand the comments to explain why using the stack "red zone"
instead of creating a new stackframe is appropriate here.

Signed-off-by: Christopher M. Riedl 
---
 arch/powerpc/kernel/idle_book3s.S | 138 --
 1 file changed, 73 insertions(+), 65 deletions(-)

diff --git a/arch/powerpc/kernel/idle_book3s.S 
b/arch/powerpc/kernel/idle_book3s.S
index 22f249b6f58d..f9e6d83e6720 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -52,28 +52,32 @@ _GLOBAL(isa300_idle_stop_mayloss)
std r1,PACAR1(r13)
mflrr4
mfcrr5
-   /* use stack red zone rather than a new frame for saving regs */
-   std r2,-8*0(r1)
-   std r14,-8*1(r1)
-   std r15,-8*2(r1)
-   std r16,-8*3(r1)
-   std r17,-8*4(r1)
-   std r18,-8*5(r1)
-   std r19,-8*6(r1)
-   std r20,-8*7(r1)
-   std r21,-8*8(r1)
-   std r22,-8*9(r1)
-   std r23,-8*10(r1)
-   std r24,-8*11(r1)
-   std r25,-8*12(r1)
-   std r26,-8*13(r1)
-   std r27,-8*14(r1)
-   std r28,-8*15(r1)
-   std r29,-8*16(r1)
-   std r30,-8*17(r1)
-   std r31,-8*18(r1)
-   std r4,-8*19(r1)
-   std r5,-8*20(r1)
+   /*
+* Use the stack red zone rather than a new frame for saving regs since
+* in the case of no GPR loss the wakeup code branches directly back to
+* the caller without deallocating the stack frame first.
+*/
+   std r2,-8*1(r1)
+   std r14,-8*2(r1)
+   std r15,-8*3(r1)
+   std r16,-8*4(r1)
+   std r17,-8*5(r1)
+   std r18,-8*6(r1)
+   std r19,-8*7(r1)
+   std r20,-8*8(r1)
+   std r21,-8*9(r1)
+   std r22,-8*10(r1)
+   std r23,-8*11(r1)
+   std r24,-8*12(r1)
+   std r25,-8*13(r1)
+   std r26,-8*14(r1)
+   std r27,-8*15(r1)
+   std r28,-8*16(r1)
+   std r29,-8*17(r1)
+   std r30,-8*18(r1)
+   std r31,-8*19(r1)
+   std r4,-8*20(r1)
+   std r5,-8*21(r1)
/* 168 bytes */
PPC_STOP
b   .   /* catch bugs */
@@ -89,8 +93,8 @@ _GLOBAL(isa300_idle_stop_mayloss)
  */
 _GLOBAL(idle_return_gpr_loss)
ld  r1,PACAR1(r13)
-   ld  r4,-8*19(r1)
-   ld  r5,-8*20(r1)
+   ld  r4,-8*20(r1)
+   ld  r5,-8*21(r1)
mtlrr4
mtcrr5
/*
@@ -98,25 +102,25 @@ _GLOBAL(idle_return_gpr_loss)
 * from PACATOC. This could be avoided for that less common case
 * if KVM saved its r2.
 */
-   ld  r2,-8*0(r1)
-   ld  r14,-8*1(r1)
-   ld  r15,-8*2(r1)
-   ld  r16,-8*3(r1)
-   ld  r17,-8*4(r1)
-   ld  r18,-8*5(r1)
-   ld  r19,-8*6(r1)
-   ld  r20,-8*7(r1)
-   ld  r21,-8*8(r1)
-   ld  r22,-8*9(r1)
-   ld  r23,-8*10(r1)
-   ld  r24,-8*11(r1)
-   ld  r25,-8*12(r1)
-   ld  r26,-8*13(r1)
-   ld  r27,-8*14(r1)
-   ld  r28,-8*15(r1)
-   ld  r29,-8*16(r1)
-   ld  r30,-8*17(r1)
-   ld  r31,-8*18(r1)
+   ld  r2,-8*1(r1)
+   ld  r14,-8*2(r1)
+   ld  r15,-8*3(r1)
+   ld  r16,-8*4(r1)
+   ld  r17,-8*5(r1)
+   ld  r18,-8*6(r1)
+   ld  r19,-8*7(r1)
+   ld  r20,-8*8(r1)
+   ld  r21,-8*9(r1)
+   ld  r22,-8*10(r1)
+   ld  r23,-8*11(r1)
+   ld  r24,-8*12(r1)
+   ld  r25,-8*13(r1)
+   ld  r26,-8*14(r1)
+   ld  r27,-8*15(r1)
+   ld  r28,-8*16(r1)
+   ld  r29,-8*17(r1)
+   ld  r30,-8*18(r1)
+   ld  r31,-8*19(r1)
blr
 
 /*
@@ -154,28 +158,32 @@ _GLOBAL(isa206_idle_insn_mayloss)
std r1,PACAR1(r13)
mflrr4
mfcrr5
-   /* use stack red zone rather than a new frame for saving regs */
-   std r2,-8*0(r1)
-   std r14,-8*1(r1)
-   std r15,-8*2(r1)
-   std r16,-8*3(r1)
-   std r17,-8*4(r1)
-   std r18,-8*5(r1)
-   std r19,-8*6(r1)
-   std r20,-8*7(r1)
-   std r21,-8*8(r1)
-   std r22,-8*9(r1)
-   std r23,-8*10(r1)
-   std r24,-8*11(r1)
-   std r25,-8*12(r1)
-   std r26,-8*13(r1)
-   std r27,-8*14(r1)
-   std r28,-8*15(r1)
-   std r29,-8*16(r1)
-   std r30,-8*17(r1)
-   std r31,-8*18(r1)
-   std r4,-8*19(r1)
-   std 

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-05 Thread Nicholas Piggin
Excerpts from Leonardo Bras's message of February 5, 2021 5:01 pm:
> Hey Nick, thanks for reviewing :)
> 
> On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
>> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
>> > Before guest entry, TBU40 register is changed to reflect guest timebase.
>> > After exitting guest, the register is reverted to it's original value.
>> > 
>> > If one tries to get the timestamp from host between those changes, it
>> > will present an incorrect value.
>> > 
>> > An example would be trying to add a tracepoint in
>> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
>> > acquired could actually cause the host to crash.
>> > 
>> > Save the Timebase Offset to PACA and use it on sched_clock() to always
>> > get the correct timestamp.
>> 
>> Ouch. Not sure how reasonable it is to half switch into guest registers 
>> and expect to call into the wider kernel, fixing things up as we go. 
>> What if mftb is used in other places?
> 
> IIUC, the CPU is not supposed to call anything as host between guest
> entry and guest exit, except guest-related cases, like

When I say "call", I'm including tracing in that. If a function is not 
marked as no trace, then it will call into the tracing subsystem.

> kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
> will still get the same value as before.

Right, so it'll be out of whack again.

> This is only supposed to change stuff that depends on sched_clock, like
> Tracepoints, that can happen in those exceptions.

If they depend on sched_clock that's one thing. Do they definitely have 
no dependencies on mftb from other calls?

>> Especially as it doesn't seem like there is a reason that function _has_
>> to be called after the timebase is switched to guest, that's just how 
>> the code is structured.
> 
> Correct, but if called, like in rb routines, used by tracepoints, the
> difference between last tb and current (lower) tb may cause the CPU to
> trap PROGRAM exception, crashing host. 

Yes, so I agree with Michael any function that is involved when we begin 
to switch into guest context (or have not completed switching back to 
host going the other way) should be marked as no trace (noinstr even, 
perhaps).

>> As a local hack to work out a bug okay. If you really need it upstream 
>> could you put it under a debug config option?
> 
> You mean something that is automatically selected whenever those
> configs are enabled? 
> 
> CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64
> 
> Or something the user need to select himself in menuconfig?

Yeah I meant a default n thing under powerpc kernel debugging somewhere.

Thanks,
Nick


[PATCH v3] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm

2021-02-05 Thread Aneesh Kumar K.V
This fix the bad fault reported by KUAP when io_wqe_worker access userspace.

 Bug: Read fault blocked by KUAP!
 WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 
__do_page_fault+0x6b4/0xcd0
 NIP [c009e7e4] __do_page_fault+0x6b4/0xcd0
 LR [c009e7e0] __do_page_fault+0x6b0/0xcd0
..
 Call Trace:
 [c00016367330] [c009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
 [c000163673e0] [c009ee3c] do_page_fault+0x3c/0x120
 [c00016367430] [c000c848] handle_page_fault+0x10/0x2c
 --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
..
 NIP [c08e8228] iov_iter_fault_in_readable+0x148/0x6f0
 LR [c08e834c] iov_iter_fault_in_readable+0x26c/0x6f0
 interrupt: 300
 [c000163677e0] [c07154a0] iomap_write_actor+0xc0/0x280
 [c00016367880] [c070fc94] iomap_apply+0x1c4/0x780
 [c00016367990] [c0710330] iomap_file_buffered_write+0xa0/0x120
 [c000163679e0] [c0080040791c] xfs_file_buffered_aio_write+0x314/0x5e0 
[xfs]
 [c00016367a90] [c06d74bc] io_write+0x10c/0x460
 [c00016367bb0] [c06d80e4] io_issue_sqe+0x8d4/0x1200
 [c00016367c70] [c06d8ad0] io_wq_submit_work+0xc0/0x250
 [c00016367cb0] [c06e2578] io_worker_handle_work+0x498/0x800
 [c00016367d40] [c06e2cdc] io_wqe_worker+0x3fc/0x4f0
 [c00016367da0] [c01cb0a4] kthread+0x1c4/0x1d0
 [c00016367e10] [c000dbf0] ret_from_kernel_thread+0x5c/0x6c

The kernel consider thread AMR value for kernel thread to be
AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
of course not correct and we should allow userspace access after
kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
AMR value of the operating address space. But, the AMR value is
thread-specific and we inherit the address space and not thread
access restrictions. Because of this ignore AMR value when accessing
userspace via kernel thread.

current_thread_amr/iamr() are updated, because we use them in the
below stack.

[  530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G  D
   5.11.0-rc6+ #3


 NIP [c00aa0c8] pkey_access_permitted+0x28/0x90
 LR [c04b9278] gup_pte_range+0x188/0x420
 --- interrupt: 700
 [c0001c4ef3f0] [] 0x0 (unreliable)
 [c0001c4ef490] [c04bd39c] gup_pgd_range+0x3ac/0xa20
 [c0001c4ef5a0] [c04bdd44] internal_get_user_pages_fast+0x334/0x410
 [c0001c4ef620] [c0852028] iov_iter_get_pages+0xf8/0x5c0
 [c0001c4ef6a0] [c07da44c] bio_iov_iter_get_pages+0xec/0x700
 [c0001c4ef770] [c06a325c] iomap_dio_bio_actor+0x2ac/0x4f0
 [c0001c4ef810] [c069cd94] iomap_apply+0x2b4/0x740
 [c0001c4ef920] [c06a38b8] __iomap_dio_rw+0x238/0x5c0
 [c0001c4ef9d0] [c06a3c60] iomap_dio_rw+0x20/0x80
 [c0001c4ef9f0] [c00801927a30] xfs_file_dio_aio_write+0x1f8/0x650 [xfs]
 [c0001c4efa60] [c008019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
 [c0001c4efa90] [c0669984] io_write+0x104/0x4b0
 [c0001c4efbb0] [c066cea4] io_issue_sqe+0x3d4/0xf50
 [c0001c4efc60] [c0670200] io_wq_submit_work+0xb0/0x2f0
 [c0001c4efcb0] [c0674268] io_worker_handle_work+0x248/0x4a0
 [c0001c4efd30] [c06746e8] io_wqe_worker+0x228/0x2a0
 [c0001c4efda0] [c019d994] kthread+0x1b4/0x1c0

Cc: Zorro Lang 
Cc: Jens Axboe 
Cc: Christophe Leroy 
Cc: Nicholas Piggin 
Signed-off-by: Aneesh Kumar K.V 
---

Changes from v2:
* Fix build error
Changes from v1:
* use default_amr/default_iamr for kthread.

 arch/powerpc/include/asm/book3s/64/kup.h   | 16 +++-
 arch/powerpc/include/asm/book3s/64/pkeys.h |  4 
 arch/powerpc/mm/book3s64/pkeys.c   |  1 +
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index f50f72e535aa..7d1ef7b9754e 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -199,25 +199,31 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
 
 #ifdef CONFIG_PPC_PKEY
 
+extern u64 __ro_after_init default_uamor;
+extern u64 __ro_after_init default_amr;
+extern u64 __ro_after_init default_iamr;
+
 #include 
 #include 
 
-/*
- * For kernel thread that doesn't have thread.regs return
- * default AMR/IAMR values.
+/* usage of kthread_use_mm() should inherit the
+ * AMR value of the operating address space. But, the AMR value is
+ * thread-specific and we inherit the address space and not thread
+ * access restrictions. Because of this ignore AMR value when accessing
+ * userspace via kernel thread.
  */
 static inline u64 current_thread_amr(void)
 {
if (current->thread.regs)
return current->thread.regs->amr;
-   return AMR_KUAP_BLOCKED;
+   return default_amr;
 }
 
 static inline u64 current_thread_iamr(void)
 

Re: [PATCH v7 39/42] powerpc: move NMI entry/exit code into wrapper

2021-02-05 Thread Nicholas Piggin
Excerpts from Michael Ellerman's message of February 6, 2021 9:38 am:
> Nicholas Piggin  writes:
>> Excerpts from Michael Ellerman's message of February 4, 2021 8:15 pm:
>>> Nicholas Piggin  writes:
 This moves the common NMI entry and exit code into the interrupt handler
 wrappers.

 This changes the behaviour of soft-NMI (watchdog) and HMI interrupts, and
 also MCE interrupts on 64e, by adding missing parts of the NMI entry to
 them.

 Signed-off-by: Nicholas Piggin 
 ---
  arch/powerpc/include/asm/interrupt.h | 28 ++
  arch/powerpc/kernel/mce.c| 11 -
  arch/powerpc/kernel/traps.c  | 35 +---
  arch/powerpc/kernel/watchdog.c   | 10 
  4 files changed, 38 insertions(+), 46 deletions(-)
>>> 
>>> This is unhappy when injecting SLB multi-hits:
>>> 
>>>   root@p86-2:~# echo PPC_SLB_MULTIHIT > 
>>> /sys/kernel/debug/provoke-crash/DIRECT
>>>   [  312.496026][ T1344] kernel BUG at 
>>> arch/powerpc/include/asm/interrupt.h:152!
>>>   [  312.496037][ T1344] Oops: Exception in kernel mode, sig: 5 [#1]
>>>   [  312.496045][ T1344] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA 
>>> pSeries
>>
>> pseries hash. Blast!
> 
> The worst kind.
> 
>>> 147 static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, 
>>> struct interrupt_nmi_state *state)
>>> 148 {
>>> 149 if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
>>> 150 !firmware_has_feature(FW_FEATURE_LPAR) ||
>>> 151 radix_enabled() || (mfmsr() & MSR_DR))
>>> 152 nmi_exit();
>>> 
>>> 
>>> So presumably it's:
>>> 
>>> #define __nmi_exit()\
>>> do {\
>>> BUG_ON(!in_nmi());  \
>>
>> Yes that would be it, pseries machine check enables MMU half way through 
>> so only one side of this triggers.
>>
>> The MSR_DR check is supposed to catch the other NMIs that run with MMU 
>> on (perf, watchdog, etc). Suppose it could test TRAP(regs) explicitly
>> although I wonder if we should also do this to keep things balanced
> 
> Yeah I think I like that. I'll give it a test.

The msr restore? Looking closer, pseries_machine_check_realmode may have
expected mce_handle_error to enable the MMU, because irq_work_queue uses
some per-cpu variables I think.

So the code might have to be rearranged a bit more than the patch below.

Thanks,
Nick

> 
> cheers
> 
> 
>> diff --git a/arch/powerpc/platforms/pseries/ras.c 
>> b/arch/powerpc/platforms/pseries/ras.c
>> index 149cec2212e6..f57ca0c570be 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -719,6 +719,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>>  
>>  static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log 
>> *errp)
>>  {
>> +   unsigned long msr;
>> struct pseries_errorlog *pseries_log;
>> struct pseries_mc_errorlog *mce_log = NULL;
>> int disposition = rtas_error_disposition(errp);
>> @@ -747,9 +748,12 @@ static int mce_handle_error(struct pt_regs *regs, 
>> struct rtas_error_log *errp)
>>  *   SLB multihit is done by now.
>>  */
>>  out:
>> -   mtmsr(mfmsr() | MSR_IR | MSR_DR);
>> +   msr = mfmsr();
>> +   mtmsr(msr | MSR_IR | MSR_DR);
>> disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>>   disposition);
>> +   mtmsr(msr);
>> +
>> return disposition;
>>  }
>>  
> 


Re: [PATCH v7 28/42] powerpc: convert interrupt handlers to use wrappers

2021-02-05 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of February 5, 2021 6:09 pm:
> 
> 
> Le 30/01/2021 à 14:08, Nicholas Piggin a écrit :
>> Signed-off-by: Nicholas Piggin 
>> ---
> 
>> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>> index f70d3f6174c8..7ff915aae8ec 100644
>> --- a/arch/powerpc/kernel/traps.c
>> +++ b/arch/powerpc/kernel/traps.c
> 
>> @@ -1462,7 +1474,7 @@ static int emulate_math(struct pt_regs *regs)
>>   static inline int emulate_math(struct pt_regs *regs) { return -1; }
>>   #endif
>>   
>> -void program_check_exception(struct pt_regs *regs)
>> +DEFINE_INTERRUPT_HANDLER(program_check_exception)
>>   {
>>  enum ctx_state prev_state = exception_enter();
>>  unsigned int reason = get_reason(regs);
>> @@ -1587,14 +1599,14 @@ NOKPROBE_SYMBOL(program_check_exception);
>>* This occurs when running in hypervisor mode on POWER6 or later
>>* and an illegal instruction is encountered.
>>*/
>> -void emulation_assist_interrupt(struct pt_regs *regs)
>> +DEFINE_INTERRUPT_HANDLER(emulation_assist_interrupt)
>>   {
>>  regs->msr |= REASON_ILLEGAL;
>>  program_check_exception(regs);
> 
> Is it correct that an INTERRUPT_HANDLER calls another INTERRUPT_HANDLER ?

No you're right, I'll have to send a patch.

Thanks,
Nick


Re: [PATCH] powerpc/8xx: Fix software emulation interrupt

2021-02-05 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of February 5, 2021 6:56 pm:
> For unimplemented instructions or unimplemented SPRs, the 8xx triggers
> a "Software Emulation Exception" (0x1000). That interrupt doesn't set
> reason bits in SRR1 as the "Program Check Exception" does.
> 
> Go through emulation_assist_interrupt() to set REASON_ILLEGAL.
> 
> Fixes: fbbcc3bb139e ("powerpc/8xx: Remove SoftwareEmulation()")
> Signed-off-by: Christophe Leroy 
> ---
> I'm wondering whether it wouldn't be better to set REASON_ILLEGAL
> in the exception prolog and still call program_check_exception.
> And do the same in book3s/64 to avoid the nightmare of an
> INTERRUPT_HANDLER calling another INTERRUPT_HANDLER.

Hmm, I missed this. We just change program_check_exception to
a common function which is called by both.

Thanks,
Nick



Re: [PATCH v3 28/32] powerpc/64s: interrupt implement exit logic in C

2021-02-05 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of February 5, 2021 4:04 pm:
> 
> 
> Le 05/02/2021 à 03:16, Nicholas Piggin a écrit :
>> Excerpts from Michael Ellerman's message of February 5, 2021 10:22 am:
>>> Nicholas Piggin  writes:
 Excerpts from Christophe Leroy's message of February 4, 2021 6:03 pm:
> Le 04/02/2021 à 04:27, Nicholas Piggin a écrit :
>> Excerpts from Christophe Leroy's message of February 4, 2021 2:25 am:
>>> Le 25/02/2020 à 18:35, Nicholas Piggin a écrit :
>>> ...
 +
 +  /*
 +   * We don't need to restore AMR on the way back to userspace 
 for KUAP.
 +   * The value of AMR only matters while we're in the kernel.
 +   */
 +  kuap_restore_amr(regs);
>>>
>>> Is that correct to restore KUAP state here ? Shouldn't we have it at 
>>> lower level in assembly ?
>>>
>>> Isn't there a risk that someone manages to call 
>>> interrupt_exit_kernel_prepare() or the end of it in
>>> a way or another, and get the previous KUAP state restored by this way ?
>>
>> I'm not sure if there much more risk if it's here rather than the
>> instruction being in another place in the code.
>>
>> There's a lot of user access around the kernel too if you want to find a
>> gadget to unlock KUAP then I suppose there is a pretty large attack
>> surface.
>
> My understanding is that user access scope is strictly limited, for 
> instance we enforce the
> begin/end of user access to be in the same function, and we refrain from 
> calling any other function
> inside the user access window. x86 even have 'objtool' to enforce it at 
> build time. So in theory
> there is no way to get out of the function while user access is open.
>
> Here with the interrupt exit function it is free beer. You have a place 
> where you re-open user
> access and return with a simple blr. So that's open bar. If someone 
> manages to just call the
> interrupt exit function, then user access remains open

 Hmm okay maybe that's a good point.
>>>
>>> I don't think it's a very attractive gadget, it's not just a plain blr,
>>> it does a full stack frame tear down before the return. And there's no
>>> LR reloads anywhere very close.
>>>
>>> Obviously it depends on what the compiler decides to do, it's possible
>>> it could be a usable gadget. But there are other places that are more
>>> attractive I think, eg:
>>>
>>> c061d768:   a6 03 3d 7d mtspr   29,r9
>>> c061d76c:   2c 01 00 4c isync
>>> c061d770:   00 00 00 60 nop
>>> c061d774:   30 00 21 38 addir1,r1,48
>>> c061d778:   20 00 80 4e blr
>>>
>>>
>>> So I don't think we should redesign the code *purely* because we're
>>> worried about interrupt_exit_kernel_prepare() being a useful gadget. If
>>> we can come up with a way to restructure it that reads well and is
>>> maintainable, and also reduces the chance of it being a good gadget then
>>> sure.
>> 
>> Okay. That would be good if we can keep it in C, the pkeys + kuap combo
>> is fairly complicated and we might want to something cleverer with it,
>> so that would make it even more difficult in asm.
>> 
> 
> Ok.
> 
> For ppc32, I prefer to keep it in assembly for the time being and move 
> everything from ASM to C at 
> once after porting syscall and interrupts to C and wrappers.
> 
> Hope this is OK for you.

I don't see a problem with that.

Thanks,
Nick


Re: [PATCH v7 39/42] powerpc: move NMI entry/exit code into wrapper

2021-02-05 Thread Michael Ellerman
Nicholas Piggin  writes:
> Excerpts from Michael Ellerman's message of February 4, 2021 8:15 pm:
>> Nicholas Piggin  writes:
>>> This moves the common NMI entry and exit code into the interrupt handler
>>> wrappers.
>>>
>>> This changes the behaviour of soft-NMI (watchdog) and HMI interrupts, and
>>> also MCE interrupts on 64e, by adding missing parts of the NMI entry to
>>> them.
>>>
>>> Signed-off-by: Nicholas Piggin 
>>> ---
>>>  arch/powerpc/include/asm/interrupt.h | 28 ++
>>>  arch/powerpc/kernel/mce.c| 11 -
>>>  arch/powerpc/kernel/traps.c  | 35 +---
>>>  arch/powerpc/kernel/watchdog.c   | 10 
>>>  4 files changed, 38 insertions(+), 46 deletions(-)
>> 
>> This is unhappy when injecting SLB multi-hits:
>> 
>>   root@p86-2:~# echo PPC_SLB_MULTIHIT > 
>> /sys/kernel/debug/provoke-crash/DIRECT
>>   [  312.496026][ T1344] kernel BUG at 
>> arch/powerpc/include/asm/interrupt.h:152!
>>   [  312.496037][ T1344] Oops: Exception in kernel mode, sig: 5 [#1]
>>   [  312.496045][ T1344] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA 
>> pSeries
>
> pseries hash. Blast!

The worst kind.

>> 147 static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, 
>> struct interrupt_nmi_state *state)
>> 148 {
>> 149  if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
>> 150  !firmware_has_feature(FW_FEATURE_LPAR) ||
>> 151  radix_enabled() || (mfmsr() & MSR_DR))
>> 152  nmi_exit();
>> 
>> 
>> So presumably it's:
>> 
>> #define __nmi_exit() \
>>  do {\
>>  BUG_ON(!in_nmi());  \
>
> Yes that would be it, pseries machine check enables MMU half way through 
> so only one side of this triggers.
>
> The MSR_DR check is supposed to catch the other NMIs that run with MMU 
> on (perf, watchdog, etc). Suppose it could test TRAP(regs) explicitly
> although I wonder if we should also do this to keep things balanced

Yeah I think I like that. I'll give it a test.

cheers


> diff --git a/arch/powerpc/platforms/pseries/ras.c 
> b/arch/powerpc/platforms/pseries/ras.c
> index 149cec2212e6..f57ca0c570be 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -719,6 +719,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  
>  static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log 
> *errp)
>  {
> +   unsigned long msr;
> struct pseries_errorlog *pseries_log;
> struct pseries_mc_errorlog *mce_log = NULL;
> int disposition = rtas_error_disposition(errp);
> @@ -747,9 +748,12 @@ static int mce_handle_error(struct pt_regs *regs, struct 
> rtas_error_log *errp)
>  *   SLB multihit is done by now.
>  */
>  out:
> -   mtmsr(mfmsr() | MSR_IR | MSR_DR);
> +   msr = mfmsr();
> +   mtmsr(msr | MSR_IR | MSR_DR);
> disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>   disposition);
> +   mtmsr(msr);
> +
> return disposition;
>  }
>  


Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-05 Thread Michael Ellerman
Nicholas Piggin  writes:
> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
>> Before guest entry, TBU40 register is changed to reflect guest timebase.
>> After exitting guest, the register is reverted to it's original value.
>> 
>> If one tries to get the timestamp from host between those changes, it
>> will present an incorrect value.
>> 
>> An example would be trying to add a tracepoint in
>> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
>> acquired could actually cause the host to crash.
>> 
>> Save the Timebase Offset to PACA and use it on sched_clock() to always
>> get the correct timestamp.
>
> Ouch. Not sure how reasonable it is to half switch into guest registers 
> and expect to call into the wider kernel, fixing things up as we go. 

Yeah it's not.

We need to disable tracing on those routines that are called in that
half-exited state.

cheers


Re: [RFC PATCH 1/9] KVM: PPC: Book3S 64: move KVM interrupt entry to a common entry point

2021-02-05 Thread Fabiano Rosas
Nicholas Piggin  writes:

> Rather than bifurcate the call depending on whether or not HV is
> possible, and have the HV entry test for PR, just make a single
> common point which does the demultiplexing. This makes it simpler
> to add another type of exit handler.
>
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Fabiano Rosas 

> ---
>  arch/powerpc/kernel/exceptions-64s.S|  8 +-
>  arch/powerpc/kvm/Makefile   |  3 +++
>  arch/powerpc/kvm/book3s_64_entry.S  | 34 +
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++--
>  4 files changed, 40 insertions(+), 16 deletions(-)
>  create mode 100644 arch/powerpc/kvm/book3s_64_entry.S
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> index e02ad6fefa46..65659ea3cec4 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -212,7 +212,6 @@ do_define_int n
>  .endm
>
>  #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  /*
>   * All interrupts which set HSRR registers, as well as SRESET and MCE and
>   * syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken,
> @@ -242,13 +241,8 @@ do_define_int n
>
>  /*
>   * If an interrupt is taken while a guest is running, it is immediately 
> routed
> - * to KVM to handle. If both HV and PR KVM arepossible, KVM interrupts go 
> first
> - * to kvmppc_interrupt_hv, which handles the PR guest case.
> + * to KVM to handle.
>   */
> -#define kvmppc_interrupt kvmppc_interrupt_hv
> -#else
> -#define kvmppc_interrupt kvmppc_interrupt_pr
> -#endif
>
>  .macro KVMTEST name
>   lbz r10,HSTATE_IN_GUEST(r13)
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 2bfeaa13befb..cdd119028f64 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -59,6 +59,9 @@ kvm-pr-y := \
>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
>   tm.o
>
> +kvm-book3s_64-builtin-objs-y += \
> + book3s_64_entry.o
> +
>  ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
>   book3s_rmhandlers.o
> diff --git a/arch/powerpc/kvm/book3s_64_entry.S 
> b/arch/powerpc/kvm/book3s_64_entry.S
> new file mode 100644
> index ..22e34b95f478
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_entry.S
> @@ -0,0 +1,34 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +/*
> + * We come here from the first-level interrupt handlers.
> + */
> +.global  kvmppc_interrupt
> +.balign IFETCH_ALIGN_BYTES
> +kvmppc_interrupt:
> + /*
> +  * Register contents:
> +  * R12  = (guest CR << 32) | interrupt vector
> +  * R13  = PACA
> +  * guest R12 saved in shadow VCPU SCRATCH0
> +  * guest R13 saved in SPRN_SCRATCH0
> +  */
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> + std r9, HSTATE_SCRATCH2(r13)
> + lbz r9, HSTATE_IN_GUEST(r13)
> + cmpwi   r9, KVM_GUEST_MODE_HOST_HV
> + beq kvmppc_bad_host_intr
> +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> + cmpwi   r9, KVM_GUEST_MODE_GUEST
> + ld  r9, HSTATE_SCRATCH2(r13)
> + beq kvmppc_interrupt_pr
> +#endif
> + b   kvmppc_interrupt_hv
> +#else
> + b   kvmppc_interrupt_pr
> +#endif
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 8cf1f69f442e..b9c4acd747f7 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1255,16 +1255,8 @@ kvmppc_interrupt_hv:
>* R13  = PACA
>* guest R12 saved in shadow VCPU SCRATCH0
>* guest R13 saved in SPRN_SCRATCH0
> +  * guest R9 saved in HSTATE_SCRATCH2
>*/
> - std r9, HSTATE_SCRATCH2(r13)
> - lbz r9, HSTATE_IN_GUEST(r13)
> - cmpwi   r9, KVM_GUEST_MODE_HOST_HV
> - beq kvmppc_bad_host_intr
> -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> - cmpwi   r9, KVM_GUEST_MODE_GUEST
> - ld  r9, HSTATE_SCRATCH2(r13)
> - beq kvmppc_interrupt_pr
> -#endif
>   /* We're now back in the host but in guest MMU context */
>   li  r9, KVM_GUEST_MODE_HOST_HV
>   stb r9, HSTATE_IN_GUEST(r13)
> @@ -3253,6 +3245,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST)
>   * cfar is saved in HSTATE_CFAR(r13)
>   * ppr is saved in HSTATE_PPR(r13)
>   */
> +.global kvmppc_bad_host_intr
>  kvmppc_bad_host_intr:
>   /*
>* Switch to the emergency stack, but start half-way down in


Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-05 Thread Leonardo Bras
Hello Fabiano, 
Thanks for reviewing! 
(answers inline)

On Fri, 2021-02-05 at 10:09 -0300, Fabiano Rosas wrote:
> Leonardo Bras  writes:
> 
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> > 
> > Signed-off-by: Leonardo Bras 
> > Suggested-by: Paul Mackerras 
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c | 1 +
> >  arch/powerpc/kernel/time.c| 8 +++-
> >  arch/powerpc/kvm/book3s_hv.c  | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> > b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >     u64 cfar;
> >     u64 ppr;
> >     u64 host_fscr;
> > +   u64 tb_offset;  /* Timebase offset: keeps correct
> > timebase while on guest */
> 
> Couldn't you use the vc->tb_offset_applied for this? We have a reference
> for the vcore in the hstate already.

But it's a pointer, which means we would have to keep checking for NULL
every time we need sched_clock(). 
Potentially it would cost a cache miss for PACA memory region that
contain vc, another for getting the part of *vc that contains the
tb_offset_applied, instead of only one for PACA struct region that
contains tb_offset.

On the other hand, it got me thinking: If the offset is applied per
cpu, why don't we get this info only in PACA, instead of in vc?
It could be a general way to get an offset applied for any purpose and
still get the sched_clock() right. 
(Not that I have any idea of any other purpose we could use it) 

Best regards!
Leonardo Bras

> 
> >  #endif
> >  };
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >     HSTATE_FIELD(HSTATE_CFAR, cfar);
> >     HSTATE_FIELD(HSTATE_PPR, ppr);
> >     HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> > 
> >  #else /* CONFIG_PPC_BOOK3S */
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index 67feb3524460..f27f0163792b 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> >   */
> >  notrace unsigned long long sched_clock(void)
> >  {
> > -   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> > +   u64 tb = get_tb() - boot_tb;
> > +
> > +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> > +   tb -= local_paca->kvm_hstate.tb_offset;
> > +#endif
> > +
> > +   return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> >  }
> > 
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index b3731572295e..c08593c63353 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> >     if ((tb & 0xff) < (new_tb & 0xff))
> >     mtspr(SPRN_TBU40, new_tb + 0x100);
> >     vc->tb_offset_applied = vc->tb_offset;
> > +   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> >     }
> > 
> >     if (vc->pcr)
> > @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> >     if ((tb & 0xff) < (new_tb & 0xff))
> >     mtspr(SPRN_TBU40, new_tb + 0x100);
> >     vc->tb_offset_applied = 0;
> > +   local_paca->kvm_hstate.tb_offset = 0;
> >     }
> > 
> >     mtspr(SPRN_HDEC, 0x7fff);
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b73140607875..8f7a9f7f4ee6 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> >     cmpdi   r8,0
> >  

Re: [PATCH] powerpc/pseries/dlpar: handle ibm,configure-connector delay status

2021-02-05 Thread Tyrel Datwyler
On 1/6/21 6:59 PM, Nathan Lynch wrote:
> dlpar_configure_connector() has two problems in its handling of
> ibm,configure-connector's return status:
> 
> 1. When the status is -2 (busy, call again), we call
>ibm,configure-connector again immediately without checking whether
>to schedule, which can result in monopolizing the CPU.
> 2. Extended delay status (9900..9905) goes completely unhandled,
>causing the configuration to unnecessarily terminate.
> 
> Fix both of these issues by using rtas_busy_delay().
> 
> Fixes: ab519a011caa ("powerpc/pseries: Kernel DLPAR Infrastructure")
> Signed-off-by: Nathan Lynch 

Reviewed-by: Tyrel Datwyler 



[PATCH v2] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm

2021-02-05 Thread Aneesh Kumar K.V
This fix the bad fault reported by KUAP when io_wqe_worker access userspace.

 Bug: Read fault blocked by KUAP!
 WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 
__do_page_fault+0x6b4/0xcd0
 NIP [c009e7e4] __do_page_fault+0x6b4/0xcd0
 LR [c009e7e0] __do_page_fault+0x6b0/0xcd0
..
 Call Trace:
 [c00016367330] [c009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
 [c000163673e0] [c009ee3c] do_page_fault+0x3c/0x120
 [c00016367430] [c000c848] handle_page_fault+0x10/0x2c
 --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
..
 NIP [c08e8228] iov_iter_fault_in_readable+0x148/0x6f0
 LR [c08e834c] iov_iter_fault_in_readable+0x26c/0x6f0
 interrupt: 300
 [c000163677e0] [c07154a0] iomap_write_actor+0xc0/0x280
 [c00016367880] [c070fc94] iomap_apply+0x1c4/0x780
 [c00016367990] [c0710330] iomap_file_buffered_write+0xa0/0x120
 [c000163679e0] [c0080040791c] xfs_file_buffered_aio_write+0x314/0x5e0 
[xfs]
 [c00016367a90] [c06d74bc] io_write+0x10c/0x460
 [c00016367bb0] [c06d80e4] io_issue_sqe+0x8d4/0x1200
 [c00016367c70] [c06d8ad0] io_wq_submit_work+0xc0/0x250
 [c00016367cb0] [c06e2578] io_worker_handle_work+0x498/0x800
 [c00016367d40] [c06e2cdc] io_wqe_worker+0x3fc/0x4f0
 [c00016367da0] [c01cb0a4] kthread+0x1c4/0x1d0
 [c00016367e10] [c000dbf0] ret_from_kernel_thread+0x5c/0x6c

The kernel consider thread AMR value for kernel thread to be
AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
of course not correct and we should allow userspace access after
kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
AMR value of the operating address space. But, the AMR value is
thread-specific and we inherit the address space and not thread
access restrictions. Because of this ignore AMR value when accessing
userspace via kernel thread.

current_thread_amr/iamr() are also updated, because we use them in the
below stack.

[  530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G  D
   5.11.0-rc6+ #3


 NIP [c00aa0c8] pkey_access_permitted+0x28/0x90
 LR [c04b9278] gup_pte_range+0x188/0x420
 --- interrupt: 700
 [c0001c4ef3f0] [] 0x0 (unreliable)
 [c0001c4ef490] [c04bd39c] gup_pgd_range+0x3ac/0xa20
 [c0001c4ef5a0] [c04bdd44] internal_get_user_pages_fast+0x334/0x410
 [c0001c4ef620] [c0852028] iov_iter_get_pages+0xf8/0x5c0
 [c0001c4ef6a0] [c07da44c] bio_iov_iter_get_pages+0xec/0x700
 [c0001c4ef770] [c06a325c] iomap_dio_bio_actor+0x2ac/0x4f0
 [c0001c4ef810] [c069cd94] iomap_apply+0x2b4/0x740
 [c0001c4ef920] [c06a38b8] __iomap_dio_rw+0x238/0x5c0
 [c0001c4ef9d0] [c06a3c60] iomap_dio_rw+0x20/0x80
 [c0001c4ef9f0] [c00801927a30] xfs_file_dio_aio_write+0x1f8/0x650 [xfs]
 [c0001c4efa60] [c008019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
 [c0001c4efa90] [c0669984] io_write+0x104/0x4b0
 [c0001c4efbb0] [c066cea4] io_issue_sqe+0x3d4/0xf50
 [c0001c4efc60] [c0670200] io_wq_submit_work+0xb0/0x2f0
 [c0001c4efcb0] [c0674268] io_worker_handle_work+0x248/0x4a0
 [c0001c4efd30] [c06746e8] io_wqe_worker+0x228/0x2a0
 [c0001c4efda0] [c019d994] kthread+0x1b4/0x1c0

Cc: Zorro Lang 
Cc: Jens Axboe 
Cc: Christophe Leroy 
Cc: Nicholas Piggin 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/kup.h   | 16 +++-
 arch/powerpc/include/asm/book3s/64/pkeys.h |  4 
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index f50f72e535aa..7d1ef7b9754e 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -199,25 +199,31 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
 
 #ifdef CONFIG_PPC_PKEY
 
+extern u64 __ro_after_init default_uamor;
+extern u64 __ro_after_init default_amr;
+extern u64 __ro_after_init default_iamr;
+
 #include 
 #include 
 
-/*
- * For kernel thread that doesn't have thread.regs return
- * default AMR/IAMR values.
+/* usage of kthread_use_mm() should inherit the
+ * AMR value of the operating address space. But, the AMR value is
+ * thread-specific and we inherit the address space and not thread
+ * access restrictions. Because of this ignore AMR value when accessing
+ * userspace via kernel thread.
  */
 static inline u64 current_thread_amr(void)
 {
if (current->thread.regs)
return current->thread.regs->amr;
-   return AMR_KUAP_BLOCKED;
+   return default_amr;
 }
 
 static inline u64 current_thread_iamr(void)
 {
if (current->thread.regs)
return current->thread.regs->iamr;
-   return AMR_KUEP_BLOCKED;
+   return 

Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error

2021-02-05 Thread Lakshmi Ramasubramanian

On 2/5/21 9:49 AM, Mimi Zohar wrote:

Hi Mimi,


On Fri, 2021-02-05 at 09:39 -0800, Lakshmi Ramasubramanian wrote:

On 2/5/21 2:05 AM, Greg KH wrote:

On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:

IMA allocates kernel virtual memory to carry forward the measurement
list, from the current kernel to the next kernel on kexec system call,
in ima_add_kexec_buffer() function.  In error code paths this memory
is not freed resulting in memory leak.

Free the memory allocated for the IMA measurement list in
the error code paths in ima_add_kexec_buffer() function.

Signed-off-by: Lakshmi Ramasubramanian 
Suggested-by: Tyler Hicks 
Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
---
   security/integrity/ima/ima_kexec.c | 1 +
   1 file changed, 1 insertion(+)




This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
  https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.





Thanks for the info Greg.

I will re-submit the two patches in the proper format.


No need.  I'm testing these patches now.  I'm not exactly sure what the
problem is.  Stable wasn't Cc'ed.  Is it that you sent the patch
directly to Greg or added "Fixes"?


I had not Cced stable, but had "Fixes" tag in the patch.

Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")

The problem is that the buffer allocated for forwarding the IMA 
measurement list is not freed - at the end of the kexec call and also in 
an error path. Please see the patch description for


[PATCH v2 2/2] ima: Free IMA measurement buffer after kexec syscall

IMA allocates kernel virtual memory to carry forward the measurement
list, from the current kernel to the next kernel on kexec system call,
in ima_add_kexec_buffer() function.  This buffer is not freed before
completing the kexec system call resulting in memory leak.

thanks,
 -lakshmi


Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error

2021-02-05 Thread Mimi Zohar
On Fri, 2021-02-05 at 09:39 -0800, Lakshmi Ramasubramanian wrote:
> On 2/5/21 2:05 AM, Greg KH wrote:
> > On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:
> >> IMA allocates kernel virtual memory to carry forward the measurement
> >> list, from the current kernel to the next kernel on kexec system call,
> >> in ima_add_kexec_buffer() function.  In error code paths this memory
> >> is not freed resulting in memory leak.
> >>
> >> Free the memory allocated for the IMA measurement list in
> >> the error code paths in ima_add_kexec_buffer() function.
> >>
> >> Signed-off-by: Lakshmi Ramasubramanian 
> >> Suggested-by: Tyler Hicks 
> >> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
> >> ---
> >>   security/integrity/ima/ima_kexec.c | 1 +
> >>   1 file changed, 1 insertion(+)
> > 
> > 
> > 
> > This is not the correct way to submit patches for inclusion in the
> > stable kernel tree.  Please read:
> >  https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> > for how to do this properly.
> > 
> > 
> > 
> 
> Thanks for the info Greg.
> 
> I will re-submit the two patches in the proper format.

No need.  I'm testing these patches now.  I'm not exactly sure what the
problem is.  Stable wasn't Cc'ed.  Is it that you sent the patch
directly to Greg or added "Fixes"?

thanks,

Mimi



Re: [PATCH] mm/pmem: Avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled

2021-02-05 Thread Dan Williams
[ add Andrew ]

On Thu, Feb 4, 2021 at 6:40 PM Aneesh Kumar K.V
 wrote:
>
> Differentiate between hardware not supporting hugepages and user disabling THP
> via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'
>
> For the devdax namespace, the kernel handles the above via the
> supported_alignment attribute and failing to initialize the namespace
> if the namespace align value is not supported on the platform.
>
> For the fsdax namespace, the kernel will continue to initialize
> the namespace. This can result in the kernel creating a huge pte
> entry even though the hardware don't support the same.
>
> We do want hugepage support with pmem even if the end-user disabled THP
> via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled). Hence
> differentiate between hardware/firmware lacking support vs user-controlled
> disable of THP and prevent a huge fault if the hardware lacks hugepage
> support.

Looks good to me.

Reviewed-by: Dan Williams 

I assume this will go through Andrew.


Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error

2021-02-05 Thread Lakshmi Ramasubramanian

On 2/5/21 2:05 AM, Greg KH wrote:

On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:

IMA allocates kernel virtual memory to carry forward the measurement
list, from the current kernel to the next kernel on kexec system call,
in ima_add_kexec_buffer() function.  In error code paths this memory
is not freed resulting in memory leak.

Free the memory allocated for the IMA measurement list in
the error code paths in ima_add_kexec_buffer() function.

Signed-off-by: Lakshmi Ramasubramanian 
Suggested-by: Tyler Hicks 
Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
---
  security/integrity/ima/ima_kexec.c | 1 +
  1 file changed, 1 insertion(+)




This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
 https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.





Thanks for the info Greg.

I will re-submit the two patches in the proper format.

 -lakshmi



[PATCH] KVM: PPC: Don't always report hash MMU capability for P9 < DD2.2

2021-02-05 Thread Fabiano Rosas
These machines don't support running both MMU types at the same time,
so remove the KVM_CAP_PPC_MMU_HASH_V3 capability when the host is
using Radix MMU.

Signed-off-by: Fabiano Rosas 
---
 arch/powerpc/include/asm/kvm_ppc.h |  1 +
 arch/powerpc/kvm/book3s_hv.c   | 10 ++
 arch/powerpc/kvm/powerpc.c |  3 +--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 0a056c64c317..b36abc89baf3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -314,6 +314,7 @@ struct kvmppc_ops {
  int size);
int (*enable_svm)(struct kvm *kvm);
int (*svm_off)(struct kvm *kvm);
+   bool (*hash_v3_possible)(void);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6f612d240392..d20c0682cae5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5599,6 +5599,15 @@ static int kvmhv_svm_off(struct kvm *kvm)
return ret;
 }
 
+static bool kvmppc_hash_v3_possible(void)
+{
+   if (radix_enabled() && no_mixing_hpt_and_radix)
+   return false;
+
+   return cpu_has_feature(CPU_FTR_ARCH_300) &&
+   cpu_has_feature(CPU_FTR_HVMODE);
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5642,6 +5651,7 @@ static struct kvmppc_ops kvm_ops_hv = {
.store_to_eaddr = kvmhv_store_to_eaddr,
.enable_svm = kvmhv_enable_svm,
.svm_off = kvmhv_svm_off,
+   .hash_v3_possible = kvmppc_hash_v3_possible,
 };
 
 static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf52d26f49cd..b9fb2f20f879 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -611,8 +611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
-   r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
-  cpu_has_feature(CPU_FTR_HVMODE));
+   r = !!(hv_enabled && kvmppc_hv_ops->hash_v3_possible());
break;
case KVM_CAP_PPC_NESTED_HV:
r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
-- 
2.29.2



Re: [PATCH] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm

2021-02-05 Thread Zorro Lang
On Fri, Feb 05, 2021 at 07:19:36PM +0530, Aneesh Kumar K.V wrote:
> Zorro Lang  writes:
> 
> 
> 
> > ...
> > [  530.180466] run fstests generic/617 at 2021-02-05 03:41:10
> > [  530.707969] [ cut here ]
> > [  530.708006] kernel BUG at arch/powerpc/include/asm/book3s/64/kup.h:207!
> > [  530.708013] Oops: Exception in kernel mode, sig: 5 [#1]
> > [  530.708018] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> > [  530.708022] Modules linked in: bonding rfkill sunrpc uio_pdrv_genirq 
> > pseries_rng uio drm fuse drm_panel_orientation_quirks ip_tables xfs 
> > libcrc32c sd_mod t10_pi ibmvscsi ibmveth scsi_trans
> > port_srp xts vmx_crypto
> > [  530.708049] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Not tainted 5.11.0-r
> 
> ok so we call current_thread_amr() with kthread.
> 
> commit ae33fb7b069ebb41e32f55ae397c887031e47472
> Author: Aneesh Kumar K.V 
> Date:   Fri Feb 5 19:11:49 2021 +0530
> 
> 
> The other stack that matters is
> ...
> [  530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G  D  
>  5.11.0-rc6+ #3
> 
> 
>  NIP [c00aa0c8] pkey_access_permitted+0x28/0x90
>  LR [c04b9278] gup_pte_range+0x188/0x420
>  --- interrupt: 700
>  [c0001c4ef3f0] [] 0x0 (unreliable)
>  [c0001c4ef490] [c04bd39c] gup_pgd_range+0x3ac/0xa20
>  [c0001c4ef5a0] [c04bdd44] 
> internal_get_user_pages_fast+0x334/0x410
>  [c0001c4ef620] [c0852028] iov_iter_get_pages+0xf8/0x5c0
>  [c0001c4ef6a0] [c07da44c] bio_iov_iter_get_pages+0xec/0x700
>  [c0001c4ef770] [c06a325c] iomap_dio_bio_actor+0x2ac/0x4f0
>  [c0001c4ef810] [c069cd94] iomap_apply+0x2b4/0x740
>  [c0001c4ef920] [c06a38b8] __iomap_dio_rw+0x238/0x5c0
>  [c0001c4ef9d0] [c06a3c60] iomap_dio_rw+0x20/0x80
>  [c0001c4ef9f0] [c00801927a30] xfs_file_dio_aio_write+0x1f8/0x650 
> [xfs]
>  [c0001c4efa60] [c008019284dc] xfs_file_write_iter+0xc4/0x130 
> [xfs]
>  [c0001c4efa90] [c0669984] io_write+0x104/0x4b0
>  [c0001c4efbb0] [c066cea4] io_issue_sqe+0x3d4/0xf50
>  [c0001c4efc60] [c0670200] io_wq_submit_work+0xb0/0x2f0
>  [c0001c4efcb0] [c0674268] io_worker_handle_work+0x248/0x4a0
>  [c0001c4efd30] [c06746e8] io_wqe_worker+0x228/0x2a0
>  [c0001c4efda0] [c019d994] kthread+0x1b4/0x1c0
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
> b/arch/powerpc/include/asm/book3s/64/kup.h
> index 2064621ae7b6..21e59c1f0d67 100644
> --- a/arch/powerpc/include/asm/book3s/64/kup.h
> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
> @@ -204,14 +204,16 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
>  
>  static inline u64 current_thread_amr(void)
>  {
> - VM_BUG_ON(!current->thread.regs);
> - return current->thread.regs->amr;
> + if (current->thread.regs)
> + return current->thread.regs->amr;
> + return 0;
>  }
>  
>  static inline u64 current_thread_iamr(void)
>  {
> - VM_BUG_ON(!current->thread.regs);
> - return current->thread.regs->iamr;
> + if (current->thread.regs)
> + return current->thread.regs->iamr;
> + return 0;
>  }
>  #endif /* CONFIG_PPC_PKEY */

This change can help to avoid above regression issue:

# ./check generic/013 generic/616 generic/617
FSTYP -- xfs (debug)
PLATFORM  -- Linux/ppc64le ibm-p9z-xx-xxx 5.11.0-rc6+ #4 SMP Fri Feb 5 
10:22:14 EST 2021
MKFS_OPTIONS  -- -f -m crc=1,finobt=1,rmapbt=1,reflink=1,inobtcount=1,bigtime=1 
/dev/sda3
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/sda3 
/mnt/xfstests/scratch

generic/013 37s ...  42s
generic/616  166s
generic/617 16s ...  20s
Ran: generic/013 generic/616 generic/617
Passed all 3 tests

>  
> 



Re: [PATCH 1/2] PCI/AER: Disable AER interrupt during suspend

2021-02-05 Thread Kai-Heng Feng
On Fri, Feb 5, 2021 at 7:28 AM Bjorn Helgaas  wrote:
>
> [+cc Alex]
>
> On Thu, Jan 28, 2021 at 12:09:37PM +0800, Kai-Heng Feng wrote:
> > On Thu, Jan 28, 2021 at 4:51 AM Bjorn Helgaas  wrote:
> > > On Thu, Jan 28, 2021 at 01:31:00AM +0800, Kai-Heng Feng wrote:
> > > > Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> > > > hint") enables ACS, and some platforms lose its NVMe after resume from
> > > > firmware:
> > > > [   50.947816] pcieport :00:1b.0: DPC: containment event, 
> > > > status:0x1f01 source:0x
> > > > [   50.947817] pcieport :00:1b.0: DPC: unmasked uncorrectable error 
> > > > detected
> > > > [   50.947829] pcieport :00:1b.0: PCIe Bus Error: 
> > > > severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> > > > [   50.947830] pcieport :00:1b.0:   device [8086:06ac] error 
> > > > status/mask=0020/0001
> > > > [   50.947831] pcieport :00:1b.0:[21] ACSViol
> > > > (First)
> > > > [   50.947841] pcieport :00:1b.0: AER: broadcast error_detected 
> > > > message
> > > > [   50.947843] nvme nvme0: frozen state error detected, reset controller
> > > >
> > > > It happens right after ACS gets enabled during resume.
> > > >
> > > > To prevent that from happening, disable AER interrupt and enable it on
> > > > system suspend and resume, respectively.
> > >
> > > Lots of questions here.  Maybe this is what we'll end up doing, but I
> > > am curious about why the error is reported in the first place.
> > >
> > > Is this a consequence of the link going down and back up?
> >
> > Could be. From the observations, it only happens when firmware suspend
> > (S3) is used.
> > Maybe it happens when it's gets powered up, but I don't have equipment
> > to debug at hardware level.
> >
> > If we use non-firmware suspend method, enabling ACS after resume won't
> > trip AER and DPC.
> >
> > > Is it consequence of the device doing a DMA when it shouldn't?
> >
> > If it's doing DMA while suspending, the same error should also happen
> > after NVMe is suspended and before PCIe port suspending.
> > Furthermore, if non-firmware suspend method is used, there's so such
> > issue, so less likely to be any DMA operation.
> >
> > > Are we doing something in the wrong order during suspend?  Or maybe
> > > resume, since I assume the error is reported during resume?
> >
> > Yes the error is reported during resume. The suspend/resume order
> > seems fine as non-firmware suspend doesn't have this issue.
>
> I really feel like we need a better understanding of what's going on
> here.  Disabling the AER interrupt is like closing our eyes and
> pretending that because we don't see it, it didn't happen.
>
> An ACS error is triggered by a DMA, right?  I'm assuming an MMIO
> access from the CPU wouldn't trigger this error.  And it sounds like
> the error is triggered before we even start running the driver after
> resume.
>
> If we're powering up an NVMe device from D3cold and it DMAs before the
> driver touches it, something would be seriously broken.  I doubt
> that's what's happening.  Maybe a device could resume some previously
> programmed DMA after powering up from D3hot.

I am not that familiar with PCIe ACS/AER/DPC, so I can't really answer
questions you raised.
PCIe spec doesn't say the suspend/resume order is also not helping here.

However, I really think it's a system firmware issue.
I've seen some suspend-to-idle platforms with NVMe can reach D3cold,
those are unaffected.

>
> Or maybe the error occurred on suspend, like if the device wasn't
> quiesced or something, but we didn't notice it until resume?  The
> AER error status bits are RW1CS, which means they can be preserved
> across hot/warm/cold resets.
>
> Can you instrument the code to see whether the AER error status bit is
> set before enabling ACS?  I'm not sure that merely enabling ACS (I
> assume you mean pci_std_enable_acs(), where we write PCI_ACS_CTRL)
> should cause an interrupt for a previously-logged error.  I suspect
> that could happen when enabling *AER*, but I wouldn't think it would
> happen when enabling *ACS*.

Diff to print AER status:
https://bugzilla.kernel.org/show_bug.cgi?id=209149#c11

And dmesg:
https://bugzilla.kernel.org/show_bug.cgi?id=209149#c12

Looks like the read before suspend and after resume are both fine.

>
> Does this error happen on multiple machines from different vendors?
> Wondering if it could be a BIOS issue, e.g., BIOS not cleaning up
> after it did something to cause an error.

AFAIK, systems from both HP and Dell are affected.
I was told that the reference platform from Intel is using
suspend-to-idle, but vendors changed the sleep method to S3 to have
lower power consumption to pass regulation.

Kai-Heng

>
> > > If we *do* take the error, why doesn't DPC recovery work?
> >
> > It works for the root port, but not for the NVMe drive:
> > [   50.947816] pcieport :00:1b.0: DPC: containment event,
> > status:0x1f01 source:0x
> 

Re: [PATCH] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm

2021-02-05 Thread Aneesh Kumar K.V
Zorro Lang  writes:



> ...
> [  530.180466] run fstests generic/617 at 2021-02-05 03:41:10
> [  530.707969] [ cut here ]
> [  530.708006] kernel BUG at arch/powerpc/include/asm/book3s/64/kup.h:207!
> [  530.708013] Oops: Exception in kernel mode, sig: 5 [#1]
> [  530.708018] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [  530.708022] Modules linked in: bonding rfkill sunrpc uio_pdrv_genirq 
> pseries_rng uio drm fuse drm_panel_orientation_quirks ip_tables xfs libcrc32c 
> sd_mod t10_pi ibmvscsi ibmveth scsi_trans
> port_srp xts vmx_crypto
> [  530.708049] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Not tainted 5.11.0-r

ok so we call current_thread_amr() with kthread.

commit ae33fb7b069ebb41e32f55ae397c887031e47472
Author: Aneesh Kumar K.V 
Date:   Fri Feb 5 19:11:49 2021 +0530


The other stack that matters is
...
[  530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G  D
   5.11.0-rc6+ #3


 NIP [c00aa0c8] pkey_access_permitted+0x28/0x90
 LR [c04b9278] gup_pte_range+0x188/0x420
 --- interrupt: 700
 [c0001c4ef3f0] [] 0x0 (unreliable)
 [c0001c4ef490] [c04bd39c] gup_pgd_range+0x3ac/0xa20
 [c0001c4ef5a0] [c04bdd44] 
internal_get_user_pages_fast+0x334/0x410
 [c0001c4ef620] [c0852028] iov_iter_get_pages+0xf8/0x5c0
 [c0001c4ef6a0] [c07da44c] bio_iov_iter_get_pages+0xec/0x700
 [c0001c4ef770] [c06a325c] iomap_dio_bio_actor+0x2ac/0x4f0
 [c0001c4ef810] [c069cd94] iomap_apply+0x2b4/0x740
 [c0001c4ef920] [c06a38b8] __iomap_dio_rw+0x238/0x5c0
 [c0001c4ef9d0] [c06a3c60] iomap_dio_rw+0x20/0x80
 [c0001c4ef9f0] [c00801927a30] xfs_file_dio_aio_write+0x1f8/0x650 
[xfs]
 [c0001c4efa60] [c008019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
 [c0001c4efa90] [c0669984] io_write+0x104/0x4b0
 [c0001c4efbb0] [c066cea4] io_issue_sqe+0x3d4/0xf50
 [c0001c4efc60] [c0670200] io_wq_submit_work+0xb0/0x2f0
 [c0001c4efcb0] [c0674268] io_worker_handle_work+0x248/0x4a0
 [c0001c4efd30] [c06746e8] io_wqe_worker+0x228/0x2a0
 [c0001c4efda0] [c019d994] kthread+0x1b4/0x1c0

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index 2064621ae7b6..21e59c1f0d67 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -204,14 +204,16 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
 
 static inline u64 current_thread_amr(void)
 {
-   VM_BUG_ON(!current->thread.regs);
-   return current->thread.regs->amr;
+   if (current->thread.regs)
+   return current->thread.regs->amr;
+   return 0;
 }
 
 static inline u64 current_thread_iamr(void)
 {
-   VM_BUG_ON(!current->thread.regs);
-   return current->thread.regs->iamr;
+   if (current->thread.regs)
+   return current->thread.regs->iamr;
+   return 0;
 }
 #endif /* CONFIG_PPC_PKEY */
 


Re: [PATCH] powerpc/pseries/dlpar: handle ibm, configure-connector delay status

2021-02-05 Thread Nathan Lynch
Nathan Lynch  writes:
> dlpar_configure_connector() has two problems in its handling of
> ibm,configure-connector's return status:
>
> 1. When the status is -2 (busy, call again), we call
>ibm,configure-connector again immediately without checking whether
>to schedule, which can result in monopolizing the CPU.
> 2. Extended delay status (9900..9905) goes completely unhandled,
>causing the configuration to unnecessarily terminate.
>
> Fix both of these issues by using rtas_busy_delay().
>
> Fixes: ab519a011caa ("powerpc/pseries: Kernel DLPAR Infrastructure")
> Signed-off-by: Nathan Lynch 

Just following up and adding some people to cc in hopes of getting some
review for this.


> ---
>  arch/powerpc/platforms/pseries/dlpar.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 16e86ba8aa20..f6b7749d6ada 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -127,7 +127,6 @@ void dlpar_free_cc_nodes(struct device_node *dn)
>  #define NEXT_PROPERTY   3
>  #define PREV_PARENT 4
>  #define MORE_MEMORY 5
> -#define CALL_AGAIN   -2
>  #define ERR_CFG_USE -9003
>  
>  struct device_node *dlpar_configure_connector(__be32 drc_index,
> @@ -168,6 +167,9 @@ struct device_node *dlpar_configure_connector(__be32 
> drc_index,
>  
>   spin_unlock(_data_buf_lock);
>  
> + if (rtas_busy_delay(rc))
> + continue;
> +
>   switch (rc) {
>   case COMPLETE:
>   break;
> @@ -216,9 +218,6 @@ struct device_node *dlpar_configure_connector(__be32 
> drc_index,
>   last_dn = last_dn->parent;
>   break;
>  
> - case CALL_AGAIN:
> - break;
> -
>   case MORE_MEMORY:
>   case ERR_CFG_USE:
>   default:
> -- 
> 2.29.2


Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-05 Thread Fabiano Rosas
Leonardo Bras  writes:

> Before guest entry, TBU40 register is changed to reflect guest timebase.
> After exitting guest, the register is reverted to it's original value.
>
> If one tries to get the timestamp from host between those changes, it
> will present an incorrect value.
>
> An example would be trying to add a tracepoint in
> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> acquired could actually cause the host to crash.
>
> Save the Timebase Offset to PACA and use it on sched_clock() to always
> get the correct timestamp.
>
> Signed-off-by: Leonardo Bras 
> Suggested-by: Paul Mackerras 
> ---
> Changes since v1:
> - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
>   CONFIG_PPC_BOOK3S_64 are defined.
> ---
>  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
>  arch/powerpc/kernel/asm-offsets.c | 1 +
>  arch/powerpc/kernel/time.c| 8 +++-
>  arch/powerpc/kvm/book3s_hv.c  | 2 ++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
>  5 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 078f4648ea27..e2c12a10eed2 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -131,6 +131,7 @@ struct kvmppc_host_state {
>   u64 cfar;
>   u64 ppr;
>   u64 host_fscr;
> + u64 tb_offset;  /* Timebase offset: keeps correct
> timebase while on guest */

Couldn't you use the vc->tb_offset_applied for this? We have a reference
for the vcore in the hstate already.

>  #endif
>  };
>
> diff --git a/arch/powerpc/kernel/asm-offsets.c 
> b/arch/powerpc/kernel/asm-offsets.c
> index b12d7c049bfe..0beb8fdc6352 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -706,6 +706,7 @@ int main(void)
>   HSTATE_FIELD(HSTATE_CFAR, cfar);
>   HSTATE_FIELD(HSTATE_PPR, ppr);
>   HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> + HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>
>  #else /* CONFIG_PPC_BOOK3S */
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 67feb3524460..f27f0163792b 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
>   */
>  notrace unsigned long long sched_clock(void)
>  {
> - return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> + u64 tb = get_tb() - boot_tb;
> +
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> + tb -= local_paca->kvm_hstate.tb_offset;
> +#endif
> +
> + return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
>  }
>
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index b3731572295e..c08593c63353 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> *vcpu, u64 time_limit,
>   if ((tb & 0xff) < (new_tb & 0xff))
>   mtspr(SPRN_TBU40, new_tb + 0x100);
>   vc->tb_offset_applied = vc->tb_offset;
> + local_paca->kvm_hstate.tb_offset = vc->tb_offset;
>   }
>
>   if (vc->pcr)
> @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> *vcpu, u64 time_limit,
>   if ((tb & 0xff) < (new_tb & 0xff))
>   mtspr(SPRN_TBU40, new_tb + 0x100);
>   vc->tb_offset_applied = 0;
> + local_paca->kvm_hstate.tb_offset = 0;
>   }
>
>   mtspr(SPRN_HDEC, 0x7fff);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b73140607875..8f7a9f7f4ee6 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>   cmpdi   r8,0
>   beq 37f
>   std r8, VCORE_TB_OFFSET_APPL(r5)
> + std r8, HSTATE_TB_OFFSET(r13)
>   mftbr6  /* current host timebase */
>   add r8,r8,r6
>   mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
> @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>   beq 17f
>   li  r0, 0
>   std r0, VCORE_TB_OFFSET_APPL(r5)
> + std r0, HSTATE_TB_OFFSET(r13)
>   mftbr6  /* current guest timebase */
>   subfr8,r8,r6
>   mtspr   SPRN_TBU40,r8   /* update upper 40 bits */


Re: [PATCH v2 2/2] ima: Free IMA measurement buffer after kexec syscall

2021-02-05 Thread Greg KH
On Thu, Feb 04, 2021 at 09:49:51AM -0800, Lakshmi Ramasubramanian wrote:
> IMA allocates kernel virtual memory to carry forward the measurement
> list, from the current kernel to the next kernel on kexec system call,
> in ima_add_kexec_buffer() function.  This buffer is not freed before
> completing the kexec system call resulting in memory leak.
> 
> Add ima_buffer field in "struct kimage" to store the virtual address
> of the buffer allocated for the IMA measurement list.
> Free the memory allocated for the IMA measurement list in
> kimage_file_post_load_cleanup() function.
> 
> Signed-off-by: Lakshmi Ramasubramanian 
> Suggested-by: Tyler Hicks 
> Reviewed-by: Thiago Jung Bauermann 
> Reviewed-by: Tyler Hicks 
> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
> ---
>  include/linux/kexec.h  | 5 +
>  kernel/kexec_file.c| 5 +
>  security/integrity/ima/ima_kexec.c | 2 ++
>  3 files changed, 12 insertions(+)



This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.




Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error

2021-02-05 Thread Greg KH
On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:
> IMA allocates kernel virtual memory to carry forward the measurement
> list, from the current kernel to the next kernel on kexec system call,
> in ima_add_kexec_buffer() function.  In error code paths this memory
> is not freed resulting in memory leak.
> 
> Free the memory allocated for the IMA measurement list in
> the error code paths in ima_add_kexec_buffer() function.
> 
> Signed-off-by: Lakshmi Ramasubramanian 
> Suggested-by: Tyler Hicks 
> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
> ---
>  security/integrity/ima/ima_kexec.c | 1 +
>  1 file changed, 1 insertion(+)



This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.




Re: [PATCH] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm

2021-02-05 Thread Zorro Lang
On Fri, Feb 05, 2021 at 08:34:26AM +0530, Aneesh Kumar K.V wrote:
> This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
> 
>  Bug: Read fault blocked by KUAP!
>  WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 
> __do_page_fault+0x6b4/0xcd0
>  NIP [c009e7e4] __do_page_fault+0x6b4/0xcd0
>  LR [c009e7e0] __do_page_fault+0x6b0/0xcd0
> ..
>  Call Trace:
>  [c00016367330] [c009e7e0] __do_page_fault+0x6b0/0xcd0 
> (unreliable)
>  [c000163673e0] [c009ee3c] do_page_fault+0x3c/0x120
>  [c00016367430] [c000c848] handle_page_fault+0x10/0x2c
>  --- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
> ..
>  NIP [c08e8228] iov_iter_fault_in_readable+0x148/0x6f0
>  LR [c08e834c] iov_iter_fault_in_readable+0x26c/0x6f0
>  interrupt: 300
>  [c000163677e0] [c07154a0] iomap_write_actor+0xc0/0x280
>  [c00016367880] [c070fc94] iomap_apply+0x1c4/0x780
>  [c00016367990] [c0710330] iomap_file_buffered_write+0xa0/0x120
>  [c000163679e0] [c0080040791c] 
> xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
>  [c00016367a90] [c06d74bc] io_write+0x10c/0x460
>  [c00016367bb0] [c06d80e4] io_issue_sqe+0x8d4/0x1200
>  [c00016367c70] [c06d8ad0] io_wq_submit_work+0xc0/0x250
>  [c00016367cb0] [c06e2578] io_worker_handle_work+0x498/0x800
>  [c00016367d40] [c06e2cdc] io_wqe_worker+0x3fc/0x4f0
>  [c00016367da0] [c01cb0a4] kthread+0x1c4/0x1d0
>  [c00016367e10] [c000dbf0] ret_from_kernel_thread+0x5c/0x6c
> 
> The kernel consider thread AMR value for kernel thread to be
> AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
> of course not correct and we should allow userspace access after
> kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
> AMR value of the operating address space. But, the AMR value is
> thread-specific and we inherit the address space and not thread
> access restrictions. Because of this ignore AMR value when accessing
> userspace via kernel thread.
> 
> Cc: Zorro Lang 
> Cc: Jens Axboe 
> Cc: Christophe Leroy 
> Cc: Nicholas Piggin 
> Signed-off-by: Aneesh Kumar K.V 
> ---

Hi,

Simply test on ppc64le with latest 5.11.0-rc6+.

1) Reproduced this bug at first:
# ./check generic/013
FSTYP -- xfs (debug)
PLATFORM  -- Linux/ppc64le ibm-p9z-xxx-xxx 5.11.0-rc6+ #2 SMP Fri Feb 5 
01:40:25 EST 2021
MKFS_OPTIONS  -- -f -m crc=1,finobt=1,rmapbt=1,reflink=1,inobtcount=1,bigtime=1 
/dev/sda3
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/sda3 
/mnt/xfstests/scratch

generic/013 49s ... _check_dmesg: something found in dmesg (see 
/var/lib/xfstests/results//generic/013.dmesg)

Ran: generic/013
Failures: generic/013
Failed 1 of 1 tests

# cat results//generic/013.dmesg
...
[ 4261.095623] Kernel attempted to read user page (1003a0648b0) - exploit 
attempt? (uid: 0) 
[ 4261.095640] [ cut here ] 
[ 4261.095643] Bug: Read fault blocked by KUAP! 
[ 4261.095647] WARNING: CPU: 7 PID: 287137 at arch/powerpc/mm/fault.c:229 
bad_kernel_fault+0x180/0x310 
...
...

2) Test passed on the kernel with this patch:
# ./check generic/013 generic/051
FSTYP -- xfs (debug)
PLATFORM  -- Linux/ppc64le ibm-p9z-xx-xxx 5.11.0-rc6+ #3 SMP Fri Feb 5 
02:44:31 EST 2021
MKFS_OPTIONS  -- -f -m crc=1,finobt=1,rmapbt=1,reflink=1,inobtcount=1,bigtime=1 
/dev/sda3
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/sda3 
/mnt/xfstests/scratch

generic/013 49s ...  42s
generic/051  87s
Ran: generic/013 generic/051
Passed all 2 tests

3) But when I just gave it a little more test, a test case hang and trigger a 
kernel BUG as below.
I thought it's a regression issue from this patch.
https://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git/tree/tests/generic/617

# ./check generic/616 generic/617
FSTYP -- xfs (debug)
PLATFORM  -- Linux/ppc64le ibm-p9z-xx-xxx 5.11.0-rc6+ #3 SMP Fri Feb 5 
02:44:31 EST 2021
MKFS_OPTIONS  -- -f -m crc=1,finobt=1,rmapbt=1,reflink=1,inobtcount=1,bigtime=1 
/dev/sda3
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/sda3 
/mnt/xfstests/scratch

generic/616  170s
generic/617 ^C^C^C^C

# dmesg
...
[  530.180466] run fstests generic/617 at 2021-02-05 03:41:10
[  530.707969] [ cut here ]
[  530.708006] kernel BUG at arch/powerpc/include/asm/book3s/64/kup.h:207!
[  530.708013] Oops: Exception in kernel mode, sig: 5 [#1]
[  530.708018] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
[  530.708022] Modules linked in: bonding rfkill sunrpc uio_pdrv_genirq 
pseries_rng uio drm fuse drm_panel_orientation_quirks ip_tables xfs libcrc32c 
sd_mod t10_pi ibmvscsi ibmveth scsi_trans
port_srp xts vmx_crypto
[  530.708049] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Not tainted 5.11.0-rc6+ 
#3
[  530.708055] NIP:  c00aa0c8 LR: c04b9278 CTR: 

Re: [PATCH] mm/memtest: Add ARCH_USE_MEMTEST

2021-02-05 Thread Vladimir Murzin
Hi Anshuman,

On 2/5/21 4:10 AM, Anshuman Khandual wrote:
> early_memtest() does not get called from all architectures. Hence enabling
> CONFIG_MEMTEST and providing a valid memtest=[1..N] kernel command line
> option might not trigger the memory pattern tests as would be expected in
> normal circumstances. This situation is misleading.

Documentation already mentions which architectures support that:

memtest=[KNL,X86,ARM,PPC] Enable memtest

yet I admit that not all reflected there

> 
> The change here prevents the above mentioned problem after introducing a
> new config option ARCH_USE_MEMTEST that should be subscribed on platforms
> that call early_memtest(), in order to enable the config CONFIG_MEMTEST.
> Conversely CONFIG_MEMTEST cannot be enabled on platforms where it would
> not be tested anyway.
> 

Is that generic pattern? What about other cross arch parameters? Do they already
use similar subscription or they rely on documentation?

I'm not against the patch just want to check if things are consistent...

Cheers
Vladimir


[PATCH V2] powerpc/perf: Record counter overflow always if SAMPLE_IP is unset

2021-02-05 Thread Athira Rajeev
While sampling for marked events, currently we record the sample only
if the SIAR valid bit of Sampled Instruction Event Register (SIER) is
set. SIAR_VALID bit is used for fetching the instruction address from
Sampled Instruction Address Register(SIAR). But there are some usecases,
where the user is interested only in the PMU stats at each counter
overflow and the exact IP of the overflow event is not required.
Dropping SIAR invalid samples will fail to record some of the counter
overflows in such cases.

Example of such usecase is dumping the PMU stats (event counts)
after some regular amount of instructions/events from the userspace
(ex: via ptrace). Here counter overflow is indicated to userspace via
signal handler, and captured by monitoring and enabling I/O
signaling on the event file descriptor. In these cases, we expect to
get sample/overflow indication after each specified sample_period.

Perf event attribute will not have PERF_SAMPLE_IP set in the
sample_type if exact IP of the overflow event is not requested. So
while profiling if SAMPLE_IP is not set, just record the counter overflow
irrespective of SIAR_VALID check.

Suggested-by: Michael Ellerman 
Signed-off-by: Athira Rajeev 
---
Changes in v2:
-- Changed the approach to include PERF_SAMPLE_IP
   condition while checking siar_valid as Suggested by
   Michael Ellerman.

 arch/powerpc/perf/core-book3s.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 28206b1fe172..0ddbe33798ce 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2149,7 +2149,17 @@ static void record_and_restart(struct perf_event *event, 
unsigned long val,
left += period;
if (left <= 0)
left = period;
-   record = siar_valid(regs);
+
+   /*
+* If address is not requested in the sample
+* via PERF_SAMPLE_IP, just record that sample
+* irrespective of SIAR valid check.
+*/
+   if (event->attr.sample_type & PERF_SAMPLE_IP)
+   record = siar_valid(regs);
+   else
+   record = 1;
+
event->hw.last_period = event->hw.sample_period;
}
if (left < 0x8000LL)
@@ -2167,9 +2177,10 @@ static void record_and_restart(struct perf_event *event, 
unsigned long val,
 * MMCR2. Check attr.exclude_kernel and address to drop the sample in
 * these cases.
 */
-   if (event->attr.exclude_kernel && record)
-   if (is_kernel_addr(mfspr(SPRN_SIAR)))
-   record = 0;
+   if (event->attr.exclude_kernel &&
+   (event->attr.sample_type & PERF_SAMPLE_IP) &&
+   is_kernel_addr(mfspr(SPRN_SIAR)))
+   record = 0;
 
/*
 * Finally record data if requested.
-- 
1.8.3.1



[PATCH] powerpc/8xx: Fix software emulation interrupt

2021-02-05 Thread Christophe Leroy
For unimplemented instructions or unimplemented SPRs, the 8xx triggers
a "Software Emulation Exception" (0x1000). That interrupt doesn't set
reason bits in SRR1 as the "Program Check Exception" does.

Go through emulation_assist_interrupt() to set REASON_ILLEGAL.

Fixes: fbbcc3bb139e ("powerpc/8xx: Remove SoftwareEmulation()")
Signed-off-by: Christophe Leroy 
---
I'm wondering whether it wouldn't be better to set REASON_ILLEGAL
in the exception prolog and still call program_check_exception.
And do the same in book3s/64 to avoid the nightmare of an
INTERRUPT_HANDLER calling another INTERRUPT_HANDLER.
---
 arch/powerpc/kernel/head_8xx.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 52702f3db6df..9eb63cf6ac38 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -165,7 +165,7 @@ SystemCall:
 /* On the MPC8xx, this is a software emulation interrupt.  It occurs
  * for all unimplemented and illegal instructions.
  */
-   EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD)
+   EXCEPTION(0x1000, SoftEmu, emulation_assist_interrupt, EXC_XFER_STD)
 
. = 0x1100
 /*
-- 
2.25.0



Re: [PATCH] mm/pmem: Avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled

2021-02-05 Thread David Hildenbrand

On 05.02.21 03:39, Aneesh Kumar K.V wrote:

Differentiate between hardware not supporting hugepages and user disabling THP
via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'

For the devdax namespace, the kernel handles the above via the
supported_alignment attribute and failing to initialize the namespace
if the namespace align value is not supported on the platform.

For the fsdax namespace, the kernel will continue to initialize
the namespace. This can result in the kernel creating a huge pte
entry even though the hardware don't support the same.

We do want hugepage support with pmem even if the end-user disabled THP
via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled). Hence
differentiate between hardware/firmware lacking support vs user-controlled
disable of THP and prevent a huge fault if the hardware lacks hugepage
support.

Signed-off-by: Aneesh Kumar K.V 
---
  include/linux/huge_mm.h | 15 +--
  mm/huge_memory.c|  6 +-
  2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6a19f35f836b..ba973efcd369 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -78,6 +78,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault 
*vmf, pfn_t pfn,
  }
  
  enum transparent_hugepage_flag {

+   TRANSPARENT_HUGEPAGE_NEVER_DAX,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
@@ -123,6 +124,13 @@ extern unsigned long transparent_hugepage_flags;
   */
  static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
  {
+
+   /*
+* If the hardware/firmware marked hugepage support disabled.
+*/
+   if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+   return false;
+
if (vma->vm_flags & VM_NOHUGEPAGE)
return false;
  
@@ -134,12 +142,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
  
  	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))

return true;
-   /*
-* For dax vmas, try to always use hugepage mappings. If the kernel does
-* not support hugepages, fsdax mappings will fallback to PAGE_SIZE
-* mappings, and device-dax namespaces, that try to guarantee a given
-* mapping size, will fail to enable
-*/
+
if (vma_is_dax(vma))
return true;
  
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 9237976abe72..d698b7e27447 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -386,7 +386,11 @@ static int __init hugepage_init(void)
struct kobject *hugepage_kobj;
  
  	if (!has_transparent_hugepage()) {

-   transparent_hugepage_flags = 0;
+   /*
+* Hardware doesn't support hugepages, hence disable
+* DAX PMD support.
+*/
+   transparent_hugepage_flags = 1 << 
TRANSPARENT_HUGEPAGE_NEVER_DAX;
return -EINVAL;
}
  



Looks sane to me from my limited understanding of that code :)

--
Thanks,

David / dhildenb



Re: [PATCH] arch:powerpc simple_write_to_buffer return check

2021-02-05 Thread Mayank Suman
On 05/02/21 12:51 pm, Christophe Leroy wrote:
> Please provide some description of the change.
> 
> And please clarify the patch subject, because as far as I can see, the return 
> is already checked allthough the check seams wrong.

This was my first patch. I will try to provide better description of changes 
and subject in later patches.

> Le 04/02/2021 à 19:16, Mayank Suman a écrit :
>> Signed-off-by: Mayank Suman 
>> ---
>>   arch/powerpc/kernel/eeh.c    | 8 
>>   arch/powerpc/platforms/powernv/eeh-powernv.c | 4 ++--
>>   2 files changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>> index 813713c9120c..2dbe1558a71f 100644
>> --- a/arch/powerpc/kernel/eeh.c
>> +++ b/arch/powerpc/kernel/eeh.c
>> @@ -1628,8 +1628,8 @@ static ssize_t eeh_force_recover_write(struct file 
>> *filp,
>>   char buf[20];
>>   int ret;
>>   -    ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
>> -    if (!ret)
>> +    ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
>> +    if (ret <= 0) >   return -EFAULT;
> 
> Why return -EFAULT when the function has returned -EINVAL ?

If -EINVAL is returned by simple_write_to_buffer, we should return -EINVAL.

> And why is it -EFAULT when ret is 0 ? EFAULT means error accessing memory.
> 

The earlier check returned EFAULT when ret is 0. Most probably, there was an 
assumption
that writing 0 bytes (by simple_write_to_buffer) means a fault with memory (or 
error accessing memory).


Re: [PATCH v7 28/42] powerpc: convert interrupt handlers to use wrappers

2021-02-05 Thread Christophe Leroy




Le 30/01/2021 à 14:08, Nicholas Piggin a écrit :

Signed-off-by: Nicholas Piggin 
---



diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f70d3f6174c8..7ff915aae8ec 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c



@@ -1462,7 +1474,7 @@ static int emulate_math(struct pt_regs *regs)
  static inline int emulate_math(struct pt_regs *regs) { return -1; }
  #endif
  
-void program_check_exception(struct pt_regs *regs)

+DEFINE_INTERRUPT_HANDLER(program_check_exception)
  {
enum ctx_state prev_state = exception_enter();
unsigned int reason = get_reason(regs);
@@ -1587,14 +1599,14 @@ NOKPROBE_SYMBOL(program_check_exception);
   * This occurs when running in hypervisor mode on POWER6 or later
   * and an illegal instruction is encountered.
   */
-void emulation_assist_interrupt(struct pt_regs *regs)
+DEFINE_INTERRUPT_HANDLER(emulation_assist_interrupt)
  {
regs->msr |= REASON_ILLEGAL;
program_check_exception(regs);


Is it correct that an INTERRUPT_HANDLER calls another INTERRUPT_HANDLER ?


  }
  NOKPROBE_SYMBOL(emulation_assist_interrupt);