[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with %cr3

2017-12-16 Thread Sai Praneeth Prakhya
From: Sai Praneeth 

Use helper function (efi_switch_mm()) to switch to/from efi_mm. We
switch to efi_mm before calling
1. efi_set_virtual_address_map() and
2. Invoking any efi_runtime_service()

Likewise, we need to switch back to previous mm (mm context stolen by
efi_mm) after the above calls return successfully. We can use
efi_switch_mm() helper function only with x86_64 kernel and
"efi=old_map" disabled because, x86_32 and efi=old_map doesn't use
efi_pgd, rather they use swapper_pg_dir.

Signed-off-by: Sai Praneeth Prakhya 
Cc: Lee, Chun-Yi 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Andy Lutomirski 
Cc: Michael S. Tsirkin 
Cc: Bhupesh Sharma 
Cc: Ricardo Neri 
Cc: Matt Fleming 
Cc: Ard Biesheuvel 
Cc: Ravi Shankar 
Tested-by: Bhupesh Sharma 
---
 arch/x86/include/asm/efi.h   | 25 +-
 arch/x86/platform/efi/efi_64.c   | 41 ++--
 arch/x86/platform/efi/efi_thunk_64.S |  2 +-
 3 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 00f977ddd718..cda9940bed7a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -62,14 +62,13 @@ extern asmlinkage u64 efi_call(void *fp, ...);
 #define efi_call_phys(f, args...)  efi_call((f), args)
 
 /*
- * Scratch space used for switching the pagetable in the EFI stub
+ * struct efi_scratch - Scratch space used while switching to/from efi_mm
+ * @phys_stack: stack used during EFI Mixed Mode
+ * @prev_mm:store/restore stolen mm_struct while switching to/from efi_mm
  */
 struct efi_scratch {
-   u64 r15;
-   u64 prev_cr3;
-   pgd_t   *efi_pgt;
-   booluse_pgd;
-   u64 phys_stack;
+   u64 phys_stack;
+   struct mm_struct*prev_mm;
 } __packed;
 
 #define arch_efi_call_virt_setup() \
@@ -78,11 +77,8 @@ struct efi_scratch {
preempt_disable();  \
__kernel_fpu_begin();   \
\
-   if (efi_scratch.use_pgd) {  \
-   efi_scratch.prev_cr3 = __read_cr3();\
-   write_cr3((unsigned long)efi_scratch.efi_pgt);  \
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(_mm); \
 })
 
 #define arch_efi_call_virt(p, f, args...)  \
@@ -90,10 +86,8 @@ struct efi_scratch {
 
 #define arch_efi_call_virt_teardown()  \
 ({ \
-   if (efi_scratch.use_pgd) {  \
-   write_cr3(efi_scratch.prev_cr3);\
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(efi_scratch.prev_mm); \
\
__kernel_fpu_end(); \
preempt_enable();   \
@@ -135,6 +129,7 @@ extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
+extern void efi_switch_mm(struct mm_struct *mm);
 
 struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6b541bdbda5f..c325b1cc4d1a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -82,9 +82,8 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   save_pgd = (pgd_t *)__read_cr3();
-   write_cr3((unsigned long)efi_scratch.efi_pgt);
-   goto out;
+   efi_switch_mm(_mm);
+   return NULL;
}
 
early_code_mapping_set_exec(1);
@@ -154,8 +153,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
pud_t *pud;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   

[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with %cr3

2017-12-16 Thread Sai Praneeth Prakhya
From: Sai Praneeth 

Use helper function (efi_switch_mm()) to switch to/from efi_mm. We
switch to efi_mm before calling
1. efi_set_virtual_address_map() and
2. Invoking any efi_runtime_service()

Likewise, we need to switch back to previous mm (mm context stolen by
efi_mm) after the above calls return successfully. We can use
efi_switch_mm() helper function only with x86_64 kernel and
"efi=old_map" disabled because, x86_32 and efi=old_map doesn't use
efi_pgd, rather they use swapper_pg_dir.

Signed-off-by: Sai Praneeth Prakhya 
Cc: Lee, Chun-Yi 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Andy Lutomirski 
Cc: Michael S. Tsirkin 
Cc: Bhupesh Sharma 
Cc: Ricardo Neri 
Cc: Matt Fleming 
Cc: Ard Biesheuvel 
Cc: Ravi Shankar 
Tested-by: Bhupesh Sharma 
---
 arch/x86/include/asm/efi.h   | 25 +-
 arch/x86/platform/efi/efi_64.c   | 41 ++--
 arch/x86/platform/efi/efi_thunk_64.S |  2 +-
 3 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 00f977ddd718..cda9940bed7a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -62,14 +62,13 @@ extern asmlinkage u64 efi_call(void *fp, ...);
 #define efi_call_phys(f, args...)  efi_call((f), args)
 
 /*
- * Scratch space used for switching the pagetable in the EFI stub
+ * struct efi_scratch - Scratch space used while switching to/from efi_mm
+ * @phys_stack: stack used during EFI Mixed Mode
+ * @prev_mm:store/restore stolen mm_struct while switching to/from efi_mm
  */
 struct efi_scratch {
-   u64 r15;
-   u64 prev_cr3;
-   pgd_t   *efi_pgt;
-   booluse_pgd;
-   u64 phys_stack;
+   u64 phys_stack;
+   struct mm_struct*prev_mm;
 } __packed;
 
 #define arch_efi_call_virt_setup() \
@@ -78,11 +77,8 @@ struct efi_scratch {
preempt_disable();  \
__kernel_fpu_begin();   \
\
-   if (efi_scratch.use_pgd) {  \
-   efi_scratch.prev_cr3 = __read_cr3();\
-   write_cr3((unsigned long)efi_scratch.efi_pgt);  \
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(_mm); \
 })
 
 #define arch_efi_call_virt(p, f, args...)  \
@@ -90,10 +86,8 @@ struct efi_scratch {
 
 #define arch_efi_call_virt_teardown()  \
 ({ \
-   if (efi_scratch.use_pgd) {  \
-   write_cr3(efi_scratch.prev_cr3);\
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(efi_scratch.prev_mm); \
\
__kernel_fpu_end(); \
preempt_enable();   \
@@ -135,6 +129,7 @@ extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
+extern void efi_switch_mm(struct mm_struct *mm);
 
 struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6b541bdbda5f..c325b1cc4d1a 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -82,9 +82,8 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   save_pgd = (pgd_t *)__read_cr3();
-   write_cr3((unsigned long)efi_scratch.efi_pgt);
-   goto out;
+   efi_switch_mm(_mm);
+   return NULL;
}
 
early_code_mapping_set_exec(1);
@@ -154,8 +153,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
pud_t *pud;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   write_cr3((unsigned long)save_pgd);
-   __flush_tlb_all();
+   efi_switch_mm(efi_scratch.prev_mm);
return;
}
 
@@ -341,13 +339,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, 
unsigned num_pages)
return 0;
 
/*
-* 

Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-25 Thread Andy Lutomirski
On Thu, Aug 24, 2017 at 7:36 PM, Sai Praneeth Prakhya
 wrote:
> On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
>> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>>  wrote:
>> > +/*
>> > + * Makes the calling kernel thread switch to/from efi_mm context
>> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
>> > + * (Note: This routine is heavily inspired from use_mm)
>> > + */
>> > +void efi_switch_mm(struct mm_struct *mm)
>> > +{
>> > +   struct task_struct *tsk = current;
>> > +
>> > +   task_lock(tsk);
>> > +   efi_scratch.prev_mm = tsk->active_mm;
>> > +   if (efi_scratch.prev_mm != mm) {
>> > +   mmgrab(mm);
>> > +   tsk->active_mm = mm;
>> > +   }
>> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
>> > +   task_unlock(tsk);
>> > +
>> > +   if (efi_scratch.prev_mm != mm)
>> > +   mmdrop(efi_scratch.prev_mm);
>>
>> I'm confused.  You're mmdropping an mm that you are still keeping a
>> pointer to.  This is also a bit confusing in the case where you do
>> efi_switch_mm(efi_scratch.prev_mm).
>>
>> This whole manipulation seems fairly dangerous to me for another
>> reason -- you're taking a user thread (I think) and swapping out its
>> mm to something that the user in question should *not* have access to.
>> What if a perf interrupt happens while you're in the alternate mm?
>> What if you segfault and dump core?  Should we maybe just have a flag
>> that says "this cpu is using a funny mm", assert that the flag is
>> clear when scheduling, and teach perf, coredumps, etc not to touch
>> user memory when the flag is set?
>>
>> Admittedly, the latter problem may well have existed even before these 
>> patches.
>
> Hi All,
>
> Could we please decouple the above issue from this patch set, so that we
> could have common efi_mm between x86 and ARM and also improve
> readability and maintainability for x86/efi.

I don't see why not.

>
> As it seems that "Everything EFI as kthread" might solve the above issue
> for real (which might take quite some time to implement, taking into
> consideration the complexity involved and some special case with
> pstore), do you think this patch set seems OK?
>
> If so, I will send out a V2 addressing the mmdropping issue.
>
> Regards,
> Sai
>


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-25 Thread Andy Lutomirski
On Thu, Aug 24, 2017 at 7:36 PM, Sai Praneeth Prakhya
 wrote:
> On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
>> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>>  wrote:
>> > +/*
>> > + * Makes the calling kernel thread switch to/from efi_mm context
>> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
>> > + * (Note: This routine is heavily inspired from use_mm)
>> > + */
>> > +void efi_switch_mm(struct mm_struct *mm)
>> > +{
>> > +   struct task_struct *tsk = current;
>> > +
>> > +   task_lock(tsk);
>> > +   efi_scratch.prev_mm = tsk->active_mm;
>> > +   if (efi_scratch.prev_mm != mm) {
>> > +   mmgrab(mm);
>> > +   tsk->active_mm = mm;
>> > +   }
>> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
>> > +   task_unlock(tsk);
>> > +
>> > +   if (efi_scratch.prev_mm != mm)
>> > +   mmdrop(efi_scratch.prev_mm);
>>
>> I'm confused.  You're mmdropping an mm that you are still keeping a
>> pointer to.  This is also a bit confusing in the case where you do
>> efi_switch_mm(efi_scratch.prev_mm).
>>
>> This whole manipulation seems fairly dangerous to me for another
>> reason -- you're taking a user thread (I think) and swapping out its
>> mm to something that the user in question should *not* have access to.
>> What if a perf interrupt happens while you're in the alternate mm?
>> What if you segfault and dump core?  Should we maybe just have a flag
>> that says "this cpu is using a funny mm", assert that the flag is
>> clear when scheduling, and teach perf, coredumps, etc not to touch
>> user memory when the flag is set?
>>
>> Admittedly, the latter problem may well have existed even before these 
>> patches.
>
> Hi All,
>
> Could we please decouple the above issue from this patch set, so that we
> could have common efi_mm between x86 and ARM and also improve
> readability and maintainability for x86/efi.

I don't see why not.

>
> As it seems that "Everything EFI as kthread" might solve the above issue
> for real (which might take quite some time to implement, taking into
> consideration the complexity involved and some special case with
> pstore), do you think this patch set seems OK?
>
> If so, I will send out a V2 addressing the mmdropping issue.
>
> Regards,
> Sai
>


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-25 Thread Andy Lutomirski
On Wed, Aug 23, 2017 at 3:52 PM, Sai Praneeth Prakhya
 wrote:
> On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote:
>>
>> > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra  wrote:
>> >
>> >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
>> >>
>> >>
>> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
>> >
>> 
>>  Using a kernel thread solves the problem for real.  Anything that
>>  blindly accesses user memory in kernel thread context is terminally
>>  broken no matter what.
>> >>>
>> >>> So perf-callchain doesn't do it 'blindly', it wants either:
>> >>>
>> >>> - user_mode(regs) true, or
>> >>> - task_pt_regs() set.
>> >>>
>> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the
>> >>> EFI code running could very well have user_mode(regs) being true.
>> >>>
>> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
>> >>> accessible. It bails on error though. So while its careful, it does
>> >>> attempt to access the 'user' mapping directly. Which should also trigger
>> >>> with the EFI code.
>> >>>
>> >>> And I'm not seeing anything particularly broken with either. The PEBS
>> >>> fixup relies on the CPU having just executed the code, and if it could
>> >>> fetch and execute the code, why shouldn't it be able to fetch and read?
>> >>
>> >> There are two ways this could be a problem.  One is that u privileged
>> >> user apps shouldn't be able to read from EFI memory.
>> >
>> > Ah, but only root can create per-cpu events or attach events to kernel
>> > threads (with sensible paranoia levels).
>>
>> But this may not need to be percpu.  If a non root user can trigger, say, an 
>> EFI variable read in their own thread context, boom.
>>
> + Tony
>
> Hi Andi,
>
> I am trying to reproduce the issue that we are discussing and hence
> tried an experiment like this:
> A user process continuously reads efi variable by
> "cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" 
> for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as 
> root (which I suppose should trigger NMI's). I see that everything is fine, 
> no lockups, no kernel crash, no warnings/errors in dmesg.
>
> I see that perf top reports 50% of time is spent in efi function
> (probably efi_get_variable()).
> OverheadShared Object   Symbol
> 50% [unknown]   [k] 0xfffeea967416
>
> 50% is max, on avg it's 35%.
>
> I have tested this on two kernels v4.12 and v3.19. My machine has 8
> cores and to stress test, I further offlined all cpus except cpu0.
>
> Could you please let me know a way to reproduce the issue that we are
> discussing here.
> I think the issue we are concerned here is, when kernel is in efi
> context and an NMI happens and if the NMI handler tries to access user
> space, boom! we don't have user space in efi context. Am I right in
> understanding the issue or is it something else?

The boom isn't a crash, though -- it'll be (potentially) sensitive
information that shows up in the perf record.

As long as EFI isn't using low addresses, there may not be an issue.
But EFI should (maybe) use low addresses, and this'll be more
important.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-25 Thread Andy Lutomirski
On Wed, Aug 23, 2017 at 3:52 PM, Sai Praneeth Prakhya
 wrote:
> On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote:
>>
>> > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra  wrote:
>> >
>> >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
>> >>
>> >>
>> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
>> >
>> 
>>  Using a kernel thread solves the problem for real.  Anything that
>>  blindly accesses user memory in kernel thread context is terminally
>>  broken no matter what.
>> >>>
>> >>> So perf-callchain doesn't do it 'blindly', it wants either:
>> >>>
>> >>> - user_mode(regs) true, or
>> >>> - task_pt_regs() set.
>> >>>
>> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the
>> >>> EFI code running could very well have user_mode(regs) being true.
>> >>>
>> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
>> >>> accessible. It bails on error though. So while its careful, it does
>> >>> attempt to access the 'user' mapping directly. Which should also trigger
>> >>> with the EFI code.
>> >>>
>> >>> And I'm not seeing anything particularly broken with either. The PEBS
>> >>> fixup relies on the CPU having just executed the code, and if it could
>> >>> fetch and execute the code, why shouldn't it be able to fetch and read?
>> >>
>> >> There are two ways this could be a problem.  One is that u privileged
>> >> user apps shouldn't be able to read from EFI memory.
>> >
>> > Ah, but only root can create per-cpu events or attach events to kernel
>> > threads (with sensible paranoia levels).
>>
>> But this may not need to be percpu.  If a non root user can trigger, say, an 
>> EFI variable read in their own thread context, boom.
>>
> + Tony
>
> Hi Andi,
>
> I am trying to reproduce the issue that we are discussing and hence
> tried an experiment like this:
> A user process continuously reads efi variable by
> "cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" 
> for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as 
> root (which I suppose should trigger NMI's). I see that everything is fine, 
> no lockups, no kernel crash, no warnings/errors in dmesg.
>
> I see that perf top reports 50% of time is spent in efi function
> (probably efi_get_variable()).
> OverheadShared Object   Symbol
> 50% [unknown]   [k] 0xfffeea967416
>
> 50% is max, on avg it's 35%.
>
> I have tested this on two kernels v4.12 and v3.19. My machine has 8
> cores and to stress test, I further offlined all cpus except cpu0.
>
> Could you please let me know a way to reproduce the issue that we are
> discussing here.
> I think the issue we are concerned here is, when kernel is in efi
> context and an NMI happens and if the NMI handler tries to access user
> space, boom! we don't have user space in efi context. Am I right in
> understanding the issue or is it something else?

The boom isn't a crash, though -- it'll be (potentially) sensitive
information that shows up in the perf record.

As long as EFI isn't using low addresses, there may not be an issue.
But EFI should (maybe) use low addresses, and this'll be more
important.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-24 Thread Sai Praneeth Prakhya
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>  wrote:
> > +/*
> > + * Makes the calling kernel thread switch to/from efi_mm context
> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > + * (Note: This routine is heavily inspired from use_mm)
> > + */
> > +void efi_switch_mm(struct mm_struct *mm)
> > +{
> > +   struct task_struct *tsk = current;
> > +
> > +   task_lock(tsk);
> > +   efi_scratch.prev_mm = tsk->active_mm;
> > +   if (efi_scratch.prev_mm != mm) {
> > +   mmgrab(mm);
> > +   tsk->active_mm = mm;
> > +   }
> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > +   task_unlock(tsk);
> > +
> > +   if (efi_scratch.prev_mm != mm)
> > +   mmdrop(efi_scratch.prev_mm);
> 
> I'm confused.  You're mmdropping an mm that you are still keeping a
> pointer to.  This is also a bit confusing in the case where you do
> efi_switch_mm(efi_scratch.prev_mm).
> 
> This whole manipulation seems fairly dangerous to me for another
> reason -- you're taking a user thread (I think) and swapping out its
> mm to something that the user in question should *not* have access to.
> What if a perf interrupt happens while you're in the alternate mm?
> What if you segfault and dump core?  Should we maybe just have a flag
> that says "this cpu is using a funny mm", assert that the flag is
> clear when scheduling, and teach perf, coredumps, etc not to touch
> user memory when the flag is set?
> 
> Admittedly, the latter problem may well have existed even before these 
> patches.

Hi All,

Could we please decouple the above issue from this patch set, so that we
could have common efi_mm between x86 and ARM and also improve
readability and maintainability for x86/efi.

As it seems that "Everything EFI as kthread" might solve the above issue
for real (which might take quite some time to implement, taking into
consideration the complexity involved and some special case with
pstore), do you think this patch set seems OK?

If so, I will send out a V2 addressing the mmdropping issue.

Regards,
Sai



Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-24 Thread Sai Praneeth Prakhya
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>  wrote:
> > +/*
> > + * Makes the calling kernel thread switch to/from efi_mm context
> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > + * (Note: This routine is heavily inspired from use_mm)
> > + */
> > +void efi_switch_mm(struct mm_struct *mm)
> > +{
> > +   struct task_struct *tsk = current;
> > +
> > +   task_lock(tsk);
> > +   efi_scratch.prev_mm = tsk->active_mm;
> > +   if (efi_scratch.prev_mm != mm) {
> > +   mmgrab(mm);
> > +   tsk->active_mm = mm;
> > +   }
> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > +   task_unlock(tsk);
> > +
> > +   if (efi_scratch.prev_mm != mm)
> > +   mmdrop(efi_scratch.prev_mm);
> 
> I'm confused.  You're mmdropping an mm that you are still keeping a
> pointer to.  This is also a bit confusing in the case where you do
> efi_switch_mm(efi_scratch.prev_mm).
> 
> This whole manipulation seems fairly dangerous to me for another
> reason -- you're taking a user thread (I think) and swapping out its
> mm to something that the user in question should *not* have access to.
> What if a perf interrupt happens while you're in the alternate mm?
> What if you segfault and dump core?  Should we maybe just have a flag
> that says "this cpu is using a funny mm", assert that the flag is
> clear when scheduling, and teach perf, coredumps, etc not to touch
> user memory when the flag is set?
> 
> Admittedly, the latter problem may well have existed even before these 
> patches.

Hi All,

Could we please decouple the above issue from this patch set, so that we
could have common efi_mm between x86 and ARM and also improve
readability and maintainability for x86/efi.

As it seems that "Everything EFI as kthread" might solve the above issue
for real (which might take quite some time to implement, taking into
consideration the complexity involved and some special case with
pstore), do you think this patch set seems OK?

If so, I will send out a V2 addressing the mmdropping issue.

Regards,
Sai



Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-23 Thread Sai Praneeth Prakhya
On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote:
> 
> > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra  wrote:
> > 
> >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
> >> 
> >> 
> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
> > 
>  
>  Using a kernel thread solves the problem for real.  Anything that
>  blindly accesses user memory in kernel thread context is terminally
>  broken no matter what.
> >>> 
> >>> So perf-callchain doesn't do it 'blindly', it wants either:
> >>> 
> >>> - user_mode(regs) true, or
> >>> - task_pt_regs() set.
> >>> 
> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the
> >>> EFI code running could very well have user_mode(regs) being true.
> >>> 
> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
> >>> accessible. It bails on error though. So while its careful, it does
> >>> attempt to access the 'user' mapping directly. Which should also trigger
> >>> with the EFI code.
> >>> 
> >>> And I'm not seeing anything particularly broken with either. The PEBS
> >>> fixup relies on the CPU having just executed the code, and if it could
> >>> fetch and execute the code, why shouldn't it be able to fetch and read?
> >> 
> >> There are two ways this could be a problem.  One is that u privileged
> >> user apps shouldn't be able to read from EFI memory.
> > 
> > Ah, but only root can create per-cpu events or attach events to kernel
> > threads (with sensible paranoia levels).
> 
> But this may not need to be percpu.  If a non root user can trigger, say, an 
> EFI variable read in their own thread context, boom.
> 
+ Tony

Hi Andi,

I am trying to reproduce the issue that we are discussing and hence
tried an experiment like this:
A user process continuously reads efi variable by
"cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" 
for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as 
root (which I suppose should trigger NMI's). I see that everything is fine, no 
lockups, no kernel crash, no warnings/errors in dmesg.

I see that perf top reports 50% of time is spent in efi function
(probably efi_get_variable()).
OverheadShared Object   Symbol
50% [unknown]   [k] 0xfffeea967416

50% is max, on avg it's 35%.

I have tested this on two kernels v4.12 and v3.19. My machine has 8
cores and to stress test, I further offlined all cpus except cpu0.

Could you please let me know a way to reproduce the issue that we are
discussing here.
I think the issue we are concerned here is, when kernel is in efi
context and an NMI happens and if the NMI handler tries to access user
space, boom! we don't have user space in efi context. Am I right in
understanding the issue or is it something else?

Regards,
Sai



Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-23 Thread Sai Praneeth Prakhya
On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote:
> 
> > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra  wrote:
> > 
> >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
> >> 
> >> 
> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
> > 
>  
>  Using a kernel thread solves the problem for real.  Anything that
>  blindly accesses user memory in kernel thread context is terminally
>  broken no matter what.
> >>> 
> >>> So perf-callchain doesn't do it 'blindly', it wants either:
> >>> 
> >>> - user_mode(regs) true, or
> >>> - task_pt_regs() set.
> >>> 
> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the
> >>> EFI code running could very well have user_mode(regs) being true.
> >>> 
> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
> >>> accessible. It bails on error though. So while its careful, it does
> >>> attempt to access the 'user' mapping directly. Which should also trigger
> >>> with the EFI code.
> >>> 
> >>> And I'm not seeing anything particularly broken with either. The PEBS
> >>> fixup relies on the CPU having just executed the code, and if it could
> >>> fetch and execute the code, why shouldn't it be able to fetch and read?
> >> 
> >> There are two ways this could be a problem.  One is that u privileged
> >> user apps shouldn't be able to read from EFI memory.
> > 
> > Ah, but only root can create per-cpu events or attach events to kernel
> > threads (with sensible paranoia levels).
> 
> But this may not need to be percpu.  If a non root user can trigger, say, an 
> EFI variable read in their own thread context, boom.
> 
+ Tony

Hi Andi,

I am trying to reproduce the issue that we are discussing and hence
tried an experiment like this:
A user process continuously reads efi variable by
"cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" 
for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as 
root (which I suppose should trigger NMI's). I see that everything is fine, no 
lockups, no kernel crash, no warnings/errors in dmesg.

I see that perf top reports 50% of time is spent in efi function
(probably efi_get_variable()).
OverheadShared Object   Symbol
50% [unknown]   [k] 0xfffeea967416

50% is max, on avg it's 35%.

I have tested this on two kernels v4.12 and v3.19. My machine has 8
cores and to stress test, I further offlined all cpus except cpu0.

Could you please let me know a way to reproduce the issue that we are
discussing here.
I think the issue we are concerned here is, when kernel is in efi
context and an NMI happens and if the NMI handler tries to access user
space, boom! we don't have user space in efi context. Am I right in
understanding the issue or is it something else?

Regards,
Sai



Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
> There are two ways this could be a problem.  One is that u privileged
> user apps shouldn't be able to read from EFI memory.  The other is
> that, if EFI were to have IO memory mapped at a "user" address, perf
> could end up reading it.

So assuming the efi_switch_mm() case from the calling thread context, I
don't see how we can avoid it at all.

Suppose we have a free running PEBS counter (PEBS puts samples in DS
buffer and only raises PMI when 'full'). This can easily cover the
entire efi_switch_mm() and back swizzle, and then we have 'userspace'
samples that don't correspond to actual userspace.

EFI (pretending to be userspace) is a giant trainwreck.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
> There are two ways this could be a problem.  One is that u privileged
> user apps shouldn't be able to read from EFI memory.  The other is
> that, if EFI were to have IO memory mapped at a "user" address, perf
> could end up reading it.

So assuming the efi_switch_mm() case from the calling thread context, I
don't see how we can avoid it at all.

Suppose we have a free running PEBS counter (PEBS puts samples in DS
buffer and only raises PMI when 'full'). This can easily cover the
entire efi_switch_mm() and back swizzle, and then we have 'userspace'
samples that don't correspond to actual userspace.

EFI (pretending to be userspace) is a giant trainwreck.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Ard Biesheuvel
On 21 August 2017 at 16:59, Peter Zijlstra  wrote:
> On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote:
>> > Ah, but only root can create per-cpu events or attach events to kernel
>> > threads (with sensible paranoia levels).
>>
>> But this may not need to be percpu.  If a non root user can trigger, say, an 
>> EFI variable read in their own thread context, boom.
>
> I was going by the proposed: "everything EFI in a kthread" model. But
> yes, if that's not done, then you're quite right.
>

How does this work in cases where we need to call into UEFI from
non-process context? Or at least from a context where current != EFI's
kthread. We have EFI pstore code, for instance, that records panic
data. Should we make an exception for those?

I'm happy to have a stab at implementing the EFI kthread, but I'd like
to get some of these details clarified first.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Ard Biesheuvel
On 21 August 2017 at 16:59, Peter Zijlstra  wrote:
> On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote:
>> > Ah, but only root can create per-cpu events or attach events to kernel
>> > threads (with sensible paranoia levels).
>>
>> But this may not need to be percpu.  If a non root user can trigger, say, an 
>> EFI variable read in their own thread context, boom.
>
> I was going by the proposed: "everything EFI in a kthread" model. But
> yes, if that's not done, then you're quite right.
>

How does this work in cases where we need to call into UEFI from
non-process context? Or at least from a context where current != EFI's
kthread. We have EFI pstore code, for instance, that records panic
data. Should we make an exception for those?

I'm happy to have a stab at implementing the EFI kthread, but I'd like
to get some of these details clarified first.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote:
> > Ah, but only root can create per-cpu events or attach events to kernel
> > threads (with sensible paranoia levels).
> 
> But this may not need to be percpu.  If a non root user can trigger, say, an 
> EFI variable read in their own thread context, boom.

I was going by the proposed: "everything EFI in a kthread" model. But
yes, if that's not done, then you're quite right.

> > 
> >> The other is that, if EFI were to have IO memory mapped at a "user"
> >> address, perf could end up reading it.
> > 
> > Ah, but in neither mode does perf assume much, the LBR follows branches
> > the CPU took and thus we _know_ there was code there, not MMIO. And the
> > stack unwind simply follows the stack up, although I suppose it could be
> > 'tricked' into probing MMIO. We can certainly add an "->mm !=
> > ->active_mm" escape clause to the unwind code.
> > 
> > Although I don't see how we're currently avoiding the same problem with
> > existing userspace unwinds, userspace can equally have MMIO mapped.
> 
> But user space at least only has IO mapped to which the user program in 
> question has rights.

Still, we should not mess it up just because we're trying to unwind
stacks.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote:
> > Ah, but only root can create per-cpu events or attach events to kernel
> > threads (with sensible paranoia levels).
> 
> But this may not need to be percpu.  If a non root user can trigger, say, an 
> EFI variable read in their own thread context, boom.

I was going by the proposed: "everything EFI in a kthread" model. But
yes, if that's not done, then you're quite right.

> > 
> >> The other is that, if EFI were to have IO memory mapped at a "user"
> >> address, perf could end up reading it.
> > 
> > Ah, but in neither mode does perf assume much, the LBR follows branches
> > the CPU took and thus we _know_ there was code there, not MMIO. And the
> > stack unwind simply follows the stack up, although I suppose it could be
> > 'tricked' into probing MMIO. We can certainly add an "->mm !=
> > ->active_mm" escape clause to the unwind code.
> > 
> > Although I don't see how we're currently avoiding the same problem with
> > existing userspace unwinds, userspace can equally have MMIO mapped.
> 
> But user space at least only has IO mapped to which the user program in 
> question has rights.

Still, we should not mess it up just because we're trying to unwind
stacks.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Andy Lutomirski


> On Aug 21, 2017, at 7:08 AM, Peter Zijlstra  wrote:
> 
>> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
>> 
>> 
>>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
> 
 
 Using a kernel thread solves the problem for real.  Anything that
 blindly accesses user memory in kernel thread context is terminally
 broken no matter what.
>>> 
>>> So perf-callchain doesn't do it 'blindly', it wants either:
>>> 
>>> - user_mode(regs) true, or
>>> - task_pt_regs() set.
>>> 
>>> However I'm thinking that if the kernel thread has ->mm == _mm, the
>>> EFI code running could very well have user_mode(regs) being true.
>>> 
>>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
>>> accessible. It bails on error though. So while its careful, it does
>>> attempt to access the 'user' mapping directly. Which should also trigger
>>> with the EFI code.
>>> 
>>> And I'm not seeing anything particularly broken with either. The PEBS
>>> fixup relies on the CPU having just executed the code, and if it could
>>> fetch and execute the code, why shouldn't it be able to fetch and read?
>> 
>> There are two ways this could be a problem.  One is that u privileged
>> user apps shouldn't be able to read from EFI memory.
> 
> Ah, but only root can create per-cpu events or attach events to kernel
> threads (with sensible paranoia levels).

But this may not need to be percpu.  If a non root user can trigger, say, an 
EFI variable read in their own thread context, boom.

> 
>> The other is that, if EFI were to have IO memory mapped at a "user"
>> address, perf could end up reading it.
> 
> Ah, but in neither mode does perf assume much, the LBR follows branches
> the CPU took and thus we _know_ there was code there, not MMIO. And the
> stack unwind simply follows the stack up, although I suppose it could be
> 'tricked' into probing MMIO. We can certainly add an "->mm !=
> ->active_mm" escape clause to the unwind code.
> 
> Although I don't see how we're currently avoiding the same problem with
> existing userspace unwinds, userspace can equally have MMIO mapped.

But user space at least only has IO mapped to which the user program in 
question has rights.

> 
> But neither will use pre-existing user addresses in the efi_mm I think.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Andy Lutomirski


> On Aug 21, 2017, at 7:08 AM, Peter Zijlstra  wrote:
> 
>> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
>> 
>> 
>>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
> 
 
 Using a kernel thread solves the problem for real.  Anything that
 blindly accesses user memory in kernel thread context is terminally
 broken no matter what.
>>> 
>>> So perf-callchain doesn't do it 'blindly', it wants either:
>>> 
>>> - user_mode(regs) true, or
>>> - task_pt_regs() set.
>>> 
>>> However I'm thinking that if the kernel thread has ->mm == _mm, the
>>> EFI code running could very well have user_mode(regs) being true.
>>> 
>>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
>>> accessible. It bails on error though. So while its careful, it does
>>> attempt to access the 'user' mapping directly. Which should also trigger
>>> with the EFI code.
>>> 
>>> And I'm not seeing anything particularly broken with either. The PEBS
>>> fixup relies on the CPU having just executed the code, and if it could
>>> fetch and execute the code, why shouldn't it be able to fetch and read?
>> 
>> There are two ways this could be a problem.  One is that u privileged
>> user apps shouldn't be able to read from EFI memory.
> 
> Ah, but only root can create per-cpu events or attach events to kernel
> threads (with sensible paranoia levels).

But this may not need to be percpu.  If a non root user can trigger, say, an 
EFI variable read in their own thread context, boom.

> 
>> The other is that, if EFI were to have IO memory mapped at a "user"
>> address, perf could end up reading it.
> 
> Ah, but in neither mode does perf assume much, the LBR follows branches
> the CPU took and thus we _know_ there was code there, not MMIO. And the
> stack unwind simply follows the stack up, although I suppose it could be
> 'tricked' into probing MMIO. We can certainly add an "->mm !=
> ->active_mm" escape clause to the unwind code.
> 
> Although I don't see how we're currently avoiding the same problem with
> existing userspace unwinds, userspace can equally have MMIO mapped.

But user space at least only has IO mapped to which the user program in 
question has rights.

> 
> But neither will use pre-existing user addresses in the efi_mm I think.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:

> >> 
> >> Using a kernel thread solves the problem for real.  Anything that
> >> blindly accesses user memory in kernel thread context is terminally
> >> broken no matter what.
> > 
> > So perf-callchain doesn't do it 'blindly', it wants either:
> > 
> > - user_mode(regs) true, or
> > - task_pt_regs() set.
> > 
> > However I'm thinking that if the kernel thread has ->mm == _mm, the
> > EFI code running could very well have user_mode(regs) being true.
> > 
> > intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
> > accessible. It bails on error though. So while its careful, it does
> > attempt to access the 'user' mapping directly. Which should also trigger
> > with the EFI code.
> > 
> > And I'm not seeing anything particularly broken with either. The PEBS
> > fixup relies on the CPU having just executed the code, and if it could
> > fetch and execute the code, why shouldn't it be able to fetch and read?
> 
> There are two ways this could be a problem.  One is that u privileged
> user apps shouldn't be able to read from EFI memory.

Ah, but only root can create per-cpu events or attach events to kernel
threads (with sensible paranoia levels).

> The other is that, if EFI were to have IO memory mapped at a "user"
> address, perf could end up reading it.

Ah, but in neither mode does perf assume much, the LBR follows branches
the CPU took and thus we _know_ there was code there, not MMIO. And the
stack unwind simply follows the stack up, although I suppose it could be
'tricked' into probing MMIO. We can certainly add an "->mm !=
->active_mm" escape clause to the unwind code.

Although I don't see how we're currently avoiding the same problem with
existing userspace unwinds, userspace can equally have MMIO mapped.

But neither will use pre-existing user addresses in the efi_mm I think.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:

> >> 
> >> Using a kernel thread solves the problem for real.  Anything that
> >> blindly accesses user memory in kernel thread context is terminally
> >> broken no matter what.
> > 
> > So perf-callchain doesn't do it 'blindly', it wants either:
> > 
> > - user_mode(regs) true, or
> > - task_pt_regs() set.
> > 
> > However I'm thinking that if the kernel thread has ->mm == _mm, the
> > EFI code running could very well have user_mode(regs) being true.
> > 
> > intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
> > accessible. It bails on error though. So while its careful, it does
> > attempt to access the 'user' mapping directly. Which should also trigger
> > with the EFI code.
> > 
> > And I'm not seeing anything particularly broken with either. The PEBS
> > fixup relies on the CPU having just executed the code, and if it could
> > fetch and execute the code, why shouldn't it be able to fetch and read?
> 
> There are two ways this could be a problem.  One is that u privileged
> user apps shouldn't be able to read from EFI memory.

Ah, but only root can create per-cpu events or attach events to kernel
threads (with sensible paranoia levels).

> The other is that, if EFI were to have IO memory mapped at a "user"
> address, perf could end up reading it.

Ah, but in neither mode does perf assume much, the LBR follows branches
the CPU took and thus we _know_ there was code there, not MMIO. And the
stack unwind simply follows the stack up, although I suppose it could be
'tricked' into probing MMIO. We can certainly add an "->mm !=
->active_mm" escape clause to the unwind code.

Although I don't see how we're currently avoiding the same problem with
existing userspace unwinds, userspace can equally have MMIO mapped.

But neither will use pre-existing user addresses in the efi_mm I think.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Andy Lutomirski


> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
> 
>> On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote:
>> On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon  wrote:
> 
>>> I'm still concerned that we're treating perf specially here -- are we
>>> absolutely sure that nobody else is going to attempt user accesses off the
>>> back of an interrupt?
>> 
>> Reasonably sure?  If nothing else, an interrupt taken while mmap_sem()
>> is held for write that tries to access user memory is asking for
>> serious trouble.  There are still a few callers of pagefault_disable()
>> and copy...inatomic(), though.
> 
> I'm not immediately seeing how holding mmap_sem for writing is a
> problem.
> 
>>> If not, then I'd much prefer a solution that catches
>>> anybody doing that with the EFI page table installed, rather than trying
>>> to play whack-a-mole like this.
>> 
>> Using a kernel thread solves the problem for real.  Anything that
>> blindly accesses user memory in kernel thread context is terminally
>> broken no matter what.
> 
> So perf-callchain doesn't do it 'blindly', it wants either:
> 
> - user_mode(regs) true, or
> - task_pt_regs() set.
> 
> However I'm thinking that if the kernel thread has ->mm == _mm, the
> EFI code running could very well have user_mode(regs) being true.
> 
> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
> accessible. It bails on error though. So while its careful, it does
> attempt to access the 'user' mapping directly. Which should also trigger
> with the EFI code.
> 
> And I'm not seeing anything particularly broken with either. The PEBS
> fixup relies on the CPU having just executed the code, and if it could
> fetch and execute the code, why shouldn't it be able to fetch and read?

There are two ways this could be a problem.  One is that u privileged user apps 
shouldn't be able to read from EFI memory.  The other is that, if EFI were to 
have IO memory mapped at a "user" address, perf could end up reading it.

> (eXecute implies Read assumed). And like said, it if triggers a fault,
> it bails, no worries.
> 
> It really doesn't care if the task is a kernel thread or not. Same for
> the unwinder, if we get an interrupt register set that points into
> 'userspace' we try and unwind it.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Andy Lutomirski


> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra  wrote:
> 
>> On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote:
>> On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon  wrote:
> 
>>> I'm still concerned that we're treating perf specially here -- are we
>>> absolutely sure that nobody else is going to attempt user accesses off the
>>> back of an interrupt?
>> 
>> Reasonably sure?  If nothing else, an interrupt taken while mmap_sem()
>> is held for write that tries to access user memory is asking for
>> serious trouble.  There are still a few callers of pagefault_disable()
>> and copy...inatomic(), though.
> 
> I'm not immediately seeing how holding mmap_sem for writing is a
> problem.
> 
>>> If not, then I'd much prefer a solution that catches
>>> anybody doing that with the EFI page table installed, rather than trying
>>> to play whack-a-mole like this.
>> 
>> Using a kernel thread solves the problem for real.  Anything that
>> blindly accesses user memory in kernel thread context is terminally
>> broken no matter what.
> 
> So perf-callchain doesn't do it 'blindly', it wants either:
> 
> - user_mode(regs) true, or
> - task_pt_regs() set.
> 
> However I'm thinking that if the kernel thread has ->mm == _mm, the
> EFI code running could very well have user_mode(regs) being true.
> 
> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
> accessible. It bails on error though. So while its careful, it does
> attempt to access the 'user' mapping directly. Which should also trigger
> with the EFI code.
> 
> And I'm not seeing anything particularly broken with either. The PEBS
> fixup relies on the CPU having just executed the code, and if it could
> fetch and execute the code, why shouldn't it be able to fetch and read?

There are two ways this could be a problem.  One is that u privileged user apps 
shouldn't be able to read from EFI memory.  The other is that, if EFI were to 
have IO memory mapped at a "user" address, perf could end up reading it.

> (eXecute implies Read assumed). And like said, it if triggers a fault,
> it bails, no worries.
> 
> It really doesn't care if the task is a kernel thread or not. Same for
> the unwinder, if we get an interrupt register set that points into
> 'userspace' we try and unwind it.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote:
> On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon  wrote:

> > I'm still concerned that we're treating perf specially here -- are we
> > absolutely sure that nobody else is going to attempt user accesses off the
> > back of an interrupt?
> 
> Reasonably sure?  If nothing else, an interrupt taken while mmap_sem()
> is held for write that tries to access user memory is asking for
> serious trouble.  There are still a few callers of pagefault_disable()
> and copy...inatomic(), though.

I'm not immediately seeing how holding mmap_sem for writing is a
problem.

> > If not, then I'd much prefer a solution that catches
> > anybody doing that with the EFI page table installed, rather than trying
> > to play whack-a-mole like this.
> 
> Using a kernel thread solves the problem for real.  Anything that
> blindly accesses user memory in kernel thread context is terminally
> broken no matter what.

So perf-callchain doesn't do it 'blindly', it wants either:

 - user_mode(regs) true, or
 - task_pt_regs() set.

However I'm thinking that if the kernel thread has ->mm == _mm, the
EFI code running could very well have user_mode(regs) being true.

intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
accessible. It bails on error though. So while its careful, it does
attempt to access the 'user' mapping directly. Which should also trigger
with the EFI code.

And I'm not seeing anything particularly broken with either. The PEBS
fixup relies on the CPU having just executed the code, and if it could
fetch and execute the code, why shouldn't it be able to fetch and read?
(eXecute implies Read assumed). And like said, it if triggers a fault,
it bails, no worries.

It really doesn't care if the task is a kernel thread or not. Same for
the unwinder, if we get an interrupt register set that points into
'userspace' we try and unwind it.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-21 Thread Peter Zijlstra
On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote:
> On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon  wrote:

> > I'm still concerned that we're treating perf specially here -- are we
> > absolutely sure that nobody else is going to attempt user accesses off the
> > back of an interrupt?
> 
> Reasonably sure?  If nothing else, an interrupt taken while mmap_sem()
> is held for write that tries to access user memory is asking for
> serious trouble.  There are still a few callers of pagefault_disable()
> and copy...inatomic(), though.

I'm not immediately seeing how holding mmap_sem for writing is a
problem.

> > If not, then I'd much prefer a solution that catches
> > anybody doing that with the EFI page table installed, rather than trying
> > to play whack-a-mole like this.
> 
> Using a kernel thread solves the problem for real.  Anything that
> blindly accesses user memory in kernel thread context is terminally
> broken no matter what.

So perf-callchain doesn't do it 'blindly', it wants either:

 - user_mode(regs) true, or
 - task_pt_regs() set.

However I'm thinking that if the kernel thread has ->mm == _mm, the
EFI code running could very well have user_mode(regs) being true.

intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are
accessible. It bails on error though. So while its careful, it does
attempt to access the 'user' mapping directly. Which should also trigger
with the EFI code.

And I'm not seeing anything particularly broken with either. The PEBS
fixup relies on the CPU having just executed the code, and if it could
fetch and execute the code, why shouldn't it be able to fetch and read?
(eXecute implies Read assumed). And like said, it if triggers a fault,
it bails, no worries.

It really doesn't care if the task is a kernel thread or not. Same for
the unwinder, if we get an interrupt register set that points into
'userspace' we try and unwind it.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-17 Thread Andy Lutomirski
On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon  wrote:
> On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote:
>> On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote:
>> > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  
>> > wrote:
>> > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
>> > >>
>> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e.
>> > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip
>> > >> the sample, as with the skid case.
>> > >
>> > > FYI, this is my preferred solution for x86 too.
>> >
>> > One option for the "funny mm" flag would be literally the condition
>> > current->mm != current->active_mm.  I *think* this gets all the cases
>> > right as long as efi_switch_mm is careful with its ordering and that
>> > the arch switch_mm() code can handle the resulting ordering.  (x86's
>> > can now, I think, or at least will be able to in 4.14 -- not sure
>> > about other arches).
>>
>> For arm64 we'd have to rework things a bit to get the ordering right
>> (especially when we flip to/from the idmap), but otherwise this sounds sane 
>> to
>> me.
>>
>> > That being said, there's a totally different solution: run EFI
>> > callbacks in a kernel thread.  This has other benefits: we could run
>> > those callbacks in user mode some day, and doing *that* in a user
>> > thread seems like a mistake.
>>
>> I think that wouldn't work for CPU-bound perf events (which are not
>> ctx-switched with the task).
>>
>> It might be desireable to do that anyway, though.
>
> I'm still concerned that we're treating perf specially here -- are we
> absolutely sure that nobody else is going to attempt user accesses off the
> back of an interrupt?

Reasonably sure?  If nothing else, an interrupt taken while mmap_sem()
is held for write that tries to access user memory is asking for
serious trouble.  There are still a few callers of pagefault_disable()
and copy...inatomic(), though.

> If not, then I'd much prefer a solution that catches
> anybody doing that with the EFI page table installed, rather than trying
> to play whack-a-mole like this.

Using a kernel thread solves the problem for real.  Anything that
blindly accesses user memory in kernel thread context is terminally
broken no matter what.

>
> Will


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-17 Thread Andy Lutomirski
On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon  wrote:
> On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote:
>> On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote:
>> > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  
>> > wrote:
>> > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
>> > >>
>> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e.
>> > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip
>> > >> the sample, as with the skid case.
>> > >
>> > > FYI, this is my preferred solution for x86 too.
>> >
>> > One option for the "funny mm" flag would be literally the condition
>> > current->mm != current->active_mm.  I *think* this gets all the cases
>> > right as long as efi_switch_mm is careful with its ordering and that
>> > the arch switch_mm() code can handle the resulting ordering.  (x86's
>> > can now, I think, or at least will be able to in 4.14 -- not sure
>> > about other arches).
>>
>> For arm64 we'd have to rework things a bit to get the ordering right
>> (especially when we flip to/from the idmap), but otherwise this sounds sane 
>> to
>> me.
>>
>> > That being said, there's a totally different solution: run EFI
>> > callbacks in a kernel thread.  This has other benefits: we could run
>> > those callbacks in user mode some day, and doing *that* in a user
>> > thread seems like a mistake.
>>
>> I think that wouldn't work for CPU-bound perf events (which are not
>> ctx-switched with the task).
>>
>> It might be desireable to do that anyway, though.
>
> I'm still concerned that we're treating perf specially here -- are we
> absolutely sure that nobody else is going to attempt user accesses off the
> back of an interrupt?

Reasonably sure?  If nothing else, an interrupt taken while mmap_sem()
is held for write that tries to access user memory is asking for
serious trouble.  There are still a few callers of pagefault_disable()
and copy...inatomic(), though.

> If not, then I'd much prefer a solution that catches
> anybody doing that with the EFI page table installed, rather than trying
> to play whack-a-mole like this.

Using a kernel thread solves the problem for real.  Anything that
blindly accesses user memory in kernel thread context is terminally
broken no matter what.

>
> Will


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-17 Thread Will Deacon
On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote:
> On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote:
> > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  
> > wrote:
> > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
> > >>
> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e.
> > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip
> > >> the sample, as with the skid case.
> > >
> > > FYI, this is my preferred solution for x86 too.
> > 
> > One option for the "funny mm" flag would be literally the condition
> > current->mm != current->active_mm.  I *think* this gets all the cases
> > right as long as efi_switch_mm is careful with its ordering and that
> > the arch switch_mm() code can handle the resulting ordering.  (x86's
> > can now, I think, or at least will be able to in 4.14 -- not sure
> > about other arches).
> 
> For arm64 we'd have to rework things a bit to get the ordering right
> (especially when we flip to/from the idmap), but otherwise this sounds sane to
> me.
> 
> > That being said, there's a totally different solution: run EFI
> > callbacks in a kernel thread.  This has other benefits: we could run
> > those callbacks in user mode some day, and doing *that* in a user
> > thread seems like a mistake.
> 
> I think that wouldn't work for CPU-bound perf events (which are not
> ctx-switched with the task).
> 
> It might be desireable to do that anyway, though.

I'm still concerned that we're treating perf specially here -- are we
absolutely sure that nobody else is going to attempt user accesses off the
back of an interrupt? If not, then I'd much prefer a solution that catches
anybody doing that with the EFI page table installed, rather than trying
to play whack-a-mole like this.

Will


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-17 Thread Will Deacon
On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote:
> On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote:
> > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  
> > wrote:
> > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
> > >>
> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e.
> > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip
> > >> the sample, as with the skid case.
> > >
> > > FYI, this is my preferred solution for x86 too.
> > 
> > One option for the "funny mm" flag would be literally the condition
> > current->mm != current->active_mm.  I *think* this gets all the cases
> > right as long as efi_switch_mm is careful with its ordering and that
> > the arch switch_mm() code can handle the resulting ordering.  (x86's
> > can now, I think, or at least will be able to in 4.14 -- not sure
> > about other arches).
> 
> For arm64 we'd have to rework things a bit to get the ordering right
> (especially when we flip to/from the idmap), but otherwise this sounds sane to
> me.
> 
> > That being said, there's a totally different solution: run EFI
> > callbacks in a kernel thread.  This has other benefits: we could run
> > those callbacks in user mode some day, and doing *that* in a user
> > thread seems like a mistake.
> 
> I think that wouldn't work for CPU-bound perf events (which are not
> ctx-switched with the task).
> 
> It might be desireable to do that anyway, though.

I'm still concerned that we're treating perf specially here -- are we
absolutely sure that nobody else is going to attempt user accesses off the
back of an interrupt? If not, then I'd much prefer a solution that catches
anybody doing that with the EFI page table installed, rather than trying
to play whack-a-mole like this.

Will


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Mark Rutland
On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote:
> On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  
> wrote:
> > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
> >>
> >> I'd expect we'd abort at a higher level, not taking any sample. i.e.
> >> we'd have the core overflow handler check in_funny_mm(), and if so, skip
> >> the sample, as with the skid case.
> >
> > FYI, this is my preferred solution for x86 too.
> 
> One option for the "funny mm" flag would be literally the condition
> current->mm != current->active_mm.  I *think* this gets all the cases
> right as long as efi_switch_mm is careful with its ordering and that
> the arch switch_mm() code can handle the resulting ordering.  (x86's
> can now, I think, or at least will be able to in 4.14 -- not sure
> about other arches).

For arm64 we'd have to rework things a bit to get the ordering right
(especially when we flip to/from the idmap), but otherwise this sounds sane to
me.

> That being said, there's a totally different solution: run EFI
> callbacks in a kernel thread.  This has other benefits: we could run
> those callbacks in user mode some day, and doing *that* in a user
> thread seems like a mistake.

I think that wouldn't work for CPU-bound perf events (which are not
ctx-switched with the task).

It might be desireable to do that anyway, though.

Thanks,
Mark.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Mark Rutland
On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote:
> On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  
> wrote:
> > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
> >>
> >> I'd expect we'd abort at a higher level, not taking any sample. i.e.
> >> we'd have the core overflow handler check in_funny_mm(), and if so, skip
> >> the sample, as with the skid case.
> >
> > FYI, this is my preferred solution for x86 too.
> 
> One option for the "funny mm" flag would be literally the condition
> current->mm != current->active_mm.  I *think* this gets all the cases
> right as long as efi_switch_mm is careful with its ordering and that
> the arch switch_mm() code can handle the resulting ordering.  (x86's
> can now, I think, or at least will be able to in 4.14 -- not sure
> about other arches).

For arm64 we'd have to rework things a bit to get the ordering right
(especially when we flip to/from the idmap), but otherwise this sounds sane to
me.

> That being said, there's a totally different solution: run EFI
> callbacks in a kernel thread.  This has other benefits: we could run
> those callbacks in user mode some day, and doing *that* in a user
> thread seems like a mistake.

I think that wouldn't work for CPU-bound perf events (which are not
ctx-switched with the task).

It might be desireable to do that anyway, though.

Thanks,
Mark.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Andy Lutomirski
On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  wrote:
> On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
>>
>> I'd expect we'd abort at a higher level, not taking any sample. i.e.
>> we'd have the core overflow handler check in_funny_mm(), and if so, skip
>> the sample, as with the skid case.
>
> FYI, this is my preferred solution for x86 too.

One option for the "funny mm" flag would be literally the condition
current->mm != current->active_mm.  I *think* this gets all the cases
right as long as efi_switch_mm is careful with its ordering and that
the arch switch_mm() code can handle the resulting ordering.  (x86's
can now, I think, or at least will be able to in 4.14 -- not sure
about other arches).

That being said, there's a totally different solution: run EFI
callbacks in a kernel thread.  This has other benefits: we could run
those callbacks in user mode some day, and doing *that* in a user
thread seems like a mistake.

--Andy


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Andy Lutomirski
On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming  wrote:
> On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
>>
>> I'd expect we'd abort at a higher level, not taking any sample. i.e.
>> we'd have the core overflow handler check in_funny_mm(), and if so, skip
>> the sample, as with the skid case.
>
> FYI, this is my preferred solution for x86 too.

One option for the "funny mm" flag would be literally the condition
current->mm != current->active_mm.  I *think* this gets all the cases
right as long as efi_switch_mm is careful with its ordering and that
the arch switch_mm() code can handle the resulting ordering.  (x86's
can now, I think, or at least will be able to in 4.14 -- not sure
about other arches).

That being said, there's a totally different solution: run EFI
callbacks in a kernel thread.  This has other benefits: we could run
those callbacks in user mode some day, and doing *that* in a user
thread seems like a mistake.

--Andy


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Matt Fleming
On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
> 
> I'd expect we'd abort at a higher level, not taking any sample. i.e.
> we'd have the core overflow handler check in_funny_mm(), and if so, skip
> the sample, as with the skid case.

FYI, this is my preferred solution for x86 too.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Matt Fleming
On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote:
> 
> I'd expect we'd abort at a higher level, not taking any sample. i.e.
> we'd have the core overflow handler check in_funny_mm(), and if so, skip
> the sample, as with the skid case.

FYI, this is my preferred solution for x86 too.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Mark Rutland
On Wed, Aug 16, 2017 at 11:07:10AM +0100, Will Deacon wrote:
> On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote:
> > On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote:
> > > (+ Mark, Will)
> > > 
> > > On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> > > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
> > > >  wrote:
> > > >> +/*
> > > >> + * Makes the calling kernel thread switch to/from efi_mm context
> > > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > > >> + * (Note: This routine is heavily inspired from use_mm)
> > > >> + */
> > > >> +void efi_switch_mm(struct mm_struct *mm)
> > > >> +{
> > > >> +   struct task_struct *tsk = current;
> > > >> +
> > > >> +   task_lock(tsk);
> > > >> +   efi_scratch.prev_mm = tsk->active_mm;
> > > >> +   if (efi_scratch.prev_mm != mm) {
> > > >> +   mmgrab(mm);
> > > >> +   tsk->active_mm = mm;
> > > >> +   }
> > > >> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > > >> +   task_unlock(tsk);
> > > >> +
> > > >> +   if (efi_scratch.prev_mm != mm)
> > > >> +   mmdrop(efi_scratch.prev_mm);
> > > >
> > > > I'm confused.  You're mmdropping an mm that you are still keeping a
> > > > pointer to.  This is also a bit confusing in the case where you do
> > > > efi_switch_mm(efi_scratch.prev_mm).
> > > >
> > > > This whole manipulation seems fairly dangerous to me for another
> > > > reason -- you're taking a user thread (I think) and swapping out its
> > > > mm to something that the user in question should *not* have access to.
> > > > What if a perf interrupt happens while you're in the alternate mm?
> > > > What if you segfault and dump core?  Should we maybe just have a flag
> > > > that says "this cpu is using a funny mm", assert that the flag is
> > > > clear when scheduling, and teach perf, coredumps, etc not to touch
> > > > user memory when the flag is set?
> > > 
> > > It appears we may have introduced this exact issue on arm64 and ARM by
> > > starting to run the UEFI runtime services with interrupts enabled.
> > > (perf does not use NMI on ARM, so the issue did not exist beforehand)
> > > 
> > > Mark, Will, any thoughts?
> > 
> > Yup, I can cause perf to take samples from the EFI FW code, so that's
> > less than ideal.
> 
> But that should only happen if you're profiling EL1, right, which needs
> root privileges? (assuming the skid issue is solved -- not sure what
> happened to those patches after they broke criu).

I *think* that only needs perf_event_paranoid < 1, rather than root.

It's certianly not accessible by default to most users (e.g. my Ubuntu
fs sets this to 2, and IIRC Debian go to a much more stringent
non-upstream paranoid level).
 
> > The "funny mm" flag sounds like a good idea to me, though given recent
> > pain with sampling in the case of skid, I don't know exactly what we
> > should do if/when we take an overflow interrupt while in EFI.
> 
> I don't think special-casing perf interrupts is the right thing to do here.
> If we're concerned about user-accesses being made off the back of interrupts
> taken whilst in EFI, then we should probably either swizzle back in the
> user page table on the IRQ path or postpone handling it until we're done
> with the firmware.

Doing that for every IRQ feels odd, especially as the result would be
sampling something that wasn't executed, potentially repeatedly, giveing
bogus info.

> Having a flag feels a bit weird: would the uaccess routines return
> -EFAULT if it's set?

I'd expect we'd abort at a higher level, not taking any sample. i.e.
we'd have the core overflow handler check in_funny_mm(), and if so, skip
the sample, as with the skid case.

Thanks,
Mark.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Mark Rutland
On Wed, Aug 16, 2017 at 11:07:10AM +0100, Will Deacon wrote:
> On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote:
> > On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote:
> > > (+ Mark, Will)
> > > 
> > > On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> > > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
> > > >  wrote:
> > > >> +/*
> > > >> + * Makes the calling kernel thread switch to/from efi_mm context
> > > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > > >> + * (Note: This routine is heavily inspired from use_mm)
> > > >> + */
> > > >> +void efi_switch_mm(struct mm_struct *mm)
> > > >> +{
> > > >> +   struct task_struct *tsk = current;
> > > >> +
> > > >> +   task_lock(tsk);
> > > >> +   efi_scratch.prev_mm = tsk->active_mm;
> > > >> +   if (efi_scratch.prev_mm != mm) {
> > > >> +   mmgrab(mm);
> > > >> +   tsk->active_mm = mm;
> > > >> +   }
> > > >> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > > >> +   task_unlock(tsk);
> > > >> +
> > > >> +   if (efi_scratch.prev_mm != mm)
> > > >> +   mmdrop(efi_scratch.prev_mm);
> > > >
> > > > I'm confused.  You're mmdropping an mm that you are still keeping a
> > > > pointer to.  This is also a bit confusing in the case where you do
> > > > efi_switch_mm(efi_scratch.prev_mm).
> > > >
> > > > This whole manipulation seems fairly dangerous to me for another
> > > > reason -- you're taking a user thread (I think) and swapping out its
> > > > mm to something that the user in question should *not* have access to.
> > > > What if a perf interrupt happens while you're in the alternate mm?
> > > > What if you segfault and dump core?  Should we maybe just have a flag
> > > > that says "this cpu is using a funny mm", assert that the flag is
> > > > clear when scheduling, and teach perf, coredumps, etc not to touch
> > > > user memory when the flag is set?
> > > 
> > > It appears we may have introduced this exact issue on arm64 and ARM by
> > > starting to run the UEFI runtime services with interrupts enabled.
> > > (perf does not use NMI on ARM, so the issue did not exist beforehand)
> > > 
> > > Mark, Will, any thoughts?
> > 
> > Yup, I can cause perf to take samples from the EFI FW code, so that's
> > less than ideal.
> 
> But that should only happen if you're profiling EL1, right, which needs
> root privileges? (assuming the skid issue is solved -- not sure what
> happened to those patches after they broke criu).

I *think* that only needs perf_event_paranoid < 1, rather than root.

It's certianly not accessible by default to most users (e.g. my Ubuntu
fs sets this to 2, and IIRC Debian go to a much more stringent
non-upstream paranoid level).
 
> > The "funny mm" flag sounds like a good idea to me, though given recent
> > pain with sampling in the case of skid, I don't know exactly what we
> > should do if/when we take an overflow interrupt while in EFI.
> 
> I don't think special-casing perf interrupts is the right thing to do here.
> If we're concerned about user-accesses being made off the back of interrupts
> taken whilst in EFI, then we should probably either swizzle back in the
> user page table on the IRQ path or postpone handling it until we're done
> with the firmware.

Doing that for every IRQ feels odd, especially as the result would be
sampling something that wasn't executed, potentially repeatedly, giveing
bogus info.

> Having a flag feels a bit weird: would the uaccess routines return
> -EFAULT if it's set?

I'd expect we'd abort at a higher level, not taking any sample. i.e.
we'd have the core overflow handler check in_funny_mm(), and if so, skip
the sample, as with the skid case.

Thanks,
Mark.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Will Deacon
On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote:
> On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote:
> > (+ Mark, Will)
> > 
> > On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
> > >  wrote:
> > >> +/*
> > >> + * Makes the calling kernel thread switch to/from efi_mm context
> > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > >> + * (Note: This routine is heavily inspired from use_mm)
> > >> + */
> > >> +void efi_switch_mm(struct mm_struct *mm)
> > >> +{
> > >> +   struct task_struct *tsk = current;
> > >> +
> > >> +   task_lock(tsk);
> > >> +   efi_scratch.prev_mm = tsk->active_mm;
> > >> +   if (efi_scratch.prev_mm != mm) {
> > >> +   mmgrab(mm);
> > >> +   tsk->active_mm = mm;
> > >> +   }
> > >> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > >> +   task_unlock(tsk);
> > >> +
> > >> +   if (efi_scratch.prev_mm != mm)
> > >> +   mmdrop(efi_scratch.prev_mm);
> > >
> > > I'm confused.  You're mmdropping an mm that you are still keeping a
> > > pointer to.  This is also a bit confusing in the case where you do
> > > efi_switch_mm(efi_scratch.prev_mm).
> > >
> > > This whole manipulation seems fairly dangerous to me for another
> > > reason -- you're taking a user thread (I think) and swapping out its
> > > mm to something that the user in question should *not* have access to.
> > > What if a perf interrupt happens while you're in the alternate mm?
> > > What if you segfault and dump core?  Should we maybe just have a flag
> > > that says "this cpu is using a funny mm", assert that the flag is
> > > clear when scheduling, and teach perf, coredumps, etc not to touch
> > > user memory when the flag is set?
> > 
> > It appears we may have introduced this exact issue on arm64 and ARM by
> > starting to run the UEFI runtime services with interrupts enabled.
> > (perf does not use NMI on ARM, so the issue did not exist beforehand)
> > 
> > Mark, Will, any thoughts?
> 
> Yup, I can cause perf to take samples from the EFI FW code, so that's
> less than ideal.

But that should only happen if you're profiling EL1, right, which needs
root privileges? (assuming the skid issue is solved -- not sure what
happened to those patches after they broke criu).

> The "funny mm" flag sounds like a good idea to me, though given recent
> pain with sampling in the case of skid, I don't know exactly what we
> should do if/when we take an overflow interrupt while in EFI.

I don't think special-casing perf interrupts is the right thing to do here.
If we're concerned about user-accesses being made off the back of interrupts
taken whilst in EFI, then we should probably either swizzle back in the
user page table on the IRQ path or postpone handling it until we're done
with the firmware. Having a flag feels a bit weird: would the uaccess
routines return -EFAULT if it's set?

Will


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Will Deacon
On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote:
> On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote:
> > (+ Mark, Will)
> > 
> > On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
> > >  wrote:
> > >> +/*
> > >> + * Makes the calling kernel thread switch to/from efi_mm context
> > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > >> + * (Note: This routine is heavily inspired from use_mm)
> > >> + */
> > >> +void efi_switch_mm(struct mm_struct *mm)
> > >> +{
> > >> +   struct task_struct *tsk = current;
> > >> +
> > >> +   task_lock(tsk);
> > >> +   efi_scratch.prev_mm = tsk->active_mm;
> > >> +   if (efi_scratch.prev_mm != mm) {
> > >> +   mmgrab(mm);
> > >> +   tsk->active_mm = mm;
> > >> +   }
> > >> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > >> +   task_unlock(tsk);
> > >> +
> > >> +   if (efi_scratch.prev_mm != mm)
> > >> +   mmdrop(efi_scratch.prev_mm);
> > >
> > > I'm confused.  You're mmdropping an mm that you are still keeping a
> > > pointer to.  This is also a bit confusing in the case where you do
> > > efi_switch_mm(efi_scratch.prev_mm).
> > >
> > > This whole manipulation seems fairly dangerous to me for another
> > > reason -- you're taking a user thread (I think) and swapping out its
> > > mm to something that the user in question should *not* have access to.
> > > What if a perf interrupt happens while you're in the alternate mm?
> > > What if you segfault and dump core?  Should we maybe just have a flag
> > > that says "this cpu is using a funny mm", assert that the flag is
> > > clear when scheduling, and teach perf, coredumps, etc not to touch
> > > user memory when the flag is set?
> > 
> > It appears we may have introduced this exact issue on arm64 and ARM by
> > starting to run the UEFI runtime services with interrupts enabled.
> > (perf does not use NMI on ARM, so the issue did not exist beforehand)
> > 
> > Mark, Will, any thoughts?
> 
> Yup, I can cause perf to take samples from the EFI FW code, so that's
> less than ideal.

But that should only happen if you're profiling EL1, right, which needs
root privileges? (assuming the skid issue is solved -- not sure what
happened to those patches after they broke criu).

> The "funny mm" flag sounds like a good idea to me, though given recent
> pain with sampling in the case of skid, I don't know exactly what we
> should do if/when we take an overflow interrupt while in EFI.

I don't think special-casing perf interrupts is the right thing to do here.
If we're concerned about user-accesses being made off the back of interrupts
taken whilst in EFI, then we should probably either swizzle back in the
user page table on the IRQ path or postpone handling it until we're done
with the firmware. Having a flag feels a bit weird: would the uaccess
routines return -EFAULT if it's set?

Will


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Mark Rutland
On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote:
> (+ Mark, Will)
> 
> On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
> >  wrote:
> >> +/*
> >> + * Makes the calling kernel thread switch to/from efi_mm context
> >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> >> + * (Note: This routine is heavily inspired from use_mm)
> >> + */
> >> +void efi_switch_mm(struct mm_struct *mm)
> >> +{
> >> +   struct task_struct *tsk = current;
> >> +
> >> +   task_lock(tsk);
> >> +   efi_scratch.prev_mm = tsk->active_mm;
> >> +   if (efi_scratch.prev_mm != mm) {
> >> +   mmgrab(mm);
> >> +   tsk->active_mm = mm;
> >> +   }
> >> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> >> +   task_unlock(tsk);
> >> +
> >> +   if (efi_scratch.prev_mm != mm)
> >> +   mmdrop(efi_scratch.prev_mm);
> >
> > I'm confused.  You're mmdropping an mm that you are still keeping a
> > pointer to.  This is also a bit confusing in the case where you do
> > efi_switch_mm(efi_scratch.prev_mm).
> >
> > This whole manipulation seems fairly dangerous to me for another
> > reason -- you're taking a user thread (I think) and swapping out its
> > mm to something that the user in question should *not* have access to.
> > What if a perf interrupt happens while you're in the alternate mm?
> > What if you segfault and dump core?  Should we maybe just have a flag
> > that says "this cpu is using a funny mm", assert that the flag is
> > clear when scheduling, and teach perf, coredumps, etc not to touch
> > user memory when the flag is set?
> 
> It appears we may have introduced this exact issue on arm64 and ARM by
> starting to run the UEFI runtime services with interrupts enabled.
> (perf does not use NMI on ARM, so the issue did not exist beforehand)
> 
> Mark, Will, any thoughts?

Yup, I can cause perf to take samples from the EFI FW code, so that's
less than ideal.

The "funny mm" flag sounds like a good idea to me, though given recent
pain with sampling in the case of skid, I don't know exactly what we
should do if/when we take an overflow interrupt while in EFI.

Mark.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Mark Rutland
On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote:
> (+ Mark, Will)
> 
> On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
> >  wrote:
> >> +/*
> >> + * Makes the calling kernel thread switch to/from efi_mm context
> >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> >> + * (Note: This routine is heavily inspired from use_mm)
> >> + */
> >> +void efi_switch_mm(struct mm_struct *mm)
> >> +{
> >> +   struct task_struct *tsk = current;
> >> +
> >> +   task_lock(tsk);
> >> +   efi_scratch.prev_mm = tsk->active_mm;
> >> +   if (efi_scratch.prev_mm != mm) {
> >> +   mmgrab(mm);
> >> +   tsk->active_mm = mm;
> >> +   }
> >> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> >> +   task_unlock(tsk);
> >> +
> >> +   if (efi_scratch.prev_mm != mm)
> >> +   mmdrop(efi_scratch.prev_mm);
> >
> > I'm confused.  You're mmdropping an mm that you are still keeping a
> > pointer to.  This is also a bit confusing in the case where you do
> > efi_switch_mm(efi_scratch.prev_mm).
> >
> > This whole manipulation seems fairly dangerous to me for another
> > reason -- you're taking a user thread (I think) and swapping out its
> > mm to something that the user in question should *not* have access to.
> > What if a perf interrupt happens while you're in the alternate mm?
> > What if you segfault and dump core?  Should we maybe just have a flag
> > that says "this cpu is using a funny mm", assert that the flag is
> > clear when scheduling, and teach perf, coredumps, etc not to touch
> > user memory when the flag is set?
> 
> It appears we may have introduced this exact issue on arm64 and ARM by
> starting to run the UEFI runtime services with interrupts enabled.
> (perf does not use NMI on ARM, so the issue did not exist beforehand)
> 
> Mark, Will, any thoughts?

Yup, I can cause perf to take samples from the EFI FW code, so that's
less than ideal.

The "funny mm" flag sounds like a good idea to me, though given recent
pain with sampling in the case of skid, I don't know exactly what we
should do if/when we take an overflow interrupt while in EFI.

Mark.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Ard Biesheuvel
(+ Mark, Will)

On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>  wrote:
>> +/*
>> + * Makes the calling kernel thread switch to/from efi_mm context
>> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
>> + * (Note: This routine is heavily inspired from use_mm)
>> + */
>> +void efi_switch_mm(struct mm_struct *mm)
>> +{
>> +   struct task_struct *tsk = current;
>> +
>> +   task_lock(tsk);
>> +   efi_scratch.prev_mm = tsk->active_mm;
>> +   if (efi_scratch.prev_mm != mm) {
>> +   mmgrab(mm);
>> +   tsk->active_mm = mm;
>> +   }
>> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
>> +   task_unlock(tsk);
>> +
>> +   if (efi_scratch.prev_mm != mm)
>> +   mmdrop(efi_scratch.prev_mm);
>
> I'm confused.  You're mmdropping an mm that you are still keeping a
> pointer to.  This is also a bit confusing in the case where you do
> efi_switch_mm(efi_scratch.prev_mm).
>
> This whole manipulation seems fairly dangerous to me for another
> reason -- you're taking a user thread (I think) and swapping out its
> mm to something that the user in question should *not* have access to.
> What if a perf interrupt happens while you're in the alternate mm?
> What if you segfault and dump core?  Should we maybe just have a flag
> that says "this cpu is using a funny mm", assert that the flag is
> clear when scheduling, and teach perf, coredumps, etc not to touch
> user memory when the flag is set?
>

It appears we may have introduced this exact issue on arm64 and ARM by
starting to run the UEFI runtime services with interrupts enabled.
(perf does not use NMI on ARM, so the issue did not exist beforehand)

Mark, Will, any thoughts?


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-16 Thread Ard Biesheuvel
(+ Mark, Will)

On 15 August 2017 at 22:46, Andy Lutomirski  wrote:
> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>  wrote:
>> +/*
>> + * Makes the calling kernel thread switch to/from efi_mm context
>> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
>> + * (Note: This routine is heavily inspired from use_mm)
>> + */
>> +void efi_switch_mm(struct mm_struct *mm)
>> +{
>> +   struct task_struct *tsk = current;
>> +
>> +   task_lock(tsk);
>> +   efi_scratch.prev_mm = tsk->active_mm;
>> +   if (efi_scratch.prev_mm != mm) {
>> +   mmgrab(mm);
>> +   tsk->active_mm = mm;
>> +   }
>> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
>> +   task_unlock(tsk);
>> +
>> +   if (efi_scratch.prev_mm != mm)
>> +   mmdrop(efi_scratch.prev_mm);
>
> I'm confused.  You're mmdropping an mm that you are still keeping a
> pointer to.  This is also a bit confusing in the case where you do
> efi_switch_mm(efi_scratch.prev_mm).
>
> This whole manipulation seems fairly dangerous to me for another
> reason -- you're taking a user thread (I think) and swapping out its
> mm to something that the user in question should *not* have access to.
> What if a perf interrupt happens while you're in the alternate mm?
> What if you segfault and dump core?  Should we maybe just have a flag
> that says "this cpu is using a funny mm", assert that the flag is
> clear when scheduling, and teach perf, coredumps, etc not to touch
> user memory when the flag is set?
>

It appears we may have introduced this exact issue on arm64 and ARM by
starting to run the UEFI runtime services with interrupts enabled.
(perf does not use NMI on ARM, so the issue did not exist beforehand)

Mark, Will, any thoughts?


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Andy Lutomirski
On Tue, Aug 15, 2017 at 5:23 PM, Sai Praneeth Prakhya
 wrote:
> On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
>> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>>  wrote:
>> > +/*
>> > + * Makes the calling kernel thread switch to/from efi_mm context
>> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
>> > + * (Note: This routine is heavily inspired from use_mm)
>> > + */
>> > +void efi_switch_mm(struct mm_struct *mm)
>> > +{
>> > +   struct task_struct *tsk = current;
>> > +
>> > +   task_lock(tsk);
>> > +   efi_scratch.prev_mm = tsk->active_mm;
>> > +   if (efi_scratch.prev_mm != mm) {
>> > +   mmgrab(mm);
>> > +   tsk->active_mm = mm;
>> > +   }
>> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
>> > +   task_unlock(tsk);
>> > +
>> > +   if (efi_scratch.prev_mm != mm)
>> > +   mmdrop(efi_scratch.prev_mm);
>>
>
> Thanks for the quick review Andy,
>
>> I'm confused.  You're mmdropping an mm that you are still keeping a
>> pointer to.  This is also a bit confusing in the case where you do
>> efi_switch_mm(efi_scratch.prev_mm).
>>
>
> This makes sense, I will look into it.
>
>> This whole manipulation seems fairly dangerous to me for another
>> reason -- you're taking a user thread (I think) and swapping out its
>> mm to something that the user in question should *not* have access to.
>
> We are switching to efi_mm from user mm_struct because
> EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are
> accessible only through efi_pgd. The user thread calls ioctl() which in
> turn calls efi_call() and thus efi_switch_mm(). So, I think, the user
> still does not have direct access to EFI_RUNTIME_SERVICES memory regions
> but accesses them through sys call.
>
>> What if a perf interrupt happens while you're in the alternate mm?
>
> Since we are disabling/enabling interrupts around switching, I think we
> are safe. We do these in following functions
> phys_efi_set_virtual_address_map()
> efi_thunk_set_virtual_address_map()
> efi_call_virt_pointer()

perf uses NMI, so this doesn't help.

Perhaps the sequence could look like this:

local_irq_disable();
current->active_mm = efi_mm;
switch_to();

...

switch_to(back to old mm);
current->active_mm = old mm;

and make perf know that current->active_mm != current->mm means that
user memory is off limits.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Andy Lutomirski
On Tue, Aug 15, 2017 at 5:23 PM, Sai Praneeth Prakhya
 wrote:
> On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
>> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>>  wrote:
>> > +/*
>> > + * Makes the calling kernel thread switch to/from efi_mm context
>> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
>> > + * (Note: This routine is heavily inspired from use_mm)
>> > + */
>> > +void efi_switch_mm(struct mm_struct *mm)
>> > +{
>> > +   struct task_struct *tsk = current;
>> > +
>> > +   task_lock(tsk);
>> > +   efi_scratch.prev_mm = tsk->active_mm;
>> > +   if (efi_scratch.prev_mm != mm) {
>> > +   mmgrab(mm);
>> > +   tsk->active_mm = mm;
>> > +   }
>> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
>> > +   task_unlock(tsk);
>> > +
>> > +   if (efi_scratch.prev_mm != mm)
>> > +   mmdrop(efi_scratch.prev_mm);
>>
>
> Thanks for the quick review Andy,
>
>> I'm confused.  You're mmdropping an mm that you are still keeping a
>> pointer to.  This is also a bit confusing in the case where you do
>> efi_switch_mm(efi_scratch.prev_mm).
>>
>
> This makes sense, I will look into it.
>
>> This whole manipulation seems fairly dangerous to me for another
>> reason -- you're taking a user thread (I think) and swapping out its
>> mm to something that the user in question should *not* have access to.
>
> We are switching to efi_mm from user mm_struct because
> EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are
> accessible only through efi_pgd. The user thread calls ioctl() which in
> turn calls efi_call() and thus efi_switch_mm(). So, I think, the user
> still does not have direct access to EFI_RUNTIME_SERVICES memory regions
> but accesses them through sys call.
>
>> What if a perf interrupt happens while you're in the alternate mm?
>
> Since we are disabling/enabling interrupts around switching, I think we
> are safe. We do these in following functions
> phys_efi_set_virtual_address_map()
> efi_thunk_set_virtual_address_map()
> efi_call_virt_pointer()

perf uses NMI, so this doesn't help.

Perhaps the sequence could look like this:

local_irq_disable();
current->active_mm = efi_mm;
switch_to();

...

switch_to(back to old mm);
current->active_mm = old mm;

and make perf know that current->active_mm != current->mm means that
user memory is off limits.


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Sai Praneeth Prakhya
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>  wrote:
> > +/*
> > + * Makes the calling kernel thread switch to/from efi_mm context
> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > + * (Note: This routine is heavily inspired from use_mm)
> > + */
> > +void efi_switch_mm(struct mm_struct *mm)
> > +{
> > +   struct task_struct *tsk = current;
> > +
> > +   task_lock(tsk);
> > +   efi_scratch.prev_mm = tsk->active_mm;
> > +   if (efi_scratch.prev_mm != mm) {
> > +   mmgrab(mm);
> > +   tsk->active_mm = mm;
> > +   }
> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > +   task_unlock(tsk);
> > +
> > +   if (efi_scratch.prev_mm != mm)
> > +   mmdrop(efi_scratch.prev_mm);
> 

Thanks for the quick review Andy,

> I'm confused.  You're mmdropping an mm that you are still keeping a
> pointer to.  This is also a bit confusing in the case where you do
> efi_switch_mm(efi_scratch.prev_mm).
> 

This makes sense, I will look into it.

> This whole manipulation seems fairly dangerous to me for another
> reason -- you're taking a user thread (I think) and swapping out its
> mm to something that the user in question should *not* have access to.

We are switching to efi_mm from user mm_struct because
EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are
accessible only through efi_pgd. The user thread calls ioctl() which in
turn calls efi_call() and thus efi_switch_mm(). So, I think, the user
still does not have direct access to EFI_RUNTIME_SERVICES memory regions
but accesses them through sys call.

> What if a perf interrupt happens while you're in the alternate mm?

Since we are disabling/enabling interrupts around switching, I think we
are safe. We do these in following functions
phys_efi_set_virtual_address_map()
efi_thunk_set_virtual_address_map()
efi_call_virt_pointer()

> What if you segfault and dump core?

We could seg fault only if firmware touches regions which it shouldn't.
i.e. Firmware touching regions outside EFI_RUNTIME_SERVICES (this is a
UEFI Spec violation). So, in this case of buggy firmware, we panic (this
is an existing problem). We also map EFI_BOOT_TIME_SERVICES into efi_pgd
because we know some buggy firmware touches these regions.

>  Should we maybe just have a flag
> that says "this cpu is using a funny mm", assert that the flag is
> clear when scheduling, and teach perf, coredumps, etc not to touch
> user memory when the flag is set?
> 
> Admittedly, the latter problem may well have existed even before these 
> patches.

Please let me know if you think otherwise.

Matt,
Please feel free to correct my understanding.

Regards,
Sai



Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Sai Praneeth Prakhya
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote:
> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
>  wrote:
> > +/*
> > + * Makes the calling kernel thread switch to/from efi_mm context
> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> > + * (Note: This routine is heavily inspired from use_mm)
> > + */
> > +void efi_switch_mm(struct mm_struct *mm)
> > +{
> > +   struct task_struct *tsk = current;
> > +
> > +   task_lock(tsk);
> > +   efi_scratch.prev_mm = tsk->active_mm;
> > +   if (efi_scratch.prev_mm != mm) {
> > +   mmgrab(mm);
> > +   tsk->active_mm = mm;
> > +   }
> > +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> > +   task_unlock(tsk);
> > +
> > +   if (efi_scratch.prev_mm != mm)
> > +   mmdrop(efi_scratch.prev_mm);
> 

Thanks for the quick review Andy,

> I'm confused.  You're mmdropping an mm that you are still keeping a
> pointer to.  This is also a bit confusing in the case where you do
> efi_switch_mm(efi_scratch.prev_mm).
> 

This makes sense, I will look into it.

> This whole manipulation seems fairly dangerous to me for another
> reason -- you're taking a user thread (I think) and swapping out its
> mm to something that the user in question should *not* have access to.

We are switching to efi_mm from user mm_struct because
EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are
accessible only through efi_pgd. The user thread calls ioctl() which in
turn calls efi_call() and thus efi_switch_mm(). So, I think, the user
still does not have direct access to EFI_RUNTIME_SERVICES memory regions
but accesses them through sys call.

> What if a perf interrupt happens while you're in the alternate mm?

Since we are disabling/enabling interrupts around switching, I think we
are safe. We do these in following functions
phys_efi_set_virtual_address_map()
efi_thunk_set_virtual_address_map()
efi_call_virt_pointer()

> What if you segfault and dump core?

We could seg fault only if firmware touches regions which it shouldn't.
i.e. Firmware touching regions outside EFI_RUNTIME_SERVICES (this is a
UEFI Spec violation). So, in this case of buggy firmware, we panic (this
is an existing problem). We also map EFI_BOOT_TIME_SERVICES into efi_pgd
because we know some buggy firmware touches these regions.

>  Should we maybe just have a flag
> that says "this cpu is using a funny mm", assert that the flag is
> clear when scheduling, and teach perf, coredumps, etc not to touch
> user memory when the flag is set?
> 
> Admittedly, the latter problem may well have existed even before these 
> patches.

Please let me know if you think otherwise.

Matt,
Please feel free to correct my understanding.

Regards,
Sai



Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Andy Lutomirski
On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
 wrote:
> +/*
> + * Makes the calling kernel thread switch to/from efi_mm context
> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> + * (Note: This routine is heavily inspired from use_mm)
> + */
> +void efi_switch_mm(struct mm_struct *mm)
> +{
> +   struct task_struct *tsk = current;
> +
> +   task_lock(tsk);
> +   efi_scratch.prev_mm = tsk->active_mm;
> +   if (efi_scratch.prev_mm != mm) {
> +   mmgrab(mm);
> +   tsk->active_mm = mm;
> +   }
> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> +   task_unlock(tsk);
> +
> +   if (efi_scratch.prev_mm != mm)
> +   mmdrop(efi_scratch.prev_mm);

I'm confused.  You're mmdropping an mm that you are still keeping a
pointer to.  This is also a bit confusing in the case where you do
efi_switch_mm(efi_scratch.prev_mm).

This whole manipulation seems fairly dangerous to me for another
reason -- you're taking a user thread (I think) and swapping out its
mm to something that the user in question should *not* have access to.
What if a perf interrupt happens while you're in the alternate mm?
What if you segfault and dump core?  Should we maybe just have a flag
that says "this cpu is using a funny mm", assert that the flag is
clear when scheduling, and teach perf, coredumps, etc not to touch
user memory when the flag is set?

Admittedly, the latter problem may well have existed even before these patches.

> +}
> +



>  #ifdef CONFIG_EFI_MIXED
>  extern efi_status_t efi64_thunk(u32, ...);
>
> @@ -649,16 +665,13 @@ efi_status_t efi_thunk_set_virtual_address_map(
> efi_sync_low_kernel_mappings();
> local_irq_save(flags);
>
> -   efi_scratch.prev_cr3 = read_cr3();
> -   write_cr3((unsigned long)efi_scratch.efi_pgt);
> -   __flush_tlb_all();
> +   efi_switch_mm(_mm);
>
> func = (u32)(unsigned long)phys_set_virtual_address_map;
> status = efi64_thunk(func, memory_map_size, descriptor_size,
>  descriptor_version, virtual_map);
>
> -   write_cr3(efi_scratch.prev_cr3);
> -   __flush_tlb_all();
> +   efi_switch_mm(efi_scratch.prev_mm);
> local_irq_restore(flags);
>
> return status;


Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Andy Lutomirski
On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya
 wrote:
> +/*
> + * Makes the calling kernel thread switch to/from efi_mm context
> + * Can be used from SetVirtualAddressMap() or during efi runtime calls
> + * (Note: This routine is heavily inspired from use_mm)
> + */
> +void efi_switch_mm(struct mm_struct *mm)
> +{
> +   struct task_struct *tsk = current;
> +
> +   task_lock(tsk);
> +   efi_scratch.prev_mm = tsk->active_mm;
> +   if (efi_scratch.prev_mm != mm) {
> +   mmgrab(mm);
> +   tsk->active_mm = mm;
> +   }
> +   switch_mm(efi_scratch.prev_mm, mm, NULL);
> +   task_unlock(tsk);
> +
> +   if (efi_scratch.prev_mm != mm)
> +   mmdrop(efi_scratch.prev_mm);

I'm confused.  You're mmdropping an mm that you are still keeping a
pointer to.  This is also a bit confusing in the case where you do
efi_switch_mm(efi_scratch.prev_mm).

This whole manipulation seems fairly dangerous to me for another
reason -- you're taking a user thread (I think) and swapping out its
mm to something that the user in question should *not* have access to.
What if a perf interrupt happens while you're in the alternate mm?
What if you segfault and dump core?  Should we maybe just have a flag
that says "this cpu is using a funny mm", assert that the flag is
clear when scheduling, and teach perf, coredumps, etc not to touch
user memory when the flag is set?

Admittedly, the latter problem may well have existed even before these patches.

> +}
> +



>  #ifdef CONFIG_EFI_MIXED
>  extern efi_status_t efi64_thunk(u32, ...);
>
> @@ -649,16 +665,13 @@ efi_status_t efi_thunk_set_virtual_address_map(
> efi_sync_low_kernel_mappings();
> local_irq_save(flags);
>
> -   efi_scratch.prev_cr3 = read_cr3();
> -   write_cr3((unsigned long)efi_scratch.efi_pgt);
> -   __flush_tlb_all();
> +   efi_switch_mm(_mm);
>
> func = (u32)(unsigned long)phys_set_virtual_address_map;
> status = efi64_thunk(func, memory_map_size, descriptor_size,
>  descriptor_version, virtual_map);
>
> -   write_cr3(efi_scratch.prev_cr3);
> -   __flush_tlb_all();
> +   efi_switch_mm(efi_scratch.prev_mm);
> local_irq_restore(flags);
>
> return status;


[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Sai Praneeth Prakhya
From: Sai Praneeth 

Use helper function (efi_switch_mm()) to switch to/from efi_mm. We
switch to efi_mm before calling
1. efi_set_virtual_address_map() and
2. Invoking any efi_runtime_service()

Likewise, we need to switch back to previous mm (mm context stolen by
efi_mm) after the above calls return successfully. We can use
efi_switch_mm() only with x86_64 kernel and "efi=old_map" disabled
because, x86_32 and efi=old_map doesn't use efi_pgd, rather they use
swapper_pg_dir.

Signed-off-by: Sai Praneeth Prakhya 
Cc: Lee, Chun-Yi 
Cc: Borislav Petkov 
Cc: Andy Lutomirski 
Cc: Michael S. Tsirkin 
Cc: Ricardo Neri 
Cc: Matt Fleming 
Cc: Ard Biesheuvel 
Cc: Ravi Shankar 
---
 arch/x86/include/asm/efi.h   | 30 +-
 arch/x86/platform/efi/efi_64.c   | 41 
 arch/x86/platform/efi/efi_thunk_64.S |  2 +-
 3 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..aa38b546e842 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,10 +1,14 @@
 #ifndef _ASM_X86_EFI_H
 #define _ASM_X86_EFI_H
 
+#include 
+#include 
+
 #include 
 #include 
 #include 
 #include 
+#include 
 
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
@@ -57,14 +61,14 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 #define efi_call_phys(f, args...)  efi_call((f), args)
 
 /*
- * Scratch space used for switching the pagetable in the EFI stub
+ * struct efi_scratch - Scratch space used while switching to/from efi_mm
+ * @phys_stack:stack used during EFI Mixed Mode
+ * @prev_mm:   store/restore stolen mm_struct while switching
+ * to/from efi_mm
  */
 struct efi_scratch {
-   u64 r15;
-   u64 prev_cr3;
-   pgd_t   *efi_pgt;
-   booluse_pgd;
-   u64 phys_stack;
+   u64 phys_stack;
+   struct mm_struct*prev_mm;
 } __packed;
 
 #define arch_efi_call_virt_setup() \
@@ -73,11 +77,8 @@ struct efi_scratch {
preempt_disable();  \
__kernel_fpu_begin();   \
\
-   if (efi_scratch.use_pgd) {  \
-   efi_scratch.prev_cr3 = read_cr3();  \
-   write_cr3((unsigned long)efi_scratch.efi_pgt);  \
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(_mm); \
 })
 
 #define arch_efi_call_virt(p, f, args...)  \
@@ -85,10 +86,8 @@ struct efi_scratch {
 
 #define arch_efi_call_virt_teardown()  \
 ({ \
-   if (efi_scratch.use_pgd) {  \
-   write_cr3(efi_scratch.prev_cr3);\
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(efi_scratch.prev_mm); \
\
__kernel_fpu_end(); \
preempt_enable();   \
@@ -130,6 +129,7 @@ extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
+extern void efi_switch_mm(struct mm_struct *mm);
 
 struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 0bb98c35e178..3be94480c1ce 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,9 +80,8 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   save_pgd = (pgd_t *)read_cr3();
-   write_cr3((unsigned long)efi_scratch.efi_pgt);
-   goto out;
+   efi_switch_mm(_mm);
+   return NULL;
}
 
early_code_mapping_set_exec(1);
@@ -152,8 +151,7 @@ void __init 

[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3

2017-08-15 Thread Sai Praneeth Prakhya
From: Sai Praneeth 

Use helper function (efi_switch_mm()) to switch to/from efi_mm. We
switch to efi_mm before calling
1. efi_set_virtual_address_map() and
2. Invoking any efi_runtime_service()

Likewise, we need to switch back to previous mm (mm context stolen by
efi_mm) after the above calls return successfully. We can use
efi_switch_mm() only with x86_64 kernel and "efi=old_map" disabled
because, x86_32 and efi=old_map doesn't use efi_pgd, rather they use
swapper_pg_dir.

Signed-off-by: Sai Praneeth Prakhya 
Cc: Lee, Chun-Yi 
Cc: Borislav Petkov 
Cc: Andy Lutomirski 
Cc: Michael S. Tsirkin 
Cc: Ricardo Neri 
Cc: Matt Fleming 
Cc: Ard Biesheuvel 
Cc: Ravi Shankar 
---
 arch/x86/include/asm/efi.h   | 30 +-
 arch/x86/platform/efi/efi_64.c   | 41 
 arch/x86/platform/efi/efi_thunk_64.S |  2 +-
 3 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..aa38b546e842 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,10 +1,14 @@
 #ifndef _ASM_X86_EFI_H
 #define _ASM_X86_EFI_H
 
+#include 
+#include 
+
 #include 
 #include 
 #include 
 #include 
+#include 
 
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
@@ -57,14 +61,14 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 #define efi_call_phys(f, args...)  efi_call((f), args)
 
 /*
- * Scratch space used for switching the pagetable in the EFI stub
+ * struct efi_scratch - Scratch space used while switching to/from efi_mm
+ * @phys_stack:stack used during EFI Mixed Mode
+ * @prev_mm:   store/restore stolen mm_struct while switching
+ * to/from efi_mm
  */
 struct efi_scratch {
-   u64 r15;
-   u64 prev_cr3;
-   pgd_t   *efi_pgt;
-   booluse_pgd;
-   u64 phys_stack;
+   u64 phys_stack;
+   struct mm_struct*prev_mm;
 } __packed;
 
 #define arch_efi_call_virt_setup() \
@@ -73,11 +77,8 @@ struct efi_scratch {
preempt_disable();  \
__kernel_fpu_begin();   \
\
-   if (efi_scratch.use_pgd) {  \
-   efi_scratch.prev_cr3 = read_cr3();  \
-   write_cr3((unsigned long)efi_scratch.efi_pgt);  \
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(_mm); \
 })
 
 #define arch_efi_call_virt(p, f, args...)  \
@@ -85,10 +86,8 @@ struct efi_scratch {
 
 #define arch_efi_call_virt_teardown()  \
 ({ \
-   if (efi_scratch.use_pgd) {  \
-   write_cr3(efi_scratch.prev_cr3);\
-   __flush_tlb_all();  \
-   }   \
+   if (!efi_enabled(EFI_OLD_MEMMAP))   \
+   efi_switch_mm(efi_scratch.prev_mm); \
\
__kernel_fpu_end(); \
preempt_enable();   \
@@ -130,6 +129,7 @@ extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
 extern int __init efi_reuse_config(u64 tables, int nr_tables);
 extern void efi_delete_dummy_variable(void);
+extern void efi_switch_mm(struct mm_struct *mm);
 
 struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 0bb98c35e178..3be94480c1ce 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,9 +80,8 @@ pgd_t * __init efi_call_phys_prolog(void)
int n_pgds, i, j;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   save_pgd = (pgd_t *)read_cr3();
-   write_cr3((unsigned long)efi_scratch.efi_pgt);
-   goto out;
+   efi_switch_mm(_mm);
+   return NULL;
}
 
early_code_mapping_set_exec(1);
@@ -152,8 +151,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
pud_t *pud;
 
if (!efi_enabled(EFI_OLD_MEMMAP)) {
-   write_cr3((unsigned long)save_pgd);
-   __flush_tlb_all();
+   efi_switch_mm(efi_scratch.prev_mm);