[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with %cr3
From: Sai PraneethUse helper function (efi_switch_mm()) to switch to/from efi_mm. We switch to efi_mm before calling 1. efi_set_virtual_address_map() and 2. Invoking any efi_runtime_service() Likewise, we need to switch back to previous mm (mm context stolen by efi_mm) after the above calls return successfully. We can use efi_switch_mm() helper function only with x86_64 kernel and "efi=old_map" disabled because, x86_32 and efi=old_map doesn't use efi_pgd, rather they use swapper_pg_dir. Signed-off-by: Sai Praneeth Prakhya Cc: Lee, Chun-Yi Cc: Borislav Petkov Cc: Tony Luck Cc: Andy Lutomirski Cc: Michael S. Tsirkin Cc: Bhupesh Sharma Cc: Ricardo Neri Cc: Matt Fleming Cc: Ard Biesheuvel Cc: Ravi Shankar Tested-by: Bhupesh Sharma --- arch/x86/include/asm/efi.h | 25 +- arch/x86/platform/efi/efi_64.c | 41 ++-- arch/x86/platform/efi/efi_thunk_64.S | 2 +- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 00f977ddd718..cda9940bed7a 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -62,14 +62,13 @@ extern asmlinkage u64 efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) /* - * Scratch space used for switching the pagetable in the EFI stub + * struct efi_scratch - Scratch space used while switching to/from efi_mm + * @phys_stack: stack used during EFI Mixed Mode + * @prev_mm:store/restore stolen mm_struct while switching to/from efi_mm */ struct efi_scratch { - u64 r15; - u64 prev_cr3; - pgd_t *efi_pgt; - booluse_pgd; - u64 phys_stack; + u64 phys_stack; + struct mm_struct*prev_mm; } __packed; #define arch_efi_call_virt_setup() \ @@ -78,11 +77,8 @@ struct efi_scratch { preempt_disable(); \ __kernel_fpu_begin(); \ \ - if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = __read_cr3();\ - write_cr3((unsigned long)efi_scratch.efi_pgt); \ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(_mm); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -90,10 +86,8 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - if (efi_scratch.use_pgd) { \ - write_cr3(efi_scratch.prev_cr3);\ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(efi_scratch.prev_mm); \ \ __kernel_fpu_end(); \ preempt_enable(); \ @@ -135,6 +129,7 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); +extern void efi_switch_mm(struct mm_struct *mm); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6b541bdbda5f..c325b1cc4d1a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -82,9 +82,8 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)__read_cr3(); - write_cr3((unsigned long)efi_scratch.efi_pgt); - goto out; + efi_switch_mm(_mm); + return NULL; } early_code_mapping_set_exec(1); @@ -154,8 +153,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) pud_t *pud; if (!efi_enabled(EFI_OLD_MEMMAP)) { -
[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with %cr3
From: Sai Praneeth Use helper function (efi_switch_mm()) to switch to/from efi_mm. We switch to efi_mm before calling 1. efi_set_virtual_address_map() and 2. Invoking any efi_runtime_service() Likewise, we need to switch back to previous mm (mm context stolen by efi_mm) after the above calls return successfully. We can use efi_switch_mm() helper function only with x86_64 kernel and "efi=old_map" disabled because, x86_32 and efi=old_map doesn't use efi_pgd, rather they use swapper_pg_dir. Signed-off-by: Sai Praneeth Prakhya Cc: Lee, Chun-Yi Cc: Borislav Petkov Cc: Tony Luck Cc: Andy Lutomirski Cc: Michael S. Tsirkin Cc: Bhupesh Sharma Cc: Ricardo Neri Cc: Matt Fleming Cc: Ard Biesheuvel Cc: Ravi Shankar Tested-by: Bhupesh Sharma --- arch/x86/include/asm/efi.h | 25 +- arch/x86/platform/efi/efi_64.c | 41 ++-- arch/x86/platform/efi/efi_thunk_64.S | 2 +- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 00f977ddd718..cda9940bed7a 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -62,14 +62,13 @@ extern asmlinkage u64 efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) /* - * Scratch space used for switching the pagetable in the EFI stub + * struct efi_scratch - Scratch space used while switching to/from efi_mm + * @phys_stack: stack used during EFI Mixed Mode + * @prev_mm:store/restore stolen mm_struct while switching to/from efi_mm */ struct efi_scratch { - u64 r15; - u64 prev_cr3; - pgd_t *efi_pgt; - booluse_pgd; - u64 phys_stack; + u64 phys_stack; + struct mm_struct*prev_mm; } __packed; #define arch_efi_call_virt_setup() \ @@ -78,11 +77,8 @@ struct efi_scratch { preempt_disable(); \ __kernel_fpu_begin(); \ \ - if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = __read_cr3();\ - write_cr3((unsigned long)efi_scratch.efi_pgt); \ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(_mm); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -90,10 +86,8 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - if (efi_scratch.use_pgd) { \ - write_cr3(efi_scratch.prev_cr3);\ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(efi_scratch.prev_mm); \ \ __kernel_fpu_end(); \ preempt_enable(); \ @@ -135,6 +129,7 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); +extern void efi_switch_mm(struct mm_struct *mm); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6b541bdbda5f..c325b1cc4d1a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -82,9 +82,8 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)__read_cr3(); - write_cr3((unsigned long)efi_scratch.efi_pgt); - goto out; + efi_switch_mm(_mm); + return NULL; } early_code_mapping_set_exec(1); @@ -154,8 +153,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) pud_t *pud; if (!efi_enabled(EFI_OLD_MEMMAP)) { - write_cr3((unsigned long)save_pgd); - __flush_tlb_all(); + efi_switch_mm(efi_scratch.prev_mm); return; } @@ -341,13 +339,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; /* -*
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Thu, Aug 24, 2017 at 7:36 PM, Sai Praneeth Prakhyawrote: > On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: >> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya >> wrote: >> > +/* >> > + * Makes the calling kernel thread switch to/from efi_mm context >> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls >> > + * (Note: This routine is heavily inspired from use_mm) >> > + */ >> > +void efi_switch_mm(struct mm_struct *mm) >> > +{ >> > + struct task_struct *tsk = current; >> > + >> > + task_lock(tsk); >> > + efi_scratch.prev_mm = tsk->active_mm; >> > + if (efi_scratch.prev_mm != mm) { >> > + mmgrab(mm); >> > + tsk->active_mm = mm; >> > + } >> > + switch_mm(efi_scratch.prev_mm, mm, NULL); >> > + task_unlock(tsk); >> > + >> > + if (efi_scratch.prev_mm != mm) >> > + mmdrop(efi_scratch.prev_mm); >> >> I'm confused. You're mmdropping an mm that you are still keeping a >> pointer to. This is also a bit confusing in the case where you do >> efi_switch_mm(efi_scratch.prev_mm). >> >> This whole manipulation seems fairly dangerous to me for another >> reason -- you're taking a user thread (I think) and swapping out its >> mm to something that the user in question should *not* have access to. >> What if a perf interrupt happens while you're in the alternate mm? >> What if you segfault and dump core? Should we maybe just have a flag >> that says "this cpu is using a funny mm", assert that the flag is >> clear when scheduling, and teach perf, coredumps, etc not to touch >> user memory when the flag is set? >> >> Admittedly, the latter problem may well have existed even before these >> patches. > > Hi All, > > Could we please decouple the above issue from this patch set, so that we > could have common efi_mm between x86 and ARM and also improve > readability and maintainability for x86/efi. I don't see why not. > > As it seems that "Everything EFI as kthread" might solve the above issue > for real (which might take quite some time to implement, taking into > consideration the complexity involved and some special case with > pstore), do you think this patch set seems OK? > > If so, I will send out a V2 addressing the mmdropping issue. > > Regards, > Sai >
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Thu, Aug 24, 2017 at 7:36 PM, Sai Praneeth Prakhya wrote: > On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: >> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya >> wrote: >> > +/* >> > + * Makes the calling kernel thread switch to/from efi_mm context >> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls >> > + * (Note: This routine is heavily inspired from use_mm) >> > + */ >> > +void efi_switch_mm(struct mm_struct *mm) >> > +{ >> > + struct task_struct *tsk = current; >> > + >> > + task_lock(tsk); >> > + efi_scratch.prev_mm = tsk->active_mm; >> > + if (efi_scratch.prev_mm != mm) { >> > + mmgrab(mm); >> > + tsk->active_mm = mm; >> > + } >> > + switch_mm(efi_scratch.prev_mm, mm, NULL); >> > + task_unlock(tsk); >> > + >> > + if (efi_scratch.prev_mm != mm) >> > + mmdrop(efi_scratch.prev_mm); >> >> I'm confused. You're mmdropping an mm that you are still keeping a >> pointer to. This is also a bit confusing in the case where you do >> efi_switch_mm(efi_scratch.prev_mm). >> >> This whole manipulation seems fairly dangerous to me for another >> reason -- you're taking a user thread (I think) and swapping out its >> mm to something that the user in question should *not* have access to. >> What if a perf interrupt happens while you're in the alternate mm? >> What if you segfault and dump core? Should we maybe just have a flag >> that says "this cpu is using a funny mm", assert that the flag is >> clear when scheduling, and teach perf, coredumps, etc not to touch >> user memory when the flag is set? >> >> Admittedly, the latter problem may well have existed even before these >> patches. > > Hi All, > > Could we please decouple the above issue from this patch set, so that we > could have common efi_mm between x86 and ARM and also improve > readability and maintainability for x86/efi. I don't see why not. > > As it seems that "Everything EFI as kthread" might solve the above issue > for real (which might take quite some time to implement, taking into > consideration the complexity involved and some special case with > pstore), do you think this patch set seems OK? > > If so, I will send out a V2 addressing the mmdropping issue. > > Regards, > Sai >
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 23, 2017 at 3:52 PM, Sai Praneeth Prakhyawrote: > On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote: >> >> > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra wrote: >> > >> >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: >> >> >> >> >> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: >> > >> >> Using a kernel thread solves the problem for real. Anything that >> blindly accesses user memory in kernel thread context is terminally >> broken no matter what. >> >>> >> >>> So perf-callchain doesn't do it 'blindly', it wants either: >> >>> >> >>> - user_mode(regs) true, or >> >>> - task_pt_regs() set. >> >>> >> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the >> >>> EFI code running could very well have user_mode(regs) being true. >> >>> >> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are >> >>> accessible. It bails on error though. So while its careful, it does >> >>> attempt to access the 'user' mapping directly. Which should also trigger >> >>> with the EFI code. >> >>> >> >>> And I'm not seeing anything particularly broken with either. The PEBS >> >>> fixup relies on the CPU having just executed the code, and if it could >> >>> fetch and execute the code, why shouldn't it be able to fetch and read? >> >> >> >> There are two ways this could be a problem. One is that u privileged >> >> user apps shouldn't be able to read from EFI memory. >> > >> > Ah, but only root can create per-cpu events or attach events to kernel >> > threads (with sensible paranoia levels). >> >> But this may not need to be percpu. If a non root user can trigger, say, an >> EFI variable read in their own thread context, boom. >> > + Tony > > Hi Andi, > > I am trying to reproduce the issue that we are discussing and hence > tried an experiment like this: > A user process continuously reads efi variable by > "cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" > for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as > root (which I suppose should trigger NMI's). I see that everything is fine, > no lockups, no kernel crash, no warnings/errors in dmesg. > > I see that perf top reports 50% of time is spent in efi function > (probably efi_get_variable()). > OverheadShared Object Symbol > 50% [unknown] [k] 0xfffeea967416 > > 50% is max, on avg it's 35%. > > I have tested this on two kernels v4.12 and v3.19. My machine has 8 > cores and to stress test, I further offlined all cpus except cpu0. > > Could you please let me know a way to reproduce the issue that we are > discussing here. > I think the issue we are concerned here is, when kernel is in efi > context and an NMI happens and if the NMI handler tries to access user > space, boom! we don't have user space in efi context. Am I right in > understanding the issue or is it something else? The boom isn't a crash, though -- it'll be (potentially) sensitive information that shows up in the perf record. As long as EFI isn't using low addresses, there may not be an issue. But EFI should (maybe) use low addresses, and this'll be more important.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 23, 2017 at 3:52 PM, Sai Praneeth Prakhya wrote: > On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote: >> >> > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra wrote: >> > >> >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: >> >> >> >> >> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: >> > >> >> Using a kernel thread solves the problem for real. Anything that >> blindly accesses user memory in kernel thread context is terminally >> broken no matter what. >> >>> >> >>> So perf-callchain doesn't do it 'blindly', it wants either: >> >>> >> >>> - user_mode(regs) true, or >> >>> - task_pt_regs() set. >> >>> >> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the >> >>> EFI code running could very well have user_mode(regs) being true. >> >>> >> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are >> >>> accessible. It bails on error though. So while its careful, it does >> >>> attempt to access the 'user' mapping directly. Which should also trigger >> >>> with the EFI code. >> >>> >> >>> And I'm not seeing anything particularly broken with either. The PEBS >> >>> fixup relies on the CPU having just executed the code, and if it could >> >>> fetch and execute the code, why shouldn't it be able to fetch and read? >> >> >> >> There are two ways this could be a problem. One is that u privileged >> >> user apps shouldn't be able to read from EFI memory. >> > >> > Ah, but only root can create per-cpu events or attach events to kernel >> > threads (with sensible paranoia levels). >> >> But this may not need to be percpu. If a non root user can trigger, say, an >> EFI variable read in their own thread context, boom. >> > + Tony > > Hi Andi, > > I am trying to reproduce the issue that we are discussing and hence > tried an experiment like this: > A user process continuously reads efi variable by > "cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" > for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as > root (which I suppose should trigger NMI's). I see that everything is fine, > no lockups, no kernel crash, no warnings/errors in dmesg. > > I see that perf top reports 50% of time is spent in efi function > (probably efi_get_variable()). > OverheadShared Object Symbol > 50% [unknown] [k] 0xfffeea967416 > > 50% is max, on avg it's 35%. > > I have tested this on two kernels v4.12 and v3.19. My machine has 8 > cores and to stress test, I further offlined all cpus except cpu0. > > Could you please let me know a way to reproduce the issue that we are > discussing here. > I think the issue we are concerned here is, when kernel is in efi > context and an NMI happens and if the NMI handler tries to access user > space, boom! we don't have user space in efi context. Am I right in > understanding the issue or is it something else? The boom isn't a crash, though -- it'll be (potentially) sensitive information that shows up in the perf record. As long as EFI isn't using low addresses, there may not be an issue. But EFI should (maybe) use low addresses, and this'll be more important.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya >wrote: > > +/* > > + * Makes the calling kernel thread switch to/from efi_mm context > > + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > + * (Note: This routine is heavily inspired from use_mm) > > + */ > > +void efi_switch_mm(struct mm_struct *mm) > > +{ > > + struct task_struct *tsk = current; > > + > > + task_lock(tsk); > > + efi_scratch.prev_mm = tsk->active_mm; > > + if (efi_scratch.prev_mm != mm) { > > + mmgrab(mm); > > + tsk->active_mm = mm; > > + } > > + switch_mm(efi_scratch.prev_mm, mm, NULL); > > + task_unlock(tsk); > > + > > + if (efi_scratch.prev_mm != mm) > > + mmdrop(efi_scratch.prev_mm); > > I'm confused. You're mmdropping an mm that you are still keeping a > pointer to. This is also a bit confusing in the case where you do > efi_switch_mm(efi_scratch.prev_mm). > > This whole manipulation seems fairly dangerous to me for another > reason -- you're taking a user thread (I think) and swapping out its > mm to something that the user in question should *not* have access to. > What if a perf interrupt happens while you're in the alternate mm? > What if you segfault and dump core? Should we maybe just have a flag > that says "this cpu is using a funny mm", assert that the flag is > clear when scheduling, and teach perf, coredumps, etc not to touch > user memory when the flag is set? > > Admittedly, the latter problem may well have existed even before these > patches. Hi All, Could we please decouple the above issue from this patch set, so that we could have common efi_mm between x86 and ARM and also improve readability and maintainability for x86/efi. As it seems that "Everything EFI as kthread" might solve the above issue for real (which might take quite some time to implement, taking into consideration the complexity involved and some special case with pstore), do you think this patch set seems OK? If so, I will send out a V2 addressing the mmdropping issue. Regards, Sai
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > wrote: > > +/* > > + * Makes the calling kernel thread switch to/from efi_mm context > > + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > + * (Note: This routine is heavily inspired from use_mm) > > + */ > > +void efi_switch_mm(struct mm_struct *mm) > > +{ > > + struct task_struct *tsk = current; > > + > > + task_lock(tsk); > > + efi_scratch.prev_mm = tsk->active_mm; > > + if (efi_scratch.prev_mm != mm) { > > + mmgrab(mm); > > + tsk->active_mm = mm; > > + } > > + switch_mm(efi_scratch.prev_mm, mm, NULL); > > + task_unlock(tsk); > > + > > + if (efi_scratch.prev_mm != mm) > > + mmdrop(efi_scratch.prev_mm); > > I'm confused. You're mmdropping an mm that you are still keeping a > pointer to. This is also a bit confusing in the case where you do > efi_switch_mm(efi_scratch.prev_mm). > > This whole manipulation seems fairly dangerous to me for another > reason -- you're taking a user thread (I think) and swapping out its > mm to something that the user in question should *not* have access to. > What if a perf interrupt happens while you're in the alternate mm? > What if you segfault and dump core? Should we maybe just have a flag > that says "this cpu is using a funny mm", assert that the flag is > clear when scheduling, and teach perf, coredumps, etc not to touch > user memory when the flag is set? > > Admittedly, the latter problem may well have existed even before these > patches. Hi All, Could we please decouple the above issue from this patch set, so that we could have common efi_mm between x86 and ARM and also improve readability and maintainability for x86/efi. As it seems that "Everything EFI as kthread" might solve the above issue for real (which might take quite some time to implement, taking into consideration the complexity involved and some special case with pstore), do you think this patch set seems OK? If so, I will send out a V2 addressing the mmdropping issue. Regards, Sai
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote: > > > On Aug 21, 2017, at 7:08 AM, Peter Zijlstrawrote: > > > >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: > >> > >> > >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: > > > > Using a kernel thread solves the problem for real. Anything that > blindly accesses user memory in kernel thread context is terminally > broken no matter what. > >>> > >>> So perf-callchain doesn't do it 'blindly', it wants either: > >>> > >>> - user_mode(regs) true, or > >>> - task_pt_regs() set. > >>> > >>> However I'm thinking that if the kernel thread has ->mm == _mm, the > >>> EFI code running could very well have user_mode(regs) being true. > >>> > >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are > >>> accessible. It bails on error though. So while its careful, it does > >>> attempt to access the 'user' mapping directly. Which should also trigger > >>> with the EFI code. > >>> > >>> And I'm not seeing anything particularly broken with either. The PEBS > >>> fixup relies on the CPU having just executed the code, and if it could > >>> fetch and execute the code, why shouldn't it be able to fetch and read? > >> > >> There are two ways this could be a problem. One is that u privileged > >> user apps shouldn't be able to read from EFI memory. > > > > Ah, but only root can create per-cpu events or attach events to kernel > > threads (with sensible paranoia levels). > > But this may not need to be percpu. If a non root user can trigger, say, an > EFI variable read in their own thread context, boom. > + Tony Hi Andi, I am trying to reproduce the issue that we are discussing and hence tried an experiment like this: A user process continuously reads efi variable by "cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as root (which I suppose should trigger NMI's). I see that everything is fine, no lockups, no kernel crash, no warnings/errors in dmesg. I see that perf top reports 50% of time is spent in efi function (probably efi_get_variable()). OverheadShared Object Symbol 50% [unknown] [k] 0xfffeea967416 50% is max, on avg it's 35%. I have tested this on two kernels v4.12 and v3.19. My machine has 8 cores and to stress test, I further offlined all cpus except cpu0. Could you please let me know a way to reproduce the issue that we are discussing here. I think the issue we are concerned here is, when kernel is in efi context and an NMI happens and if the NMI handler tries to access user space, boom! we don't have user space in efi context. Am I right in understanding the issue or is it something else? Regards, Sai
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, 2017-08-21 at 08:23 -0700, Andy Lutomirski wrote: > > > On Aug 21, 2017, at 7:08 AM, Peter Zijlstra wrote: > > > >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: > >> > >> > >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: > > > > Using a kernel thread solves the problem for real. Anything that > blindly accesses user memory in kernel thread context is terminally > broken no matter what. > >>> > >>> So perf-callchain doesn't do it 'blindly', it wants either: > >>> > >>> - user_mode(regs) true, or > >>> - task_pt_regs() set. > >>> > >>> However I'm thinking that if the kernel thread has ->mm == _mm, the > >>> EFI code running could very well have user_mode(regs) being true. > >>> > >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are > >>> accessible. It bails on error though. So while its careful, it does > >>> attempt to access the 'user' mapping directly. Which should also trigger > >>> with the EFI code. > >>> > >>> And I'm not seeing anything particularly broken with either. The PEBS > >>> fixup relies on the CPU having just executed the code, and if it could > >>> fetch and execute the code, why shouldn't it be able to fetch and read? > >> > >> There are two ways this could be a problem. One is that u privileged > >> user apps shouldn't be able to read from EFI memory. > > > > Ah, but only root can create per-cpu events or attach events to kernel > > threads (with sensible paranoia levels). > > But this may not need to be percpu. If a non root user can trigger, say, an > EFI variable read in their own thread context, boom. > + Tony Hi Andi, I am trying to reproduce the issue that we are discussing and hence tried an experiment like this: A user process continuously reads efi variable by "cat /sys/firmware/efi/efivars/Boot-8be4df61-93ca-11d2-aa0d-00e098032b8c" for specified time (Eg: 100 seconds) and simultaneously I ran "perf top" as root (which I suppose should trigger NMI's). I see that everything is fine, no lockups, no kernel crash, no warnings/errors in dmesg. I see that perf top reports 50% of time is spent in efi function (probably efi_get_variable()). OverheadShared Object Symbol 50% [unknown] [k] 0xfffeea967416 50% is max, on avg it's 35%. I have tested this on two kernels v4.12 and v3.19. My machine has 8 cores and to stress test, I further offlined all cpus except cpu0. Could you please let me know a way to reproduce the issue that we are discussing here. I think the issue we are concerned here is, when kernel is in efi context and an NMI happens and if the NMI handler tries to access user space, boom! we don't have user space in efi context. Am I right in understanding the issue or is it something else? Regards, Sai
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: > There are two ways this could be a problem. One is that u privileged > user apps shouldn't be able to read from EFI memory. The other is > that, if EFI were to have IO memory mapped at a "user" address, perf > could end up reading it. So assuming the efi_switch_mm() case from the calling thread context, I don't see how we can avoid it at all. Suppose we have a free running PEBS counter (PEBS puts samples in DS buffer and only raises PMI when 'full'). This can easily cover the entire efi_switch_mm() and back swizzle, and then we have 'userspace' samples that don't correspond to actual userspace. EFI (pretending to be userspace) is a giant trainwreck.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: > There are two ways this could be a problem. One is that u privileged > user apps shouldn't be able to read from EFI memory. The other is > that, if EFI were to have IO memory mapped at a "user" address, perf > could end up reading it. So assuming the efi_switch_mm() case from the calling thread context, I don't see how we can avoid it at all. Suppose we have a free running PEBS counter (PEBS puts samples in DS buffer and only raises PMI when 'full'). This can easily cover the entire efi_switch_mm() and back swizzle, and then we have 'userspace' samples that don't correspond to actual userspace. EFI (pretending to be userspace) is a giant trainwreck.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On 21 August 2017 at 16:59, Peter Zijlstrawrote: > On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote: >> > Ah, but only root can create per-cpu events or attach events to kernel >> > threads (with sensible paranoia levels). >> >> But this may not need to be percpu. If a non root user can trigger, say, an >> EFI variable read in their own thread context, boom. > > I was going by the proposed: "everything EFI in a kthread" model. But > yes, if that's not done, then you're quite right. > How does this work in cases where we need to call into UEFI from non-process context? Or at least from a context where current != EFI's kthread. We have EFI pstore code, for instance, that records panic data. Should we make an exception for those? I'm happy to have a stab at implementing the EFI kthread, but I'd like to get some of these details clarified first.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On 21 August 2017 at 16:59, Peter Zijlstra wrote: > On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote: >> > Ah, but only root can create per-cpu events or attach events to kernel >> > threads (with sensible paranoia levels). >> >> But this may not need to be percpu. If a non root user can trigger, say, an >> EFI variable read in their own thread context, boom. > > I was going by the proposed: "everything EFI in a kthread" model. But > yes, if that's not done, then you're quite right. > How does this work in cases where we need to call into UEFI from non-process context? Or at least from a context where current != EFI's kthread. We have EFI pstore code, for instance, that records panic data. Should we make an exception for those? I'm happy to have a stab at implementing the EFI kthread, but I'd like to get some of these details clarified first.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote: > > Ah, but only root can create per-cpu events or attach events to kernel > > threads (with sensible paranoia levels). > > But this may not need to be percpu. If a non root user can trigger, say, an > EFI variable read in their own thread context, boom. I was going by the proposed: "everything EFI in a kthread" model. But yes, if that's not done, then you're quite right. > > > >> The other is that, if EFI were to have IO memory mapped at a "user" > >> address, perf could end up reading it. > > > > Ah, but in neither mode does perf assume much, the LBR follows branches > > the CPU took and thus we _know_ there was code there, not MMIO. And the > > stack unwind simply follows the stack up, although I suppose it could be > > 'tricked' into probing MMIO. We can certainly add an "->mm != > > ->active_mm" escape clause to the unwind code. > > > > Although I don't see how we're currently avoiding the same problem with > > existing userspace unwinds, userspace can equally have MMIO mapped. > > But user space at least only has IO mapped to which the user program in > question has rights. Still, we should not mess it up just because we're trying to unwind stacks.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, Aug 21, 2017 at 08:23:10AM -0700, Andy Lutomirski wrote: > > Ah, but only root can create per-cpu events or attach events to kernel > > threads (with sensible paranoia levels). > > But this may not need to be percpu. If a non root user can trigger, say, an > EFI variable read in their own thread context, boom. I was going by the proposed: "everything EFI in a kthread" model. But yes, if that's not done, then you're quite right. > > > >> The other is that, if EFI were to have IO memory mapped at a "user" > >> address, perf could end up reading it. > > > > Ah, but in neither mode does perf assume much, the LBR follows branches > > the CPU took and thus we _know_ there was code there, not MMIO. And the > > stack unwind simply follows the stack up, although I suppose it could be > > 'tricked' into probing MMIO. We can certainly add an "->mm != > > ->active_mm" escape clause to the unwind code. > > > > Although I don't see how we're currently avoiding the same problem with > > existing userspace unwinds, userspace can equally have MMIO mapped. > > But user space at least only has IO mapped to which the user program in > question has rights. Still, we should not mess it up just because we're trying to unwind stacks.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
> On Aug 21, 2017, at 7:08 AM, Peter Zijlstrawrote: > >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: >> >> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: > Using a kernel thread solves the problem for real. Anything that blindly accesses user memory in kernel thread context is terminally broken no matter what. >>> >>> So perf-callchain doesn't do it 'blindly', it wants either: >>> >>> - user_mode(regs) true, or >>> - task_pt_regs() set. >>> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the >>> EFI code running could very well have user_mode(regs) being true. >>> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are >>> accessible. It bails on error though. So while its careful, it does >>> attempt to access the 'user' mapping directly. Which should also trigger >>> with the EFI code. >>> >>> And I'm not seeing anything particularly broken with either. The PEBS >>> fixup relies on the CPU having just executed the code, and if it could >>> fetch and execute the code, why shouldn't it be able to fetch and read? >> >> There are two ways this could be a problem. One is that u privileged >> user apps shouldn't be able to read from EFI memory. > > Ah, but only root can create per-cpu events or attach events to kernel > threads (with sensible paranoia levels). But this may not need to be percpu. If a non root user can trigger, say, an EFI variable read in their own thread context, boom. > >> The other is that, if EFI were to have IO memory mapped at a "user" >> address, perf could end up reading it. > > Ah, but in neither mode does perf assume much, the LBR follows branches > the CPU took and thus we _know_ there was code there, not MMIO. And the > stack unwind simply follows the stack up, although I suppose it could be > 'tricked' into probing MMIO. We can certainly add an "->mm != > ->active_mm" escape clause to the unwind code. > > Although I don't see how we're currently avoiding the same problem with > existing userspace unwinds, userspace can equally have MMIO mapped. But user space at least only has IO mapped to which the user program in question has rights. > > But neither will use pre-existing user addresses in the efi_mm I think.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
> On Aug 21, 2017, at 7:08 AM, Peter Zijlstra wrote: > >> On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: >> >> >>> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: > Using a kernel thread solves the problem for real. Anything that blindly accesses user memory in kernel thread context is terminally broken no matter what. >>> >>> So perf-callchain doesn't do it 'blindly', it wants either: >>> >>> - user_mode(regs) true, or >>> - task_pt_regs() set. >>> >>> However I'm thinking that if the kernel thread has ->mm == _mm, the >>> EFI code running could very well have user_mode(regs) being true. >>> >>> intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are >>> accessible. It bails on error though. So while its careful, it does >>> attempt to access the 'user' mapping directly. Which should also trigger >>> with the EFI code. >>> >>> And I'm not seeing anything particularly broken with either. The PEBS >>> fixup relies on the CPU having just executed the code, and if it could >>> fetch and execute the code, why shouldn't it be able to fetch and read? >> >> There are two ways this could be a problem. One is that u privileged >> user apps shouldn't be able to read from EFI memory. > > Ah, but only root can create per-cpu events or attach events to kernel > threads (with sensible paranoia levels). But this may not need to be percpu. If a non root user can trigger, say, an EFI variable read in their own thread context, boom. > >> The other is that, if EFI were to have IO memory mapped at a "user" >> address, perf could end up reading it. > > Ah, but in neither mode does perf assume much, the LBR follows branches > the CPU took and thus we _know_ there was code there, not MMIO. And the > stack unwind simply follows the stack up, although I suppose it could be > 'tricked' into probing MMIO. We can certainly add an "->mm != > ->active_mm" escape clause to the unwind code. > > Although I don't see how we're currently avoiding the same problem with > existing userspace unwinds, userspace can equally have MMIO mapped. But user space at least only has IO mapped to which the user program in question has rights. > > But neither will use pre-existing user addresses in the efi_mm I think.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: > > > > On Aug 21, 2017, at 3:33 AM, Peter Zijlstrawrote: > >> > >> Using a kernel thread solves the problem for real. Anything that > >> blindly accesses user memory in kernel thread context is terminally > >> broken no matter what. > > > > So perf-callchain doesn't do it 'blindly', it wants either: > > > > - user_mode(regs) true, or > > - task_pt_regs() set. > > > > However I'm thinking that if the kernel thread has ->mm == _mm, the > > EFI code running could very well have user_mode(regs) being true. > > > > intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are > > accessible. It bails on error though. So while its careful, it does > > attempt to access the 'user' mapping directly. Which should also trigger > > with the EFI code. > > > > And I'm not seeing anything particularly broken with either. The PEBS > > fixup relies on the CPU having just executed the code, and if it could > > fetch and execute the code, why shouldn't it be able to fetch and read? > > There are two ways this could be a problem. One is that u privileged > user apps shouldn't be able to read from EFI memory. Ah, but only root can create per-cpu events or attach events to kernel threads (with sensible paranoia levels). > The other is that, if EFI were to have IO memory mapped at a "user" > address, perf could end up reading it. Ah, but in neither mode does perf assume much, the LBR follows branches the CPU took and thus we _know_ there was code there, not MMIO. And the stack unwind simply follows the stack up, although I suppose it could be 'tricked' into probing MMIO. We can certainly add an "->mm != ->active_mm" escape clause to the unwind code. Although I don't see how we're currently avoiding the same problem with existing userspace unwinds, userspace can equally have MMIO mapped. But neither will use pre-existing user addresses in the efi_mm I think.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Mon, Aug 21, 2017 at 06:56:01AM -0700, Andy Lutomirski wrote: > > > > On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: > >> > >> Using a kernel thread solves the problem for real. Anything that > >> blindly accesses user memory in kernel thread context is terminally > >> broken no matter what. > > > > So perf-callchain doesn't do it 'blindly', it wants either: > > > > - user_mode(regs) true, or > > - task_pt_regs() set. > > > > However I'm thinking that if the kernel thread has ->mm == _mm, the > > EFI code running could very well have user_mode(regs) being true. > > > > intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are > > accessible. It bails on error though. So while its careful, it does > > attempt to access the 'user' mapping directly. Which should also trigger > > with the EFI code. > > > > And I'm not seeing anything particularly broken with either. The PEBS > > fixup relies on the CPU having just executed the code, and if it could > > fetch and execute the code, why shouldn't it be able to fetch and read? > > There are two ways this could be a problem. One is that u privileged > user apps shouldn't be able to read from EFI memory. Ah, but only root can create per-cpu events or attach events to kernel threads (with sensible paranoia levels). > The other is that, if EFI were to have IO memory mapped at a "user" > address, perf could end up reading it. Ah, but in neither mode does perf assume much, the LBR follows branches the CPU took and thus we _know_ there was code there, not MMIO. And the stack unwind simply follows the stack up, although I suppose it could be 'tricked' into probing MMIO. We can certainly add an "->mm != ->active_mm" escape clause to the unwind code. Although I don't see how we're currently avoiding the same problem with existing userspace unwinds, userspace can equally have MMIO mapped. But neither will use pre-existing user addresses in the efi_mm I think.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
> On Aug 21, 2017, at 3:33 AM, Peter Zijlstrawrote: > >> On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote: >> On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon wrote: > >>> I'm still concerned that we're treating perf specially here -- are we >>> absolutely sure that nobody else is going to attempt user accesses off the >>> back of an interrupt? >> >> Reasonably sure? If nothing else, an interrupt taken while mmap_sem() >> is held for write that tries to access user memory is asking for >> serious trouble. There are still a few callers of pagefault_disable() >> and copy...inatomic(), though. > > I'm not immediately seeing how holding mmap_sem for writing is a > problem. > >>> If not, then I'd much prefer a solution that catches >>> anybody doing that with the EFI page table installed, rather than trying >>> to play whack-a-mole like this. >> >> Using a kernel thread solves the problem for real. Anything that >> blindly accesses user memory in kernel thread context is terminally >> broken no matter what. > > So perf-callchain doesn't do it 'blindly', it wants either: > > - user_mode(regs) true, or > - task_pt_regs() set. > > However I'm thinking that if the kernel thread has ->mm == _mm, the > EFI code running could very well have user_mode(regs) being true. > > intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are > accessible. It bails on error though. So while its careful, it does > attempt to access the 'user' mapping directly. Which should also trigger > with the EFI code. > > And I'm not seeing anything particularly broken with either. The PEBS > fixup relies on the CPU having just executed the code, and if it could > fetch and execute the code, why shouldn't it be able to fetch and read? There are two ways this could be a problem. One is that u privileged user apps shouldn't be able to read from EFI memory. The other is that, if EFI were to have IO memory mapped at a "user" address, perf could end up reading it. > (eXecute implies Read assumed). And like said, it if triggers a fault, > it bails, no worries. > > It really doesn't care if the task is a kernel thread or not. Same for > the unwinder, if we get an interrupt register set that points into > 'userspace' we try and unwind it.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
> On Aug 21, 2017, at 3:33 AM, Peter Zijlstra wrote: > >> On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote: >> On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon wrote: > >>> I'm still concerned that we're treating perf specially here -- are we >>> absolutely sure that nobody else is going to attempt user accesses off the >>> back of an interrupt? >> >> Reasonably sure? If nothing else, an interrupt taken while mmap_sem() >> is held for write that tries to access user memory is asking for >> serious trouble. There are still a few callers of pagefault_disable() >> and copy...inatomic(), though. > > I'm not immediately seeing how holding mmap_sem for writing is a > problem. > >>> If not, then I'd much prefer a solution that catches >>> anybody doing that with the EFI page table installed, rather than trying >>> to play whack-a-mole like this. >> >> Using a kernel thread solves the problem for real. Anything that >> blindly accesses user memory in kernel thread context is terminally >> broken no matter what. > > So perf-callchain doesn't do it 'blindly', it wants either: > > - user_mode(regs) true, or > - task_pt_regs() set. > > However I'm thinking that if the kernel thread has ->mm == _mm, the > EFI code running could very well have user_mode(regs) being true. > > intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are > accessible. It bails on error though. So while its careful, it does > attempt to access the 'user' mapping directly. Which should also trigger > with the EFI code. > > And I'm not seeing anything particularly broken with either. The PEBS > fixup relies on the CPU having just executed the code, and if it could > fetch and execute the code, why shouldn't it be able to fetch and read? There are two ways this could be a problem. One is that u privileged user apps shouldn't be able to read from EFI memory. The other is that, if EFI were to have IO memory mapped at a "user" address, perf could end up reading it. > (eXecute implies Read assumed). And like said, it if triggers a fault, > it bails, no worries. > > It really doesn't care if the task is a kernel thread or not. Same for > the unwinder, if we get an interrupt register set that points into > 'userspace' we try and unwind it.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote: > On Thu, Aug 17, 2017 at 3:35 AM, Will Deaconwrote: > > I'm still concerned that we're treating perf specially here -- are we > > absolutely sure that nobody else is going to attempt user accesses off the > > back of an interrupt? > > Reasonably sure? If nothing else, an interrupt taken while mmap_sem() > is held for write that tries to access user memory is asking for > serious trouble. There are still a few callers of pagefault_disable() > and copy...inatomic(), though. I'm not immediately seeing how holding mmap_sem for writing is a problem. > > If not, then I'd much prefer a solution that catches > > anybody doing that with the EFI page table installed, rather than trying > > to play whack-a-mole like this. > > Using a kernel thread solves the problem for real. Anything that > blindly accesses user memory in kernel thread context is terminally > broken no matter what. So perf-callchain doesn't do it 'blindly', it wants either: - user_mode(regs) true, or - task_pt_regs() set. However I'm thinking that if the kernel thread has ->mm == _mm, the EFI code running could very well have user_mode(regs) being true. intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are accessible. It bails on error though. So while its careful, it does attempt to access the 'user' mapping directly. Which should also trigger with the EFI code. And I'm not seeing anything particularly broken with either. The PEBS fixup relies on the CPU having just executed the code, and if it could fetch and execute the code, why shouldn't it be able to fetch and read? (eXecute implies Read assumed). And like said, it if triggers a fault, it bails, no worries. It really doesn't care if the task is a kernel thread or not. Same for the unwinder, if we get an interrupt register set that points into 'userspace' we try and unwind it.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Thu, Aug 17, 2017 at 08:52:38AM -0700, Andy Lutomirski wrote: > On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon wrote: > > I'm still concerned that we're treating perf specially here -- are we > > absolutely sure that nobody else is going to attempt user accesses off the > > back of an interrupt? > > Reasonably sure? If nothing else, an interrupt taken while mmap_sem() > is held for write that tries to access user memory is asking for > serious trouble. There are still a few callers of pagefault_disable() > and copy...inatomic(), though. I'm not immediately seeing how holding mmap_sem for writing is a problem. > > If not, then I'd much prefer a solution that catches > > anybody doing that with the EFI page table installed, rather than trying > > to play whack-a-mole like this. > > Using a kernel thread solves the problem for real. Anything that > blindly accesses user memory in kernel thread context is terminally > broken no matter what. So perf-callchain doesn't do it 'blindly', it wants either: - user_mode(regs) true, or - task_pt_regs() set. However I'm thinking that if the kernel thread has ->mm == _mm, the EFI code running could very well have user_mode(regs) being true. intel_pmu_pebs_fixup() OTOH 'blindly' assumes that the LBR addresses are accessible. It bails on error though. So while its careful, it does attempt to access the 'user' mapping directly. Which should also trigger with the EFI code. And I'm not seeing anything particularly broken with either. The PEBS fixup relies on the CPU having just executed the code, and if it could fetch and execute the code, why shouldn't it be able to fetch and read? (eXecute implies Read assumed). And like said, it if triggers a fault, it bails, no worries. It really doesn't care if the task is a kernel thread or not. Same for the unwinder, if we get an interrupt register set that points into 'userspace' we try and unwind it.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Thu, Aug 17, 2017 at 3:35 AM, Will Deaconwrote: > On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote: >> On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote: >> > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming >> > wrote: >> > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: >> > >> >> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e. >> > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip >> > >> the sample, as with the skid case. >> > > >> > > FYI, this is my preferred solution for x86 too. >> > >> > One option for the "funny mm" flag would be literally the condition >> > current->mm != current->active_mm. I *think* this gets all the cases >> > right as long as efi_switch_mm is careful with its ordering and that >> > the arch switch_mm() code can handle the resulting ordering. (x86's >> > can now, I think, or at least will be able to in 4.14 -- not sure >> > about other arches). >> >> For arm64 we'd have to rework things a bit to get the ordering right >> (especially when we flip to/from the idmap), but otherwise this sounds sane >> to >> me. >> >> > That being said, there's a totally different solution: run EFI >> > callbacks in a kernel thread. This has other benefits: we could run >> > those callbacks in user mode some day, and doing *that* in a user >> > thread seems like a mistake. >> >> I think that wouldn't work for CPU-bound perf events (which are not >> ctx-switched with the task). >> >> It might be desireable to do that anyway, though. > > I'm still concerned that we're treating perf specially here -- are we > absolutely sure that nobody else is going to attempt user accesses off the > back of an interrupt? Reasonably sure? If nothing else, an interrupt taken while mmap_sem() is held for write that tries to access user memory is asking for serious trouble. There are still a few callers of pagefault_disable() and copy...inatomic(), though. > If not, then I'd much prefer a solution that catches > anybody doing that with the EFI page table installed, rather than trying > to play whack-a-mole like this. Using a kernel thread solves the problem for real. Anything that blindly accesses user memory in kernel thread context is terminally broken no matter what. > > Will
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Thu, Aug 17, 2017 at 3:35 AM, Will Deacon wrote: > On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote: >> On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote: >> > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming >> > wrote: >> > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: >> > >> >> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e. >> > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip >> > >> the sample, as with the skid case. >> > > >> > > FYI, this is my preferred solution for x86 too. >> > >> > One option for the "funny mm" flag would be literally the condition >> > current->mm != current->active_mm. I *think* this gets all the cases >> > right as long as efi_switch_mm is careful with its ordering and that >> > the arch switch_mm() code can handle the resulting ordering. (x86's >> > can now, I think, or at least will be able to in 4.14 -- not sure >> > about other arches). >> >> For arm64 we'd have to rework things a bit to get the ordering right >> (especially when we flip to/from the idmap), but otherwise this sounds sane >> to >> me. >> >> > That being said, there's a totally different solution: run EFI >> > callbacks in a kernel thread. This has other benefits: we could run >> > those callbacks in user mode some day, and doing *that* in a user >> > thread seems like a mistake. >> >> I think that wouldn't work for CPU-bound perf events (which are not >> ctx-switched with the task). >> >> It might be desireable to do that anyway, though. > > I'm still concerned that we're treating perf specially here -- are we > absolutely sure that nobody else is going to attempt user accesses off the > back of an interrupt? Reasonably sure? If nothing else, an interrupt taken while mmap_sem() is held for write that tries to access user memory is asking for serious trouble. There are still a few callers of pagefault_disable() and copy...inatomic(), though. > If not, then I'd much prefer a solution that catches > anybody doing that with the EFI page table installed, rather than trying > to play whack-a-mole like this. Using a kernel thread solves the problem for real. Anything that blindly accesses user memory in kernel thread context is terminally broken no matter what. > > Will
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote: > On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote: > > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming> > wrote: > > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: > > >> > > >> I'd expect we'd abort at a higher level, not taking any sample. i.e. > > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip > > >> the sample, as with the skid case. > > > > > > FYI, this is my preferred solution for x86 too. > > > > One option for the "funny mm" flag would be literally the condition > > current->mm != current->active_mm. I *think* this gets all the cases > > right as long as efi_switch_mm is careful with its ordering and that > > the arch switch_mm() code can handle the resulting ordering. (x86's > > can now, I think, or at least will be able to in 4.14 -- not sure > > about other arches). > > For arm64 we'd have to rework things a bit to get the ordering right > (especially when we flip to/from the idmap), but otherwise this sounds sane to > me. > > > That being said, there's a totally different solution: run EFI > > callbacks in a kernel thread. This has other benefits: we could run > > those callbacks in user mode some day, and doing *that* in a user > > thread seems like a mistake. > > I think that wouldn't work for CPU-bound perf events (which are not > ctx-switched with the task). > > It might be desireable to do that anyway, though. I'm still concerned that we're treating perf specially here -- are we absolutely sure that nobody else is going to attempt user accesses off the back of an interrupt? If not, then I'd much prefer a solution that catches anybody doing that with the EFI page table installed, rather than trying to play whack-a-mole like this. Will
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, Aug 15, 2017 at 11:35:41PM +0100, Mark Rutland wrote: > On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote: > > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming > > wrote: > > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: > > >> > > >> I'd expect we'd abort at a higher level, not taking any sample. i.e. > > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip > > >> the sample, as with the skid case. > > > > > > FYI, this is my preferred solution for x86 too. > > > > One option for the "funny mm" flag would be literally the condition > > current->mm != current->active_mm. I *think* this gets all the cases > > right as long as efi_switch_mm is careful with its ordering and that > > the arch switch_mm() code can handle the resulting ordering. (x86's > > can now, I think, or at least will be able to in 4.14 -- not sure > > about other arches). > > For arm64 we'd have to rework things a bit to get the ordering right > (especially when we flip to/from the idmap), but otherwise this sounds sane to > me. > > > That being said, there's a totally different solution: run EFI > > callbacks in a kernel thread. This has other benefits: we could run > > those callbacks in user mode some day, and doing *that* in a user > > thread seems like a mistake. > > I think that wouldn't work for CPU-bound perf events (which are not > ctx-switched with the task). > > It might be desireable to do that anyway, though. I'm still concerned that we're treating perf specially here -- are we absolutely sure that nobody else is going to attempt user accesses off the back of an interrupt? If not, then I'd much prefer a solution that catches anybody doing that with the EFI page table installed, rather than trying to play whack-a-mole like this. Will
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote: > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming> wrote: > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: > >> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e. > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip > >> the sample, as with the skid case. > > > > FYI, this is my preferred solution for x86 too. > > One option for the "funny mm" flag would be literally the condition > current->mm != current->active_mm. I *think* this gets all the cases > right as long as efi_switch_mm is careful with its ordering and that > the arch switch_mm() code can handle the resulting ordering. (x86's > can now, I think, or at least will be able to in 4.14 -- not sure > about other arches). For arm64 we'd have to rework things a bit to get the ordering right (especially when we flip to/from the idmap), but otherwise this sounds sane to me. > That being said, there's a totally different solution: run EFI > callbacks in a kernel thread. This has other benefits: we could run > those callbacks in user mode some day, and doing *that* in a user > thread seems like a mistake. I think that wouldn't work for CPU-bound perf events (which are not ctx-switched with the task). It might be desireable to do that anyway, though. Thanks, Mark.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 09:14:41AM -0700, Andy Lutomirski wrote: > On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming > wrote: > > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: > >> > >> I'd expect we'd abort at a higher level, not taking any sample. i.e. > >> we'd have the core overflow handler check in_funny_mm(), and if so, skip > >> the sample, as with the skid case. > > > > FYI, this is my preferred solution for x86 too. > > One option for the "funny mm" flag would be literally the condition > current->mm != current->active_mm. I *think* this gets all the cases > right as long as efi_switch_mm is careful with its ordering and that > the arch switch_mm() code can handle the resulting ordering. (x86's > can now, I think, or at least will be able to in 4.14 -- not sure > about other arches). For arm64 we'd have to rework things a bit to get the ordering right (especially when we flip to/from the idmap), but otherwise this sounds sane to me. > That being said, there's a totally different solution: run EFI > callbacks in a kernel thread. This has other benefits: we could run > those callbacks in user mode some day, and doing *that* in a user > thread seems like a mistake. I think that wouldn't work for CPU-bound perf events (which are not ctx-switched with the task). It might be desireable to do that anyway, though. Thanks, Mark.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 5:57 AM, Matt Flemingwrote: > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: >> >> I'd expect we'd abort at a higher level, not taking any sample. i.e. >> we'd have the core overflow handler check in_funny_mm(), and if so, skip >> the sample, as with the skid case. > > FYI, this is my preferred solution for x86 too. One option for the "funny mm" flag would be literally the condition current->mm != current->active_mm. I *think* this gets all the cases right as long as efi_switch_mm is careful with its ordering and that the arch switch_mm() code can handle the resulting ordering. (x86's can now, I think, or at least will be able to in 4.14 -- not sure about other arches). That being said, there's a totally different solution: run EFI callbacks in a kernel thread. This has other benefits: we could run those callbacks in user mode some day, and doing *that* in a user thread seems like a mistake. --Andy
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 5:57 AM, Matt Fleming wrote: > On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: >> >> I'd expect we'd abort at a higher level, not taking any sample. i.e. >> we'd have the core overflow handler check in_funny_mm(), and if so, skip >> the sample, as with the skid case. > > FYI, this is my preferred solution for x86 too. One option for the "funny mm" flag would be literally the condition current->mm != current->active_mm. I *think* this gets all the cases right as long as efi_switch_mm is careful with its ordering and that the arch switch_mm() code can handle the resulting ordering. (x86's can now, I think, or at least will be able to in 4.14 -- not sure about other arches). That being said, there's a totally different solution: run EFI callbacks in a kernel thread. This has other benefits: we could run those callbacks in user mode some day, and doing *that* in a user thread seems like a mistake. --Andy
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: > > I'd expect we'd abort at a higher level, not taking any sample. i.e. > we'd have the core overflow handler check in_funny_mm(), and if so, skip > the sample, as with the skid case. FYI, this is my preferred solution for x86 too.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, 16 Aug, at 12:03:22PM, Mark Rutland wrote: > > I'd expect we'd abort at a higher level, not taking any sample. i.e. > we'd have the core overflow handler check in_funny_mm(), and if so, skip > the sample, as with the skid case. FYI, this is my preferred solution for x86 too.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 11:07:10AM +0100, Will Deacon wrote: > On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote: > > On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote: > > > (+ Mark, Will) > > > > > > On 15 August 2017 at 22:46, Andy Lutomirskiwrote: > > > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > > > > wrote: > > > >> +/* > > > >> + * Makes the calling kernel thread switch to/from efi_mm context > > > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > > >> + * (Note: This routine is heavily inspired from use_mm) > > > >> + */ > > > >> +void efi_switch_mm(struct mm_struct *mm) > > > >> +{ > > > >> + struct task_struct *tsk = current; > > > >> + > > > >> + task_lock(tsk); > > > >> + efi_scratch.prev_mm = tsk->active_mm; > > > >> + if (efi_scratch.prev_mm != mm) { > > > >> + mmgrab(mm); > > > >> + tsk->active_mm = mm; > > > >> + } > > > >> + switch_mm(efi_scratch.prev_mm, mm, NULL); > > > >> + task_unlock(tsk); > > > >> + > > > >> + if (efi_scratch.prev_mm != mm) > > > >> + mmdrop(efi_scratch.prev_mm); > > > > > > > > I'm confused. You're mmdropping an mm that you are still keeping a > > > > pointer to. This is also a bit confusing in the case where you do > > > > efi_switch_mm(efi_scratch.prev_mm). > > > > > > > > This whole manipulation seems fairly dangerous to me for another > > > > reason -- you're taking a user thread (I think) and swapping out its > > > > mm to something that the user in question should *not* have access to. > > > > What if a perf interrupt happens while you're in the alternate mm? > > > > What if you segfault and dump core? Should we maybe just have a flag > > > > that says "this cpu is using a funny mm", assert that the flag is > > > > clear when scheduling, and teach perf, coredumps, etc not to touch > > > > user memory when the flag is set? > > > > > > It appears we may have introduced this exact issue on arm64 and ARM by > > > starting to run the UEFI runtime services with interrupts enabled. > > > (perf does not use NMI on ARM, so the issue did not exist beforehand) > > > > > > Mark, Will, any thoughts? > > > > Yup, I can cause perf to take samples from the EFI FW code, so that's > > less than ideal. > > But that should only happen if you're profiling EL1, right, which needs > root privileges? (assuming the skid issue is solved -- not sure what > happened to those patches after they broke criu). I *think* that only needs perf_event_paranoid < 1, rather than root. It's certianly not accessible by default to most users (e.g. my Ubuntu fs sets this to 2, and IIRC Debian go to a much more stringent non-upstream paranoid level). > > The "funny mm" flag sounds like a good idea to me, though given recent > > pain with sampling in the case of skid, I don't know exactly what we > > should do if/when we take an overflow interrupt while in EFI. > > I don't think special-casing perf interrupts is the right thing to do here. > If we're concerned about user-accesses being made off the back of interrupts > taken whilst in EFI, then we should probably either swizzle back in the > user page table on the IRQ path or postpone handling it until we're done > with the firmware. Doing that for every IRQ feels odd, especially as the result would be sampling something that wasn't executed, potentially repeatedly, giveing bogus info. > Having a flag feels a bit weird: would the uaccess routines return > -EFAULT if it's set? I'd expect we'd abort at a higher level, not taking any sample. i.e. we'd have the core overflow handler check in_funny_mm(), and if so, skip the sample, as with the skid case. Thanks, Mark.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 11:07:10AM +0100, Will Deacon wrote: > On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote: > > On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote: > > > (+ Mark, Will) > > > > > > On 15 August 2017 at 22:46, Andy Lutomirski wrote: > > > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > > > > wrote: > > > >> +/* > > > >> + * Makes the calling kernel thread switch to/from efi_mm context > > > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > > >> + * (Note: This routine is heavily inspired from use_mm) > > > >> + */ > > > >> +void efi_switch_mm(struct mm_struct *mm) > > > >> +{ > > > >> + struct task_struct *tsk = current; > > > >> + > > > >> + task_lock(tsk); > > > >> + efi_scratch.prev_mm = tsk->active_mm; > > > >> + if (efi_scratch.prev_mm != mm) { > > > >> + mmgrab(mm); > > > >> + tsk->active_mm = mm; > > > >> + } > > > >> + switch_mm(efi_scratch.prev_mm, mm, NULL); > > > >> + task_unlock(tsk); > > > >> + > > > >> + if (efi_scratch.prev_mm != mm) > > > >> + mmdrop(efi_scratch.prev_mm); > > > > > > > > I'm confused. You're mmdropping an mm that you are still keeping a > > > > pointer to. This is also a bit confusing in the case where you do > > > > efi_switch_mm(efi_scratch.prev_mm). > > > > > > > > This whole manipulation seems fairly dangerous to me for another > > > > reason -- you're taking a user thread (I think) and swapping out its > > > > mm to something that the user in question should *not* have access to. > > > > What if a perf interrupt happens while you're in the alternate mm? > > > > What if you segfault and dump core? Should we maybe just have a flag > > > > that says "this cpu is using a funny mm", assert that the flag is > > > > clear when scheduling, and teach perf, coredumps, etc not to touch > > > > user memory when the flag is set? > > > > > > It appears we may have introduced this exact issue on arm64 and ARM by > > > starting to run the UEFI runtime services with interrupts enabled. > > > (perf does not use NMI on ARM, so the issue did not exist beforehand) > > > > > > Mark, Will, any thoughts? > > > > Yup, I can cause perf to take samples from the EFI FW code, so that's > > less than ideal. > > But that should only happen if you're profiling EL1, right, which needs > root privileges? (assuming the skid issue is solved -- not sure what > happened to those patches after they broke criu). I *think* that only needs perf_event_paranoid < 1, rather than root. It's certianly not accessible by default to most users (e.g. my Ubuntu fs sets this to 2, and IIRC Debian go to a much more stringent non-upstream paranoid level). > > The "funny mm" flag sounds like a good idea to me, though given recent > > pain with sampling in the case of skid, I don't know exactly what we > > should do if/when we take an overflow interrupt while in EFI. > > I don't think special-casing perf interrupts is the right thing to do here. > If we're concerned about user-accesses being made off the back of interrupts > taken whilst in EFI, then we should probably either swizzle back in the > user page table on the IRQ path or postpone handling it until we're done > with the firmware. Doing that for every IRQ feels odd, especially as the result would be sampling something that wasn't executed, potentially repeatedly, giveing bogus info. > Having a flag feels a bit weird: would the uaccess routines return > -EFAULT if it's set? I'd expect we'd abort at a higher level, not taking any sample. i.e. we'd have the core overflow handler check in_funny_mm(), and if so, skip the sample, as with the skid case. Thanks, Mark.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote: > On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote: > > (+ Mark, Will) > > > > On 15 August 2017 at 22:46, Andy Lutomirskiwrote: > > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > > > wrote: > > >> +/* > > >> + * Makes the calling kernel thread switch to/from efi_mm context > > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > >> + * (Note: This routine is heavily inspired from use_mm) > > >> + */ > > >> +void efi_switch_mm(struct mm_struct *mm) > > >> +{ > > >> + struct task_struct *tsk = current; > > >> + > > >> + task_lock(tsk); > > >> + efi_scratch.prev_mm = tsk->active_mm; > > >> + if (efi_scratch.prev_mm != mm) { > > >> + mmgrab(mm); > > >> + tsk->active_mm = mm; > > >> + } > > >> + switch_mm(efi_scratch.prev_mm, mm, NULL); > > >> + task_unlock(tsk); > > >> + > > >> + if (efi_scratch.prev_mm != mm) > > >> + mmdrop(efi_scratch.prev_mm); > > > > > > I'm confused. You're mmdropping an mm that you are still keeping a > > > pointer to. This is also a bit confusing in the case where you do > > > efi_switch_mm(efi_scratch.prev_mm). > > > > > > This whole manipulation seems fairly dangerous to me for another > > > reason -- you're taking a user thread (I think) and swapping out its > > > mm to something that the user in question should *not* have access to. > > > What if a perf interrupt happens while you're in the alternate mm? > > > What if you segfault and dump core? Should we maybe just have a flag > > > that says "this cpu is using a funny mm", assert that the flag is > > > clear when scheduling, and teach perf, coredumps, etc not to touch > > > user memory when the flag is set? > > > > It appears we may have introduced this exact issue on arm64 and ARM by > > starting to run the UEFI runtime services with interrupts enabled. > > (perf does not use NMI on ARM, so the issue did not exist beforehand) > > > > Mark, Will, any thoughts? > > Yup, I can cause perf to take samples from the EFI FW code, so that's > less than ideal. But that should only happen if you're profiling EL1, right, which needs root privileges? (assuming the skid issue is solved -- not sure what happened to those patches after they broke criu). > The "funny mm" flag sounds like a good idea to me, though given recent > pain with sampling in the case of skid, I don't know exactly what we > should do if/when we take an overflow interrupt while in EFI. I don't think special-casing perf interrupts is the right thing to do here. If we're concerned about user-accesses being made off the back of interrupts taken whilst in EFI, then we should probably either swizzle back in the user page table on the IRQ path or postpone handling it until we're done with the firmware. Having a flag feels a bit weird: would the uaccess routines return -EFAULT if it's set? Will
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 10:53:38AM +0100, Mark Rutland wrote: > On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote: > > (+ Mark, Will) > > > > On 15 August 2017 at 22:46, Andy Lutomirski wrote: > > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > > > wrote: > > >> +/* > > >> + * Makes the calling kernel thread switch to/from efi_mm context > > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > >> + * (Note: This routine is heavily inspired from use_mm) > > >> + */ > > >> +void efi_switch_mm(struct mm_struct *mm) > > >> +{ > > >> + struct task_struct *tsk = current; > > >> + > > >> + task_lock(tsk); > > >> + efi_scratch.prev_mm = tsk->active_mm; > > >> + if (efi_scratch.prev_mm != mm) { > > >> + mmgrab(mm); > > >> + tsk->active_mm = mm; > > >> + } > > >> + switch_mm(efi_scratch.prev_mm, mm, NULL); > > >> + task_unlock(tsk); > > >> + > > >> + if (efi_scratch.prev_mm != mm) > > >> + mmdrop(efi_scratch.prev_mm); > > > > > > I'm confused. You're mmdropping an mm that you are still keeping a > > > pointer to. This is also a bit confusing in the case where you do > > > efi_switch_mm(efi_scratch.prev_mm). > > > > > > This whole manipulation seems fairly dangerous to me for another > > > reason -- you're taking a user thread (I think) and swapping out its > > > mm to something that the user in question should *not* have access to. > > > What if a perf interrupt happens while you're in the alternate mm? > > > What if you segfault and dump core? Should we maybe just have a flag > > > that says "this cpu is using a funny mm", assert that the flag is > > > clear when scheduling, and teach perf, coredumps, etc not to touch > > > user memory when the flag is set? > > > > It appears we may have introduced this exact issue on arm64 and ARM by > > starting to run the UEFI runtime services with interrupts enabled. > > (perf does not use NMI on ARM, so the issue did not exist beforehand) > > > > Mark, Will, any thoughts? > > Yup, I can cause perf to take samples from the EFI FW code, so that's > less than ideal. But that should only happen if you're profiling EL1, right, which needs root privileges? (assuming the skid issue is solved -- not sure what happened to those patches after they broke criu). > The "funny mm" flag sounds like a good idea to me, though given recent > pain with sampling in the case of skid, I don't know exactly what we > should do if/when we take an overflow interrupt while in EFI. I don't think special-casing perf interrupts is the right thing to do here. If we're concerned about user-accesses being made off the back of interrupts taken whilst in EFI, then we should probably either swizzle back in the user page table on the IRQ path or postpone handling it until we're done with the firmware. Having a flag feels a bit weird: would the uaccess routines return -EFAULT if it's set? Will
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote: > (+ Mark, Will) > > On 15 August 2017 at 22:46, Andy Lutomirskiwrote: > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > > wrote: > >> +/* > >> + * Makes the calling kernel thread switch to/from efi_mm context > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls > >> + * (Note: This routine is heavily inspired from use_mm) > >> + */ > >> +void efi_switch_mm(struct mm_struct *mm) > >> +{ > >> + struct task_struct *tsk = current; > >> + > >> + task_lock(tsk); > >> + efi_scratch.prev_mm = tsk->active_mm; > >> + if (efi_scratch.prev_mm != mm) { > >> + mmgrab(mm); > >> + tsk->active_mm = mm; > >> + } > >> + switch_mm(efi_scratch.prev_mm, mm, NULL); > >> + task_unlock(tsk); > >> + > >> + if (efi_scratch.prev_mm != mm) > >> + mmdrop(efi_scratch.prev_mm); > > > > I'm confused. You're mmdropping an mm that you are still keeping a > > pointer to. This is also a bit confusing in the case where you do > > efi_switch_mm(efi_scratch.prev_mm). > > > > This whole manipulation seems fairly dangerous to me for another > > reason -- you're taking a user thread (I think) and swapping out its > > mm to something that the user in question should *not* have access to. > > What if a perf interrupt happens while you're in the alternate mm? > > What if you segfault and dump core? Should we maybe just have a flag > > that says "this cpu is using a funny mm", assert that the flag is > > clear when scheduling, and teach perf, coredumps, etc not to touch > > user memory when the flag is set? > > It appears we may have introduced this exact issue on arm64 and ARM by > starting to run the UEFI runtime services with interrupts enabled. > (perf does not use NMI on ARM, so the issue did not exist beforehand) > > Mark, Will, any thoughts? Yup, I can cause perf to take samples from the EFI FW code, so that's less than ideal. The "funny mm" flag sounds like a good idea to me, though given recent pain with sampling in the case of skid, I don't know exactly what we should do if/when we take an overflow interrupt while in EFI. Mark.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Wed, Aug 16, 2017 at 10:31:12AM +0100, Ard Biesheuvel wrote: > (+ Mark, Will) > > On 15 August 2017 at 22:46, Andy Lutomirski wrote: > > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > > wrote: > >> +/* > >> + * Makes the calling kernel thread switch to/from efi_mm context > >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls > >> + * (Note: This routine is heavily inspired from use_mm) > >> + */ > >> +void efi_switch_mm(struct mm_struct *mm) > >> +{ > >> + struct task_struct *tsk = current; > >> + > >> + task_lock(tsk); > >> + efi_scratch.prev_mm = tsk->active_mm; > >> + if (efi_scratch.prev_mm != mm) { > >> + mmgrab(mm); > >> + tsk->active_mm = mm; > >> + } > >> + switch_mm(efi_scratch.prev_mm, mm, NULL); > >> + task_unlock(tsk); > >> + > >> + if (efi_scratch.prev_mm != mm) > >> + mmdrop(efi_scratch.prev_mm); > > > > I'm confused. You're mmdropping an mm that you are still keeping a > > pointer to. This is also a bit confusing in the case where you do > > efi_switch_mm(efi_scratch.prev_mm). > > > > This whole manipulation seems fairly dangerous to me for another > > reason -- you're taking a user thread (I think) and swapping out its > > mm to something that the user in question should *not* have access to. > > What if a perf interrupt happens while you're in the alternate mm? > > What if you segfault and dump core? Should we maybe just have a flag > > that says "this cpu is using a funny mm", assert that the flag is > > clear when scheduling, and teach perf, coredumps, etc not to touch > > user memory when the flag is set? > > It appears we may have introduced this exact issue on arm64 and ARM by > starting to run the UEFI runtime services with interrupts enabled. > (perf does not use NMI on ARM, so the issue did not exist beforehand) > > Mark, Will, any thoughts? Yup, I can cause perf to take samples from the EFI FW code, so that's less than ideal. The "funny mm" flag sounds like a good idea to me, though given recent pain with sampling in the case of skid, I don't know exactly what we should do if/when we take an overflow interrupt while in EFI. Mark.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
(+ Mark, Will) On 15 August 2017 at 22:46, Andy Lutomirskiwrote: > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > wrote: >> +/* >> + * Makes the calling kernel thread switch to/from efi_mm context >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls >> + * (Note: This routine is heavily inspired from use_mm) >> + */ >> +void efi_switch_mm(struct mm_struct *mm) >> +{ >> + struct task_struct *tsk = current; >> + >> + task_lock(tsk); >> + efi_scratch.prev_mm = tsk->active_mm; >> + if (efi_scratch.prev_mm != mm) { >> + mmgrab(mm); >> + tsk->active_mm = mm; >> + } >> + switch_mm(efi_scratch.prev_mm, mm, NULL); >> + task_unlock(tsk); >> + >> + if (efi_scratch.prev_mm != mm) >> + mmdrop(efi_scratch.prev_mm); > > I'm confused. You're mmdropping an mm that you are still keeping a > pointer to. This is also a bit confusing in the case where you do > efi_switch_mm(efi_scratch.prev_mm). > > This whole manipulation seems fairly dangerous to me for another > reason -- you're taking a user thread (I think) and swapping out its > mm to something that the user in question should *not* have access to. > What if a perf interrupt happens while you're in the alternate mm? > What if you segfault and dump core? Should we maybe just have a flag > that says "this cpu is using a funny mm", assert that the flag is > clear when scheduling, and teach perf, coredumps, etc not to touch > user memory when the flag is set? > It appears we may have introduced this exact issue on arm64 and ARM by starting to run the UEFI runtime services with interrupts enabled. (perf does not use NMI on ARM, so the issue did not exist beforehand) Mark, Will, any thoughts?
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
(+ Mark, Will) On 15 August 2017 at 22:46, Andy Lutomirski wrote: > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > wrote: >> +/* >> + * Makes the calling kernel thread switch to/from efi_mm context >> + * Can be used from SetVirtualAddressMap() or during efi runtime calls >> + * (Note: This routine is heavily inspired from use_mm) >> + */ >> +void efi_switch_mm(struct mm_struct *mm) >> +{ >> + struct task_struct *tsk = current; >> + >> + task_lock(tsk); >> + efi_scratch.prev_mm = tsk->active_mm; >> + if (efi_scratch.prev_mm != mm) { >> + mmgrab(mm); >> + tsk->active_mm = mm; >> + } >> + switch_mm(efi_scratch.prev_mm, mm, NULL); >> + task_unlock(tsk); >> + >> + if (efi_scratch.prev_mm != mm) >> + mmdrop(efi_scratch.prev_mm); > > I'm confused. You're mmdropping an mm that you are still keeping a > pointer to. This is also a bit confusing in the case where you do > efi_switch_mm(efi_scratch.prev_mm). > > This whole manipulation seems fairly dangerous to me for another > reason -- you're taking a user thread (I think) and swapping out its > mm to something that the user in question should *not* have access to. > What if a perf interrupt happens while you're in the alternate mm? > What if you segfault and dump core? Should we maybe just have a flag > that says "this cpu is using a funny mm", assert that the flag is > clear when scheduling, and teach perf, coredumps, etc not to touch > user memory when the flag is set? > It appears we may have introduced this exact issue on arm64 and ARM by starting to run the UEFI runtime services with interrupts enabled. (perf does not use NMI on ARM, so the issue did not exist beforehand) Mark, Will, any thoughts?
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, Aug 15, 2017 at 5:23 PM, Sai Praneeth Prakhyawrote: > On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: >> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya >> wrote: >> > +/* >> > + * Makes the calling kernel thread switch to/from efi_mm context >> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls >> > + * (Note: This routine is heavily inspired from use_mm) >> > + */ >> > +void efi_switch_mm(struct mm_struct *mm) >> > +{ >> > + struct task_struct *tsk = current; >> > + >> > + task_lock(tsk); >> > + efi_scratch.prev_mm = tsk->active_mm; >> > + if (efi_scratch.prev_mm != mm) { >> > + mmgrab(mm); >> > + tsk->active_mm = mm; >> > + } >> > + switch_mm(efi_scratch.prev_mm, mm, NULL); >> > + task_unlock(tsk); >> > + >> > + if (efi_scratch.prev_mm != mm) >> > + mmdrop(efi_scratch.prev_mm); >> > > Thanks for the quick review Andy, > >> I'm confused. You're mmdropping an mm that you are still keeping a >> pointer to. This is also a bit confusing in the case where you do >> efi_switch_mm(efi_scratch.prev_mm). >> > > This makes sense, I will look into it. > >> This whole manipulation seems fairly dangerous to me for another >> reason -- you're taking a user thread (I think) and swapping out its >> mm to something that the user in question should *not* have access to. > > We are switching to efi_mm from user mm_struct because > EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are > accessible only through efi_pgd. The user thread calls ioctl() which in > turn calls efi_call() and thus efi_switch_mm(). So, I think, the user > still does not have direct access to EFI_RUNTIME_SERVICES memory regions > but accesses them through sys call. > >> What if a perf interrupt happens while you're in the alternate mm? > > Since we are disabling/enabling interrupts around switching, I think we > are safe. We do these in following functions > phys_efi_set_virtual_address_map() > efi_thunk_set_virtual_address_map() > efi_call_virt_pointer() perf uses NMI, so this doesn't help. Perhaps the sequence could look like this: local_irq_disable(); current->active_mm = efi_mm; switch_to(); ... switch_to(back to old mm); current->active_mm = old mm; and make perf know that current->active_mm != current->mm means that user memory is off limits.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, Aug 15, 2017 at 5:23 PM, Sai Praneeth Prakhya wrote: > On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: >> On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya >> wrote: >> > +/* >> > + * Makes the calling kernel thread switch to/from efi_mm context >> > + * Can be used from SetVirtualAddressMap() or during efi runtime calls >> > + * (Note: This routine is heavily inspired from use_mm) >> > + */ >> > +void efi_switch_mm(struct mm_struct *mm) >> > +{ >> > + struct task_struct *tsk = current; >> > + >> > + task_lock(tsk); >> > + efi_scratch.prev_mm = tsk->active_mm; >> > + if (efi_scratch.prev_mm != mm) { >> > + mmgrab(mm); >> > + tsk->active_mm = mm; >> > + } >> > + switch_mm(efi_scratch.prev_mm, mm, NULL); >> > + task_unlock(tsk); >> > + >> > + if (efi_scratch.prev_mm != mm) >> > + mmdrop(efi_scratch.prev_mm); >> > > Thanks for the quick review Andy, > >> I'm confused. You're mmdropping an mm that you are still keeping a >> pointer to. This is also a bit confusing in the case where you do >> efi_switch_mm(efi_scratch.prev_mm). >> > > This makes sense, I will look into it. > >> This whole manipulation seems fairly dangerous to me for another >> reason -- you're taking a user thread (I think) and swapping out its >> mm to something that the user in question should *not* have access to. > > We are switching to efi_mm from user mm_struct because > EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are > accessible only through efi_pgd. The user thread calls ioctl() which in > turn calls efi_call() and thus efi_switch_mm(). So, I think, the user > still does not have direct access to EFI_RUNTIME_SERVICES memory regions > but accesses them through sys call. > >> What if a perf interrupt happens while you're in the alternate mm? > > Since we are disabling/enabling interrupts around switching, I think we > are safe. We do these in following functions > phys_efi_set_virtual_address_map() > efi_thunk_set_virtual_address_map() > efi_call_virt_pointer() perf uses NMI, so this doesn't help. Perhaps the sequence could look like this: local_irq_disable(); current->active_mm = efi_mm; switch_to(); ... switch_to(back to old mm); current->active_mm = old mm; and make perf know that current->active_mm != current->mm means that user memory is off limits.
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya >wrote: > > +/* > > + * Makes the calling kernel thread switch to/from efi_mm context > > + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > + * (Note: This routine is heavily inspired from use_mm) > > + */ > > +void efi_switch_mm(struct mm_struct *mm) > > +{ > > + struct task_struct *tsk = current; > > + > > + task_lock(tsk); > > + efi_scratch.prev_mm = tsk->active_mm; > > + if (efi_scratch.prev_mm != mm) { > > + mmgrab(mm); > > + tsk->active_mm = mm; > > + } > > + switch_mm(efi_scratch.prev_mm, mm, NULL); > > + task_unlock(tsk); > > + > > + if (efi_scratch.prev_mm != mm) > > + mmdrop(efi_scratch.prev_mm); > Thanks for the quick review Andy, > I'm confused. You're mmdropping an mm that you are still keeping a > pointer to. This is also a bit confusing in the case where you do > efi_switch_mm(efi_scratch.prev_mm). > This makes sense, I will look into it. > This whole manipulation seems fairly dangerous to me for another > reason -- you're taking a user thread (I think) and swapping out its > mm to something that the user in question should *not* have access to. We are switching to efi_mm from user mm_struct because EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are accessible only through efi_pgd. The user thread calls ioctl() which in turn calls efi_call() and thus efi_switch_mm(). So, I think, the user still does not have direct access to EFI_RUNTIME_SERVICES memory regions but accesses them through sys call. > What if a perf interrupt happens while you're in the alternate mm? Since we are disabling/enabling interrupts around switching, I think we are safe. We do these in following functions phys_efi_set_virtual_address_map() efi_thunk_set_virtual_address_map() efi_call_virt_pointer() > What if you segfault and dump core? We could seg fault only if firmware touches regions which it shouldn't. i.e. Firmware touching regions outside EFI_RUNTIME_SERVICES (this is a UEFI Spec violation). So, in this case of buggy firmware, we panic (this is an existing problem). We also map EFI_BOOT_TIME_SERVICES into efi_pgd because we know some buggy firmware touches these regions. > Should we maybe just have a flag > that says "this cpu is using a funny mm", assert that the flag is > clear when scheduling, and teach perf, coredumps, etc not to touch > user memory when the flag is set? > > Admittedly, the latter problem may well have existed even before these > patches. Please let me know if you think otherwise. Matt, Please feel free to correct my understanding. Regards, Sai
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, 2017-08-15 at 14:46 -0700, Andy Lutomirski wrote: > On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya > wrote: > > +/* > > + * Makes the calling kernel thread switch to/from efi_mm context > > + * Can be used from SetVirtualAddressMap() or during efi runtime calls > > + * (Note: This routine is heavily inspired from use_mm) > > + */ > > +void efi_switch_mm(struct mm_struct *mm) > > +{ > > + struct task_struct *tsk = current; > > + > > + task_lock(tsk); > > + efi_scratch.prev_mm = tsk->active_mm; > > + if (efi_scratch.prev_mm != mm) { > > + mmgrab(mm); > > + tsk->active_mm = mm; > > + } > > + switch_mm(efi_scratch.prev_mm, mm, NULL); > > + task_unlock(tsk); > > + > > + if (efi_scratch.prev_mm != mm) > > + mmdrop(efi_scratch.prev_mm); > Thanks for the quick review Andy, > I'm confused. You're mmdropping an mm that you are still keeping a > pointer to. This is also a bit confusing in the case where you do > efi_switch_mm(efi_scratch.prev_mm). > This makes sense, I will look into it. > This whole manipulation seems fairly dangerous to me for another > reason -- you're taking a user thread (I think) and swapping out its > mm to something that the user in question should *not* have access to. We are switching to efi_mm from user mm_struct because EFI_RUNTIME_SERVICES like efi_set_variable()/efi_get_variable() are accessible only through efi_pgd. The user thread calls ioctl() which in turn calls efi_call() and thus efi_switch_mm(). So, I think, the user still does not have direct access to EFI_RUNTIME_SERVICES memory regions but accesses them through sys call. > What if a perf interrupt happens while you're in the alternate mm? Since we are disabling/enabling interrupts around switching, I think we are safe. We do these in following functions phys_efi_set_virtual_address_map() efi_thunk_set_virtual_address_map() efi_call_virt_pointer() > What if you segfault and dump core? We could seg fault only if firmware touches regions which it shouldn't. i.e. Firmware touching regions outside EFI_RUNTIME_SERVICES (this is a UEFI Spec violation). So, in this case of buggy firmware, we panic (this is an existing problem). We also map EFI_BOOT_TIME_SERVICES into efi_pgd because we know some buggy firmware touches these regions. > Should we maybe just have a flag > that says "this cpu is using a funny mm", assert that the flag is > clear when scheduling, and teach perf, coredumps, etc not to touch > user memory when the flag is set? > > Admittedly, the latter problem may well have existed even before these > patches. Please let me know if you think otherwise. Matt, Please feel free to correct my understanding. Regards, Sai
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhyawrote: > +/* > + * Makes the calling kernel thread switch to/from efi_mm context > + * Can be used from SetVirtualAddressMap() or during efi runtime calls > + * (Note: This routine is heavily inspired from use_mm) > + */ > +void efi_switch_mm(struct mm_struct *mm) > +{ > + struct task_struct *tsk = current; > + > + task_lock(tsk); > + efi_scratch.prev_mm = tsk->active_mm; > + if (efi_scratch.prev_mm != mm) { > + mmgrab(mm); > + tsk->active_mm = mm; > + } > + switch_mm(efi_scratch.prev_mm, mm, NULL); > + task_unlock(tsk); > + > + if (efi_scratch.prev_mm != mm) > + mmdrop(efi_scratch.prev_mm); I'm confused. You're mmdropping an mm that you are still keeping a pointer to. This is also a bit confusing in the case where you do efi_switch_mm(efi_scratch.prev_mm). This whole manipulation seems fairly dangerous to me for another reason -- you're taking a user thread (I think) and swapping out its mm to something that the user in question should *not* have access to. What if a perf interrupt happens while you're in the alternate mm? What if you segfault and dump core? Should we maybe just have a flag that says "this cpu is using a funny mm", assert that the flag is clear when scheduling, and teach perf, coredumps, etc not to touch user memory when the flag is set? Admittedly, the latter problem may well have existed even before these patches. > +} > + > #ifdef CONFIG_EFI_MIXED > extern efi_status_t efi64_thunk(u32, ...); > > @@ -649,16 +665,13 @@ efi_status_t efi_thunk_set_virtual_address_map( > efi_sync_low_kernel_mappings(); > local_irq_save(flags); > > - efi_scratch.prev_cr3 = read_cr3(); > - write_cr3((unsigned long)efi_scratch.efi_pgt); > - __flush_tlb_all(); > + efi_switch_mm(_mm); > > func = (u32)(unsigned long)phys_set_virtual_address_map; > status = efi64_thunk(func, memory_map_size, descriptor_size, > descriptor_version, virtual_map); > > - write_cr3(efi_scratch.prev_cr3); > - __flush_tlb_all(); > + efi_switch_mm(efi_scratch.prev_mm); > local_irq_restore(flags); > > return status;
Re: [PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
On Tue, Aug 15, 2017 at 12:18 PM, Sai Praneeth Prakhya wrote: > +/* > + * Makes the calling kernel thread switch to/from efi_mm context > + * Can be used from SetVirtualAddressMap() or during efi runtime calls > + * (Note: This routine is heavily inspired from use_mm) > + */ > +void efi_switch_mm(struct mm_struct *mm) > +{ > + struct task_struct *tsk = current; > + > + task_lock(tsk); > + efi_scratch.prev_mm = tsk->active_mm; > + if (efi_scratch.prev_mm != mm) { > + mmgrab(mm); > + tsk->active_mm = mm; > + } > + switch_mm(efi_scratch.prev_mm, mm, NULL); > + task_unlock(tsk); > + > + if (efi_scratch.prev_mm != mm) > + mmdrop(efi_scratch.prev_mm); I'm confused. You're mmdropping an mm that you are still keeping a pointer to. This is also a bit confusing in the case where you do efi_switch_mm(efi_scratch.prev_mm). This whole manipulation seems fairly dangerous to me for another reason -- you're taking a user thread (I think) and swapping out its mm to something that the user in question should *not* have access to. What if a perf interrupt happens while you're in the alternate mm? What if you segfault and dump core? Should we maybe just have a flag that says "this cpu is using a funny mm", assert that the flag is clear when scheduling, and teach perf, coredumps, etc not to touch user memory when the flag is set? Admittedly, the latter problem may well have existed even before these patches. > +} > + > #ifdef CONFIG_EFI_MIXED > extern efi_status_t efi64_thunk(u32, ...); > > @@ -649,16 +665,13 @@ efi_status_t efi_thunk_set_virtual_address_map( > efi_sync_low_kernel_mappings(); > local_irq_save(flags); > > - efi_scratch.prev_cr3 = read_cr3(); > - write_cr3((unsigned long)efi_scratch.efi_pgt); > - __flush_tlb_all(); > + efi_switch_mm(_mm); > > func = (u32)(unsigned long)phys_set_virtual_address_map; > status = efi64_thunk(func, memory_map_size, descriptor_size, > descriptor_version, virtual_map); > > - write_cr3(efi_scratch.prev_cr3); > - __flush_tlb_all(); > + efi_switch_mm(efi_scratch.prev_mm); > local_irq_restore(flags); > > return status;
[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
From: Sai PraneethUse helper function (efi_switch_mm()) to switch to/from efi_mm. We switch to efi_mm before calling 1. efi_set_virtual_address_map() and 2. Invoking any efi_runtime_service() Likewise, we need to switch back to previous mm (mm context stolen by efi_mm) after the above calls return successfully. We can use efi_switch_mm() only with x86_64 kernel and "efi=old_map" disabled because, x86_32 and efi=old_map doesn't use efi_pgd, rather they use swapper_pg_dir. Signed-off-by: Sai Praneeth Prakhya Cc: Lee, Chun-Yi Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Michael S. Tsirkin Cc: Ricardo Neri Cc: Matt Fleming Cc: Ard Biesheuvel Cc: Ravi Shankar --- arch/x86/include/asm/efi.h | 30 +- arch/x86/platform/efi/efi_64.c | 41 arch/x86/platform/efi/efi_thunk_64.S | 2 +- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2f77bcefe6b4..aa38b546e842 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,10 +1,14 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +#include +#include + #include #include #include #include +#include /* * We map the EFI regions needed for runtime services non-contiguously, @@ -57,14 +61,14 @@ extern u64 asmlinkage efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) /* - * Scratch space used for switching the pagetable in the EFI stub + * struct efi_scratch - Scratch space used while switching to/from efi_mm + * @phys_stack:stack used during EFI Mixed Mode + * @prev_mm: store/restore stolen mm_struct while switching + * to/from efi_mm */ struct efi_scratch { - u64 r15; - u64 prev_cr3; - pgd_t *efi_pgt; - booluse_pgd; - u64 phys_stack; + u64 phys_stack; + struct mm_struct*prev_mm; } __packed; #define arch_efi_call_virt_setup() \ @@ -73,11 +77,8 @@ struct efi_scratch { preempt_disable(); \ __kernel_fpu_begin(); \ \ - if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = read_cr3(); \ - write_cr3((unsigned long)efi_scratch.efi_pgt); \ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(_mm); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -85,10 +86,8 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - if (efi_scratch.use_pgd) { \ - write_cr3(efi_scratch.prev_cr3);\ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(efi_scratch.prev_mm); \ \ __kernel_fpu_end(); \ preempt_enable(); \ @@ -130,6 +129,7 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); +extern void efi_switch_mm(struct mm_struct *mm); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 0bb98c35e178..3be94480c1ce 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -80,9 +80,8 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)read_cr3(); - write_cr3((unsigned long)efi_scratch.efi_pgt); - goto out; + efi_switch_mm(_mm); + return NULL; } early_code_mapping_set_exec(1); @@ -152,8 +151,7 @@ void __init
[PATCH 3/3] x86/efi: Use efi_switch_mm() rather than manually twiddling with cr3
From: Sai Praneeth Use helper function (efi_switch_mm()) to switch to/from efi_mm. We switch to efi_mm before calling 1. efi_set_virtual_address_map() and 2. Invoking any efi_runtime_service() Likewise, we need to switch back to previous mm (mm context stolen by efi_mm) after the above calls return successfully. We can use efi_switch_mm() only with x86_64 kernel and "efi=old_map" disabled because, x86_32 and efi=old_map doesn't use efi_pgd, rather they use swapper_pg_dir. Signed-off-by: Sai Praneeth Prakhya Cc: Lee, Chun-Yi Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Michael S. Tsirkin Cc: Ricardo Neri Cc: Matt Fleming Cc: Ard Biesheuvel Cc: Ravi Shankar --- arch/x86/include/asm/efi.h | 30 +- arch/x86/platform/efi/efi_64.c | 41 arch/x86/platform/efi/efi_thunk_64.S | 2 +- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2f77bcefe6b4..aa38b546e842 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,10 +1,14 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +#include +#include + #include #include #include #include +#include /* * We map the EFI regions needed for runtime services non-contiguously, @@ -57,14 +61,14 @@ extern u64 asmlinkage efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) /* - * Scratch space used for switching the pagetable in the EFI stub + * struct efi_scratch - Scratch space used while switching to/from efi_mm + * @phys_stack:stack used during EFI Mixed Mode + * @prev_mm: store/restore stolen mm_struct while switching + * to/from efi_mm */ struct efi_scratch { - u64 r15; - u64 prev_cr3; - pgd_t *efi_pgt; - booluse_pgd; - u64 phys_stack; + u64 phys_stack; + struct mm_struct*prev_mm; } __packed; #define arch_efi_call_virt_setup() \ @@ -73,11 +77,8 @@ struct efi_scratch { preempt_disable(); \ __kernel_fpu_begin(); \ \ - if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = read_cr3(); \ - write_cr3((unsigned long)efi_scratch.efi_pgt); \ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(_mm); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -85,10 +86,8 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - if (efi_scratch.use_pgd) { \ - write_cr3(efi_scratch.prev_cr3);\ - __flush_tlb_all(); \ - } \ + if (!efi_enabled(EFI_OLD_MEMMAP)) \ + efi_switch_mm(efi_scratch.prev_mm); \ \ __kernel_fpu_end(); \ preempt_enable(); \ @@ -130,6 +129,7 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); +extern void efi_switch_mm(struct mm_struct *mm); struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 0bb98c35e178..3be94480c1ce 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -80,9 +80,8 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)read_cr3(); - write_cr3((unsigned long)efi_scratch.efi_pgt); - goto out; + efi_switch_mm(_mm); + return NULL; } early_code_mapping_set_exec(1); @@ -152,8 +151,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) pud_t *pud; if (!efi_enabled(EFI_OLD_MEMMAP)) { - write_cr3((unsigned long)save_pgd); - __flush_tlb_all(); + efi_switch_mm(efi_scratch.prev_mm);