I think Nadav is right here. IMO the right fix is to rename the functions cr4_set_bits_irqs_off() etc, add a warning (if lockdep is on) and fix the callers.
On Wed, Nov 15, 2017 at 4:34 PM, Nadav Amit <nadav.a...@gmail.com> wrote: > h...@zytor.com wrote: > >> On November 15, 2017 3:31:50 PM PST, Nadav Amit <nadav.a...@gmail.com> wrote: >>> Ping? >>> >>> Nadav Amit <nadav.a...@gmail.com> wrote: >>> >>>> CC’ing more people, and adding a patch to clarify... >>>> >>>> Nadav Amit <nadav.a...@gmail.com> wrote: >>>> >>>>> I am puzzled by the comment in tlb_state.cr4 , which says: >>>>> >>>>> /* >>>>> * Access to this CR4 shadow and to H/W CR4 is protected by >>>>> * disabling interrupts when modifying either one. >>>>> */ >>>>> >>>>> This does not seem to be true and adding a warning to CR4 writes >>> when >>>>> !irqs_disabled() immediately fires in identify_cpu() and >>>>> __mcheck_cpu_init_generic(). While these two are called during boot, >>> I think >>>>> there other CR4 changes with enabled IRQs, for example, PR_SET_TSC. >>>>> >>>>> So my question(s): Is the comment correct? Is the current behavior >>> correct? >>>> So here is what I have in mind. I am not sure whether >>> CONFIG_DEBUG_PREEMPT is >>>> the right #ifdef. Let me know what you think. >>>> >>>> -- >8 -- >>>> >>>> Subject: [PATCH] x86: disable IRQs before changing CR4 >>>> >>>> CR4 changes need to be performed while IRQs are disabled in order to >>>> update the CR4 shadow and the actual register atomically. >>>> >>>> Signed-off-by: Nadav Amit <na...@vmware.com> >>>> --- >>>> arch/x86/include/asm/tlbflush.h | 18 ++++++++++++------ >>>> arch/x86/kernel/cpu/common.c | 13 ++++++++++++- >>>> arch/x86/kernel/cpu/mcheck/mce.c | 3 +++ >>>> arch/x86/kernel/cpu/mcheck/p5.c | 4 ++++ >>>> arch/x86/kernel/cpu/mcheck/winchip.c | 3 +++ >>>> arch/x86/kernel/process.c | 14 ++++++++++++-- >>>> 6 files changed, 46 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/arch/x86/include/asm/tlbflush.h >>> b/arch/x86/include/asm/tlbflush.h >>>> index 50ea3482e1d1..bc70dd1cc7c6 100644 >>>> --- a/arch/x86/include/asm/tlbflush.h >>>> +++ b/arch/x86/include/asm/tlbflush.h >>>> @@ -89,6 +89,15 @@ static inline void cr4_init_shadow(void) >>>> this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); >>>> } >>>> >>>> +static inline void update_cr4(unsigned long cr4) >>>> +{ >>>> +#ifdef CONFIG_DEBUG_PREEMPT >>>> + WARN_ON_ONCE(!irqs_disabled()); >>>> +#endif >>>> + this_cpu_write(cpu_tlbstate.cr4, cr4); >>>> + __write_cr4(cr4); >>>> +} >>>> + >>>> /* Set in this cpu's CR4. */ >>>> static inline void cr4_set_bits(unsigned long mask) >>>> { >>>> @@ -97,8 +106,7 @@ static inline void cr4_set_bits(unsigned long >>> mask) >>>> cr4 = this_cpu_read(cpu_tlbstate.cr4); >>>> if ((cr4 | mask) != cr4) { >>>> cr4 |= mask; >>>> - this_cpu_write(cpu_tlbstate.cr4, cr4); >>>> - __write_cr4(cr4); >>>> + update_cr4(cr4); >>>> } >>>> } >>>> >>>> @@ -110,8 +118,7 @@ static inline void cr4_clear_bits(unsigned long >>> mask) >>>> cr4 = this_cpu_read(cpu_tlbstate.cr4); >>>> if ((cr4 & ~mask) != cr4) { >>>> cr4 &= ~mask; >>>> - this_cpu_write(cpu_tlbstate.cr4, cr4); >>>> - __write_cr4(cr4); >>>> + update_cr4(cr4); >>>> } >>>> } >>>> >>>> @@ -121,8 +128,7 @@ static inline void cr4_toggle_bits(unsigned long >>> mask) >>>> cr4 = this_cpu_read(cpu_tlbstate.cr4); >>>> cr4 ^= mask; >>>> - this_cpu_write(cpu_tlbstate.cr4, cr4); >>>> - __write_cr4(cr4); >>>> + update_cr4(cr4); >>>> } >>>> >>>> /* Read the CR4 shadow. */ >>>> diff --git a/arch/x86/kernel/cpu/common.c >>> b/arch/x86/kernel/cpu/common.c >>>> index c8b39870f33e..82e6b41fd5e9 100644 >>>> --- a/arch/x86/kernel/cpu/common.c >>>> +++ b/arch/x86/kernel/cpu/common.c >>>> @@ -318,6 +318,8 @@ static bool pku_disabled; >>>> >>>> static __always_inline void setup_pku(struct cpuinfo_x86 *c) >>>> { >>>> + unsigned long flags; >>>> + >>>> /* check the boot processor, plus compile options for PKU: */ >>>> if (!cpu_feature_enabled(X86_FEATURE_PKU)) >>>> return; >>>> @@ -327,7 +329,10 @@ static __always_inline void setup_pku(struct >>> cpuinfo_x86 *c) >>>> if (pku_disabled) >>>> return; >>>> >>>> + local_irq_save(flags); >>>> cr4_set_bits(X86_CR4_PKE); >>>> + local_irq_restore(flags); >>>> + >>>> /* >>>> * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE >>>> * cpuid bit to be set. We need to ensure that we >>>> @@ -1069,6 +1074,7 @@ static void validate_apic_and_package_id(struct >>> cpuinfo_x86 *c) >>>> */ >>>> static void identify_cpu(struct cpuinfo_x86 *c) >>>> { >>>> + unsigned long flags; >>>> int i; >>>> >>>> c->loops_per_jiffy = loops_per_jiffy; >>>> @@ -1121,9 +1127,14 @@ static void identify_cpu(struct cpuinfo_x86 >>> *c) >>>> /* Disable the PN if appropriate */ >>>> squash_the_stupid_serial_number(c); >>>> >>>> - /* Set up SMEP/SMAP */ >>>> + /* >>>> + * Set up SMEP/SMAP. Disable interrupts to prevent triggering a >>> warning >>>> + * as CR4 changes must be done with disabled interrupts. >>>> + */ >>>> + local_irq_save(flags); >>>> setup_smep(c); >>>> setup_smap(c); >>>> + local_irq_restore(flags); >>>> >>>> /* >>>> * The vendor-specific functions might have changed features. >>>> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c >>> b/arch/x86/kernel/cpu/mcheck/mce.c >>>> index 3b413065c613..497c07e33c25 100644 >>>> --- a/arch/x86/kernel/cpu/mcheck/mce.c >>>> +++ b/arch/x86/kernel/cpu/mcheck/mce.c >>>> @@ -1508,6 +1508,7 @@ static void __mcheck_cpu_init_generic(void) >>>> { >>>> enum mcp_flags m_fl = 0; >>>> mce_banks_t all_banks; >>>> + unsigned long flags; >>>> u64 cap; >>>> >>>> if (!mca_cfg.bootlog) >>>> @@ -1519,7 +1520,9 @@ static void __mcheck_cpu_init_generic(void) >>>> bitmap_fill(all_banks, MAX_NR_BANKS); >>>> machine_check_poll(MCP_UC | m_fl, &all_banks); >>>> >>>> + local_irq_save(flags); >>>> cr4_set_bits(X86_CR4_MCE); >>>> + local_irq_restore(flags); >>>> >>>> rdmsrl(MSR_IA32_MCG_CAP, cap); >>>> if (cap & MCG_CTL_P) >>>> diff --git a/arch/x86/kernel/cpu/mcheck/p5.c >>> b/arch/x86/kernel/cpu/mcheck/p5.c >>>> index 2a0717bf8033..d5d4963415e9 100644 >>>> --- a/arch/x86/kernel/cpu/mcheck/p5.c >>>> +++ b/arch/x86/kernel/cpu/mcheck/p5.c >>>> @@ -42,6 +42,7 @@ static void pentium_machine_check(struct pt_regs >>> *regs, long error_code) >>>> /* Set up machine check reporting for processors with Intel style >>> MCE: */ >>>> void intel_p5_mcheck_init(struct cpuinfo_x86 *c) >>>> { >>>> + unsigned long flags; >>>> u32 l, h; >>>> >>>> /* Default P5 to off as its often misconnected: */ >>>> @@ -62,7 +63,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) >>>> pr_info("Intel old style machine check architecture supported.\n"); >>>> >>>> /* Enable MCE: */ >>>> + local_irq_save(flags); >>>> cr4_set_bits(X86_CR4_MCE); >>>> + local_irq_restore(flags); >>>> + >>>> pr_info("Intel old style machine check reporting enabled on >>> CPU#%d.\n", >>>> smp_processor_id()); >>>> } >>>> diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c >>> b/arch/x86/kernel/cpu/mcheck/winchip.c >>>> index c6a722e1d011..6dd985e3849d 100644 >>>> --- a/arch/x86/kernel/cpu/mcheck/winchip.c >>>> +++ b/arch/x86/kernel/cpu/mcheck/winchip.c >>>> @@ -26,6 +26,7 @@ static void winchip_machine_check(struct pt_regs >>> *regs, long error_code) >>>> /* Set up machine check reporting on the Winchip C6 series */ >>>> void winchip_mcheck_init(struct cpuinfo_x86 *c) >>>> { >>>> + unsigned long flags; >>>> u32 lo, hi; >>>> >>>> machine_check_vector = winchip_machine_check; >>>> @@ -37,7 +38,9 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) >>>> lo &= ~(1<<4); /* Enable MCE */ >>>> wrmsr(MSR_IDT_FCR1, lo, hi); >>>> >>>> + local_irq_save(flags); >>>> cr4_set_bits(X86_CR4_MCE); >>>> + local_irq_restore(flags); >>>> >>>> pr_info("Winchip machine check reporting enabled on CPU#0.\n"); >>>> } >>>> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c >>>> index 3ca198080ea9..09e44e784745 100644 >>>> --- a/arch/x86/kernel/process.c >>>> +++ b/arch/x86/kernel/process.c >>>> @@ -127,25 +127,35 @@ void flush_thread(void) >>>> >>>> void disable_TSC(void) >>>> { >>>> + unsigned long flags; >>>> + >>>> preempt_disable(); >>>> - if (!test_and_set_thread_flag(TIF_NOTSC)) >>>> + if (!test_and_set_thread_flag(TIF_NOTSC)) { >>>> /* >>>> * Must flip the CPU state synchronously with >>>> * TIF_NOTSC in the current running context. >>>> */ >>>> + local_irq_save(flags); >>>> cr4_set_bits(X86_CR4_TSD); >>>> + local_irq_restore(flags); >>>> + } >>>> preempt_enable(); >>>> } >>>> >>>> static void enable_TSC(void) >>>> { >>>> + unsigned long flags; >>>> + >>>> preempt_disable(); >>>> - if (test_and_clear_thread_flag(TIF_NOTSC)) >>>> + if (test_and_clear_thread_flag(TIF_NOTSC)) { >>>> /* >>>> * Must flip the CPU state synchronously with >>>> * TIF_NOTSC in the current running context. >>>> */ >>>> + local_irq_save(flags); >>>> cr4_clear_bits(X86_CR4_TSD); >>>> + local_irq_restore(flags); >>>> + } >>>> preempt_enable(); >>>> } >> >> This is wrong on at least two levels colon first of all, this should not be >> wrapped around the abstracted operations but put inside them if relevant. >> Second, I suspect that this is not at all a requirement but rather that as >> long as the hardware register is written second, I think we should always be >> safe. > > Thanks for your reply. > > Can you please explain your suspicion? Here is a simple execution that may > fail (assume cr4 is initially zero): > > CPU0 > ==== > cr4_set_bits(1) > => cpu_tlbstate.cr4 = 1 > => interrupt > cr4_set_bits(2) > cpu_tlbstate.cr4 = 3 > __write_cr4(3) > => __write_cr4(1) [ and should have been 3 ] > > Am I missing anything? > > Now, it may be a theoretical issue right now, since I did not find any > change of CR4 bits that happens in an interrupt handler. > > Anyhow, if you want me to submit a fix, please let me know what other levels > of wrongness you had in mind. > > Regards, > Nadav >