[tip:x86/apic] x86/idt: Move IST stack based traps to table init
Commit-ID: 90f6225fba0c732f3f5f9f5e265bdefa021ff12d Gitweb: http://git.kernel.org/tip/90f6225fba0c732f3f5f9f5e265bdefa021ff12d Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:52 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:27 +0200 x86/idt: Move IST stack based traps to table init Initialize the IST based traps via a table. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.091328...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 ++ arch/x86/kernel/idt.c | 22 ++ arch/x86/kernel/traps.c | 9 + 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 930acd5..e624527 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -509,9 +509,11 @@ extern void idt_setup_early_traps(void); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); extern void idt_setup_debugidt_traps(void); #else static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } static inline void idt_setup_debugidt_traps(void) { } #endif diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f5281b8..a6326fd 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -92,6 +92,20 @@ struct desc_ptr idt_descr __ro_after_init = { gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; /* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initdata struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, debug, DEBUG_STACK), + ISTG(X86_TRAP_NMI, nmi,NMI_STACK), + ISTG(X86_TRAP_BP, int3, DEBUG_STACK), + ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, _check, MCE_STACK), +#endif +}; + +/* * Override for the debug_idt. Same as the default, but with interrupt * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ @@ -158,6 +172,14 @@ void __init idt_setup_early_pf(void) } /** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts)); +} + +/** * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps */ void __init idt_setup_debugidt_traps(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1492bf5..293f5bd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -979,14 +979,7 @@ void __init trap_init(void) */ cpu_init(); - /* -* X86_TRAP_DB and X86_TRAP_BP have been set -* in early_trap_init(). However, ITS works only after -* cpu_init() loads TSS. See comments in early_trap_init(). -*/ - set_intr_gate_ist(X86_TRAP_DB, , DEBUG_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, , DEBUG_STACK); + idt_setup_ist_traps(); x86_init.irqs.trap_init();
[tip:x86/apic] x86/ipi: Make platform IPI depend on APIC
Commit-ID: 0428e01a2f13a6b7dae8289fb10030dbea336dee Gitweb: http://git.kernel.org/tip/0428e01a2f13a6b7dae8289fb10030dbea336dee Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:34 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:29 +0200 x86/ipi: Make platform IPI depend on APIC The platform IPI vector is only installed when the local APIC is enabled. All users of it depend on the local APIC anyway. Make the related code conditional on CONFIG_X86_LOCAL_APIC=y. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064957.615286...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 3 +-- arch/x86/kernel/irq.c | 11 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c911650..aa15d1f 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -17,8 +17,6 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) @@ -37,6 +35,7 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index befdd4a..52089c0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); atomic_t irq_err_count; -/* Function pointer for generic interrupt vector handling */ -void (*x86_platform_ipi_callback)(void) = NULL; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); -#endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } +#endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) @@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; -#endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; +#endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -259,6 +256,9 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ @@ -275,6 +275,7 @@ __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } +#endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {}
[tip:x86/apic] x86/tracing: Disentangle pagefault and resched IPI tracing key
Commit-ID: 809547472edae0bc68f2b5abc37b92c8a988bc8a Gitweb: http://git.kernel.org/tip/809547472edae0bc68f2b5abc37b92c8a988bc8a Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:33 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:29 +0200 x86/tracing: Disentangle pagefault and resched IPI tracing key The pagefault and the resched IPI handler are the only ones where it is worth to optimize the code further in case tracepoints are disabled. But it makes no sense to have a single static key for both. Seperate the static keys so the facilities are handled seperately. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064957.536699...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/common.h | 15 --- arch/x86/include/asm/trace/exceptions.h | 6 -- arch/x86/include/asm/trace/irq_vectors.h | 29 +++-- arch/x86/kernel/smp.c| 2 +- arch/x86/kernel/tracepoint.c | 27 ++- arch/x86/mm/fault.c | 2 +- 6 files changed, 59 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h index b1eb7b1..57c8da02 100644 --- a/arch/x86/include/asm/trace/common.h +++ b/arch/x86/include/asm/trace/common.h @@ -1,15 +1,16 @@ #ifndef _ASM_TRACE_COMMON_H #define _ASM_TRACE_COMMON_H -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); - #ifdef CONFIG_TRACING -DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key); -#define trace_irqvectors_enabled() \ - static_branch_unlikely(_irqvectors_key) +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled()\ + static_branch_unlikely(_resched_ipi_key) #else -static inline bool trace_irqvectors_enabled(void) { return false; } +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } #endif #endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 960a5b5..5665bf2 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -7,6 +7,9 @@ #include #include +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); + DECLARE_EVENT_CLASS(x86_exceptions, TP_PROTO(unsigned long address, struct pt_regs *regs, @@ -35,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 7825b44..a1bdc25 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -7,6 +7,9 @@ #include #include +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); + DECLARE_EVENT_CLASS(x86_irq_vector, TP_PROTO(int vector), @@ -26,15 +29,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector),\ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc);\ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector),\ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg,
[tip:x86/apic] x86/irq_work: Make it depend on APIC
Commit-ID: a45525b5b47c10c0446eda21227792b39af233dc Gitweb: http://git.kernel.org/tip/a45525b5b47c10c0446eda21227792b39af233dc Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:35 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:30 +0200 x86/irq_work: Make it depend on APIC The irq work interrupt vector is only installed when CONFIG_X86_LOCAL_APIC is enabled, but the interrupt handler is compiled in unconditionally. Compile the cruft out when the APIC is disabled. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064957.691909...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_work.h | 8 arch/x86/kernel/irq_work.c | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f706041..ddbb8ea 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -3,9 +3,17 @@ #include +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 8054cae..70dee05 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -11,6 +11,7 @@ #include #include +#ifdef CONFIG_X86_LOCAL_APIC __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); @@ -23,11 +24,10 @@ __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) void arch_irq_work_raise(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); -#endif } +#endif
[tip:x86/apic] x86/tracing: Build tracepoints only when they are used
Commit-ID: 73285527804402befe5d5140aeede21c16544b4c Gitweb: http://git.kernel.org/tip/73285527804402befe5d5140aeede21c16544b4c Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:36 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:24 +0200 x86/tracing: Build tracepoints only when they are used The tracepoint macro magic emits code for all tracepoints in a event header file. That code stays around even if the tracepoint is not used at all. The linker does not discard it. Build the various irq_vector tracepoints dependent on the appropriate CONFIG switches. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064957.770651...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/irq_vectors.h | 36 +--- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index a1bdc25..1599d39 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -7,6 +7,8 @@ #include #include +#ifdef CONFIG_X86_LOCAL_APIC + extern int trace_resched_ipi_reg(void); extern void trace_resched_ipi_unreg(void); @@ -53,18 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * The ifdef is required because that tracepoint macro hell emits tracepoint - * code in files which include this header even if the tracepoint is not - * enabled. Brilliant stuff that. - */ -#ifdef CONFIG_SMP -/* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_RESCHED_IPI_EVENT(reschedule); -#endif - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -80,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -96,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -108,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH .
[tip:x86/apic] x86/idt: Move 32-bit idt_descr to C code
Commit-ID: 16bc18d895cee95f12bd722e5a3016676dfcf084 Gitweb: http://git.kernel.org/tip/16bc18d895cee95f12bd722e5a3016676dfcf084 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:44 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:26 +0200 x86/idt: Move 32-bit idt_descr to C code 32-bit kernels have the idt_descr defined in the low level assembly entry code, but there is no good reason for that. Move it into the C file and use the 64-bit version of it. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.445862...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 6 -- arch/x86/kernel/idt.c | 10 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 29da959..ce8c6ed 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -622,7 +622,6 @@ int_msg: .data .globl boot_gdt_descr -.globl idt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -631,11 +630,6 @@ boot_gdt_descr: .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 3d19cad..86e5912 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -10,15 +10,15 @@ /* Must be page-aligned because the real IDT is used in a fixmap. */ gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; -#ifdef CONFIG_X86_64 -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; - struct desc_ptr idt_descr __ro_after_init = { - .size = IDT_ENTRIES * 16 - 1, + .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, .address= (unsigned long) idt_table, }; +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + const struct desc_ptr debug_idt_descr = { .size = IDT_ENTRIES * 16 - 1, .address= (unsigned long) debug_idt_table,
[tip:x86/apic] x86/idt: Create file for IDT related code
Commit-ID: d8ed9d48266a27ab02a4bbcb81e755d63aec108a Gitweb: http://git.kernel.org/tip/d8ed9d48266a27ab02a4bbcb81e755d63aec108a Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:43 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:25 +0200 x86/idt: Create file for IDT related code IDT related code lives scattered around in various places. Create a new source file in arch/x86/kernel/idt.c to hold it. Move the idt_tables and descriptors to it for a start. Follow up patches will gradually move more code over. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.367081...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 9 - arch/x86/kernel/idt.c| 26 ++ arch/x86/kernel/traps.c | 6 -- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6ab5fbf..fd0a789 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o -obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b3987..71ab8a4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1289,15 +1289,6 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr __ro_after_init = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) idt_table, -}; -const struct desc_ptr debug_idt_descr = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; - DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000..3d19cad --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,26 @@ +/* + * Interrupt descriptor table related code + * + * This file is licensed under the GPL V2 + */ +#include + +#include + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + +struct desc_ptr idt_descr __ro_after_init = { + .size = IDT_ENTRIES * 16 - 1, + .address= (unsigned long) idt_table, +}; + +const struct desc_ptr debug_idt_descr = { + .size = IDT_ENTRIES * 16 - 1, + .address= (unsigned long) debug_idt_table, +}; +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 36c5836..41f4cd3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -65,18 +65,12 @@ #include #include #include - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include #include #include #endif -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[NR_VECTORS] __page_aligned_bss; - DECLARE_BITMAP(used_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs)
[tip:x86/apic] x86/idt: Remove unused functions/inlines
Commit-ID: 485fa57bd73a0b79987d144e15bdc582f926701d Gitweb: http://git.kernel.org/tip/485fa57bd73a0b79987d144e15bdc582f926701d Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:56 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:28 +0200 x86/idt: Remove unused functions/inlines The IDT related inlines are not longer used. Remove them. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.422083...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 36 1 file changed, 36 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index cae0cb0..cbd36dd 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -390,16 +390,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, ); -} -#endif - static inline void _set_gate(int gate, unsigned type, const void *addr, unsigned dpl, unsigned ist, unsigned seg) { @@ -437,32 +427,6 @@ static inline void alloc_system_vector(int vector) set_intr_gate(n, addr); \ } while (0) -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr);
[tip:x86/apic] x86/idt: Deinline setup functions
Commit-ID: db18da78f9a8bbab1bdc5968ba47ace788b5061f Gitweb: http://git.kernel.org/tip/db18da78f9a8bbab1bdc5968ba47ace788b5061f Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:57 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:28 +0200 x86/idt: Deinline setup functions None of this is performance sensitive in any way - so debloat the kernel. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.502052...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 37 ++--- arch/x86/kernel/idt.c | 43 ++- 2 files changed, 36 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index cbd36dd..33f84f2 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -390,44 +390,11 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -static inline void _set_gate(int gate, unsigned type, const void *addr, -unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(, type, (unsigned long)addr, dpl, ist, seg); - /* -* does not need to be atomic because it is only done once at -* setup time -*/ - write_idt_entry(idt_table, gate, ); -} - -static inline void set_intr_gate(unsigned int n, const void *addr) -{ - BUG_ON(n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); -} +void set_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - BUG_ON(vector < FIRST_SYSTEM_VECTOR); - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do {\ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 99f93a6..8e9318d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -212,15 +212,16 @@ static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) #endif } -static __init void -idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size) +static void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { gate_desc desc; for (; size > 0; t++, size--) { idt_init_desc(, t); - set_bit(t->vector, used_vectors); write_idt_entry(idt, t->vector, ); + if (sys) + set_bit(t->vector, used_vectors); } } @@ -233,7 +234,8 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size) */ void __init idt_setup_early_traps(void) { - idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts)); + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), +true); load_idt(_descr); } @@ -242,7 +244,7 @@ void __init idt_setup_early_traps(void) */ void __init idt_setup_traps(void) { - idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts)); + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); } #ifdef CONFIG_X86_64 @@ -259,7 +261,7 @@ void __init idt_setup_traps(void) void __init idt_setup_early_pf(void) { idt_setup_from_table(idt_table, early_pf_idts, -ARRAY_SIZE(early_pf_idts)); +ARRAY_SIZE(early_pf_idts), true); } /** @@ -267,7 +269,7 @@ void __init idt_setup_early_pf(void) */ void __init idt_setup_ist_traps(void) { - idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts)); + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); } /** @@ -277,7 +279,7 @@ void __init idt_setup_debugidt_traps(void) { memcpy(_idt_table, _table, IDT_ENTRIES * 16); - idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts)); +
[tip:x86/apic] x86/idt: Move interrupt gate initialization to IDT code
Commit-ID: dc20b2d526539344d7175a2a83221337302596b8 Gitweb: http://git.kernel.org/tip/dc20b2d526539344d7175a2a83221337302596b8 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:55 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:28 +0200 x86/idt: Move interrupt gate initialization to IDT code Move the gate intialization from interrupt init to the IDT code so all IDT related operations are at a single place. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.340209...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/idt.c | 18 ++ arch/x86/kernel/irqinit.c | 18 -- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 4327104..99f93a6 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -286,7 +286,25 @@ void __init idt_setup_debugidt_traps(void) */ void __init idt_setup_apic_and_irq_gates(void) { + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts)); + + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifdef CONFIG_X86_LOCAL_APIC + set_bit(i, used_vectors); + set_intr_gate(i, spurious_interrupt); +#else + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); +#endif + } } /** diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 218cd06..1add9e0 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -89,29 +89,11 @@ void __init init_IRQ(void) void __init native_init_IRQ(void) { - int i; - /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); idt_setup_apic_and_irq_gates(); - /* -* Cover the whole vector space, no vector can escape -* us. (some of these will be overridden and become -* 'special' SMP interrupts) -*/ - i = FIRST_EXTERNAL_VECTOR; - for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { - /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } -#ifdef CONFIG_X86_LOCAL_APIC - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) - set_intr_gate(i, spurious_interrupt); -#endif - if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, );
[tip:x86/apic] x86/tracing: Introduce a static key for exception tracing
Commit-ID: 2feb1b316d48004d905278c02a55902cab0be8be Gitweb: http://git.kernel.org/tip/2feb1b316d48004d905278c02a55902cab0be8be Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:21 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:23 +0200 x86/tracing: Introduce a static key for exception tracing Switching the IDT just for avoiding tracepoints creates a completely impenetrable macro/inline/ifdef mess. There is no point in avoiding tracepoints for most of the traps/exceptions. For the more expensive tracepoints, like pagefaults, this can be handled with an explicit static key. Preparatory patch to remove the tracing IDT. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.593094...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/common.h | 15 +++ arch/x86/include/asm/trace/exceptions.h | 4 +--- arch/x86/include/asm/trace/irq_vectors.h | 4 +--- arch/x86/kernel/tracepoint.c | 9 - 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000..b1eb7b1 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,15 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +extern int trace_irq_vector_regfunc(void); +extern void trace_irq_vector_unregfunc(void); + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key); +#define trace_irqvectors_enabled() \ + static_branch_unlikely(_irqvectors_key) +#else +static inline bool trace_irqvectors_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14..960a5b5 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,7 @@ #define _TRACE_PAGE_FAULT_H #include - -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#include DECLARE_EVENT_CLASS(x86_exceptions, diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9..7825b44 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,7 @@ #define _TRACE_IRQ_VECTORS_H #include - -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#include DECLARE_EVENT_CLASS(x86_irq_vector, diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1551513..dd4aa04 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -4,9 +4,11 @@ * Copyright (C) 2013 Seiji Aguchi * */ +#include +#include + #include #include -#include atomic_t trace_idt_ctr = ATOMIC_INIT(0); struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, @@ -15,6 +17,7 @@ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, /* No need to be aligned, but done to keep all IDTs defined the same way. */ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; +DEFINE_STATIC_KEY_FALSE(trace_irqvectors_key); static int trace_irq_vector_refcount; static DEFINE_MUTEX(irq_vector_mutex); @@ -36,6 +39,8 @@ static void switch_idt(void *arg) int trace_irq_vector_regfunc(void) { + static_branch_inc(_irqvectors_key); + mutex_lock(_vector_mutex); if (!trace_irq_vector_refcount) { set_trace_idt_ctr(1); @@ -49,6 +54,8 @@ int trace_irq_vector_regfunc(void) void trace_irq_vector_unregfunc(void) { + static_branch_dec(_irqvectors_key); + mutex_lock(_vector_mutex); trace_irq_vector_refcount--; if (!trace_irq_vector_refcount) {
[tip:x86/apic] x86/traps: Simplify pagefault tracing logic
Commit-ID: 11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8 Gitweb: http://git.kernel.org/tip/11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:22 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:23 +0200 x86/traps: Simplify pagefault tracing logic Make use of the new irqvector tracing static key and remove the duplicated trace_do_pagefault() implementation. If irq vector tracing is disabled, then the overhead of this is a single NOP5, which is a reasonable tradeoff to avoid duplicated code and the unholy macro mess. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.672965...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S| 8 arch/x86/entry/entry_64.S| 13 +--- arch/x86/include/asm/traps.h | 10 + arch/x86/kernel/kvm.c| 2 +- arch/x86/mm/fault.c | 49 5 files changed, 16 insertions(+), 66 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 48ef7bb..0092da1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -891,14 +891,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4dbb336..2731b94 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -918,17 +918,6 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry boundsdo_bounds has_error_code=0 @@ -1096,7 +1085,7 @@ idtentry xen_stack_segmentdo_stack_segment has_error_code=1 #endif idtentry general_protectiondo_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_faultdo_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7..b4f322d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -39,7 +39,6 @@ asmlinkage void machine_check(void); asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); #define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds @@ -54,6 +53,7 @@ asmlinkage void trace_page_fault(void); #define trace_alignment_check alignment_check #define trace_simd_coprocessor_error simd_coprocessor_error #define trace_async_page_fault async_page_fault +#define trace_page_fault page_fault #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +74,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d04e30e..6ed9242 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case
[tip:x86/apic] x86/boot: Move EISA setup to a separate file
Commit-ID: f7eaf6e00fd581043bb540dfe865f1d81769b189 Gitweb: http://git.kernel.org/tip/f7eaf6e00fd581043bb540dfe865f1d81769b189 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:20 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:22 +0200 x86/boot: Move EISA setup to a separate file EISA has absolutely nothing to do with traps, so move it out of traps.c into its own eisa.c file. Furthermore, the EISA bus detection does not need to run during very early boot, it's good enough to run it before the EISA bus and drivers are initialized. I.e. instead of calling it from the very early trap_init() code, make it a subsys_initcall(). Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.515322...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/eisa.c | 18 ++ arch/x86/kernel/traps.c | 13 - 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 287eac7..6ab5fbf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000..881f923 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,18 @@ +/* + * EISA specific code + * + * This file is licensed under the GPL V2 + */ +#include +#include + +static __init int eisa_bus_probe(void) +{ + void __iomem *p = ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 556f8f5..3095324 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -38,11 +38,6 @@ #include #include -#ifdef CONFIG_EISA -#include -#include -#endif - #if defined(CONFIG_EDAC) #include #endif @@ -969,14 +964,6 @@ void __init trap_init(void) { int i; -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - set_intr_gate(X86_TRAP_DE, divide_error); set_intr_gate_ist(X86_TRAP_NMI, , NMI_STACK); /* int4 can be called from all */
[tip:x86/apic] x86/irq: Get rid of the 'first_system_vector' indirection bogosity
Commit-ID: 05161b9cbe553c41cf775ac41bb5120d94347e5c Gitweb: http://git.kernel.org/tip/05161b9cbe553c41cf775ac41bb5120d94347e5c Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:18 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:21 +0200 x86/irq: Get rid of the 'first_system_vector' indirection bogosity This variable is beyond pointless. Nothing allocates a vector via alloc_gate() below FIRST_SYSTEM_VECTOR. So nothing can change first_system_vector. If there is a need for a gate below FIRST_SYSTEM_VECTOR then it can be added to the vector defines and FIRST_SYSTEM_VECTOR can be adjusted accordingly. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.357109...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 5 ++--- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/irq.c | 2 +- arch/x86/kernel/irqinit.c | 5 + 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b1..a7f36ab 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -482,16 +483,14 @@ static inline void _set_gate(int gate, unsigned type, void *addr, 0, 0, __KERNEL_CS); \ } while (0) -extern int first_system_vector; /* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; static inline void alloc_system_vector(int vector) { + BUG_ON(vector < FIRST_SYSTEM_VECTOR); if (!test_bit(vector, used_vectors)) { set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; } else { BUG(); } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 98b3dd8..8996ef1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -177,8 +177,6 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = FIRST_SYSTEM_VECTOR; - /* * Debug level, exported for io_apic.c */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b3af457..88c214e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, offset = current_offset; next: vector += 16; - if (vector >= first_system_vector) { + if (vector >= FIRST_SYSTEM_VECTOR) { offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e6073a0..019d0ac 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -429,7 +429,7 @@ int check_irq_vectors_for_cpu_disable(void) * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; -vector < first_system_vector; vector++) { +vector < FIRST_SYSTEM_VECTOR; vector++) { if (!test_bit(vector, used_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6537cfe..4e5f8c0 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -169,10 +169,7 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ i = FIRST_EXTERNAL_VECTOR; -#ifndef CONFIG_X86_LOCAL_APIC -#define first_system_vector NR_VECTORS -#endif - for_each_clear_bit_from(i, used_vectors, first_system_vector) { + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR));
[tip:x86/apic] x86/irq: Unexport used_vectors[]
Commit-ID: fa4ab5774dfe58fd5e99462f625253659d41df09 Gitweb: http://git.kernel.org/tip/fa4ab5774dfe58fd5e99462f625253659d41df09 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:17 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:20 +0200 x86/irq: Unexport used_vectors[] No modular users. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.278375...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf54309..556f8f5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,7 +83,6 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; gate_desc idt_table[NR_VECTORS] __page_aligned_bss; DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); static inline void cond_local_irq_enable(struct pt_regs *regs) {
[tip:x86/apic] x86/fpu: Use bitfield accessors for desc_struct
Commit-ID: 718f5d0030da8669404dab873336b16c169b430b Gitweb: http://git.kernel.org/tip/718f5d0030da8669404dab873336b16c169b430b Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:39 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:25 +0200 x86/fpu: Use bitfield accessors for desc_struct desc_struct is a union of u32 fields and bitfields. The access to the u32 fields is done with magic macros. Convert it to use the bitfields and replace the macro magic with parseable inline functions. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.042406...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/math-emu/fpu_entry.c | 11 +- arch/x86/math-emu/fpu_system.h | 48 +++-- arch/x86/math-emu/get_address.c | 17 --- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 0203bae..d4a7df2 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info) } code_descriptor = FPU_get_ldt_descriptor(FPU_CS); - if (SEG_D_SIZE(code_descriptor)) { + if (code_descriptor.d) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ addr_modes.default_mode = SEG32; @@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info) /* 16 bit protected mode */ addr_modes.default_mode = PM16; } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor) + - 1) * SEG_GRANULARITY(code_descriptor) - - 1; + FPU_EIP += code_base = seg_get_base(_descriptor); + code_limit = seg_get_limit(_descriptor) + 1; + code_limit *= seg_get_granularity(_descriptor); + code_limit += code_base - 1; if (code_limit < code_base) code_limit = 0x; } diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index a179254..2319a25 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -34,17 +34,43 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) return ret; } -#define SEG_D_SIZE(x) ((x).b & (3 << 21)) -#define SEG_G_BIT(x) ((x).b & (1 << 23)) -#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) -#define SEG_286_MODE(x)((x).b & ( 0xff00 | 0xf | (1 << 23))) -#define SEG_BASE_ADDR(s) (((s).b & 0xff00) \ -| (((s).b & 0xff) << 16) | ((s).a >> 16)) -#define SEG_LIMIT(s) (((s).b & 0xff) | ((s).a & 0x)) -#define SEG_EXECUTE_ONLY(s)(((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) -#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) -#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ -== (1 << 10)) +#define SEG_TYPE_WRITABLE (1U << 1) +#define SEG_TYPE_EXPANDS_DOWN (1U << 2) +#define SEG_TYPE_EXECUTE (1U << 3) +#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE) +#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE) + +static inline unsigned long seg_get_base(struct desc_struct *d) +{ + unsigned long base = (unsigned long)d->base2 << 24; + + return base | ((unsigned long)d->base1 << 16) | d->base0; +} + +static inline unsigned long seg_get_limit(struct desc_struct *d) +{ + return ((unsigned long)d->limit << 16) | d->limit0; +} + +static inline unsigned long seg_get_granularity(struct desc_struct *d) +{ + return d->g ? 4096 : 1; +} + +static inline bool seg_expands_down(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN; +} + +static inline bool seg_execute_only(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE; +} + +static inline bool seg_writable(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; +} #define I387 (>thread.fpu.state) #define FPU_info
[tip:x86/apic] x86/idt: Unify gate_struct handling for 32/64-bit kernels
Commit-ID: 64b163fab684e3de47aa8db6cc08ae7d2e194373 Gitweb: http://git.kernel.org/tip/64b163fab684e3de47aa8db6cc08ae7d2e194373 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:37 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:24 +0200 x86/idt: Unify gate_struct handling for 32/64-bit kernels The first 32 bits of gate struct are the same for 32 and 64 bit kernels. The 32-bit version uses desc_struct and no designated data structure, so we need different accessors for 32 and 64 bit kernels. Aside of that the macros which are necessary to build the 32-bit gate descriptor are horrible to read. Unify the gate structs and switch all code fiddling with it over. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064957.861974...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/eboot.c | 8 +++--- arch/x86/include/asm/desc.h | 45 ++- arch/x86/include/asm/desc_defs.h | 57 ++-- arch/x86/kvm/vmx.c | 2 +- arch/x86/xen/enlighten_pv.c | 12 - 5 files changed, 67 insertions(+), 57 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c3e869e..65f0b24 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1058,7 +1058,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1078,7 +1078,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; @@ -1099,7 +1099,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; @@ -1116,7 +1116,7 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = 0; desc->dpl = 0; desc->p = 1; - desc->limit = 0x0; + desc->limit1 = 0x0; desc->avl = 0; desc->l = 0; desc->d = 0; diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d18a604..0731064 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -84,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low= PTR_LOW(func); + gate->offset_low= (u16) func; + gate->bits.p= 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, -unsigned long base, unsigned dpl, unsigned flags, -unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0x); - gate->b = (base & 0x) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -186,7 +178,8 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long
[tip:x86/apic] x86/percpu: Use static initializer for GDT entry
Commit-ID: 1dd439fe97e1a32cbb980c180f1bcb54bb6a2a55 Gitweb: http://git.kernel.org/tip/1dd439fe97e1a32cbb980c180f1bcb54bb6a2a55 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:38 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:24 +0200 x86/percpu: Use static initializer for GDT entry The IDT cleanup is about to remove pack_descriptor(). The GDT setup for the per-cpu storage can be achieved with the static initializer as well. Replace it. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064957.954214...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 9 +++-- 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 10edd1e..6e8fcb6f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct gdt; + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xF); - pack_descriptor(, per_cpu_offset(cpu), 0xF, - 0x2 | DESCTYPE_S, 0x8); - gdt.s = 1; - write_gdt_entry(get_cpu_gdt_rw(cpu), - GDT_ENTRY_PERCPU, , DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, , DESCTYPE_S); #endif }
[tip:x86/apic] x86/idt: Switch early trap init to IDT tables
Commit-ID: 433f8924fa8e55a50ce57f3b8a33ed095c405644 Gitweb: http://git.kernel.org/tip/433f8924fa8e55a50ce57f3b8a33ed095c405644 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:50 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:27 +0200 x86/idt: Switch early trap init to IDT tables Add the initialization table for the early trap setup and replace the early trap init code. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.929139...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/idt.c | 53 + arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/traps.c | 27 - 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ae6fc12..64e2211 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -48,6 +48,28 @@ struct idt_data { #define TSKG(_vector, _gdt)\ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, debug), + SYSG(X86_TRAP_BP, int3), +#ifdef CONFIG_X86_32 + INTG(X86_TRAP_PF, page_fault), +#endif +}; + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, page_fault), +}; +#endif + /* Must be page-aligned because the real IDT is used in a fixmap. */ gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; @@ -93,6 +115,37 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size) } /** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts)); + load_idt(_descr); +} + +#ifdef CONFIG_X86_64 +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * FIXME: Why is 32bit and 64bit installing the PF handler at different + * places in the early setup code? + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, +ARRAY_SIZE(early_pf_idts)); +} +#endif + +/** * idt_setup_early_handler - Initializes the idt table with early handlers */ void __init idt_setup_early_handler(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ecab322..30dc84e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -891,7 +891,7 @@ void __init setup_arch(char **cmdline_p) */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); early_ioremap_init(); @@ -1162,7 +1162,7 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - early_trap_pf_init(); + idt_setup_early_pf(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 41f4cd3..835c7e8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -923,33 +923,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif -/* Set of traps needed for early debugging. */ -void __init early_trap_init(void) -{ - /* -* Don't use IST to set DEBUG_STACK as it doesn't work until TSS -* is ready in cpu_init() <-- trap_init(). Before trap_init(), -* CPU runs at ring 0 so it is impossible to hit an invalid -* stack. Using the original stack works well enough at this -* early stage. DEBUG_STACK will be equipped after cpu_init() in -* trap_init(). -*/ - set_intr_gate(X86_TRAP_DB, debug); - /* int3 can be called from all */ - set_system_intr_gate(X86_TRAP_BP, ); -#ifdef CONFIG_X86_32 -
[tip:x86/apic] x86/idt: Move debug stack init to table based
Commit-ID: 0a30908b9149b2b332ccf817261125a634765566 Gitweb: http://git.kernel.org/tip/0a30908b9149b2b332ccf817261125a634765566 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:51 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:27 +0200 x86/idt: Move debug stack init to table based Add the debug_idt init table and make use of it. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.006502...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 ++ arch/x86/kernel/idt.c | 23 +++ arch/x86/kernel/traps.c | 6 +- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 5a3cdeb..930acd5 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -509,8 +509,10 @@ extern void idt_setup_early_traps(void); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); +extern void idt_setup_debugidt_traps(void); #else static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_debugidt_traps(void) { } #endif extern void idt_invalidate(void *addr); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 64e2211..f5281b8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -68,6 +68,15 @@ static const __initdata struct idt_data early_idts[] = { static const __initdata struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, page_fault), }; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +static const __initdata struct idt_data dbg_idts[] = { + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_BP, int3), +}; #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -82,6 +91,10 @@ struct desc_ptr idt_descr __ro_after_init = { /* No need to be aligned, but done to keep all IDTs defined the same way. */ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ const struct desc_ptr debug_idt_descr = { .size = IDT_ENTRIES * 16 - 1, .address= (unsigned long) debug_idt_table, @@ -143,6 +156,16 @@ void __init idt_setup_early_pf(void) idt_setup_from_table(idt_table, early_pf_idts, ARRAY_SIZE(early_pf_idts)); } + +/** + * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps + */ +void __init idt_setup_debugidt_traps(void) +{ + memcpy(_idt_table, _table, IDT_ENTRIES * 16); + + idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts)); +} #endif /** diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 835c7e8..1492bf5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -990,9 +990,5 @@ void __init trap_init(void) x86_init.irqs.trap_init(); -#ifdef CONFIG_X86_64 - memcpy(_idt_table, _table, IDT_ENTRIES * 16); - set_nmi_gate(X86_TRAP_DB, ); - set_nmi_gate(X86_TRAP_BP, ); -#endif + idt_setup_debugidt_traps(); }
[tip:x86/apic] x86/idt: Hide set_intr_gate()
Commit-ID: facaa3e3c813848e6b49ee37a42a3688832e63cd Gitweb: http://git.kernel.org/tip/facaa3e3c813848e6b49ee37a42a3688832e63cd Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:59 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:29 +0200 x86/idt: Hide set_intr_gate() set_intr_gate() is an internal function of the IDT code. The only user left is the KVM code which replaces the pagefault handler eventually. Provide an explicit update_intr_gate() function and make set_intr_gate() static. While at it replace the magic number 14 in the KVM code with the proper trap define. Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.663008...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 +- arch/x86/kernel/idt.c | 33 - arch/x86/kernel/kvm.c | 2 +- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 33f84f2..1a2ba36 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -390,7 +390,7 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void set_intr_gate(unsigned int n, const void *addr); +void update_intr_gate(unsigned int n, const void *addr); void alloc_intr_gate(unsigned int n, const void *addr); extern unsigned long used_vectors[]; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b609eac..61b490c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -225,6 +225,22 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy } } +static void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + BUG_ON(n > 0xFF); + + memset(, 0, sizeof(data)); + data.vector = n; + data.addr = addr; + data.segment= __KERNEL_CS; + data.bits.type = GATE_INTERRUPT; + data.bits.p = 1; + + idt_setup_from_table(idt_table, , 1, false); +} + /** * idt_setup_early_traps - Initialize the idt table with early traps * @@ -336,20 +352,11 @@ void idt_invalidate(void *addr) load_idt(); } -void set_intr_gate(unsigned int n, const void *addr) +void __init update_intr_gate(unsigned int n, const void *addr) { - struct idt_data data; - - BUG_ON(n > 0xFF); - - memset(, 0, sizeof(data)); - data.vector = n; - data.addr = addr; - data.segment= __KERNEL_CS; - data.bits.type = GATE_INTERRUPT; - data.bits.p = 1; - - idt_setup_from_table(idt_table, , 1, false); + if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + return; + set_intr_gate(n, addr); } void alloc_intr_gate(unsigned int n, const void *addr) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6ed9242..874827b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, async_page_fault); + update_intr_gate(X86_TRAP_PF, async_page_fault); } void __init kvm_guest_init(void)
[tip:x86/apic] x86/idt: Simplify alloc_intr_gate()
Commit-ID: 4447ac1195a845b18f2f427686f116ab77c5b268 Gitweb: http://git.kernel.org/tip/4447ac1195a845b18f2f427686f116ab77c5b268 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:58 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:28 +0200 x86/idt: Simplify alloc_intr_gate() The only users of alloc_intr_gate() are hypervisors, which both check the used_vectors bitmap whether they have allocated the gate already. Move that check into alloc_intr_gate() and simplify the users. Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Reviewed-by: K. Y. Srinivasan Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064959.580830...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mshyperv.c | 9 ++--- arch/x86/kernel/idt.c| 6 +++--- drivers/xen/events/events_base.c | 6 ++ 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 70e717f..9fc3265 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs) void hv_setup_vmbus_irq(void (*handler)(void)) { vmbus_handler = handler; - /* -* Setup the IDT for hypervisor callback. Prevent reallocation -* at module reload. -*/ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - hyperv_callback_vector); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); } void hv_remove_vmbus_irq(void) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8e9318d..b609eac 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -354,7 +354,7 @@ void set_intr_gate(unsigned int n, const void *addr) void alloc_intr_gate(unsigned int n, const void *addr) { - BUG_ON(test_bit(n, used_vectors) || n < FIRST_SYSTEM_VECTOR); - set_bit(n, used_vectors); - set_intr_gate(n, addr); + BUG_ON(n < FIRST_SYSTEM_VECTOR); + if (!test_and_set_bit(n, used_vectors)) + set_intr_gate(n, addr); } diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 2d43118..1ab4bd1 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1653,10 +1653,8 @@ void xen_callback_vector(void) return; } pr_info("Xen HVM callback vector for event delivery is enabled\n"); - /* in the restore case the vector has already been allocated */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); } } #else
[tip:x86/apic] x86/irq: Remove vector_used_by_percpu_irq()
Commit-ID: 69de72ec6db950c436e36b94cf05eeb9e11ee144 Gitweb: http://git.kernel.org/tip/69de72ec6db950c436e36b94cf05eeb9e11ee144 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:16 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:20 +0200 x86/irq: Remove vector_used_by_percpu_irq() Last user (lguest) is gone. Remove it. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.201432...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq.h | 1 - arch/x86/kernel/irq.c | 2 -- arch/x86/kernel/irqinit.c | 12 3 files changed, 15 deletions(-) diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 668cca5..ce99168 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -44,7 +44,6 @@ extern __visible unsigned int do_IRQ(struct pt_regs *regs); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4ed0aba..e6073a0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -346,8 +346,6 @@ __visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) set_irq_regs(old_regs); } -EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); - #ifdef CONFIG_HOTPLUG_CPU /* These two declarations are only used in check_irq_vectors_for_cpu_disable() diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c7fd185..6537cfe 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - return 1; - } - - return 0; -} - void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip;
[tip:x86/apic] x86/irq: Remove duplicated used_vectors definition
Commit-ID: 9aec458ff07323f6593fd718cc33b1bca2f64597 Gitweb: http://git.kernel.org/tip/9aec458ff07323f6593fd718cc33b1bca2f64597 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:19 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:21 +0200 x86/irq: Remove duplicated used_vectors definition Also remove the unparseable comment in the other place while at it. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.436711...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 1 - arch/x86/include/asm/irq.h | 3 --- 2 files changed, 4 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a7f36ab..71094f2 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -483,7 +483,6 @@ static inline void _set_gate(int gate, unsigned type, void *addr, 0, 0, __KERNEL_CS); \ } while (0) -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; static inline void alloc_system_vector(int vector) diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ce99168..9958cee 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -42,9 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC
[tip:x86/apic] x86/idt: Remove unused set_trap_gate()
Commit-ID: 8f55868f9e42fea56021b17421914b9e4fda4960 Gitweb: http://git.kernel.org/tip/8f55868f9e42fea56021b17421914b9e4fda4960 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:45 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:26 +0200 x86/idt: Remove unused set_trap_gate() This inline is not used at all. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.522053...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 12 1 file changed, 12 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 108a9e8..51b3d48 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -446,18 +446,6 @@ static inline void set_system_intr_gate(unsigned int n, void *addr) _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); } -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) { BUG_ON((unsigned)n > 0xFF);
[tip:x86/apic] x86/idt: Move early IDT setup out of 32-bit asm
Commit-ID: 87e81786b13b267c4355e0d23e33c7e4c08fa63f Gitweb: http://git.kernel.org/tip/87e81786b13b267c4355e0d23e33c7e4c08fa63f Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:48 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:26 +0200 x86/idt: Move early IDT setup out of 32-bit asm The early IDT setup can be done in C code like it's done on 64-bit kernels. Reuse the 64-bit version. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.757980...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 1 + arch/x86/kernel/head32.c | 4 arch/x86/kernel/head_32.S | 36 ++-- arch/x86/kernel/idt.c | 4 4 files changed, 11 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a602d6..066aaf8 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -238,6 +238,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 538ec01..cf2ce06 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { cr4_init_shadow(); + + idt_setup_early_handler(); + sanitize_boot_params(_params); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce8c6ed..a615a5e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -345,7 +345,6 @@ ENTRY(startup_32_smp) movl %eax,%cr0 lgdt early_gdt_descr - lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax# reload all the segment registers movl %eax,%ss # after changing gdt. @@ -378,37 +377,6 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: - /* -* Set up a idt with 256 interrupt gates that push zero if there -* is no error code and then jump to early_idt_handler_common. -* It doesn't actually load the idt - that needs to be done on -* each CPU. Interrupts are enabled elsewhere, when we can be -* relatively sure everything is ok. -*/ - - movl $idt_table,%edi - movl $early_idt_handler_array,%eax - movl $NUM_EXCEPTION_VECTORS,%ecx -1: - movl %eax,(%edi) - movl %eax,4(%edi) - /* interrupt gate, dpl=0, present */ - movl $(0x8E00 + __KERNEL_CS),2(%edi) - addl $EARLY_IDT_HANDLER_SIZE,%eax - addl $8,%edi - loop 1b - - movl $256 - NUM_EXCEPTION_VECTORS,%ecx - movl $ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax/* selector = 0x0010 = cs */ - movw $0x8E00,%dx/* interrupt gate - dpl=0, present */ -2: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - loop 2b - #ifdef CONFIG_CC_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by @@ -498,7 +466,7 @@ ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN -ignore_int: +ENTRY(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -533,7 +501,7 @@ ignore_int: hlt_loop: hlt jmp hlt_loop -ENDPROC(ignore_int) +ENDPROC(early_ignore_irq) __INITDATA .align 4 GLOBAL(early_recursion_flag) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index a147581..70ca248 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -34,6 +34,10 @@ void __init idt_setup_early_handler(void) for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif load_idt(_descr); }
[tip:x86/apic] x86/idt: Prepare for table based init
Commit-ID: 3318e9744244a415ee9481ca7e54234caf5e12c5 Gitweb: http://git.kernel.org/tip/3318e9744244a415ee9481ca7e54234caf5e12c5 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:49 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:27 +0200 x86/idt: Prepare for table based init The IDT setup code is handled in several places. All of them use variants of set_intr_gate() inlines. This can be done with a table based initialization, which allows to reduce the inline zoo and puts all IDT related code and information into a single place. Add the infrastructure. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.849877...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/idt.c | 67 +++ 1 file changed, 67 insertions(+) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 70ca248..ae6fc12 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -5,8 +5,49 @@ */ #include +#include +#include #include +struct idt_data { + unsigned intvector; + unsigned intsegment; + struct idt_bits bits; + const void *addr; +}; + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type,\ + .bits.dpl = _dpl, \ + .bits.p = 1,\ + .addr = _addr,\ + .segment= _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Interrupt gate with interrupt stack */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt)\ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + /* Must be page-aligned because the real IDT is used in a fixmap. */ gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; @@ -25,6 +66,32 @@ const struct desc_ptr debug_idt_descr = { }; #endif +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low= (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + +static __init void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(, t); + set_bit(t->vector, used_vectors); + write_idt_entry(idt, t->vector, ); + } +} + /** * idt_setup_early_handler - Initializes the idt table with early handlers */
[tip:x86/apic] x86/idt: Consolidate IDT invalidation
Commit-ID: e802a51ede91350438c051da2f238f5e8c918ead Gitweb: http://git.kernel.org/tip/e802a51ede91350438c051da2f238f5e8c918ead Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:46 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:26 +0200 x86/idt: Consolidate IDT invalidation kexec and reboot have both code to invalidate IDT. Create a common function and use it. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.600953...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h| 3 +++ arch/x86/kernel/idt.c | 11 +++ arch/x86/kernel/machine_kexec_32.c | 14 +- arch/x86/kernel/reboot.c | 4 +--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 51b3d48..33aff45 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -503,4 +503,7 @@ static inline void load_current_idt(void) else load_idt((const struct desc_ptr *)_descr); } + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 86e5912..cd4658c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -24,3 +24,14 @@ const struct desc_ptr debug_idt_descr = { .address= (unsigned long) debug_idt_table, }; #endif + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(); +} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8c53c5d..00bc751 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -26,18 +26,6 @@ #include #include -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size= limit; - curidt.address = (unsigned long)newidt; - - load_idt(); -} - - static void set_gdt(void *newgdt, __u16 limit) { struct desc_ptr curgdt; @@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image) * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + idt_invalidate(phys_to_virt(0)); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a56bf60..54984b1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -38,8 +38,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static const struct desc_ptr no_idt = {}; - /* * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on @@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - load_idt(_idt); + idt_invalidate(NULL); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */
[tip:x86/apic] x86/idt: Move early IDT handler setup to IDT code
Commit-ID: 588787fde7aa346f345e1a7600f84d88039fc9df Gitweb: http://git.kernel.org/tip/588787fde7aa346f345e1a7600f84d88039fc9df Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:47 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:26 +0200 x86/idt: Move early IDT handler setup to IDT code The early IDT handler setup is done in C entry code on 64-bit kernels and in ASM entry code on 32-bit kernels. Move the 64-bit variant to the IDT code so it can be shared with 32-bit in the next step. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.679561...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 9 + arch/x86/kernel/head64.c| 6 +- arch/x86/kernel/idt.c | 12 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 33aff45..5a3cdeb 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -504,6 +504,15 @@ static inline void load_current_idt(void) load_idt((const struct desc_ptr *)_descr); } +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +#else +static inline void idt_setup_early_pf(void) { } +#endif + extern void idt_invalidate(void *addr); #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9ba7954..d6ab034 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -255,8 +255,6 @@ static void __init copy_bootdata(char *real_mode_data) asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { - int i; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -282,9 +280,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handler_array[i]); - load_idt((const struct desc_ptr *)_descr); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index cd4658c..a147581 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -26,6 +26,18 @@ const struct desc_ptr debug_idt_descr = { #endif /** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); + load_idt(_descr); +} + +/** * idt_invalidate - Invalidate interrupt descriptor table * @addr: The virtual address of the 'invalid' IDT */
[tip:x86/apic] x86/apic: Remove the duplicated tracing versions of interrupts
Commit-ID: 61069de7a3252be0b1f567fe9e0b4723f1d2814f Gitweb: http://git.kernel.org/tip/61069de7a3252be0b1f567fe9e0b4723f1d2814f Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:26 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:25 +0200 x86/apic: Remove the duplicated tracing versions of interrupts The error and the spurious interrupt are really rare events and not at all performance sensitive: two NOP5s can be tolerated when tracing is disabled. Remove the complication. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170828064956.986009...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 4 ++-- arch/x86/kernel/apic/apic.c | 43 ++- 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a7e45d1..b094b87 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -48,15 +48,15 @@ extern asmlinkage void call_function_single_interrupt(void); #ifdef CONFIG_TRACING /* Interrupt handlers registered during init_IRQ */ -extern void trace_error_interrupt(void); extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); +#define trace_error_interrupt error_interrupt +#define trace_spurious_interrupt spurious_interrupt #define trace_x86_platform_ipi x86_platform_ipi #define trace_apic_timer_interrupt apic_timer_interrupt #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a33fa44..eebee4c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1899,10 +1899,14 @@ void __init register_lapic_address(unsigned long address) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static void __smp_spurious_interrupt(u8 vector) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; u32 v; + entering_irq(); + trace_spurious_apic_entry(vector); + /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1917,22 +1921,7 @@ static void __smp_spurious_interrupt(u8 vector) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " "should never happen.\n", vector, smp_processor_id()); -} -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_spurious_interrupt(~regs->orig_ax); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) -{ - u8 vector = ~regs->orig_ax; - - entering_irq(); - trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); trace_spurious_apic_exit(vector); exiting_irq(); } @@ -1940,10 +1929,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static void __smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { - u32 v; - u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error",/* APIC Error Bit 0 */ "Receive CS error", /* APIC Error Bit 1 */ @@ -1954,6 +1941,10 @@ static void __smp_error_interrupt(struct pt_regs *regs) "Received illegal vector", /* APIC Error Bit 6 */ "Illegal register address", /* APIC Error Bit 7 */ }; + u32 v, i = 0; + + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ @@ -1975,20 +1966,6 @@ static void __smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); -} - -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_error_interrupt(regs); - exiting_irq(); -} - -__visible void __irq_entry
[tip:x86/apic] x86/irq: Get rid of duplicated trace_x86_platform_ipi() code
Commit-ID: 8a17116b1fddc1f414cd4dd5e86fa239fcdb5208 Gitweb: http://git.kernel.org/tip/8a17116b1fddc1f414cd4dd5e86fa239fcdb5208 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:25 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:25 +0200 x86/irq: Get rid of duplicated trace_x86_platform_ipi() code Two NOP5s are really a good tradeoff vs. the unholy IDT switching mess, which duplicates code all over the place. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170828064956.907209...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/kernel/irq.c | 25 + 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 44137bb..a7e45d1 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -48,7 +48,6 @@ extern asmlinkage void call_function_single_interrupt(void); #ifdef CONFIG_TRACING /* Interrupt handlers registered during init_IRQ */ -extern void trace_x86_platform_ipi(void); extern void trace_error_interrupt(void); extern void trace_irq_work_interrupt(void); extern void trace_spurious_interrupt(void); @@ -58,6 +57,7 @@ extern void trace_threshold_interrupt(void); extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); +#define trace_x86_platform_ipi x86_platform_ipi #define trace_apic_timer_interrupt apic_timer_interrupt #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 019d0ac..befdd4a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -262,20 +262,16 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void __smp_x86_platform_ipi(void) -{ - inc_irq_stat(x86_platform_ipis); - - if (x86_platform_ipi_callback) - x86_platform_ipi_callback(); -} - __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - __smp_x86_platform_ipi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } @@ -334,17 +330,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) } #endif -__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); - set_irq_regs(old_regs); -} #ifdef CONFIG_HOTPLUG_CPU
[tip:x86/apic] x86/apic: Remove the duplicated tracing version of local_timer_interrupt()
Commit-ID: 302a98f896bbd2feb1393d98e8b9febeb101db6e Gitweb: http://git.kernel.org/tip/302a98f896bbd2feb1393d98e8b9febeb101db6e Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:23 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:24 +0200 x86/apic: Remove the duplicated tracing version of local_timer_interrupt() The two NOP5s are noise in the rest of the work which is done by the timer interrupt and modern CPUs are pretty good in optimizing NOPs anyway. Get rid of the interrupt handler duplication and move the tracepoints into the regular handler. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170828064956.751247...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/kernel/apic/apic.c | 19 --- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d6dbafb..44137bb 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -48,7 +48,6 @@ extern asmlinkage void call_function_single_interrupt(void); #ifdef CONFIG_TRACING /* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); extern void trace_x86_platform_ipi(void); extern void trace_error_interrupt(void); extern void trace_irq_work_interrupt(void); @@ -59,6 +58,7 @@ extern void trace_threshold_interrupt(void); extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); +#define trace_apic_timer_interrupt apic_timer_interrupt #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8996ef1..7a57b54 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1038,25 +1038,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - local_apic_timer_interrupt(); - exiting_irq(); - - set_irq_regs(old_regs); -} - -__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* -* NOTE! We'd better ACK the irq immediately, -* because timer handling can be slow. -* -* update_process_times() expects us to have done irq_enter(). -* Besides, if we don't timer interrupts ignore the global -* interrupt lock, which is the WrongThing (tm) to do. -*/ - entering_ack_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR);
[tip:x86/apic] x86/apic: Use this_cpu_ptr() in local_timer_interrupt()
Commit-ID: 3bec6def39e32609e01a68b43476ee1f1c512eaa Gitweb: http://git.kernel.org/tip/3bec6def39e32609e01a68b43476ee1f1c512eaa Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:24 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 11:42:24 +0200 x86/apic: Use this_cpu_ptr() in local_timer_interrupt() Accessing the per cpu data via per_cpu(, smp_processor_id()) is pointless. Use this_cpu_ptr() instead. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064956.829552...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 7a57b54..a33fa44 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -988,8 +988,7 @@ void setup_secondary_APIC_clock(void) */ static void local_apic_timer_interrupt(void) { - int cpu = smp_processor_id(); - struct clock_event_device *evt = _cpu(lapic_events, cpu); + struct clock_event_device *evt = this_cpu_ptr(_events); /* * Normally we should not be here till LAPIC has been initialized but @@ -1003,7 +1002,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return;
[tip:x86/apic] x86/asm: Replace access to desc_struct:a/b fields
Commit-ID: 9a98e7780022aa7cd201eb8a88a4f1d607b73cde Gitweb: http://git.kernel.org/tip/9a98e7780022aa7cd201eb8a88a4f1d607b73cde Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:40 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:25 +0200 x86/asm: Replace access to desc_struct:a/b fields The union inside of desc_struct which allows access to the raw u32 parts of the descriptors. This raw access part is about to go away. Replace the few code parts which access those fields. Signed-off-by: Thomas Gleixner Reviewed-by: Boris Ostrovsky Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.120214...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/hypercall.h | 6 -- arch/x86/kernel/tls.c| 2 +- arch/x86/xen/enlighten_pv.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 11071fc..9606688 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,6 +552,8 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index dcd699b..a106b97 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx, while (n-- > 0) { if (LDT_empty(info) || LDT_zero(info)) { - desc->a = desc->b = 0; + memset(desc, 0, sizeof(*desc)); } else { fill_ldt(desc, info); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4c5d72b..03fb07d 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -494,7 +494,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) static inline bool desc_equal(const struct desc_struct *d1, const struct desc_struct *d2) { - return d1->a == d2->a && d1->b == d2->b; + return !memcmp(d1, d2, sizeof(*d1)); } static void load_TLS_descriptor(struct thread_struct *t,
[tip:x86/apic] x86/gdt: Use bitfields for initialization
Commit-ID: 38e9e81f4c81c75799b002d5811de7241b307676 Gitweb: http://git.kernel.org/tip/38e9e81f4c81c75799b002d5811de7241b307676 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:41 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:25 +0200 x86/gdt: Use bitfields for initialization The GDT entry related code uses two ways to access entries via union fields: - bitfields - macros which initialize the two 16-bit parts of the entry by magic shift and mask operations. Clean it up and only use the bitfields to initialize and access entries. ( The old access patterns were partly done due to GCC optimizing bitfield accesses in a horrible way - that's mostly fixed these days and clarity of code in such low level accessors is very important. ) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.197673...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c| 2 +- arch/x86/include/asm/desc.h | 26 +++- arch/x86/include/asm/desc_defs.h | 44 ++-- arch/x86/math-emu/fpu_system.h | 2 +- 4 files changed, 38 insertions(+), 36 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 726355c..1911310 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg) * and 8 bits for the node) */ d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; + d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1;/* Not a system segment */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 0731064..2090cd2 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -23,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf) >> 16; + desc->limit1= (info->limit & 0xf) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -170,14 +170,20 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, unsigned long limit, unsigned char type, unsigned char flags) { - desc->a = ((base & 0x) << 16) | (limit & 0x); - desc->b = (base & 0xff00) | ((base & 0xff) >> 16) | - (limit & 0x000f) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; + desc->limit0= (u16) limit; + desc->base0 = (u16) base; + desc->base1 = (base >> 16) & 0xFF; + desc->type = type & 0x0F; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit1= (limit >> 16) & 0xF; + desc->avl = (flags >> 0) & 0x01; + desc->l = (flags >> 1) & 0x01; + desc->d = (flags >> 2) & 0x01; + desc->g = (flags >> 3) & 0x01; } - static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) { @@ -195,7 +201,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, desc->base2 = (addr >> 24) & 0xFF; desc->base3 = (u32) (addr >> 32); #else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + pack_descriptor((struct desc_struct *)d, addr, size, type, 0); #endif } @@ -395,13 +401,13 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0x; - desc->limit = (limit >> 16) & 0xf; + desc->limit1 = (limit >> 16) & 0xf;
[tip:x86/apic] x86/ldttss: Clean up 32-bit descriptors
Commit-ID: 87cc037674342cbf6213829b2cc59bb71be60777 Gitweb: http://git.kernel.org/tip/87cc037674342cbf6213829b2cc59bb71be60777 Author: Thomas GleixnerAuthorDate: Mon, 28 Aug 2017 08:47:42 +0200 Committer: Ingo Molnar CommitDate: Tue, 29 Aug 2017 12:07:25 +0200 x86/ldttss: Clean up 32-bit descriptors Like the IDT descriptors, the LDT/TSS descriptors are pointlessly different on 32 and 64 bit kernels. Unify them and get rid of the duplicated code. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170828064958.289634...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 26 +++--- arch/x86/include/asm/desc_defs.h | 27 --- 2 files changed, 15 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 2090cd2..108a9e8 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -166,42 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy([entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->limit0= (u16) limit; - desc->base0 = (u16) base; - desc->base1 = (base >> 16) & 0xFF; - desc->type = type & 0x0F; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit1= (limit >> 16) & 0xF; - desc->avl = (flags >> 0) & 0x01; - desc->l = (flags >> 1) & 0x01; - desc->d = (flags >> 2) & 0x01; - desc->g = (flags >> 3) & 0x01; -} - static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0= size & 0x; + desc->limit0= (u16) size; desc->base0 = (u16) addr; desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1= (size >> 16) & 0xF; desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 desc->base3 = (u32) (addr >> 32); -#else - pack_descriptor((struct desc_struct *)d, addr, size, type, 0); #endif } diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 1b9494e..346d252 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -49,24 +49,21 @@ enum { DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; #ifdef CONFIG_X86_64 -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#else -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; + u32 base3; + u32 zero1; #endif +} __attribute__((packed)); + +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; struct idt_bits { u16 ist : 3,
[tip:irq/urgent] genirq/cpuhotplug: Add sanity check for effective affinity mask
Commit-ID: 60b09c51bb4fb46e2331fdbb39f91520f31d35f7 Gitweb: https://git.kernel.org/tip/60b09c51bb4fb46e2331fdbb39f91520f31d35f7 Author: Thomas GleixnerAuthorDate: Mon, 9 Oct 2017 12:47:24 +0200 Committer: Thomas Gleixner CommitDate: Mon, 9 Oct 2017 13:26:48 +0200 genirq/cpuhotplug: Add sanity check for effective affinity mask The effective affinity mask handling has no safety net when the mask is not updated by the interrupt chip or the mask contains offline CPUs. If that happens the CPU unplug code fails to migrate interrupts. Add sanity checks and emit a warning when the mask contains only offline CPUs. Fixes: 415fcf1a2293 ("genirq/cpuhotplug: Use effective affinity mask") Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Christoph Hellwig Cc: sta...@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/cpuhotplug.c | 28 +++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 638eb9c..9eb09ae 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,8 +18,34 @@ static inline bool irq_needs_fixup(struct irq_data *d) { const struct cpumask *m = irq_data_get_effective_affinity_mask(d); + unsigned int cpu = smp_processor_id(); - return cpumask_test_cpu(smp_processor_id(), m); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + /* +* The cpumask_empty() check is a workaround for interrupt chips, +* which do not implement effective affinity, but the architecture has +* enabled the config switch. Use the general affinity mask instead. +*/ + if (cpumask_empty(m)) + m = irq_data_get_affinity_mask(d); + + /* +* Sanity check. If the mask is not empty when excluding the outgoing +* CPU then it must contain at least one online CPU. The outgoing CPU +* has been removed from the online mask already. +*/ + if (cpumask_any_but(m, cpu) < nr_cpu_ids && + cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { + /* +* If this happens then there was a missed IRQ fixup at some +* point. Warn about it and enforce fixup. +*/ + pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", + cpumask_pr_args(m), d->irq, cpu); + return true; + } +#endif + return cpumask_test_cpu(cpu, m); } static bool migrate_one_irq(struct irq_desc *desc)
[tip:irq/urgent] genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs
Commit-ID: e43b3b58548051f8809391eb7bec7a27ed3003ea Gitweb: https://git.kernel.org/tip/e43b3b58548051f8809391eb7bec7a27ed3003ea Author: Thomas GleixnerAuthorDate: Wed, 4 Oct 2017 21:07:38 +0200 Committer: Thomas Gleixner CommitDate: Mon, 9 Oct 2017 13:26:48 +0200 genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs Managed interrupts can end up in a stale state on CPU hotplug. If the interrupt is not targeting a single CPU, i.e. the affinity mask spawns multiple CPUs then the following can happen: After boot: dstate: 0x01601200 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 0 After offlining CPU 31 - 24 dstate: 0x01a31000 IRQD_IRQ_DISABLED IRQD_IRQ_MASKED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_AFFINITY_MANAGED IRQD_MANAGED_SHUTDOWN node: 0 affinity: 24-31 effectiv: 24 pending: 0 Now CPU 25 gets onlined again, so it should get the effective interrupt affinity for this interruopt, but due to the x86 interrupt affinity setter restrictions this ends up after restarting the interrupt with: dstate: 0x01601300 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_SETAFFINITY_PENDING IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 24-31 So the interrupt is still affine to CPU 24, which was the last CPU to go offline of that affinity set and the move to an online CPU within 24-31, in this case 25, is pending. This mechanism is x86/ia64 specific as those architectures cannot move interrupts from thread context and do this when an interrupt is actually handled. So the move is set to pending. Whats worse is that offlining CPU 25 again results in: dstate: 0x01601300 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_SET IRQD_SETAFFINITY_PENDING IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 24 pending: 24-31 This means the interrupt has not been shut down, because the outgoing CPU is not in the effective affinity mask, but of course nothing notices that the effective affinity mask is pointing at an offline CPU. In the case of restarting a managed interrupt the move restriction does not apply, so the affinity setting can be made unconditional. This needs to be done _before_ the interrupt is started up as otherwise the condition for moving it from thread context would not longer be fulfilled. With that change applied onlining CPU 25 after offlining 31-24 results in: dstate: 0x01600200 IRQD_ACTIVATED IRQD_IRQ_STARTED IRQD_SINGLE_TARGET IRQD_AFFINITY_MANAGED node: 0 affinity: 24-31 effectiv: 25 pending: And after offlining CPU 25: dstate: 0x01a3 IRQD_IRQ_DISABLED IRQD_IRQ_MASKED IRQD_SINGLE_TARGET IRQD_AFFINITY_MANAGED IRQD_MANAGED_SHUTDOWN node: 0 affinity: 24-31 effectiv: 25 pending: which is the correct and expected result. Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()") Reported-by: YASUAKI ISHIMATSU Signed-off-by: Thomas Gleixner Cc: ax...@kernel.dk Cc: linux-s...@vger.kernel.org Cc: Sumit Saxena Cc: Marc Zyngier Cc: m...@ellerman.id.au Cc: Shivasharan Srikanteshwara Cc: Kashyap Desai Cc: keith.bu...@intel.com Cc: pet...@infradead.org Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos --- kernel/irq/chip.c | 2 +- kernel/irq/manage.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6fc89fd..5a2ef92c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: + irq_do_set_affinity(d, aff, false); ret = __irq_startup(desc); - irq_set_affinity_locked(d, aff, false); break; case IRQ_STARTUP_ABORT: return 0; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ef89f72..4bff6a1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -188,6 +188,9 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
[tip:x86/apic] genirq: Add config option for reservation mode
Commit-ID: 2b5175c4fa974b6aa05bbd2ee8d443a8036a1714 Gitweb: https://git.kernel.org/tip/2b5175c4fa974b6aa05bbd2ee8d443a8036a1714 Author: Thomas GleixnerAuthorDate: Tue, 17 Oct 2017 09:54:57 +0200 Committer: Thomas Gleixner CommitDate: Wed, 18 Oct 2017 15:38:30 +0200 genirq: Add config option for reservation mode The interrupt reservation mode requires reactivation of PCI/MSI interrupts. Create a config option, so the PCI code can set the corresponding flag when required. Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-...@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: de...@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.369375...@linutronix.de --- kernel/irq/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index ac1a3e2..89e3558 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -100,6 +100,9 @@ config IRQ_TIMINGS config GENERIC_IRQ_MATRIX_ALLOCATOR bool +config GENERIC_IRQ_RESERVATION_MODE + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS
[tip:x86/apic] x86/vector/msi: Select CONFIG_GENERIC_IRQ_RESERVATION_MODE
Commit-ID: c201c91799d687c0a6d8c3272950f51aad5ffebe Gitweb: https://git.kernel.org/tip/c201c91799d687c0a6d8c3272950f51aad5ffebe Author: Thomas GleixnerAuthorDate: Tue, 17 Oct 2017 09:54:59 +0200 Committer: Thomas Gleixner CommitDate: Wed, 18 Oct 2017 15:38:31 +0200 x86/vector/msi: Select CONFIG_GENERIC_IRQ_RESERVATION_MODE Select CONFIG_GENERIC_IRQ_RESERVATION_MODE so PCI/MSI domains get the MSI_FLAG_MUST_REACTIVATE flag set in pci_msi_create_irq_domain(). Remove the explicit setters of this flag in the apic/msi code as they are not longer required. Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode") Reported-and-tested-by: Dexuan Cui Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-...@vger.kernel.org Cc: Haiyang Zhang Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: de...@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.527569...@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/msi.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 64e99d3..ea4beda 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -95,6 +95,7 @@ config X86 select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC select GENERIC_IRQ_MIGRATIONif SMP select GENERIC_IRQ_PROBE + select GENERIC_IRQ_RESERVATION_MODE select GENERIC_IRQ_SHOW select GENERIC_PENDING_IRQ if SMP select GENERIC_SMP_IDLE_THREAD diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 5b6dd1a..9b18be7 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -129,7 +129,7 @@ static struct msi_domain_ops pci_msi_domain_ops = { static struct msi_domain_info pci_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX | MSI_FLAG_MUST_REACTIVATE, + MSI_FLAG_PCI_MSIX, .ops= _msi_domain_ops, .chip = _msi_controller, .handler= handle_edge_irq, @@ -167,8 +167,7 @@ static struct irq_chip pci_msi_ir_controller = { static struct msi_domain_info pci_msi_ir_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX | - MSI_FLAG_MUST_REACTIVATE, + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, .ops= _msi_domain_ops, .chip = _msi_ir_controller, .handler= handle_edge_irq,
[tip:x86/apic] PCI/MSI: Set MSI_FLAG_MUST_REACTIVATE in core code
Commit-ID: 25e960efc63852b84d1c3739aef586285b177395 Gitweb: https://git.kernel.org/tip/25e960efc63852b84d1c3739aef586285b177395 Author: Thomas GleixnerAuthorDate: Tue, 17 Oct 2017 09:54:58 +0200 Committer: Thomas Gleixner CommitDate: Wed, 18 Oct 2017 15:38:31 +0200 PCI/MSI: Set MSI_FLAG_MUST_REACTIVATE in core code If interrupt reservation mode is enabled then the PCI/MSI interrupts must be reactivated after early activation. Make sure that all callers of pci_msi_create_irq_domain() have the MSI_FLAG_MUST_REACTIVATE set when reservation mode is enabled. Signed-off-by: Thomas Gleixner Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-...@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: de...@linuxdriverproject.org Cc: KY Srinivasan Link: https://lkml.kernel.org/r/20171017075600.448649...@linutronix.de --- drivers/pci/msi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 496ed91..e066071 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1441,6 +1441,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, pci_msi_domain_update_chip_ops(info); info->flags |= MSI_FLAG_ACTIVATE_EARLY; + if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) + info->flags |= MSI_FLAG_MUST_REACTIVATE; domain = msi_create_irq_domain(fwnode, info, parent); if (!domain)
[tip:x86/fpu] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap()
Commit-ID: 57b8b1a1856adaa849d02d547411a553a531022b Gitweb: https://git.kernel.org/tip/57b8b1a1856adaa849d02d547411a553a531022b Author: Thomas GleixnerAuthorDate: Wed, 18 Oct 2017 19:39:35 +0200 Committer: Thomas Gleixner CommitDate: Wed, 18 Oct 2017 20:03:34 +0200 x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible 'features' which can be handed in are larger than this, because after the capabilities the bug 'feature' bits occupy another 32bit. Not really obvious... So clearing any of the misfeature bits, as 32bit does for the F00F bug, accesses that bitmap out of bounds thereby corrupting the stack. Size the bitmap proper and add a sanity check to catch accidental out of bound access. Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop --- arch/x86/kernel/cpu/cpuid-deps.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e48eb73..c1d4984 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) __clear_cpu_cap(c, feature); } +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) { - bool changed; - DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; clear_feature(c, feature);
[tip:sched/urgent] sched/debug: Fix task state recording/printout
Commit-ID: 3f5fe9fef5b2da06b6319fab8123056da5217c3f Gitweb: https://git.kernel.org/tip/3f5fe9fef5b2da06b6319fab8123056da5217c3f Author: Thomas GleixnerAuthorDate: Wed, 22 Nov 2017 13:05:48 +0100 Committer: Ingo Molnar CommitDate: Fri, 24 Nov 2017 08:39:12 +0100 sched/debug: Fix task state recording/printout The recent conversion of the task state recording to use task_state_index() broke the sched_switch tracepoint task state output. task_state_index() returns surprisingly an index (0-7) which is then printed with __print_flags() applying bitmasks. Not really working and resulting in weird states like 'prev_state=t' instead of 'prev_state=I'. Use TASK_REPORT_MAX instead of TASK_STATE_MAX to report preemption. Build a bitmask from the return value of task_state_index() and store it in entry->prev_state, which makes __print_flags() work as expected. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: sta...@vger.kernel.org Fixes: efb40f588b43 ("sched/tracing: Fix trace_sched_switch task-state printing") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1711221304180.1751@nanos Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 306b31d..bc01e06 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -116,9 +116,9 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * RUNNING (we will not have dequeued if state != RUNNING). */ if (preempt) - return TASK_STATE_MAX; + return TASK_REPORT_MAX; - return task_state_index(p); + return 1 << task_state_index(p); } #endif /* CREATE_TRACE_POINTS */ @@ -164,7 +164,7 @@ TRACE_EVENT(sched_switch, { 0x40, "P" }, { 0x80, "I" }) : "R", - __entry->prev_state & TASK_STATE_MAX ? "+" : "", + __entry->prev_state & TASK_REPORT_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) );
[tip:irq/core] irq/work: Use llist_for_each_entry_safe
Commit-ID: d00a08cf9ee986ad6689ce8c6fd176aff679c106 Gitweb: https://git.kernel.org/tip/d00a08cf9ee986ad6689ce8c6fd176aff679c106 Author: Thomas GleixnerAuthorDate: Sun, 12 Nov 2017 13:02:51 +0100 Committer: Thomas Gleixner CommitDate: Sun, 12 Nov 2017 13:15:14 +0100 irq/work: Use llist_for_each_entry_safe The llist_for_each_entry() loop in irq_work_run_list() is unsafe because once the works PENDING bit is cleared it can be requeued on another CPU. Use llist_for_each_entry_safe() instead. Fixes: 16c0890dc66d ("irq/work: Don't reinvent the wheel but use existing llist API") Reported-by:Chris Wilson Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Cc: Byungchul Park Cc: Peter Zijlstra Cc: Petri Latvala Link: http://lkml.kernel.org/r/151027307351.14762.461196020658...@mail.alporthouse.com --- kernel/irq_work.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e2ebe8c..6647b33f 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -128,9 +128,9 @@ bool irq_work_needs_cpu(void) static void irq_work_run_list(struct llist_head *list) { - unsigned long flags; - struct irq_work *work; + struct irq_work *work, *tmp; struct llist_node *llnode; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -138,7 +138,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry(work, llnode, llnode) { + llist_for_each_entry_safe(work, tmp, llnode, llnode) { /* * Clear the PENDING bit, after this point the @work * can be re-used.
[tip:core/urgent] watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy")
Commit-ID: 1c294733b7b9f712f78d15cfa75ffdea72b79abb Gitweb: https://git.kernel.org/tip/1c294733b7b9f712f78d15cfa75ffdea72b79abb Author: Thomas GleixnerAuthorDate: Tue, 31 Oct 2017 22:32:00 +0100 Committer: Thomas Gleixner CommitDate: Wed, 1 Nov 2017 20:41:27 +0100 watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Guenter reported a crash in the watchdog/perf code, which is caused by cleanup() and enable() running concurrently. The reason for this is: The watchdog functions are serialized via the watchdog_mutex and cpu hotplug locking, but the enable of the perf based watchdog happens in context of the unpark callback of the smpboot thread. But that unpark function is not synchronous inside the locking. The unparking of the thread just wakes it up and leaves so there is no guarantee when the thread is executing. If it starts running _before_ the cleanup happened then it will create a event and overwrite the dead event pointer. The new event is then cleaned up because the event is marked dead. lock(watchdog_mutex); lockup_detector_reconfigure(); cpus_read_lock(); stop(); park() update(); start(); unpark() cpus_read_unlock(); thread runs() overwrite dead event ptr cleanup(); free new event, which is active inside perf unlock(watchdog_mutex); The park side is safe as that actually waits for the thread to reach parked state. Commit a33d44843d45 removed the protection against this kind of scenario under the stupid assumption that the hotplug serialization and the watchdog_mutex cover everything. Bring it back. Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Reported-and-tested-by: Guenter Roeck Signed-off-by: Thomas Feels-stupid Gleixner Cc: Peter Zijlstra Cc: Don Zickus Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos --- kernel/watchdog_hld.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71a62ce..f8db56b 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), _events_mask); watchdog_cpus--; } @@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, _events_mask) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = per_cpu(dead_event, cpu); /* * Required because for_each_cpu() reports unconditionally @@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void) */ if (event) perf_event_release_kernel(event); - per_cpu(watchdog_ev, cpu) = NULL; + per_cpu(dead_event_ev, cpu) = NULL; } cpumask_clear(_events_mask); }
[tip:core/urgent] watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy")
Commit-ID: 9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1 Gitweb: https://git.kernel.org/tip/9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1 Author: Thomas GleixnerAuthorDate: Tue, 31 Oct 2017 22:32:00 +0100 Committer: Thomas Gleixner CommitDate: Wed, 1 Nov 2017 21:18:39 +0100 watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Guenter reported a crash in the watchdog/perf code, which is caused by cleanup() and enable() running concurrently. The reason for this is: The watchdog functions are serialized via the watchdog_mutex and cpu hotplug locking, but the enable of the perf based watchdog happens in context of the unpark callback of the smpboot thread. But that unpark function is not synchronous inside the locking. The unparking of the thread just wakes it up and leaves so there is no guarantee when the thread is executing. If it starts running _before_ the cleanup happened then it will create a event and overwrite the dead event pointer. The new event is then cleaned up because the event is marked dead. lock(watchdog_mutex); lockup_detector_reconfigure(); cpus_read_lock(); stop(); park() update(); start(); unpark() cpus_read_unlock(); thread runs() overwrite dead event ptr cleanup(); free new event, which is active inside perf unlock(watchdog_mutex); The park side is safe as that actually waits for the thread to reach parked state. Commit a33d44843d45 removed the protection against this kind of scenario under the stupid assumption that the hotplug serialization and the watchdog_mutex cover everything. Bring it back. Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Reported-and-tested-by: Guenter Roeck Signed-off-by: Thomas Feels-stupid Gleixner Cc: Peter Zijlstra Cc: Don Zickus Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos --- kernel/watchdog_hld.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71a62ce..a7f137c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), _events_mask); watchdog_cpus--; } @@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, _events_mask) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = per_cpu(dead_event, cpu); /* * Required because for_each_cpu() reports unconditionally @@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void) */ if (event) perf_event_release_kernel(event); - per_cpu(watchdog_ev, cpu) = NULL; + per_cpu(dead_event, cpu) = NULL; } cpumask_clear(_events_mask); }
[tip:smp/urgent] cpu/hotplug: Reset node state after operation
Commit-ID: 1f7c70d6b2bc5de301f30456621e1161fddf4242 Gitweb: https://git.kernel.org/tip/1f7c70d6b2bc5de301f30456621e1161fddf4242 Author: Thomas GleixnerAuthorDate: Sat, 21 Oct 2017 16:06:52 +0200 Committer: Thomas Gleixner CommitDate: Sat, 21 Oct 2017 16:11:30 +0200 cpu/hotplug: Reset node state after operation The recent rework of the cpu hotplug internals changed the usage of the per cpu state->node field, but missed to clean it up after usage. So subsequent hotplug operations use the stale pointer from a previous operation and hand it into the callback functions. The callbacks then dereference a pointer which either belongs to a different facility or points to freed and potentially reused memory. In either case data corruption and crashes are the obvious consequence. Reset the node and the last pointers in the per cpu state to NULL after the operation which set them has completed. Fixes: 96abb968549c ("smp/hotplug: Allow external multi-instance rollback") Reported-by: Tvrtko Ursulin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: "Paul E. McKenney" Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710211606130.3213@nanos --- kernel/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index d851df2..04892a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, __cpuhp_kick_ap(st); } + /* +* Clean up the leftovers so the next hotplug operation wont use stale +* data. +*/ + st->node = st->last = NULL; return ret; }
[tip:locking/core] stop_machine: Use raw spinlocks
Commit-ID: de5b55c1d4e30740009864eb35ce4ed856aac01d Gitweb: https://git.kernel.org/tip/de5b55c1d4e30740009864eb35ce4ed856aac01d Author: Thomas GleixnerAuthorDate: Mon, 23 Apr 2018 21:16:35 +0200 Committer: Thomas Gleixner CommitDate: Fri, 27 Apr 2018 14:34:51 +0200 stop_machine: Use raw spinlocks Use raw-locks in stop_machine() to allow locking in irq-off and preempt-disabled regions on -RT. This also documents the possible locking context in general. [bigeasy: update patch description.] Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20180423191635.6014-1-bige...@linutronix.de --- kernel/stop_machine.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7591261652d..c25ba18274fb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -36,7 +36,7 @@ struct cpu_stop_done { struct cpu_stopper { struct task_struct *thread; - spinlock_t lock; + raw_spinlock_t lock; boolenabled;/* is this stopper enabled? */ struct list_headworks; /* list of pending works */ @@ -78,13 +78,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) unsigned long flags; bool enabled; - spin_lock_irqsave(>lock, flags); + raw_spin_lock_irqsave(>lock, flags); enabled = stopper->enabled; if (enabled) __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); - spin_unlock_irqrestore(>lock, flags); + raw_spin_unlock_irqrestore(>lock, flags); return enabled; } @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2); int err; retry: - spin_lock_irq(>lock); - spin_lock_nested(>lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(>lock); + raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) @@ -255,8 +255,8 @@ retry: __cpu_stop_queue_work(stopper1, work1); __cpu_stop_queue_work(stopper2, work2); unlock: - spin_unlock(>lock); - spin_unlock_irq(>lock); + raw_spin_unlock(>lock); + raw_spin_unlock_irq(>lock); if (unlikely(err == -EDEADLK)) { while (stop_cpus_in_progress) @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu) unsigned long flags; int run; - spin_lock_irqsave(>lock, flags); + raw_spin_lock_irqsave(>lock, flags); run = !list_empty(>works); - spin_unlock_irqrestore(>lock, flags); + raw_spin_unlock_irqrestore(>lock, flags); return run; } @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu) repeat: work = NULL; - spin_lock_irq(>lock); + raw_spin_lock_irq(>lock); if (!list_empty(>works)) { work = list_first_entry(>works, struct cpu_stop_work, list); list_del_init(>list); } - spin_unlock_irq(>lock); + raw_spin_unlock_irq(>lock); if (work) { cpu_stop_fn_t fn = work->fn; @@ -541,7 +541,7 @@ static int __init cpu_stop_init(void) for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = _cpu(cpu_stopper, cpu); - spin_lock_init(>lock); + raw_spin_lock_init(>lock); INIT_LIST_HEAD(>works); }
[tip:x86/urgent] x86/apic/x2apic: Initialize cluster ID properly
Commit-ID: fed71f7d98795ed0fa1d431910787f0f4a68324f Gitweb: https://git.kernel.org/tip/fed71f7d98795ed0fa1d431910787f0f4a68324f Author: Thomas GleixnerAuthorDate: Thu, 17 May 2018 14:36:39 +0200 Committer: Thomas Gleixner CommitDate: Thu, 17 May 2018 21:00:12 +0200 x86/apic/x2apic: Initialize cluster ID properly Rick bisected a regression on large systems which use the x2apic cluster mode for interrupt delivery to the commit wich reworked the cluster management. The problem is caused by a missing initialization of the clusterid field in the shared cluster data structures. So all structures end up with cluster ID 0 which only allows sharing between all CPUs which belong to cluster 0. All other CPUs with a cluster ID > 0 cannot share the data structure because they cannot find existing data with their cluster ID. This causes malfunction with IPIs because IPIs are sent to the wrong cluster and the caller waits for ever that the target CPU handles the IPI. Add the missing initialization when a upcoming CPU is the first in a cluster so that the later booting CPUs can find the data and share it for proper operation. Fixes: 023a611748fd ("x86/apic/x2apic: Simplify cluster management") Reported-by: Rick Warner Bisected-by: Rick Warner Signed-off-by: Thomas Gleixner Tested-by: Rick Warner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.deb.2.21.1805171418210.1...@nanos.tec.linutronix.de --- arch/x86/kernel/apic/x2apic_cluster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 8b04234e010b..7685444a106b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -116,6 +116,7 @@ static void init_x2apic_ldr(void) goto update; } cmsk = cluster_hotplug_mask; + cmsk->clusterid = cluster; cluster_hotplug_mask = NULL; update: this_cpu_write(cluster_masks, cmsk);
[tip:x86/urgent] x86/apic/vector: Print APIC control bits in debugfs
Commit-ID: a07771ac6a78860777a9da5d9bc38830ec993fe7 Gitweb: https://git.kernel.org/tip/a07771ac6a78860777a9da5d9bc38830ec993fe7 Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:34:00 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:22 +0200 x86/apic/vector: Print APIC control bits in debugfs Extend the debugability of the vector management by adding the state bits to the debugfs output. Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.908136...@linutronix.de --- arch/x86/kernel/apic/vector.c | 27 ++- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index b708f597eee3..35aaee4fc028 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -588,8 +588,7 @@ error: static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, struct irq_data *irqd, int ind) { - unsigned int cpu, vector, prev_cpu, prev_vector; - struct apic_chip_data *apicd; + struct apic_chip_data apicd; unsigned long flags; int irq; @@ -605,24 +604,26 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, return; } - apicd = irqd->chip_data; - if (!apicd) { + if (!irqd->chip_data) { seq_printf(m, "%*sVector: Not assigned\n", ind, ""); return; } raw_spin_lock_irqsave(_lock, flags); - cpu = apicd->cpu; - vector = apicd->vector; - prev_cpu = apicd->prev_cpu; - prev_vector = apicd->prev_vector; + memcpy(, irqd->chip_data, sizeof(apicd)); raw_spin_unlock_irqrestore(_lock, flags); - seq_printf(m, "%*sVector: %5u\n", ind, "", vector); - seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu); - if (prev_vector) { - seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector); - seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu); + + seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector); + seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu); + if (apicd.prev_vector) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector); + seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu); } + seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0); + seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0); + seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0); + seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0); + seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed()); } #endif
[tip:x86/urgent] genirq/affinity: Defer affinity setting if irq chip is busy
Commit-ID: 12f47073a40f6aa75119d8f5df4077b7f334cced Gitweb: https://git.kernel.org/tip/12f47073a40f6aa75119d8f5df4077b7f334cced Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:59 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:22 +0200 genirq/affinity: Defer affinity setting if irq chip is busy The case that interrupt affinity setting fails with -EBUSY can be handled in the kernel completely by using the already available generic pending infrastructure. If a irq_chip::set_affinity() fails with -EBUSY, handle it like the interrupts for which irq_chip::set_affinity() can only be invoked from interrupt context. Copy the new affinity mask to irq_desc::pending_mask and set the affinity pending bit. The next raised interrupt for the affected irq will check the pending bit and try to set the new affinity from the handler. This avoids that -EBUSY is returned when an affinity change is requested from user space and the previous change has not been cleaned up. The new affinity will take effect when the next interrupt is raised from the device. Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup") Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.819273...@linutronix.de --- kernel/irq/manage.c | 37 +++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e3336d904f64..facfecfc543c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -204,6 +204,39 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + irqd_set_move_pending(data); + irq_copy_pending(desc, dest); + return 0; +} +#else +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + return -EBUSY; +} +#endif + +static int irq_try_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + int ret = irq_do_set_affinity(data, dest, force); + + /* +* In case that the underlying vector management is busy and the +* architecture supports the generic pending mechanism then utilize +* this to avoid returning an error to user space. +*/ + if (ret == -EBUSY && !force) + ret = irq_set_affinity_pending(data, dest); + return ret; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -214,8 +247,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, force); + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { + ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask);
[tip:x86/urgent] genirq/generic_pending: Do not lose pending affinity update
Commit-ID: a33a5d2d16cb84bea8d5f5510f3a41aa48b5c467 Gitweb: https://git.kernel.org/tip/a33a5d2d16cb84bea8d5f5510f3a41aa48b5c467 Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:54 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:19 +0200 genirq/generic_pending: Do not lose pending affinity update The generic pending interrupt mechanism moves interrupts from the interrupt handler on the original target CPU to the new destination CPU. This is required for x86 and ia64 due to the way the interrupt delivery and acknowledge works if the interrupts are not remapped. However that update can fail for various reasons. Some of them are valid reasons to discard the pending update, but the case, when the previous move has not been fully cleaned up is not a legit reason to fail. Check the return value of irq_do_set_affinity() for -EBUSY, which indicates a pending cleanup, and rearm the pending move in the irq dexcriptor so it's tried again when the next interrupt arrives. Fixes: 996c591227d9 ("x86/irq: Plug vector cleanup race") Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.386544...@linutronix.de --- kernel/irq/migration.c | 26 +++--- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 86ae0eb80b53..8b8cecd18cce 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -38,17 +38,18 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_data *data = >irq_data; + struct irq_chip *chip = data->chip; - if (likely(!irqd_is_setaffinity_pending(>irq_data))) + if (likely(!irqd_is_setaffinity_pending(data))) return; - irqd_clr_move_pending(>irq_data); + irqd_clr_move_pending(data); /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (irqd_is_per_cpu(>irq_data)) { + if (irqd_is_per_cpu(data)) { WARN_ON(1); return; } @@ -73,9 +74,20 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) - irq_do_set_affinity(>irq_data, desc->pending_mask, false); - + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + int ret; + + ret = irq_do_set_affinity(data, desc->pending_mask, false); + /* +* If the there is a cleanup pending in the underlying +* vector management, reschedule the move for the next +* interrupt. Leave desc->pending_mask intact. +*/ + if (ret == -EBUSY) { + irqd_set_move_pending(data); + return; + } + } cpumask_clear(desc->pending_mask); }
[tip:x86/urgent] irq_remapping: Use apic_ack_irq()
Commit-ID: 8a2b7d142e7ac477d52f5f92251e59fc136d7ddd Gitweb: https://git.kernel.org/tip/8a2b7d142e7ac477d52f5f92251e59fc136d7ddd Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:56 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:20 +0200 irq_remapping: Use apic_ack_irq() To address the EBUSY fail of interrupt affinity settings in case that the previous setting has not been cleaned up yet, use the new apic_ack_irq() function instead of the special ir_ack_apic_edge() implementation which is merily a wrapper around ack_APIC_irq(). Preparatory change for the real fix Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup") Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.555716...@linutronix.de --- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel_irq_remapping.c | 2 +- drivers/iommu/irq_remapping.c | 5 - drivers/iommu/irq_remapping.h | 2 -- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 8fb8c737fffe..b0b30a568db7 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4379,7 +4379,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) static struct irq_chip amd_ir_chip = { .name = "AMD-IR", - .irq_ack= ir_ack_apic_edge, + .irq_ack= apic_ack_irq, .irq_set_affinity = amd_ir_set_affinity, .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, .irq_compose_msi_msg= ir_compose_msi_msg, diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 3062a154a9fb..967450bd421a 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -1223,7 +1223,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) static struct irq_chip intel_ir_chip = { .name = "INTEL-IR", - .irq_ack= ir_ack_apic_edge, + .irq_ack= apic_ack_irq, .irq_set_affinity = intel_ir_set_affinity, .irq_compose_msi_msg= intel_ir_compose_msi_msg, .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 496deee3ae3a..7d0f3074d41d 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -156,11 +156,6 @@ void panic_if_irq_remap(const char *msg) panic(msg); } -void ir_ack_apic_edge(struct irq_data *data) -{ - ack_APIC_irq(); -} - /** * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU * device serving request @info diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index 039c7af7b190..0afef6e43be4 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -65,8 +65,6 @@ struct irq_remap_ops { extern struct irq_remap_ops intel_irq_remap_ops; extern struct irq_remap_ops amd_iommu_irq_ops; -extern void ir_ack_apic_edge(struct irq_data *data); - #else /* CONFIG_IRQ_REMAP */ #define irq_remapping_enabled 0
[tip:x86/urgent] x86/apic: Provide apic_ack_irq()
Commit-ID: c0255770ccdc77ef2184d2a0a2e0cde09d2b44a4 Gitweb: https://git.kernel.org/tip/c0255770ccdc77ef2184d2a0a2e0cde09d2b44a4 Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:55 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:20 +0200 x86/apic: Provide apic_ack_irq() apic_ack_edge() is explicitely for handling interrupt affinity cleanup when interrupt remapping is not available or disable. Remapped interrupts and also some of the platform specific special interrupts, e.g. UV, invoke ack_APIC_irq() directly. To address the issue of failing an affinity update with -EBUSY the delayed affinity mechanism can be reused, but ack_APIC_irq() does not handle that. Adding this to ack_APIC_irq() is not possible, because that function is also used for exceptions and directly handled interrupts like IPIs. Create a new function, which just contains the conditional invocation of irq_move_irq() and the final ack_APIC_irq(). Reuse the new function in apic_ack_edge(). Preparatory change for the real fix. Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup") Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.471925...@linutronix.de --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/kernel/apic/vector.c | 9 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 08acd954f00e..74a9e06b6cfd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -436,6 +436,8 @@ static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} #endif /* CONFIG_X86_LOCAL_APIC */ +extern void apic_ack_irq(struct irq_data *data); + static inline void ack_APIC_irq(void) { /* diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 72b575a0b662..b708f597eee3 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -809,13 +809,18 @@ static int apic_retrigger_irq(struct irq_data *irqd) return 1; } -void apic_ack_edge(struct irq_data *irqd) +void apic_ack_irq(struct irq_data *irqd) { - irq_complete_move(irqd_cfg(irqd)); irq_move_irq(irqd); ack_APIC_irq(); } +void apic_ack_edge(struct irq_data *irqd) +{ + irq_complete_move(irqd_cfg(irqd)); + apic_ack_irq(irqd); +} + static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack= apic_ack_edge,
[tip:x86/urgent] genirq/migration: Avoid out of line call if pending is not set
Commit-ID: d340ebd696f921d3ad01b8c0c29dd38f2ad2bf3e Gitweb: https://git.kernel.org/tip/d340ebd696f921d3ad01b8c0c29dd38f2ad2bf3e Author: Thomas Gleixner AuthorDate: Wed, 6 Jun 2018 14:46:59 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:20 +0200 genirq/migration: Avoid out of line call if pending is not set The upcoming fix for the -EBUSY return from affinity settings requires to use the irq_move_irq() functionality even on irq remapped interrupts. To avoid the out of line call, move the check for the pending bit into an inline helper. Preparatory change for the real fix. No functional change. Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup") Signed-off-by: Thomas Gleixner Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Cc: Dou Liyang Link: https://lkml.kernel.org/r/20180604162224.471925...@linutronix.de --- include/linux/irq.h| 7 ++- kernel/irq/migration.c | 5 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 65916a305f3d..4e66378f290b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -551,7 +551,12 @@ extern int irq_affinity_online_cpu(unsigned int cpu); #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) -void irq_move_irq(struct irq_data *data); +void __irq_move_irq(struct irq_data *data); +static inline void irq_move_irq(struct irq_data *data) +{ + if (unlikely(irqd_is_setaffinity_pending(data))) + __irq_move_irq(data); +} void irq_move_masked_irq(struct irq_data *data); void irq_force_complete_move(struct irq_desc *desc); #else diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 8b8cecd18cce..def48589ea48 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -91,7 +91,7 @@ void irq_move_masked_irq(struct irq_data *idata) cpumask_clear(desc->pending_mask); } -void irq_move_irq(struct irq_data *idata) +void __irq_move_irq(struct irq_data *idata) { bool masked; @@ -102,9 +102,6 @@ void irq_move_irq(struct irq_data *idata) */ idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); - if (likely(!irqd_is_setaffinity_pending(idata))) - return; - if (unlikely(irqd_irq_disabled(idata))) return;
[tip:x86/urgent] x86/ioapic: Use apic_ack_irq()
Commit-ID: 2b04e46d8d0b9b7ac08ded672e3eab823f01d77a Gitweb: https://git.kernel.org/tip/2b04e46d8d0b9b7ac08ded672e3eab823f01d77a Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:57 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:21 +0200 x86/ioapic: Use apic_ack_irq() To address the EBUSY fail of interrupt affinity settings in case that the previous setting has not been cleaned up yet, use the new apic_ack_irq() function instead of directly invoking ack_APIC_irq(). Preparatory change for the real fix Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup") Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Song Liu Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.639011...@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7553819c74c3..3982f79d2377 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1851,7 +1851,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) * intr-remapping table entry. Hence for the io-apic * EOI we use the pin number. */ - ack_APIC_irq(); + apic_ack_irq(irq_data); eoi_ioapic_pin(data->entry.vector, data); }
[tip:x86/urgent] x86/platform/uv: Use apic_ack_irq()
Commit-ID: 839b0f1c4ef674cd929a42304c078afca278581a Gitweb: https://git.kernel.org/tip/839b0f1c4ef674cd929a42304c078afca278581a Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:58 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:21 +0200 x86/platform/uv: Use apic_ack_irq() To address the EBUSY fail of interrupt affinity settings in case that the previous setting has not been cleaned up yet, use the new apic_ack_irq() function instead of the special uv_ack_apic() implementation which is merily a wrapper around ack_APIC_irq(). Preparatory change for the real fix Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup") Reported-by: Song Liu Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Cc: Tariq Toukan Link: https://lkml.kernel.org/r/20180604162224.721691...@linutronix.de --- arch/x86/platform/uv/uv_irq.c | 7 +-- 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index e4cb9f4cde8a..fc13cbbb2dce 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -47,11 +47,6 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) static void uv_noop(struct irq_data *data) { } -static void uv_ack_apic(struct irq_data *data) -{ - ack_APIC_irq(); -} - static int uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, bool force) @@ -73,7 +68,7 @@ static struct irq_chip uv_irq_chip = { .name = "UV-CORE", .irq_mask = uv_noop, .irq_unmask = uv_noop, - .irq_eoi= uv_ack_apic, + .irq_eoi= apic_ack_irq, .irq_set_affinity = uv_set_irq_affinity, };
[tip:x86/urgent] x86/apic/vector: Prevent hlist corruption and leaks
Commit-ID: 80ae7b1a918e78b0bae88b0c0ad413d3fdced968 Gitweb: https://git.kernel.org/tip/80ae7b1a918e78b0bae88b0c0ad413d3fdced968 Author: Thomas Gleixner AuthorDate: Mon, 4 Jun 2018 17:33:53 +0200 Committer: Thomas Gleixner CommitDate: Wed, 6 Jun 2018 15:18:19 +0200 x86/apic/vector: Prevent hlist corruption and leaks Several people observed the WARN_ON() in irq_matrix_free() which triggers when the caller tries to free an vector which is not in the allocation range. Song provided the trace information which allowed to decode the root cause. The rework of the vector allocation mechanism failed to preserve a sanity check, which prevents setting a new target vector/CPU when the previous affinity change has not fully completed. As a result a half finished affinity change can be overwritten, which can cause the leak of a irq descriptor pointer on the previous target CPU and double enqueue of the hlist head into the cleanup lists of two or more CPUs. After one CPU cleaned up its vector the next CPU will invoke the cleanup handler with vector 0, which triggers the out of range warning in the matrix allocator. Prevent this by checking the apic_data of the interrupt whether the move_in_progress flag is false and the hlist node is not hashed. Return -EBUSY if not. This prevents the damage and restores the behaviour before the vector allocation rework, but due to other changes in that area it also widens the chance that user space can observe -EBUSY. In theory this should be fine, but actually not all user space tools handle -EBUSY correctly. Addressing that is not part of this fix, but will be addressed in follow up patches. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Dmitry Safonov <0x7f454...@gmail.com> Reported-by: Tariq Toukan Reported-by: Song Liu Signed-off-by: Thomas Gleixner Tested-by: Song Liu Cc: Joerg Roedel Cc: Peter Zijlstra Cc: sta...@vger.kernel.org Cc: Mike Travis Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20180604162224.303870...@linutronix.de --- arch/x86/kernel/apic/vector.c | 9 + 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index bb6f7a2148d7..72b575a0b662 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -235,6 +235,15 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) return 0; + /* +* Careful here. @apicd might either have move_in_progress set or +* be enqueued for cleanup. Assigning a new vector would either +* leave a stale vector on some CPU around or in case of a pending +* cleanup corrupt the hlist. +*/ + if (apicd->move_in_progress || !hlist_unhashed(>clist)) + return -EBUSY; + vector = irq_matrix_alloc(vector_matrix, dest, resvd, ); if (vector > 0) apic_update_vector(irqd, vector, cpu);
[tip:timers/core] posix-timers: Sanitize overrun handling
Commit-ID: 78c9c4dfbf8c04883941445a195276bb4bb92c76 Gitweb: https://git.kernel.org/tip/78c9c4dfbf8c04883941445a195276bb4bb92c76 Author: Thomas Gleixner AuthorDate: Tue, 26 Jun 2018 15:21:32 +0200 Committer: Thomas Gleixner CommitDate: Mon, 2 Jul 2018 11:33:25 +0200 posix-timers: Sanitize overrun handling The posix timer overrun handling is broken because the forwarding functions can return a huge number of overruns which does not fit in an int. As a consequence timer_getoverrun(2) and siginfo::si_overrun can turn into random number generators. The k_clock::timer_forward() callbacks return a 64 bit value now. Make k_itimer::ti_overrun[_last] 64bit as well, so the kernel internal accounting is correct. 3Remove the temporary (int) casts. Add a helper function which clamps the overrun value returned to user space via timer_getoverrun(2) or siginfo::si_overrun limited to a positive value between 0 and INT_MAX. INT_MAX is an indicator for user space that the overrun value has been clamped. Reported-by: Team OWL337 Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Peter Zijlstra Cc: Michael Kerrisk Link: https://lkml.kernel.org/r/20180626132705.018623...@linutronix.de --- include/linux/posix-timers.h | 4 ++-- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 31 --- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index c85704fcdbd2..ee7e987ea1b4 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -95,8 +95,8 @@ struct k_itimer { clockid_t it_clock; timer_t it_id; int it_active; - int it_overrun; - int it_overrun_last; + s64 it_overrun; + s64 it_overrun_last; int it_requeue_pending; int it_sigev_notify; ktime_t it_interval; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5a6251ac6f7a..562cc3891b57 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -85,7 +85,7 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) continue; timer->it.cpu.expires += incr; - timer->it_overrun += 1 << i; + timer->it_overrun += 1LL << i; delta -= incr; } } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index db1d65963a57..3ac7295306dc 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -283,6 +283,17 @@ static __init int init_posix_timers(void) } __initcall(init_posix_timers); +/* + * The siginfo si_overrun field and the return value of timer_getoverrun(2) + * are of type int. Clamp the overrun value to INT_MAX + */ +static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) +{ + s64 sum = timr->it_overrun_last + (s64)baseval; + + return sum > (s64)INT_MAX ? INT_MAX : (int)sum; +} + static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = >it.real.timer; @@ -290,9 +301,8 @@ static void common_hrtimer_rearm(struct k_itimer *timr) if (!timr->it_interval) return; - timr->it_overrun += (unsigned int) hrtimer_forward(timer, - timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), + timr->it_interval); hrtimer_restart(timer); } @@ -321,10 +331,10 @@ void posixtimer_rearm(struct siginfo *info) timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; + timr->it_overrun = -1LL; ++timr->it_requeue_pending; - info->si_overrun += timr->it_overrun_last; + info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } unlock_timer(timr, flags); @@ -418,9 +428,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) now = ktime_add(now, kj); } #endif - timr->it_overrun += (unsigned int) - hrtimer_forward(timer, now, - timr->it_interval); + timr->it_overrun += hrtimer_forward(timer, now, + timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; timr->it_active = 1; @@ -524,7 +533,7 @@ static int
[tip:timers/core] alarmtimer: Prevent overflow for relative nanosleep
Commit-ID: 5f936e19cc0ef97dbe3a56e9498922ad5ba1edef Gitweb: https://git.kernel.org/tip/5f936e19cc0ef97dbe3a56e9498922ad5ba1edef Author: Thomas Gleixner AuthorDate: Mon, 2 Jul 2018 09:34:29 +0200 Committer: Thomas Gleixner CommitDate: Mon, 2 Jul 2018 11:33:26 +0200 alarmtimer: Prevent overflow for relative nanosleep Air Icy reported: UBSAN: Undefined behaviour in kernel/time/alarmtimer.c:811:7 signed integer overflow: 1529859276030040771 + 9223372036854775807 cannot be represented in type 'long long int' Call Trace: alarm_timer_nsleep+0x44c/0x510 kernel/time/alarmtimer.c:811 __do_sys_clock_nanosleep kernel/time/posix-timers.c:1235 [inline] __se_sys_clock_nanosleep kernel/time/posix-timers.c:1213 [inline] __x64_sys_clock_nanosleep+0x326/0x4e0 kernel/time/posix-timers.c:1213 do_syscall_64+0xb8/0x3a0 arch/x86/entry/common.c:290 alarm_timer_nsleep() uses ktime_add() to add the current time and the relative expiry value. ktime_add() has no sanity checks so the addition can overflow when the relative timeout is large enough. Use ktime_add_safe() which has the necessary sanity checks in place and limits the result to the valid range. Fixes: 9a7adcf5c6de ("timers: Posix interface for alarm-timers") Reported-by: Team OWL337 Signed-off-by: Thomas Gleixner Cc: John Stultz Link: https://lkml.kernel.org/r/alpine.deb.2.21.1807020926360.1...@nanos.tec.linutronix.de --- kernel/time/alarmtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 78a3cc555823..fa5de5e8de61 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -808,7 +808,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].gettime(); - exp = ktime_add(now, exp); + + exp = ktime_add_safe(now, exp); } ret = alarmtimer_do_nsleep(, exp, type);
[tip:timers/core] posix-timers: Make forward callback return s64
Commit-ID: 6fec64e1c92d5c715c6d0f50786daa7708266bde Gitweb: https://git.kernel.org/tip/6fec64e1c92d5c715c6d0f50786daa7708266bde Author: Thomas Gleixner AuthorDate: Tue, 26 Jun 2018 15:21:31 +0200 Committer: Thomas Gleixner CommitDate: Mon, 2 Jul 2018 11:33:25 +0200 posix-timers: Make forward callback return s64 The posix timer ti_overrun handling is broken because the forwarding functions can return a huge number of overruns which does not fit in an int. As a consequence timer_getoverrun(2) and siginfo::si_overrun can turn into random number generators. As a first step to address that let the timer_forward() callbacks return the full 64 bit value. Cast it to (int) temporarily until k_itimer::ti_overrun is converted to 64bit and the conversion to user space visible values is sanitized. Reported-by: Team OWL337 Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Peter Zijlstra Cc: Michael Kerrisk Link: https://lkml.kernel.org/r/20180626132704.922098...@linutronix.de --- kernel/time/alarmtimer.c | 4 ++-- kernel/time/posix-timers.c | 6 +++--- kernel/time/posix-timers.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 639321bf2e39..78a3cc555823 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -581,11 +581,11 @@ static void alarm_timer_rearm(struct k_itimer *timr) * @timr: Pointer to the posixtimer data struct * @now: Current time to forward the timer against */ -static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) +static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = >it.alarm.alarmtimer; - return (int) alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, timr->it_interval, now); } /** diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 80d59333c76e..db1d65963a57 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -645,11 +645,11 @@ static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) return __hrtimer_expires_remaining_adjusted(timer, now); } -static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) { struct hrtimer *timer = >it.real.timer; - return (int)hrtimer_forward(timer, now, timr->it_interval); + return hrtimer_forward(timer, now, timr->it_interval); } /* @@ -702,7 +702,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) * expiry time forward by intervals, so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) - timr->it_overrun += kc->timer_forward(timr, now); + timr->it_overrun += (int)kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); /* Return 0 only, when the timer is expired and not pending */ diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 151e28f5bf30..ddb21145211a 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -19,7 +19,7 @@ struct k_clock { void(*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); void(*timer_rearm)(struct k_itimer *timr); - int (*timer_forward)(struct k_itimer *timr, ktime_t now); + s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); int (*timer_try_to_cancel)(struct k_itimer *timr); void(*timer_arm)(struct k_itimer *timr, ktime_t expires,
[tip:x86/urgent] x86/cpu: Restore CPUID_8000_0008_EBX reload
Commit-ID: c65732e4f72124ca5a3a0dd3bee0d3cee39c7170 Gitweb: https://git.kernel.org/tip/c65732e4f72124ca5a3a0dd3bee0d3cee39c7170 Author: Thomas GleixnerAuthorDate: Mon, 30 Apr 2018 21:47:46 +0200 Committer: Thomas Gleixner CommitDate: Wed, 2 May 2018 16:44:38 +0200 x86/cpu: Restore CPUID_8000_0008_EBX reload The recent commt which addresses the x86_phys_bits corruption with encrypted memory on CPUID reload after a microcode update lost the reload of CPUID_8000_0008_EBX as well. As a consequence IBRS and IBRS_FW are not longer detected Restore the behaviour by bringing the reload of CPUID_8000_0008_EBX back. This restore has a twist due to the convoluted way the cpuid analysis works: CPUID_8000_0008_EBX is used by AMD to enumerate IBRB, IBRS, STIBP. On Intel EBX is not used. But the speculation control code sets the AMD bits when running on Intel depending on the Intel specific speculation control bits. This was done to use the same bits for alternatives. The change which moved the 8000_0008 evaluation out of get_cpu_cap() broke this nasty scheme due to ordering. So that on Intel the store to CPUID_8000_0008_EBX clears the IBRB, IBRS, STIBP bits which had been set before by software. So the actual CPUID_8000_0008_EBX needs to go back to the place where it was and the phys/virt address space calculation cannot touch it. In hindsight this should have used completely synthetic bits for IBRB, IBRS, STIBP instead of reusing the AMD bits, but that's for 4.18. /me needs to find time to cleanup that steaming pile of ... Fixes: d94a155c59c9 ("x86/cpu: Prevent cpuinfo_x86::x86_phys_bits adjustment corruption") Reported-by: Jörg Otte Reported-by: Tim Chen Signed-off-by: Thomas Gleixner Tested-by: Jörg Otte Cc: Linus Torvalds Cc: kirill.shute...@linux.intel.com Cc: Borislav Petkov x86_power = edx; } + if (c->extended_cpuid_level >= 0x8008) { + cpuid(0x8008, , , , ); + c->x86_capability[CPUID_8000_0008_EBX] = ebx; + } + if (c->extended_cpuid_level >= 0x800a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x800a); @@ -871,7 +876,6 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; - c->x86_capability[CPUID_8000_0008_EBX] = ebx; } #ifdef CONFIG_X86_32 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
[tip:x86/urgent] x86/apic: Switch all APICs to Fixed delivery mode
Commit-ID: a31e58e129f73ab5b04016330b13ed51fde7a961 Gitweb: https://git.kernel.org/tip/a31e58e129f73ab5b04016330b13ed51fde7a961 Author: Thomas GleixnerAuthorDate: Thu, 28 Dec 2017 11:33:33 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 14:20:48 +0100 x86/apic: Switch all APICs to Fixed delivery mode Some of the APIC incarnations are operating in lowest priority delivery mode. This worked as long as the vector management code allocated the same vector on all possible CPUs for each interrupt. Lowest priority delivery mode does not necessarily respect the affinity setting and may redirect to some other online CPU. This was documented somewhere in the old code and the conversion to single target delivery missed to update the delivery mode of the affected APIC drivers which results in spurious interrupts on some of the affected CPU/Chipset combinations. Switch the APIC drivers over to Fixed delivery mode and remove all leftovers of lowest priority delivery mode. Switching to Fixed delivery mode is not a problem on these CPUs because the kernel already uses Fixed delivery mode for IPIs. The reason for this is that th SDM explicitely forbids lowest prio mode for IPIs. The reason is obvious: If the irq routing does not honor destination targets in lowest prio mode then an IPI targeted at CPU1 might end up on CPU0, which would be a fatal problem in many cases. As a consequence of this change, the apic::irq_delivery_mode field is now pointless, but this needs to be cleaned up in a separate patch. Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity") Reported-by: vcap...@pengaru.com Signed-off-by: Thomas Gleixner Tested-by: vcap...@pengaru.com Cc: Pavel Machek Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/apic_noop.c | 2 +- arch/x86/kernel/apic/msi.c| 8 ++-- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- drivers/pci/host/pci-hyperv.c | 8 ++-- 6 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690..25a8702 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr= 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7b659c4..5078b5c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be7..ce503c9 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fa22017..02e8acb 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /*
[tip:irq/urgent] genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI
Commit-ID: bc976233a872c0f20f018fb1e89264a541584e25 Gitweb: https://git.kernel.org/tip/bc976233a872c0f20f018fb1e89264a541584e25 Author: Thomas GleixnerAuthorDate: Fri, 29 Dec 2017 10:47:22 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 21:13:05 +0100 genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI The new reservation mode for interrupts assigns a dummy vector when the interrupt is allocated and assigns a real vector when the interrupt is requested. The reservation mode prevents vector pressure when devices with a large amount of queues/interrupts are initialized, but only a minimal subset of those queues/interrupts is actually used. This mode has an issue with MSI interrupts which cannot be masked. If the driver is not careful or the hardware emits an interrupt before the device irq is requestd by the driver then the interrupt ends up on the dummy vector as a spurious interrupt which can cause malfunction of the device or in the worst case a lockup of the machine. Change the logic for the reservation mode so that the early activation of MSI interrupts checks whether: - the device is a PCI/MSI device - the reservation mode of the underlying irqdomain is activated - PCI/MSI masking is globally enabled - the PCI/MSI device uses either MSI-X, which supports masking, or MSI with the maskbit supported. If one of those conditions is false, then clear the reservation mode flag in the irq data of the interrupt and invoke irq_domain_activate_irq() with the reserve argument cleared. In the x86 vector code, clear the can_reserve flag in the vector allocation data so a subsequent free_irq() won't create the same situation again. The interrupt stays assigned to a real vector until pci_disable_msi() is invoked and all allocations are undone. Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode") Reported-by: Alexandru Chirvasitu Reported-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-...@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: de...@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-me...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291406420.1899@nanos Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291409460.1899@nanos --- arch/x86/kernel/apic/vector.c | 12 +++- kernel/irq/msi.c | 37 + 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 52c85c8..f8b03bb 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -369,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd) int ret; ret = assign_irq_vector_any_locked(irqd); - if (!ret) + if (!ret) { apicd->has_reserved = false; + /* +* Core might have disabled reservation mode after +* allocating the irq descriptor. Ideally this should +* happen before allocation time, but that would require +* completely convoluted ways of transporting that +* information. +*/ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } return ret; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9ba9543..2f3c4f5 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,11 +339,38 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } -static bool msi_check_reservation_mode(struct msi_domain_info *info) +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and
[tip:timers/urgent] timers: Reinitialize per cpu bases on hotplug
Commit-ID: 26456f87aca7157c057de65c9414b37f1ab881d1 Gitweb: https://git.kernel.org/tip/26456f87aca7157c057de65c9414b37f1ab881d1 Author: Thomas GleixnerAuthorDate: Wed, 27 Dec 2017 21:37:25 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 23:13:09 +0100 timers: Reinitialize per cpu bases on hotplug The timer wheel bases are not (re)initialized on CPU hotplug. That leaves them with a potentially stale clk and next_expiry valuem, which can cause trouble then the CPU is plugged. Add a prepare callback which forwards the clock, sets next_expiry to far in the future and reset the control flags to a known state. Set base->must_forward_clk so the first timer which is queued will try to forward the clock to current jiffies. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: Anna-Maria Gleixner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos --- include/linux/cpuhotplug.h | 2 +- include/linux/timer.h | 4 +++- kernel/cpu.c | 4 ++-- kernel/time/timer.c| 15 +++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 201ab72..1a32e55 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -86,7 +86,7 @@ enum cpuhp_state { CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, - CPUHP_TIMERS_DEAD, + CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END= CPUHP_BP_PREPARE_DYN + 20, diff --git a/include/linux/timer.h b/include/linux/timer.h index 04af640..2448f9c 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU +int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else -#define timers_dead_cpu NULL +#define timers_prepare_cpu NULL +#define timers_dead_cpuNULL #endif #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 41376c3..9785847 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single= timers_dead_cpu, }, /* Kicks the plugged cpu into life */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 19a9c3d..6be576e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base;
[tip:timers/urgent] nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()
Commit-ID: 5d62c183f9e9df1deeea0906d099a94e8a43047a Gitweb: https://git.kernel.org/tip/5d62c183f9e9df1deeea0906d099a94e8a43047a Author: Thomas GleixnerAuthorDate: Fri, 22 Dec 2017 15:51:13 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 23:13:10 +0100 nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() The conditions in irq_exit() to invoke tick_nohz_irq_exit() which subsequently invokes tick_nohz_stop_sched_tick() are: if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) If need_resched() is not set, but a timer softirq is pending then this is an indication that the softirq code punted and delegated the execution to softirqd. need_resched() is not true because the current interrupted task takes precedence over softirqd. Invoking tick_nohz_irq_exit() in this case can cause an endless loop of timer interrupts because the timer wheel contains an expired timer, but softirqs are not yet executed. So it returns an immediate expiry request, which causes the timer to fire immediately again. Lather, rinse and repeat Prevent that by adding a check for a pending timer soft interrupt to the conditions in tick_nohz_stop_sched_tick() which avoid calling get_next_timer_interrupt(). That keeps the tick sched timer on the tick and prevents a repetitive programming of an already expired timer. Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul McKenney Cc: Anna-Maria Gleixner Cc: Sebastian Siewior Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos --- kernel/time/tick-sched.c | 19 +-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 77555fa..f7cc7ab 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, _rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* +* Keep the periodic tick, when RCU, architecture or irq_work +* requests it. +* Aside of that check whether the local timer softirq is +* pending. If so its a bad idea to call get_next_timer_interrupt() +* because there is an already expired timer, so it will request +* immeditate expiry, which rearms the hardware timer with a +* minimal delta which brings us back to this place +* immediately. Lather, rinse and repeat... +*/ + if (rcu_needs_cpu(basemono, _rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /*
[tip:timers/urgent] timers: Invoke timer_start_debug() where it makes sense
Commit-ID: fd45bb77ad682be728d1002431d77b8c73342836 Gitweb: https://git.kernel.org/tip/fd45bb77ad682be728d1002431d77b8c73342836 Author: Thomas GleixnerAuthorDate: Fri, 22 Dec 2017 15:51:14 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 23:13:10 +0100 timers: Invoke timer_start_debug() where it makes sense The timer start debug function is called before the proper timer base is set. As a consequence the trace data contains the stale CPU and flags values. Call the debug function after setting the new base and flags. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: sta...@vger.kernel.org Cc: r...@linutronix.de Cc: Paul McKenney Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/20171222145337.792907...@linutronix.de --- kernel/time/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 6be576e..89a9e1b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1007,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1032,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance
[tip:timers/urgent] timerqueue: Document return values of timerqueue_add/del()
Commit-ID: 9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7 Gitweb: https://git.kernel.org/tip/9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7 Author: Thomas GleixnerAuthorDate: Fri, 22 Dec 2017 15:51:15 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 23:13:10 +0100 timerqueue: Document return values of timerqueue_add/del() The return values of timerqueue_add/del() are not documented in the kernel doc comment. Add proper documentation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: r...@linutronix.de Cc: Paul McKenney Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/20171222145337.872681...@linutronix.de --- lib/timerqueue.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/timerqueue.c b/lib/timerqueue.c index 4a720ed..0d54bcb 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -33,8 +33,9 @@ * @head: head of timerqueue * @node: timer node to be added * - * Adds the timer node to the timerqueue, sorted by the - * node's expires value. + * Adds the timer node to the timerqueue, sorted by the node's expires + * value. Returns true if the newly added timer is the first expiring timer in + * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { @@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add); * @head: head of timerqueue * @node: timer node to be removed * - * Removes the timer node from the timerqueue. + * Removes the timer node from the timerqueue. Returns true if the queue is + * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) {
[tip:x86/pti] x86/kaslr: Fix the vaddr_end mess
Commit-ID: 125125112ba49706518ac9077a1026a18f37 Gitweb: https://git.kernel.org/tip/125125112ba49706518ac9077a1026a18f37 Author: Thomas GleixnerAuthorDate: Thu, 4 Jan 2018 12:32:03 +0100 Committer: Thomas Gleixner CommitDate: Fri, 5 Jan 2018 00:39:57 +0100 x86/kaslr: Fix the vaddr_end mess vaddr_end for KASLR is only documented in the KASLR code itself and is adjusted depending on config options. So it's not surprising that a change of the memory layout causes KASLR to have the wrong vaddr_end. This can map arbitrary stuff into other areas causing hard to understand problems. Remove the whole ifdef magic and define the start of the cpu_entry_area to be the end of the KASLR vaddr range. Add documentation to that effect. Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") Reported-by: Benjamin Gilbert Signed-off-by: Thomas Gleixner Tested-by: Benjamin Gilbert Cc: Andy Lutomirski Cc: Greg Kroah-Hartman Cc: stable Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Garnier , Cc: Alexander Kuleshov Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos --- Documentation/x86/x86_64/mm.txt | 6 ++ arch/x86/include/asm/pgtable_64_types.h | 8 +++- arch/x86/mm/kaslr.c | 32 +--- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index f7dabe1..ea91cb6 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,7 @@ ea00 - eaff (=40 bits) virtual memory map (1TB) ... unused hole ... ec00 - fbff (=44 bits) kasan shadow memory (16TB) ... unused hole ... + vaddr_end for KASLR fe00 - fe7f (=39 bits) cpu_entry_area mapping fe80 - feff (=39 bits) LDT remap for PTI ff00 - ff7f (=39 bits) %esp fixup stacks @@ -37,6 +38,7 @@ ffd4 - ffd5 (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf - fc00 (=53 bits) kasan shadow memory (8PB) ... unused hole ... + vaddr_end for KASLR fe00 - fe7f (=39 bits) cpu_entry_area mapping ... unused hole ... ff00 - ff7f (=39 bits) %esp fixup stacks @@ -71,3 +73,7 @@ during EFI runtime calls. Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. + +Be very careful vs. KASLR when changing anything here. The KASLR address +range must not overlap with anything except the KASAN shadow area, which is +correct as KASAN disables KASLR. diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 61b4b60..6b8f73d 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +/* + * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * + * Be very careful vs. KASLR when changing anything here. The KASLR address + * range must not overlap with anything except the KASAN shadow area, which + * is correct as KASAN disables KASLR. + */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 879ef93..aedebd2 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,25 +34,14 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. The end changes base - * on configuration to have the highest amount of space for randomization. - * It increases the possible random position for each randomized region. + * Virtual address start and end range for randomization. * - * You need to add an if/def entry if you introduce a new memory region - * compatible with KASLR. Your entry must be in logical order with memory - * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to - * ensure that this order is correct and won't be changed. + * The end address could depend on more configuration options to make the + * highest amount of space for randomization
[tip:x86/pti] x86/kaslr: Fix the vaddr_end mess
Commit-ID: 1b3ef54207f068dae9c36d891ff69dd4d37c5c2f Gitweb: https://git.kernel.org/tip/1b3ef54207f068dae9c36d891ff69dd4d37c5c2f Author: Thomas GleixnerAuthorDate: Thu, 4 Jan 2018 12:32:03 +0100 Committer: Thomas Gleixner CommitDate: Thu, 4 Jan 2018 23:04:57 +0100 x86/kaslr: Fix the vaddr_end mess vaddr_end for KASLR is only documented in the KASLR code itself and is adjusted depending on config options. So it's not surprising that a change of the memory layout causes KASLR to have the wrong vaddr_end. This can map arbitrary stuff into other areas causing hard to understand problems. Remove the whole ifdef magic and define the start of the cpu_entry_area to be the end of the KASLR vaddr range. Add documentation to that effect. Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") Reported-by: Benjamin Gilbert Signed-off-by: Thomas Gleixner Tested-by: Benjamin Gilbert Cc: Andy Lutomirski Cc: Greg Kroah-Hartman Cc: stable Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Garnier , Cc: Alexander Kuleshov Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos --- Documentation/x86/x86_64/mm.txt | 6 ++ arch/x86/include/asm/pgtable_64_types.h | 8 +++- arch/x86/mm/kaslr.c | 34 ++--- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index f7dabe1..ea91cb6 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,7 @@ ea00 - eaff (=40 bits) virtual memory map (1TB) ... unused hole ... ec00 - fbff (=44 bits) kasan shadow memory (16TB) ... unused hole ... + vaddr_end for KASLR fe00 - fe7f (=39 bits) cpu_entry_area mapping fe80 - feff (=39 bits) LDT remap for PTI ff00 - ff7f (=39 bits) %esp fixup stacks @@ -37,6 +38,7 @@ ffd4 - ffd5 (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf - fc00 (=53 bits) kasan shadow memory (8PB) ... unused hole ... + vaddr_end for KASLR fe00 - fe7f (=39 bits) cpu_entry_area mapping ... unused hole ... ff00 - ff7f (=39 bits) %esp fixup stacks @@ -71,3 +73,7 @@ during EFI runtime calls. Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. + +Be very careful vs. KASLR when changing anything here. The KASLR address +range must not overlap with anything except the KASAN shadow area, which is +correct as KASAN disables KASLR. diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 61b4b60..6b8f73d 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +/* + * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * + * Be very careful vs. KASLR when changing anything here. The KASLR address + * range must not overlap with anything except the KASAN shadow area, which + * is correct as KASAN disables KASLR. + */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 879ef93..b805a61 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,25 +34,14 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. The end changes base - * on configuration to have the highest amount of space for randomization. - * It increases the possible random position for each randomized region. + * Virtual address start and end range for randomization. * - * You need to add an if/def entry if you introduce a new memory region - * compatible with KASLR. Your entry must be in logical order with memory - * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to - * ensure that this order is correct and won't be changed. + * The end address could depend on more configuration options to make the + * highest amount of space for randomization
[tip:x86/pti] x86/mm: Map cpu_entry_area at the same place on 4/5 level
Commit-ID: f2078904810373211fb15f91888fba14c01a4acc Gitweb: https://git.kernel.org/tip/f2078904810373211fb15f91888fba14c01a4acc Author: Thomas GleixnerAuthorDate: Thu, 4 Jan 2018 13:01:40 +0100 Committer: Thomas Gleixner CommitDate: Thu, 4 Jan 2018 23:04:57 +0100 x86/mm: Map cpu_entry_area at the same place on 4/5 level There is no reason for 4 and 5 level pagetables to have a different layout. It just makes determining vaddr_end for KASLR harder than necessary. Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Benjamin Gilbert Cc: Greg Kroah-Hartman Cc: stable Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Garnier , Cc: Alexander Kuleshov Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos --- Documentation/x86/x86_64/mm.txt | 7 --- arch/x86/include/asm/pgtable_64_types.h | 4 ++-- arch/x86/mm/dump_pagetables.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index ddd5ffd..f7dabe1 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,8 +12,8 @@ ea00 - eaff (=40 bits) virtual memory map (1TB) ... unused hole ... ec00 - fbff (=44 bits) kasan shadow memory (16TB) ... unused hole ... -fe00 - fe7f (=39 bits) LDT remap for PTI -fe80 - feff (=39 bits) cpu_entry_area mapping +fe00 - fe7f (=39 bits) cpu_entry_area mapping +fe80 - feff (=39 bits) LDT remap for PTI ff00 - ff7f (=39 bits) %esp fixup stacks ... unused hole ... ffef - fffe (=64 GB) EFI region mapping space @@ -37,7 +37,8 @@ ffd4 - ffd5 (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf - fc00 (=53 bits) kasan shadow memory (8PB) ... unused hole ... -fe80 - feff (=39 bits) cpu_entry_area mapping +fe00 - fe7f (=39 bits) cpu_entry_area mapping +... unused hole ... ff00 - ff7f (=39 bits) %esp fixup stacks ... unused hole ... ffef - fffe (=64 GB) EFI region mapping space diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6233e55..61b4b60 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -88,7 +88,7 @@ typedef struct { pteval_t pte; } pte_t; # define VMALLOC_SIZE_TB _AC(32, UL) # define __VMALLOC_BASE_AC(0xc900, UL) # define __VMEMMAP_BASE_AC(0xea00, UL) -# define LDT_PGD_ENTRY _AC(-4, UL) +# define LDT_PGD_ENTRY _AC(-3, UL) # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif @@ -110,7 +110,7 @@ typedef struct { pteval_t pte; } pte_t; #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define CPU_ENTRY_AREA_PGD _AC(-3, UL) +#define CPU_ENTRY_AREA_PGD _AC(-4, UL) #define CPU_ENTRY_AREA_BASE(CPU_ENTRY_AREA_PGD << P4D_SHIFT) #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f56902c..2a4849e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -61,10 +61,10 @@ enum address_markers_idx { KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif + CPU_ENTRY_AREA_NR, #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) LDT_NR, #endif - CPU_ENTRY_AREA_NR, #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif
[tip:x86/pti] x86/cpu: Implement CPU vulnerabilites sysfs functions
Commit-ID: 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Gitweb: https://git.kernel.org/tip/61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Author: Thomas GleixnerAuthorDate: Sun, 7 Jan 2018 22:48:01 +0100 Committer: Thomas Gleixner CommitDate: Mon, 8 Jan 2018 11:10:40 +0100 x86/cpu: Implement CPU vulnerabilites sysfs functions Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and spectre_v2. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.177414...@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/bugs.c | 29 + 2 files changed, 30 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd5199d..e23d21a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,6 +89,7 @@ config X86 select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b242..76ad6cb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -60,3 +61,31 @@ void __init check_bugs(void) set_memory_4k((unsigned long)__va(0), 1); #endif } + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return sprintf(buf, "Not affected\n"); + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} +#endif
[tip:x86/pti] sysfs/cpu: Add vulnerability folder
Commit-ID: 87590ce6e373d1a5401f6539f0c59ef92dd924a9 Gitweb: https://git.kernel.org/tip/87590ce6e373d1a5401f6539f0c59ef92dd924a9 Author: Thomas GleixnerAuthorDate: Sun, 7 Jan 2018 22:48:00 +0100 Committer: Thomas Gleixner CommitDate: Mon, 8 Jan 2018 11:10:33 +0100 sysfs/cpu: Add vulnerability folder As the meltdown/spectre problem affects several CPU architectures, it makes sense to have common way to express whether a system is affected by a particular vulnerability or not. If affected the way to express the mitigation should be common as well. Create /sys/devices/system/cpu/vulnerabilities folder and files for meltdown, spectre_v1 and spectre_v2. Allow architectures to override the show function. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.096657...@linutronix.de --- Documentation/ABI/testing/sysfs-devices-system-cpu | 16 drivers/base/Kconfig | 3 ++ drivers/base/cpu.c | 48 ++ include/linux/cpu.h| 7 4 files changed, 74 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index f3d5817..bd3a88e 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -373,3 +373,19 @@ Contact: Linux kernel mailing list Description: information about CPUs heterogeneity. cpu_capacity: capacity of cpu#. + +What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/meltdown + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + /sys/devices/system/cpu/vulnerabilities/spectre_v2 +Date: Januar 2018 +Contact: Linux kernel mailing list +Description: Information about CPU vulnerabilities + + The files are named after the code names of CPU + vulnerabilities. The output of those files reflects the + state of the CPUs in the system. Possible output values: + + "Not affected"CPU is not affected by the vulnerability + "Vulnerable" CPU is affected and no mitigation in effect + "Mitigation: $M" CPU is affetcted and mitigation $M is in effect diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 2f6614c..37a71fd 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -235,6 +235,9 @@ config GENERIC_CPU_DEVICES config GENERIC_CPU_AUTOPROBE bool +config GENERIC_CPU_VULNERABILITIES + bool + config SOC_BUS bool select GLOB diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 321cd7b..825964e 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -501,10 +501,58 @@ static void __init cpu_dev_register_generic(void) #endif } +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES + +ssize_t __weak cpu_show_meltdown(struct device *dev, +struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); +static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); +static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + +static struct attribute *cpu_root_vulnerabilities_attrs[] = { + _attr_meltdown.attr, + _attr_spectre_v1.attr, + _attr_spectre_v2.attr, + NULL +}; + +static const struct attribute_group cpu_root_vulnerabilities_group = { + .name = "vulnerabilities", + .attrs = cpu_root_vulnerabilities_attrs, +}; + +static void __init cpu_register_vulnerabilities(void) +{ + if (sysfs_create_group(_subsys.dev_root->kobj, + _root_vulnerabilities_group)) + pr_err("Unable to register CPU vulnerabilities\n"); +} + +#else +static inline void cpu_register_vulnerabilities(void) { } +#endif + void __init cpu_dev_init(void) { if (subsys_system_register(_subsys, cpu_root_attr_groups))
[tip:x86/pti] x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN
Commit-ID: de791821c295cc61419a06fe5562288417d1bc58 Gitweb: https://git.kernel.org/tip/de791821c295cc61419a06fe5562288417d1bc58 Author: Thomas GleixnerAuthorDate: Fri, 5 Jan 2018 15:27:34 +0100 Committer: Thomas Gleixner CommitDate: Fri, 5 Jan 2018 15:34:43 +0100 x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN Use the name associated with the particular attack which needs page table isolation for mitigation. Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Alan Cox Cc: Jiri Koshina Cc: Linus Torvalds Cc: Tim Chen Cc: Andi Lutomirski Cc: Andi Kleen Cc: Peter Zijlstra Cc: Paul Turner Cc: Tom Lendacky Cc: Greg KH Cc: Dave Hansen Cc: Kees Cook Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/mm/pti.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 07cdd17..21ac898 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -341,6 +341,6 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITORX86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b1be494..2d3bd22 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); if (c->x86_vendor != X86_VENDOR_AMD) - setup_force_cpu_bug(X86_BUG_CPU_INSECURE); + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); fpu__init_system(c); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 2da28ba..43d4a4a 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -56,13 +56,13 @@ static void __init pti_print_if_insecure(const char *reason) { - if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } static void __init pti_print_if_secure(const char *reason) { - if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } @@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void) } autosel: - if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; enable: setup_force_cpu_cap(X86_FEATURE_PTI);
[tip:timers/core] hrtimer: Optimize the hrtimer code by using static keys for migration_enable/nohz_active
Commit-ID: ae67badaa1643253998cb21d5782e4ea7c231a29 Gitweb: https://git.kernel.org/tip/ae67badaa1643253998cb21d5782e4ea7c231a29 Author: Thomas GleixnerAuthorDate: Sun, 14 Jan 2018 23:30:51 +0100 Committer: Ingo Molnar CommitDate: Tue, 16 Jan 2018 02:35:44 +0100 hrtimer: Optimize the hrtimer code by using static keys for migration_enable/nohz_active The hrtimer_cpu_base::migration_enable and ::nohz_active fields were originally introduced to avoid accessing global variables for these decisions. Still that results in a (cache hot) load and conditional branch, which can be avoided by using static keys. Implement it with static keys and optimize for the most critical case of high performance networking which tends to disable the timer migration functionality. No change in functionality. Signed-off-by: Thomas Gleixner Cc: Anna-Maria Gleixner Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: keesc...@chromium.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1801142327490.2371@nanos Link: https://lkml.kernel.org/r/20171221104205.7269-2-anna-ma...@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 4 --- kernel/time/hrtimer.c | 17 +++--- kernel/time/tick-internal.h | 19 +++ kernel/time/tick-sched.c| 2 +- kernel/time/timer.c | 83 +++-- 5 files changed, 60 insertions(+), 65 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 012c37f..79b2a8d 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -153,8 +153,6 @@ enum hrtimer_base_type { * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events - * @migration_enabled: The migration of hrtimers to other cpus is enabled - * @nohz_active: The nohz functionality is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @next_timer:Pointer to the first expiring timer @@ -178,8 +176,6 @@ struct hrtimer_cpu_base { unsigned intcpu; unsigned intactive_bases; unsigned intclock_was_set_seq; - boolmigration_enabled; - boolnohz_active; #ifdef CONFIG_HIGH_RES_TIMERS unsigned intin_hrtirq : 1, hres_active : 1, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d325208..1d06d2b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -178,23 +178,16 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } -#ifdef CONFIG_NO_HZ_COMMON -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, -int pinned) -{ - if (pinned || !base->migration_enabled) - return base; - return _cpu(hrtimer_bases, get_nohz_timer_target()); -} -#else static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(_migration_enabled) && !pinned) + return _cpu(hrtimer_bases, get_nohz_timer_target()); +#endif return base; } -#endif /* * We switch the timer base to a power-optimized selected CPU target, @@ -969,7 +962,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * Kick to reschedule the next tick to handle the new timer * on dynticks target. */ - if (new_base->cpu_base->nohz_active) + if (is_timers_nohz_active()) wake_up_nohz_cpu(new_base->cpu_base->cpu); } else { hrtimer_reprogram(timer, new_base); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f8e1845..f690628 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -150,14 +150,19 @@ static inline void tick_nohz_init(void) { } #ifdef CONFIG_NO_HZ_COMMON extern unsigned long tick_nohz_active; -#else +extern void timers_update_nohz(void); +extern struct static_key_false timers_nohz_active; +static inline bool is_timers_nohz_active(void) +{ + return static_branch_likely(_nohz_active); +} +# ifdef CONFIG_SMP +extern struct static_key_false timers_migration_enabled; +# endif
[tip:timers/core] hrtimer: Correct blatantly incorrect comment
Commit-ID: d05ca13b8d3f685667b3b1748fa89285466270c5 Gitweb: https://git.kernel.org/tip/d05ca13b8d3f685667b3b1748fa89285466270c5 Author: Thomas GleixnerAuthorDate: Thu, 21 Dec 2017 11:41:31 +0100 Committer: Ingo Molnar CommitDate: Tue, 16 Jan 2018 02:35:44 +0100 hrtimer: Correct blatantly incorrect comment The protection of a hrtimer which runs its callback against migration to a different CPU has nothing to do with hard interrupt context. The protection against migration of a hrtimer running the expiry callback is the pointer in the cpu_base which holds a pointer to the currently running timer. This pointer is evaluated in the code which potentially switches the timer base and makes sure it's kept on the CPU on which the callback is running. Reported-by: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Frederic Weisbecker Cc: Christoph Hellwig Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: keesc...@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-3-anna-ma...@linutronix.de Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1d06d2b..7687355 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1195,9 +1195,9 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer->is_rel = false; /* -* Because we run timers from hardirq context, there is no chance -* they get migrated to another cpu, therefore its safe to unlock -* the timer base. +* The timer is marked as running in the CPU base, so it is +* protected against migration to a different CPU even if the lock +* is dropped. */ raw_spin_unlock(_base->lock); trace_hrtimer_expire_entry(timer, now);
[tip:timers/core] ALSA/dummy: Replace tasklet with softirq hrtimer
Commit-ID: b03bbbe08ff04d80136b6aac152954ef308a4909 Gitweb: https://git.kernel.org/tip/b03bbbe08ff04d80136b6aac152954ef308a4909 Author: Thomas GleixnerAuthorDate: Thu, 21 Dec 2017 11:42:03 +0100 Committer: Ingo Molnar CommitDate: Tue, 16 Jan 2018 09:51:22 +0100 ALSA/dummy: Replace tasklet with softirq hrtimer The tasklet is used to defer the execution of snd_pcm_period_elapsed() to the softirq context. Using the HRTIMER_MODE_SOFT mode invokes the timer callback in softirq context as well which renders the tasklet useless. [o-takashi: avoid stall due to a call of hrtimer_cancel() on a callback of hrtimer] Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Reviewed-by: Takashi Iwai Cc: Christoph Hellwig Cc: Jaroslav Kysela Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Takashi Iwai Cc: Takashi Sakamoto Cc: alsa-de...@alsa-project.org Cc: keesc...@chromium.org Link: http://lkml.kernel.org/r/20171221104205.7269-35-anna-ma...@linutronix.de Signed-off-by: Ingo Molnar --- sound/drivers/dummy.c | 27 --- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c index 7b2b1f7..6ad2ff5 100644 --- a/sound/drivers/dummy.c +++ b/sound/drivers/dummy.c @@ -375,17 +375,9 @@ struct dummy_hrtimer_pcm { ktime_t period_time; atomic_t running; struct hrtimer timer; - struct tasklet_struct tasklet; struct snd_pcm_substream *substream; }; -static void dummy_hrtimer_pcm_elapsed(unsigned long priv) -{ - struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv; - if (atomic_read(>running)) - snd_pcm_period_elapsed(dpcm->substream); -} - static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer) { struct dummy_hrtimer_pcm *dpcm; @@ -393,7 +385,14 @@ static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer) dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer); if (!atomic_read(>running)) return HRTIMER_NORESTART; - tasklet_schedule(>tasklet); + /* +* In cases of XRUN and draining, this calls .trigger to stop PCM +* substream. +*/ + snd_pcm_period_elapsed(dpcm->substream); + if (!atomic_read(>running)) + return HRTIMER_NORESTART; + hrtimer_forward_now(timer, dpcm->period_time); return HRTIMER_RESTART; } @@ -403,7 +402,7 @@ static int dummy_hrtimer_start(struct snd_pcm_substream *substream) struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; dpcm->base_time = hrtimer_cb_get_time(>timer); - hrtimer_start(>timer, dpcm->period_time, HRTIMER_MODE_REL); + hrtimer_start(>timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT); atomic_set(>running, 1); return 0; } @@ -413,14 +412,14 @@ static int dummy_hrtimer_stop(struct snd_pcm_substream *substream) struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; atomic_set(>running, 0); - hrtimer_cancel(>timer); + if (!hrtimer_callback_running(>timer)) + hrtimer_cancel(>timer); return 0; } static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm) { hrtimer_cancel(>timer); - tasklet_kill(>tasklet); } static snd_pcm_uframes_t @@ -465,12 +464,10 @@ static int dummy_hrtimer_create(struct snd_pcm_substream *substream) if (!dpcm) return -ENOMEM; substream->runtime->private_data = dpcm; - hrtimer_init(>timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(>timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dpcm->timer.function = dummy_hrtimer_callback; dpcm->substream = substream; atomic_set(>running, 0); - tasklet_init(>tasklet, dummy_hrtimer_pcm_elapsed, -(unsigned long)dpcm); return 0; }
[tip:timers/core] usb/gadget/NCM: Replace tasklet with softirq hrtimer
Commit-ID: b1a31a5f5f27ff8aba42b545a1c721941f735107 Gitweb: https://git.kernel.org/tip/b1a31a5f5f27ff8aba42b545a1c721941f735107 Author: Thomas GleixnerAuthorDate: Thu, 21 Dec 2017 11:42:04 +0100 Committer: Ingo Molnar CommitDate: Tue, 16 Jan 2018 09:51:23 +0100 usb/gadget/NCM: Replace tasklet with softirq hrtimer The tx_tasklet tasklet is used in invoke the hrtimer (task_timer) in softirq context. This can be also achieved without the tasklet but with HRTIMER_MODE_SOFT as hrtimer mode. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Acked-by: Felipe Balbi Cc: Christoph Hellwig Cc: Felipe Balbi Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: keesc...@chromium.org Cc: linux-...@vger.kernel.org Link: http://lkml.kernel.org/r/20171221104205.7269-36-anna-ma...@linutronix.de Signed-off-by: Ingo Molnar --- drivers/usb/gadget/function/f_ncm.c | 30 +++--- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index c5bce8e..5780fba 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -73,9 +73,7 @@ struct f_ncm { struct sk_buff *skb_tx_ndp; u16 ndp_dgram_count; booltimer_force_tx; - struct tasklet_struct tx_tasklet; struct hrtimer task_timer; - booltimer_stopping; }; @@ -1104,7 +1102,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port, /* Delay the timer. */ hrtimer_start(>task_timer, TX_TIMEOUT_NSECS, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_SOFT); /* Add the datagram position entries */ ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len); @@ -1148,17 +1146,15 @@ err: } /* - * This transmits the NTB if there are frames waiting. + * The transmit should only be run if no skb data has been sent + * for a certain duration. */ -static void ncm_tx_tasklet(unsigned long data) +static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data) { - struct f_ncm*ncm = (void *)data; - - if (ncm->timer_stopping) - return; + struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer); /* Only send if data is available. */ - if (ncm->skb_tx_data) { + if (!ncm->timer_stopping && ncm->skb_tx_data) { ncm->timer_force_tx = true; /* XXX This allowance of a NULL skb argument to ndo_start_xmit @@ -1171,16 +1167,6 @@ static void ncm_tx_tasklet(unsigned long data) ncm->timer_force_tx = false; } -} - -/* - * The transmit should only be run if no skb data has been sent - * for a certain duration. - */ -static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data) -{ - struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer); - tasklet_schedule(>tx_tasklet); return HRTIMER_NORESTART; } @@ -1513,8 +1499,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) ncm->port.open = ncm_open; ncm->port.close = ncm_close; - tasklet_init(>tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm); - hrtimer_init(>task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(>task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); ncm->task_timer.function = ncm_tx_timeout; DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n", @@ -1623,7 +1608,6 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f) DBG(c->cdev, "ncm unbind\n"); hrtimer_cancel(>task_timer); - tasklet_kill(>tx_tasklet); ncm_string_defs[0].id = 0; usb_free_all_descriptors(f);
[tip:x86/pti] x86/pti: Fix !PCID and sanitize defines
Commit-ID: f10ee3dcc9f0aba92a5c4c064628be5200765dc2 Gitweb: https://git.kernel.org/tip/f10ee3dcc9f0aba92a5c4c064628be5200765dc2 Author: Thomas GleixnerAuthorDate: Sun, 14 Jan 2018 00:23:57 +0100 Committer: Thomas Gleixner CommitDate: Sun, 14 Jan 2018 10:45:53 +0100 x86/pti: Fix !PCID and sanitize defines The switch to the user space page tables in the low level ASM code sets unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base address of the page directory to the user part, bit 11 is switching the PCID to the PCID associated with the user page tables. This fails on a machine which lacks PCID support because bit 11 is set in CR3. Bit 11 is reserved when PCID is inactive. While the Intel SDM claims that the reserved bits are ignored when PCID is disabled, the AMD APM states that they should be cleared. This went unnoticed as the AMD APM was not checked when the code was developed and reviewed and test systems with Intel CPUs never failed to boot. The report is against a Centos 6 host where the guest fails to boot, so it's not yet clear whether this is a virt issue or can happen on real hardware too, but thats irrelevant as the AMD APM clearly ask for clearing the reserved bits. Make sure that on non PCID machines bit 11 is not set by the page table switching code. Andy suggested to rename the related bits and masks so they are clearly describing what they should be used for, which is done as well for clarity. That split could have been done with alternatives but the macro hell is horrible and ugly. This can be done on top if someone cares to remove the extra orq. For now it's a straight forward fix. Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") Reported-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Willy Tarreau Cc: David Woodhouse Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos --- arch/x86/entry/calling.h | 36 ++ arch/x86/include/asm/processor-flags.h | 2 +- arch/x86/include/asm/tlbflush.h| 6 +++--- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 45a63e0..3f48f69 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ -#define PTI_SWITCH_PGTABLES_MASK (1<
[tip:x86/pti] x86/mce: Make machine check speculation protected
Commit-ID: 6f41c34d69eb005e7848716bbcafc979b35037d5 Gitweb: https://git.kernel.org/tip/6f41c34d69eb005e7848716bbcafc979b35037d5 Author: Thomas GleixnerAuthorDate: Thu, 18 Jan 2018 16:28:26 +0100 Committer: Thomas Gleixner CommitDate: Fri, 19 Jan 2018 16:31:28 +0100 x86/mce: Make machine check speculation protected The machine check idtentry uses an indirect branch directly from the low level code. This evades the speculation protection. Replace it by a direct call into C code and issue the indirect call there so the compiler can apply the proper speculation protection. Signed-off-by: Thomas Gleixner Reviewed-by:Borislav Petkov Reviewed-by: David Woodhouse Niced-by: Peter Zijlstra Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801181626290.1847@nanos --- arch/x86/entry/entry_64.S| 2 +- arch/x86/include/asm/traps.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 5 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d54a0ed..63f4320 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1258,7 +1258,7 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 31051f3..3de6933 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -88,6 +88,7 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif +dotraplinkage void do_mce(struct pt_regs *, long); static inline int get_si_code(unsigned long condition) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3b413065..a9e898b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1788,6 +1788,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +{ + machine_check_vector(regs, error_code); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off:
[tip:x86/pti] x86/retpoline: Remove compile time warning
Commit-ID: b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Gitweb: https://git.kernel.org/tip/b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Author: Thomas GleixnerAuthorDate: Sun, 14 Jan 2018 22:13:29 +0100 Committer: Thomas Gleixner CommitDate: Sun, 14 Jan 2018 22:29:36 +0100 x86/retpoline: Remove compile time warning Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler does not have retpoline support. Linus rationale for this is: It's wrong because it will just make people turn off RETPOLINE, and the asm updates - and return stack clearing - that are independent of the compiler are likely the most important parts because they are likely the ones easiest to target. And it's annoying because most people won't be able to do anything about it. The number of people building their own compiler? Very small. So if their distro hasn't got a compiler yet (and pretty much nobody does), the warning is just annoying crap. It is already properly reported as part of the sysfs interface. The compile-time warning only encourages bad things. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Requested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Peter Zijlstra (Intel) Cc: gno...@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lenda...@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=geccdqg5u...@mail.gmail.com --- arch/x86/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 974c618..504b1a4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -240,8 +240,6 @@ ifdef CONFIG_RETPOLINE RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) ifneq ($(RETPOLINE_CFLAGS),) KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE -else -$(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) endif endif
[tip:timers/urgent] hrtimer: Reset hrtimer cpu base proper on CPU hotplug
Commit-ID: d5421ea43d30701e03cadc56a38854c36a8b4433 Gitweb: https://git.kernel.org/tip/d5421ea43d30701e03cadc56a38854c36a8b4433 Author: Thomas GleixnerAuthorDate: Fri, 26 Jan 2018 14:54:32 +0100 Committer: Thomas Gleixner CommitDate: Sat, 27 Jan 2018 15:12:22 +0100 hrtimer: Reset hrtimer cpu base proper on CPU hotplug The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d325208..aa9d2a2b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) timerqueue_init_head(_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); return 0;
[tip:irq/urgent] genirq: Make legacy autoprobing work again
Commit-ID: 55595980acc3232b018ba30df8ee6e0ac40ad184 Gitweb: https://git.kernel.org/tip/55595980acc3232b018ba30df8ee6e0ac40ad184 Author: Thomas GleixnerAuthorDate: Tue, 30 Jan 2018 19:36:32 +0100 Committer: Thomas Gleixner CommitDate: Wed, 31 Jan 2018 10:52:06 +0100 genirq: Make legacy autoprobing work again Meelis reported the following warning on a quad P3 HP NetServer museum piece: WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100 EIP: __irq_startup+0x80/0x100 irq_startup+0x7e/0x170 probe_irq_on+0x128/0x2b0 parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc] parport_pc_probe_port+0xf11/0x1260 [parport_pc] parport_pc_init+0x78a/0xf10 [parport_pc] parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc] do_one_initcall+0x45/0x1e0 This is caused by the rewrite of the irq activation/startup sequence which missed to convert a callsite in the irq legacy auto probing code. To fix this irq_activate_and_startup() needs to gain a return value so the pending logic can work proper. Fixes: c942cee46bba ("genirq: Separate activation and startup") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos --- kernel/irq/autoprobe.c | 2 +- kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b..8c82ea2 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -71,7 +71,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(>lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) + if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(>lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc3..f681c0e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc) return 0; } -void irq_activate_and_startup(struct irq_desc *desc, bool resend) +int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) return; - irq_startup(desc, resend, IRQ_START_FORCE); + return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ab19371..ca6afa2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); -extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); +extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc);
[tip:irq/urgent] genirq: Make legacy autoprobing work again
Commit-ID: 9bc43be5151aaf1aa87f832128f1687341f07483 Gitweb: https://git.kernel.org/tip/9bc43be5151aaf1aa87f832128f1687341f07483 Author: Thomas GleixnerAuthorDate: Tue, 30 Jan 2018 19:36:32 +0100 Committer: Thomas Gleixner CommitDate: Thu, 1 Feb 2018 10:54:48 +0100 genirq: Make legacy autoprobing work again Meelis reported the following warning on a quad P3 HP NetServer museum piece: WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100 EIP: __irq_startup+0x80/0x100 irq_startup+0x7e/0x170 probe_irq_on+0x128/0x2b0 parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc] parport_pc_probe_port+0xf11/0x1260 [parport_pc] parport_pc_init+0x78a/0xf10 [parport_pc] parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc] do_one_initcall+0x45/0x1e0 This is caused by the rewrite of the irq activation/startup sequence which missed to convert a callsite in the irq legacy auto probing code. To fix this irq_activate_and_startup() needs to gain a return value so the pending logic can work proper. Fixes: c942cee46bba ("genirq: Separate activation and startup") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f681c0e..c69357a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -297,7 +297,7 @@ int irq_activate(struct irq_desc *desc) int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) - return; + return 0; return irq_startup(desc, resend, IRQ_START_FORCE); }
[tip:irq/urgent] genirq: Make legacy autoprobing work again
Commit-ID: 1beaeacdc88b537703d04d5536235d0bbb36db93 Gitweb: https://git.kernel.org/tip/1beaeacdc88b537703d04d5536235d0bbb36db93 Author: Thomas GleixnerAuthorDate: Tue, 30 Jan 2018 19:36:32 +0100 Committer: Thomas Gleixner CommitDate: Thu, 1 Feb 2018 11:09:40 +0100 genirq: Make legacy autoprobing work again Meelis reported the following warning on a quad P3 HP NetServer museum piece: WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100 EIP: __irq_startup+0x80/0x100 irq_startup+0x7e/0x170 probe_irq_on+0x128/0x2b0 parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc] parport_pc_probe_port+0xf11/0x1260 [parport_pc] parport_pc_init+0x78a/0xf10 [parport_pc] parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc] do_one_initcall+0x45/0x1e0 This is caused by the rewrite of the irq activation/startup sequence which missed to convert a callsite in the irq legacy auto probing code. To fix this irq_activate_and_startup() needs to gain a return value so the pending logic can work proper. Fixes: c942cee46bba ("genirq: Separate activation and startup") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos --- kernel/irq/autoprobe.c | 2 +- kernel/irq/chip.c | 6 +++--- kernel/irq/internals.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b..8c82ea2 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -71,7 +71,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(>lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) + if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(>lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc3..c69357a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc) return 0; } -void irq_activate_and_startup(struct irq_desc *desc, bool resend) +int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) - return; - irq_startup(desc, resend, IRQ_START_FORCE); + return 0; + return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ab19371..ca6afa2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); -extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); +extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc);
[tip:x86/urgent] x86/apic/vector: Handle vector release on CPU unplug correctly
Commit-ID: c16721c5cece64bfe12cdc302a0228026d8089d7 Gitweb: https://git.kernel.org/tip/c16721c5cece64bfe12cdc302a0228026d8089d7 Author: Thomas GleixnerAuthorDate: Thu, 22 Feb 2018 12:08:06 +0100 Committer: Thomas Gleixner CommitDate: Thu, 22 Feb 2018 22:25:50 +0100 x86/apic/vector: Handle vector release on CPU unplug correctly When a irq vector is replaced, then the previous vector is normally released when the first interrupt happens on the new vector. If the target CPU of the previous vector is already offline when the new vector is installed, then the previous vector is silently discarded, which leads to accounting issues causing suspend failures and other problems. Adjust the logic so that the previous vector is freed in the underlying matrix allocator to ensure that the accounting stays correct. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Yuriy Vostrikov Signed-off-by: Thomas Gleixner Tested-by: Yuriy Vostrikov Cc: Peter Zijlstra Cc: Randy Dunlap Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180222112316.930791...@linutronix.de --- arch/x86/kernel/apic/vector.c | 25 ++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3cc471b..a82ea2e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); lockdep_assert_held(_lock); trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, apicd->cpu); - /* Setup the vector move, if required */ - if (apicd->vector && cpu_online(apicd->cpu)) { + /* +* If there is no vector associated or if the associated vector is +* the shutdown vector, which is associated to make PCI/MSI +* shutdown mode work, then there is nothing to release. Clear out +* prev_vector for this and the offlined target case. +*/ + apicd->prev_vector = 0; + if (!apic->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; + /* +* If the target CPU of the previous vector is online, then mark +* the vector as move in progress and store it for cleanup when the +* first interrupt on the new vector arrives. If the target CPU is +* offline then the regular release mechanism via the cleanup +* vector is not possible and the vector can be immediately freed +* in the underlying matrix allocator. +*/ + if (cpu_online(apicd->cpu)) { apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; } else { - apicd->prev_vector = 0; + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); } +setnew: apicd->vector = newvec; apicd->cpu = newcpu; BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
[tip:x86/urgent] x86/apic/vector: Handle vector release on CPU unplug correctly
Commit-ID: f60606c4ce402963dc552c62910ffa7080b4a628 Gitweb: https://git.kernel.org/tip/f60606c4ce402963dc552c62910ffa7080b4a628 Author: Thomas GleixnerAuthorDate: Thu, 22 Feb 2018 12:08:06 +0100 Committer: Thomas Gleixner CommitDate: Thu, 22 Feb 2018 22:05:44 +0100 x86/apic/vector: Handle vector release on CPU unplug correctly When a irq vector is replaced, then the previous vector is normally released when the first interrupt happens on the new vector. If the target CPU of the previous vector is already offline when the new vector is installed, then the previous vector is silently discarded, which leads to accounting issues causing suspend failures and other problems. Adjust the logic so that the previous vector is freed in the underlying matrix allocator to ensure that the accounting stays correct. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Yuriy Vostrikov Signed-off-by: Thomas Gleixner Tested-by: Yuriy Vostrikov Cc: Peter Zijlstra Cc: Randy Dunlap Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180222112316.930791...@linutronix.de --- arch/x86/kernel/apic/vector.c | 25 ++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3cc471b..a82ea2e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); lockdep_assert_held(_lock); trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, apicd->cpu); - /* Setup the vector move, if required */ - if (apicd->vector && cpu_online(apicd->cpu)) { + /* +* If there is no vector associated or if the associated vector is +* the shutdown vector, which is associated to make PCI/MSI +* shutdown mode work, then there is nothing to release. Clear out +* prev_vector for this and the offlined target case. +*/ + apicd->prev_vector = 0; + if (!apic->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; + /* +* If the target CPU of the previous vector is online, then mark +* the vector as move in progress and store it for cleanup when the +* first interrupt on the new vector arrives. If the target CPU is +* offline then the regular release mechanism via the cleanup +* vector is not possible and the vector can be immediately freed +* in the underlying matrix allocator. +*/ + if (cpu_online(apicd->cpu)) { apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; } else { - apicd->prev_vector = 0; + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); } +setnew: apicd->vector = newvec; apicd->cpu = newcpu; BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
[tip:x86/urgent] genirq/matrix: Handle CPU offlining proper
Commit-ID: 651ca2c00405a2ae3870cc0b4f15a182eb6fbe26 Gitweb: https://git.kernel.org/tip/651ca2c00405a2ae3870cc0b4f15a182eb6fbe26 Author: Thomas GleixnerAuthorDate: Thu, 22 Feb 2018 12:08:05 +0100 Committer: Thomas Gleixner CommitDate: Thu, 22 Feb 2018 22:05:43 +0100 genirq/matrix: Handle CPU offlining proper At CPU hotunplug the corresponding per cpu matrix allocator is shut down and the allocated interrupt bits are discarded under the assumption that all allocated bits have been either migrated away or shut down through the managed interrupts mechanism. This is not true because interrupts which are not started up might have a vector allocated on the outgoing CPU. When the interrupt is started up later or completely shutdown and freed then the allocated vector is handed back, triggering warnings or causing accounting issues which result in suspend failures and other issues. Change the CPU hotplug mechanism of the matrix allocator so that the remaining allocations at unplug time are preserved and global accounting at hotplug is correctly readjusted to take the dormant vectors into account. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Reported-by: Yuriy Vostrikov Signed-off-by: Thomas Gleixner Tested-by: Yuriy Vostrikov Cc: Peter Zijlstra Cc: Randy Dunlap Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180222112316.849980...@linutronix.de --- kernel/irq/matrix.c | 23 ++- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 5187dfe..4c57704 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -16,6 +16,7 @@ struct cpumap { unsigned intavailable; unsigned intallocated; unsigned intmanaged; + boolinitialized; boolonline; unsigned long alloc_map[IRQ_MATRIX_SIZE]; unsigned long managed_map[IRQ_MATRIX_SIZE]; @@ -81,9 +82,11 @@ void irq_matrix_online(struct irq_matrix *m) BUG_ON(cm->online); - bitmap_zero(cm->alloc_map, m->matrix_bits); - cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); - cm->allocated = 0; + if (!cm->initialized) { + cm->available = m->alloc_size; + cm->available -= cm->managed + m->systembits_inalloc; + cm->initialized = true; + } m->global_available += cm->available; cm->online = true; m->online_maps++; @@ -370,14 +373,16 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; - if (cm->online) { - clear_bit(bit, cm->alloc_map); - cm->allocated--; + clear_bit(bit, cm->alloc_map); + cm->allocated--; + + if (cm->online) m->total_allocated--; - if (!managed) { - cm->available++; + + if (!managed) { + cm->available++; + if (cm->online) m->global_available++; - } } trace_irq_matrix_free(bit, cpu, m, cm); }
[tip:x86/urgent] x86/apic/vector: Handle vector release on CPU unplug correctly
Commit-ID: e84cf6aa501c58bf4bf451f1e425192ec090aed2 Gitweb: https://git.kernel.org/tip/e84cf6aa501c58bf4bf451f1e425192ec090aed2 Author: Thomas GleixnerAuthorDate: Thu, 22 Feb 2018 12:08:06 +0100 Committer: Ingo Molnar CommitDate: Fri, 23 Feb 2018 08:02:00 +0100 x86/apic/vector: Handle vector release on CPU unplug correctly When a irq vector is replaced, then the previous vector is normally released when the first interrupt happens on the new vector. If the target CPU of the previous vector is already offline when the new vector is installed, then the previous vector is silently discarded, which leads to accounting issues causing suspend failures and other problems. Adjust the logic so that the previous vector is freed in the underlying matrix allocator to ensure that the accounting stays correct. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Yuriy Vostrikov Signed-off-by: Thomas Gleixner Tested-by: Yuriy Vostrikov Cc: Peter Zijlstra Cc: Randy Dunlap Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180222112316.930791...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/vector.c | 25 ++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3cc471beb50b..bb6f7a2148d7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, { struct apic_chip_data *apicd = apic_chip_data(irqd); struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); lockdep_assert_held(_lock); trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, apicd->cpu); - /* Setup the vector move, if required */ - if (apicd->vector && cpu_online(apicd->cpu)) { + /* +* If there is no vector associated or if the associated vector is +* the shutdown vector, which is associated to make PCI/MSI +* shutdown mode work, then there is nothing to release. Clear out +* prev_vector for this and the offlined target case. +*/ + apicd->prev_vector = 0; + if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; + /* +* If the target CPU of the previous vector is online, then mark +* the vector as move in progress and store it for cleanup when the +* first interrupt on the new vector arrives. If the target CPU is +* offline then the regular release mechanism via the cleanup +* vector is not possible and the vector can be immediately freed +* in the underlying matrix allocator. +*/ + if (cpu_online(apicd->cpu)) { apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; } else { - apicd->prev_vector = 0; + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); } +setnew: apicd->vector = newvec; apicd->cpu = newcpu; BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
[tip:x86/urgent] x86/apic: Switch all APICs to Fixed delivery mode
Commit-ID: 45fa8d89192e4e8e801e67dac3394d6597613e07 Gitweb: https://git.kernel.org/tip/45fa8d89192e4e8e801e67dac3394d6597613e07 Author: Thomas GleixnerAuthorDate: Thu, 28 Dec 2017 11:33:33 +0100 Committer: Thomas Gleixner CommitDate: Fri, 29 Dec 2017 00:21:04 +0100 x86/apic: Switch all APICs to Fixed delivery mode Some of the APIC incarnations are operating in lowest priority delivery mode. This worked as long as the vector management code allocated the same vector on all possible CPUs for each interrupt. Lowest priority delivery mode does not necessarily respect the affinity setting and may redirect to some other online CPU. This was documented somewhere in the old code and the conversion to single target delivery missed to update the delivery mode of the affected APIC drivers which results in spurious interrupts on some of the affected CPU/Chipset combinations. Switch the APIC drivers over to Fixed delivery mode and remove all leftovers of lowest priority delivery mode. As a consequence of this change, the apic::irq_delivery_mode field is now pointless, but this needs to be cleaned up in a separate patch. Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity") Reported-by: vcap...@pengaru.com Signed-off-by: Thomas Gleixner Tested-by: vcap...@pengaru.com Cc: Pavel Machek Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/apic_noop.c | 2 +- arch/x86/kernel/apic/msi.c| 8 ++-- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- drivers/pci/host/pci-hyperv.c | 8 ++-- 6 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690..25a8702 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr= 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7b659c4..5078b5c 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be7..ce503c9 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fa22017..02e8acb 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 622f13c..8b04234 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid
[tip:x86/pti] x86/pti: Make sure the user/kernel PTEs match
Commit-ID: 52994c256df36fda9a715697431cba9daecb6b11 Gitweb: https://git.kernel.org/tip/52994c256df36fda9a715697431cba9daecb6b11 Author: Thomas GleixnerAuthorDate: Wed, 3 Jan 2018 15:57:59 +0100 Committer: Thomas Gleixner CommitDate: Wed, 3 Jan 2018 15:57:59 +0100 x86/pti: Make sure the user/kernel PTEs match Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is enabled: [Hardware Error]: Error Addr: 0x81e000e0 [Hardware Error]: MC1 Error: L1 TLB multimatch. [Hardware Error]: cache level: L1, tx: INSN The address is in the entry area, which is mapped into kernel _AND_ user space. That's special because we switch CR3 while we are executing there. User mapping: 0x81e0-0x8200 2M ro PSE GLB x pmd Kernel mapping: 0x8100-0x8200 16M ro PSE x pmd So the K8 is complaining that the TLB entries differ. They differ in the GLB bit. Drop the GLB bit when installing the user shared mapping. Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: Borislav Petkov Cc: Tom Lendacky Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos --- arch/x86/mm/pti.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index bce8aea..2da28ba 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void) static void __init pti_clone_entry_text(void) { pti_clone_pmds((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, _PAGE_RW); + (unsigned long) __irqentry_text_end, + _PAGE_RW | _PAGE_GLOBAL); } /*
[tip:x86/pti] x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat()
Commit-ID: d7732ba55c4b6a2da339bb12589c515830cfac2c Gitweb: https://git.kernel.org/tip/d7732ba55c4b6a2da339bb12589c515830cfac2c Author: Thomas GleixnerAuthorDate: Wed, 3 Jan 2018 19:52:04 +0100 Committer: Thomas Gleixner CommitDate: Wed, 3 Jan 2018 23:19:32 +0100 x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat() The preparation for PTI which added CR3 switching to the entry code misplaced the CR3 switch in entry_SYSCALL_compat(). With PTI enabled the entry code tries to access a per cpu variable after switching to kernel GS. This fails because that variable is not mapped to user space. This results in a double fault and in the worst case a kernel crash. Move the switch ahead of the access and clobber RSP which has been saved already. Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching") Reported-by: Lars Wendler Reported-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: Borislav Betkov Cc: Andy Lutomirski , Cc: Dave Hansen , Cc: Peter Zijlstra , Cc: Greg KH , , Cc: Boris Ostrovsky , Cc: Juergen Gross Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos --- arch/x86/entry/entry_64_compat.S | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 40f1700..98d5358 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -190,8 +190,13 @@ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ swapgs - /* Stash user ESP and switch to the kernel stack. */ + /* Stash user ESP */ movl%esp, %r8d + + /* Use %rsp as scratch reg. User ESP is stashed in r8 */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Switch to the kernel stack */ movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ @@ -220,12 +225,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) pushq $0 /* pt_regs->r15 = 0 */ /* -* We just saved %rdi so it is safe to clobber. It is not -* preserved during the C calls inside TRACE_IRQS_OFF anyway. -*/ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - - /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. */
[tip:x86/timers] x86/kvmclock: Remove page size requirement from wall_clock
Commit-ID: 7ef363a39514ed8a6f2333fbae1875ac0953715a Gitweb: https://git.kernel.org/tip/7ef363a39514ed8a6f2333fbae1875ac0953715a Author: Thomas Gleixner AuthorDate: Thu, 19 Jul 2018 16:55:21 -0400 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 00:02:36 +0200 x86/kvmclock: Remove page size requirement from wall_clock There is no requirement for wall_clock data to be page aligned or page sized. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pet...@infradead.org Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-3-pasha.tatas...@oracle.com --- arch/x86/kernel/kvmclock.c | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1f6ac5aaa904..a995d7d7164c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -46,14 +46,12 @@ early_param("no-kvmclock", parse_no_kvmclock); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) -#define WALL_CLOCK_SIZE(sizeof(struct pvclock_wall_clock)) static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); -static u8 wall_clock_mem[PAGE_ALIGN(WALL_CLOCK_SIZE)] __aligned(PAGE_SIZE); /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static struct pvclock_wall_clock wall_clock; /* * The wallclock is the time of day when we booted. Since then, some time may @@ -66,15 +64,15 @@ static void kvm_get_wallclock(struct timespec64 *now) int low, high; int cpu; - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); + low = (int)slow_virt_to_phys(_clock); + high = ((u64)slow_virt_to_phys(_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); cpu = get_cpu(); vcpu_time = _clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); + pvclock_read_wallclock(_clock, vcpu_time, now); put_cpu(); } @@ -267,12 +265,10 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - wall_clock = (struct pvclock_wall_clock *)wall_clock_mem; hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - wall_clock = NULL; return; }
[tip:x86/timers] x86/kvmclock: Cleanup the code
Commit-ID: 146c394d0c3c8e88df433a179c2b0b85fd8cf247 Gitweb: https://git.kernel.org/tip/146c394d0c3c8e88df433a179c2b0b85fd8cf247 Author: Thomas Gleixner AuthorDate: Thu, 19 Jul 2018 16:55:23 -0400 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 00:02:37 +0200 x86/kvmclock: Cleanup the code - Cleanup the mrs write for wall clock. The type casts to (int) are sloppy because the wrmsr parameters are u32 and aside of that wrmsrl() already provides the high/low split for free. - Remove the pointless get_cpu()/put_cpu() dance from various functions. Either they are called during early init where CPU is guaranteed to be 0 or they are already called from non preemptible context where smp_processor_id() can be used safely - Simplify the convoluted check for kvmclock in the init function. - Mark the parameter parsing function __init. No point in keeping it around. - Convert to pr_info() Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pet...@infradead.org Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-5-pasha.tatas...@oracle.com --- arch/x86/kernel/kvmclock.c | 72 ++ 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f0a0aef5e9fa..4afb03e49a4f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -37,7 +37,7 @@ static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static u64 kvm_sched_clock_offset; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; @@ -61,13 +61,9 @@ static struct pvclock_wall_clock wall_clock; static void kvm_get_wallclock(struct timespec64 *now) { struct pvclock_vcpu_time_info *vcpu_time; - int low, high; int cpu; - low = (int)slow_virt_to_phys(_clock); - high = ((u64)slow_virt_to_phys(_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(_clock)); cpu = get_cpu(); @@ -117,11 +113,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > -sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -135,16 +131,8 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = _clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return tsc_khz; + return pvclock_tsc_khz(_clock[0].pvti); } static void kvm_get_preset_lpj(void) @@ -161,29 +149,27 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { - bool ret = false; struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); + bool ret = false; if (!hv_clock) return ret; - src = _clock[cpu].pvti; + src = _clock[smp_processor_id()].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { src->flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); @@ -199,7 +185,7 @@ static void kvm_register_clock(char *txt) src = _clock[cpu].pvti; pa = slow_virt_to_phys(src) | 0x01ULL; wrmsrl(msr_kvm_system_time, pa); - pr_info("kvm-clock: cpu
[tip:x86/timers] x86/kvmclock: Decrapify kvm_register_clock()
Commit-ID: 7a5ddc8fe0ea9518cd7fb6a929cac7d864c6f300 Gitweb: https://git.kernel.org/tip/7a5ddc8fe0ea9518cd7fb6a929cac7d864c6f300 Author: Thomas Gleixner AuthorDate: Thu, 19 Jul 2018 16:55:22 -0400 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 00:02:36 +0200 x86/kvmclock: Decrapify kvm_register_clock() The return value is pointless because the wrmsr cannot fail if KVM_FEATURE_CLOCKSOURCE or KVM_FEATURE_CLOCKSOURCE2 are set. kvm_register_clock() is only called locally so wants to be static. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pet...@infradead.org Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-4-pasha.tatas...@oracle.com --- arch/x86/include/asm/kvm_para.h | 1 - arch/x86/kernel/kvmclock.c | 33 ++--- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 3aea2658323a..4c723632c036 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,7 +7,6 @@ #include extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index a995d7d7164c..f0a0aef5e9fa 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -187,23 +187,19 @@ struct clocksource kvm_clock = { }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); + u64 pa; if (!hv_clock) - return 0; + return; src = _clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); - - return ret; + pa = slow_virt_to_phys(src) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s\n", cpu, pa, txt); } static void kvm_save_sched_clock_state(void) @@ -218,11 +214,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* -* Now that the first cpu already had this clocksource initialized, -* we shouldn't fail. -*/ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -265,16 +257,11 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - return; - } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; + kvm_register_clock("primary cpu clock"); pvclock_set_pvti_cpu0_va(hv_clock); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
[tip:x86/timers] x86/kvmclock: Move kvmclock vsyscall param and init to kvmclock
Commit-ID: e499a9b6dc488aff7f284bee51936f510ab7ad15 Gitweb: https://git.kernel.org/tip/e499a9b6dc488aff7f284bee51936f510ab7ad15 Author: Thomas Gleixner AuthorDate: Thu, 19 Jul 2018 16:55:25 -0400 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 00:02:37 +0200 x86/kvmclock: Move kvmclock vsyscall param and init to kvmclock There is no point to have this in the kvm code itself and call it from there. This can be called from an initcall and the parameter is cleared when the hypervisor is not KVM. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pet...@infradead.org Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-7-pasha.tatas...@oracle.com --- arch/x86/include/asm/kvm_guest.h | 7 --- arch/x86/kernel/kvm.c| 13 arch/x86/kernel/kvmclock.c | 44 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h deleted file mode 100644 index 46185263d9c2.. --- a/arch/x86/include/asm/kvm_guest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KVM_GUEST_H -#define _ASM_X86_KVM_GUEST_H - -int kvm_setup_vsyscall_timeinfo(void); - -#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c65c232d3ddd..a560750cc76f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include #include #include -#include static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ -kvmclock_vsyscall = 0; -return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -560,9 +550,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 78aec160f5e0..7d690d2238f8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,12 +27,14 @@ #include #include +#include #include #include #include #include static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; static u64 kvm_sched_clock_offset __ro_after_init; @@ -44,6 +46,13 @@ static int __init parse_no_kvmclock(char *arg) } early_param("no-kvmclock", parse_no_kvmclock); +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) @@ -228,6 +237,24 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + u8 flags; + + if (!hv_clock || !kvmclock_vsyscall) + return 0; + + flags = pvclock_read_flags(_clock[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 1; + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} +early_initcall(kvm_setup_vsyscall_timeinfo); + void __init kvmclock_init(void) { u8 flags; @@ -272,20 +299,3 @@ void __init kvmclock_init(void) clocksource_register_hz(_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - u8 flags; - - if (!hv_clock) - return 0; - - flags = pvclock_read_flags(_clock[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; - - kvm_clock.archdata.vclock_mode =
[tip:x86/timers] x86/kvmclock: Mark variables __initdata and __ro_after_init
Commit-ID: 42f8df935efefba51d0c5321b1325436523e3377 Gitweb: https://git.kernel.org/tip/42f8df935efefba51d0c5321b1325436523e3377 Author: Thomas Gleixner AuthorDate: Thu, 19 Jul 2018 16:55:24 -0400 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 00:02:37 +0200 x86/kvmclock: Mark variables __initdata and __ro_after_init The kvmclock parameter is init data and the other variables are not modified after init. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pet...@infradead.org Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-6-pasha.tatas...@oracle.com --- arch/x86/kernel/kvmclock.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4afb03e49a4f..78aec160f5e0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,10 +32,10 @@ #include #include -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; static int __init parse_no_kvmclock(char *arg) { @@ -50,7 +50,7 @@ early_param("no-kvmclock", parse_no_kvmclock); static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); /* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; +static struct pvclock_vsyscall_time_info *hv_clock __ro_after_init; static struct pvclock_wall_clock wall_clock; /*
[tip:x86/timers] x86/kvmclock: Switch kvmclock data to a PER_CPU variable
Commit-ID: 95a3d4454bb1cf5bfd666c27fdd2dc188e17c14d Gitweb: https://git.kernel.org/tip/95a3d4454bb1cf5bfd666c27fdd2dc188e17c14d Author: Thomas Gleixner AuthorDate: Thu, 19 Jul 2018 16:55:26 -0400 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 00:02:38 +0200 x86/kvmclock: Switch kvmclock data to a PER_CPU variable The previous removal of the memblock dependency from kvmclock introduced a static data array sized 64bytes * CONFIG_NR_CPUS. That's wasteful on large systems when kvmclock is not used. Replace it with: - A static page sized array of pvclock data. It's page sized because the pvclock data of the boot cpu is mapped into the VDSO so otherwise random other data would be exposed to the vDSO - A PER_CPU variable of pvclock data pointers. This is used to access the pcvlock data storage on each CPU. The setup is done in two stages: - Early boot stores the pointer to the static page for the boot CPU in the per cpu data. - In the preparatory stage of CPU hotplug assign either an element of the static array (when the CPU number is in that range) or allocate memory and initialize the per cpu pointer. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pet...@infradead.org Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-8-pasha.tatas...@oracle.com --- arch/x86/kernel/kvmclock.c | 99 +- 1 file changed, 62 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 7d690d2238f8..91b94c0ae4e3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -55,12 +56,23 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) -static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); - -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock __ro_after_init; +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return _cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -69,17 +81,10 @@ static struct pvclock_wall_clock wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int cpu; - wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(_clock)); - - cpu = get_cpu(); - - vcpu_time = _clock[cpu].pvti; - pvclock_read_wallclock(_clock, vcpu_time, now); - - put_cpu(); + preempt_disable(); + pvclock_read_wallclock(_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -89,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = _clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -141,7 +142,7 @@ static inline void kvm_sched_clock_init(bool stable) static unsigned long kvm_get_tsc_khz(void) { setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return pvclock_tsc_khz(_clock[0].pvti); + return pvclock_tsc_khz(this_cpu_pvti()); } static void kvm_get_preset_lpj(void) @@ -158,15 +159,14 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { - struct pvclock_vcpu_time_info *src; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - if (!hv_clock) + if (!src)
[tip:perf/urgent] perf/x86/amd/ibs: Don't access non-started event
Commit-ID: d2753e6b4882a637a0e8fb3b9c2e15f33265300e Gitweb: https://git.kernel.org/tip/d2753e6b4882a637a0e8fb3b9c2e15f33265300e Author: Thomas Gleixner AuthorDate: Fri, 20 Jul 2018 10:39:07 +0200 Committer: Ingo Molnar CommitDate: Tue, 24 Jul 2018 09:51:10 +0200 perf/x86/amd/ibs: Don't access non-started event Paul Menzel reported the following bug: > Enabling the undefined behavior sanitizer and building GNU/Linux 4.18-rc5+ > (with some unrelated commits) with GCC 8.1.0 from Debian Sid/unstable, the > warning below is shown. > > > [2.111913] > > > > [2.111917] UBSAN: Undefined behaviour in > > arch/x86/events/amd/ibs.c:582:24 > > [2.111919] member access within null pointer of type 'struct perf_event' > > [2.111926] CPU: 0 PID: 144 Comm: udevadm Not tainted > > 4.18.0-rc5-00316-g4864b68cedf2 #104 > > [2.111928] Hardware name: ASROCK E350M1/E350M1, BIOS TIMELESS 01/01/1970 > > [2.111930] Call Trace: > > [2.111943] dump_stack+0x55/0x89 > > [2.111949] ubsan_epilogue+0xb/0x33 > > [2.111953] handle_null_ptr_deref+0x7f/0x90 > > [2.111958] __ubsan_handle_type_mismatch_v1+0x55/0x60 > > [2.111964] perf_ibs_handle_irq+0x596/0x620 The code dereferences event before checking the STARTED bit. Patch below should cure the issue. The warning should not trigger, if I analyzed the thing correctly. (And Paul's testing confirms this.) Reported-by: Paul Menzel Tested-by: Paul Menzel Signed-off-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul Menzel Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/alpine.deb.2.21.1807200958390.1...@nanos.tec.linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/amd/ibs.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4b98101209a1..d50bb4dc0650 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -579,7 +579,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); struct perf_event *event = pcpu->event; - struct hw_perf_event *hwc = >hw; + struct hw_perf_event *hwc; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; @@ -602,6 +602,10 @@ fail: return 0; } + if (WARN_ON_ONCE(!event)) + goto fail; + + hwc = >hw; msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf);
[tip:smp/urgent] cpu/hotplug: Prevent state corruption on error rollback
Commit-ID: 69fa6eb7d6a64801ea261025cce9723d9442d773 Gitweb: https://git.kernel.org/tip/69fa6eb7d6a64801ea261025cce9723d9442d773 Author: Thomas Gleixner AuthorDate: Thu, 6 Sep 2018 15:21:38 +0200 Committer: Thomas Gleixner CommitDate: Thu, 6 Sep 2018 15:21:38 +0200 cpu/hotplug: Prevent state corruption on error rollback When a teardown callback fails, the CPU hotplug code brings the CPU back to the previous state. The previous state becomes the new target state. The rollback happens in undo_cpu_down() which increments the state unconditionally even if the state is already the same as the target. As a consequence the next CPU hotplug operation will start at the wrong state. This is easily to observe when __cpu_disable() fails. Prevent the unconditional undo by checking the state vs. target before incrementing state and fix up the consequently wrong conditional in the unplug code which handles the failure of the final CPU take down on the control CPU side. Fixes: 4dddfb5faa61 ("smp/hotplug: Rewrite AP state machine core") Reported-by: Neeraj Upadhyay Signed-off-by: Thomas Gleixner Tested-by: Geert Uytterhoeven Tested-by: Sudeep Holla Tested-by: Neeraj Upadhyay Cc: j...@joshtriplett.org Cc: pet...@infradead.org Cc: jiangshan...@gmail.com Cc: dzic...@redhat.com Cc: brendan.jack...@arm.com Cc: ma...@debian.org Cc: sram...@codeaurora.org Cc: linux-arm-...@vger.kernel.org Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.deb.2.21.1809051419580.1...@nanos.tec.linutronix.de --- kernel/cpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index eb4041f78073..0097acec1c71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -916,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st); + if (st->state < prev_state) + undo_cpu_down(cpu, st); break; } } @@ -969,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); - if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) { cpuhp_reset_state(st, prev_state); __cpuhp_kick_ap(st); }