[tip:x86/apic] x86/idt: Move IST stack based traps to table init

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  90f6225fba0c732f3f5f9f5e265bdefa021ff12d
Gitweb: http://git.kernel.org/tip/90f6225fba0c732f3f5f9f5e265bdefa021ff12d
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:52 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:27 +0200

x86/idt: Move IST stack based traps to table init

Initialize the IST based traps via a table.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.091328...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h |  2 ++
 arch/x86/kernel/idt.c   | 22 ++
 arch/x86/kernel/traps.c |  9 +
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 930acd5..e624527 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -509,9 +509,11 @@ extern void idt_setup_early_traps(void);
 
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
+extern void idt_setup_ist_traps(void);
 extern void idt_setup_debugidt_traps(void);
 #else
 static inline void idt_setup_early_pf(void) { }
+static inline void idt_setup_ist_traps(void) { }
 static inline void idt_setup_debugidt_traps(void) { }
 #endif
 
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index f5281b8..a6326fd 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -92,6 +92,20 @@ struct desc_ptr idt_descr __ro_after_init = {
 gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
 
 /*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initdata struct idt_data ist_idts[] = {
+   ISTG(X86_TRAP_DB,   debug,  DEBUG_STACK),
+   ISTG(X86_TRAP_NMI,  nmi,NMI_STACK),
+   ISTG(X86_TRAP_BP,   int3,   DEBUG_STACK),
+   ISTG(X86_TRAP_DF,   double_fault,   DOUBLEFAULT_STACK),
+#ifdef CONFIG_X86_MCE
+   ISTG(X86_TRAP_MC,   _check, MCE_STACK),
+#endif
+};
+
+/*
  * Override for the debug_idt. Same as the default, but with interrupt
  * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
  */
@@ -158,6 +172,14 @@ void __init idt_setup_early_pf(void)
 }
 
 /**
+ * idt_setup_ist_traps - Initialize the idt table with traps using IST
+ */
+void __init idt_setup_ist_traps(void)
+{
+   idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts));
+}
+
+/**
  * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
  */
 void __init idt_setup_debugidt_traps(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1492bf5..293f5bd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -979,14 +979,7 @@ void __init trap_init(void)
 */
cpu_init();
 
-   /*
-* X86_TRAP_DB and X86_TRAP_BP have been set
-* in early_trap_init(). However, ITS works only after
-* cpu_init() loads TSS. See comments in early_trap_init().
-*/
-   set_intr_gate_ist(X86_TRAP_DB, , DEBUG_STACK);
-   /* int3 can be called from all */
-   set_system_intr_gate_ist(X86_TRAP_BP, , DEBUG_STACK);
+   idt_setup_ist_traps();
 
x86_init.irqs.trap_init();
 


[tip:x86/apic] x86/ipi: Make platform IPI depend on APIC

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  0428e01a2f13a6b7dae8289fb10030dbea336dee
Gitweb: http://git.kernel.org/tip/0428e01a2f13a6b7dae8289fb10030dbea336dee
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:34 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:29 +0200

x86/ipi: Make platform IPI depend on APIC

The platform IPI vector is only installed when the local APIC is enabled. All
users of it depend on the local APIC anyway.

Make the related code conditional on CONFIG_X86_LOCAL_APIC=y.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064957.615286...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/entry_arch.h |  3 +--
 arch/x86/kernel/irq.c | 11 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/entry_arch.h 
b/arch/x86/include/asm/entry_arch.h
index c911650..aa15d1f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -17,8 +17,6 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt, 
IRQ_MOVE_CLEANUP_VECTOR)
 BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR)
 #endif
 
-BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
-
 #ifdef CONFIG_HAVE_KVM
 BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
 BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR)
@@ -37,6 +35,7 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, 
POSTED_INTR_NESTED_VECTOR)
 BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 
 #ifdef CONFIG_IRQ_WORK
 BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index befdd4a..52089c0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 atomic_t irq_err_count;
 
-/* Function pointer for generic interrupt vector handling */
-void (*x86_platform_ipi_callback)(void) = NULL;
-
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
seq_puts(p, "  APIC ICR read retries\n");
-#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
seq_puts(p, "  Platform interrupts\n");
}
+#endif
 #ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
@@ -183,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_irq_work_irqs;
sum += irq_stats(cpu)->icr_read_retry_count;
-#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
@@ -259,6 +256,9 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs 
*regs)
return 1;
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
 /*
  * Handler for X86_PLATFORM_IPI_VECTOR.
  */
@@ -275,6 +275,7 @@ __visible void __irq_entry smp_x86_platform_ipi(struct 
pt_regs *regs)
exiting_irq();
set_irq_regs(old_regs);
 }
+#endif
 
 #ifdef CONFIG_HAVE_KVM
 static void dummy_handler(void) {}


[tip:x86/apic] x86/tracing: Disentangle pagefault and resched IPI tracing key

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  809547472edae0bc68f2b5abc37b92c8a988bc8a
Gitweb: http://git.kernel.org/tip/809547472edae0bc68f2b5abc37b92c8a988bc8a
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:33 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:29 +0200

x86/tracing: Disentangle pagefault and resched IPI tracing key

The pagefault and the resched IPI handler are the only ones where it is
worth to optimize the code further in case tracepoints are disabled. But it
makes no sense to have a single static key for both.

Seperate the static keys so the facilities are handled seperately.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064957.536699...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/trace/common.h  | 15 ---
 arch/x86/include/asm/trace/exceptions.h  |  6 --
 arch/x86/include/asm/trace/irq_vectors.h | 29 +++--
 arch/x86/kernel/smp.c|  2 +-
 arch/x86/kernel/tracepoint.c | 27 ++-
 arch/x86/mm/fault.c  |  2 +-
 6 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/trace/common.h 
b/arch/x86/include/asm/trace/common.h
index b1eb7b1..57c8da02 100644
--- a/arch/x86/include/asm/trace/common.h
+++ b/arch/x86/include/asm/trace/common.h
@@ -1,15 +1,16 @@
 #ifndef _ASM_TRACE_COMMON_H
 #define _ASM_TRACE_COMMON_H
 
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
-
 #ifdef CONFIG_TRACING
-DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key);
-#define trace_irqvectors_enabled() \
-   static_branch_unlikely(_irqvectors_key)
+DECLARE_STATIC_KEY_FALSE(trace_pagefault_key);
+#define trace_pagefault_enabled()  \
+   static_branch_unlikely(_pagefault_key)
+DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+#define trace_resched_ipi_enabled()\
+   static_branch_unlikely(_resched_ipi_key)
 #else
-static inline bool trace_irqvectors_enabled(void) { return false; }
+static inline bool trace_pagefault_enabled(void) { return false; }
+static inline bool trace_resched_ipi_enabled(void) { return false; }
 #endif
 
 #endif
diff --git a/arch/x86/include/asm/trace/exceptions.h 
b/arch/x86/include/asm/trace/exceptions.h
index 960a5b5..5665bf2 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -7,6 +7,9 @@
 #include 
 #include 
 
+extern int trace_pagefault_reg(void);
+extern void trace_pagefault_unreg(void);
+
 DECLARE_EVENT_CLASS(x86_exceptions,
 
TP_PROTO(unsigned long address, struct pt_regs *regs,
@@ -35,8 +38,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, 
\
TP_PROTO(unsigned long address, struct pt_regs *regs,   \
 unsigned long error_code), \
TP_ARGS(address, regs, error_code), \
-   trace_irq_vector_regfunc,   \
-   trace_irq_vector_unregfunc);
+   trace_pagefault_reg, trace_pagefault_unreg);
 
 DEFINE_PAGE_FAULT_EVENT(page_fault_user);
 DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
diff --git a/arch/x86/include/asm/trace/irq_vectors.h 
b/arch/x86/include/asm/trace/irq_vectors.h
index 7825b44..a1bdc25 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -7,6 +7,9 @@
 #include 
 #include 
 
+extern int trace_resched_ipi_reg(void);
+extern void trace_resched_ipi_unreg(void);
+
 DECLARE_EVENT_CLASS(x86_irq_vector,
 
TP_PROTO(int vector),
@@ -26,15 +29,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector,
 #define DEFINE_IRQ_VECTOR_EVENT(name)  \
 DEFINE_EVENT_FN(x86_irq_vector, name##_entry,  \
TP_PROTO(int vector),   \
+   TP_ARGS(vector), NULL, NULL);   \
+DEFINE_EVENT_FN(x86_irq_vector, name##_exit,   \
+   TP_PROTO(int vector),   \
+   TP_ARGS(vector), NULL, NULL);
+
+#define DEFINE_RESCHED_IPI_EVENT(name) \
+DEFINE_EVENT_FN(x86_irq_vector, name##_entry,  \
+   TP_PROTO(int vector),   \
TP_ARGS(vector),\
-   trace_irq_vector_regfunc,   \
-   trace_irq_vector_unregfunc);\
+   trace_resched_ipi_reg,  \
+   trace_resched_ipi_unreg);   \
 DEFINE_EVENT_FN(x86_irq_vector, name##_exit,   \
TP_PROTO(int vector),   \
TP_ARGS(vector),\
-   trace_irq_vector_regfunc,   \
-   trace_irq_vector_unregfunc);
-
+   trace_resched_ipi_reg,   

[tip:x86/apic] x86/irq_work: Make it depend on APIC

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  a45525b5b47c10c0446eda21227792b39af233dc
Gitweb: http://git.kernel.org/tip/a45525b5b47c10c0446eda21227792b39af233dc
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:35 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:30 +0200

x86/irq_work: Make it depend on APIC

The irq work interrupt vector is only installed when CONFIG_X86_LOCAL_APIC is
enabled, but the interrupt handler is compiled in unconditionally.

Compile the cruft out when the APIC is disabled.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064957.691909...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/irq_work.h | 8 
 arch/x86/kernel/irq_work.c  | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index f706041..ddbb8ea 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -3,9 +3,17 @@
 
 #include 
 
+#ifdef CONFIG_X86_LOCAL_APIC
 static inline bool arch_irq_work_has_interrupt(void)
 {
return boot_cpu_has(X86_FEATURE_APIC);
 }
+extern void arch_irq_work_raise(void);
+#else
+static inline bool arch_irq_work_has_interrupt(void)
+{
+   return false;
+}
+#endif
 
 #endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 8054cae..70dee05 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 
+#ifdef CONFIG_X86_LOCAL_APIC
 __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
 {
ipi_entering_ack_irq();
@@ -23,11 +24,10 @@ __visible void __irq_entry smp_irq_work_interrupt(struct 
pt_regs *regs)
 
 void arch_irq_work_raise(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
if (!arch_irq_work_has_interrupt())
return;
 
apic->send_IPI_self(IRQ_WORK_VECTOR);
apic_wait_icr_idle();
-#endif
 }
+#endif


[tip:x86/apic] x86/tracing: Build tracepoints only when they are used

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  73285527804402befe5d5140aeede21c16544b4c
Gitweb: http://git.kernel.org/tip/73285527804402befe5d5140aeede21c16544b4c
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:36 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:24 +0200

x86/tracing: Build tracepoints only when they are used

The tracepoint macro magic emits code for all tracepoints in a event header
file. That code stays around even if the tracepoint is not used at all. The
linker does not discard it.

Build the various irq_vector tracepoints dependent on the appropriate CONFIG
switches.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064957.770651...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/trace/irq_vectors.h | 36 +---
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/trace/irq_vectors.h 
b/arch/x86/include/asm/trace/irq_vectors.h
index a1bdc25..1599d39 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -7,6 +7,8 @@
 #include 
 #include 
 
+#ifdef CONFIG_X86_LOCAL_APIC
+
 extern int trace_resched_ipi_reg(void);
 extern void trace_resched_ipi_unreg(void);
 
@@ -53,18 +55,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
 DEFINE_IRQ_VECTOR_EVENT(local_timer);
 
 /*
- * The ifdef is required because that tracepoint macro hell emits tracepoint
- * code in files which include this header even if the tracepoint is not
- * enabled. Brilliant stuff that.
- */
-#ifdef CONFIG_SMP
-/*
- * reschedule - called when entering/exiting a reschedule vector handler
- */
-DEFINE_RESCHED_IPI_EVENT(reschedule);
-#endif
-
-/*
  * spurious_apic - called when entering/exiting a spurious apic vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(spurious_apic);
@@ -80,6 +70,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic);
  */
 DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi);
 
+#ifdef CONFIG_IRQ_WORK
 /*
  * irq_work - called when entering/exiting a irq work interrupt
  * vector handler
@@ -96,6 +87,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work);
  *  4) goto 1
  */
 TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0);
+#endif
+
+/*
+ * The ifdef is required because that tracepoint macro hell emits tracepoint
+ * code in files which include this header even if the tracepoint is not
+ * enabled. Brilliant stuff that.
+ */
+#ifdef CONFIG_SMP
+/*
+ * reschedule - called when entering/exiting a reschedule vector handler
+ */
+DEFINE_RESCHED_IPI_EVENT(reschedule);
 
 /*
  * call_function - called when entering/exiting a call function interrupt
@@ -108,24 +111,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function);
  * single interrupt vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(call_function_single);
+#endif
 
+#ifdef CONFIG_X86_MCE_THRESHOLD
 /*
  * threshold_apic - called when entering/exiting a threshold apic interrupt
  * vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
+#endif
 
+#ifdef CONFIG_X86_MCE_AMD
 /*
  * deferred_error_apic - called when entering/exiting a deferred apic interrupt
  * vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+#endif
 
+#ifdef CONFIG_X86_THERMAL_VECTOR
 /*
  * thermal_apic - called when entering/exiting a thermal apic interrupt
  * vector handler
  */
 DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
+#endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
 
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH .


[tip:x86/apic] x86/idt: Move 32-bit idt_descr to C code

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  16bc18d895cee95f12bd722e5a3016676dfcf084
Gitweb: http://git.kernel.org/tip/16bc18d895cee95f12bd722e5a3016676dfcf084
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:44 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:26 +0200

x86/idt: Move 32-bit idt_descr to C code

32-bit kernels have the idt_descr defined in the low level assembly entry code,
but there is no good reason for that.

Move it into the C file and use the 64-bit version of it.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.445862...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/head_32.S |  6 --
 arch/x86/kernel/idt.c | 10 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 29da959..ce8c6ed 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -622,7 +622,6 @@ int_msg:
 
.data
 .globl boot_gdt_descr
-.globl idt_descr
 
ALIGN
 # early boot GDT descriptor (must use 1:1 address mapping)
@@ -631,11 +630,6 @@ boot_gdt_descr:
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
 
-   .word 0 # 32-bit align idt_desc.address
-idt_descr:
-   .word IDT_ENTRIES*8-1   # idt contains 256 entries
-   .long idt_table
-
 # boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 3d19cad..86e5912 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -10,15 +10,15 @@
 /* Must be page-aligned because the real IDT is used in a fixmap. */
 gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
 
-#ifdef CONFIG_X86_64
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
-
 struct desc_ptr idt_descr __ro_after_init = {
-   .size   = IDT_ENTRIES * 16 - 1,
+   .size   = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
.address= (unsigned long) idt_table,
 };
 
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
 const struct desc_ptr debug_idt_descr = {
.size   = IDT_ENTRIES * 16 - 1,
.address= (unsigned long) debug_idt_table,


[tip:x86/apic] x86/idt: Create file for IDT related code

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  d8ed9d48266a27ab02a4bbcb81e755d63aec108a
Gitweb: http://git.kernel.org/tip/d8ed9d48266a27ab02a4bbcb81e755d63aec108a
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:43 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:25 +0200

x86/idt: Create file for IDT related code

IDT related code lives scattered around in various places. Create a new
source file in arch/x86/kernel/idt.c to hold it.

Move the idt_tables and descriptors to it for a start. Follow up patches
will gradually move more code over.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.367081...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/Makefile |  2 +-
 arch/x86/kernel/cpu/common.c |  9 -
 arch/x86/kernel/idt.c| 26 ++
 arch/x86/kernel/traps.c  |  6 --
 4 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6ab5fbf..fd0a789 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -42,7 +42,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y  := process_$(BITS).o signal.o
 obj-$(CONFIG_COMPAT)   += signal_compat.o
-obj-y  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y  += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y  += time.o ioport.o dumpstack.o nmi.o
 obj-$(CONFIG_MODIFY_LDT_SYSCALL)   += ldt.o
 obj-y  += setup.o x86_init.o i8259.o irqinit.o jump_label.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b3987..71ab8a4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1289,15 +1289,6 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
-struct desc_ptr idt_descr __ro_after_init = {
-   .size = NR_VECTORS * 16 - 1,
-   .address = (unsigned long) idt_table,
-};
-const struct desc_ptr debug_idt_descr = {
-   .size = NR_VECTORS * 16 - 1,
-   .address = (unsigned long) debug_idt_table,
-};
-
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 irq_stack_union) __aligned(PAGE_SIZE) __visible;
 
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000..3d19cad
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,26 @@
+/*
+ * Interrupt descriptor table related code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include 
+
+#include 
+
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+struct desc_ptr idt_descr __ro_after_init = {
+   .size   = IDT_ENTRIES * 16 - 1,
+   .address= (unsigned long) idt_table,
+};
+
+const struct desc_ptr debug_idt_descr = {
+   .size   = IDT_ENTRIES * 16 - 1,
+   .address= (unsigned long) debug_idt_table,
+};
+#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 36c5836..41f4cd3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -65,18 +65,12 @@
 #include 
 #include 
 #include 
-
-/* No need to be aligned, but done to keep all IDTs defined the same way. */
-gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
 #else
 #include 
 #include 
 #include 
 #endif
 
-/* Must be page-aligned because the real IDT is used in a fixmap. */
-gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
-
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 
 static inline void cond_local_irq_enable(struct pt_regs *regs)


[tip:x86/apic] x86/idt: Remove unused functions/inlines

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  485fa57bd73a0b79987d144e15bdc582f926701d
Gitweb: http://git.kernel.org/tip/485fa57bd73a0b79987d144e15bdc582f926701d
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:56 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:28 +0200

x86/idt: Remove unused functions/inlines

The IDT related inlines are not longer used. Remove them.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.422083...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h | 36 
 1 file changed, 36 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index cae0cb0..cbd36dd 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -390,16 +390,6 @@ static inline void set_desc_limit(struct desc_struct 
*desc, unsigned long limit)
desc->limit1 = (limit >> 16) & 0xf;
 }
 
-#ifdef CONFIG_X86_64
-static inline void set_nmi_gate(int gate, void *addr)
-{
-   gate_desc s;
-
-   pack_gate(, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
-   write_idt_entry(debug_idt_table, gate, );
-}
-#endif
-
 static inline void _set_gate(int gate, unsigned type, const void *addr,
 unsigned dpl, unsigned ist, unsigned seg)
 {
@@ -437,32 +427,6 @@ static inline void alloc_system_vector(int vector)
set_intr_gate(n, addr); \
} while (0)
 
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-   BUG_ON((unsigned)n > 0xFF);
-   _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
-}
-
-static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-   BUG_ON((unsigned)n > 0xFF);
-   _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
-}
-
-static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
-{
-   BUG_ON((unsigned)n > 0xFF);
-   _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
-}
-
-static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
-{
-   BUG_ON((unsigned)n > 0xFF);
-   _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
-}
 
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(u32, debug_idt_ctr);


[tip:x86/apic] x86/idt: Deinline setup functions

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  db18da78f9a8bbab1bdc5968ba47ace788b5061f
Gitweb: http://git.kernel.org/tip/db18da78f9a8bbab1bdc5968ba47ace788b5061f
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:57 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:28 +0200

x86/idt: Deinline setup functions

None of this is performance sensitive in any way - so debloat the kernel.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.502052...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h | 37 ++---
 arch/x86/kernel/idt.c   | 43 ++-
 2 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index cbd36dd..33f84f2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -390,44 +390,11 @@ static inline void set_desc_limit(struct desc_struct 
*desc, unsigned long limit)
desc->limit1 = (limit >> 16) & 0xf;
 }
 
-static inline void _set_gate(int gate, unsigned type, const void *addr,
-unsigned dpl, unsigned ist, unsigned seg)
-{
-   gate_desc s;
-
-   pack_gate(, type, (unsigned long)addr, dpl, ist, seg);
-   /*
-* does not need to be atomic because it is only done once at
-* setup time
-*/
-   write_idt_entry(idt_table, gate, );
-}
-
-static inline void set_intr_gate(unsigned int n, const void *addr)
-{
-   BUG_ON(n > 0xFF);
-   _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
-}
+void set_intr_gate(unsigned int n, const void *addr);
+void alloc_intr_gate(unsigned int n, const void *addr);
 
 extern unsigned long used_vectors[];
 
-static inline void alloc_system_vector(int vector)
-{
-   BUG_ON(vector < FIRST_SYSTEM_VECTOR);
-   if (!test_bit(vector, used_vectors)) {
-   set_bit(vector, used_vectors);
-   } else {
-   BUG();
-   }
-}
-
-#define alloc_intr_gate(n, addr)   \
-   do {\
-   alloc_system_vector(n); \
-   set_intr_gate(n, addr); \
-   } while (0)
-
-
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(u32, debug_idt_ctr);
 static inline bool is_debug_idt_enabled(void)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 99f93a6..8e9318d 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -212,15 +212,16 @@ static inline void idt_init_desc(gate_desc *gate, const 
struct idt_data *d)
 #endif
 }
 
-static __init void
-idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size)
+static void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool 
sys)
 {
gate_desc desc;
 
for (; size > 0; t++, size--) {
idt_init_desc(, t);
-   set_bit(t->vector, used_vectors);
write_idt_entry(idt, t->vector, );
+   if (sys)
+   set_bit(t->vector, used_vectors);
}
 }
 
@@ -233,7 +234,8 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data 
*t, int size)
  */
 void __init idt_setup_early_traps(void)
 {
-   idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts));
+   idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+true);
load_idt(_descr);
 }
 
@@ -242,7 +244,7 @@ void __init idt_setup_early_traps(void)
  */
 void __init idt_setup_traps(void)
 {
-   idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts));
+   idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
 }
 
 #ifdef CONFIG_X86_64
@@ -259,7 +261,7 @@ void __init idt_setup_traps(void)
 void __init idt_setup_early_pf(void)
 {
idt_setup_from_table(idt_table, early_pf_idts,
-ARRAY_SIZE(early_pf_idts));
+ARRAY_SIZE(early_pf_idts), true);
 }
 
 /**
@@ -267,7 +269,7 @@ void __init idt_setup_early_pf(void)
  */
 void __init idt_setup_ist_traps(void)
 {
-   idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts));
+   idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
 }
 
 /**
@@ -277,7 +279,7 @@ void __init idt_setup_debugidt_traps(void)
 {
memcpy(_idt_table, _table, IDT_ENTRIES * 16);
 
-   idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts));
+   

[tip:x86/apic] x86/idt: Move interrupt gate initialization to IDT code

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  dc20b2d526539344d7175a2a83221337302596b8
Gitweb: http://git.kernel.org/tip/dc20b2d526539344d7175a2a83221337302596b8
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:55 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:28 +0200

x86/idt: Move interrupt gate initialization to IDT code

Move the gate intialization from interrupt init to the IDT code so all IDT
related operations are at a single place.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.340209...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/idt.c | 18 ++
 arch/x86/kernel/irqinit.c | 18 --
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 4327104..99f93a6 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -286,7 +286,25 @@ void __init idt_setup_debugidt_traps(void)
  */
 void __init idt_setup_apic_and_irq_gates(void)
 {
+   int i = FIRST_EXTERNAL_VECTOR;
+   void *entry;
+
idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts));
+
+   for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
+   entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+   set_intr_gate(i, entry);
+   }
+
+   for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
+#ifdef CONFIG_X86_LOCAL_APIC
+   set_bit(i, used_vectors);
+   set_intr_gate(i, spurious_interrupt);
+#else
+   entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+   set_intr_gate(i, entry);
+#endif
+   }
 }
 
 /**
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 218cd06..1add9e0 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -89,29 +89,11 @@ void __init init_IRQ(void)
 
 void __init native_init_IRQ(void)
 {
-   int i;
-
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();
 
idt_setup_apic_and_irq_gates();
 
-   /*
-* Cover the whole vector space, no vector can escape
-* us. (some of these will be overridden and become
-* 'special' SMP interrupts)
-*/
-   i = FIRST_EXTERNAL_VECTOR;
-   for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
-   /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
-   set_intr_gate(i, irq_entries_start +
-   8 * (i - FIRST_EXTERNAL_VECTOR));
-   }
-#ifdef CONFIG_X86_LOCAL_APIC
-   for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
-   set_intr_gate(i, spurious_interrupt);
-#endif
-
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, );
 


[tip:x86/apic] x86/tracing: Introduce a static key for exception tracing

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  2feb1b316d48004d905278c02a55902cab0be8be
Gitweb: http://git.kernel.org/tip/2feb1b316d48004d905278c02a55902cab0be8be
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:21 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:23 +0200

x86/tracing: Introduce a static key for exception tracing

Switching the IDT just for avoiding tracepoints creates a completely
impenetrable macro/inline/ifdef mess.

There is no point in avoiding tracepoints for most of the traps/exceptions.
For the more expensive tracepoints, like pagefaults, this can be handled with
an explicit static key.

Preparatory patch to remove the tracing IDT.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.593094...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/trace/common.h  | 15 +++
 arch/x86/include/asm/trace/exceptions.h  |  4 +---
 arch/x86/include/asm/trace/irq_vectors.h |  4 +---
 arch/x86/kernel/tracepoint.c |  9 -
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/trace/common.h 
b/arch/x86/include/asm/trace/common.h
new file mode 100644
index 000..b1eb7b1
--- /dev/null
+++ b/arch/x86/include/asm/trace/common.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_TRACE_COMMON_H
+#define _ASM_TRACE_COMMON_H
+
+extern int trace_irq_vector_regfunc(void);
+extern void trace_irq_vector_unregfunc(void);
+
+#ifdef CONFIG_TRACING
+DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key);
+#define trace_irqvectors_enabled() \
+   static_branch_unlikely(_irqvectors_key)
+#else
+static inline bool trace_irqvectors_enabled(void) { return false; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/trace/exceptions.h 
b/arch/x86/include/asm/trace/exceptions.h
index 2422b14..960a5b5 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -5,9 +5,7 @@
 #define _TRACE_PAGE_FAULT_H
 
 #include 
-
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+#include 
 
 DECLARE_EVENT_CLASS(x86_exceptions,
 
diff --git a/arch/x86/include/asm/trace/irq_vectors.h 
b/arch/x86/include/asm/trace/irq_vectors.h
index 32dd6a9..7825b44 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -5,9 +5,7 @@
 #define _TRACE_IRQ_VECTORS_H
 
 #include 
-
-extern int trace_irq_vector_regfunc(void);
-extern void trace_irq_vector_unregfunc(void);
+#include 
 
 DECLARE_EVENT_CLASS(x86_irq_vector,
 
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1551513..dd4aa04 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -4,9 +4,11 @@
  * Copyright (C) 2013 Seiji Aguchi 
  *
  */
+#include 
+#include 
+
 #include 
 #include 
-#include 
 
 atomic_t trace_idt_ctr = ATOMIC_INIT(0);
 struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
@@ -15,6 +17,7 @@ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
 /* No need to be aligned, but done to keep all IDTs defined the same way. */
 gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
 
+DEFINE_STATIC_KEY_FALSE(trace_irqvectors_key);
 static int trace_irq_vector_refcount;
 static DEFINE_MUTEX(irq_vector_mutex);
 
@@ -36,6 +39,8 @@ static void switch_idt(void *arg)
 
 int trace_irq_vector_regfunc(void)
 {
+   static_branch_inc(_irqvectors_key);
+
mutex_lock(_vector_mutex);
if (!trace_irq_vector_refcount) {
set_trace_idt_ctr(1);
@@ -49,6 +54,8 @@ int trace_irq_vector_regfunc(void)
 
 void trace_irq_vector_unregfunc(void)
 {
+   static_branch_dec(_irqvectors_key);
+
mutex_lock(_vector_mutex);
trace_irq_vector_refcount--;
if (!trace_irq_vector_refcount) {


[tip:x86/apic] x86/traps: Simplify pagefault tracing logic

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8
Gitweb: http://git.kernel.org/tip/11a7ffb01703c3bbb1e9b968893f4487a1b0b5a8
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:22 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:23 +0200

x86/traps: Simplify pagefault tracing logic

Make use of the new irqvector tracing static key and remove the duplicated
trace_do_pagefault() implementation.

If irq vector tracing is disabled, then the overhead of this is a single
NOP5, which is a reasonable tradeoff to avoid duplicated code and the
unholy macro mess.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.672965...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/entry/entry_32.S|  8 
 arch/x86/entry/entry_64.S| 13 +---
 arch/x86/include/asm/traps.h | 10 +
 arch/x86/kernel/kvm.c|  2 +-
 arch/x86/mm/fault.c  | 49 
 5 files changed, 16 insertions(+), 66 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 48ef7bb..0092da1 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -891,14 +891,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, 
HYPERVISOR_CALLBACK_VECTOR,
 
 #endif /* CONFIG_HYPERV */
 
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
-   ASM_CLAC
-   pushl   $trace_do_page_fault
-   jmp common_exception
-END(trace_page_fault)
-#endif
-
 ENTRY(page_fault)
ASM_CLAC
pushl   $do_page_fault
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4dbb336..2731b94 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -918,17 +918,6 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
-#ifdef CONFIG_TRACING
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#else
-.macro trace_idtentry sym do_sym has_error_code:req
-idtentry \sym \do_sym has_error_code=\has_error_code
-.endm
-#endif
-
 idtentry divide_error  do_divide_error 
has_error_code=0
 idtentry overflow  do_overflow 
has_error_code=0
 idtentry boundsdo_bounds   
has_error_code=0
@@ -1096,7 +1085,7 @@ idtentry xen_stack_segmentdo_stack_segment
has_error_code=1
 #endif
 
 idtentry general_protectiondo_general_protection   has_error_code=1
-trace_idtentry page_fault  do_page_fault   has_error_code=1
+idtentry page_faultdo_page_fault   has_error_code=1
 
 #ifdef CONFIG_KVM_GUEST
 idtentry async_page_fault  do_async_page_fault has_error_code=1
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 01fd0a7..b4f322d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -39,7 +39,6 @@ asmlinkage void machine_check(void);
 asmlinkage void simd_coprocessor_error(void);
 
 #ifdef CONFIG_TRACING
-asmlinkage void trace_page_fault(void);
 #define trace_stack_segment stack_segment
 #define trace_divide_error divide_error
 #define trace_bounds bounds
@@ -54,6 +53,7 @@ asmlinkage void trace_page_fault(void);
 #define trace_alignment_check alignment_check
 #define trace_simd_coprocessor_error simd_coprocessor_error
 #define trace_async_page_fault async_page_fault
+#define trace_page_fault page_fault
 #endif
 
 dotraplinkage void do_divide_error(struct pt_regs *, long);
@@ -74,14 +74,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
-#ifdef CONFIG_TRACING
-dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long);
-#else
-static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long 
error)
-{
-   do_page_fault(regs, error);
-}
-#endif
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
 dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
 dotraplinkage void do_alignment_check(struct pt_regs *, long);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d04e30e..6ed9242 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -263,7 +263,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long 
error_code)
 
switch (kvm_read_and_reset_pf_reason()) {
default:
-   trace_do_page_fault(regs, error_code);
+   do_page_fault(regs, error_code);
break;
case 

[tip:x86/apic] x86/boot: Move EISA setup to a separate file

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  f7eaf6e00fd581043bb540dfe865f1d81769b189
Gitweb: http://git.kernel.org/tip/f7eaf6e00fd581043bb540dfe865f1d81769b189
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:20 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:22 +0200

x86/boot: Move EISA setup to a separate file

EISA has absolutely nothing to do with traps, so move it out of traps.c
into its own eisa.c file.

Furthermore, the EISA bus detection does not need to run during
very early boot, it's good enough to run it before the EISA bus
and drivers are initialized.

I.e. instead of calling it from the very early trap_init() code,
make it a subsys_initcall().

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.515322...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/eisa.c   | 18 ++
 arch/x86/kernel/traps.c  | 13 -
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 287eac7..6ab5fbf 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
 obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
+obj-$(CONFIG_EISA) += eisa.o
 obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
 
 obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000..881f923
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,18 @@
+/*
+ * EISA specific code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include 
+#include 
+
+static __init int eisa_bus_probe(void)
+{
+   void __iomem *p = ioremap(0x0FFFD9, 4);
+
+   if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
+   EISA_bus = 1;
+   iounmap(p);
+   return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 556f8f5..3095324 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -38,11 +38,6 @@
 #include 
 #include 
 
-#ifdef CONFIG_EISA
-#include 
-#include 
-#endif
-
 #if defined(CONFIG_EDAC)
 #include 
 #endif
@@ -969,14 +964,6 @@ void __init trap_init(void)
 {
int i;
 
-#ifdef CONFIG_EISA
-   void __iomem *p = early_ioremap(0x0FFFD9, 4);
-
-   if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
-   EISA_bus = 1;
-   early_iounmap(p, 4);
-#endif
-
set_intr_gate(X86_TRAP_DE, divide_error);
set_intr_gate_ist(X86_TRAP_NMI, , NMI_STACK);
/* int4 can be called from all */


[tip:x86/apic] x86/irq: Get rid of the 'first_system_vector' indirection bogosity

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  05161b9cbe553c41cf775ac41bb5120d94347e5c
Gitweb: http://git.kernel.org/tip/05161b9cbe553c41cf775ac41bb5120d94347e5c
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:18 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:21 +0200

x86/irq: Get rid of the 'first_system_vector' indirection bogosity

This variable is beyond pointless. Nothing allocates a vector via
alloc_gate() below FIRST_SYSTEM_VECTOR. So nothing can change
first_system_vector.

If there is a need for a gate below FIRST_SYSTEM_VECTOR then it can be
added to the vector defines and FIRST_SYSTEM_VECTOR can be adjusted
accordingly.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.357109...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h   | 5 ++---
 arch/x86/kernel/apic/apic.c   | 2 --
 arch/x86/kernel/apic/vector.c | 2 +-
 arch/x86/kernel/irq.c | 2 +-
 arch/x86/kernel/irqinit.c | 5 +
 5 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b1..a7f36ab 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -482,16 +483,14 @@ static inline void _set_gate(int gate, unsigned type, 
void *addr,
0, 0, __KERNEL_CS); \
} while (0)
 
-extern int first_system_vector;
 /* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
 extern unsigned long used_vectors[];
 
 static inline void alloc_system_vector(int vector)
 {
+   BUG_ON(vector < FIRST_SYSTEM_VECTOR);
if (!test_bit(vector, used_vectors)) {
set_bit(vector, used_vectors);
-   if (first_system_vector > vector)
-   first_system_vector = vector;
} else {
BUG();
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 98b3dd8..8996ef1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -177,8 +177,6 @@ static int disable_apic_timer __initdata;
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
-int first_system_vector = FIRST_SYSTEM_VECTOR;
-
 /*
  * Debug level, exported for io_apic.c
  */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b3af457..88c214e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -166,7 +166,7 @@ static int __assign_irq_vector(int irq, struct 
apic_chip_data *d,
offset = current_offset;
 next:
vector += 16;
-   if (vector >= first_system_vector) {
+   if (vector >= FIRST_SYSTEM_VECTOR) {
offset = (offset + 1) % 16;
vector = FIRST_EXTERNAL_VECTOR + offset;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e6073a0..019d0ac 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -429,7 +429,7 @@ int check_irq_vectors_for_cpu_disable(void)
 * this w/o holding vector_lock.
 */
for (vector = FIRST_EXTERNAL_VECTOR;
-vector < first_system_vector; vector++) {
+vector < FIRST_SYSTEM_VECTOR; vector++) {
if (!test_bit(vector, used_vectors) &&
IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) {
if (++count == this_count)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 6537cfe..4e5f8c0 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -169,10 +169,7 @@ void __init native_init_IRQ(void)
 * 'special' SMP interrupts)
 */
i = FIRST_EXTERNAL_VECTOR;
-#ifndef CONFIG_X86_LOCAL_APIC
-#define first_system_vector NR_VECTORS
-#endif
-   for_each_clear_bit_from(i, used_vectors, first_system_vector) {
+   for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
set_intr_gate(i, irq_entries_start +
8 * (i - FIRST_EXTERNAL_VECTOR));


[tip:x86/apic] x86/irq: Unexport used_vectors[]

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  fa4ab5774dfe58fd5e99462f625253659d41df09
Gitweb: http://git.kernel.org/tip/fa4ab5774dfe58fd5e99462f625253659d41df09
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:17 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:20 +0200

x86/irq: Unexport used_vectors[]

No modular users.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.278375...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/traps.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf54309..556f8f5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,7 +83,6 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
 gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
 
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
 
 static inline void cond_local_irq_enable(struct pt_regs *regs)
 {


[tip:x86/apic] x86/fpu: Use bitfield accessors for desc_struct

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  718f5d0030da8669404dab873336b16c169b430b
Gitweb: http://git.kernel.org/tip/718f5d0030da8669404dab873336b16c169b430b
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:39 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:25 +0200

x86/fpu: Use bitfield accessors for desc_struct

desc_struct is a union of u32 fields and bitfields. The access to the u32
fields is done with magic macros.

Convert it to use the bitfields and replace the macro magic with parseable
inline functions.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.042406...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/math-emu/fpu_entry.c   | 11 +-
 arch/x86/math-emu/fpu_system.h  | 48 +++--
 arch/x86/math-emu/get_address.c | 17 ---
 3 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 0203bae..d4a7df2 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -147,7 +147,7 @@ void math_emulate(struct math_emu_info *info)
}
 
code_descriptor = FPU_get_ldt_descriptor(FPU_CS);
-   if (SEG_D_SIZE(code_descriptor)) {
+   if (code_descriptor.d) {
/* The above test may be wrong, the book is not clear */
/* Segmented 32 bit protected mode */
addr_modes.default_mode = SEG32;
@@ -155,11 +155,10 @@ void math_emulate(struct math_emu_info *info)
/* 16 bit protected mode */
addr_modes.default_mode = PM16;
}
-   FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor);
-   code_limit = code_base
-   + (SEG_LIMIT(code_descriptor) +
-  1) * SEG_GRANULARITY(code_descriptor)
-   - 1;
+   FPU_EIP += code_base = seg_get_base(_descriptor);
+   code_limit = seg_get_limit(_descriptor) + 1;
+   code_limit *= seg_get_granularity(_descriptor);
+   code_limit += code_base - 1;
if (code_limit < code_base)
code_limit = 0x;
}
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index a179254..2319a25 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -34,17 +34,43 @@ static inline struct desc_struct 
FPU_get_ldt_descriptor(unsigned seg)
return ret;
 }
 
-#define SEG_D_SIZE(x)  ((x).b & (3 << 21))
-#define SEG_G_BIT(x)   ((x).b & (1 << 23))
-#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1)
-#define SEG_286_MODE(x)((x).b & ( 0xff00 | 0xf | (1 << 
23)))
-#define SEG_BASE_ADDR(s)   (((s).b & 0xff00) \
-| (((s).b & 0xff) << 16) | ((s).a >> 16))
-#define SEG_LIMIT(s)   (((s).b & 0xff) | ((s).a & 0x))
-#define SEG_EXECUTE_ONLY(s)(((s).b & ((1 << 11) | (1 << 9))) == (1 << 11))
-#define SEG_WRITE_PERM(s)  (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9))
-#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
-== (1 << 10))
+#define SEG_TYPE_WRITABLE  (1U << 1)
+#define SEG_TYPE_EXPANDS_DOWN  (1U << 2)
+#define SEG_TYPE_EXECUTE   (1U << 3)
+#define SEG_TYPE_EXPAND_MASK   (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE)
+#define SEG_TYPE_EXECUTE_MASK  (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE)
+
+static inline unsigned long seg_get_base(struct desc_struct *d)
+{
+   unsigned long base = (unsigned long)d->base2 << 24;
+
+   return base | ((unsigned long)d->base1 << 16) | d->base0;
+}
+
+static inline unsigned long seg_get_limit(struct desc_struct *d)
+{
+   return ((unsigned long)d->limit << 16) | d->limit0;
+}
+
+static inline unsigned long seg_get_granularity(struct desc_struct *d)
+{
+   return d->g ? 4096 : 1;
+}
+
+static inline bool seg_expands_down(struct desc_struct *d)
+{
+   return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN;
+}
+
+static inline bool seg_execute_only(struct desc_struct *d)
+{
+   return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE;
+}
+
+static inline bool seg_writable(struct desc_struct *d)
+{
+   return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
+}
 
 #define I387   (>thread.fpu.state)
 #define FPU_info   

[tip:x86/apic] x86/idt: Unify gate_struct handling for 32/64-bit kernels

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  64b163fab684e3de47aa8db6cc08ae7d2e194373
Gitweb: http://git.kernel.org/tip/64b163fab684e3de47aa8db6cc08ae7d2e194373
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:37 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:24 +0200

x86/idt: Unify gate_struct handling for 32/64-bit kernels

The first 32 bits of gate struct are the same for 32 and 64 bit kernels.

The 32-bit version uses desc_struct and no designated data structure,
so we need different accessors for 32 and 64 bit kernels.

Aside of that the macros which are necessary to build the 32-bit
gate descriptor are horrible to read.

Unify the gate structs and switch all code fiddling with it over.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064957.861974...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/boot/compressed/eboot.c |  8 +++---
 arch/x86/include/asm/desc.h  | 45 ++-
 arch/x86/include/asm/desc_defs.h | 57 ++--
 arch/x86/kvm/vmx.c   |  2 +-
 arch/x86/xen/enlighten_pv.c  | 12 -
 5 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869e..65f0b24 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1058,7 +1058,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
-   desc->limit = 0xf;
+   desc->limit1 = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
@@ -1078,7 +1078,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
-   desc->limit = 0xf;
+   desc->limit1 = 0xf;
desc->avl = 0;
if (IS_ENABLED(CONFIG_X86_64)) {
desc->l = 1;
@@ -1099,7 +1099,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
-   desc->limit = 0xf;
+   desc->limit1 = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
@@ -1116,7 +1116,7 @@ struct boot_params *efi_main(struct efi_config *c,
desc->s = 0;
desc->dpl = 0;
desc->p = 1;
-   desc->limit = 0x0;
+   desc->limit1 = 0x0;
desc->avl = 0;
desc->l = 0;
desc->d = 0;
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d18a604..0731064 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -84,33 +84,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int 
cpu)
return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
 }
 
-#ifdef CONFIG_X86_64
-
 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long 
func,
 unsigned dpl, unsigned ist, unsigned seg)
 {
-   gate->offset_low= PTR_LOW(func);
+   gate->offset_low= (u16) func;
+   gate->bits.p= 1;
+   gate->bits.dpl  = dpl;
+   gate->bits.zero = 0;
+   gate->bits.type = type;
+   gate->offset_middle = (u16) (func >> 16);
+#ifdef CONFIG_X86_64
gate->segment   = __KERNEL_CS;
-   gate->ist   = ist;
-   gate->p = 1;
-   gate->dpl   = dpl;
-   gate->zero0 = 0;
-   gate->zero1 = 0;
-   gate->type  = type;
-   gate->offset_middle = PTR_MIDDLE(func);
-   gate->offset_high   = PTR_HIGH(func);
-}
-
+   gate->bits.ist  = ist;
+   gate->reserved  = 0;
+   gate->offset_high   = (u32) (func >> 32);
 #else
-static inline void pack_gate(gate_desc *gate, unsigned char type,
-unsigned long base, unsigned dpl, unsigned flags,
-unsigned short seg)
-{
-   gate->a = (seg << 16) | (base & 0x);
-   gate->b = (base & 0x) | (((0x80 | type | (dpl << 5)) & 0xff) << 
8);
-}
-
+   gate->segment   = seg;
+   gate->bits.ist  = 0;
 #endif
+}
 
 static inline int desc_empty(const void *ptr)
 {
@@ -186,7 +178,8 @@ static inline void pack_descriptor(struct desc_struct 
*desc, unsigned long 

[tip:x86/apic] x86/percpu: Use static initializer for GDT entry

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  1dd439fe97e1a32cbb980c180f1bcb54bb6a2a55
Gitweb: http://git.kernel.org/tip/1dd439fe97e1a32cbb980c180f1bcb54bb6a2a55
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:38 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:24 +0200

x86/percpu: Use static initializer for GDT entry

The IDT cleanup is about to remove pack_descriptor(). The GDT setup for the
per-cpu storage can be achieved with the static initializer as well. Replace
it.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064957.954214...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/setup_percpu.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 10edd1e..6e8fcb6f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -155,13 +155,10 @@ static void __init pcpup_populate_pte(unsigned long addr)
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
-   struct desc_struct gdt;
+   struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
+ 0xF);
 
-   pack_descriptor(, per_cpu_offset(cpu), 0xF,
-   0x2 | DESCTYPE_S, 0x8);
-   gdt.s = 1;
-   write_gdt_entry(get_cpu_gdt_rw(cpu),
-   GDT_ENTRY_PERCPU, , DESCTYPE_S);
+   write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, , DESCTYPE_S);
 #endif
 }
 


[tip:x86/apic] x86/idt: Switch early trap init to IDT tables

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  433f8924fa8e55a50ce57f3b8a33ed095c405644
Gitweb: http://git.kernel.org/tip/433f8924fa8e55a50ce57f3b8a33ed095c405644
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:50 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:27 +0200

x86/idt: Switch early trap init to IDT tables

Add the initialization table for the early trap setup and replace the early
trap init code.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.929139...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/idt.c   | 53 +
 arch/x86/kernel/setup.c |  4 ++--
 arch/x86/kernel/traps.c | 27 -
 3 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ae6fc12..64e2211 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -48,6 +48,28 @@ struct idt_data {
 #define TSKG(_vector, _gdt)\
G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
 
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_idts[] = {
+   INTG(X86_TRAP_DB,   debug),
+   SYSG(X86_TRAP_BP,   int3),
+#ifdef CONFIG_X86_32
+   INTG(X86_TRAP_PF,   page_fault),
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initdata struct idt_data early_pf_idts[] = {
+   INTG(X86_TRAP_PF,   page_fault),
+};
+#endif
+
 /* Must be page-aligned because the real IDT is used in a fixmap. */
 gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
 
@@ -93,6 +115,37 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data 
*t, int size)
 }
 
 /**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+   idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts));
+   load_idt(_descr);
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * FIXME: Why is 32bit and 64bit installing the PF handler at different
+ * places in the early setup code?
+ */
+void __init idt_setup_early_pf(void)
+{
+   idt_setup_from_table(idt_table, early_pf_idts,
+ARRAY_SIZE(early_pf_idts));
+}
+#endif
+
+/**
  * idt_setup_early_handler - Initializes the idt table with early handlers
  */
 void __init idt_setup_early_handler(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ecab322..30dc84e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -891,7 +891,7 @@ void __init setup_arch(char **cmdline_p)
 */
olpc_ofw_detect();
 
-   early_trap_init();
+   idt_setup_early_traps();
early_cpu_init();
early_ioremap_init();
 
@@ -1162,7 +1162,7 @@ void __init setup_arch(char **cmdline_p)
 
init_mem_mapping();
 
-   early_trap_pf_init();
+   idt_setup_early_pf();
 
/*
 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 41f4cd3..835c7e8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -923,33 +923,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, 
long error_code)
 }
 #endif
 
-/* Set of traps needed for early debugging. */
-void __init early_trap_init(void)
-{
-   /*
-* Don't use IST to set DEBUG_STACK as it doesn't work until TSS
-* is ready in cpu_init() <-- trap_init(). Before trap_init(),
-* CPU runs at ring 0 so it is impossible to hit an invalid
-* stack.  Using the original stack works well enough at this
-* early stage. DEBUG_STACK will be equipped after cpu_init() in
-* trap_init().
-*/
-   set_intr_gate(X86_TRAP_DB, debug);
-   /* int3 can be called from all */
-   set_system_intr_gate(X86_TRAP_BP, );
-#ifdef CONFIG_X86_32
-   

[tip:x86/apic] x86/idt: Move debug stack init to table based

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  0a30908b9149b2b332ccf817261125a634765566
Gitweb: http://git.kernel.org/tip/0a30908b9149b2b332ccf817261125a634765566
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:51 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:27 +0200

x86/idt: Move debug stack init to table based

Add the debug_idt init table and make use of it.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.006502...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h |  2 ++
 arch/x86/kernel/idt.c   | 23 +++
 arch/x86/kernel/traps.c |  6 +-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 5a3cdeb..930acd5 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -509,8 +509,10 @@ extern void idt_setup_early_traps(void);
 
 #ifdef CONFIG_X86_64
 extern void idt_setup_early_pf(void);
+extern void idt_setup_debugidt_traps(void);
 #else
 static inline void idt_setup_early_pf(void) { }
+static inline void idt_setup_debugidt_traps(void) { }
 #endif
 
 extern void idt_invalidate(void *addr);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 64e2211..f5281b8 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -68,6 +68,15 @@ static const __initdata struct idt_data early_idts[] = {
 static const __initdata struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF,   page_fault),
 };
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+static const __initdata struct idt_data dbg_idts[] = {
+   INTG(X86_TRAP_DB,   debug),
+   INTG(X86_TRAP_BP,   int3),
+};
 #endif
 
 /* Must be page-aligned because the real IDT is used in a fixmap. */
@@ -82,6 +91,10 @@ struct desc_ptr idt_descr __ro_after_init = {
 /* No need to be aligned, but done to keep all IDTs defined the same way. */
 gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
 
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
 const struct desc_ptr debug_idt_descr = {
.size   = IDT_ENTRIES * 16 - 1,
.address= (unsigned long) debug_idt_table,
@@ -143,6 +156,16 @@ void __init idt_setup_early_pf(void)
idt_setup_from_table(idt_table, early_pf_idts,
 ARRAY_SIZE(early_pf_idts));
 }
+
+/**
+ * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
+ */
+void __init idt_setup_debugidt_traps(void)
+{
+   memcpy(_idt_table, _table, IDT_ENTRIES * 16);
+
+   idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts));
+}
 #endif
 
 /**
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 835c7e8..1492bf5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -990,9 +990,5 @@ void __init trap_init(void)
 
x86_init.irqs.trap_init();
 
-#ifdef CONFIG_X86_64
-   memcpy(_idt_table, _table, IDT_ENTRIES * 16);
-   set_nmi_gate(X86_TRAP_DB, );
-   set_nmi_gate(X86_TRAP_BP, );
-#endif
+   idt_setup_debugidt_traps();
 }


[tip:x86/apic] x86/idt: Hide set_intr_gate()

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  facaa3e3c813848e6b49ee37a42a3688832e63cd
Gitweb: http://git.kernel.org/tip/facaa3e3c813848e6b49ee37a42a3688832e63cd
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:59 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:29 +0200

x86/idt: Hide set_intr_gate()

set_intr_gate() is an internal function of the IDT code. The only user left
is the KVM code which replaces the pagefault handler eventually.

Provide an explicit update_intr_gate() function and make set_intr_gate()
static. While at it replace the magic number 14 in the KVM code with the
proper trap define.

Signed-off-by: Thomas Gleixner 
Acked-by: Paolo Bonzini 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.663008...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h |  2 +-
 arch/x86/kernel/idt.c   | 33 -
 arch/x86/kernel/kvm.c   |  2 +-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 33f84f2..1a2ba36 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -390,7 +390,7 @@ static inline void set_desc_limit(struct desc_struct *desc, 
unsigned long limit)
desc->limit1 = (limit >> 16) & 0xf;
 }
 
-void set_intr_gate(unsigned int n, const void *addr);
+void update_intr_gate(unsigned int n, const void *addr);
 void alloc_intr_gate(unsigned int n, const void *addr);
 
 extern unsigned long used_vectors[];
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index b609eac..61b490c 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -225,6 +225,22 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data 
*t, int size, bool sy
}
 }
 
+static void set_intr_gate(unsigned int n, const void *addr)
+{
+   struct idt_data data;
+
+   BUG_ON(n > 0xFF);
+
+   memset(, 0, sizeof(data));
+   data.vector = n;
+   data.addr   = addr;
+   data.segment= __KERNEL_CS;
+   data.bits.type  = GATE_INTERRUPT;
+   data.bits.p = 1;
+
+   idt_setup_from_table(idt_table, , 1, false);
+}
+
 /**
  * idt_setup_early_traps - Initialize the idt table with early traps
  *
@@ -336,20 +352,11 @@ void idt_invalidate(void *addr)
load_idt();
 }
 
-void set_intr_gate(unsigned int n, const void *addr)
+void __init update_intr_gate(unsigned int n, const void *addr)
 {
-   struct idt_data data;
-
-   BUG_ON(n > 0xFF);
-
-   memset(, 0, sizeof(data));
-   data.vector = n;
-   data.addr   = addr;
-   data.segment= __KERNEL_CS;
-   data.bits.type  = GATE_INTERRUPT;
-   data.bits.p = 1;
-
-   idt_setup_from_table(idt_table, , 1, false);
+   if (WARN_ON_ONCE(!test_bit(n, used_vectors)))
+   return;
+   set_intr_gate(n, addr);
 }
 
 void alloc_intr_gate(unsigned int n, const void *addr)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6ed9242..874827b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -455,7 +455,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
 
 static void __init kvm_apf_trap_init(void)
 {
-   set_intr_gate(14, async_page_fault);
+   update_intr_gate(X86_TRAP_PF, async_page_fault);
 }
 
 void __init kvm_guest_init(void)


[tip:x86/apic] x86/idt: Simplify alloc_intr_gate()

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  4447ac1195a845b18f2f427686f116ab77c5b268
Gitweb: http://git.kernel.org/tip/4447ac1195a845b18f2f427686f116ab77c5b268
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:58 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:28 +0200

x86/idt: Simplify alloc_intr_gate()

The only users of alloc_intr_gate() are hypervisors, which both check the
used_vectors bitmap whether they have allocated the gate already. Move that
check into alloc_intr_gate() and simplify the users.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Juergen Gross 
Reviewed-by: K. Y. Srinivasan 
Cc: Andy Lutomirski 
Cc: Boris Ostrovsky 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Stephen Hemminger 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064959.580830...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/mshyperv.c   | 9 ++---
 arch/x86/kernel/idt.c| 6 +++---
 drivers/xen/events/events_base.c | 6 ++
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 70e717f..9fc3265 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -59,13 +59,8 @@ void hyperv_vector_handler(struct pt_regs *regs)
 void hv_setup_vmbus_irq(void (*handler)(void))
 {
vmbus_handler = handler;
-   /*
-* Setup the IDT for hypervisor callback. Prevent reallocation
-* at module reload.
-*/
-   if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
-   alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
-   hyperv_callback_vector);
+   /* Setup the IDT for hypervisor callback */
+   alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
 }
 
 void hv_remove_vmbus_irq(void)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 8e9318d..b609eac 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -354,7 +354,7 @@ void set_intr_gate(unsigned int n, const void *addr)
 
 void alloc_intr_gate(unsigned int n, const void *addr)
 {
-   BUG_ON(test_bit(n, used_vectors) || n < FIRST_SYSTEM_VECTOR);
-   set_bit(n, used_vectors);
-   set_intr_gate(n, addr);
+   BUG_ON(n < FIRST_SYSTEM_VECTOR);
+   if (!test_and_set_bit(n, used_vectors))
+   set_intr_gate(n, addr);
 }
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 2d43118..1ab4bd1 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1653,10 +1653,8 @@ void xen_callback_vector(void)
return;
}
pr_info("Xen HVM callback vector for event delivery is 
enabled\n");
-   /* in the restore case the vector has already been allocated */
-   if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
-   alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
-   xen_hvm_callback_vector);
+   alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+   xen_hvm_callback_vector);
}
 }
 #else


[tip:x86/apic] x86/irq: Remove vector_used_by_percpu_irq()

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  69de72ec6db950c436e36b94cf05eeb9e11ee144
Gitweb: http://git.kernel.org/tip/69de72ec6db950c436e36b94cf05eeb9e11ee144
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:16 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:20 +0200

x86/irq: Remove vector_used_by_percpu_irq()

Last user (lguest) is gone. Remove it.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.201432...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/irq.h |  1 -
 arch/x86/kernel/irq.c  |  2 --
 arch/x86/kernel/irqinit.c  | 12 
 3 files changed, 15 deletions(-)

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 668cca5..ce99168 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -44,7 +44,6 @@ extern __visible unsigned int do_IRQ(struct pt_regs *regs);
 
 /* Interrupt vector management */
 extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
-extern int vector_used_by_percpu_irq(unsigned int vector);
 
 extern void init_ISA_irqs(void);
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 4ed0aba..e6073a0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -346,8 +346,6 @@ __visible void __irq_entry 
smp_trace_x86_platform_ipi(struct pt_regs *regs)
set_irq_regs(old_regs);
 }
 
-EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 /* These two declarations are only used in check_irq_vectors_for_cpu_disable()
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c7fd185..6537cfe 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -55,18 +55,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-   int cpu;
-
-   for_each_online_cpu(cpu) {
-   if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
-   return 1;
-   }
-
-   return 0;
-}
-
 void __init init_ISA_irqs(void)
 {
struct irq_chip *chip = legacy_pic->chip;


[tip:x86/apic] x86/irq: Remove duplicated used_vectors definition

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  9aec458ff07323f6593fd718cc33b1bca2f64597
Gitweb: http://git.kernel.org/tip/9aec458ff07323f6593fd718cc33b1bca2f64597
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:19 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:21 +0200

x86/irq: Remove duplicated used_vectors definition

Also remove the unparseable comment in the other place while at it.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.436711...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h | 1 -
 arch/x86/include/asm/irq.h  | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a7f36ab..71094f2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -483,7 +483,6 @@ static inline void _set_gate(int gate, unsigned type, void 
*addr,
0, 0, __KERNEL_CS); \
} while (0)
 
-/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
 extern unsigned long used_vectors[];
 
 static inline void alloc_system_vector(int vector)
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ce99168..9958cee 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -42,9 +42,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs 
*regs);
 
 extern __visible unsigned int do_IRQ(struct pt_regs *regs);
 
-/* Interrupt vector management */
-extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
-
 extern void init_ISA_irqs(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC


[tip:x86/apic] x86/idt: Remove unused set_trap_gate()

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  8f55868f9e42fea56021b17421914b9e4fda4960
Gitweb: http://git.kernel.org/tip/8f55868f9e42fea56021b17421914b9e4fda4960
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:45 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:26 +0200

x86/idt: Remove unused set_trap_gate()

This inline is not used at all.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.522053...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h | 12 
 1 file changed, 12 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 108a9e8..51b3d48 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -446,18 +446,6 @@ static inline void set_system_intr_gate(unsigned int n, 
void *addr)
_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
 }
 
-static inline void set_system_trap_gate(unsigned int n, void *addr)
-{
-   BUG_ON((unsigned)n > 0xFF);
-   _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
-}
-
-static inline void set_trap_gate(unsigned int n, void *addr)
-{
-   BUG_ON((unsigned)n > 0xFF);
-   _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
-}
-
 static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
 {
BUG_ON((unsigned)n > 0xFF);


[tip:x86/apic] x86/idt: Move early IDT setup out of 32-bit asm

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  87e81786b13b267c4355e0d23e33c7e4c08fa63f
Gitweb: http://git.kernel.org/tip/87e81786b13b267c4355e0d23e33c7e4c08fa63f
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:48 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:26 +0200

x86/idt: Move early IDT setup out of 32-bit asm

The early IDT setup can be done in C code like it's done on 64-bit kernels.
Reuse the 64-bit version.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.757980...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/segment.h |  1 +
 arch/x86/kernel/head32.c   |  4 
 arch/x86/kernel/head_32.S  | 36 ++--
 arch/x86/kernel/idt.c  |  4 
 4 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5a602d6..066aaf8 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -238,6 +238,7 @@
 #ifndef __ASSEMBLY__
 
 extern const char 
early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
+extern void early_ignore_irq(void);
 
 /*
  * Load a segment. Fall back on loading the zero segment if something goes
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 538ec01..cf2ce06 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -30,6 +31,9 @@ static void __init i386_default_early_setup(void)
 asmlinkage __visible void __init i386_start_kernel(void)
 {
cr4_init_shadow();
+
+   idt_setup_early_handler();
+
sanitize_boot_params(_params);
 
x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ce8c6ed..a615a5e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -345,7 +345,6 @@ ENTRY(startup_32_smp)
movl %eax,%cr0
 
lgdt early_gdt_descr
-   lidt idt_descr
ljmp $(__KERNEL_CS),$1f
 1: movl $(__KERNEL_DS),%eax# reload all the segment registers
movl %eax,%ss   # after changing gdt.
@@ -378,37 +377,6 @@ ENDPROC(startup_32_smp)
  */
 __INIT
 setup_once:
-   /*
-* Set up a idt with 256 interrupt gates that push zero if there
-* is no error code and then jump to early_idt_handler_common.
-* It doesn't actually load the idt - that needs to be done on
-* each CPU. Interrupts are enabled elsewhere, when we can be
-* relatively sure everything is ok.
-*/
-
-   movl $idt_table,%edi
-   movl $early_idt_handler_array,%eax
-   movl $NUM_EXCEPTION_VECTORS,%ecx
-1:
-   movl %eax,(%edi)
-   movl %eax,4(%edi)
-   /* interrupt gate, dpl=0, present */
-   movl $(0x8E00 + __KERNEL_CS),2(%edi)
-   addl $EARLY_IDT_HANDLER_SIZE,%eax
-   addl $8,%edi
-   loop 1b
-
-   movl $256 - NUM_EXCEPTION_VECTORS,%ecx
-   movl $ignore_int,%edx
-   movl $(__KERNEL_CS << 16),%eax
-   movw %dx,%ax/* selector = 0x0010 = cs */
-   movw $0x8E00,%dx/* interrupt gate - dpl=0, present */
-2:
-   movl %eax,(%edi)
-   movl %edx,4(%edi)
-   addl $8,%edi
-   loop 2b
-
 #ifdef CONFIG_CC_STACKPROTECTOR
/*
 * Configure the stack canary. The linker can't handle this by
@@ -498,7 +466,7 @@ ENDPROC(early_idt_handler_common)
 
 /* This is the default interrupt "handler" :-) */
ALIGN
-ignore_int:
+ENTRY(early_ignore_irq)
cld
 #ifdef CONFIG_PRINTK
pushl %eax
@@ -533,7 +501,7 @@ ignore_int:
 hlt_loop:
hlt
jmp hlt_loop
-ENDPROC(ignore_int)
+ENDPROC(early_ignore_irq)
 __INITDATA
.align 4
 GLOBAL(early_recursion_flag)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index a147581..70ca248 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -34,6 +34,10 @@ void __init idt_setup_early_handler(void)
 
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+   for ( ; i < NR_VECTORS; i++)
+   set_intr_gate(i, early_ignore_irq);
+#endif
load_idt(_descr);
 }
 


[tip:x86/apic] x86/idt: Prepare for table based init

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  3318e9744244a415ee9481ca7e54234caf5e12c5
Gitweb: http://git.kernel.org/tip/3318e9744244a415ee9481ca7e54234caf5e12c5
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:49 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:27 +0200

x86/idt: Prepare for table based init

The IDT setup code is handled in several places. All of them use variants
of set_intr_gate() inlines. This can be done with a table based
initialization, which allows to reduce the inline zoo and puts all IDT
related code and information into a single place.

Add the infrastructure.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.849877...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/idt.c | 67 +++
 1 file changed, 67 insertions(+)

diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 70ca248..ae6fc12 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -5,8 +5,49 @@
  */
 #include 
 
+#include 
+#include 
 #include 
 
+struct idt_data {
+   unsigned intvector;
+   unsigned intsegment;
+   struct idt_bits bits;
+   const void  *addr;
+};
+
+#define DPL0   0x0
+#define DPL3   0x3
+
+#define DEFAULT_STACK  0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+   {   \
+   .vector = _vector,  \
+   .bits.ist   = _ist, \
+   .bits.type  = _type,\
+   .bits.dpl   = _dpl, \
+   .bits.p = 1,\
+   .addr   = _addr,\
+   .segment= _segment, \
+   }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr)   \
+   G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr)   \
+   G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Interrupt gate with interrupt stack */
+#define ISTG(_vector, _addr, _ist) \
+   G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* Task gate */
+#define TSKG(_vector, _gdt)\
+   G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
 /* Must be page-aligned because the real IDT is used in a fixmap. */
 gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
 
@@ -25,6 +66,32 @@ const struct desc_ptr debug_idt_descr = {
 };
 #endif
 
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+   unsigned long addr = (unsigned long) d->addr;
+
+   gate->offset_low= (u16) addr;
+   gate->segment   = (u16) d->segment;
+   gate->bits  = d->bits;
+   gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+   gate->offset_high   = (u32) (addr >> 32);
+   gate->reserved  = 0;
+#endif
+}
+
+static __init void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size)
+{
+   gate_desc desc;
+
+   for (; size > 0; t++, size--) {
+   idt_init_desc(, t);
+   set_bit(t->vector, used_vectors);
+   write_idt_entry(idt, t->vector, );
+   }
+}
+
 /**
  * idt_setup_early_handler - Initializes the idt table with early handlers
  */


[tip:x86/apic] x86/idt: Consolidate IDT invalidation

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  e802a51ede91350438c051da2f238f5e8c918ead
Gitweb: http://git.kernel.org/tip/e802a51ede91350438c051da2f238f5e8c918ead
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:46 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:26 +0200

x86/idt: Consolidate IDT invalidation

kexec and reboot have both code to invalidate IDT. Create a common function
and use it.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.600953...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h|  3 +++
 arch/x86/kernel/idt.c  | 11 +++
 arch/x86/kernel/machine_kexec_32.c | 14 +-
 arch/x86/kernel/reboot.c   |  4 +---
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 51b3d48..33aff45 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -503,4 +503,7 @@ static inline void load_current_idt(void)
else
load_idt((const struct desc_ptr *)_descr);
 }
+
+extern void idt_invalidate(void *addr);
+
 #endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 86e5912..cd4658c 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -24,3 +24,14 @@ const struct desc_ptr debug_idt_descr = {
.address= (unsigned long) debug_idt_table,
 };
 #endif
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ * @addr:  The virtual address of the 'invalid' IDT
+ */
+void idt_invalidate(void *addr)
+{
+   struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+
+   load_idt();
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c 
b/arch/x86/kernel/machine_kexec_32.c
index 8c53c5d..00bc751 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -26,18 +26,6 @@
 #include 
 #include 
 
-static void set_idt(void *newidt, __u16 limit)
-{
-   struct desc_ptr curidt;
-
-   /* ia32 supports unaliged loads & stores */
-   curidt.size= limit;
-   curidt.address = (unsigned long)newidt;
-
-   load_idt();
-}
-
-
 static void set_gdt(void *newgdt, __u16 limit)
 {
struct desc_ptr curgdt;
@@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image)
 * If you want to load them you must set up your own idt & gdt.
 */
set_gdt(phys_to_virt(0), 0);
-   set_idt(phys_to_virt(0), 0);
+   idt_invalidate(phys_to_virt(0));
 
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a56bf60..54984b1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -38,8 +38,6 @@
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
-static const struct desc_ptr no_idt = {};
-
 /*
  * This is set if we need to go through the 'emergency' path.
  * When machine_emergency_restart() is called, we may be on
@@ -638,7 +636,7 @@ static void native_machine_emergency_restart(void)
break;
 
case BOOT_TRIPLE:
-   load_idt(_idt);
+   idt_invalidate(NULL);
__asm__ __volatile__("int3");
 
/* We're probably dead after this, but... */


[tip:x86/apic] x86/idt: Move early IDT handler setup to IDT code

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  588787fde7aa346f345e1a7600f84d88039fc9df
Gitweb: http://git.kernel.org/tip/588787fde7aa346f345e1a7600f84d88039fc9df
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:47 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:26 +0200

x86/idt: Move early IDT handler setup to IDT code

The early IDT handler setup is done in C entry code on 64-bit kernels and in
ASM entry code on 32-bit kernels.

Move the 64-bit variant to the IDT code so it can be shared with 32-bit
in the next step.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.679561...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h |  9 +
 arch/x86/kernel/head64.c|  6 +-
 arch/x86/kernel/idt.c   | 12 
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 33aff45..5a3cdeb 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -504,6 +504,15 @@ static inline void load_current_idt(void)
load_idt((const struct desc_ptr *)_descr);
 }
 
+extern void idt_setup_early_handler(void);
+extern void idt_setup_early_traps(void);
+
+#ifdef CONFIG_X86_64
+extern void idt_setup_early_pf(void);
+#else
+static inline void idt_setup_early_pf(void) { }
+#endif
+
 extern void idt_invalidate(void *addr);
 
 #endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9ba7954..d6ab034 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -255,8 +255,6 @@ static void __init copy_bootdata(char *real_mode_data)
 
 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 {
-   int i;
-
/*
 * Build-time sanity checks on the kernel image and module
 * area mappings. (these are purely build-time and produce no code)
@@ -282,9 +280,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * 
real_mode_data)
 
kasan_early_init();
 
-   for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
-   set_intr_gate(i, early_idt_handler_array[i]);
-   load_idt((const struct desc_ptr *)_descr);
+   idt_setup_early_handler();
 
copy_bootdata(__va(real_mode_data));
 
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index cd4658c..a147581 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -26,6 +26,18 @@ const struct desc_ptr debug_idt_descr = {
 #endif
 
 /**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+   int i;
+
+   for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+   set_intr_gate(i, early_idt_handler_array[i]);
+   load_idt(_descr);
+}
+
+/**
  * idt_invalidate - Invalidate interrupt descriptor table
  * @addr:  The virtual address of the 'invalid' IDT
  */


[tip:x86/apic] x86/apic: Remove the duplicated tracing versions of interrupts

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  61069de7a3252be0b1f567fe9e0b4723f1d2814f
Gitweb: http://git.kernel.org/tip/61069de7a3252be0b1f567fe9e0b4723f1d2814f
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:26 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:25 +0200

x86/apic: Remove the duplicated tracing versions of interrupts

The error and the spurious interrupt are really rare events and not at all
performance sensitive: two NOP5s can be tolerated when tracing is disabled.

Remove the complication.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Steven Rostedt (VMware) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20170828064956.986009...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/hw_irq.h |  4 ++--
 arch/x86/kernel/apic/apic.c   | 43 ++-
 2 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a7e45d1..b094b87 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -48,15 +48,15 @@ extern asmlinkage void call_function_single_interrupt(void);
 
 #ifdef CONFIG_TRACING
 /* Interrupt handlers registered during init_IRQ */
-extern void trace_error_interrupt(void);
 extern void trace_irq_work_interrupt(void);
-extern void trace_spurious_interrupt(void);
 extern void trace_thermal_interrupt(void);
 extern void trace_reschedule_interrupt(void);
 extern void trace_threshold_interrupt(void);
 extern void trace_deferred_error_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
+#define trace_error_interrupt error_interrupt
+#define trace_spurious_interrupt spurious_interrupt
 #define trace_x86_platform_ipi x86_platform_ipi
 #define trace_apic_timer_interrupt apic_timer_interrupt
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a33fa44..eebee4c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1899,10 +1899,14 @@ void __init register_lapic_address(unsigned long 
address)
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-static void __smp_spurious_interrupt(u8 vector)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
 {
+   u8 vector = ~regs->orig_ax;
u32 v;
 
+   entering_irq();
+   trace_spurious_apic_entry(vector);
+
/*
 * Check if this really is a spurious interrupt and ACK it
 * if it is a vectored one.  Just in case...
@@ -1917,22 +1921,7 @@ static void __smp_spurious_interrupt(u8 vector)
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
"should never happen.\n", vector, smp_processor_id());
-}
 
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
-{
-   entering_irq();
-   __smp_spurious_interrupt(~regs->orig_ax);
-   exiting_irq();
-}
-
-__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
-{
-   u8 vector = ~regs->orig_ax;
-
-   entering_irq();
-   trace_spurious_apic_entry(vector);
-   __smp_spurious_interrupt(vector);
trace_spurious_apic_exit(vector);
exiting_irq();
 }
@@ -1940,10 +1929,8 @@ __visible void __irq_entry 
smp_trace_spurious_interrupt(struct pt_regs *regs)
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
-static void __smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
 {
-   u32 v;
-   u32 i = 0;
static const char * const error_interrupt_reason[] = {
"Send CS error",/* APIC Error Bit 0 */
"Receive CS error", /* APIC Error Bit 1 */
@@ -1954,6 +1941,10 @@ static void __smp_error_interrupt(struct pt_regs *regs)
"Received illegal vector",  /* APIC Error Bit 6 */
"Illegal register address", /* APIC Error Bit 7 */
};
+   u32 v, i = 0;
+
+   entering_irq();
+   trace_error_apic_entry(ERROR_APIC_VECTOR);
 
/* First tickle the hardware, only then report what went on. -- REW */
if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
@@ -1975,20 +1966,6 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 
apic_printk(APIC_DEBUG, KERN_CONT "\n");
 
-}
-
-__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
-{
-   entering_irq();
-   __smp_error_interrupt(regs);
-   exiting_irq();
-}
-
-__visible void __irq_entry 

[tip:x86/apic] x86/irq: Get rid of duplicated trace_x86_platform_ipi() code

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  8a17116b1fddc1f414cd4dd5e86fa239fcdb5208
Gitweb: http://git.kernel.org/tip/8a17116b1fddc1f414cd4dd5e86fa239fcdb5208
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:25 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:25 +0200

x86/irq: Get rid of duplicated trace_x86_platform_ipi() code

Two NOP5s are really a good tradeoff vs. the unholy IDT switching mess,
which duplicates code all over the place.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Steven Rostedt (VMware) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20170828064956.907209...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/hw_irq.h |  2 +-
 arch/x86/kernel/irq.c | 25 +
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 44137bb..a7e45d1 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -48,7 +48,6 @@ extern asmlinkage void call_function_single_interrupt(void);
 
 #ifdef CONFIG_TRACING
 /* Interrupt handlers registered during init_IRQ */
-extern void trace_x86_platform_ipi(void);
 extern void trace_error_interrupt(void);
 extern void trace_irq_work_interrupt(void);
 extern void trace_spurious_interrupt(void);
@@ -58,6 +57,7 @@ extern void trace_threshold_interrupt(void);
 extern void trace_deferred_error_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
+#define trace_x86_platform_ipi x86_platform_ipi
 #define trace_apic_timer_interrupt apic_timer_interrupt
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 019d0ac..befdd4a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -262,20 +262,16 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs 
*regs)
 /*
  * Handler for X86_PLATFORM_IPI_VECTOR.
  */
-void __smp_x86_platform_ipi(void)
-{
-   inc_irq_stat(x86_platform_ipis);
-
-   if (x86_platform_ipi_callback)
-   x86_platform_ipi_callback();
-}
-
 __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
 {
struct pt_regs *old_regs = set_irq_regs(regs);
 
entering_ack_irq();
-   __smp_x86_platform_ipi();
+   trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+   inc_irq_stat(x86_platform_ipis);
+   if (x86_platform_ipi_callback)
+   x86_platform_ipi_callback();
+   trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
exiting_irq();
set_irq_regs(old_regs);
 }
@@ -334,17 +330,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct 
pt_regs *regs)
 }
 #endif
 
-__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
-{
-   struct pt_regs *old_regs = set_irq_regs(regs);
-
-   entering_ack_irq();
-   trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
-   __smp_x86_platform_ipi();
-   trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
-   exiting_irq();
-   set_irq_regs(old_regs);
-}
 
 #ifdef CONFIG_HOTPLUG_CPU
 


[tip:x86/apic] x86/apic: Remove the duplicated tracing version of local_timer_interrupt()

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  302a98f896bbd2feb1393d98e8b9febeb101db6e
Gitweb: http://git.kernel.org/tip/302a98f896bbd2feb1393d98e8b9febeb101db6e
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:23 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:24 +0200

x86/apic: Remove the duplicated tracing version of local_timer_interrupt()

The two NOP5s are noise in the rest of the work which is done by the timer
interrupt and modern CPUs are pretty good in optimizing NOPs anyway.

Get rid of the interrupt handler duplication and move the tracepoints into
the regular handler.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Steven Rostedt (VMware) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20170828064956.751247...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/hw_irq.h |  2 +-
 arch/x86/kernel/apic/apic.c   | 19 ---
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index d6dbafb..44137bb 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -48,7 +48,6 @@ extern asmlinkage void call_function_single_interrupt(void);
 
 #ifdef CONFIG_TRACING
 /* Interrupt handlers registered during init_IRQ */
-extern void trace_apic_timer_interrupt(void);
 extern void trace_x86_platform_ipi(void);
 extern void trace_error_interrupt(void);
 extern void trace_irq_work_interrupt(void);
@@ -59,6 +58,7 @@ extern void trace_threshold_interrupt(void);
 extern void trace_deferred_error_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
+#define trace_apic_timer_interrupt apic_timer_interrupt
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8996ef1..7a57b54 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1038,25 +1038,6 @@ __visible void __irq_entry 
smp_apic_timer_interrupt(struct pt_regs *regs)
 * interrupt lock, which is the WrongThing (tm) to do.
 */
entering_ack_irq();
-   local_apic_timer_interrupt();
-   exiting_irq();
-
-   set_irq_regs(old_regs);
-}
-
-__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
-{
-   struct pt_regs *old_regs = set_irq_regs(regs);
-
-   /*
-* NOTE! We'd better ACK the irq immediately,
-* because timer handling can be slow.
-*
-* update_process_times() expects us to have done irq_enter().
-* Besides, if we don't timer interrupts ignore the global
-* interrupt lock, which is the WrongThing (tm) to do.
-*/
-   entering_ack_irq();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);


[tip:x86/apic] x86/apic: Use this_cpu_ptr() in local_timer_interrupt()

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  3bec6def39e32609e01a68b43476ee1f1c512eaa
Gitweb: http://git.kernel.org/tip/3bec6def39e32609e01a68b43476ee1f1c512eaa
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:24 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 11:42:24 +0200

x86/apic: Use this_cpu_ptr() in local_timer_interrupt()

Accessing the per cpu data via per_cpu(, smp_processor_id()) is
pointless. Use this_cpu_ptr() instead.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064956.829552...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/apic/apic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7a57b54..a33fa44 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -988,8 +988,7 @@ void setup_secondary_APIC_clock(void)
  */
 static void local_apic_timer_interrupt(void)
 {
-   int cpu = smp_processor_id();
-   struct clock_event_device *evt = _cpu(lapic_events, cpu);
+   struct clock_event_device *evt = this_cpu_ptr(_events);
 
/*
 * Normally we should not be here till LAPIC has been initialized but
@@ -1003,7 +1002,8 @@ static void local_apic_timer_interrupt(void)
 * spurious.
 */
if (!evt->event_handler) {
-   pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+   pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
+  smp_processor_id());
/* Switch it off */
lapic_timer_shutdown(evt);
return;


[tip:x86/apic] x86/asm: Replace access to desc_struct:a/b fields

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  9a98e7780022aa7cd201eb8a88a4f1d607b73cde
Gitweb: http://git.kernel.org/tip/9a98e7780022aa7cd201eb8a88a4f1d607b73cde
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:40 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:25 +0200

x86/asm: Replace access to desc_struct:a/b fields

The union inside of desc_struct which allows access to the raw u32 parts of
the descriptors. This raw access part is about to go away.

Replace the few code parts which access those fields.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Boris Ostrovsky 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Juergen Gross 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.120214...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/xen/hypercall.h | 6 --
 arch/x86/kernel/tls.c| 2 +-
 arch/x86/xen/enlighten_pv.c  | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypercall.h 
b/arch/x86/include/asm/xen/hypercall.h
index 11071fc..9606688 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -552,6 +552,8 @@ static inline void
 MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
struct desc_struct desc)
 {
+   u32 *p = (u32 *) 
+
mcl->op = __HYPERVISOR_update_descriptor;
if (sizeof(maddr) == sizeof(long)) {
mcl->args[0] = maddr;
@@ -559,8 +561,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 
maddr,
} else {
mcl->args[0] = maddr;
mcl->args[1] = maddr >> 32;
-   mcl->args[2] = desc.a;
-   mcl->args[3] = desc.b;
+   mcl->args[2] = *p++;
+   mcl->args[3] = *p;
}
 
trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index dcd699b..a106b97 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,7 +93,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
 
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info)) {
-   desc->a = desc->b = 0;
+   memset(desc, 0, sizeof(*desc));
} else {
fill_ldt(desc, info);
 
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 4c5d72b..03fb07d 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -494,7 +494,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr 
*dtr)
 static inline bool desc_equal(const struct desc_struct *d1,
  const struct desc_struct *d2)
 {
-   return d1->a == d2->a && d1->b == d2->b;
+   return !memcmp(d1, d2, sizeof(*d1));
 }
 
 static void load_TLS_descriptor(struct thread_struct *t,


[tip:x86/apic] x86/gdt: Use bitfields for initialization

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  38e9e81f4c81c75799b002d5811de7241b307676
Gitweb: http://git.kernel.org/tip/38e9e81f4c81c75799b002d5811de7241b307676
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:41 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:25 +0200

x86/gdt: Use bitfields for initialization

The GDT entry related code uses two ways to access entries via
union fields:

 - bitfields

 - macros which initialize the two 16-bit parts of the entry
   by magic shift and mask operations.

Clean it up and only use the bitfields to initialize and access entries.

( The old access patterns were partly done due to GCC optimizing bitfield
  accesses in a horrible way - that's mostly fixed these days and clarity
  of code in such low level accessors is very important. )

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.197673...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/entry/vdso/vma.c|  2 +-
 arch/x86/include/asm/desc.h  | 26 +++-
 arch/x86/include/asm/desc_defs.h | 44 ++--
 arch/x86/math-emu/fpu_system.h   |  2 +-
 4 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 726355c..1911310 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -351,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg)
 * and 8 bits for the node)
 */
d.limit0 = cpu | ((node & 0xf) << 12);
-   d.limit = node >> 4;
+   d.limit1 = node >> 4;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3;  /* Visible to user code */
d.s = 1;/* Not a system segment */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 0731064..2090cd2 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -23,7 +23,7 @@ static inline void fill_ldt(struct desc_struct *desc, const 
struct user_desc *in
desc->s = 1;
desc->dpl   = 0x3;
desc->p = info->seg_not_present ^ 1;
-   desc->limit = (info->limit & 0xf) >> 16;
+   desc->limit1= (info->limit & 0xf) >> 16;
desc->avl   = info->useable;
desc->d = info->seg_32bit;
desc->g = info->limit_in_pages;
@@ -170,14 +170,20 @@ static inline void pack_descriptor(struct desc_struct 
*desc, unsigned long base,
   unsigned long limit, unsigned char type,
   unsigned char flags)
 {
-   desc->a = ((base & 0x) << 16) | (limit & 0x);
-   desc->b = (base & 0xff00) | ((base & 0xff) >> 16) |
-   (limit & 0x000f) | ((type & 0xff) << 8) |
-   ((flags & 0xf) << 20);
-   desc->p = 1;
+   desc->limit0= (u16) limit;
+   desc->base0 = (u16) base;
+   desc->base1 = (base >> 16) & 0xFF;
+   desc->type  = type & 0x0F;
+   desc->s = 0;
+   desc->dpl   = 0;
+   desc->p = 1;
+   desc->limit1= (limit >> 16) & 0xF;
+   desc->avl   = (flags >> 0) & 0x01;
+   desc->l = (flags >> 1) & 0x01;
+   desc->d = (flags >> 2) & 0x01;
+   desc->g = (flags >> 3) & 0x01;
 }
 
-
 static inline void set_tssldt_descriptor(void *d, unsigned long addr,
 unsigned type, unsigned size)
 {
@@ -195,7 +201,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned 
long addr,
desc->base2 = (addr >> 24) & 0xFF;
desc->base3 = (u32) (addr >> 32);
 #else
-   pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+   pack_descriptor((struct desc_struct *)d, addr, size, type, 0);
 #endif
 }
 
@@ -395,13 +401,13 @@ static inline void set_desc_base(struct desc_struct 
*desc, unsigned long base)
 
 static inline unsigned long get_desc_limit(const struct desc_struct *desc)
 {
-   return desc->limit0 | (desc->limit << 16);
+   return desc->limit0 | (desc->limit1 << 16);
 }
 
 static inline void set_desc_limit(struct desc_struct *desc, unsigned long 
limit)
 {
desc->limit0 = limit & 0x;
-   desc->limit = (limit >> 16) & 0xf;
+   desc->limit1 = (limit >> 16) & 0xf;
 

[tip:x86/apic] x86/ldttss: Clean up 32-bit descriptors

2017-08-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  87cc037674342cbf6213829b2cc59bb71be60777
Gitweb: http://git.kernel.org/tip/87cc037674342cbf6213829b2cc59bb71be60777
Author: Thomas Gleixner 
AuthorDate: Mon, 28 Aug 2017 08:47:42 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Aug 2017 12:07:25 +0200

x86/ldttss: Clean up 32-bit descriptors

Like the IDT descriptors, the LDT/TSS descriptors are pointlessly different
on 32 and 64 bit kernels.

Unify them and get rid of the duplicated code.

Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Link: http://lkml.kernel.org/r/20170828064958.289634...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/desc.h  | 26 +++---
 arch/x86/include/asm/desc_defs.h | 27 ---
 2 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 2090cd2..108a9e8 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -166,42 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int 
entry, const void *desc, int
memcpy([entry], desc, size);
 }
 
-static inline void pack_descriptor(struct desc_struct *desc, unsigned long 
base,
-  unsigned long limit, unsigned char type,
-  unsigned char flags)
-{
-   desc->limit0= (u16) limit;
-   desc->base0 = (u16) base;
-   desc->base1 = (base >> 16) & 0xFF;
-   desc->type  = type & 0x0F;
-   desc->s = 0;
-   desc->dpl   = 0;
-   desc->p = 1;
-   desc->limit1= (limit >> 16) & 0xF;
-   desc->avl   = (flags >> 0) & 0x01;
-   desc->l = (flags >> 1) & 0x01;
-   desc->d = (flags >> 2) & 0x01;
-   desc->g = (flags >> 3) & 0x01;
-}
-
 static inline void set_tssldt_descriptor(void *d, unsigned long addr,
 unsigned type, unsigned size)
 {
-#ifdef CONFIG_X86_64
-   struct ldttss_desc64 *desc = d;
+   struct ldttss_desc *desc = d;
 
memset(desc, 0, sizeof(*desc));
 
-   desc->limit0= size & 0x;
+   desc->limit0= (u16) size;
desc->base0 = (u16) addr;
desc->base1 = (addr >> 16) & 0xFF;
desc->type  = type;
desc->p = 1;
desc->limit1= (size >> 16) & 0xF;
desc->base2 = (addr >> 24) & 0xFF;
+#ifdef CONFIG_X86_64
desc->base3 = (u32) (addr >> 32);
-#else
-   pack_descriptor((struct desc_struct *)d, addr, size, type, 0);
 #endif
 }
 
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 1b9494e..346d252 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -49,24 +49,21 @@ enum {
DESCTYPE_S = 0x10,  /* !system */
 };
 
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct ldttss_desc64 {
-   u16 limit0;
-   u16 base0;
-   unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-   unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
-   u32 base3;
-   u32 zero1;
-} __attribute__((packed));
-
+/* LDT or TSS descriptor in the GDT. */
+struct ldttss_desc {
+   u16 limit0;
+   u16 base0;
 
+   u16 base1 : 8, type : 5, dpl : 2, p : 1;
+   u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8;
 #ifdef CONFIG_X86_64
-typedef struct ldttss_desc64 ldt_desc;
-typedef struct ldttss_desc64 tss_desc;
-#else
-typedef struct desc_struct ldt_desc;
-typedef struct desc_struct tss_desc;
+   u32 base3;
+   u32 zero1;
 #endif
+} __attribute__((packed));
+
+typedef struct ldttss_desc ldt_desc;
+typedef struct ldttss_desc tss_desc;
 
 struct idt_bits {
u16 ist : 3,


[tip:irq/urgent] genirq/cpuhotplug: Add sanity check for effective affinity mask

2017-10-09 Thread tip-bot for Thomas Gleixner
Commit-ID:  60b09c51bb4fb46e2331fdbb39f91520f31d35f7
Gitweb: https://git.kernel.org/tip/60b09c51bb4fb46e2331fdbb39f91520f31d35f7
Author: Thomas Gleixner 
AuthorDate: Mon, 9 Oct 2017 12:47:24 +0200
Committer:  Thomas Gleixner 
CommitDate: Mon, 9 Oct 2017 13:26:48 +0200

genirq/cpuhotplug: Add sanity check for effective affinity mask

The effective affinity mask handling has no safety net when the mask is not
updated by the interrupt chip or the mask contains offline CPUs.

If that happens the CPU unplug code fails to migrate interrupts.

Add sanity checks and emit a warning when the mask contains only offline
CPUs.

Fixes: 415fcf1a2293 ("genirq/cpuhotplug: Use effective affinity mask")
Signed-off-by: Thomas Gleixner 
Cc: Marc Zyngier 
Cc: Christoph Hellwig 
Cc: sta...@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos
---
 kernel/irq/cpuhotplug.c | 28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 638eb9c..9eb09ae 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,8 +18,34 @@
 static inline bool irq_needs_fixup(struct irq_data *d)
 {
const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+   unsigned int cpu = smp_processor_id();
 
-   return cpumask_test_cpu(smp_processor_id(), m);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+   /*
+* The cpumask_empty() check is a workaround for interrupt chips,
+* which do not implement effective affinity, but the architecture has
+* enabled the config switch. Use the general affinity mask instead.
+*/
+   if (cpumask_empty(m))
+   m = irq_data_get_affinity_mask(d);
+
+   /*
+* Sanity check. If the mask is not empty when excluding the outgoing
+* CPU then it must contain at least one online CPU. The outgoing CPU
+* has been removed from the online mask already.
+*/
+   if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+   cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
+   /*
+* If this happens then there was a missed IRQ fixup at some
+* point. Warn about it and enforce fixup.
+*/
+   pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline 
CPUs after offlining CPU %u\n",
+   cpumask_pr_args(m), d->irq, cpu);
+   return true;
+   }
+#endif
+   return cpumask_test_cpu(cpu, m);
 }
 
 static bool migrate_one_irq(struct irq_desc *desc)


[tip:irq/urgent] genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs

2017-10-09 Thread tip-bot for Thomas Gleixner
Commit-ID:  e43b3b58548051f8809391eb7bec7a27ed3003ea
Gitweb: https://git.kernel.org/tip/e43b3b58548051f8809391eb7bec7a27ed3003ea
Author: Thomas Gleixner 
AuthorDate: Wed, 4 Oct 2017 21:07:38 +0200
Committer:  Thomas Gleixner 
CommitDate: Mon, 9 Oct 2017 13:26:48 +0200

genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs

Managed interrupts can end up in a stale state on CPU hotplug. If the
interrupt is not targeting a single CPU, i.e. the affinity mask spawns
multiple CPUs then the following can happen:

After boot:

dstate:   0x01601200
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_AFFINITY_SET
IRQD_AFFINITY_MANAGED
node: 0
affinity: 24-31
effectiv: 24
pending:  0

After offlining CPU 31 - 24

dstate:   0x01a31000
IRQD_IRQ_DISABLED
IRQD_IRQ_MASKED
IRQD_SINGLE_TARGET
IRQD_AFFINITY_SET
IRQD_AFFINITY_MANAGED
IRQD_MANAGED_SHUTDOWN
node: 0
affinity: 24-31
effectiv: 24
pending:  0

Now CPU 25 gets onlined again, so it should get the effective interrupt
affinity for this interruopt, but due to the x86 interrupt affinity setter
restrictions this ends up after restarting the interrupt with:

dstate:   0x01601300
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_AFFINITY_SET
IRQD_SETAFFINITY_PENDING
IRQD_AFFINITY_MANAGED
node: 0
affinity: 24-31
effectiv: 24
pending:  24-31

So the interrupt is still affine to CPU 24, which was the last CPU to go
offline of that affinity set and the move to an online CPU within 24-31,
in this case 25, is pending. This mechanism is x86/ia64 specific as those
architectures cannot move interrupts from thread context and do this when
an interrupt is actually handled. So the move is set to pending.

Whats worse is that offlining CPU 25 again results in:

dstate:   0x01601300
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_AFFINITY_SET
IRQD_SETAFFINITY_PENDING
IRQD_AFFINITY_MANAGED
node: 0
affinity: 24-31
effectiv: 24
pending:  24-31

This means the interrupt has not been shut down, because the outgoing CPU
is not in the effective affinity mask, but of course nothing notices that
the effective affinity mask is pointing at an offline CPU.

In the case of restarting a managed interrupt the move restriction does not
apply, so the affinity setting can be made unconditional. This needs to be
done _before_ the interrupt is started up as otherwise the condition for
moving it from thread context would not longer be fulfilled.

With that change applied onlining CPU 25 after offlining 31-24 results in:

dstate:   0x01600200
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_AFFINITY_MANAGED
node: 0
affinity: 24-31
effectiv: 25
pending:  

And after offlining CPU 25:

dstate:   0x01a3
IRQD_IRQ_DISABLED
IRQD_IRQ_MASKED
IRQD_SINGLE_TARGET
IRQD_AFFINITY_MANAGED
IRQD_MANAGED_SHUTDOWN
node: 0
affinity: 24-31
effectiv: 25
pending:  

which is the correct and expected result.

Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()")
Reported-by: YASUAKI ISHIMATSU 
Signed-off-by: Thomas Gleixner 
Cc: ax...@kernel.dk
Cc: linux-s...@vger.kernel.org
Cc: Sumit Saxena 
Cc: Marc Zyngier 
Cc: m...@ellerman.id.au
Cc: Shivasharan Srikanteshwara 
Cc: Kashyap Desai 
Cc: keith.bu...@intel.com
Cc: pet...@infradead.org
Cc: Hannes Reinecke 
Cc: Christoph Hellwig 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710042208400.2406@nanos

---
 kernel/irq/chip.c   | 2 +-
 kernel/irq/manage.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd..5a2ef92c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool 
force)
irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
+   irq_do_set_affinity(d, aff, false);
ret = __irq_startup(desc);
-   irq_set_affinity_locked(d, aff, false);
break;
case IRQ_STARTUP_ABORT:
return 0;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ef89f72..4bff6a1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -188,6 +188,9 @@ int irq_do_set_affinity(struct irq_data *data, const struct 
cpumask *mask,

[tip:x86/apic] genirq: Add config option for reservation mode

2017-10-18 Thread tip-bot for Thomas Gleixner
Commit-ID:  2b5175c4fa974b6aa05bbd2ee8d443a8036a1714
Gitweb: https://git.kernel.org/tip/2b5175c4fa974b6aa05bbd2ee8d443a8036a1714
Author: Thomas Gleixner 
AuthorDate: Tue, 17 Oct 2017 09:54:57 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 18 Oct 2017 15:38:30 +0200

genirq: Add config option for reservation mode

The interrupt reservation mode requires reactivation of PCI/MSI
interrupts. Create a config option, so the PCI code can set the
corresponding flag when required.

Signed-off-by: Thomas Gleixner 
Cc: Josh Poulson 
Cc: Mihai Costache 
Cc: Stephen Hemminger 
Cc: Marc Zyngier 
Cc: linux-...@vger.kernel.org
Cc: Haiyang Zhang 
Cc: Dexuan Cui 
Cc: Simon Xiao 
Cc: Saeed Mahameed 
Cc: Jork Loeser 
Cc: Bjorn Helgaas 
Cc: de...@linuxdriverproject.org
Cc: KY Srinivasan 
Link: https://lkml.kernel.org/r/20171017075600.369375...@linutronix.de

---
 kernel/irq/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index ac1a3e2..89e3558 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -100,6 +100,9 @@ config IRQ_TIMINGS
 config GENERIC_IRQ_MATRIX_ALLOCATOR
bool
 
+config GENERIC_IRQ_RESERVATION_MODE
+   bool
+
 config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS


[tip:x86/apic] x86/vector/msi: Select CONFIG_GENERIC_IRQ_RESERVATION_MODE

2017-10-18 Thread tip-bot for Thomas Gleixner
Commit-ID:  c201c91799d687c0a6d8c3272950f51aad5ffebe
Gitweb: https://git.kernel.org/tip/c201c91799d687c0a6d8c3272950f51aad5ffebe
Author: Thomas Gleixner 
AuthorDate: Tue, 17 Oct 2017 09:54:59 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 18 Oct 2017 15:38:31 +0200

x86/vector/msi: Select CONFIG_GENERIC_IRQ_RESERVATION_MODE

Select CONFIG_GENERIC_IRQ_RESERVATION_MODE so PCI/MSI domains get the
MSI_FLAG_MUST_REACTIVATE flag set in pci_msi_create_irq_domain().

Remove the explicit setters of this flag in the apic/msi code as they are
not longer required.

Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode")
Reported-and-tested-by: Dexuan Cui 
Signed-off-by: Thomas Gleixner 
Cc: Josh Poulson 
Cc: Mihai Costache 
Cc: Stephen Hemminger 
Cc: Marc Zyngier 
Cc: linux-...@vger.kernel.org
Cc: Haiyang Zhang 
Cc: Simon Xiao 
Cc: Saeed Mahameed 
Cc: Jork Loeser 
Cc: Bjorn Helgaas 
Cc: de...@linuxdriverproject.org
Cc: KY Srinivasan 
Link: https://lkml.kernel.org/r/20171017075600.527569...@linutronix.de

---
 arch/x86/Kconfig   | 1 +
 arch/x86/kernel/apic/msi.c | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 64e99d3..ea4beda 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,6 +95,7 @@ config X86
select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC
select GENERIC_IRQ_MIGRATIONif SMP
select GENERIC_IRQ_PROBE
+   select GENERIC_IRQ_RESERVATION_MODE
select GENERIC_IRQ_SHOW
select GENERIC_PENDING_IRQ  if SMP
select GENERIC_SMP_IDLE_THREAD
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 5b6dd1a..9b18be7 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -129,7 +129,7 @@ static struct msi_domain_ops pci_msi_domain_ops = {
 
 static struct msi_domain_info pci_msi_domain_info = {
.flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX | MSI_FLAG_MUST_REACTIVATE,
+ MSI_FLAG_PCI_MSIX,
.ops= _msi_domain_ops,
.chip   = _msi_controller,
.handler= handle_edge_irq,
@@ -167,8 +167,7 @@ static struct irq_chip pci_msi_ir_controller = {
 
 static struct msi_domain_info pci_msi_ir_domain_info = {
.flags  = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX |
- MSI_FLAG_MUST_REACTIVATE,
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
.ops= _msi_domain_ops,
.chip   = _msi_ir_controller,
.handler= handle_edge_irq,


[tip:x86/apic] PCI/MSI: Set MSI_FLAG_MUST_REACTIVATE in core code

2017-10-18 Thread tip-bot for Thomas Gleixner
Commit-ID:  25e960efc63852b84d1c3739aef586285b177395
Gitweb: https://git.kernel.org/tip/25e960efc63852b84d1c3739aef586285b177395
Author: Thomas Gleixner 
AuthorDate: Tue, 17 Oct 2017 09:54:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 18 Oct 2017 15:38:31 +0200

PCI/MSI: Set MSI_FLAG_MUST_REACTIVATE in core code

If interrupt reservation mode is enabled then the PCI/MSI interrupts must
be reactivated after early activation.

Make sure that all callers of pci_msi_create_irq_domain() have the
MSI_FLAG_MUST_REACTIVATE set when reservation mode is enabled.

Signed-off-by: Thomas Gleixner 
Cc: Josh Poulson 
Cc: Mihai Costache 
Cc: Stephen Hemminger 
Cc: Marc Zyngier 
Cc: linux-...@vger.kernel.org
Cc: Haiyang Zhang 
Cc: Dexuan Cui 
Cc: Simon Xiao 
Cc: Saeed Mahameed 
Cc: Jork Loeser 
Cc: Bjorn Helgaas 
Cc: de...@linuxdriverproject.org
Cc: KY Srinivasan 
Link: https://lkml.kernel.org/r/20171017075600.448649...@linutronix.de

---
 drivers/pci/msi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 496ed91..e066071 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1441,6 +1441,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct 
fwnode_handle *fwnode,
pci_msi_domain_update_chip_ops(info);
 
info->flags |= MSI_FLAG_ACTIVATE_EARLY;
+   if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
+   info->flags |= MSI_FLAG_MUST_REACTIVATE;
 
domain = msi_create_irq_domain(fwnode, info, parent);
if (!domain)


[tip:x86/fpu] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap()

2017-10-18 Thread tip-bot for Thomas Gleixner
Commit-ID:  57b8b1a1856adaa849d02d547411a553a531022b
Gitweb: https://git.kernel.org/tip/57b8b1a1856adaa849d02d547411a553a531022b
Author: Thomas Gleixner 
AuthorDate: Wed, 18 Oct 2017 19:39:35 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 18 Oct 2017 20:03:34 +0200

x86/cpuid: Prevent out of bound access in do_clear_cpu_cap()

do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature
dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible
'features' which can be handed in are larger than this, because after the
capabilities the bug 'feature' bits occupy another 32bit. Not really
obvious...

So clearing any of the misfeature bits, as 32bit does for the F00F bug,
accesses that bitmap out of bounds thereby corrupting the stack.

Size the bitmap proper and add a sanity check to catch accidental out of
bound access.

Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies")
Reported-by: kernel test robot 
Signed-off-by: Thomas Gleixner 
Cc: Andi Kleen 
Cc: Borislav Petkov 
Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop
---
 arch/x86/kernel/cpu/cpuid-deps.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index e48eb73..c1d4984 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, 
unsigned int feature)
__clear_cpu_cap(c, feature);
 }
 
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
 static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
 {
-   bool changed;
-   DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8);
+   DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
const struct cpuid_dep *d;
+   bool changed;
+
+   if (WARN_ON(feature >= MAX_FEATURE_BITS))
+   return;
 
clear_feature(c, feature);
 


[tip:sched/urgent] sched/debug: Fix task state recording/printout

2017-11-24 Thread tip-bot for Thomas Gleixner
Commit-ID:  3f5fe9fef5b2da06b6319fab8123056da5217c3f
Gitweb: https://git.kernel.org/tip/3f5fe9fef5b2da06b6319fab8123056da5217c3f
Author: Thomas Gleixner 
AuthorDate: Wed, 22 Nov 2017 13:05:48 +0100
Committer:  Ingo Molnar 
CommitDate: Fri, 24 Nov 2017 08:39:12 +0100

sched/debug: Fix task state recording/printout

The recent conversion of the task state recording to use task_state_index()
broke the sched_switch tracepoint task state output.

task_state_index() returns surprisingly an index (0-7) which is then
printed with __print_flags() applying bitmasks. Not really working and
resulting in weird states like 'prev_state=t' instead of 'prev_state=I'.

Use TASK_REPORT_MAX instead of TASK_STATE_MAX to report preemption. Build a
bitmask from the return value of task_state_index() and store it in
entry->prev_state, which makes __print_flags() work as expected.

Signed-off-by: Thomas Gleixner 
Cc: Linus Torvalds 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: sta...@vger.kernel.org
Fixes: efb40f588b43 ("sched/tracing: Fix trace_sched_switch task-state 
printing")
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1711221304180.1751@nanos
Signed-off-by: Ingo Molnar 
---
 include/trace/events/sched.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 306b31d..bc01e06 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -116,9 +116,9 @@ static inline long __trace_sched_switch_state(bool preempt, 
struct task_struct *
 * RUNNING (we will not have dequeued if state != RUNNING).
 */
if (preempt)
-   return TASK_STATE_MAX;
+   return TASK_REPORT_MAX;
 
-   return task_state_index(p);
+   return 1 << task_state_index(p);
 }
 #endif /* CREATE_TRACE_POINTS */
 
@@ -164,7 +164,7 @@ TRACE_EVENT(sched_switch,
{ 0x40, "P" }, { 0x80, "I" }) :
  "R",
 
-   __entry->prev_state & TASK_STATE_MAX ? "+" : "",
+   __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 


[tip:irq/core] irq/work: Use llist_for_each_entry_safe

2017-11-12 Thread tip-bot for Thomas Gleixner
Commit-ID:  d00a08cf9ee986ad6689ce8c6fd176aff679c106
Gitweb: https://git.kernel.org/tip/d00a08cf9ee986ad6689ce8c6fd176aff679c106
Author: Thomas Gleixner 
AuthorDate: Sun, 12 Nov 2017 13:02:51 +0100
Committer:  Thomas Gleixner 
CommitDate: Sun, 12 Nov 2017 13:15:14 +0100

irq/work: Use llist_for_each_entry_safe

The llist_for_each_entry() loop in irq_work_run_list() is unsafe because
once the works PENDING bit is cleared it can be requeued on another CPU.

Use llist_for_each_entry_safe() instead.

Fixes: 16c0890dc66d ("irq/work: Don't reinvent the wheel but use existing llist 
API")
Reported-by:Chris Wilson 
Signed-off-by: Thomas Gleixner 
Cc: Frederic Weisbecker 
Cc: Byungchul Park 
Cc: Peter Zijlstra 
Cc: Petri Latvala 
Link: 
http://lkml.kernel.org/r/151027307351.14762.461196020658...@mail.alporthouse.com
---
 kernel/irq_work.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e2ebe8c..6647b33f 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -128,9 +128,9 @@ bool irq_work_needs_cpu(void)
 
 static void irq_work_run_list(struct llist_head *list)
 {
-   unsigned long flags;
-   struct irq_work *work;
+   struct irq_work *work, *tmp;
struct llist_node *llnode;
+   unsigned long flags;
 
BUG_ON(!irqs_disabled());
 
@@ -138,7 +138,7 @@ static void irq_work_run_list(struct llist_head *list)
return;
 
llnode = llist_del_all(list);
-   llist_for_each_entry(work, llnode, llnode) {
+   llist_for_each_entry_safe(work, tmp, llnode, llnode) {
/*
 * Clear the PENDING bit, after this point the @work
 * can be re-used.


[tip:core/urgent] watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy")

2017-11-01 Thread tip-bot for Thomas Gleixner
Commit-ID:  1c294733b7b9f712f78d15cfa75ffdea72b79abb
Gitweb: https://git.kernel.org/tip/1c294733b7b9f712f78d15cfa75ffdea72b79abb
Author: Thomas Gleixner 
AuthorDate: Tue, 31 Oct 2017 22:32:00 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 1 Nov 2017 20:41:27 +0100

watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: 
Simplify deferred event destroy")

Guenter reported a crash in the watchdog/perf code, which is caused by
cleanup() and enable() running concurrently. The reason for this is:

The watchdog functions are serialized via the watchdog_mutex and cpu
hotplug locking, but the enable of the perf based watchdog happens in
context of the unpark callback of the smpboot thread. But that unpark
function is not synchronous inside the locking. The unparking of the thread
just wakes it up and leaves so there is no guarantee when the thread is
executing.

If it starts running _before_ the cleanup happened then it will create a
event and overwrite the dead event pointer. The new event is then cleaned
up because the event is marked dead.

lock(watchdog_mutex);
lockup_detector_reconfigure();
cpus_read_lock();
stop();
   park()
update();
start();
   unpark()
cpus_read_unlock(); thread runs()
  overwrite dead event ptr
cleanup();
  free new event, which is active inside perf
unlock(watchdog_mutex);

The park side is safe as that actually waits for the thread to reach
parked state.

Commit a33d44843d45 removed the protection against this kind of scenario
under the stupid assumption that the hotplug serialization and the
watchdog_mutex cover everything. 

Bring it back.

Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event 
destroy")
Reported-and-tested-by: Guenter Roeck 
Signed-off-by: Thomas Feels-stupid Gleixner 
Cc: Peter Zijlstra 
Cc: Don Zickus 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos

---
 kernel/watchdog_hld.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71a62ce..f8db56b 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,6 +21,7 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
 static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
@@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void)
 
if (event) {
perf_event_disable(event);
+   this_cpu_write(watchdog_ev, NULL);
+   this_cpu_write(dead_event, event);
cpumask_set_cpu(smp_processor_id(), _events_mask);
watchdog_cpus--;
}
@@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void)
int cpu;
 
for_each_cpu(cpu, _events_mask) {
-   struct perf_event *event = per_cpu(watchdog_ev, cpu);
+   struct perf_event *event = per_cpu(dead_event, cpu);
 
/*
 * Required because for_each_cpu() reports  unconditionally
@@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void)
 */
if (event)
perf_event_release_kernel(event);
-   per_cpu(watchdog_ev, cpu) = NULL;
+   per_cpu(dead_event_ev, cpu) = NULL;
}
cpumask_clear(_events_mask);
 }


[tip:core/urgent] watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy")

2017-11-01 Thread tip-bot for Thomas Gleixner
Commit-ID:  9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1
Gitweb: https://git.kernel.org/tip/9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1
Author: Thomas Gleixner 
AuthorDate: Tue, 31 Oct 2017 22:32:00 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 1 Nov 2017 21:18:39 +0100

watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: 
Simplify deferred event destroy")

Guenter reported a crash in the watchdog/perf code, which is caused by
cleanup() and enable() running concurrently. The reason for this is:

The watchdog functions are serialized via the watchdog_mutex and cpu
hotplug locking, but the enable of the perf based watchdog happens in
context of the unpark callback of the smpboot thread. But that unpark
function is not synchronous inside the locking. The unparking of the thread
just wakes it up and leaves so there is no guarantee when the thread is
executing.

If it starts running _before_ the cleanup happened then it will create a
event and overwrite the dead event pointer. The new event is then cleaned
up because the event is marked dead.

lock(watchdog_mutex);
lockup_detector_reconfigure();
cpus_read_lock();
stop();
   park()
update();
start();
   unpark()
cpus_read_unlock(); thread runs()
  overwrite dead event ptr
cleanup();
  free new event, which is active inside perf
unlock(watchdog_mutex);

The park side is safe as that actually waits for the thread to reach
parked state.

Commit a33d44843d45 removed the protection against this kind of scenario
under the stupid assumption that the hotplug serialization and the
watchdog_mutex cover everything. 

Bring it back.

Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event 
destroy")
Reported-and-tested-by: Guenter Roeck 
Signed-off-by: Thomas Feels-stupid Gleixner 
Cc: Peter Zijlstra 
Cc: Don Zickus 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos


---
 kernel/watchdog_hld.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71a62ce..a7f137c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,6 +21,7 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
 static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
@@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void)
 
if (event) {
perf_event_disable(event);
+   this_cpu_write(watchdog_ev, NULL);
+   this_cpu_write(dead_event, event);
cpumask_set_cpu(smp_processor_id(), _events_mask);
watchdog_cpus--;
}
@@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void)
int cpu;
 
for_each_cpu(cpu, _events_mask) {
-   struct perf_event *event = per_cpu(watchdog_ev, cpu);
+   struct perf_event *event = per_cpu(dead_event, cpu);
 
/*
 * Required because for_each_cpu() reports  unconditionally
@@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void)
 */
if (event)
perf_event_release_kernel(event);
-   per_cpu(watchdog_ev, cpu) = NULL;
+   per_cpu(dead_event, cpu) = NULL;
}
cpumask_clear(_events_mask);
 }


[tip:smp/urgent] cpu/hotplug: Reset node state after operation

2017-10-21 Thread tip-bot for Thomas Gleixner
Commit-ID:  1f7c70d6b2bc5de301f30456621e1161fddf4242
Gitweb: https://git.kernel.org/tip/1f7c70d6b2bc5de301f30456621e1161fddf4242
Author: Thomas Gleixner 
AuthorDate: Sat, 21 Oct 2017 16:06:52 +0200
Committer:  Thomas Gleixner 
CommitDate: Sat, 21 Oct 2017 16:11:30 +0200

cpu/hotplug: Reset node state after operation

The recent rework of the cpu hotplug internals changed the usage of the per
cpu state->node field, but missed to clean it up after usage.

So subsequent hotplug operations use the stale pointer from a previous
operation and hand it into the callback functions. The callbacks then
dereference a pointer which either belongs to a different facility or
points to freed and potentially reused memory. In either case data
corruption and crashes are the obvious consequence.

Reset the node and the last pointers in the per cpu state to NULL after the
operation which set them has completed.

Fixes: 96abb968549c ("smp/hotplug: Allow external multi-instance rollback")
Reported-by: Tvrtko Ursulin 
Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: Boris Ostrovsky 
Cc: "Paul E. McKenney" 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710211606130.3213@nanos

---
 kernel/cpu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index d851df2..04892a8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -632,6 +632,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, 
bool bringup,
__cpuhp_kick_ap(st);
}
 
+   /*
+* Clean up the leftovers so the next hotplug operation wont use stale
+* data.
+*/
+   st->node = st->last = NULL;
return ret;
 }
 


[tip:locking/core] stop_machine: Use raw spinlocks

2018-04-27 Thread tip-bot for Thomas Gleixner
Commit-ID:  de5b55c1d4e30740009864eb35ce4ed856aac01d
Gitweb: https://git.kernel.org/tip/de5b55c1d4e30740009864eb35ce4ed856aac01d
Author: Thomas Gleixner 
AuthorDate: Mon, 23 Apr 2018 21:16:35 +0200
Committer:  Thomas Gleixner 
CommitDate: Fri, 27 Apr 2018 14:34:51 +0200

stop_machine: Use raw spinlocks

Use raw-locks in stop_machine() to allow locking in irq-off and
preempt-disabled regions on -RT. This also documents the possible locking
context in general.

[bigeasy: update patch description.]
Signed-off-by: Thomas Gleixner 
Signed-off-by: Sebastian Andrzej Siewior 
Link: https://lkml.kernel.org/r/20180423191635.6014-1-bige...@linutronix.de

---
 kernel/stop_machine.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7591261652d..c25ba18274fb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -36,7 +36,7 @@ struct cpu_stop_done {
 struct cpu_stopper {
struct task_struct  *thread;
 
-   spinlock_t  lock;
+   raw_spinlock_t  lock;
boolenabled;/* is this stopper enabled? */
struct list_headworks;  /* list of pending works */
 
@@ -78,13 +78,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct 
cpu_stop_work *work)
unsigned long flags;
bool enabled;
 
-   spin_lock_irqsave(>lock, flags);
+   raw_spin_lock_irqsave(>lock, flags);
enabled = stopper->enabled;
if (enabled)
__cpu_stop_queue_work(stopper, work);
else if (work->done)
cpu_stop_signal_done(work->done);
-   spin_unlock_irqrestore(>lock, flags);
+   raw_spin_unlock_irqrestore(>lock, flags);
 
return enabled;
 }
@@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct 
cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2);
int err;
 retry:
-   spin_lock_irq(>lock);
-   spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
+   raw_spin_lock_irq(>lock);
+   raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
 
err = -ENOENT;
if (!stopper1->enabled || !stopper2->enabled)
@@ -255,8 +255,8 @@ retry:
__cpu_stop_queue_work(stopper1, work1);
__cpu_stop_queue_work(stopper2, work2);
 unlock:
-   spin_unlock(>lock);
-   spin_unlock_irq(>lock);
+   raw_spin_unlock(>lock);
+   raw_spin_unlock_irq(>lock);
 
if (unlikely(err == -EDEADLK)) {
while (stop_cpus_in_progress)
@@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
unsigned long flags;
int run;
 
-   spin_lock_irqsave(>lock, flags);
+   raw_spin_lock_irqsave(>lock, flags);
run = !list_empty(>works);
-   spin_unlock_irqrestore(>lock, flags);
+   raw_spin_unlock_irqrestore(>lock, flags);
return run;
 }
 
@@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
 
 repeat:
work = NULL;
-   spin_lock_irq(>lock);
+   raw_spin_lock_irq(>lock);
if (!list_empty(>works)) {
work = list_first_entry(>works,
struct cpu_stop_work, list);
list_del_init(>list);
}
-   spin_unlock_irq(>lock);
+   raw_spin_unlock_irq(>lock);
 
if (work) {
cpu_stop_fn_t fn = work->fn;
@@ -541,7 +541,7 @@ static int __init cpu_stop_init(void)
for_each_possible_cpu(cpu) {
struct cpu_stopper *stopper = _cpu(cpu_stopper, cpu);
 
-   spin_lock_init(>lock);
+   raw_spin_lock_init(>lock);
INIT_LIST_HEAD(>works);
}
 


[tip:x86/urgent] x86/apic/x2apic: Initialize cluster ID properly

2018-05-17 Thread tip-bot for Thomas Gleixner
Commit-ID:  fed71f7d98795ed0fa1d431910787f0f4a68324f
Gitweb: https://git.kernel.org/tip/fed71f7d98795ed0fa1d431910787f0f4a68324f
Author: Thomas Gleixner 
AuthorDate: Thu, 17 May 2018 14:36:39 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 17 May 2018 21:00:12 +0200

x86/apic/x2apic: Initialize cluster ID properly

Rick bisected a regression on large systems which use the x2apic cluster
mode for interrupt delivery to the commit wich reworked the cluster
management.

The problem is caused by a missing initialization of the clusterid field
in the shared cluster data structures. So all structures end up with
cluster ID 0 which only allows sharing between all CPUs which belong to
cluster 0. All other CPUs with a cluster ID > 0 cannot share the data
structure because they cannot find existing data with their cluster
ID. This causes malfunction with IPIs because IPIs are sent to the wrong
cluster and the caller waits for ever that the target CPU handles the IPI.

Add the missing initialization when a upcoming CPU is the first in a
cluster so that the later booting CPUs can find the data and share it for
proper operation.

Fixes: 023a611748fd ("x86/apic/x2apic: Simplify cluster management")
Reported-by: Rick Warner 
Bisected-by: Rick Warner 
Signed-off-by: Thomas Gleixner 
Tested-by: Rick Warner 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/alpine.deb.2.21.1805171418210.1...@nanos.tec.linutronix.de
---
 arch/x86/kernel/apic/x2apic_cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c 
b/arch/x86/kernel/apic/x2apic_cluster.c
index 8b04234e010b..7685444a106b 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)
goto update;
}
cmsk = cluster_hotplug_mask;
+   cmsk->clusterid = cluster;
cluster_hotplug_mask = NULL;
 update:
this_cpu_write(cluster_masks, cmsk);


[tip:x86/urgent] x86/apic/vector: Print APIC control bits in debugfs

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  a07771ac6a78860777a9da5d9bc38830ec993fe7
Gitweb: https://git.kernel.org/tip/a07771ac6a78860777a9da5d9bc38830ec993fe7
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:34:00 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:22 +0200

x86/apic/vector: Print APIC control bits in debugfs

Extend the debugability of the vector management by adding the state bits
to the debugfs output.

Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.908136...@linutronix.de

---
 arch/x86/kernel/apic/vector.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b708f597eee3..35aaee4fc028 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -588,8 +588,7 @@ error:
 static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
  struct irq_data *irqd, int ind)
 {
-   unsigned int cpu, vector, prev_cpu, prev_vector;
-   struct apic_chip_data *apicd;
+   struct apic_chip_data apicd;
unsigned long flags;
int irq;
 
@@ -605,24 +604,26 @@ static void x86_vector_debug_show(struct seq_file *m, 
struct irq_domain *d,
return;
}
 
-   apicd = irqd->chip_data;
-   if (!apicd) {
+   if (!irqd->chip_data) {
seq_printf(m, "%*sVector: Not assigned\n", ind, "");
return;
}
 
raw_spin_lock_irqsave(_lock, flags);
-   cpu = apicd->cpu;
-   vector = apicd->vector;
-   prev_cpu = apicd->prev_cpu;
-   prev_vector = apicd->prev_vector;
+   memcpy(, irqd->chip_data, sizeof(apicd));
raw_spin_unlock_irqrestore(_lock, flags);
-   seq_printf(m, "%*sVector: %5u\n", ind, "", vector);
-   seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu);
-   if (prev_vector) {
-   seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", 
prev_vector);
-   seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu);
+
+   seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector);
+   seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu);
+   if (apicd.prev_vector) {
+   seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", 
apicd.prev_vector);
+   seq_printf(m, "%*sPrevious target: %5u\n", ind, "", 
apicd.prev_cpu);
}
+   seq_printf(m, "%*smove_in_progress: %u\n", ind, "", 
apicd.move_in_progress ? 1 : 0);
+   seq_printf(m, "%*sis_managed:   %u\n", ind, "", apicd.is_managed ? 
1 : 0);
+   seq_printf(m, "%*scan_reserve:  %u\n", ind, "", apicd.can_reserve ? 
1 : 0);
+   seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved 
? 1 : 0);
+   seq_printf(m, "%*scleanup_pending:  %u\n", ind, "", 
!hlist_unhashed());
 }
 #endif
 


[tip:x86/urgent] genirq/affinity: Defer affinity setting if irq chip is busy

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  12f47073a40f6aa75119d8f5df4077b7f334cced
Gitweb: https://git.kernel.org/tip/12f47073a40f6aa75119d8f5df4077b7f334cced
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:59 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:22 +0200

genirq/affinity: Defer affinity setting if irq chip is busy

The case that interrupt affinity setting fails with -EBUSY can be handled
in the kernel completely by using the already available generic pending
infrastructure.

If a irq_chip::set_affinity() fails with -EBUSY, handle it like the
interrupts for which irq_chip::set_affinity() can only be invoked from
interrupt context. Copy the new affinity mask to irq_desc::pending_mask and
set the affinity pending bit. The next raised interrupt for the affected
irq will check the pending bit and try to set the new affinity from the
handler. This avoids that -EBUSY is returned when an affinity change is
requested from user space and the previous change has not been cleaned
up. The new affinity will take effect when the next interrupt is raised
from the device.

Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup")
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.819273...@linutronix.de

---
 kernel/irq/manage.c | 37 +++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e3336d904f64..facfecfc543c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -204,6 +204,39 @@ int irq_do_set_affinity(struct irq_data *data, const 
struct cpumask *mask,
return ret;
 }
 
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline int irq_set_affinity_pending(struct irq_data *data,
+  const struct cpumask *dest)
+{
+   struct irq_desc *desc = irq_data_to_desc(data);
+
+   irqd_set_move_pending(data);
+   irq_copy_pending(desc, dest);
+   return 0;
+}
+#else
+static inline int irq_set_affinity_pending(struct irq_data *data,
+  const struct cpumask *dest)
+{
+   return -EBUSY;
+}
+#endif
+
+static int irq_try_set_affinity(struct irq_data *data,
+   const struct cpumask *dest, bool force)
+{
+   int ret = irq_do_set_affinity(data, dest, force);
+
+   /*
+* In case that the underlying vector management is busy and the
+* architecture supports the generic pending mechanism then utilize
+* this to avoid returning an error to user space.
+*/
+   if (ret == -EBUSY && !force)
+   ret = irq_set_affinity_pending(data, dest);
+   return ret;
+}
+
 int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
 {
@@ -214,8 +247,8 @@ int irq_set_affinity_locked(struct irq_data *data, const 
struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
 
-   if (irq_can_move_pcntxt(data)) {
-   ret = irq_do_set_affinity(data, mask, force);
+   if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
+   ret = irq_try_set_affinity(data, mask, force);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);


[tip:x86/urgent] genirq/generic_pending: Do not lose pending affinity update

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  a33a5d2d16cb84bea8d5f5510f3a41aa48b5c467
Gitweb: https://git.kernel.org/tip/a33a5d2d16cb84bea8d5f5510f3a41aa48b5c467
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:54 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:19 +0200

genirq/generic_pending: Do not lose pending affinity update

The generic pending interrupt mechanism moves interrupts from the interrupt
handler on the original target CPU to the new destination CPU. This is
required for x86 and ia64 due to the way the interrupt delivery and
acknowledge works if the interrupts are not remapped.

However that update can fail for various reasons. Some of them are valid
reasons to discard the pending update, but the case, when the previous move
has not been fully cleaned up is not a legit reason to fail.

Check the return value of irq_do_set_affinity() for -EBUSY, which indicates
a pending cleanup, and rearm the pending move in the irq dexcriptor so it's
tried again when the next interrupt arrives.

Fixes: 996c591227d9 ("x86/irq: Plug vector cleanup race")
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.386544...@linutronix.de

---
 kernel/irq/migration.c | 26 +++---
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 86ae0eb80b53..8b8cecd18cce 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -38,17 +38,18 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool 
force_clear)
 void irq_move_masked_irq(struct irq_data *idata)
 {
struct irq_desc *desc = irq_data_to_desc(idata);
-   struct irq_chip *chip = desc->irq_data.chip;
+   struct irq_data *data = >irq_data;
+   struct irq_chip *chip = data->chip;
 
-   if (likely(!irqd_is_setaffinity_pending(>irq_data)))
+   if (likely(!irqd_is_setaffinity_pending(data)))
return;
 
-   irqd_clr_move_pending(>irq_data);
+   irqd_clr_move_pending(data);
 
/*
 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
 */
-   if (irqd_is_per_cpu(>irq_data)) {
+   if (irqd_is_per_cpu(data)) {
WARN_ON(1);
return;
}
@@ -73,9 +74,20 @@ void irq_move_masked_irq(struct irq_data *idata)
 * For correct operation this depends on the caller
 * masking the irqs.
 */
-   if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
-   irq_do_set_affinity(>irq_data, desc->pending_mask, false);
-
+   if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) {
+   int ret;
+
+   ret = irq_do_set_affinity(data, desc->pending_mask, false);
+   /*
+* If the there is a cleanup pending in the underlying
+* vector management, reschedule the move for the next
+* interrupt. Leave desc->pending_mask intact.
+*/
+   if (ret == -EBUSY) {
+   irqd_set_move_pending(data);
+   return;
+   }
+   }
cpumask_clear(desc->pending_mask);
 }
 


[tip:x86/urgent] irq_remapping: Use apic_ack_irq()

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  8a2b7d142e7ac477d52f5f92251e59fc136d7ddd
Gitweb: https://git.kernel.org/tip/8a2b7d142e7ac477d52f5f92251e59fc136d7ddd
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:56 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:20 +0200

irq_remapping: Use apic_ack_irq()

To address the EBUSY fail of interrupt affinity settings in case that the
previous setting has not been cleaned up yet, use the new apic_ack_irq()
function instead of the special ir_ack_apic_edge() implementation which is
merily a wrapper around ack_APIC_irq().

Preparatory change for the real fix

Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup")
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.555716...@linutronix.de

---
 drivers/iommu/amd_iommu.c   | 2 +-
 drivers/iommu/intel_irq_remapping.c | 2 +-
 drivers/iommu/irq_remapping.c   | 5 -
 drivers/iommu/irq_remapping.h   | 2 --
 4 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 8fb8c737fffe..b0b30a568db7 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4379,7 +4379,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, 
struct msi_msg *msg)
 
 static struct irq_chip amd_ir_chip = {
.name   = "AMD-IR",
-   .irq_ack= ir_ack_apic_edge,
+   .irq_ack= apic_ack_irq,
.irq_set_affinity   = amd_ir_set_affinity,
.irq_set_vcpu_affinity  = amd_ir_set_vcpu_affinity,
.irq_compose_msi_msg= ir_compose_msi_msg,
diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index 3062a154a9fb..967450bd421a 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1223,7 +1223,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data 
*data, void *info)
 
 static struct irq_chip intel_ir_chip = {
.name   = "INTEL-IR",
-   .irq_ack= ir_ack_apic_edge,
+   .irq_ack= apic_ack_irq,
.irq_set_affinity   = intel_ir_set_affinity,
.irq_compose_msi_msg= intel_ir_compose_msi_msg,
.irq_set_vcpu_affinity  = intel_ir_set_vcpu_affinity,
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 496deee3ae3a..7d0f3074d41d 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -156,11 +156,6 @@ void panic_if_irq_remap(const char *msg)
panic(msg);
 }
 
-void ir_ack_apic_edge(struct irq_data *data)
-{
-   ack_APIC_irq();
-}
-
 /**
  * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the 
IOMMU
  *  device serving request @info
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 039c7af7b190..0afef6e43be4 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -65,8 +65,6 @@ struct irq_remap_ops {
 extern struct irq_remap_ops intel_irq_remap_ops;
 extern struct irq_remap_ops amd_iommu_irq_ops;
 
-extern void ir_ack_apic_edge(struct irq_data *data);
-
 #else  /* CONFIG_IRQ_REMAP */
 
 #define irq_remapping_enabled 0


[tip:x86/urgent] x86/apic: Provide apic_ack_irq()

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  c0255770ccdc77ef2184d2a0a2e0cde09d2b44a4
Gitweb: https://git.kernel.org/tip/c0255770ccdc77ef2184d2a0a2e0cde09d2b44a4
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:55 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:20 +0200

x86/apic: Provide apic_ack_irq()

apic_ack_edge() is explicitely for handling interrupt affinity cleanup when
interrupt remapping is not available or disable.

Remapped interrupts and also some of the platform specific special
interrupts, e.g. UV, invoke ack_APIC_irq() directly.

To address the issue of failing an affinity update with -EBUSY the delayed
affinity mechanism can be reused, but ack_APIC_irq() does not handle
that. Adding this to ack_APIC_irq() is not possible, because that function
is also used for exceptions and directly handled interrupts like IPIs.

Create a new function, which just contains the conditional invocation of
irq_move_irq() and the final ack_APIC_irq().

Reuse the new function in apic_ack_edge().

Preparatory change for the real fix.

Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup")
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.471925...@linutronix.de

---
 arch/x86/include/asm/apic.h   | 2 ++
 arch/x86/kernel/apic/vector.c | 9 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 08acd954f00e..74a9e06b6cfd 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -436,6 +436,8 @@ static inline void apic_set_eoi_write(void (*eoi_write)(u32 
reg, u32 v)) {}
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+extern void apic_ack_irq(struct irq_data *data);
+
 static inline void ack_APIC_irq(void)
 {
/*
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 72b575a0b662..b708f597eee3 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -809,13 +809,18 @@ static int apic_retrigger_irq(struct irq_data *irqd)
return 1;
 }
 
-void apic_ack_edge(struct irq_data *irqd)
+void apic_ack_irq(struct irq_data *irqd)
 {
-   irq_complete_move(irqd_cfg(irqd));
irq_move_irq(irqd);
ack_APIC_irq();
 }
 
+void apic_ack_edge(struct irq_data *irqd)
+{
+   irq_complete_move(irqd_cfg(irqd));
+   apic_ack_irq(irqd);
+}
+
 static struct irq_chip lapic_controller = {
.name   = "APIC",
.irq_ack= apic_ack_edge,


[tip:x86/urgent] genirq/migration: Avoid out of line call if pending is not set

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  d340ebd696f921d3ad01b8c0c29dd38f2ad2bf3e
Gitweb: https://git.kernel.org/tip/d340ebd696f921d3ad01b8c0c29dd38f2ad2bf3e
Author: Thomas Gleixner 
AuthorDate: Wed, 6 Jun 2018 14:46:59 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:20 +0200

genirq/migration: Avoid out of line call if pending is not set

The upcoming fix for the -EBUSY return from affinity settings requires to
use the irq_move_irq() functionality even on irq remapped interrupts. To
avoid the out of line call, move the check for the pending bit into an
inline helper.

Preparatory change for the real fix. No functional change.

Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup")
Signed-off-by: Thomas Gleixner 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Cc: Dou Liyang 
Link: https://lkml.kernel.org/r/20180604162224.471925...@linutronix.de

---
 include/linux/irq.h| 7 ++-
 kernel/irq/migration.c | 5 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 65916a305f3d..4e66378f290b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -551,7 +551,12 @@ extern int irq_affinity_online_cpu(unsigned int cpu);
 #endif
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
-void irq_move_irq(struct irq_data *data);
+void __irq_move_irq(struct irq_data *data);
+static inline void irq_move_irq(struct irq_data *data)
+{
+   if (unlikely(irqd_is_setaffinity_pending(data)))
+   __irq_move_irq(data);
+}
 void irq_move_masked_irq(struct irq_data *data);
 void irq_force_complete_move(struct irq_desc *desc);
 #else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 8b8cecd18cce..def48589ea48 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -91,7 +91,7 @@ void irq_move_masked_irq(struct irq_data *idata)
cpumask_clear(desc->pending_mask);
 }
 
-void irq_move_irq(struct irq_data *idata)
+void __irq_move_irq(struct irq_data *idata)
 {
bool masked;
 
@@ -102,9 +102,6 @@ void irq_move_irq(struct irq_data *idata)
 */
idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
 
-   if (likely(!irqd_is_setaffinity_pending(idata)))
-   return;
-
if (unlikely(irqd_irq_disabled(idata)))
return;
 


[tip:x86/urgent] x86/ioapic: Use apic_ack_irq()

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  2b04e46d8d0b9b7ac08ded672e3eab823f01d77a
Gitweb: https://git.kernel.org/tip/2b04e46d8d0b9b7ac08ded672e3eab823f01d77a
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:57 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:21 +0200

x86/ioapic: Use apic_ack_irq()

To address the EBUSY fail of interrupt affinity settings in case that the
previous setting has not been cleaned up yet, use the new apic_ack_irq()
function instead of directly invoking ack_APIC_irq().

Preparatory change for the real fix

Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup")
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Song Liu 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.639011...@linutronix.de

---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7553819c74c3..3982f79d2377 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1851,7 +1851,7 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
 * intr-remapping table entry. Hence for the io-apic
 * EOI we use the pin number.
 */
-   ack_APIC_irq();
+   apic_ack_irq(irq_data);
eoi_ioapic_pin(data->entry.vector, data);
 }
 


[tip:x86/urgent] x86/platform/uv: Use apic_ack_irq()

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  839b0f1c4ef674cd929a42304c078afca278581a
Gitweb: https://git.kernel.org/tip/839b0f1c4ef674cd929a42304c078afca278581a
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:21 +0200

x86/platform/uv: Use apic_ack_irq()

To address the EBUSY fail of interrupt affinity settings in case that the
previous setting has not been cleaned up yet, use the new apic_ack_irq()
function instead of the special uv_ack_apic() implementation which is
merily a wrapper around ack_APIC_irq().

Preparatory change for the real fix

Fixes: dccfe3147b42 ("x86/vector: Simplify vector move cleanup")
Reported-by: Song Liu 
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Cc: Tariq Toukan 
Link: https://lkml.kernel.org/r/20180604162224.721691...@linutronix.de

---
 arch/x86/platform/uv/uv_irq.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index e4cb9f4cde8a..fc13cbbb2dce 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -47,11 +47,6 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct 
uv_irq_2_mmr_pnode *info)
 
 static void uv_noop(struct irq_data *data) { }
 
-static void uv_ack_apic(struct irq_data *data)
-{
-   ack_APIC_irq();
-}
-
 static int
 uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
@@ -73,7 +68,7 @@ static struct irq_chip uv_irq_chip = {
.name   = "UV-CORE",
.irq_mask   = uv_noop,
.irq_unmask = uv_noop,
-   .irq_eoi= uv_ack_apic,
+   .irq_eoi= apic_ack_irq,
.irq_set_affinity   = uv_set_irq_affinity,
 };
 


[tip:x86/urgent] x86/apic/vector: Prevent hlist corruption and leaks

2018-06-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  80ae7b1a918e78b0bae88b0c0ad413d3fdced968
Gitweb: https://git.kernel.org/tip/80ae7b1a918e78b0bae88b0c0ad413d3fdced968
Author: Thomas Gleixner 
AuthorDate: Mon, 4 Jun 2018 17:33:53 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 6 Jun 2018 15:18:19 +0200

x86/apic/vector: Prevent hlist corruption and leaks

Several people observed the WARN_ON() in irq_matrix_free() which triggers
when the caller tries to free an vector which is not in the allocation
range. Song provided the trace information which allowed to decode the root
cause.

The rework of the vector allocation mechanism failed to preserve a sanity
check, which prevents setting a new target vector/CPU when the previous
affinity change has not fully completed.

As a result a half finished affinity change can be overwritten, which can
cause the leak of a irq descriptor pointer on the previous target CPU and
double enqueue of the hlist head into the cleanup lists of two or more
CPUs. After one CPU cleaned up its vector the next CPU will invoke the
cleanup handler with vector 0, which triggers the out of range warning in
the matrix allocator.

Prevent this by checking the apic_data of the interrupt whether the
move_in_progress flag is false and the hlist node is not hashed. Return
-EBUSY if not.

This prevents the damage and restores the behaviour before the vector
allocation rework, but due to other changes in that area it also widens the
chance that user space can observe -EBUSY. In theory this should be fine,
but actually not all user space tools handle -EBUSY correctly. Addressing
that is not part of this fix, but will be addressed in follow up patches.

Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment")
Reported-by: Dmitry Safonov <0x7f454...@gmail.com>
Reported-by: Tariq Toukan 
Reported-by: Song Liu 
Signed-off-by: Thomas Gleixner 
Tested-by: Song Liu 
Cc: Joerg Roedel 
Cc: Peter Zijlstra 
Cc: sta...@vger.kernel.org
Cc: Mike Travis 
Cc: Borislav Petkov 
Link: https://lkml.kernel.org/r/20180604162224.303870...@linutronix.de

---
 arch/x86/kernel/apic/vector.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index bb6f7a2148d7..72b575a0b662 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -235,6 +235,15 @@ static int allocate_vector(struct irq_data *irqd, const 
struct cpumask *dest)
if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
return 0;
 
+   /*
+* Careful here. @apicd might either have move_in_progress set or
+* be enqueued for cleanup. Assigning a new vector would either
+* leave a stale vector on some CPU around or in case of a pending
+* cleanup corrupt the hlist.
+*/
+   if (apicd->move_in_progress || !hlist_unhashed(>clist))
+   return -EBUSY;
+
vector = irq_matrix_alloc(vector_matrix, dest, resvd, );
if (vector > 0)
apic_update_vector(irqd, vector, cpu);


[tip:timers/core] posix-timers: Sanitize overrun handling

2018-07-02 Thread tip-bot for Thomas Gleixner
Commit-ID:  78c9c4dfbf8c04883941445a195276bb4bb92c76
Gitweb: https://git.kernel.org/tip/78c9c4dfbf8c04883941445a195276bb4bb92c76
Author: Thomas Gleixner 
AuthorDate: Tue, 26 Jun 2018 15:21:32 +0200
Committer:  Thomas Gleixner 
CommitDate: Mon, 2 Jul 2018 11:33:25 +0200

posix-timers: Sanitize overrun handling

The posix timer overrun handling is broken because the forwarding functions
can return a huge number of overruns which does not fit in an int. As a
consequence timer_getoverrun(2) and siginfo::si_overrun can turn into
random number generators.

The k_clock::timer_forward() callbacks return a 64 bit value now. Make
k_itimer::ti_overrun[_last] 64bit as well, so the kernel internal
accounting is correct. 3Remove the temporary (int) casts.

Add a helper function which clamps the overrun value returned to user space
via timer_getoverrun(2) or siginfo::si_overrun limited to a positive value
between 0 and INT_MAX. INT_MAX is an indicator for user space that the
overrun value has been clamped.

Reported-by: Team OWL337 
Signed-off-by: Thomas Gleixner 
Acked-by: John Stultz 
Cc: Peter Zijlstra 
Cc: Michael Kerrisk 
Link: https://lkml.kernel.org/r/20180626132705.018623...@linutronix.de

---
 include/linux/posix-timers.h   |  4 ++--
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c | 31 ---
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index c85704fcdbd2..ee7e987ea1b4 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -95,8 +95,8 @@ struct k_itimer {
clockid_t   it_clock;
timer_t it_id;
int it_active;
-   int it_overrun;
-   int it_overrun_last;
+   s64 it_overrun;
+   s64 it_overrun_last;
int it_requeue_pending;
int it_sigev_notify;
ktime_t it_interval;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5a6251ac6f7a..562cc3891b57 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -85,7 +85,7 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
continue;
 
timer->it.cpu.expires += incr;
-   timer->it_overrun += 1 << i;
+   timer->it_overrun += 1LL << i;
delta -= incr;
}
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index db1d65963a57..3ac7295306dc 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -283,6 +283,17 @@ static __init int init_posix_timers(void)
 }
 __initcall(init_posix_timers);
 
+/*
+ * The siginfo si_overrun field and the return value of timer_getoverrun(2)
+ * are of type int. Clamp the overrun value to INT_MAX
+ */
+static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
+{
+   s64 sum = timr->it_overrun_last + (s64)baseval;
+
+   return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
+}
+
 static void common_hrtimer_rearm(struct k_itimer *timr)
 {
struct hrtimer *timer = >it.real.timer;
@@ -290,9 +301,8 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
if (!timr->it_interval)
return;
 
-   timr->it_overrun += (unsigned int) hrtimer_forward(timer,
-   timer->base->get_time(),
-   timr->it_interval);
+   timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
+   timr->it_interval);
hrtimer_restart(timer);
 }
 
@@ -321,10 +331,10 @@ void posixtimer_rearm(struct siginfo *info)
 
timr->it_active = 1;
timr->it_overrun_last = timr->it_overrun;
-   timr->it_overrun = -1;
+   timr->it_overrun = -1LL;
++timr->it_requeue_pending;
 
-   info->si_overrun += timr->it_overrun_last;
+   info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
}
 
unlock_timer(timr, flags);
@@ -418,9 +428,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer 
*timer)
now = ktime_add(now, kj);
}
 #endif
-   timr->it_overrun += (unsigned int)
-   hrtimer_forward(timer, now,
-   timr->it_interval);
+   timr->it_overrun += hrtimer_forward(timer, now,
+   timr->it_interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
timr->it_active = 1;
@@ -524,7 +533,7 @@ static int 

[tip:timers/core] alarmtimer: Prevent overflow for relative nanosleep

2018-07-02 Thread tip-bot for Thomas Gleixner
Commit-ID:  5f936e19cc0ef97dbe3a56e9498922ad5ba1edef
Gitweb: https://git.kernel.org/tip/5f936e19cc0ef97dbe3a56e9498922ad5ba1edef
Author: Thomas Gleixner 
AuthorDate: Mon, 2 Jul 2018 09:34:29 +0200
Committer:  Thomas Gleixner 
CommitDate: Mon, 2 Jul 2018 11:33:26 +0200

alarmtimer: Prevent overflow for relative nanosleep

Air Icy reported:

  UBSAN: Undefined behaviour in kernel/time/alarmtimer.c:811:7
  signed integer overflow:
  1529859276030040771 + 9223372036854775807 cannot be represented in type 'long 
long int'
  Call Trace:
   alarm_timer_nsleep+0x44c/0x510 kernel/time/alarmtimer.c:811
   __do_sys_clock_nanosleep kernel/time/posix-timers.c:1235 [inline]
   __se_sys_clock_nanosleep kernel/time/posix-timers.c:1213 [inline]
   __x64_sys_clock_nanosleep+0x326/0x4e0 kernel/time/posix-timers.c:1213
   do_syscall_64+0xb8/0x3a0 arch/x86/entry/common.c:290

alarm_timer_nsleep() uses ktime_add() to add the current time and the
relative expiry value. ktime_add() has no sanity checks so the addition
can overflow when the relative timeout is large enough.

Use ktime_add_safe() which has the necessary sanity checks in place and
limits the result to the valid range.

Fixes: 9a7adcf5c6de ("timers: Posix interface for alarm-timers")
Reported-by: Team OWL337 
Signed-off-by: Thomas Gleixner 
Cc: John Stultz 
Link: 
https://lkml.kernel.org/r/alpine.deb.2.21.1807020926360.1...@nanos.tec.linutronix.de

---
 kernel/time/alarmtimer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 78a3cc555823..fa5de5e8de61 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -808,7 +808,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, 
int flags,
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now = alarm_bases[type].gettime();
-   exp = ktime_add(now, exp);
+
+   exp = ktime_add_safe(now, exp);
}
 
ret = alarmtimer_do_nsleep(, exp, type);


[tip:timers/core] posix-timers: Make forward callback return s64

2018-07-02 Thread tip-bot for Thomas Gleixner
Commit-ID:  6fec64e1c92d5c715c6d0f50786daa7708266bde
Gitweb: https://git.kernel.org/tip/6fec64e1c92d5c715c6d0f50786daa7708266bde
Author: Thomas Gleixner 
AuthorDate: Tue, 26 Jun 2018 15:21:31 +0200
Committer:  Thomas Gleixner 
CommitDate: Mon, 2 Jul 2018 11:33:25 +0200

posix-timers: Make forward callback return s64

The posix timer ti_overrun handling is broken because the forwarding
functions can return a huge number of overruns which does not fit in an
int. As a consequence timer_getoverrun(2) and siginfo::si_overrun can turn
into random number generators.

As a first step to address that let the timer_forward() callbacks return
the full 64 bit value.

Cast it to (int) temporarily until k_itimer::ti_overrun is converted to
64bit and the conversion to user space visible values is sanitized.

Reported-by: Team OWL337 
Signed-off-by: Thomas Gleixner 
Acked-by: John Stultz 
Cc: Peter Zijlstra 
Cc: Michael Kerrisk 
Link: https://lkml.kernel.org/r/20180626132704.922098...@linutronix.de

---
 kernel/time/alarmtimer.c   | 4 ++--
 kernel/time/posix-timers.c | 6 +++---
 kernel/time/posix-timers.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 639321bf2e39..78a3cc555823 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -581,11 +581,11 @@ static void alarm_timer_rearm(struct k_itimer *timr)
  * @timr:  Pointer to the posixtimer data struct
  * @now:   Current time to forward the timer against
  */
-static int alarm_timer_forward(struct k_itimer *timr, ktime_t now)
+static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
 {
struct alarm *alarm = >it.alarm.alarmtimer;
 
-   return (int) alarm_forward(alarm, timr->it_interval, now);
+   return alarm_forward(alarm, timr->it_interval, now);
 }
 
 /**
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 80d59333c76e..db1d65963a57 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -645,11 +645,11 @@ static ktime_t common_hrtimer_remaining(struct k_itimer 
*timr, ktime_t now)
return __hrtimer_expires_remaining_adjusted(timer, now);
 }
 
-static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
+static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
 {
struct hrtimer *timer = >it.real.timer;
 
-   return (int)hrtimer_forward(timer, now, timr->it_interval);
+   return hrtimer_forward(timer, now, timr->it_interval);
 }
 
 /*
@@ -702,7 +702,7 @@ void common_timer_get(struct k_itimer *timr, struct 
itimerspec64 *cur_setting)
 * expiry time forward by intervals, so expiry is > now.
 */
if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
-   timr->it_overrun += kc->timer_forward(timr, now);
+   timr->it_overrun += (int)kc->timer_forward(timr, now);
 
remaining = kc->timer_remaining(timr, now);
/* Return 0 only, when the timer is expired and not pending */
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 151e28f5bf30..ddb21145211a 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -19,7 +19,7 @@ struct k_clock {
void(*timer_get)(struct k_itimer *timr,
 struct itimerspec64 *cur_setting);
void(*timer_rearm)(struct k_itimer *timr);
-   int (*timer_forward)(struct k_itimer *timr, ktime_t now);
+   s64 (*timer_forward)(struct k_itimer *timr, ktime_t now);
ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now);
int (*timer_try_to_cancel)(struct k_itimer *timr);
void(*timer_arm)(struct k_itimer *timr, ktime_t expires,


[tip:x86/urgent] x86/cpu: Restore CPUID_8000_0008_EBX reload

2018-05-02 Thread tip-bot for Thomas Gleixner
Commit-ID:  c65732e4f72124ca5a3a0dd3bee0d3cee39c7170
Gitweb: https://git.kernel.org/tip/c65732e4f72124ca5a3a0dd3bee0d3cee39c7170
Author: Thomas Gleixner 
AuthorDate: Mon, 30 Apr 2018 21:47:46 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 2 May 2018 16:44:38 +0200

x86/cpu: Restore CPUID_8000_0008_EBX reload

The recent commt which addresses the x86_phys_bits corruption with
encrypted memory on CPUID reload after a microcode update lost the reload
of CPUID_8000_0008_EBX as well.

As a consequence IBRS and IBRS_FW are not longer detected

Restore the behaviour by bringing the reload of CPUID_8000_0008_EBX
back. This restore has a twist due to the convoluted way the cpuid analysis
works:

CPUID_8000_0008_EBX is used by AMD to enumerate IBRB, IBRS, STIBP. On Intel
EBX is not used. But the speculation control code sets the AMD bits when
running on Intel depending on the Intel specific speculation control
bits. This was done to use the same bits for alternatives.

The change which moved the 8000_0008 evaluation out of get_cpu_cap() broke
this nasty scheme due to ordering. So that on Intel the store to
CPUID_8000_0008_EBX clears the IBRB, IBRS, STIBP bits which had been set
before by software.

So the actual CPUID_8000_0008_EBX needs to go back to the place where it
was and the phys/virt address space calculation cannot touch it.

In hindsight this should have used completely synthetic bits for IBRB,
IBRS, STIBP instead of reusing the AMD bits, but that's for 4.18.

/me needs to find time to cleanup that steaming pile of ...

Fixes: d94a155c59c9 ("x86/cpu: Prevent cpuinfo_x86::x86_phys_bits adjustment 
corruption")
Reported-by: Jörg Otte 
Reported-by: Tim Chen 
Signed-off-by: Thomas Gleixner 
Tested-by: Jörg Otte 
Cc: Linus Torvalds 
Cc: kirill.shute...@linux.intel.com
Cc: Borislav Petkov x86_power = edx;
}
 
+   if (c->extended_cpuid_level >= 0x8008) {
+   cpuid(0x8008, , , , );
+   c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+   }
+
if (c->extended_cpuid_level >= 0x800a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x800a);
 
@@ -871,7 +876,6 @@ static void get_cpu_address_sizes(struct cpuinfo_x86 *c)
 
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
-   c->x86_capability[CPUID_8000_0008_EBX] = ebx;
}
 #ifdef CONFIG_X86_32
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))


[tip:x86/urgent] x86/apic: Switch all APICs to Fixed delivery mode

2017-12-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  a31e58e129f73ab5b04016330b13ed51fde7a961
Gitweb: https://git.kernel.org/tip/a31e58e129f73ab5b04016330b13ed51fde7a961
Author: Thomas Gleixner 
AuthorDate: Thu, 28 Dec 2017 11:33:33 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 14:20:48 +0100

x86/apic: Switch all APICs to Fixed delivery mode

Some of the APIC incarnations are operating in lowest priority delivery
mode. This worked as long as the vector management code allocated the same
vector on all possible CPUs for each interrupt.

Lowest priority delivery mode does not necessarily respect the affinity
setting and may redirect to some other online CPU. This was documented
somewhere in the old code and the conversion to single target delivery
missed to update the delivery mode of the affected APIC drivers which
results in spurious interrupts on some of the affected CPU/Chipset
combinations.

Switch the APIC drivers over to Fixed delivery mode and remove all
leftovers of lowest priority delivery mode.

Switching to Fixed delivery mode is not a problem on these CPUs because the
kernel already uses Fixed delivery mode for IPIs. The reason for this is
that th SDM explicitely forbids lowest prio mode for IPIs. The reason is
obvious: If the irq routing does not honor destination targets in lowest
prio mode then an IPI targeted at CPU1 might end up on CPU0, which would be
a fatal problem in many cases.

As a consequence of this change, the apic::irq_delivery_mode field is now
pointless, but this needs to be cleaned up in a separate patch.

Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity")
Reported-by: vcap...@pengaru.com
Signed-off-by: Thomas Gleixner 
Tested-by: vcap...@pengaru.com
Cc: Pavel Machek 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos
---
 arch/x86/kernel/apic/apic_flat_64.c   | 2 +-
 arch/x86/kernel/apic/apic_noop.c  | 2 +-
 arch/x86/kernel/apic/msi.c| 8 ++--
 arch/x86/kernel/apic/probe_32.c   | 2 +-
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 drivers/pci/host/pci-hyperv.c | 8 ++--
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c 
b/arch/x86/kernel/apic/apic_flat_64.c
index aa85690..25a8702 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid  = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
 
-   .irq_delivery_mode  = dest_LowestPrio,
+   .irq_delivery_mode  = dest_Fixed,
.irq_dest_mode  = 1, /* logical */
 
.disable_esr= 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 7b659c4..5078b5c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid  = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
 
-   .irq_delivery_mode  = dest_LowestPrio,
+   .irq_delivery_mode  = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode  = 1,
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9b18be7..ce503c9 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, 
struct msi_msg *msg)
((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL :
MSI_ADDR_DEST_MODE_LOGICAL) |
-   ((apic->irq_delivery_mode != dest_LowestPrio) ?
-   MSI_ADDR_REDIRECTION_CPU :
-   MSI_ADDR_REDIRECTION_LOWPRI) |
+   MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
-   ((apic->irq_delivery_mode != dest_LowestPrio) ?
-   MSI_DATA_DELIVERY_FIXED :
-   MSI_DATA_DELIVERY_LOWPRI) |
+   MSI_DATA_DELIVERY_FIXED |
MSI_DATA_VECTOR(cfg->vector);
 }
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fa22017..02e8acb 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid  = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
 
-   .irq_delivery_mode  = dest_LowestPrio,
+   .irq_delivery_mode  = dest_Fixed,
/* 

[tip:irq/urgent] genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI

2017-12-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  bc976233a872c0f20f018fb1e89264a541584e25
Gitweb: https://git.kernel.org/tip/bc976233a872c0f20f018fb1e89264a541584e25
Author: Thomas Gleixner 
AuthorDate: Fri, 29 Dec 2017 10:47:22 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 21:13:05 +0100

genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI

The new reservation mode for interrupts assigns a dummy vector when the
interrupt is allocated and assigns a real vector when the interrupt is
requested. The reservation mode prevents vector pressure when devices with
a large amount of queues/interrupts are initialized, but only a minimal
subset of those queues/interrupts is actually used.

This mode has an issue with MSI interrupts which cannot be masked. If the
driver is not careful or the hardware emits an interrupt before the device
irq is requestd by the driver then the interrupt ends up on the dummy
vector as a spurious interrupt which can cause malfunction of the device or
in the worst case a lockup of the machine.

Change the logic for the reservation mode so that the early activation of
MSI interrupts checks whether:

 - the device is a PCI/MSI device
 - the reservation mode of the underlying irqdomain is activated
 - PCI/MSI masking is globally enabled
 - the PCI/MSI device uses either MSI-X, which supports masking, or
   MSI with the maskbit supported.

If one of those conditions is false, then clear the reservation mode flag
in the irq data of the interrupt and invoke irq_domain_activate_irq() with
the reserve argument cleared. In the x86 vector code, clear the can_reserve
flag in the vector allocation data so a subsequent free_irq() won't create
the same situation again. The interrupt stays assigned to a real vector
until pci_disable_msi() is invoked and all allocations are undone.

Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode")
Reported-by: Alexandru Chirvasitu 
Reported-by: Andy Shevchenko 
Signed-off-by: Thomas Gleixner 
Tested-by: Alexandru Chirvasitu 
Tested-by: Andy Shevchenko 
Cc: Dou Liyang 
Cc: Pavel Machek 
Cc: Maciej W. Rozycki 
Cc: Mikael Pettersson 
Cc: Josh Poulson 
Cc: Mihai Costache 
Cc: Stephen Hemminger 
Cc: Marc Zyngier 
Cc: linux-...@vger.kernel.org
Cc: Haiyang Zhang 
Cc: Dexuan Cui 
Cc: Simon Xiao 
Cc: Saeed Mahameed 
Cc: Jork Loeser 
Cc: Bjorn Helgaas 
Cc: de...@linuxdriverproject.org
Cc: KY Srinivasan 
Cc: Alan Cox 
Cc: Sakari Ailus ,
Cc: linux-me...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291406420.1899@nanos
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291409460.1899@nanos
---
 arch/x86/kernel/apic/vector.c | 12 +++-
 kernel/irq/msi.c  | 37 +
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 52c85c8..f8b03bb 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -369,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
int ret;
 
ret = assign_irq_vector_any_locked(irqd);
-   if (!ret)
+   if (!ret) {
apicd->has_reserved = false;
+   /*
+* Core might have disabled reservation mode after
+* allocating the irq descriptor. Ideally this should
+* happen before allocation time, but that would require
+* completely convoluted ways of transporting that
+* information.
+*/
+   if (!irqd_can_reserve(irqd))
+   apicd->can_reserve = false;
+   }
return ret;
 }
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9ba9543..2f3c4f5 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,11 +339,38 @@ int msi_domain_populate_irqs(struct irq_domain *domain, 
struct device *dev,
return ret;
 }
 
-static bool msi_check_reservation_mode(struct msi_domain_info *info)
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and 

[tip:timers/urgent] timers: Reinitialize per cpu bases on hotplug

2017-12-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  26456f87aca7157c057de65c9414b37f1ab881d1
Gitweb: https://git.kernel.org/tip/26456f87aca7157c057de65c9414b37f1ab881d1
Author: Thomas Gleixner 
AuthorDate: Wed, 27 Dec 2017 21:37:25 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 23:13:09 +0100

timers: Reinitialize per cpu bases on hotplug

The timer wheel bases are not (re)initialized on CPU hotplug. That leaves
them with a potentially stale clk and next_expiry valuem, which can cause
trouble then the CPU is plugged.

Add a prepare callback which forwards the clock, sets next_expiry to far in
the future and reset the control flags to a known state.

Set base->must_forward_clk so the first timer which is queued will try to
forward the clock to current jiffies.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Reported-by: Paul E. McKenney 
Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Frederic Weisbecker 
Cc: Sebastian Siewior 
Cc: Anna-Maria Gleixner 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos

---
 include/linux/cpuhotplug.h |  2 +-
 include/linux/timer.h  |  4 +++-
 kernel/cpu.c   |  4 ++--
 kernel/time/timer.c| 15 +++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 201ab72..1a32e55 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -86,7 +86,7 @@ enum cpuhp_state {
CPUHP_MM_ZSWP_POOL_PREPARE,
CPUHP_KVM_PPC_BOOK3S_PREPARE,
CPUHP_ZCOMP_PREPARE,
-   CPUHP_TIMERS_DEAD,
+   CPUHP_TIMERS_PREPARE,
CPUHP_MIPS_SOC_PREPARE,
CPUHP_BP_PREPARE_DYN,
CPUHP_BP_PREPARE_DYN_END= CPUHP_BP_PREPARE_DYN + 20,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 04af640..2448f9c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j);
 unsigned long round_jiffies_up_relative(unsigned long j);
 
 #ifdef CONFIG_HOTPLUG_CPU
+int timers_prepare_cpu(unsigned int cpu);
 int timers_dead_cpu(unsigned int cpu);
 #else
-#define timers_dead_cpu NULL
+#define timers_prepare_cpu NULL
+#define timers_dead_cpuNULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41376c3..9785847 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 * before blk_mq_queue_reinit_notify() from notify_dead(),
 * otherwise a RCU stall occurs.
 */
-   [CPUHP_TIMERS_DEAD] = {
+   [CPUHP_TIMERS_PREPARE] = {
.name   = "timers:dead",
-   .startup.single = NULL,
+   .startup.single = timers_prepare_cpu,
.teardown.single= timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 19a9c3d..6be576e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1853,6 +1853,21 @@ static void migrate_timer_list(struct timer_base 
*new_base, struct hlist_head *h
}
 }
 
+int timers_prepare_cpu(unsigned int cpu)
+{
+   struct timer_base *base;
+   int b;
+
+   for (b = 0; b < NR_BASES; b++) {
+   base = per_cpu_ptr(_bases[b], cpu);
+   base->clk = jiffies;
+   base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+   base->is_idle = false;
+   base->must_forward_clk = true;
+   }
+   return 0;
+}
+
 int timers_dead_cpu(unsigned int cpu)
 {
struct timer_base *old_base;


[tip:timers/urgent] nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()

2017-12-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  5d62c183f9e9df1deeea0906d099a94e8a43047a
Gitweb: https://git.kernel.org/tip/5d62c183f9e9df1deeea0906d099a94e8a43047a
Author: Thomas Gleixner 
AuthorDate: Fri, 22 Dec 2017 15:51:13 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 23:13:10 +0100

nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()

The conditions in irq_exit() to invoke tick_nohz_irq_exit() which
subsequently invokes tick_nohz_stop_sched_tick() are:

  if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu))

If need_resched() is not set, but a timer softirq is pending then this is
an indication that the softirq code punted and delegated the execution to
softirqd. need_resched() is not true because the current interrupted task
takes precedence over softirqd.

Invoking tick_nohz_irq_exit() in this case can cause an endless loop of
timer interrupts because the timer wheel contains an expired timer, but
softirqs are not yet executed. So it returns an immediate expiry request,
which causes the timer to fire immediately again. Lather, rinse and
repeat

Prevent that by adding a check for a pending timer soft interrupt to the
conditions in tick_nohz_stop_sched_tick() which avoid calling
get_next_timer_interrupt(). That keeps the tick sched timer on the tick and
prevents a repetitive programming of an already expired timer.

Reported-by: Sebastian Siewior 
Signed-off-by: Thomas Gleixner 
Acked-by: Frederic Weisbecker 
Cc: Peter Zijlstra 
Cc: Paul McKenney 
Cc: Anna-Maria Gleixner 
Cc: Sebastian Siewior 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos

---
 kernel/time/tick-sched.c | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 77555fa..f7cc7ab 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, 
ktime_t now)
ts->next_tick = 0;
 }
 
+static inline bool local_timer_softirq_pending(void)
+{
+   return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 ktime_t now, int cpu)
 {
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched 
*ts,
} while (read_seqretry(_lock, seq));
ts->last_jiffies = basejiff;
 
-   if (rcu_needs_cpu(basemono, _rcu) ||
-   arch_needs_cpu() || irq_work_needs_cpu()) {
+   /*
+* Keep the periodic tick, when RCU, architecture or irq_work
+* requests it.
+* Aside of that check whether the local timer softirq is
+* pending. If so its a bad idea to call get_next_timer_interrupt()
+* because there is an already expired timer, so it will request
+* immeditate expiry, which rearms the hardware timer with a
+* minimal delta which brings us back to this place
+* immediately. Lather, rinse and repeat...
+*/
+   if (rcu_needs_cpu(basemono, _rcu) || arch_needs_cpu() ||
+   irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*


[tip:timers/urgent] timers: Invoke timer_start_debug() where it makes sense

2017-12-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  fd45bb77ad682be728d1002431d77b8c73342836
Gitweb: https://git.kernel.org/tip/fd45bb77ad682be728d1002431d77b8c73342836
Author: Thomas Gleixner 
AuthorDate: Fri, 22 Dec 2017 15:51:14 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 23:13:10 +0100

timers: Invoke timer_start_debug() where it makes sense

The timer start debug function is called before the proper timer base is
set. As a consequence the trace data contains the stale CPU and flags
values.

Call the debug function after setting the new base and flags.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Frederic Weisbecker 
Cc: Sebastian Siewior 
Cc: sta...@vger.kernel.org
Cc: r...@linutronix.de
Cc: Paul McKenney 
Cc: Anna-Maria Gleixner 
Link: https://lkml.kernel.org/r/20171222145337.792907...@linutronix.de

---
 kernel/time/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 6be576e..89a9e1b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1007,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long 
expires, unsigned int option
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
 
-   debug_activate(timer, expires);
-
new_base = get_target_base(base, timer->flags);
 
if (base != new_base) {
@@ -1032,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long 
expires, unsigned int option
}
}
 
+   debug_activate(timer, expires);
+
timer->expires = expires;
/*
 * If 'idx' was calculated above and the base time did not advance


[tip:timers/urgent] timerqueue: Document return values of timerqueue_add/del()

2017-12-29 Thread tip-bot for Thomas Gleixner
Commit-ID:  9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7
Gitweb: https://git.kernel.org/tip/9f4533cd7334235cd4c9b9fb1b0b8791e2ba01a7
Author: Thomas Gleixner 
AuthorDate: Fri, 22 Dec 2017 15:51:15 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 23:13:10 +0100

timerqueue: Document return values of timerqueue_add/del()

The return values of timerqueue_add/del() are not documented in the kernel doc
comment. Add proper documentation.

Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Frederic Weisbecker 
Cc: Sebastian Siewior 
Cc: r...@linutronix.de
Cc: Paul McKenney 
Cc: Anna-Maria Gleixner 
Link: https://lkml.kernel.org/r/20171222145337.872681...@linutronix.de

---
 lib/timerqueue.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 4a720ed..0d54bcb 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -33,8 +33,9 @@
  * @head: head of timerqueue
  * @node: timer node to be added
  *
- * Adds the timer node to the timerqueue, sorted by the
- * node's expires value.
+ * Adds the timer node to the timerqueue, sorted by the node's expires
+ * value. Returns true if the newly added timer is the first expiring timer in
+ * the queue.
  */
 bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
@@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
  * @head: head of timerqueue
  * @node: timer node to be removed
  *
- * Removes the timer node from the timerqueue.
+ * Removes the timer node from the timerqueue. Returns true if the queue is
+ * not empty after the remove.
  */
 bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {


[tip:x86/pti] x86/kaslr: Fix the vaddr_end mess

2018-01-04 Thread tip-bot for Thomas Gleixner
Commit-ID:  125125112ba49706518ac9077a1026a18f37
Gitweb: https://git.kernel.org/tip/125125112ba49706518ac9077a1026a18f37
Author: Thomas Gleixner 
AuthorDate: Thu, 4 Jan 2018 12:32:03 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 5 Jan 2018 00:39:57 +0100

x86/kaslr: Fix the vaddr_end mess

vaddr_end for KASLR is only documented in the KASLR code itself and is
adjusted depending on config options. So it's not surprising that a change
of the memory layout causes KASLR to have the wrong vaddr_end. This can map
arbitrary stuff into other areas causing hard to understand problems.

Remove the whole ifdef magic and define the start of the cpu_entry_area to
be the end of the KASLR vaddr range.

Add documentation to that effect.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Reported-by: Benjamin Gilbert 
Signed-off-by: Thomas Gleixner 
Tested-by: Benjamin Gilbert 
Cc: Andy Lutomirski 
Cc: Greg Kroah-Hartman 
Cc: stable 
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Thomas Garnier ,
Cc: Alexander Kuleshov 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos
---
 Documentation/x86/x86_64/mm.txt |  6 ++
 arch/x86/include/asm/pgtable_64_types.h |  8 +++-
 arch/x86/mm/kaslr.c | 32 +---
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index f7dabe1..ea91cb6 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ea00 - eaff (=40 bits) virtual memory 
map (1TB)
 ... unused hole ...
 ec00 - fbff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+   vaddr_end for KASLR
 fe00 - fe7f (=39 bits) cpu_entry_area mapping
 fe80 - feff (=39 bits) LDT remap for PTI
 ff00 - ff7f (=39 bits) %esp fixup stacks
@@ -37,6 +38,7 @@ ffd4 - ffd5 (=49 bits) virtual memory 
map (512TB)
 ... unused hole ...
 ffdf - fc00 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+   vaddr_end for KASLR
 fe00 - fe7f (=39 bits) cpu_entry_area mapping
 ... unused hole ...
 ff00 - ff7f (=39 bits) %esp fixup stacks
@@ -71,3 +73,7 @@ during EFI runtime calls.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
+
+Be very careful vs. KASLR when changing anything here. The KASLR address
+range must not overlap with anything except the KASAN shadow area, which is
+correct as KASAN disables KASLR.
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 61b4b60..6b8f73d 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
 
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+/*
+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
 #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 #ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 879ef93..aedebd2 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -34,25 +34,14 @@
 #define TB_SHIFT 40
 
 /*
- * Virtual address start and end range for randomization. The end changes base
- * on configuration to have the highest amount of space for randomization.
- * It increases the possible random position for each randomized region.
+ * Virtual address start and end range for randomization.
  *
- * You need to add an if/def entry if you introduce a new memory region
- * compatible with KASLR. Your entry must be in logical order with memory
- * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() 
to
- * ensure that this order is correct and won't be changed.
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization 

[tip:x86/pti] x86/kaslr: Fix the vaddr_end mess

2018-01-04 Thread tip-bot for Thomas Gleixner
Commit-ID:  1b3ef54207f068dae9c36d891ff69dd4d37c5c2f
Gitweb: https://git.kernel.org/tip/1b3ef54207f068dae9c36d891ff69dd4d37c5c2f
Author: Thomas Gleixner 
AuthorDate: Thu, 4 Jan 2018 12:32:03 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 4 Jan 2018 23:04:57 +0100

x86/kaslr: Fix the vaddr_end mess

vaddr_end for KASLR is only documented in the KASLR code itself and is
adjusted depending on config options. So it's not surprising that a change
of the memory layout causes KASLR to have the wrong vaddr_end. This can map
arbitrary stuff into other areas causing hard to understand problems.

Remove the whole ifdef magic and define the start of the cpu_entry_area to
be the end of the KASLR vaddr range.

Add documentation to that effect.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Reported-by: Benjamin Gilbert 
Signed-off-by: Thomas Gleixner 
Tested-by: Benjamin Gilbert 
Cc: Andy Lutomirski 
Cc: Greg Kroah-Hartman 
Cc: stable 
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Thomas Garnier ,
Cc: Alexander Kuleshov 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos
---
 Documentation/x86/x86_64/mm.txt |  6 ++
 arch/x86/include/asm/pgtable_64_types.h |  8 +++-
 arch/x86/mm/kaslr.c | 34 ++---
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index f7dabe1..ea91cb6 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ea00 - eaff (=40 bits) virtual memory 
map (1TB)
 ... unused hole ...
 ec00 - fbff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+   vaddr_end for KASLR
 fe00 - fe7f (=39 bits) cpu_entry_area mapping
 fe80 - feff (=39 bits) LDT remap for PTI
 ff00 - ff7f (=39 bits) %esp fixup stacks
@@ -37,6 +38,7 @@ ffd4 - ffd5 (=49 bits) virtual memory 
map (512TB)
 ... unused hole ...
 ffdf - fc00 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+   vaddr_end for KASLR
 fe00 - fe7f (=39 bits) cpu_entry_area mapping
 ... unused hole ...
 ff00 - ff7f (=39 bits) %esp fixup stacks
@@ -71,3 +73,7 @@ during EFI runtime calls.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
+
+Be very careful vs. KASLR when changing anything here. The KASLR address
+range must not overlap with anything except the KASAN shadow area, which is
+correct as KASAN disables KASLR.
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 61b4b60..6b8f73d 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK (~(PGDIR_SIZE - 1))
 
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+/*
+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
 #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 #ifdef CONFIG_X86_5LEVEL
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 879ef93..b805a61 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -34,25 +34,14 @@
 #define TB_SHIFT 40
 
 /*
- * Virtual address start and end range for randomization. The end changes base
- * on configuration to have the highest amount of space for randomization.
- * It increases the possible random position for each randomized region.
+ * Virtual address start and end range for randomization.
  *
- * You need to add an if/def entry if you introduce a new memory region
- * compatible with KASLR. Your entry must be in logical order with memory
- * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() 
to
- * ensure that this order is correct and won't be changed.
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization 

[tip:x86/pti] x86/mm: Map cpu_entry_area at the same place on 4/5 level

2018-01-04 Thread tip-bot for Thomas Gleixner
Commit-ID:  f2078904810373211fb15f91888fba14c01a4acc
Gitweb: https://git.kernel.org/tip/f2078904810373211fb15f91888fba14c01a4acc
Author: Thomas Gleixner 
AuthorDate: Thu, 4 Jan 2018 13:01:40 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 4 Jan 2018 23:04:57 +0100

x86/mm: Map cpu_entry_area at the same place on 4/5 level

There is no reason for 4 and 5 level pagetables to have a different
layout. It just makes determining vaddr_end for KASLR harder than
necessary.

Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap")
Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Benjamin Gilbert 
Cc: Greg Kroah-Hartman 
Cc: stable 
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Thomas Garnier ,
Cc: Alexander Kuleshov 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos

---
 Documentation/x86/x86_64/mm.txt | 7 ---
 arch/x86/include/asm/pgtable_64_types.h | 4 ++--
 arch/x86/mm/dump_pagetables.c   | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ddd5ffd..f7dabe1 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,8 +12,8 @@ ea00 - eaff (=40 bits) virtual memory 
map (1TB)
 ... unused hole ...
 ec00 - fbff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
-fe00 - fe7f (=39 bits) LDT remap for PTI
-fe80 - feff (=39 bits) cpu_entry_area mapping
+fe00 - fe7f (=39 bits) cpu_entry_area mapping
+fe80 - feff (=39 bits) LDT remap for PTI
 ff00 - ff7f (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffef - fffe (=64 GB) EFI region mapping space
@@ -37,7 +37,8 @@ ffd4 - ffd5 (=49 bits) virtual memory 
map (512TB)
 ... unused hole ...
 ffdf - fc00 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
-fe80 - feff (=39 bits) cpu_entry_area mapping
+fe00 - fe7f (=39 bits) cpu_entry_area mapping
+... unused hole ...
 ff00 - ff7f (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffef - fffe (=64 GB) EFI region mapping space
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 6233e55..61b4b60 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -88,7 +88,7 @@ typedef struct { pteval_t pte; } pte_t;
 # define VMALLOC_SIZE_TB   _AC(32, UL)
 # define __VMALLOC_BASE_AC(0xc900, UL)
 # define __VMEMMAP_BASE_AC(0xea00, UL)
-# define LDT_PGD_ENTRY _AC(-4, UL)
+# define LDT_PGD_ENTRY _AC(-3, UL)
 # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 
@@ -110,7 +110,7 @@ typedef struct { pteval_t pte; } pte_t;
 #define ESPFIX_PGD_ENTRY   _AC(-2, UL)
 #define ESPFIX_BASE_ADDR   (ESPFIX_PGD_ENTRY << P4D_SHIFT)
 
-#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
+#define CPU_ENTRY_AREA_PGD _AC(-4, UL)
 #define CPU_ENTRY_AREA_BASE(CPU_ENTRY_AREA_PGD << P4D_SHIFT)
 
 #define EFI_VA_START   ( -4 * (_AC(1, UL) << 30))
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index f56902c..2a4849e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -61,10 +61,10 @@ enum address_markers_idx {
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
 #endif
+   CPU_ENTRY_AREA_NR,
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
LDT_NR,
 #endif
-   CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
 #endif


[tip:x86/pti] x86/cpu: Implement CPU vulnerabilites sysfs functions

2018-01-08 Thread tip-bot for Thomas Gleixner
Commit-ID:  61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4
Gitweb: https://git.kernel.org/tip/61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4
Author: Thomas Gleixner 
AuthorDate: Sun, 7 Jan 2018 22:48:01 +0100
Committer:  Thomas Gleixner 
CommitDate: Mon, 8 Jan 2018 11:10:40 +0100

x86/cpu: Implement CPU vulnerabilites sysfs functions

Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and
spectre_v2.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Greg Kroah-Hartman 
Reviewed-by: Konrad Rzeszutek Wilk 
Cc: Peter Zijlstra 
Cc: Will Deacon 
Cc: Dave Hansen 
Cc: Linus Torvalds 
Cc: Borislav Petkov 
Cc: David Woodhouse 
Link: https://lkml.kernel.org/r/20180107214913.177414...@linutronix.de

---
 arch/x86/Kconfig   |  1 +
 arch/x86/kernel/cpu/bugs.c | 29 +
 2 files changed, 30 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd5199d..e23d21a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -89,6 +89,7 @@ config X86
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
+   select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_FIND_FIRST_BIT
select GENERIC_IOMAP
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ba0b242..76ad6cb 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -10,6 +10,7 @@
  */
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -60,3 +61,31 @@ void __init check_bugs(void)
set_memory_4k((unsigned long)__va(0), 1);
 #endif
 }
+
+#ifdef CONFIG_SYSFS
+ssize_t cpu_show_meltdown(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+   if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+   return sprintf(buf, "Not affected\n");
+   if (boot_cpu_has(X86_FEATURE_PTI))
+   return sprintf(buf, "Mitigation: PTI\n");
+   return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+   return sprintf(buf, "Not affected\n");
+   return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+   return sprintf(buf, "Not affected\n");
+   return sprintf(buf, "Vulnerable\n");
+}
+#endif


[tip:x86/pti] sysfs/cpu: Add vulnerability folder

2018-01-08 Thread tip-bot for Thomas Gleixner
Commit-ID:  87590ce6e373d1a5401f6539f0c59ef92dd924a9
Gitweb: https://git.kernel.org/tip/87590ce6e373d1a5401f6539f0c59ef92dd924a9
Author: Thomas Gleixner 
AuthorDate: Sun, 7 Jan 2018 22:48:00 +0100
Committer:  Thomas Gleixner 
CommitDate: Mon, 8 Jan 2018 11:10:33 +0100

sysfs/cpu: Add vulnerability folder

As the meltdown/spectre problem affects several CPU architectures, it makes
sense to have common way to express whether a system is affected by a
particular vulnerability or not. If affected the way to express the
mitigation should be common as well.

Create /sys/devices/system/cpu/vulnerabilities folder and files for
meltdown, spectre_v1 and spectre_v2.

Allow architectures to override the show function.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Greg Kroah-Hartman 
Reviewed-by: Konrad Rzeszutek Wilk 
Cc: Peter Zijlstra 
Cc: Will Deacon 
Cc: Dave Hansen 
Cc: Linus Torvalds 
Cc: Borislav Petkov 
Cc: David Woodhouse 
Link: https://lkml.kernel.org/r/20180107214913.096657...@linutronix.de

---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 16 
 drivers/base/Kconfig   |  3 ++
 drivers/base/cpu.c | 48 ++
 include/linux/cpu.h|  7 
 4 files changed, 74 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index f3d5817..bd3a88e 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -373,3 +373,19 @@ Contact:   Linux kernel mailing list 

 Description:   information about CPUs heterogeneity.
 
cpu_capacity: capacity of cpu#.
+
+What:  /sys/devices/system/cpu/vulnerabilities
+   /sys/devices/system/cpu/vulnerabilities/meltdown
+   /sys/devices/system/cpu/vulnerabilities/spectre_v1
+   /sys/devices/system/cpu/vulnerabilities/spectre_v2
+Date:  Januar 2018
+Contact:   Linux kernel mailing list 
+Description:   Information about CPU vulnerabilities
+
+   The files are named after the code names of CPU
+   vulnerabilities. The output of those files reflects the
+   state of the CPUs in the system. Possible output values:
+
+   "Not affected"CPU is not affected by the vulnerability
+   "Vulnerable"  CPU is affected and no mitigation in effect
+   "Mitigation: $M"  CPU is affetcted and mitigation $M is in 
effect
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 2f6614c..37a71fd 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -235,6 +235,9 @@ config GENERIC_CPU_DEVICES
 config GENERIC_CPU_AUTOPROBE
bool
 
+config GENERIC_CPU_VULNERABILITIES
+   bool
+
 config SOC_BUS
bool
select GLOB
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b..825964e 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -501,10 +501,58 @@ static void __init cpu_dev_register_generic(void)
 #endif
 }
 
+#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES
+
+ssize_t __weak cpu_show_meltdown(struct device *dev,
+struct device_attribute *attr, char *buf)
+{
+   return sprintf(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_spectre_v1(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   return sprintf(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_spectre_v2(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   return sprintf(buf, "Not affected\n");
+}
+
+static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
+static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
+static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
+
+static struct attribute *cpu_root_vulnerabilities_attrs[] = {
+   _attr_meltdown.attr,
+   _attr_spectre_v1.attr,
+   _attr_spectre_v2.attr,
+   NULL
+};
+
+static const struct attribute_group cpu_root_vulnerabilities_group = {
+   .name  = "vulnerabilities",
+   .attrs = cpu_root_vulnerabilities_attrs,
+};
+
+static void __init cpu_register_vulnerabilities(void)
+{
+   if (sysfs_create_group(_subsys.dev_root->kobj,
+  _root_vulnerabilities_group))
+   pr_err("Unable to register CPU vulnerabilities\n");
+}
+
+#else
+static inline void cpu_register_vulnerabilities(void) { }
+#endif
+
 void __init cpu_dev_init(void)
 {
if (subsys_system_register(_subsys, cpu_root_attr_groups))

[tip:x86/pti] x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN

2018-01-05 Thread tip-bot for Thomas Gleixner
Commit-ID:  de791821c295cc61419a06fe5562288417d1bc58
Gitweb: https://git.kernel.org/tip/de791821c295cc61419a06fe5562288417d1bc58
Author: Thomas Gleixner 
AuthorDate: Fri, 5 Jan 2018 15:27:34 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 5 Jan 2018 15:34:43 +0100

x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN

Use the name associated with the particular attack which needs page table
isolation for mitigation.

Signed-off-by: Thomas Gleixner 
Acked-by: David Woodhouse 
Cc: Alan Cox 
Cc: Jiri Koshina 
Cc: Linus Torvalds 
Cc: Tim Chen 
Cc: Andi Lutomirski  
Cc: Andi Kleen 
Cc: Peter Zijlstra 
Cc: Paul Turner 
Cc: Tom Lendacky 
Cc: Greg KH 
Cc: Dave Hansen 
Cc: Kees Cook 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos

---
 arch/x86/include/asm/cpufeatures.h | 2 +-
 arch/x86/kernel/cpu/common.c   | 2 +-
 arch/x86/mm/pti.c  | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 07cdd17..21ac898 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -341,6 +341,6 @@
 #define X86_BUG_SWAPGS_FENCE   X86_BUG(11) /* SWAPGS without input dep 
on GS */
 #define X86_BUG_MONITORX86_BUG(12) /* IPI required to 
wake up remote CPU */
 #define X86_BUG_AMD_E400   X86_BUG(13) /* CPU is among the 
affected by Erratum 400 */
-#define X86_BUG_CPU_INSECURE   X86_BUG(14) /* CPU is insecure and 
needs kernel page table isolation */
+#define X86_BUG_CPU_MELTDOWN   X86_BUG(14) /* CPU is affected by 
meltdown attack and needs kernel page table isolation */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b1be494..2d3bd22 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
if (c->x86_vendor != X86_VENDOR_AMD)
-   setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+   setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
fpu__init_system(c);
 
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2da28ba..43d4a4a 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -56,13 +56,13 @@
 
 static void __init pti_print_if_insecure(const char *reason)
 {
-   if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+   if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
pr_info("%s\n", reason);
 }
 
 static void __init pti_print_if_secure(const char *reason)
 {
-   if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+   if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
pr_info("%s\n", reason);
 }
 
@@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void)
}
 
 autosel:
-   if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+   if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
return;
 enable:
setup_force_cpu_cap(X86_FEATURE_PTI);


[tip:timers/core] hrtimer: Optimize the hrtimer code by using static keys for migration_enable/nohz_active

2018-01-15 Thread tip-bot for Thomas Gleixner
Commit-ID:  ae67badaa1643253998cb21d5782e4ea7c231a29
Gitweb: https://git.kernel.org/tip/ae67badaa1643253998cb21d5782e4ea7c231a29
Author: Thomas Gleixner 
AuthorDate: Sun, 14 Jan 2018 23:30:51 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Jan 2018 02:35:44 +0100

hrtimer: Optimize the hrtimer code by using static keys for 
migration_enable/nohz_active

The hrtimer_cpu_base::migration_enable and ::nohz_active fields
were originally introduced to avoid accessing global variables
for these decisions.

Still that results in a (cache hot) load and conditional branch,
which can be avoided by using static keys.

Implement it with static keys and optimize for the most critical
case of high performance networking which tends to disable the
timer migration functionality.

No change in functionality.

Signed-off-by: Thomas Gleixner 
Cc: Anna-Maria Gleixner 
Cc: Christoph Hellwig 
Cc: Frederic Weisbecker 
Cc: John Stultz 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: keesc...@chromium.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1801142327490.2371@nanos
Link: https://lkml.kernel.org/r/20171221104205.7269-2-anna-ma...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 include/linux/hrtimer.h |  4 ---
 kernel/time/hrtimer.c   | 17 +++---
 kernel/time/tick-internal.h | 19 +++
 kernel/time/tick-sched.c|  2 +-
 kernel/time/timer.c | 83 +++--
 5 files changed, 60 insertions(+), 65 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 012c37f..79b2a8d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -153,8 +153,6 @@ enum  hrtimer_base_type {
  * @cpu:   cpu number
  * @active_bases:  Bitfield to mark bases with active timers
  * @clock_was_set_seq: Sequence counter of clock was set events
- * @migration_enabled: The migration of hrtimers to other cpus is enabled
- * @nohz_active:   The nohz functionality is enabled
  * @expires_next:  absolute time of the next event which was scheduled
  * via clock_set_next_event()
  * @next_timer:Pointer to the first expiring timer
@@ -178,8 +176,6 @@ struct hrtimer_cpu_base {
unsigned intcpu;
unsigned intactive_bases;
unsigned intclock_was_set_seq;
-   boolmigration_enabled;
-   boolnohz_active;
 #ifdef CONFIG_HIGH_RES_TIMERS
unsigned intin_hrtirq   : 1,
hres_active : 1,
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d325208..1d06d2b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -178,23 +178,16 @@ hrtimer_check_target(struct hrtimer *timer, struct 
hrtimer_clock_base *new_base)
 #endif
 }
 
-#ifdef CONFIG_NO_HZ_COMMON
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
-int pinned)
-{
-   if (pinned || !base->migration_enabled)
-   return base;
-   return _cpu(hrtimer_bases, get_nohz_timer_target());
-}
-#else
 static inline
 struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 int pinned)
 {
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+   if (static_branch_likely(_migration_enabled) && !pinned)
+   return _cpu(hrtimer_bases, get_nohz_timer_target());
+#endif
return base;
 }
-#endif
 
 /*
  * We switch the timer base to a power-optimized selected CPU target,
@@ -969,7 +962,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t 
tim,
 * Kick to reschedule the next tick to handle the new timer
 * on dynticks target.
 */
-   if (new_base->cpu_base->nohz_active)
+   if (is_timers_nohz_active())
wake_up_nohz_cpu(new_base->cpu_base->cpu);
} else {
hrtimer_reprogram(timer, new_base);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f8e1845..f690628 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -150,14 +150,19 @@ static inline void tick_nohz_init(void) { }
 
 #ifdef CONFIG_NO_HZ_COMMON
 extern unsigned long tick_nohz_active;
-#else
+extern void timers_update_nohz(void);
+extern struct static_key_false timers_nohz_active;
+static inline bool is_timers_nohz_active(void)
+{
+   return static_branch_likely(_nohz_active);
+}
+# ifdef CONFIG_SMP
+extern struct static_key_false timers_migration_enabled;
+# endif

[tip:timers/core] hrtimer: Correct blatantly incorrect comment

2018-01-15 Thread tip-bot for Thomas Gleixner
Commit-ID:  d05ca13b8d3f685667b3b1748fa89285466270c5
Gitweb: https://git.kernel.org/tip/d05ca13b8d3f685667b3b1748fa89285466270c5
Author: Thomas Gleixner 
AuthorDate: Thu, 21 Dec 2017 11:41:31 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Jan 2018 02:35:44 +0100

hrtimer: Correct blatantly incorrect comment

The protection of a hrtimer which runs its callback against migration to a
different CPU has nothing to do with hard interrupt context.

The protection against migration of a hrtimer running the expiry callback
is the pointer in the cpu_base which holds a pointer to the currently
running timer. This pointer is evaluated in the code which potentially
switches the timer base and makes sure it's kept on the CPU on which the
callback is running.

Reported-by: Anna-Maria Gleixner 
Signed-off-by: Thomas Gleixner 
Signed-off-by: Anna-Maria Gleixner 
Reviewed-by: Frederic Weisbecker 
Cc: Christoph Hellwig 
Cc: John Stultz 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: keesc...@chromium.org
Link: http://lkml.kernel.org/r/20171221104205.7269-3-anna-ma...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 kernel/time/hrtimer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1d06d2b..7687355 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1195,9 +1195,9 @@ static void __run_hrtimer(struct hrtimer_cpu_base 
*cpu_base,
timer->is_rel = false;
 
/*
-* Because we run timers from hardirq context, there is no chance
-* they get migrated to another cpu, therefore its safe to unlock
-* the timer base.
+* The timer is marked as running in the CPU base, so it is
+* protected against migration to a different CPU even if the lock
+* is dropped.
 */
raw_spin_unlock(_base->lock);
trace_hrtimer_expire_entry(timer, now);


[tip:timers/core] ALSA/dummy: Replace tasklet with softirq hrtimer

2018-01-16 Thread tip-bot for Thomas Gleixner
Commit-ID:  b03bbbe08ff04d80136b6aac152954ef308a4909
Gitweb: https://git.kernel.org/tip/b03bbbe08ff04d80136b6aac152954ef308a4909
Author: Thomas Gleixner 
AuthorDate: Thu, 21 Dec 2017 11:42:03 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Jan 2018 09:51:22 +0100

ALSA/dummy: Replace tasklet with softirq hrtimer

The tasklet is used to defer the execution of snd_pcm_period_elapsed() to
the softirq context. Using the HRTIMER_MODE_SOFT mode invokes the timer
callback in softirq context as well which renders the tasklet useless.

[o-takashi: avoid stall due to a call of hrtimer_cancel() on a callback of 
hrtimer]

Signed-off-by: Thomas Gleixner 
Signed-off-by: Anna-Maria Gleixner 
Reviewed-by: Takashi Iwai 
Cc: Christoph Hellwig 
Cc: Jaroslav Kysela 
Cc: John Stultz 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Takashi Iwai 
Cc: Takashi Sakamoto 
Cc: alsa-de...@alsa-project.org
Cc: keesc...@chromium.org
Link: http://lkml.kernel.org/r/20171221104205.7269-35-anna-ma...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 sound/drivers/dummy.c | 27 ---
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
index 7b2b1f7..6ad2ff5 100644
--- a/sound/drivers/dummy.c
+++ b/sound/drivers/dummy.c
@@ -375,17 +375,9 @@ struct dummy_hrtimer_pcm {
ktime_t period_time;
atomic_t running;
struct hrtimer timer;
-   struct tasklet_struct tasklet;
struct snd_pcm_substream *substream;
 };
 
-static void dummy_hrtimer_pcm_elapsed(unsigned long priv)
-{
-   struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv;
-   if (atomic_read(>running))
-   snd_pcm_period_elapsed(dpcm->substream);
-}
-
 static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer)
 {
struct dummy_hrtimer_pcm *dpcm;
@@ -393,7 +385,14 @@ static enum hrtimer_restart dummy_hrtimer_callback(struct 
hrtimer *timer)
dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer);
if (!atomic_read(>running))
return HRTIMER_NORESTART;
-   tasklet_schedule(>tasklet);
+   /*
+* In cases of XRUN and draining, this calls .trigger to stop PCM
+* substream.
+*/
+   snd_pcm_period_elapsed(dpcm->substream);
+   if (!atomic_read(>running))
+   return HRTIMER_NORESTART;
+
hrtimer_forward_now(timer, dpcm->period_time);
return HRTIMER_RESTART;
 }
@@ -403,7 +402,7 @@ static int dummy_hrtimer_start(struct snd_pcm_substream 
*substream)
struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
 
dpcm->base_time = hrtimer_cb_get_time(>timer);
-   hrtimer_start(>timer, dpcm->period_time, HRTIMER_MODE_REL);
+   hrtimer_start(>timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT);
atomic_set(>running, 1);
return 0;
 }
@@ -413,14 +412,14 @@ static int dummy_hrtimer_stop(struct snd_pcm_substream 
*substream)
struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data;
 
atomic_set(>running, 0);
-   hrtimer_cancel(>timer);
+   if (!hrtimer_callback_running(>timer))
+   hrtimer_cancel(>timer);
return 0;
 }
 
 static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm)
 {
hrtimer_cancel(>timer);
-   tasklet_kill(>tasklet);
 }
 
 static snd_pcm_uframes_t
@@ -465,12 +464,10 @@ static int dummy_hrtimer_create(struct snd_pcm_substream 
*substream)
if (!dpcm)
return -ENOMEM;
substream->runtime->private_data = dpcm;
-   hrtimer_init(>timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   hrtimer_init(>timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
dpcm->timer.function = dummy_hrtimer_callback;
dpcm->substream = substream;
atomic_set(>running, 0);
-   tasklet_init(>tasklet, dummy_hrtimer_pcm_elapsed,
-(unsigned long)dpcm);
return 0;
 }
 


[tip:timers/core] usb/gadget/NCM: Replace tasklet with softirq hrtimer

2018-01-16 Thread tip-bot for Thomas Gleixner
Commit-ID:  b1a31a5f5f27ff8aba42b545a1c721941f735107
Gitweb: https://git.kernel.org/tip/b1a31a5f5f27ff8aba42b545a1c721941f735107
Author: Thomas Gleixner 
AuthorDate: Thu, 21 Dec 2017 11:42:04 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Jan 2018 09:51:23 +0100

usb/gadget/NCM: Replace tasklet with softirq hrtimer

The tx_tasklet tasklet is used in invoke the hrtimer (task_timer) in
softirq context. This can be also achieved without the tasklet but
with HRTIMER_MODE_SOFT as hrtimer mode.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Anna-Maria Gleixner 
Acked-by: Felipe Balbi 
Cc: Christoph Hellwig 
Cc: Felipe Balbi 
Cc: John Stultz 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: keesc...@chromium.org
Cc: linux-...@vger.kernel.org
Link: http://lkml.kernel.org/r/20171221104205.7269-36-anna-ma...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 drivers/usb/gadget/function/f_ncm.c | 30 +++---
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/drivers/usb/gadget/function/f_ncm.c 
b/drivers/usb/gadget/function/f_ncm.c
index c5bce8e..5780fba 100644
--- a/drivers/usb/gadget/function/f_ncm.c
+++ b/drivers/usb/gadget/function/f_ncm.c
@@ -73,9 +73,7 @@ struct f_ncm {
struct sk_buff  *skb_tx_ndp;
u16 ndp_dgram_count;
booltimer_force_tx;
-   struct tasklet_struct   tx_tasklet;
struct hrtimer  task_timer;
-
booltimer_stopping;
 };
 
@@ -1104,7 +1102,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
 
/* Delay the timer. */
hrtimer_start(>task_timer, TX_TIMEOUT_NSECS,
- HRTIMER_MODE_REL);
+ HRTIMER_MODE_REL_SOFT);
 
/* Add the datagram position entries */
ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len);
@@ -1148,17 +1146,15 @@ err:
 }
 
 /*
- * This transmits the NTB if there are frames waiting.
+ * The transmit should only be run if no skb data has been sent
+ * for a certain duration.
  */
-static void ncm_tx_tasklet(unsigned long data)
+static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
 {
-   struct f_ncm*ncm = (void *)data;
-
-   if (ncm->timer_stopping)
-   return;
+   struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
 
/* Only send if data is available. */
-   if (ncm->skb_tx_data) {
+   if (!ncm->timer_stopping && ncm->skb_tx_data) {
ncm->timer_force_tx = true;
 
/* XXX This allowance of a NULL skb argument to ndo_start_xmit
@@ -1171,16 +1167,6 @@ static void ncm_tx_tasklet(unsigned long data)
 
ncm->timer_force_tx = false;
}
-}
-
-/*
- * The transmit should only be run if no skb data has been sent
- * for a certain duration.
- */
-static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data)
-{
-   struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer);
-   tasklet_schedule(>tx_tasklet);
return HRTIMER_NORESTART;
 }
 
@@ -1513,8 +1499,7 @@ static int ncm_bind(struct usb_configuration *c, struct 
usb_function *f)
ncm->port.open = ncm_open;
ncm->port.close = ncm_close;
 
-   tasklet_init(>tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm);
-   hrtimer_init(>task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   hrtimer_init(>task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
ncm->task_timer.function = ncm_tx_timeout;
 
DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n",
@@ -1623,7 +1608,6 @@ static void ncm_unbind(struct usb_configuration *c, 
struct usb_function *f)
DBG(c->cdev, "ncm unbind\n");
 
hrtimer_cancel(>task_timer);
-   tasklet_kill(>tx_tasklet);
 
ncm_string_defs[0].id = 0;
usb_free_all_descriptors(f);


[tip:x86/pti] x86/pti: Fix !PCID and sanitize defines

2018-01-14 Thread tip-bot for Thomas Gleixner
Commit-ID:  f10ee3dcc9f0aba92a5c4c064628be5200765dc2
Gitweb: https://git.kernel.org/tip/f10ee3dcc9f0aba92a5c4c064628be5200765dc2
Author: Thomas Gleixner 
AuthorDate: Sun, 14 Jan 2018 00:23:57 +0100
Committer:  Thomas Gleixner 
CommitDate: Sun, 14 Jan 2018 10:45:53 +0100

x86/pti: Fix !PCID and sanitize defines

The switch to the user space page tables in the low level ASM code sets
unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base
address of the page directory to the user part, bit 11 is switching the
PCID to the PCID associated with the user page tables.

This fails on a machine which lacks PCID support because bit 11 is set in
CR3. Bit 11 is reserved when PCID is inactive.

While the Intel SDM claims that the reserved bits are ignored when PCID is
disabled, the AMD APM states that they should be cleared.

This went unnoticed as the AMD APM was not checked when the code was
developed and reviewed and test systems with Intel CPUs never failed to
boot. The report is against a Centos 6 host where the guest fails to boot,
so it's not yet clear whether this is a virt issue or can happen on real
hardware too, but thats irrelevant as the AMD APM clearly ask for clearing
the reserved bits.

Make sure that on non PCID machines bit 11 is not set by the page table
switching code.

Andy suggested to rename the related bits and masks so they are clearly
describing what they should be used for, which is done as well for clarity.

That split could have been done with alternatives but the macro hell is
horrible and ugly. This can be done on top if someone cares to remove the
extra orq. For now it's a straight forward fix.

Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")
Reported-by: Laura Abbott 
Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: stable 
Cc: Borislav Petkov 
Cc: Andy Lutomirski 
Cc: Willy Tarreau 
Cc: David Woodhouse 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos

---
 arch/x86/entry/calling.h   | 36 ++
 arch/x86/include/asm/processor-flags.h |  2 +-
 arch/x86/include/asm/tlbflush.h|  6 +++---
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 45a63e0..3f48f69 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is 
built with
  * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
  * halves:
  */
-#define PTI_SWITCH_PGTABLES_MASK   (1<

[tip:x86/pti] x86/mce: Make machine check speculation protected

2018-01-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  6f41c34d69eb005e7848716bbcafc979b35037d5
Gitweb: https://git.kernel.org/tip/6f41c34d69eb005e7848716bbcafc979b35037d5
Author: Thomas Gleixner 
AuthorDate: Thu, 18 Jan 2018 16:28:26 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 19 Jan 2018 16:31:28 +0100

x86/mce: Make machine check speculation protected

The machine check idtentry uses an indirect branch directly from the low
level code. This evades the speculation protection.

Replace it by a direct call into C code and issue the indirect call there
so the compiler can apply the proper speculation protection.

Signed-off-by: Thomas Gleixner 
Reviewed-by:Borislav Petkov 
Reviewed-by: David Woodhouse 
Niced-by: Peter Zijlstra 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801181626290.1847@nanos

---
 arch/x86/entry/entry_64.S| 2 +-
 arch/x86/include/asm/traps.h | 1 +
 arch/x86/kernel/cpu/mcheck/mce.c | 5 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d54a0ed..63f4320 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1258,7 +1258,7 @@ idtentry async_page_fault do_async_page_fault 
has_error_code=1
 #endif
 
 #ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0
paranoid=1 do_sym=*machine_check_vector(%rip)
+idtentry machine_check do_mce  has_error_code=0
paranoid=1
 #endif
 
 /*
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 31051f3..3de6933 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -88,6 +88,7 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs 
*, long);
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *, long);
 #endif
+dotraplinkage void do_mce(struct pt_regs *, long);
 
 static inline int get_si_code(unsigned long condition)
 {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3b413065..a9e898b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1788,6 +1788,11 @@ static void unexpected_machine_check(struct pt_regs 
*regs, long error_code)
 void (*machine_check_vector)(struct pt_regs *, long error_code) =
unexpected_machine_check;
 
+dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+{
+   machine_check_vector(regs, error_code);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:


[tip:x86/pti] x86/retpoline: Remove compile time warning

2018-01-14 Thread tip-bot for Thomas Gleixner
Commit-ID:  b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6
Gitweb: https://git.kernel.org/tip/b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6
Author: Thomas Gleixner 
AuthorDate: Sun, 14 Jan 2018 22:13:29 +0100
Committer:  Thomas Gleixner 
CommitDate: Sun, 14 Jan 2018 22:29:36 +0100

x86/retpoline: Remove compile time warning

Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler
does not have retpoline support. Linus rationale for this is:

  It's wrong because it will just make people turn off RETPOLINE, and the
  asm updates - and return stack clearing - that are independent of the
  compiler are likely the most important parts because they are likely the
  ones easiest to target.

  And it's annoying because most people won't be able to do anything about
  it. The number of people building their own compiler? Very small. So if
  their distro hasn't got a compiler yet (and pretty much nobody does), the
  warning is just annoying crap.

  It is already properly reported as part of the sysfs interface. The
  compile-time warning only encourages bad things.

Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support")
Requested-by: Linus Torvalds 
Signed-off-by: Thomas Gleixner 
Cc: David Woodhouse 
Cc: Peter Zijlstra (Intel) 
Cc: gno...@lxorguk.ukuu.org.uk
Cc: Rik van Riel 
Cc: Andi Kleen 
Cc: Josh Poimboeuf 
Cc: thomas.lenda...@amd.com
Cc: Linus Torvalds 
Cc: Jiri Kosina 
Cc: Andy Lutomirski 
Cc: Dave Hansen 
Cc: Kees Cook 
Cc: Tim Chen 
Cc: Greg Kroah-Hartman 
Link: 
https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=geccdqg5u...@mail.gmail.com
---
 arch/x86/Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 974c618..504b1a4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -240,8 +240,6 @@ ifdef CONFIG_RETPOLINE
 RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern 
-mindirect-branch-register)
 ifneq ($(RETPOLINE_CFLAGS),)
 KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
-else
-$(warning CONFIG_RETPOLINE=y, but not supported by the compiler. 
Toolchain update recommended.)
 endif
 endif
 


[tip:timers/urgent] hrtimer: Reset hrtimer cpu base proper on CPU hotplug

2018-01-27 Thread tip-bot for Thomas Gleixner
Commit-ID:  d5421ea43d30701e03cadc56a38854c36a8b4433
Gitweb: https://git.kernel.org/tip/d5421ea43d30701e03cadc56a38854c36a8b4433
Author: Thomas Gleixner 
AuthorDate: Fri, 26 Jan 2018 14:54:32 +0100
Committer:  Thomas Gleixner 
CommitDate: Sat, 27 Jan 2018 15:12:22 +0100

hrtimer: Reset hrtimer cpu base proper on CPU hotplug

The hrtimer interrupt code contains a hang detection and mitigation
mechanism, which prevents that a long delayed hrtimer interrupt causes a
continous retriggering of interrupts which prevent the system from making
progress. If a hang is detected then the timer hardware is programmed with
a certain delay into the future and a flag is set in the hrtimer cpu base
which prevents newly enqueued timers from reprogramming the timer hardware
prior to the chosen delay. The subsequent hrtimer interrupt after the delay
clears the flag and resumes normal operation.

If such a hang happens in the last hrtimer interrupt before a CPU is
unplugged then the hang_detected flag is set and stays that way when the
CPU is plugged in again. At that point the timer hardware is not armed and
it cannot be armed because the hang_detected flag is still active, so
nothing clears that flag. As a consequence the CPU does not receive hrtimer
interrupts and no timers expire on that CPU which results in RCU stalls and
other malfunctions.

Clear the flag along with some other less critical members of the hrtimer
cpu base to ensure starting from a clean state when a CPU is plugged in.

Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the
root cause of that hard to reproduce heisenbug. Once understood it's
trivial and certainly justifies a brown paperbag.

Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic")
Reported-by: Paul E. McKenney 
Signed-off-by: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Sebastian Sewior 
Cc: Anna-Maria Gleixner 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos

---
 kernel/time/hrtimer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d325208..aa9d2a2b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer,
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
base->expires_next = KTIME_MAX;
+   base->hang_detected = 0;
base->hres_active = 0;
+   base->next_timer = NULL;
 }
 
 /*
@@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
timerqueue_init_head(_base->clock_base[i].active);
}
 
+   cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
return 0;


[tip:irq/urgent] genirq: Make legacy autoprobing work again

2018-01-31 Thread tip-bot for Thomas Gleixner
Commit-ID:  55595980acc3232b018ba30df8ee6e0ac40ad184
Gitweb: https://git.kernel.org/tip/55595980acc3232b018ba30df8ee6e0ac40ad184
Author: Thomas Gleixner 
AuthorDate: Tue, 30 Jan 2018 19:36:32 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 31 Jan 2018 10:52:06 +0100

genirq: Make legacy autoprobing work again

Meelis reported the following warning on a quad P3 HP NetServer museum piece:
 
WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100
EIP: __irq_startup+0x80/0x100
irq_startup+0x7e/0x170
probe_irq_on+0x128/0x2b0
parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc]
parport_pc_probe_port+0xf11/0x1260 [parport_pc]
parport_pc_init+0x78a/0xf10 [parport_pc]
parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc]
do_one_initcall+0x45/0x1e0

This is caused by the rewrite of the irq activation/startup sequence which
missed to convert a callsite in the irq legacy auto probing code.

To fix this irq_activate_and_startup() needs to gain a return value so the
pending logic can work proper.

Fixes: c942cee46bba ("genirq: Separate activation and startup")
Reported-by: Meelis Roos 
Signed-off-by: Thomas Gleixner 
Tested-by: Meelis Roos 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos

---
 kernel/irq/autoprobe.c | 2 +-
 kernel/irq/chip.c  | 4 ++--
 kernel/irq/internals.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 4e8089b..8c82ea2 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -71,7 +71,7 @@ unsigned long probe_irq_on(void)
raw_spin_lock_irq(>lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-   if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE))
+   if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(>lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 043bfc3..f681c0e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc)
return 0;
 }
 
-void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+int irq_activate_and_startup(struct irq_desc *desc, bool resend)
 {
if (WARN_ON(irq_activate(desc)))
return;
-   irq_startup(desc, resend, IRQ_START_FORCE);
+   return irq_startup(desc, resend, IRQ_START_FORCE);
 }
 
 static void __irq_disable(struct irq_desc *desc, bool mask);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ab19371..ca6afa2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc);
 #define IRQ_START_COND false
 
 extern int irq_activate(struct irq_desc *desc);
-extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
+extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
 extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
 
 extern void irq_shutdown(struct irq_desc *desc);


[tip:irq/urgent] genirq: Make legacy autoprobing work again

2018-02-01 Thread tip-bot for Thomas Gleixner
Commit-ID:  9bc43be5151aaf1aa87f832128f1687341f07483
Gitweb: https://git.kernel.org/tip/9bc43be5151aaf1aa87f832128f1687341f07483
Author: Thomas Gleixner 
AuthorDate: Tue, 30 Jan 2018 19:36:32 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 1 Feb 2018 10:54:48 +0100

genirq: Make legacy autoprobing work again

Meelis reported the following warning on a quad P3 HP NetServer museum piece:

WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100
EIP: __irq_startup+0x80/0x100
irq_startup+0x7e/0x170
probe_irq_on+0x128/0x2b0
parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc]
parport_pc_probe_port+0xf11/0x1260 [parport_pc]
parport_pc_init+0x78a/0xf10 [parport_pc]
parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc]
do_one_initcall+0x45/0x1e0

This is caused by the rewrite of the irq activation/startup sequence which
missed to convert a callsite in the irq legacy auto probing code.

To fix this irq_activate_and_startup() needs to gain a return value so the
pending logic can work proper.

Fixes: c942cee46bba ("genirq: Separate activation and startup")
Reported-by: Meelis Roos 
Signed-off-by: Thomas Gleixner 
Tested-by: Meelis Roos 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f681c0e..c69357a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -297,7 +297,7 @@ int irq_activate(struct irq_desc *desc)
 int irq_activate_and_startup(struct irq_desc *desc, bool resend)
 {
if (WARN_ON(irq_activate(desc)))
-   return;
+   return 0;
return irq_startup(desc, resend, IRQ_START_FORCE);
 }
 


[tip:irq/urgent] genirq: Make legacy autoprobing work again

2018-02-01 Thread tip-bot for Thomas Gleixner
Commit-ID:  1beaeacdc88b537703d04d5536235d0bbb36db93
Gitweb: https://git.kernel.org/tip/1beaeacdc88b537703d04d5536235d0bbb36db93
Author: Thomas Gleixner 
AuthorDate: Tue, 30 Jan 2018 19:36:32 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 1 Feb 2018 11:09:40 +0100

genirq: Make legacy autoprobing work again

Meelis reported the following warning on a quad P3 HP NetServer museum piece:

WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100
EIP: __irq_startup+0x80/0x100
irq_startup+0x7e/0x170
probe_irq_on+0x128/0x2b0
parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc]
parport_pc_probe_port+0xf11/0x1260 [parport_pc]
parport_pc_init+0x78a/0xf10 [parport_pc]
parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc]
do_one_initcall+0x45/0x1e0

This is caused by the rewrite of the irq activation/startup sequence which
missed to convert a callsite in the irq legacy auto probing code.

To fix this irq_activate_and_startup() needs to gain a return value so the
pending logic can work proper.

Fixes: c942cee46bba ("genirq: Separate activation and startup")
Reported-by: Meelis Roos 
Signed-off-by: Thomas Gleixner 
Tested-by: Meelis Roos 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos
---
 kernel/irq/autoprobe.c | 2 +-
 kernel/irq/chip.c  | 6 +++---
 kernel/irq/internals.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 4e8089b..8c82ea2 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -71,7 +71,7 @@ unsigned long probe_irq_on(void)
raw_spin_lock_irq(>lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-   if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE))
+   if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(>lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 043bfc3..c69357a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc)
return 0;
 }
 
-void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+int irq_activate_and_startup(struct irq_desc *desc, bool resend)
 {
if (WARN_ON(irq_activate(desc)))
-   return;
-   irq_startup(desc, resend, IRQ_START_FORCE);
+   return 0;
+   return irq_startup(desc, resend, IRQ_START_FORCE);
 }
 
 static void __irq_disable(struct irq_desc *desc, bool mask);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ab19371..ca6afa2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc);
 #define IRQ_START_COND false
 
 extern int irq_activate(struct irq_desc *desc);
-extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
+extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
 extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
 
 extern void irq_shutdown(struct irq_desc *desc);


[tip:x86/urgent] x86/apic/vector: Handle vector release on CPU unplug correctly

2018-02-22 Thread tip-bot for Thomas Gleixner
Commit-ID:  c16721c5cece64bfe12cdc302a0228026d8089d7
Gitweb: https://git.kernel.org/tip/c16721c5cece64bfe12cdc302a0228026d8089d7
Author: Thomas Gleixner 
AuthorDate: Thu, 22 Feb 2018 12:08:06 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 22 Feb 2018 22:25:50 +0100

x86/apic/vector: Handle vector release on CPU unplug correctly

When a irq vector is replaced, then the previous vector is normally
released when the first interrupt happens on the new vector. If the target
CPU of the previous vector is already offline when the new vector is
installed, then the previous vector is silently discarded, which leads to
accounting issues causing suspend failures and other problems.

Adjust the logic so that the previous vector is freed in the underlying
matrix allocator to ensure that the accounting stays correct.

Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment")
Reported-by: Yuriy Vostrikov 
Signed-off-by: Thomas Gleixner 
Tested-by: Yuriy Vostrikov 
Cc: Peter Zijlstra 
Cc: Randy Dunlap 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20180222112316.930791...@linutronix.de
---
 arch/x86/kernel/apic/vector.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3cc471b..a82ea2e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, 
unsigned int newvec,
 {
struct apic_chip_data *apicd = apic_chip_data(irqd);
struct irq_desc *desc = irq_data_to_desc(irqd);
+   bool managed = irqd_affinity_is_managed(irqd);
 
lockdep_assert_held(_lock);
 
trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
apicd->cpu);
 
-   /* Setup the vector move, if required  */
-   if (apicd->vector && cpu_online(apicd->cpu)) {
+   /*
+* If there is no vector associated or if the associated vector is
+* the shutdown vector, which is associated to make PCI/MSI
+* shutdown mode work, then there is nothing to release. Clear out
+* prev_vector for this and the offlined target case.
+*/
+   apicd->prev_vector = 0;
+   if (!apic->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+   goto setnew;
+   /*
+* If the target CPU of the previous vector is online, then mark
+* the vector as move in progress and store it for cleanup when the
+* first interrupt on the new vector arrives. If the target CPU is
+* offline then the regular release mechanism via the cleanup
+* vector is not possible and the vector can be immediately freed
+* in the underlying matrix allocator.
+*/
+   if (cpu_online(apicd->cpu)) {
apicd->move_in_progress = true;
apicd->prev_vector = apicd->vector;
apicd->prev_cpu = apicd->cpu;
} else {
-   apicd->prev_vector = 0;
+   irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
+   managed);
}
 
+setnew:
apicd->vector = newvec;
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));


[tip:x86/urgent] x86/apic/vector: Handle vector release on CPU unplug correctly

2018-02-22 Thread tip-bot for Thomas Gleixner
Commit-ID:  f60606c4ce402963dc552c62910ffa7080b4a628
Gitweb: https://git.kernel.org/tip/f60606c4ce402963dc552c62910ffa7080b4a628
Author: Thomas Gleixner 
AuthorDate: Thu, 22 Feb 2018 12:08:06 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 22 Feb 2018 22:05:44 +0100

x86/apic/vector: Handle vector release on CPU unplug correctly

When a irq vector is replaced, then the previous vector is normally
released when the first interrupt happens on the new vector. If the target
CPU of the previous vector is already offline when the new vector is
installed, then the previous vector is silently discarded, which leads to
accounting issues causing suspend failures and other problems.

Adjust the logic so that the previous vector is freed in the underlying
matrix allocator to ensure that the accounting stays correct.

Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment")
Reported-by: Yuriy Vostrikov 
Signed-off-by: Thomas Gleixner 
Tested-by: Yuriy Vostrikov 
Cc: Peter Zijlstra 
Cc: Randy Dunlap 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20180222112316.930791...@linutronix.de

---
 arch/x86/kernel/apic/vector.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3cc471b..a82ea2e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, 
unsigned int newvec,
 {
struct apic_chip_data *apicd = apic_chip_data(irqd);
struct irq_desc *desc = irq_data_to_desc(irqd);
+   bool managed = irqd_affinity_is_managed(irqd);
 
lockdep_assert_held(_lock);
 
trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
apicd->cpu);
 
-   /* Setup the vector move, if required  */
-   if (apicd->vector && cpu_online(apicd->cpu)) {
+   /*
+* If there is no vector associated or if the associated vector is
+* the shutdown vector, which is associated to make PCI/MSI
+* shutdown mode work, then there is nothing to release. Clear out
+* prev_vector for this and the offlined target case.
+*/
+   apicd->prev_vector = 0;
+   if (!apic->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+   goto setnew;
+   /*
+* If the target CPU of the previous vector is online, then mark
+* the vector as move in progress and store it for cleanup when the
+* first interrupt on the new vector arrives. If the target CPU is
+* offline then the regular release mechanism via the cleanup
+* vector is not possible and the vector can be immediately freed
+* in the underlying matrix allocator.
+*/
+   if (cpu_online(apicd->cpu)) {
apicd->move_in_progress = true;
apicd->prev_vector = apicd->vector;
apicd->prev_cpu = apicd->cpu;
} else {
-   apicd->prev_vector = 0;
+   irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
+   managed);
}
 
+setnew:
apicd->vector = newvec;
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));


[tip:x86/urgent] genirq/matrix: Handle CPU offlining proper

2018-02-22 Thread tip-bot for Thomas Gleixner
Commit-ID:  651ca2c00405a2ae3870cc0b4f15a182eb6fbe26
Gitweb: https://git.kernel.org/tip/651ca2c00405a2ae3870cc0b4f15a182eb6fbe26
Author: Thomas Gleixner 
AuthorDate: Thu, 22 Feb 2018 12:08:05 +0100
Committer:  Thomas Gleixner 
CommitDate: Thu, 22 Feb 2018 22:05:43 +0100

genirq/matrix: Handle CPU offlining proper

At CPU hotunplug the corresponding per cpu matrix allocator is shut down and
the allocated interrupt bits are discarded under the assumption that all
allocated bits have been either migrated away or shut down through the
managed interrupts mechanism.

This is not true because interrupts which are not started up might have a
vector allocated on the outgoing CPU. When the interrupt is started up
later or completely shutdown and freed then the allocated vector is handed
back, triggering warnings or causing accounting issues which result in
suspend failures and other issues.

Change the CPU hotplug mechanism of the matrix allocator so that the
remaining allocations at unplug time are preserved and global accounting at
hotplug is correctly readjusted to take the dormant vectors into account.

Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator")
Reported-by: Yuriy Vostrikov 
Signed-off-by: Thomas Gleixner 
Tested-by: Yuriy Vostrikov 
Cc: Peter Zijlstra 
Cc: Randy Dunlap 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20180222112316.849980...@linutronix.de

---
 kernel/irq/matrix.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 5187dfe..4c57704 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -16,6 +16,7 @@ struct cpumap {
unsigned intavailable;
unsigned intallocated;
unsigned intmanaged;
+   boolinitialized;
boolonline;
unsigned long   alloc_map[IRQ_MATRIX_SIZE];
unsigned long   managed_map[IRQ_MATRIX_SIZE];
@@ -81,9 +82,11 @@ void irq_matrix_online(struct irq_matrix *m)
 
BUG_ON(cm->online);
 
-   bitmap_zero(cm->alloc_map, m->matrix_bits);
-   cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc);
-   cm->allocated = 0;
+   if (!cm->initialized) {
+   cm->available = m->alloc_size;
+   cm->available -= cm->managed + m->systembits_inalloc;
+   cm->initialized = true;
+   }
m->global_available += cm->available;
cm->online = true;
m->online_maps++;
@@ -370,14 +373,16 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int 
cpu,
if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
return;
 
-   if (cm->online) {
-   clear_bit(bit, cm->alloc_map);
-   cm->allocated--;
+   clear_bit(bit, cm->alloc_map);
+   cm->allocated--;
+
+   if (cm->online)
m->total_allocated--;
-   if (!managed) {
-   cm->available++;
+
+   if (!managed) {
+   cm->available++;
+   if (cm->online)
m->global_available++;
-   }
}
trace_irq_matrix_free(bit, cpu, m, cm);
 }


[tip:x86/urgent] x86/apic/vector: Handle vector release on CPU unplug correctly

2018-02-22 Thread tip-bot for Thomas Gleixner
Commit-ID:  e84cf6aa501c58bf4bf451f1e425192ec090aed2
Gitweb: https://git.kernel.org/tip/e84cf6aa501c58bf4bf451f1e425192ec090aed2
Author: Thomas Gleixner 
AuthorDate: Thu, 22 Feb 2018 12:08:06 +0100
Committer:  Ingo Molnar 
CommitDate: Fri, 23 Feb 2018 08:02:00 +0100

x86/apic/vector: Handle vector release on CPU unplug correctly

When a irq vector is replaced, then the previous vector is normally
released when the first interrupt happens on the new vector. If the target
CPU of the previous vector is already offline when the new vector is
installed, then the previous vector is silently discarded, which leads to
accounting issues causing suspend failures and other problems.

Adjust the logic so that the previous vector is freed in the underlying
matrix allocator to ensure that the accounting stays correct.

Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment")
Reported-by: Yuriy Vostrikov 
Signed-off-by: Thomas Gleixner 
Tested-by: Yuriy Vostrikov 
Cc: Peter Zijlstra 
Cc: Randy Dunlap 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20180222112316.930791...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/apic/vector.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3cc471beb50b..bb6f7a2148d7 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -134,21 +134,40 @@ static void apic_update_vector(struct irq_data *irqd, 
unsigned int newvec,
 {
struct apic_chip_data *apicd = apic_chip_data(irqd);
struct irq_desc *desc = irq_data_to_desc(irqd);
+   bool managed = irqd_affinity_is_managed(irqd);
 
lockdep_assert_held(_lock);
 
trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
apicd->cpu);
 
-   /* Setup the vector move, if required  */
-   if (apicd->vector && cpu_online(apicd->cpu)) {
+   /*
+* If there is no vector associated or if the associated vector is
+* the shutdown vector, which is associated to make PCI/MSI
+* shutdown mode work, then there is nothing to release. Clear out
+* prev_vector for this and the offlined target case.
+*/
+   apicd->prev_vector = 0;
+   if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+   goto setnew;
+   /*
+* If the target CPU of the previous vector is online, then mark
+* the vector as move in progress and store it for cleanup when the
+* first interrupt on the new vector arrives. If the target CPU is
+* offline then the regular release mechanism via the cleanup
+* vector is not possible and the vector can be immediately freed
+* in the underlying matrix allocator.
+*/
+   if (cpu_online(apicd->cpu)) {
apicd->move_in_progress = true;
apicd->prev_vector = apicd->vector;
apicd->prev_cpu = apicd->cpu;
} else {
-   apicd->prev_vector = 0;
+   irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
+   managed);
}
 
+setnew:
apicd->vector = newvec;
apicd->cpu = newcpu;
BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));


[tip:x86/urgent] x86/apic: Switch all APICs to Fixed delivery mode

2017-12-28 Thread tip-bot for Thomas Gleixner
Commit-ID:  45fa8d89192e4e8e801e67dac3394d6597613e07
Gitweb: https://git.kernel.org/tip/45fa8d89192e4e8e801e67dac3394d6597613e07
Author: Thomas Gleixner 
AuthorDate: Thu, 28 Dec 2017 11:33:33 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 29 Dec 2017 00:21:04 +0100

x86/apic: Switch all APICs to Fixed delivery mode

Some of the APIC incarnations are operating in lowest priority delivery
mode. This worked as long as the vector management code allocated the same
vector on all possible CPUs for each interrupt.

Lowest priority delivery mode does not necessarily respect the affinity
setting and may redirect to some other online CPU. This was documented
somewhere in the old code and the conversion to single target delivery
missed to update the delivery mode of the affected APIC drivers which
results in spurious interrupts on some of the affected CPU/Chipset
combinations.

Switch the APIC drivers over to Fixed delivery mode and remove all
leftovers of lowest priority delivery mode.

As a consequence of this change, the apic::irq_delivery_mode field is now
pointless, but this needs to be cleaned up in a separate patch.

Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity")
Reported-by: vcap...@pengaru.com
Signed-off-by: Thomas Gleixner 
Tested-by: vcap...@pengaru.com
Cc: Pavel Machek 
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos
---
 arch/x86/kernel/apic/apic_flat_64.c   | 2 +-
 arch/x86/kernel/apic/apic_noop.c  | 2 +-
 arch/x86/kernel/apic/msi.c| 8 ++--
 arch/x86/kernel/apic/probe_32.c   | 2 +-
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 drivers/pci/host/pci-hyperv.c | 8 ++--
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c 
b/arch/x86/kernel/apic/apic_flat_64.c
index aa85690..25a8702 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid  = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
 
-   .irq_delivery_mode  = dest_LowestPrio,
+   .irq_delivery_mode  = dest_Fixed,
.irq_dest_mode  = 1, /* logical */
 
.disable_esr= 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 7b659c4..5078b5c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid  = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
 
-   .irq_delivery_mode  = dest_LowestPrio,
+   .irq_delivery_mode  = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode  = 1,
 
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9b18be7..ce503c9 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, 
struct msi_msg *msg)
((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL :
MSI_ADDR_DEST_MODE_LOGICAL) |
-   ((apic->irq_delivery_mode != dest_LowestPrio) ?
-   MSI_ADDR_REDIRECTION_CPU :
-   MSI_ADDR_REDIRECTION_LOWPRI) |
+   MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
-   ((apic->irq_delivery_mode != dest_LowestPrio) ?
-   MSI_DATA_DELIVERY_FIXED :
-   MSI_DATA_DELIVERY_LOWPRI) |
+   MSI_DATA_DELIVERY_FIXED |
MSI_DATA_VECTOR(cfg->vector);
 }
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fa22017..02e8acb 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid  = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
 
-   .irq_delivery_mode  = dest_LowestPrio,
+   .irq_delivery_mode  = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode  = 1,
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c 
b/arch/x86/kernel/apic/x2apic_cluster.c
index 622f13c..8b04234 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.apic_id_valid  

[tip:x86/pti] x86/pti: Make sure the user/kernel PTEs match

2018-01-03 Thread tip-bot for Thomas Gleixner
Commit-ID:  52994c256df36fda9a715697431cba9daecb6b11
Gitweb: https://git.kernel.org/tip/52994c256df36fda9a715697431cba9daecb6b11
Author: Thomas Gleixner 
AuthorDate: Wed, 3 Jan 2018 15:57:59 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 3 Jan 2018 15:57:59 +0100

x86/pti: Make sure the user/kernel PTEs match

Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is
enabled:

[Hardware Error]: Error Addr: 0x81e000e0
[Hardware Error]: MC1 Error: L1 TLB multimatch.
[Hardware Error]: cache level: L1, tx: INSN

The address is in the entry area, which is mapped into kernel _AND_ user
space. That's special because we switch CR3 while we are executing
there. 

User mapping:
0x81e0-0x8200   2M ro PSE GLB x 
 pmd

Kernel mapping:
0x8100-0x8200  16M ro PSE x 
 pmd

So the K8 is complaining that the TLB entries differ. They differ in the
GLB bit.

Drop the GLB bit when installing the user shared mapping.

Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD")
Reported-by: Meelis Roos 
Signed-off-by: Thomas Gleixner 
Tested-by: Meelis Roos 
Cc: Borislav Petkov 
Cc: Tom Lendacky 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos
---
 arch/x86/mm/pti.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bce8aea..2da28ba 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void)
 static void __init pti_clone_entry_text(void)
 {
pti_clone_pmds((unsigned long) __entry_text_start,
-   (unsigned long) __irqentry_text_end, _PAGE_RW);
+   (unsigned long) __irqentry_text_end,
+  _PAGE_RW | _PAGE_GLOBAL);
 }
 
 /*


[tip:x86/pti] x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat()

2018-01-03 Thread tip-bot for Thomas Gleixner
Commit-ID:  d7732ba55c4b6a2da339bb12589c515830cfac2c
Gitweb: https://git.kernel.org/tip/d7732ba55c4b6a2da339bb12589c515830cfac2c
Author: Thomas Gleixner 
AuthorDate: Wed, 3 Jan 2018 19:52:04 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 3 Jan 2018 23:19:32 +0100

x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat()

The preparation for PTI which added CR3 switching to the entry code
misplaced the CR3 switch in entry_SYSCALL_compat().

With PTI enabled the entry code tries to access a per cpu variable after
switching to kernel GS. This fails because that variable is not mapped to
user space. This results in a double fault and in the worst case a kernel
crash.

Move the switch ahead of the access and clobber RSP which has been saved
already.

Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for 
entry/exit CR3 switching")
Reported-by: Lars Wendler 
Reported-by: Laura Abbott 
Signed-off-by: Thomas Gleixner 
Cc: Borislav Betkov 
Cc: Andy Lutomirski , 
Cc: Dave Hansen , 
Cc: Peter Zijlstra , 
Cc: Greg KH , , 
Cc: Boris Ostrovsky , 
Cc: Juergen Gross 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos

---
 arch/x86/entry/entry_64_compat.S | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 40f1700..98d5358 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -190,8 +190,13 @@ ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
swapgs
 
-   /* Stash user ESP and switch to the kernel stack. */
+   /* Stash user ESP */
movl%esp, %r8d
+
+   /* Use %rsp as scratch reg. User ESP is stashed in r8 */
+   SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+   /* Switch to the kernel stack */
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
/* Construct struct pt_regs on stack */
@@ -220,12 +225,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq   $0  /* pt_regs->r15 = 0 */
 
/*
-* We just saved %rdi so it is safe to clobber.  It is not
-* preserved during the C calls inside TRACE_IRQS_OFF anyway.
-*/
-   SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-
-   /*
 * User mode is traced as though IRQs are on, and SYSENTER
 * turned them off.
 */


[tip:x86/timers] x86/kvmclock: Remove page size requirement from wall_clock

2018-07-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  7ef363a39514ed8a6f2333fbae1875ac0953715a
Gitweb: https://git.kernel.org/tip/7ef363a39514ed8a6f2333fbae1875ac0953715a
Author: Thomas Gleixner 
AuthorDate: Thu, 19 Jul 2018 16:55:21 -0400
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 00:02:36 +0200

x86/kvmclock: Remove page size requirement from wall_clock

There is no requirement for wall_clock data to be page aligned or page
sized.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Pavel Tatashin 
Acked-by: Paolo Bonzini 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pet...@infradead.org
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Link: https://lkml.kernel.org/r/20180719205545.16512-3-pasha.tatas...@oracle.com

---
 arch/x86/kernel/kvmclock.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1f6ac5aaa904..a995d7d7164c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -46,14 +46,12 @@ early_param("no-kvmclock", parse_no_kvmclock);
 
 /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
 #define HV_CLOCK_SIZE  (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
-#define WALL_CLOCK_SIZE(sizeof(struct pvclock_wall_clock))
 
 static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE);
-static u8 wall_clock_mem[PAGE_ALIGN(WALL_CLOCK_SIZE)] __aligned(PAGE_SIZE);
 
 /* The hypervisor will put information about time periodically here */
 static struct pvclock_vsyscall_time_info *hv_clock;
-static struct pvclock_wall_clock *wall_clock;
+static struct pvclock_wall_clock wall_clock;
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
@@ -66,15 +64,15 @@ static void kvm_get_wallclock(struct timespec64 *now)
int low, high;
int cpu;
 
-   low = (int)slow_virt_to_phys(wall_clock);
-   high = ((u64)slow_virt_to_phys(wall_clock) >> 32);
+   low = (int)slow_virt_to_phys(_clock);
+   high = ((u64)slow_virt_to_phys(_clock) >> 32);
 
native_write_msr(msr_kvm_wall_clock, low, high);
 
cpu = get_cpu();
 
vcpu_time = _clock[cpu].pvti;
-   pvclock_read_wallclock(wall_clock, vcpu_time, now);
+   pvclock_read_wallclock(_clock, vcpu_time, now);
 
put_cpu();
 }
@@ -267,12 +265,10 @@ void __init kvmclock_init(void)
} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
return;
 
-   wall_clock = (struct pvclock_wall_clock *)wall_clock_mem;
hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem;
 
if (kvm_register_clock("primary cpu clock")) {
hv_clock = NULL;
-   wall_clock = NULL;
return;
}
 


[tip:x86/timers] x86/kvmclock: Cleanup the code

2018-07-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  146c394d0c3c8e88df433a179c2b0b85fd8cf247
Gitweb: https://git.kernel.org/tip/146c394d0c3c8e88df433a179c2b0b85fd8cf247
Author: Thomas Gleixner 
AuthorDate: Thu, 19 Jul 2018 16:55:23 -0400
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 00:02:37 +0200

x86/kvmclock: Cleanup the code

- Cleanup the mrs write for wall clock. The type casts to (int) are sloppy
  because the wrmsr parameters are u32 and aside of that wrmsrl() already
  provides the high/low split for free.

- Remove the pointless get_cpu()/put_cpu() dance from various
  functions. Either they are called during early init where CPU is
  guaranteed to be 0 or they are already called from non preemptible
  context where smp_processor_id() can be used safely

- Simplify the convoluted check for kvmclock in the init function.

- Mark the parameter parsing function __init. No point in keeping it
  around.

- Convert to pr_info()

Signed-off-by: Thomas Gleixner 
Signed-off-by: Pavel Tatashin 
Acked-by: Paolo Bonzini 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pet...@infradead.org
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Link: https://lkml.kernel.org/r/20180719205545.16512-5-pasha.tatas...@oracle.com

---
 arch/x86/kernel/kvmclock.c | 72 ++
 1 file changed, 22 insertions(+), 50 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f0a0aef5e9fa..4afb03e49a4f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -37,7 +37,7 @@ static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 static u64 kvm_sched_clock_offset;
 
-static int parse_no_kvmclock(char *arg)
+static int __init parse_no_kvmclock(char *arg)
 {
kvmclock = 0;
return 0;
@@ -61,13 +61,9 @@ static struct pvclock_wall_clock wall_clock;
 static void kvm_get_wallclock(struct timespec64 *now)
 {
struct pvclock_vcpu_time_info *vcpu_time;
-   int low, high;
int cpu;
 
-   low = (int)slow_virt_to_phys(_clock);
-   high = ((u64)slow_virt_to_phys(_clock) >> 32);
-
-   native_write_msr(msr_kvm_wall_clock, low, high);
+   wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(_clock));
 
cpu = get_cpu();
 
@@ -117,11 +113,11 @@ static inline void kvm_sched_clock_init(bool stable)
kvm_sched_clock_offset = kvm_clock_read();
pv_time_ops.sched_clock = kvm_sched_clock_read;
 
-   printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
-   kvm_sched_clock_offset);
+   pr_info("kvm-clock: using sched offset of %llu cycles",
+   kvm_sched_clock_offset);
 
BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
-sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+   sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
 }
 
 /*
@@ -135,16 +131,8 @@ static inline void kvm_sched_clock_init(bool stable)
  */
 static unsigned long kvm_get_tsc_khz(void)
 {
-   struct pvclock_vcpu_time_info *src;
-   int cpu;
-   unsigned long tsc_khz;
-
-   cpu = get_cpu();
-   src = _clock[cpu].pvti;
-   tsc_khz = pvclock_tsc_khz(src);
-   put_cpu();
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
-   return tsc_khz;
+   return pvclock_tsc_khz(_clock[0].pvti);
 }
 
 static void kvm_get_preset_lpj(void)
@@ -161,29 +149,27 @@ static void kvm_get_preset_lpj(void)
 
 bool kvm_check_and_clear_guest_paused(void)
 {
-   bool ret = false;
struct pvclock_vcpu_time_info *src;
-   int cpu = smp_processor_id();
+   bool ret = false;
 
if (!hv_clock)
return ret;
 
-   src = _clock[cpu].pvti;
+   src = _clock[smp_processor_id()].pvti;
if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
pvclock_touch_watchdogs();
ret = true;
}
-
return ret;
 }
 
 struct clocksource kvm_clock = {
-   .name = "kvm-clock",
-   .read = kvm_clock_get_cycles,
-   .rating = 400,
-   .mask = CLOCKSOURCE_MASK(64),
-   .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+   .name   = "kvm-clock",
+   .read   = kvm_clock_get_cycles,
+   .rating = 400,
+   .mask   = CLOCKSOURCE_MASK(64),
+   .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 EXPORT_SYMBOL_GPL(kvm_clock);
 
@@ -199,7 +185,7 @@ static void kvm_register_clock(char *txt)
src = _clock[cpu].pvti;
pa = slow_virt_to_phys(src) | 0x01ULL;
wrmsrl(msr_kvm_system_time, pa);
-   pr_info("kvm-clock: cpu 

[tip:x86/timers] x86/kvmclock: Decrapify kvm_register_clock()

2018-07-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  7a5ddc8fe0ea9518cd7fb6a929cac7d864c6f300
Gitweb: https://git.kernel.org/tip/7a5ddc8fe0ea9518cd7fb6a929cac7d864c6f300
Author: Thomas Gleixner 
AuthorDate: Thu, 19 Jul 2018 16:55:22 -0400
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 00:02:36 +0200

x86/kvmclock: Decrapify kvm_register_clock()

The return value is pointless because the wrmsr cannot fail if
KVM_FEATURE_CLOCKSOURCE or KVM_FEATURE_CLOCKSOURCE2 are set.

kvm_register_clock() is only called locally so wants to be static.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Pavel Tatashin 
Acked-by: Paolo Bonzini 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pet...@infradead.org
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Link: https://lkml.kernel.org/r/20180719205545.16512-4-pasha.tatas...@oracle.com

---
 arch/x86/include/asm/kvm_para.h |  1 -
 arch/x86/kernel/kvmclock.c  | 33 ++---
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 3aea2658323a..4c723632c036 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,7 +7,6 @@
 #include 
 
 extern void kvmclock_init(void);
-extern int kvm_register_clock(char *txt);
 
 #ifdef CONFIG_KVM_GUEST
 bool kvm_check_and_clear_guest_paused(void);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index a995d7d7164c..f0a0aef5e9fa 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -187,23 +187,19 @@ struct clocksource kvm_clock = {
 };
 EXPORT_SYMBOL_GPL(kvm_clock);
 
-int kvm_register_clock(char *txt)
+static void kvm_register_clock(char *txt)
 {
-   int cpu = smp_processor_id();
-   int low, high, ret;
struct pvclock_vcpu_time_info *src;
+   int cpu = smp_processor_id();
+   u64 pa;
 
if (!hv_clock)
-   return 0;
+   return;
 
src = _clock[cpu].pvti;
-   low = (int)slow_virt_to_phys(src) | 1;
-   high = ((u64)slow_virt_to_phys(src) >> 32);
-   ret = native_write_msr_safe(msr_kvm_system_time, low, high);
-   printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
-  cpu, high, low, txt);
-
-   return ret;
+   pa = slow_virt_to_phys(src) | 0x01ULL;
+   wrmsrl(msr_kvm_system_time, pa);
+   pr_info("kvm-clock: cpu %d, msr %llx, %s\n", cpu, pa, txt);
 }
 
 static void kvm_save_sched_clock_state(void)
@@ -218,11 +214,7 @@ static void kvm_restore_sched_clock_state(void)
 #ifdef CONFIG_X86_LOCAL_APIC
 static void kvm_setup_secondary_clock(void)
 {
-   /*
-* Now that the first cpu already had this clocksource initialized,
-* we shouldn't fail.
-*/
-   WARN_ON(kvm_register_clock("secondary cpu clock"));
+   kvm_register_clock("secondary cpu clock");
 }
 #endif
 
@@ -265,16 +257,11 @@ void __init kvmclock_init(void)
} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
return;
 
-   hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem;
-
-   if (kvm_register_clock("primary cpu clock")) {
-   hv_clock = NULL;
-   return;
-   }
-
printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
 
+   hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem;
+   kvm_register_clock("primary cpu clock");
pvclock_set_pvti_cpu0_va(hv_clock);
 
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))


[tip:x86/timers] x86/kvmclock: Move kvmclock vsyscall param and init to kvmclock

2018-07-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  e499a9b6dc488aff7f284bee51936f510ab7ad15
Gitweb: https://git.kernel.org/tip/e499a9b6dc488aff7f284bee51936f510ab7ad15
Author: Thomas Gleixner 
AuthorDate: Thu, 19 Jul 2018 16:55:25 -0400
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 00:02:37 +0200

x86/kvmclock: Move kvmclock vsyscall param and init to kvmclock

There is no point to have this in the kvm code itself and call it from
there. This can be called from an initcall and the parameter is cleared
when the hypervisor is not KVM.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Pavel Tatashin 
Acked-by: Paolo Bonzini 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pet...@infradead.org
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Link: https://lkml.kernel.org/r/20180719205545.16512-7-pasha.tatas...@oracle.com

---
 arch/x86/include/asm/kvm_guest.h |  7 ---
 arch/x86/kernel/kvm.c| 13 
 arch/x86/kernel/kvmclock.c   | 44 
 3 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
deleted file mode 100644
index 46185263d9c2..
--- a/arch/x86/include/asm/kvm_guest.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_KVM_GUEST_H
-#define _ASM_X86_KVM_GUEST_H
-
-int kvm_setup_vsyscall_timeinfo(void);
-
-#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c65c232d3ddd..a560750cc76f 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -45,7 +45,6 @@
 #include 
 #include 
 #include 
-#include 
 
 static int kvmapf = 1;
 
@@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg)
 
 early_param("no-steal-acc", parse_no_stealacc);
 
-static int kvmclock_vsyscall = 1;
-static int __init parse_no_kvmclock_vsyscall(char *arg)
-{
-kvmclock_vsyscall = 0;
-return 0;
-}
-
-early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-
 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) 
__aligned(64);
 static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) 
__aligned(64);
 static int has_steal_clock = 0;
@@ -560,9 +550,6 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
-   if (kvmclock_vsyscall)
-   kvm_setup_vsyscall_timeinfo();
-
 #ifdef CONFIG_SMP
smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 78aec160f5e0..7d690d2238f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -27,12 +27,14 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
 #include 
 
 static int kvmclock __initdata = 1;
+static int kvmclock_vsyscall __initdata = 1;
 static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
 static u64 kvm_sched_clock_offset __ro_after_init;
@@ -44,6 +46,13 @@ static int __init parse_no_kvmclock(char *arg)
 }
 early_param("no-kvmclock", parse_no_kvmclock);
 
+static int __init parse_no_kvmclock_vsyscall(char *arg)
+{
+   kvmclock_vsyscall = 0;
+   return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
 /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
 #define HV_CLOCK_SIZE  (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
 
@@ -228,6 +237,24 @@ static void kvm_shutdown(void)
native_machine_shutdown();
 }
 
+static int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+   u8 flags;
+
+   if (!hv_clock || !kvmclock_vsyscall)
+   return 0;
+
+   flags = pvclock_read_flags(_clock[0].pvti);
+   if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+   return 1;
+
+   kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+   return 0;
+}
+early_initcall(kvm_setup_vsyscall_timeinfo);
+
 void __init kvmclock_init(void)
 {
u8 flags;
@@ -272,20 +299,3 @@ void __init kvmclock_init(void)
clocksource_register_hz(_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
 }
-
-int __init kvm_setup_vsyscall_timeinfo(void)
-{
-#ifdef CONFIG_X86_64
-   u8 flags;
-
-   if (!hv_clock)
-   return 0;
-
-   flags = pvclock_read_flags(_clock[0].pvti);
-   if (!(flags & PVCLOCK_TSC_STABLE_BIT))
-   return 1;
-
-   kvm_clock.archdata.vclock_mode = 

[tip:x86/timers] x86/kvmclock: Mark variables __initdata and __ro_after_init

2018-07-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  42f8df935efefba51d0c5321b1325436523e3377
Gitweb: https://git.kernel.org/tip/42f8df935efefba51d0c5321b1325436523e3377
Author: Thomas Gleixner 
AuthorDate: Thu, 19 Jul 2018 16:55:24 -0400
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 00:02:37 +0200

x86/kvmclock: Mark variables __initdata and __ro_after_init

The kvmclock parameter is init data and the other variables are not
modified after init.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Pavel Tatashin 
Acked-by: Paolo Bonzini 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pet...@infradead.org
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Link: https://lkml.kernel.org/r/20180719205545.16512-6-pasha.tatas...@oracle.com

---
 arch/x86/kernel/kvmclock.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4afb03e49a4f..78aec160f5e0 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,10 +32,10 @@
 #include 
 #include 
 
-static int kvmclock __ro_after_init = 1;
-static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
-static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
-static u64 kvm_sched_clock_offset;
+static int kvmclock __initdata = 1;
+static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static u64 kvm_sched_clock_offset __ro_after_init;
 
 static int __init parse_no_kvmclock(char *arg)
 {
@@ -50,7 +50,7 @@ early_param("no-kvmclock", parse_no_kvmclock);
 static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE);
 
 /* The hypervisor will put information about time periodically here */
-static struct pvclock_vsyscall_time_info *hv_clock;
+static struct pvclock_vsyscall_time_info *hv_clock __ro_after_init;
 static struct pvclock_wall_clock wall_clock;
 
 /*


[tip:x86/timers] x86/kvmclock: Switch kvmclock data to a PER_CPU variable

2018-07-19 Thread tip-bot for Thomas Gleixner
Commit-ID:  95a3d4454bb1cf5bfd666c27fdd2dc188e17c14d
Gitweb: https://git.kernel.org/tip/95a3d4454bb1cf5bfd666c27fdd2dc188e17c14d
Author: Thomas Gleixner 
AuthorDate: Thu, 19 Jul 2018 16:55:26 -0400
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 00:02:38 +0200

x86/kvmclock: Switch kvmclock data to a PER_CPU variable

The previous removal of the memblock dependency from kvmclock introduced a
static data array sized 64bytes * CONFIG_NR_CPUS. That's wasteful on large
systems when kvmclock is not used.

Replace it with:

 - A static page sized array of pvclock data. It's page sized because the
   pvclock data of the boot cpu is mapped into the VDSO so otherwise random
   other data would be exposed to the vDSO

 - A PER_CPU variable of pvclock data pointers. This is used to access the
   pcvlock data storage on each CPU.

The setup is done in two stages:

 - Early boot stores the pointer to the static page for the boot CPU in
   the per cpu data.

 - In the preparatory stage of CPU hotplug assign either an element of
   the static array (when the CPU number is in that range) or allocate
   memory and initialize the per cpu pointer.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Pavel Tatashin 
Acked-by: Paolo Bonzini 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pet...@infradead.org
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Link: https://lkml.kernel.org/r/20180719205545.16512-8-pasha.tatas...@oracle.com

---
 arch/x86/kernel/kvmclock.c | 99 +-
 1 file changed, 62 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 7d690d2238f8..91b94c0ae4e3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -55,12 +56,23 @@ early_param("no-kvmclock-vsyscall", 
parse_no_kvmclock_vsyscall);
 
 /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
 #define HV_CLOCK_SIZE  (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
+#define HVC_BOOT_ARRAY_SIZE \
+   (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
 
-static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE);
-
-/* The hypervisor will put information about time periodically here */
-static struct pvclock_vsyscall_time_info *hv_clock __ro_after_init;
+static struct pvclock_vsyscall_time_info
+   hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
 static struct pvclock_wall_clock wall_clock;
+static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+
+static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+   return _cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+   return this_cpu_read(hv_clock_per_cpu);
+}
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
@@ -69,17 +81,10 @@ static struct pvclock_wall_clock wall_clock;
  */
 static void kvm_get_wallclock(struct timespec64 *now)
 {
-   struct pvclock_vcpu_time_info *vcpu_time;
-   int cpu;
-
wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(_clock));
-
-   cpu = get_cpu();
-
-   vcpu_time = _clock[cpu].pvti;
-   pvclock_read_wallclock(_clock, vcpu_time, now);
-
-   put_cpu();
+   preempt_disable();
+   pvclock_read_wallclock(_clock, this_cpu_pvti(), now);
+   preempt_enable();
 }
 
 static int kvm_set_wallclock(const struct timespec64 *now)
@@ -89,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now)
 
 static u64 kvm_clock_read(void)
 {
-   struct pvclock_vcpu_time_info *src;
u64 ret;
-   int cpu;
 
preempt_disable_notrace();
-   cpu = smp_processor_id();
-   src = _clock[cpu].pvti;
-   ret = pvclock_clocksource_read(src);
+   ret = pvclock_clocksource_read(this_cpu_pvti());
preempt_enable_notrace();
return ret;
 }
@@ -141,7 +142,7 @@ static inline void kvm_sched_clock_init(bool stable)
 static unsigned long kvm_get_tsc_khz(void)
 {
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
-   return pvclock_tsc_khz(_clock[0].pvti);
+   return pvclock_tsc_khz(this_cpu_pvti());
 }
 
 static void kvm_get_preset_lpj(void)
@@ -158,15 +159,14 @@ static void kvm_get_preset_lpj(void)
 
 bool kvm_check_and_clear_guest_paused(void)
 {
-   struct pvclock_vcpu_time_info *src;
+   struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
bool ret = false;
 
-   if (!hv_clock)
+   if (!src)
   

[tip:perf/urgent] perf/x86/amd/ibs: Don't access non-started event

2018-07-24 Thread tip-bot for Thomas Gleixner
Commit-ID:  d2753e6b4882a637a0e8fb3b9c2e15f33265300e
Gitweb: https://git.kernel.org/tip/d2753e6b4882a637a0e8fb3b9c2e15f33265300e
Author: Thomas Gleixner 
AuthorDate: Fri, 20 Jul 2018 10:39:07 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 24 Jul 2018 09:51:10 +0200

perf/x86/amd/ibs: Don't access non-started event

Paul Menzel reported the following bug:

> Enabling the undefined behavior sanitizer and building GNU/Linux 4.18-rc5+
> (with some unrelated commits) with GCC 8.1.0 from Debian Sid/unstable, the
> warning below is shown.
>
> > [2.111913]
> > 
> > [2.111917] UBSAN: Undefined behaviour in 
> > arch/x86/events/amd/ibs.c:582:24
> > [2.111919] member access within null pointer of type 'struct perf_event'
> > [2.111926] CPU: 0 PID: 144 Comm: udevadm Not tainted 
> > 4.18.0-rc5-00316-g4864b68cedf2 #104
> > [2.111928] Hardware name: ASROCK E350M1/E350M1, BIOS TIMELESS 01/01/1970
> > [2.111930] Call Trace:
> > [2.111943]  dump_stack+0x55/0x89
> > [2.111949]  ubsan_epilogue+0xb/0x33
> > [2.111953]  handle_null_ptr_deref+0x7f/0x90
> > [2.111958]  __ubsan_handle_type_mismatch_v1+0x55/0x60
> > [2.111964]  perf_ibs_handle_irq+0x596/0x620

The code dereferences event before checking the STARTED bit. Patch
below should cure the issue.

The warning should not trigger, if I analyzed the thing correctly.
(And Paul's testing confirms this.)

Reported-by: Paul Menzel 
Tested-by: Paul Menzel 
Signed-off-by: Thomas Gleixner 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Borislav Petkov 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Paul Menzel 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Vince Weaver 
Link: 
http://lkml.kernel.org/r/alpine.deb.2.21.1807200958390.1...@nanos.tec.linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/events/amd/ibs.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 4b98101209a1..d50bb4dc0650 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -579,7 +579,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, 
struct pt_regs *iregs)
 {
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
struct perf_event *event = pcpu->event;
-   struct hw_perf_event *hwc = >hw;
+   struct hw_perf_event *hwc;
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
@@ -602,6 +602,10 @@ fail:
return 0;
}
 
+   if (WARN_ON_ONCE(!event))
+   goto fail;
+
+   hwc = >hw;
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);


[tip:smp/urgent] cpu/hotplug: Prevent state corruption on error rollback

2018-09-06 Thread tip-bot for Thomas Gleixner
Commit-ID:  69fa6eb7d6a64801ea261025cce9723d9442d773
Gitweb: https://git.kernel.org/tip/69fa6eb7d6a64801ea261025cce9723d9442d773
Author: Thomas Gleixner 
AuthorDate: Thu, 6 Sep 2018 15:21:38 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 6 Sep 2018 15:21:38 +0200

cpu/hotplug: Prevent state corruption on error rollback

When a teardown callback fails, the CPU hotplug code brings the CPU back to
the previous state. The previous state becomes the new target state. The
rollback happens in undo_cpu_down() which increments the state
unconditionally even if the state is already the same as the target.

As a consequence the next CPU hotplug operation will start at the wrong
state. This is easily to observe when __cpu_disable() fails.

Prevent the unconditional undo by checking the state vs. target before
incrementing state and fix up the consequently wrong conditional in the
unplug code which handles the failure of the final CPU take down on the
control CPU side.

Fixes: 4dddfb5faa61 ("smp/hotplug: Rewrite AP state machine core")
Reported-by: Neeraj Upadhyay 
Signed-off-by: Thomas Gleixner 
Tested-by: Geert Uytterhoeven 
Tested-by: Sudeep Holla 
Tested-by: Neeraj Upadhyay 
Cc: j...@joshtriplett.org
Cc: pet...@infradead.org
Cc: jiangshan...@gmail.com
Cc: dzic...@redhat.com
Cc: brendan.jack...@arm.com
Cc: ma...@debian.org
Cc: sram...@codeaurora.org
Cc: linux-arm-...@vger.kernel.org
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/alpine.deb.2.21.1809051419580.1...@nanos.tec.linutronix.de


---
 kernel/cpu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index eb4041f78073..0097acec1c71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -916,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct 
cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
if (ret) {
st->target = prev_state;
-   undo_cpu_down(cpu, st);
+   if (st->state < prev_state)
+   undo_cpu_down(cpu, st);
break;
}
}
@@ -969,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int 
tasks_frozen,
 * to do the further cleanups.
 */
ret = cpuhp_down_callbacks(cpu, st, target);
-   if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+   if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
cpuhp_reset_state(st, prev_state);
__cpuhp_kick_ap(st);
}


<    2   3   4   5   6   7   8   9   10   11   >