Re: [PATCH 23/23] x86/fpu: Defer FPU state load until return to userspace

2018-11-08 Thread Sebastian Andrzej Siewior
On 2018-11-07 20:48:58 [+0100], To linux-kernel@vger.kernel.org wrote:
> index 8c03b42416656..d10b60d80d35a 100644
> --- a/arch/x86/include/asm/fpu/api.h
> +++ b/arch/x86/include/asm/fpu/api.h

The bot complained about missing definition for local_bh_disable() so I
swapped the preempt.h for bottom_half.h at the beginning of this file.

> @@ -27,17 +27,31 @@ extern void __kernel_fpu_end(void);
>  extern void kernel_fpu_begin(void);
>  extern void kernel_fpu_end(void);
>  extern bool irq_fpu_usable(void);
> +extern void fpregs_mark_activate(void);
>  
>  static inline void __fpregs_changes_begin(void)
>  {
>   preempt_disable();
> + local_bh_disable();
>  }
>  
>  static inline void __fpregs_changes_end(void)
>  {
> + local_bh_enable();
>   preempt_enable();
>  }

Sebastian


Re: [PATCH 23/23] x86/fpu: Defer FPU state load until return to userspace

2018-11-08 Thread Sebastian Andrzej Siewior
On 2018-11-07 20:48:58 [+0100], To linux-kernel@vger.kernel.org wrote:
> index 8c03b42416656..d10b60d80d35a 100644
> --- a/arch/x86/include/asm/fpu/api.h
> +++ b/arch/x86/include/asm/fpu/api.h

The bot complained about missing definition for local_bh_disable() so I
swapped the preempt.h for bottom_half.h at the beginning of this file.

> @@ -27,17 +27,31 @@ extern void __kernel_fpu_end(void);
>  extern void kernel_fpu_begin(void);
>  extern void kernel_fpu_end(void);
>  extern bool irq_fpu_usable(void);
> +extern void fpregs_mark_activate(void);
>  
>  static inline void __fpregs_changes_begin(void)
>  {
>   preempt_disable();
> + local_bh_disable();
>  }
>  
>  static inline void __fpregs_changes_end(void)
>  {
> + local_bh_enable();
>   preempt_enable();
>  }

Sebastian


[PATCH 23/23] x86/fpu: Defer FPU state load until return to userspace

2018-11-07 Thread Sebastian Andrzej Siewior
From: Rik van Riel 

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin() & kernel_fpu_end().

The __fpregs_changes_{begin|end}() section ensures that the register
remain unchanged. Otherwise a context switch or a BH could save the
registers to its FPU context and processor's FPU register would became
random if beeing modified at the same time.

KVM swaps the host/guest register on entry/exit path. I kept the flow as
is. First it ensures that the registers are loaded and then saves the
current (host) state before it loads the guest's register. The swap is
done at the very end with disabled interrupts so it should not change
anymore before theg guest is entered. The read/save version seems to be
cheaper compared to memcpy() in a micro benchmark.

Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy().
For kernel threads, this flag gets never cleared which avoids saving /
restoring the FPU state for kernel threads and during in-kernel usage of
the FPU register. This should restore the performance regression which
was introduced after the removal of the ->initialized field in struct
fpu.

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
---
 arch/x86/entry/common.c |  8 +++
 arch/x86/include/asm/fpu/api.h  | 14 +
 arch/x86/include/asm/fpu/internal.h | 31 +
 arch/x86/include/asm/trace/fpu.h|  5 +-
 arch/x86/kernel/fpu/core.c  | 97 +++--
 arch/x86/kernel/fpu/signal.c| 49 ---
 arch/x86/kernel/process.c   |  2 +-
 arch/x86/kernel/process_32.c|  5 +-
 arch/x86/kernel/process_64.c|  5 +-
 arch/x86/kvm/x86.c  | 19 --
 10 files changed, 172 insertions(+), 63 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b819181..e46fe9aa2934a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct 
pt_regs *regs)
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
 
+   /* Reload ti->flags; we may have rescheduled above. */
+   cached_flags = READ_ONCE(ti->flags);
+
+   fpregs_assert_state_consistent();
+   if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+   switch_fpu_return();
+
 #ifdef CONFIG_COMPAT
/*
 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 8c03b42416656..d10b60d80d35a 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -27,17 +27,31 @@ extern void __kernel_fpu_end(void);
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
 
 static inline void __fpregs_changes_begin(void)
 {
preempt_disable();
+   local_bh_disable();
 }
 
 static inline void __fpregs_changes_end(void)
 {
+   local_bh_enable();
preempt_enable();
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as 
well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 5e86ff60a3a5c..0406f0987697f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -29,7 +29,7 @@ extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
@@ -436,15 +436,20 @@ static inline void fpregs_activate(struct fpu *fpu)
 /*
  * Load the FPU state for the current task. Call with preemption disabled.
  */
-static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+static inline void __fpregs_load_activate(void)
 {
+   struct fpu *fpu = >thread.fpu;
+   int cpu = smp_processor_id();
+   int kthread = current->mm == NULL;
+
+   if (kthread)
+   return;
if (!fpregs_state_valid(fpu, cpu)) {
- 

[PATCH 23/23] x86/fpu: Defer FPU state load until return to userspace

2018-11-07 Thread Sebastian Andrzej Siewior
From: Rik van Riel 

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin() & kernel_fpu_end().

The __fpregs_changes_{begin|end}() section ensures that the register
remain unchanged. Otherwise a context switch or a BH could save the
registers to its FPU context and processor's FPU register would became
random if beeing modified at the same time.

KVM swaps the host/guest register on entry/exit path. I kept the flow as
is. First it ensures that the registers are loaded and then saves the
current (host) state before it loads the guest's register. The swap is
done at the very end with disabled interrupts so it should not change
anymore before theg guest is entered. The read/save version seems to be
cheaper compared to memcpy() in a micro benchmark.

Each thread gets TIF_NEED_FPU_LOAD set as part of fork() / fpu__copy().
For kernel threads, this flag gets never cleared which avoids saving /
restoring the FPU state for kernel threads and during in-kernel usage of
the FPU register. This should restore the performance regression which
was introduced after the removal of the ->initialized field in struct
fpu.

Signed-off-by: Rik van Riel 
Signed-off-by: Sebastian Andrzej Siewior 
---
 arch/x86/entry/common.c |  8 +++
 arch/x86/include/asm/fpu/api.h  | 14 +
 arch/x86/include/asm/fpu/internal.h | 31 +
 arch/x86/include/asm/trace/fpu.h|  5 +-
 arch/x86/kernel/fpu/core.c  | 97 +++--
 arch/x86/kernel/fpu/signal.c| 49 ---
 arch/x86/kernel/process.c   |  2 +-
 arch/x86/kernel/process_32.c|  5 +-
 arch/x86/kernel/process_64.c|  5 +-
 arch/x86/kvm/x86.c  | 19 --
 10 files changed, 172 insertions(+), 63 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b819181..e46fe9aa2934a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct 
pt_regs *regs)
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
 
+   /* Reload ti->flags; we may have rescheduled above. */
+   cached_flags = READ_ONCE(ti->flags);
+
+   fpregs_assert_state_consistent();
+   if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+   switch_fpu_return();
+
 #ifdef CONFIG_COMPAT
/*
 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 8c03b42416656..d10b60d80d35a 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -27,17 +27,31 @@ extern void __kernel_fpu_end(void);
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 extern bool irq_fpu_usable(void);
+extern void fpregs_mark_activate(void);
 
 static inline void __fpregs_changes_begin(void)
 {
preempt_disable();
+   local_bh_disable();
 }
 
 static inline void __fpregs_changes_end(void)
 {
+   local_bh_enable();
preempt_enable();
 }
 
+#ifdef CONFIG_X86_DEBUG_FPU
+extern void fpregs_assert_state_consistent(void);
+#else
+static inline void fpregs_assert_state_consistent(void) { }
+#endif
+
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
 /*
  * Query the presence of one or more xfeatures. Works on any legacy CPU as 
well.
  *
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 5e86ff60a3a5c..0406f0987697f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -29,7 +29,7 @@ extern void fpu__prepare_write(struct fpu *fpu);
 extern void fpu__save(struct fpu *fpu);
 extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
-extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
 extern void fpu__clear(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
@@ -436,15 +436,20 @@ static inline void fpregs_activate(struct fpu *fpu)
 /*
  * Load the FPU state for the current task. Call with preemption disabled.
  */
-static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+static inline void __fpregs_load_activate(void)
 {
+   struct fpu *fpu = >thread.fpu;
+   int cpu = smp_processor_id();
+   int kthread = current->mm == NULL;
+
+   if (kthread)
+   return;
if (!fpregs_state_valid(fpu, cpu)) {
-