[PATCH v7 4/6] selftest/sigaltstack: Use the AT_MINSIGSTKSZ aux vector if available

2021-03-16 Thread Chang S. Bae
The SIGSTKSZ constant may not represent enough stack size in some
architectures as the hardware state size grows.

Use getauxval(AT_MINSIGSTKSZ) to increase the stack size.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v5:
* Added as a new patch.
---
 tools/testing/selftests/sigaltstack/sas.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/sigaltstack/sas.c 
b/tools/testing/selftests/sigaltstack/sas.c
index 8934a3766d20..c53b070755b6 100644
--- a/tools/testing/selftests/sigaltstack/sas.c
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "../kselftest.h"
 
@@ -24,6 +25,11 @@
 #define SS_AUTODISARM  (1U << 31)
 #endif
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ 51
+#endif
+
+static unsigned int stack_size;
 static void *sstack, *ustack;
 static ucontext_t uc, sc;
 static const char *msg = "[OK]\tStack preserved";
@@ -47,7 +53,7 @@ void my_usr1(int sig, siginfo_t *si, void *u)
 #endif
 
if (sp < (unsigned long)sstack ||
-   sp >= (unsigned long)sstack + SIGSTKSZ) {
+   sp >= (unsigned long)sstack + stack_size) {
ksft_exit_fail_msg("SP is not on sigaltstack\n");
}
/* put some data on stack. other sighandler will try to overwrite it */
@@ -108,6 +114,10 @@ int main(void)
stack_t stk;
int err;
 
+   /* Make sure more than the required minimum. */
+   stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+   ksft_print_msg("[NOTE]\tthe stack size is %lu\n", stack_size);
+
ksft_print_header();
ksft_set_plan(3);
 
@@ -117,7 +127,7 @@ int main(void)
sigaction(SIGUSR1, , NULL);
act.sa_sigaction = my_usr2;
sigaction(SIGUSR2, , NULL);
-   sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+   sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (sstack == MAP_FAILED) {
ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -139,7 +149,7 @@ int main(void)
}
 
stk.ss_sp = sstack;
-   stk.ss_size = SIGSTKSZ;
+   stk.ss_size = stack_size;
stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
err = sigaltstack(, NULL);
if (err) {
@@ -161,7 +171,7 @@ int main(void)
}
}
 
-   ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+   ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (ustack == MAP_FAILED) {
ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -170,7 +180,7 @@ int main(void)
getcontext();
uc.uc_link = NULL;
uc.uc_stack.ss_sp = ustack;
-   uc.uc_stack.ss_size = SIGSTKSZ;
+   uc.uc_stack.ss_size = stack_size;
makecontext(, switch_fn, 0);
raise(SIGUSR1);
 
-- 
2.17.1



[PATCH v7 6/6] selftest/x86/signal: Include test cases for validating sigaltstack

2021-03-16 Thread Chang S. Bae
The test measures the kernel's signal delivery with different (enough vs.
insufficient) stack sizes.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Revised test messages again (Borislav Petkov)

Changes from v2:
* Revised test messages (Borislav Petkov)
---
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 333980375bc7..65bba2ae86ee 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) 
trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
-   syscall_arg_fault fsgsbase_restore
+   syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/sigaltstack.c 
b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index ..f689af75e979
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ   2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ   51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+  int flags)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_sigaction = handler;
+   sa.sa_flags = SA_SIGINFO | flags;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_handler = SIG_DFL;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+   stack_t ss;
+
+   memset(, 0, sizeof(ss));
+   ss.ss_size = size;
+   ss.ss_sp = start;
+
+   return sigaltstack(, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected 
SIGALRM).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGSEGV signal delivered.\n");
+   }
+
+   siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (!sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGALRM (expected 
SIGSEGV).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGALRM signal delivered.\n");
+   }
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+   if (setup_altstack(altstack, size))
+   err(1, "sigaltstack()");
+
+   sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+   sethandler(SIGSEGV, sigsegv, 0);
+   sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+   if (!sigsetjmp(jmpbuf, 1)) {
+   printf("[RUN]\tTest an alternate signal stack of %ssufficient 
size.\n",
+  sigalrm_expected ? "" : "in");
+   printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+  sigalrm_expected ? "It" : "SIGSEGV");
+   raise(SIGALRM);
+   }
+
+   clearhandler(SIGALRM);
+   clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+   void *altstack;
+
+   at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+   altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | 
PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+   if (altstack == MAP_FAILED)
+   err(1, "mmap()");
+
+   if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+   test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+   test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+   return nerrs == 0 ? 0 : 1;
+}
-- 
2.17.1



[PATCH v7 3/6] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2021-03-16 Thread Chang S. Bae
Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
use by all architectures with sigaltstack(2). Over time, the hardware state
size grew, but these constants did not evolve. Today, literal use of these
constants on several architectures may result in signal stack overflow, and
thus user data corruption.

A few years ago, the ARM team addressed this issue by establishing
getauxval(AT_MINSIGSTKSZ). This enables the kernel to supply at runtime
value that is an appropriate replacement on the current and future
hardware.

Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
added for ARM in commit 94b07c1f8c39 ("arm64: signal: Report signal frame
size to userspace via auxv").

Also, include a documentation to describe x86-specific auxiliary vectors.

Reported-by: Florian Weimer 
Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: H.J. Lu 
Cc: Fenghua Yu 
Cc: Dave Martin 
Cc: Michael Ellerman 
Cc: x...@kernel.org
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
---
Changes from v6:
* Revised the documentation and fixed the build issue. (Borislav Petkov)
* Fixed the vertical alignment of '\'. (Borislav Petkov)

Changes from v5:
* Added a documentation.
---
 Documentation/x86/elf_auxvec.rst   | 53 ++
 Documentation/x86/index.rst|  1 +
 arch/x86/include/asm/elf.h |  4 +++
 arch/x86/include/uapi/asm/auxvec.h |  4 +--
 arch/x86/kernel/signal.c   |  5 +++
 5 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/x86/elf_auxvec.rst

diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/x86/elf_auxvec.rst
new file mode 100644
index ..6c75b26f5efb
--- /dev/null
+++ b/Documentation/x86/elf_auxvec.rst
@@ -0,0 +1,53 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+x86-specific ELF Auxiliary Vectors
+==
+
+This document describes the semantics of the x86 auxiliary vectors.
+
+Introduction
+
+
+ELF Auxiliary vectors enable the kernel to efficiently provide
+configuration specific parameters to userspace. In this example, a program
+allocates an alternate stack based on the kernel-provided size::
+
+   #include 
+   #include 
+   #include 
+   #include 
+   #include 
+   #include 
+
+   #ifndef AT_MINSIGSTKSZ
+   #define AT_MINSIGSTKSZ  51
+   #endif
+
+   
+   stack_t ss;
+
+   ss.ss_sp = malloc(ss.ss_size);
+   assert(ss.ss_sp);
+
+   ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+   ss.ss_flags = 0;
+
+   if (sigaltstack(, NULL))
+err(1, "sigaltstack");
+
+
+The exposed auxiliary vectors
+=
+
+AT_SYSINFO is used for locating the vsyscall entry point.  It is not
+exported on 64-bit mode.
+
+AT_SYSINFO_EHDR is the start address of the page containing the vDSO.
+
+AT_MINSIGSTKSZ denotes the minimum stack size required by the kernel to
+deliver a signal to user-space.  AT_MINSIGSTKSZ comprehends the space
+consumed by the kernel to accommodate the user context for the current
+hardware configuration.  It does not comprehend subsequent user-space stack
+consumption, which must be added by the user.  (e.g. Above, user-space adds
+SIGSTKSZ to AT_MINSIGSTKSZ.)
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 4693e192b447..d58614d5cde6 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -35,3 +35,4 @@ x86-specific Documentation
sva
sgx
features
+   elf_auxvec
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9224d40cdefe..18d9b1117871 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -312,6 +312,7 @@ do {
\
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);\
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);\
}   \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   \
 } while (0)
 
 /*
@@ -328,6 +329,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -349,6 +351,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(uns

[PATCH v7 5/6] x86/signal: Detect and prevent an alternate signal stack overflow

2021-03-16 Thread Chang S. Bae
The kernel pushes context on to the userspace stack to prepare for the
user's signal handler. When the user has supplied an alternate signal
stack, via sigaltstack(2), it is easy for the kernel to verify that the
stack size is sufficient for the current hardware context.

Check if writing the hardware context to the alternate stack will exceed
it's size. If yes, then instead of corrupting user-data and proceeding with
the original signal handler, an immediate SIGSEGV signal is delivered.

Instead of calling on_sig_stack(), directly check the new stack pointer
whether in the bounds.

While the kernel allows new source code to discover and use a sufficient
alternate signal stack size, this check is still necessary to protect
binaries with insufficient alternate signal stack size from data
corruption.

Suggested-by: Jann Horn 
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Reviewed-by: Jann Horn 
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v5:
* Fixed the overflow check. (Andy Lutomirski)
* Updated the changelog.

Changes from v3:
* Updated the changelog (Borislav Petkov)

Changes from v2:
* Simplified the implementation (Jann Horn)
---
 arch/x86/kernel/signal.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 0d24f64d0145..9a62604fbf63 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -242,7 +242,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
-   int onsigstack = on_sig_stack(sp);
+   bool onsigstack = on_sig_stack(sp);
int ret;
 
/* redzone */
@@ -251,8 +251,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 
/* This is the X/Open sanctioned signal stack switching.  */
if (ka->sa.sa_flags & SA_ONSTACK) {
-   if (sas_ss_flags(sp) == 0)
+   if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+   /* On the alternate signal stack */
+   onsigstack = true;
+   }
} else if (IS_ENABLED(CONFIG_X86_32) &&
   !onsigstack &&
   regs->ss != __USER_DS &&
@@ -272,7 +275,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 * If we are on the alternate signal stack and would overflow it, don't.
 * Return an always-bogus address instead so we will die with SIGSEGV.
 */
-   if (onsigstack && !likely(on_sig_stack(sp)))
+   if (onsigstack && unlikely(sp <= current->sas_ss_sp ||
+  sp - current->sas_ss_sp > 
current->sas_ss_size))
return (void __user *)-1L;
 
/* save i387 and extended state */
-- 
2.17.1



[PATCH v7 2/6] x86/signal: Introduce helpers to get the maximum signal frame size

2021-03-16 Thread Chang S. Bae
Signal frames do not have a fixed format and can vary in size when a number
of things change: support XSAVE features, 32 vs. 64-bit apps. Add the code
to support a runtime method for userspace to dynamically discover how large
a signal stack needs to be.

Introduce a new variable, max_frame_size, and helper functions for the
calculation to be used in a new user interface. Set max_frame_size to a
system-wide worst-case value, instead of storing multiple app-specific
values.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Acked-by: H.J. Lu 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Renamed the fpstate size helper with cleanup (Borislav Petkov)
* Moved the sigframe struct size defines to where used (Borislav Petkov)
* Removed unneeded sentence in the changelog (Borislav Petkov)

Change from v1:
* Took stack alignment into account for sigframe size (Dave Martin)
---
 arch/x86/include/asm/fpu/signal.h |  2 ++
 arch/x86/include/asm/sigframe.h   |  2 ++
 arch/x86/kernel/cpu/common.c  |  3 ++
 arch/x86/kernel/fpu/signal.c  | 19 +++
 arch/x86/kernel/signal.c  | 57 +--
 5 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h 
b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 unsigned long *buf_fx, unsigned long *size);
 
+unsigned long fpu__get_fpstate_size(void);
+
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ab640abe26b6..c49ef3ad34dc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1334,6 +1335,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
 
fpu__init_system(c);
 
+   init_sigframe_size();
+
 #ifdef CONFIG_X86_32
/*
 * Regardless of whether PCID is enumerated, the SDM says
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..dbb304e48f16 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -507,6 +507,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
return sp;
 }
+
+unsigned long fpu__get_fpstate_size(void)
+{
+   unsigned long ret = xstate_sigframe_size();
+
+   /*
+* This space is needed on (most) 32-bit kernels, or when a 32-bit
+* app is running on a 64-bit kernel. To keep things simple, just
+* assume the worst case and always include space for 'freg_state',
+* even for 64-bit apps on 64-bit kernels. This wastes a bit of
+* space, but keeps the code simple.
+*/
+   if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+   ret += sizeof(struct fregs_state);
+
+   return ret;
+}
+
 /*
  * Prepare the SW reserved portion of the fxsave memory layout, indicating
  * the presence of the extended state information in the memory layout
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ea794a083c44..800243afd1ef 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -212,6 +212,11 @@ do {   
\
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT16UL
+
+#define MAX_FRAME_PADDING  (FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp)
 * Align the stack pointer according to the i386 ABI,
 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 */
-   sp = ((sp + 4) & -16ul) - 4;
+   sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-   sp = round_down(sp, 16) - 8;
+   sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
return sp;
 }
@@ -663,6 +668,54 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if defined(CONFIG_X86_32) ||

[PATCH v7 1/6] uapi: Define the aux vector AT_MINSIGSTKSZ

2021-03-16 Thread Chang S. Bae
Define the AT_MINSIGSTKSZ in generic Linux. It is already used as generic
ABI in glibc's generic elf.h, and this define will prevent future namespace
conflicts. In particular, x86 is also using this generic definition.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Carlos O'Donell 
Cc: Dave Martin 
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
Change from v6:
* Revised the comment. (Borislav Petkov)

Change from v5:
* Reverted the arm64 change. (Dave Martin and Will Deacon)
* Massaged the changelog.

Change from v4:
* Added as a new patch (Carlos O'Donell)
---
 include/uapi/linux/auxvec.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index abe5f2b6581b..c7e502bf5a6f 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,5 +33,8 @@
 
 #define AT_EXECFN  31  /* filename of program */
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ 51  /* minimal stack size for signal delivery */
+#endif
 
 #endif /* _UAPI_LINUX_AUXVEC_H */
-- 
2.17.1



[PATCH v7 0/6] x86: Improve Minimum Alternate Stack Size

2021-03-16 Thread Chang S. Bae
During signal entry, the kernel pushes data onto the normal userspace
stack. On x86, the data pushed onto the user stack includes XSAVE state,
which has grown over time as new features and larger registers have been
added to the architecture.

MINSIGSTKSZ is a constant provided in the kernel signal.h headers and
typically distributed in lib-dev(el) packages, e.g. [1]. Its value is
compiled into programs and is part of the user/kernel ABI. The MINSIGSTKSZ
constant indicates to userspace how much data the kernel expects to push on
the user stack, [2][3].

However, this constant is much too small and does not reflect recent
additions to the architecture. For instance, when AVX-512 states are in
use, the signal frame size can be 3.5KB while MINSIGSTKSZ remains 2KB.

The bug report [4] explains this as an ABI issue. The small MINSIGSTKSZ can
cause user stack overflow when delivering a signal.

In this series, we suggest a couple of things:
1. Provide a variable minimum stack size to userspace, as a similar
   approach to [5].
2. Avoid using a too-small alternate stack.

Changes from v6 [11]:
* Updated and fixed the documentation. (Borislav Petkov)
* Revised the AT_MINSIGSTKSZ comment. (Borislav Petkov)

Changes form v5 [10]:
* Fixed the overflow detection. (Andy Lutomirski)
* Reverted the AT_MINSIGSTKSZ removal on arm64. (Dave Martin)
* Added a documentation about the x86 AT_MINSIGSTKSZ.
* Supported the existing sigaltstack test to use the new aux vector.

Changes from v4 [9]:
* Moved the aux vector define to the generic header. (Carlos O'Donell)

Changes from v3 [8]:
* Updated the changelog. (Borislav Petkov)
* Revised the test messages again. (Borislav Petkov)

Changes from v2 [7]:
* Simplified the sigaltstack overflow prevention. (Jann Horn)
* Renamed fpstate size helper with cleanup. (Borislav Petkov)
* Cleaned up the signframe struct size defines. (Borislav Petkov)
* Revised the selftest messages. (Borislav Petkov)
* Revised a changelog. (Borislav Petkov)

Changes from v1 [6]:
* Took stack alignment into account for sigframe size. (Dave Martin)

[1]: 
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/bits/sigstack.h;h=b9dca794da093dc4d41d39db9851d444e1b54d9b;hb=HEAD
[2]: https://www.gnu.org/software/libc/manual/html_node/Signal-Stack.html
[3]: https://man7.org/linux/man-pages/man2/sigaltstack.2.html
[4]: https://bugzilla.kernel.org/show_bug.cgi?id=153531
[5]: 
https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4671/original/plumbers-dm-2017.pdf
[6]: 
https://lore.kernel.org/lkml/20200929205746.6763-1-chang.seok@intel.com/
[7]: https://lore.kernel.org/lkml/20201119190237.626-1-chang.seok@intel.com/
[8]: 
https://lore.kernel.org/lkml/20201223015312.4882-1-chang.seok@intel.com/
[9]: 
https://lore.kernel.org/lkml/20210115211038.2072-1-chang.seok@intel.com/
[10]: 
https://lore.kernel.org/lkml/20210203172242.29644-1-chang.seok@intel.com/
[11]: 
https://lore.kernel.org/lkml/20210227165911.32757-1-chang.seok@intel.com/

Chang S. Bae (6):
  uapi: Define the aux vector AT_MINSIGSTKSZ
  x86/signal: Introduce helpers to get the maximum signal frame size
  x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ
  selftest/sigaltstack: Use the AT_MINSIGSTKSZ aux vector if available
  x86/signal: Detect and prevent an alternate signal stack overflow
  selftest/x86/signal: Include test cases for validating sigaltstack

 Documentation/x86/elf_auxvec.rst  |  53 +
 Documentation/x86/index.rst   |   1 +
 arch/x86/include/asm/elf.h|   4 +
 arch/x86/include/asm/fpu/signal.h |   2 +
 arch/x86/include/asm/sigframe.h   |   2 +
 arch/x86/include/uapi/asm/auxvec.h|   4 +-
 arch/x86/kernel/cpu/common.c  |   3 +
 arch/x86/kernel/fpu/signal.c  |  19 
 arch/x86/kernel/signal.c  |  72 +++-
 include/uapi/linux/auxvec.h   |   3 +
 tools/testing/selftests/sigaltstack/sas.c |  20 +++-
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 13 files changed, 300 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/x86/elf_auxvec.rst
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c


base-commit: 1e28eed17697bcf343c6743f0028cc3b5dd88bf0
-- 
2.17.1



[PATCH v6 6/6] selftest/x86/signal: Include test cases for validating sigaltstack

2021-02-27 Thread Chang S. Bae
The test measures the kernel's signal delivery with different (enough vs.
insufficient) stack sizes.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Revised test messages again (Borislav Petkov)

Changes from v2:
* Revised test messages (Borislav Petkov)
---
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 333980375bc7..65bba2ae86ee 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) 
trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
-   syscall_arg_fault fsgsbase_restore
+   syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/sigaltstack.c 
b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index ..f689af75e979
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ   2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ   51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+  int flags)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_sigaction = handler;
+   sa.sa_flags = SA_SIGINFO | flags;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_handler = SIG_DFL;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+   stack_t ss;
+
+   memset(, 0, sizeof(ss));
+   ss.ss_size = size;
+   ss.ss_sp = start;
+
+   return sigaltstack(, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected 
SIGALRM).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGSEGV signal delivered.\n");
+   }
+
+   siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (!sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGALRM (expected 
SIGSEGV).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGALRM signal delivered.\n");
+   }
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+   if (setup_altstack(altstack, size))
+   err(1, "sigaltstack()");
+
+   sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+   sethandler(SIGSEGV, sigsegv, 0);
+   sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+   if (!sigsetjmp(jmpbuf, 1)) {
+   printf("[RUN]\tTest an alternate signal stack of %ssufficient 
size.\n",
+  sigalrm_expected ? "" : "in");
+   printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+  sigalrm_expected ? "It" : "SIGSEGV");
+   raise(SIGALRM);
+   }
+
+   clearhandler(SIGALRM);
+   clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+   void *altstack;
+
+   at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+   altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | 
PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+   if (altstack == MAP_FAILED)
+   err(1, "mmap()");
+
+   if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+   test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+   test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+   return nerrs == 0 ? 0 : 1;
+}
-- 
2.17.1



[PATCH v6 0/6] x86: Improve Minimum Alternate Stack Size

2021-02-27 Thread Chang S. Bae
During signal entry, the kernel pushes data onto the normal userspace
stack. On x86, the data pushed onto the user stack includes XSAVE state,
which has grown over time as new features and larger registers have been
added to the architecture.

MINSIGSTKSZ is a constant provided in the kernel signal.h headers and
typically distributed in lib-dev(el) packages, e.g. [1]. Its value is
compiled into programs and is part of the user/kernel ABI. The MINSIGSTKSZ
constant indicates to userspace how much data the kernel expects to push on
the user stack, [2][3].

However, this constant is much too small and does not reflect recent
additions to the architecture. For instance, when AVX-512 states are in
use, the signal frame size can be 3.5KB while MINSIGSTKSZ remains 2KB.

The bug report [4] explains this as an ABI issue. The small MINSIGSTKSZ can
cause user stack overflow when delivering a signal.

In this series, we suggest a couple of things:
1. Provide a variable minimum stack size to userspace, as a similar
   approach to [5].
2. Avoid using a too-small alternate stack.

Changes form v5 [10]:
* Fixed the overflow detection. (Andy Lutomirski)
* Reverted the AT_MINSIGSTKSZ removal on arm64. (Dave Martin)
* Added a documentation about the x86 AT_MINSIGSTKSZ.
* Supported the existing sigaltstack test to use the new aux vector.

Changes from v4 [9]:
* Moved the aux vector define to the generic header. (Carlos O'Donell)

Changes from v3 [8]:
* Updated the changelog. (Borislav Petkov)
* Revised the test messages again. (Borislav Petkov)

Changes from v2 [7]:
* Simplified the sigaltstack overflow prevention. (Jann Horn)
* Renamed fpstate size helper with cleanup. (Borislav Petkov)
* Cleaned up the signframe struct size defines. (Borislav Petkov)
* Revised the selftest messages. (Borislav Petkov)
* Revised a changelog. (Borislav Petkov)

Changes from v1 [6]:
* Took stack alignment into account for sigframe size. (Dave Martin)

[1]: 
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/bits/sigstack.h;h=b9dca794da093dc4d41d39db9851d444e1b54d9b;hb=HEAD
[2]: https://www.gnu.org/software/libc/manual/html_node/Signal-Stack.html
[3]: https://man7.org/linux/man-pages/man2/sigaltstack.2.html
[4]: https://bugzilla.kernel.org/show_bug.cgi?id=153531
[5]: 
https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4671/original/plumbers-dm-2017.pdf
[6]: 
https://lore.kernel.org/lkml/20200929205746.6763-1-chang.seok@intel.com/
[7]: https://lore.kernel.org/lkml/20201119190237.626-1-chang.seok@intel.com/
[8]: 
https://lore.kernel.org/lkml/20201223015312.4882-1-chang.seok@intel.com/
[9]: 
https://lore.kernel.org/lkml/20210115211038.2072-1-chang.seok@intel.com/
[10]: 
https://lore.kernel.org/lkml/20210203172242.29644-1-chang.seok@intel.com/

Chang S. Bae (6):
  uapi: Define the aux vector AT_MINSIGSTKSZ
  x86/signal: Introduce helpers to get the maximum signal frame size
  x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ
  selftest/sigaltstack: Use the AT_MINSIGSTKSZ aux vector if available
  x86/signal: Detect and prevent an alternate signal stack overflow
  selftest/x86/signal: Include test cases for validating sigaltstack

 Documentation/x86/elf_auxvec.rst  |  56 ++
 arch/x86/include/asm/elf.h|   4 +
 arch/x86/include/asm/fpu/signal.h |   2 +
 arch/x86/include/asm/sigframe.h   |   2 +
 arch/x86/include/uapi/asm/auxvec.h|   4 +-
 arch/x86/kernel/cpu/common.c  |   3 +
 arch/x86/kernel/fpu/signal.c  |  19 
 arch/x86/kernel/signal.c  |  72 +++-
 include/uapi/linux/auxvec.h   |   3 +
 tools/testing/selftests/sigaltstack/sas.c |  20 +++-
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 12 files changed, 302 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/x86/elf_auxvec.rst
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c


base-commit: f40ddce88593482919761f74910f42f4b84c004b
-- 
2.17.1



[PATCH v6 5/6] x86/signal: Detect and prevent an alternate signal stack overflow

2021-02-27 Thread Chang S. Bae
The kernel pushes context on to the userspace stack to prepare for the
user's signal handler. When the user has supplied an alternate signal
stack, via sigaltstack(2), it is easy for the kernel to verify that the
stack size is sufficient for the current hardware context.

Check if writing the hardware context to the alternate stack will exceed
it's size. If yes, then instead of corrupting user-data and proceeding with
the original signal handler, an immediate SIGSEGV signal is delivered.

Instead of calling on_sig_stack(), directly check the new stack pointer
whether in the bounds.

While the kernel allows new source code to discover and use a sufficient
alternate signal stack size, this check is still necessary to protect
binaries with insufficient alternate signal stack size from data
corruption.

Suggested-by: Jann Horn 
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Reviewed-by: Jann Horn 
Cc: Andy Lutomirski 
Cc: Jann Horn 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v5:
* Fixed the overflow check. (Andy Lutomirski)
* Updated the changelog.

Changes from v3:
* Updated the changelog (Borislav Petkov)

Changes from v2:
* Simplified the implementation (Jann Horn)
---
 arch/x86/kernel/signal.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 0d24f64d0145..9a62604fbf63 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -242,7 +242,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
-   int onsigstack = on_sig_stack(sp);
+   bool onsigstack = on_sig_stack(sp);
int ret;
 
/* redzone */
@@ -251,8 +251,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 
/* This is the X/Open sanctioned signal stack switching.  */
if (ka->sa.sa_flags & SA_ONSTACK) {
-   if (sas_ss_flags(sp) == 0)
+   if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+   /* On the alternate signal stack */
+   onsigstack = true;
+   }
} else if (IS_ENABLED(CONFIG_X86_32) &&
   !onsigstack &&
   regs->ss != __USER_DS &&
@@ -272,7 +275,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 * If we are on the alternate signal stack and would overflow it, don't.
 * Return an always-bogus address instead so we will die with SIGSEGV.
 */
-   if (onsigstack && !likely(on_sig_stack(sp)))
+   if (onsigstack && unlikely(sp <= current->sas_ss_sp ||
+  sp - current->sas_ss_sp > 
current->sas_ss_size))
return (void __user *)-1L;
 
/* save i387 and extended state */
-- 
2.17.1



[PATCH v6 4/6] selftest/sigaltstack: Use the AT_MINSIGSTKSZ aux vector if available

2021-02-27 Thread Chang S. Bae
The SIGSTKSZ constant may not represent enough stack size in some
architectures as the hardware state size grows.

Use getauxval(AT_MINSIGSTKSZ) to increase the stack size.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v5:
* Added as a new patch.
---
 tools/testing/selftests/sigaltstack/sas.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/sigaltstack/sas.c 
b/tools/testing/selftests/sigaltstack/sas.c
index 8934a3766d20..c53b070755b6 100644
--- a/tools/testing/selftests/sigaltstack/sas.c
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "../kselftest.h"
 
@@ -24,6 +25,11 @@
 #define SS_AUTODISARM  (1U << 31)
 #endif
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ 51
+#endif
+
+static unsigned int stack_size;
 static void *sstack, *ustack;
 static ucontext_t uc, sc;
 static const char *msg = "[OK]\tStack preserved";
@@ -47,7 +53,7 @@ void my_usr1(int sig, siginfo_t *si, void *u)
 #endif
 
if (sp < (unsigned long)sstack ||
-   sp >= (unsigned long)sstack + SIGSTKSZ) {
+   sp >= (unsigned long)sstack + stack_size) {
ksft_exit_fail_msg("SP is not on sigaltstack\n");
}
/* put some data on stack. other sighandler will try to overwrite it */
@@ -108,6 +114,10 @@ int main(void)
stack_t stk;
int err;
 
+   /* Make sure more than the required minimum. */
+   stack_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+   ksft_print_msg("[NOTE]\tthe stack size is %lu\n", stack_size);
+
ksft_print_header();
ksft_set_plan(3);
 
@@ -117,7 +127,7 @@ int main(void)
sigaction(SIGUSR1, , NULL);
act.sa_sigaction = my_usr2;
sigaction(SIGUSR2, , NULL);
-   sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+   sstack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (sstack == MAP_FAILED) {
ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -139,7 +149,7 @@ int main(void)
}
 
stk.ss_sp = sstack;
-   stk.ss_size = SIGSTKSZ;
+   stk.ss_size = stack_size;
stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
err = sigaltstack(, NULL);
if (err) {
@@ -161,7 +171,7 @@ int main(void)
}
}
 
-   ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+   ustack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (ustack == MAP_FAILED) {
ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
@@ -170,7 +180,7 @@ int main(void)
getcontext();
uc.uc_link = NULL;
uc.uc_stack.ss_sp = ustack;
-   uc.uc_stack.ss_size = SIGSTKSZ;
+   uc.uc_stack.ss_size = stack_size;
makecontext(, switch_fn, 0);
raise(SIGUSR1);
 
-- 
2.17.1



[PATCH v6 3/6] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2021-02-27 Thread Chang S. Bae
Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
use by all architectures with sigaltstack(2). Over time, the hardware state
size grew, but these constants did not evolve. Today, literal use of these
constants on several architectures may result in signal stack overflow, and
thus user data corruption.

A few years ago, the ARM team addressed this issue by establishing
getauxval(AT_MINSIGSTKSZ). This enables the kernel to supply at runtime
value that is an appropriate replacement on the current and future
hardware.

Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
added for ARM in commit 94b07c1f8c39 ("arm64: signal: Report signal frame
size to userspace via auxv").

Also, include a documentation to describe x86-specific auxiliary vectors.

Reported-by: Florian Weimer 
Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: H.J. Lu 
Cc: Fenghua Yu 
Cc: Dave Martin 
Cc: Michael Ellerman 
Cc: x...@kernel.org
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
---
Changes from v5:
* Added a documentation.
---
 Documentation/x86/elf_auxvec.rst   | 56 ++
 arch/x86/include/asm/elf.h |  4 +++
 arch/x86/include/uapi/asm/auxvec.h |  4 +--
 arch/x86/kernel/signal.c   |  5 +++
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/x86/elf_auxvec.rst

diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/x86/elf_auxvec.rst
new file mode 100644
index ..751c552c4048
--- /dev/null
+++ b/Documentation/x86/elf_auxvec.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+x86-specific ELF Auxiliary Vectors
+==
+
+This document describes the semantics of the x86 auxiliary vectors.
+
+1. Introduction
+---
+
+ELF Auxiliary vectors enable the kernel to efficiently provide
+configuration specific parameters to userspace. In this example, a program
+allocates an alternate stack based on the kernel-provided size.
+
+   #include 
+   #include 
+
+   #ifndef AT_MINSIGSTKSZ
+   #define AT_MINSIGSTKSZ  51
+   #endif
+
+   stack_t ss;
+   int err;
+
+   ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
+   ss.ss_sp = malloc(ss.ss_size);
+   ...
+
+   err = sigaltstack(, NULL);
+   ...
+
+
+2. The exposed auxiliary vectors
+-
+
+AT_SYSINFO
+The entry point to the system call function the virtual Dynamic Shared
+Object (vDSO), not exported on 64-bit.
+
+AT_SYSINFO_EHDR
+The start address of the page containing vDSO.
+
+AT_MINSIGSTKSZ
+The minimum stack size required to deliver a signal. It is a calculated
+sigframe size based on the largest possible user context. When programs
+use sigaltstack() to provide alternate signal stack, that stack must be
+at least the size to function properly on this hardware. Note that this
+is a minimum of the kernel to correctly get to the signal handler.
+Additional space must be added to handle objects pushed onto the stack
+by the signal handlers, as well as for nested signal delivery.
+
+The purpose of this parameter is to accommodate the different stack
+sizes required by different hardware configuration. E.g., the x86
+system supporting the Advanced Vector Extension needs at least 8KB more
+than the one without it.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 66bdfe838d61..cd10795c178e 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -312,6 +312,7 @@ do {
\
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);\
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);\
}   \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /*
@@ -328,6 +329,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -349,6 +351,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   

[PATCH v6 2/6] x86/signal: Introduce helpers to get the maximum signal frame size

2021-02-27 Thread Chang S. Bae
Signal frames do not have a fixed format and can vary in size when a number
of things change: support XSAVE features, 32 vs. 64-bit apps. Add the code
to support a runtime method for userspace to dynamically discover how large
a signal stack needs to be.

Introduce a new variable, max_frame_size, and helper functions for the
calculation to be used in a new user interface. Set max_frame_size to a
system-wide worst-case value, instead of storing multiple app-specific
values.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Acked-by: H.J. Lu 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Renamed the fpstate size helper with cleanup (Borislav Petkov)
* Moved the sigframe struct size defines to where used (Borislav Petkov)
* Removed unneeded sentence in the changelog (Borislav Petkov)

Change from v1:
* Took stack alignment into account for sigframe size (Dave Martin)
---
 arch/x86/include/asm/fpu/signal.h |  2 ++
 arch/x86/include/asm/sigframe.h   |  2 ++
 arch/x86/kernel/cpu/common.c  |  3 ++
 arch/x86/kernel/fpu/signal.c  | 19 +++
 arch/x86/kernel/signal.c  | 57 +--
 5 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h 
b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 unsigned long *buf_fx, unsigned long *size);
 
+unsigned long fpu__get_fpstate_size(void);
+
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..6954932272d5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1331,6 +1332,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
 
fpu__init_system(c);
 
+   init_sigframe_size();
+
 #ifdef CONFIG_X86_32
/*
 * Regardless of whether PCID is enumerated, the SDM says
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..dbb304e48f16 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -507,6 +507,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
return sp;
 }
+
+unsigned long fpu__get_fpstate_size(void)
+{
+   unsigned long ret = xstate_sigframe_size();
+
+   /*
+* This space is needed on (most) 32-bit kernels, or when a 32-bit
+* app is running on a 64-bit kernel. To keep things simple, just
+* assume the worst case and always include space for 'freg_state',
+* even for 64-bit apps on 64-bit kernels. This wastes a bit of
+* space, but keeps the code simple.
+*/
+   if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+   ret += sizeof(struct fregs_state);
+
+   return ret;
+}
+
 /*
  * Prepare the SW reserved portion of the fxsave memory layout, indicating
  * the presence of the extended state information in the memory layout
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ea794a083c44..800243afd1ef 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -212,6 +212,11 @@ do {   
\
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT16UL
+
+#define MAX_FRAME_PADDING  (FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp)
 * Align the stack pointer according to the i386 ABI,
 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 */
-   sp = ((sp + 4) & -16ul) - 4;
+   sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-   sp = round_down(sp, 16) - 8;
+   sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
return sp;
 }
@@ -663,6 +668,54 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if defined(CONFIG_X86_32) ||

[PATCH v6 1/6] uapi: Define the aux vector AT_MINSIGSTKSZ

2021-02-27 Thread Chang S. Bae
Define the AT_MINSIGSTKSZ in generic Linux. It is already used as generic
ABI in glibc's generic elf.h, and this define will prevent future namespace
conflicts. In particular, x86 is also using this generic definition.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Carlos O'Donell 
Cc: Dave Martin 
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
Change from v5:
* Reverted the arm64 change. (Dave Martin)
* Massaged the changelog.

Change from v4:
* Added as a new patch (Carlos O'Donell)
---
 include/uapi/linux/auxvec.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index abe5f2b6581b..15be98c75174 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,5 +33,8 @@
 
 #define AT_EXECFN  31  /* filename of program */
 
+#ifndef AT_MINSIGSTKSZ
+#define AT_MINSIGSTKSZ 51  /* stack needed for signal delivery  */
+#endif
 
 #endif /* _UAPI_LINUX_AUXVEC_H */
-- 
2.17.1



[PATCH v4 17/22] x86/cpufeatures/amx: Enumerate Advanced Matrix Extension (AMX) feature bits

2021-02-21 Thread Chang S. Bae
Intel's Advanced Matrix Extension (AMX) is a new 64-bit extended feature
consisting of two-dimensional registers and an accelerator unit. The first
implementation of the latter is the tile matrix multiply unit (TMUL). TMUL
performs SIMD dot-products on four bytes (INT8) or two bfloat16
floating-point (BF16) elements.

Here we add AMX to the kernel/user ABI, by enumerating the capability.
E.g., /proc/cpuinfo: amx_tile, amx_bf16, amx_int8

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/cpufeatures.h | 3 +++
 arch/x86/kernel/cpu/cpuid-deps.c   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 3170ab367cf2..f9990841238a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -379,6 +379,9 @@
 #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR   (18*32+19) /* Intel ARCH LBR */
 #define X86_FEATURE_AVX512_FP16(18*32+23) /* AVX512 FP16 */
+#define X86_FEATURE_AMX_BF16   (18*32+22) /* AMX BF16 Support */
+#define X86_FEATURE_AMX_TILE   (18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8   (18*32+25) /* AMX INT8 Support */
 #define X86_FEATURE_SPEC_CTRL  (18*32+26) /* "" Speculation Control 
(IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP(18*32+27) /* "" Single Thread 
Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D  (18*32+28) /* Flush L1D cache */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 4423046c2d74..154c18e493c5 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -73,6 +73,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_ENQCMD,   X86_FEATURE_XSAVES},
{ X86_FEATURE_PER_THREAD_MBA,   X86_FEATURE_MBA   },
{ X86_FEATURE_XFD,  X86_FEATURE_XSAVES},
+   { X86_FEATURE_AMX_TILE, X86_FEATURE_XSAVE },
+   { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE  },
+   { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE  },
{}
 };
 
-- 
2.17.1



[PATCH v4 22/22] x86/fpu/xstate: Introduce boot-parameters to control state component support

2021-02-21 Thread Chang S. Bae
"xstate.disable=0x6" will disable AMX on a system that has AMX compiled
into XFEATURE_MASK_USER_ENABLED.

"xstate.enable=0x6" will enable AMX on a system that does NOT have AMX
compiled into XFEATURE_MASK_USER_ENABLED (assuming the kernel is new enough
to support this feature).

Rename XFEATURE_MASK_USER_SUPPORTED to XFEATURE_MASK_USER_ENABLED to be
aligned with the new parameters.

While this cmdline is currently enabled only for AMX, it is intended to be
easily enabled to be useful for future XSAVE-enabled features.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Fixed a few typos. (Randy Dunlap)

Changes from v2:
* Changed the kernel tainted when any unknown state is enabled. (Andy
  Lutomirski)
* Simplified the cmdline handling.
* Edited the changelog.

Changes from v1:
* Renamed the user state mask define (Andy Lutomirski and Dave Hansen)
* Changed the error message (Dave Hansen)
* Fixed xfeatures_mask_user()
* Rebased the upstream kernel (5.10) -- revived the param parse function
---
 .../admin-guide/kernel-parameters.txt | 15 +
 arch/x86/include/asm/fpu/types.h  |  6 ++
 arch/x86/include/asm/fpu/xstate.h | 24 +++
 arch/x86/kernel/fpu/init.c| 65 +--
 4 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index a10b545c2070..ec79f63979a4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6014,6 +6014,21 @@
which allow the hypervisor to 'idle' the guest on lock
contention.
 
+   xstate.enable=  [X86-64]
+   xstate.disable= [X86-64]
+   The kernel is compiled with a default xstate bitmask --
+   enabling it to use the XSAVE hardware to efficiently
+   save and restore thread states on context switch.
+   xstate.enable allows adding to that default mask at
+   boot-time without recompiling the kernel just to support
+   the new thread state. (Note that the kernel will ignore
+   any bits in the mask that do not correspond to features
+   that are actually available in CPUID.)  xstate.disable
+   allows clearing bits in the default mask, forcing the
+   kernel to forget that it supports the specified thread
+   state. When a bit set for both, the kernel takes
+   xstate.disable as a priority.
+
xirc2ps_cs= [NET,PCMCIA]
Format:

,[,[,[,]]]
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 2f297aa85d8f..967d38cc7eb1 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -149,6 +149,12 @@ enum xfeature {
 #define XFEATURE_MASK_XTILE(XFEATURE_MASK_XTILE_DATA \
 | XFEATURE_MASK_XTILE_CFG)
 
+#define XFEATURE_REGION_MASK(max_bit, min_bit) \
+   ((BIT_ULL((max_bit) - (min_bit) + 1) - 1) << (min_bit))
+
+#define XFEATURE_MASK_CONFIGURABLE \
+   XFEATURE_REGION_MASK(XFEATURE_XTILE_DATA, XFEATURE_XTILE_CFG)
+
 #define FIRST_EXTENDED_XFEATUREXFEATURE_YMM
 
 struct reg_128_bit {
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 9e5c28f3beaa..1e64afea9f68 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -25,17 +25,17 @@
 
 #define XSAVE_ALIGNMENT 64
 
-/* All currently supported user features */
-#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
- XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
- XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM | \
- XFEATURE_MASK_PKRU | \
- XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR | \
- XFEATURE_MASK_XTILE)
+/* All currently enabled user features */
+#define XFEATURE_MASK_USER_ENABLED (XFEATURE_MASK_FP | \
+   XFEATURE_MASK_SSE | \
+   XFEATURE_MASK_YMM | \
+   XFEATURE_MASK_OPMASK | \
+   XFEATURE_MASK_ZMM_Hi256 | \
+   XFEATURE_MASK_Hi16_ZMM   | \
+   

[PATCH v4 21/22] x86/fpu/xstate: Support dynamic user state in the signal handling path

2021-02-21 Thread Chang S. Bae
Entering a signal handler, the kernel saves xstate in signal frame. The
dynamic user state is better to be saved only when used. fpu->state_mask
can help to exclude unused states.

Returning from a signal handler, XRSTOR re-initializes the excluded state
components.

Add a test case to verify in the signal handler that the signal frame
excludes AMX data when the signaled thread has initialized AMX state.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselft...@vger.kernel.org
---
Changes from v3:
* Removed 'no functional changes' in the changelog. (Borislav Petkov)

Changes from v1:
* Made it revertable (moved close to the end of the series).
* Included the test case.
---
 arch/x86/include/asm/fpu/internal.h |  2 +-
 tools/testing/selftests/x86/amx.c   | 66 +
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index c467312d38d8..090eb5bb277b 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -354,7 +354,7 @@ static inline void copy_kernel_to_xregs(struct xregs_state 
*xstate, u64 mask)
  */
 static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 {
-   u64 mask = xfeatures_mask_user();
+   u64 mask = current->thread.fpu.state_mask;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/tools/testing/selftests/x86/amx.c 
b/tools/testing/selftests/x86/amx.c
index f4ecdfd27ae9..a7386b886532 100644
--- a/tools/testing/selftests/x86/amx.c
+++ b/tools/testing/selftests/x86/amx.c
@@ -650,6 +650,71 @@ static void test_ptrace(void)
test_tile_state_write(ptracee_loads_tiles);
 }
 
+/* Signal handling test */
+
+static int sigtrapped;
+struct tile_data sig_tiles, sighdl_tiles;
+
+static void handle_sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+   ucontext_t *uctxt = (ucontext_t *)ctx_void;
+   struct xsave_data xdata;
+   struct tile_config cfg;
+   struct tile_data tiles;
+   u64 header;
+
+   header = __get_xsave_xstate_bv((void *)uctxt->uc_mcontext.fpregs);
+
+   if (header & (1 << XFEATURE_XTILE_DATA))
+   printf("[FAIL]\ttile data was written in sigframe\n");
+   else
+   printf("[OK]\ttile data was skipped in sigframe\n");
+
+   set_tilecfg();
+   load_tilecfg();
+   init_xdata();
+
+   make_tiles();
+   copy_tiles_to_xdata(, );
+   restore_xdata();
+
+   save_xdata();
+   if (compare_xdata_tiles(, ))
+   err(1, "tile load file");
+
+   printf("\tsignal handler: load tile data\n");
+
+   sigtrapped = sig;
+}
+
+static void test_signal_handling(void)
+{
+   struct xsave_data xdata = { 0 };
+   struct tile_data tiles = { 0 };
+
+   sethandler(SIGTRAP, handle_sigtrap, 0);
+   sigtrapped = 0;
+
+   printf("[RUN]\tCheck tile state management in handling signal\n");
+
+   printf("\tbefore signal: initial tile data state\n");
+
+   raise(SIGTRAP);
+
+   if (sigtrapped == 0)
+   err(1, "sigtrap");
+
+   save_xdata();
+   if (compare_xdata_tiles(, )) {
+   printf("[FAIL]\ttile data was not loaded at sigreturn\n");
+   nerrs++;
+   } else {
+   printf("[OK]\ttile data was re-initialized at sigreturn\n");
+   }
+
+   clearhandler(SIGTRAP);
+}
+
 int main(void)
 {
/* Check hardware availability at first */
@@ -672,6 +737,7 @@ int main(void)
test_fork();
test_context_switch();
test_ptrace();
+   test_signal_handling();
 
return nerrs ? 1 : 0;
 }
-- 
2.17.1



[PATCH v4 16/22] x86/fpu/xstate: Extend the table to map state components with features

2021-02-21 Thread Chang S. Bae
At compile-time xfeatures_mask_all includes all possible XCR0 features. At
run-time fpu__init_system_xstate() clears features in xfeatures_mask_all
that are not enabled in CPUID. It does this by looping through all possible
XCR0 features.

Update the code to handle the possibility that there will be gaps in the
XCR0 feature bit numbers.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/kernel/fpu/xstate.c | 41 ++--
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b69913ae30ed..4421ef424670 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -43,18 +43,23 @@ static const char *xfeature_names[] =
"unknown xstate feature",
 };
 
-static short xsave_cpuid_features[] __initdata = {
-   X86_FEATURE_FPU,
-   X86_FEATURE_XMM,
-   X86_FEATURE_AVX,
-   X86_FEATURE_MPX,
-   X86_FEATURE_MPX,
-   X86_FEATURE_AVX512F,
-   X86_FEATURE_AVX512F,
-   X86_FEATURE_AVX512F,
-   X86_FEATURE_INTEL_PT,
-   X86_FEATURE_PKU,
-   X86_FEATURE_ENQCMD,
+struct xfeature_capflag_info {
+   int xfeature_idx;
+   short cpu_cap;
+};
+
+static struct xfeature_capflag_info xfeature_capflags[] __initdata = {
+   { XFEATURE_FP,  X86_FEATURE_FPU },
+   { XFEATURE_SSE, X86_FEATURE_XMM },
+   { XFEATURE_YMM, X86_FEATURE_AVX },
+   { XFEATURE_BNDREGS, X86_FEATURE_MPX },
+   { XFEATURE_BNDCSR,  X86_FEATURE_MPX },
+   { XFEATURE_OPMASK,  X86_FEATURE_AVX512F },
+   { XFEATURE_ZMM_Hi256,   X86_FEATURE_AVX512F },
+   { XFEATURE_Hi16_ZMM,X86_FEATURE_AVX512F },
+   { XFEATURE_PT_UNIMPLEMENTED_SO_FAR, X86_FEATURE_INTEL_PT },
+   { XFEATURE_PKRU,X86_FEATURE_PKU },
+   { XFEATURE_PASID,   X86_FEATURE_ENQCMD },
 };
 
 /*
@@ -1010,11 +1015,15 @@ void __init fpu__init_system_xstate(void)
}
 
/*
-* Clear XSAVE features that are disabled in the normal CPUID.
+* Cross-check XSAVE feature with CPU capability flag. Clear the
+* mask bit for disabled features.
 */
-   for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
-   if (!boot_cpu_has(xsave_cpuid_features[i]))
-   xfeatures_mask_all &= ~BIT_ULL(i);
+   for (i = 0; i < ARRAY_SIZE(xfeature_capflags); i++) {
+   short cpu_cap = xfeature_capflags[i].cpu_cap;
+   int idx = xfeature_capflags[i].xfeature_idx;
+
+   if (!boot_cpu_has(cpu_cap))
+   xfeatures_mask_all &= ~BIT_ULL(idx);
}
 
xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
-- 
2.17.1



[PATCH v4 20/22] selftest/x86/amx: Include test cases for the AMX state management

2021-02-21 Thread Chang S. Bae
This selftest exercises the kernel's behavior not to inherit AMX state and
the ability to switch the context by verifying that they retain unique
data between multiple threads.

Also, ptrace() is used to insert AMX state into existing threads -- both
before and after the existing thread has initialized its AMX state.

Collect the test cases of validating those operations together, as they
share some common setup for the AMX state.

These test cases do not depend on AMX compiler support, as they employ
userspace-XSAVE directly to access AMX state.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselft...@vger.kernel.org
---
Changes from v2:
* Updated the test messages and the changelog as tile data is not inherited
  to a child anymore.
* Removed bytecode for the instructions already supported by binutils.
* Changed to check the XSAVE availability in a reliable way.

Changes from v1:
* Removed signal testing code
---
 tools/testing/selftests/x86/Makefile |   2 +-
 tools/testing/selftests/x86/amx.c| 677 +++
 2 files changed, 678 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/amx.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 333980375bc7..2f7feb03867b 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -17,7 +17,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs 
syscall_nt test_mremap
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering amx
 # Some selftests require 32bit support enabled also on 64bit systems
 TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
 
diff --git a/tools/testing/selftests/x86/amx.c 
b/tools/testing/selftests/x86/amx.c
new file mode 100644
index ..f4ecdfd27ae9
--- /dev/null
+++ b/tools/testing/selftests/x86/amx.c
@@ -0,0 +1,677 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define PAGE_SIZE  (1 << 12)
+
+#define NUM_TILES  8
+#define TILE_SIZE  1024
+#define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+struct xsave_data {
+   u8 area[XSAVE_SIZE];
+} __attribute__((aligned(64)));
+
+/* Tile configuration associated: */
+#define MAX_TILES  16
+#define RESERVED_BYTES 14
+
+struct tile_config {
+   u8  palette_id;
+   u8  start_row;
+   u8  reserved[RESERVED_BYTES];
+   u16 colsb[MAX_TILES];
+   u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+   u8 data[NUM_TILES * TILE_SIZE];
+};
+
+static inline u64 __xgetbv(u32 index)
+{
+   u32 eax, edx;
+
+   asm volatile("xgetbv;"
+: "=a" (eax), "=d" (edx)
+: "c" (index));
+   return eax + ((u64)edx << 32);
+}
+
+static inline void __cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+{
+   asm volatile("cpuid;"
+: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+: "0" (*eax), "2" (*ecx));
+}
+
+/* Load tile configuration */
+static inline void __ldtilecfg(void *cfg)
+{
+   asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+: : "a"(cfg));
+}
+
+/* Load tile data to %tmm0 register only */
+static inline void __tileloadd(void *tile)
+{
+   asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+: : "a"(tile), "d"(0));
+}
+
+/* Save extended states */
+static inline void __xsave(void *buffer, u32 lo, u32 hi)
+{
+   asm volatile("xsave (%%rdi)"
+: : "D" (buffer), "a" (lo), "d" (hi)
+: "memory");
+}
+
+/* Restore extended states */
+static inline void __xrstor(void *buffer, u32 lo, u32 hi)
+{
+   asm volatile("xrstor (%%rdi)"
+: : "D" (buffer), "a" (lo), "d" (hi));
+}
+
+/* Release tile states to init values */
+static inline void __tilerelease(void)
+{
+   asm volatile(".byte 0xc4, 0xe2, 0x78,

[PATCH v4 08/22] x86/fpu/xstate: Convert the struct fpu 'state' field to a pointer

2021-02-21 Thread Chang S. Bae
The xstate per-task buffer is embedded into struct fpu. And the field
'state' represents the buffer. When the dynamic user states in use, the
buffer may be dynamically allocated.

Convert the 'state' field to point either the embedded buffer or the
dynamically-allocated buffer. Add a new field to represent the embedded
buffer.

Every child process will set the pointer on its creation. And the initial
task sets it before dealing with soft FPU.

No functional change.

Suggested-by: Borislav Petkov 
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v3:
* Added as a new patch to simplify the buffer access. (Borislav Petkov)
---
 arch/x86/include/asm/fpu/internal.h |  6 +++---
 arch/x86/include/asm/fpu/types.h| 27 +--
 arch/x86/include/asm/trace/fpu.h|  4 ++--
 arch/x86/kernel/fpu/core.c  | 26 ++
 arch/x86/kernel/fpu/init.c  |  6 --
 arch/x86/kernel/fpu/regset.c| 22 +++---
 arch/x86/kernel/fpu/signal.c| 22 +++---
 arch/x86/kernel/fpu/xstate.c| 18 +-
 arch/x86/kernel/process.c   |  2 +-
 arch/x86/kvm/x86.c  | 18 +-
 arch/x86/math-emu/fpu_aux.c |  2 +-
 arch/x86/math-emu/fpu_entry.c   |  4 ++--
 arch/x86/math-emu/fpu_system.h  |  2 +-
 13 files changed, 89 insertions(+), 70 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index b34d0d29e4b8..46cb51ef4d17 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -199,9 +199,9 @@ static inline int copy_user_to_fregs(struct fregs_state 
__user *fx)
 static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 {
if (IS_ENABLED(CONFIG_X86_32))
-   asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+   asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
else
-   asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+   asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
 }
 
 /* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -427,7 +427,7 @@ static inline void __copy_kernel_to_fpregs(union 
fpregs_state *fpstate, u64 mask
 
 static inline void copy_kernel_to_fpregs(struct fpu *fpu)
 {
-   union fpregs_state *fpstate = >state;
+   union fpregs_state *fpstate = fpu->state;
 
/*
 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..dcd28a545377 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -339,13 +339,28 @@ struct fpu {
/*
 * @state:
 *
-* In-memory copy of all FPU registers that we save/restore
-* over context switches. If the task is using the FPU then
-* the registers in the FPU are more recent than this state
-* copy. If the task context-switches away then they get
-* saved here and represent the FPU state.
+* A pointer to indicate the in-memory copy of all FPU registers that 
are
+* saved/restored over context switches.
+*
+* Initially @state points to @__default_state. When dynamic states get
+* used, a memory is allocated for the larger state copy and @state is
+* updated to point to it. Then, the state in ->state supersedes and
+* invalidates the state in @__default_state.
+*
+* In general, if the task is using the FPU then the registers in the 
FPU
+* are more recent than the state copy. If the task context-switches 
away
+* then they get saved in ->state and represent the FPU state.
+*/
+   union fpregs_state  *state;
+
+   /*
+* @__default_state:
+*
+* Initial in-memory copy of all FPU registers that saved/restored
+* over context switches. When the task is switched to dynamic states,
+* this copy is replaced with the new in-memory copy in ->state.
 */
-   union fpregs_state  state;
+   union fpregs_state  __default_state;
/*
 * WARNING: 'state' is dynamically-sized.  Do not put
 * anything after it here.
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 879b77792f94..ef82f4824ce7 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu= fpu;
__entry->load_fpu   = test_thread_flag(TIF_NE

[PATCH v4 15/22] x86/fpu/xstate: Support ptracer-induced xstate buffer expansion

2021-02-21 Thread Chang S. Bae
ptrace() may update xstate data before the target task has taken an XFD
fault and expanded the xstate buffer. Detect this case and allocate a
sufficient buffer to support the request. Also, disable the (now
unnecessary) associated first-use fault.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Removed 'no functional changes' in the changelog. (Borislav Petkov)

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
* Updated the code comments.
---
 arch/x86/kernel/fpu/regset.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index ee27df4caed6..ec6cbb75010e 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -122,6 +122,35 @@ int xstateregs_set(struct task_struct *target, const 
struct user_regset *regset,
 
xsave = >state->xsave;
 
+   /*
+* When a ptracer attempts to write any state in the target buffer but 
not
+* sufficiently allocated, it dynamically expands the buffer.
+*/
+   if (count > get_xstate_size(fpu->state_mask)) {
+   unsigned int offset, size;
+   struct xstate_header hdr;
+   u64 mask;
+
+   offset = offsetof(struct xregs_state, header);
+   size = sizeof(hdr);
+
+   /* Retrieve XSTATE_BV */
+   if (kbuf) {
+   memcpy(, kbuf + offset, size);
+   } else {
+   ret = __copy_from_user(, ubuf + offset, size);
+   if (ret)
+   return ret;
+   }
+
+   mask = hdr.xfeatures & xfeatures_mask_user_dynamic;
+   if (!mask) {
+   ret = alloc_xstate_buffer(fpu, mask);
+   if (ret)
+   return ret;
+   }
+   }
+
fpu__prepare_write(fpu);
 
if (using_compacted_format()) {
-- 
2.17.1



[PATCH v4 13/22] x86/fpu/xstate: Update the xstate context copy function to support dynamic states

2021-02-21 Thread Chang S. Bae
ptrace() and signal return paths use xstate context copy functions. They
allow callers to read (or write) xstate values in the target's buffer. With
dynamic user states, a component's position in the buffer may vary and the
initial value is not always stored in init_fpstate.

Change the helpers to find a component's offset accordingly.

When copying an initial value, explicitly check the init_fpstate coverage.
If not found, reset the memory in the destination. Otherwise, copy values
from init_fpstate.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Cleaned up the code change with more comments.
* Removed 'no functional change' in the changelog. (Borislav Petkov)

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
---
 arch/x86/kernel/fpu/xstate.c | 69 
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 84b55f51bdb7..c57877df797d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -301,7 +301,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 * in a special way already:
 */
feature_bit = 0x2;
-   xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
+   xfeatures = (xfeatures_mask_user() & fpu->state_mask & ~xfeatures) >> 
feature_bit;
 
/*
 * Update all the remaining memory layouts according to their
@@ -310,12 +310,19 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 */
while (xfeatures) {
if (xfeatures & 0x1) {
-   int offset = xstate_comp_offsets[feature_bit];
+   int offset = get_xstate_comp_offset(fpu->state_mask, 
feature_bit);
int size = xstate_sizes[feature_bit];
 
-   memcpy((void *)fx + offset,
-  (void *)_fpstate.xsave + offset,
-  size);
+   /*
+* init_fpstate does not include the dynamic user states
+* as having initial values with zeros.
+*/
+   if (xfeatures_mask_user_dynamic & BIT_ULL(feature_bit))
+   memset((void *)fx + offset, 0, size);
+   else
+   memcpy((void *)fx + offset,
+  (void *)_fpstate.xsave + offset,
+  size);
}
 
xfeatures >>= 1;
@@ -1291,15 +1298,31 @@ static void fill_gap(struct membuf *to, unsigned *last, 
unsigned offset)
 {
if (*last >= offset)
return;
-   membuf_write(to, (void *)_fpstate.xsave + *last, offset - *last);
+
+   /*
+* Copy initial data.
+*
+* init_fpstate buffer has the minimum size as excluding the dynamic 
user
+* states. But their initial values are zeros.
+*/
+   if (offset <= get_xstate_config(XSTATE_MIN_SIZE))
+   membuf_write(to, (void *)_fpstate.xsave + *last, offset - 
*last);
+   else
+   membuf_zero(to, offset - *last);
*last = offset;
 }
 
+/*
+ * @from: If NULL, copy zeros.
+ */
 static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
  unsigned size, void *from)
 {
fill_gap(to, last, offset);
-   membuf_write(to, from, size);
+   if (from)
+   membuf_write(to, from, size);
+   else
+   membuf_zero(to, size);
*last = offset + size;
 }
 
@@ -1351,15 +1374,27 @@ void copy_xstate_to_kernel(struct membuf to, struct fpu 
*fpu)
  sizeof(header), );
 
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+   u64 mask = BIT_ULL(i);
+   void *src;
+
+   if (!(xfeatures_mask_user() & mask))
+   continue;
+
/*
-* Copy only in-use xstates:
+* Copy states if used. Otherwise, copy the initial data.
 */
-   if ((header.xfeatures >> i) & 1) {
-   void *src = __raw_xsave_addr(fpu, i);
 
-   copy_part(, , xstate_offsets[i],
- xstate_sizes[i], src);
-   }
+   if (header.xfeatures & mask)
+   src = __raw_xsave_addr(fpu, i);
+   else
+   /*
+* init_fpstate buffer does not include the dynamic
+* user state data as having initial values with zeros.
+*/
+   src = (xfeatures_mask_user_dynamic & mask) ?
+ NULL : (

[PATCH v4 14/22] x86/fpu/xstate: Expand the xstate buffer on the first use of dynamic user state

2021-02-21 Thread Chang S. Bae
Intel's Extended Feature Disable (XFD) feature is an extension of the XSAVE
architecture. XFD allows the kernel to enable a feature state in XCR0 and
to receive a #NM trap when a task uses instructions accessing that state.
In this way, Linux can defer allocating the large XSAVE buffer until tasks
need it.

XFD introduces two MSRs: IA32_XFD to enable/disable the feature and
IA32_XFD_ERR to assist the #NM trap handler. Both use the same
state-component bitmap format, used by XCR0.

Use this hardware capability to find the right time to expand the xstate
buffer. Introduce two sets of helper functions for that:

1. The first set is primarily for interacting with the XFD hardware:
xdisable_setbits()
xdisable_getbits()
xdisable_switch()

2. The second set is for managing the first-use status and handling #NM
   trap:
xfirstuse_enabled()
xfirstuse_not_detected()

The #NM handler induces the xstate buffer expansion to save the first-used
states.

The XFD feature is enabled only for the compacted format. If the kernel
uses the standard format, the buffer has to be always enough for all the
states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Removed 'no functional change' in the changelog. (Borislav Petkov)

Changes from v2:
* Changed to enable XFD only when the compacted format is used.
* Updated the changelog with task->fpu removed. (Borislav Petkov)

Changes from v1:
* Inlined the XFD-induced #NM handling code (Andy Lutomirski)
---
 arch/x86/include/asm/cpufeatures.h  |  1 +
 arch/x86/include/asm/fpu/internal.h | 51 -
 arch/x86/include/asm/msr-index.h|  2 ++
 arch/x86/kernel/cpu/cpuid-deps.c|  1 +
 arch/x86/kernel/fpu/xstate.c| 37 +++--
 arch/x86/kernel/process.c   |  5 +++
 arch/x86/kernel/process_32.c|  2 +-
 arch/x86/kernel/process_64.c|  2 +-
 arch/x86/kernel/traps.c | 40 ++
 9 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 84b887825f12..3170ab367cf2 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -277,6 +277,7 @@
 #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
 #define X86_FEATURE_XGETBV1(10*32+ 2) /* XGETBV with ECX = 1 
instruction */
 #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS 
instructions */
+#define X86_FEATURE_XFD(10*32+ 4) /* eXtended Feature 
Disabling */
 
 /*
  * Extended auxiliary flags: Linux defined - for features scattered in various
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index f964f3efc92e..c467312d38d8 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -557,11 +557,58 @@ static inline void switch_fpu_prepare(struct fpu 
*old_fpu, int cpu)
  * Misc helper functions:
  */
 
+/* The first-use detection helpers: */
+
+static inline void xdisable_setbits(u64 value)
+{
+   wrmsrl_safe(MSR_IA32_XFD, value);
+}
+
+static inline u64 xdisable_getbits(void)
+{
+   u64 value;
+
+   rdmsrl_safe(MSR_IA32_XFD, );
+   return value;
+}
+
+static inline u64 xfirstuse_enabled(void)
+{
+   /* All the dynamic user components are first-use enabled. */
+   return xfeatures_mask_user_dynamic;
+}
+
+/*
+ * Convert fpu->state_mask to the xdisable configuration to be written to
+ * MSR IA32_XFD.  So, xdisable_setbits() only uses this outcome.
+ */
+static inline u64 xfirstuse_not_detected(struct fpu *fpu)
+{
+   u64 firstuse_bv = (fpu->state_mask & xfirstuse_enabled());
+
+   /*
+* If first-use is not detected, set the bit. If the detection is
+* not enabled, the bit is always zero in firstuse_bv. So, make
+* following conversion:
+*/
+   return  (xfirstuse_enabled() ^ firstuse_bv);
+}
+
+/* Update MSR IA32_XFD with xfirstuse_not_detected() if needed. */
+static inline void xdisable_switch(struct fpu *prev, struct fpu *next)
+{
+   if (!static_cpu_has(X86_FEATURE_XFD) || !xfirstuse_enabled())
+   return;
+
+   if (unlikely(prev->state_mask != next->state_mask))
+   xdisable_setbits(xfirstuse_not_detected(next));
+}
+
 /*
  * Load PKRU from the FPU context if available. Delay loading of the
  * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct fpu *old_fpu, struct fpu *new_fpu)
 {
u32 pkru_val = init_pkru_value;
struct pkru_state *pk;
@@ -571,6 +618,8 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 
set_thread_flag(TIF_NEED_FPU_LOAD);
 
+   xdisable_switch(old_fpu, new_fpu);
+
if (!

[PATCH v4 07/22] x86/fpu/xstate: Calculate and remember dynamic xstate buffer sizes

2021-02-21 Thread Chang S. Bae
The xstate buffer is currently embedded into struct fpu with static size.
To accommodate dynamic user xstates, record the maximum and minimum buffer
sizes.

Rename the size calculation function. It calculates the maximum xstate size
and sanity checks it with CPUID. It also calculates the static embedded
buffer size by excluding the dynamic user states from the maximum size.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)
* Updated the code comment. (Borislav Petkov)
* Adjusted the calculation function naming.
* Moved out the new variable addition into a new patch.

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
* Renamed the in-line size variable.
* Updated some code comments.
---
 arch/x86/kernel/fpu/xstate.c | 52 +++-
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 16379c368714..b7686f107f3a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -655,23 +655,31 @@ static void check_xstate_against_struct(int nr)
 }
 
 /*
- * This essentially double-checks what the cpu told us about
- * how large the XSAVE buffer needs to be.  We are recalculating
- * it to be safe.
+ * Calculate the xstate per-task buffer sizes -- maximum and minimum.
+ *
+ * And record the minimum. Also double-check the maximum against what
+ * the cpu told.
+ *
+ * Dynamic user states are stored in this buffer. They account for the
+ * delta between the maximum and the minimum.
  *
  * Dynamic supervisor XSAVE features allocate their own buffers and are
- * not covered by these checks. Only the size of the buffer for task->fpu
- * is checked here.
+ * not covered by these checks.
  */
-static void do_extra_xstate_size_checks(void)
+static void calculate_xstate_sizes(void)
 {
-   int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+   int paranoid_min_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+   int paranoid_max_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
int i;
 
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+   bool user_dynamic;
+
if (!xfeature_enabled(i))
continue;
 
+   user_dynamic = (xfeatures_mask_user_dynamic & BIT_ULL(i)) ? 
true : false;
+
check_xstate_against_struct(i);
/*
 * Supervisor state components can be managed only by
@@ -681,23 +689,32 @@ static void do_extra_xstate_size_checks(void)
XSTATE_WARN_ON(xfeature_is_supervisor(i));
 
/* Align from the end of the previous feature */
-   if (xfeature_is_aligned(i))
-   paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+   if (xfeature_is_aligned(i)) {
+   paranoid_max_size = ALIGN(paranoid_max_size, 64);
+   if (!user_dynamic)
+   paranoid_min_size = ALIGN(paranoid_min_size, 
64);
+   }
/*
 * The offset of a given state in the non-compacted
 * format is given to us in a CPUID leaf.  We check
 * them for being ordered (increasing offsets) in
 * setup_xstate_features().
 */
-   if (!using_compacted_format())
-   paranoid_xstate_size = xfeature_uncompacted_offset(i);
+   if (!using_compacted_format()) {
+   paranoid_max_size = xfeature_uncompacted_offset(i);
+   if (!user_dynamic)
+   paranoid_min_size = 
xfeature_uncompacted_offset(i);
+   }
/*
 * The compacted-format offset always depends on where
 * the previous state ended.
 */
-   paranoid_xstate_size += xfeature_size(i);
+   paranoid_max_size += xfeature_size(i);
+   if (!user_dynamic)
+   paranoid_min_size += xfeature_size(i);
}
-   XSTATE_WARN_ON(paranoid_xstate_size != 
get_xstate_config(XSTATE_MAX_SIZE));
+   XSTATE_WARN_ON(paranoid_max_size != get_xstate_config(XSTATE_MAX_SIZE));
+   set_xstate_config(XSTATE_MIN_SIZE, paranoid_min_size);
 }
 
 
@@ -798,14 +815,11 @@ static int __init init_xstate_size(void)
 */
set_xstate_config(XSTATE_MAX_SIZE, possible_xstate_size);
 
-   /* Perform an extra check for the maximum size. */
-   do_extra_xstate_size_checks();
-
/*
-* Set the minimum to be the same as the maximum. The dynamic
-* user states are not supported yet.
+* Calculate and double-check the maximum size. Calculate and record
+* the minimum size.
 */
-   set_

[PATCH v4 19/22] x86/fpu/amx: Enable the AMX feature in 64-bit mode

2021-02-21 Thread Chang S. Bae
In 64-bit mode, include the AMX state components in
XFEATURE_MASK_USER_SUPPORTED.

The XFD feature will be used to dynamically expand the xstate per-task
buffer on the first use.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/fpu/xstate.h | 3 ++-
 arch/x86/kernel/fpu/init.c| 8 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 4112dbf05f19..9e5c28f3beaa 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -34,7 +34,8 @@
  XFEATURE_MASK_Hi16_ZMM | \
  XFEATURE_MASK_PKRU | \
  XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_XTILE)
 
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index f2fcdcc979e7..046889f31037 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -219,8 +219,12 @@ static void __init 
fpu__init_system_xstate_size_legacy(void)
  */
 u64 __init fpu__get_supported_xfeatures_mask(void)
 {
-   return XFEATURE_MASK_USER_SUPPORTED |
-  XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+   u64 mask = XFEATURE_MASK_USER_SUPPORTED | 
XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+   if (!IS_ENABLED(CONFIG_X86_64))
+   mask &= ~(XFEATURE_MASK_XTILE);
+
+   return mask;
 }
 
 /* Legacy code to initialize eager fpu mode. */
-- 
2.17.1



[PATCH v4 12/22] x86/fpu/xstate: Update the xstate buffer address finder to support dynamic states

2021-02-21 Thread Chang S. Bae
__raw_xsave_addr() returns the requested component's pointer in an xstate
buffer, by simply looking up the offset table. The offset used to be fixed,
but, with dynamic user states, it becomes variable.

get_xstate_size() has a routine to find an offset at runtime. Refactor to
use it for the address finder.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Added the function description in the kernel-doc style. (Borislav Petkov)
* Removed 'no functional change' in the changelog. (Borislav Petkov)
---
 arch/x86/kernel/fpu/xstate.c | 80 ++--
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index daf76108aa5f..84b55f51bdb7 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -175,6 +175,40 @@ static bool xfeature_is_supervisor(int xfeature_nr)
return ecx & 1;
 }
 
+/**
+ * get_xstate_comp_offset() - Find the feature's offset in the compacted format
+ * @mask:  This bitmap tells which components reserved in the format.
+ * @feature_nr:The feature number
+ *
+ * Returns:The offset value
+ */
+static unsigned int get_xstate_comp_offset(u64 mask, int feature_nr)
+{
+   u64 xmask = BIT_ULL(feature_nr + 1) - 1;
+   unsigned int next_offset, offset = 0;
+   int i;
+
+   if ((mask & xmask) == (xfeatures_mask_all & xmask))
+   return xstate_comp_offsets[feature_nr];
+
+   /*
+* With the given mask, no relevant size is found. Calculate it by 
summing
+* up each state size.
+*/
+
+   next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+   for (i = FIRST_EXTENDED_XFEATURE; i <= feature_nr; i++) {
+   if (!(mask & BIT_ULL(i)))
+   continue;
+
+   offset = xstate_aligns[i] ? ALIGN(next_offset, 64) : 
next_offset;
+   next_offset += xstate_sizes[i];
+   }
+
+   return offset;
+}
+
 /**
  * get_xstate_size() - calculate an xstate buffer size
  * @mask:  This bitmap tells which components reserved in the buffer.
@@ -186,9 +220,8 @@ static bool xfeature_is_supervisor(int xfeature_nr)
  */
 unsigned int get_xstate_size(u64 mask)
 {
-   unsigned int size;
-   u64 xmask;
-   int i, nr;
+   unsigned int offset;
+   int nr;
 
if (!mask)
return 0;
@@ -207,24 +240,8 @@ unsigned int get_xstate_size(u64 mask)
if (!using_compacted_format())
return xstate_offsets[nr] + xstate_sizes[nr];
 
-   xmask = BIT_ULL(nr + 1) - 1;
-
-   if (mask == (xmask & xfeatures_mask_all))
-   return xstate_comp_offsets[nr] + xstate_sizes[nr];
-
-   /*
-* With the given mask, no relevant size is found so far. So, calculate
-* it by summing up each state size.
-*/
-   for (size = FXSAVE_SIZE + XSAVE_HDR_SIZE, i = FIRST_EXTENDED_XFEATURE; 
i <= nr; i++) {
-   if (!(mask & BIT_ULL(i)))
-   continue;
-
-   if (xstate_aligns[i])
-   size = ALIGN(size, 64);
-   size += xstate_sizes[i];
-   }
-   return size;
+   offset = get_xstate_comp_offset(mask, nr);
+   return offset + xstate_sizes[nr];
 }
 
 /*
@@ -1042,17 +1059,20 @@ static void *__raw_xsave_addr(struct fpu *fpu, int 
xfeature_nr)
 {
void *xsave;
 
-   if (!xfeature_enabled(xfeature_nr)) {
-   WARN_ON_FPU(1);
-   return NULL;
-   }
-
-   if (fpu)
-   xsave = >state->xsave;
-   else
+   if (!xfeature_enabled(xfeature_nr))
+   goto not_found;
+   else if (!fpu)
xsave = _fpstate.xsave;
+   else if (!(fpu->state_mask & BIT_ULL(xfeature_nr)))
+   goto not_found;
+   else
+   xsave = >state->xsave;
+
+   return xsave + get_xstate_comp_offset(fpu->state_mask, xfeature_nr);
 
-   return xsave + xstate_comp_offsets[xfeature_nr];
+not_found:
+   WARN_ON_FPU(1);
+   return NULL;
 }
 /*
  * Given the xsave area and a state inside, this function returns the
-- 
2.17.1



[PATCH v4 06/22] x86/fpu/xstate: Add new variables to indicate dynamic xstate buffer size

2021-02-21 Thread Chang S. Bae
The xstate per-task buffer is in preparation to be dynamic for user states.
Introduce new size variables to indicate the minimum and maximum size of
the buffer. The value is determined at boot-time.

Instead of adding them as newly exported, introduce helper functions to
access them as well as the user buffer size.

No functional change. Those sizes have no difference, as the buffer is not
dynamic yet.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v3:
* Added as a new patch to add the variables along with new helpers.
  (Borislav Petkov)
---
 arch/x86/include/asm/fpu/xstate.h |  9 
 arch/x86/include/asm/processor.h  | 10 +---
 arch/x86/kernel/fpu/core.c| 24 +++---
 arch/x86/kernel/fpu/init.c| 26 ---
 arch/x86/kernel/fpu/regset.c  |  4 +-
 arch/x86/kernel/fpu/signal.c  | 27 ++-
 arch/x86/kernel/fpu/xstate.c  | 78 ---
 arch/x86/kernel/process.c |  7 +++
 arch/x86/kvm/x86.c|  5 +-
 9 files changed, 129 insertions(+), 61 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 6ce8350672c2..1fba2ca15874 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -102,6 +102,15 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 extern void __init update_regset_xstate_info(unsigned int size,
 u64 xstate_mask);
 
+enum xstate_config {
+   XSTATE_MIN_SIZE,
+   XSTATE_MAX_SIZE,
+   XSTATE_USER_SIZE
+};
+
+extern unsigned int get_xstate_config(enum xstate_config cfg);
+void set_xstate_config(enum xstate_config cfg, unsigned int value);
+
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..f70228312790 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -477,9 +477,6 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
 #endif /* X86_64 */
 
-extern unsigned int fpu_kernel_xstate_size;
-extern unsigned int fpu_user_xstate_size;
-
 struct perf_event;
 
 struct thread_struct {
@@ -545,12 +542,7 @@ struct thread_struct {
 };
 
 /* Whitelist the FPU state from the task_struct for hardened usercopy. */
-static inline void arch_thread_struct_whitelist(unsigned long *offset,
-   unsigned long *size)
-{
-   *offset = offsetof(struct thread_struct, fpu.state);
-   *size = fpu_kernel_xstate_size;
-}
+extern void arch_thread_struct_whitelist(unsigned long *offset, unsigned long 
*size);
 
 /*
  * Thread-synchronous status.
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 5775e64b0172..043fdba8431c 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -198,21 +198,30 @@ static inline void fpstate_init_fstate(struct fregs_state 
*fp)
 void fpstate_init(struct fpu *fpu)
 {
union fpregs_state *state;
+   unsigned int size;
+   u64 mask;
 
-   if (fpu)
+   if (fpu) {
state = >state;
-   else
+   /* The dynamic user states are not prepared yet. */
+   mask = xfeatures_mask_all & ~xfeatures_mask_user_dynamic;
+   size = get_xstate_config(XSTATE_MIN_SIZE);
+   } else {
state = _fpstate;
+   mask = xfeatures_mask_all;
+   size = get_xstate_config(XSTATE_MAX_SIZE);
+   }
 
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpstate_init_soft(>soft);
return;
}
 
-   memset(state, 0, fpu_kernel_xstate_size);
+   memset(state, 0, size);
 
if (static_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(>xsave, xfeatures_mask_all);
+   fpstate_init_xstate(>xsave, mask);
+
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(>fxsave);
else
@@ -235,8 +244,11 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
/*
 * Don't let 'init optimized' areas of the XSAVE area
 * leak into the child task:
+*
+* The child does not inherit the dynamic states. So,
+* the xstate buffer has the minimum size.
 */
-   memset(_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+   memset(_fpu->state.xsave, 0, get_xstate_config(XSTATE_MIN_SIZE));
 
/*
 * If the FPU registers are not current just memcpy() the state.
@@ -248,7 +260,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 */
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
- 

[PATCH v4 05/22] x86/fpu/xstate: Add a new variable to indicate dynamic user states

2021-02-21 Thread Chang S. Bae
The xstate per-task buffer is in preparation to be dynamic for user states.
Introduce a new mask variable to indicate the 'dynamic' user states. The
value is determined at boot-time.

The perf subsystem has a separate buffer to save some state only when
needed, not in every context switch. The states are named as 'dynamic'
supervisor states. Some define and helper are not named with dynamic
supervisor states, so rename them.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)
* Updated the code comment. (Borislav Petkov)

Changes from v2:
* Updated the changelog for clarification.
---
 arch/x86/include/asm/fpu/xstate.h | 12 ++-
 arch/x86/kernel/fpu/xstate.c  | 33 ---
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 24bf8d3f559a..6ce8350672c2 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -56,7 +56,7 @@
  * - Don't set the bit corresponding to the dynamic supervisor feature in
  *   IA32_XSS at run time, since it has been set at boot time.
  */
-#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+#define XFEATURE_MASK_SUPERVISOR_DYNAMIC (XFEATURE_MASK_LBR)
 
 /*
  * Unsupported supervisor features. When a supervisor feature in this mask is
@@ -66,7 +66,7 @@
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
- XFEATURE_MASK_DYNAMIC | \
+ XFEATURE_MASK_SUPERVISOR_DYNAMIC | \
  XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
 #ifdef CONFIG_X86_64
@@ -87,14 +87,16 @@ static inline u64 xfeatures_mask_user(void)
return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
 
-static inline u64 xfeatures_mask_dynamic(void)
+static inline u64 xfeatures_mask_supervisor_dynamic(void)
 {
if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
-   return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+   return XFEATURE_MASK_SUPERVISOR_DYNAMIC & ~XFEATURE_MASK_LBR;
 
-   return XFEATURE_MASK_DYNAMIC;
+   return XFEATURE_MASK_SUPERVISOR_DYNAMIC;
 }
 
+extern u64 xfeatures_mask_user_dynamic;
+
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5401a71dd15e..43940828d1a3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -61,6 +61,12 @@ static short xsave_cpuid_features[] __initdata = {
  */
 u64 xfeatures_mask_all __read_mostly;
 
+/*
+ * This represents user xstates, a subset of xfeatures_mask_all, saved in a
+ * dynamic kernel XSAVE buffer.
+ */
+u64 xfeatures_mask_user_dynamic __read_mostly;
+
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] 
= -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] 
= -1};
 static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX 
- 1] = -1};
@@ -237,7 +243,7 @@ void fpu__init_cpu_xstate(void)
 */
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
-xfeatures_mask_dynamic());
+xfeatures_mask_supervisor_dynamic());
}
 }
 
@@ -615,8 +621,8 @@ static void check_xstate_against_struct(int nr)
  * how large the XSAVE buffer needs to be.  We are recalculating
  * it to be safe.
  *
- * Dynamic XSAVE features allocate their own buffers and are not
- * covered by these checks. Only the size of the buffer for task->fpu
+ * Dynamic supervisor XSAVE features allocate their own buffers and are
+ * not covered by these checks. Only the size of the buffer for task->fpu
  * is checked here.
  */
 static void do_extra_xstate_size_checks(void)
@@ -686,7 +692,7 @@ static unsigned int __init get_xsaves_size(void)
  */
 static unsigned int __init get_xsaves_size_no_dynamic(void)
 {
-   u64 mask = xfeatures_mask_dynamic();
+   u64 mask = xfeatures_mask_supervisor_dynamic();
unsigned int size;
 
if (!mask)
@@ -773,6 +779,7 @@ static int __init init_xstate_size(void)
 static void fpu__init_disable_system_xstate(void)
 {
xfeatures_mask_all = 0;
+   xfeatures_mask_user_dynamic = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 }
@@ -839,6 +846,8 @@ void __init fpu__init_system_xstate(void)
}
 
xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
+   /* Do not support the dynamically allocated buffer yet. */
+   xfeatures

[PATCH v4 11/22] x86/fpu/xstate: Update the xstate save function to support dynamic states

2021-02-21 Thread Chang S. Bae
Extend copy_xregs_to_kernel() to receive a mask argument of which states to
save, in preparation for dynamic user state handling.

Update KVM to set a valid fpu->state_mask, so it can continue to share with
the core code.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)
* Made the code change more reviewable.

Changes from v2:
* Updated the changelog to clarify the KVM code changes.
---
 arch/x86/include/asm/fpu/internal.h | 3 +--
 arch/x86/kernel/fpu/core.c  | 2 +-
 arch/x86/kvm/x86.c  | 9 +++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index e4afc1831e29..f964f3efc92e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -317,9 +317,8 @@ static inline void copy_kernel_to_xregs_booting(struct 
xregs_state *xstate)
 /*
  * Save processor xstate to xsave area.
  */
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate, u64 mask)
 {
-   u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index dc20eabb072d..ad1ac80f98ef 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_fpu_usable);
 int copy_fpregs_to_fpstate(struct fpu *fpu)
 {
if (likely(use_xsave())) {
-   copy_xregs_to_kernel(>state->xsave);
+   copy_xregs_to_kernel(>state->xsave, fpu->state_mask);
 
/*
 * AVX512 state is tracked here because its use is
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c10122547ecd..ca2c0574acf2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9275,11 +9275,16 @@ static void kvm_save_current_fpu(struct fpu *fpu)
 * KVM does not support dynamic user states yet. Assume the buffer
 * always has the minimum size.
 */
-   if (test_thread_flag(TIF_NEED_FPU_LOAD))
+   if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
memcpy(fpu->state, current->thread.fpu.state,
   get_xstate_config(XSTATE_MIN_SIZE));
-   else
+   } else {
+   struct fpu *src_fpu = >thread.fpu;
+
+   if (fpu->state_mask != src_fpu->state_mask)
+   fpu->state_mask = src_fpu->state_mask;
copy_fpregs_to_fpstate(fpu);
+   }
 }
 
 /* Swap (qemu) user FPU context for the guest FPU context. */
-- 
2.17.1



[PATCH v4 01/22] x86/fpu/xstate: Modify the initialization helper to handle both static and dynamic buffers

2021-02-21 Thread Chang S. Bae
Have the function initializing the xstate buffer take a struct fpu *
pointer in preparation for dynamic state buffer support.

init_fpstate is a special case, which is indicated by a null pointer
parameter to fpstate_init().

Also, fpstate_init_xstate() now accepts the state component bitmap to
configure XCOMP_BV for the compacted format.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)
* Updated the function comment to use kernel-doc style. (Borislav Petkov)

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
---
 arch/x86/include/asm/fpu/internal.h |  6 +++---
 arch/x86/kernel/fpu/core.c  | 16 +---
 arch/x86/kernel/fpu/init.c  |  2 +-
 arch/x86/kernel/fpu/regset.c|  2 +-
 arch/x86/kernel/fpu/xstate.c|  3 +--
 arch/x86/kvm/x86.c  |  2 +-
 6 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 8d33ad80704f..d81d8c407dc0 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -80,20 +80,20 @@ static __always_inline __pure bool use_fxsr(void)
 
 extern union fpregs_state init_fpstate;
 
-extern void fpstate_init(union fpregs_state *state);
+extern void fpstate_init(struct fpu *fpu);
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
 #else
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
+static inline void fpstate_init_xstate(struct xregs_state *xsave, u64 
xcomp_mask)
 {
/*
 * XRSTORS requires these bits set in xcomp_bv, or it will
 * trigger #GP:
 */
-   xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+   xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xcomp_mask;
 }
 
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 571220ac8bea..d43661d309ab 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -192,8 +192,18 @@ static inline void fpstate_init_fstate(struct fregs_state 
*fp)
fp->fos = 0xu;
 }
 
-void fpstate_init(union fpregs_state *state)
+/*
+ * @fpu: If NULL, use init_fpstate
+ */
+void fpstate_init(struct fpu *fpu)
 {
+   union fpregs_state *state;
+
+   if (fpu)
+   state = >state;
+   else
+   state = _fpstate;
+
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpstate_init_soft(>soft);
return;
@@ -202,7 +212,7 @@ void fpstate_init(union fpregs_state *state)
memset(state, 0, fpu_kernel_xstate_size);
 
if (static_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(>xsave);
+   fpstate_init_xstate(>xsave, xfeatures_mask_all);
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(>fxsave);
else
@@ -262,7 +272,7 @@ static void fpu__initialize(struct fpu *fpu)
WARN_ON_FPU(fpu != >thread.fpu);
 
set_thread_flag(TIF_NEED_FPU_LOAD);
-   fpstate_init(>state);
+   fpstate_init(fpu);
trace_x86_fpu_init_state(fpu);
 }
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 701f196d7c68..74e03e3bc20f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -124,7 +124,7 @@ static void __init fpu__init_system_generic(void)
 * Set up the legacy init FPU context. (xstate init might overwrite this
 * with a more modern format, if the CPU supports it.)
 */
-   fpstate_init(_fpstate);
+   fpstate_init(NULL);
 
fpu__init_system_mxcsr();
 }
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..4c4d9059ff36 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -144,7 +144,7 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 * In case of failure, mark all states as init:
 */
if (ret)
-   fpstate_init(>state);
+   fpstate_init(fpu);
 
return ret;
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0a..1a3e5effe0fa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -457,8 +457,7 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
 
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
-xfeatures_mask_all;
+   fpstate_init_xstat

[PATCH v4 18/22] x86/fpu/amx: Define AMX state components and have it used for boot-time checks

2021-02-21 Thread Chang S. Bae
Linux uses check_xstate_against_struct() to sanity check the size of
XSTATE-enabled features. AMX is the XSAVE-enabled feature, and its size is
not hard-coded but discoverable at run-time via CPUID.

The AMX state is composed of state components 17 and 18, which are all user
state components. The first component is the XTILECFG state of a 64-byte
tile-related control register. The state component 18, called XTILEDATA,
contains the actual tile data, and the state size varies on
implementations. The architectural maximum, as defined in the CPUID(0x1d,
1): EAX[15:0], is a byte less than 64KB. The first implementation supports
8KB.

Check the XTILEDATA state size dynamically. The feature introduces the new
tile register, TMM. Define one register struct only and read the number of
registers from CPUID. Cross-check the overall size with CPUID again.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the code comments.

Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/include/asm/fpu/types.h  | 27 ++
 arch/x86/include/asm/fpu/xstate.h |  2 +
 arch/x86/kernel/fpu/xstate.c  | 62 +++
 3 files changed, 91 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 6fc707c14350..2f297aa85d8f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -120,6 +120,9 @@ enum xfeature {
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
+   XFEATURE_RSRVD_COMP_16,
+   XFEATURE_XTILE_CFG,
+   XFEATURE_XTILE_DATA,
 
XFEATURE_MAX,
 };
@@ -136,11 +139,15 @@ enum xfeature {
 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
 #define XFEATURE_MASK_PASID(1 << XFEATURE_PASID)
 #define XFEATURE_MASK_LBR  (1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG(1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA   (1 << XFEATURE_XTILE_DATA)
 
 #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512   (XFEATURE_MASK_OPMASK \
 | XFEATURE_MASK_ZMM_Hi256 \
 | XFEATURE_MASK_Hi16_ZMM)
+#define XFEATURE_MASK_XTILE(XFEATURE_MASK_XTILE_DATA \
+| XFEATURE_MASK_XTILE_CFG)
 
 #define FIRST_EXTENDED_XFEATUREXFEATURE_YMM
 
@@ -153,6 +160,9 @@ struct reg_256_bit {
 struct reg_512_bit {
u8  regbytes[512/8];
 };
+struct reg_1024_byte {
+   u8  regbytes[1024];
+};
 
 /*
  * State component 2:
@@ -255,6 +265,23 @@ struct arch_lbr_state {
u64 ler_to;
u64 ler_info;
struct lbr_entryentries[];
+};
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+   u64 tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+   struct reg_1024_bytetmm;
 } __packed;
 
 /*
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index cbb4795d2b45..4112dbf05f19 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -13,6 +13,8 @@
 
 #define XSTATE_CPUID   0x000d
 
+#define TILE_CPUID 0x001d
+
 #define FXSAVE_SIZE512
 
 #define XSAVE_HDR_SIZE 64
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4421ef424670..7e708d6f43b5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -41,6 +41,14 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "AMX Tile config"   ,
+   "AMX Tile data" ,
+   "unknown xstate feature",
 };
 
 struct xfeature_capflag_info {
@@ -60,6 +68,8 @@ static struct xfeature_capflag_info xfeature_capflags[] 
__initdata = {
{ XFEATURE_PT_UNIMPLEMENTED_SO_FAR, X86_FEATURE_INTEL_PT },
{ XFEATURE_PKRU,X86_FEATURE_PKU },
{ XFEATURE_PASID,   X86_FEATURE_ENQCMD },
+   { XFEATURE_XTILE_CFG,   X86_FEATURE_AMX_TILE },
+   { XFEATURE_XTILE_DATA,  X86_FEATURE

[PATCH v4 10/22] x86/fpu/xstate: Define the scope of the initial xstate data

2021-02-21 Thread Chang S. Bae
init_fpstate is used to record the initial xstate value and covers all the
states. But it is wasteful to cover large states all with trivial initial
data.

Limit init_fpstate by clarifying its size and coverage, which are all but
dynamic user states. The dynamic states are assumed to be large but having
initial data with zeros.

Expand copy_xregs_to_kernel_booting() to receive a mask argument of which
states to save.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Removed the helper functions. (Borislav Petkov)
* Removed 'no functional change' in the changelog. (Borislav Petkov)
* Updated the code comment.
* Moved out the other initialization changes into the previous patch.

Changes from v2:
* Updated the changelog for clarification.
* Updated the code comments.
---
 arch/x86/include/asm/fpu/internal.h |  3 +--
 arch/x86/kernel/fpu/core.c  | 13 ++---
 arch/x86/kernel/fpu/xstate.c| 11 +--
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 46cb51ef4d17..e4afc1831e29 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -272,9 +272,8 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
  * This function is called only during boot time when x86 caps are not set
  * up and alternative can not be used yet.
  */
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
+static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate, 
u64 mask)
 {
-   u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 5debb1cd3c74..dc20eabb072d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -21,7 +21,10 @@
 
 /*
  * Represents the initial FPU state. It's mostly (but not completely) zeroes,
- * depending on the FPU hardware format:
+ * depending on the FPU hardware format.
+ *
+ * The dynamic user states are excluded as they are large but having initial
+ * values with zeros.
  */
 union fpregs_state init_fpstate __read_mostly;
 
@@ -206,9 +209,13 @@ void fpstate_init(struct fpu *fpu)
mask = fpu->state_mask;
size = get_xstate_size(fpu->state_mask);
} else {
+   /*
+* init_fpstate excludes the dynamic user states as they are
+* large but having initial values with zeros.
+*/
state = _fpstate;
-   mask = xfeatures_mask_all;
-   size = get_xstate_config(XSTATE_MAX_SIZE);
+   mask = (xfeatures_mask_all & ~xfeatures_mask_user_dynamic);
+   size = get_xstate_config(XSTATE_MIN_SIZE);
}
 
if (!static_cpu_has(X86_FEATURE_FPU)) {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 86251b947403..daf76108aa5f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -552,6 +552,7 @@ static void __init print_xstate_offset_size(void)
 static void __init setup_init_fpu_buf(void)
 {
static int on_boot_cpu __initdata = 1;
+   u64 mask;
 
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -562,8 +563,14 @@ static void __init setup_init_fpu_buf(void)
setup_xstate_features();
print_xstate_features();
 
+   /*
+* Exclude the dynamic user states as they are large but having
+* initial values with zeros.
+*/
+   mask = xfeatures_mask_all & ~xfeatures_mask_user_dynamic;
+
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(_fpstate.xsave, xfeatures_mask_all);
+   fpstate_init_xstate(_fpstate.xsave, mask);
 
/*
 * Init all the features state with header.xfeatures being 0x0
@@ -574,7 +581,7 @@ static void __init setup_init_fpu_buf(void)
 * Dump the init state again. This is to identify the init state
 * of any feature which is not represented by all zero's.
 */
-   copy_xregs_to_kernel_booting(_fpstate.xsave);
+   copy_xregs_to_kernel_booting(_fpstate.xsave, mask);
 }
 
 static int xfeature_uncompacted_offset(int xfeature_nr)
-- 
2.17.1



[PATCH v4 02/22] x86/fpu/xstate: Modify state copy helpers to handle both static and dynamic buffers

2021-02-21 Thread Chang S. Bae
Have all the functions copying xstate take a struct fpu * pointer in
preparation for dynamic state buffer support.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
---
 arch/x86/include/asm/fpu/xstate.h |  8 
 arch/x86/kernel/fpu/regset.c  |  6 +++---
 arch/x86/kernel/fpu/signal.c  | 16 +++-
 arch/x86/kernel/fpu/xstate.c  | 19 +++
 4 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..e0f1b22f53ce 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -105,10 +105,10 @@ const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
 struct membuf;
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_supervisor_to_kernel(struct xregs_state *xsave);
+void copy_xstate_to_kernel(struct membuf to, struct fpu *fpu);
+int copy_kernel_to_xstate(struct fpu *fpu, const void *kbuf);
+int copy_user_to_xstate(struct fpu *fpu, const void __user *ubuf);
+void copy_supervisor_to_kernel(struct fpu *fpu);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 4c4d9059ff36..5e13e58d11d4 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -85,7 +85,7 @@ int xstateregs_get(struct task_struct *target, const struct 
user_regset *regset,
fpu__prepare_read(fpu);
 
if (using_compacted_format()) {
-   copy_xstate_to_kernel(to, xsave);
+   copy_xstate_to_kernel(to, fpu);
return 0;
} else {
fpstate_sanitize_xstate(fpu);
@@ -126,9 +126,9 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 
if (using_compacted_format()) {
if (kbuf)
-   ret = copy_kernel_to_xstate(xsave, kbuf);
+   ret = copy_kernel_to_xstate(fpu, kbuf);
else
-   ret = copy_user_to_xstate(xsave, ubuf);
+   ret = copy_user_to_xstate(fpu, ubuf);
} else {
ret = user_regset_copyin(, , , , xsave, 0, 
-1);
if (!ret)
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..0d6deb75c507 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -212,11 +212,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void 
__user *buf_fx, int size)
 }
 
 static inline void
-sanitize_restored_user_xstate(union fpregs_state *state,
+sanitize_restored_user_xstate(struct fpu *fpu,
  struct user_i387_ia32_struct *ia32_env,
  u64 user_xfeatures, int fx_only)
 {
-   struct xregs_state *xsave = >xsave;
+   struct xregs_state *xsave = >state.xsave;
struct xstate_header *header = >header;
 
if (use_xsave()) {
@@ -253,7 +253,7 @@ sanitize_restored_user_xstate(union fpregs_state *state,
xsave->i387.mxcsr &= mxcsr_feature_mask;
 
if (ia32_env)
-   convert_to_fxsr(>fxsave, ia32_env);
+   convert_to_fxsr(>state.fxsave, ia32_env);
}
 }
 
@@ -396,7 +396,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
 * current supervisor states first and invalidate the FPU regs.
 */
if (xfeatures_mask_supervisor())
-   copy_supervisor_to_kernel(>state.xsave);
+   copy_supervisor_to_kernel(fpu);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
@@ -406,7 +406,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
if (using_compacted_format()) {
-   ret = copy_user_to_xstate(>state.xsave, buf_fx);
+   ret = copy_user_to_xstate(fpu, buf_fx);
} else {
ret = __copy_from_user(>state.xsave, buf_fx, 
state_size);
 
@@ -416,8 +416,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
if (ret

[PATCH v4 03/22] x86/fpu/xstate: Modify address finders to handle both static and dynamic buffers

2021-02-21 Thread Chang S. Bae
Have all the functions finding xstate address take a struct fpu * pointer
in preparation for dynamic state buffer support.

init_fpstate is a special case, which is indicated by a null pointer
parameter to get_xsave_addr() and __raw_xsave_addr().

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)
* Updated the function comment to use kernel-doc style. (Borislav Petkov)

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)

Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/include/asm/fpu/internal.h |  2 +-
 arch/x86/include/asm/fpu/xstate.h   |  2 +-
 arch/x86/include/asm/pgtable.h  |  2 +-
 arch/x86/kernel/cpu/common.c|  2 +-
 arch/x86/kernel/fpu/xstate.c| 38 +
 arch/x86/kvm/x86.c  | 10 +++-
 arch/x86/mm/pkeys.c |  2 +-
 7 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index d81d8c407dc0..0153c4d4ca77 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -579,7 +579,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 * return to userland e.g. for a copy_to_user() operation.
 */
if (current->mm) {
-   pk = get_xsave_addr(_fpu->state.xsave, XFEATURE_PKRU);
+   pk = get_xsave_addr(new_fpu, XFEATURE_PKRU);
if (pk)
pkru_val = pk->pkru;
}
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index e0f1b22f53ce..24bf8d3f559a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -100,7 +100,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 extern void __init update_regset_xstate_info(unsigned int size,
 u64 xstate_mask);
 
-void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..83268b41444f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -141,7 +141,7 @@ static inline void write_pkru(u32 pkru)
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return;
 
-   pk = get_xsave_addr(>thread.fpu.state.xsave, XFEATURE_PKRU);
+   pk = get_xsave_addr(>thread.fpu, XFEATURE_PKRU);
 
/*
 * The PKRU value in xstate needs to be in sync with the value that is
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..860b19db208b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -478,7 +478,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
return;
 
cr4_set_bits(X86_CR4_PKE);
-   pk = get_xsave_addr(_fpstate.xsave, XFEATURE_PKRU);
+   pk = get_xsave_addr(NULL, XFEATURE_PKRU);
if (pk)
pk->pkru = init_pkru_value;
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6156dad0feb6..5401a71dd15e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -894,15 +894,24 @@ void fpu__resume_cpu(void)
  * Given an xstate feature nr, calculate where in the xsave
  * buffer the state is.  Callers should ensure that the buffer
  * is valid.
+ *
+ * @fpu: If NULL, use init_fpstate
  */
-static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+static void *__raw_xsave_addr(struct fpu *fpu, int xfeature_nr)
 {
+   void *xsave;
+
if (!xfeature_enabled(xfeature_nr)) {
WARN_ON_FPU(1);
return NULL;
}
 
-   return (void *)xsave + xstate_comp_offsets[xfeature_nr];
+   if (fpu)
+   xsave = >state.xsave;
+   else
+   xsave = _fpstate.xsave;
+
+   return xsave + xstate_comp_offsets[xfeature_nr];
 }
 /*
  * Given the xsave area and a state inside, this function returns the
@@ -915,15 +924,18 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, 
int xfeature_nr)
  * this will return NULL.
  *
  * Inputs:
- * xstate: the thread's storage area for all FPU data
+ * fpu: the thread's FPU data to reference xstate buffer(s).
+ *  (A null pointer parameter indicates init_fpstate.)
  * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
  * XFEATURE_SSE, etc...)
  * Output:
  * address of the state in the xsave area, or NULL if the
  * field is not present in the xsave buffer.
  

[PATCH v4 09/22] x86/fpu/xstate: Introduce helpers to manage the xstate buffer dynamically

2021-02-21 Thread Chang S. Bae
The static per-task xstate buffer contains the extended register states --
but it is not expandable at runtime. Introduce runtime methods and a new
fpu struct field to support the expansion.

fpu->state_mask indicates which state components are reserved to be
saved in the xstate buffer.

alloc_xstate_buffer() uses vmalloc(). If use of this mechanism grows to
allocate buffers larger than 64KB, a more sophisticated allocation scheme
that includes purpose-built reclaim capability might be justified.

Introduce a new helper -- get_xstate_size() to calculate the buffer size.

Also, use the new field and helper to initialize the buffer.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Updated code comments. (Borislav Petkov)
* Used vzalloc() instead of vmalloc() with memset(). (Borislav Petkov)
* Removed the max size check for >64KB. (Borislav Petkov)
* Removed the allocation size check in the helper. (Borislav Petkov)
* Switched the function description in the kernel-doc style.
* Used them for buffer initialization -- moved from the next patch.

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
* Replaced 'area' with 'buffer' in the comments and the changelog.
* Updated the code comments.

Changes from v1:
* Removed unneeded interrupt masking (Andy Lutomirski)
* Added vmalloc() error tracing (Dave Hansen, PeterZ, and Andy Lutomirski)
---
 arch/x86/include/asm/fpu/types.h  |   7 ++
 arch/x86/include/asm/fpu/xstate.h |   4 +
 arch/x86/include/asm/trace/fpu.h  |   5 ++
 arch/x86/kernel/fpu/core.c|  14 ++--
 arch/x86/kernel/fpu/xstate.c  | 125 ++
 5 files changed, 148 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index dcd28a545377..6fc707c14350 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -336,6 +336,13 @@ struct fpu {
 */
unsigned long   avx512_timestamp;
 
+   /*
+* @state_mask:
+*
+* The bitmap represents state components reserved to be saved in 
->state.
+*/
+   u64 state_mask;
+
/*
 * @state:
 *
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 1fba2ca15874..cbb4795d2b45 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -112,6 +112,10 @@ extern unsigned int get_xstate_config(enum xstate_config 
cfg);
 void set_xstate_config(enum xstate_config cfg, unsigned int value);
 
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
+unsigned int get_xstate_size(u64 mask);
+int alloc_xstate_buffer(struct fpu *fpu, u64 mask);
+void free_xstate_buffer(struct fpu *fpu);
+
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index ef82f4824ce7..b691c2db47c7 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -89,6 +89,11 @@ DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
TP_ARGS(fpu)
 );
 
+DEFINE_EVENT(x86_fpu, x86_fpu_xstate_alloc_failed,
+   TP_PROTO(struct fpu *fpu),
+   TP_ARGS(fpu)
+);
+
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH asm/trace/
 #undef TRACE_INCLUDE_FILE
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 60a581aa0be8..5debb1cd3c74 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -203,9 +203,8 @@ void fpstate_init(struct fpu *fpu)
 
if (fpu) {
state = fpu->state;
-   /* The dynamic user states are not prepared yet. */
-   mask = xfeatures_mask_all & ~xfeatures_mask_user_dynamic;
-   size = get_xstate_config(XSTATE_MIN_SIZE);
+   mask = fpu->state_mask;
+   size = get_xstate_size(fpu->state_mask);
} else {
state = _fpstate;
mask = xfeatures_mask_all;
@@ -241,14 +240,15 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 
WARN_ON_FPU(src_fpu != >thread.fpu);
 
+   /*
+* The child does not inherit the dynamic states. Thus, use the buffer
+* embedded in struct task_struct, which has the minimum size.
+*/
+   dst_fpu->state_mask = (xfeatures_mask_all & 
~xfeatures_mask_user_dynamic);
dst_fpu->state = _fpu->__default_state;
-
/*
 * Don't let 'init optimized' areas of the XSAVE area
 * leak into the child task:
-*
-* The child does not inherit the dynamic states. So,
-* the xstate buffer has the minimum size.
 */
memset(_fpu->state->xsave, 0, get_xstate_config(XSTATE_MIN_SIZE));
 
diff 

[PATCH v4 04/22] x86/fpu/xstate: Modify the context restore helper to handle both static and dynamic buffers

2021-02-21 Thread Chang S. Bae
Have the function restoring xstate take a struct fpu * pointer in
preparation for dynamic state buffer support.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v3:
* Updated the changelog. (Borislav Petkov)
* Reverted the change on the copy_kernel_to_xregs_err() function as not
  needed.

Changes from v2:
* Updated the changelog with task->fpu removed. (Borislav Petkov)
---
 arch/x86/include/asm/fpu/internal.h | 6 --
 arch/x86/kernel/fpu/core.c  | 4 ++--
 arch/x86/kvm/x86.c  | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 0153c4d4ca77..b34d0d29e4b8 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -425,8 +425,10 @@ static inline void __copy_kernel_to_fpregs(union 
fpregs_state *fpstate, u64 mask
}
 }
 
-static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void copy_kernel_to_fpregs(struct fpu *fpu)
 {
+   union fpregs_state *fpstate = >state;
+
/*
 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
 * pending. Clear the x87 state here by setting it to fixed values.
@@ -511,7 +513,7 @@ static inline void __fpregs_load_activate(void)
return;
 
if (!fpregs_state_valid(fpu, cpu)) {
-   copy_kernel_to_fpregs(>state);
+   copy_kernel_to_fpregs(fpu);
fpregs_activate(fpu);
fpu->last_cpu = cpu;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d43661d309ab..5775e64b0172 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -173,7 +173,7 @@ void fpu__save(struct fpu *fpu)
 
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
if (!copy_fpregs_to_fpstate(fpu)) {
-   copy_kernel_to_fpregs(>state);
+   copy_kernel_to_fpregs(fpu);
}
}
 
@@ -251,7 +251,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
memcpy(_fpu->state, _fpu->state, 
fpu_kernel_xstate_size);
 
else if (!copy_fpregs_to_fpstate(dst_fpu))
-   copy_kernel_to_fpregs(_fpu->state);
+   copy_kernel_to_fpregs(dst_fpu);
 
fpregs_unlock();
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc3b604ddcd2..dd9565d12d81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9313,7 +9313,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_fpu)
kvm_save_current_fpu(vcpu->arch.guest_fpu);
 
-   copy_kernel_to_fpregs(>arch.user_fpu->state);
+   copy_kernel_to_fpregs(vcpu->arch.user_fpu);
 
fpregs_mark_activate();
fpregs_unlock();
-- 
2.17.1



[PATCH v4 00/22] x86: Support Intel Advanced Matrix Extensions

2021-02-21 Thread Chang S. Bae
Intel Advanced Matrix Extensions (AMX)[1][2] will be shipping on servers
soon.  AMX consists of configurable TMM "TILE" registers plus new
accelerator instructions that operate on them.  TMUL (Tile matrix MULtiply)
is the first accelerator instruction set to use the new registers, and we
anticipate additional instructions in the future.

Neither AMX state nor TMUL instructions depend on AVX.  However, AMX and
AVX do share common challenges.  The TMM registers are 8KB today, and
architecturally as large as 64KB, which merits updates to hardware and
software state management.

Further, both technologies run faster when they are not simultaneously
running on SMT siblings, and both technologies use of power and bandwidth
impact the power and performance available to neighboring cores.  (This
impact has measurably improved in recent hardware.)

If the existing kernel approach for managing XSAVE state was employed to
handle AMX, 8KB space would be added to every task, but possibly rarely
used.  So Linux support is optimized by using a new XSAVE feature: eXtended
Feature Disabling (XFD).  The kernel arms XFD to provide a #NM exception
upon a tasks' first access to TILE state. The kernel exception handler
installs the appropriate XSAVE context switch buffer, and the task behaves
as if the kernel had done that for all tasks.  Using XFD, AMX space is
allocated only when needed, eliminating the memory waste for unused state
components.

This series requires the new minimum sigaltstack support [3] and is based
on the mainline. The series is composed of three parts:
* Patch 01-15: Foundation to support dynamic user state management
* Patch 16-20: AMX enablement, including unit tests
* Patch 21-22: Signal handling optimization and new boot-parameters

Thanks to Len Brown and Dave Hansen for help with the cover letter.

Changes from v3 [6]:
* Updated some commit messages and code comments. (Borislav Petkov)
* Added and removed some helpers. (Borislav Petkov)
* Revised the buffer allocation function. (Borislav Petkov)
* Simplified in accessing buffers. (Borislav Petkov)
* Re-organized some code change more reviewable. (PATCH9/10)
* Reverted unnecessary changes. (PATCH4)
* Fixed typo in the documentation. (Randy Dunlap)

Changes from v2 [5]:
* Removed the patch for the tile data inheritance. Also, updated the
  selftest patch. (Andy Lutomirski)
* Changed the kernel tainted when any unknown state is enabled. (Andy
  Lutomirski)
* Changed to use the XFD feature only when the compacted format in use.
* Improved the test code.
* Simplified the cmdline handling.
* Removed 'task->fpu' in changelogs. (Boris Petkov)
* Updated the variable name / comments / changelogs for clarification.

Changes from v1 [4]:
* Added vmalloc() error tracing (Dave Hansen, PeterZ, and Andy Lutomirski)
* Inlined the #NM handling code (Andy Lutomirski)
* Made signal handling optimization revertible
* Revised the new parameter handling code (Andy Lutomirski and Dave Hansen)
* Rebased on the upstream kernel

[1]: Intel Architecture Instruction Set Extension Programming Reference
February 2021, 
https://software.intel.com/content/dam/develop/external/us/en/documents-tps/architecture-instruction-set-extensions-programming-reference.pdf
[2]: 
https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-matrix-extensions-intel-amx-instructions.html
[3]: 
https://lore.kernel.org/lkml/20210203172242.29644-1-chang.seok@intel.com/
[4]: 
https://lore.kernel.org/lkml/20201001203913.9125-1-chang.seok@intel.com/
[5]: 
https://lore.kernel.org/lkml/20201119233257.2939-1-chang.seok@intel.com/
[6]: 
https://lore.kernel.org/lkml/20201223155717.19556-1-chang.seok@intel.com/

Chang S. Bae (22):
  x86/fpu/xstate: Modify the initialization helper to handle both static
and dynamic buffers
  x86/fpu/xstate: Modify state copy helpers to handle both static and
dynamic buffers
  x86/fpu/xstate: Modify address finders to handle both static and
dynamic buffers
  x86/fpu/xstate: Modify the context restore helper to handle both
static and dynamic buffers
  x86/fpu/xstate: Add a new variable to indicate dynamic user states
  x86/fpu/xstate: Add new variables to indicate dynamic xstate buffer
size
  x86/fpu/xstate: Calculate and remember dynamic xstate buffer sizes
  x86/fpu/xstate: Convert the struct fpu 'state' field to a pointer
  x86/fpu/xstate: Introduce helpers to manage the xstate buffer
dynamically
  x86/fpu/xstate: Define the scope of the initial xstate data
  x86/fpu/xstate: Update the xstate save function to support dynamic
states
  x86/fpu/xstate: Update the xstate buffer address finder to support
dynamic states
  x86/fpu/xstate: Update the xstate context copy function to support
dynamic states
  x86/fpu/xstate: Expand the xstate buffer on the first use of dynamic
user state
  x86/fpu/xs

[PATCH v2] x86/fpu: Use consistent test for X86_FEATURE_XSAVES

2021-02-05 Thread Chang S. Bae
When XSAVES is present, the kernel always takes advantage of it, and XSAVES
always uses the compacted format.

The helper function using_compacted_format() implies that using the
compacted format may be possible without XSAVES (say by using XSAVEC), but
that is not possible here, so delete that confusing helper and simply check
the feature in the first place -- if XSAVES is available or not.

Cleanup only. No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Dave Hansen 
Cc: Borislav Petkov 
Cc: linux-kernel@vger.kernel.org
Cc: x...@kernel.org
---
Changes from v1:
* Added comments (Dave Hansen)
* Updated the changelog (Borislav Petkov)
---
 arch/x86/include/asm/fpu/xstate.h |  1 -
 arch/x86/kernel/fpu/regset.c  |  6 --
 arch/x86/kernel/fpu/signal.c  |  3 ++-
 arch/x86/kernel/fpu/xstate.c  | 18 ++
 4 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..96c43380b8c2 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -102,7 +102,6 @@ extern void __init update_regset_xstate_info(unsigned int 
size,
 
 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
-int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
 struct membuf;
 void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..c999b9e5b3a1 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -84,7 +84,8 @@ int xstateregs_get(struct task_struct *target, const struct 
user_regset *regset,
 
fpu__prepare_read(fpu);
 
-   if (using_compacted_format()) {
+   /* The XSAVES compacted format must be copied one state at a time. */
+   if (boot_cpu_has(X86_FEATURE_XSAVES)) {
copy_xstate_to_kernel(to, xsave);
return 0;
} else {
@@ -124,7 +125,8 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 
fpu__prepare_write(fpu);
 
-   if (using_compacted_format()) {
+   /* The XSAVES compacted format must be copied one state at a time. */
+   if (boot_cpu_has(X86_FEATURE_XSAVES)) {
if (kbuf)
ret = copy_kernel_to_xstate(xsave, kbuf);
else
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..761324a31e5c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -405,7 +405,8 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
if (use_xsave() && !fx_only) {
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
-   if (using_compacted_format()) {
+   /* The XSAVES compacted format must be copied one state at a 
time. */
+   if (boot_cpu_has(X86_FEATURE_XSAVES)) {
ret = copy_user_to_xstate(>state.xsave, buf_fx);
} else {
ret = __copy_from_user(>state.xsave, buf_fx, 
state_size);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 683749b80ae2..0e5fa511f0a1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -500,20 +500,6 @@ int xfeature_size(int xfeature_nr)
return eax;
 }
 
-/*
- * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
- * 2. using the compacted format
- *
- * Use this function when dealing with the compacted format so
- * that it is obvious which aspect of 'XSAVES' is being handled
- * by the calling code.
- */
-int using_compacted_format(void)
-{
-   return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_user_xstate_header(const struct xstate_header *hdr)
 {
@@ -634,7 +620,7 @@ static void do_extra_xstate_size_checks(void)
 * Supervisor state components can be managed only by
 * XSAVES, which is compacted-format only.
 */
-   if (!using_compacted_format())
+   if (!boot_cpu_has(X86_FEATURE_XSAVES))
XSTATE_WARN_ON(xfeature_is_supervisor(i));
 
/* Align from the end of the previous feature */
@@ -646,7 +632,7 @@ static void do_extra_xstate_size_checks(void)
 * them for being ordered (increasing offsets) in
 * setup_xstate_features().
 */
-   if (!using_compacted_format())
+   if (!boot_cpu_has(X86_FEATURE_XSAVES))
paranoid_xstate_size = xfeature_uncompacted_offset(i);
/*
 * The compacted-format offset always depends on where
-- 
2.17.1



[PATCH v5 5/5] selftest/x86/signal: Include test cases for validating sigaltstack

2021-02-03 Thread Chang S. Bae
The test measures the kernel's signal delivery with different (enough vs.
insufficient) stack sizes.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Revised test messages again (Borislav Petkov)

Changes from v2:
* Revised test messages (Borislav Petkov)
---
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 333980375bc7..65bba2ae86ee 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) 
trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
-   syscall_arg_fault fsgsbase_restore
+   syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/sigaltstack.c 
b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index ..f689af75e979
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ   2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ   51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+  int flags)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_sigaction = handler;
+   sa.sa_flags = SA_SIGINFO | flags;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_handler = SIG_DFL;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+   stack_t ss;
+
+   memset(, 0, sizeof(ss));
+   ss.ss_size = size;
+   ss.ss_sp = start;
+
+   return sigaltstack(, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected 
SIGALRM).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGSEGV signal delivered.\n");
+   }
+
+   siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (!sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGALRM (expected 
SIGSEGV).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGALRM signal delivered.\n");
+   }
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+   if (setup_altstack(altstack, size))
+   err(1, "sigaltstack()");
+
+   sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+   sethandler(SIGSEGV, sigsegv, 0);
+   sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+   if (!sigsetjmp(jmpbuf, 1)) {
+   printf("[RUN]\tTest an alternate signal stack of %ssufficient 
size.\n",
+  sigalrm_expected ? "" : "in");
+   printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+  sigalrm_expected ? "It" : "SIGSEGV");
+   raise(SIGALRM);
+   }
+
+   clearhandler(SIGALRM);
+   clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+   void *altstack;
+
+   at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+   altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | 
PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+   if (altstack == MAP_FAILED)
+   err(1, "mmap()");
+
+   if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+   test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+   test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+   return nerrs == 0 ? 0 : 1;
+}
-- 
2.17.1



[PATCH v5 2/5] x86/signal: Introduce helpers to get the maximum signal frame size

2021-02-03 Thread Chang S. Bae
Signal frames do not have a fixed format and can vary in size when a number
of things change: support XSAVE features, 32 vs. 64-bit apps. Add the code
to support a runtime method for userspace to dynamically discover how large
a signal stack needs to be.

Introduce a new variable, max_frame_size, and helper functions for the
calculation to be used in a new user interface. Set max_frame_size to a
system-wide worst-case value, instead of storing multiple app-specific
values.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Acked-by: H.J. Lu 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Renamed the fpstate size helper with cleanup (Borislav Petkov)
* Moved the sigframe struct size defines to where used (Borislav Petkov)
* Removed unneeded sentence in the changelog (Borislav Petkov)

Change from v1:
* Took stack alignment into account for sigframe size (Dave Martin)
---
 arch/x86/include/asm/fpu/signal.h |  2 ++
 arch/x86/include/asm/sigframe.h   |  2 ++
 arch/x86/kernel/cpu/common.c  |  3 ++
 arch/x86/kernel/fpu/signal.c  | 19 +++
 arch/x86/kernel/signal.c  | 57 +--
 5 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h 
b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 unsigned long *buf_fx, unsigned long *size);
 
+unsigned long fpu__get_fpstate_size(void);
+
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..6954932272d5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1331,6 +1332,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
 
fpu__init_system(c);
 
+   init_sigframe_size();
+
 #ifdef CONFIG_X86_32
/*
 * Regardless of whether PCID is enumerated, the SDM says
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..dbb304e48f16 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -507,6 +507,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
return sp;
 }
+
+unsigned long fpu__get_fpstate_size(void)
+{
+   unsigned long ret = xstate_sigframe_size();
+
+   /*
+* This space is needed on (most) 32-bit kernels, or when a 32-bit
+* app is running on a 64-bit kernel. To keep things simple, just
+* assume the worst case and always include space for 'freg_state',
+* even for 64-bit apps on 64-bit kernels. This wastes a bit of
+* space, but keeps the code simple.
+*/
+   if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+   ret += sizeof(struct fregs_state);
+
+   return ret;
+}
+
 /*
  * Prepare the SW reserved portion of the fxsave memory layout, indicating
  * the presence of the extended state information in the memory layout
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ea794a083c44..800243afd1ef 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -212,6 +212,11 @@ do {   
\
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT16UL
+
+#define MAX_FRAME_PADDING  (FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp)
 * Align the stack pointer according to the i386 ABI,
 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 */
-   sp = ((sp + 4) & -16ul) - 4;
+   sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-   sp = round_down(sp, 16) - 8;
+   sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
return sp;
 }
@@ -663,6 +668,54 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if def

[PATCH v5 4/5] x86/signal: Detect and prevent an alternate signal stack overflow

2021-02-03 Thread Chang S. Bae
The kernel pushes context on to the userspace stack to prepare for the
user's signal handler. When the user has supplied an alternate signal
stack, via sigaltstack(2), it is easy for the kernel to verify that the
stack size is sufficient for the current hardware context.

Check if writing the hardware context to the alternate stack will exceed
it's size. If yes, then instead of corrupting user-data and proceeding with
the original signal handler, an immediate SIGSEGV signal is delivered.

While previous patches in this series allow new source code to discover and
use a sufficient alternate signal stack size, this check is still necessary
to protect binaries with insufficient alternate signal stack size from data
corruption.

Suggested-by: Jann Horn 
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Reviewed-by: Jann Horn 
Cc: Borislav Petkov 
Cc: Jann Horn 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Updated the changelog (Borislav Petkov)

Changes from v2:
* Simplified the implementation (Jann Horn)
---
 arch/x86/kernel/signal.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 0d24f64d0145..8e2df070dbfd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -242,7 +242,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
-   int onsigstack = on_sig_stack(sp);
+   bool onsigstack = on_sig_stack(sp);
int ret;
 
/* redzone */
@@ -251,8 +251,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 
/* This is the X/Open sanctioned signal stack switching.  */
if (ka->sa.sa_flags & SA_ONSTACK) {
-   if (sas_ss_flags(sp) == 0)
+   if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+   /* On the alternate signal stack */
+   onsigstack = true;
+   }
} else if (IS_ENABLED(CONFIG_X86_32) &&
   !onsigstack &&
   regs->ss != __USER_DS &&
-- 
2.17.1



[PATCH v5 3/5] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2021-02-03 Thread Chang S. Bae
Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
use by all architectures with sigaltstack(2). Over time, the hardware state
size grew, but these constants did not evolve. Today, literal use of these
constants on several architectures may result in signal stack overflow, and
thus user data corruption.

A few years ago, the ARM team addressed this issue by establishing
getauxval(AT_MINSIGSTKSZ), such that the kernel can supply at runtime value
that is an appropriate replacement on the current and future hardware.

Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
added for ARM in commit 94b07c1f8c39 ("arm64: signal: Report signal frame
size to userspace via auxv").

Reported-by: Florian Weimer 
Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: H.J. Lu 
Cc: Fenghua Yu 
Cc: Dave Martin 
Cc: Michael Ellerman 
Cc: x...@kernel.org
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
---
 arch/x86/include/asm/elf.h | 4 
 arch/x86/include/uapi/asm/auxvec.h | 4 ++--
 arch/x86/kernel/signal.c   | 5 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 66bdfe838d61..cd10795c178e 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -312,6 +312,7 @@ do {
\
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);\
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);\
}   \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /*
@@ -328,6 +329,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -349,6 +351,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -357,6 +360,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 #define AT_SYSINFO 32
diff --git a/arch/x86/include/uapi/asm/auxvec.h 
b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046..6beb55bbefa4 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -12,9 +12,9 @@
 
 /* entries in ARCH_DLINFO: */
 #if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
 #else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
 #endif
 
 #endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 800243afd1ef..0d24f64d0145 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -716,6 +716,11 @@ void __init init_sigframe_size(void)
max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
 }
 
+unsigned long get_sigframe_size(void)
+{
+   return max_frame_size;
+}
+
 static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
-- 
2.17.1



[PATCH v5 0/5] x86: Improve Minimum Alternate Stack Size

2021-02-03 Thread Chang S. Bae
During signal entry, the kernel pushes data onto the normal userspace
stack. On x86, the data pushed onto the user stack includes XSAVE state,
which has grown over time as new features and larger registers have been
added to the architecture.

MINSIGSTKSZ is a constant provided in the kernel signal.h headers and
typically distributed in lib-dev(el) packages, e.g. [1]. Its value is
compiled into programs and is part of the user/kernel ABI. The MINSIGSTKSZ
constant indicates to userspace how much data the kernel expects to push on
the user stack, [2][3].

However, this constant is much too small and does not reflect recent
additions to the architecture. For instance, when AVX-512 states are in
use, the signal frame size can be 3.5KB while MINSIGSTKSZ remains 2KB.

The bug report [4] explains this as an ABI issue. The small MINSIGSTKSZ can
cause user stack overflow when delivering a signal.

In this series, we suggest a couple of things:
1. Provide a variable minimum stack size to userspace, as a similar
   approach to [5]
2. Avoid using a too-small alternate stack

Changes from v4 [9]:
* Moved the aux vector define to the generic header (Carlos O'Donell)

Changes from v3 [8]:
* Updated the changelog (Borislav Petkov)
* Revised the test messages again (Borislav Petkov)

Changes from v2 [7]:
* Simplified the sigaltstack overflow prevention (Jann Horn)
* Renamed fpstate size helper with cleanup (Borislav Petkov)
* Cleaned up the signframe struct size defines (Borislav Petkov)
* Revised the selftest messages (Borislav Petkov)
* Revised a changelog (Borislav Petkov)

Changes from v1 [6]:
* Took stack alignment into account for sigframe size (Dave Martin)

[1]: 
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/bits/sigstack.h;h=b9dca794da09
3dc4d41d39db9851d444e1b54d9b;hb=HEAD
[2]: https://www.gnu.org/software/libc/manual/html_node/Signal-Stack.html
[3]: https://man7.org/linux/man-pages/man2/sigaltstack.2.html
[4]: https://bugzilla.kernel.org/show_bug.cgi?id=153531
[5]: 
https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4671/original/plumbers-dm-2017.pdf
[6]: 
https://lore.kernel.org/lkml/20200929205746.6763-1-chang.seok@intel.com/
[7]: https://lore.kernel.org/lkml/20201119190237.626-1-chang.seok@intel.com/
[8]: 
https://lore.kernel.org/lkml/20201223015312.4882-1-chang.seok@intel.com/
[9]: 
https://lore.kernel.org/lkml/20210115211038.2072-1-chang.seok@intel.com/

Chang S. Bae (5):
  uapi: Move the aux vector AT_MINSIGSTKSZ define to uapi
  x86/signal: Introduce helpers to get the maximum signal frame size
  x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ
  x86/signal: Detect and prevent an alternate signal stack overflow
  selftest/x86/signal: Include test cases for validating sigaltstack

 arch/arm64/include/uapi/asm/auxvec.h  |   1 -
 arch/x86/include/asm/elf.h|   4 +
 arch/x86/include/asm/fpu/signal.h |   2 +
 arch/x86/include/asm/sigframe.h   |   2 +
 arch/x86/include/uapi/asm/auxvec.h|   4 +-
 arch/x86/kernel/cpu/common.c  |   3 +
 arch/x86/kernel/fpu/signal.c  |  19 
 arch/x86/kernel/signal.c  |  69 +++-
 include/uapi/linux/auxvec.h   |   1 +
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 11 files changed, 227 insertions(+), 8 deletions(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

-- 
2.17.1



[PATCH v5 1/5] uapi: Move the aux vector AT_MINSIGSTKSZ define to uapi

2021-02-03 Thread Chang S. Bae
Move the AT_MINSIGSTKSZ definition to generic Linux from arm64. It is
already used as generic ABI in glibc's generic elf.h, and this move will
prevent future namespace conflicts. In particular, x86 will re-use this
generic definition.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Carlos O'Donell 
Cc: Dave Martin 
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
Change from v4:
* Added as a new patch (Carlos O'Donell)
---
 arch/arm64/include/uapi/asm/auxvec.h | 1 -
 include/uapi/linux/auxvec.h  | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/uapi/asm/auxvec.h 
b/arch/arm64/include/uapi/asm/auxvec.h
index 743c0b84fd30..767d710c92aa 100644
--- a/arch/arm64/include/uapi/asm/auxvec.h
+++ b/arch/arm64/include/uapi/asm/auxvec.h
@@ -19,7 +19,6 @@
 
 /* vDSO location */
 #define AT_SYSINFO_EHDR33
-#define AT_MINSIGSTKSZ 51  /* stack needed for signal delivery */
 
 #define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
 
diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
index abe5f2b6581b..cc4fa77bd2a7 100644
--- a/include/uapi/linux/auxvec.h
+++ b/include/uapi/linux/auxvec.h
@@ -33,5 +33,6 @@
 
 #define AT_EXECFN  31  /* filename of program */
 
+#define AT_MINSIGSTKSZ 51  /* stack needed for signal delivery  */
 
 #endif /* _UAPI_LINUX_AUXVEC_H */
-- 
2.17.1



[PATCH] x86/fpu: Use consistent test for X86_FEATURE_XSAVES

2021-02-02 Thread Chang S. Bae
When XSAVES is present, the kernel always takes advantage of it, and XSAVES
always uses compacted format.

The macro using_compacted_format() implies that using compacted format may
be possible without XSAVES (say by using XSAVEC), but that is not possible
here, so delete that confusing macro and simply test for what we want to
know in the first place -- if we have XSAVES or not.

Cleanup only. No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Dave Hansen 
Cc: Borislav Petkov 
Cc: linux-kernel@vger.kernel.org
Cc: x...@kernel.org
---
 arch/x86/include/asm/fpu/xstate.h |  1 -
 arch/x86/kernel/fpu/regset.c  |  4 ++--
 arch/x86/kernel/fpu/signal.c  |  2 +-
 arch/x86/kernel/fpu/xstate.c  | 18 ++
 4 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..96c43380b8c2 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -102,7 +102,6 @@ extern void __init update_regset_xstate_info(unsigned int 
size,
 
 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
-int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
 struct membuf;
 void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..3e52e15a4891 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -84,7 +84,7 @@ int xstateregs_get(struct task_struct *target, const struct 
user_regset *regset,
 
fpu__prepare_read(fpu);
 
-   if (using_compacted_format()) {
+   if (boot_cpu_has(X86_FEATURE_XSAVES)) {
copy_xstate_to_kernel(to, xsave);
return 0;
} else {
@@ -124,7 +124,7 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 
fpu__prepare_write(fpu);
 
-   if (using_compacted_format()) {
+   if (boot_cpu_has(X86_FEATURE_XSAVES)) {
if (kbuf)
ret = copy_kernel_to_xstate(xsave, kbuf);
else
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..2d0efb9a27c1 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -405,7 +405,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
if (use_xsave() && !fx_only) {
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
-   if (using_compacted_format()) {
+   if (boot_cpu_has(X86_FEATURE_XSAVES)) {
ret = copy_user_to_xstate(>state.xsave, buf_fx);
} else {
ret = __copy_from_user(>state.xsave, buf_fx, 
state_size);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 683749b80ae2..0e5fa511f0a1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -500,20 +500,6 @@ int xfeature_size(int xfeature_nr)
return eax;
 }
 
-/*
- * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
- * 2. using the compacted format
- *
- * Use this function when dealing with the compacted format so
- * that it is obvious which aspect of 'XSAVES' is being handled
- * by the calling code.
- */
-int using_compacted_format(void)
-{
-   return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_user_xstate_header(const struct xstate_header *hdr)
 {
@@ -634,7 +620,7 @@ static void do_extra_xstate_size_checks(void)
 * Supervisor state components can be managed only by
 * XSAVES, which is compacted-format only.
 */
-   if (!using_compacted_format())
+   if (!boot_cpu_has(X86_FEATURE_XSAVES))
XSTATE_WARN_ON(xfeature_is_supervisor(i));
 
/* Align from the end of the previous feature */
@@ -646,7 +632,7 @@ static void do_extra_xstate_size_checks(void)
 * them for being ordered (increasing offsets) in
 * setup_xstate_features().
 */
-   if (!using_compacted_format())
+   if (!boot_cpu_has(X86_FEATURE_XSAVES))
paranoid_xstate_size = xfeature_uncompacted_offset(i);
/*
 * The compacted-format offset always depends on where
-- 
2.17.1



[PATCH v4 4/4] selftest/x86/signal: Include test cases for validating sigaltstack

2021-01-15 Thread Chang S. Bae
The test measures the kernel's signal delivery with different (enough vs.
insufficient) stack sizes.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Revised test messages again (Borislav Petkov)

Changes from v2:
* Revised test messages (Borislav Petkov)
---
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 6703c7906b71..e0c52e5ab49e 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) 
trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vdso test_vsyscall mov_ss_trap \
-   syscall_arg_fault fsgsbase_restore
+   syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/sigaltstack.c 
b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index ..f689af75e979
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ   2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ   51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+  int flags)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_sigaction = handler;
+   sa.sa_flags = SA_SIGINFO | flags;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_handler = SIG_DFL;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+   stack_t ss;
+
+   memset(, 0, sizeof(ss));
+   ss.ss_size = size;
+   ss.ss_sp = start;
+
+   return sigaltstack(, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGSEGV (expected 
SIGALRM).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGSEGV signal delivered.\n");
+   }
+
+   siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (!sigalrm_expected) {
+   printf("[FAIL]\tWrong signal delivered: SIGALRM (expected 
SIGSEGV).");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGALRM signal delivered.\n");
+   }
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+   if (setup_altstack(altstack, size))
+   err(1, "sigaltstack()");
+
+   sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+   sethandler(SIGSEGV, sigsegv, 0);
+   sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+   if (!sigsetjmp(jmpbuf, 1)) {
+   printf("[RUN]\tTest an alternate signal stack of %ssufficient 
size.\n",
+  sigalrm_expected ? "" : "in");
+   printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+  sigalrm_expected ? "It" : "SIGSEGV");
+   raise(SIGALRM);
+   }
+
+   clearhandler(SIGALRM);
+   clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+   void *altstack;
+
+   at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+   altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | 
PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+   if (altstack == MAP_FAILED)
+   err(1, "mmap()");
+
+   if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+   test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+   test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+   return nerrs == 0 ? 0 : 1;
+}
-- 
2.17.1



[PATCH v4 3/4] x86/signal: Detect and prevent an alternate signal stack overflow

2021-01-15 Thread Chang S. Bae
The kernel pushes context on to the userspace stack to prepare for the
user's signal handler. When the user has supplied an alternate signal
stack, via sigaltstack(2), it is easy for the kernel to verify that the
stack size is sufficient for the current hardware context.

Check if writing the hardware context to the alternate stack will exceed
it's size. If yes, then instead of corrupting user-data and proceeding with
the original signal handler, an immediate SIGSEGV signal is delivered.

While previous patches in this series allow new source code to discover and
use a sufficient alternate signal stack size, this check is still necessary
to protect binaries with insufficient alternate signal stack size from data
corruption.

Suggested-by: Jann Horn 
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Reviewed-by: Jann Horn 
Cc: Borislav Petkov 
Cc: Jann Horn 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v3:
* Updated the changelog (Borislav Petkov)

Changes from v2:
* Simplified the implementation (Jann Horn)
---
 arch/x86/kernel/signal.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 761d856f8ef7..91056a940271 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -242,7 +242,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
-   int onsigstack = on_sig_stack(sp);
+   bool onsigstack = on_sig_stack(sp);
int ret;
 
/* redzone */
@@ -251,8 +251,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 
/* This is the X/Open sanctioned signal stack switching.  */
if (ka->sa.sa_flags & SA_ONSTACK) {
-   if (sas_ss_flags(sp) == 0)
+   if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+   /* On the alternate signal stack */
+   onsigstack = true;
+   }
} else if (IS_ENABLED(CONFIG_X86_32) &&
   !onsigstack &&
   regs->ss != __USER_DS &&
-- 
2.17.1



[PATCH v4 1/4] x86/signal: Introduce helpers to get the maximum signal frame size

2021-01-15 Thread Chang S. Bae
Signal frames do not have a fixed format and can vary in size when a number
of things change: support XSAVE features, 32 vs. 64-bit apps. Add the code
to support a runtime method for userspace to dynamically discover how large
a signal stack needs to be.

Introduce a new variable, max_frame_size, and helper functions for the
calculation to be used in a new user interface. Set max_frame_size to a
system-wide worst-case value, instead of storing multiple app-specific
values.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Acked-by: H.J. Lu 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Renamed the fpstate size helper with cleanup (Borislav Petkov)
* Moved the sigframe struct size defines to where used (Borislav Petkov)
* Removed unneeded sentence in the changelog (Borislav Petkov)

Change from v1:
* Took stack alignment into account for sigframe size (Dave Martin)
---
 arch/x86/include/asm/fpu/signal.h |  2 ++
 arch/x86/include/asm/sigframe.h   |  2 ++
 arch/x86/kernel/cpu/common.c  |  3 ++
 arch/x86/kernel/fpu/signal.c  | 19 +++
 arch/x86/kernel/signal.c  | 57 +--
 5 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h 
b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 unsigned long *buf_fx, unsigned long *size);
 
+unsigned long fpu__get_fpstate_size(void);
+
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..6954932272d5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1331,6 +1332,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
 
fpu__init_system(c);
 
+   init_sigframe_size();
+
 #ifdef CONFIG_X86_32
/*
 * Regardless of whether PCID is enumerated, the SDM says
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..dbb304e48f16 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -507,6 +507,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
return sp;
 }
+
+unsigned long fpu__get_fpstate_size(void)
+{
+   unsigned long ret = xstate_sigframe_size();
+
+   /*
+* This space is needed on (most) 32-bit kernels, or when a 32-bit
+* app is running on a 64-bit kernel. To keep things simple, just
+* assume the worst case and always include space for 'freg_state',
+* even for 64-bit apps on 64-bit kernels. This wastes a bit of
+* space, but keeps the code simple.
+*/
+   if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+   ret += sizeof(struct fregs_state);
+
+   return ret;
+}
+
 /*
  * Prepare the SW reserved portion of the fxsave memory layout, indicating
  * the presence of the extended state information in the memory layout
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index be0d7d4152ec..138a9f5b78d8 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -212,6 +212,11 @@ do {   
\
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT16UL
+
+#define MAX_FRAME_PADDING  (FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp)
 * Align the stack pointer according to the i386 ABI,
 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 */
-   sp = ((sp + 4) & -16ul) - 4;
+   sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-   sp = round_down(sp, 16) - 8;
+   sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
return sp;
 }
@@ -663,6 +668,54 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if def

[PATCH v4 0/4] x86: Improve Minimum Alternate Stack Size

2021-01-15 Thread Chang S. Bae
During signal entry, the kernel pushes data onto the normal userspace
stack. On x86, the data pushed onto the user stack includes XSAVE state,
which has grown over time as new features and larger registers have been
added to the architecture.

MINSIGSTKSZ is a constant provided in the kernel signal.h headers and
typically distributed in lib-dev(el) packages, e.g. [1]. Its value is
compiled into programs and is part of the user/kernel ABI. The MINSIGSTKSZ
constant indicates to userspace how much data the kernel expects to push on
the user stack, [2][3].

However, this constant is much too small and does not reflect recent
additions to the architecture. For instance, when AVX-512 states are in
use, the signal frame size can be 3.5KB while MINSIGSTKSZ remains 2KB.

The bug report [4] explains this as an ABI issue. The small MINSIGSTKSZ can
cause user stack overflow when delivering a signal.

In this series, we suggest a couple of things:
1. Provide a variable minimum stack size to userspace, as a similar
   approach to [5]
2. Avoid using a too-small alternate stack

Changes from v3 [8]:
* Updated the changelog (Borislav Petkov)
* Revised the test messages again (Borislav Petkov)

Changes from v2 [7]:
* Simplified the sigaltstack overflow prevention (Jann Horn)
* Renamed fpstate size helper with cleanup (Borislav Petkov)
* Cleaned up the signframe struct size defines (Borislav Petkov)
* Revised the selftest messages (Borislav Petkov)
* Revised a changelog (Borislav Petkov)

Changes from v1 [6]:
* Took stack alignment into account for sigframe size (Dave Martin)

[1]: 
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/bits/sigstack.h;h=b9dca794da093dc4d41d39db9851d444e1b54d9b;hb=HEAD
[2]: https://www.gnu.org/software/libc/manual/html_node/Signal-Stack.html
[3]: https://man7.org/linux/man-pages/man2/sigaltstack.2.html
[4]: https://bugzilla.kernel.org/show_bug.cgi?id=153531
[5]: 
https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4671/original/plumbers-dm-2017.pdf
[6]: 
https://lore.kernel.org/lkml/20200929205746.6763-1-chang.seok@intel.com/
[7]: https://lore.kernel.org/lkml/20201119190237.626-1-chang.seok@intel.com/
[8]: 
https://lore.kernel.org/lkml/20201223015312.4882-1-chang.seok@intel.com/

Chang S. Bae (4):
  x86/signal: Introduce helpers to get the maximum signal frame size
  x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ
  x86/signal: Detect and prevent an alternate signal stack overflow
  selftest/x86/signal: Include test cases for validating sigaltstack

 arch/x86/include/asm/elf.h|   4 +
 arch/x86/include/asm/fpu/signal.h |   2 +
 arch/x86/include/asm/sigframe.h   |   2 +
 arch/x86/include/uapi/asm/auxvec.h|   6 +-
 arch/x86/kernel/cpu/common.c  |   3 +
 arch/x86/kernel/fpu/signal.c  |  19 
 arch/x86/kernel/signal.c  |  69 +++-
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 9 files changed, 228 insertions(+), 7 deletions(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

-- 
2.17.1



[PATCH v4 2/4] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2021-01-15 Thread Chang S. Bae
Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
use by all architectures with sigaltstack(2). Over time, the hardware state
size grew, but these constants did not evolve. Today, literal use of these
constants on several architectures may result in signal stack overflow, and
thus user data corruption.

A few years ago, the ARM team addressed this issue by establishing
getauxval(AT_MINSIGSTKSZ), such that the kernel can supply at runtime value
that is an appropriate replacement on the current and future hardware.

Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
added for ARM in commit 94b07c1f8c39 ("arm64: signal: Report signal frame
size to userspace via auxv").

Reported-by: Florian Weimer 
Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: H.J. Lu 
Cc: Fenghua Yu 
Cc: Dave Martin 
Cc: Michael Ellerman 
Cc: x...@kernel.org
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
---
 arch/x86/include/asm/elf.h | 4 
 arch/x86/include/uapi/asm/auxvec.h | 6 --
 arch/x86/kernel/signal.c   | 5 +
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b9a5d488f1a5..044b024abea1 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -311,6 +311,7 @@ do {
\
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);\
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);\
}   \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /*
@@ -327,6 +328,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -348,6 +350,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -356,6 +359,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 #define AT_SYSINFO 32
diff --git a/arch/x86/include/uapi/asm/auxvec.h 
b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046..edd7808060e6 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -10,11 +10,13 @@
 #endif
 #define AT_SYSINFO_EHDR33
 
+#define AT_MINSIGSTKSZ 51
+
 /* entries in ARCH_DLINFO: */
 #if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
 #else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
 #endif
 
 #endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 138a9f5b78d8..761d856f8ef7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -716,6 +716,11 @@ void __init init_sigframe_size(void)
max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
 }
 
+unsigned long get_sigframe_size(void)
+{
+   return max_frame_size;
+}
+
 static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
-- 
2.17.1



[PATCH v3 21/21] x86/fpu/xstate: Introduce boot-parameters to control some state component support

2020-12-23 Thread Chang S. Bae
"xstate.disable=0x6" will disable AMX on a system that has AMX compiled
into XFEATURE_MASK_USER_ENABLED.

"xstate.enable=0x6" will enable AMX on a system that does NOT have AMX
compiled into XFEATURE_MASK_USER_ENABLED (assuming the kernel is new enough
to support this feature).

Rename XFEATURE_MASK_USER_SUPPORTED to XFEATURE_MASK_USER_ENABLED to be
aligned with the new parameters.

While this cmdline is currently enabled only for AMX, it is intended to be
easily enabled to be useful for future XSAVE-enabled features.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Changed the kernel tainted when any unknown state is enabled. (Andy
  Lutomirski)
* Simplified the cmdline handling.
* Edited the changelog.

Changes from v1:
* Renamed the user state mask define (Andy Lutomirski and Dave Hansen)
* Changed the error message (Dave Hansen)
* Fixed xfeatures_mask_user()
* Rebased the upstream kernel (5.10) -- revived the param parse function
---
 .../admin-guide/kernel-parameters.txt | 15 +
 arch/x86/include/asm/fpu/types.h  |  6 ++
 arch/x86/include/asm/fpu/xstate.h | 24 +++
 arch/x86/kernel/fpu/init.c| 65 +--
 4 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 44fde25bb221..a67ae04d43c5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6002,6 +6002,21 @@
which allow the hypervisor to 'idle' the guest on lock
contention.
 
+   xstate.enable=  [X86-64]
+   xstate.disable= [X86-64]
+   The kernel is compiled with a default xstate bitmask --
+   enabling it to use the XSAVE hardware to efficiently
+   save and restore thread states on context switch.
+   xstate.enable allows adding to that default mask at
+   boot-time without recompiling the kernel just to support
+   the new thread state. (Note that the kernel will ignore
+   any bits in the mask that do not correspond to features
+   that are actually available in CPUID)  xstate.disable
+   allows clearing bits in the default mask, forcing the
+   kernel to forget that it supports the specified thread
+   state. When a bit set for both, the kernel takes
+   xstate.disable in a priority.
+
xirc2ps_cs= [NET,PCMCIA]
Format:

,[,[,[,]]]
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index bf9511efd546..8835d3f6acb7 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -149,6 +149,12 @@ enum xfeature {
 #define XFEATURE_MASK_XTILE(XFEATURE_MASK_XTILE_DATA \
 | XFEATURE_MASK_XTILE_CFG)
 
+#define XFEATURE_REGION_MASK(max_bit, min_bit) \
+   ((BIT_ULL((max_bit) - (min_bit) + 1) - 1) << (min_bit))
+
+#define XFEATURE_MASK_CONFIGURABLE \
+   XFEATURE_REGION_MASK(XFEATURE_XTILE_DATA, XFEATURE_XTILE_CFG)
+
 #define FIRST_EXTENDED_XFEATUREXFEATURE_YMM
 
 struct reg_128_bit {
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 8f5218d420ad..c27feca8e66c 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -25,17 +25,17 @@
 
 #define XSAVE_ALIGNMENT 64
 
-/* All currently supported user features */
-#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
- XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
- XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM | \
- XFEATURE_MASK_PKRU | \
- XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR | \
- XFEATURE_MASK_XTILE)
+/* All currently enabled user features */
+#define XFEATURE_MASK_USER_ENABLED (XFEATURE_MASK_FP | \
+   XFEATURE_MASK_SSE | \
+   XFEATURE_MASK_YMM | \
+   XFEATURE_MASK_OPMASK | \
+   XFEATURE_MASK_ZMM_Hi256 | \
+   XFEATURE_MASK_Hi16_ZMM   | \
+   XFEATURE_MASK_PKRU | \
+  

[PATCH v3 18/21] x86/fpu/amx: Enable the AMX feature in 64-bit mode

2020-12-23 Thread Chang S. Bae
In 64-bit mode, include the AMX state components in
XFEATURE_MASK_USER_SUPPORTED.

The XFD feature will be used to dynamically allocate per-task XSAVE
buffer on first use.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/fpu/xstate.h | 3 ++-
 arch/x86/kernel/fpu/init.c| 8 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 08d3dd18d7d8..8f5218d420ad 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -34,7 +34,8 @@
  XFEATURE_MASK_Hi16_ZMM | \
  XFEATURE_MASK_PKRU | \
  XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR)
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_XTILE)
 
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 5dac97158030..c77c1c5580f9 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -233,8 +233,12 @@ static void __init 
fpu__init_system_xstate_size_legacy(void)
  */
 u64 __init fpu__get_supported_xfeatures_mask(void)
 {
-   return XFEATURE_MASK_USER_SUPPORTED |
-  XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+   u64 mask = XFEATURE_MASK_USER_SUPPORTED | 
XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+   if (!IS_ENABLED(CONFIG_X86_64))
+   mask &= ~(XFEATURE_MASK_XTILE);
+
+   return mask;
 }
 
 /* Legacy code to initialize eager fpu mode. */
-- 
2.17.1



[PATCH v3 09/21] x86/fpu/xstate: Introduce wrapper functions to organize xstate buffer access

2020-12-23 Thread Chang S. Bae
The struct fpu includes two (possible) xstate buffers -- fpu->state and
fpu->state_ptr. Instead of open code for accessing one of them, provide a
wrapper that covers both cases.

KVM does not yet use fpu->state_ptr, and so it is left unchanged.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
---
 arch/x86/include/asm/fpu/internal.h | 10 ++
 arch/x86/include/asm/fpu/xstate.h   | 10 ++
 arch/x86/include/asm/trace/fpu.h|  6 --
 arch/x86/kernel/fpu/core.c  | 27 ---
 arch/x86/kernel/fpu/regset.c| 28 +---
 arch/x86/kernel/fpu/signal.c| 23 +--
 arch/x86/kernel/fpu/xstate.c| 20 +++-
 7 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index bbdd304719c6..67ffd1d7c95e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -210,10 +210,12 @@ static inline int copy_user_to_fregs(struct fregs_state 
__user *fx)
 
 static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 {
+   union fpregs_state *xstate = __xstate(fpu);
+
if (IS_ENABLED(CONFIG_X86_32))
-   asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+   asm volatile("fxsave %[fx]" : [fx] "=m" (xstate->fxsave));
else
-   asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+   asm volatile("fxsaveq %[fx]" : [fx] "=m" (xstate->fxsave));
 }
 
 /* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -411,7 +413,7 @@ static inline int copy_user_to_xregs(struct xregs_state 
__user *buf, u64 mask)
  */
 static inline int copy_kernel_to_xregs_err(struct fpu *fpu, u64 mask)
 {
-   struct xregs_state *xstate = >state.xsave;
+   struct xregs_state *xstate = __xsave(fpu);
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -440,7 +442,7 @@ static inline void __copy_kernel_to_fpregs(union 
fpregs_state *fpstate, u64 mask
 
 static inline void copy_kernel_to_fpregs(struct fpu *fpu)
 {
-   union fpregs_state *fpstate = >state;
+   union fpregs_state *fpstate = __xstate(fpu);
 
/*
 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 62f6583f34fa..5927033e017f 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -102,6 +102,16 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 extern void __init update_regset_xstate_info(unsigned int size,
 u64 xstate_mask);
 
+static inline union fpregs_state *__xstate(struct fpu *fpu)
+{
+   return (fpu->state_ptr) ? fpu->state_ptr : >state;
+}
+
+static inline struct xregs_state *__xsave(struct fpu *fpu)
+{
+   return &__xstate(fpu)->xsave;
+}
+
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
 unsigned int get_xstate_size(u64 mask);
 int alloc_xstate_buffer(struct fpu *fpu, u64 mask);
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index bf88b873..4b21c34436f9 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,10 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu= fpu;
__entry->load_fpu   = test_thread_flag(TIF_NEED_FPU_LOAD);
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-   __entry->xfeatures = fpu->state.xsave.header.xfeatures;
-   __entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
+   struct xregs_state *xsave = __xsave(fpu);
+
+   __entry->xfeatures = xsave->header.xfeatures;
+   __entry->xcomp_bv  = xsave->header.xcomp_bv;
}
),
TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index aad1a7102096..8b9d3ec9ac46 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -94,14 +94,18 @@ EXPORT_SYMBOL(irq_fpu_usable);
  */
 int copy_fpregs_to_fpstate(struct fpu *fpu)
 {
+   union fpregs_state *xstate = __xstate(fpu);
+
if (likely(use_xsave())) {
-   copy_xregs_to_kernel(>state.xsave);
+   struct xregs_state *xsave = >xsave;
+
+   copy_xregs_to_kernel(xsave);
 
/

[PATCH v3 19/21] selftest/x86/amx: Include test cases for the AMX state management

2020-12-23 Thread Chang S. Bae
This selftest exercises the kernel's behavior not to inherit AMX state and
the ability to switch the context by verifying that they retain unique
data between multiple threads.

Also, ptrace() is used to insert AMX state into existing threads -- both
before and after the existing thread has initialized its AMX state.

Collect the test cases of validating those operations together, as they
share some common setup for the AMX state.

These test cases do not depend on AMX compiler support, as they employ
user-space-XSAVE directly to access AMX state.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselft...@vger.kernel.org
---
Changes from v2:
* Updated the test messages and the changelog as tile data is not inherited
  to a child anymore.
* Removed bytecode for the instructions already supported by binutils.
* Changed to check the XSAVE availability in a reliable way.

Changes from v1:
* Removed signal testing code
---
 tools/testing/selftests/x86/Makefile |   2 +-
 tools/testing/selftests/x86/amx.c| 677 +++
 2 files changed, 678 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/amx.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 6703c7906b71..8408bbde788f 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -17,7 +17,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs 
syscall_nt test_mremap
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering amx
 # Some selftests require 32bit support enabled also on 64bit systems
 TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
 
diff --git a/tools/testing/selftests/x86/amx.c 
b/tools/testing/selftests/x86/amx.c
new file mode 100644
index ..f4ecdfd27ae9
--- /dev/null
+++ b/tools/testing/selftests/x86/amx.c
@@ -0,0 +1,677 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define PAGE_SIZE  (1 << 12)
+
+#define NUM_TILES  8
+#define TILE_SIZE  1024
+#define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+struct xsave_data {
+   u8 area[XSAVE_SIZE];
+} __attribute__((aligned(64)));
+
+/* Tile configuration associated: */
+#define MAX_TILES  16
+#define RESERVED_BYTES 14
+
+struct tile_config {
+   u8  palette_id;
+   u8  start_row;
+   u8  reserved[RESERVED_BYTES];
+   u16 colsb[MAX_TILES];
+   u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+   u8 data[NUM_TILES * TILE_SIZE];
+};
+
+static inline u64 __xgetbv(u32 index)
+{
+   u32 eax, edx;
+
+   asm volatile("xgetbv;"
+: "=a" (eax), "=d" (edx)
+: "c" (index));
+   return eax + ((u64)edx << 32);
+}
+
+static inline void __cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+{
+   asm volatile("cpuid;"
+: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+: "0" (*eax), "2" (*ecx));
+}
+
+/* Load tile configuration */
+static inline void __ldtilecfg(void *cfg)
+{
+   asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+: : "a"(cfg));
+}
+
+/* Load tile data to %tmm0 register only */
+static inline void __tileloadd(void *tile)
+{
+   asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+: : "a"(tile), "d"(0));
+}
+
+/* Save extended states */
+static inline void __xsave(void *buffer, u32 lo, u32 hi)
+{
+   asm volatile("xsave (%%rdi)"
+: : "D" (buffer), "a" (lo), "d" (hi)
+: "memory");
+}
+
+/* Restore extended states */
+static inline void __xrstor(void *buffer, u32 lo, u32 hi)
+{
+   asm volatile("xrstor (%%rdi)"
+: : "D" (buffer), "a" (lo), "d" (hi));
+}
+
+/* Release tile states to init values */
+static inline void __tilerelease(void)
+{
+   asm vola

[PATCH v3 17/21] x86/fpu/amx: Define AMX state components and have it used for boot-time checks

2020-12-23 Thread Chang S. Bae
Linux uses check_xstate_against_struct() to sanity check the size of
XSTATE-enabled features. AMX is the XSAVE-enabled feature, and its size is
not hard-coded but discoverable at run-time via CPUID.

The AMX state is composed of state components 17 and 18, which are all user
state components. The first component is the XTILECFG state of a 64-byte
tile-related control register. The state component 18, called XTILEDATA,
contains the actual tile data, and the state size varies on
implementations. The architectural maximum, as defined in the CPUID(0x1d,
1): EAX[15:0], is a byte less than 64KB. The first implementation supports
8KB.

Check the XTILEDATA state size dynamically. The feature introduces the new
tile register, TMM. Define one register struct only and read the number of
registers from CPUID. Cross-check the overall size with CPUID again.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the code comments.

Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/include/asm/fpu/types.h  | 27 ++
 arch/x86/include/asm/fpu/xstate.h |  2 +
 arch/x86/kernel/fpu/xstate.c  | 62 +++
 3 files changed, 91 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3fc6dbbe3ede..bf9511efd546 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -120,6 +120,9 @@ enum xfeature {
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
+   XFEATURE_RSRVD_COMP_16,
+   XFEATURE_XTILE_CFG,
+   XFEATURE_XTILE_DATA,
 
XFEATURE_MAX,
 };
@@ -136,11 +139,15 @@ enum xfeature {
 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
 #define XFEATURE_MASK_PASID(1 << XFEATURE_PASID)
 #define XFEATURE_MASK_LBR  (1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG(1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA   (1 << XFEATURE_XTILE_DATA)
 
 #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512   (XFEATURE_MASK_OPMASK \
 | XFEATURE_MASK_ZMM_Hi256 \
 | XFEATURE_MASK_Hi16_ZMM)
+#define XFEATURE_MASK_XTILE(XFEATURE_MASK_XTILE_DATA \
+| XFEATURE_MASK_XTILE_CFG)
 
 #define FIRST_EXTENDED_XFEATUREXFEATURE_YMM
 
@@ -153,6 +160,9 @@ struct reg_256_bit {
 struct reg_512_bit {
u8  regbytes[512/8];
 };
+struct reg_1024_byte {
+   u8  regbytes[1024];
+};
 
 /*
  * State component 2:
@@ -255,6 +265,23 @@ struct arch_lbr_state {
u64 ler_to;
u64 ler_info;
struct lbr_entryentries[];
+};
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+   u64 tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+   struct reg_1024_bytetmm;
 } __packed;
 
 /*
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 5927033e017f..08d3dd18d7d8 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -13,6 +13,8 @@
 
 #define XSTATE_CPUID   0x000d
 
+#define TILE_CPUID 0x001d
+
 #define FXSAVE_SIZE512
 
 #define XSAVE_HDR_SIZE 64
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c2acfee581ba..f54ff1d4a44b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -41,6 +41,14 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "AMX Tile config"   ,
+   "AMX Tile data" ,
+   "unknown xstate feature",
 };
 
 struct xfeature_capflag_info {
@@ -60,6 +68,8 @@ static struct xfeature_capflag_info xfeature_capflags[] 
__initdata = {
{ XFEATURE_PT_UNIMPLEMENTED_SO_FAR, X86_FEATURE_INTEL_PT },
{ XFEATURE_PKRU,X86_FEATURE_PKU },
{ XFEATURE_PASID,   X86_FEATURE_ENQCMD },
+   { XFEATURE_XTILE_CFG,   X86_FEATURE_AMX_TILE },
+   { XFEATURE_XTILE_DATA,  X86_FEATURE

[PATCH v3 15/21] x86/fpu/xstate: Extend the table to map xstate components with features

2020-12-23 Thread Chang S. Bae
At compile-time xfeatures_mask_all includes all possible XCR0 features. At
run-time fpu__init_system_xstate() clears features in xfeatures_mask_all
that are not enabled in CPUID. It does this by looping through all possible
XCR0 features.

Update the code to handle the possibility that there will be gaps in the
XCR0 feature bit numbers.

No functional change, until hardware with bit number gaps in XCR0.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/kernel/fpu/xstate.c | 41 ++--
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 592e67ff6fa7..c2acfee581ba 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -43,18 +43,23 @@ static const char *xfeature_names[] =
"unknown xstate feature",
 };
 
-static short xsave_cpuid_features[] __initdata = {
-   X86_FEATURE_FPU,
-   X86_FEATURE_XMM,
-   X86_FEATURE_AVX,
-   X86_FEATURE_MPX,
-   X86_FEATURE_MPX,
-   X86_FEATURE_AVX512F,
-   X86_FEATURE_AVX512F,
-   X86_FEATURE_AVX512F,
-   X86_FEATURE_INTEL_PT,
-   X86_FEATURE_PKU,
-   X86_FEATURE_ENQCMD,
+struct xfeature_capflag_info {
+   int xfeature_idx;
+   short cpu_cap;
+};
+
+static struct xfeature_capflag_info xfeature_capflags[] __initdata = {
+   { XFEATURE_FP,  X86_FEATURE_FPU },
+   { XFEATURE_SSE, X86_FEATURE_XMM },
+   { XFEATURE_YMM, X86_FEATURE_AVX },
+   { XFEATURE_BNDREGS, X86_FEATURE_MPX },
+   { XFEATURE_BNDCSR,  X86_FEATURE_MPX },
+   { XFEATURE_OPMASK,  X86_FEATURE_AVX512F },
+   { XFEATURE_ZMM_Hi256,   X86_FEATURE_AVX512F },
+   { XFEATURE_Hi16_ZMM,X86_FEATURE_AVX512F },
+   { XFEATURE_PT_UNIMPLEMENTED_SO_FAR, X86_FEATURE_INTEL_PT },
+   { XFEATURE_PKRU,X86_FEATURE_PKU },
+   { XFEATURE_PASID,   X86_FEATURE_ENQCMD },
 };
 
 /*
@@ -956,11 +961,15 @@ void __init fpu__init_system_xstate(void)
}
 
/*
-* Clear XSAVE features that are disabled in the normal CPUID.
+* Cross-check XSAVE feature with CPU capability flag. Clear the
+* mask bit for disabled features.
 */
-   for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
-   if (!boot_cpu_has(xsave_cpuid_features[i]))
-   xfeatures_mask_all &= ~BIT_ULL(i);
+   for (i = 0; i < ARRAY_SIZE(xfeature_capflags); i++) {
+   short cpu_cap = xfeature_capflags[i].cpu_cap;
+   int idx = xfeature_capflags[i].xfeature_idx;
+
+   if (!boot_cpu_has(cpu_cap))
+   xfeatures_mask_all &= ~BIT_ULL(idx);
}
 
xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
-- 
2.17.1



[PATCH v3 16/21] x86/cpufeatures/amx: Enumerate Advanced Matrix Extension (AMX) feature bits

2020-12-23 Thread Chang S. Bae
Intel's Advanced Matrix Extension (AMX) is a new 64-bit extended feature
consisting of two-dimensional registers and an accelerator unit. The first
implementation of the latter is the tile matrix multiply unit (TMUL). TMUL
performs SIMD dot-products on four bytes (INT8) or two bfloat16
floating-point (BF16) elements.

Here we add AMX to the kernel/user ABI, by enumerating the capability.
E.g., /proc/cpuinfo: amx_tile, amx_bf16, amx_int8

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/cpufeatures.h | 3 +++
 arch/x86/kernel/cpu/cpuid-deps.c   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 5b6496ee3703..a1839b6a1929 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -375,6 +375,9 @@
 #define X86_FEATURE_TSXLDTRK   (18*32+16) /* TSX Suspend Load Address 
Tracking */
 #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR   (18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_AMX_BF16   (18*32+22) /* AMX BF16 Support */
+#define X86_FEATURE_AMX_TILE   (18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8   (18*32+25) /* AMX INT8 Support */
 #define X86_FEATURE_SPEC_CTRL  (18*32+26) /* "" Speculation Control 
(IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP(18*32+27) /* "" Single Thread 
Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D  (18*32+28) /* Flush L1D cache */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index a9e8e160ae30..1cef9264067e 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_ENQCMD,   X86_FEATURE_XSAVES},
{ X86_FEATURE_PER_THREAD_MBA,   X86_FEATURE_MBA   },
{ X86_FEATURE_XFD,  X86_FEATURE_XSAVES},
+   { X86_FEATURE_AMX_TILE, X86_FEATURE_XSAVE },
+   { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE  },
+   { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE  },
{}
 };
 
-- 
2.17.1



[PATCH v3 14/21] x86/fpu/xstate: Support ptracer-induced xstate buffer expansion

2020-12-23 Thread Chang S. Bae
ptrace() may update xstate data before the target task has taken an XFD
fault and expanded the context switch buffer. Detect this case and allocate
a sufficient buffer to support the request. Also, disable the (now
unnecessary) associated first-use fault.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
* Updated the code comments.
---
 arch/x86/kernel/fpu/regset.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 8d863240b9c6..16ff8ac765c1 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -125,6 +125,35 @@ int xstateregs_set(struct task_struct *target, const 
struct user_regset *regset,
 
xsave = __xsave(fpu);
 
+   /*
+* When a ptracer attempts to write any state in the target buffer but 
not
+* sufficiently allocated, it dynamically expands the buffer.
+*/
+   if (count > get_xstate_size(fpu->state_mask)) {
+   unsigned int offset, size;
+   struct xstate_header hdr;
+   u64 mask;
+
+   offset = offsetof(struct xregs_state, header);
+   size = sizeof(hdr);
+
+   /* Retrieve XSTATE_BV */
+   if (kbuf) {
+   memcpy(, kbuf + offset, size);
+   } else {
+   ret = __copy_from_user(, ubuf + offset, size);
+   if (ret)
+   return ret;
+   }
+
+   mask = hdr.xfeatures & xfeatures_mask_user_dynamic;
+   if (!mask) {
+   ret = alloc_xstate_buffer(fpu, mask);
+   if (ret)
+   return ret;
+   }
+   }
+
fpu__prepare_write(fpu);
 
if (using_compacted_format()) {
-- 
2.17.1



[PATCH v3 08/21] x86/fpu/xstate: Define the scope of the initial xstate data

2020-12-23 Thread Chang S. Bae
init_fpstate is used to record the initial xstate value for convenience
and covers all the states. But it is wasteful to cover large states all
with trivial initial data.

Limit init_fpstate by clarifying its size and coverage, which are all but
dynamic user states. The dynamic states are assumed to be large but having
initial data with zeros.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog for clarification.
* Updated the code comments.
---
 arch/x86/include/asm/fpu/internal.h | 18 +++---
 arch/x86/include/asm/fpu/xstate.h   |  1 +
 arch/x86/kernel/fpu/core.c  |  4 ++--
 arch/x86/kernel/fpu/xstate.c|  4 ++--
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 37ea5e37f21c..bbdd304719c6 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -80,6 +80,18 @@ static __always_inline __pure bool use_fxsr(void)
 
 extern union fpregs_state init_fpstate;
 
+static inline u64 get_init_fpstate_mask(void)
+{
+   /* init_fpstate covers states in fpu->state. */
+   return (xfeatures_mask_all & ~xfeatures_mask_user_dynamic);
+}
+
+static inline unsigned int get_init_fpstate_size(void)
+{
+   /* fpu->state size is aligned with the init_fpstate size. */
+   return fpu_kernel_xstate_min_size;
+}
+
 extern void fpstate_init(struct fpu *fpu);
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
@@ -269,12 +281,12 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 : "memory")
 
 /*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
+ * Use this function to dump the initial state, only during boot time when x86
+ * caps not set up and alternative not available yet.
  */
 static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 {
-   u64 mask = xfeatures_mask_all;
+   u64 mask = get_init_fpstate_mask();
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 379e8f8b8440..62f6583f34fa 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -103,6 +103,7 @@ extern void __init update_regset_xstate_info(unsigned int 
size,
 u64 xstate_mask);
 
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
+unsigned int get_xstate_size(u64 mask);
 int alloc_xstate_buffer(struct fpu *fpu, u64 mask);
 void free_xstate_buffer(struct fpu *fpu);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 6dafed34be4f..aad1a7102096 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -206,10 +206,10 @@ void fpstate_init(struct fpu *fpu)
return;
}
 
-   memset(state, 0, fpu_kernel_xstate_min_size);
+   memset(state, 0, fpu ? get_xstate_size(fpu->state_mask) : 
get_init_fpstate_size());
 
if (static_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(>xsave, xfeatures_mask_all);
+   fpstate_init_xstate(>xsave, fpu ? fpu->state_mask : 
get_init_fpstate_mask());
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(>fxsave);
else
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index af4d7d9aa977..43877005b4e2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -137,7 +137,7 @@ static bool xfeature_is_supervisor(int xfeature_nr)
  * Available once those arrays for the offset, size, and alignment info are 
set up,
  * by setup_xstate_features().
  */
-static unsigned int get_xstate_size(u64 mask)
+unsigned int get_xstate_size(u64 mask)
 {
unsigned int size;
u64 xmask;
@@ -511,7 +511,7 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
 
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(_fpstate.xsave, xfeatures_mask_all);
+   fpstate_init_xstate(_fpstate.xsave, 
get_init_fpstate_mask());
 
/*
 * Init all the features state with header.xfeatures being 0x0
-- 
2.17.1



[PATCH v3 07/21] x86/fpu/xstate: Introduce helpers to manage dynamic xstate buffers

2020-12-23 Thread Chang S. Bae
The static per-task xstate buffer contains the extended register states --
but it is not expandable at runtime. Introduce runtime methods and new fpu
struct fields to support the expansion.

fpu->state_mask indicates the saved states per task and fpu->state_ptr
points to the dynamically allocated buffer.

alloc_xstate_buffer() uses vmalloc(). If use of this mechanism grows to
allocate buffers larger than 64KB, a more sophisticated allocation scheme
that includes purpose-built reclaim capability might be justified.

Introduce a new helper -- get_xstate_size() to calculate the buffer size.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
* Replaced 'area' with 'buffer' in the comments and the changelog.
* Updated the code comments.

Changes from v1:
* Removed unneeded interrupt masking (Andy Lutomirski)
* Added vmalloc() error tracing (Dave Hansen, PeterZ, and Andy Lutomirski)
---
 arch/x86/include/asm/fpu/types.h  |  29 ++--
 arch/x86/include/asm/fpu/xstate.h |   3 +
 arch/x86/include/asm/trace/fpu.h  |   5 ++
 arch/x86/kernel/fpu/core.c|   3 +
 arch/x86/kernel/fpu/xstate.c  | 115 ++
 5 files changed, 150 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..3fc6dbbe3ede 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -336,14 +336,33 @@ struct fpu {
 */
unsigned long   avx512_timestamp;
 
+   /*
+* @state_mask:
+*
+* The state component bitmap. It indicates the saved xstate in
+* either @state or @state_ptr. The map value starts to be aligned
+* with @state and then with @state_ptr once it is in use.
+*/
+   u64 state_mask;
+
+   /*
+* @state_ptr:
+*
+* Copy of all extended register states, in a dynamically allocated
+* buffer. When a task is using extended features, the register state
+* is always the most current. This state copy is more recent than
+* @state. If the task context-switches away, they get saved here,
+* representing the xstate.
+*/
+   union fpregs_state  *state_ptr;
+
/*
 * @state:
 *
-* In-memory copy of all FPU registers that we save/restore
-* over context switches. If the task is using the FPU then
-* the registers in the FPU are more recent than this state
-* copy. If the task context-switches away then they get
-* saved here and represent the FPU state.
+* Copy of some extended register state. If a task uses a dynamically
+* allocated buffer, @state_ptr, then it has a more recent state copy
+* than this. This copy follows the same attributes as described for
+* @state_ptr.
 */
union fpregs_state  state;
/*
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 6ce8350672c2..379e8f8b8440 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -103,6 +103,9 @@ extern void __init update_regset_xstate_info(unsigned int 
size,
 u64 xstate_mask);
 
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
+int alloc_xstate_buffer(struct fpu *fpu, u64 mask);
+void free_xstate_buffer(struct fpu *fpu);
+
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 879b77792f94..bf88b873 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -89,6 +89,11 @@ DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
TP_ARGS(fpu)
 );
 
+DEFINE_EVENT(x86_fpu, x86_fpu_xstate_alloc_failed,
+   TP_PROTO(struct fpu *fpu),
+   TP_ARGS(fpu)
+);
+
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH asm/trace/
 #undef TRACE_INCLUDE_FILE
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 1a428803e6b2..6dafed34be4f 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -235,6 +235,9 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 */
memset(_fpu->state.xsave, 0, fpu_kernel_xstate_min_size);
 
+   dst_fpu->state_mask = xfeatures_mask_all & ~xfeatures_mask_user_dynamic;
+   dst_fpu->state_ptr = NULL;
+
/*
 * If the FPU registers are not current just memcpy() the state.
 * Otherwise save current FPU registers directly into the child's FPU
diff --git a/arch/x86/kernel/fpu/xstate.c

[PATCH v3 13/21] x86/fpu/xstate: Expand dynamic context switch buffer on first use

2020-12-23 Thread Chang S. Bae
Intel's Extended Feature Disable (XFD) feature is an extension of the XSAVE
architecture. XFD allows the kernel to enable a feature state in XCR0 and
to receive a #NM trap when a task uses instructions accessing that state.
In this way, Linux can defer allocating the large XSAVE buffer until tasks
need it.

XFD introduces two MSRs: IA32_XFD to enable/disable the feature and
IA32_XFD_ERR to assist the #NM trap handler. Both use the same
state-component bitmap format, used by XCR0.

Use this hardware capability to find the right time to expand the xstate
buffer. Introduce two sets of helper functions for that:

1. The first set is primarily for interacting with the XFD hardware:
xdisable_setbits()
xdisable_getbits()
xdisable_switch()

2. The second set is for managing the first-use status and handling #NM
   trap:
xfirstuse_enabled()
xfirstuse_not_detected()

The #NM handler induces the xstate buffer expansion to save the first-used
states.

If the standard (uncompacted) format is used in the kernel, the XSAVE
buffer has the maximum size already, and so XFD is not needed. The XFD
feature is enabled only when the compacted format is in use.

No functional change until the kernel enables dynamic user states and XFD.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Changed to enable XFD only when the compacted format is used.
* Updated the changelog with task->fpu removed. (Boris Petkov)

Changes from v1:
* Inlined the XFD-induced #NM handling code (Andy Lutomirski)
---
 arch/x86/include/asm/cpufeatures.h  |  1 +
 arch/x86/include/asm/fpu/internal.h | 51 -
 arch/x86/include/asm/msr-index.h|  2 ++
 arch/x86/kernel/cpu/cpuid-deps.c|  1 +
 arch/x86/kernel/fpu/xstate.c| 37 +++--
 arch/x86/kernel/process.c   |  5 +++
 arch/x86/kernel/process_32.c|  2 +-
 arch/x86/kernel/process_64.c|  2 +-
 arch/x86/kernel/traps.c | 40 ++
 9 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..5b6496ee3703 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -275,6 +275,7 @@
 #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
 #define X86_FEATURE_XGETBV1(10*32+ 2) /* XGETBV with ECX = 1 
instruction */
 #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS 
instructions */
+#define X86_FEATURE_XFD(10*32+ 4) /* eXtended Feature 
Disabling */
 
 /*
  * Extended auxiliary flags: Linux defined - for features scattered in various
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index d409a6ae0c38..5eba9a466249 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -573,11 +573,58 @@ static inline void switch_fpu_prepare(struct fpu 
*old_fpu, int cpu)
  * Misc helper functions:
  */
 
+/* The first-use detection helpers: */
+
+static inline void xdisable_setbits(u64 value)
+{
+   wrmsrl_safe(MSR_IA32_XFD, value);
+}
+
+static inline u64 xdisable_getbits(void)
+{
+   u64 value;
+
+   rdmsrl_safe(MSR_IA32_XFD, );
+   return value;
+}
+
+static inline u64 xfirstuse_enabled(void)
+{
+   /* All the dynamic user components are first-use enabled. */
+   return xfeatures_mask_user_dynamic;
+}
+
+/*
+ * Convert fpu->firstuse_bv to xdisable configuration in MSR IA32_XFD.
+ * xdisable_setbits() only uses this.
+ */
+static inline u64 xfirstuse_not_detected(struct fpu *fpu)
+{
+   u64 firstuse_bv = (fpu->state_mask & xfirstuse_enabled());
+
+   /*
+* If first-use is not detected, set the bit. If the detection is
+* not enabled, the bit is always zero in firstuse_bv. So, make
+* following conversion:
+*/
+   return  (xfirstuse_enabled() ^ firstuse_bv);
+}
+
+/* Update MSR IA32_XFD based on fpu->firstuse_bv */
+static inline void xdisable_switch(struct fpu *prev, struct fpu *next)
+{
+   if (!static_cpu_has(X86_FEATURE_XFD) || !xfirstuse_enabled())
+   return;
+
+   if (unlikely(prev->state_mask != next->state_mask))
+   xdisable_setbits(xfirstuse_not_detected(next));
+}
+
 /*
  * Load PKRU from the FPU context if available. Delay loading of the
  * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct fpu *old_fpu, struct fpu *new_fpu)
 {
u32 pkru_val = init_pkru_value;
struct pkru_state *pk;
@@ -587,6 +634,8 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 
set_thread_flag(TIF_NEED_FPU_LOAD);
 
+   xdisable_switch(old_fpu, new_fpu);
+
if (!cpu_feature_ena

[PATCH v3 01/21] x86/fpu/xstate: Modify initialization helper to handle both static and dynamic buffers

2020-12-23 Thread Chang S. Bae
In preparation for dynamic xstate buffer expansion, update the buffer
initialization function parameters to equally handle static in-line xstate
buffer, as well as dynamically allocated xstate buffer.

init_fpstate is a special case, which is indicated by a null pointer
parameter to fpstate_init().

Also, fpstate_init_xstate() now accepts the state component bitmap to
configure XCOMP_BV for the compacted format.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
---
 arch/x86/include/asm/fpu/internal.h |  6 +++---
 arch/x86/kernel/fpu/core.c  | 14 +++---
 arch/x86/kernel/fpu/init.c  |  2 +-
 arch/x86/kernel/fpu/regset.c|  2 +-
 arch/x86/kernel/fpu/xstate.c|  3 +--
 arch/x86/kvm/x86.c  |  2 +-
 6 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 8d33ad80704f..d81d8c407dc0 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -80,20 +80,20 @@ static __always_inline __pure bool use_fxsr(void)
 
 extern union fpregs_state init_fpstate;
 
-extern void fpstate_init(union fpregs_state *state);
+extern void fpstate_init(struct fpu *fpu);
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
 #else
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
+static inline void fpstate_init_xstate(struct xregs_state *xsave, u64 
xcomp_mask)
 {
/*
 * XRSTORS requires these bits set in xcomp_bv, or it will
 * trigger #GP:
 */
-   xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+   xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xcomp_mask;
 }
 
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index eb86a2b831b1..f23e5ffbb307 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -191,8 +191,16 @@ static inline void fpstate_init_fstate(struct fregs_state 
*fp)
fp->fos = 0xu;
 }
 
-void fpstate_init(union fpregs_state *state)
+/* A null pointer parameter indicates init_fpstate. */
+void fpstate_init(struct fpu *fpu)
 {
+   union fpregs_state *state;
+
+   if (fpu)
+   state = >state;
+   else
+   state = _fpstate;
+
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpstate_init_soft(>soft);
return;
@@ -201,7 +209,7 @@ void fpstate_init(union fpregs_state *state)
memset(state, 0, fpu_kernel_xstate_size);
 
if (static_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(>xsave);
+   fpstate_init_xstate(>xsave, xfeatures_mask_all);
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(>fxsave);
else
@@ -261,7 +269,7 @@ static void fpu__initialize(struct fpu *fpu)
WARN_ON_FPU(fpu != >thread.fpu);
 
set_thread_flag(TIF_NEED_FPU_LOAD);
-   fpstate_init(>state);
+   fpstate_init(fpu);
trace_x86_fpu_init_state(fpu);
 }
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 701f196d7c68..74e03e3bc20f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -124,7 +124,7 @@ static void __init fpu__init_system_generic(void)
 * Set up the legacy init FPU context. (xstate init might overwrite this
 * with a more modern format, if the CPU supports it.)
 */
-   fpstate_init(_fpstate);
+   fpstate_init(NULL);
 
fpu__init_system_mxcsr();
 }
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..4c4d9059ff36 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -144,7 +144,7 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 * In case of failure, mark all states as init:
 */
if (ret)
-   fpstate_init(>state);
+   fpstate_init(fpu);
 
return ret;
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0a..1a3e5effe0fa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -457,8 +457,7 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
 
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
-xfeatures_mask_all;
+   fpstate_init_xstate(_fpstate.xsave, xfeatures_mask_all);
 
/*
  

[PATCH v3 06/21] x86/fpu/xstate: Calculate and remember dynamic xstate buffer sizes

2020-12-23 Thread Chang S. Bae
The xstate buffer is currently in-line with static size. To accommodate
dynamic user xstates, introduce variables to represent the maximum and
minimum buffer sizes.

do_extra_xstate_size_checks() calculates the maximum xstate size and sanity
checks it with CPUID. It calculates the static in-line buffer size by
excluding the dynamic user states from the maximum xstate size.

No functional change, until the kernel enables dynamic buffer support.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
* Renamed the in-line size variable.
* Updated some code comments.
---
 arch/x86/include/asm/processor.h | 10 +++
 arch/x86/kernel/fpu/core.c   |  6 ++---
 arch/x86/kernel/fpu/init.c   | 36 -
 arch/x86/kernel/fpu/signal.c |  2 +-
 arch/x86/kernel/fpu/xstate.c | 46 +---
 arch/x86/kernel/process.c|  6 +
 arch/x86/kvm/x86.c   |  2 +-
 7 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 82a08b585818..c9c608f8af91 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -477,7 +477,8 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
 #endif /* X86_64 */
 
-extern unsigned int fpu_kernel_xstate_size;
+extern unsigned int fpu_kernel_xstate_min_size;
+extern unsigned int fpu_kernel_xstate_max_size;
 extern unsigned int fpu_user_xstate_size;
 
 struct perf_event;
@@ -545,12 +546,7 @@ struct thread_struct {
 };
 
 /* Whitelist the FPU state from the task_struct for hardened usercopy. */
-static inline void arch_thread_struct_whitelist(unsigned long *offset,
-   unsigned long *size)
-{
-   *offset = offsetof(struct thread_struct, fpu.state);
-   *size = fpu_kernel_xstate_size;
-}
+extern void arch_thread_struct_whitelist(unsigned long *offset, unsigned long 
*size);
 
 /*
  * Thread-synchronous status.
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 20925cae2a84..1a428803e6b2 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -206,7 +206,7 @@ void fpstate_init(struct fpu *fpu)
return;
}
 
-   memset(state, 0, fpu_kernel_xstate_size);
+   memset(state, 0, fpu_kernel_xstate_min_size);
 
if (static_cpu_has(X86_FEATURE_XSAVES))
fpstate_init_xstate(>xsave, xfeatures_mask_all);
@@ -233,7 +233,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 * Don't let 'init optimized' areas of the XSAVE area
 * leak into the child task:
 */
-   memset(_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+   memset(_fpu->state.xsave, 0, fpu_kernel_xstate_min_size);
 
/*
 * If the FPU registers are not current just memcpy() the state.
@@ -245,7 +245,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 */
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
-   memcpy(_fpu->state, _fpu->state, 
fpu_kernel_xstate_size);
+   memcpy(_fpu->state, _fpu->state, 
fpu_kernel_xstate_min_size);
 
else if (!copy_fpregs_to_fpstate(dst_fpu))
copy_kernel_to_fpregs(dst_fpu);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 74e03e3bc20f..5dac97158030 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -130,13 +130,20 @@ static void __init fpu__init_system_generic(void)
 }
 
 /*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
+ * Size of the minimally allocated FPU context state. All threads have this 
amount
+ * of xstate buffer at minimum.
+ *
+ * This buffer is inherent to the XSAVE architecture which puts all state 
components
+ * into a single, continuous memory block:
+ */
+unsigned int fpu_kernel_xstate_min_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_min_size);
+
+/*
+ * Size of the maximum FPU context state. When using the compacted format, the 
buffer
+ * can be dynamically expanded to include some states up to this size.
  */
-unsigned int fpu_kernel_xstate_size;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
+unsigned int fpu_kernel_xstate_max_size;
 
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
@@ -167,8 +174,10 @@ static void __init fpu__init_task_struct_size(void)
/*
 * Add back the dynamically-calculated register state
 * size.
+*
+  

[PATCH v3 12/21] x86/fpu/xstate: Update xstate context copy function to support dynamic buffer

2020-12-23 Thread Chang S. Bae
ptrace() and signal return paths use xstate context copy functions. They
allow callers to read (or write) xstate values in the target's buffer. With
dynamic user states, a component's position in the buffer may vary and the
initial value is not always stored in init_fpstate.

Change the helpers to find a component's offset accordingly.

When copying an initial value, explicitly check the init_fpstate coverage.
If not found, reset the memory in the destination. Otherwise, copy values
from init_fpstate.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
---
 arch/x86/kernel/fpu/xstate.c | 55 +++-
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6b863b2ca405..1d7d0cce6cc5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -248,12 +248,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
if (!(xfeatures & XFEATURE_MASK_SSE))
memset(>xmm_space[0], 0, 256);
 
+   /* Make sure 'xfeatures' to be a subset of fpu->state_mask */
+   xfeatures = ((xfeatures_mask_user() & fpu->state_mask) & ~xfeatures);
/*
 * First two features are FPU and SSE, which above we handled
 * in a special way already:
 */
feature_bit = 0x2;
-   xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
+   xfeatures >>= 0x2;
 
/*
 * Update all the remaining memory layouts according to their
@@ -262,12 +264,15 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 */
while (xfeatures) {
if (xfeatures & 0x1) {
-   int offset = xstate_comp_offsets[feature_bit];
-   int size = xstate_sizes[feature_bit];
-
-   memcpy((void *)xsave + offset,
-  (void *)_fpstate.xsave + offset,
-  size);
+   unsigned int offset = get_xstate_comp_offset(fpu, 
feature_bit);
+   unsigned int size = xstate_sizes[feature_bit];
+
+   if (get_init_fpstate_mask() & BIT_ULL(feature_bit))
+   memcpy((void *)xsave + offset,
+  (void *)_fpstate.xsave + offset,
+  size);
+   else
+   memset((void *)xsave + offset, 0, size);
}
 
xfeatures >>= 1;
@@ -1232,7 +1237,10 @@ static void fill_gap(struct membuf *to, unsigned *last, 
unsigned offset)
 {
if (*last >= offset)
return;
-   membuf_write(to, (void *)_fpstate.xsave + *last, offset - *last);
+   if (offset <= get_init_fpstate_size())
+   membuf_write(to, (void *)_fpstate.xsave + *last, offset - 
*last);
+   else
+   membuf_zero(to, offset - *last);
*last = offset;
 }
 
@@ -1240,7 +1248,10 @@ static void copy_part(struct membuf *to, unsigned *last, 
unsigned offset,
  unsigned size, void *from)
 {
fill_gap(to, last, offset);
-   membuf_write(to, from, size);
+   if (from)
+   membuf_write(to, from, size);
+   else
+   membuf_zero(to, size);
*last = offset + size;
 }
 
@@ -1292,12 +1303,22 @@ void copy_xstate_to_kernel(struct membuf to, struct fpu 
*fpu)
  sizeof(header), );
 
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+   u64 mask = BIT_ULL(i);
+   void *src;
/*
-* Copy only in-use xstates:
+* Copy only in-use xstate at first. If the feature is enabled,
+* find the init value, whether stored in init_fpstate or simply
+* zeros, and then copy them.
 */
-   if ((header.xfeatures >> i) & 1) {
-   void *src = __raw_xsave_addr(fpu, i);
-
+   if (header.xfeatures & mask) {
+   src = __raw_xsave_addr(fpu, i);
+   copy_part(, , xstate_offsets[i],
+ xstate_sizes[i], src);
+   } else if (xfeatures_mask_user() & mask) {
+   if (get_init_fpstate_mask() & mask)
+   src = (void *)_fpstate.xsave + last;
+   else
+   src = NULL;
copy_part(, , xstate_offsets[i],
  xstate_sizes[i], src);
}
@@ -1331,6 +1352,9 @@ int copy_kernel_to_xstate(struct fpu *fpu, const void 

[PATCH v3 04/21] x86/fpu/xstate: Modify context switch helpers to handle both static and dynamic buffers

2020-12-23 Thread Chang S. Bae
In preparation for dynamic xstate buffer expansion, update the xstate
restore function parameters to equally handle static in-line xstate buffer,
as well as dynamically allocated xstate buffer.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
---
 arch/x86/include/asm/fpu/internal.h | 9 ++---
 arch/x86/kernel/fpu/core.c  | 4 ++--
 arch/x86/kernel/fpu/signal.c| 3 +--
 arch/x86/kvm/x86.c  | 2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 0153c4d4ca77..37ea5e37f21c 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -397,8 +397,9 @@ static inline int copy_user_to_xregs(struct xregs_state 
__user *buf, u64 mask)
  * Restore xstate from kernel space xsave area, return an error code instead of
  * an exception.
  */
-static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 
mask)
+static inline int copy_kernel_to_xregs_err(struct fpu *fpu, u64 mask)
 {
+   struct xregs_state *xstate = >state.xsave;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -425,8 +426,10 @@ static inline void __copy_kernel_to_fpregs(union 
fpregs_state *fpstate, u64 mask
}
 }
 
-static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+static inline void copy_kernel_to_fpregs(struct fpu *fpu)
 {
+   union fpregs_state *fpstate = >state;
+
/*
 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
 * pending. Clear the x87 state here by setting it to fixed values.
@@ -511,7 +514,7 @@ static inline void __fpregs_load_activate(void)
return;
 
if (!fpregs_state_valid(fpu, cpu)) {
-   copy_kernel_to_fpregs(>state);
+   copy_kernel_to_fpregs(fpu);
fpregs_activate(fpu);
fpu->last_cpu = cpu;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f23e5ffbb307..20925cae2a84 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -172,7 +172,7 @@ void fpu__save(struct fpu *fpu)
 
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
if (!copy_fpregs_to_fpstate(fpu)) {
-   copy_kernel_to_fpregs(>state);
+   copy_kernel_to_fpregs(fpu);
}
}
 
@@ -248,7 +248,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
memcpy(_fpu->state, _fpu->state, 
fpu_kernel_xstate_size);
 
else if (!copy_fpregs_to_fpstate(dst_fpu))
-   copy_kernel_to_fpregs(_fpu->state);
+   copy_kernel_to_fpregs(dst_fpu);
 
fpregs_unlock();
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 0d6deb75c507..414a13427934 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -426,8 +426,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
 * Restore previously saved supervisor xstates along with
 * copied-in user xstates.
 */
-   ret = copy_kernel_to_xregs_err(>state.xsave,
-  user_xfeatures | 
xfeatures_mask_supervisor());
+   ret = copy_kernel_to_xregs_err(fpu, user_xfeatures | 
xfeatures_mask_supervisor());
 
} else if (use_fxsr()) {
ret = __copy_from_user(>state.fxsave, buf_fx, state_size);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09368201d9cc..a087bbf252b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9249,7 +9249,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
kvm_save_current_fpu(vcpu->arch.guest_fpu);
 
-   copy_kernel_to_fpregs(>arch.user_fpu->state);
+   copy_kernel_to_fpregs(vcpu->arch.user_fpu);
 
fpregs_mark_activate();
fpregs_unlock();
-- 
2.17.1



[PATCH v3 05/21] x86/fpu/xstate: Add a new variable to indicate dynamic user states

2020-12-23 Thread Chang S. Bae
The perf has a buffer that is allocated on demand. The states saved in the
buffer were named as 'dynamic' (supervisor) states but the buffer is not
updated in every context switch.

The context switch buffer is in preparation to be dynamic for user states.
Make the wording to differentiate between those 'dynamic' states.

Add a new variable -- xfeatures_mask_user_dynamic to indicate the dynamic
user states, and rename some define and helper as related to the dynamic
supervisor states:
xfeatures_mask_supervisor_dynamic()
XFEATURE_MASK_SUPERVISOR_DYNAMIC

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog for clarification.
---
 arch/x86/include/asm/fpu/xstate.h | 12 +++-
 arch/x86/kernel/fpu/xstate.c  | 29 +++--
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 24bf8d3f559a..6ce8350672c2 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -56,7 +56,7 @@
  * - Don't set the bit corresponding to the dynamic supervisor feature in
  *   IA32_XSS at run time, since it has been set at boot time.
  */
-#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+#define XFEATURE_MASK_SUPERVISOR_DYNAMIC (XFEATURE_MASK_LBR)
 
 /*
  * Unsupported supervisor features. When a supervisor feature in this mask is
@@ -66,7 +66,7 @@
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
- XFEATURE_MASK_DYNAMIC | \
+ XFEATURE_MASK_SUPERVISOR_DYNAMIC | \
  XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
 #ifdef CONFIG_X86_64
@@ -87,14 +87,16 @@ static inline u64 xfeatures_mask_user(void)
return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
 
-static inline u64 xfeatures_mask_dynamic(void)
+static inline u64 xfeatures_mask_supervisor_dynamic(void)
 {
if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
-   return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+   return XFEATURE_MASK_SUPERVISOR_DYNAMIC & ~XFEATURE_MASK_LBR;
 
-   return XFEATURE_MASK_DYNAMIC;
+   return XFEATURE_MASK_SUPERVISOR_DYNAMIC;
 }
 
+extern u64 xfeatures_mask_user_dynamic;
+
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 2010c31d25e1..6620d0a3caff 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -61,6 +61,12 @@ static short xsave_cpuid_features[] __initdata = {
  */
 u64 xfeatures_mask_all __read_mostly;
 
+/*
+ * This represents user xstates, a subset of xfeatures_mask_all, saved in a
+ * dynamic kernel XSAVE buffer.
+ */
+u64 xfeatures_mask_user_dynamic __read_mostly;
+
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] 
= -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] 
= -1};
 static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX 
- 1] = -1};
@@ -237,7 +243,7 @@ void fpu__init_cpu_xstate(void)
 */
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
-xfeatures_mask_dynamic());
+xfeatures_mask_supervisor_dynamic());
}
 }
 
@@ -686,7 +692,7 @@ static unsigned int __init get_xsaves_size(void)
  */
 static unsigned int __init get_xsaves_size_no_dynamic(void)
 {
-   u64 mask = xfeatures_mask_dynamic();
+   u64 mask = xfeatures_mask_supervisor_dynamic();
unsigned int size;
 
if (!mask)
@@ -773,6 +779,7 @@ static int __init init_xstate_size(void)
 static void fpu__init_disable_system_xstate(void)
 {
xfeatures_mask_all = 0;
+   xfeatures_mask_user_dynamic = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 }
@@ -839,6 +846,8 @@ void __init fpu__init_system_xstate(void)
}
 
xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
+   /* Do not support the dynamically allocated buffer yet. */
+   xfeatures_mask_user_dynamic = 0;
 
/* Enable xstate instructions to be able to continue with 
initialization: */
fpu__init_cpu_xstate();
@@ -886,7 +895,7 @@ void fpu__resume_cpu(void)
 */
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
-xfeatures_mask_dynamic());
+xfeatures_mask_supervisor_dynamic());
}
 }
 
@@ -1321,8

[PATCH v3 11/21] x86/fpu/xstate: Update xstate buffer address finder to support dynamic xstate

2020-12-23 Thread Chang S. Bae
__raw_xsave_addr() returns the requested component's pointer in an xstate
buffer, by simply looking up the offset table. The offset used to be fixed,
but, with dynamic user states, it becomes variable.

get_xstate_size() has a routine to find an offset at runtime. Refactor to
use it for the address finder.

No functional change until the kernel enables dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/fpu/xstate.c | 82 +++-
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 8dfbc7d1702a..6b863b2ca405 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -133,15 +133,50 @@ static bool xfeature_is_supervisor(int xfeature_nr)
return ecx & 1;
 }
 
+/*
+ * Available once those arrays for the offset, size, and alignment info are 
set up,
+ * by setup_xstate_features().
+ */
+static unsigned int __get_xstate_comp_offset(u64 mask, int feature_nr)
+{
+   u64 xmask = BIT_ULL(feature_nr + 1) - 1;
+   unsigned int next_offset, offset = 0;
+   int i;
+
+   if ((mask & xmask) == (xfeatures_mask_all & xmask))
+   return xstate_comp_offsets[feature_nr];
+
+   /*
+* Calculate the size by summing up each state together, since no known
+* offset found with the xstate buffer format out of the given mask.
+*/
+
+   next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+   for (i = FIRST_EXTENDED_XFEATURE; i <= feature_nr; i++) {
+   if (!(mask & BIT_ULL(i)))
+   continue;
+
+   offset = xstate_aligns[i] ? ALIGN(next_offset, 64) : 
next_offset;
+   next_offset += xstate_sizes[i];
+   }
+
+   return offset;
+}
+
+static unsigned int get_xstate_comp_offset(struct fpu *fpu, int feature_nr)
+{
+   return __get_xstate_comp_offset(fpu->state_mask, feature_nr);
+}
+
 /*
  * Available once those arrays for the offset, size, and alignment info are 
set up,
  * by setup_xstate_features().
  */
 unsigned int get_xstate_size(u64 mask)
 {
-   unsigned int size;
-   u64 xmask;
-   int i, nr;
+   unsigned int offset;
+   int nr;
 
if (!mask)
return 0;
@@ -155,24 +190,8 @@ unsigned int get_xstate_size(u64 mask)
if (!using_compacted_format())
return xstate_offsets[nr] + xstate_sizes[nr];
 
-   xmask = BIT_ULL(nr + 1) - 1;
-
-   if (mask == (xmask & xfeatures_mask_all))
-   return xstate_comp_offsets[nr] + xstate_sizes[nr];
-
-   /*
-* Calculate the size by summing up each state together, since no known
-* size found with the xstate buffer format out of the given mask.
-*/
-   for (size = FXSAVE_SIZE + XSAVE_HDR_SIZE, i = FIRST_EXTENDED_XFEATURE; 
i <= nr; i++) {
-   if (!(mask & BIT_ULL(i)))
-   continue;
-
-   if (xstate_aligns[i])
-   size = ALIGN(size, 64);
-   size += xstate_sizes[i];
-   }
-   return size;
+   offset = __get_xstate_comp_offset(mask, nr);
+   return offset + xstate_sizes[nr];
 }
 
 /*
@@ -988,17 +1007,20 @@ static void *__raw_xsave_addr(struct fpu *fpu, int 
xfeature_nr)
 {
void *xsave;
 
-   if (!xfeature_enabled(xfeature_nr)) {
-   WARN_ON_FPU(1);
-   return NULL;
-   }
-
-   if (fpu)
-   xsave = __xsave(fpu);
-   else
+   if (!xfeature_enabled(xfeature_nr))
+   goto not_found;
+   else if (!fpu)
xsave = _fpstate.xsave;
+   else if (!(fpu->state_mask & BIT_ULL(xfeature_nr)))
+   goto not_found;
+   else
+   xsave = __xsave(fpu);
+
+   return xsave + get_xstate_comp_offset(fpu, xfeature_nr);
 
-   return xsave + xstate_comp_offsets[xfeature_nr];
+not_found:
+   WARN_ON_FPU(1);
+   return NULL;
 }
 /*
  * Given the xsave area and a state inside, this function returns the
-- 
2.17.1



[PATCH v3 00/21] x86: Support Intel Advanced Matrix Extensions

2020-12-23 Thread Chang S. Bae
Intel Advanced Matrix Extensions (AMX)[1][2] will be shipping on servers
soon.  AMX consists of configurable TMM "TILE" registers plus new
accelerator instructions that operate on them.  TMUL (Tile matrix MULtiply)
is the first accelerator instruction set to use the new registers, and we
anticipate additional instructions in the future.

Neither AMX state nor TMUL instructions depend on AVX.  However, AMX and
AVX do share common challenges.  The TMM registers are 8KB today, and
architecturally as large as 64KB, which merits updates to hardware and
software state management.

Further, both technologies run faster when they are not simultaneously
running on SMT siblings, and both technologies use of power and bandwidth
impact the power and performance available to neighboring cores.  (This
impact has measurably improved in recent hardware.)

If the existing kernel approach for managing XSAVE state was employed to
handle AMX, 8KB space would be added to every task, but possibly rarely
used.  So Linux support is optimized by using a new XSAVE feature: eXtended
Feature Disabling (XFD).  The kernel arms XFD to provide a #NM exception
upon a tasks' first access to TILE state. The kernel exception handler
installs the appropriate XSAVE context switch buffer, and the task behaves
as if the kernel had done that for all tasks.  Using XFD, AMX space is
allocated only when needed, eliminating the memory waste for unused state
components.

This series requires the new minimum sigaltstack support [3] and is based
on the mainline. The series is composed of three parts:
* Patch 01-14: Foundation to support dynamic user state management
* Patch 15-19: AMX enablement, including unit tests
* Patch 20-21: Signal handling optimization and new boot-parameters

Thanks to Len Brown and Dave Hansen for help with the cover letter.

Changes from v2 [5]:
* Removed the patch for the tile data inheritance. Also, updated the
  selftest patch. (Andy Lutomirski)
* Changed the kernel tainted when any unknown state is enabled. (Andy
  Lutomirski)
* Changed to use the XFD feature only when the compacted format in use.
* Improved the test code.
* Simplified the cmdline handling.
* Removed 'task->fpu' in changelogs. (Boris Petkov)
* Updated the variable name / comments / changelogs for clarification.

Changes from v1 [4]:
* Added vmalloc() error tracing (Dave Hansen, PeterZ, and Andy Lutomirski)
* Inlined the #NM handling code (Andy Lutomirski)
* Made signal handling optimization revertible
* Revised the new parameter handling code (Andy Lutomirski and Dave Hansen)
* Rebased on the upstream kernel

[1]: Intel Architecture Instruction Set Extension Programming Reference
October 2020, 
https://software.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
[2]: 
https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-matrix-extensions-intel-amx-instructions.html
[3]: 
https://lore.kernel.org/lkml/20201223015312.4882-1-chang.seok@intel.com/
[4]: 
https://lore.kernel.org/lkml/20201001203913.9125-1-chang.seok@intel.com/
[5]: 
https://lore.kernel.org/lkml/20201119233257.2939-1-chang.seok@intel.com/

Chang S. Bae (21):
  x86/fpu/xstate: Modify initialization helper to handle both static and
dynamic buffers
  x86/fpu/xstate: Modify state copy helpers to handle both static and
dynamic buffers
  x86/fpu/xstate: Modify address finders to handle both static and
dynamic buffers
  x86/fpu/xstate: Modify context switch helpers to handle both static
and dynamic buffers
  x86/fpu/xstate: Add a new variable to indicate dynamic user states
  x86/fpu/xstate: Calculate and remember dynamic xstate buffer sizes
  x86/fpu/xstate: Introduce helpers to manage dynamic xstate buffers
  x86/fpu/xstate: Define the scope of the initial xstate data
  x86/fpu/xstate: Introduce wrapper functions to organize xstate buffer
access
  x86/fpu/xstate: Update xstate save function to support dynamic xstate
  x86/fpu/xstate: Update xstate buffer address finder to support dynamic
xstate
  x86/fpu/xstate: Update xstate context copy function to support dynamic
buffer
  x86/fpu/xstate: Expand dynamic context switch buffer on first use
  x86/fpu/xstate: Support ptracer-induced xstate buffer expansion
  x86/fpu/xstate: Extend the table to map xstate components with
features
  x86/cpufeatures/amx: Enumerate Advanced Matrix Extension (AMX) feature
bits
  x86/fpu/amx: Define AMX state components and have it used for
boot-time checks
  x86/fpu/amx: Enable the AMX feature in 64-bit mode
  selftest/x86/amx: Include test cases for the AMX state management
  x86/fpu/xstate: Support dynamic user state in the signal handling path
  x86/fpu/xstate: Introduce boot-parameters to control some state
component support

 .../admin-guide/kernel-p

[PATCH v3 02/21] x86/fpu/xstate: Modify state copy helpers to handle both static and dynamic buffers

2020-12-23 Thread Chang S. Bae
In preparation for dynamic xstate buffer expansion, update the xstate
copy function parameters to equally handle static in-line buffer, as well
as dynamically allocated xstate buffer.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)
---
 arch/x86/include/asm/fpu/xstate.h |  8 
 arch/x86/kernel/fpu/regset.c  |  6 +++---
 arch/x86/kernel/fpu/signal.c  | 16 +++-
 arch/x86/kernel/fpu/xstate.c  | 19 +++
 4 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..e0f1b22f53ce 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -105,10 +105,10 @@ const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
 struct membuf;
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_supervisor_to_kernel(struct xregs_state *xsave);
+void copy_xstate_to_kernel(struct membuf to, struct fpu *fpu);
+int copy_kernel_to_xstate(struct fpu *fpu, const void *kbuf);
+int copy_user_to_xstate(struct fpu *fpu, const void __user *ubuf);
+void copy_supervisor_to_kernel(struct fpu *fpu);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 4c4d9059ff36..5e13e58d11d4 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -85,7 +85,7 @@ int xstateregs_get(struct task_struct *target, const struct 
user_regset *regset,
fpu__prepare_read(fpu);
 
if (using_compacted_format()) {
-   copy_xstate_to_kernel(to, xsave);
+   copy_xstate_to_kernel(to, fpu);
return 0;
} else {
fpstate_sanitize_xstate(fpu);
@@ -126,9 +126,9 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 
if (using_compacted_format()) {
if (kbuf)
-   ret = copy_kernel_to_xstate(xsave, kbuf);
+   ret = copy_kernel_to_xstate(fpu, kbuf);
else
-   ret = copy_user_to_xstate(xsave, ubuf);
+   ret = copy_user_to_xstate(fpu, ubuf);
} else {
ret = user_regset_copyin(, , , , xsave, 0, 
-1);
if (!ret)
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..0d6deb75c507 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -212,11 +212,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void 
__user *buf_fx, int size)
 }
 
 static inline void
-sanitize_restored_user_xstate(union fpregs_state *state,
+sanitize_restored_user_xstate(struct fpu *fpu,
  struct user_i387_ia32_struct *ia32_env,
  u64 user_xfeatures, int fx_only)
 {
-   struct xregs_state *xsave = >xsave;
+   struct xregs_state *xsave = >state.xsave;
struct xstate_header *header = >header;
 
if (use_xsave()) {
@@ -253,7 +253,7 @@ sanitize_restored_user_xstate(union fpregs_state *state,
xsave->i387.mxcsr &= mxcsr_feature_mask;
 
if (ia32_env)
-   convert_to_fxsr(>fxsave, ia32_env);
+   convert_to_fxsr(>state.fxsave, ia32_env);
}
 }
 
@@ -396,7 +396,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
 * current supervisor states first and invalidate the FPU regs.
 */
if (xfeatures_mask_supervisor())
-   copy_supervisor_to_kernel(>state.xsave);
+   copy_supervisor_to_kernel(fpu);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
@@ -406,7 +406,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
if (using_compacted_format()) {
-   ret = copy_user_to_xstate(>state.xsave, buf_fx);
+   ret = copy_user_to_xstate(fpu, buf_fx);
} else {
ret = __copy_from_user(>state.xsave, buf_fx, 
state_size);
 
@@ -416,8 +416,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
if (ret

[PATCH v3 03/21] x86/fpu/xstate: Modify address finders to handle both static and dynamic buffers

2020-12-23 Thread Chang S. Bae
In preparation for dynamic xstate buffer expansion, update the buffer
address finder function parameters to equally handle static in-line xstate
buffer, as well as dynamically allocated xstate buffer.

init_fpstate is a special case, which is indicated by a null pointer
parameter to get_xsave_addr() and __raw_xsave_addr().

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v2:
* Updated the changelog with task->fpu removed. (Boris Petkov)

Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/include/asm/fpu/internal.h |  2 +-
 arch/x86/include/asm/fpu/xstate.h   |  2 +-
 arch/x86/include/asm/pgtable.h  |  2 +-
 arch/x86/kernel/cpu/common.c|  2 +-
 arch/x86/kernel/fpu/xstate.c| 50 +++--
 arch/x86/kvm/x86.c  | 26 +--
 arch/x86/mm/pkeys.c |  2 +-
 7 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index d81d8c407dc0..0153c4d4ca77 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -579,7 +579,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 * return to userland e.g. for a copy_to_user() operation.
 */
if (current->mm) {
-   pk = get_xsave_addr(_fpu->state.xsave, XFEATURE_PKRU);
+   pk = get_xsave_addr(new_fpu, XFEATURE_PKRU);
if (pk)
pkru_val = pk->pkru;
}
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index e0f1b22f53ce..24bf8d3f559a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -100,7 +100,7 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 extern void __init update_regset_xstate_info(unsigned int size,
 u64 xstate_mask);
 
-void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..83268b41444f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -141,7 +141,7 @@ static inline void write_pkru(u32 pkru)
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return;
 
-   pk = get_xsave_addr(>thread.fpu.state.xsave, XFEATURE_PKRU);
+   pk = get_xsave_addr(>thread.fpu, XFEATURE_PKRU);
 
/*
 * The PKRU value in xstate needs to be in sync with the value that is
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..860b19db208b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -478,7 +478,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
return;
 
cr4_set_bits(X86_CR4_PKE);
-   pk = get_xsave_addr(_fpstate.xsave, XFEATURE_PKRU);
+   pk = get_xsave_addr(NULL, XFEATURE_PKRU);
if (pk)
pk->pkru = init_pkru_value;
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6156dad0feb6..2010c31d25e1 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -894,15 +894,24 @@ void fpu__resume_cpu(void)
  * Given an xstate feature nr, calculate where in the xsave
  * buffer the state is.  Callers should ensure that the buffer
  * is valid.
+ *
+ * A null pointer parameter indicates to use init_fpstate.
  */
-static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+static void *__raw_xsave_addr(struct fpu *fpu, int xfeature_nr)
 {
+   void *xsave;
+
if (!xfeature_enabled(xfeature_nr)) {
WARN_ON_FPU(1);
return NULL;
}
 
-   return (void *)xsave + xstate_comp_offsets[xfeature_nr];
+   if (fpu)
+   xsave = >state.xsave;
+   else
+   xsave = _fpstate.xsave;
+
+   return xsave + xstate_comp_offsets[xfeature_nr];
 }
 /*
  * Given the xsave area and a state inside, this function returns the
@@ -915,15 +924,18 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, 
int xfeature_nr)
  * this will return NULL.
  *
  * Inputs:
- * xstate: the thread's storage area for all FPU data
+ * fpu: the thread's FPU data to reference xstate buffer(s).
+ *  (A null pointer parameter indicates init_fpstate.)
  * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
  * XFEATURE_SSE, etc...)
  * Output:
  * address of the state in the xsave area, or NULL if the
  * field is not present in the xsave buffer.
  */
-void *get_xsave_addr(struct

[PATCH v3 10/21] x86/fpu/xstate: Update xstate save function to support dynamic xstate

2020-12-23 Thread Chang S. Bae
copy_xregs_to_kernel() used to save all user states in a kernel buffer.
When the dynamic user state is enabled, it becomes conditional which state
to be saved.

fpu->state_mask can indicate which state components are reserved to be
saved in XSAVE buffer. Use it as XSAVE's instruction mask to select states.

KVM used to save all xstate via copy_xregs_to_kernel(). Update KVM to set a
valid fpu->state_mask, which will be necessary to correctly handle dynamic
state buffers.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
Changes from v2:
* Updated the changelog to clarify the KVM code changes.
---
 arch/x86/include/asm/fpu/internal.h |  3 +--
 arch/x86/kernel/fpu/core.c  |  2 +-
 arch/x86/kvm/x86.c  | 11 ---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 67ffd1d7c95e..d409a6ae0c38 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -332,9 +332,8 @@ static inline void copy_kernel_to_xregs_booting(struct 
xregs_state *xstate)
 /*
  * Save processor xstate to xsave area.
  */
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate, u64 mask)
 {
-   u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8b9d3ec9ac46..5a12e4b22db2 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -99,7 +99,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
if (likely(use_xsave())) {
struct xregs_state *xsave = >xsave;
 
-   copy_xregs_to_kernel(xsave);
+   copy_xregs_to_kernel(xsave, fpu->state_mask);
 
/*
 * AVX512 state is tracked here because its use is
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4aecfba04bd3..93b5bacad67a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9214,15 +9214,20 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 static void kvm_save_current_fpu(struct fpu *fpu)
 {
+   struct fpu *src_fpu = >thread.fpu;
+
/*
 * If the target FPU state is not resident in the CPU registers, just
 * memcpy() from current, else save CPU state directly to the target.
 */
-   if (test_thread_flag(TIF_NEED_FPU_LOAD))
-   memcpy(>state, >thread.fpu.state,
+   if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
+   memcpy(>state, _fpu->state,
   fpu_kernel_xstate_min_size);
-   else
+   } else {
+   if (fpu->state_mask != src_fpu->state_mask)
+   fpu->state_mask = src_fpu->state_mask;
copy_fpregs_to_fpstate(fpu);
+   }
 }
 
 /* Swap (qemu) user FPU context for the guest FPU context. */
-- 
2.17.1



[PATCH v3 20/21] x86/fpu/xstate: Support dynamic user state in the signal handling path

2020-12-23 Thread Chang S. Bae
Entering a signal handler, the kernel saves the XSAVE buffer. The dynamic
user state is better to be saved only when used. fpu->state_mask can help
to exclude unused states.

Returning from a signal handler, XRSTOR re-initializes the excluded state
components.

Add a test case to verify in the signal handler that the signal frame
excludes AMX data when the signaled thread has initialized AMX state.

No functional change until the kernel supports the dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselft...@vger.kernel.org
---
Changes from v1:
* Made it revertible (moved close to the end of the series).
* Included the test case.
---
 arch/x86/include/asm/fpu/internal.h |  2 +-
 tools/testing/selftests/x86/amx.c   | 66 +
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 5eba9a466249..202874bb79da 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -369,7 +369,7 @@ static inline void copy_kernel_to_xregs(struct xregs_state 
*xstate, u64 mask)
  */
 static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 {
-   u64 mask = xfeatures_mask_user();
+   u64 mask = current->thread.fpu.state_mask;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/tools/testing/selftests/x86/amx.c 
b/tools/testing/selftests/x86/amx.c
index f4ecdfd27ae9..a7386b886532 100644
--- a/tools/testing/selftests/x86/amx.c
+++ b/tools/testing/selftests/x86/amx.c
@@ -650,6 +650,71 @@ static void test_ptrace(void)
test_tile_state_write(ptracee_loads_tiles);
 }
 
+/* Signal handling test */
+
+static int sigtrapped;
+struct tile_data sig_tiles, sighdl_tiles;
+
+static void handle_sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+   ucontext_t *uctxt = (ucontext_t *)ctx_void;
+   struct xsave_data xdata;
+   struct tile_config cfg;
+   struct tile_data tiles;
+   u64 header;
+
+   header = __get_xsave_xstate_bv((void *)uctxt->uc_mcontext.fpregs);
+
+   if (header & (1 << XFEATURE_XTILE_DATA))
+   printf("[FAIL]\ttile data was written in sigframe\n");
+   else
+   printf("[OK]\ttile data was skipped in sigframe\n");
+
+   set_tilecfg();
+   load_tilecfg();
+   init_xdata();
+
+   make_tiles();
+   copy_tiles_to_xdata(, );
+   restore_xdata();
+
+   save_xdata();
+   if (compare_xdata_tiles(, ))
+   err(1, "tile load file");
+
+   printf("\tsignal handler: load tile data\n");
+
+   sigtrapped = sig;
+}
+
+static void test_signal_handling(void)
+{
+   struct xsave_data xdata = { 0 };
+   struct tile_data tiles = { 0 };
+
+   sethandler(SIGTRAP, handle_sigtrap, 0);
+   sigtrapped = 0;
+
+   printf("[RUN]\tCheck tile state management in handling signal\n");
+
+   printf("\tbefore signal: initial tile data state\n");
+
+   raise(SIGTRAP);
+
+   if (sigtrapped == 0)
+   err(1, "sigtrap");
+
+   save_xdata();
+   if (compare_xdata_tiles(, )) {
+   printf("[FAIL]\ttile data was not loaded at sigreturn\n");
+   nerrs++;
+   } else {
+   printf("[OK]\ttile data was re-initialized at sigreturn\n");
+   }
+
+   clearhandler(SIGTRAP);
+}
+
 int main(void)
 {
/* Check hardware availability at first */
@@ -672,6 +737,7 @@ int main(void)
test_fork();
test_context_switch();
test_ptrace();
+   test_signal_handling();
 
return nerrs ? 1 : 0;
 }
-- 
2.17.1



[PATCH v3 4/4] selftest/x86/signal: Include test cases for validating sigaltstack

2020-12-22 Thread Chang S. Bae
The test measures the kernel's signal delivery with different (enough vs.
insufficient) stack sizes.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: linux-kselft...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Revised test messages (Borislav Pekov)
---
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 2 files changed, 129 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 6703c7906b71..e0c52e5ab49e 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) 
trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vdso test_vsyscall mov_ss_trap \
-   syscall_arg_fault fsgsbase_restore
+   syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/sigaltstack.c 
b/tools/testing/selftests/x86/sigaltstack.c
new file mode 100644
index ..e2cbf09723c8
--- /dev/null
+++ b/tools/testing/selftests/x86/sigaltstack.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* sigaltstack()-enforced minimum stack */
+#define ENFORCED_MINSIGSTKSZ   2048
+
+#ifndef AT_MINSIGSTKSZ
+#  define AT_MINSIGSTKSZ   51
+#endif
+
+static int nerrs;
+
+static bool sigalrm_expected;
+
+static unsigned long at_minstack_size;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+  int flags)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_sigaction = handler;
+   sa.sa_flags = SA_SIGINFO | flags;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+   struct sigaction sa;
+
+   memset(, 0, sizeof(sa));
+   sa.sa_handler = SIG_DFL;
+   sigemptyset(_mask);
+   if (sigaction(sig, , 0))
+   err(1, "sigaction");
+}
+
+static int setup_altstack(void *start, unsigned long size)
+{
+   stack_t ss;
+
+   memset(, 0, sizeof(ss));
+   ss.ss_size = size;
+   ss.ss_sp = start;
+
+   return sigaltstack(, NULL);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (sigalrm_expected) {
+   printf("[FAIL]\tSIGSEGV signal delivery is wrong.\n");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGSEGV signal is delivered.\n");
+   }
+
+   siglongjmp(jmpbuf, 1);
+}
+
+static void sigalrm(int sig, siginfo_t *info, void *ctx_void)
+{
+   if (!sigalrm_expected) {
+   printf("[FAIL]\tSIGALRM sigal delivery is wrong.\n");
+   nerrs++;
+   } else {
+   printf("[OK]\tSIGALRM signal is delivered.\n");
+   }
+}
+
+static void test_sigaltstack(void *altstack, unsigned long size)
+{
+   if (setup_altstack(altstack, size))
+   err(1, "sigaltstack()");
+
+   sigalrm_expected = (size > at_minstack_size) ? true : false;
+
+   sethandler(SIGSEGV, sigsegv, 0);
+   sethandler(SIGALRM, sigalrm, SA_ONSTACK);
+
+   if (sigsetjmp(jmpbuf, 1) == 0) {
+   printf("[RUN]\tTest an (%s) alternate signal stack\n",
+  sigalrm_expected ? "enough" : "too-small");
+   printf("\tRaise SIGALRM. %s is expected to be delivered.\n",
+  sigalrm_expected ? "It" : "But SIGSEGV");
+   raise(SIGALRM);
+   }
+
+   clearhandler(SIGALRM);
+   clearhandler(SIGSEGV);
+}
+
+int main(void)
+{
+   void *altstack;
+
+   at_minstack_size = getauxval(AT_MINSIGSTKSZ);
+
+   altstack = mmap(NULL, at_minstack_size + SIGSTKSZ, PROT_READ | 
PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+   if (altstack == MAP_FAILED)
+   err(1, "mmap()");
+
+   if ((ENFORCED_MINSIGSTKSZ + 1) < at_minstack_size)
+   test_sigaltstack(altstack, ENFORCED_MINSIGSTKSZ + 1);
+
+   test_sigaltstack(altstack, at_minstack_size + SIGSTKSZ);
+
+   return nerrs == 0 ? 0 : 1;
+}
-- 
2.17.1



[PATCH v3 3/4] x86/signal: Prevent an alternate stack overflow before a signal delivery

2020-12-22 Thread Chang S. Bae
The kernel pushes data on the userspace stack when entering a signal. If
using a sigaltstack(), the kernel precisely knows the user stack size.

When the kernel knows that the user stack is too small, avoid the overflow
and do an immediate SIGSEGV instead.

This overflow is known to occur on systems with large XSAVE state. The
effort to increase the size typically used for altstacks reduces the
frequency of these overflows, but this approach is still useful for legacy
binaries.

Suggested-by: Jann Horn 
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: Jann Horn 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Simplified the implementation (Jann Horn)
---
 arch/x86/kernel/signal.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 761d856f8ef7..91056a940271 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -242,7 +242,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
-   int onsigstack = on_sig_stack(sp);
+   bool onsigstack = on_sig_stack(sp);
int ret;
 
/* redzone */
@@ -251,8 +251,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 
size_t frame_size,
 
/* This is the X/Open sanctioned signal stack switching.  */
if (ka->sa.sa_flags & SA_ONSTACK) {
-   if (sas_ss_flags(sp) == 0)
+   if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+   /* On the alternate signal stack */
+   onsigstack = true;
+   }
} else if (IS_ENABLED(CONFIG_X86_32) &&
   !onsigstack &&
   regs->ss != __USER_DS &&
-- 
2.17.1



[PATCH v3 2/4] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2020-12-22 Thread Chang S. Bae
Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
use by all architectures with sigaltstack(2). Over time, the hardware state
size grew, but these constants did not evolve. Today, literal use of these
constants on several architectures may result in signal stack overflow, and
thus user data corruption.

A few years ago, the ARM team addressed this issue by establishing
getauxval(AT_MINSIGSTKSZ), such that the kernel can supply at runtime value
that is an appropriate replacement on the current and future hardware.

Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
added for ARM in commit 94b07c1f8c39 ("arm64: signal: Report signal frame
size to userspace via auxv").

Reported-by: Florian Weimer 
Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: H.J. Lu 
Cc: Fenghua Yu 
Cc: Dave Martin 
Cc: Michael Ellerman 
Cc: x...@kernel.org
Cc: libc-al...@sourceware.org
Cc: linux-a...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531
---
 arch/x86/include/asm/elf.h | 4 
 arch/x86/include/uapi/asm/auxvec.h | 6 --
 arch/x86/kernel/signal.c   | 5 +
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b9a5d488f1a5..044b024abea1 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -311,6 +311,7 @@ do {
\
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);\
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);\
}   \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /*
@@ -327,6 +328,7 @@ extern unsigned long task_size_32bit(void);
 extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
 
 #ifdef CONFIG_X86_32
 
@@ -348,6 +350,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -356,6 +359,7 @@ do {
\
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR,\
(unsigned long __force)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());   
\
 } while (0)
 
 #define AT_SYSINFO 32
diff --git a/arch/x86/include/uapi/asm/auxvec.h 
b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046..edd7808060e6 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -10,11 +10,13 @@
 #endif
 #define AT_SYSINFO_EHDR33
 
+#define AT_MINSIGSTKSZ 51
+
 /* entries in ARCH_DLINFO: */
 #if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
 #else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
 #endif
 
 #endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 138a9f5b78d8..761d856f8ef7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -716,6 +716,11 @@ void __init init_sigframe_size(void)
max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
 }
 
+unsigned long get_sigframe_size(void)
+{
+   return max_frame_size;
+}
+
 static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
-- 
2.17.1



[PATCH v3 1/4] x86/signal: Introduce helpers to get the maximum signal frame size

2020-12-22 Thread Chang S. Bae
Signal frames do not have a fixed format and can vary in size when a number
of things change: support XSAVE features, 32 vs. 64-bit apps. Add the code
to support a runtime method for userspace to dynamically discover how large
a signal stack needs to be.

Introduce a new variable, max_frame_size, and helper functions for the
calculation to be used in a new user interface. Set max_frame_size to a
system-wide worst-case value, instead of storing multiple app-specific
values.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Acked-by: H.J. Lu 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v2:
* Renamed the fpstate size helper with cleanup (Borislav Petkov)
* Moved the sigframe struct size defines to where used (Borislav Petkov)
* Removed unneeded sentence in the changelog (Borislav Petkov)

Change from v1:
* Took stack alignment into account for sigframe size (Dave Martin)
---
 arch/x86/include/asm/fpu/signal.h |  2 ++
 arch/x86/include/asm/sigframe.h   |  2 ++
 arch/x86/kernel/cpu/common.c  |  3 ++
 arch/x86/kernel/fpu/signal.c  | 19 +++
 arch/x86/kernel/signal.c  | 57 +--
 5 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/fpu/signal.h 
b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
 fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 unsigned long *buf_fx, unsigned long *size);
 
+unsigned long fpu__get_fpstate_size(void);
+
 extern void fpu__init_prepare_fx_sw_frame(void);
 
 #endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
 
 #endif /* CONFIG_X86_64 */
 
+void __init init_sigframe_size(void);
+
 #endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..6954932272d5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1331,6 +1332,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
*c)
 
fpu__init_system(c);
 
+   init_sigframe_size();
+
 #ifdef CONFIG_X86_32
/*
 * Regardless of whether PCID is enumerated, the SDM says
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..dbb304e48f16 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -507,6 +507,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
 
return sp;
 }
+
+unsigned long fpu__get_fpstate_size(void)
+{
+   unsigned long ret = xstate_sigframe_size();
+
+   /*
+* This space is needed on (most) 32-bit kernels, or when a 32-bit
+* app is running on a 64-bit kernel. To keep things simple, just
+* assume the worst case and always include space for 'freg_state',
+* even for 64-bit apps on 64-bit kernels. This wastes a bit of
+* space, but keeps the code simple.
+*/
+   if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+   ret += sizeof(struct fregs_state);
+
+   return ret;
+}
+
 /*
  * Prepare the SW reserved portion of the fxsave memory layout, indicating
  * the presence of the extended state information in the memory layout
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index be0d7d4152ec..138a9f5b78d8 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -212,6 +212,11 @@ do {   
\
  * Set up a signal frame.
  */
 
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT16UL
+
+#define MAX_FRAME_PADDING  (FRAME_ALIGNMENT - 1)
+
 /*
  * Determine which stack to use..
  */
@@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp)
 * Align the stack pointer according to the i386 ABI,
 * i.e. so that on function entry ((sp + 4) & 15) == 0.
 */
-   sp = ((sp + 4) & -16ul) - 4;
+   sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
 #else /* !CONFIG_X86_32 */
-   sp = round_down(sp, 16) - 8;
+   sp = round_down(sp, FRAME_ALIGNMENT) - 8;
 #endif
return sp;
 }
@@ -663,6 +668,54 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 }
 
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if def

[PATCH v3 0/4] x86: Improve Minimum Alternate Stack Size

2020-12-22 Thread Chang S. Bae
During signal entry, the kernel pushes data onto the normal userspace
stack. On x86, the data pushed onto the user stack includes XSAVE state,
which has grown over time as new features and larger registers have been
added to the architecture.

MINSIGSTKSZ is a constant provided in the kernel signal.h headers and
typically distributed in lib-dev(el) packages, e.g. [1]. Its value is
compiled into programs and is part of the user/kernel ABI. The MINSIGSTKSZ
constant indicates to userspace how much data the kernel expects to push on
the user stack, [2][3].

However, this constant is much too small and does not reflect recent
additions to the architecture. For instance, when AVX-512 states are in
use, the signal frame size can be 3.5KB while MINSIGSTKSZ remains 2KB.

The bug report [4] explains this as an ABI issue. The small MINSIGSTKSZ can
cause user stack overflow when delivering a signal.

In this series, we suggest a couple of things:
1. Provide a variable minimum stack size to userspace, as a similar
   approach to [5]
2. Avoid using a too-small alternate stack

Changes from v2 [7]:
* Simplified the sigaltstack overflow prevention (Jann Horn)
* Renamed fpstate size helper with cleanup (Borislav Petkov)
* Cleaned up the signframe struct size defines (Borislav Petkov)
* Revised the selftest messages (Borislav Petkov)
* Revised a changelog (Borislav Petkov)

Changes from v1 [6]:
* Took stack alignment into account for sigframe size (Dave Martin)

[1]: 
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/bits/sigstack.h;h=b9dca794da093dc4d41d39db9851d444e1b54d9b;hb=HEAD
[2]: https://www.gnu.org/software/libc/manual/html_node/Signal-Stack.html
[3]: https://man7.org/linux/man-pages/man2/sigaltstack.2.html
[4]: https://bugzilla.kernel.org/show_bug.cgi?id=153531
[5]: 
https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4671/original/plumbers-dm-2017.pdf
[6]: 
https://lore.kernel.org/lkml/20200929205746.6763-1-chang.seok@intel.com/
[7]: https://lore.kernel.org/lkml/20201119190237.626-1-chang.seok@intel.com/

Chang S. Bae (4):
  x86/signal: Introduce helpers to get the maximum signal frame size
  x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ
  x86/signal: Prevent an alternate stack overflow before a signal
delivery
  selftest/x86/signal: Include test cases for validating sigaltstack

 arch/x86/include/asm/elf.h|   4 +
 arch/x86/include/asm/fpu/signal.h |   2 +
 arch/x86/include/asm/sigframe.h   |   2 +
 arch/x86/include/uapi/asm/auxvec.h|   6 +-
 arch/x86/kernel/cpu/common.c  |   3 +
 arch/x86/kernel/fpu/signal.c  |  19 
 arch/x86/kernel/signal.c  |  69 +++-
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/sigaltstack.c | 128 ++
 9 files changed, 228 insertions(+), 7 deletions(-)
 create mode 100644 tools/testing/selftests/x86/sigaltstack.c

-- 
2.17.1



[RFC PATCH 7/8] crypto: x86/aes-kl - Support AES algorithm using Key Locker instructions

2020-12-16 Thread Chang S. Bae
Key Locker (KL) is Intel's new security feature that protects the AES key
at the time of data transformation. New AES SIMD instructions -- as a
successor of Intel's AES-NI -- are provided to encode an AES key and
reference it for the AES algorithm.

New instructions support 128/256-bit keys. While it is not desirable to
receive any 192-bit key, AES-NI instructions are taken to serve this size.

New instructions are operational in both 32-/64-bit modes.

Add a set of new macros for the new instructions so that no new binutils
version is required.

Implemented methods are for a single block as well as ECB, CBC, CTR, and
XTS modes. The methods are not compatible with other AES implementations as
accessing an encrypted key instead of the normal AES key.

setkey() call encodes an AES key. User may displace the AES key once
encoded, as encrypt()/decrypt() methods do not need the key.

Most C code follows the AES-NI implementation. It has higher priority than
the AES-NI as providing key protection.

Signed-off-by: Chang S. Bae 
Cc: Herbert Xu 
Cc: x...@kernel.org
Cc: linux-cry...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/crypto/Makefile   |   3 +
 arch/x86/crypto/aeskl-intel_asm.S  | 881 +
 arch/x86/crypto/aeskl-intel_glue.c | 697 +++
 arch/x86/include/asm/inst.h| 201 +++
 crypto/Kconfig |  28 +
 5 files changed, 1810 insertions(+)
 create mode 100644 arch/x86/crypto/aeskl-intel_asm.S
 create mode 100644 arch/x86/crypto/aeskl-intel_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a31de0c6ccde..8e2e34e73a21 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -54,6 +54,9 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 
+obj-$(CONFIG_CRYPTO_AES_KL) += aeskl-intel.o
+aeskl-intel-y := aeskl-intel_asm.o aesni-intel_asm.o aeskl-intel_glue.o
+
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
 sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o
diff --git a/arch/x86/crypto/aeskl-intel_asm.S 
b/arch/x86/crypto/aeskl-intel_asm.S
new file mode 100644
index ..80ddeda11bdf
--- /dev/null
+++ b/arch/x86/crypto/aeskl-intel_asm.S
@@ -0,0 +1,881 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Implement AES algorithm using Intel AES Key Locker instructions.
+ *
+ * Most codes are based from AES-NI implementation, aesni-intel_asm.S
+ *
+ */
+
+#include 
+#include 
+#include 
+
+#define STATE1 %xmm0
+#define STATE2 %xmm1
+#define STATE3 %xmm2
+#define STATE4 %xmm3
+#define STATE5 %xmm4
+#define STATE6 %xmm5
+#define STATE7 %xmm6
+#define STATE8 %xmm7
+#define STATE  STATE1
+
+#ifdef __x86_64__
+#define IN1%xmm8
+#define IN2%xmm9
+#define IN3%xmm10
+#define IN4%xmm11
+#define IN5%xmm12
+#define IN6%xmm13
+#define IN7%xmm14
+#define IN8%xmm15
+#define IN IN1
+#else
+#define IN %xmm1
+#endif
+
+#ifdef __x86_64__
+#define AREG   %rax
+#define HANDLEP%rdi
+#define OUTP   %rsi
+#define KLEN   %r9d
+#define INP%rdx
+#define T1 %r10
+#define LEN%rcx
+#define IVP%r8
+#else
+#define AREG   %eax
+#define HANDLEP%edi
+#define OUTP   AREG
+#define KLEN   %ebx
+#define INP%edx
+#define T1%ecx
+#define LEN %esi
+#define IVP %ebp
+#endif
+
+#define UKEYP OUTP
+
+/*
+ * int __aeskl_setkey(struct crypto_aes_ctx *ctx,
+ *   const u8 *in_key,
+ *   unsigned int key_len)
+ */
+SYM_FUNC_START(__aeskl_setkey)
+   FRAME_BEGIN
+#ifndef __x86_64__
+   push HANDLEP
+   movl (FRAME_OFFSET+8)(%esp), HANDLEP# ctx
+   movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
+   movl (FRAME_OFFSET+16)(%esp), %edx  # key_len
+#endif
+   movl %edx, 480(HANDLEP)
+   movdqu (UKEYP), STATE1
+   mov $1, %eax
+   cmp $16, %dl
+   je .Lsetkey_128
+
+   movdqu 0x10(UKEYP), STATE2
+   ENCODEKEY256 %eax, %eax
+   movdqu STATE4, 0x30(HANDLEP)
+   jmp .Lsetkey_end
+.Lsetkey_128:
+   ENCODEKEY128 %eax, %eax
+
+.Lsetkey_end:
+   movdqu STATE1, (HANDLEP)
+   movdqu STATE2, 0x10(HANDLEP)
+   movdqu STATE3, 0x20(HANDLEP)
+
+   xor AREG, AREG
+#ifndef __x86_64__
+   popl HANDLEP
+#endif
+   FRAME_END
+   ret
+SYM_FUNC_END(__aeskl_setkey)
+
+/*
+ * int __aeskl_enc1(const void *ctx,
+ * u8 *dst,
+ * const u8 *src)
+ */
+SYM_FUNC_START(__aeskl_enc1)
+   FRAME_BEGIN
+#ifndef __x86_64__
+   pushl HANDLEP
+   pushl KLEN
+   movl (FRAME_OFFSET+12)(%esp), HANDLEP   # ctx
+   movl (FRAME_OFFSET+16)(%esp), OUTP  # dst
+   movl (FRAME_OFFSET+20)(%esp), INP   # src
+#endif
+   movdqu (INP), STATE
+   movl 480(HANDLEP), KLEN
+
+   cmp

[RFC PATCH 6/8] selftests/x86: Test Key Locker internal key maintenance

2020-12-16 Thread Chang S. Bae
The test validates the internal key to be the same in all CPUs.

It performs the validation again with the Suspend-To-RAM (ACPI S3) state.

Signed-off-by: Chang S. Bae 
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselft...@vger.kernel.org
---
 tools/testing/selftests/x86/Makefile|   2 +-
 tools/testing/selftests/x86/keylocker.c | 177 
 2 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/keylocker.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 6703c7906b71..c53e496d77b2 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -13,7 +13,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) 
trivial_program.c -no-pie)
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
test_vdso test_vsyscall mov_ss_trap \
-   syscall_arg_fault fsgsbase_restore
+   syscall_arg_fault fsgsbase_restore keylocker
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
diff --git a/tools/testing/selftests/x86/keylocker.c 
b/tools/testing/selftests/x86/keylocker.c
new file mode 100644
index ..3d69c1615bca
--- /dev/null
+++ b/tools/testing/selftests/x86/keylocker.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * keylocker.c, validating the internal key management
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define HANDLE_SIZE48
+
+static bool keylocker_disabled;
+
+/* Encode a 128-bit key to a 384-bit handle */
+static inline void __encode_key(char *handle)
+{
+   static const unsigned char aeskey[] = { 0x31, 0x32, 0x33, 0x34, 0x35, 
0x36, 0x37, 0x38,
+   0x71, 0x77, 0x74, 0x69, 0x6f, 
0x6b, 0x6c, 0x78 };
+
+   asm volatile ("movdqu %0, %%xmm0" : : "m" (*aeskey) :);
+
+   /* Set no restriction to the handle */
+   asm volatile ("mov $0, %%eax" :);
+
+   /* ENCODEKEY128 %EAX */
+   asm volatile (".byte 0xf3, 0xf, 0x38, 0xfa, 0xc0");
+
+   asm volatile ("movdqu %%xmm0, %0; movdqu %%xmm1, %1; movdqu %%xmm2, %2;"
+ : "=m" (handle[0]), "=m" (handle[0x10]), "=m" 
(handle[0x20]));
+}
+
+static jmp_buf jmpbuf;
+
+static void handle_sigill(int sig, siginfo_t *si, void *ctx_void)
+{
+   keylocker_disabled = true;
+   siglongjmp(jmpbuf, 1);
+}
+
+static bool encode_key(char *handle)
+{
+   bool success = true;
+   struct sigaction sa;
+   int ret;
+
+   memset(, 0, sizeof(sa));
+
+   /* Set signal handler */
+   sa.sa_flags = SA_SIGINFO;
+   sa.sa_sigaction = handle_sigill;
+   sigemptyset(_mask);
+   ret = sigaction(SIGILL, , 0);
+   if (ret)
+   err(1, "sigaction");
+
+   if (sigsetjmp(jmpbuf, 1))
+   success = false;
+   else
+   __encode_key(handle);
+
+   /* Clear signal handler */
+   sa.sa_flags = 0;
+   sa.sa_sigaction = NULL;
+   sa.sa_handler = SIG_DFL;
+   sigemptyset(_mask);
+   ret = sigaction(SIGILL, , 0);
+   if (ret)
+   err(1, "sigaction");
+
+   return success;
+}
+
+/*
+ * Test if the internal key is the same in all the CPUs:
+ *
+ * Since the value is not readable, compare the encoded output of a AES key
+ * between CPUs.
+ */
+
+static int nerrs;
+
+static unsigned char cpu0_handle[HANDLE_SIZE] = { 0 };
+
+static void test_internal_key(bool slept, long cpus)
+{
+   int cpu, errs;
+
+   printf("Test the internal key consistency between CPUs\n");
+
+   for (cpu = 0, errs = 0; cpu < cpus; cpu++) {
+   char handle[HANDLE_SIZE] = { 0 };
+   cpu_set_t mask;
+   bool success;
+
+   CPU_ZERO();
+   CPU_SET(cpu, );
+   sched_setaffinity(0, sizeof(cpu_set_t), );
+
+   success = encode_key(handle);
+   if (!success) {
+   /* The encode should success after the S3 sleep */
+   if (slept)
+   errs++;
+   printf("[%s]\tKey Locker disabled at CPU%d\n",
+  slept ? "FAIL" : "NOTE", cpu);
+   continue;
+   }
+
+   if (cpu == 0 && !slept) {
+   /* Record the first handle value as reference */
+   memcpy(cpu0_handle, handle, HANDLE_SIZE);
+   } else if (memc

[RFC PATCH 5/8] x86/cpu: Add a config option and a chicken bit for Key Locker

2020-12-16 Thread Chang S. Bae
Add a kernel config option to enable the feature (disabled by default) at
compile-time.

Also, add a new command-line parameter -- 'nokeylocker' to disable the
feature at boot-time.

Signed-off-by: Chang S. Bae 
Cc: x...@kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 Documentation/admin-guide/kernel-parameters.txt |  2 ++
 arch/x86/Kconfig| 14 ++
 arch/x86/kernel/cpu/common.c| 16 
 3 files changed, 32 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 44fde25bb221..c389ad8fb9de 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3220,6 +3220,8 @@
 
nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
 
+   nokeylocker [X86] Disables Key Locker hardware feature.
+
nosmt   [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fbf26e0f7a6a..7623af32f919 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1886,6 +1886,20 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
 
  If unsure, say y.
 
+config X86_KEYLOCKER
+   prompt "Key Locker"
+   def_bool n
+   depends on CPU_SUP_INTEL
+   help
+ Key Locker is a new security feature to protect a data encryption
+ key for the Advanced Encryption Standard (AES) algorithm.
+
+ When enabled, every CPU has a unique internal key to wrap the AES
+ key into an encoded format.  The internal key is not accessible
+ to software once loaded.
+
+ If unsure, say y.
+
 choice
prompt "TSX enable mode"
depends on CPU_SUP_INTEL
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a446d5aff08f..ba5bd79fbac2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -354,6 +354,22 @@ static __always_inline void setup_umip(struct cpuinfo_x86 
*c)
 /* These bits should not change their value after CPU init is finished. */
 static const unsigned long cr4_pinned_mask =
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE;
+
+static __init int x86_nokeylocker_setup(char *arg)
+{
+   /* Expect an exact match without trailing characters */
+   if (strlen(arg))
+   return 0;
+
+   if (!cpu_feature_enabled(X86_FEATURE_KEYLOCKER))
+   return 1;
+
+   setup_clear_cpu_cap(X86_FEATURE_KEYLOCKER);
+   pr_info("x86/keylocker: Disabled by kernel command line\n");
+   return 1;
+}
+__setup("nokeylocker", x86_nokeylocker_setup);
+
 static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
 static unsigned long cr4_pinned_bits __ro_after_init;
 
-- 
2.17.1



[RFC PATCH 4/8] x86/power: Restore Key Locker internal key from the ACPI S3/4 sleep states

2020-12-16 Thread Chang S. Bae
When the system state switches to these sleep states, the internal key gets
reset. Since this system transition is transparent to userspace, the
internal key needs to be restored properly.

Key Locker provides a mechanism to back up the internal key in non-volatile
memory. The kernel requests a backup right after the key loaded at
boot-time and copies it later when the system wakes up.

The backup during the S5 sleep state is not trusted. It is overwritten by a
new key at the next boot.

On a system with the S3/4 states, enable the feature only when the backup
mechanism is supported.

Disable the feature when the copy fails (or the backup corrupts). The
shutdown is considered too noisy. A new key is considerable only when
threads can be synchronously suspended.

Signed-off-by: Chang S. Bae 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux...@vger.kernel.org
---
 arch/x86/include/asm/keylocker.h | 12 
 arch/x86/kernel/cpu/common.c | 25 +++-
 arch/x86/kernel/keylocker.c  | 51 
 arch/x86/power/cpu.c | 34 +
 4 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/keylocker.h b/arch/x86/include/asm/keylocker.h
index daf0734a4095..722574c305c2 100644
--- a/arch/x86/include/asm/keylocker.h
+++ b/arch/x86/include/asm/keylocker.h
@@ -6,6 +6,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 
 #define KEYLOCKER_CPUID0x019
 #define KEYLOCKER_CPUID_EAX_SUPERVISOR BIT(0)
@@ -25,5 +26,16 @@ void invalidate_keylocker_data(void);
 #define invalidate_keylocker_data() do { } while (0)
 #endif
 
+static inline u64 read_keylocker_backup_status(void)
+{
+   u64 status;
+
+   rdmsrl(MSR_IA32_IWKEYBACKUP_STATUS, status);
+   return status;
+}
+
+void backup_keylocker(void);
+bool copy_keylocker(void);
+
 #endif /*__ASSEMBLY__ */
 #endif /* _ASM_KEYLOCKER_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d675075848bb..a446d5aff08f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -463,24 +463,35 @@ __setup("nofsgsbase", x86_nofsgsbase_setup);
 
 static __always_inline void setup_keylocker(struct cpuinfo_x86 *c)
 {
-   bool keyloaded;
-
if (!cpu_feature_enabled(X86_FEATURE_KEYLOCKER))
goto out;
 
cr4_set_bits(X86_CR4_KEYLOCKER);
 
if (c == _cpu_data) {
+   bool keyloaded;
+
if (!check_keylocker_readiness())
goto disable_keylocker;
 
make_keylocker_data();
-   }
 
-   keyloaded = load_keylocker();
-   if (!keyloaded) {
-   pr_err_once("x86/keylocker: Failed to load internal key\n");
-   goto disable_keylocker;
+   keyloaded = load_keylocker();
+   if (!keyloaded) {
+   pr_err("x86/keylocker: Fail to load internal key\n");
+   goto disable_keylocker;
+   }
+
+   backup_keylocker();
+   } else {
+   bool keycopied;
+
+   /* NB: When system wakes up, this path recovers the internal 
key. */
+   keycopied = copy_keylocker();
+   if (!keycopied) {
+   pr_err_once("x86/keylocker: Fail to copy internal 
key\n");
+   goto disable_keylocker;
+   }
}
 
pr_info_once("x86/keylocker: Activated\n");
diff --git a/arch/x86/kernel/keylocker.c b/arch/x86/kernel/keylocker.c
index e455d806b80c..229875ac80d5 100644
--- a/arch/x86/kernel/keylocker.c
+++ b/arch/x86/kernel/keylocker.c
@@ -5,11 +5,15 @@
  */
 
 #include 
+#include 
+#include 
 
 #include 
 #include 
 #include 
 
+static bool keybackup_available;
+
 bool check_keylocker_readiness(void)
 {
u32 eax, ebx, ecx, edx;
@@ -21,6 +25,14 @@ bool check_keylocker_readiness(void)
return false;
}
 
+   keybackup_available = (ebx & KEYLOCKER_CPUID_EBX_BACKUP);
+   /* Internal Key backup is essential with S3/4 states */
+   if (!keybackup_available &&
+   (acpi_sleep_state_supported(ACPI_STATE_S3) ||
+acpi_sleep_state_supported(ACPI_STATE_S4))) {
+   pr_debug("x86/keylocker: no key backup support with possible 
S3/4\n");
+   return false;
+   }
return true;
 }
 
@@ -29,6 +41,7 @@ bool check_keylocker_readiness(void)
 #define LOADIWKEY_NUM_OPERANDS 3
 
 static struct key {
+   bool valid;
struct reg_128_bit value[LOADIWKEY_NUM_OPERANDS];
 } keydata;
 
@@ -38,11 +51,15 @@ void make_keylocker_data(void)
 
for (i = 0; i < LOADIWKEY_NUM_OPERANDS; i++)
get_random_bytes([i], sizeof(struct reg_128_bit));
+
+   keydata.valid = true;
 }
 
 void invalidate_keylocker_data(void)
 {
memset(, 0, sizeof(struct reg_128_bit)

[RFC PATCH 3/8] x86/msr-index: Add MSRs for Key Locker internal key

2020-12-16 Thread Chang S. Bae
Key Locker internal key in a CPU state can be backed up in a platform
register. The backup can be also copied back to a CPU state. This mechanism
is useful to restore the key (after system sleep).

Add MSRs for the internal key backup, copy, and status check.

Signed-off-by: Chang S. Bae 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/msr-index.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 972a34d93505..c0b9157806f7 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -922,4 +922,10 @@
 #define MSR_VM_IGNNE0xc0010115
 #define MSR_VM_HSAVE_PA 0xc0010117
 
+/* MSRs for Key Locker Internal (Wrapping) Key management */
+#define MSR_IA32_COPY_LOCAL_TO_PLATFORM0x0d91
+#define MSR_IA32_COPY_PLATFORM_TO_LOCAL0x0d92
+#define MSR_IA32_COPY_STATUS   0x0990
+#define MSR_IA32_IWKEYBACKUP_STATUS0x0991
+
 #endif /* _ASM_X86_MSR_INDEX_H */
-- 
2.17.1



[RFC PATCH 8/8] x86/cpu: Support the hardware randomization option for Key Locker internal key

2020-12-16 Thread Chang S. Bae
Hardware can load the internal key with randomization. random.trust_cpu
determines the use of the CPU's random number generator. Take the parameter
to use the CPU's internal key randomization.

The backup mechanism is required to distribute the key. It is the only
way to copy the (unknown) key value to other CPUs.

This randomization option is disabled when hardware does not support the
key backup.

Signed-off-by: Chang S. Bae 
Cc: Mark Brown 
Cc: x...@kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/keylocker.h |  2 +-
 arch/x86/kernel/cpu/common.c |  3 ++-
 arch/x86/kernel/keylocker.c  | 31 ---
 drivers/char/random.c|  6 ++
 include/linux/random.h   |  2 ++
 5 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/keylocker.h b/arch/x86/include/asm/keylocker.h
index 722574c305c2..a6774ced916a 100644
--- a/arch/x86/include/asm/keylocker.h
+++ b/arch/x86/include/asm/keylocker.h
@@ -19,7 +19,7 @@ bool check_keylocker_readiness(void);
 
 bool load_keylocker(void);
 
-void make_keylocker_data(void);
+void make_keylocker_data(bool use_hwrand);
 #ifdef CONFIG_X86_KEYLOCKER
 void invalidate_keylocker_data(void);
 #else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ba5bd79fbac2..48881d8ea559 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -485,12 +485,13 @@ static __always_inline void setup_keylocker(struct 
cpuinfo_x86 *c)
cr4_set_bits(X86_CR4_KEYLOCKER);
 
if (c == _cpu_data) {
+   bool use_hwrand = check_random_trust_cpu();
bool keyloaded;
 
if (!check_keylocker_readiness())
goto disable_keylocker;
 
-   make_keylocker_data();
+   make_keylocker_data(use_hwrand);
 
keyloaded = load_keylocker();
if (!keyloaded) {
diff --git a/arch/x86/kernel/keylocker.c b/arch/x86/kernel/keylocker.c
index 229875ac80d5..e77e4c3d785e 100644
--- a/arch/x86/kernel/keylocker.c
+++ b/arch/x86/kernel/keylocker.c
@@ -13,6 +13,7 @@
 #include 
 
 static bool keybackup_available;
+static bool keyhwrand_available;
 
 bool check_keylocker_readiness(void)
 {
@@ -33,25 +34,33 @@ bool check_keylocker_readiness(void)
pr_debug("x86/keylocker: no key backup support with possible 
S3/4\n");
return false;
}
+
+   keyhwrand_available = (ecx & KEYLOCKER_CPUID_ECX_RAND);
return true;
 }
 
 /* Load Internal (Wrapping) Key */
 #define LOADIWKEY  ".byte 0xf3,0x0f,0x38,0xdc,0xd1"
 #define LOADIWKEY_NUM_OPERANDS 3
+#define LOADIWKEY_HWRAND_RETRY 10
 
 static struct key {
bool valid;
+   bool hwrand;
struct reg_128_bit value[LOADIWKEY_NUM_OPERANDS];
 } keydata;
 
-void make_keylocker_data(void)
+void make_keylocker_data(bool use_hwrand)
 {
int i;
 
for (i = 0; i < LOADIWKEY_NUM_OPERANDS; i++)
get_random_bytes([i], sizeof(struct reg_128_bit));
 
+   keydata.hwrand = (use_hwrand && keyhwrand_available && 
keybackup_available);
+   if (use_hwrand && !keydata.hwrand)
+   pr_warn("x86/keylocker: hardware random key not fully 
supported\n");
+
keydata.valid = true;
 }
 
@@ -63,12 +72,22 @@ void invalidate_keylocker_data(void)
 }
 
 #define USE_SWKEY  0
+#define USE_HWRANDKEY  BIT(1)
 
 bool load_keylocker(void)
 {
struct reg_128_bit zeros = { 0 };
-   u32 keysrc = USE_SWKEY;
bool err = true;
+   u32 keysrc;
+   int retry;
+
+   if (keydata.hwrand) {
+   keysrc = USE_HWRANDKEY;
+   retry = LOADIWKEY_HWRAND_RETRY;
+   } else {
+   keysrc = USE_SWKEY;
+   retry = 0;
+   }
 
kernel_fpu_begin();
 
@@ -77,13 +96,19 @@ bool load_keylocker(void)
 "m"(keydata.value[1]),
 "m"(keydata.value[2]));
 
-   asm volatile (LOADIWKEY CC_SET(z) : CC_OUT(z) (err) : "a"(keysrc));
+   do {
+   asm volatile (LOADIWKEY CC_SET(z) : CC_OUT(z) (err) : 
"a"(keysrc));
+   retry--;
+   } while (err && retry >= 0);
 
asm volatile ("movdqu %0, %%xmm0; movdqu %0, %%xmm1; movdqu %0, %%xmm2;"
  :: "m"(zeros));
 
kernel_fpu_end();
 
+   if (keydata.hwrand)
+   invalidate_keylocker_data();
+
return err ? false : true;
 }
 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 2a41b21623ae..3ee0d659ab2a 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -781,6 +781,12 @@ static int __init parse_trust_cpu(char *arg)
 }
 early_param("random.trust_cpu", parse_trust_cpu);
 
+bool check_random_trust_cpu(void)
+{

[RFC PATCH 2/8] x86/cpu: Load Key Locker internal key at boot-time

2020-12-16 Thread Chang S. Bae
Internal (Wrapping) Key is a new entity of Intel Key Locker feature. This
internal key is loaded in a software-inaccessible CPU state and used to
encode a data encryption key.

The kernel makes random data and loads it as the internal key in each CPU.
The data need to be invalidated as soon as the load is done.

The BIOS may disable the feature. Check the dynamic CPUID bit
(KEYLOCKER_CPUID_EBX_AESKLE) at first.

Add byte code for LOADIWKEY -- an instruction to load the internal key, in
the 'x86-opcode-map.txt' file to avoid objtool's misinterpretation.

Signed-off-by: Chang S. Bae 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/keylocker.h  | 11 +
 arch/x86/kernel/Makefile  |  1 +
 arch/x86/kernel/cpu/common.c  | 38 +-
 arch/x86/kernel/keylocker.c   | 71 +++
 arch/x86/kernel/smpboot.c |  2 +
 arch/x86/lib/x86-opcode-map.txt   |  2 +-
 tools/arch/x86/lib/x86-opcode-map.txt |  2 +-
 7 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/keylocker.c

diff --git a/arch/x86/include/asm/keylocker.h b/arch/x86/include/asm/keylocker.h
index 2fe13c21c63f..daf0734a4095 100644
--- a/arch/x86/include/asm/keylocker.h
+++ b/arch/x86/include/asm/keylocker.h
@@ -14,5 +14,16 @@
 #define KEYLOCKER_CPUID_EBX_BACKUP BIT(4)
 #define KEYLOCKER_CPUID_ECX_RAND   BIT(1)
 
+bool check_keylocker_readiness(void);
+
+bool load_keylocker(void);
+
+void make_keylocker_data(void);
+#ifdef CONFIG_X86_KEYLOCKER
+void invalidate_keylocker_data(void);
+#else
+#define invalidate_keylocker_data() do { } while (0)
+#endif
+
 #endif /*__ASSEMBLY__ */
 #endif /* _ASM_KEYLOCKER_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..085dbf49b3b9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -145,6 +145,7 @@ obj-$(CONFIG_PERF_EVENTS)   += perf_regs.o
 obj-$(CONFIG_TRACING)  += tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)+= itmt.o
 obj-$(CONFIG_X86_UMIP) += umip.o
+obj-$(CONFIG_X86_KEYLOCKER)+= keylocker.o
 
 obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
 obj-$(CONFIG_UNWINDER_FRAME_POINTER)   += unwind_frame.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..d675075848bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -57,6 +57,8 @@
 #include 
 #include 
 #include 
+#include 
+
 #include 
 
 #include "cpu.h"
@@ -459,6 +461,39 @@ static __init int x86_nofsgsbase_setup(char *arg)
 }
 __setup("nofsgsbase", x86_nofsgsbase_setup);
 
+static __always_inline void setup_keylocker(struct cpuinfo_x86 *c)
+{
+   bool keyloaded;
+
+   if (!cpu_feature_enabled(X86_FEATURE_KEYLOCKER))
+   goto out;
+
+   cr4_set_bits(X86_CR4_KEYLOCKER);
+
+   if (c == _cpu_data) {
+   if (!check_keylocker_readiness())
+   goto disable_keylocker;
+
+   make_keylocker_data();
+   }
+
+   keyloaded = load_keylocker();
+   if (!keyloaded) {
+   pr_err_once("x86/keylocker: Failed to load internal key\n");
+   goto disable_keylocker;
+   }
+
+   pr_info_once("x86/keylocker: Activated\n");
+   return;
+
+disable_keylocker:
+   clear_cpu_cap(c, X86_FEATURE_KEYLOCKER);
+   pr_info_once("x86/keylocker: Disabled\n");
+out:
+   /* Make sure the feature disabled for kexec-reboot. */
+   cr4_clear_bits(X86_CR4_KEYLOCKER);
+}
+
 /*
  * Protection Keys are not available in 32-bit mode.
  */
@@ -1554,10 +1589,11 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
 
-   /* Set up SMEP/SMAP/UMIP */
+   /* Setup various Intel-specific CPU security features */
setup_smep(c);
setup_smap(c);
setup_umip(c);
+   setup_keylocker(c);
 
/* Enable FSGSBASE instructions if available. */
if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
diff --git a/arch/x86/kernel/keylocker.c b/arch/x86/kernel/keylocker.c
new file mode 100644
index ..e455d806b80c
--- /dev/null
+++ b/arch/x86/kernel/keylocker.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Key Locker feature check and support the internal key
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+
+bool check_keylocker_readiness(void)
+{
+   u32 eax, ebx, ecx, edx;
+
+   cpuid_count(KEYLOCKER_CPUID, 0, , , , );
+   /* BIOS may not enable it on some systems. */
+   if (!(ebx & KEYLOCKER_CPUID_EBX_AESKLE)) {
+   pr_debug("x86/keylocker: not fully enabled\n");
+   return false;
+   }
+
+   return true;
+}
+
+/* Load Internal (Wrapping) Key */
+#define LOADIWKEY  ".byte 0xf3,0x0f,0

[RFC PATCH 1/8] x86/cpufeature: Enumerate Key Locker feature

2020-12-16 Thread Chang S. Bae
Intel's Key Locker is a new security feature providing a mechanism to
protect a data encryption key when processing the Advanced Encryption
Standard algorithm.

Here we add it to the kernel/user ABI by enumerating the hardware
capability. E.g., /proc/cpuinfo: keylocker.

Also, define the feature-specific CPUID leaf and bits for the feature
enablement.

Key Locker is on the disabled list, which is useful for compile-time
configuration later.

Signed-off-by: Chang S. Bae 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/cpufeatures.h  |  1 +
 arch/x86/include/asm/disabled-features.h|  8 +++-
 arch/x86/include/asm/keylocker.h| 18 ++
 arch/x86/include/uapi/asm/processor-flags.h |  2 ++
 arch/x86/kernel/cpu/cpuid-deps.c|  1 +
 5 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/keylocker.h

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..8f2f050023b7 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -352,6 +352,7 @@
 #define X86_FEATURE_AVX512_VPOPCNTDQ   (16*32+14) /* POPCNT for vectors of 
DW/QW */
 #define X86_FEATURE_LA57   (16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID  (16*32+22) /* RDPID instruction */
+#define X86_FEATURE_KEYLOCKER  (16*32+23) /* Key Locker */
 #define X86_FEATURE_CLDEMOTE   (16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B  (16*32+28) /* MOVDIR64B instruction */
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index 5861d34f9771..0ac9414da242 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -44,6 +44,12 @@
 # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
+#ifdef CONFIG_X86_KEYLOCKER
+# define DISABLE_KEYLOCKER 0
+#else
+# define DISABLE_KEYLOCKER (1<<(X86_FEATURE_KEYLOCKER & 31))
+#endif /* CONFIG_X86_KEYLOCKER */
+
 #ifdef CONFIG_X86_5LEVEL
 # define DISABLE_LA57  0
 #else
@@ -82,7 +88,7 @@
 #define DISABLED_MASK140
 #define DISABLED_MASK150
 #define DISABLED_MASK16
(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
-DISABLE_ENQCMD)
+DISABLE_ENQCMD|DISABLE_KEYLOCKER)
 #define DISABLED_MASK170
 #define DISABLED_MASK180
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
diff --git a/arch/x86/include/asm/keylocker.h b/arch/x86/include/asm/keylocker.h
new file mode 100644
index ..2fe13c21c63f
--- /dev/null
+++ b/arch/x86/include/asm/keylocker.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ASM_KEYLOCKER_H
+#define _ASM_KEYLOCKER_H
+
+#ifndef __ASSEMBLY__
+
+#include 
+
+#define KEYLOCKER_CPUID0x019
+#define KEYLOCKER_CPUID_EAX_SUPERVISOR BIT(0)
+#define KEYLOCKER_CPUID_EBX_AESKLE BIT(0)
+#define KEYLOCKER_CPUID_EBX_WIDE   BIT(2)
+#define KEYLOCKER_CPUID_EBX_BACKUP BIT(4)
+#define KEYLOCKER_CPUID_ECX_RAND   BIT(1)
+
+#endif /*__ASSEMBLY__ */
+#endif /* _ASM_KEYLOCKER_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..b958a95a0908 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -124,6 +124,8 @@
 #define X86_CR4_PCIDE  _BITUL(X86_CR4_PCIDE_BIT)
 #define X86_CR4_OSXSAVE_BIT18 /* enable xsave and xrestore */
 #define X86_CR4_OSXSAVE_BITUL(X86_CR4_OSXSAVE_BIT)
+#define X86_CR4_KEYLOCKER_BIT  19 /* enable Key Locker */
+#define X86_CR4_KEYLOCKER  _BITUL(X86_CR4_KEYLOCKER_BIT)
 #define X86_CR4_SMEP_BIT   20 /* enable SMEP support */
 #define X86_CR4_SMEP   _BITUL(X86_CR4_SMEP_BIT)
 #define X86_CR4_SMAP_BIT   21 /* enable SMAP support */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..b8edcb91fe4f 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -71,6 +71,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_BF16,  X86_FEATURE_AVX512VL  },
{ X86_FEATURE_ENQCMD,   X86_FEATURE_XSAVES},
{ X86_FEATURE_PER_THREAD_MBA,   X86_FEATURE_MBA   },
+   { X86_FEATURE_KEYLOCKER,X86_FEATURE_XMM2  },
{}
 };
 
-- 
2.17.1



[RFC PATCH 0/8] x86: Support Intel Key Locker

2020-12-16 Thread Chang S. Bae
Key Locker [1][2] is a new security feature available in new Intel CPUs to
protect data encryption keys for the Advanced Encryption Standard
algorithm. The protection limits the amount of time an AES key is exposed
in memory by sealing a key and referencing it with new AES instructions.

The new AES instruction set is a successor of Intel's AES-NI (AES New
Instruction). Users may switch to the Key Locker version from crypto
libraries.  This series includes a new AES implementation for the Crypto
API, which was validated through the crypto unit tests. The performance in
the test cases was measured and found comparable to the AES-NI version.

Key Locker introduces a (CPU-)internal key to encode AES keys. The kernel
needs to load it and ensure it unchanged as long as CPUs are operational.

The series has three parts:
* PATCH1-6: Implement the internal key management
* PATCH7:   Add AES implementation in Crypto library
* PATCH8:   Provide the hardware randomization option for the internal key

This RFC series has been reviewed by Dan Williams, with an open question of
whether to use hardware backup/restore, or to synchronize reinitialize the
internal key over suspend / resume to avoid the implications of key restore
failures.

[1] Intel Architecture Instruction Set Extensions Programming Reference:

https://software.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-$
[2] Intel Key Locker Specification:

https://software.intel.com/content/dam/develop/external/us/en/documents/343965-intel-key-locker-speci$

Chang S. Bae (8):
  x86/cpufeature: Enumerate Key Locker feature
  x86/cpu: Load Key Locker internal key at boot-time
  x86/msr-index: Add MSRs for Key Locker internal key
  x86/power: Restore Key Locker internal key from the ACPI S3/4 sleep
states
  x86/cpu: Add a config option and a chicken bit for Key Locker
  selftests/x86: Test Key Locker internal key maintenance
  crypto: x86/aes-kl - Support AES algorithm using Key Locker
instructions
  x86/cpu: Support the hardware randomization option for Key Locker
internal key

 .../admin-guide/kernel-parameters.txt |   2 +
 arch/x86/Kconfig  |  14 +
 arch/x86/crypto/Makefile  |   3 +
 arch/x86/crypto/aeskl-intel_asm.S | 881 ++
 arch/x86/crypto/aeskl-intel_glue.c| 697 ++
 arch/x86/include/asm/cpufeatures.h|   1 +
 arch/x86/include/asm/disabled-features.h  |   8 +-
 arch/x86/include/asm/inst.h   | 201 
 arch/x86/include/asm/keylocker.h  |  41 +
 arch/x86/include/asm/msr-index.h  |   6 +
 arch/x86/include/uapi/asm/processor-flags.h   |   2 +
 arch/x86/kernel/Makefile  |   1 +
 arch/x86/kernel/cpu/common.c  |  66 +-
 arch/x86/kernel/cpu/cpuid-deps.c  |   1 +
 arch/x86/kernel/keylocker.c   | 147 +++
 arch/x86/kernel/smpboot.c |   2 +
 arch/x86/lib/x86-opcode-map.txt   |   2 +-
 arch/x86/power/cpu.c  |  34 +
 crypto/Kconfig|  28 +
 drivers/char/random.c |   6 +
 include/linux/random.h|   2 +
 tools/arch/x86/lib/x86-opcode-map.txt |   2 +-
 tools/testing/selftests/x86/Makefile  |   2 +-
 tools/testing/selftests/x86/keylocker.c   | 177 
 24 files changed, 2321 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/crypto/aeskl-intel_asm.S
 create mode 100644 arch/x86/crypto/aeskl-intel_glue.c
 create mode 100644 arch/x86/include/asm/keylocker.h
 create mode 100644 arch/x86/kernel/keylocker.c
 create mode 100644 tools/testing/selftests/x86/keylocker.c

-- 
2.17.1



[PATCH v2 02/22] x86/fpu/xstate: Modify xstate copy helper prototypes to access all the possible areas

2020-11-19 Thread Chang S. Bae
The xstate infrastructure is not flexible to support dynamic areas in
task->fpu. Make the xstate copy functions to access task->fpu directly.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/fpu/xstate.h |  8 
 arch/x86/kernel/fpu/regset.c  |  6 +++---
 arch/x86/kernel/fpu/signal.c  | 17 -
 arch/x86/kernel/fpu/xstate.c  | 19 +++
 4 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..e0f1b22f53ce 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -105,10 +105,10 @@ const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
 struct membuf;
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_supervisor_to_kernel(struct xregs_state *xsave);
+void copy_xstate_to_kernel(struct membuf to, struct fpu *fpu);
+int copy_kernel_to_xstate(struct fpu *fpu, const void *kbuf);
+int copy_user_to_xstate(struct fpu *fpu, const void __user *ubuf);
+void copy_supervisor_to_kernel(struct fpu *fpu);
 void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
 void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
 
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 4c4d9059ff36..5e13e58d11d4 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -85,7 +85,7 @@ int xstateregs_get(struct task_struct *target, const struct 
user_regset *regset,
fpu__prepare_read(fpu);
 
if (using_compacted_format()) {
-   copy_xstate_to_kernel(to, xsave);
+   copy_xstate_to_kernel(to, fpu);
return 0;
} else {
fpstate_sanitize_xstate(fpu);
@@ -126,9 +126,9 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 
if (using_compacted_format()) {
if (kbuf)
-   ret = copy_kernel_to_xstate(xsave, kbuf);
+   ret = copy_kernel_to_xstate(fpu, kbuf);
else
-   ret = copy_user_to_xstate(xsave, ubuf);
+   ret = copy_user_to_xstate(fpu, ubuf);
} else {
ret = user_regset_copyin(, , , , xsave, 0, 
-1);
if (!ret)
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..60676eef41a8 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -212,11 +212,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void 
__user *buf_fx, int size)
 }
 
 static inline void
-sanitize_restored_user_xstate(union fpregs_state *state,
+sanitize_restored_user_xstate(struct fpu *fpu,
  struct user_i387_ia32_struct *ia32_env,
  u64 user_xfeatures, int fx_only)
 {
-   struct xregs_state *xsave = >xsave;
+   struct xregs_state *xsave = >state.xsave;
struct xstate_header *header = >header;
 
if (use_xsave()) {
@@ -253,7 +253,7 @@ sanitize_restored_user_xstate(union fpregs_state *state,
xsave->i387.mxcsr &= mxcsr_feature_mask;
 
if (ia32_env)
-   convert_to_fxsr(>fxsave, ia32_env);
+   convert_to_fxsr(>state.fxsave, ia32_env);
}
 }
 
@@ -396,7 +396,7 @@ static int __fpu__restore_sig(void __user *buf, void __user 
*buf_fx, int size)
 * current supervisor states first and invalidate the FPU regs.
 */
if (xfeatures_mask_supervisor())
-   copy_supervisor_to_kernel(>state.xsave);
+   copy_supervisor_to_kernel(fpu);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
@@ -406,18 +406,18 @@ static int __fpu__restore_sig(void __user *buf, void 
__user *buf_fx, int size)
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
 
if (using_compacted_format()) {
-   ret = copy_user_to_xstate(>state.xsave, buf_fx);
+   ret = copy_user_to_xstate(fpu, buf_fx);
} else {
ret = __copy_from_user(>state.xsave, buf_fx, 
state_size);
 
if (!ret && state_size > offsetof(struct xregs_state, 
header))
ret = 
validate_user_xstate_header(>state.xsave.header);
+
}
if (ret)
goto err_out;
 
-  

[PATCH v2 13/22] x86/fpu/xstate: Expand dynamic user state area on first use

2020-11-19 Thread Chang S. Bae
Intel's Extended Feature Disable (XFD) feature is an extension of the XSAVE
architecture. XFD allows the kernel to enable a feature state in XCR0 and
to receive a #NM trap when a task uses instructions accessing that state.
In this way, Linux can allocate the large task->fpu buffer only for tasks
that use it.

XFD introduces two MSRs: IA32_XFD to enable/disable the feature and
IA32_XFD_ERR to assist the #NM trap handler. Both use the same
state-component bitmap format, used by XCR0.

Use this hardware capability to find the right time to expand the xstate
area. Introduce two sets of helper functions for that:

1. The first set is primarily for interacting with the XFD hardware
   feature. Helpers for configuring disablement, e.g. in context switching,
   are:
xdisable_setbits()
xdisable_getbits()
xdisable_switch()

2. The second set is for managing the first-use status and handling #NM
   trap:
xfirstuse_enabled()
xfirstuse_not_detected()

The #NM handler induces the xstate area expansion to save the first-used
states.

No functional change until the kernel enables dynamic user states and XFD.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Change from v1:
* Inlined the XFD-induced #NM handling code (Andy Lutomirski)
---
 arch/x86/include/asm/cpufeatures.h  |  1 +
 arch/x86/include/asm/fpu/internal.h | 51 -
 arch/x86/include/asm/msr-index.h|  2 ++
 arch/x86/kernel/fpu/xstate.c| 34 +--
 arch/x86/kernel/process.c   |  5 +++
 arch/x86/kernel/process_32.c|  2 +-
 arch/x86/kernel/process_64.c|  2 +-
 arch/x86/kernel/traps.c | 40 ++
 8 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index dad350d42ecf..5b6496ee3703 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -275,6 +275,7 @@
 #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
 #define X86_FEATURE_XGETBV1(10*32+ 2) /* XGETBV with ECX = 1 
instruction */
 #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS 
instructions */
+#define X86_FEATURE_XFD(10*32+ 4) /* eXtended Feature 
Disabling */
 
 /*
  * Extended auxiliary flags: Linux defined - for features scattered in various
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 75196d10aa71..ede3f88bdc52 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -573,11 +573,58 @@ static inline void switch_fpu_prepare(struct fpu 
*old_fpu, int cpu)
  * Misc helper functions:
  */
 
+/* The first-use detection helpers: */
+
+static inline void xdisable_setbits(u64 value)
+{
+   wrmsrl_safe(MSR_IA32_XFD, value);
+}
+
+static inline u64 xdisable_getbits(void)
+{
+   u64 value;
+
+   rdmsrl_safe(MSR_IA32_XFD, );
+   return value;
+}
+
+static inline u64 xfirstuse_enabled(void)
+{
+   /* All the dynamic user components are first-use enabled. */
+   return xfeatures_mask_user_dynamic;
+}
+
+/*
+ * Convert fpu->firstuse_bv to xdisable configuration in MSR IA32_XFD.
+ * xdisable_setbits() only uses this.
+ */
+static inline u64 xfirstuse_not_detected(struct fpu *fpu)
+{
+   u64 firstuse_bv = (fpu->state_mask & xfirstuse_enabled());
+
+   /*
+* If first-use is not detected, set the bit. If the detection is
+* not enabled, the bit is always zero in firstuse_bv. So, make
+* following conversion:
+*/
+   return  (xfirstuse_enabled() ^ firstuse_bv);
+}
+
+/* Update MSR IA32_XFD based on fpu->firstuse_bv */
+static inline void xdisable_switch(struct fpu *prev, struct fpu *next)
+{
+   if (!static_cpu_has(X86_FEATURE_XFD) || !xfirstuse_enabled())
+   return;
+
+   if (unlikely(prev->state_mask != next->state_mask))
+   xdisable_setbits(xfirstuse_not_detected(next));
+}
+
 /*
  * Load PKRU from the FPU context if available. Delay loading of the
  * complete FPU state until the return to userland.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct fpu *old_fpu, struct fpu *new_fpu)
 {
u32 pkru_val = init_pkru_value;
struct pkru_state *pk;
@@ -587,6 +634,8 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
 
set_thread_flag(TIF_NEED_FPU_LOAD);
 
+   xdisable_switch(old_fpu, new_fpu);
+
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return;
 
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 972a34d93505..f8b5f9b3c845 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -617,6 +617,8 @@
 #define MSR_IA32_BNDCFGS_RSVD 

[PATCH v2 22/22] x86/fpu/xstate: Introduce boot-parameters for control some state component support

2020-11-19 Thread Chang S. Bae
Rename XFEATURE_MASK_USER_SUPPORTED to XFEATURE_MASK_USER_ENABLED to
literally align with new boot-parameters.

"xstate.disable=0x6" will disable AMX on a system that has AMX compiled
into XFEATURE_MASK_USER_ENABLED.

"xstate.enable=0x6" will enable AMX on a system that does NOT have AMX
compiled into XFEATURE_MASK_USER_ENABLED (assuming the kernel is new enough
to support this feature).

While this cmdline is currently enabled only for AMX, it is intended to be
easily enabled to be useful for future XSAVE-enabled features.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v1:
* Renamed the user state mask define (Andy Lutomirski and Dave Hansen)
* Changed the error message (Dave Hansen)
* Fixed xfeatures_mask_user()
* Rebased the upstream kernel (5.10) -- revived the param parse function
---
 .../admin-guide/kernel-parameters.txt | 15 
 arch/x86/include/asm/fpu/types.h  |  6 ++
 arch/x86/include/asm/fpu/xstate.h | 24 +++---
 arch/x86/kernel/fpu/init.c| 73 +--
 4 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 526d65d8573a..c41528cfe39f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5995,6 +5995,21 @@
which allow the hypervisor to 'idle' the guest on lock
contention.
 
+   xstate.enable=  [X86-64]
+   xstate.disable= [X86-64]
+   The kernel is compiled with a default xstate bitmask --
+   enabling it to use the XSAVE hardware to efficiently
+   save and restore thread states on context switch.
+   xstate.enable allows adding to that default mask at
+   boot-time without recompiling the kernel just to support
+   the new thread state. (Note that the kernel will ignore
+   any bits in the mask that do not correspond to features
+   that are actually available in CPUID)  xstate.disable
+   allows clearing bits in the default mask, forcing the
+   kernel to forget that it supports the specified thread
+   state. When a bit set for both, the kernel takes
+   xstate.disable in a priority.
+
xirc2ps_cs= [NET,PCMCIA]
Format:

,[,[,[,]]]
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 18eb50fc95e8..ababb748cc8e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -149,6 +149,12 @@ enum xfeature {
 #define XFEATURE_MASK_XTILE(XFEATURE_MASK_XTILE_DATA \
 | XFEATURE_MASK_XTILE_CFG)
 
+#define XFEATURE_REGION_MASK(max_bit, min_bit) \
+   ((BIT_ULL((max_bit) - (min_bit) + 1) - 1) << (min_bit))
+
+#define XFEATURE_MASK_CONFIGURABLE \
+   XFEATURE_REGION_MASK(XFEATURE_XTILE_DATA, XFEATURE_XTILE_CFG)
+
 #define FIRST_EXTENDED_XFEATUREXFEATURE_YMM
 
 struct reg_128_bit {
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 1544a874b748..683a8503c1c6 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -25,17 +25,17 @@
 
 #define XSAVE_ALIGNMENT 64
 
-/* All currently supported user features */
-#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
- XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
- XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM | \
- XFEATURE_MASK_PKRU | \
- XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR | \
- XFEATURE_MASK_XTILE)
+/* All currently enabled user features */
+#define XFEATURE_MASK_USER_ENABLED (XFEATURE_MASK_FP | \
+   XFEATURE_MASK_SSE | \
+   XFEATURE_MASK_YMM | \
+   XFEATURE_MASK_OPMASK | \
+   XFEATURE_MASK_ZMM_Hi256 | \
+   XFEATURE_MASK_Hi16_ZMM   | \
+   XFEATURE_MASK_PKRU | \
+   XFEATURE_MASK_BNDREGS | \
+   XFEATURE_MASK_BNDCSR | \
+   XFEATURE_MASK_XTILE)
 
 /* All curr

[PATCH v2 01/22] x86/fpu/xstate: Modify area init helper prototypes to access all the possible areas

2020-11-19 Thread Chang S. Bae
The xstate infrastructure is not flexible to support dynamic areas in
task->fpu. Change the fpstate_init() prototype to access task->fpu
directly. It treats a null pointer as indicating init_fpstate, as this
initial data does not belong to any task. For the compacted format,
fpstate_init_xstate() now accepts the state component bitmap to configure
XCOMP_BV.

No functional change.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
 arch/x86/include/asm/fpu/internal.h |  6 +++---
 arch/x86/kernel/fpu/core.c  | 14 +++---
 arch/x86/kernel/fpu/init.c  |  2 +-
 arch/x86/kernel/fpu/regset.c|  2 +-
 arch/x86/kernel/fpu/xstate.c|  3 +--
 arch/x86/kvm/x86.c  |  2 +-
 6 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 8d33ad80704f..d81d8c407dc0 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -80,20 +80,20 @@ static __always_inline __pure bool use_fxsr(void)
 
 extern union fpregs_state init_fpstate;
 
-extern void fpstate_init(union fpregs_state *state);
+extern void fpstate_init(struct fpu *fpu);
 #ifdef CONFIG_MATH_EMULATION
 extern void fpstate_init_soft(struct swregs_state *soft);
 #else
 static inline void fpstate_init_soft(struct swregs_state *soft) {}
 #endif
 
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
+static inline void fpstate_init_xstate(struct xregs_state *xsave, u64 
xcomp_mask)
 {
/*
 * XRSTORS requires these bits set in xcomp_bv, or it will
 * trigger #GP:
 */
-   xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+   xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xcomp_mask;
 }
 
 static inline void fpstate_init_fxstate(struct fxregs_state *fx)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index eb86a2b831b1..41d926c76615 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -191,8 +191,16 @@ static inline void fpstate_init_fstate(struct fregs_state 
*fp)
fp->fos = 0xu;
 }
 
-void fpstate_init(union fpregs_state *state)
+/* If a null pointer is given, assume to take the initial FPU state, 
init_fpstate. */
+void fpstate_init(struct fpu *fpu)
 {
+   union fpregs_state *state;
+
+   if (fpu)
+   state = >state;
+   else
+   state = _fpstate;
+
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpstate_init_soft(>soft);
return;
@@ -201,7 +209,7 @@ void fpstate_init(union fpregs_state *state)
memset(state, 0, fpu_kernel_xstate_size);
 
if (static_cpu_has(X86_FEATURE_XSAVES))
-   fpstate_init_xstate(>xsave);
+   fpstate_init_xstate(>xsave, xfeatures_mask_all);
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(>fxsave);
else
@@ -261,7 +269,7 @@ static void fpu__initialize(struct fpu *fpu)
WARN_ON_FPU(fpu != >thread.fpu);
 
set_thread_flag(TIF_NEED_FPU_LOAD);
-   fpstate_init(>state);
+   fpstate_init(fpu);
trace_x86_fpu_init_state(fpu);
 }
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 701f196d7c68..74e03e3bc20f 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -124,7 +124,7 @@ static void __init fpu__init_system_generic(void)
 * Set up the legacy init FPU context. (xstate init might overwrite this
 * with a more modern format, if the CPU supports it.)
 */
-   fpstate_init(_fpstate);
+   fpstate_init(NULL);
 
fpu__init_system_mxcsr();
 }
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..4c4d9059ff36 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -144,7 +144,7 @@ int xstateregs_set(struct task_struct *target, const struct 
user_regset *regset,
 * In case of failure, mark all states as init:
 */
if (ret)
-   fpstate_init(>state);
+   fpstate_init(fpu);
 
return ret;
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0a..1a3e5effe0fa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -457,8 +457,7 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
 
if (boot_cpu_has(X86_FEATURE_XSAVES))
-   init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
-xfeatures_mask_all;
+   fpstate_init_xstate(_fpstate.xsave, xfeatures_mask_all);
 
/*
 * Init all the features state with header.xfeatures being 0x0
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm

[PATCH v2 15/22] x86/fpu/xstate: Support ptracer-induced xstate area expansion

2020-11-19 Thread Chang S. Bae
ptrace() may request an update to task->fpu that has not yet been
allocated. Detect this case and allocate task->fpu to support the request.
Also, disable the (now unnecessary) associated first-use fault.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/fpu/regset.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 8d863240b9c6..6b9d0c0a266d 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -125,6 +125,35 @@ int xstateregs_set(struct task_struct *target, const 
struct user_regset *regset,
 
xsave = __xsave(fpu);
 
+   /*
+* When a ptracer attempts to write any state in task->fpu but not 
allocated,
+* it dynamically expands the xstate area of fpu->state_ptr.
+*/
+   if (count > get_xstate_size(fpu->state_mask)) {
+   unsigned int offset, size;
+   struct xstate_header hdr;
+   u64 mask;
+
+   offset = offsetof(struct xregs_state, header);
+   size = sizeof(hdr);
+
+   /* Retrieve XSTATE_BV */
+   if (kbuf) {
+   memcpy(, kbuf + offset, size);
+   } else {
+   ret = __copy_from_user(, ubuf + offset, size);
+   if (ret)
+   return ret;
+   }
+
+   mask = hdr.xfeatures & xfeatures_mask_user_dynamic;
+   if (!mask) {
+   ret = alloc_xstate_area(fpu, mask, NULL);
+   if (ret)
+   return ret;
+   }
+   }
+
fpu__prepare_write(fpu);
 
if (using_compacted_format()) {
-- 
2.17.1



[PATCH v2 18/22] x86/fpu/amx: Define AMX state components and have it used for boot-time checks

2020-11-19 Thread Chang S. Bae
Linux uses check_xstate_against_struct() to sanity check the size of
XSTATE-enabled features. AMX is the XSAVE-enabled feature, and its size is
not hard-coded but discoverable at run-time via CPUID.

The AMX state is composed of state components 17 and 18, which are all user
state components. The first component is the XTILECFG state of a 64-byte
tile-related control register. The state component 18, called XTILEDATA,
contains the actual tile data, and the state size varies on
implementations. The architectural maximum, as defined in the CPUID(0x1d,
1): EAX[15:0], is a byte less than 64KB. The first implementation supports
8KB.

Check the XTILEDATA state size dynamically. The feature introduces the new
tile register, TMM. Define one register struct only and read the number of
registers from CPUID. Cross-check the overall size with CPUID again.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v1:
* Rebased on the upstream kernel (5.10)
---
 arch/x86/include/asm/fpu/types.h  | 27 +
 arch/x86/include/asm/fpu/xstate.h |  2 +
 arch/x86/kernel/fpu/xstate.c  | 64 +++
 3 files changed, 93 insertions(+)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index a91b4f4df8c8..18eb50fc95e8 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -120,6 +120,9 @@ enum xfeature {
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
+   XFEATURE_RSRVD_COMP_16,
+   XFEATURE_XTILE_CFG,
+   XFEATURE_XTILE_DATA,
 
XFEATURE_MAX,
 };
@@ -136,11 +139,15 @@ enum xfeature {
 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
 #define XFEATURE_MASK_PASID(1 << XFEATURE_PASID)
 #define XFEATURE_MASK_LBR  (1 << XFEATURE_LBR)
+#define XFEATURE_MASK_XTILE_CFG(1 << XFEATURE_XTILE_CFG)
+#define XFEATURE_MASK_XTILE_DATA   (1 << XFEATURE_XTILE_DATA)
 
 #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512   (XFEATURE_MASK_OPMASK \
 | XFEATURE_MASK_ZMM_Hi256 \
 | XFEATURE_MASK_Hi16_ZMM)
+#define XFEATURE_MASK_XTILE(XFEATURE_MASK_XTILE_DATA \
+| XFEATURE_MASK_XTILE_CFG)
 
 #define FIRST_EXTENDED_XFEATUREXFEATURE_YMM
 
@@ -153,6 +160,9 @@ struct reg_256_bit {
 struct reg_512_bit {
u8  regbytes[512/8];
 };
+struct reg_1024_byte {
+   u8  regbytes[1024];
+};
 
 /*
  * State component 2:
@@ -255,6 +265,23 @@ struct arch_lbr_state {
u64 ler_to;
u64 ler_info;
struct lbr_entryentries[];
+};
+
+/*
+ * State component 17: 64-byte tile configuration register.
+ */
+struct xtile_cfg {
+   u64 tcfg[8];
+} __packed;
+
+/*
+ * State component 18: 1KB tile data register.
+ * Each register represents 16 64-byte rows of the matrix
+ * data. But the number of registers depends on the actual
+ * implementation.
+ */
+struct xtile_data {
+   struct reg_1024_bytetmm;
 } __packed;
 
 /*
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index cc159bc9386d..d2ad69230d0e 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -13,6 +13,8 @@
 
 #define XSTATE_CPUID   0x000d
 
+#define TILE_CPUID 0x001d
+
 #define FXSAVE_SIZE512
 
 #define XSAVE_HDR_SIZE 64
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index ebc89009d6bc..61f1dd3e6721 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -41,6 +41,14 @@ static const char *xfeature_names[] =
"Protection Keys User registers",
"PASID state",
"unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "unknown xstate feature",
+   "AMX Tile config"   ,
+   "AMX Tile data" ,
+   "unknown xstate feature",
 };
 
 struct xfeature_capflag_info {
@@ -60,6 +68,8 @@ static struct xfeature_capflag_info xfeature_capflags[] 
__initdata = {
{ XFEATURE_PT_UNIMPLEMENTED_SO_FAR, X86_FEATURE_INTEL_PT },
{ XFEATURE_PKRU,X86_FEATURE_PKU },
{ XFEATURE_PASID,   X86_FEATURE_ENQCMD },
+   { XFEATURE_XTILE_CFG,   X86_FEATURE_AMX_TILE },
+   { XFEATURE_XTILE_DATA,  X86_FEATURE_AMX_TILE }
 };
 
 /*
@@ -421,6 +431,8 @@ static v

[PATCH v2 07/22] x86/fpu/xstate: Introduce helpers to manage an xstate area dynamically

2020-11-19 Thread Chang S. Bae
task->fpu has a buffer to keep the extended register states, but it is not
expandable at runtime. Introduce runtime methods and new fpu struct fields
to support the expansion.

fpu->state_mask indicates the saved states per task and fpu->state_ptr
points to the dynamically allocated area.

alloc_xstate_area() uses vmalloc() for its scalability. However, set a
threshold (64KB) to watch out a potential need for an alternative
mechanism.

Also, introduce a new helper -- get_xstate_size() to calculate the area
size.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
Changes from v1:
* Removed unneeded interrupt masking (Andy Lutomirski)
* Added vmalloc() error tracing (Dave Hansen, PeterZ, and Andy Lutomirski)
---
 arch/x86/include/asm/fpu/types.h  |  29 +--
 arch/x86/include/asm/fpu/xstate.h |   3 +
 arch/x86/include/asm/trace/fpu.h  |   5 ++
 arch/x86/kernel/fpu/core.c|   3 +
 arch/x86/kernel/fpu/xstate.c  | 121 ++
 5 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..a91b4f4df8c8 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -336,14 +336,33 @@ struct fpu {
 */
unsigned long   avx512_timestamp;
 
+   /*
+* @state_mask:
+*
+* The state component bitmap. It indicates the saved xstate in
+* either @state or @state_ptr. The map value starts to be aligned
+* with @state and then with @state_ptr once it is in use.
+*/
+   u64 state_mask;
+
+   /*
+* @state_ptr:
+*
+* Copy of all extended register states, in a dynamically-allocated
+* area, we save and restore over context switches. When a task is
+* using extended features, the register state is always the most
+* current. This state copy is more recent than @state. If the task
+* context-switches away, they get saved here, representing the xstate.
+*/
+   union fpregs_state  *state_ptr;
+
/*
 * @state:
 *
-* In-memory copy of all FPU registers that we save/restore
-* over context switches. If the task is using the FPU then
-* the registers in the FPU are more recent than this state
-* copy. If the task context-switches away then they get
-* saved here and represent the FPU state.
+* Copy of some extended register state that we save and restore
+* over context switches. If a task uses a dynamically-allocated
+* area, @state_ptr, then it has a more recent state copy than this.
+* This copy follows the same attributes as described for @state_ptr.
 */
union fpregs_state  state;
/*
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 6ce8350672c2..49020c745eb6 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -103,6 +103,9 @@ extern void __init update_regset_xstate_info(unsigned int 
size,
 u64 xstate_mask);
 
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
+int alloc_xstate_area(struct fpu *fpu, u64 mask, unsigned int *alloc_size);
+void free_xstate_area(struct fpu *fpu);
+
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
 int xfeature_size(int xfeature_nr);
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 879b77792f94..bf88b873 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -89,6 +89,11 @@ DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
TP_ARGS(fpu)
 );
 
+DEFINE_EVENT(x86_fpu, x86_fpu_xstate_alloc_failed,
+   TP_PROTO(struct fpu *fpu),
+   TP_ARGS(fpu)
+);
+
 #undef TRACE_INCLUDE_PATH
 #define TRACE_INCLUDE_PATH asm/trace/
 #undef TRACE_INCLUDE_FILE
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 875620fdfe61..e25f7866800e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -235,6 +235,9 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 */
memset(_fpu->state.xsave, 0, fpu_kernel_xstate_default_size);
 
+   dst_fpu->state_mask = xfeatures_mask_all & ~xfeatures_mask_user_dynamic;
+   dst_fpu->state_ptr = NULL;
+
/*
 * If the FPU registers are not current just memcpy() the state.
 * Otherwise save current FPU registers directly into the child's FPU
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 297eaefce589..7ec856668717 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/x

[PATCH v2 10/22] x86/fpu/xstate: Update xstate save function for supporting dynamic user xstate

2020-11-19 Thread Chang S. Bae
copy_xregs_to_kernel() used to save all user states in an invariably
sufficient buffer. When the dynamic user state is enabled, it becomes
conditional which state to be saved.

fpu->state_mask can indicate which state components are reserved to be
saved in XSAVE buffer. Use it as XSAVE's instruction mask to select states.

KVM saves xstate in guest_fpu and user_fpu. With the change, the KVM code
needs to ensure a valid fpu->state_mask before XSAVE.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
 arch/x86/include/asm/fpu/internal.h |  3 +--
 arch/x86/kernel/fpu/core.c  |  2 +-
 arch/x86/kvm/x86.c  | 11 ---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 3201468ff4aa..75196d10aa71 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -332,9 +332,8 @@ static inline void copy_kernel_to_xregs_booting(struct 
xregs_state *xstate)
 /*
  * Save processor xstate to xsave area.
  */
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate, u64 mask)
 {
-   u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index dca4961fcc36..ece6428ba85b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -99,7 +99,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
if (likely(use_xsave())) {
struct xregs_state *xsave = >xsave;
 
-   copy_xregs_to_kernel(xsave);
+   copy_xregs_to_kernel(xsave, fpu->state_mask);
 
/*
 * AVX512 state is tracked here because its use is
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index abd5ff338155..023db770b55f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9212,15 +9212,20 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 static void kvm_save_current_fpu(struct fpu *fpu)
 {
+   struct fpu *src_fpu = >thread.fpu;
+
/*
 * If the target FPU state is not resident in the CPU registers, just
 * memcpy() from current, else save CPU state directly to the target.
 */
-   if (test_thread_flag(TIF_NEED_FPU_LOAD))
-   memcpy(>state, >thread.fpu.state,
+   if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
+   memcpy(>state, _fpu->state,
   fpu_kernel_xstate_default_size);
-   else
+   } else {
+   if (fpu->state_mask != src_fpu->state_mask)
+   fpu->state_mask = src_fpu->state_mask;
copy_fpregs_to_fpstate(fpu);
+   }
 }
 
 /* Swap (qemu) user FPU context for the guest FPU context. */
-- 
2.17.1



[PATCH v2 20/22] selftest/x86/amx: Include test cases for the AMX state management

2020-11-19 Thread Chang S. Bae
This selftest exercises the kernel's ability to inherit and context switch
AMX state, by verifying that they retain unique data when creating a child
process and between multiple threads.

Also, ptrace() is used to insert AMX state into existing threads -- both
before and after the existing thread has initialized its AMX state.

Collect the test cases of validating those operations together, as they
share some common setup for the AMX state.

These test cases do not depend on AMX compiler support, as they employ
user-space-XSAVE directly to access AMX state.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselft...@vger.kernel.org
---
Changes from v1:
* Removed signal testing code
---
 tools/testing/selftests/x86/Makefile |   2 +-
 tools/testing/selftests/x86/amx.c| 647 +++
 2 files changed, 648 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/amx.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 6703c7906b71..8408bbde788f 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -17,7 +17,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs 
syscall_nt test_mremap
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering amx
 # Some selftests require 32bit support enabled also on 64bit systems
 TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
 
diff --git a/tools/testing/selftests/x86/amx.c 
b/tools/testing/selftests/x86/amx.c
new file mode 100644
index ..dce3b298f043
--- /dev/null
+++ b/tools/testing/selftests/x86/amx.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+#define PAGE_SIZE  (1 << 12)
+
+#define NUM_TILES  8
+#define TILE_SIZE  1024
+#define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+struct xsave_data {
+   u8 area[XSAVE_SIZE];
+} __attribute__((aligned(64)));
+
+/* Tile configuration associated: */
+#define MAX_TILES  16
+#define RESERVED_BYTES 14
+
+struct tile_config {
+   u8  palette_id;
+   u8  start_row;
+   u8  reserved[RESERVED_BYTES];
+   u16 colsb[MAX_TILES];
+   u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+   u8 data[NUM_TILES * TILE_SIZE];
+};
+
+static inline u64 __xgetbv(u32 index)
+{
+   u32 eax, edx;
+
+   asm volatile(".byte 0x0f,0x01,0xd0"
+: "=a" (eax), "=d" (edx)
+: "c" (index));
+   return eax + ((u64)edx << 32);
+}
+
+static inline void __cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+{
+   asm volatile("cpuid;"
+: "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+: "0" (*eax), "2" (*ecx));
+}
+
+/* Load tile configuration */
+static inline void __ldtilecfg(void *cfg)
+{
+   asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+: : "a"(cfg));
+}
+
+/* Load tile data to %tmm0 register only */
+static inline void __tileloadd(void *tile)
+{
+   asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+: : "a"(tile), "d"(0));
+}
+
+/* Save extended states */
+static inline void __xsave(void *area, u32 lo, u32 hi)
+{
+   asm volatile(".byte 0x48,0x0f,0xae,0x27"
+: : "D" (area), "a" (lo), "d" (hi)
+: "memory");
+}
+
+/* Restore extended states */
+static inline void __xrstor(void *area, u32 lo, u32 hi)
+{
+   asm volatile(".byte 0x48,0x0f,0xae,0x2f"
+: : "D" (area), "a" (lo), "d" (hi));
+}
+
+/* Release tile states to init values */
+static inline void __tilerelease(void)
+{
+   asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
+}
+
+/* Hardware info check: */
+
+static inline bool check_xsave_supports_xtile(void)
+{
+   u32 eax, ebx, ecx, edx;
+   bool available = false;
+
+#define XSAVE_CPUI

[PATCH v2 12/22] x86/fpu/xstate: Update xstate context copy function for supporting dynamic area

2020-11-19 Thread Chang S. Bae
There are xstate context copy functions that used in ptrace() and signal
return paths. They serve callers to read (or write) xstate values in the
task->fpu's buffer or to get initial values. With dynamic user states, a
component's position in the buffer may vary and the initial value is not
always stored in init_fpstate.

Change the helpers to find a component's offset accordingly (either lookup
table or calculation).

When copying an initial value, explicitly check the init_fpstate coverage.
If not found, reset the memory in the destination. Otherwise, copy values
from init_fpstate.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/fpu/xstate.c | 55 +++-
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0c0be0952194..bd7c4135c1d0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -248,12 +248,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
if (!(xfeatures & XFEATURE_MASK_SSE))
memset(>xmm_space[0], 0, 256);
 
+   /* Make sure 'xfeatures' to be a subset of fpu->state_mask */
+   xfeatures = ((xfeatures_mask_user() & fpu->state_mask) & ~xfeatures);
/*
 * First two features are FPU and SSE, which above we handled
 * in a special way already:
 */
feature_bit = 0x2;
-   xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
+   xfeatures >>= 0x2;
 
/*
 * Update all the remaining memory layouts according to their
@@ -262,12 +264,15 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
 */
while (xfeatures) {
if (xfeatures & 0x1) {
-   int offset = xstate_comp_offsets[feature_bit];
-   int size = xstate_sizes[feature_bit];
-
-   memcpy((void *)xsave + offset,
-  (void *)_fpstate.xsave + offset,
-  size);
+   unsigned int offset = get_xstate_comp_offset(fpu, 
feature_bit);
+   unsigned int size = xstate_sizes[feature_bit];
+
+   if (get_init_fpstate_mask() & BIT_ULL(feature_bit))
+   memcpy((void *)xsave + offset,
+  (void *)_fpstate.xsave + offset,
+  size);
+   else
+   memset((void *)xsave + offset, 0, size);
}
 
xfeatures >>= 1;
@@ -1239,7 +1244,10 @@ static void fill_gap(struct membuf *to, unsigned *last, 
unsigned offset)
 {
if (*last >= offset)
return;
-   membuf_write(to, (void *)_fpstate.xsave + *last, offset - *last);
+   if (offset <= get_init_fpstate_size())
+   membuf_write(to, (void *)_fpstate.xsave + *last, offset - 
*last);
+   else
+   membuf_zero(to, offset - *last);
*last = offset;
 }
 
@@ -1247,7 +1255,10 @@ static void copy_part(struct membuf *to, unsigned *last, 
unsigned offset,
  unsigned size, void *from)
 {
fill_gap(to, last, offset);
-   membuf_write(to, from, size);
+   if (from)
+   membuf_write(to, from, size);
+   else
+   membuf_zero(to, size);
*last = offset + size;
 }
 
@@ -1299,12 +1310,22 @@ void copy_xstate_to_kernel(struct membuf to, struct fpu 
*fpu)
  sizeof(header), );
 
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+   u64 mask = BIT_ULL(i);
+   void *src;
/*
-* Copy only in-use xstates:
+* Copy only in-use xstate at first. If the feature is enabled,
+* find the init value, whether stored in init_fpstate or simply
+* zeros, and then copy them.
 */
-   if ((header.xfeatures >> i) & 1) {
-   void *src = __raw_xsave_addr(fpu, i);
-
+   if (header.xfeatures & mask) {
+   src = __raw_xsave_addr(fpu, i);
+   copy_part(, , xstate_offsets[i],
+ xstate_sizes[i], src);
+   } else if (xfeatures_mask_user() & mask) {
+   if (get_init_fpstate_mask() & mask)
+   src = (void *)_fpstate.xsave + last;
+   else
+   src = NULL;
copy_part(, , xstate_offsets[i],
  xstate_sizes[i], src);
}
@@ -1338,6 +1359,9 @@ int copy_kernel_to_xstate(struct fpu *fpu, const void 

[PATCH v2 06/22] x86/fpu/xstate: Outline dynamic xstate area size in the task context

2020-11-19 Thread Chang S. Bae
The xstate area size in task->fpu used to be fixed at runtime. To
accommodate dynamic user states, introduce variables for representing the
maximum and default (as minimum) area sizes.

do_extra_xstate_size_checks() is ready to calculate both sizes, which can
be compared with CPUID. CPUID can immediately provide the maximum size. The
code needs to rewrite XCR0 registers to get the default size that excludes
the dynamic parts. It is not always straightforward especially when
inter-dependency exists between state component bits. To make it simple,
the code double-checks the maximum size only.

No functional change as long as the kernel does not support the dynamic
area.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: k...@vger.kernel.org
---
 arch/x86/include/asm/processor.h | 10 ++-
 arch/x86/kernel/fpu/core.c   |  6 ++--
 arch/x86/kernel/fpu/init.c   | 33 --
 arch/x86/kernel/fpu/signal.c |  2 +-
 arch/x86/kernel/fpu/xstate.c | 48 +---
 arch/x86/kernel/process.c|  6 
 arch/x86/kvm/x86.c   |  2 +-
 7 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 82a08b585818..d78069ca4b8d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -477,7 +477,8 @@ DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
 #endif /* X86_64 */
 
-extern unsigned int fpu_kernel_xstate_size;
+extern unsigned int fpu_kernel_xstate_default_size;
+extern unsigned int fpu_kernel_xstate_max_size;
 extern unsigned int fpu_user_xstate_size;
 
 struct perf_event;
@@ -545,12 +546,7 @@ struct thread_struct {
 };
 
 /* Whitelist the FPU state from the task_struct for hardened usercopy. */
-static inline void arch_thread_struct_whitelist(unsigned long *offset,
-   unsigned long *size)
-{
-   *offset = offsetof(struct thread_struct, fpu.state);
-   *size = fpu_kernel_xstate_size;
-}
+extern void arch_thread_struct_whitelist(unsigned long *offset, unsigned long 
*size);
 
 /*
  * Thread-synchronous status.
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 39ddb22c143b..875620fdfe61 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -206,7 +206,7 @@ void fpstate_init(struct fpu *fpu)
return;
}
 
-   memset(state, 0, fpu_kernel_xstate_size);
+   memset(state, 0, fpu_kernel_xstate_default_size);
 
if (static_cpu_has(X86_FEATURE_XSAVES))
fpstate_init_xstate(>xsave, xfeatures_mask_all);
@@ -233,7 +233,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 * Don't let 'init optimized' areas of the XSAVE area
 * leak into the child task:
 */
-   memset(_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+   memset(_fpu->state.xsave, 0, fpu_kernel_xstate_default_size);
 
/*
 * If the FPU registers are not current just memcpy() the state.
@@ -245,7 +245,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct 
*src)
 */
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
-   memcpy(_fpu->state, _fpu->state, 
fpu_kernel_xstate_size);
+   memcpy(_fpu->state, _fpu->state, 
fpu_kernel_xstate_default_size);
 
else if (!copy_fpregs_to_fpstate(dst_fpu))
copy_kernel_to_fpregs(dst_fpu);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 74e03e3bc20f..5e217bd6e85a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -130,13 +130,17 @@ static void __init fpu__init_system_generic(void)
 }
 
 /*
- * Size of the FPU context state. All tasks in the system use the
- * same context size, regardless of what portion they use.
- * This is inherent to the XSAVE architecture which puts all state
- * components into a single, continuous memory block:
+ * Size of the maximum FPU context state. It is inherent to the XSAVE 
architecture
+ * which puts all state components into a single, continuous memory block:
  */
-unsigned int fpu_kernel_xstate_size;
-EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
+unsigned int fpu_kernel_xstate_max_size;
+
+/*
+ * Size of the initial FPU context state. All tasks in the system use this 
context
+ * size by default.
+ */
+unsigned int fpu_kernel_xstate_default_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_default_size);
 
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
@@ -166,9 +170,9 @@ static void __init fpu__init_task_struct_size(void)
 
/*
 * Add back the dynamically-calculated register state
-* size.
+* size by default.
 */
-   task_size +

[PATCH v2 00/22] x86: Support Intel Advanced Matrix Extensions

2020-11-19 Thread Chang S. Bae
[ We know there are a lot of Intel patches out there this week. We're
  posting this as early as we can in case anyone has bandwidth to take a
  look.  We don't think these are quite ready to be merged, but any review
  is appreciated. ]

Intel Advanced Matrix Extensions (AMX)[1][2] will be shipping on servers
soon.  AMX consists of configurable TMM "TILE" registers plus new
accelerator instructions that operate on them.  TMUL (Tile matrix MULtiply)
is the first accelerator instruction set to use the new registers, and we
anticipate additional instructions in the future.

Neither AMX state nor TMUL instructions depend on AVX.  However, AMX and
AVX do share common challenges.  The TMM registers are 8KB today, and
architecturally as large as 64KB, which merits updates to hardware and
software state management.

Further, both technologies run faster when they are not simultaneously
running on SMT siblings, and both technologies use of power and bandwidth
impact the power and performance available to neighboring cores.  (This
impact has measurably improved in recent hardware.)

If the existing kernel approach for managing XSAVE state was employed to
handle AMX, 8KB space would be added to every task, but possibly rarely
used.  So Linux support is optimized by using a new XSAVE feature: eXtended
Feature Disabling (XFD).  The kernel arms XFD to provide a #NM exception
upon a tasks' first access to TILE state. The kernel exception handler
installs the appropriate XSAVE context switch buffer, and the task behaves
as if the kernel had done that for all tasks.  Using XFD, AMX space is
allocated only when needed, eliminating the memory waste for unused state
components.

This series requires the new minimum sigaltstack support [3] and is based
on the mainline. The series is composed of three parts:
* Patch 01-16: Foundation to support dynamic user state management
* Patch 16-20: AMX enablement, including unit tests
* Patch 21-22: Signal handling optimization and new boot-parameters

Thanks to Len Brown and Dave Hansen for help with the cover letter.

Changes from v1 [4]:
* Added vmalloc() error tracing (Dave Hansen, PeterZ, and Andy Lutomirski)
* Inlined the #NM handling code (Andy Lutomirski)
* Made signal handling optimization revertible
* Revised the new parameter handling code (Andy Lutomirski and Dave Hansen)
* Rebased on the upstream kernel

[1]: Intel Architecture Instruction Set Extension Programming Reference
June 2020, 
https://software.intel.com/content/dam/develop/public/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
[2]: 
https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-matrix-extensions-intel-amx-instructions.html
[3]: https://lore.kernel.org/lkml/20201119190237.626-1-chang.seok@intel.com/
[4]: 
https://lore.kernel.org/lkml/20201001203913.9125-1-chang.seok@intel.com/

Chang S. Bae (22):
  x86/fpu/xstate: Modify area init helper prototypes to access all the
possible areas
  x86/fpu/xstate: Modify xstate copy helper prototypes to access all the
possible areas
  x86/fpu/xstate: Modify address finder prototypes to access all the
possible areas
  x86/fpu/xstate: Modify save and restore helper prototypes to access
all the possible areas
  x86/fpu/xstate: Introduce a new variable for dynamic user states
  x86/fpu/xstate: Outline dynamic xstate area size in the task context
  x86/fpu/xstate: Introduce helpers to manage an xstate area dynamically
  x86/fpu/xstate: Define the scope of the initial xstate data
  x86/fpu/xstate: Introduce wrapper functions for organizing xstate area
access
  x86/fpu/xstate: Update xstate save function for supporting dynamic
user xstate
  x86/fpu/xstate: Update xstate area address finder for supporting
dynamic user xstate
  x86/fpu/xstate: Update xstate context copy function for supporting
dynamic area
  x86/fpu/xstate: Expand dynamic user state area on first use
  x86/fpu/xstate: Inherit dynamic user state when used in the parent
  x86/fpu/xstate: Support ptracer-induced xstate area expansion
  x86/fpu/xstate: Extend the table for mapping xstate components with
features
  x86/cpufeatures/amx: Enumerate Advanced Matrix Extension (AMX) feature
bits
  x86/fpu/amx: Define AMX state components and have it used for
boot-time checks
  x86/fpu/amx: Enable the AMX feature in 64-bit mode
  selftest/x86/amx: Include test cases for the AMX state management
  x86/fpu/xstate: Support dynamic user state in the signal handling path
  x86/fpu/xstate: Introduce boot-parameters for control some state
component support

 .../admin-guide/kernel-parameters.txt |  15 +
 arch/x86/include/asm/cpufeatures.h|   4 +
 arch/x86/include/asm/fpu/internal.h   |  97 ++-
 arch/x86/include/asm/fpu/types.h  |  62 +-
 arch/x86/include/asm/fp

[PATCH v2 11/22] x86/fpu/xstate: Update xstate area address finder for supporting dynamic user xstate

2020-11-19 Thread Chang S. Bae
__raw_xsave_addr() returns the requested component's pointer in an XSAVE
buffer, by simply looking up the offset table. The offset used to be fixed,
but, with dynamic user states, it becomes variable.

get_xstate_size() has a routine to find an offset at run-time. Refactor to
use it for the address finder.

No functional change until the kernel enables dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/fpu/xstate.c | 82 +++-
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f8884dcdcc7c..0c0be0952194 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -133,15 +133,50 @@ static bool xfeature_is_supervisor(int xfeature_nr)
return ecx & 1;
 }
 
+/*
+ * Available once those arrays for the offset, size, and alignment info are 
set up,
+ * by setup_xstate_features().
+ */
+static unsigned int __get_xstate_comp_offset(u64 mask, int feature_nr)
+{
+   u64 xmask = BIT_ULL(feature_nr + 1) - 1;
+   unsigned int next_offset, offset = 0;
+   int i;
+
+   if ((mask & xmask) == (xfeatures_mask_all & xmask))
+   return xstate_comp_offsets[feature_nr];
+
+   /*
+* Calculate the size by summing up each state together, since no known
+* offset found with the xstate area format out of the given mask.
+*/
+
+   next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+   for (i = FIRST_EXTENDED_XFEATURE; i <= feature_nr; i++) {
+   if (!(mask & BIT_ULL(i)))
+   continue;
+
+   offset = xstate_aligns[i] ? ALIGN(next_offset, 64) : 
next_offset;
+   next_offset += xstate_sizes[i];
+   }
+
+   return offset;
+}
+
+static unsigned int get_xstate_comp_offset(struct fpu *fpu, int feature_nr)
+{
+   return __get_xstate_comp_offset(fpu->state_mask, feature_nr);
+}
+
 /*
  * Available once those arrays for the offset, size, and alignment info are 
set up,
  * by setup_xstate_features().
  */
 unsigned int get_xstate_size(u64 mask)
 {
-   unsigned int size;
-   u64 xmask;
-   int i, nr;
+   unsigned int offset;
+   int nr;
 
if (!mask)
return 0;
@@ -155,24 +190,8 @@ unsigned int get_xstate_size(u64 mask)
if (!using_compacted_format())
return xstate_offsets[nr] + xstate_sizes[nr];
 
-   xmask = BIT_ULL(nr + 1) - 1;
-
-   if (mask == (xmask & xfeatures_mask_all))
-   return xstate_comp_offsets[nr] + xstate_sizes[nr];
-
-   /*
-* Calculate the size by summing up each state together, since no known
-* size found with the xstate area format out of the given mask.
-*/
-   for (size = FXSAVE_SIZE + XSAVE_HDR_SIZE, i = FIRST_EXTENDED_XFEATURE; 
i <= nr; i++) {
-   if (!(mask & BIT_ULL(i)))
-   continue;
-
-   if (xstate_aligns[i])
-   size = ALIGN(size, 64);
-   size += xstate_sizes[i];
-   }
-   return size;
+   offset = __get_xstate_comp_offset(mask, nr);
+   return offset + xstate_sizes[nr];
 }
 
 /*
@@ -991,17 +1010,20 @@ static void *__raw_xsave_addr(struct fpu *fpu, int 
xfeature_nr)
 {
void *xsave;
 
-   if (!xfeature_enabled(xfeature_nr)) {
-   WARN_ON_FPU(1);
-   return NULL;
-   }
-
-   if (fpu)
-   xsave = __xsave(fpu);
-   else
+   if (!xfeature_enabled(xfeature_nr))
+   goto not_found;
+   else if (!fpu)
xsave = _fpstate.xsave;
+   else if (!(fpu->state_mask & BIT_ULL(xfeature_nr)))
+   goto not_found;
+   else
+   xsave = __xsave(fpu);
+
+   return (xsave + get_xstate_comp_offset(fpu, xfeature_nr));
 
-   return xsave + xstate_comp_offsets[xfeature_nr];
+not_found:
+   WARN_ON_FPU(1);
+   return NULL;
 }
 
 /*
-- 
2.17.1



[PATCH v2 09/22] x86/fpu/xstate: Introduce wrapper functions for organizing xstate area access

2020-11-19 Thread Chang S. Bae
task->fpu now has two possible xstate areas, fpu->state or fpu->state_ptr.
Instead of open code for accessing one of the two areas, rearrange them to
use a new wrapper.

Some open code (e.g., in KVM) is left unchanged as not going to use
fpu->state_ptr at the moment.

No functional change until the kernel supports dynamic user states.

Signed-off-by: Chang S. Bae 
Reviewed-by: Len Brown 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/fpu/internal.h | 10 ++
 arch/x86/include/asm/fpu/xstate.h   | 10 ++
 arch/x86/include/asm/trace/fpu.h|  6 --
 arch/x86/kernel/fpu/core.c  | 27 ---
 arch/x86/kernel/fpu/regset.c| 28 +---
 arch/x86/kernel/fpu/signal.c| 23 +--
 arch/x86/kernel/fpu/xstate.c| 20 +++-
 7 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 66ed1f88191c..3201468ff4aa 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -210,10 +210,12 @@ static inline int copy_user_to_fregs(struct fregs_state 
__user *fx)
 
 static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 {
+   union fpregs_state *xstate = __xstate(fpu);
+
if (IS_ENABLED(CONFIG_X86_32))
-   asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+   asm volatile("fxsave %[fx]" : [fx] "=m" (xstate->fxsave));
else
-   asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+   asm volatile("fxsaveq %[fx]" : [fx] "=m" (xstate->fxsave));
 }
 
 /* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -411,7 +413,7 @@ static inline int copy_user_to_xregs(struct xregs_state 
__user *buf, u64 mask)
  */
 static inline int copy_kernel_to_xregs_err(struct fpu *fpu, u64 mask)
 {
-   struct xregs_state *xstate = >state.xsave;
+   struct xregs_state *xstate = __xsave(fpu);
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -440,7 +442,7 @@ static inline void __copy_kernel_to_fpregs(union 
fpregs_state *fpstate, u64 mask
 
 static inline void copy_kernel_to_fpregs(struct fpu *fpu)
 {
-   union fpregs_state *fpstate = >state;
+   union fpregs_state *fpstate = __xstate(fpu);
 
/*
 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 9183e2cdffe3..cc159bc9386d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -102,6 +102,16 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 extern void __init update_regset_xstate_info(unsigned int size,
 u64 xstate_mask);
 
+static inline union fpregs_state *__xstate(struct fpu *fpu)
+{
+   return (fpu->state_ptr) ? fpu->state_ptr : >state;
+}
+
+static inline struct xregs_state *__xsave(struct fpu *fpu)
+{
+   return &__xstate(fpu)->xsave;
+}
+
 void *get_xsave_addr(struct fpu *fpu, int xfeature_nr);
 unsigned int get_xstate_size(u64 mask);
 int alloc_xstate_area(struct fpu *fpu, u64 mask, unsigned int *alloc_size);
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index bf88b873..4b21c34436f9 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -22,8 +22,10 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu= fpu;
__entry->load_fpu   = test_thread_flag(TIF_NEED_FPU_LOAD);
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-   __entry->xfeatures = fpu->state.xsave.header.xfeatures;
-   __entry->xcomp_bv  = fpu->state.xsave.header.xcomp_bv;
+   struct xregs_state *xsave = __xsave(fpu);
+
+   __entry->xfeatures = xsave->header.xfeatures;
+   __entry->xcomp_bv  = xsave->header.xcomp_bv;
}
),
TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 33956ae3de2b..dca4961fcc36 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -94,14 +94,18 @@ EXPORT_SYMBOL(irq_fpu_usable);
  */
 int copy_fpregs_to_fpstate(struct fpu *fpu)
 {
+   union fpregs_state *xstate = __xstate(fpu);
+
if (likely(use_xsave())) {
-   copy_xregs_to_kernel(>state.xsave);
+   struct xregs_state *xsave = >xsave;
+
+   copy_xregs_to_kernel(xsave);
 
/*
 * AVX512 state is tracked here bec

  1   2   3   4   5   6   >