Re: [PATCH] x86/sgx: fix the return type of sgx_init

2021-01-14 Thread Darren Kenny
On Wednesday, 2021-01-13 at 15:23:11 -08, Sami Tolvanen wrote:
> device_initcall() expects a function of type initcall_t, which returns
> an integer. Change the signature of sgx_init() to match.
>
> Fixes: e7e0545299d8c ("x86/sgx: Initialize metadata for Enclave Page Cache 
> (EPC) sections")
> Signed-off-by: Sami Tolvanen 

Makes sense.

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/kernel/cpu/sgx/main.c | 14 +-
>  1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index c519fc5f6948..8df81a3ed945 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void)
>   return true;
>  }
>  
> -static void __init sgx_init(void)
> +static int __init sgx_init(void)
>  {
>   int ret;
>   int i;
>  
>   if (!cpu_feature_enabled(X86_FEATURE_SGX))
> - return;
> + return -ENODEV;
>  
>   if (!sgx_page_cache_init())
> - return;
> + return -ENOMEM;
>  
> - if (!sgx_page_reclaimer_init())
> + if (!sgx_page_reclaimer_init()) {
> + ret = -ENOMEM;
>   goto err_page_cache;
> + }
>  
>   ret = sgx_drv_init();
>   if (ret)
>   goto err_kthread;
>  
> - return;
> + return 0;
>  
>  err_kthread:
>   kthread_stop(ksgxd_tsk);
> @@ -728,6 +730,8 @@ static void __init sgx_init(void)
>   vfree(sgx_epc_sections[i].pages);
>   memunmap(sgx_epc_sections[i].virt_addr);
>   }
> +
> + return ret;
>  }
>  
>  device_initcall(sgx_init);
>
> base-commit: 65f0d2414b7079556fbbcc070b3d1c9f9587606d
> -- 
> 2.30.0.284.gd98b1dd5eaa7-goog


Re: [PATCH v36 05/24] x86/sgx: Add wrappers for ENCLS leaf functions

2020-08-07 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:44 +03, Jarkko Sakkinen wrote:
> ENCLS is a ring 0 instruction, which contains a set of leaf functions for
> managing an enclave. Enclaves are measured and signed software entities,
> which are protected by asserting the outside memory accesses and memory
> encryption.
>
> Add a two-layer macro system along with an encoding scheme to allow
> wrappers to return trap numbers along ENCLS-specific error codes. The
> bottom layer of the macro system splits between the leafs that return an
> error code and those that do not. The second layer generates the correct
> input/output annotations based on the number of operands for each leaf
> function.
>
> ENCLS leaf functions are documented in
>
>   Intel SDM: 36.6 ENCLAVE INSTRUCTIONS AND INTEL®

Tested-by: Darren Kenny 

>
> Acked-by: Jethro Beekman 
> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Signed-off-by: Jarkko Sakkinen 
> ---
>  arch/x86/kernel/cpu/sgx/encls.h | 238 
>  1 file changed, 238 insertions(+)
>  create mode 100644 arch/x86/kernel/cpu/sgx/encls.h
>
> diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
> new file mode 100644
> index ..f716b4328614
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/encls.h
> @@ -0,0 +1,238 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +#ifndef _X86_ENCLS_H
> +#define _X86_ENCLS_H
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "sgx.h"
> +
> +enum sgx_encls_leaf {
> + ECREATE = 0x00,
> + EADD= 0x01,
> + EINIT   = 0x02,
> + EREMOVE = 0x03,
> + EDGBRD  = 0x04,
> + EDGBWR  = 0x05,
> + EEXTEND = 0x06,
> + ELDU= 0x08,
> + EBLOCK  = 0x09,
> + EPA = 0x0A,
> + EWB = 0x0B,
> + ETRACK  = 0x0C,
> +};
> +
> +/**
> + * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
> + *
> + * ENCLS has its own (positive value) error codes and also generates
> + * ENCLS specific #GP and #PF faults.  And the ENCLS values get munged
> + * with system error codes as everything percolates back up the stack.
> + * Unfortunately (for us), we need to precisely identify each unique
> + * error code, e.g. the action taken if EWB fails varies based on the
> + * type of fault and on the exact SGX error code, i.e. we can't simply
> + * convert all faults to -EFAULT.
> + *
> + * To make all three error types coexist, we set bit 30 to identify an
> + * ENCLS fault.  Bit 31 (technically bits N:31) is used to differentiate
> + * between positive (faults and SGX error codes) and negative (system
> + * error codes) values.
> + */
> +#define ENCLS_FAULT_FLAG 0x4000
> +
> +/* Retrieve the encoded trapnr from the specified return code. */
> +#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG)
> +
> +/* Issue a WARN() about an ENCLS leaf. */
> +#define ENCLS_WARN(r, name) {
>   \
> + do {  \
> + int _r = (r); \
> + WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \
> + } while (0);  \
> +}
> +
> +/**
> + * encls_failed() - Check if an ENCLS leaf function failed
> + * @ret: the return value of an ENCLS leaf function call
> + *
> + * Check if an ENCLS leaf function failed. This happens when the leaf 
> function
> + * causes a fault that is not caused by an EPCM conflict or when the leaf
> + * function returns a non-zero value.
> + */
> +static inline bool encls_failed(int ret)
> +{
> + int epcm_trapnr;
> +
> + if (boot_cpu_has(X86_FEATURE_SGX2))
> + epcm_trapnr = X86_TRAP_PF;
> + else
> + epcm_trapnr = X86_TRAP_GP;
> +
> + if (ret & ENCLS_FAULT_FLAG)
> + return ENCLS_TRAPNR(ret) != epcm_trapnr;
> +
> + return !!ret;
> +}
> +
> +/**
> + * __encls_ret_N - encode an ENCLS leaf that returns an error code in EAX
> + * @rax: leaf number
> + * @inputs:  asm inputs for the leaf
> + *
> + * Emit assembly for an ENCLS leaf that returns an error code, e.g. EREMOVE.
> + * And because SGX isn't complex enough as it is, leafs that return an error
> + * code also modify flags.
> + *
> + * Return:
> + *   0 on success,
> + *   SGX error code on failure
> + */
> +#define __encls_ret_N(rax, inputs...)\
> + ({  

Re: [PATCH v36 12/24] x86/sgx: Add SGX_IOC_ENCLAVE_CREATE

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:51 +03, Jarkko Sakkinen wrote:
> Add an ioctl that performs ENCLS[ECREATE], which creates SGX Enclave
> Control Structure for the enclave. SECS contains attributes about the
> enclave that are used by the hardware and cannot be directly accessed by
> software, as SECS resides in the EPC.
>
> One essential field in SECS is a field that stores the SHA256 of the
> measured enclave pages. This field, MRENCLAVE, is initialized by the
> ECREATE instruction and updated by every EADD and EEXTEND operation.
> Finally, EINIT locks down the value.
>
> Acked-by: Jethro Beekman 
> Tested-by: Jethro Beekman 
> Tested-by: Haitao Huang 
> Tested-by: Chunyang Hui 
> Tested-by: Jordan Hand 
> Tested-by: Nathaniel McCallum 
> Tested-by: Seth Moore 

Tested-by: Darren Kenny 
Reviewed-by: Darren Kenny 

> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Suresh Siddha 
> Signed-off-by: Suresh Siddha 
> Signed-off-by: Jarkko Sakkinen 
> ---
>  .../userspace-api/ioctl/ioctl-number.rst  |   1 +
>  arch/x86/include/uapi/asm/sgx.h   |  25 ++
>  arch/x86/kernel/cpu/sgx/Makefile  |   1 +
>  arch/x86/kernel/cpu/sgx/driver.c  |  12 +
>  arch/x86/kernel/cpu/sgx/driver.h  |   1 +
>  arch/x86/kernel/cpu/sgx/ioctl.c   | 226 ++
>  6 files changed, 266 insertions(+)
>  create mode 100644 arch/x86/include/uapi/asm/sgx.h
>  create mode 100644 arch/x86/kernel/cpu/sgx/ioctl.c
>
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
> b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 59472cd6a11d..35f713e3a267 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -323,6 +323,7 @@ Code  Seq#Include File
>Comments
>   
> <mailto:tle...@mindspring.com>
>  0xA3  90-9F  linux/dtlk.h
>  0xA4  00-1F  uapi/linux/tee.hGeneric 
> TEE subsystem
> +0xA4  00-1F  uapi/asm/sgx.h  Intel 
> SGX subsystem (a legit conflict as TEE and SGX do not co-exist)
>  0xAA  00-3F  linux/uapi/linux/userfaultfd.h
>  0xAB  00-1F  linux/nbd.h
>  0xAC  00-1F  linux/raw.h
> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> new file mode 100644
> index ..3787d278e84b
> --- /dev/null
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) WITH 
> Linux-syscall-note */
> +/*
> + * Copyright(c) 2016-19 Intel Corporation.
> + */
> +#ifndef _UAPI_ASM_X86_SGX_H
> +#define _UAPI_ASM_X86_SGX_H
> +
> +#include 
> +#include 
> +
> +#define SGX_MAGIC 0xA4
> +
> +#define SGX_IOC_ENCLAVE_CREATE \
> + _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
> +
> +/**
> + * struct sgx_enclave_create - parameter structure for the
> + * %SGX_IOC_ENCLAVE_CREATE ioctl
> + * @src: address for the SECS page data
> + */
> +struct sgx_enclave_create  {
> + __u64   src;
> +};
> +
> +#endif /* _UAPI_ASM_X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile 
> b/arch/x86/kernel/cpu/sgx/Makefile
> index 3fc451120735..91d3dc784a29 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -1,4 +1,5 @@
>  obj-y += \
>   driver.o \
>   encl.o \
> + ioctl.o \
>   main.o
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c 
> b/arch/x86/kernel/cpu/sgx/driver.c
> index b52520407f5b..5559bc18de41 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.c
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -118,10 +118,22 @@ static unsigned long sgx_get_unmapped_area(struct file 
> *file,
>   return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
>  }
>  
> +#ifdef CONFIG_COMPAT
> +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
> +   unsigned long arg)
> +{
> + return sgx_ioctl(filep, cmd, arg);
> +}
> +#endif
> +
>  static const struct file_operations sgx_encl_fops = {
>   .owner  = THIS_MODULE,
>   .open   = sgx_open,
>   .release= sgx_release,
> + .unlocked_ioctl = sgx_ioctl,
> +#ifdef CONFIG_COMPAT
> + .compat_ioctl   = sgx_compat_ioctl,
> +#endif
>   .mmap   = sgx_mmap,
>   .get_unmapped_area  = sgx_get_unmapped_area,
>  };
> diff --git a/arch/x8

Re: [PATCH v36 01/24] x86/cpufeatures: x86/msr: Add Intel SGX hardware bits

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:40 +03, Jarkko Sakkinen wrote:
> From: Sean Christopherson 
>
> Add X86_FEATURE_SGX from CPUID.(EAX=7, ECX=1), which informs whether the
> CPU has SGX.
>
> Add X86_FEATURE_SGX1 and X86_FEATURE_SGX2 from CPUID.(EAX=12H, ECX=0),
> which describe the level of SGX support available [1].
>
> Add IA32_FEATURE_CONTROL.SGX_ENABLE. BIOS can use this bit to opt-in SGX
> before locking the feature control MSR [2].
>
> [1] Intel SDM: 36.7.2 Intel® SGX Resource Enumeration Leaves
> [2] Intel SDM: 36.7.1 Intel® SGX Opt-In Configuration
>
> Reviewed-by: Borislav Petkov 
> Acked-by: Jethro Beekman 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> Signed-off-by: Jarkko Sakkinen 
> ---
>  arch/x86/include/asm/cpufeature.h|  5 +++--
>  arch/x86/include/asm/cpufeatures.h   |  7 ++-
>  arch/x86/include/asm/disabled-features.h | 18 +++---
>  arch/x86/include/asm/msr-index.h |  1 +
>  arch/x86/include/asm/required-features.h |  2 +-
>  arch/x86/kernel/cpu/common.c |  4 
>  6 files changed, 30 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/include/asm/cpufeature.h 
> b/arch/x86/include/asm/cpufeature.h
> index 59bf91c57aa8..efbdba5170a3 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -30,6 +30,7 @@ enum cpuid_leafs
>   CPUID_7_ECX,
>   CPUID_8000_0007_EBX,
>   CPUID_7_EDX,
> + CPUID_12_EAX,
>  };
>  
>  #ifdef CONFIG_X86_FEATURE_NAMES
> @@ -89,7 +90,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
>  CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) ||\
>  CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) ||\
>  REQUIRED_MASK_CHECK||\
> -BUILD_BUG_ON_ZERO(NCAPINTS != 19))
> +BUILD_BUG_ON_ZERO(NCAPINTS != 20))
>  
>  #define DISABLED_MASK_BIT_SET(feature_bit)   \
>( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  0, feature_bit) ||\
> @@ -112,7 +113,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
>  CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) ||\
>  CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) ||\
>  DISABLED_MASK_CHECK||\
> -BUILD_BUG_ON_ZERO(NCAPINTS != 19))
> +BUILD_BUG_ON_ZERO(NCAPINTS != 20))
>  
>  #define cpu_has(c, bit)  
> \
>   (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
> diff --git a/arch/x86/include/asm/cpufeatures.h 
> b/arch/x86/include/asm/cpufeatures.h
> index 02dabc9e77b0..545ac3e0e269 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -13,7 +13,7 @@
>  /*
>   * Defines x86 CPU feature bits
>   */
> -#define NCAPINTS 19 /* N 32-bit words worth of 
> info */
> +#define NCAPINTS 20 /* N 32-bit words worth of 
> info */
>  #define NBUGINTS 1  /* N 32-bit bug flags */
>  
>  /*
> @@ -238,6 +238,7 @@
>  /* Intel-defined CPU features, CPUID level 0x0007:0 (EBX), word 9 */
>  #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, 
> RDGSBASE, WRGSBASE instructions*/
>  #define X86_FEATURE_TSC_ADJUST   ( 9*32+ 1) /* TSC adjustment 
> MSR 0x3B */
> +#define X86_FEATURE_SGX  ( 9*32+ 2) /* Software Guard 
> Extensions */
>  #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit 
> manipulation extensions */
>  #define X86_FEATURE_HLE  ( 9*32+ 4) /* Hardware Lock 
> Elision */
>  #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
> @@ -373,6 +374,10 @@
>  #define X86_FEATURE_CORE_CAPABILITIES(18*32+30) /* "" 
> IA32_CORE_CAPABILITIES MSR */
>  #define X86_FEATURE_SPEC_CTRL_SSBD   (18*32+31) /* "" Speculative Store 
> Bypass Disable */
>  
> +/* Intel-defined SGX features, CPUID level 0x0012:0 (EAX), word 19 */
> +#define X86_FEATURE_SGX1 (19*32+ 0) /* SGX1 leaf functions */
> +#define X86_FEATURE_SGX2 (19*32+ 1) /* SGX2 leaf functions */
> +
>  /*
>   * BUG word(s)
>   */
> diff --git a/arch/x86/include/asm/disabled-features.h 
> b/arch/x86/include/asm/disabled-features.h
> index 4ea8584682f9..dbe534d5153f 100644
> --- a/arch/x86/include/asm/disabled-features.h
> +++ b/arch/x86/include/asm/disabled-features.h
> @@ -28,13 +28,18 @@
>  # define DISABLE_CYRIX_A

Re: [PATCH v36 04/24] x86/sgx: Add SGX microarchitectural data structures

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:43 +03, Jarkko Sakkinen wrote:
> Define the SGX microarchitectural data structures used by various SGX
> opcodes. This is not an exhaustive representation of all SGX data
> structures but only those needed by the kernel.
>
> The data structures are described in:
>
>   Intel SDM: 37.6 INTEL® SGX DATA STRUCTURES OVERVIEW
>
> Acked-by: Jethro Beekman 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/kernel/cpu/sgx/arch.h | 343 +
>  1 file changed, 343 insertions(+)
>  create mode 100644 arch/x86/kernel/cpu/sgx/arch.h
>
> diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h
> new file mode 100644
> index ..ddae55e9d4d8
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/arch.h
> @@ -0,0 +1,343 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/**
> + * Copyright(c) 2016-18 Intel Corporation.
> + *
> + * Contains data structures defined by the SGX architecture.  Data structures
> + * defined by the Linux software stack should not be placed here.
> + */
> +#ifndef _ASM_X86_SGX_ARCH_H
> +#define _ASM_X86_SGX_ARCH_H
> +
> +#include 
> +#include 
> +
> +#define SGX_CPUID0x12
> +#define SGX_CPUID_FIRST_VARIABLE_SUB_LEAF2
> +
> +/**
> + * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
> + * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
> + *   been completed yet.
> + * %SGX_INVALID_EINITTOKEN:  EINITTOKEN is invalid and enclave signer's
> + *   public key does not match IA32_SGXLEPUBKEYHASH.
> + * %SGX_UNMASKED_EVENT:  An unmasked event, e.g. INTR, was 
> received
> + */
> +enum sgx_return_code {
> + SGX_NOT_TRACKED = 11,
> + SGX_INVALID_EINITTOKEN  = 16,
> + SGX_UNMASKED_EVENT  = 128,
> +};
> +
> +/**
> + * enum sgx_sub_leaf_types - SGX CPUID variable sub-leaf types
> + * %SGX_CPUID_SUB_LEAF_INVALID:  Indicates this sub-leaf is 
> invalid.
> + * %SGX_CPUID_SUB_LEAF_EPC_SECTION:  Sub-leaf enumerates an EPC section.
> + */
> +enum sgx_sub_leaf_types {
> + SGX_CPUID_SUB_LEAF_INVALID  = 0x0,
> + SGX_CPUID_SUB_LEAF_EPC_SECTION  = 0x1,
> +};
> +
> +#define SGX_CPUID_SUB_LEAF_TYPE_MASK GENMASK(3, 0)
> +
> +#define SGX_MODULUS_SIZE 384
> +
> +/**
> + * enum sgx_miscselect - additional information to an SSA frame
> + * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
> + *
> + * Save State Area (SSA) is a stack inside the enclave used to store 
> processor
> + * state when an exception or interrupt occurs. This enum defines additional
> + * information stored to an SSA frame.
> + */
> +enum sgx_miscselect {
> + SGX_MISC_EXINFO = BIT(0),
> +};
> +
> +#define SGX_MISC_RESERVED_MASK   GENMASK_ULL(63, 1)
> +
> +#define SGX_SSA_GPRS_SIZE184
> +#define SGX_SSA_MISC_EXINFO_SIZE 16
> +
> +/**
> + * enum sgx_attributes - the attributes field in  sgx_secs
> + * %SGX_ATTR_INIT:   Enclave can be entered (is initialized).
> + * %SGX_ATTR_DEBUG:  Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
> + * %SGX_ATTR_MODE64BIT:  Tell that this a 64-bit enclave.
> + * %SGX_ATTR_PROVISIONKEY:  Allow to use provisioning keys for remote
> + *   attestation.
> + * %SGX_ATTR_KSS:Allow to use key separation and sharing (KSS).
> + * %SGX_ATTR_EINITTOKENKEY:  Allow to use token signing key that is used to
> + *   sign cryptographic tokens that can be passed to
> + *   EINIT as an authorization to run an enclave.
> + */
> +enum sgx_attribute {
> + SGX_ATTR_INIT   = BIT(0),
> + SGX_ATTR_DEBUG  = BIT(1),
> + SGX_ATTR_MODE64BIT  = BIT(2),
> + SGX_ATTR_PROVISIONKEY   = BIT(4),
> + SGX_ATTR_EINITTOKENKEY  = BIT(5),
> + SGX_ATTR_KSS= BIT(7),
> +};
> +
> +#define SGX_ATTR_RESERVED_MASK   (BIT_ULL(3) | BIT_ULL(6) | 
> GENMASK_ULL(63, 8))
> +#define SGX_ATTR_ALLOWED_MASK(SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | \
> +  SGX_ATTR_KSS)
> +
> +/**
> + * struct sgx_secs - SGX Enclave Control Structure (SECS)
> + * @size:size of the address space
> + * @base:base address of the  address space
> + * @ssa_frame_size:  size of an SSA frame
> + * @miscselect:  additional information stored to an SSA frame
> + * @attributes:  attributes for enclave
> + * @xfrm:XSave-Fea

Re: [PATCH v36 06/24] x86/cpu/intel: Detect SGX support

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:45 +03, Jarkko Sakkinen wrote:
> From: Sean Christopherson 
>
> Configure SGX as part of feature control MSR initialization and update
> the associated X86_FEATURE flags accordingly.  Because the kernel will
> require the LE hash MSRs to be writable when running native enclaves,
> disable X86_FEATURE_SGX (and all derivatives) if SGX Launch Control is
> not (or cannot) be fully enabled via feature control MSR.
>
> The check is done for every CPU, not just BSP, in order to verify that
> MSR_IA32_FEATURE_CONTROL is correctly configured on all CPUs. The other
> parts of the kernel, like the enclave driver, expect the same
> configuration from all CPUs.
>
> Note, unlike VMX, clear the X86_FEATURE_SGX* flags for all CPUs if any
> CPU lacks SGX support as the kernel expects SGX to be available on all
> CPUs.  X86_FEATURE_VMX is intentionally cleared only for the current CPU
> so that KVM can provide additional information if KVM fails to load,
> e.g. print which CPU doesn't support VMX.  KVM/VMX requires additional
> per-CPU enabling, e.g. to set CR4.VMXE and do VMXON, and so already has
> the necessary infrastructure to do per-CPU checks.  SGX on the other
> hand doesn't require additional enabling, so clearing the feature flags
> on all CPUs means the SGX subsystem doesn't need to manually do support
> checks on a per-CPU basis.
>
> Acked-by: Jethro Beekman 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Jarkko Sakkinen 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/kernel/cpu/feat_ctl.c | 32 +++-
>  1 file changed, 31 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
> index 29a3bedabd06..c3afcd2e4342 100644
> --- a/arch/x86/kernel/cpu/feat_ctl.c
> +++ b/arch/x86/kernel/cpu/feat_ctl.c
> @@ -93,16 +93,35 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
>  }
>  #endif /* CONFIG_X86_VMX_FEATURE_NAMES */
>  
> +static void clear_sgx_caps(void)
> +{
> + setup_clear_cpu_cap(X86_FEATURE_SGX);
> + setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
> + setup_clear_cpu_cap(X86_FEATURE_SGX1);
> + setup_clear_cpu_cap(X86_FEATURE_SGX2);
> +}
> +
>  void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
>  {
>   bool tboot = tboot_enabled();
> + bool enable_sgx;
>   u64 msr;
>  
>   if (rdmsrl_safe(MSR_IA32_FEAT_CTL, )) {
>   clear_cpu_cap(c, X86_FEATURE_VMX);
> + clear_sgx_caps();
>   return;
>   }
>  
> + /*
> +  * Enable SGX if and only if the kernel supports SGX and Launch Control
> +  * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
> +  */
> + enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
> +  cpu_has(c, X86_FEATURE_SGX1) &&
> +  cpu_has(c, X86_FEATURE_SGX_LC) &&
> +  IS_ENABLED(CONFIG_INTEL_SGX);
> +
>   if (msr & FEAT_CTL_LOCKED)
>   goto update_caps;
>  
> @@ -124,13 +143,16 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
>   msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
>   }
>  
> + if (enable_sgx)
> + msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
> +
>   wrmsrl(MSR_IA32_FEAT_CTL, msr);
>  
>  update_caps:
>   set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
>  
>   if (!cpu_has(c, X86_FEATURE_VMX))
> - return;
> + goto update_sgx;
>  
>   if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
>   (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
> @@ -143,4 +165,12 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
>   init_vmx_capabilities(c);
>  #endif
>   }
> +
> +update_sgx:
> + if (!(msr & FEAT_CTL_SGX_ENABLED) ||
> + !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
> + if (enable_sgx)
> + pr_err_once("SGX disabled by BIOS\n");
> + clear_sgx_caps();
> + }
>  }
> -- 
> 2.25.1


Re: [PATCH v36 13/24] x86/sgx: Add SGX_IOC_ENCLAVE_ADD_PAGES

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:52 +03, Jarkko Sakkinen wrote:
> Add an ioctl, which performs ENCLS[EADD] that adds new visible page to an
> enclave, and optionally ENCLS[EEXTEND] operations that hash the page to the
> enclave measurement. By visible we mean a page that can be mapped to the
> address range of an enclave.
>
> Acked-by: Jethro Beekman 
> Tested-by: Jethro Beekman 
> Tested-by: Haitao Huang 
> Tested-by: Chunyang Hui 
> Tested-by: Jordan Hand 
> Tested-by: Nathaniel McCallum 
> Tested-by: Seth Moore 

Tested-by: Darren Kenny 
Reviewed-by: Darren Kenny 

> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Suresh Siddha 
> Signed-off-by: Suresh Siddha 
> Signed-off-by: Jarkko Sakkinen 
> ---
>  arch/x86/include/uapi/asm/sgx.h |  30 
>  arch/x86/kernel/cpu/sgx/ioctl.c | 291 
>  2 files changed, 321 insertions(+)
>
> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> index 3787d278e84b..c8f199b3fb6f 100644
> --- a/arch/x86/include/uapi/asm/sgx.h
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -8,10 +8,21 @@
>  #include 
>  #include 
>  
> +/**
> + * enum sgx_epage_flags - page control flags
> + * %SGX_PAGE_MEASURE:Measure the page contents with a sequence of
> + *   ENCLS[EEXTEND] operations.
> + */
> +enum sgx_page_flags {
> + SGX_PAGE_MEASURE= 0x01,
> +};
> +
>  #define SGX_MAGIC 0xA4
>  
>  #define SGX_IOC_ENCLAVE_CREATE \
>   _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
> +#define SGX_IOC_ENCLAVE_ADD_PAGES \
> + _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
>  
>  /**
>   * struct sgx_enclave_create - parameter structure for the
> @@ -22,4 +33,23 @@ struct sgx_enclave_create  {
>   __u64   src;
>  };
>  
> +/**
> + * struct sgx_enclave_add_pages - parameter structure for the
> + *%SGX_IOC_ENCLAVE_ADD_PAGE ioctl
> + * @src: start address for the page data
> + * @offset:  starting page offset
> + * @length:  length of the data (multiple of the page size)
> + * @secinfo: address for the SECINFO data
> + * @flags:   page control flags
> + * @count:   number of bytes added (multiple of the page size)
> + */
> +struct sgx_enclave_add_pages {
> + __u64   src;
> + __u64   offset;
> + __u64   length;
> + __u64   secinfo;
> + __u64   flags;
> + __u64   count;
> +};
> +
>  #endif /* _UAPI_ASM_X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index 7981c411b05a..c63a51362d14 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -197,6 +197,294 @@ static long sgx_ioc_enclave_create(struct sgx_encl 
> *encl, void __user *arg)
>   return ret;
>  }
>  
> +static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
> +  unsigned long offset,
> +  u64 secinfo_flags)
> +{
> + struct sgx_encl_page *encl_page;
> + unsigned long prot;
> +
> + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
> + if (!encl_page)
> + return ERR_PTR(-ENOMEM);
> +
> + encl_page->desc = encl->base + offset;
> + encl_page->encl = encl;
> +
> + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ)  |
> +_calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
> +_calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
> +
> + /*
> +  * TCS pages must always RW set for CPU access while the SECINFO
> +  * permissions are *always* zero - the CPU ignores the user provided
> +  * values and silently overwrites them with zero permissions.
> +  */
> + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
> + prot |= PROT_READ | PROT_WRITE;
> +
> + /* Calculate maximum of the VM flags for the page. */
> + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
> +
> + return encl_page;
> +}
> +
> +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
> +{
> + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
> + u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
> +
> + if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
> + return -EINVAL;
> +
> + if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
> + return -EINVAL;
> +
> + /*
> +  * CPU will silently overwrite the permissions as zero, whic

Re: [PATCH v36 03/24] x86/mm: x86/sgx: Signal SIGSEGV with PF_SGX

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:42 +03, Jarkko Sakkinen wrote:
> From: Sean Christopherson 
>
> Include SGX bit to the PF error codes and throw SIGSEGV with PF_SGX when
> a #PF with SGX set happens.
>
> CPU throws a #PF with the SGX set in the event of Enclave Page Cache Map
> (EPCM) conflict. The EPCM is a CPU-internal table, which describes the
> properties for a enclave page. Enclaves are measured and signed software
> entities, which SGX hosts. [1]
>
> Although the primary purpose of the EPCM conflict checks  is to prevent
> malicious accesses to an enclave, an illegit access can happen also for
> legit reasons.
>
> All SGX reserved memory, including EPCM is encrypted with a transient key
> that does not survive from the power transition. Throwing a SIGSEGV allows
> user space software to react when this happens (e.g. recreate the enclave,
> which was invalidated).
>
> [1] Intel SDM: 36.5.1 Enclave Page Cache Map (EPCM)
>
> Acked-by: Jethro Beekman 
> Signed-off-by: Sean Christopherson 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/include/asm/traps.h | 14 --
>  arch/x86/mm/fault.c  | 13 +
>  2 files changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 714b1a30e7b0..4446f95ad997 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -44,12 +44,13 @@ void __noreturn handle_stack_overflow(const char *message,
>  /*
>   * Page fault error code bits:
>   *
> - *   bit 0 == 0: no page found   1: protection fault
> - *   bit 1 == 0: read access 1: write access
> - *   bit 2 == 0: kernel-mode access  1: user-mode access
> - *   bit 3 ==1: use of reserved bit detected
> - *   bit 4 ==1: fault was an instruction 
> fetch
> - *   bit 5 ==1: protection keys block access
> + *   bit 0  ==0: no page found   1: protection fault
> + *   bit 1  ==0: read access 1: write access
> + *   bit 2  ==0: kernel-mode access  1: user-mode access
> + *   bit 3  ==   1: use of reserved bit detected
> + *   bit 4  ==   1: fault was an instruction 
> fetch
> + *   bit 5  ==   1: protection keys block access
> + *   bit 15 ==   1: inside SGX enclave
>   */
>  enum x86_pf_error_code {
>   X86_PF_PROT =   1 << 0,
> @@ -58,5 +59,6 @@ enum x86_pf_error_code {
>   X86_PF_RSVD =   1 << 3,
>   X86_PF_INSTR=   1 << 4,
>   X86_PF_PK   =   1 << 5,
> + X86_PF_SGX  =   1 << 15,
>  };
>  #endif /* _ASM_X86_TRAPS_H */
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 1ead568c0101..1db6fbd7af8e 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1055,6 +1055,19 @@ access_error(unsigned long error_code, struct 
> vm_area_struct *vma)
>   if (error_code & X86_PF_PK)
>   return 1;
>  
> + /*
> +  * Access is blocked by the Enclave Page Cache Map (EPCM), i.e. the
> +  * access is allowed by the PTE but not the EPCM. This usually happens
> +  * when the EPCM is yanked out from under us, e.g. by hardware after a
> +  * suspend/resume cycle. In any case, software, i.e. the kernel, can't
> +  * fix the source of the fault as the EPCM can't be directly modified by
> +  * software. Handle the fault as an access error in order to signal
> +  * userspace so that userspace can rebuild their enclave(s), even though
> +  * userspace may not have actually violated access permissions.
> +  */
> + if (unlikely(error_code & X86_PF_SGX))
> + return 1;
> +
>   /*
>* Make sure to check the VMA so that we do not perform
>* faults just to hit a X86_PF_PK as soon as we fill in a
> -- 
> 2.25.1


Re: [PATCH v36 08/24] x86/sgx: Initialize metadata for Enclave Page Cache (EPC) sections

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:47 +03, Jarkko Sakkinen wrote:
> From: Sean Christopherson 
>
> Enumerate Enclave Page Cache (EPC) sections via CPUID and add the data
> structures necessary to track EPC pages so that they can be easily borrowed
> for different uses.
>
> Embed section index to the first eight bits of the EPC page descriptor.
> Existing client hardware supports only a single section, while upcoming
> server hardware will support at most eight sections. Thus, eight bits
> should be enough for long term needs.
>
> Acked-by: Jethro Beekman 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Serge Ayoun 
> Signed-off-by: Serge Ayoun 
> Co-developed-by: Jarkko Sakkinen 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/Kconfig |  17 +++
>  arch/x86/kernel/cpu/Makefile |   1 +
>  arch/x86/kernel/cpu/sgx/Makefile |   2 +
>  arch/x86/kernel/cpu/sgx/main.c   | 216 +++
>  arch/x86/kernel/cpu/sgx/sgx.h|  52 
>  5 files changed, 288 insertions(+)
>  create mode 100644 arch/x86/kernel/cpu/sgx/Makefile
>  create mode 100644 arch/x86/kernel/cpu/sgx/main.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/sgx.h
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 883da0abf779..0dea7fdd7a00 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1926,6 +1926,23 @@ config X86_INTEL_TSX_MODE_AUTO
> side channel attacks- equals the tsx=auto command line parameter.
>  endchoice
>  
> +config INTEL_SGX
> + bool "Intel SGX"
> + depends on X86_64 && CPU_SUP_INTEL
> + depends on CRYPTO=y
> + depends on CRYPTO_SHA256=y
> + select SRCU
> + select MMU_NOTIFIER
> + help
> +   Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
> +   that can be used by applications to set aside private regions of code
> +   and data, referred to as enclaves. An enclave's private memory can
> +   only be accessed by code running within the enclave. Accesses from
> +   outside the enclave, including other enclaves, are disallowed by
> +   hardware.
> +
> +   If unsure, say N.
> +
>  config EFI
>   bool "EFI runtime service support"
>   depends on ACPI
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index dba6a83bc349..b00f801601f3 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -49,6 +49,7 @@ obj-$(CONFIG_X86_MCE)   += mce/
>  obj-$(CONFIG_MTRR)   += mtrr/
>  obj-$(CONFIG_MICROCODE)  += microcode/
>  obj-$(CONFIG_X86_CPU_RESCTRL)+= resctrl/
> +obj-$(CONFIG_INTEL_SGX)  += sgx/
>  
>  obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
>  
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile 
> b/arch/x86/kernel/cpu/sgx/Makefile
> new file mode 100644
> index ..79510ce01b3b
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -0,0 +1,2 @@
> +obj-y += \
> + main.o
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> new file mode 100644
> index ..c5831e3db14a
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -0,0 +1,216 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-17 Intel Corporation.
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "encls.h"
> +
> +struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> +static int sgx_nr_epc_sections;
> +static struct task_struct *ksgxswapd_tsk;
> +
> +static void sgx_sanitize_section(struct sgx_epc_section *section)
> +{
> + struct sgx_epc_page *page;
> + LIST_HEAD(secs_list);
> + int ret;
> +
> + while (!list_empty(>unsanitized_page_list)) {
> + if (kthread_should_stop())
> + return;
> +
> + spin_lock(>lock);
> +
> + page = list_first_entry(>unsanitized_page_list,
> + struct sgx_epc_page, list);
> +
> + ret = __eremove(sgx_get_epc_addr(page));
> + if (!ret)
> + list_move(>list, >page_list);
> + else
> + list_move_tail(>list, _list);
> +
> + spin_unlock(>lock);
> +
> + cond_resched();
> + }
> +}
> +
> +static int ksgxswapd(void *p)
> +{
> + int i;
> +
> + set_freezable();
> +
> + /*
>

Re: [PATCH v36 15/24] x86/sgx: Allow a limited use of ATTRIBUTE.PROVISIONKEY for attestation

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:54 +03, Jarkko Sakkinen wrote:
> Provisioning Certification Enclave (PCE), the root of trust for other
> enclaves, generates a signing key from a fused key called Provisioning
> Certification Key. PCE can then use this key to certify an attestation key
> of a Quoting Enclave (QE), e.g. we get the chain of trust down to the
> hardware if the Intel signed PCE is used.
>
> To use the needed keys, ATTRIBUTE.PROVISIONKEY is required but should be
> only allowed for those who actually need it so that only the trusted
> parties can certify QE's.
>
> Obviously the attestation service should know the public key of the used
> PCE and that way detect illegit attestation, but whitelisting the legit
> users still adds an additional layer of defence.
>
> Add new device file called /dev/sgx/provision. The sole purpose of this
> file is to provide file descriptors that act as privilege tokens to allow
> to build enclaves with ATTRIBUTE.PROVISIONKEY set. A new ioctl called
> SGX_IOC_ENCLAVE_SET_ATTRIBUTE is used to assign this token to an enclave.
>
> Cc: linux-security-mod...@vger.kernel.org
> Acked-by: Jethro Beekman 
> Suggested-by: Andy Lutomirski 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/include/uapi/asm/sgx.h  | 11 
>  arch/x86/kernel/cpu/sgx/driver.c | 18 
>  arch/x86/kernel/cpu/sgx/driver.h |  2 ++
>  arch/x86/kernel/cpu/sgx/ioctl.c  | 47 
>  4 files changed, 78 insertions(+)
>
> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> index 5edb08ab8fd0..57d0d30c79b3 100644
> --- a/arch/x86/include/uapi/asm/sgx.h
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -25,6 +25,8 @@ enum sgx_page_flags {
>   _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
>  #define SGX_IOC_ENCLAVE_INIT \
>   _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
> +#define SGX_IOC_ENCLAVE_SET_ATTRIBUTE \
> + _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_set_attribute)
>  
>  /**
>   * struct sgx_enclave_create - parameter structure for the
> @@ -63,4 +65,13 @@ struct sgx_enclave_init {
>   __u64 sigstruct;
>  };
>  
> +/**
> + * struct sgx_enclave_set_attribute - parameter structure for the
> + * %SGX_IOC_ENCLAVE_SET_ATTRIBUTE ioctl
> + * @attribute_fd:file handle of the attribute file in the securityfs
> + */
> +struct sgx_enclave_set_attribute {
> + __u64 attribute_fd;
> +};
> +
>  #endif /* _UAPI_ASM_X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c 
> b/arch/x86/kernel/cpu/sgx/driver.c
> index 5559bc18de41..b9af330a16fa 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.c
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -138,6 +138,10 @@ static const struct file_operations sgx_encl_fops = {
>   .get_unmapped_area  = sgx_get_unmapped_area,
>  };
>  
> +const struct file_operations sgx_provision_fops = {
> + .owner  = THIS_MODULE,
> +};
> +
>  static struct miscdevice sgx_dev_enclave = {
>   .minor = MISC_DYNAMIC_MINOR,
>   .name = "enclave",
> @@ -145,6 +149,13 @@ static struct miscdevice sgx_dev_enclave = {
>   .fops = _encl_fops,
>  };
>  
> +static struct miscdevice sgx_dev_provision = {
> + .minor = MISC_DYNAMIC_MINOR,
> + .name = "provision",
> + .nodename = "sgx/provision",
> + .fops = _provision_fops,
> +};
> +
>  int __init sgx_drv_init(void)
>  {
>   unsigned int eax, ebx, ecx, edx;
> @@ -185,5 +196,12 @@ int __init sgx_drv_init(void)
>   return ret;
>   }
>  
> + ret = misc_register(_dev_provision);
> + if (ret) {
> + pr_err("Creating /dev/sgx/provision failed with %d.\n", ret);
> + misc_deregister(_dev_enclave);
> + return ret;
> + }
> +
>   return 0;
>  }
> diff --git a/arch/x86/kernel/cpu/sgx/driver.h 
> b/arch/x86/kernel/cpu/sgx/driver.h
> index e4063923115b..72747d01c046 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.h
> +++ b/arch/x86/kernel/cpu/sgx/driver.h
> @@ -23,6 +23,8 @@ extern u64 sgx_attributes_reserved_mask;
>  extern u64 sgx_xfrm_reserved_mask;
>  extern u32 sgx_xsave_size_tbl[64];
>  
> +extern const struct file_operations sgx_provision_fops;
> +
>  long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
>  
>  int sgx_drv_init(void);
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index 3444de955191..95b0a1e62ea7 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -669,6 +669,50 @@ static long sgx_ioc

Re: [PATCH v36 11/24] x86/sgx: Add SGX enclave driver

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:50 +03, Jarkko Sakkinen wrote:
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that can
> be used by applications to set aside private regions of code and data. The
> code outside the SGX hosted software entity is prevented from accessing the
> memory inside the enclave by the CPU. We call these entities enclaves.
>
> Add a driver that provides an ioctl API to construct and run enclaves.
> Enclaves are constructed from pages residing in reserved physical memory
> areas. The contents of these pages can only be accessed when they are
> mapped as part of an enclave, by a hardware thread running inside the
> enclave.
>
> The starting state of an enclave consists of a fixed measured set of
> pages that are copied to the EPC during the construction process by
> using ENCLS leaf functions and Software Enclave Control Structure (SECS)
> that defines the enclave properties.
>
> Enclaves are constructed by using ENCLS leaf functions ECREATE, EADD and
> EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
> the EPC and EINIT checks a given signed measurement and moves the enclave
> into a state ready for execution.
>
> An initialized enclave can only be accessed through special Thread Control
> Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
> function converts a thread into enclave mode and continues the execution in
> the offset defined by the TCS provided to EENTER. An enclave is exited
> through syscall, exception, interrupts or by explicitly calling another
> ENCLU leaf EEXIT.
>
> The mmap() permissions are capped by the contained enclave page
> permissions. The mapped areas must also be opaque, i.e. each page address
> must contain a page. This logic is implemented in sgx_encl_may_map().
>
> Cc: linux-security-mod...@vger.kernel.org
> Cc: linux...@kvack.org
> Cc: Andrew Morton 
> Cc: Matthew Wilcox 
> Acked-by: Jethro Beekman 
> Tested-by: Jethro Beekman 
> Tested-by: Haitao Huang 
> Tested-by: Chunyang Hui 
> Tested-by: Jordan Hand 
> Tested-by: Nathaniel McCallum 
> Tested-by: Seth Moore 

Tested-by: Darren Kenny 
Reviewed-by: Darren Kenny 

> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Suresh Siddha 
> Signed-off-by: Suresh Siddha 
> Signed-off-by: Jarkko Sakkinen 
> ---
>  arch/x86/kernel/cpu/sgx/Makefile |   2 +
>  arch/x86/kernel/cpu/sgx/driver.c | 177 
>  arch/x86/kernel/cpu/sgx/driver.h |  29 +++
>  arch/x86/kernel/cpu/sgx/encl.c   | 333 +++
>  arch/x86/kernel/cpu/sgx/encl.h   |  87 
>  arch/x86/kernel/cpu/sgx/main.c   |  11 +
>  6 files changed, 639 insertions(+)
>  create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
>  create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/encl.h
>
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile 
> b/arch/x86/kernel/cpu/sgx/Makefile
> index 79510ce01b3b..3fc451120735 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -1,2 +1,4 @@
>  obj-y += \
> + driver.o \
> + encl.o \
>   main.o
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c 
> b/arch/x86/kernel/cpu/sgx/driver.c
> new file mode 100644
> index ..b52520407f5b
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -0,0 +1,177 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-18 Intel Corporation.
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "driver.h"
> +#include "encl.h"
> +
> +MODULE_DESCRIPTION("Intel SGX Enclave Driver");
> +MODULE_AUTHOR("Jarkko Sakkinen ");
> +MODULE_LICENSE("Dual BSD/GPL");
> +
> +u64 sgx_encl_size_max_32;
> +u64 sgx_encl_size_max_64;
> +u32 sgx_misc_reserved_mask;
> +u64 sgx_attributes_reserved_mask;
> +u64 sgx_xfrm_reserved_mask = ~0x3;
> +u32 sgx_xsave_size_tbl[64];
> +
> +static int sgx_open(struct inode *inode, struct file *file)
> +{
> + struct sgx_encl *encl;
> + int ret;
> +
> + encl = kzalloc(sizeof(*encl), GFP_KERNEL);
> + if (!encl)
> + return -ENOMEM;
> +
> + atomic_set(>flags, 0);
> + kref_init(>refcount);
> + xa_init(>page_array);
> + mutex_init(>lock);
> + INIT_LIST_HEAD(>mm_list);
> + spin_lock_init(>mm_lock);
> +
> + ret = init_srcu_struct(>srcu);
> + if (ret) {
> + kfree(encl);
> + return ret;
> + }
&

Re: [PATCH v36 02/24] x86/cpufeatures: x86/msr: Add Intel SGX Launch Control hardware bits

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:41 +03, Jarkko Sakkinen wrote:
> From: Sean Christopherson 
>
> Add X86_FEATURE_SGX_LC, which informs whether or not the CPU supports SGX
> Launch Control.
>
> Add MSR_IA32_SGXLEPUBKEYHASH{0, 1, 2, 3}, which when combined contain a
> SHA256 hash of a 3072-bit RSA public key. SGX backed software packages, so
> called enclaves, are always signed. All enclaves signed with the public key
> are unconditionally allowed to initialize. [1]
>
> Add FEAT_CTL_SGX_LC_ENABLED, which informs whether the aformentioned MSRs
> are writable or not. If the bit is off, the public key MSRs are read-only
> for the OS.
>
> If the MSRs are read-only, the platform must provide a launch enclave (LE).
> LE can create cryptographic tokens for other enclaves that they can pass
> together with their signature to the ENCLS(EINIT) opcode, which is used
> to initialize enclaves.
>
> Linux is unlikely to support the locked configuration because it takes away
> the control of the launch decisions from the kernel.
>
> [1] Intel SDM: 38.1.4 Intel SGX Launch Control Configuration
>
> Reviewed-by: Borislav Petkov 
> Acked-by: Jethro Beekman 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Jarkko Sakkinen 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/include/asm/cpufeatures.h | 1 +
>  arch/x86/include/asm/msr-index.h   | 7 +++
>  2 files changed, 8 insertions(+)
>
> diff --git a/arch/x86/include/asm/cpufeatures.h 
> b/arch/x86/include/asm/cpufeatures.h
> index 545ac3e0e269..0a4541e4f076 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -352,6 +352,7 @@
>  #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
>  #define X86_FEATURE_MOVDIRI  (16*32+27) /* MOVDIRI instruction */
>  #define X86_FEATURE_MOVDIR64B(16*32+28) /* MOVDIR64B 
> instruction */
> +#define X86_FEATURE_SGX_LC   (16*32+30) /* Software Guard Extensions 
> Launch Control */
>  
>  /* AMD-defined CPU features, CPUID level 0x8007 (EBX), word 17 */
>  #define X86_FEATURE_OVERFLOW_RECOV   (17*32+ 0) /* MCA overflow recovery 
> support */
> diff --git a/arch/x86/include/asm/msr-index.h 
> b/arch/x86/include/asm/msr-index.h
> index 18e08da19f16..3d7c89a8533f 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -582,6 +582,7 @@
>  #define FEAT_CTL_LOCKED  BIT(0)
>  #define FEAT_CTL_VMX_ENABLED_INSIDE_SMX  BIT(1)
>  #define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2)
> +#define FEAT_CTL_SGX_LC_ENABLED  BIT(17)
>  #define FEAT_CTL_SGX_ENABLED BIT(18)
>  #define FEAT_CTL_LMCE_ENABLEDBIT(20)
>  
> @@ -602,6 +603,12 @@
>  #define MSR_IA32_UCODE_WRITE 0x0079
>  #define MSR_IA32_UCODE_REV   0x008b
>  
> +/* Intel SGX Launch Enclave Public Key Hash MSRs */
> +#define MSR_IA32_SGXLEPUBKEYHASH00x008C
> +#define MSR_IA32_SGXLEPUBKEYHASH10x008D
> +#define MSR_IA32_SGXLEPUBKEYHASH20x008E
> +#define MSR_IA32_SGXLEPUBKEYHASH30x008F
> +
>  #define MSR_IA32_SMM_MONITOR_CTL 0x009b
>  #define MSR_IA32_SMBASE  0x009e
>  
> -- 
> 2.25.1


Re: [PATCH v36 07/24] x86/cpu/intel: Add nosgx kernel parameter

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:46 +03, Jarkko Sakkinen wrote:
> Add kernel parameter to disable Intel SGX kernel support.
>
> Tested-by: Sean Christopherson 
> Reviewed-by: Sean Christopherson 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  Documentation/admin-guide/kernel-parameters.txt | 2 ++
>  arch/x86/kernel/cpu/feat_ctl.c  | 9 +
>  2 files changed, 11 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index fb95fad81c79..e747bd9ca911 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3314,6 +3314,8 @@
>  
>   nosep   [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support.
>  
> + nosgx   [X86-64,SGX] Disables Intel SGX kernel support.
> +
>   nosmp   [SMP] Tells an SMP kernel to act as a UP kernel,
>   and disable the IO APIC.  legacy for "maxcpus=0".
>  
> diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
> index c3afcd2e4342..1837df39527f 100644
> --- a/arch/x86/kernel/cpu/feat_ctl.c
> +++ b/arch/x86/kernel/cpu/feat_ctl.c
> @@ -101,6 +101,15 @@ static void clear_sgx_caps(void)
>   setup_clear_cpu_cap(X86_FEATURE_SGX2);
>  }
>  
> +static int __init nosgx(char *str)
> +{
> + clear_sgx_caps();
> +
> + return 0;
> +}
> +
> +early_param("nosgx", nosgx);
> +
>  void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
>  {
>   bool tboot = tboot_enabled();
> -- 
> 2.25.1


Re: [PATCH v36 09/24] x86/sgx: Add __sgx_alloc_epc_page() and sgx_free_epc_page()

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:48 +03, Jarkko Sakkinen wrote:
> Add __sgx_alloc_epc_page(), which iterates through EPC sections and borrows
> a page structure that is not used by anyone else. When a page is no longer
> needed it must be released with sgx_free_epc_page(). This function
> implicitly calls ENCLS[EREMOVE], which will return the page to the
> uninitialized state (i.e. not required from caller part).
>
> Acked-by: Jethro Beekman 
> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  arch/x86/kernel/cpu/sgx/main.c | 62 ++
>  arch/x86/kernel/cpu/sgx/sgx.h  |  3 ++
>  2 files changed, 65 insertions(+)
>
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index c5831e3db14a..97c6895fb6c9 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -83,6 +83,68 @@ static bool __init sgx_page_reclaimer_init(void)
>   return true;
>  }
>  
> +static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct 
> sgx_epc_section *section)
> +{
> + struct sgx_epc_page *page;
> +
> + if (list_empty(>page_list))
> + return NULL;
> +
> + page = list_first_entry(>page_list, struct sgx_epc_page, list);
> + list_del_init(>list);
> +
> + return page;
> +}
> +
> +/**
> + * __sgx_alloc_epc_page() - Allocate an EPC page
> + *
> + * Iterate through EPC sections and borrow a free EPC page to the caller. 
> When a
> + * page is no longer needed it must be released with sgx_free_epc_page().
> + *
> + * Return:
> + *   an EPC page,
> + *   -errno on error
> + */
> +struct sgx_epc_page *__sgx_alloc_epc_page(void)
> +{
> + struct sgx_epc_section *section;
> + struct sgx_epc_page *page;
> + int i;
> +
> + for (i = 0; i < sgx_nr_epc_sections; i++) {
> + section = _epc_sections[i];
> + spin_lock(>lock);
> + page = __sgx_alloc_epc_page_from_section(section);
> + spin_unlock(>lock);
> +
> + if (page)
> + return page;
> + }
> +
> + return ERR_PTR(-ENOMEM);
> +}
> +
> +/**
> + * sgx_free_epc_page() - Free an EPC page
> + * @page:an EPC page
> + *
> + * Call EREMOVE for an EPC page and insert it back to the list of free pages.
> + */
> +void sgx_free_epc_page(struct sgx_epc_page *page)
> +{
> + struct sgx_epc_section *section = sgx_get_epc_section(page);
> + int ret;
> +
> + ret = __eremove(sgx_get_epc_addr(page));
> + if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
> + return;
> +
> + spin_lock(>lock);
> + list_add_tail(>list, >page_list);
> + spin_unlock(>lock);
> +}
> +
>  static void __init sgx_free_epc_section(struct sgx_epc_section *section)
>  {
>   struct sgx_epc_page *page;
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index dff4f5f16d09..fce756c3434b 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -49,4 +49,7 @@ static inline void *sgx_get_epc_addr(struct sgx_epc_page 
> *page)
>   return section->va + (page->desc & PAGE_MASK) - section->pa;
>  }
>  
> +struct sgx_epc_page *__sgx_alloc_epc_page(void);
> +void sgx_free_epc_page(struct sgx_epc_page *page);
> +
>  #endif /* _X86_SGX_H */
> -- 
> 2.25.1


Re: [PATCH v36 14/24] x86/sgx: Add SGX_IOC_ENCLAVE_INIT

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:53 +03, Jarkko Sakkinen wrote:
> Add an ioctl that performs ENCLS[EINIT], which locks down the measurement
> and initializes the enclave for entrance. After this, new pages can no
> longer be added.
>
> Acked-by: Jethro Beekman 
> Tested-by: Jethro Beekman 
> Tested-by: Haitao Huang 
> Tested-by: Chunyang Hui 
> Tested-by: Jordan Hand 
> Tested-by: Nathaniel McCallum 
> Tested-by: Seth Moore 

Tested-by: Darren Kenny 
Reviewed-by: Darren Kenny 

> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Suresh Siddha 
> Signed-off-by: Suresh Siddha 
> Signed-off-by: Jarkko Sakkinen 
> ---
>  arch/x86/include/uapi/asm/sgx.h |  11 ++
>  arch/x86/kernel/cpu/sgx/ioctl.c | 188 
>  2 files changed, 199 insertions(+)
>
> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> index c8f199b3fb6f..5edb08ab8fd0 100644
> --- a/arch/x86/include/uapi/asm/sgx.h
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -23,6 +23,8 @@ enum sgx_page_flags {
>   _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
>  #define SGX_IOC_ENCLAVE_ADD_PAGES \
>   _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
> +#define SGX_IOC_ENCLAVE_INIT \
> + _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
>  
>  /**
>   * struct sgx_enclave_create - parameter structure for the
> @@ -52,4 +54,13 @@ struct sgx_enclave_add_pages {
>   __u64   count;
>  };
>  
> +/**
> + * struct sgx_enclave_init - parameter structure for the
> + *   %SGX_IOC_ENCLAVE_INIT ioctl
> + * @sigstruct:   address for the SIGSTRUCT data
> + */
> +struct sgx_enclave_init {
> + __u64 sigstruct;
> +};
> +
>  #endif /* _UAPI_ASM_X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index c63a51362d14..3444de955191 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -16,6 +16,9 @@
>  #include "encl.h"
>  #include "encls.h"
>  
> +/* A per-cpu cache for the last known values of IA32_SGXLEPUBKEYHASHx MSRs. 
> */
> +static DEFINE_PER_CPU(u64 [4], sgx_lepubkeyhash_cache);
> +
>  static u32 sgx_calc_ssa_frame_size(u32 miscselect, u64 xfrm)
>  {
>   u32 size_max = PAGE_SIZE;
> @@ -485,6 +488,188 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl 
> *encl, void __user *arg)
>   return ret;
>  }
>  
> +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
> +   void *hash)
> +{
> + SHASH_DESC_ON_STACK(shash, tfm);
> +
> + shash->tfm = tfm;
> +
> + return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
> +}
> +
> +static int sgx_get_key_hash(const void *modulus, void *hash)
> +{
> + struct crypto_shash *tfm;
> + int ret;
> +
> + tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
> + if (IS_ERR(tfm))
> + return PTR_ERR(tfm);
> +
> + ret = __sgx_get_key_hash(tfm, modulus, hash);
> +
> + crypto_free_shash(tfm);
> + return ret;
> +}
> +
> +static void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce)
> +{
> + u64 *cache;
> + int i;
> +
> + cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());
> + for (i = 0; i < 4; i++) {
> + if (enforce || (lepubkeyhash[i] != cache[i])) {
> + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
> + cache[i] = lepubkeyhash[i];
> + }
> + }
> +}
> +
> +static int sgx_einit(struct sgx_sigstruct *sigstruct, void *token,
> +  struct sgx_epc_page *secs, u64 *lepubkeyhash)
> +{
> + int ret;
> +
> + preempt_disable();
> + sgx_update_lepubkeyhash_msrs(lepubkeyhash, false);
> + ret = __einit(sigstruct, token, sgx_get_epc_addr(secs));
> + if (ret == SGX_INVALID_EINITTOKEN) {
> + sgx_update_lepubkeyhash_msrs(lepubkeyhash, true);
> + ret = __einit(sigstruct, token, sgx_get_epc_addr(secs));
> + }
> + preempt_enable();
> + return ret;
> +}
> +
> +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct 
> *sigstruct,
> +  void *token)
> +{
> + u64 mrsigner[4];
> + int ret;
> + int i;
> + int j;
> +
> + /* Check that the required attributes have been authorized. */
> + if (encl->secs_attributes & ~encl->allowed_attributes)
> + return -EACCES;
> +
> + ret = sgx_get_key_hash(sigstruct->mod

Re: [PATCH v36 10/24] mm: Add vm_ops->mprotect()

2020-08-06 Thread Darren Kenny
On Thursday, 2020-07-16 at 16:52:49 +03, Jarkko Sakkinen wrote:
> From: Sean Christopherson 
>
> Add vm_ops()->mprotect() for additional constraints for a VMA.
>
> Intel Software Guard eXtensions (SGX) will use this callback to add two
> constraints:
>
> 1. Verify that the address range does not have holes: each page address
>must be filled with an enclave page.
> 2. Verify that VMA permissions won't surpass the permissions of any enclave
>page within the address range. Enclave cryptographically sealed
>permissions for each page address that set the upper limit for possible
>VMA permissions. Not respecting this can cause #GP's to be emitted.
>
> Cc: linux...@kvack.org
> Cc: Andrew Morton 
> Cc: Matthew Wilcox 
> Acked-by: Jethro Beekman 
> Signed-off-by: Sean Christopherson 
> Co-developed-by: Jarkko Sakkinen 
> Signed-off-by: Jarkko Sakkinen 

Reviewed-by: Darren Kenny 

> ---
>  include/linux/mm.h | 3 +++
>  mm/mprotect.c  | 5 -
>  2 files changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index dc7b87310c10..458e8cb99aaf 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -542,6 +542,9 @@ struct vm_operations_struct {
>   void (*close)(struct vm_area_struct * area);
>   int (*split)(struct vm_area_struct * area, unsigned long addr);
>   int (*mremap)(struct vm_area_struct * area);
> + int (*mprotect)(struct vm_area_struct *vma,
> + struct vm_area_struct **pprev, unsigned long start,
> + unsigned long end, unsigned long newflags);
>   vm_fault_t (*fault)(struct vm_fault *vmf);
>   vm_fault_t (*huge_fault)(struct vm_fault *vmf,
>   enum page_entry_size pe_size);
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index ce8b8a5eacbb..f170f3da8a4f 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -610,7 +610,10 @@ static int do_mprotect_pkey(unsigned long start, size_t 
> len,
>   tmp = vma->vm_end;
>   if (tmp > end)
>   tmp = end;
> - error = mprotect_fixup(vma, , nstart, tmp, newflags);
> + if (vma->vm_ops && vma->vm_ops->mprotect)
> + error = vma->vm_ops->mprotect(vma, , nstart, tmp, 
> newflags);
> + else
> + error = mprotect_fixup(vma, , nstart, tmp, 
> newflags);
>   if (error)
>   goto out;
>   nstart = tmp;
> -- 
> 2.25.1


Re: [PATCH v32 12/21] x86/sgx: Add provisioning

2020-06-05 Thread Darren Kenny


Hi Jarkko,

Just a couple of nits below...

On Monday, 2020-06-01 at 10:52:09 +03, Jarkko Sakkinen wrote:
> In order to provide a mechanism for devilering provisoning rights:

TYPO: s/devilering/delivering/?

>
> 1. Add a new device file /dev/sgx/provision that works as a token for
>allowing an enclave to have the provisioning privileges.
> 2. Add a new ioctl called SGX_IOC_ENCLAVE_SET_ATTRIBUTE that accepts the
>following data structure:
>
>struct sgx_enclave_set_attribute {
>__u64 addr;
>__u64 attribute_fd;
>};
>
> A daemon could sit on top of /dev/sgx/provision and send a file
> descriptor of this file to a process that needs to be able to provision
> enclaves.
>
> The way this API is used is straight-forward. Lets assume that dev_fd is
> a handle to /dev/sgx/enclave and prov_fd is a handle to
> /dev/sgx/provision.  You would allow SGX_IOC_ENCLAVE_CREATE to
> initialize an enclave with the PROVISIONKEY attribute by
>
> params.addr = ;
> params.token_fd = prov_fd;
>
> ioctl(dev_fd, SGX_IOC_ENCLAVE_SET_ATTRIBUTE, );
>
> Cc: linux-security-mod...@vger.kernel.org
> Acked-by: Jethro Beekman 
> Suggested-by: Andy Lutomirski 
> Signed-off-by: Jarkko Sakkinen 
> ---
>  arch/x86/include/uapi/asm/sgx.h  | 11 
>  arch/x86/kernel/cpu/sgx/driver.c | 14 ++
>  arch/x86/kernel/cpu/sgx/driver.h |  2 ++
>  arch/x86/kernel/cpu/sgx/ioctl.c  | 47 
>  4 files changed, 74 insertions(+)
>
> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> index 5edb08ab8fd0..57d0d30c79b3 100644
> --- a/arch/x86/include/uapi/asm/sgx.h
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -25,6 +25,8 @@ enum sgx_page_flags {
>   _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
>  #define SGX_IOC_ENCLAVE_INIT \
>   _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
> +#define SGX_IOC_ENCLAVE_SET_ATTRIBUTE \
> + _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_set_attribute)
>  
>  /**
>   * struct sgx_enclave_create - parameter structure for the
> @@ -63,4 +65,13 @@ struct sgx_enclave_init {
>   __u64 sigstruct;
>  };
>  
> +/**
> + * struct sgx_enclave_set_attribute - parameter structure for the
> + * %SGX_IOC_ENCLAVE_SET_ATTRIBUTE ioctl
> + * @attribute_fd:file handle of the attribute file in the securityfs
> + */
> +struct sgx_enclave_set_attribute {
> + __u64 attribute_fd;
> +};
> +
>  #endif /* _UAPI_ASM_X86_SGX_H */
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c 
> b/arch/x86/kernel/cpu/sgx/driver.c
> index b4aa7b9f8376..d90114cec1c3 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.c
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -150,6 +150,13 @@ static struct miscdevice sgx_dev_enclave = {
>   .fops = _encl_fops,
>  };
>  
> +static struct miscdevice sgx_dev_provision = {
> + .minor = MISC_DYNAMIC_MINOR,
> + .name = "provision",
> + .nodename = "sgx/provision",
> + .fops = _provision_fops,
> +};
> +
>  int __init sgx_drv_init(void)
>  {
>   unsigned int eax, ebx, ecx, edx;
> @@ -190,5 +197,12 @@ int __init sgx_drv_init(void)
>   return ret;
>   }
>  
> + ret = misc_register(_dev_provision);
> + if (ret) {
> + pr_err("Creating /dev/sgx/provision failed with %d.\n", ret);
> + misc_deregister(_dev_enclave);
> + return ret;
> + }
> +
>   return 0;
>  }
> diff --git a/arch/x86/kernel/cpu/sgx/driver.h 
> b/arch/x86/kernel/cpu/sgx/driver.h
> index e4063923115b..72747d01c046 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.h
> +++ b/arch/x86/kernel/cpu/sgx/driver.h
> @@ -23,6 +23,8 @@ extern u64 sgx_attributes_reserved_mask;
>  extern u64 sgx_xfrm_reserved_mask;
>  extern u32 sgx_xsave_size_tbl[64];
>  
> +extern const struct file_operations sgx_provision_fops;
> +
>  long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
>  
>  int sgx_drv_init(void);
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index 757cb9a4ae70..713bce437659 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -669,6 +669,50 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, 
> void __user *arg)
>   return ret;
>  }
>  
> +/**
> + * sgx_ioc_enclave_set_attribute - handler for %SGX_IOC_ENCLAVE_SET_ATTRIBUTE
> + * @filep:   open file to /dev/sgx
> + * @arg: userspace pointer to a struct sgx_enclave_set_attribute instance
> + *
> + * Mark the enclave as being allowed to access a restricted attribute bit.
> + * The requested attribute is specified via the attribute_fd field in the
> + * provided struct sgx_enclave_set_attribute.  The attribute_fd must be a
> + * handle to an SGX attribute file, e.g. €/dev/sgx/provision".

Maybe this should be simply a double-quote rather than the Unicode left
quote?

Thanks,

Darren.

> + *
> + * Failure to explicitly request access to a restricted attribute will cause
> + * 

Re: [PATCH] KVM: X86: Limit timer frequency with more smaller interval

2018-04-30 Thread Darren Kenny

Hi Wanpeng Li,

On Sun, Apr 29, 2018 at 07:38:20PM -0700, Wanpeng Li wrote:

From: Wanpeng Li <wanpen...@tencent.com>

Anthoine reported:
The period used by Windows change over time but it can be 1 milliseconds
or less. I saw the limit_periodic_timer_frequency print so 500
microseconds is sometimes reached.

This patchs limits timer frequency with more smaller interval 200ms(5000Hz)
to leave some headroom as Paolo suggested since Windows 10 changed the
scheduler tick limit from 1024 Hz to 2048 Hz.


I would suggest re-writing this slightly, removing the 'this patch'
as some people suggested in other threads, and maybe some other
small re-wording, e.g.:

 As suggested by Paolo, lower the timer frequency limit to a
 smaller interval of 200 ms (5000 Hz) to leave some headroom. This
 is required due to Windows 10 changing the scheduler tick limit
 from 1024 Hz to 2048 Hz.

Also, in the subject line, maybe write it as 'Lower timer frequency
limit to 200ms'?



Reported-by: Anthoine Bourgeois <anthoine.bourge...@blade-group.com>
Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Anthoine Bourgeois <anthoine.bourge...@blade-group.com>
Signed-off-by: Wanpeng Li <wanpen...@tencent.com>


With those changes (or similar):

Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
arch/x86/kvm/x86.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51ecd38..dc47073 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
static bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);

-unsigned int min_timer_period_us = 500;
+unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);

static bool __read_mostly kvmclock_periodic_sync = true;
--
2.7.4



Re: [PATCH] KVM: X86: Limit timer frequency with more smaller interval

2018-04-30 Thread Darren Kenny

Hi Wanpeng Li,

On Sun, Apr 29, 2018 at 07:38:20PM -0700, Wanpeng Li wrote:

From: Wanpeng Li 

Anthoine reported:
The period used by Windows change over time but it can be 1 milliseconds
or less. I saw the limit_periodic_timer_frequency print so 500
microseconds is sometimes reached.

This patchs limits timer frequency with more smaller interval 200ms(5000Hz)
to leave some headroom as Paolo suggested since Windows 10 changed the
scheduler tick limit from 1024 Hz to 2048 Hz.


I would suggest re-writing this slightly, removing the 'this patch'
as some people suggested in other threads, and maybe some other
small re-wording, e.g.:

 As suggested by Paolo, lower the timer frequency limit to a
 smaller interval of 200 ms (5000 Hz) to leave some headroom. This
 is required due to Windows 10 changing the scheduler tick limit
 from 1024 Hz to 2048 Hz.

Also, in the subject line, maybe write it as 'Lower timer frequency
limit to 200ms'?



Reported-by: Anthoine Bourgeois 
Suggested-by: Paolo Bonzini 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Anthoine Bourgeois 
Signed-off-by: Wanpeng Li 


With those changes (or similar):

Reviewed-by: Darren Kenny 


---
arch/x86/kvm/x86.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51ecd38..dc47073 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -114,7 +114,7 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
static bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);

-unsigned int min_timer_period_us = 500;
+unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);

static bool __read_mostly kvmclock_periodic_sync = true;
--
2.7.4



Re: [PATCH net V2] vhost: correctly remove wait queue during poll failure

2018-03-27 Thread Darren Kenny

On Tue, Mar 27, 2018 at 08:50:52PM +0800, Jason Wang wrote:

We tried to remove vq poll from wait queue, but do not check whether
or not it was in a list before. This will lead double free. Fixing
this by switching to use vhost_poll_stop() which zeros poll->wqh after
removing poll from waitqueue to make sure it won't be freed twice.

Cc: Darren Kenny <darren.ke...@oracle.com>
Reported-by: syzbot+c0272972b01b872e6...@syzkaller.appspotmail.com
Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
Signed-off-by: Jason Wang <jasow...@redhat.com>


Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
Changes from V1:
- tweak the commit log for to match the code
---
drivers/vhost/vhost.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..5d5a9d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file)
if (mask)
vhost_poll_wakeup(>wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
-   if (poll->wqh)
-   remove_wait_queue(poll->wqh, >wait);
+   vhost_poll_stop(poll);
ret = -EINVAL;
}

--
2.7.4



Re: [PATCH net V2] vhost: correctly remove wait queue during poll failure

2018-03-27 Thread Darren Kenny

On Tue, Mar 27, 2018 at 08:50:52PM +0800, Jason Wang wrote:

We tried to remove vq poll from wait queue, but do not check whether
or not it was in a list before. This will lead double free. Fixing
this by switching to use vhost_poll_stop() which zeros poll->wqh after
removing poll from waitqueue to make sure it won't be freed twice.

Cc: Darren Kenny 
Reported-by: syzbot+c0272972b01b872e6...@syzkaller.appspotmail.com
Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
Signed-off-by: Jason Wang 


Reviewed-by: Darren Kenny 


---
Changes from V1:
- tweak the commit log for to match the code
---
drivers/vhost/vhost.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..5d5a9d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file)
if (mask)
vhost_poll_wakeup(>wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
-   if (poll->wqh)
-   remove_wait_queue(poll->wqh, >wait);
+   vhost_poll_stop(poll);
ret = -EINVAL;
}

--
2.7.4



Re: [PATCH net] vhost: correctly remove wait queue during poll failure

2018-03-27 Thread Darren Kenny

Hi Jason,

On Tue, Mar 27, 2018 at 11:47:22AM +0800, Jason Wang wrote:

We tried to remove vq poll from wait queue, but do not check whether
or not it was in a list before. This will lead double free. Fixing
this by checking poll->wqh to make sure it was in a list.


This text seems at odds with the code below, instead of checking
poll-whq, you are removing that check...

Maybe the text needs rewording?

Thanks,

Darren.



Reported-by: syzbot+c0272972b01b872e6...@syzkaller.appspotmail.com
Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
Signed-off-by: Jason Wang 
---
drivers/vhost/vhost.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..5d5a9d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file)
if (mask)
vhost_poll_wakeup(>wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
-   if (poll->wqh)
-   remove_wait_queue(poll->wqh, >wait);
+   vhost_poll_stop(poll);
ret = -EINVAL;
}

--
2.7.4



Re: [PATCH net] vhost: correctly remove wait queue during poll failure

2018-03-27 Thread Darren Kenny

Hi Jason,

On Tue, Mar 27, 2018 at 11:47:22AM +0800, Jason Wang wrote:

We tried to remove vq poll from wait queue, but do not check whether
or not it was in a list before. This will lead double free. Fixing
this by checking poll->wqh to make sure it was in a list.


This text seems at odds with the code below, instead of checking
poll-whq, you are removing that check...

Maybe the text needs rewording?

Thanks,

Darren.



Reported-by: syzbot+c0272972b01b872e6...@syzkaller.appspotmail.com
Fixes: 2b8b328b61c79 ("vhost_net: handle polling errors when setting backend")
Signed-off-by: Jason Wang 
---
drivers/vhost/vhost.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d..5d5a9d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -212,8 +212,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file)
if (mask)
vhost_poll_wakeup(>wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
-   if (poll->wqh)
-   remove_wait_queue(poll->wqh, >wait);
+   vhost_poll_stop(poll);
ret = -EINVAL;
}

--
2.7.4



Re: [PATCH] vhost: fix vhost ioctl signature to build with clang

2018-03-15 Thread Darren Kenny

On Wed, Mar 14, 2018 at 10:05:06AM -0700, Sonny Rao wrote:

Clang is particularly anal about signed vs unsigned comparisons and
doesn't like the fact that some ioctl numbers set the MSB, so we get
this error when trying to build vhost on aarch64:

drivers/vhost/vhost.c:1400:7: error: overflow converting case value to
switch condition type (3221794578 to 18446744072636378898)
[-Werror, -Wswitch]
   case VHOST_GET_VRING_BASE:

3221794578 is 0xC008AF12 in hex
18446744072636378898 is 0xC008AF12 in hex

Fix this by using unsigned ints in the function signature for
vhost_vring_ioctl().

Signed-off-by: Sonny Rao <sonny...@chromium.org>


Reviewed-by: Darren Kenny <darren.ke...@oracle.com>

All the other callers of this function already appear to assume that
it is an unsigned int.

Thanks,

Darren.


---
drivers/vhost/vhost.c | 2 +-
drivers/vhost/vhost.h | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d5c8b4..5316319d84081 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1337,7 +1337,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct 
vhost_memory __user *m)
return -EFAULT;
}

-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp)
{
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b6056f19ae..d8ee85ae8fdcc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,7 @@ void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
void vhost_poll_queue(struct vhost_poll *poll);
void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);

struct vhost_log {
u64 addr;
@@ -177,7 +177,7 @@ void vhost_dev_reset_owner(struct vhost_dev *, struct 
vhost_umem *);
void vhost_dev_cleanup(struct vhost_dev *);
void vhost_dev_stop(struct vhost_dev *);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
int vhost_vq_access_ok(struct vhost_virtqueue *vq);
int vhost_log_access_ok(struct vhost_dev *);

--
2.13.5



Re: [PATCH] vhost: fix vhost ioctl signature to build with clang

2018-03-15 Thread Darren Kenny

On Wed, Mar 14, 2018 at 10:05:06AM -0700, Sonny Rao wrote:

Clang is particularly anal about signed vs unsigned comparisons and
doesn't like the fact that some ioctl numbers set the MSB, so we get
this error when trying to build vhost on aarch64:

drivers/vhost/vhost.c:1400:7: error: overflow converting case value to
switch condition type (3221794578 to 18446744072636378898)
[-Werror, -Wswitch]
   case VHOST_GET_VRING_BASE:

3221794578 is 0xC008AF12 in hex
18446744072636378898 is 0xC008AF12 in hex

Fix this by using unsigned ints in the function signature for
vhost_vring_ioctl().

Signed-off-by: Sonny Rao 


Reviewed-by: Darren Kenny 

All the other callers of this function already appear to assume that
it is an unsigned int.

Thanks,

Darren.


---
drivers/vhost/vhost.c | 2 +-
drivers/vhost/vhost.h | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d5c8b4..5316319d84081 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1337,7 +1337,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct 
vhost_memory __user *m)
return -EFAULT;
}

-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp)
{
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b6056f19ae..d8ee85ae8fdcc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,7 @@ void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
void vhost_poll_queue(struct vhost_poll *poll);
void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);

struct vhost_log {
u64 addr;
@@ -177,7 +177,7 @@ void vhost_dev_reset_owner(struct vhost_dev *, struct 
vhost_umem *);
void vhost_dev_cleanup(struct vhost_dev *);
void vhost_dev_stop(struct vhost_dev *);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
int vhost_vq_access_ok(struct vhost_virtqueue *vq);
int vhost_log_access_ok(struct vhost_dev *);

--
2.13.5



Re: [PATCH] x86/MSR: Move native_* variants to msr.h

2018-03-02 Thread Darren Kenny

On Thu, Mar 01, 2018 at 04:13:36PM +0100, Borislav Petkov wrote:

From: Borislav Petkov <b...@suse.de>

... where they belong.

No functionality change.

Signed-off-by: Borislav Petkov <b...@suse.de>
Cc: k...@vger.kernel.org


Seems like the right place to put them.

Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
arch/x86/include/asm/microcode.h | 14 --
arch/x86/include/asm/msr.h   | 14 ++
arch/x86/kvm/svm.c   |  1 -
arch/x86/kvm/vmx.c   |  1 -
4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 7fb1047d61c7..871714e2e4c6 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -6,20 +6,6 @@
#include 
#include 

-#define native_rdmsr(msr, val1, val2)  \
-do {   \
-   u64 __val = __rdmsr((msr)); \
-   (void)((val1) = (u32)__val);\
-   (void)((val2) = (u32)(__val >> 32));  \
-} while (0)
-
-#define native_wrmsr(msr, low, high)   \
-   __wrmsr(msr, low, high)
-
-#define native_wrmsrl(msr, val)\
-   __wrmsr((msr), (u32)((u64)(val)),   \
-  (u32)((u64)(val) >> 32))
-
struct ucode_patch {
struct list_head plist;
void *data; /* Intel uses only this one */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 30df295f6d94..77254c9c8f61 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -108,6 +108,20 @@ static inline void notrace __wrmsr(unsigned int msr, u32 
low, u32 high)
 : : "c" (msr), "a"(low), "d" (high) : "memory");
}

+#define native_rdmsr(msr, val1, val2)  \
+do {   \
+   u64 __val = __rdmsr((msr)); \
+   (void)((val1) = (u32)__val);\
+   (void)((val2) = (u32)(__val >> 32));  \
+} while (0)
+
+#define native_wrmsr(msr, low, high)   \
+   __wrmsr(msr, low, high)
+
+#define native_wrmsrl(msr, val)\
+   __wrmsr((msr), (u32)((u64)(val)),   \
+  (u32)((u64)(val) >> 32))
+
static inline unsigned long long native_read_msr(unsigned int msr)
{
unsigned long long val;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cbd7ab74952e..c6dde6f1d848 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -49,7 +49,6 @@
#include 
#include 
#include 
-#include 
#include 

#include 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cab6ea1f8be5..5f708478b66e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,7 +51,6 @@
#include 
#include 
#include 
-#include 
#include 

#include "trace.h"
--
2.13.0



Re: [PATCH] x86/MSR: Move native_* variants to msr.h

2018-03-02 Thread Darren Kenny

On Thu, Mar 01, 2018 at 04:13:36PM +0100, Borislav Petkov wrote:

From: Borislav Petkov 

... where they belong.

No functionality change.

Signed-off-by: Borislav Petkov 
Cc: k...@vger.kernel.org


Seems like the right place to put them.

Reviewed-by: Darren Kenny 


---
arch/x86/include/asm/microcode.h | 14 --
arch/x86/include/asm/msr.h   | 14 ++
arch/x86/kvm/svm.c   |  1 -
arch/x86/kvm/vmx.c   |  1 -
4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 7fb1047d61c7..871714e2e4c6 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -6,20 +6,6 @@
#include 
#include 

-#define native_rdmsr(msr, val1, val2)  \
-do {   \
-   u64 __val = __rdmsr((msr)); \
-   (void)((val1) = (u32)__val);\
-   (void)((val2) = (u32)(__val >> 32));  \
-} while (0)
-
-#define native_wrmsr(msr, low, high)   \
-   __wrmsr(msr, low, high)
-
-#define native_wrmsrl(msr, val)\
-   __wrmsr((msr), (u32)((u64)(val)),   \
-  (u32)((u64)(val) >> 32))
-
struct ucode_patch {
struct list_head plist;
void *data; /* Intel uses only this one */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 30df295f6d94..77254c9c8f61 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -108,6 +108,20 @@ static inline void notrace __wrmsr(unsigned int msr, u32 
low, u32 high)
 : : "c" (msr), "a"(low), "d" (high) : "memory");
}

+#define native_rdmsr(msr, val1, val2)  \
+do {   \
+   u64 __val = __rdmsr((msr)); \
+   (void)((val1) = (u32)__val);\
+   (void)((val2) = (u32)(__val >> 32));  \
+} while (0)
+
+#define native_wrmsr(msr, low, high)   \
+   __wrmsr(msr, low, high)
+
+#define native_wrmsrl(msr, val)\
+   __wrmsr((msr), (u32)((u64)(val)),   \
+  (u32)((u64)(val) >> 32))
+
static inline unsigned long long native_read_msr(unsigned int msr)
{
unsigned long long val;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cbd7ab74952e..c6dde6f1d848 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -49,7 +49,6 @@
#include 
#include 
#include 
-#include 
#include 

#include 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cab6ea1f8be5..5f708478b66e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,7 +51,6 @@
#include 
#include 
#include 
-#include 
#include 

#include "trace.h"
--
2.13.0



Re: [PATCH v2 1/6] x86/speculation: Update Speculation Control microcode blacklist

2018-02-12 Thread Darren Kenny

On Sat, Feb 10, 2018 at 11:39:22PM +, David Woodhouse wrote:

Intel have retroactively blessed the 0xc2 microcode on Skylake mobile
and desktop parts, and the Gemini Lake 0x22 microcode is apparently fine
too. We blacklisted the latter purely because it was present with all
the other problematic ones in the 2018-01-08 release, but now it's
explicitly listed as OK.

We still list 0x84 for the various Kaby Lake / Coffee Lake parts, as
that appeared in one version of the blacklist and then reverted to
0x80 again. We can change it if 0x84 is actually announced to be safe.

Signed-off-by: David Woodhouse <d...@amazon.co.uk>


Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
arch/x86/kernel/cpu/intel.c | 4 
1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 319bf98..f73b814 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -123,8 +123,6 @@ static const struct sku_microcode spectre_bad_microcodes[] 
= {
{ INTEL_FAM6_KABYLAKE_MOBILE,   0x09,   0x84 },
{ INTEL_FAM6_SKYLAKE_X, 0x03,   0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04,   0x023c },
-   { INTEL_FAM6_SKYLAKE_MOBILE,0x03,   0xc2 },
-   { INTEL_FAM6_SKYLAKE_DESKTOP,   0x03,   0xc2 },
{ INTEL_FAM6_BROADWELL_CORE,0x04,   0x28 },
{ INTEL_FAM6_BROADWELL_GT3E,0x01,   0x1b },
{ INTEL_FAM6_BROADWELL_XEON_D,  0x02,   0x14 },
@@ -136,8 +134,6 @@ static const struct sku_microcode spectre_bad_microcodes[] 
= {
{ INTEL_FAM6_HASWELL_X, 0x02,   0x3b },
{ INTEL_FAM6_HASWELL_X, 0x04,   0x10 },
{ INTEL_FAM6_IVYBRIDGE_X,   0x04,   0x42a },
-   /* Updated in the 20180108 release; blacklist until we know otherwise */
-   { INTEL_FAM6_ATOM_GEMINI_LAKE,  0x01,   0x22 },
/* Observed in the wild */
{ INTEL_FAM6_SANDYBRIDGE_X, 0x06,   0x61b },
{ INTEL_FAM6_SANDYBRIDGE_X, 0x07,   0x712 },
--
2.7.4



Re: [PATCH v2 1/6] x86/speculation: Update Speculation Control microcode blacklist

2018-02-12 Thread Darren Kenny

On Sat, Feb 10, 2018 at 11:39:22PM +, David Woodhouse wrote:

Intel have retroactively blessed the 0xc2 microcode on Skylake mobile
and desktop parts, and the Gemini Lake 0x22 microcode is apparently fine
too. We blacklisted the latter purely because it was present with all
the other problematic ones in the 2018-01-08 release, but now it's
explicitly listed as OK.

We still list 0x84 for the various Kaby Lake / Coffee Lake parts, as
that appeared in one version of the blacklist and then reverted to
0x80 again. We can change it if 0x84 is actually announced to be safe.

Signed-off-by: David Woodhouse 


Reviewed-by: Darren Kenny 


---
arch/x86/kernel/cpu/intel.c | 4 
1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 319bf98..f73b814 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -123,8 +123,6 @@ static const struct sku_microcode spectre_bad_microcodes[] 
= {
{ INTEL_FAM6_KABYLAKE_MOBILE,   0x09,   0x84 },
{ INTEL_FAM6_SKYLAKE_X, 0x03,   0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04,   0x023c },
-   { INTEL_FAM6_SKYLAKE_MOBILE,0x03,   0xc2 },
-   { INTEL_FAM6_SKYLAKE_DESKTOP,   0x03,   0xc2 },
{ INTEL_FAM6_BROADWELL_CORE,0x04,   0x28 },
{ INTEL_FAM6_BROADWELL_GT3E,0x01,   0x1b },
{ INTEL_FAM6_BROADWELL_XEON_D,  0x02,   0x14 },
@@ -136,8 +134,6 @@ static const struct sku_microcode spectre_bad_microcodes[] 
= {
{ INTEL_FAM6_HASWELL_X, 0x02,   0x3b },
{ INTEL_FAM6_HASWELL_X, 0x04,   0x10 },
{ INTEL_FAM6_IVYBRIDGE_X,   0x04,   0x42a },
-   /* Updated in the 20180108 release; blacklist until we know otherwise */
-   { INTEL_FAM6_ATOM_GEMINI_LAKE,  0x01,   0x22 },
/* Observed in the wild */
{ INTEL_FAM6_SANDYBRIDGE_X, 0x06,   0x61b },
{ INTEL_FAM6_SANDYBRIDGE_X, 0x07,   0x712 },
--
2.7.4



Re: [PATCH] x86/spectre_v2: Remove 0xc2 from spectre_bad_microcodes

2018-02-09 Thread Darren Kenny

On Fri, Feb 09, 2018 at 02:29:27PM +, David Woodhouse wrote:

On Fri, 2018-02-09 at 14:10 +, Darren Kenny wrote:

According to the latest microcode update from Intel (on Feb 8, 2018) on
Skylake we should be using the microcode revisions 0xC2***, so we need
to remove that from the blacklist now.


The doc also suggests that Gemini Lake 0x22 is also now known to be OK.
I'll do a full pass over it and bring the list completely up to date.

Thanks.


OK, let me know if you need me to do anything.

Thanks,

Darren.


Re: [PATCH] x86/spectre_v2: Remove 0xc2 from spectre_bad_microcodes

2018-02-09 Thread Darren Kenny

On Fri, Feb 09, 2018 at 02:29:27PM +, David Woodhouse wrote:

On Fri, 2018-02-09 at 14:10 +, Darren Kenny wrote:

According to the latest microcode update from Intel (on Feb 8, 2018) on
Skylake we should be using the microcode revisions 0xC2***, so we need
to remove that from the blacklist now.


The doc also suggests that Gemini Lake 0x22 is also now known to be OK.
I'll do a full pass over it and bring the list completely up to date.

Thanks.


OK, let me know if you need me to do anything.

Thanks,

Darren.


[PATCH] x86/spectre_v2: Remove 0xc2 from spectre_bad_microcodes

2018-02-09 Thread Darren Kenny

According to the latest microcode update from Intel (on Feb 8, 2018) on
Skylake we should be using the microcode revisions 0xC2***, so we need
to remove that from the blacklist now.

Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Tested-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
---
arch/x86/kernel/cpu/intel.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 319bf98..276bfe2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -123,8 +123,6 @@ static const struct sku_microcode spectre_bad_microcodes[] 
= {
{ INTEL_FAM6_KABYLAKE_MOBILE,   0x09,   0x84 },
{ INTEL_FAM6_SKYLAKE_X, 0x03,   0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04,   0x023c },
-   { INTEL_FAM6_SKYLAKE_MOBILE,0x03,   0xc2 },
-   { INTEL_FAM6_SKYLAKE_DESKTOP,   0x03,   0xc2 },
{ INTEL_FAM6_BROADWELL_CORE,0x04,   0x28 },
{ INTEL_FAM6_BROADWELL_GT3E,0x01,   0x1b },
{ INTEL_FAM6_BROADWELL_XEON_D,  0x02,   0x14 },
--
2.9.5



[PATCH] x86/spectre_v2: Remove 0xc2 from spectre_bad_microcodes

2018-02-09 Thread Darren Kenny

According to the latest microcode update from Intel (on Feb 8, 2018) on
Skylake we should be using the microcode revisions 0xC2***, so we need
to remove that from the blacklist now.

Signed-off-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 
Tested-by: Konrad Rzeszutek Wilk 
---
arch/x86/kernel/cpu/intel.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 319bf98..276bfe2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -123,8 +123,6 @@ static const struct sku_microcode spectre_bad_microcodes[] 
= {
{ INTEL_FAM6_KABYLAKE_MOBILE,   0x09,   0x84 },
{ INTEL_FAM6_SKYLAKE_X, 0x03,   0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04,   0x023c },
-   { INTEL_FAM6_SKYLAKE_MOBILE,0x03,   0xc2 },
-   { INTEL_FAM6_SKYLAKE_DESKTOP,   0x03,   0xc2 },
{ INTEL_FAM6_BROADWELL_CORE,0x04,   0x28 },
{ INTEL_FAM6_BROADWELL_GT3E,0x01,   0x1b },
{ INTEL_FAM6_BROADWELL_XEON_D,  0x02,   0x14 },
--
2.9.5



[PATCH v2] Correct use of IBRS_ATT name, should be IBRS_ALL

2018-02-05 Thread Darren Kenny

Fixes a comment in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

While the original name was "IBRS All The Time" (IBRS_ATT),
the publicly documented name is IBRS_ALL, we should be using that.

Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Acked-by: David Woodhouse <d...@amazon.co.uk>
---
arch/x86/include/asm/nospec-branch.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index 4ad4108..5b778d2 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -201,7 +201,7 @@ extern char __indirect_thunk_end[];
 * On VMEXIT we must ensure that no RSB predictions learned in the guest
 * can be followed in the host, by overwriting the RSB completely. Both
 * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
 */
static inline void vmexit_fill_RSB(void)
{
--
2.9.5



[PATCH v2] Correct use of IBRS_ATT name, should be IBRS_ALL

2018-02-05 Thread Darren Kenny

Fixes a comment in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

While the original name was "IBRS All The Time" (IBRS_ATT),
the publicly documented name is IBRS_ALL, we should be using that.

Signed-off-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 
Acked-by: David Woodhouse 
---
arch/x86/include/asm/nospec-branch.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index 4ad4108..5b778d2 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -201,7 +201,7 @@ extern char __indirect_thunk_end[];
 * On VMEXIT we must ensure that no RSB predictions learned in the guest
 * can be followed in the host, by overwriting the RSB completely. Both
 * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
 */
static inline void vmexit_fill_RSB(void)
{
--
2.9.5



Re: [PATCH v2] Correct use of IBRS_ATT name, should be IBRS_ALL

2018-02-05 Thread Darren Kenny

Please ignore this, didn't realise it had been already applied - guess I
looked in the wrong place.

Thanks,

Darren.

On Mon, Feb 05, 2018 at 11:02:20AM +, Darren Kenny wrote:

Fixes a comment in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

While the original name was "IBRS All The Time" (IBRS_ATT),
the publicly documented name is IBRS_ALL, we should be using that.

Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Acked-by: David Woodhouse <d...@amazon.co.uk>
---
arch/x86/include/asm/nospec-branch.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index 4ad4108..5b778d2 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -201,7 +201,7 @@ extern char __indirect_thunk_end[];
* On VMEXIT we must ensure that no RSB predictions learned in the guest
* can be followed in the host, by overwriting the RSB completely. Both
* retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
*/
static inline void vmexit_fill_RSB(void)
{
--
2.9.5



Re: [PATCH v2] Correct use of IBRS_ATT name, should be IBRS_ALL

2018-02-05 Thread Darren Kenny

Please ignore this, didn't realise it had been already applied - guess I
looked in the wrong place.

Thanks,

Darren.

On Mon, Feb 05, 2018 at 11:02:20AM +, Darren Kenny wrote:

Fixes a comment in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

While the original name was "IBRS All The Time" (IBRS_ATT),
the publicly documented name is IBRS_ALL, we should be using that.

Signed-off-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 
Acked-by: David Woodhouse 
---
arch/x86/include/asm/nospec-branch.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index 4ad4108..5b778d2 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -201,7 +201,7 @@ extern char __indirect_thunk_end[];
* On VMEXIT we must ensure that no RSB predictions learned in the guest
* can be followed in the host, by overwriting the RSB completely. Both
* retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
*/
static inline void vmexit_fill_RSB(void)
{
--
2.9.5



Re: [PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-05 Thread Darren Kenny

On Fri, Feb 02, 2018 at 11:42:12PM +, David Woodhouse wrote:

On Fri, 2018-02-02 at 19:12 +, Darren Kenny wrote:

Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>


Not strictly a typo; that was the original name for it. "IBRS all the
time". But yes, it should be IBRS_ALL now.

Acked-by: David Woodhouse <d...@amazon.co.uk>


Thanks David.

I'll send out another patch with this snippet of information in it
too, and including your Acked-by.

(The subject like will be slightly different)

Thanks,

Darren.


Re: [PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-05 Thread Darren Kenny

On Fri, Feb 02, 2018 at 11:42:12PM +, David Woodhouse wrote:

On Fri, 2018-02-02 at 19:12 +, Darren Kenny wrote:

Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

Signed-off-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 


Not strictly a typo; that was the original name for it. "IBRS all the
time". But yes, it should be IBRS_ALL now.

Acked-by: David Woodhouse 


Thanks David.

I'll send out another patch with this snippet of information in it
too, and including your Acked-by.

(The subject like will be slightly different)

Thanks,

Darren.


[tip:x86/pti] x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-02 Thread tip-bot for Darren Kenny
Commit-ID:  af189c95a371b59f493dbe0f50c0a09724868881
Gitweb: https://git.kernel.org/tip/af189c95a371b59f493dbe0f50c0a09724868881
Author: Darren Kenny <darren.ke...@oracle.com>
AuthorDate: Fri, 2 Feb 2018 19:12:20 +
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Fri, 2 Feb 2018 23:13:57 +0100

x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL

Fixes: 117cc7a908c83 ("x86/retpoline: Fill return stack buffer on vmexit")
Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Cc: Tom Lendacky <thomas.lenda...@amd.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Borislav Petkov <b...@alien8.de>
Cc: Masami Hiramatsu <mhira...@kernel.org>
Cc: Arjan van de Ven <ar...@linux.intel.com>
Cc: David Woodhouse <d...@amazon.co.uk>
Link: 
https://lkml.kernel.org/r/20180202191220.blvgkgutojecx...@starbug-vm.ie.oracle.com

---
 arch/x86/include/asm/nospec-branch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index d15d471..4d57894 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -150,7 +150,7 @@ extern char __indirect_thunk_end[];
  * On VMEXIT we must ensure that no RSB predictions learned in the guest
  * can be followed in the host, by overwriting the RSB completely. Both
  * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
  */
 static inline void vmexit_fill_RSB(void)
 {


[tip:x86/pti] x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-02 Thread tip-bot for Darren Kenny
Commit-ID:  af189c95a371b59f493dbe0f50c0a09724868881
Gitweb: https://git.kernel.org/tip/af189c95a371b59f493dbe0f50c0a09724868881
Author: Darren Kenny 
AuthorDate: Fri, 2 Feb 2018 19:12:20 +
Committer:  Thomas Gleixner 
CommitDate: Fri, 2 Feb 2018 23:13:57 +0100

x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL

Fixes: 117cc7a908c83 ("x86/retpoline: Fill return stack buffer on vmexit")
Signed-off-by: Darren Kenny 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Konrad Rzeszutek Wilk 
Cc: Tom Lendacky 
Cc: Andi Kleen 
Cc: Borislav Petkov 
Cc: Masami Hiramatsu 
Cc: Arjan van de Ven 
Cc: David Woodhouse 
Link: 
https://lkml.kernel.org/r/20180202191220.blvgkgutojecx...@starbug-vm.ie.oracle.com

---
 arch/x86/include/asm/nospec-branch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index d15d471..4d57894 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -150,7 +150,7 @@ extern char __indirect_thunk_end[];
  * On VMEXIT we must ensure that no RSB predictions learned in the guest
  * can be followed in the host, by overwriting the RSB completely. Both
  * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
  */
 static inline void vmexit_fill_RSB(void)
 {


[PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-02 Thread Darren Kenny

Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
---
arch/x86/include/asm/nospec-branch.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index 4ad4108..5b778d2 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -201,7 +201,7 @@ extern char __indirect_thunk_end[];
 * On VMEXIT we must ensure that no RSB predictions learned in the guest
 * can be followed in the host, by overwriting the RSB completely. Both
 * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
 */
static inline void vmexit_fill_RSB(void)
{
--
2.9.5



[PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-02 Thread Darren Kenny

Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
("x86/retpoline: Fill return stack buffer on vmexit")

Signed-off-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 
---
arch/x86/include/asm/nospec-branch.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index 4ad4108..5b778d2 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -201,7 +201,7 @@ extern char __indirect_thunk_end[];
 * On VMEXIT we must ensure that no RSB predictions learned in the guest
 * can be followed in the host, by overwriting the RSB completely. Both
 * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
 */
static inline void vmexit_fill_RSB(void)
{
--
2.9.5



Re: [PATCH v6 5/5] KVM: SVM: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread Darren Kenny

On Thu, Feb 01, 2018 at 10:59:46PM +0100, KarimAllah Ahmed wrote:

[ Based on a patch from Paolo Bonzini <pbonz...@redhat.com> ]

... basically doing exactly what we do for VMX:

- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
 actually used it.

Cc: Asit Mallick <asit.k.mall...@intel.com>
Cc: Arjan Van De Ven <arjan.van.de@intel.com>
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Jun Nakajima <jun.nakaj...@intel.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: David Woodhouse <d...@amazon.co.uk>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Ashok Raj <ashok@intel.com>
Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>


Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
v5:
- Add SPEC_CTRL to direct_access_msrs.
---
arch/x86/kvm/svm.c | 59 ++
1 file changed, 59 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 254eefb..c6ab343 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -184,6 +184,9 @@ struct vcpu_svm {
u64 gs_base;
} host;

+   u64 spec_ctrl;
+   bool save_spec_ctrl_on_exit;
+
u32 *msrpm;

ulong nmi_iret_rip;
@@ -249,6 +252,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_CSTAR,   .always = true  },
{ .index = MSR_SYSCALL_MASK,.always = true  },
#endif
+   { .index = MSR_IA32_SPEC_CTRL,  .always = false },
{ .index = MSR_IA32_PRED_CMD,   .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP,   .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
@@ -1584,6 +1588,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
u32 dummy;
u32 eax = 1;

+   svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
   MSR_IA32_APICBASE_ENABLE;
@@ -3605,6 +3611,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+   return 1;
+
+   msr_info->data = svm->spec_ctrl;
+   break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x0165;
break;
@@ -3696,6 +3709,30 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+   return 1;
+
+   /* The STIBP bit doesn't fault even if it's not advertised */
+   if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+   return 1;
+
+   svm->spec_ctrl = data;
+
+   /*
+* When it's written (to non-zero) for the first time, pass
+* it through. This means we don't have to take the perf
+* hit of saving it on vmexit for the common case of guests
+* that don't use it.
+*/
+   if (data && !svm->save_spec_ctrl_on_exit) {
+   svm->save_spec_ctrl_on_exit = true;
+   if (is_guest_mode(vcpu))
+   break;
+   set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 
1);
+   }
+   break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
@@ -4964,6 +5001,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)

local_irq_enable();

+   /*
+* If this vCPU has touched SPEC_CTRL, restore the guest's value if
+* it's non-zero. Since vmentry is serialising on affected CPUs, there
+* is no need to worry about the conditional branch over the wrmsr
+* being speculatively taken.
+*/
+   if (svm->spec_ctrl)
+ 

Re: [PATCH v6 5/5] KVM: SVM: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread Darren Kenny

On Thu, Feb 01, 2018 at 10:59:46PM +0100, KarimAllah Ahmed wrote:

[ Based on a patch from Paolo Bonzini  ]

... basically doing exactly what we do for VMX:

- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
 actually used it.

Cc: Asit Mallick 
Cc: Arjan Van De Ven 
Cc: Dave Hansen 
Cc: Andi Kleen 
Cc: Andrea Arcangeli 
Cc: Linus Torvalds 
Cc: Tim Chen 
Cc: Thomas Gleixner 
Cc: Dan Williams 
Cc: Jun Nakajima 
Cc: Paolo Bonzini 
Cc: David Woodhouse 
Cc: Greg KH 
Cc: Andy Lutomirski 
Cc: Ashok Raj 
Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 


Reviewed-by: Darren Kenny 


---
v5:
- Add SPEC_CTRL to direct_access_msrs.
---
arch/x86/kvm/svm.c | 59 ++
1 file changed, 59 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 254eefb..c6ab343 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -184,6 +184,9 @@ struct vcpu_svm {
u64 gs_base;
} host;

+   u64 spec_ctrl;
+   bool save_spec_ctrl_on_exit;
+
u32 *msrpm;

ulong nmi_iret_rip;
@@ -249,6 +252,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_CSTAR,   .always = true  },
{ .index = MSR_SYSCALL_MASK,.always = true  },
#endif
+   { .index = MSR_IA32_SPEC_CTRL,  .always = false },
{ .index = MSR_IA32_PRED_CMD,   .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP,   .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
@@ -1584,6 +1588,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
u32 dummy;
u32 eax = 1;

+   svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
   MSR_IA32_APICBASE_ENABLE;
@@ -3605,6 +3611,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+   return 1;
+
+   msr_info->data = svm->spec_ctrl;
+   break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x0165;
break;
@@ -3696,6 +3709,30 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+   return 1;
+
+   /* The STIBP bit doesn't fault even if it's not advertised */
+   if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+   return 1;
+
+   svm->spec_ctrl = data;
+
+   /*
+* When it's written (to non-zero) for the first time, pass
+* it through. This means we don't have to take the perf
+* hit of saving it on vmexit for the common case of guests
+* that don't use it.
+*/
+   if (data && !svm->save_spec_ctrl_on_exit) {
+   svm->save_spec_ctrl_on_exit = true;
+   if (is_guest_mode(vcpu))
+   break;
+   set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 
1);
+   }
+   break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
@@ -4964,6 +5001,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)

local_irq_enable();

+   /*
+* If this vCPU has touched SPEC_CTRL, restore the guest's value if
+* it's non-zero. Since vmentry is serialising on affected CPUs, there
+* is no need to worry about the conditional branch over the wrmsr
+* being speculatively taken.
+*/
+   if (svm->spec_ctrl)
+   wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -5056,6 +5102,19 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);

+   /*
+* We do not use IBRS in the kernel. If this vCPU has used the
+* SPEC_CTRL MSR it may have left it on; save the value and
+* turn it off. This is much more efficient than blindly adding
+* it 

Re: [PATCH v6 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread Darren Kenny

On Thu, Feb 01, 2018 at 10:59:45PM +0100, KarimAllah Ahmed wrote:

[ Based on a patch from Ashok Raj <ashok@intel.com> ]

Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.

To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.

No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.

[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]

Cc: Asit Mallick <asit.k.mall...@intel.com>
Cc: Arjan Van De Ven <arjan.van.de@intel.com>
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Jun Nakajima <jun.nakaj...@intel.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: David Woodhouse <d...@amazon.co.uk>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Ashok Raj <ashok@intel.com>
Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>


Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
v6:
- got rid of save_spec_ctrl_on_exit
- introduce msr_write_intercepted
v5:
- Also check for X86_FEATURE_SPEC_CTRL for the msr reads/writes
v4:
- Add IBRS to kvm_cpuid_8000_0008_ebx_x86_features
- Handling nested guests
v3:
- Save/restore manually
- Fix CPUID handling
- Fix a copy & paste error in the name of SPEC_CTRL MSR in
 disable_intercept.
- support !cpu_has_vmx_msr_bitmap()
v2:
- remove 'host_spec_ctrl' in favor of only a comment (dwmw@).
- special case writing '0' in SPEC_CTRL to avoid confusing live-migration
 when the instance never used the MSR (dwmw@).
- depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@).
- add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident).
---
arch/x86/kvm/cpuid.c |   9 +++--
arch/x86/kvm/vmx.c   | 105 ++-
arch/x86/kvm/x86.c   |   2 +-
3 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1909635..13f5d42 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -367,7 +367,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,

/* cpuid 0x8008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-   F(IBPB);
+   F(IBPB) | F(IBRS);

/* cpuid 0xC001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,

/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
-   F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);
+   F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+   F(ARCH_CAPABILITIES);

/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
-   /* IBPB isn't necessarily present in hardware cpuid */
+   /* IBRS and IBPB aren't necessarily present in hardware cpuid */
if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(IBPB);
+   if (boot_cpu_has(X86_FEATURE_IBRS))
+   entry->ebx |= F(IBRS);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(>ebx, CPUID_8000_0008_EBX);
break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b13314a..5d8a6a91 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -594,6 +594,7 @@ struct vcpu_vmx {
#endif

u64   arch_capabilities;
+   u64   spec_ctrl;

u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
@@ -1913,6 +1914,29 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
}

/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+   unsigned long *msr_bitmap;
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap())
+

Re: [PATCH v6 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread Darren Kenny

On Thu, Feb 01, 2018 at 10:59:45PM +0100, KarimAllah Ahmed wrote:

[ Based on a patch from Ashok Raj  ]

Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.

To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.

No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.

[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]

Cc: Asit Mallick 
Cc: Arjan Van De Ven 
Cc: Dave Hansen 
Cc: Andi Kleen 
Cc: Andrea Arcangeli 
Cc: Linus Torvalds 
Cc: Tim Chen 
Cc: Thomas Gleixner 
Cc: Dan Williams 
Cc: Jun Nakajima 
Cc: Paolo Bonzini 
Cc: David Woodhouse 
Cc: Greg KH 
Cc: Andy Lutomirski 
Cc: Ashok Raj 
Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 


Reviewed-by: Darren Kenny 


---
v6:
- got rid of save_spec_ctrl_on_exit
- introduce msr_write_intercepted
v5:
- Also check for X86_FEATURE_SPEC_CTRL for the msr reads/writes
v4:
- Add IBRS to kvm_cpuid_8000_0008_ebx_x86_features
- Handling nested guests
v3:
- Save/restore manually
- Fix CPUID handling
- Fix a copy & paste error in the name of SPEC_CTRL MSR in
 disable_intercept.
- support !cpu_has_vmx_msr_bitmap()
v2:
- remove 'host_spec_ctrl' in favor of only a comment (dwmw@).
- special case writing '0' in SPEC_CTRL to avoid confusing live-migration
 when the instance never used the MSR (dwmw@).
- depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@).
- add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident).
---
arch/x86/kvm/cpuid.c |   9 +++--
arch/x86/kvm/vmx.c   | 105 ++-
arch/x86/kvm/x86.c   |   2 +-
3 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1909635..13f5d42 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -367,7 +367,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,

/* cpuid 0x8008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-   F(IBPB);
+   F(IBPB) | F(IBRS);

/* cpuid 0xC001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,

/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
-   F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);
+   F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+   F(ARCH_CAPABILITIES);

/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
-   /* IBPB isn't necessarily present in hardware cpuid */
+   /* IBRS and IBPB aren't necessarily present in hardware cpuid */
if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(IBPB);
+   if (boot_cpu_has(X86_FEATURE_IBRS))
+   entry->ebx |= F(IBRS);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(>ebx, CPUID_8000_0008_EBX);
break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b13314a..5d8a6a91 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -594,6 +594,7 @@ struct vcpu_vmx {
#endif

u64   arch_capabilities;
+   u64   spec_ctrl;

u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
@@ -1913,6 +1914,29 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
}

/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+   unsigned long *msr_bitmap;
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap())
+   return true;
+
+   msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+   if (msr <= 0x1fff) {
+   return !!test_bit(msr, msr_bitmap + 0x800 / f);
+   } else if ((msr >= 0xc000) && (msr <= 0xc0001fff)) {
+   msr &= 0x1fff;
+   return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+   }
+
+   return true;
+}
+
+/*
 * Check if MSR is intercepted for L01 MSR bitmap.
 */
static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
@@ -3264,6 +3288

Re: [PATCH v6 3/5] KVM: VMX: Emulate MSR_IA32_ARCH_CAPABILITIES

2018-02-02 Thread Darren Kenny

On Thu, Feb 01, 2018 at 10:59:44PM +0100, KarimAllah Ahmed wrote:

Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.

[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]

Cc: Asit Mallick <asit.k.mall...@intel.com>
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Arjan Van De Ven <arjan.van.de@intel.com>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Jun Nakajima <jun.nakaj...@intel.com>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Ashok Raj <ashok@intel.com>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com>
Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>


Reviewed-by: Darren Kenny <darren.ke...@oracle.com>


---
arch/x86/kvm/cpuid.c |  2 +-
arch/x86/kvm/vmx.c   | 15 +++
arch/x86/kvm/x86.c   |  1 +
3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 033004d..1909635 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -394,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,

/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
-   F(AVX512_4VNNIW) | F(AVX512_4FMAPS);
+   F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);

/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 263eb1f..b13314a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -593,6 +593,8 @@ struct vcpu_vmx {
u64   msr_guest_kernel_gs_base;
#endif

+   u64   arch_capabilities;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
u32 secondary_exec_control;
@@ -3262,6 +3264,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+   return 1;
+   msr_info->data = to_vmx(vcpu)->arch_capabilities;
+   break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3397,6 +3405,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, 
MSR_IA32_PRED_CMD,
  MSR_TYPE_W);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated)
+   return 1;
+   vmx->arch_capabilities = data;
+   break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5659,6 +5672,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}

+   if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+   rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);

vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c53298d..4ec142e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1009,6 +1009,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+   MSR_IA32_ARCH_CAPABILITIES
};

static unsigned num_msrs_to_save;
--
2.7.4



Re: [PATCH v6 3/5] KVM: VMX: Emulate MSR_IA32_ARCH_CAPABILITIES

2018-02-02 Thread Darren Kenny

On Thu, Feb 01, 2018 at 10:59:44PM +0100, KarimAllah Ahmed wrote:

Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.

[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]

Cc: Asit Mallick 
Cc: Dave Hansen 
Cc: Arjan Van De Ven 
Cc: Tim Chen 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
Cc: Andi Kleen 
Cc: Thomas Gleixner 
Cc: Dan Williams 
Cc: Jun Nakajima 
Cc: Andy Lutomirski 
Cc: Greg KH 
Cc: Paolo Bonzini 
Cc: Ashok Raj 
Reviewed-by: Paolo Bonzini 
Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 


Reviewed-by: Darren Kenny 


---
arch/x86/kvm/cpuid.c |  2 +-
arch/x86/kvm/vmx.c   | 15 +++
arch/x86/kvm/x86.c   |  1 +
3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 033004d..1909635 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -394,7 +394,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,

/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
-   F(AVX512_4VNNIW) | F(AVX512_4FMAPS);
+   F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);

/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 263eb1f..b13314a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -593,6 +593,8 @@ struct vcpu_vmx {
u64   msr_guest_kernel_gs_base;
#endif

+   u64   arch_capabilities;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
u32 secondary_exec_control;
@@ -3262,6 +3264,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+   return 1;
+   msr_info->data = to_vmx(vcpu)->arch_capabilities;
+   break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3397,6 +3405,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, 
MSR_IA32_PRED_CMD,
  MSR_TYPE_W);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated)
+   return 1;
+   vmx->arch_capabilities = data;
+   break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5659,6 +5672,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}

+   if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+   rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);

vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c53298d..4ec142e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1009,6 +1009,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+   MSR_IA32_ARCH_CAPABILITIES
};

static unsigned num_msrs_to_save;
--
2.7.4



Re: [PATCH v4] KVM: Fix stack-out-of-bounds read in write_mmio

2017-12-15 Thread Darren Kenny

Code-wise, that looks good to me now. I also don't have ARM handy to
validate though - but it looks correct.

Reviewed-by: Darren Kenny <darren.ke...@oracle.com>

Thanks,

Darren.

On Thu, Dec 14, 2017 at 05:40:50PM -0800, Wanpeng Li wrote:

From: Wanpeng Li <wanpeng...@hotmail.com>

Reported by syzkaller:

 BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm]
 Read of size 8 at addr 8803259df7f8 by task syz-executor/32298

 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G   OE4.15.0-rc2+ #18
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 
02/16/2016
 Call Trace:
  dump_stack+0xab/0xe1
  print_address_description+0x6b/0x290
  kasan_report+0x28a/0x370
  write_mmio+0x11e/0x270 [kvm]
  emulator_read_write_onepage+0x311/0x600 [kvm]
  emulator_read_write+0xef/0x240 [kvm]
  emulator_fix_hypercall+0x105/0x150 [kvm]
  em_hypercall+0x2b/0x80 [kvm]
  x86_emulate_insn+0x2b1/0x1640 [kvm]
  x86_emulate_instruction+0x39a/0xb90 [kvm]
  handle_exception+0x1b4/0x4d0 [kvm_intel]
  vcpu_enter_guest+0x15a0/0x2640 [kvm]
  kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm]
  kvm_vcpu_ioctl+0x479/0x880 [kvm]
  do_vfs_ioctl+0x142/0x9a0
  SyS_ioctl+0x74/0x80
  entry_SYSCALL_64_fastpath+0x23/0x9a

The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall)
to the guest memory, however, write_mmio tracepoint always prints 8 bytes
through *(u64 *)val since kvm splits the mmio access into 8 bytes. This
can result in stack-out-of-bounds read due to access the extra 5 bytes.
This patch fixes it by just accessing the bytes which we operates on.

Before patch:

syz-executor-5567  [007]  51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 
val 0x110077c1010f

After patch:

syz-executor-13416 [002]  51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 
val 0xc1010f

Reported-by: Dmitry Vyukov <dvyu...@google.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Marc Zyngier <marc.zyng...@arm.com>
Cc: Christoffer Dall <christoffer.d...@linaro.org>
Signed-off-by: Wanpeng Li <wanpeng...@hotmail.com>
---
v3 -> v4:
* fix the arm tracepoint
v2 -> v3:
* fix sparse warning
v1 -> v2:
* do the memcpy in kvm_mmio tracepoint

arch/x86/kvm/x86.c | 8 
include/trace/events/kvm.h | 6 --
virt/kvm/arm/mmio.c| 6 +++---
3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f82e2c..c7071e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4456,7 +4456,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t 
addr, int len, void *v)
 addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
@@ -4715,7 +4715,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, 
int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-  vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+  vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
@@ -4737,14 +4737,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t 
gpa,

static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}

static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
  void *val, int bytes)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index e4b0b8e..dfd2170 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq,
{ KVM_TRACE_MMIO_WRITE, "write" }

TRACE_EVENT(kvm_mmio,
-   TP_PROTO(int type, int len, u64 gpa, u64 val),
+   TP_PROTO(int type, int len, u64 gpa, void *val),
TP_ARGS(type, len, gpa, val),

TP_STRUCT__entry(
@@ -225,7 +225,9 @@ TRACE_EVENT(kvm_mmio,
__entry->type= type;
__entry->len = len;
__entry->gpa = gpa;
-   __entry->val = val;
+   __entry->val = 0;
+   if (val)
+   memcpy(&__entry->val, val, min(8, len));
),

T

Re: [PATCH v4] KVM: Fix stack-out-of-bounds read in write_mmio

2017-12-15 Thread Darren Kenny

Code-wise, that looks good to me now. I also don't have ARM handy to
validate though - but it looks correct.

Reviewed-by: Darren Kenny 

Thanks,

Darren.

On Thu, Dec 14, 2017 at 05:40:50PM -0800, Wanpeng Li wrote:

From: Wanpeng Li 

Reported by syzkaller:

 BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm]
 Read of size 8 at addr 8803259df7f8 by task syz-executor/32298

 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G   OE4.15.0-rc2+ #18
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 
02/16/2016
 Call Trace:
  dump_stack+0xab/0xe1
  print_address_description+0x6b/0x290
  kasan_report+0x28a/0x370
  write_mmio+0x11e/0x270 [kvm]
  emulator_read_write_onepage+0x311/0x600 [kvm]
  emulator_read_write+0xef/0x240 [kvm]
  emulator_fix_hypercall+0x105/0x150 [kvm]
  em_hypercall+0x2b/0x80 [kvm]
  x86_emulate_insn+0x2b1/0x1640 [kvm]
  x86_emulate_instruction+0x39a/0xb90 [kvm]
  handle_exception+0x1b4/0x4d0 [kvm_intel]
  vcpu_enter_guest+0x15a0/0x2640 [kvm]
  kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm]
  kvm_vcpu_ioctl+0x479/0x880 [kvm]
  do_vfs_ioctl+0x142/0x9a0
  SyS_ioctl+0x74/0x80
  entry_SYSCALL_64_fastpath+0x23/0x9a

The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall)
to the guest memory, however, write_mmio tracepoint always prints 8 bytes
through *(u64 *)val since kvm splits the mmio access into 8 bytes. This
can result in stack-out-of-bounds read due to access the extra 5 bytes.
This patch fixes it by just accessing the bytes which we operates on.

Before patch:

syz-executor-5567  [007]  51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 
val 0x110077c1010f

After patch:

syz-executor-13416 [002]  51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 
val 0xc1010f

Reported-by: Dmitry Vyukov 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Marc Zyngier 
Cc: Christoffer Dall 
Signed-off-by: Wanpeng Li 
---
v3 -> v4:
* fix the arm tracepoint
v2 -> v3:
* fix sparse warning
v1 -> v2:
* do the memcpy in kvm_mmio tracepoint

arch/x86/kvm/x86.c | 8 
include/trace/events/kvm.h | 6 --
virt/kvm/arm/mmio.c| 6 +++---
3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f82e2c..c7071e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4456,7 +4456,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t 
addr, int len, void *v)
 addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
@@ -4715,7 +4715,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, 
int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-  vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+  vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
@@ -4737,14 +4737,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t 
gpa,

static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}

static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
  void *val, int bytes)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index e4b0b8e..dfd2170 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq,
{ KVM_TRACE_MMIO_WRITE, "write" }

TRACE_EVENT(kvm_mmio,
-   TP_PROTO(int type, int len, u64 gpa, u64 val),
+   TP_PROTO(int type, int len, u64 gpa, void *val),
TP_ARGS(type, len, gpa, val),

TP_STRUCT__entry(
@@ -225,7 +225,9 @@ TRACE_EVENT(kvm_mmio,
__entry->type= type;
__entry->len = len;
__entry->gpa = gpa;
-   __entry->val = val;
+   __entry->val = 0;
+   if (val)
+   memcpy(&__entry->val, val, min(8, len));
),

TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx",
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index b6e715f..dac7ceb 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -112,7 +112,7 @@

Re: [PATCH v3] KVM: X86: Fix stack-out-of-bounds read in write_mmio

2017-12-14 Thread Darren Kenny

Hi,

I'm wondering about the change to trace_kvm_mmio() here, since it
doesn't appear to be changing the use of it in
virt/kvm/arm/mmio.c, e.g:

  
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/virt/kvm/arm/mmio.c#n114

which seems to be still using it with a data value rather than a
pointer like you've changed it to in the header.

Should that also be modified to match? As it is, it would appear to
cause a compilation error on ARM (though I've not confirmed).

Thanks,

Darren.

On Thu, Dec 14, 2017 at 05:56:58AM -0800, Wanpeng Li wrote:

From: Wanpeng Li 

Reported by syzkaller:

 BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm]
 Read of size 8 at addr 8803259df7f8 by task syz-executor/32298

 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G   OE4.15.0-rc2+ #18
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 
02/16/2016
 Call Trace:
  dump_stack+0xab/0xe1
  print_address_description+0x6b/0x290
  kasan_report+0x28a/0x370
  write_mmio+0x11e/0x270 [kvm]
  emulator_read_write_onepage+0x311/0x600 [kvm]
  emulator_read_write+0xef/0x240 [kvm]
  emulator_fix_hypercall+0x105/0x150 [kvm]
  em_hypercall+0x2b/0x80 [kvm]
  x86_emulate_insn+0x2b1/0x1640 [kvm]
  x86_emulate_instruction+0x39a/0xb90 [kvm]
  handle_exception+0x1b4/0x4d0 [kvm_intel]
  vcpu_enter_guest+0x15a0/0x2640 [kvm]
  kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm]
  kvm_vcpu_ioctl+0x479/0x880 [kvm]
  do_vfs_ioctl+0x142/0x9a0
  SyS_ioctl+0x74/0x80
  entry_SYSCALL_64_fastpath+0x23/0x9a

The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall)
to the guest memory, however, write_mmio tracepoint always prints 8 bytes
through *(u64 *)val since kvm splits the mmio access into 8 bytes. This
can result in stack-out-of-bounds read due to access the extra 5 bytes.
This patch fixes it by just accessing the bytes which we operates on.

Before patch:

syz-executor-5567  [007]  51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 
val 0x110077c1010f

After patch:

syz-executor-13416 [002]  51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 
val 0xc1010f

Reported-by: Dmitry Vyukov 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Signed-off-by: Wanpeng Li 
---
v2 -> v3:
* fix sparse warning
v1 -> v2:
* do the memcpy in kvm_mmio tracepoint

arch/x86/kvm/x86.c | 8 
include/trace/events/kvm.h | 6 --
2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index daa1918..315ff2c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4457,7 +4457,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t 
addr, int len, void *v)
 addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
@@ -4716,7 +4716,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, 
int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-  vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+  vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
@@ -4738,14 +4738,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t 
gpa,

static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}

static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
  void *val, int bytes)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index e4b0b8e..dfd2170 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq,
{ KVM_TRACE_MMIO_WRITE, "write" }

TRACE_EVENT(kvm_mmio,
-   TP_PROTO(int type, int len, u64 gpa, u64 val),
+   TP_PROTO(int type, int len, u64 gpa, void *val),
TP_ARGS(type, len, gpa, val),

TP_STRUCT__entry(
@@ -225,7 +225,9 @@ TRACE_EVENT(kvm_mmio,
__entry->type= type;
__entry->len = len;
__entry->gpa = gpa;
-   __entry->val = val;
+   __entry->val = 0;
+   if (val)
+   

Re: [PATCH v3] KVM: X86: Fix stack-out-of-bounds read in write_mmio

2017-12-14 Thread Darren Kenny

Hi,

I'm wondering about the change to trace_kvm_mmio() here, since it
doesn't appear to be changing the use of it in
virt/kvm/arm/mmio.c, e.g:

  
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/virt/kvm/arm/mmio.c#n114

which seems to be still using it with a data value rather than a
pointer like you've changed it to in the header.

Should that also be modified to match? As it is, it would appear to
cause a compilation error on ARM (though I've not confirmed).

Thanks,

Darren.

On Thu, Dec 14, 2017 at 05:56:58AM -0800, Wanpeng Li wrote:

From: Wanpeng Li 

Reported by syzkaller:

 BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm]
 Read of size 8 at addr 8803259df7f8 by task syz-executor/32298

 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G   OE4.15.0-rc2+ #18
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 
02/16/2016
 Call Trace:
  dump_stack+0xab/0xe1
  print_address_description+0x6b/0x290
  kasan_report+0x28a/0x370
  write_mmio+0x11e/0x270 [kvm]
  emulator_read_write_onepage+0x311/0x600 [kvm]
  emulator_read_write+0xef/0x240 [kvm]
  emulator_fix_hypercall+0x105/0x150 [kvm]
  em_hypercall+0x2b/0x80 [kvm]
  x86_emulate_insn+0x2b1/0x1640 [kvm]
  x86_emulate_instruction+0x39a/0xb90 [kvm]
  handle_exception+0x1b4/0x4d0 [kvm_intel]
  vcpu_enter_guest+0x15a0/0x2640 [kvm]
  kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm]
  kvm_vcpu_ioctl+0x479/0x880 [kvm]
  do_vfs_ioctl+0x142/0x9a0
  SyS_ioctl+0x74/0x80
  entry_SYSCALL_64_fastpath+0x23/0x9a

The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall)
to the guest memory, however, write_mmio tracepoint always prints 8 bytes
through *(u64 *)val since kvm splits the mmio access into 8 bytes. This
can result in stack-out-of-bounds read due to access the extra 5 bytes.
This patch fixes it by just accessing the bytes which we operates on.

Before patch:

syz-executor-5567  [007]  51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 
val 0x110077c1010f

After patch:

syz-executor-13416 [002]  51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 
val 0xc1010f

Reported-by: Dmitry Vyukov 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Signed-off-by: Wanpeng Li 
---
v2 -> v3:
* fix sparse warning
v1 -> v2:
* do the memcpy in kvm_mmio tracepoint

arch/x86/kvm/x86.c | 8 
include/trace/events/kvm.h | 6 --
2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index daa1918..315ff2c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4457,7 +4457,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t 
addr, int len, void *v)
 addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
handled += n;
addr += n;
len -= n;
@@ -4716,7 +4716,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, 
int bytes)
{
if (vcpu->mmio_read_completed) {
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-  vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+  vcpu->mmio_fragments[0].gpa, val);
vcpu->mmio_read_completed = 0;
return 1;
}
@@ -4738,14 +4738,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t 
gpa,

static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
return vcpu_mmio_write(vcpu, gpa, bytes, val);
}

static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
  void *val, int bytes)
{
-   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+   trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
return X86EMUL_IO_NEEDED;
}

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index e4b0b8e..dfd2170 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq,
{ KVM_TRACE_MMIO_WRITE, "write" }

TRACE_EVENT(kvm_mmio,
-   TP_PROTO(int type, int len, u64 gpa, u64 val),
+   TP_PROTO(int type, int len, u64 gpa, void *val),
TP_ARGS(type, len, gpa, val),

TP_STRUCT__entry(
@@ -225,7 +225,9 @@ TRACE_EVENT(kvm_mmio,
__entry->type= type;
__entry->len = len;
__entry->gpa = gpa;
-   __entry->val = val;
+   __entry->val = 0;
+   if (val)
+   memcpy(&__entry->val, val, min(8, len));
),

TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx",
--
2.7.4