[PATCH 1/9] KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail

2018-02-06 Thread David Woodhouse
From: David Hildenbrand 

vmx_complete_nested_posted_interrupt() can't fail, let's turn it into
a void function.

Signed-off-by: David Hildenbrand 
Signed-off-by: Paolo Bonzini 

(cherry picked from commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f)
Signed-off-by: David Woodhouse 
---
 arch/x86/kvm/vmx.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index feadff3..fd890af 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4736,7 +4736,7 @@ static bool vmx_get_enable_apicv(void)
return enable_apicv;
 }
 
-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@@ -4747,13 +4747,13 @@ static int vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
vmx->nested.pi_pending) {
vmx->nested.pi_pending = false;
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-   return 0;
+   return;
 
max_irr = find_last_bit(
(unsigned long *)vmx->nested.pi_desc->pir, 256);
 
if (max_irr == 256)
-   return 0;
+   return;
 
vapic_page = kmap(vmx->nested.virtual_apic_page);
if (!vapic_page) {
@@ -4770,7 +4770,6 @@ static int vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
-   return 0;
 }
 
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -10491,7 +10490,8 @@ static int vmx_check_nested_events(struct kvm_vcpu 
*vcpu, bool external_intr)
return 0;
}
 
-   return vmx_complete_nested_posted_interrupt(vcpu);
+   vmx_complete_nested_posted_interrupt(vcpu);
+   return 0;
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
-- 
2.7.4



[PATCH 7/9] KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES

2018-02-06 Thread David Woodhouse
From: KarimAllah Ahmed <karah...@amazon.de>

Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.

[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]

Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com>
Reviewed-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Jim Mattson <jmatt...@google.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Jun Nakajima <jun.nakaj...@intel.com>
Cc: k...@vger.kernel.org
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Asit Mallick <asit.k.mall...@intel.com>
Cc: Arjan Van De Ven <arjan.van.de@intel.com>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Ashok Raj <ashok@intel.com>
Link: 
https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karah...@amazon.de

(cherry picked from commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd)
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 arch/x86/kvm/cpuid.c |  8 +++-
 arch/x86/kvm/cpuid.h |  8 
 arch/x86/kvm/vmx.c   | 15 +++
 arch/x86/kvm/x86.c   |  1 +
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6f24483..9c6493f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
 
+   /* cpuid 7.0.edx*/
+   const u32 kvm_cpuid_7_0_edx_x86_features =
+   F(ARCH_CAPABILITIES);
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
 
@@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
+   entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+   cpuid_mask(>edx, CPUID_7_EDX);
} else {
entry->ebx = 0;
entry->ecx = 0;
+   entry->edx = 0;
}
entry->eax = 0;
-   entry->edx = 0;
break;
}
case 9:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ec4f9dc..8719997 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -171,6 +171,14 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu 
*vcpu)
return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
+static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
+}
+
 
 /*
  * NRIPS is provided through cpuidfn 0x800a.edx bit 3
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dd6c831..92bf61f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -551,6 +551,8 @@ struct vcpu_vmx {
u64   msr_guest_kernel_gs_base;
 #endif
 
+   u64   arch_capabilities;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
/*
@@ -2979,6 +2981,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has_arch_capabilities(vcpu))
+   return 1;
+   msr_info->data = to_vmx(vcpu)->arch_capabilities;
+   break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3110,6 +3118,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, 
MSR_IA32_PRED_CMD,
  MSR_TYPE_W);
break;
+ 

[PATCH 7/9] KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES

2018-02-06 Thread David Woodhouse
From: KarimAllah Ahmed 

Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.

[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]

Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Paolo Bonzini 
Reviewed-by: Darren Kenny 
Reviewed-by: Jim Mattson 
Reviewed-by: Konrad Rzeszutek Wilk 
Cc: Andrea Arcangeli 
Cc: Andi Kleen 
Cc: Jun Nakajima 
Cc: k...@vger.kernel.org
Cc: Dave Hansen 
Cc: Linus Torvalds 
Cc: Andy Lutomirski 
Cc: Asit Mallick 
Cc: Arjan Van De Ven 
Cc: Greg KH 
Cc: Dan Williams 
Cc: Tim Chen 
Cc: Ashok Raj 
Link: 
https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karah...@amazon.de

(cherry picked from commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd)
Signed-off-by: David Woodhouse 
---
 arch/x86/kvm/cpuid.c |  8 +++-
 arch/x86/kvm/cpuid.h |  8 
 arch/x86/kvm/vmx.c   | 15 +++
 arch/x86/kvm/x86.c   |  1 +
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6f24483..9c6493f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
 
+   /* cpuid 7.0.edx*/
+   const u32 kvm_cpuid_7_0_edx_x86_features =
+   F(ARCH_CAPABILITIES);
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
 
@@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
+   entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+   cpuid_mask(>edx, CPUID_7_EDX);
} else {
entry->ebx = 0;
entry->ecx = 0;
+   entry->edx = 0;
}
entry->eax = 0;
-   entry->edx = 0;
break;
}
case 9:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index ec4f9dc..8719997 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -171,6 +171,14 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu 
*vcpu)
return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
+static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
+}
+
 
 /*
  * NRIPS is provided through cpuidfn 0x800a.edx bit 3
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dd6c831..92bf61f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -551,6 +551,8 @@ struct vcpu_vmx {
u64   msr_guest_kernel_gs_base;
 #endif
 
+   u64   arch_capabilities;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
/*
@@ -2979,6 +2981,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has_arch_capabilities(vcpu))
+   return 1;
+   msr_info->data = to_vmx(vcpu)->arch_capabilities;
+   break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3110,6 +3118,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, 
MSR_IA32_PRED_CMD,
  MSR_TYPE_W);
break;
+   case MSR_IA32_ARCH_CAPABILITIES:
+   if (!msr_info->host_initiated)
+   return 1;
+   vmx->arch_capabilities = data;
+   break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5200,6 +5213,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
 
+   if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+   rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilit

[PATCH 5/9] KVM: VMX: make MSR bitmaps per-VCPU

2018-02-06 Thread David Woodhouse
From: Paolo Bonzini <pbonz...@redhat.com>

Place the MSR bitmap in struct loaded_vmcs, and update it in place
every time the x2apic or APICv state can change.  This is rare and
the loop can handle 64 MSRs per iteration, in a similar fashion as
nested_vmx_prepare_msr_bitmap.

This prepares for choosing, on a per-VM basis, whether to intercept
the SPEC_CTRL and PRED_CMD MSRs.

Cc: sta...@vger.kernel.org   # prereq for Spectre mitigation
Suggested-by: Jim Mattson <jmatt...@google.com>
Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
(cherry picked from commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6)
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 arch/x86/kvm/vmx.c | 314 +++--
 1 file changed, 114 insertions(+), 200 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c562da..8570a17 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,14 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW3
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV   2
+#define MSR_BITMAP_MODE_LM 4
+
 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xULL
 
 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
@@ -191,6 +199,7 @@ struct loaded_vmcs {
struct vmcs *shadow_vmcs;
int cpu;
int launched;
+   unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -429,8 +438,6 @@ struct nested_vmx {
bool pi_pending;
u16 posted_intr_nv;
 
-   unsigned long *msr_bitmap;
-
struct hrtimer preemption_timer;
bool preemption_timer_expired;
 
@@ -531,6 +538,7 @@ struct vcpu_vmx {
unsigned long host_rsp;
u8fail;
bool  nmi_known_unmasked;
+   u8msr_bitmap_mode;
u32   exit_intr_info;
u32   idt_vectoring_info;
ulong rflags;
@@ -902,6 +910,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment 
*var);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -921,12 +930,6 @@ static DEFINE_PER_CPU(spinlock_t, 
blocked_vcpu_on_cpu_lock);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -2520,36 +2523,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, 
int to)
vmx->guest_msrs[from] = tmp;
 }
 
-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
-   unsigned long *msr_bitmap;
-
-   if (is_guest_mode(vcpu))
-   msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
-   else if (cpu_has_secondary_exec_ctrls() &&
-(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-   if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
-   if (is_long_mode(vcpu))
-   msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-   else
-   msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
-   } else {
-   if (is_long_mode(vcpu))
-   msr_bitmap = 
vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
-   else
-   msr_bitmap = 
vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-   }
-   } else {
-   if (is_long_mode(vcpu))
-   msr_bitmap = vmx_msr_bitmap_longmode;
-   else
-   msr_bitmap = vmx_msr_bitmap_legacy;
-   }
-
-   vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
 /*
  * Set up the vmcs to automatically save and restore system
  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@ -2590,7 +2563,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;
 
if (cpu_has_vmx_msr_bitmap())
-   vmx_set_msr_bitmap(>vcpu);
+   vmx_update_msr_bitmap(>vcpu);
 }
 

[PATCH 5/9] KVM: VMX: make MSR bitmaps per-VCPU

2018-02-06 Thread David Woodhouse
From: Paolo Bonzini 

Place the MSR bitmap in struct loaded_vmcs, and update it in place
every time the x2apic or APICv state can change.  This is rare and
the loop can handle 64 MSRs per iteration, in a similar fashion as
nested_vmx_prepare_msr_bitmap.

This prepares for choosing, on a per-VM basis, whether to intercept
the SPEC_CTRL and PRED_CMD MSRs.

Cc: sta...@vger.kernel.org   # prereq for Spectre mitigation
Suggested-by: Jim Mattson 
Signed-off-by: Paolo Bonzini 
(cherry picked from commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6)
Signed-off-by: David Woodhouse 
---
 arch/x86/kvm/vmx.c | 314 +++--
 1 file changed, 114 insertions(+), 200 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c562da..8570a17 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,14 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW3
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV   2
+#define MSR_BITMAP_MODE_LM 4
+
 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xULL
 
 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
@@ -191,6 +199,7 @@ struct loaded_vmcs {
struct vmcs *shadow_vmcs;
int cpu;
int launched;
+   unsigned long *msr_bitmap;
struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -429,8 +438,6 @@ struct nested_vmx {
bool pi_pending;
u16 posted_intr_nv;
 
-   unsigned long *msr_bitmap;
-
struct hrtimer preemption_timer;
bool preemption_timer_expired;
 
@@ -531,6 +538,7 @@ struct vcpu_vmx {
unsigned long host_rsp;
u8fail;
bool  nmi_known_unmasked;
+   u8msr_bitmap_mode;
u32   exit_intr_info;
u32   idt_vectoring_info;
ulong rflags;
@@ -902,6 +910,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment 
*var);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -921,12 +930,6 @@ static DEFINE_PER_CPU(spinlock_t, 
blocked_vcpu_on_cpu_lock);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -2520,36 +2523,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, 
int to)
vmx->guest_msrs[from] = tmp;
 }
 
-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
-   unsigned long *msr_bitmap;
-
-   if (is_guest_mode(vcpu))
-   msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
-   else if (cpu_has_secondary_exec_ctrls() &&
-(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-   if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
-   if (is_long_mode(vcpu))
-   msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-   else
-   msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
-   } else {
-   if (is_long_mode(vcpu))
-   msr_bitmap = 
vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
-   else
-   msr_bitmap = 
vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-   }
-   } else {
-   if (is_long_mode(vcpu))
-   msr_bitmap = vmx_msr_bitmap_longmode;
-   else
-   msr_bitmap = vmx_msr_bitmap_legacy;
-   }
-
-   vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
 /*
  * Set up the vmcs to automatically save and restore system
  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@ -2590,7 +2563,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->save_nmsrs = save_nmsrs;
 
if (cpu_has_vmx_msr_bitmap())
-   vmx_set_msr_bitmap(>vcpu);
+   vmx_update_msr_bitmap(>vcpu);
 }
 
 /*
@@ -3537,6 +3510,8 @@ static void free_loaded_vmcs(struct loaded_vmcs 
*loaded_vmcs)
loaded_vmcs

[PATCH 9/9] KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-06 Thread David Woodhouse
From: KarimAllah Ahmed <karah...@amazon.de>

[ Based on a patch from Paolo Bonzini <pbonz...@redhat.com> ]

... basically doing exactly what we do for VMX:

- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
  actually used it.

Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Jun Nakajima <jun.nakaj...@intel.com>
Cc: k...@vger.kernel.org
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Asit Mallick <asit.k.mall...@intel.com>
Cc: Arjan Van De Ven <arjan.van.de@intel.com>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Ashok Raj <ashok@intel.com>
Link: 
https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karah...@amazon.de

(cherry picked from commit b2ac58f90540e39324e7a29a7ad471407ae0bf48)
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 arch/x86/kvm/svm.c | 88 ++
 1 file changed, 88 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6de8d65..be644af 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -183,6 +183,8 @@ struct vcpu_svm {
u64 gs_base;
} host;
 
+   u64 spec_ctrl;
+
u32 *msrpm;
 
ulong nmi_iret_rip;
@@ -248,6 +250,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_CSTAR,   .always = true  },
{ .index = MSR_SYSCALL_MASK,.always = true  },
 #endif
+   { .index = MSR_IA32_SPEC_CTRL,  .always = false },
{ .index = MSR_IA32_PRED_CMD,   .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP,   .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
@@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 index)
return false;
 }
 
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+   u8 bit_write;
+   unsigned long tmp;
+   u32 offset;
+   u32 *msrpm;
+
+   msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+   offset= svm_msrpm_offset(msr);
+   bit_write = 2 * (msr & 0x0f) + 1;
+   tmp   = msrpm[offset];
+
+   BUG_ON(offset == MSR_INVALID);
+
+   return !!test_bit(bit_write,  );
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 int read, int write)
 {
@@ -1537,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
u32 dummy;
u32 eax = 1;
 
+   svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
   MSR_IA32_APICBASE_ENABLE;
@@ -3520,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has_ibrs(vcpu))
+   return 1;
+
+   msr_info->data = svm->spec_ctrl;
+   break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x0165;
break;
@@ -3611,6 +3642,33 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr->host_initiated &&
+   !guest_cpuid_has_ibrs(vcpu))
+   return 1;
+
+   /* The STIBP bit doesn't fault even if it's not advertised */
+   if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+   return 1;
+
+   svm->spec_ctrl = data;
+
+   if (!data)
+   break;
+
+   /*
+* For non-nested:
+* When it's written (to non-zero) for the first time, pass
+* it through.
+*
+* For nested:
+* The handling of the MSR bitmap f

[PATCH 9/9] KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-06 Thread David Woodhouse
From: KarimAllah Ahmed 

[ Based on a patch from Paolo Bonzini  ]

... basically doing exactly what we do for VMX:

- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
  actually used it.

Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 
Cc: Andrea Arcangeli 
Cc: Andi Kleen 
Cc: Jun Nakajima 
Cc: k...@vger.kernel.org
Cc: Dave Hansen 
Cc: Tim Chen 
Cc: Andy Lutomirski 
Cc: Asit Mallick 
Cc: Arjan Van De Ven 
Cc: Greg KH 
Cc: Paolo Bonzini 
Cc: Dan Williams 
Cc: Linus Torvalds 
Cc: Ashok Raj 
Link: 
https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karah...@amazon.de

(cherry picked from commit b2ac58f90540e39324e7a29a7ad471407ae0bf48)
Signed-off-by: David Woodhouse 
---
 arch/x86/kvm/svm.c | 88 ++
 1 file changed, 88 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6de8d65..be644af 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -183,6 +183,8 @@ struct vcpu_svm {
u64 gs_base;
} host;
 
+   u64 spec_ctrl;
+
u32 *msrpm;
 
ulong nmi_iret_rip;
@@ -248,6 +250,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_CSTAR,   .always = true  },
{ .index = MSR_SYSCALL_MASK,.always = true  },
 #endif
+   { .index = MSR_IA32_SPEC_CTRL,  .always = false },
{ .index = MSR_IA32_PRED_CMD,   .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP,   .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
@@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 index)
return false;
 }
 
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+   u8 bit_write;
+   unsigned long tmp;
+   u32 offset;
+   u32 *msrpm;
+
+   msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+   offset= svm_msrpm_offset(msr);
+   bit_write = 2 * (msr & 0x0f) + 1;
+   tmp   = msrpm[offset];
+
+   BUG_ON(offset == MSR_INVALID);
+
+   return !!test_bit(bit_write,  );
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 int read, int write)
 {
@@ -1537,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool 
init_event)
u32 dummy;
u32 eax = 1;
 
+   svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
   MSR_IA32_APICBASE_ENABLE;
@@ -3520,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has_ibrs(vcpu))
+   return 1;
+
+   msr_info->data = svm->spec_ctrl;
+   break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x0165;
break;
@@ -3611,6 +3642,33 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+   case MSR_IA32_SPEC_CTRL:
+   if (!msr->host_initiated &&
+   !guest_cpuid_has_ibrs(vcpu))
+   return 1;
+
+   /* The STIBP bit doesn't fault even if it's not advertised */
+   if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+   return 1;
+
+   svm->spec_ctrl = data;
+
+   if (!data)
+   break;
+
+   /*
+* For non-nested:
+* When it's written (to non-zero) for the first time, pass
+* it through.
+*
+* For nested:
+* The handling of the MSR bitmap for L2 guests is done in
+* nested_svm_vmrun_msrpm.
+* We update the L1 MSR bit as well since it will end up
+* touching the MSR anyway now.
+*/
+   set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+   break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
!guest_cpuid_has_ibpb(vcpu))
@@ -4854,6 +4912,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
local_irq_enable();
 
+   /*
+* If this vCPU has touched SPEC_CTRL, restore the guest's value if
+* it's non-zero. 

Re: [PATCH] mm: Always print RLIMIT_DATA warning

2018-02-06 Thread David Woodhouse


On Tue, 2018-02-06 at 20:27 +0300, Cyrill Gorcunov wrote:
> On Tue, Feb 06, 2018 at 04:45:05PM +0000, David Woodhouse wrote:
> > 
> > The documentation for ignore_rlimit_data says that it will print a warning
> > at first misuse. Yet it doesn't seem to do that. Fix the code to print
> > the warning even when we allow the process to continue.
> > 
> > Signed-off-by: David Woodhouse <d...@amazon.co.uk>
> > ---
> > We should probably also do what Linus suggested in 
> > https://lkml.org/lkml/2016/9/16/585
> > 
> Might be typo in docs I guess, Kostya?

I prefer the documented behaviour to the actual behaviour :)

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] mm: Always print RLIMIT_DATA warning

2018-02-06 Thread David Woodhouse


On Tue, 2018-02-06 at 20:27 +0300, Cyrill Gorcunov wrote:
> On Tue, Feb 06, 2018 at 04:45:05PM +0000, David Woodhouse wrote:
> > 
> > The documentation for ignore_rlimit_data says that it will print a warning
> > at first misuse. Yet it doesn't seem to do that. Fix the code to print
> > the warning even when we allow the process to continue.
> > 
> > Signed-off-by: David Woodhouse 
> > ---
> > We should probably also do what Linus suggested in 
> > https://lkml.org/lkml/2016/9/16/585
> > 
> Might be typo in docs I guess, Kostya?

I prefer the documented behaviour to the actual behaviour :)

smime.p7s
Description: S/MIME cryptographic signature


[PATCH 8/9] KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-06 Thread David Woodhouse
From: KarimAllah Ahmed <karah...@amazon.de>

[ Based on a patch from Ashok Raj <ashok@intel.com> ]

Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.

To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.

No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.

[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]

Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Reviewed-by: Darren Kenny <darren.ke...@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Reviewed-by: Jim Mattson <jmatt...@google.com>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Andi Kleen <a...@linux.intel.com>
Cc: Jun Nakajima <jun.nakaj...@intel.com>
Cc: k...@vger.kernel.org
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Tim Chen <tim.c.c...@linux.intel.com>
Cc: Andy Lutomirski <l...@kernel.org>
Cc: Asit Mallick <asit.k.mall...@intel.com>
Cc: Arjan Van De Ven <arjan.van.de@intel.com>
Cc: Greg KH <gre...@linuxfoundation.org>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Ashok Raj <ashok@intel.com>
Link: 
https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karah...@amazon.de

(cherry picked from commit d28b387fb74da95d69d2615732f50cceb38e9a4d)
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 arch/x86/kvm/cpuid.c |   8 ++--
 arch/x86/kvm/cpuid.h |  11 ++
 arch/x86/kvm/vmx.c   | 103 ++-
 arch/x86/kvm/x86.c   |   2 +-
 4 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9c6493f..93f924d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 0x8008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-   F(IBPB);
+   F(IBPB) | F(IBRS);
 
/* cpuid 0xC001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
-   F(ARCH_CAPABILITIES);
+   F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
-   /* IBPB isn't necessarily present in hardware cpuid */
+   /* IBRS and IBPB aren't necessarily present in hardware cpuid */
if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(IBPB);
+   if (boot_cpu_has(X86_FEATURE_IBRS))
+   entry->ebx |= F(IBRS);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(>ebx, CPUID_8000_0008_EBX);
break;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 8719997..d1beb71 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -171,6 +171,17 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu 
*vcpu)
return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
+static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
+   if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
+   return true;
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+}
+
 static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
 {
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 92bf61f..764cb7c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -552,6 +552,7 @@ struct vcpu_vmx {
 #endif
 
u64   arch_capabilities;
+   u64   spec_ctrl;
 
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow

[PATCH 8/9] KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-06 Thread David Woodhouse
From: KarimAllah Ahmed 

[ Based on a patch from Ashok Raj  ]

Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.

To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.

No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.

[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]

Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Darren Kenny 
Reviewed-by: Konrad Rzeszutek Wilk 
Reviewed-by: Jim Mattson 
Cc: Andrea Arcangeli 
Cc: Andi Kleen 
Cc: Jun Nakajima 
Cc: k...@vger.kernel.org
Cc: Dave Hansen 
Cc: Tim Chen 
Cc: Andy Lutomirski 
Cc: Asit Mallick 
Cc: Arjan Van De Ven 
Cc: Greg KH 
Cc: Paolo Bonzini 
Cc: Dan Williams 
Cc: Linus Torvalds 
Cc: Ashok Raj 
Link: 
https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karah...@amazon.de

(cherry picked from commit d28b387fb74da95d69d2615732f50cceb38e9a4d)
Signed-off-by: David Woodhouse 
---
 arch/x86/kvm/cpuid.c |   8 ++--
 arch/x86/kvm/cpuid.h |  11 ++
 arch/x86/kvm/vmx.c   | 103 ++-
 arch/x86/kvm/x86.c   |   2 +-
 4 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9c6493f..93f924d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 0x8008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
-   F(IBPB);
+   F(IBPB) | F(IBRS);
 
/* cpuid 0xC001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
-   F(ARCH_CAPABILITIES);
+   F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
-   /* IBPB isn't necessarily present in hardware cpuid */
+   /* IBRS and IBPB aren't necessarily present in hardware cpuid */
if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(IBPB);
+   if (boot_cpu_has(X86_FEATURE_IBRS))
+   entry->ebx |= F(IBRS);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(>ebx, CPUID_8000_0008_EBX);
break;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 8719997..d1beb71 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -171,6 +171,17 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu 
*vcpu)
return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
 }
 
+static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
+   if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
+   return true;
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
+}
+
 static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
 {
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 92bf61f..764cb7c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -552,6 +552,7 @@ struct vcpu_vmx {
 #endif
 
u64   arch_capabilities;
+   u64   spec_ctrl;
 
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
@@ -1852,6 +1853,29 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
 }
 
 /*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+   unsigned long *msr_bitmap;
+   int f = sizeof(unsigned long);
+
+   if (!cpu_has_vmx_msr_bitmap())
+   return true;
+
+   msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+   if (msr <= 0x1fff) {
+   return !!test_bit(msr, msr_bitmap + 0x800 / f);
+   } else if ((msr >= 0xc000) && (msr <= 0xc0001fff)

[PATCH 3/9] KVM: nVMX: Eliminate vmcs02 pool

2018-02-06 Thread David Woodhouse
From: Jim Mattson <jmatt...@google.com>

The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.

Cc: sta...@vger.kernel.org   # prereq for Spectre mitigation
Signed-off-by: Jim Mattson <jmatt...@google.com>
Signed-off-by: Mark Kanda <mark.ka...@oracle.com>
Reviewed-by: Ameya More <ameya.m...@oracle.com>
Reviewed-by: David Hildenbrand <da...@redhat.com>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com>
Signed-off-by: Radim Krčmář <rkrc...@redhat.com>

(cherry picked from commit de3a0021a60635de96aa92713c1a31a96747d72c)
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 arch/x86/kvm/vmx.c | 146 +
 1 file changed, 23 insertions(+), 123 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9408ae8..55b1474 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -174,7 +174,6 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
u32 revision_id;
@@ -208,7 +207,7 @@ struct shared_msr_entry {
  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
  * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
  * underlying hardware which will be used to run L2.
  * This structure is packed to ensure that its layout is identical across
  * machines (necessary for live migration).
@@ -387,13 +386,6 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
-   struct list_head list;
-   gpa_t vmptr;
-   struct loaded_vmcs vmcs02;
-};
-
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -420,15 +412,15 @@ struct nested_vmx {
 */
bool sync_shadow_vmcs;
 
-   /* vmcs02_list cache of VMCSs recently used to run L2 guests */
-   struct list_head vmcs02_pool;
-   int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+   struct loaded_vmcs vmcs02;
+
/*
-* Guest pages referred to in vmcs02 with host-physical pointers, so
-* we must keep them pinned while L2 runs.
+* Guest pages referred to in the vmcs02 with host-physical
+* pointers, so we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6682,94 +6674,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 }
 
 /*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-   struct vmcs02_list *item;
-   list_for_each_entry(item, >nested.vmcs02_pool, list)
-   if (item->vmptr == vmx->nested.current_vmptr) {
-   list_move(>list, >nested.vmcs02_pool);
-   return >vmcs02;
-   }
-
-   if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-   /* Recycle the least recently used VMCS. */
-   item = list_last_entry(>nested.vmcs02_pool,
-  struct vmcs02_list, list);
-   item->vmptr = vmx->nested.current_vmptr;
-   list_move(>list, >nested.vmcs02_pool);
-   return >vmcs02;
-   }
-
-   /* Create a new VMCS */
-   item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-   if (!item)
-   return NULL;
-   item->vmcs02.vmcs = alloc_vmcs();
-   item->vmcs02.shadow_vmcs = NULL;
-   if (!item->vmcs02.vmcs) {
-   kfree(item);
-   

[PATCH 3/9] KVM: nVMX: Eliminate vmcs02 pool

2018-02-06 Thread David Woodhouse
From: Jim Mattson 

The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.

Cc: sta...@vger.kernel.org   # prereq for Spectre mitigation
Signed-off-by: Jim Mattson 
Signed-off-by: Mark Kanda 
Reviewed-by: Ameya More 
Reviewed-by: David Hildenbrand 
Reviewed-by: Paolo Bonzini 
Signed-off-by: Radim Krčmář 

(cherry picked from commit de3a0021a60635de96aa92713c1a31a96747d72c)
Signed-off-by: David Woodhouse 
---
 arch/x86/kvm/vmx.c | 146 +
 1 file changed, 23 insertions(+), 123 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9408ae8..55b1474 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -174,7 +174,6 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
u32 revision_id;
@@ -208,7 +207,7 @@ struct shared_msr_entry {
  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
  * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
  * underlying hardware which will be used to run L2.
  * This structure is packed to ensure that its layout is identical across
  * machines (necessary for live migration).
@@ -387,13 +386,6 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
-   struct list_head list;
-   gpa_t vmptr;
-   struct loaded_vmcs vmcs02;
-};
-
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -420,15 +412,15 @@ struct nested_vmx {
 */
bool sync_shadow_vmcs;
 
-   /* vmcs02_list cache of VMCSs recently used to run L2 guests */
-   struct list_head vmcs02_pool;
-   int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+   struct loaded_vmcs vmcs02;
+
/*
-* Guest pages referred to in vmcs02 with host-physical pointers, so
-* we must keep them pinned while L2 runs.
+* Guest pages referred to in the vmcs02 with host-physical
+* pointers, so we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6682,94 +6674,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 }
 
 /*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-   struct vmcs02_list *item;
-   list_for_each_entry(item, >nested.vmcs02_pool, list)
-   if (item->vmptr == vmx->nested.current_vmptr) {
-   list_move(>list, >nested.vmcs02_pool);
-   return >vmcs02;
-   }
-
-   if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-   /* Recycle the least recently used VMCS. */
-   item = list_last_entry(>nested.vmcs02_pool,
-  struct vmcs02_list, list);
-   item->vmptr = vmx->nested.current_vmptr;
-   list_move(>list, >nested.vmcs02_pool);
-   return >vmcs02;
-   }
-
-   /* Create a new VMCS */
-   item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-   if (!item)
-   return NULL;
-   item->vmcs02.vmcs = alloc_vmcs();
-   item->vmcs02.shadow_vmcs = NULL;
-   if (!item->vmcs02.vmcs) {
-   kfree(item);
-   return NULL;
-   }
-   loaded_vmcs_init(>vmcs02);
-   item->vmptr = vmx->nested.current_vmptr;
-   list_add(&(item->list), &(vmx->nested.vmcs02_pool));
-   vmx->nested.vmcs02_num++;
-  

[STABLE 4.9.y PATCH 0/9] Backport of KVM Speculation Control support

2018-02-06 Thread David Woodhouse
I've put together a linux-4.9.y branch at 
http://git.infradead.org/retpoline-stable.git/shortlog/refs/heads/linux-4.9.y

Most of it is fairly straightforward, apart from the IBPB on context 
switch for which Tim has already posted a candidate. I wanted some more
review on my backports of the KVM bits though, including some extra
historical patches I pulled in.

Ashok Raj (1):
  KVM/x86: Add IBPB support

David Hildenbrand (1):
  KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail

David Matlack (1):
  KVM: nVMX: mark vmcs12 pages dirty on L2 exit

Jim Mattson (1):
  KVM: nVMX: Eliminate vmcs02 pool

KarimAllah Ahmed (3):
  KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
  KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
  KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL

Paolo Bonzini (2):
  KVM: VMX: introduce alloc_loaded_vmcs
  KVM: VMX: make MSR bitmaps per-VCPU

 arch/x86/kvm/cpuid.c |  21 +-
 arch/x86/kvm/cpuid.h |  31 +++
 arch/x86/kvm/svm.c   | 116 
 arch/x86/kvm/vmx.c   | 730 +++
 arch/x86/kvm/x86.c   |   1 +
 5 files changed, 554 insertions(+), 345 deletions(-)

-- 
2.7.4



[STABLE 4.9.y PATCH 0/9] Backport of KVM Speculation Control support

2018-02-06 Thread David Woodhouse
I've put together a linux-4.9.y branch at 
http://git.infradead.org/retpoline-stable.git/shortlog/refs/heads/linux-4.9.y

Most of it is fairly straightforward, apart from the IBPB on context 
switch for which Tim has already posted a candidate. I wanted some more
review on my backports of the KVM bits though, including some extra
historical patches I pulled in.

Ashok Raj (1):
  KVM/x86: Add IBPB support

David Hildenbrand (1):
  KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail

David Matlack (1):
  KVM: nVMX: mark vmcs12 pages dirty on L2 exit

Jim Mattson (1):
  KVM: nVMX: Eliminate vmcs02 pool

KarimAllah Ahmed (3):
  KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
  KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
  KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL

Paolo Bonzini (2):
  KVM: VMX: introduce alloc_loaded_vmcs
  KVM: VMX: make MSR bitmaps per-VCPU

 arch/x86/kvm/cpuid.c |  21 +-
 arch/x86/kvm/cpuid.h |  31 +++
 arch/x86/kvm/svm.c   | 116 
 arch/x86/kvm/vmx.c   | 730 +++
 arch/x86/kvm/x86.c   |   1 +
 5 files changed, 554 insertions(+), 345 deletions(-)

-- 
2.7.4



[PATCH] mm: Always print RLIMIT_DATA warning

2018-02-06 Thread David Woodhouse
The documentation for ignore_rlimit_data says that it will print a warning
at first misuse. Yet it doesn't seem to do that. Fix the code to print
the warning even when we allow the process to continue.

Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
We should probably also do what Linus suggested in 
https://lkml.org/lkml/2016/9/16/585

 mm/mmap.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021..dd76ea3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,13 +3184,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t 
flags, unsigned long npages)
if (rlimit(RLIMIT_DATA) == 0 &&
mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> 
PAGE_SHIFT)
return true;
-   if (!ignore_rlimit_data) {
-   pr_warn_once("%s (%d): VmData %lu exceed data ulimit 
%lu. Update limits or use boot option ignore_rlimit_data.\n",
-current->comm, current->pid,
-(mm->data_vm + npages) << PAGE_SHIFT,
-rlimit(RLIMIT_DATA));
+
+   pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. 
Update limits%s.\n",
+current->comm, current->pid,
+(mm->data_vm + npages) << PAGE_SHIFT,
+rlimit(RLIMIT_DATA),
+ignore_rlimit_data ? "" : " or use boot option 
ignore_rlimit_data");
+
+   if (!ignore_rlimit_data)
return false;
-   }
}
 
return true;
-- 
2.7.4



[PATCH] mm: Always print RLIMIT_DATA warning

2018-02-06 Thread David Woodhouse
The documentation for ignore_rlimit_data says that it will print a warning
at first misuse. Yet it doesn't seem to do that. Fix the code to print
the warning even when we allow the process to continue.

Signed-off-by: David Woodhouse 
---
We should probably also do what Linus suggested in 
https://lkml.org/lkml/2016/9/16/585

 mm/mmap.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 9efdc021..dd76ea3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,13 +3184,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t 
flags, unsigned long npages)
if (rlimit(RLIMIT_DATA) == 0 &&
mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> 
PAGE_SHIFT)
return true;
-   if (!ignore_rlimit_data) {
-   pr_warn_once("%s (%d): VmData %lu exceed data ulimit 
%lu. Update limits or use boot option ignore_rlimit_data.\n",
-current->comm, current->pid,
-(mm->data_vm + npages) << PAGE_SHIFT,
-rlimit(RLIMIT_DATA));
+
+   pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. 
Update limits%s.\n",
+current->comm, current->pid,
+(mm->data_vm + npages) << PAGE_SHIFT,
+rlimit(RLIMIT_DATA),
+ignore_rlimit_data ? "" : " or use boot option 
ignore_rlimit_data");
+
+   if (!ignore_rlimit_data)
return false;
-   }
}
 
return true;
-- 
2.7.4



Re: [PATCH 05/10] tools headers: Synchoronize x86 features UAPI headers

2018-02-06 Thread David Woodhouse


On Mon, 2018-02-05 at 16:56 -0300, Arnaldo Carvalho de Melo wrote:
> 
> None will entail changes in the tools/perf/, synchronizing to elliminate
> these perf build warnings:
> 
>   Warning: Kernel ABI header at 
> 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version 
> at 'arch/x86/include/asm/disabled-features.h'
>   Warning: Kernel ABI header at 
> 'tools/arch/x86/include/asm/required-features.h' differs from latest version 
> at 'arch/x86/include/asm/required-features.h'
>   Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' 
> differs from latest version at 'arch/x86/include/asm/cpufeatures.h'

Ick. Have we considered just using a symlink? Why have copies of the
same header file in different places in the tree, and tooling to
complain i̶f̶when they get out of sync?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 05/10] tools headers: Synchoronize x86 features UAPI headers

2018-02-06 Thread David Woodhouse


On Mon, 2018-02-05 at 16:56 -0300, Arnaldo Carvalho de Melo wrote:
> 
> None will entail changes in the tools/perf/, synchronizing to elliminate
> these perf build warnings:
> 
>   Warning: Kernel ABI header at 
> 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version 
> at 'arch/x86/include/asm/disabled-features.h'
>   Warning: Kernel ABI header at 
> 'tools/arch/x86/include/asm/required-features.h' differs from latest version 
> at 'arch/x86/include/asm/required-features.h'
>   Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' 
> differs from latest version at 'arch/x86/include/asm/cpufeatures.h'

Ick. Have we considered just using a symlink? Why have copies of the
same header file in different places in the tree, and tooling to
complain i̶f̶when they get out of sync?

smime.p7s
Description: S/MIME cryptographic signature


Re: [RFC 09/10] x86/enter: Create macros to restrict/unrestrict Indirect Branch Speculation

2018-02-06 Thread David Woodhouse


On Sun, 2018-02-04 at 19:43 +0100, Thomas Gleixner wrote:
> Yet another possibility is to avoid the function entry and accouting magic
> and use the generic gcc return thunk:
> 
> __x86_return_thunk:
> call L2
> L1:
> pause
> lfence
> jmp L1
> L2:
> lea 8(%rsp), %rsp|lea 4(%esp), %esp
> ret
> 
> which basically refills the RSB on every return. That can be inline or
> extern, but in both cases we should be able to patch it out.
> 
> I have no idea how that affects performance, but it might be worthwhile to
> experiment with that.

That was what I had in mind when I asked HJ to add -mfunction-return.

I suspect the performance hit would be significant because it would
cause a prediction miss on *every* return.

But as I said, let's implement what we can without IBRS for Skylake,
then we can compare the two options for performance, security coverage
and general fugliness.

smime.p7s
Description: S/MIME cryptographic signature


Re: [RFC 09/10] x86/enter: Create macros to restrict/unrestrict Indirect Branch Speculation

2018-02-06 Thread David Woodhouse


On Sun, 2018-02-04 at 19:43 +0100, Thomas Gleixner wrote:
> Yet another possibility is to avoid the function entry and accouting magic
> and use the generic gcc return thunk:
> 
> __x86_return_thunk:
> call L2
> L1:
> pause
> lfence
> jmp L1
> L2:
> lea 8(%rsp), %rsp|lea 4(%esp), %esp
> ret
> 
> which basically refills the RSB on every return. That can be inline or
> extern, but in both cases we should be able to patch it out.
> 
> I have no idea how that affects performance, but it might be worthwhile to
> experiment with that.

That was what I had in mind when I asked HJ to add -mfunction-return.

I suspect the performance hit would be significant because it would
cause a prediction miss on *every* return.

But as I said, let's implement what we can without IBRS for Skylake,
then we can compare the two options for performance, security coverage
and general fugliness.

smime.p7s
Description: S/MIME cryptographic signature


Re: [tip:x86/pti] x86/speculation: Use Indirect Branch Prediction Barrier in context switch

2018-02-05 Thread David Woodhouse
On Tue, 2018-01-30 at 14:39 -0800, tip-bot for Tim Chen wrote:
> Thanks to the reviewers and Andy Lutomirski for the suggestion of
> using ctx_id which got rid of the problem of mm pointer recycling.

That one doesn't backport well to 4.9. Suggestions welcome.

smime.p7s
Description: S/MIME cryptographic signature


Re: [tip:x86/pti] x86/speculation: Use Indirect Branch Prediction Barrier in context switch

2018-02-05 Thread David Woodhouse
On Tue, 2018-01-30 at 14:39 -0800, tip-bot for Tim Chen wrote:
> Thanks to the reviewers and Andy Lutomirski for the suggestion of
> using ctx_id which got rid of the problem of mm pointer recycling.

That one doesn't backport well to 4.9. Suggestions welcome.

smime.p7s
Description: S/MIME cryptographic signature


Re: [9/8] KVM: x86: limit MSR_IA32_SPEC_CTRL access based on CPUID availability

2018-02-05 Thread David Woodhouse
On Mon, 2018-02-05 at 12:10 +0100, Ingo Molnar wrote:
> 
> Can this branch still be rebased, to fix the SoB chain of:
> 
>   de3a0021a606 ("KVM: nVMX: Eliminate vmcs02 pool")
> 
> ?

It can't. Linus pulled it already.

smime.p7s
Description: S/MIME cryptographic signature


Re: [9/8] KVM: x86: limit MSR_IA32_SPEC_CTRL access based on CPUID availability

2018-02-05 Thread David Woodhouse
On Mon, 2018-02-05 at 12:10 +0100, Ingo Molnar wrote:
> 
> Can this branch still be rebased, to fix the SoB chain of:
> 
>   de3a0021a606 ("KVM: nVMX: Eliminate vmcs02 pool")
> 
> ?

It can't. Linus pulled it already.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-05 Thread David Woodhouse
On Mon, 2018-02-05 at 11:00 +, Darren Kenny wrote:
> On Fri, Feb 02, 2018 at 11:42:12PM +0000, David Woodhouse wrote:
> > 
> > On Fri, 2018-02-02 at 19:12 +, Darren Kenny wrote:
> > > 
> > > Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
> > > ("x86/retpoline: Fill return stack buffer on vmexit")
> > > 
> > > Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
> > > Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
> > Not strictly a typo; that was the original name for it. "IBRS all
> > the
> > time". But yes, it should be IBRS_ALL now.
> > 
> > Acked-by: David Woodhouse <d...@amazon.co.uk>
> Thanks David.
> 
> I'll send out another patch with this snippet of information in it
> too, and including your Acked-by.

It's in Linus's tree already.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-05 Thread David Woodhouse
On Mon, 2018-02-05 at 11:00 +, Darren Kenny wrote:
> On Fri, Feb 02, 2018 at 11:42:12PM +0000, David Woodhouse wrote:
> > 
> > On Fri, 2018-02-02 at 19:12 +, Darren Kenny wrote:
> > > 
> > > Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
> > > ("x86/retpoline: Fill return stack buffer on vmexit")
> > > 
> > > Signed-off-by: Darren Kenny 
> > > Reviewed-by: Konrad Rzeszutek Wilk 
> > Not strictly a typo; that was the original name for it. "IBRS all
> > the
> > time". But yes, it should be IBRS_ALL now.
> > 
> > Acked-by: David Woodhouse 
> Thanks David.
> 
> I'll send out another patch with this snippet of information in it
> too, and including your Acked-by.

It's in Linus's tree already.

smime.p7s
Description: S/MIME cryptographic signature


Re: [RFC 09/10] x86/enter: Create macros to restrict/unrestrict Indirect Branch Speculation

2018-02-04 Thread David Woodhouse
On Sun, 2018-02-04 at 19:43 +0100, Thomas Gleixner wrote:
> 
> __x86_return_thunk would look like this:
> 
> __x86_return_thunk:
> testl   $0xf, PER_CPU_VAR(call_depth)
> jnz 1f  
> stuff_rsb
>    1:
> declPER_CPU_VAR(call_depth)
> ret
> 
> The call_depth variable would be reset on context switch.

Note that the 'jnz' can be predicted taken there, allowing the CPU to
speculate all the way to the 'ret'... and beyond.



smime.p7s
Description: S/MIME cryptographic signature


Re: [RFC 09/10] x86/enter: Create macros to restrict/unrestrict Indirect Branch Speculation

2018-02-04 Thread David Woodhouse
On Sun, 2018-02-04 at 19:43 +0100, Thomas Gleixner wrote:
> 
> __x86_return_thunk would look like this:
> 
> __x86_return_thunk:
> testl   $0xf, PER_CPU_VAR(call_depth)
> jnz 1f  
> stuff_rsb
>    1:
> declPER_CPU_VAR(call_depth)
> ret
> 
> The call_depth variable would be reset on context switch.

Note that the 'jnz' can be predicted taken there, allowing the CPU to
speculate all the way to the 'ret'... and beyond.



smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-03 Thread David Woodhouse
On Fri, 2018-02-02 at 21:23 +, Alan Cox wrote:
> In addition the problem with switch() is that gcc might decide in some
> cases that the best way to implement your switch is an indirect call
> from a lookup table.

That's also true of the

  if (ptr == usualfunction)
     usualfunction();
  else
     *ptr();

construct. Especially if GCC doesn't take into account the increased
cost of indirect branches with retpoline.

> For the simple case how about wrapping the if into
> 
> call_likely(foo->bar, usualfunction, args)
> 
> as a companion to 
> 
>  foo->bar(args)
> 
> that can resolve to nothing special on architectures that don't need it,
> an if/else case on platforms with spectre, and potentially clever
> stuff on any platform where you can beat the compiler by knowing
> probabilities it can't infer ?

Yeah. I'm keen on being able to use something like alternatives to
*change* 'usualfunction' at runtime though. I suspect it'll be a win
for stuff like dma_ops.

But I'm also keen to actually base such things on real data, not just
go randomly "optimising" stuff just because we can. Let's try to make
sure we fix up the real bottlenecks, and not just go crazy.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-03 Thread David Woodhouse
On Fri, 2018-02-02 at 21:23 +, Alan Cox wrote:
> In addition the problem with switch() is that gcc might decide in some
> cases that the best way to implement your switch is an indirect call
> from a lookup table.

That's also true of the

  if (ptr == usualfunction)
     usualfunction();
  else
     *ptr();

construct. Especially if GCC doesn't take into account the increased
cost of indirect branches with retpoline.

> For the simple case how about wrapping the if into
> 
> call_likely(foo->bar, usualfunction, args)
> 
> as a companion to 
> 
>  foo->bar(args)
> 
> that can resolve to nothing special on architectures that don't need it,
> an if/else case on platforms with spectre, and potentially clever
> stuff on any platform where you can beat the compiler by knowing
> probabilities it can't infer ?

Yeah. I'm keen on being able to use something like alternatives to
*change* 'usualfunction' at runtime though. I suspect it'll be a win
for stuff like dma_ops.

But I'm also keen to actually base such things on real data, not just
go randomly "optimising" stuff just because we can. Let's try to make
sure we fix up the real bottlenecks, and not just go crazy.

smime.p7s
Description: S/MIME cryptographic signature


Re: [BUG] x86 : i486 reporting to be vulnerable to Meltdown/Spectre_V1/Spectre_V2

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 23:52 -0500, tedheadster wrote:
> I just tested the 4.15 kernel and it is reporting that my old i486
> (non-cpuid capable) cpu is vulnerable to all three issues: Meltdown,
> Spectre V1, and Spectre V2.
> 
> I find this to be _unlikely_.

This should be fixed in Linus' tree already by commit fec9434a1
("x86/pti: Do not enable PTI on CPUs which are not vulnerable to
Meltdown").

We'll make sure it ends up in the stable tree too, if it hasn't
already.

smime.p7s
Description: S/MIME cryptographic signature


Re: [BUG] x86 : i486 reporting to be vulnerable to Meltdown/Spectre_V1/Spectre_V2

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 23:52 -0500, tedheadster wrote:
> I just tested the 4.15 kernel and it is reporting that my old i486
> (non-cpuid capable) cpu is vulnerable to all three issues: Meltdown,
> Spectre V1, and Spectre V2.
> 
> I find this to be _unlikely_.

This should be fixed in Linus' tree already by commit fec9434a1
("x86/pti: Do not enable PTI on CPUs which are not vulnerable to
Meltdown").

We'll make sure it ends up in the stable tree too, if it hasn't
already.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 19:12 +, Darren Kenny wrote:
> Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
> ("x86/retpoline: Fill return stack buffer on vmexit")
> 
> Signed-off-by: Darren Kenny <darren.ke...@oracle.com>
> Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>

Not strictly a typo; that was the original name for it. "IBRS all the
time". But yes, it should be IBRS_ALL now.

Acked-by: David Woodhouse <d...@amazon.co.uk>


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] Fix typo IBRS_ATT, which should be IBRS_ALL

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 19:12 +, Darren Kenny wrote:
> Fixes a typo in commit 117cc7a908c83697b0b737d15ae1eb5943afe35b
> ("x86/retpoline: Fill return stack buffer on vmexit")
> 
> Signed-off-by: Darren Kenny 
> Reviewed-by: Konrad Rzeszutek Wilk 

Not strictly a typo; that was the original name for it. "IBRS all the
time". But yes, it should be IBRS_ALL now.

Acked-by: David Woodhouse 


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 16:10 -0500, Paolo Bonzini wrote:
> > > On 2. Feb 2018, at 15:59, David Woodhouse <d...@amazon.co.uk> wrote:
> > > With retpoline, tight loops of "call this function for every XXX" are
> > > very much pessimised by taking a prediction miss *every* time.
> > > 
> > > This one showed up very high in our early testing, and it only has five
> > > things it'll ever call so make it take an 'op' enum instead of a
> > > function pointer and let's see how that works out...
> > > 
> > > Signed-off-by: David Woodhouse <d...@amazon.co.uk>
> 
> What about __always_inline instead?

Yeah, Linus suggested that and it looks like it generates sane (in fact
better) code. Will do some more testing and send it out for real
probably on Monday. Thanks.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 16:10 -0500, Paolo Bonzini wrote:
> > > On 2. Feb 2018, at 15:59, David Woodhouse  wrote:
> > > With retpoline, tight loops of "call this function for every XXX" are
> > > very much pessimised by taking a prediction miss *every* time.
> > > 
> > > This one showed up very high in our early testing, and it only has five
> > > things it'll ever call so make it take an 'op' enum instead of a
> > > function pointer and let's see how that works out...
> > > 
> > > Signed-off-by: David Woodhouse 
> 
> What about __always_inline instead?

Yeah, Linus suggested that and it looks like it generates sane (in fact
better) code. Will do some more testing and send it out for real
probably on Monday. Thanks.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 2/5] KVM: x86: Add IBPB support

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 15:28 -0500, Konrad Rzeszutek Wilk wrote:
> 
> > 
> > No. The AMD feature bits give us more fine-grained support for exposing
> > IBPB or IBRS alone, so we expose those bits on Intel too.
> 
> But but.. that runs smack against the idea of exposing a platform that
> is as close to emulating the real hardware as possible.
> 
> As in I would never expect an Intel CPU to expose the IBPB on the 0x8000_0008
> leaf. Hence KVM (nor any hypervisor) should not do it either.
> 
> Unless Intel is doing it? Did I miss a new spec update?

Are you telling me there's no way you can infer from CPUID that you're
running in a hypervisor?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 2/5] KVM: x86: Add IBPB support

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 15:28 -0500, Konrad Rzeszutek Wilk wrote:
> 
> > 
> > No. The AMD feature bits give us more fine-grained support for exposing
> > IBPB or IBRS alone, so we expose those bits on Intel too.
> 
> But but.. that runs smack against the idea of exposing a platform that
> is as close to emulating the real hardware as possible.
> 
> As in I would never expect an Intel CPU to expose the IBPB on the 0x8000_0008
> leaf. Hence KVM (nor any hypervisor) should not do it either.
> 
> Unless Intel is doing it? Did I miss a new spec update?

Are you telling me there's no way you can infer from CPUID that you're
running in a hypervisor?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 2/5] KVM: x86: Add IBPB support

2018-02-02 Thread David Woodhouse


On Fri, 2018-02-02 at 14:56 -0500, Konrad Rzeszutek Wilk wrote:
> On Fri, Feb 02, 2018 at 06:02:24PM +0000, David Woodhouse wrote:
> > 
> > On Fri, 2018-02-02 at 12:49 -0500, Konrad Rzeszutek Wilk wrote:
> > > 
> > > > 
> > > > @@ -625,7 +629,12 @@ static inline int __do_cpuid_ent(struct
> > > > kvm_cpuid_entry2 *entry, u32 function,
> > > >  if (!g_phys_as)
> > > >  g_phys_as = phys_as;
> > > >  entry->eax = g_phys_as | (virt_as << 8);
> > > > -   entry->ebx = entry->edx = 0;
> > > > +   entry->edx = 0;
> > > > +   /* IBPB isn't necessarily present in hardware cpuid>  */
> > > > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > > > +   entry->ebx |= F(IBPB);
> > > > +   entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
> > > > +   cpuid_mask(>ebx, CPUID_8000_0008_EBX);
> > > It is with x86/pti nowadays. I think you can remove that comment.
> > In this code we use the actual CPUID instruction, then filter stuff out
> > of it (with &= kvm_cpuid_XXX_x86_features and then cpuid_mask() to turn
> > off any bits which were otherwise present in the hardware and *would*
> > have been supported by KVM, but which the kernel has decided to pretend
> > are not present.
> > 
> > Nothing would *set* the IBPB bit though, since that's a "virtual" bit
> > on Intel hardware. The comment explains why we have that |= F(IBPB),
> > and if the comment wasn't true, we wouldn't need that code either.
>
> But this seems wrong. That is on Intel CPUs we will advertise on
> AMD leaf that the IBPB feature is available.
> 
> Shouldn't we just check to see if the machine is AMD before advertising
> this bit?

No. The AMD feature bits give us more fine-grained support for exposing
IBPB or IBRS alone, so we expose those bits on Intel too.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 2/5] KVM: x86: Add IBPB support

2018-02-02 Thread David Woodhouse


On Fri, 2018-02-02 at 14:56 -0500, Konrad Rzeszutek Wilk wrote:
> On Fri, Feb 02, 2018 at 06:02:24PM +0000, David Woodhouse wrote:
> > 
> > On Fri, 2018-02-02 at 12:49 -0500, Konrad Rzeszutek Wilk wrote:
> > > 
> > > > 
> > > > @@ -625,7 +629,12 @@ static inline int __do_cpuid_ent(struct
> > > > kvm_cpuid_entry2 *entry, u32 function,
> > > >  if (!g_phys_as)
> > > >  g_phys_as = phys_as;
> > > >  entry->eax = g_phys_as | (virt_as << 8);
> > > > -   entry->ebx = entry->edx = 0;
> > > > +   entry->edx = 0;
> > > > +   /* IBPB isn't necessarily present in hardware cpuid>  */
> > > > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > > > +   entry->ebx |= F(IBPB);
> > > > +   entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
> > > > +   cpuid_mask(>ebx, CPUID_8000_0008_EBX);
> > > It is with x86/pti nowadays. I think you can remove that comment.
> > In this code we use the actual CPUID instruction, then filter stuff out
> > of it (with &= kvm_cpuid_XXX_x86_features and then cpuid_mask() to turn
> > off any bits which were otherwise present in the hardware and *would*
> > have been supported by KVM, but which the kernel has decided to pretend
> > are not present.
> > 
> > Nothing would *set* the IBPB bit though, since that's a "virtual" bit
> > on Intel hardware. The comment explains why we have that |= F(IBPB),
> > and if the comment wasn't true, we wouldn't need that code either.
>
> But this seems wrong. That is on Intel CPUs we will advertise on
> AMD leaf that the IBPB feature is available.
> 
> Shouldn't we just check to see if the machine is AMD before advertising
> this bit?

No. The AMD feature bits give us more fine-grained support for exposing
IBPB or IBRS alone, so we expose those bits on Intel too.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 11:10 -0800, Linus Torvalds wrote:
> On Fri, Feb 2, 2018 at 10:50 AM, Linus Torvalds
>  wrote:
> > 
> > 
> > Will it make for bigger code? Yes. But probably not really all *that*
> > much bigger, because of how it also will allow the compiler to
> > simplify some things.
>
> Actually, testing this with my fairly minimal config, it actually
> makes for *smaller* code to inline those things.
> 
> That may be a quirk of my configuration, or maybe I screwed something
> else up, but:
> 
>   [torvalds@i7 linux]$ size ~/mmu.o arch/x86/kvm/mmu.o
>  textdata bss dec hex filename
> 855879310 120   95017   17329 /home/torvalds/mmu.o
> 855319310 120   94961   172f1 arch/x86/kvm/mmu.o
> 
> so the attached patch actually shrank things down by about 50 bytes
> because of the code simplification.
> 
> Of course, I have been known to screw up retpoline testing in the
> past, so my numbers are suspect ;). Somebody should double-check me.

I got this:

   text    data bss dec hex 
filename
  87167    9310 120   96597   17955 
arch/x86/kvm/mmu.o
  88299    9310 120   97729   17dc1 
arch/x86/kvm/mmu-inline.o

But then, I'd also done kvm_handle_hva() and kvm_handle_hva_range().

Either way, that does look like a reasonable answer. I had looked at
the various one-line wrappers around slot_handle_level_range() and
thought "hm, those should be inline", but I hadn't made the next step
and pondered putting the whole thing inline. We'll give it a spin and
work out where the next performance bottleneck is. Thanks.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 11:10 -0800, Linus Torvalds wrote:
> On Fri, Feb 2, 2018 at 10:50 AM, Linus Torvalds
>  wrote:
> > 
> > 
> > Will it make for bigger code? Yes. But probably not really all *that*
> > much bigger, because of how it also will allow the compiler to
> > simplify some things.
>
> Actually, testing this with my fairly minimal config, it actually
> makes for *smaller* code to inline those things.
> 
> That may be a quirk of my configuration, or maybe I screwed something
> else up, but:
> 
>   [torvalds@i7 linux]$ size ~/mmu.o arch/x86/kvm/mmu.o
>  textdata bss dec hex filename
> 855879310 120   95017   17329 /home/torvalds/mmu.o
> 855319310 120   94961   172f1 arch/x86/kvm/mmu.o
> 
> so the attached patch actually shrank things down by about 50 bytes
> because of the code simplification.
> 
> Of course, I have been known to screw up retpoline testing in the
> past, so my numbers are suspect ;). Somebody should double-check me.

I got this:

   text    data bss dec hex 
filename
  87167    9310 120   96597   17955 
arch/x86/kvm/mmu.o
  88299    9310 120   97729   17dc1 
arch/x86/kvm/mmu-inline.o

But then, I'd also done kvm_handle_hva() and kvm_handle_hva_range().

Either way, that does look like a reasonable answer. I had looked at
the various one-line wrappers around slot_handle_level_range() and
thought "hm, those should be inline", but I hadn't made the next step
and pondered putting the whole thing inline. We'll give it a spin and
work out where the next performance bottleneck is. Thanks.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 12:53 -0500, Konrad Rzeszutek Wilk wrote:
> .snip..
> > 
> > @@ -1913,6 +1914,29 @@ static void update_exception_bitmap(struct
> > kvm_vcpu *vcpu)
> >  }
> >  
> >  /*
> > + * Check if MSR is intercepted for currently loaded MSR bitmap.
> > + */
> > +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
> > +{
> > +   unsigned long *msr_bitmap;
> > +   int f = sizeof(unsigned long);
>
> unsigned int

$ grep -n 'f = sizeof' vmx.c
1934:   int f = sizeof(unsigned long);
5013:   int f = sizeof(unsigned long);
5048:   int f = sizeof(unsigned long);
5097:   int f = sizeof(unsigned long);

It sucks enough that we're doing this stuff repeatedly, and it's a
prime candidate for cleaning up, but I wasn't going to send Karim off
to bikeshed that today. Let's at least keep it consistent.


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 12:53 -0500, Konrad Rzeszutek Wilk wrote:
> .snip..
> > 
> > @@ -1913,6 +1914,29 @@ static void update_exception_bitmap(struct
> > kvm_vcpu *vcpu)
> >  }
> >  
> >  /*
> > + * Check if MSR is intercepted for currently loaded MSR bitmap.
> > + */
> > +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
> > +{
> > +   unsigned long *msr_bitmap;
> > +   int f = sizeof(unsigned long);
>
> unsigned int

$ grep -n 'f = sizeof' vmx.c
1934:   int f = sizeof(unsigned long);
5013:   int f = sizeof(unsigned long);
5048:   int f = sizeof(unsigned long);
5097:   int f = sizeof(unsigned long);

It sucks enough that we're doing this stuff repeatedly, and it's a
prime candidate for cleaning up, but I wasn't going to send Karim off
to bikeshed that today. Let's at least keep it consistent.


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 2/5] KVM: x86: Add IBPB support

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 12:49 -0500, Konrad Rzeszutek Wilk wrote:
> > @@ -625,7 +629,12 @@ static inline int __do_cpuid_ent(struct
> > kvm_cpuid_entry2 *entry, u32 function,
> > if (!g_phys_as)
> > g_phys_as = phys_as;
> > entry->eax = g_phys_as | (virt_as << 8);
> > -   entry->ebx = entry->edx = 0;
> > +   entry->edx = 0;
> > +   /* IBPB isn't necessarily present in hardware cpuid>  */
> > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > +   entry->ebx |= F(IBPB);
> > +   entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
> > +   cpuid_mask(>ebx, CPUID_8000_0008_EBX);
> 
> It is with x86/pti nowadays. I think you can remove that comment.

In this code we use the actual CPUID instruction, then filter stuff out
of it (with &= kvm_cpuid_XXX_x86_features and then cpuid_mask() to turn
off any bits which were otherwise present in the hardware and *would*
have been supported by KVM, but which the kernel has decided to pretend
are not present.

Nothing would *set* the IBPB bit though, since that's a "virtual" bit
on Intel hardware. The comment explains why we have that |= F(IBPB),
and if the comment wasn't true, we wouldn't need that code either.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 2/5] KVM: x86: Add IBPB support

2018-02-02 Thread David Woodhouse
On Fri, 2018-02-02 at 12:49 -0500, Konrad Rzeszutek Wilk wrote:
> > @@ -625,7 +629,12 @@ static inline int __do_cpuid_ent(struct
> > kvm_cpuid_entry2 *entry, u32 function,
> > if (!g_phys_as)
> > g_phys_as = phys_as;
> > entry->eax = g_phys_as | (virt_as << 8);
> > -   entry->ebx = entry->edx = 0;
> > +   entry->edx = 0;
> > +   /* IBPB isn't necessarily present in hardware cpuid>  */
> > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > +   entry->ebx |= F(IBPB);
> > +   entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
> > +   cpuid_mask(>ebx, CPUID_8000_0008_EBX);
> 
> It is with x86/pti nowadays. I think you can remove that comment.

In this code we use the actual CPUID instruction, then filter stuff out
of it (with &= kvm_cpuid_XXX_x86_features and then cpuid_mask() to turn
off any bits which were otherwise present in the hardware and *would*
have been supported by KVM, but which the kernel has decided to pretend
are not present.

Nothing would *set* the IBPB bit though, since that's a "virtual" bit
on Intel hardware. The comment explains why we have that |= F(IBPB),
and if the comment wasn't true, we wouldn't need that code either.

smime.p7s
Description: S/MIME cryptographic signature


[PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-02 Thread David Woodhouse
With retpoline, tight loops of "call this function for every XXX" are
very much pessimised by taking a prediction miss *every* time.

This one showed up very high in our early testing, and it only has five
things it'll ever call so make it take an 'op' enum instead of a
function pointer and let's see how that works out...

Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
Not sure I like this. Better suggestions welcomed...

In the general case, we have a few things we can do with the calls that
retpoline turns into bottlenecks. This is one of them.

Another option, if there are just one or two "likely" functions, is
something along the lines of

 if (func == likelyfunc)
likelyfunc()
 else
(*func)(); // GCC does retpoline for this

For things like kvm_x86_ops we really could just turn *all* of those
into direct calls at runtime, like pvops does.

There are some which land somewhere in the middle, like the default
dma_ops. We probably want something like the 'likelyfunc' version
above, except that we *also* want to flip the likelyfunc between the
Intel and AMD IOMMU ops functions, at early boot. I'll see what I can
come up with...

 arch/x86/kvm/mmu.c | 72 --
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b8eb4d..44f9de7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 }
 
 /* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head 
*rmap_head);
+enum slot_handler_op {
+   SLOT_RMAP_CLEAR_DIRTY,
+   SLOT_RMAP_SET_DIRTY,
+   SLOT_RMAP_WRITE_PROTECT,
+   SLOT_ZAP_RMAPP,
+   SLOT_ZAP_COLLAPSIBLE_SPTE,
+};
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+struct kvm_rmap_head *rmap_head);
 
 /* The caller should hold mmu-lock before calling this function. */
 static bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
-   slot_level_handler fn, int start_level, int end_level,
+   enum slot_handler_op op, int start_level, int end_level,
gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
 {
struct slot_rmap_walk_iterator iterator;
@@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
 
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, ) {
-   if (iterator.rmap)
-   flush |= fn(kvm, iterator.rmap);
+   if (iterator.rmap) {
+   switch (op) {
+   case SLOT_RMAP_CLEAR_DIRTY:
+   flush |= __rmap_clear_dirty(kvm, iterator.rmap);
+   break;
+
+   case SLOT_RMAP_SET_DIRTY:
+   flush |= __rmap_set_dirty(kvm, iterator.rmap);
+   break;
+
+   case SLOT_RMAP_WRITE_PROTECT:
+   flush |= __rmap_write_protect(kvm, 
iterator.rmap, false);
+   break;
+
+   case SLOT_ZAP_RMAPP:
+   flush |= kvm_zap_rmapp(kvm, iterator.rmap);
+   break;
+
+   case SLOT_ZAP_COLLAPSIBLE_SPTE:
+   flush |= kvm_mmu_zap_collapsible_spte(kvm, 
iterator.rmap);
+   break;
+   }
+   }
 
if (need_resched() || spin_needbreak(>mmu_lock)) {
if (flush && lock_flush_tlb) {
@@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
 
 static bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
+ enum slot_handler_op op, int start_level, int end_level,
  bool lock_flush_tlb)
 {
-   return slot_handle_level_range(kvm, memslot, fn, start_level,
+   return slot_handle_level_range(kvm, memslot, op, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
lock_flush_tlb);
@@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
 
 static bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ enum slot_handler_op op, bool lock_flush_tlb)
 {
-   return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+   return slot_hand

[PATCH] KVM: x86: Reduce retpoline performance impact in slot_handle_level_range()

2018-02-02 Thread David Woodhouse
With retpoline, tight loops of "call this function for every XXX" are
very much pessimised by taking a prediction miss *every* time.

This one showed up very high in our early testing, and it only has five
things it'll ever call so make it take an 'op' enum instead of a
function pointer and let's see how that works out...

Signed-off-by: David Woodhouse 
---
Not sure I like this. Better suggestions welcomed...

In the general case, we have a few things we can do with the calls that
retpoline turns into bottlenecks. This is one of them.

Another option, if there are just one or two "likely" functions, is
something along the lines of

 if (func == likelyfunc)
likelyfunc()
 else
(*func)(); // GCC does retpoline for this

For things like kvm_x86_ops we really could just turn *all* of those
into direct calls at runtime, like pvops does.

There are some which land somewhere in the middle, like the default
dma_ops. We probably want something like the 'likelyfunc' version
above, except that we *also* want to flip the likelyfunc between the
Intel and AMD IOMMU ops functions, at early boot. I'll see what I can
come up with...

 arch/x86/kvm/mmu.c | 72 --
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b8eb4d..44f9de7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5055,12 +5055,21 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 }
 
 /* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head 
*rmap_head);
+enum slot_handler_op {
+   SLOT_RMAP_CLEAR_DIRTY,
+   SLOT_RMAP_SET_DIRTY,
+   SLOT_RMAP_WRITE_PROTECT,
+   SLOT_ZAP_RMAPP,
+   SLOT_ZAP_COLLAPSIBLE_SPTE,
+};
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+struct kvm_rmap_head *rmap_head);
 
 /* The caller should hold mmu-lock before calling this function. */
 static bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
-   slot_level_handler fn, int start_level, int end_level,
+   enum slot_handler_op op, int start_level, int end_level,
gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
 {
struct slot_rmap_walk_iterator iterator;
@@ -5068,8 +5077,29 @@ slot_handle_level_range(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
 
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, ) {
-   if (iterator.rmap)
-   flush |= fn(kvm, iterator.rmap);
+   if (iterator.rmap) {
+   switch (op) {
+   case SLOT_RMAP_CLEAR_DIRTY:
+   flush |= __rmap_clear_dirty(kvm, iterator.rmap);
+   break;
+
+   case SLOT_RMAP_SET_DIRTY:
+   flush |= __rmap_set_dirty(kvm, iterator.rmap);
+   break;
+
+   case SLOT_RMAP_WRITE_PROTECT:
+   flush |= __rmap_write_protect(kvm, 
iterator.rmap, false);
+   break;
+
+   case SLOT_ZAP_RMAPP:
+   flush |= kvm_zap_rmapp(kvm, iterator.rmap);
+   break;
+
+   case SLOT_ZAP_COLLAPSIBLE_SPTE:
+   flush |= kvm_mmu_zap_collapsible_spte(kvm, 
iterator.rmap);
+   break;
+   }
+   }
 
if (need_resched() || spin_needbreak(>mmu_lock)) {
if (flush && lock_flush_tlb) {
@@ -5090,10 +5120,10 @@ slot_handle_level_range(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
 
 static bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
+ enum slot_handler_op op, int start_level, int end_level,
  bool lock_flush_tlb)
 {
-   return slot_handle_level_range(kvm, memslot, fn, start_level,
+   return slot_handle_level_range(kvm, memslot, op, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
lock_flush_tlb);
@@ -5101,25 +5131,25 @@ slot_handle_level(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
 
 static bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ enum slot_handler_op op, bool lock_flush_tlb)
 {
-   return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+   return slot_handle_lev

[tip:x86/pti] x86/retpoline: Avoid retpolines for built-in __init functions

2018-02-02 Thread tip-bot for David Woodhouse
Commit-ID:  66f793099a636862a71c59d4a6ba91387b155e0c
Gitweb: https://git.kernel.org/tip/66f793099a636862a71c59d4a6ba91387b155e0c
Author: David Woodhouse <d...@amazon.co.uk>
AuthorDate: Thu, 1 Feb 2018 11:27:20 +
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Fri, 2 Feb 2018 12:28:27 +0100

x86/retpoline: Avoid retpolines for built-in __init functions

There's no point in building init code with retpolines, since it runs before
any potentially hostile userspace does. And before the retpoline is actually
ALTERNATIVEd into place, for much of it.

Signed-off-by: David Woodhouse <d...@amazon.co.uk>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Cc: karah...@amazon.de
Cc: pet...@infradead.org
Cc: b...@alien8.de
Link: 
https://lkml.kernel.org/r/1517484441-1420-2-git-send-email-d...@amazon.co.uk

---
 include/linux/init.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/init.h b/include/linux/init.h
index ea1b311..506a981 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -5,6 +5,13 @@
 #include 
 #include 
 
+/* Built-in __init functions needn't be compiled with retpoline */
+#if defined(RETPOLINE) && !defined(MODULE)
+#define __noretpoline __attribute__((indirect_branch("keep")))
+#else
+#define __noretpoline
+#endif
+
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
  * as `initialization' functions. The kernel can take this
@@ -40,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
discard it in modules) */
-#define __init __section(.init.text) __cold  __latent_entropy
+#define __init __section(.init.text) __cold  __latent_entropy 
__noretpoline
 #define __initdata __section(.init.data)
 #define __initconst__section(.init.rodata)
 #define __exitdata __section(.exit.data)


[tip:x86/pti] x86/retpoline: Avoid retpolines for built-in __init functions

2018-02-02 Thread tip-bot for David Woodhouse
Commit-ID:  66f793099a636862a71c59d4a6ba91387b155e0c
Gitweb: https://git.kernel.org/tip/66f793099a636862a71c59d4a6ba91387b155e0c
Author: David Woodhouse 
AuthorDate: Thu, 1 Feb 2018 11:27:20 +
Committer:  Thomas Gleixner 
CommitDate: Fri, 2 Feb 2018 12:28:27 +0100

x86/retpoline: Avoid retpolines for built-in __init functions

There's no point in building init code with retpolines, since it runs before
any potentially hostile userspace does. And before the retpoline is actually
ALTERNATIVEd into place, for much of it.

Signed-off-by: David Woodhouse 
Signed-off-by: Thomas Gleixner 
Cc: karah...@amazon.de
Cc: pet...@infradead.org
Cc: b...@alien8.de
Link: 
https://lkml.kernel.org/r/1517484441-1420-2-git-send-email-d...@amazon.co.uk

---
 include/linux/init.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/init.h b/include/linux/init.h
index ea1b311..506a981 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -5,6 +5,13 @@
 #include 
 #include 
 
+/* Built-in __init functions needn't be compiled with retpoline */
+#if defined(RETPOLINE) && !defined(MODULE)
+#define __noretpoline __attribute__((indirect_branch("keep")))
+#else
+#define __noretpoline
+#endif
+
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
  * as `initialization' functions. The kernel can take this
@@ -40,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
discard it in modules) */
-#define __init __section(.init.text) __cold  __latent_entropy
+#define __init __section(.init.text) __cold  __latent_entropy 
__noretpoline
 #define __initdata __section(.init.data)
 #define __initconst__section(.init.rodata)
 #define __exitdata __section(.exit.data)


Re: [PATCH v6 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread David Woodhouse
On Thu, 2018-02-01 at 22:59 +0100, KarimAllah Ahmed wrote:

> [ Based on a patch from Ashok Raj <ashok@intel.com> ]
> 
> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
> guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
> be using a retpoline+IBPB based approach.
> 
> To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
> guests that do not actually use the MSR, only start saving and restoring
> when a non-zero is written to it.
> 
> No attempt is made to handle STIBP here, intentionally. Filtering STIBP
> may be added in a future patch, which may require trapping all writes
> if we don't want to pass it through directly to the guest.
> 
> [dwmw2: Clean up CPUID bits, save/restore manually, handle reset]
> 
> Cc: Asit Mallick <asit.k.mall...@intel.com>
> Cc: Arjan Van De Ven <arjan.van.de@intel.com>
> Cc: Dave Hansen <dave.han...@intel.com>
> Cc: Andi Kleen <a...@linux.intel.com>
> Cc: Andrea Arcangeli <aarca...@redhat.com>
> Cc: Linus Torvalds <torva...@linux-foundation.org>
> Cc: Tim Chen <tim.c.c...@linux.intel.com>
> Cc: Thomas Gleixner <t...@linutronix.de>
> Cc: Dan Williams <dan.j.willi...@intel.com>
> Cc: Jun Nakajima <jun.nakaj...@intel.com>
> Cc: Paolo Bonzini <pbonz...@redhat.com>
> Cc: David Woodhouse <d...@amazon.co.uk>
> Cc: Greg KH <gre...@linuxfoundation.org>
> Cc: Andy Lutomirski <l...@kernel.org>
> Cc: Ashok Raj <ashok@intel.com>
> Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
> Signed-off-by: David Woodhouse <d...@amazon.co.uk>
> ---
> v6:
> - got rid of save_spec_ctrl_on_exit
> - introduce msr_write_intercepted
> v5:
> - Also check for X86_FEATURE_SPEC_CTRL for the msr reads/writes
> v4:
> - Add IBRS to kvm_cpuid_8000_0008_ebx_x86_features
> - Handling nested guests
> v3:
> - Save/restore manually
> - Fix CPUID handling
> - Fix a copy & paste error in the name of SPEC_CTRL MSR in
>   disable_intercept.
> - support !cpu_has_vmx_msr_bitmap()
> v2:
> - remove 'host_spec_ctrl' in favor of only a comment (dwmw@).
> - special case writing '0' in SPEC_CTRL to avoid confusing live-migration
>   when the instance never used the MSR (dwmw@).
> - depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@).
> - add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident).

This looks very good to me now, and the comments are helpful. Thank you
for your persistence with getting the details right. If we make the SVM
one look like this, as you mentioned, I think we ought finally be ready
to merge it.

Good work ;)


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v6 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-02 Thread David Woodhouse
On Thu, 2018-02-01 at 22:59 +0100, KarimAllah Ahmed wrote:

> [ Based on a patch from Ashok Raj  ]
> 
> Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
> guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
> be using a retpoline+IBPB based approach.
> 
> To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
> guests that do not actually use the MSR, only start saving and restoring
> when a non-zero is written to it.
> 
> No attempt is made to handle STIBP here, intentionally. Filtering STIBP
> may be added in a future patch, which may require trapping all writes
> if we don't want to pass it through directly to the guest.
> 
> [dwmw2: Clean up CPUID bits, save/restore manually, handle reset]
> 
> Cc: Asit Mallick 
> Cc: Arjan Van De Ven 
> Cc: Dave Hansen 
> Cc: Andi Kleen 
> Cc: Andrea Arcangeli 
> Cc: Linus Torvalds 
> Cc: Tim Chen 
> Cc: Thomas Gleixner 
> Cc: Dan Williams 
> Cc: Jun Nakajima 
> Cc: Paolo Bonzini 
> Cc: David Woodhouse 
> Cc: Greg KH 
> Cc: Andy Lutomirski 
> Cc: Ashok Raj 
> Signed-off-by: KarimAllah Ahmed 
> Signed-off-by: David Woodhouse 
> ---
> v6:
> - got rid of save_spec_ctrl_on_exit
> - introduce msr_write_intercepted
> v5:
> - Also check for X86_FEATURE_SPEC_CTRL for the msr reads/writes
> v4:
> - Add IBRS to kvm_cpuid_8000_0008_ebx_x86_features
> - Handling nested guests
> v3:
> - Save/restore manually
> - Fix CPUID handling
> - Fix a copy & paste error in the name of SPEC_CTRL MSR in
>   disable_intercept.
> - support !cpu_has_vmx_msr_bitmap()
> v2:
> - remove 'host_spec_ctrl' in favor of only a comment (dwmw@).
> - special case writing '0' in SPEC_CTRL to avoid confusing live-migration
>   when the instance never used the MSR (dwmw@).
> - depend on X86_FEATURE_IBRS instead of X86_FEATURE_SPEC_CTRL (dwmw@).
> - add MSR_IA32_SPEC_CTRL to the list of MSRs to save (dropped it by accident).

This looks very good to me now, and the comments are helpful. Thank you
for your persistence with getting the details right. If we make the SVM
one look like this, as you mentioned, I think we ought finally be ready
to merge it.

Good work ;)


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 0/7] objtool: retpoline validation

2018-02-01 Thread David Woodhouse
On Thu, 2018-02-01 at 16:40 +0100, Peter Zijlstra wrote:
> On Thu, Feb 01, 2018 at 03:32:11PM +0000, David Woodhouse wrote:
> > 
> > On Thu, 2018-02-01 at 09:28 -0600, Josh Poimboeuf wrote:
> > > 
> > > On Thu, Feb 01, 2018 at 03:34:21PM +0100, Peter Zijlstra wrote:
> > > > 
> > > > 
> > > > There are the retpoline validation patches; they work with the
> > > > __noretpoline
> > > > thing from David.
> > > Have you run this through 0-day bot yet?  A manual awk/sed found
> > > another
> > > one, which objtool confirms:
> > > 
> > >   drivers/watchdog/.tmp_hpwdt.o: warning: objtool: .text+0x24:
> > > indirect call found in RETPOLINE build
> > > 
> > > And my search wasn't exhaustive so it would be good to sic 0-day bot on
> > > it.
> > We discussed that one. It's correct; we're calling into firmware so
> > there's *no* point in retpolining that one. We need to set IBRS before
> > any runtime calls into firmware, if we want to be safe.
>
> Ideally we'd have a way to mark the module 'unsafe' or something.

No, we just need to set IBRS before doing it. The same applies to any
EFI runtime calls, APM and all kinds of other random crap that calls
into firmware. I'm not sure why those aren't showing up.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 0/7] objtool: retpoline validation

2018-02-01 Thread David Woodhouse
On Thu, 2018-02-01 at 16:40 +0100, Peter Zijlstra wrote:
> On Thu, Feb 01, 2018 at 03:32:11PM +0000, David Woodhouse wrote:
> > 
> > On Thu, 2018-02-01 at 09:28 -0600, Josh Poimboeuf wrote:
> > > 
> > > On Thu, Feb 01, 2018 at 03:34:21PM +0100, Peter Zijlstra wrote:
> > > > 
> > > > 
> > > > There are the retpoline validation patches; they work with the
> > > > __noretpoline
> > > > thing from David.
> > > Have you run this through 0-day bot yet?  A manual awk/sed found
> > > another
> > > one, which objtool confirms:
> > > 
> > >   drivers/watchdog/.tmp_hpwdt.o: warning: objtool: .text+0x24:
> > > indirect call found in RETPOLINE build
> > > 
> > > And my search wasn't exhaustive so it would be good to sic 0-day bot on
> > > it.
> > We discussed that one. It's correct; we're calling into firmware so
> > there's *no* point in retpolining that one. We need to set IBRS before
> > any runtime calls into firmware, if we want to be safe.
>
> Ideally we'd have a way to mark the module 'unsafe' or something.

No, we just need to set IBRS before doing it. The same applies to any
EFI runtime calls, APM and all kinds of other random crap that calls
into firmware. I'm not sure why those aren't showing up.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 0/7] objtool: retpoline validation

2018-02-01 Thread David Woodhouse
On Thu, 2018-02-01 at 09:28 -0600, Josh Poimboeuf wrote:
> On Thu, Feb 01, 2018 at 03:34:21PM +0100, Peter Zijlstra wrote:
> > 
> > There are the retpoline validation patches; they work with the
> > __noretpoline
> > thing from David.
> Have you run this through 0-day bot yet?  A manual awk/sed found
> another
> one, which objtool confirms:
> 
>   drivers/watchdog/.tmp_hpwdt.o: warning: objtool: .text+0x24:
> indirect call found in RETPOLINE build
> 
> And my search wasn't exhaustive so it would be good to sic 0-day bot on
> it.

We discussed that one. It's correct; we're calling into firmware so
there's *no* point in retpolining that one. We need to set IBRS before
any runtime calls into firmware, if we want to be safe.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 0/7] objtool: retpoline validation

2018-02-01 Thread David Woodhouse
On Thu, 2018-02-01 at 09:28 -0600, Josh Poimboeuf wrote:
> On Thu, Feb 01, 2018 at 03:34:21PM +0100, Peter Zijlstra wrote:
> > 
> > There are the retpoline validation patches; they work with the
> > __noretpoline
> > thing from David.
> Have you run this through 0-day bot yet?  A manual awk/sed found
> another
> one, which objtool confirms:
> 
>   drivers/watchdog/.tmp_hpwdt.o: warning: objtool: .text+0x24:
> indirect call found in RETPOLINE build
> 
> And my search wasn't exhaustive so it would be good to sic 0-day bot on
> it.

We discussed that one. It's correct; we're calling into firmware so
there's *no* point in retpolining that one. We need to set IBRS before
any runtime calls into firmware, if we want to be safe.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 4/7] x86,nospec: Annotate indirect calls/jumps

2018-02-01 Thread David Woodhouse


On Thu, 2018-02-01 at 15:34 +0100, Peter Zijlstra wrote:
> 
>   * These are the bare retpoline primitives for indirect jmp and call.
>   * Do not use these directly; they only exist to make the ALTERNATIVE
>   * invocation below less ugly.
> @@ -102,9 +114,9 @@
>  .macro JMP_NOSPEC reg:req
>  #ifdef CONFIG_RETPOLINE
> ANNOTATE_NOSPEC_ALTERNATIVE
> -   ALTERNATIVE_2 __stringify(jmp *\reg),   \
> +   ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg),  \
> __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
> -   __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
> +   __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), 
> X86_FEATURE_RETPOLINE_AMD
>  #else
> jmp *\reg
>  #endif

The first one, yes. But the second one for the AMD retpoline is
redundant, isn't it? Objtool isn't going to look there.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 4/7] x86,nospec: Annotate indirect calls/jumps

2018-02-01 Thread David Woodhouse


On Thu, 2018-02-01 at 15:34 +0100, Peter Zijlstra wrote:
> 
>   * These are the bare retpoline primitives for indirect jmp and call.
>   * Do not use these directly; they only exist to make the ALTERNATIVE
>   * invocation below less ugly.
> @@ -102,9 +114,9 @@
>  .macro JMP_NOSPEC reg:req
>  #ifdef CONFIG_RETPOLINE
> ANNOTATE_NOSPEC_ALTERNATIVE
> -   ALTERNATIVE_2 __stringify(jmp *\reg),   \
> +   ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg),  \
> __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
> -   __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
> +   __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), 
> X86_FEATURE_RETPOLINE_AMD
>  #else
> jmp *\reg
>  #endif

The first one, yes. But the second one for the AMD retpoline is
redundant, isn't it? Objtool isn't going to look there.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-01 Thread David Woodhouse


On Wed, 2018-01-31 at 23:26 -0500, Konrad Rzeszutek Wilk wrote:
> 
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 6a9f4ec..bfc80ff 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -594,6 +594,14 @@ struct vcpu_vmx {
> >  #endif
> >  
> >   u64   arch_capabilities;
> > + u64   spec_ctrl;
> > +
> > + /*
> > +  * This indicates that:
> > +  * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true &&
> > +  * 2) The guest has actually initiated a write against the MSR.
> > +  */
> > + bool spec_ctrl_used;
> >  
> >   /*
> >    * This indicates that:

Thanks for persisting with the details here, Karim. In addition to
Konrad's heckling at the comments, I'll add my own request to his...

I'd like the comment for spec_ctrl_used to explain why it isn't
entirely redundant with the spec_ctrl_intercepted() function.

Without nesting, I believe it *would* be redundant, but the difference
comes when an L2 is running for which L1 has not permitted the MSR to
be passed through. That's when we have spec_ctrl_used = true but the
MSR *isn't* actually passed through in the active msr_bitmap.

Question: if spec_ctrl_used is always equivalent to the intercept bit
in the vmcs01.msr_bitmap, just not the guest bitmap... should we ditch
it and always use the bit from the vmcs01.msr_bitmap?

Sorry :)

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-02-01 Thread David Woodhouse


On Wed, 2018-01-31 at 23:26 -0500, Konrad Rzeszutek Wilk wrote:
> 
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index 6a9f4ec..bfc80ff 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -594,6 +594,14 @@ struct vcpu_vmx {
> >  #endif
> >  
> >   u64   arch_capabilities;
> > + u64   spec_ctrl;
> > +
> > + /*
> > +  * This indicates that:
> > +  * 1) guest_cpuid_has(X86_FEATURE_IBRS) == true &&
> > +  * 2) The guest has actually initiated a write against the MSR.
> > +  */
> > + bool spec_ctrl_used;
> >  
> >   /*
> >    * This indicates that:

Thanks for persisting with the details here, Karim. In addition to
Konrad's heckling at the comments, I'll add my own request to his...

I'd like the comment for spec_ctrl_used to explain why it isn't
entirely redundant with the spec_ctrl_intercepted() function.

Without nesting, I believe it *would* be redundant, but the difference
comes when an L2 is running for which L1 has not permitted the MSR to
be passed through. That's when we have spec_ctrl_used = true but the
MSR *isn't* actually passed through in the active msr_bitmap.

Question: if spec_ctrl_used is always equivalent to the intercept bit
in the vmcs01.msr_bitmap, just not the guest bitmap... should we ditch
it and always use the bit from the vmcs01.msr_bitmap?

Sorry :)

smime.p7s
Description: S/MIME cryptographic signature


[PATCH 2/2] x86: Simplify spectre_v2 command line parsing

2018-02-01 Thread David Woodhouse
From: KarimAllah Ahmed <karah...@amazon.de>

[dwmw2: Use ARRAY_SIZE]
Signed-off-by: KarimAllah Ahmed <karah...@amazon.de>
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 arch/x86/kernel/cpu/bugs.c | 86 ++
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index dd3a3cc..71949bf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -119,13 +119,13 @@ static inline const char *spectre_v2_module_string(void) 
{ return ""; }
 static void __init spec2_print_if_insecure(const char *reason)
 {
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-   pr_info("%s\n", reason);
+   pr_info("%s selected on command line.\n", reason);
 }
 
 static void __init spec2_print_if_secure(const char *reason)
 {
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-   pr_info("%s\n", reason);
+   pr_info("%s selected on command line.\n", reason);
 }
 
 static inline bool retp_compiler(void)
@@ -140,42 +140,68 @@ static inline bool match_option(const char *arg, int 
arglen, const char *opt)
return len == arglen && !strncmp(arg, opt, len);
 }
 
+static const struct {
+   const char *option;
+   enum spectre_v2_mitigation_cmd cmd;
+   bool secure;
+} mitigation_options[] = {
+   { "off",   SPECTRE_V2_CMD_NONE,  false },
+   { "on",SPECTRE_V2_CMD_FORCE, true },
+   { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+   { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+   { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+   { "auto",  SPECTRE_V2_CMD_AUTO,  false },
+};
+
 static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 {
char arg[20];
-   int ret;
-
-   ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
- sizeof(arg));
-   if (ret > 0)  {
-   if (match_option(arg, ret, "off")) {
-   goto disable;
-   } else if (match_option(arg, ret, "on")) {
-   spec2_print_if_secure("force enabled on command line.");
-   return SPECTRE_V2_CMD_FORCE;
-   } else if (match_option(arg, ret, "retpoline")) {
-   spec2_print_if_insecure("retpoline selected on command 
line.");
-   return SPECTRE_V2_CMD_RETPOLINE;
-   } else if (match_option(arg, ret, "retpoline,amd")) {
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-   pr_err("retpoline,amd selected but CPU is not 
AMD. Switching to AUTO select\n");
-   return SPECTRE_V2_CMD_AUTO;
-   }
-   spec2_print_if_insecure("AMD retpoline selected on 
command line.");
-   return SPECTRE_V2_CMD_RETPOLINE_AMD;
-   } else if (match_option(arg, ret, "retpoline,generic")) {
-   spec2_print_if_insecure("generic retpoline selected on 
command line.");
-   return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
-   } else if (match_option(arg, ret, "auto")) {
+   int ret, i;
+   enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+
+   if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+   return SPECTRE_V2_CMD_NONE;
+   else {
+   ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
+ sizeof(arg));
+   if (ret < 0)
+   return SPECTRE_V2_CMD_AUTO;
+
+   for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+   if (!match_option(arg, ret, 
mitigation_options[i].option))
+   continue;
+   cmd = mitigation_options[i].cmd;
+   break;
+   }
+
+   if (i >= ARRAY_SIZE(mitigation_options)) {
+   pr_err("unknown option (%s). Switching to AUTO 
select\n",
+  mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
}
 
-   if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+   if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
+cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
+cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+   !IS_ENAB

[PATCH 2/2] x86: Simplify spectre_v2 command line parsing

2018-02-01 Thread David Woodhouse
From: KarimAllah Ahmed 

[dwmw2: Use ARRAY_SIZE]
Signed-off-by: KarimAllah Ahmed 
Signed-off-by: David Woodhouse 
---
 arch/x86/kernel/cpu/bugs.c | 86 ++
 1 file changed, 56 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index dd3a3cc..71949bf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -119,13 +119,13 @@ static inline const char *spectre_v2_module_string(void) 
{ return ""; }
 static void __init spec2_print_if_insecure(const char *reason)
 {
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-   pr_info("%s\n", reason);
+   pr_info("%s selected on command line.\n", reason);
 }
 
 static void __init spec2_print_if_secure(const char *reason)
 {
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-   pr_info("%s\n", reason);
+   pr_info("%s selected on command line.\n", reason);
 }
 
 static inline bool retp_compiler(void)
@@ -140,42 +140,68 @@ static inline bool match_option(const char *arg, int 
arglen, const char *opt)
return len == arglen && !strncmp(arg, opt, len);
 }
 
+static const struct {
+   const char *option;
+   enum spectre_v2_mitigation_cmd cmd;
+   bool secure;
+} mitigation_options[] = {
+   { "off",   SPECTRE_V2_CMD_NONE,  false },
+   { "on",SPECTRE_V2_CMD_FORCE, true },
+   { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+   { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+   { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+   { "auto",  SPECTRE_V2_CMD_AUTO,  false },
+};
+
 static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 {
char arg[20];
-   int ret;
-
-   ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
- sizeof(arg));
-   if (ret > 0)  {
-   if (match_option(arg, ret, "off")) {
-   goto disable;
-   } else if (match_option(arg, ret, "on")) {
-   spec2_print_if_secure("force enabled on command line.");
-   return SPECTRE_V2_CMD_FORCE;
-   } else if (match_option(arg, ret, "retpoline")) {
-   spec2_print_if_insecure("retpoline selected on command 
line.");
-   return SPECTRE_V2_CMD_RETPOLINE;
-   } else if (match_option(arg, ret, "retpoline,amd")) {
-   if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-   pr_err("retpoline,amd selected but CPU is not 
AMD. Switching to AUTO select\n");
-   return SPECTRE_V2_CMD_AUTO;
-   }
-   spec2_print_if_insecure("AMD retpoline selected on 
command line.");
-   return SPECTRE_V2_CMD_RETPOLINE_AMD;
-   } else if (match_option(arg, ret, "retpoline,generic")) {
-   spec2_print_if_insecure("generic retpoline selected on 
command line.");
-   return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
-   } else if (match_option(arg, ret, "auto")) {
+   int ret, i;
+   enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+
+   if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+   return SPECTRE_V2_CMD_NONE;
+   else {
+   ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
+ sizeof(arg));
+   if (ret < 0)
+   return SPECTRE_V2_CMD_AUTO;
+
+   for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+   if (!match_option(arg, ret, 
mitigation_options[i].option))
+   continue;
+   cmd = mitigation_options[i].cmd;
+   break;
+   }
+
+   if (i >= ARRAY_SIZE(mitigation_options)) {
+   pr_err("unknown option (%s). Switching to AUTO 
select\n",
+  mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
}
 
-   if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+   if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
+cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
+cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+   !IS_ENABLED(CONFIG_RETPOLINE)) {
+   pr_err("%s selected but not compi

[PATCH 0/2] Spectre v2 tweaks

2018-02-01 Thread David Woodhouse
A couple of things that have been lurking in the "shall we do this?" area
of my git tree for a while, and I think we probably should...

David Woodhouse (1):
  x86/retpoline: No retpolines for built-in __init functions

KarimAllah Ahmed (1):
  x86: Simplify spectre_v2 command line parsing

 arch/x86/kernel/cpu/bugs.c | 86 ++
 include/linux/init.h   |  9 -
 2 files changed, 64 insertions(+), 31 deletions(-)

-- 
2.7.4



[PATCH 0/2] Spectre v2 tweaks

2018-02-01 Thread David Woodhouse
A couple of things that have been lurking in the "shall we do this?" area
of my git tree for a while, and I think we probably should...

David Woodhouse (1):
  x86/retpoline: No retpolines for built-in __init functions

KarimAllah Ahmed (1):
  x86: Simplify spectre_v2 command line parsing

 arch/x86/kernel/cpu/bugs.c | 86 ++
 include/linux/init.h   |  9 -
 2 files changed, 64 insertions(+), 31 deletions(-)

-- 
2.7.4



[PATCH 1/2] x86/retpoline: No retpolines for built-in __init functions

2018-02-01 Thread David Woodhouse
There's no point in building init code with retpolines, since it runs before
any potentially hostile userspace does. And before the retpoline is actually
ALTERNATIVEd into place, for much of it.

Signed-off-by: David Woodhouse <d...@amazon.co.uk>
---
 include/linux/init.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/init.h b/include/linux/init.h
index ea1b311..506a981 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -5,6 +5,13 @@
 #include 
 #include 
 
+/* Built-in __init functions needn't be compiled with retpoline */
+#if defined(RETPOLINE) && !defined(MODULE)
+#define __noretpoline __attribute__((indirect_branch("keep")))
+#else
+#define __noretpoline
+#endif
+
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
  * as `initialization' functions. The kernel can take this
@@ -40,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
discard it in modules) */
-#define __init __section(.init.text) __cold  __latent_entropy
+#define __init __section(.init.text) __cold  __latent_entropy 
__noretpoline
 #define __initdata __section(.init.data)
 #define __initconst__section(.init.rodata)
 #define __exitdata __section(.exit.data)
-- 
2.7.4



[PATCH 1/2] x86/retpoline: No retpolines for built-in __init functions

2018-02-01 Thread David Woodhouse
There's no point in building init code with retpolines, since it runs before
any potentially hostile userspace does. And before the retpoline is actually
ALTERNATIVEd into place, for much of it.

Signed-off-by: David Woodhouse 
---
 include/linux/init.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/init.h b/include/linux/init.h
index ea1b311..506a981 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -5,6 +5,13 @@
 #include 
 #include 
 
+/* Built-in __init functions needn't be compiled with retpoline */
+#if defined(RETPOLINE) && !defined(MODULE)
+#define __noretpoline __attribute__((indirect_branch("keep")))
+#else
+#define __noretpoline
+#endif
+
 /* These macros are used to mark some functions or 
  * initialized data (doesn't apply to uninitialized data)
  * as `initialization' functions. The kernel can take this
@@ -40,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
discard it in modules) */
-#define __init __section(.init.text) __cold  __latent_entropy
+#define __init __section(.init.text) __cold  __latent_entropy 
__noretpoline
 #define __initdata __section(.init.data)
 #define __initconst__section(.init.rodata)
 #define __exitdata __section(.exit.data)
-- 
2.7.4



Re: [tip:x86/pti] x86/speculation: Use Indirect Branch Prediction Barrier in context switch

2018-02-01 Thread David Woodhouse
On Wed, 2018-01-31 at 08:03 +0100, Dominik Brodowski wrote:
> Whether a process needs protection by IBPB on context switches is a
> different question to whether a process should be allowed to be dumped,
> though the former may be a superset of the latter. Enable IBPB on all
> context switches to a different userspace process, until we have a clear
> mitigation strategy for userspace against Spectre-v2 designed and
> implemented.
>
> ...
> if (tsk && tsk->mm &&
> -   tsk->mm->context.ctx_id != last_ctx_id &&
> -   get_dumpable(tsk->mm) != SUID_DUMP_USER)
> +   tsk->mm->context.ctx_id != last_ctx_id)
> indirect_branch_prediction_barrier();


I understand your argument and I sympathise.

But that's going to hurt a *lot*, and we don't even have a viable
proof-of-concept for a user←→user Spectre v2 attack, do we? It's only
theoretical?

Show a working PoC and it makes the argument somewhat more
convincing...

smime.p7s
Description: S/MIME cryptographic signature


Re: [tip:x86/pti] x86/speculation: Use Indirect Branch Prediction Barrier in context switch

2018-02-01 Thread David Woodhouse
On Wed, 2018-01-31 at 08:03 +0100, Dominik Brodowski wrote:
> Whether a process needs protection by IBPB on context switches is a
> different question to whether a process should be allowed to be dumped,
> though the former may be a superset of the latter. Enable IBPB on all
> context switches to a different userspace process, until we have a clear
> mitigation strategy for userspace against Spectre-v2 designed and
> implemented.
>
> ...
> if (tsk && tsk->mm &&
> -   tsk->mm->context.ctx_id != last_ctx_id &&
> -   get_dumpable(tsk->mm) != SUID_DUMP_USER)
> +   tsk->mm->context.ctx_id != last_ctx_id)
> indirect_branch_prediction_barrier();


I understand your argument and I sympathise.

But that's going to hurt a *lot*, and we don't even have a viable
proof-of-concept for a user←→user Spectre v2 attack, do we? It's only
theoretical?

Show a working PoC and it makes the argument somewhat more
convincing...

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 14:06 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 1:59 PM, David Woodhouse <dw...@infradead.org> wrote:
> > I'm actually working on IBRS_ALL at the moment.
> >
> > I was tempted to *not* let the guests turn it off. Expose SPEC_CTRL but
> > just make it a no-op.
> 
> Maybe we could convince Intel to add a LOCK bit to IA32_SPEC_CTRL like
> the one in IA32_FEATURE_CONTROL.

Given that IBRS_ALL is supposed to be a sanely-performing option, I'd
rather convince Intel to just make it unconditional. If they've added
the appropriate tagging to the BTB, why even *have* this deliberately
insecure mode when IBRS==0?

I understand that until/unless they get a *proper* fix, software is
still going to have to use IBPB as appropriate. But there's no need for
the IBRS bit to do *anything*.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 14:06 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 1:59 PM, David Woodhouse  wrote:
> > I'm actually working on IBRS_ALL at the moment.
> >
> > I was tempted to *not* let the guests turn it off. Expose SPEC_CTRL but
> > just make it a no-op.
> 
> Maybe we could convince Intel to add a LOCK bit to IA32_SPEC_CTRL like
> the one in IA32_FEATURE_CONTROL.

Given that IBRS_ALL is supposed to be a sanely-performing option, I'd
rather convince Intel to just make it unconditional. If they've added
the appropriate tagging to the BTB, why even *have* this deliberately
insecure mode when IBRS==0?

I understand that until/unless they get a *proper* fix, software is
still going to have to use IBPB as appropriate. But there's no need for
the IBRS bit to do *anything*.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 13:18 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 12:21 PM, David Woodhouse  wrote:
> 
> > 
> > Reading and writing this MSR is expensive. And if it's yielded to the
> > guest in the MSR bitmap, that means we have to save its value on vmexit
> > and set it back to zero.
>
> Agreed. But my point is that if it's not yielded to the guest in the
> MSR bitmap, then we don't have to save its value on VM-exit and set it
> back to zero. The vmcs02 MSR bitmap is reconstructed on every L1->L2
> transition. Sometimes, it will yield the MSR and sometimes it won't.

Strictly: if SPEC_CTRL is not already set to 1 *and* hasn't been
yielded to the guest in the MSR bitmap, then we don't have to set it
back to zero.

If L1 decides it's *always* going to trap and never pass through, but
the value is already set to non-zero, we need to get that case right.


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 13:18 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 12:21 PM, David Woodhouse  wrote:
> 
> > 
> > Reading and writing this MSR is expensive. And if it's yielded to the
> > guest in the MSR bitmap, that means we have to save its value on vmexit
> > and set it back to zero.
>
> Agreed. But my point is that if it's not yielded to the guest in the
> MSR bitmap, then we don't have to save its value on VM-exit and set it
> back to zero. The vmcs02 MSR bitmap is reconstructed on every L1->L2
> transition. Sometimes, it will yield the MSR and sometimes it won't.

Strictly: if SPEC_CTRL is not already set to 1 *and* hasn't been
yielded to the guest in the MSR bitmap, then we don't have to set it
back to zero.

If L1 decides it's *always* going to trap and never pass through, but
the value is already set to non-zero, we need to get that case right.


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 13:53 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 1:42 PM, Paolo Bonzini  wrote:
> 
> > Can we just say it sucks to be L2 too? :)  Because in the end as long as
> > no one ever writes to spec_ctrl, everybody is happy.
> 
> Unfortunately, quite a few OS vendors shipped IBRS-based mitigations
> earlier this month. (Has Redhat stopped writing to IA32_SPEC_CTRL yet?
> :-)
> 
> And in the long run, everyone is going to set IA32_SPEC_CTRL.IBRS=1 on
> CPUs with IA32_ARCH_CAPABILITIES.IBRS_ALL.


I'm actually working on IBRS_ALL at the moment.

I was tempted to *not* let the guests turn it off. Expose SPEC_CTRL but
just make it a no-op.

Or if that really doesn't fly, perhaps with IBRS_ALL we should invert
the logic. Set IBRS to 1 on vCPU reset, and only if it's set to *zero*
do we pass through the MSR and set the save_spec_ctrl_on_exit flag.

But let's get the code for *current* hardware done first...

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 13:53 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 1:42 PM, Paolo Bonzini  wrote:
> 
> > Can we just say it sucks to be L2 too? :)  Because in the end as long as
> > no one ever writes to spec_ctrl, everybody is happy.
> 
> Unfortunately, quite a few OS vendors shipped IBRS-based mitigations
> earlier this month. (Has Redhat stopped writing to IA32_SPEC_CTRL yet?
> :-)
> 
> And in the long run, everyone is going to set IA32_SPEC_CTRL.IBRS=1 on
> CPUs with IA32_ARCH_CAPABILITIES.IBRS_ALL.


I'm actually working on IBRS_ALL at the moment.

I was tempted to *not* let the guests turn it off. Expose SPEC_CTRL but
just make it a no-op.

Or if that really doesn't fly, perhaps with IBRS_ALL we should invert
the logic. Set IBRS to 1 on vCPU reset, and only if it's set to *zero*
do we pass through the MSR and set the save_spec_ctrl_on_exit flag.

But let's get the code for *current* hardware done first...

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse
On Wed, 2018-01-31 at 12:18 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 12:01 PM, KarimAllah Ahmed  wrote:
> 
> > 
> > but save_spec_ctrl_on_exit is also set for L2 write. So once L2 writes
> > to it, this condition will be true and then the bitmap will be updated.
> So if L1 or any L2 writes to the MSR, then save_spec_ctrl_on_exit is
> set to true, even if the MSR permission bitmap for a particular VMCS
> *doesn't* allow the MSR to be written without an intercept. That's
> functionally correct, but inefficient. It seems to me that
> save_spec_ctrl_on_exit should indicate whether or not the *current*
> MSR permission bitmap allows unintercepted writes to IA32_SPEC_CTRL.
> To that end, perhaps save_spec_ctrl_on_exit rightfully belongs in the
> loaded_vmcs structure, alongside the msr_bitmap pointer that it is
> associated with. For vmcs02, nested_vmx_merge_msr_bitmap() should set
> the vmcs02 save_spec_ctrl_on_exit based on (a) whether L0 is willing
> to yield the MSR to L1, and (b) whether L1 is willing to yield the MSR
> to L2.

Reading and writing this MSR is expensive. And if it's yielded to the
guest in the MSR bitmap, that means we have to save its value on vmexit
and set it back to zero.

Some of the gymnastics here are explicitly done to avoid having to do
that save-and-zero step unless the guest has *actually* touched the
MSR. Not just if we are *willing* to let it do so.

That's the whole point in the yield-after-first-write dance.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse
On Wed, 2018-01-31 at 12:18 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 12:01 PM, KarimAllah Ahmed  wrote:
> 
> > 
> > but save_spec_ctrl_on_exit is also set for L2 write. So once L2 writes
> > to it, this condition will be true and then the bitmap will be updated.
> So if L1 or any L2 writes to the MSR, then save_spec_ctrl_on_exit is
> set to true, even if the MSR permission bitmap for a particular VMCS
> *doesn't* allow the MSR to be written without an intercept. That's
> functionally correct, but inefficient. It seems to me that
> save_spec_ctrl_on_exit should indicate whether or not the *current*
> MSR permission bitmap allows unintercepted writes to IA32_SPEC_CTRL.
> To that end, perhaps save_spec_ctrl_on_exit rightfully belongs in the
> loaded_vmcs structure, alongside the msr_bitmap pointer that it is
> associated with. For vmcs02, nested_vmx_merge_msr_bitmap() should set
> the vmcs02 save_spec_ctrl_on_exit based on (a) whether L0 is willing
> to yield the MSR to L1, and (b) whether L1 is willing to yield the MSR
> to L2.

Reading and writing this MSR is expensive. And if it's yielded to the
guest in the MSR bitmap, that means we have to save its value on vmexit
and set it back to zero.

Some of the gymnastics here are explicitly done to avoid having to do
that save-and-zero step unless the guest has *actually* touched the
MSR. Not just if we are *willing* to let it do so.

That's the whole point in the yield-after-first-write dance.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse
On Wed, 2018-01-31 at 11:53 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 11:37 AM, KarimAllah Ahmed  wrote:
> 
> > +
> > +   if (to_vmx(vcpu)->save_spec_ctrl_on_exit) {
> > +   nested_vmx_disable_intercept_for_msr(
> > +   msr_bitmap_l1, msr_bitmap_l0,
> > +   MSR_IA32_SPEC_CTRL,
> > +   MSR_TYPE_R | MSR_TYPE_W);
> > +   }
> > +
> 
> As this is written, L2 will never get direct access to this MSR until
> after L1 writes it.  What if L1 never writes it? The condition should
> really be something that captures, "if L0 is willing to yield this MSR
> to the guest..."

I'm still kind of lost here, but don't forget the requirement that the
MSR must *not* be passed through for direct access by L1 or L2 guests,
unless that ->save_spec_ctrl_on_exit flag is set.

Because that's what makes us set it back to zero on vmexit.

So the above condition doesn't look *so* wrong to me. Perhaps the issue
is that we're missing a way for L2 to actually cause that flag to get
set?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 4/5] KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL

2018-01-31 Thread David Woodhouse
On Wed, 2018-01-31 at 11:53 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 11:37 AM, KarimAllah Ahmed  wrote:
> 
> > +
> > +   if (to_vmx(vcpu)->save_spec_ctrl_on_exit) {
> > +   nested_vmx_disable_intercept_for_msr(
> > +   msr_bitmap_l1, msr_bitmap_l0,
> > +   MSR_IA32_SPEC_CTRL,
> > +   MSR_TYPE_R | MSR_TYPE_W);
> > +   }
> > +
> 
> As this is written, L2 will never get direct access to this MSR until
> after L1 writes it.  What if L1 never writes it? The condition should
> really be something that captures, "if L0 is willing to yield this MSR
> to the guest..."

I'm still kind of lost here, but don't forget the requirement that the
MSR must *not* be passed through for direct access by L1 or L2 guests,
unless that ->save_spec_ctrl_on_exit flag is set.

Because that's what makes us set it back to zero on vmexit.

So the above condition doesn't look *so* wrong to me. Perhaps the issue
is that we're missing a way for L2 to actually cause that flag to get
set?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 2/5] KVM: x86: Add IBPB support

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 11:45 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 11:37 AM, KarimAllah Ahmed  wrote:
> 
> > +   nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> > +    MSR_IA32_PRED_CMD,
> > +    MSR_TYPE_W);
> > +
> 
> I still think this should be predicated on L1 having
> guest_cpuid_has(vcpu, X86_FEATURE_IBPB) or guest_cpuid_has(vcpu,
> X86_FEATURE_SPEC_CTRL), because of the potential impact to the
> hypertwin. If L0 denies the feature to L1 by clearing those CPUID
> bits, L1 shouldn't be able to bypass that restriction by launching L2.

Rather than doing the expensive guest_cpu_has() every time (which is
worse now as we realised we need two of them) perhaps we should
introduce a local flag for that too?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v5 2/5] KVM: x86: Add IBPB support

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 11:45 -0800, Jim Mattson wrote:
> On Wed, Jan 31, 2018 at 11:37 AM, KarimAllah Ahmed  wrote:
> 
> > +   nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
> > +    MSR_IA32_PRED_CMD,
> > +    MSR_TYPE_W);
> > +
> 
> I still think this should be predicated on L1 having
> guest_cpuid_has(vcpu, X86_FEATURE_IBPB) or guest_cpuid_has(vcpu,
> X86_FEATURE_SPEC_CTRL), because of the potential impact to the
> hypertwin. If L0 denies the feature to L1 by clearing those CPUID
> bits, L1 shouldn't be able to bypass that restriction by launching L2.

Rather than doing the expensive guest_cpu_has() every time (which is
worse now as we realised we need two of them) perhaps we should
introduce a local flag for that too?

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 20/24] objtool: Another static block fail

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 11:01 +0100, Peter Zijlstra wrote:
> On Tue, Jan 30, 2018 at 09:12:21PM -0600, Josh Poimboeuf wrote:
> > 
> > Or, maybe we should just forget the whole thing and just stick with the
> > dynamic IBRS checks with lfence.  Yes, it's less ideal for the kernel,
> > but adding these acrobatics to objtool also has a cost.
> 
> For now, IBRS seems off the table entirely. But no, I really don't want
> to have to unconditionally eat the LFENCE cost in all those sites.

There's also alternatives. And without the IBRS-on-kernel-entry bits
there aren't that many call sites that really need this anyway and
don't have *other* conditionals that really are runtime-only (like
dumpable etc.).

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH 20/24] objtool: Another static block fail

2018-01-31 Thread David Woodhouse


On Wed, 2018-01-31 at 11:01 +0100, Peter Zijlstra wrote:
> On Tue, Jan 30, 2018 at 09:12:21PM -0600, Josh Poimboeuf wrote:
> > 
> > Or, maybe we should just forget the whole thing and just stick with the
> > dynamic IBRS checks with lfence.  Yes, it's less ideal for the kernel,
> > but adding these acrobatics to objtool also has a cost.
> 
> For now, IBRS seems off the table entirely. But no, I really don't want
> to have to unconditionally eat the LFENCE cost in all those sites.

There's also alternatives. And without the IBRS-on-kernel-entry bits
there aren't that many call sites that really need this anyway and
don't have *other* conditionals that really are runtime-only (like
dumpable etc.).

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v3 0/4] KVM: Expose speculation control feature to guests

2018-01-30 Thread David Woodhouse


On Tue, 2018-01-30 at 19:16 -0500, Paolo Bonzini wrote:
> On 30/01/2018 18:48, Raj, Ashok wrote:
> > 
> > > 
> > > Certainly not every vmexit!  But doing it on every userspace vmexit and
> > > every sched_out would not be *that* bad.
> > Right.. agreed. We discussed the different scenarios that doing IBPB
> > on VMexit would help, and decided its really not required on every exit. 
> > 
> > One obvious case is when there is a VMexit and return back to Qemu
> > process (witout a real context switch) do we need that to be 
> > protected from any poisoned BTB from guest?
> If the host is using retpolines, then some kind of barrier is needed.  I
> don't know if the full PRED_CMD barrier is needed, or two IBRS=1/IBRS=0
> writes back-to-back are enough.
> 
> If the host is using IBRS, then writing IBRS=1 at vmexit has established
> a barrier from the less privileged VMX guest environment.

IBRS will protect qemu userspace only if you set it at some point
before exiting the kernel. You don't want it set *in* the kernel, if
we're using retpolines in the kernel, so you'd want to clear it again
on the way back into the kernel. It's almost the opposite of what the
IBRS patch set was doing to protect the kernel.

Just use IBPB :)

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v3 0/4] KVM: Expose speculation control feature to guests

2018-01-30 Thread David Woodhouse


On Tue, 2018-01-30 at 19:16 -0500, Paolo Bonzini wrote:
> On 30/01/2018 18:48, Raj, Ashok wrote:
> > 
> > > 
> > > Certainly not every vmexit!  But doing it on every userspace vmexit and
> > > every sched_out would not be *that* bad.
> > Right.. agreed. We discussed the different scenarios that doing IBPB
> > on VMexit would help, and decided its really not required on every exit. 
> > 
> > One obvious case is when there is a VMexit and return back to Qemu
> > process (witout a real context switch) do we need that to be 
> > protected from any poisoned BTB from guest?
> If the host is using retpolines, then some kind of barrier is needed.  I
> don't know if the full PRED_CMD barrier is needed, or two IBRS=1/IBRS=0
> writes back-to-back are enough.
> 
> If the host is using IBRS, then writing IBRS=1 at vmexit has established
> a barrier from the less privileged VMX guest environment.

IBRS will protect qemu userspace only if you set it at some point
before exiting the kernel. You don't want it set *in* the kernel, if
we're using retpolines in the kernel, so you'd want to clear it again
on the way back into the kernel. It's almost the opposite of what the
IBRS patch set was doing to protect the kernel.

Just use IBPB :)

smime.p7s
Description: S/MIME cryptographic signature


Re: [9/8] KVM: x86: limit MSR_IA32_SPEC_CTRL access based on CPUID availability

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 18:11 -0500, Paolo Bonzini wrote:

> Great, then the "per-VCPU MSR bitmaps" branch
> (git://git.kernel.org/pub/scm/virt/kvm/kvm.git refs/heads/msr-bitmaps)
> that I created last weekend can be pulled directly in tip/x86/pti.
> 
> It cherry-picks directly to both 4.14 so it will be fine for Greg too.

Right, I've switched to pulling that branch in to my tree at 
http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb

I seem to recall there was some hecking at it, and I expected it to
change? :)


smime.p7s
Description: S/MIME cryptographic signature


Re: [9/8] KVM: x86: limit MSR_IA32_SPEC_CTRL access based on CPUID availability

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 18:11 -0500, Paolo Bonzini wrote:

> Great, then the "per-VCPU MSR bitmaps" branch
> (git://git.kernel.org/pub/scm/virt/kvm/kvm.git refs/heads/msr-bitmaps)
> that I created last weekend can be pulled directly in tip/x86/pti.
> 
> It cherry-picks directly to both 4.14 so it will be fine for Greg too.

Right, I've switched to pulling that branch in to my tree at 
http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb

I seem to recall there was some hecking at it, and I expected it to
change? :)


smime.p7s
Description: S/MIME cryptographic signature


[tip:x86/pti] x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

2018-01-30 Thread tip-bot for David Woodhouse
Commit-ID:  7fcae1118f5fd44a862aa5c3525248e35ee67c3b
Gitweb: https://git.kernel.org/tip/7fcae1118f5fd44a862aa5c3525248e35ee67c3b
Author: David Woodhouse <d...@amazon.co.uk>
AuthorDate: Tue, 30 Jan 2018 14:30:23 +
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Tue, 30 Jan 2018 22:35:05 +0100

x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

Despite the fact that all the other code there seems to be doing it, just
using set_cpu_cap() in early_intel_init() doesn't actually work.

For CPUs with PKU support, setup_pku() calls get_cpu_cap() after
c->c_init() has set those feature bits. That resets those bits back to what
was queried from the hardware.

Turning the bits off for bad microcode is easy to fix. That can just use
setup_clear_cpu_cap() to force them off for all CPUs.

I was less keen on forcing the feature bits *on* that way, just in case
of inconsistencies. I appreciate that the kernel is going to get this
utterly wrong if CPU features are not consistent, because it has already
applied alternatives by the time secondary CPUs are brought up.

But at least if setup_force_cpu_cap() isn't being used, we might have a
chance of *detecting* the lack of the corresponding bit and either
panicking or refusing to bring the offending CPU online.

So ensure that the appropriate feature bits are set within get_cpu_cap()
regardless of how many extra times it's called.

Fixes: 2961298e ("x86/cpufeatures: Clean up Spectre v2 related CPUID flags")
Signed-off-by: David Woodhouse <d...@amazon.co.uk>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Cc: karah...@amazon.de
Cc: pet...@infradead.org
Cc: b...@alien8.de
Link: 
https://lkml.kernel.org/r/1517322623-15261-1-git-send-email-d...@amazon.co.uk

---
 arch/x86/kernel/cpu/common.c | 21 +
 arch/x86/kernel/cpu/intel.c  | 27 ---
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7c996a..dd09270 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -750,6 +750,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
}
 }
 
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+   /*
+* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+* and they also have a different bit for STIBP support. Also,
+* a hypervisor might have set the individual AMD bits even on
+* Intel CPUs, for finer-grained selection of what's available.
+*
+* We use the AMD bits in 0x8000_0008 EBX as the generic hardware
+* features, which are visible in /proc/cpuinfo and used by the
+* kernel. So set those accordingly from the Intel bits.
+*/
+   if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+   set_cpu_cap(c, X86_FEATURE_IBRS);
+   set_cpu_cap(c, X86_FEATURE_IBPB);
+   }
+   if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+   set_cpu_cap(c, X86_FEATURE_STIBP);
+}
+
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
u32 eax, ebx, ecx, edx;
@@ -844,6 +864,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x800a);
 
init_scattered_cpuid_features(c);
+   init_speculation_control(c);
 
/*
 * Clear/Set all flags overridden by options, after probe.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6936d14..319bf98 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -175,28 +175,17 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
 
-   /*
-* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
-* and they also have a different bit for STIBP support. Also,
-* a hypervisor might have set the individual AMD bits even on
-* Intel CPUs, for finer-grained selection of what's available.
-*/
-   if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
-   set_cpu_cap(c, X86_FEATURE_IBRS);
-   set_cpu_cap(c, X86_FEATURE_IBPB);
-   }
-   if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
-   set_cpu_cap(c, X86_FEATURE_STIBP);
-
/* Now if any of them are set, check the blacklist and clear the lot */
-   if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+   if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
 cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
pr_warn("Intel Spectre v2 broken microcode detected; disabling 
Speculation Control\n");
-   

[tip:x86/pti] x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

2018-01-30 Thread tip-bot for David Woodhouse
Commit-ID:  7fcae1118f5fd44a862aa5c3525248e35ee67c3b
Gitweb: https://git.kernel.org/tip/7fcae1118f5fd44a862aa5c3525248e35ee67c3b
Author: David Woodhouse 
AuthorDate: Tue, 30 Jan 2018 14:30:23 +
Committer:  Thomas Gleixner 
CommitDate: Tue, 30 Jan 2018 22:35:05 +0100

x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

Despite the fact that all the other code there seems to be doing it, just
using set_cpu_cap() in early_intel_init() doesn't actually work.

For CPUs with PKU support, setup_pku() calls get_cpu_cap() after
c->c_init() has set those feature bits. That resets those bits back to what
was queried from the hardware.

Turning the bits off for bad microcode is easy to fix. That can just use
setup_clear_cpu_cap() to force them off for all CPUs.

I was less keen on forcing the feature bits *on* that way, just in case
of inconsistencies. I appreciate that the kernel is going to get this
utterly wrong if CPU features are not consistent, because it has already
applied alternatives by the time secondary CPUs are brought up.

But at least if setup_force_cpu_cap() isn't being used, we might have a
chance of *detecting* the lack of the corresponding bit and either
panicking or refusing to bring the offending CPU online.

So ensure that the appropriate feature bits are set within get_cpu_cap()
regardless of how many extra times it's called.

Fixes: 2961298e ("x86/cpufeatures: Clean up Spectre v2 related CPUID flags")
Signed-off-by: David Woodhouse 
Signed-off-by: Thomas Gleixner 
Cc: karah...@amazon.de
Cc: pet...@infradead.org
Cc: b...@alien8.de
Link: 
https://lkml.kernel.org/r/1517322623-15261-1-git-send-email-d...@amazon.co.uk

---
 arch/x86/kernel/cpu/common.c | 21 +
 arch/x86/kernel/cpu/intel.c  | 27 ---
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c7c996a..dd09270 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -750,6 +750,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
}
 }
 
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+   /*
+* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+* and they also have a different bit for STIBP support. Also,
+* a hypervisor might have set the individual AMD bits even on
+* Intel CPUs, for finer-grained selection of what's available.
+*
+* We use the AMD bits in 0x8000_0008 EBX as the generic hardware
+* features, which are visible in /proc/cpuinfo and used by the
+* kernel. So set those accordingly from the Intel bits.
+*/
+   if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+   set_cpu_cap(c, X86_FEATURE_IBRS);
+   set_cpu_cap(c, X86_FEATURE_IBPB);
+   }
+   if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+   set_cpu_cap(c, X86_FEATURE_STIBP);
+}
+
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
u32 eax, ebx, ecx, edx;
@@ -844,6 +864,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x800a);
 
init_scattered_cpuid_features(c);
+   init_speculation_control(c);
 
/*
 * Clear/Set all flags overridden by options, after probe.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6936d14..319bf98 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -175,28 +175,17 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
 
-   /*
-* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
-* and they also have a different bit for STIBP support. Also,
-* a hypervisor might have set the individual AMD bits even on
-* Intel CPUs, for finer-grained selection of what's available.
-*/
-   if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
-   set_cpu_cap(c, X86_FEATURE_IBRS);
-   set_cpu_cap(c, X86_FEATURE_IBPB);
-   }
-   if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
-   set_cpu_cap(c, X86_FEATURE_STIBP);
-
/* Now if any of them are set, check the blacklist and clear the lot */
-   if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+   if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
 cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
pr_warn("Intel Spectre v2 broken microcode detected; disabling 
Speculation Control\n");
-   clear_cpu_cap(c, X86_FEATURE_IBRS);
-   clear_cpu_cap(c, X86_FEATURE_IBPB);
-   clear_c

Re: [PATCH v3 2/4] KVM: x86: Add IBPB support

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 09:19 -0800, Jim Mattson wrote:
> 
> Are you planning to allow L2 to write MSR_IA32_PRED_CMD without L0
> intercepting it, if the MSR write intercept is disabled in both the
> vmcs01 MSR permission bitmap and the vmcs12 MSR permission bitmap?

I don't see why we shouldn't.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v3 2/4] KVM: x86: Add IBPB support

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 09:19 -0800, Jim Mattson wrote:
> 
> Are you planning to allow L2 to write MSR_IA32_PRED_CMD without L0
> intercepting it, if the MSR write intercept is disabled in both the
> vmcs01 MSR permission bitmap and the vmcs12 MSR permission bitmap?

I don't see why we shouldn't.

smime.p7s
Description: S/MIME cryptographic signature


Re: [9/8] KVM: x86: limit MSR_IA32_SPEC_CTRL access based on CPUID availability

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 08:57 -0800, Jim Mattson wrote:
> It's really hard to tell which patches are being proposed for which
> repositories, but assuming that everything else is correct, I don't
> think your condition is adequate. What if the physical CPU and the
> virtual CPU both have CPUID.(EAX=7H,ECX=0):EDX[26], but only the
> physical CPU has CPUID.(EAX=7H,ECX=0):EDX[27]? If the guest has write
> access to MSR_IA32_SPEC_CTRL, it can set MSR_IA32_SPEC_CTRL[1]
> (STIBP), even though setting that bit in the guest should raise #GP.

Everything we're talking about here is for tip/x86/pti. Which I note
has just updated to be 4.15-based, although I thought it was going to
stay on 4.14 for now. So I've updated my tree at
http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb
accordingly.

You can always write to the STIBP bit without a #GP even when it's not
advertised/available.

There's a possibility that we'll want to always trap and *prevent*
that, instead of passing through — because doing so will also have an
effect on the HT siblings. But as discussed, I wanted to get the basics
working before exploring the complex IBRS/STIBP interactions. This much
should be OK to start with.

smime.p7s
Description: S/MIME cryptographic signature


Re: [9/8] KVM: x86: limit MSR_IA32_SPEC_CTRL access based on CPUID availability

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 08:57 -0800, Jim Mattson wrote:
> It's really hard to tell which patches are being proposed for which
> repositories, but assuming that everything else is correct, I don't
> think your condition is adequate. What if the physical CPU and the
> virtual CPU both have CPUID.(EAX=7H,ECX=0):EDX[26], but only the
> physical CPU has CPUID.(EAX=7H,ECX=0):EDX[27]? If the guest has write
> access to MSR_IA32_SPEC_CTRL, it can set MSR_IA32_SPEC_CTRL[1]
> (STIBP), even though setting that bit in the guest should raise #GP.

Everything we're talking about here is for tip/x86/pti. Which I note
has just updated to be 4.15-based, although I thought it was going to
stay on 4.14 for now. So I've updated my tree at
http://git.infradead.org/linux-retpoline.git/shortlog/refs/heads/ibpb
accordingly.

You can always write to the STIBP bit without a #GP even when it's not
advertised/available.

There's a possibility that we'll want to always trap and *prevent*
that, instead of passing through — because doing so will also have an
effect on the HT siblings. But as discussed, I wanted to get the basics
working before exploring the complex IBRS/STIBP interactions. This much
should be OK to start with.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v2 3/3] KVM: VMX: make MSR bitmaps per-VCPU

2018-01-30 Thread David Woodhouse


On Tue, 2018-01-30 at 17:23 +0100, Radim Krčmář wrote:
> 
> The physical address of the nested msr_bitmap is never loaded into vmcs.
> 
> The resolution you provided had extra hunk in prepare_vmcs02_full():
> 
> +   vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
> 
> I have queued that as:
> 
> +   if (cpu_has_vmx_msr_bitmap())
> +   vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
> 
> but it should be a part of the patch or a followup fix.
> 
> Is the branch already merged into PTI?

No, we've never seen a 4.14-based branch that could be merged. I made
one myself for the moment but assumed there would be one from Paulo
that was then pulled into both tip/x86/pti and the kvm.git tree.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v2 3/3] KVM: VMX: make MSR bitmaps per-VCPU

2018-01-30 Thread David Woodhouse


On Tue, 2018-01-30 at 17:23 +0100, Radim Krčmář wrote:
> 
> The physical address of the nested msr_bitmap is never loaded into vmcs.
> 
> The resolution you provided had extra hunk in prepare_vmcs02_full():
> 
> +   vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
> 
> I have queued that as:
> 
> +   if (cpu_has_vmx_msr_bitmap())
> +   vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
> 
> but it should be a part of the patch or a followup fix.
> 
> Is the branch already merged into PTI?

No, we've never seen a 4.14-based branch that could be merged. I made
one myself for the moment but assumed there would be one from Paulo
that was then pulled into both tip/x86/pti and the kvm.git tree.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

2018-01-30 Thread David Woodhouse


On Tue, 2018-01-30 at 14:54 +, Alan Cox wrote:
> The only case I can think of where you'd get a non boot processor that
> didn't have the same microcode loaded as the rest on entry to the OS would
> be in a system where it was possibly to phyically hot plug processors
> post boot.
> 
> There are not many such systems and it may be that all of them do
> sufficient deeply unmentionable things in their firmware to cover this.

We've got a patch lurking somewhere to properly collect the return code
from microcode loading on all CPUs, because we've seen it *fail* on one
CPU. Leaving them inconsistent, and on a kexec after that we really
*did* see the boot CPU with IBRS support and a secondary without it.

But really, my point in this patch was not that I expect all this to
work nicely, but that I don't want to make things *worse* by using
setup_force_cpu_cap() when really I only want to set the bit for *this*
CPU.

I'm *all* for ditching the per-CPU bitmasks since they're largely
pointless and we've applied alternatives before the secondaries are
brought up anyway. It's just a rabbithole I didn't need to go down
today.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH] x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

2018-01-30 Thread David Woodhouse


On Tue, 2018-01-30 at 14:54 +, Alan Cox wrote:
> The only case I can think of where you'd get a non boot processor that
> didn't have the same microcode loaded as the rest on entry to the OS would
> be in a system where it was possibly to phyically hot plug processors
> post boot.
> 
> There are not many such systems and it may be that all of them do
> sufficient deeply unmentionable things in their firmware to cover this.

We've got a patch lurking somewhere to properly collect the return code
from microcode loading on all CPUs, because we've seen it *fail* on one
CPU. Leaving them inconsistent, and on a kexec after that we really
*did* see the boot CPU with IBRS support and a secondary without it.

But really, my point in this patch was not that I expect all this to
work nicely, but that I don't want to make things *worse* by using
setup_force_cpu_cap() when really I only want to set the bit for *this*
CPU.

I'm *all* for ditching the per-CPU bitmasks since they're largely
pointless and we've applied alternatives before the secondaries are
brought up anyway. It's just a rabbithole I didn't need to go down
today.

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v3 2/4] KVM: x86: Add IBPB support

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 08:22 -0600, Tom Lendacky wrote:
> > @@ -918,6 +919,9 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
> >  
> >     set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
> >     }
> > +
> > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > +   set_msr_interception(msrpm, MSR_IA32_PRED_CMD, 1, 1);
>
> Not sure you really need the check here.  If the feature isn't available
> in the hardware, then it won't be advertised in the CPUID bits to the
> guest, so the guest shouldn't try to write to the msr.  If it does, it
> will #GP. So I would think it could be set all the time to not be
> intercepted, no?

The check for boot_cpu_has() is wrong and is fairly redundant as you
say. What we actually want is guest_cpu_has(). We *don't* want to pass
the MSR through for a recalcitrant guest to bash on, if we have elected
not to expose this feature to the guest.

On Intel right now it's *really* important that we don't allow it to be
touched, even if a write would succeed. So even boot_cpu_has() would
not be entirely meaningless there. :)


> > @@ -3330,6 +3331,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
> > msr_data *msr_info)
> >     case MSR_IA32_TSC:
> >     kvm_write_tsc(vcpu, msr_info);
> >     break;
> > +   case MSR_IA32_PRED_CMD:
> > +   if (!msr_info->host_initiated &&
> > +   !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
> > +   return 1;
> > +
> > +   if (data & PRED_CMD_IBPB)
> > +   wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> > +   break;
>
> Should this also be in svm.c or as common code in x86.c?

See my response to [0/4]. I suggested that, but noted that it wasn't
entirely clear where we'd put the storage for SPEC_CTRL. We probably
*could* manage it for IBPB though.

> > 
> >     case MSR_IA32_CR_PAT:
> >     if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> >     if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> > @@ -9548,6 +9557,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm 
> > *kvm, unsigned int id)
> >     goto free_msrs;
> >  
> >     msr_bitmap = vmx->vmcs01.msr_bitmap;
> > +
> > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > +   vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PRED_CMD, 
> > MSR_TYPE_W);
>
> Same comment here as in svm.c, is the feature check necessary?

Again, yes but it should be guest_cpu_has() and we couldn't see how :)


smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v3 2/4] KVM: x86: Add IBPB support

2018-01-30 Thread David Woodhouse
On Tue, 2018-01-30 at 08:22 -0600, Tom Lendacky wrote:
> > @@ -918,6 +919,9 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
> >  
> >     set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
> >     }
> > +
> > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > +   set_msr_interception(msrpm, MSR_IA32_PRED_CMD, 1, 1);
>
> Not sure you really need the check here.  If the feature isn't available
> in the hardware, then it won't be advertised in the CPUID bits to the
> guest, so the guest shouldn't try to write to the msr.  If it does, it
> will #GP. So I would think it could be set all the time to not be
> intercepted, no?

The check for boot_cpu_has() is wrong and is fairly redundant as you
say. What we actually want is guest_cpu_has(). We *don't* want to pass
the MSR through for a recalcitrant guest to bash on, if we have elected
not to expose this feature to the guest.

On Intel right now it's *really* important that we don't allow it to be
touched, even if a write would succeed. So even boot_cpu_has() would
not be entirely meaningless there. :)


> > @@ -3330,6 +3331,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
> > msr_data *msr_info)
> >     case MSR_IA32_TSC:
> >     kvm_write_tsc(vcpu, msr_info);
> >     break;
> > +   case MSR_IA32_PRED_CMD:
> > +   if (!msr_info->host_initiated &&
> > +   !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
> > +   return 1;
> > +
> > +   if (data & PRED_CMD_IBPB)
> > +   wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
> > +   break;
>
> Should this also be in svm.c or as common code in x86.c?

See my response to [0/4]. I suggested that, but noted that it wasn't
entirely clear where we'd put the storage for SPEC_CTRL. We probably
*could* manage it for IBPB though.

> > 
> >     case MSR_IA32_CR_PAT:
> >     if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
> >     if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
> > @@ -9548,6 +9557,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm 
> > *kvm, unsigned int id)
> >     goto free_msrs;
> >  
> >     msr_bitmap = vmx->vmcs01.msr_bitmap;
> > +
> > +   if (boot_cpu_has(X86_FEATURE_IBPB))
> > +   vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_PRED_CMD, 
> > MSR_TYPE_W);
>
> Same comment here as in svm.c, is the feature check necessary?

Again, yes but it should be guest_cpu_has() and we couldn't see how :)


smime.p7s
Description: S/MIME cryptographic signature


[PATCH v2] x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel

2018-01-30 Thread David Woodhouse
Despite the fact that all the other code there seems to be doing it,
just using set_cpu_cap() in early_intel_init() doesn't actually work.

For CPUs with PKU support, setup_pku() calls get_cpu_cap() after
c->c_init() has set those feature bits. That resets those bits back
to what was queried from the hardware.

Turning the bits off for bad microcode is easy to fix. That can just use
setup_clear_cpu_cap() to force them off for all CPUs.

I was less keen on forcing the feature bits *on* that way, just in case
of inconsistencies. I appreciate that the kernel is going to get this
utterly wrong if CPU features are not consistent, because it has already
applied alternatives by the time secondary CPUs are brought up.

But at least if setup_force_cpu_cap() isn't being used, we might have a
chance of *detecting* the lack of the corresponding bit and either
panicking or refusing to bring the offending CPU online.

So ensure that the appropriate feature bits are set within get_cpu_cap()
regardless of how many extra times it's called.

Signed-off-by: David Woodhouse <d...@amazon.co.uk>
Fixes: 2961298e ("x86/cpufeatures: Clean up Spectre v2 related CPUID flags")
---

So...

We now understand why this didn't show up in my initial testing. It's not
*just* that I'm incompetent; it's setup_pku() which stomps on the bits
when it calls get_cpu_cap() again.

So Boris was right, and it would have been nicer if we hadn't been 
(ab)using the AMD bits as our generic visible feature bits for these.
Then they wouldn't have been stomped on again by get_cpu_cap() reading
the corresponding words from the hardware. Except...

Boris proposes that we introduce three new bits for IBRS/IBPB/STIBP and
make the AMD bits "hidden" too. So we'd need to set those new virtual
bits when *either* the AMD or the Intel feature bits are set.

That means we'd still have to add some code *somewhere* to set those
virtual bits. The AMD hardware bits, in particular, might be set even
by a hypervisor running on Intel CPUs since they allow combinations
like "IBPB only" to be advertised while the Intel-defined bits don't.

So we'd have to add code in a generic code path *anyway* to do that.

And since the main objection to my v1 patch was that it was adding the
code in a generic code path, it doesn't seem to be much of an improvement.

This v2 patch moves said code from scattered.c, which wasn't really
the right place, to a subfunction called from get_cpu_cap() directly.
You could do the shift to a separate three virtual feature bits on
top of this if you *really* wanted. I just don't see the point. 

We are, of course, still utterly hosed if the CPU capabilities are
inconsistent. But at least I didn't make things worse by making that
harder to detect by using setup_force_cpu_cap().

 arch/x86/kernel/cpu/common.c | 21 +
 arch/x86/kernel/cpu/intel.c  | 27 ---
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 970ee06..f9d8460 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -726,6 +726,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
}
 }
 
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+   /*
+* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+* and they also have a different bit for STIBP support. Also,
+* a hypervisor might have set the individual AMD bits even on
+* Intel CPUs, for finer-grained selection of what's available.
+*
+* We use the AMD bits in 0x8000_0008 EBX as the generic hardware
+* features, which are visible in /proc/cpuinfo and used by the
+* kernel. So set those accordingly from the Intel bits.
+*/
+   if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+   set_cpu_cap(c, X86_FEATURE_IBRS);
+   set_cpu_cap(c, X86_FEATURE_IBPB);
+   }
+   if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+   set_cpu_cap(c, X86_FEATURE_STIBP);
+}
+
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
u32 eax, ebx, ecx, edx;
@@ -820,6 +840,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x800a);
 
init_scattered_cpuid_features(c);
+   init_speculation_control(c);
 
/*
 * Clear/Set all flags overridden by options, after probe.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0c8b916..4cf4f8c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -175,28 +175,17 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
 
-   /*
-* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
-* and they also have a dif

<    2   3   4   5   6   7   8   9   10   11   >