Re: [PATCH v3] KVM/x86: Move definition of __ex to x86.h
On 12/21/20 11:48 AM, Uros Bizjak wrote: Merge __kvm_handle_fault_on_reboot with its sole user and move the definition of __ex to a common include to be shared between VMX and SVM. v2: Rebase to the latest kvm/queue. v3: Incorporate changes from review comments. Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Uros Bizjak Reviewed-by: Krish Sadhukhan --- arch/x86/include/asm/kvm_host.h | 25 - arch/x86/kvm/svm/sev.c | 2 -- arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/vmx/vmx.c | 4 +--- arch/x86/kvm/vmx/vmx_ops.h | 4 +--- arch/x86/kvm/x86.h | 24 6 files changed, 26 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 39707e72b062..a78e4b1a5d77 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1634,31 +1634,6 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void kvm_spurious_fault(void); - -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Usually after catching the fault we just panic; during reboot - * instead the instruction is ignored. - */ -#define __kvm_handle_fault_on_reboot(insn) \ - "666: \n\t" \ - insn "\n\t" \ - "jmp 668f \n\t" \ - "667: \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_begin \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t"\ - "call kvm_spurious_fault \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_end \n\t"\ - ".long 1b - . \n\t" \ - ".popsection \n\t"\ - "668: \n\t" \ - _ASM_EXTABLE(666b, 667b) - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e57847ff8bd2..ba492b6d37a0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -25,8 +25,6 @@ #include "cpuid.h" #include "trace.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - static u8 sev_enc_bit; static int sev_flush_asids(void); static DECLARE_RWSEM(sev_deactivate_lock); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 941e5251e13f..733d9f98a121 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -42,8 +42,6 @@ #include "svm.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75c9c6a0a3a4..b82f2689f2d7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2320,9 +2320,7 @@ static void vmclear_local_loaded_vmcss(void) } -/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() - * tricks. - */ +/* Just like cpu_vmxoff(), but with the fault handling. */ static void kvm_cpu_vmxoff(void) { asm volatile (__ex("vmxoff")); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..7e3cb53c413f 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,13 +4,11 @@ #include -#include #include #include "evmcs.h" #include "vmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c5ee0f5ce0f1..5b16d2b5c3bc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,30 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +asmlinkage void kvm_spurious_fault(void); + +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + *
Re: [PATCH] KVM/x86: Move definition of __ex to x86.h
On 12/18/20 4:11 AM, Uros Bizjak wrote: Merge __kvm_handle_fault_on_reboot with its sole user and move the definition of __ex to a common include to be shared between VMX and SVM. Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Uros Bizjak --- arch/x86/include/asm/kvm_host.h | 25 - arch/x86/kvm/svm/svm.c | 2 -- arch/x86/kvm/vmx/vmx_ops.h | 4 +--- arch/x86/kvm/x86.h | 23 +++ 4 files changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e5f33a0d0e2..ff152ee1d63f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1623,31 +1623,6 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void kvm_spurious_fault(void); - -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Usually after catching the fault we just panic; during reboot - * instead the instruction is ignored. - */ -#define __kvm_handle_fault_on_reboot(insn) \ - "666: \n\t" \ - insn "\n\t" \ - "jmp 668f \n\t" \ - "667: \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_begin \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t"\ - "call kvm_spurious_fault \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_end \n\t"\ - ".long 1b - . \n\t" \ - ".popsection \n\t"\ - "668: \n\t" \ - _ASM_EXTABLE(666b, 667b) - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index da7eb4aaf44f..0a72ab9fd568 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -42,8 +42,6 @@ #include "svm.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 692b0c31c9c8..7e3cb53c413f 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,13 +4,11 @@ #include -#include #include #include "evmcs.h" #include "vmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e7ca622a468f..608548d05e84 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -7,6 +7,29 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +asmlinkage void kvm_spurious_fault(void); + +/* + * Hardware virtualization extension instructions may fault if a + * reboot turns off virtualization while processes are running. + * Usually after catching the fault we just panic; during reboot + * instead the instruction is ignored. + */ +#define __ex(insn) \ + "666: " insn "\n" \ + " jmp 669f\n"\ + "667:\n" \ + ".pushsection .discard.instr_begin\n" \ + ".long 667b - .\n"\ + ".popsection\n" \ + " call kvm_spurious_fault\n" \ + "668:\n" \ + ".pushsection .discard.instr_end\n" \ + ".long 668b - .\n" \ + ".popsection\n" \ + "669:\n" \ + _ASM_EXTABLE(666b, 667b) + #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW4096 #define KVM_DEFAULT_PLE_WINDOW_GROW 2 Reviewed-by: Krish Sadhukhan
[tip: x86/cpu] KVM: SVM: Don't flush cache if hardware enforces cache coherency across encryption domains
The following commit has been merged into the x86/cpu branch of tip: Commit-ID: e1ebb2b49048c4767cfa0d8466f9c701e549fa5e Gitweb: https://git.kernel.org/tip/e1ebb2b49048c4767cfa0d8466f9c701e549fa5e Author:Krish Sadhukhan AuthorDate:Thu, 17 Sep 2020 21:20:38 Committer: Borislav Petkov CommitterDate: Sat, 19 Sep 2020 20:46:59 +02:00 KVM: SVM: Don't flush cache if hardware enforces cache coherency across encryption domains In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page in a VM is enforced. In such a system, it is not required for software to flush the VM's page from all CPU caches in the system prior to changing the value of the C-bit for the page. So check that bit before flushing the cache. Signed-off-by: Krish Sadhukhan Signed-off-by: Borislav Petkov Acked-by: Paolo Bonzini Link: https://lkml.kernel.org/r/20200917212038.5090-4-krish.sadhuk...@oracle.com --- arch/x86/kvm/svm/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 402dc42..567792f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -384,7 +384,8 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) uint8_t *page_virtual; unsigned long i; - if (npages == 0 || pages == NULL) + if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || + pages == NULL) return; for (i = 0; i < npages; i++) {
[tip: x86/cpu] x86/cpu: Add hardware-enforced cache coherency as a CPUID feature
The following commit has been merged into the x86/cpu branch of tip: Commit-ID: 5866e9205b47a983a77ebc8654949f696342f2ab Gitweb: https://git.kernel.org/tip/5866e9205b47a983a77ebc8654949f696342f2ab Author:Krish Sadhukhan AuthorDate:Thu, 17 Sep 2020 21:20:36 Committer: Borislav Petkov CommitterDate: Fri, 18 Sep 2020 10:46:41 +02:00 x86/cpu: Add hardware-enforced cache coherency as a CPUID feature In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for a page. This hardware- enforced cache coherency is indicated by EAX[10] in CPUID leaf 0x801f. [ bp: Use one of the free slots in word 3. ] Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200917212038.5090-2-krish.sadhuk...@oracle.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c| 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 83fc9d3..50b2a8d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* free( 3*32+17) */ +#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c..3221b71 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_MBA, CPUID_EBX, 6, 0x8008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x801f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x801f, 0 }, + { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x801f, 0 }, { 0, 0, 0, 0, 0 } };
[tip: x86/cpu] x86/mm/pat: Don't flush cache if hardware enforces cache coherency across encryption domnains
The following commit has been merged into the x86/cpu branch of tip: Commit-ID: 75d1cc0e05af579301ce4e49cf6399be4b4e6e76 Gitweb: https://git.kernel.org/tip/75d1cc0e05af579301ce4e49cf6399be4b4e6e76 Author:Krish Sadhukhan AuthorDate:Thu, 17 Sep 2020 21:20:37 Committer: Borislav Petkov CommitterDate: Fri, 18 Sep 2020 10:47:00 +02:00 x86/mm/pat: Don't flush cache if hardware enforces cache coherency across encryption domnains In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for the page. So check that bit before flushing the cache. [ bp: Massage commit message. ] Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200917212038.5090-3-krish.sadhuk...@oracle.com --- arch/x86/mm/pat/set_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d1b2a88..40baa90 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush(, 1); + cpa_flush(, !this_cpu_has(X86_FEATURE_SME_COHERENT)); ret = __change_page_attr_set_clr(, 1);
[tip: x86/cpu] x86/cpu: Add hardware-enforced cache coherency as a CPUID feature
The following commit has been merged into the x86/cpu branch of tip: Commit-ID: f1f325183519ba25370765607e2732d6edf53de1 Gitweb: https://git.kernel.org/tip/f1f325183519ba25370765607e2732d6edf53de1 Author:Krish Sadhukhan AuthorDate:Thu, 17 Sep 2020 21:20:36 Committer: Borislav Petkov CommitterDate: Fri, 18 Sep 2020 09:46:06 +02:00 x86/cpu: Add hardware-enforced cache coherency as a CPUID feature In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for a page. This hardware- enforced cache coherency is indicated by EAX[10] in CPUID leaf 0x801f. Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200917212038.5090-2-krish.sadhuk...@oracle.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c| 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 83fc9d3..ba6e8f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -288,6 +288,7 @@ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ +#define X86_FEATURE_SME_COHERENT (11*32+ 7) /* "" AMD hardware-enforced cache coherency */ /* Intel-defined CPU features, CPUID level 0x0007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16(12*32+ 5) /* AVX512 BFLOAT16 instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c..3221b71 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_MBA, CPUID_EBX, 6, 0x8008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x801f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x801f, 0 }, + { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x801f, 0 }, { 0, 0, 0, 0, 0 } };
[tip: x86/cpu] x86/mm/pat: Don't flush cache if hardware enforces cache coherency across encryption domnains
The following commit has been merged into the x86/cpu branch of tip: Commit-ID: 789521fca70ec8cb98f7257b880405e46f8a47a1 Gitweb: https://git.kernel.org/tip/789521fca70ec8cb98f7257b880405e46f8a47a1 Author:Krish Sadhukhan AuthorDate:Thu, 17 Sep 2020 21:20:37 Committer: Borislav Petkov CommitterDate: Fri, 18 Sep 2020 09:48:22 +02:00 x86/mm/pat: Don't flush cache if hardware enforces cache coherency across encryption domnains In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for the page. So check that bit before flushing the cache. [ bp: Massage commit message. ] Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200917212038.5090-3-krish.sadhuk...@oracle.com --- arch/x86/mm/pat/set_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d1b2a88..40baa90 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush(, 1); + cpa_flush(, !this_cpu_has(X86_FEATURE_SME_COHERENT)); ret = __change_page_attr_set_clr(, 1);
[PATCH 3/3 v4] KVM: SVM: Don't flush cache if hardware enforces cache coherency across encryption domains
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page in a VM is enforced. In such a system, it is not required for software to flush the VM's page from all CPU caches in the system prior to changing the value of the C-bit for the page. Signed-off-by: Krish Sadhukhan --- arch/x86/kvm/svm/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7bf7bf734979..3c9a45efdd4d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -384,7 +384,8 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) uint8_t *page_virtual; unsigned long i; - if (npages == 0 || pages == NULL) + if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || + pages == NULL) return; for (i = 0; i < npages; i++) { -- 2.18.4
[PATCH 2/3 v4] x86: AMD: Don't flush cache if hardware enforces cache coherency across encryption domnains
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for the page. Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan --- arch/x86/mm/pat/set_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d1b2a889f035..40baa90e74f4 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush(, 1); + cpa_flush(, !this_cpu_has(X86_FEATURE_SME_COHERENT)); ret = __change_page_attr_set_clr(, 1); -- 2.18.4
[PATCH 1/3 v4] x86: AMD: Add hardware-enforced cache coherency as a CPUID feature
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for a page. This hardware- enforced cache coherency is indicated by EAX[10] in CPUID leaf 0x801f. Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c| 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2901d5df4366..c3fada5f5f71 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -288,6 +288,7 @@ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ +#define X86_FEATURE_SME_COHERENT (11*32+ 7) /* "" AMD hardware-enforced cache coherency */ /* Intel-defined CPU features, CPUID level 0x0007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16(12*32+ 5) /* AVX512 BFLOAT16 instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c3c97a..0bc2668f22e6 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_MBA, CPUID_EBX, 6, 0x8008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x801f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x801f, 0 }, + { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x801f, 0 }, { 0, 0, 0, 0, 0 } }; -- 2.18.4
[PATCH 0/3 v4] x86: AMD: Don't flush cache if hardware enforces cache coherency across encryption domains
v3 -> v4: 1. Patch# 1 from v3 has been dropped. 2. The CPUID feature for hardware-enforced cache coherency has been renamed. [PATCH 1/3 v4] x86: AMD: Add hardware-enforced cache coherency as a [PATCH 2/3 v4] x86: AMD: Don't flush cache if hardware enforces cache [PATCH 3/3 v4] KVM: SVM: Don't flush cache if hardware enforces cache arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c| 1 + arch/x86/kvm/svm/sev.c | 3 ++- arch/x86/mm/pat/set_memory.c | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) Krish Sadhukhan (3): x86: AMD: Add hardware-enforced cache coherency as a CPUID feature x86: AMD: Don't flush cache if hardware enforces cache coherency across encryption domnains KVM: SVM: Don't flush cache if hardware enforces cache coherency across encryption domains
Re: [PATCH] KVM: SVM: Get rid of the variable 'exit_fastpath'
On 9/15/20 4:30 AM, lihaiwei.ker...@gmail.com wrote: From: Haiwei Li 'exit_fastpath' isn't used anywhere else, so remove it. Suggested-by: Krish Sadhukhan Signed-off-by: Haiwei Li --- arch/x86/kvm/svm/svm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c44f3e9..6e88658 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3413,7 +3413,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3536,8 +3535,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); - exit_fastpath = svm_exit_handlers_fastpath(vcpu); - return exit_fastpath; + return svm_exit_handlers_fastpath(vcpu); } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, Reviewed-by: Krish Sadhukhan
Re: [PATCH] KVM: SVM: Analyze is_guest_mode() in svm_vcpu_run()
On 9/13/20 11:55 PM, Wanpeng Li wrote: From: Wanpeng Li Analyze is_guest_mode() in svm_vcpu_run() instead of svm_exit_handlers_fastpath() in conformity with VMX version. Suggested-by: Vitaly Kuznetsov Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3da5b2f..009035a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3393,8 +3393,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -3580,6 +3579,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + exit_fastpath = svm_exit_handlers_fastpath(vcpu); return exit_fastpath; Not related to your changes, but should we get rid of the variable 'exit_fastpath' and just do, return svm_exit_handler_fastpath(vcpu); It seems the variable isn't used anywhere else and svm_vcpu_run() doesn't return from anywhere else either. Also, svm_exit_handlers_fastpath() doesn't have any other caller. Should we get rid of it as well ? For your changes, Reviewed-by: Krish Sadhukhan }
Re: [PATCH 2/4 v3] x86: AMD: Add hardware-enforced cache coherency as a CPUID feature
On 9/11/20 12:36 PM, Dave Hansen wrote: On 9/11/20 12:25 PM, Krish Sadhukhan wrote: diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 81335e6fe47d..0e5b27ee5931 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -293,6 +293,7 @@ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ +#define X86_FEATURE_HW_CACHE_COHERENCY (11*32+ 7) /* AMD hardware-enforced cache coherency */ That's an awfully generic name. We generally have "hardware-enforced cache coherency" already everywhere. :) This probably needs to say something about encryption, or even SEV specifically. How about X86_FEATURE_ENC_CACHE_COHERENCY ? I also don't see this bit in the "AMD64 Architecture Programmer’s Manual". Did I look in the wrong spot somehow? Section 7.10.6 in APM mentions this.
[PATCH 2/4 v3] x86: AMD: Add hardware-enforced cache coherency as a CPUID feature
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for a page. This hardware- enforced cache coherency is indicated by EAX[10] in CPUID leaf 0x801f. Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c| 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 81335e6fe47d..0e5b27ee5931 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -293,6 +293,7 @@ #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ +#define X86_FEATURE_HW_CACHE_COHERENCY (11*32+ 7) /* AMD hardware-enforced cache coherency */ /* Intel-defined CPU features, CPUID level 0x0007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16(12*32+ 5) /* AVX512 BFLOAT16 instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 033c112e03fc..57394fee1d35 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_MBA, CPUID_EBX, 6, 0x8008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, CPUID_AMD_SME, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, CPUID_AMD_SME, 0 }, + { X86_FEATURE_HW_CACHE_COHERENCY, CPUID_EAX, 10, CPUID_AMD_SME, 0 }, { 0, 0, 0, 0, 0 } }; -- 2.18.4
[PATCH 0/4 v3] x86: AMD: Don't flush cache if hardware enforces cache coherency across encryption domains
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for a page. This hardware- enforced cache coherency is indicated by EAX[10] in CPUID leaf 0x801f. Add this as a CPUID feature and skip flushing caches if the feature is present. v2 -> v3: Patch# 2: Moves the addition of the CPUID feature from early_detect_mem_encrypt() to scattered.c. Patch# 3,4: These two are the split of patch# 3 from v2. Patch# 3 is for non[PATCH 0/4 v3] x86: AMD: Don't flush encrypted pages if hardware enforces cache coherency-SEV encryptions while patch#4 is for SEV encryptions. [PATCH 1/4 v3] x86: AMD: Replace numeric value for SME CPUID leaf with a [PATCH 2/4 v3] x86: AMD: Add hardware-enforced cache coherency as a [PATCH 3/4 v3] x86: AMD: Don't flush cache if hardware enforces cache [PATCH 4/4 v3] KVM: SVM: Don't flush cache if hardware enforces cache arch/x86/boot/compressed/mem_encrypt.S | 5 +++-- arch/x86/include/asm/cpufeatures.h | 6 ++ arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/scattered.c| 5 +++-- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/sev.c | 3 ++- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/mm/mem_encrypt_identity.c | 4 ++-- arch/x86/mm/pat/set_memory.c | 2 +- 9 files changed, 21 insertions(+), 12 deletions(-) Krish Sadhukhan (4): x86: AMD: Replace numeric value for SME CPUID leaf with a #define x86: AMD: Add hardware-enforced cache coherency as a CPUID feature x86: AMD: Don't flush cache if hardware enforces cache coherency across en cryption domnains KVM: SVM: Don't flush cache if hardware enforces cache coherency across en cryption domains
[PATCH 3/4 v3] x86: AMD: Don't flush cache if hardware enforces cache coherency across encryption domnains
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page is enforced. In such a system, it is not required for software to flush the page from all CPU caches in the system prior to changing the value of the C-bit for the page. Suggested-by: Tom Lendacky Signed-off-by: Krish Sadhukhan --- arch/x86/mm/pat/set_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index d1b2a889f035..78d5511c5edd 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1999,7 +1999,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush(, 1); + cpa_flush(, !this_cpu_has(X86_FEATURE_HW_CACHE_COHERENCY)); ret = __change_page_attr_set_clr(, 1); -- 2.18.4
[PATCH 1/4 v3] x86: AMD: Replace numeric value for SME CPUID leaf with a #define
Signed-off-by: Krish Sadhukhan --- arch/x86/boot/compressed/mem_encrypt.S | 5 +++-- arch/x86/include/asm/cpufeatures.h | 5 + arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/scattered.c| 4 ++-- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/mm/mem_encrypt_identity.c | 4 ++-- 7 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index dd07e7b41b11..22e30b0c0d19 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -12,6 +12,7 @@ #include #include #include +#include .text .code32 @@ -31,7 +32,7 @@ SYM_FUNC_START(get_sev_encryption_bit) movl$0x8000, %eax /* CPUID to check the highest leaf */ cpuid - cmpl$0x801f, %eax /* See if 0x801f is available */ + cmpl$CPUID_AMD_SME, %eax/* See if 0x801f is available */ jb .Lno_sev /* @@ -40,7 +41,7 @@ SYM_FUNC_START(get_sev_encryption_bit) * CPUID Fn8000_001F[EBX] - Bits 5:0 * Pagetable bit position used to indicate encryption */ - movl$0x801f, %eax + movl$CPUID_AMD_SME, %eax cpuid bt $1, %eax/* Check if SEV is available */ jnc .Lno_sev diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2901d5df4366..81335e6fe47d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -10,6 +10,11 @@ #include #endif +/* + * AMD CPUID functions + */ +#define CPUID_AMD_SME 0x801f /* Secure Memory Encryption */ + /* * Defines x86 CPU feature bits */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dcc3d943c68f..4507ededb978 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -630,7 +630,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * will be a value above 32-bits this is still done for * CONFIG_X86_32 so that accurate values are reported. */ - c->x86_phys_bits -= (cpuid_ebx(0x801f) >> 6) & 0x3f; + c->x86_phys_bits -= (cpuid_ebx(CPUID_AMD_SME) >> 6) & 0x3f; if (IS_ENABLED(CONFIG_X86_32)) goto clear_all; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c3c97a..033c112e03fc 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -39,8 +39,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x8007, 0 }, { X86_FEATURE_PROC_FEEDBACK,CPUID_EDX, 11, 0x8007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x8008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x801f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x801f, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, CPUID_AMD_SME, 0 }, + { X86_FEATURE_SEV, CPUID_EAX, 1, CPUID_AMD_SME, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3fd6eec202d7..95863e767d3d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -756,7 +756,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x8000: - entry->eax = min(entry->eax, 0x801f); + entry->eax = min(entry->eax, CPUID_AMD_SME); break; case 0x8001: cpuid_entry_override(entry, CPUID_8000_0001_EDX); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0194336b64a4..a4e92ae399b4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -749,7 +749,7 @@ static __init void svm_adjust_mmio_mask(void) u64 msr, mask; /* If there is no memory encryption support, use existing mask */ - if (cpuid_eax(0x8000) < 0x801f) + if (cpuid_eax(0x8000) < CPUID_AMD_SME) return; /* If memory encryption is not enabled, use existing mask */ @@ -757,7 +757,7 @@ static __init void svm_adjust_mmio_mask(void) if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) return; - enc_bit = cpuid_ebx(0x801f) & 0x3f; + enc_bit = cpuid_ebx(CPUID_AMD_SME) & 0x3f; mask_bit = boot_cpu_data.x86_phys_bits; /* Increment the mask bit if it is the same as the encryption bit */ diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index e2b0e2ac07bb..cbe600dd357b 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x8
[PATCH 4/4 v3] KVM: SVM: Don't flush cache if hardware enforces cache coherency across encryption domains
In some hardware implementations, coherency between the encrypted and unencrypted mappings of the same physical page in a VM is enforced. In such a system, it is not required for software to flush the VM's page from all CPU caches in the system prior to changing the value of the C-bit for the page. Signed-off-by: Krish Sadhukhan --- arch/x86/kvm/svm/sev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 402dc4234e39..8aa2209f2637 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -384,7 +384,8 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) uint8_t *page_virtual; unsigned long i; - if (npages == 0 || pages == NULL) + if (this_cpu_has(X86_FEATURE_HW_CACHE_COHERENCY) || npages == 0 || + pages == NULL) return; for (i = 0; i < npages; i++) { -- 2.18.4
Re: [PATCH] KVM: SVM: Mark SEV launch secret pages as dirty.
On 8/6/20 6:23 PM, Cfir Cohen wrote: The LAUNCH_SECRET command performs encryption of the launch secret memory contents. Mark pinned pages as dirty, before unpinning them. This matches the logic in sev_launch_update(). sev_launch_update_data() instead of sev_launch_update() ? Signed-off-by: Cfir Cohen --- arch/x86/kvm/svm/sev.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5573a97f1520..37c47d26b9f7 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -850,7 +850,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; - unsigned long n; + unsigned long n, i; int ret, offset; if (!sev_guest(kvm)) @@ -863,6 +863,14 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!pages) return -ENOMEM; + /* +* The LAUNCH_SECRET command will perform in-place encryption of the +* memory content (i.e it will write the same memory region with C=1). +* It's possible that the cache may contain the data with C=0, i.e., +* unencrypted so invalidate it first. +*/ + sev_clflush_pages(pages, n); + /* * The secret must be copied into contiguous memory region, lets verify * that userspace memory pages are contiguous before we issue command. @@ -908,6 +916,11 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) e_free: kfree(data); e_unpin_memory: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < n; i++) { + set_page_dirty_lock(pages[i]); + mark_page_accessed(pages[i]); + } sev_unpin_memory(kvm, pages, n); return ret; } Reviewed-by: Krish Sadhukhan
Re: [PATCH 4/4] KVM: SVM: Rename svm_nested_virtualize_tpr() to nested_svm_virtualize_tpr()
On 6/25/20 1:03 AM, Joerg Roedel wrote: From: Joerg Roedel Match the naming with other nested svm functions. No functional changes. Signed-off-by: Joerg Roedel --- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e0b0321d95d4..b0d551bcf2a0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3040,7 +3040,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -3234,7 +3234,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { @@ -3248,7 +3248,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (svm_nested_virtualize_tpr(vcpu) || + if (nested_svm_virtualize_tpr(vcpu) || kvm_vcpu_apicv_active(vcpu)) return; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 003e89008331..8907efda0b0a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -365,7 +365,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); Should the name be changed to svm_nested_virtualized_tpr ? Because svm_nested_virtualize_tpr implies that the action of the function is to virtualize the TPR.
Re: [PATCH 0/4] KVM: SVM: Code move follow-up
On 6/25/20 1:03 AM, Joerg Roedel wrote: From: Joerg Roedel Hi, here is small series to follow-up on the review comments for moving the kvm-amd module code to its own sub-directory. The comments were only about renaming structs and symbols, so there are no functional changes in these patches. The comments addressed here are all from [1]. Regards, Joerg [1] https://lore.kernel.org/lkml/87d0917ezq@vitty.brq.redhat.com/ Joerg Roedel (4): KVM: SVM: Rename struct nested_state to svm_nested_state KVM: SVM: Add vmcb_ prefix to mark_*() functions KVM: SVM: Add svm_ prefix to set/clr/is_intercept() KVM: SVM: Rename svm_nested_virtualize_tpr() to nested_svm_virtualize_tpr() arch/x86/kvm/svm/avic.c | 2 +- arch/x86/kvm/svm/nested.c | 8 +-- arch/x86/kvm/svm/sev.c| 2 +- arch/x86/kvm/svm/svm.c| 138 +++--- arch/x86/kvm/svm/svm.h| 20 +++--- 5 files changed, 85 insertions(+), 85 deletions(-) Reviewed-by: Krish Sadhukhan
Re: [PATCH 30/30] KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE
On 5/29/20 8:39 AM, Paolo Bonzini wrote: Similar to VMX, the state that is captured through the currently available IOCTLs is a mix of L1 and L2 state, dependent on whether the L2 guest was running at the moment when the process was interrupted to save its state. In particular, the SVM-specific state for nested virtualization includes the L1 saved state (including the interrupt flag), the cached L2 controls, and the GIF. Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 17 +++- arch/x86/kvm/cpuid.h| 5 ++ arch/x86/kvm/svm/nested.c | 147 arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/vmx/nested.c | 5 -- arch/x86/kvm/x86.c | 3 +- 6 files changed, 171 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 3f3f780c8c65..12075a9de1c1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -385,18 +385,22 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM1 /* unused */ +#define KVM_STATE_NESTED_FORMAT_SVM1 #define KVM_STATE_NESTED_GUEST_MODE 0x0001 #define KVM_STATE_NESTED_RUN_PENDING 0x0002 #define KVM_STATE_NESTED_EVMCS0x0004 #define KVM_STATE_NESTED_MTF_PENDING 0x0008 +#define KVM_STATE_NESTED_GIF_SET 0x0100 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x0001 #define KVM_STATE_NESTED_SMM_VMXON0x0002 #define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 +#define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000 + + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -411,6 +415,15 @@ struct kvm_vmx_nested_state_hdr { } smm; }; +struct kvm_svm_nested_state_data { + /* Save area only used if KVM_STATE_NESTED_RUN_PENDING. */ + __u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE]; +}; + +struct kvm_svm_nested_state_hdr { + __u64 vmcb_pa; +}; + /* for KVM_CAP_NESTED_STATE */ struct kvm_nested_state { __u16 flags; @@ -419,6 +432,7 @@ struct kvm_nested_state { union { struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; /* Pad the header to 128 bytes. */ __u8 pad[120]; @@ -431,6 +445,7 @@ struct kvm_nested_state { */ union { struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; } data; }; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 63a70f6a3df3..05434cd9342f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -303,4 +303,9 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c712fe577029..6b1049148c1b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,7 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "cpuid.h" #include "lapic.h" #include "svm.h" @@ -238,6 +239,8 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, { copy_vmcb_control_area(>nested.ctl, control); + /* Copy it here because nested_svm_check_controls will check it. */ + svm->nested.ctl.asid = control->asid; svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; } @@ -930,6 +933,150 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } +static int svm_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_svm *svm; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = KVM_STATE_NESTED_FORMAT_SVM, + .size = sizeof(kvm_state), + }; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + _kvm_nested_state->data.svm[0]; + + if (!vcpu) + return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE; + + svm = to_svm(vcpu); + + if (user_data_size < kvm_state.size) + goto out; + + /* First fill in the header and copy it out. */ + if (is_guest_mode(vcpu)) { + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
Re: [PATCH 25/30] KVM: nSVM: leave guest mode when clearing EFER.SVME
On 5/29/20 8:39 AM, Paolo Bonzini wrote: According to the AMD manual, the effect of turning off EFER.SVME while a guest is running is undefined. We make it leave guest mode immediately, similar to the effect of clearing the VMX bit in MSR_IA32_FEAT_CTL. I see that svm_set_efer() is called in enter_svm_guest_mode() and nested_svm_vmexit(). In the VMRUN path, we have already checked EFER.SVME in nested_vmcb_checks(). So if it was not set, we wouldn't come to enter_svm_guest_mode(). Your fix is only for the #VMEXIT path then ? Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 16 arch/x86/kvm/svm/svm.c| 10 -- arch/x86/kvm/svm/svm.h| 1 + 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bd3a89cd4070..369eca73fe3e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -618,6 +618,22 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +/* + * Forcibly leave nested mode in order to be able to reset the VCPU later on. + */ +void svm_leave_nested(struct vcpu_svm *svm) +{ + if (is_guest_mode(>vcpu)) { + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + + svm->nested.nested_run_pending = 0; + leave_guest_mode(>vcpu); + copy_vmcb_control_area(>control, >control); + nested_svm_uninit_mmu_context(>vcpu); + } +} + static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { u32 offset, msr, value; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bc08221f6743..b4db9a980469 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -265,6 +265,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu) void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { + struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.efer = efer; if (!npt_enabled) { @@ -275,8 +276,13 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + } + + svm->vmcb->save.efer = efer | EFER_SVME; + mark_dirty(svm->vmcb, VMCB_CR); } static int is_external_interrupt(u32 info) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index be8e830f83fa..6ac4c00a5d82 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -389,6 +389,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb); +void svm_leave_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm);
Re: [PATCH 20/30] KVM: SVM: preserve VGIF across VMCB switch
On 5/29/20 8:39 AM, Paolo Bonzini wrote: There is only one GIF flag for the whole processor, so make sure it is not clobbered when switching to L2 (in which case we also have to include the V_GIF_ENABLE_MASK, lest we confuse enable_gif/disable_gif/gif_set). When going back, L1 could in theory have entered L2 without issuing a CLGI so make sure the svm_set_gif is done last, after svm->vmcb->control.int_ctl has been copied back from hsave. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 7e4a506828c9..6c7f0bffdf01 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -293,6 +293,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v static void nested_prepare_vmcb_control(struct vcpu_svm *svm) { + const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) nested_svm_init_mmu_context(>vcpu); @@ -308,7 +309,10 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; - svm->vmcb->control.int_ctl = svm->nested.ctl.int_ctl | V_INTR_MASKING_MASK; + svm->vmcb->control.int_ctl = + (svm->nested.ctl.int_ctl & ~mask) | + (svm->nested.hsave->control.int_ctl & mask); If this is the very first VMRUN, do we have any int_ctl saved in hsave ? + svm->vmcb->control.virt_ext= svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; svm->vmcb->control.int_state = svm->nested.ctl.int_state;
Re: [PATCH 17/30] KVM: nSVM: synchronize VMCB controls updated by the processor on every vmexit
On 5/29/20 8:39 AM, Paolo Bonzini wrote: The control state changes on every L2->L0 vmexit, and we will have to serialize it in the nested state. So keep it up to date in svm->nested.ctl and just copy them back to the nested VMCB in nested_svm_vmexit. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 57 ++- arch/x86/kvm/svm/svm.c| 5 +++- arch/x86/kvm/svm/svm.h| 1 + 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1e5f460b5540..921466eba556 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -234,6 +234,34 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; } +/* + * Synchronize fields that are written by the processor, so that + * they can be copied back into the nested_vmcb. + */ +void sync_nested_vmcb_control(struct vcpu_svm *svm) +{ + u32 mask; + svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; + svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; + + /* Only a few fields of int_ctl are written by the processor. */ + mask = V_IRQ_MASK | V_TPR_MASK; + if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && + is_intercept(svm, SVM_EXIT_VINTR)) { + /* +* In order to request an interrupt window, L0 is usurping +* svm->vmcb->control.int_ctl and possibly setting V_IRQ +* even if it was clear in L1's VMCB. Restoring it would be +* wrong. However, in this case V_IRQ will remain true until +* interrupt_window_interception calls svm_clear_vintr and +* restores int_ctl. We can just leave it aside. +*/ + mask &= ~V_IRQ_MASK; + } + svm->nested.ctl.int_ctl&= ~mask; + svm->nested.ctl.int_ctl|= svm->vmcb->control.int_ctl & mask; +} + static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) { /* Load the nested guest state */ @@ -471,6 +499,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Exit Guest-Mode */ leave_guest_mode(>vcpu); svm->nested.vmcb = 0; + WARN_ON_ONCE(svm->nested.nested_run_pending); /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -497,8 +526,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.dr6= svm->vcpu.arch.dr6; nested_vmcb->save.cpl= vmcb->save.cpl; - nested_vmcb->control.int_ctl = vmcb->control.int_ctl; - nested_vmcb->control.int_vector= vmcb->control.int_vector; While it's not related to this patch, I am wondering if we need the following line in enter_svm_guest_mode(): svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; If int_vector is used only to trigger a #VMEXIT after we have decided to inject a virtual interrupt, we probably don't the vector value from the nested vmcb. nested_vmcb->control.int_state = vmcb->control.int_state; nested_vmcb->control.exit_code = vmcb->control.exit_code; nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; @@ -510,34 +537,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (svm->nrips_enabled) nested_vmcb->control.next_rip = vmcb->control.next_rip; - /* -* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have -* to make sure that we do not lose injected events. So check event_inj -* here and copy it to exit_int_info if it is valid. -* Exit_int_info and event_inj can't be both valid because the case -* below only happens on a VMRUN instruction intercept which has -* no valid exit_int_info set. -*/ - if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { - struct vmcb_control_area *nc = _vmcb->control; - - nc->exit_int_info = vmcb->control.event_inj; - nc->exit_int_info_err = vmcb->control.event_inj_err; - } - - nested_vmcb->control.tlb_ctl = 0; - nested_vmcb->control.event_inj = 0; - nested_vmcb->control.event_inj_err = 0; + nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; + nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; + nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; nested_vmcb->control.pause_filter_count = svm->vmcb->control.pause_filter_count; nested_vmcb->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; - /* We always set V_INTR_MASKING and remember the old value in hflags */ - if
Re: [PATCH 08/30] KVM: nSVM: move map argument out of enter_svm_guest_mode
On 5/29/20 12:04 PM, Paolo Bonzini wrote: On 29/05/20 20:10, Krish Sadhukhan wrote: Unmapping the nested VMCB in enter_svm_guest_mode is a bit of a wart, since the map is not used elsewhere in the function. There are just two calls, so move it there. The last sentence sounds bit incomplete. Good point---more precisely, "calls" should be "callers". "It" refers to "unmapping". Also, does it make sense to mention the reason why unmapping is not required before we enter guest mode ? Unmapping is a host operation and is not visible by the guest; is this what you mean? Yes, I was thinking if we could mention it in the commit message... -Krish Paolo
Re: [PATCH 10/30] KVM: nSVM: extract preparation of VMCB for nested run
On 5/29/20 8:39 AM, Paolo Bonzini wrote: Split out filling svm->vmcb.save and svm->vmcb.control before VMRUN. Only the latter will be useful when restoring nested SVM state. This patch introduces no semantic change, so the MMU setup is still done in nested_prepare_vmcb_save. The next patch will clean up things. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 40 +++ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index fc0c6d1678eb..73be7af79453 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -245,21 +245,8 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, svm->vcpu.arch.tsc_offset += control->tsc_offset; } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb) +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) Not a big deal, but I feel that it helps a lot in readability if we keep the names symmetric. This one could be named prepare_nested_vmcb_save to match load_nested_vmcb_control that you created in the previous patch. Or load_nested_vmcb_control could be renamed to nested_load_vmcb_control to match the name here. { - bool evaluate_pending_interrupts = - is_intercept(svm, INTERCEPT_VINTR) || - is_intercept(svm, INTERCEPT_IRET); - - svm->nested.vmcb = vmcb_gpa; - if (kvm_get_rflags(>vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - - load_nested_vmcb_control(svm, _vmcb->control); - if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) nested_svm_init_mmu_context(>vcpu); @@ -291,7 +278,10 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->save.dr7 = nested_vmcb->save.dr7; svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; +} +static void nested_prepare_vmcb_control(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +{ svm_flush_tlb(>vcpu); if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; @@ -321,6 +311,26 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, */ recalc_intercepts(svm); + mark_all_dirty(svm->vmcb); +} + +void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb) +{ + bool evaluate_pending_interrupts = + is_intercept(svm, INTERCEPT_VINTR) || + is_intercept(svm, INTERCEPT_IRET); + + svm->nested.vmcb = vmcb_gpa; + if (kvm_get_rflags(>vcpu) & X86_EFLAGS_IF) + svm->vcpu.arch.hflags |= HF_HIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + + load_nested_vmcb_control(svm, _vmcb->control); + nested_prepare_vmcb_save(svm, nested_vmcb); + nested_prepare_vmcb_control(svm, nested_vmcb); + /* * If L1 had a pending IRQ/NMI before executing VMRUN, * which wasn't delivered because it was disallowed (e.g. @@ -336,8 +346,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, enable_gif(svm); if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, >vcpu); - - mark_all_dirty(svm->vmcb); } int nested_svm_vmrun(struct vcpu_svm *svm)
Re: [PATCH 08/30] KVM: nSVM: move map argument out of enter_svm_guest_mode
On 5/29/20 8:39 AM, Paolo Bonzini wrote: Unmapping the nested VMCB in enter_svm_guest_mode is a bit of a wart, since the map is not used elsewhere in the function. There are just two calls, so move it there. The last sentence sounds bit incomplete. Also, does it make sense to mention the reason why unmapping is not required before we enter guest mode ? Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 14 ++ arch/x86/kvm/svm/svm.c| 3 ++- arch/x86/kvm/svm/svm.h| 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 8756c9f463fd..8e98d5e420a5 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -229,7 +229,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) } void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map) + struct vmcb *nested_vmcb) { bool evaluate_pending_interrupts = is_intercept(svm, INTERCEPT_VINTR) || @@ -304,8 +304,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->control.pause_filter_thresh = nested_vmcb->control.pause_filter_thresh; - kvm_vcpu_unmap(>vcpu, map, true); - /* Enter Guest-Mode */ enter_guest_mode(>vcpu); @@ -368,10 +366,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb->control.exit_code_hi = 0; nested_vmcb->control.exit_info_1 = 0; nested_vmcb->control.exit_info_2 = 0; - - kvm_vcpu_unmap(>vcpu, , true); - - return ret; + goto out; } trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, @@ -414,7 +409,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); svm->nested.nested_run_pending = 1; - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, ); + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); if (!nested_svm_vmrun_msrpm(svm)) { svm->vmcb->control.exit_code= SVM_EXIT_ERR; @@ -425,6 +420,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_svm_vmexit(svm); } +out: + kvm_vcpu_unmap(>vcpu, , true); + return ret; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index feb96a410f2d..76b3f553815e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3814,7 +3814,8 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (kvm_vcpu_map(>vcpu, gpa_to_gfn(vmcb), ) == -EINVAL) return 1; nested_vmcb = map.hva; - enter_svm_guest_mode(svm, vmcb, nested_vmcb, ); + enter_svm_guest_mode(svm, vmcb, nested_vmcb); + kvm_vcpu_unmap(>vcpu, , true); } return 0; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 89fab75dd4f5..33e3f09d7a8e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -395,7 +395,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) } void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb, struct kvm_host_map *map); + struct vmcb *nested_vmcb); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm);
Re: [PATCH 06/30] KVM: SVM: always update CR3 in VMCB
On 5/29/20 8:39 AM, Paolo Bonzini wrote: svm_load_mmu_pgd is delaying the write of GUEST_CR3 to prepare_vmcs02 Did you mean to say enter_svm_guest_mode here ? as an optimization, but this is only correct before the nested vmentry. If userspace is modifying CR3 with KVM_SET_SREGS after the VM has already been put in guest mode, the value of CR3 will not be updated. Remove the optimization, which almost never triggers anyway. This was was added in commit 689f3bf21628 ("KVM: x86: unify callbacks to load paging root", 2020-03-16) just to keep the two vendor-specific modules closer, but we'll fix VMX too. Fixes: 689f3bf21628 ("KVM: x86: unify callbacks to load paging root") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 6 +- arch/x86/kvm/svm/svm.c| 16 +--- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dcac4c3510ab..8756c9f463fd 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -256,11 +256,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm_set_efer(>vcpu, nested_vmcb->save.efer); svm_set_cr0(>vcpu, nested_vmcb->save.cr0); svm_set_cr4(>vcpu, nested_vmcb->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = nested_vmcb->save.cr3; - svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else - (void)kvm_set_cr3(>vcpu, nested_vmcb->save.cr3); + (void)kvm_set_cr3(>vcpu, nested_vmcb->save.cr3); /* Guest paging mode is active - reset mmu */ kvm_mmu_reset_context(>vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 545f63ebc720..feb96a410f2d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3447,7 +3447,6 @@ static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - bool update_guest_cr3 = true; unsigned long cr3; cr3 = __sme_set(root); @@ -3456,18 +3455,13 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ - if (is_guest_mode(vcpu)) - update_guest_cr3 = false; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)>arch.regs_avail)) - cr3 = vcpu->arch.cr3; - else /* CR3 is already up-to-date. */ - update_guest_cr3 = false; + if (!test_bit(VCPU_EXREG_CR3, (ulong *)>arch.regs_avail)) + return; + cr3 = vcpu->arch.cr3; } - if (update_guest_cr3) { - svm->vmcb->save.cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_CR); - } + svm->vmcb->save.cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_CR); } static int is_disabled(void)
Re: [PATCH 05/28] KVM: nSVM: correctly inject INIT vmexits
On 5/26/20 10:22 AM, Paolo Bonzini wrote: The usual drill at this point, except there is no code to remove because this case was not handled at all. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 27 +++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bbf991cfe24b..166b88fc9509 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,7 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "lapic.h" #include "svm.h" static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -788,11 +789,37 @@ static void nested_svm_intr(struct vcpu_svm *svm) nested_svm_vmexit(svm); } +static inline bool nested_exit_on_init(struct vcpu_svm *svm) +{ + return (svm->nested.intercept & (1ULL << INTERCEPT_INIT)); +} + +static void nested_svm_init(struct vcpu_svm *svm) Should this be named nested_svm_inject_init_vmexit in accordance with nested_svm_inject_exception_vmexit that you did in patch# 3 ? +{ + svm->vmcb->control.exit_code = SVM_EXIT_INIT; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); +} + + static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool block_nested_events = kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending; + struct kvm_lapic *apic = vcpu->arch.apic; + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_INIT, >pending_events)) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_init(svm)) + return 0; + nested_svm_init(svm); + return 0; + } if (vcpu->arch.exception.pending) { if (block_nested_events)
Re: [PATCH 02/28] KVM: x86: enable event window in inject_pending_event
On 5/26/20 10:22 AM, Paolo Bonzini wrote: In case an interrupt arrives after nested.check_events but before the call to kvm_cpu_has_injectable_intr, we could end up enabling the interrupt window even if the interrupt is actually going to be a vmexit. This is useless rather than harmful, but it really complicates reasoning about SVM's handling of the VINTR intercept. We'd like to never bother with the VINTR intercept if V_INTR_MASKING=1 && INTERCEPT_INTR=1, because in that case there is no interrupt window and we can just exit the nested guest whenever we want. As a first step, this patch moves the opening of the interrupt window inside inject_pending_event. This consolidates the check for pending interrupt/NMI/SMI in one place, removing the repeated call to kvm_cpu_has_injectable_intr. The main functional change here is that re-injection of still-pending events will also use req_immediate_exit instead of using interrupt-window intercepts. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +-- arch/x86/kvm/svm/svm.c | 24 +++ arch/x86/kvm/vmx/vmx.c | 20 +++--- arch/x86/kvm/x86.c | 112 +--- 4 files changed, 87 insertions(+), 77 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db261da578f3..7707bd4b0593 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1136,8 +1136,8 @@ struct kvm_x86_ops { void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); - bool (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - bool (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); @@ -1234,10 +1234,10 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - bool (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); - int (*enable_smi_window)(struct kvm_vcpu *vcpu); + void (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9987f6fe9d88..9ac9963405b5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3053,15 +3053,15 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) return ret; } -static bool svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) - return false; + return -EBUSY; /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) - return false; + return -EBUSY; return !svm_nmi_blocked(vcpu); } @@ -3112,18 +3112,18 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); } -static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) - return false; + return -EBUSY; /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) - return false; + return -EBUSY; return !svm_interrupt_blocked(vcpu); } @@ -3793,15 +3793,15 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu) return is_smm(vcpu); } -static bool svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) - return false; + return -EBUSY; /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) - return false; +
Re: [PATCH 0/2] Fix issue with not starting nesting guests on my system
On 5/23/20 9:14 AM, Maxim Levitsky wrote: On my AMD machine I noticed that I can't start any nested guests, because nested KVM (everything from master git branches) complains that it can't find msr MSR_IA32_UMWAIT_CONTROL which my system doesn't support at all anyway. I traced it to the recently added UMWAIT support to qemu and kvm. The kvm portion exposed the new MSR in KVM_GET_MSR_INDEX_LIST without checking that it the underlying feature is supported in CPUID. It happened to work when non nested because as a precation kvm, tries to read each MSR on host before adding it to that list, and when read gets a #GP it ignores it. When running nested, the L1 hypervisor can be set to ignore unknown msr read/writes (I need this for some other guests), thus this safety check doesn't work anymore. V2: * added a patch to setup correctly the X86_FEATURE_WAITPKG kvm capability * dropped the cosmetic fix patch as it is now fixed in kvm/queue Best regards, Maxim Levitsky Maxim Levitsky (2): kvm/x86/vmx: enable X86_FEATURE_WAITPKG in KVM capabilities kvm/x86: don't expose MSR_IA32_UMWAIT_CONTROL unconditionally arch/x86/kvm/vmx/vmx.c | 3 +++ arch/x86/kvm/x86.c | 4 2 files changed, 7 insertions(+) Nit: The added 'break' statement in patch# 2 is not required. Reviewed-by: Krish Sadhukhan
Re: [PATCH 0/7] KVM: SVM: baby steps towards nested state migration
On 5/15/20 10:41 AM, Paolo Bonzini wrote: Here are some refactorings to prepare for an SVM implementation of KVM_SET_NESTED_STATE. It's a prerequisite for that to eliminate exit_required, moving exceptions to svm_check_nested_events. However: - I might work on that soon, because it's needed to handle RSM when the L1 hypervisor wants to get it from #UD rather than the specific RSM intercept - this should be enough to get a quick prototype, that I need in order to debug a particularly crazy bug and figure out its reproducibility. So, I am getting these patches out of my todo list for now. Thanks, Paolo Paolo Bonzini (7): KVM: SVM: move map argument out of enter_svm_guest_mode KVM: SVM: extract load_nested_vmcb_control KVM: SVM: extract preparation of VMCB for nested run KVM: SVM: save all control fields in svm->nested KVM: nSVM: remove HF_VINTR_MASK KVM: nSVM: do not reload pause filter fields from VMCB KVM: SVM: introduce data structures for nested virt state arch/x86/include/asm/kvm_host.h | 1 - arch/x86/include/uapi/asm/kvm.h | 26 +++- arch/x86/kvm/svm/nested.c | 115 +--- arch/x86/kvm/svm/svm.c | 11 ++- arch/x86/kvm/svm/svm.h | 28 +--- 5 files changed, 116 insertions(+), 65 deletions(-) Reviewed-by: Krish Sadhukhan
Re: [PATCH 2/7] KVM: SVM: extract load_nested_vmcb_control
On 5/15/20 10:41 AM, Paolo Bonzini wrote: When restoring SVM nested state, the control state will be stored already in svm->nested by KVM_SET_NESTED_STATE. We will not need to fish it out of L1's VMCB. Pull everything into a separate function so that it is documented which fields are needed. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 45 ++- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 22f75f66084f..e79acc852000 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -225,6 +225,27 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) return true; } +static void load_nested_vmcb_control(struct vcpu_svm *svm, struct vmcb *nested_vmcb) This function only separates a subset of the controls. If the purpose of the function is to separate only the controls that are related to migration, should it be called something like load_nested_state_vmcb_control or something like that ? +{ + if (kvm_get_rflags(>vcpu) & X86_EFLAGS_IF) + svm->vcpu.arch.hflags |= HF_HIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + + svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; + svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; + + /* cache intercepts */ + svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; + svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; + svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; + svm->nested.intercept= nested_vmcb->control.intercept; + + svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; +} + void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb) { @@ -232,15 +253,11 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, is_intercept(svm, INTERCEPT_VINTR) || is_intercept(svm, INTERCEPT_IRET); - if (kvm_get_rflags(>vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + svm->nested.vmcb = vmcb_gpa; + load_nested_vmcb_control(svm, nested_vmcb); - if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { - svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) nested_svm_init_mmu_context(>vcpu); - } /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; @@ -275,25 +292,15 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; - svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; - - /* cache intercepts */ - svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; - svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; - svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; - svm->nested.intercept= nested_vmcb->control.intercept; - svm_flush_tlb(>vcpu); - svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; + svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; @@ -314,8 +321,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, */ recalc_intercepts(svm); - svm->nested.vmcb = vmcb_gpa; - /* * If L1 had a pending IRQ/NMI before executing VMRUN, * which wasn't delivered because it was disallowed (e.g.
Re: [PATCH 1/4] KVM: VMX: rename {vmx,nested_vmx}_vcpu_setup functions
On 10/15/19 6:27 PM, Xiaoyao Li wrote: On 10/16/2019 6:05 AM, Krish Sadhukhan wrote: On 10/15/2019 09:40 AM, Xiaoyao Li wrote: Rename {vmx,nested_vmx}_vcpu_setup to {vmx,nested_vmx}_vmcs_setup, to match what they really do. No functional change. Signed-off-by: Xiaoyao Li --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c | 9 +++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5e231da00310..7935422d311f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5768,7 +5768,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return ret; } -void nested_vmx_vcpu_setup(void) +void nested_vmx_vmcs_setup(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..2be1ba7482c9 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -11,7 +11,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); -void nested_vmx_vcpu_setup(void); +void nested_vmx_vmcs_setup(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e660e28e9ae0..58b77a882426 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4161,15 +4161,12 @@ static void ept_set_mmio_spte_mask(void) #define VMX_XSS_EXIT_BITMAP 0 -/* - * Sets up the vmcs for emulated real mode. - */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void vmx_vmcs_setup(struct vcpu_vmx *vmx) { int i; if (nested) - nested_vmx_vcpu_setup(); + nested_vmx_vmcs_setup(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); @@ -6777,7 +6774,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(>vcpu, cpu); vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); + vmx_vmcs_setup(vmx); vmx_vcpu_put(>vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(>vcpu)) { May be we should rename vmx_vcpu_reset() to vmx_vmcs_reset() as well ? Not really. vmx_vcpu_reset() not only resets vmcs but also the emulated field of vmx vcpu. It would be a better organization of the code if the resetting of the VMCS fields were placed in a separate function. Reviewed-by: Krish Sadhukhan
Re: [PATCH 2/4] KVM: VMX: Setup MSR bitmap only when has msr_bitmap capability
On 10/15/2019 09:40 AM, Xiaoyao Li wrote: Move the MSR bitmap setup codes to vmx_vmcs_setup() and only setup them when hardware has msr_bitmap capability. Signed-off-by: Xiaoyao Li --- arch/x86/kvm/vmx/vmx.c | 39 --- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 58b77a882426..7051511c27c2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4164,12 +4164,30 @@ static void ept_set_mmio_spte_mask(void) static void vmx_vmcs_setup(struct vcpu_vmx *vmx) { int i; + unsigned long *msr_bitmap; if (nested) nested_vmx_vmcs_setup(); - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); + if (cpu_has_vmx_msr_bitmap()) { + msr_bitmap = vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); vmx_disable_intercept_for_msr() also calls cpu_has_vmx_msr_bitmap(), which means we are repeating the check. A cleaner approach is to remove the call to cpu_has_vmx_msr_bitmap() from vmx_disable_intercept_for_msr() and let its callers do the check just like you are doing here. + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(vmx->vcpu.kvm)) { + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); + } + vmx->msr_bitmap_mode = 0; vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ @@ -6697,7 +6715,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; struct vcpu_vmx *vmx; - unsigned long *msr_bitmap; int cpu; BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, @@ -6754,22 +6771,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err < 0) goto free_msrs; - msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - if (kvm_cstate_in_guest(kvm)) { - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); - } - vmx->msr_bitmap_mode = 0; - vmx->loaded_vmcs = >vmcs01; cpu = get_cpu(); vmx_vcpu_load(>vcpu, cpu);
Re: [PATCH 1/4] KVM: VMX: rename {vmx,nested_vmx}_vcpu_setup functions
On 10/15/2019 09:40 AM, Xiaoyao Li wrote: Rename {vmx,nested_vmx}_vcpu_setup to {vmx,nested_vmx}_vmcs_setup, to match what they really do. No functional change. Signed-off-by: Xiaoyao Li --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c| 9 +++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5e231da00310..7935422d311f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5768,7 +5768,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return ret; } -void nested_vmx_vcpu_setup(void) +void nested_vmx_vmcs_setup(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..2be1ba7482c9 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -11,7 +11,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); -void nested_vmx_vcpu_setup(void); +void nested_vmx_vmcs_setup(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e660e28e9ae0..58b77a882426 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4161,15 +4161,12 @@ static void ept_set_mmio_spte_mask(void) #define VMX_XSS_EXIT_BITMAP 0 -/* - * Sets up the vmcs for emulated real mode. - */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void vmx_vmcs_setup(struct vcpu_vmx *vmx) { int i; if (nested) - nested_vmx_vcpu_setup(); + nested_vmx_vmcs_setup(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); @@ -6777,7 +6774,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(>vcpu, cpu); vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); + vmx_vmcs_setup(vmx); vmx_vcpu_put(>vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(>vcpu)) { May be we should rename vmx_vcpu_reset() to vmx_vmcs_reset() as well ?
Re: [PATCH] KVM: nVMX: cleanup and fix host 64-bit mode checks
On 09/25/2019 09:47 AM, Jim Mattson wrote: On Wed, Sep 25, 2019 at 9:34 AM Paolo Bonzini wrote: KVM was incorrectly checking vmcs12->host_ia32_efer even if the "load IA32_EFER" exit control was reset. Also, some checks were not using the new CC macro for tracing. Cleanup everything so that the vCPU's 64-bit mode is determined directly from EFER_LMA and the VMCS checks are based on that, which matches section 26.2.4 of the SDM. Cc: Sean Christopherson Cc: Jim Mattson Cc: Krish Sadhukhan Fixes: 5845038c111db27902bc220a4f70070fe945871c Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 53 --- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 70d59d9304f2..e108847f6cf8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2664,8 +2664,26 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; - ia32e = (vmcs12->vm_exit_controls & -VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; +#ifdef CONFIG_X86_64 + ia32e = !!(vcpu->arch.efer & EFER_LMA); +#else + if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)) + return -EINVAL; This check is redundant, since it is checked in the else block below. Should we be re-using is_long_mode() instead of duplicating the code ? + + ia32e = false; +#endif + + if (ia32e) { + if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || + CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + return -EINVAL; + } else { + if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || + CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || + CC(((vmcs12->host_rip) >> 32) & 0x)) The mask shouldn't be necessary. + return -EINVAL; + } if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || @@ -2684,35 +2702,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) - return -EINVAL; - - if (!(vmcs12->host_ia32_efer & EFER_LMA) && - ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || - (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE))) { - return -EINVAL; - } - - if ((vmcs12->host_ia32_efer & EFER_LMA) && - !(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) { - return -EINVAL; - } - - if (!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) && - ((vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || - (vmcs12->host_cr4 & X86_CR4_PCIDE) || - (((vmcs12->host_rip) >> 32) & 0x))) { - return -EINVAL; - } - - if ((vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) && - ((!(vmcs12->host_cr4 & X86_CR4_PAE)) || - (is_noncanonical_address(vmcs12->host_rip, vcpu { - return -EINVAL; - } -#else - if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE || - vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; #endif -- 1.8.3.1 Reviewed-by: Jim Mattson
Re: [PATCH v2] KVM: x86: Unconditionally call x86 ops that are always implemented
On 08/02/2019 03:06 PM, Sean Christopherson wrote: Remove a few stale checks for non-NULL ops now that the ops in question are implemented by both VMX and SVM. Note, this is **not** stable material, the Fixes tags are there purely to show when a particular op was first supported by both VMX and SVM. Fixes: 74f169090b6f ("kvm/svm: Setup MCG_CAP on AMD properly") Fixes: b31c114b82b2 ("KVM: X86: Provide a capability to disable PAUSE intercepts") Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt") Cc: Krish Sadhukhan Signed-off-by: Sean Christopherson --- v2: Give update_pi_iret the same treatment [Krish]. arch/x86/kvm/x86.c | 13 +++-- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01e18caac825..e7c993f0cbed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3506,8 +3506,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -9313,10 +9312,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); - - return 0; + return kvm_x86_ops->vm_init(kvm); } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -9992,7 +9988,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) Now that this is returning true always and that this is called only in kvm_irqfd_assign(), this can perhaps be removed altogether ? { - return kvm_x86_ops->update_pi_irte != NULL; + return true; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -10032,9 +10028,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } Reviewed-by: Krish Sadhukhan
Re: [PATCH] KVM: x86: Unconditionally call x86 ops that are always implemented
On 08/01/2019 09:46 AM, Sean Christopherson wrote: Remove two stale checks for non-NULL ops now that they're implemented by both VMX and SVM. Fixes: 74f169090b6f ("kvm/svm: Setup MCG_CAP on AMD properly") Fixes: b31c114b82b2 ("KVM: X86: Provide a capability to disable PAUSE intercepts") Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01e18caac825..2c25a19d436f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3506,8 +3506,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -9313,10 +9312,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); - - return 0; + return kvm_x86_ops->vm_init(kvm); } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) The following two ops are also implemented by both VMX and SVM: update_cr8_intercept update_pi_irte
Re: [PATCH] KVM: x86: init x2apic_enabled() once
On 07/23/2019 06:06 AM, lufe...@163.com wrote: From: luferry x2apic_eanbled() costs about 200 cycles when guest trigger halt pretty high, pi ops in hotpath Signed-off-by: luferry --- arch/x86/kvm/vmx/vmx.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d98eac371c0a..e17dbf011e47 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -186,6 +186,8 @@ static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); +static int __read_mostly host_x2apic_enabled; + /* Storage for pre module init parameter parsing */ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; @@ -1204,7 +1206,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) dest = cpu_physical_id(cpu); - if (x2apic_enabled()) + if (host_x2apic_enabled) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -7151,7 +7153,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) dest = cpu_physical_id(vcpu->cpu); - if (x2apic_enabled()) + if (host_x2apic_enabled) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -7221,7 +7223,7 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) */ dest = cpu_physical_id(vcpu->pre_pcpu); - if (x2apic_enabled()) + if (host_x2apic_enabled) new.ndst = dest; else new.ndst = (dest << 8) & 0xFF00; @@ -7804,6 +7806,8 @@ static int __init vmx_init(void) } #endif + host_x2apic_enabled = x2apic_enabled(); + r = kvm_init(_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) Reviewed-by: Krish Sadhukhan
Re: [PATCH] KVM: nVMX: Rename prepare_vmcs02_*_full to prepare_vmcs02_*_extra
On 06/06/2019 11:41 AM, Sean Christopherson wrote: On Thu, Jun 06, 2019 at 05:24:12PM +0200, Paolo Bonzini wrote: These function do not prepare the entire state of the vmcs02, only the rarely needed parts. Rename them to make this clearer. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 84438cf23d37..fd8150ef6cce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1955,7 +1955,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmx_set_constant_host_state(vmx); } -static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, +static void prepare_vmcs02_early_extra(struct vcpu_vmx *vmx, Or maybe 'uncommon', 'rare' or 'ext'? I don't I particularly love any of the names, but they're all better than 'full'. The big chunk of the work in this function is done via prepare_vmcs02_constant_state(). It seems cleaner to get rid of prepare_vmcs02_early_full(), call prepare_vmcs02_constant_state() directly from prepare_vmcs02_early() and move the three vmcs_write16() calls to prepare_vmcs02_early(). Reviewed-by: Sean Christopherson struct vmcs12 *vmcs12) { prepare_vmcs02_constant_state(vmx); @@ -1976,7 +1976,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) - prepare_vmcs02_early_full(vmx, vmcs12); + prepare_vmcs02_early_extra(vmx, vmcs12); /* * PIN CONTROLS @@ -2130,7 +2130,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } } -static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_extra(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; @@ -2254,7 +2254,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { - prepare_vmcs02_full(vmx, vmcs12); + prepare_vmcs02_extra(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; } -- 1.8.3.1
Re: [PATCH] KVM: nVMX: allow tests to use bad virtual-APIC page address
On 04/15/2019 06:35 AM, Paolo Bonzini wrote: As mentioned in the comment, there are some special cases where we can simply clear the TPR shadow bit from the CPU-based execution controls in the vmcs02. Handle them so that we can remove some XFAILs from vmx.flat. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 - arch/x86/kvm/vmx/vmx.c| 2 +- arch/x86/kvm/vmx/vmx.h| 2 ++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7ec9bb1dd723..a22af5a85540 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2873,20 +2873,27 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. -* Failing the vm entry is _not_ what the processor -* does but it's basically the only possibility we -* have. We could still enter the guest if CR8 load -* exits are enabled, CR8 store exits are enabled, and -* virtualize APIC access is disabled; in this case -* the processor would never use the TPR shadow and we -* could simply clear the bit from the execution -* control. But such a configuration is useless, so -* let's keep the code simple. */ if (!is_error_page(page)) { vmx->nested.virtual_apic_page = page; hpa = page_to_phys(vmx->nested.virtual_apic_page); vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && + nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* +* The processor will never use the TPR shadow, simply +* clear the bit from the execution control. Such a +* configuration is useless, but it happens in tests. +* For any other configuration, failing the vm entry is +* _not_ what the processor does but it's basically the +* only possibility we have. +*/ + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_TPR_SHADOW); + } else { + printk("bad virtual-APIC page address\n"); + dump_vmcs(); } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f8054dc1de65..14cacfd7ffd0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5603,7 +5603,7 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +void dump_vmcs(void) { u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a1e00d0a2482..f879529906b4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -517,4 +517,6 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +void dump_vmcs(void); + #endif /* __KVM_X86_VMX_H */ Reviewed-by: Krish Sadhukhan
Re: [PATCH] KVM: vmx: print more APICv fields in dump_vmcs
On 04/15/2019 06:35 AM, Paolo Bonzini wrote: The SVI, RVI, virtual-APIC page address and APIC-access page address fields were left out of dump_vmcs. Add them. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab432a930ae8..f8054dc1de65 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5723,8 +5723,17 @@ static void dump_vmcs(void) if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); - if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) - pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); + } + pr_err(KERN_CONT "TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); + pr_err(KERN_CONT "virt-APIC addr=0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) Reviewed-by: Krish Sadhukhan
Re: [PATCH] KVM: x86: optimize check for valid PAT value
On 04/10/2019 02:55 AM, Paolo Bonzini wrote: This check will soon be done on every nested vmentry and vmexit, "parallelize" it using bitwise operations. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c| 10 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.h | 8 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index e9ea2d45ae66..9f72cc427158 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr) return false; } -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - static bool valid_mtrr_type(unsigned t) { return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ @@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return false; if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; + return kvm_pat_valid(data); } else if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b6c533afbf27..b74679732cfc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1891,7 +1891,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 28406aa1136d..7bc7ac9d2a44 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -347,4 +347,12 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } +static inline bool kvm_pat_valid(u64 data) +{ + if (data & 0xF8F8F8F8F8F8F8F8) + return false; + /* 0, 1, 4, 5, 6, 7 are valid values. */ + return (data | ((data & 0x0202020202020202) << 1)) == data; +} + #endif Reviewed-by: Krish Sadhukhan
Re: [PATCH v2] KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned
On 10/20/2018 03:50 PM, KarimAllah Ahmed wrote: The spec only requires the posted interrupt descriptor address to be 64-bytes aligned (i.e. bits[0:5] == 0). Using page_address_valid also forces the address to be page aligned. Only validate that the address does not cross the maximum physical address without enforcing a page alignment. v1 -> v2: - Add a missing parenthesis (dropped while merging!) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: H. Peter Anvin Cc: x...@kernel.org Cc: k...@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 6de84e581c0 ("nVMX x86: check posted-interrupt descriptor addresss on vmentry of L2") Signed-off-by: KarimAllah Ahmed --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 38f1a16..bb0fcdb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11667,7 +11667,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, !nested_exit_intr_ack_set(vcpu) || (vmcs12->posted_intr_nv & 0xff00) || (vmcs12->posted_intr_desc_addr & 0x3f) || - (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr + (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu return -EINVAL; /* tpr shadow is needed by all apicv features. */ Reviewed-by: Krish Sadhuhan
Re: [PATCH v2] KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned
On 10/20/2018 03:50 PM, KarimAllah Ahmed wrote: The spec only requires the posted interrupt descriptor address to be 64-bytes aligned (i.e. bits[0:5] == 0). Using page_address_valid also forces the address to be page aligned. Only validate that the address does not cross the maximum physical address without enforcing a page alignment. v1 -> v2: - Add a missing parenthesis (dropped while merging!) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: H. Peter Anvin Cc: x...@kernel.org Cc: k...@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 6de84e581c0 ("nVMX x86: check posted-interrupt descriptor addresss on vmentry of L2") Signed-off-by: KarimAllah Ahmed --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 38f1a16..bb0fcdb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11667,7 +11667,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, !nested_exit_intr_ack_set(vcpu) || (vmcs12->posted_intr_nv & 0xff00) || (vmcs12->posted_intr_desc_addr & 0x3f) || - (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr + (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu return -EINVAL; /* tpr shadow is needed by all apicv features. */ Reviewed-by: Krish Sadhuhan
Re: [PATCH] x86/kvm/nVMX: avoid redundant double assignment of nested_run_pending
On 08/23/2018 09:24 AM, Vitaly Kuznetsov wrote: nested_run_pending is set 20 lines above and check_vmentry_prereqs()/ check_vmentry_postreqs() don't seem to be resetting it (the later, however, checks it). Signed-off-by: Vitaly Kuznetsov --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6c3514750d0c..8a63b8cf9458 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -14221,9 +14221,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, _qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) Reviewed-by: Krish Sadhukhan
Re: [PATCH] x86/kvm/nVMX: avoid redundant double assignment of nested_run_pending
On 08/23/2018 09:24 AM, Vitaly Kuznetsov wrote: nested_run_pending is set 20 lines above and check_vmentry_prereqs()/ check_vmentry_postreqs() don't seem to be resetting it (the later, however, checks it). Signed-off-by: Vitaly Kuznetsov --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6c3514750d0c..8a63b8cf9458 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -14221,9 +14221,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, _qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) Reviewed-by: Krish Sadhukhan
Re: [PATCH] KVM: vmx: unify adjacent #ifdefs
On 04/04/2018 09:59 AM, Paolo Bonzini wrote: vmx_save_host_state has multiple ifdefs for CONFIG_X86_64 that have no other code between them. Simplify by reducing them to a single conditional. Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> --- arch/x86/kvm/vmx.c | 10 +++--- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 939c8724feb4..a1572461786e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2392,20 +2392,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 savesegment(ds, vmx->host_state.ds_sel); savesegment(es, vmx->host_state.es_sel); -#endif -#ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, current->thread.fsbase); vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif -#ifdef CONFIG_X86_64 vmx->msr_host_kernel_gs_base = current->thread.gsbase; if (is_long_mode(>vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif if (boot_cpu_has(X86_FEATURE_MPX)) rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); Reviewed-by: Krish Sadhukhan <krish.sadhuk...@oracle.com>
Re: [PATCH] KVM: vmx: unify adjacent #ifdefs
On 04/04/2018 09:59 AM, Paolo Bonzini wrote: vmx_save_host_state has multiple ifdefs for CONFIG_X86_64 that have no other code between them. Simplify by reducing them to a single conditional. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 +++--- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 939c8724feb4..a1572461786e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2392,20 +2392,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 savesegment(ds, vmx->host_state.ds_sel); savesegment(es, vmx->host_state.es_sel); -#endif -#ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, current->thread.fsbase); vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif -#ifdef CONFIG_X86_64 vmx->msr_host_kernel_gs_base = current->thread.gsbase; if (is_long_mode(>vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif if (boot_cpu_has(X86_FEATURE_MPX)) rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); Reviewed-by: Krish Sadhukhan
Re: [PATCH v2 1/4] x86/kvm/cpuid: Fix CPUID function for word 6 (80000001_ECX)
On 11/06/2017 09:44 AM, Janakarajan Natarajan wrote: The function for CPUID 8001 ECX is set to 0xc001. Set it to 0x8001. Signed-off-by: Janakarajan Natarajan <janakarajan.natara...@amd.com> --- arch/x86/kvm/cpuid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 0bc5c13..b21b1d2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,7 +43,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, [CPUID_C000_0001_EDX] = {0xc001, 0, CPUID_EDX}, - [CPUID_8000_0001_ECX] = {0xc001, 0, CPUID_ECX}, + [CPUID_8000_0001_ECX] = {0x8001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, Reviewed-by: Krish Sadhukhan <krish.sadhuk...@oracle.com>
Re: [PATCH v2 1/4] x86/kvm/cpuid: Fix CPUID function for word 6 (80000001_ECX)
On 11/06/2017 09:44 AM, Janakarajan Natarajan wrote: The function for CPUID 8001 ECX is set to 0xc001. Set it to 0x8001. Signed-off-by: Janakarajan Natarajan --- arch/x86/kvm/cpuid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 0bc5c13..b21b1d2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,7 +43,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, [CPUID_C000_0001_EDX] = {0xc001, 0, CPUID_EDX}, - [CPUID_8000_0001_ECX] = {0xc001, 0, CPUID_ECX}, + [CPUID_8000_0001_ECX] = {0x8001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, Reviewed-by: Krish Sadhukhan
Re: [PATCH v5 3/3] KVM: nVMX: Fix mmu context after VMLAUNCH/VMRESUME failure
1539,6 +11546,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * accordingly. */ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + load_vmcs12_mmu_host_state(vcpu, vmcs12); + /* * The emulated instruction was already skipped in * nested_vmx_run, but the updated RIP was never Reviewed-by: Krish Sadhukhan <krish.sadhuk...@oracle.com>
Re: [PATCH v5 3/3] KVM: nVMX: Fix mmu context after VMLAUNCH/VMRESUME failure
*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + load_vmcs12_mmu_host_state(vcpu, vmcs12); + /* * The emulated instruction was already skipped in * nested_vmx_run, but the updated RIP was never Reviewed-by: Krish Sadhukhan
Re: [PATCH v5 2/3] KVM: nVMX: Validate the IA32_BNDCFGS on nested VM-entry
On 11/02/2017 11:40 PM, Wanpeng Li wrote: 2017-11-03 14:31 GMT+08:00 Krish Sadhukhan <krish.sadhuk...@oracle.com>: On 11/02/2017 05:50 PM, Wanpeng Li wrote: From: Wanpeng Li <wanpeng...@hotmail.com> According to the SDM, if the "load IA32_BNDCFGS" VM-entry controls is 1, the following checks are performed on the field for the IA32_BNDCFGS MSR: - Bits reserved in the IA32_BNDCFGS MSR must be 0. - The linear address in bits 63:12 must be canonical. Reviewed-by: Konrad Rzeszutek Wilk <konrad.w...@oracle.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Radim Krčmář <rkrc...@redhat.com> Cc: Jim Mattson <jmatt...@google.com> Signed-off-by: Wanpeng Li <wanpeng...@hotmail.com> --- v3 -> v4: * simply condition * use && instead of nested "if"s arch/x86/kvm/vmx.c | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e6c8ffa..6cf3972 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10805,6 +10805,11 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 1; } + if (kvm_mpx_supported() && + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return 1; + return 0; } Hi Wanpeng, The SDM check is performed only when "load IA32_BNDCFGS" VM-entry control is 1. But vmx_mpx_supported() returns true when both "load IA32_BNDCFGS" and "store IA32_BNDCFGS" VM-entry controls are 1. Therefore your check is performed when both controls are 1. Did I miss something here ? https://lkml.org/lkml/2017/11/2/748 Paolo hopes the simplification. Regards, Wanpeng Li I got that simplification and your changes look good to me. However, I am still curious know the reason why vmx_mpx_supported() returns true only when both controls are true whereas the SDM states the following: "IA32_BNDCFGS (64 bits). This field is supported only on processors that support either the 1-setting of the “load IA32_BNDCFGS” VM-entry control or that of the “clear IA32_BNDCFGS” VM-exit control." Thanks, Krish
Re: [PATCH v5 2/3] KVM: nVMX: Validate the IA32_BNDCFGS on nested VM-entry
On 11/02/2017 11:40 PM, Wanpeng Li wrote: 2017-11-03 14:31 GMT+08:00 Krish Sadhukhan : On 11/02/2017 05:50 PM, Wanpeng Li wrote: From: Wanpeng Li According to the SDM, if the "load IA32_BNDCFGS" VM-entry controls is 1, the following checks are performed on the field for the IA32_BNDCFGS MSR: - Bits reserved in the IA32_BNDCFGS MSR must be 0. - The linear address in bits 63:12 must be canonical. Reviewed-by: Konrad Rzeszutek Wilk Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li --- v3 -> v4: * simply condition * use && instead of nested "if"s arch/x86/kvm/vmx.c | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e6c8ffa..6cf3972 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10805,6 +10805,11 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 1; } + if (kvm_mpx_supported() && + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return 1; + return 0; } Hi Wanpeng, The SDM check is performed only when "load IA32_BNDCFGS" VM-entry control is 1. But vmx_mpx_supported() returns true when both "load IA32_BNDCFGS" and "store IA32_BNDCFGS" VM-entry controls are 1. Therefore your check is performed when both controls are 1. Did I miss something here ? https://lkml.org/lkml/2017/11/2/748 Paolo hopes the simplification. Regards, Wanpeng Li I got that simplification and your changes look good to me. However, I am still curious know the reason why vmx_mpx_supported() returns true only when both controls are true whereas the SDM states the following: "IA32_BNDCFGS (64 bits). This field is supported only on processors that support either the 1-setting of the “load IA32_BNDCFGS” VM-entry control or that of the “clear IA32_BNDCFGS” VM-exit control." Thanks, Krish