Avi Kivity wrote:
> On 04/29/2010 08:22 AM, Dexuan Cui wrote:
>> When the host enables XSAVE/XRSTOR, the patch exposes the
>> XSAVE/XRSTOR
>> related CPUID leaves to guest by fixing up kvm_emulate_cpuid() and
>> the
>> patch allows guest to set CR4.OSXSAVE to enable XSAVE.
>> The patch adds per-vcpu host/guest xstate image/mask and enhances the
>> current FXSAVE/FRSTOR with the new XSAVE/XRSTOR on the host xstate
>> (FPU/SSE/YMM) switch.
>>
>>
>> 5 files changed, 242 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h
>> index 3f0007b..60be1a7 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -303,6 +303,11 @@ struct kvm_vcpu_arch {
>> struct i387_fxsave_struct host_fx_image;
>> struct i387_fxsave_struct guest_fx_image;
>>
>> + struct xsave_struct *host_xstate_image;
>> + struct xsave_struct *guest_xstate_image;
>> + uint64_t host_xstate_mask;
>>
>
> Does host_xstate_mask need to be per-vcpu, or can it be global?
(I'm sorry for my late reply as I was on a long leave...)
Yes, I think host_xstate_mask can be global because in Linux
xsave_cntxt_init() and xsave_init() always enables all the XSTATEs on all
CPUs (currently XCNTXT_MASK has 3 bits). In host, only the kernel
can execute xsetbv() and applications can't change the XCR.I think this
usage of Linux won't change in future.
>
>> + uint64_t guest_xstate_mask;
>>
>
> Can be called xcr0, like other shadow registers.
OK.
>> +
>> gva_t mmio_fault_cr2;
>> struct kvm_pio_request pio;
>> void *pio_data;
>>
>>
>> @@ -3258,6 +3262,25 @@ static int handle_wbinvd(struct kvm_vcpu
>> *vcpu) return 1; }
>>
>> +static int handle_xsetbv(struct kvm_vcpu *vcpu)
>> +{
>> + u64 new_bv = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX)) |
>> + kvm_register_read(vcpu, VCPU_REGS_RAX);
>>
>
> Missing shift?
Oh, my carelessness! Thanks for pointing it out.
> Probably worthwhile to create a helper for reading/writing edx:eax
> into
> a u64.
OK.
>
>> + u64 host_bv = vcpu->arch.host_xstate_mask;
>>
>
> What about ecx?
Sorry, I missed ecx.
Currently only ecx==0 is defined. When a guest tries a non-zero value
of ecx, a #GP should be injected into the guest.
>
>> +
>> + if (((new_bv ^ host_bv)& ~host_bv)
>
> Isn't (new_bv & ~host_bv) equivalent? (guest cannot exceed host...)
OK. Will use the simple one. :-)
>
>> || !(new_bv& 1))
>>
>
> Symbolic value or comment.
OK. the "1" here should be "XSTATE_FP".
>
>> + goto err;
>> + if ((host_bv& XSTATE_YMM& new_bv)&& !(new_bv& XSTATE_SSE))
>>
>
> host_bv unneeded, I think.
Agree.
>
>> + goto err;
>> + vcpu->arch.guest_xstate_mask = new_bv;
>> + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.guest_xstate_mask);
>>
>
> Can't we run with the host xcr0? isn't it guaranteed to be a superset
> of the guest xcr0?
I agree host xcr0 is a superset of guest xcr0.
In the case guest xcr0 has less bits set than host xcr0, I suppose running
with guest
xcr0 would be a bit faster. If you think simplying the code by removing the
field
guest_xstate_mask is more important, we can do it.
>> + skip_emulated_instruction(vcpu);
>> + return 1;
>> +err:
>> + kvm_inject_gp(vcpu, 0);
>>
>
> Need to #UD in some circumstances.
I don't think so. When the CPU doesn't suport XSAVE, or guest doesn't set guest
CR4.OSXSAVE, guest's attempt of exectuing xsetbv would get a #UD immediately
and no VMexit would occur.
BTW: I missed injecting #GP in the cases ECX !=0 or CPL !=0. Will add them.
>
>> + return 1;
>> +}
>> +
>> static int handle_apic_access(struct kvm_vcpu *vcpu) {
>> unsigned long exit_qualification;
>> @@ -3556,6 +3579,7 @@ static int (*kvm_vmx_exit_handlers[])(struct
>> kvm_vcpu *vcpu) = { [EXIT_REASON_TPR_BELOW_THRESHOLD] =
>> handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS]
>> = handle_apic_access, [EXIT_REASON_WBINVD] =
>> handle_wbinvd, + [EXIT_REASON_XSETBV] =
>> handle_xsetbv, [EXIT_REASON_TASK_SWITCH] =
>> handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] =
>> handle_machine_check, [EXIT_REASON_EPT_VIOLATION] =
>> handle_ept_violation,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6b2ce1d..2af3fbe 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -52,6 +52,8 @@
>> #include<asm/desc.h>
>> #include<asm/mtrr.h>
>> #include<asm/mce.h>
>> +#include<asm/i387.h>
>> +#include<asm/xcr.h>
>>
>> #define MAX_IO_MSRS 256
>> #define CR0_RESERVED_BITS \
>> @@ -62,6 +64,7 @@
>> (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD |
>> X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE
>> \
>> | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
>> + | (cpu_has_xsave ? X86_CR4_OSXSAVE : 0) \
>> | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
>>
>
> It also depends on the guest cpuid value. Please add it outside the
If cpu_has_xsave is true, we always present xsave to guest via cpuid, so I
think the code is correct here.
> macro, it's confusing to read something that looks like a constant but
> isn't.
Ok. Will try to make it outside the macro.
>> int kvm_emulate_halt(struct kvm_vcpu *vcpu)
>> @@ -4307,6 +4346,65 @@ not_found:
>> return 36;
>> }
>>
>> +#define bitmaskof(idx) (1U<< ((idx)& 31))
>> +static void kvm_emulate_cpuid_fixup(struct kvm_vcpu *vcpu, u32
>> func, u32 idx) +{ + u32 eax, ebx, ecx, edx;
>> +
>> + if (func != 0&& func != 1&& func != 0xd)
>> + return;
>> +
>> + eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
>> + ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
>> + ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
>> + edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
>> +
>> + switch (func) {
>> + case 0:
>> + /* fixup the Maximum Input Value */
>> + if (cpu_has_xsave&& eax< 0xd)
>> + eax = 0xd;
>> + break;
>> + case 1:
>> + ecx&= ~(bitmaskof(X86_FEATURE_XSAVE) |
>> + bitmaskof(X86_FEATURE_OSXSAVE));
>> + if (!cpu_has_xsave)
>> + break;
>> + ecx |= bitmaskof(X86_FEATURE_XSAVE);
>> + if (kvm_read_cr4(vcpu)& X86_CR4_OSXSAVE)
>> + ecx |= bitmaskof(X86_FEATURE_OSXSAVE);
>> + break;
>> + case 0xd:
>> + eax = ebx = ecx = edx = 0;
>> + if (!cpu_has_xsave)
>> + break;
>> + switch (idx) {
>> + case 0:
>> + eax = vcpu->arch.host_xstate_mask& XCNTXT_MASK;
>> + /* FP/SSE + XSAVE.HEADER + YMM. */
>> + ecx = 512 + 64;
>> + if (eax& XSTATE_YMM)
>> + ecx += XSTATE_YMM_SIZE;
>> + ebx = ecx;
>> + break;
>> + case 2:
>> + if (!(vcpu->arch.host_xstate_mask& XSTATE_YMM)) +
>> break;
>> + eax = XSTATE_YMM_SIZE;
>> + ebx = XSTATE_YMM_OFFSET;
>> + break;
>> + default:
>> + break;
>> + }
>> + break;
>> + }
>> +
>> + kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
>> + kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
>> + kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
>> + kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
>> +}
>>
>
> This should be part of KVM_GET_SUPPORTED_CPUID.@@ -5091,6 +5192,60 @@
Ok. Will add this.
> int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu
> *fpu)
>> return 0;
>> }
>>
>> +#ifdef CONFIG_X86_64
>> +#define REX_PREFIX "0x48, "
>> +#else
>> +#define REX_PREFIX
>> +#endif
>> +
>> +static inline void kvm_fx_save_host(struct kvm_vcpu *vcpu) +{
>> + if (cpu_has_xsave) {
>> + asm volatile (".byte " REX_PREFIX "0x0f,0xae,0x27"
>> + : : "a" (-1), "d" (-1), "D"(vcpu->arch.host_xstate_image) +
>> :
>> "memory"); + vcpu->arch.host_xstate_mask =
>> + xgetbv(XCR_XFEATURE_ENABLED_MASK);
>> + } else {
>> + asm("fxsave (%0)" : : "r" (&vcpu->arch.host_fx_image)); +
>> }
>> +}
>> +
>> +static inline void kvm_fx_save_guest(struct kvm_vcpu *vcpu) +{
>> + if (cpu_has_xsave) {
>> + asm volatile (".byte " REX_PREFIX "0x0f,0xae,0x27"
>> + : : "a" (-1), "d" (-1), "D"(vcpu->arch.guest_xstate_image) +
>> :
>> "memory"); + vcpu->arch.guest_xstate_mask =
>> + xgetbv(XCR_XFEATURE_ENABLED_MASK);
>> + } else {
>> + asm("fxsave (%0)" : : "r" (&vcpu->arch.guest_fx_image)); +
>> }
>> +}
>> +
>> +static inline void kvm_fx_restore_host(struct kvm_vcpu *vcpu) +{
>> + if (cpu_has_xsave) {
>> + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.host_xstate_mask);
>> + asm volatile (".byte " REX_PREFIX "0x0f,0xae,0x2f"
>> + : : "a" (-1), "d" (-1), "D"(vcpu->arch.host_xstate_image)); +
>> }
>> else { + asm("fxrstor (%0)" : : "r" (&vcpu->arch.host_fx_image));
>> + } +}
>> +
>> +static inline void kvm_fx_restore_guest(struct kvm_vcpu *vcpu) +{
>> + if (cpu_has_xsave) {
>> + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.guest_xstate_mask);
>> + asm volatile (".byte " REX_PREFIX "0x0f,0xae,0x2f"
>> + : : "a" (-1), "d" (-1), "D"(vcpu->arch.guest_xstate_image)); +
>> }
>> else { + asm("fxrstor (%0)" : : "r"
>> (&vcpu->arch.guest_fx_image));
>> + } +}
>> +
>>
>
>
> This mostly duplicates the standard x86 fpu code. I have a patch
> somewhere that abstracts it out, I'll dig it up and send it out.
I tried to avoid touching the host kernel itself. :-)
I saw the 2 patches you sent. They (like the new APIs fpu_alloc/fpu_free) will
simplify
the implementation of kvm xsave support. Thanks a lot!
BTW: Sheng Yang(in the Cc) will help me to follow up all the left issues.
Thanks,
-- Dexuan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html