On Wed, Jan 16, 2013 at 06:21:11PM +0800, Yang Zhang wrote:
> From: Yang Zhang <[email protected]>
>
> basically to benefit from apicv, we need to enable virtualized x2apic mode.
> Currently, we only enable it when guest is really using x2apic.
>
> Also, clear MSR bitmap for corresponding x2apic MSRs when guest enabled
> x2apic:
> 0x800 - 0x8ff: no read intercept for apicv register virtualization,
> except APIC ID and TMCCT which need software's assistance
> to
> get right value.
>
> Signed-off-by: Kevin Tian <[email protected]>
> Signed-off-by: Yang Zhang <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/include/asm/vmx.h | 1 +
> arch/x86/kvm/lapic.c | 20 ++--
> arch/x86/kvm/lapic.h | 5 +
> arch/x86/kvm/svm.c | 6 +
> arch/x86/kvm/vmx.c | 204
> +++++++++++++++++++++++++++++++++++----
> 6 files changed, 209 insertions(+), 28 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..35aa8e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -697,6 +697,7 @@ struct kvm_x86_ops {
> void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
> void (*enable_irq_window)(struct kvm_vcpu *vcpu);
> void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
> + void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
> int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> int (*get_tdp_level)(void);
> u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 44c3f7e..0a54df0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -139,6 +139,7 @@
> #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
> #define SECONDARY_EXEC_ENABLE_EPT 0x00000002
> #define SECONDARY_EXEC_RDTSCP 0x00000008
> +#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
> #define SECONDARY_EXEC_ENABLE_VPID 0x00000020
> #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
> #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 0664c13..f39aee3 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -140,11 +140,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
> (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
>
> -static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> -{
> - return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> -}
> -
> static inline int kvm_apic_id(struct kvm_lapic *apic)
> {
> return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> @@ -1323,12 +1318,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64
> value)
> if (!kvm_vcpu_is_bsp(apic->vcpu))
> value &= ~MSR_IA32_APICBASE_BSP;
>
> - vcpu->arch.apic_base = value;
> - if (apic_x2apic_mode(apic)) {
> - u32 id = kvm_apic_id(apic);
> - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> - kvm_apic_set_ldr(apic, ldr);
> + if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
> + if (value & X2APIC_ENABLE) {
> + u32 id = kvm_apic_id(apic);
> + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
> + kvm_apic_set_ldr(apic, ldr);
> + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
> + } else
> + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
> }
> +
> + vcpu->arch.apic_base = value;
> apic->base_address = apic->vcpu->arch.apic_base &
> MSR_IA32_APICBASE_BASE;
>
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 9a8ee22..22a5397 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -126,4 +126,9 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
> return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
> }
>
> +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> +{
> + return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> +}
> +
> #endif
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index d29d3cd..38407e9 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
> set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
> }
>
> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> + return;
> +}
> +
> static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
> {
> struct vcpu_svm *svm = to_svm(vcpu);
> @@ -4290,6 +4295,7 @@ static struct kvm_x86_ops svm_x86_ops = {
> .enable_nmi_window = enable_nmi_window,
> .enable_irq_window = enable_irq_window,
> .update_cr8_intercept = update_cr8_intercept,
> + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>
> .set_tss_addr = svm_set_tss_addr,
> .get_tdp_level = get_npt_level,
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 0403634..526955d 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -643,6 +643,8 @@ static unsigned long *vmx_io_bitmap_a;
> static unsigned long *vmx_io_bitmap_b;
> static unsigned long *vmx_msr_bitmap_legacy;
> static unsigned long *vmx_msr_bitmap_longmode;
> +static unsigned long *vmx_msr_bitmap_legacy_x2apic;
> +static unsigned long *vmx_msr_bitmap_longmode_x2apic;
>
> static bool cpu_has_load_ia32_efer;
> static bool cpu_has_load_perf_global_ctrl;
> @@ -767,6 +769,12 @@ static inline bool
> cpu_has_vmx_virtualize_apic_accesses(void)
> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> }
>
> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
> +{
> + return vmcs_config.cpu_based_2nd_exec_ctrl &
> + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> +}
> +
> static inline bool cpu_has_vmx_apic_register_virt(void)
> {
> return vmcs_config.cpu_based_2nd_exec_ctrl &
> @@ -1830,6 +1838,24 @@ static void move_msr_up(struct vcpu_vmx *vmx, int
> from, int to)
> vmx->guest_msrs[from] = tmp;
> }
>
> +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
> +{
> + unsigned long *msr_bitmap;
> +
> + if (apic_x2apic_mode(vcpu->arch.apic))
> + if (is_long_mode(vcpu))
> + msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
> + else
> + msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
> + else
> + if (is_long_mode(vcpu))
> + msr_bitmap = vmx_msr_bitmap_longmode;
> + else
> + msr_bitmap = vmx_msr_bitmap_legacy;
> +
> + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
> +}
> +
> /*
> * Set up the vmcs to automatically save and restore system
> * msrs. Don't touch the 64-bit msrs if the guest is in legacy
> @@ -1838,7 +1864,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from,
> int to)
> static void setup_msrs(struct vcpu_vmx *vmx)
> {
> int save_nmsrs, index;
> - unsigned long *msr_bitmap;
>
> save_nmsrs = 0;
> #ifdef CONFIG_X86_64
> @@ -1870,14 +1895,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
>
> vmx->save_nmsrs = save_nmsrs;
>
> - if (cpu_has_vmx_msr_bitmap()) {
> - if (is_long_mode(&vmx->vcpu))
> - msr_bitmap = vmx_msr_bitmap_longmode;
> - else
> - msr_bitmap = vmx_msr_bitmap_legacy;
> -
> - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
> - }
> + if (cpu_has_vmx_msr_bitmap())
> + vmx_set_msr_bitmap(&vmx->vcpu);
> }
>
> /*
> @@ -2543,6 +2562,7 @@ static __init int setup_vmcs_config(struct vmcs_config
> *vmcs_conf)
> if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
> min2 = 0;
> opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
> SECONDARY_EXEC_WBINVD_EXITING |
> SECONDARY_EXEC_ENABLE_VPID |
> SECONDARY_EXEC_ENABLE_EPT |
> @@ -3724,7 +3744,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
> spin_unlock(&vmx_vpid_lock);
> }
>
> -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32
> msr)
> +#define MSR_TYPE_R 1
> +#define MSR_TYPE_W 2
> +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
> + u32 msr, int type)
> {
> int f = sizeof(unsigned long);
>
> @@ -3737,20 +3760,99 @@ static void __vmx_disable_intercept_for_msr(unsigned
> long *msr_bitmap, u32 msr)
> * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> */
> if (msr <= 0x1fff) {
> - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
> - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
> + if (type & MSR_TYPE_R)
> + /* read-low */
> + __clear_bit(msr, msr_bitmap + 0x000 / f);
> +
> + if (type & MSR_TYPE_W)
> + /* write-low */
> + __clear_bit(msr, msr_bitmap + 0x800 / f);
> +
> } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> msr &= 0x1fff;
> - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
> - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
> + if (type & MSR_TYPE_R)
> + /* read-high */
> + __clear_bit(msr, msr_bitmap + 0x400 / f);
> +
> + if (type & MSR_TYPE_W)
> + /* write-high */
> + __clear_bit(msr, msr_bitmap + 0xc00 / f);
> +
> + }
> +}
> +
> +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
> + u32 msr, int type)
> +{
> + int f = sizeof(unsigned long);
> +
> + if (!cpu_has_vmx_msr_bitmap())
> + return;
> +
> + /*
> + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
> + * have the write-low and read-high bitmap offsets the wrong way round.
> + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
> + */
> + if (msr <= 0x1fff) {
> + if (type & MSR_TYPE_R)
> + /* read-low */
> + __set_bit(msr, msr_bitmap + 0x000 / f);
> +
> + if (type & MSR_TYPE_W)
> + /* write-low */
> + __set_bit(msr, msr_bitmap + 0x800 / f);
> +
> + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
> + msr &= 0x1fff;
> + if (type & MSR_TYPE_R)
> + /* read-high */
> + __set_bit(msr, msr_bitmap + 0x400 / f);
> +
> + if (type & MSR_TYPE_W)
> + /* write-high */
> + __set_bit(msr, msr_bitmap + 0xc00 / f);
> +
> }
> }
>
> static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
> {
> if (!longmode_only)
> - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
> - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
> + msr, MSR_TYPE_R | MSR_TYPE_W);
> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
> + msr, MSR_TYPE_R | MSR_TYPE_W);
> +}
> +
> +static void vmx_intercept_for_msr_read_x2apic(u32 msr, bool set)
> +{
> + if (set) {
> + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> + msr, MSR_TYPE_R);
> + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> + msr, MSR_TYPE_R);
> + } else {
> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> + msr, MSR_TYPE_R);
> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> + msr, MSR_TYPE_R);
> + }
> +}
> +
> +static void vmx_intercept_for_msr_write_x2apic(u32 msr, bool set)
> +{
> + if (set) {
> + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> + msr, MSR_TYPE_W);
> + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> + msr, MSR_TYPE_W);
> + } else {
> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
> + msr, MSR_TYPE_W);
> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
> + msr, MSR_TYPE_W);
> + }
> }
>
> /*
> @@ -3848,6 +3950,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx
> *vmx)
> exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> if (!enable_apicv_reg)
> exec_control &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
> + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> return exec_control;
> }
>
> @@ -6103,6 +6206,53 @@ static void update_cr8_intercept(struct kvm_vcpu
> *vcpu, int tpr, int irr)
> vmcs_write32(TPR_THRESHOLD, irr);
> }
>
> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
> +{
> + u32 exec_control, sec_exec_control;
> + int msr;
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> + /* There is not point to enable virtualize x2apic without enable
> + * apicv*/
> + if (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg)
> + return;
> +
> + if (set) {
> + exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> + /* virtualize x2apic mode relies on tpr shadow */
> + if (!(exec_control & CPU_BASED_TPR_SHADOW))
> + return;
> + }
> +
> + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +
> + if (set) {
> + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> + } else {
> + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
> + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
> + sec_exec_control |=
> + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> + }
> + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
> +
> + if (set) {
> + for (msr = 0x800; msr <= 0x8ff; msr++)
> + vmx_intercept_for_msr_read_x2apic(msr, false);
> +
> + /* According SDM, in x2apic mode, the whole id reg is used.
> + * But in KVM, it only use the highest eight bits. Need to
> + * intercept it */
> + vmx_intercept_for_msr_read_x2apic(0x802, true);
> + /* TMCCT */
> + vmx_intercept_for_msr_read_x2apic(0x839, true);
> + /* TPR */
> + vmx_intercept_for_msr_write_x2apic(0x808, false);
> + }
Do it during vmx init, not here. Here you only need to call
vmx_set_msr_bitmap().
> + vmx_set_msr_bitmap(vcpu);
> +}
> +
> static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
> {
> u32 exit_intr_info;
> @@ -7366,6 +7516,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
> .enable_nmi_window = enable_nmi_window,
> .enable_irq_window = enable_irq_window,
> .update_cr8_intercept = update_cr8_intercept,
> + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>
> .set_tss_addr = vmx_set_tss_addr,
> .get_tdp_level = get_ept_level,
> @@ -7419,11 +7570,19 @@ static int __init vmx_init(void)
> if (!vmx_msr_bitmap_legacy)
> goto out1;
>
> + vmx_msr_bitmap_legacy_x2apic =
> + (unsigned long *)__get_free_page(GFP_KERNEL);
> + if (!vmx_msr_bitmap_legacy_x2apic)
> + goto out2;
>
> vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
> if (!vmx_msr_bitmap_longmode)
> - goto out2;
> + goto out3;
>
> + vmx_msr_bitmap_longmode_x2apic =
> + (unsigned long *)__get_free_page(GFP_KERNEL);
> + if (!vmx_msr_bitmap_longmode_x2apic)
> + goto out4;
>
> /*
> * Allow direct access to the PC debug port (it is often used for I/O
> @@ -7456,6 +7615,11 @@ static int __init vmx_init(void)
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
> vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
>
> + memcpy(vmx_msr_bitmap_legacy_x2apic,
> + vmx_msr_bitmap_legacy, PAGE_SIZE);
> + memcpy(vmx_msr_bitmap_longmode_x2apic,
> + vmx_msr_bitmap_longmode, PAGE_SIZE);
> +
> if (enable_ept) {
> kvm_mmu_set_mask_ptes(0ull,
> (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
> @@ -7468,8 +7632,10 @@ static int __init vmx_init(void)
>
> return 0;
>
> -out3:
> +out4:
> free_page((unsigned long)vmx_msr_bitmap_longmode);
> +out3:
> + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
> out2:
> free_page((unsigned long)vmx_msr_bitmap_legacy);
> out1:
> @@ -7481,6 +7647,8 @@ out:
>
> static void __exit vmx_exit(void)
> {
> + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
> + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
> free_page((unsigned long)vmx_msr_bitmap_legacy);
> free_page((unsigned long)vmx_msr_bitmap_longmode);
> free_page((unsigned long)vmx_io_bitmap_b);
> --
> 1.7.1
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html