Attached patch is modified according your comments to use 4 msr bitmaps. Please 
help to review it.

Also, is there any comments for the other part of this series? If yes, welcome. 
:)

Gleb Natapov wrote on 2013-01-14:
> On Mon, Jan 14, 2013 at 11:25:45AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-14:
>>> On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-01-14:
>>>>> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
>>>>>> Gleb Natapov wrote on 2013-01-14:
>>>>>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
>>>>>>>> From: Yang Zhang <[email protected]>
>>>>>>>> 
>>>>>>>> basically to benefit from apicv, we need to enable virtualized
>>>>>>>> x2apic mode. Currently, we only enable it when guest is really
>>>>>>>> using x2apic.
>>>>>>>> 
>>>>>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest
>>> enabled
>>>>>>> x2apic:
>>>>>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>>>>>                    except APIC ID and TMCCT.
>>>>>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>>>>>> Actually since msr bitmap is shared between all vcpus this will break
>>>>>>> guests that do not enable x2apic.
>>>>>> I don't think this case will exist. It will break the real OS too.
>>>>>> 
>>>>> Which case? One VM uses x2apic another one does not? Bitmap is shared
>>>>> between all vcpus of all VMs.
>>>> Sorry. I misread your comments.
>>>> 
>>> I miswrote it. Forgot to include "of all VMs" there.
>>> 
>>>> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead
>>> global bitmap.
>>> Are you sure cpus cannot be in different modes during boot and smp
>>> initialization? Where spec says that?
>> I don't think hardware has this limitation. I mean use per vcpu's msr
>> bitmap instead global bitmap.
>> 
> Even if HW has such limitation we cannot have per VM bitmap since
> malicious guest can abuse it.
> 
> Per cpu msr bitmap means that each vcpu will waste one more page of
> memory. We already have different global msr bitmap for long mode
> and legacy mode, may be make it 4: x2apic X long X legacy.
> 
>>>> 
>>>>>>>> Signed-off-by: Kevin Tian <[email protected]>
>>>>>>>> Signed-off-by: Yang Zhang <[email protected]>
>>>>>>>> ---
>>>>>>>>  arch/x86/include/asm/kvm_host.h |    1 +
>>> arch/x86/include/asm/vmx.h
>>>>>>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
>>>>>>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
>>>>>>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
>>>>> changed, 173
>>>>>>>>  insertions(+), 12 deletions(-)
>>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>>>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644
>>>>>>>> --- a/arch/x86/include/asm/kvm_host.h +++
>>>>>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
>>>>>>>> kvm_x86_ops {
>>>>>>>>        void (*enable_nmi_window)(struct kvm_vcpu *vcpu);       void
>>>>>>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu);  void
>>>>>>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int
>>>>>>>>  irr); +       void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu,
>>>>>>>>  bool set);    int (*set_tss_addr)(struct kvm *kvm, unsigned int
>>>>>>>>  addr);        int (*get_tdp_level)(void);     u64 
>>>>>>>> (*get_mt_mask)(struct
>>>>>>>>  kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>>>>>>>> diff --git a/arch/x86/include/asm/vmx.h
>>>>>>>> b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 ---
>>>>>>>> a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@
>>>>>>>> -139,6 +139,7 @@
>>>>>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
>>>>>>>>  #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
>>>>>>>>  #define SECONDARY_EXEC_RDTSCP                 0x00000008 +#define
>>>>>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>>>>>  SECONDARY_EXEC_WBINVD_EXITING         0x00000040 #define
>>>>>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST     0x00000080
>>>>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>>>>>> index 0664c13..2ef5e2b 100644
>>>>>>>> --- a/arch/x86/kvm/lapic.c
>>>>>>>> +++ b/arch/x86/kvm/lapic.c
>>>>>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
>>>>> *vcpu,
>>>>>>> u64 value)
>>>>>>>>        if (!kvm_vcpu_is_bsp(apic->vcpu))
>>>>>>>>                value &= ~MSR_IA32_APICBASE_BSP;
>>>>>>>> -      vcpu->arch.apic_base = value;
>>>>>>>> -      if (apic_x2apic_mode(apic)) {
>>>>>>>> -              u32 id = kvm_apic_id(apic);
>>>>>>>> -              u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>>>> -              kvm_apic_set_ldr(apic, ldr);
>>>>>>>> +      if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>>>>>>>> +              if (value & X2APIC_ENABLE) {
>>>>>>>> +                      u32 id = kvm_apic_id(apic);
>>>>>>>> +                      u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>>>> +                      kvm_apic_set_ldr(apic, ldr);
>>>>>>>> +                      kvm_x86_ops->set_virtual_x2apic_mode(vcpu, 
>>>>>>>> true);
>>>>>>>> +              } else
>>>>>>>> +                      kvm_x86_ops->set_virtual_x2apic_mode(vcpu, 
>>>>>>>> false);
>>>>>>>>        }
>>>>>>>> +
>>>>>>>> +      vcpu->arch.apic_base = value;
>>>>>>>>        apic->base_address = apic->vcpu->arch.apic_base &
>>>>>>>>                             MSR_IA32_APICBASE_BASE;
>>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>>>>>> index d29d3cd..38407e9 100644
>>>>>>>> --- a/arch/x86/kvm/svm.c
>>>>>>>> +++ b/arch/x86/kvm/svm.c
>>>>>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
>>>>> kvm_vcpu
>>>>>>> *vcpu, int tpr, int irr)
>>>>>>>>                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>>>>>  }
>>>>>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
>>>>>>>> bool set) +{ + return; +} +
>>>>>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) {   struct
>>>>>>>>  vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static
>>>>>>>>  struct kvm_x86_ops svm_x86_ops = {    .enable_nmi_window =
>>>>>>>>  enable_nmi_window,    .enable_irq_window = enable_irq_window,
>>>>>>>>        .update_cr8_intercept = update_cr8_intercept,
>>>>>>>> +      .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>>>>>>>> 
>>>>>>>>        .set_tss_addr = svm_set_tss_addr,
>>>>>>>>        .get_tdp_level = get_npt_level,
>>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>>>>>> index 0403634..847022e 100644
>>>>>>>> --- a/arch/x86/kvm/vmx.c
>>>>>>>> +++ b/arch/x86/kvm/vmx.c
>>>>>>>> @@ -767,6 +767,12 @@ static inline bool
>>>>>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>>>>>                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>>>  }
>>>>>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>>>>>> +{
>>>>>>>> +      return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>>>> +              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>>>>>  {
>>>>>>>>        return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
>>>>>>> vmcs_config *vmcs_conf)
>>>>>>>>        if (_cpu_based_exec_control &
>>>>>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {              min2 = 0;       
>>>>>>>>         opt2 =
>>>>>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>>>>>  +                     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>>>>>                        SECONDARY_EXEC_WBINVD_EXITING |         
>>>>>>>> SECONDARY_EXEC_ENABLE_VPID |
>>>>>>>>                        SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 
>>>>>>>> @@ static
>>>>>>>>  void free_vpid(struct vcpu_vmx *vmx)
>       spin_unlock(&vmx_vpid_lock); }
>>>>>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R      1 +#define MSR_TYPE_W   
>>>>>>>> 2
>>>>>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>>>> *msr_bitmap, +                                         u32 msr, int 
>>>>>>>> type) +{ + int f =
>>>>>>>> sizeof(unsigned long); + +     if (!cpu_has_vmx_msr_bitmap())
>>>>>>>> +              return; + +     /* +     * See Intel PRM Vol. 3, 20.6.9 
>>>>>>>> (MSR-Bitmap
>>>>>>>> Address). Early manuals +
>>>>>>>> 
>>>>>>>> * have the write-low and read-high bitmap offsets the wrong way
>>>>>>>> round. +        * We can control MSRs 0x00000000-0x00001fff and
>>>>>>>> 0xc0000000-0xc0001fff. +        */ +   if (msr <= 0x1fff) { +          
>>>>>>>> if (type &
>>>>>>>> MSR_TYPE_R) +                  /* read-low */ +        
>>>>>>>> __clear_bit(msr, msr_bitmap +
>>>>>>>> 0x000 / f); + +                if (type & MSR_TYPE_W) +                
>>>>>>>>         /* write-low */ +
>>>>>>>>        __clear_bit(msr, msr_bitmap + 0x800 / f); + +   } else if ((msr 
>>>>>>>> >=
>>>>>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +                msr &= 0x1fff; 
>>>>>>>> +        if
>>>>>>>> (type & MSR_TYPE_R) +  /* read-high */ +                       
>>>>>>>> __clear_bit(msr,
>>>>>>>> msr_bitmap + 0x400 / f); + +           if (type & MSR_TYPE_W) +        
>>>>>>>>                 /*
>>>>>>>> write-high */ +        __clear_bit(msr, msr_bitmap + 0xc00 / f); + +   
>>>>>>>> }
>>>>>>>> +} + +static void __vmx_enable_intercept_for_msr(unsigned long
>>>>>>>> *msr_bitmap, +                                 u32 msr, int type)
>>>>>>>>  {
>>>>>>>>        int f = sizeof(unsigned long);
>>>>>>>> @@ -3737,20 +3782,75 @@ static void
>>>>>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32
> msr)
>>>>>>>>         * We can control MSRs 0x00000000-0x00001fff and
>>>>>>>>  0xc0000000-0xc0001fff.         */     if (msr <= 0x1fff) {
>>>>>>>> -              __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>>>>>> -              __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low 
>>>>>>>> */
>>>>>>>> +              if (type & MSR_TYPE_R)
>>>>>>>> +                      /* read-low */
>>>>>>>> +                      __set_bit(msr, msr_bitmap + 0x000 / f);
>>>>>>>> +
>>>>>>>> +              if (type & MSR_TYPE_W)
>>>>>>>> +                      /* write-low */
>>>>>>>> +                      __set_bit(msr, msr_bitmap + 0x800 / f);
>>>>>>>> +
>>>>>>>>        } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>>>>>                msr &= 0x1fff;
>>>>>>>> -              __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high 
>>>>>>>> */
>>>>>>>> -              __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high 
>>>>>>>> */
>>>>>>>> +              if (type & MSR_TYPE_R)
>>>>>>>> +                      /* read-high */
>>>>>>>> +                      __set_bit(msr, msr_bitmap + 0x400 / f);
>>>>>>>> +
>>>>>>>> +              if (type & MSR_TYPE_W)
>>>>>>>> +                      /* write-high */
>>>>>>>> +                      __set_bit(msr, msr_bitmap + 0xc00 / f);
>>>>>>>> +
>>>>>>>>        } } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>>>>>  longmode_only) {      if (!longmode_only)
>>>>>>>> -      __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
>>>>>>>> -      __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>>>>>> +      __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>>                                                msr, MSR_TYPE_R | 
>>>>>>>> MSR_TYPE_W);
>>>>>>>> +      __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +                                              msr, MSR_TYPE_R | 
>>>>>>>> MSR_TYPE_W); +} + +static void
>>>>>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +              
>>>>>>>> bool
>>>>>>>> set) +{ +      if (!longmode_only) { +         if (set) +
>>>>>>>>        __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +         
>>>>>>>>                         msr,
>>>>>>>> MSR_TYPE_R); +         else +
>>>>>>>>        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>>                                msr, MSR_TYPE_R); + +   } +     if 
>>>>>>>> (set) +
>>>>>>>>        __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +                              msr, MSR_TYPE_R); +     else +
>>>>>>>>        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +                              msr, MSR_TYPE_R); +} + +static void
>>>>>>>> vmx_intercept_for_msr_write(u32 msr, bool longmode_only, +
>>>>>>>>                                bool set) +{ +  if (!longmode_only) { + 
>>>>>>>>         if (set) +
>>>>>>>>        __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>>                                        msr, MSR_TYPE_W); +             
>>>>>>>> else +
>>>>>>>>        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>>>                                msr, MSR_TYPE_W); + +   } +     if 
>>>>>>>> (set) +
>>>>>>>>        __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +                              msr, MSR_TYPE_W); +     else +
>>>>>>>>        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>>>> +                              msr, MSR_TYPE_W);
>>>>>>>>  }
>>>>>>>>  
>>>>>>>>  /*
>>>>>>>> @@ -3848,6 +3948,7 @@ static u32
> vmx_secondary_exec_control(struct
>>>>>>> vcpu_vmx *vmx)
>>>>>>>>                exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;     
>>>>>>>> if
>>>>>>>>  (!enable_apicv_reg)           exec_control &=
>>>>>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; + exec_control &=
>>>>>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;       return
>>>>> exec_control; }
>>>>>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
>>>>> kvm_vcpu
>>>>>>> *vcpu, int tpr, int irr)
>>>>>>>>        vmcs_write32(TPR_THRESHOLD, irr);
>>>>>>>>  }
>>>>>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
>>>>>>>> bool set) +{ + u32 exec_control, sec_exec_control; +   int msr;
>>>>>>>> +      struct vcpu_vmx *vmx = to_vmx(vcpu); + +        /* There is not 
>>>>>>>> point
>>>>>>>> to enable virtualize x2apic without enable +    * apicv*/ +    if
>>>>>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
>>>>>>>>        return; + +     if (set) { +            exec_control =
>>>>>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +              /* virtualize 
>>>>>>>> x2apic
>>>>>>>> mode relies on tpr shadow */ +         if (!(exec_control &
>>>>>>>> CPU_BASED_TPR_SHADOW)) +                       return; +       } + +   
>>>>>>>> sec_exec_control =
>>>>>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +    if (set) {
>>>>>>>> +              sec_exec_control &= 
>>>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>>> +              sec_exec_control |= 
>>>>>>>> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +    }
>>>>>>>> else { +               sec_exec_control &=
>>>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +              if
>>>>>>>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
>>>>>>>>        sec_exec_control |= +   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>>> +      } +     vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 
>>>>>>>> sec_exec_control);
>>>>>>>> + +    for (msr = 0x800; msr <= 0x8ff; msr++) +
>>>>>>>>        vmx_intercept_for_msr_read(msr, false, !set); + +       if 
>>>>>>>> (set) { +
>>>>>>>>        /* According SDM, in x2apic mode, the whole id reg is used. +   
>>>>>>>>         
>>>>>>>> * But in KVM, it only use the highest eight bits. Need to +            
>>>>>>>>  *
>>>>>>>> intercept it*/ +       vmx_intercept_for_msr_read(0x802, false, true);
>>>>>>>> +      /* TMCCT */ +   vmx_intercept_for_msr_read(0x839, false, true);
>>>>>>>> +      } +     /* TPR */ +     vmx_intercept_for_msr_write(0x808, 
>>>>>>>> false,
>>>>>>>> !set); +} +
>>>>>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) {
>>>>>>>>        u32 exit_intr_info; @@ -7366,6 +7513,7 @@ static struct
>>>>>>>>  kvm_x86_ops vmx_x86_ops = {   .enable_nmi_window =
>>>>>>>>  enable_nmi_window,    .enable_irq_window = enable_irq_window,
>>>>>>>>        .update_cr8_intercept = update_cr8_intercept,
>>>>>>>> +      .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>>>>>>>> 
>>>>>>>>        .set_tss_addr = vmx_set_tss_addr,
>>>>>>>>        .get_tdp_level = get_ept_level,
>>>>>>>> --
>>>>>>>> 1.7.1
>>>>>>> 
>>>>>>> --
>>>>>>>                         Gleb.
>>>>>> 
>>>>>> 
>>>>>> Best regards,
>>>>>> Yang
>>>>> 
>>>>> --
>>>>>                   Gleb.
>>>> 
>>>> 
>>>> Best regards,
>>>> Yang
>>>> 
>>> 
>>> --
>>>                     Gleb.
>> 
>> 
>> Best regards,
>> Yang
>> 
> 
> --
>                       Gleb.


Best regards,
Yang


Attachment: x2apic_revise.patch
Description: x2apic_revise.patch

Reply via email to