Gleb Natapov wrote on 2013-01-14:
> On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote:
>> Gleb Natapov wrote on 2013-01-14:
>>> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote:
>>>> Gleb Natapov wrote on 2013-01-14:
>>>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote:
>>>>>> From: Yang Zhang <[email protected]>
>>>>>> 
>>>>>> basically to benefit from apicv, we need to enable virtualized
>>>>>> x2apic mode. Currently, we only enable it when guest is really
>>>>>> using x2apic.
>>>>>> 
>>>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest
> enabled
>>>>> x2apic:
>>>>>>     0x800 - 0x8ff: no read intercept for apicv register virtualization,
>>>>>>                    except APIC ID and TMCCT.
>>>>>>     APIC ID and TMCCT: need software's assistance to get right value.
>>>>> Actually since msr bitmap is shared between all vcpus this will break
>>>>> guests that do not enable x2apic.
>>>> I don't think this case will exist. It will break the real OS too.
>>>> 
>>> Which case? One VM uses x2apic another one does not? Bitmap is shared
>>> between all vcpus of all VMs.
>> Sorry. I misread your comments.
>> 
> I miswrote it. Forgot to include "of all VMs" there.
> 
>> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead
> global bitmap.
> Are you sure cpus cannot be in different modes during boot and smp
> initialization? Where spec says that?
I don't think hardware has this limitation. I mean use per vcpu's msr bitmap 
instead global bitmap.

>> 
>>>>>> Signed-off-by: Kevin Tian <[email protected]>
>>>>>> Signed-off-by: Yang Zhang <[email protected]>
>>>>>> ---
>>>>>>  arch/x86/include/asm/kvm_host.h |    1 +
> arch/x86/include/asm/vmx.h
>>>>>>    |    1 + arch/x86/kvm/lapic.c            |   15 +++-
>>>>>>  arch/x86/kvm/svm.c              |    6 ++ arch/x86/kvm/vmx.c
>>>>>>     |  162 +++++++++++++++++++++++++++++++++++++-- 5 files
>>> changed, 173
>>>>>>  insertions(+), 12 deletions(-)
>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h
>>>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 ---
>>>>>> a/arch/x86/include/asm/kvm_host.h +++
>>>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct
>>>>>> kvm_x86_ops {
>>>>>>          void (*enable_nmi_window)(struct kvm_vcpu *vcpu);       void
>>>>>>  (*enable_irq_window)(struct kvm_vcpu *vcpu);    void
>>>>>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>>>>>>  +       void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool
>>>>>>  set);   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>>>>>          int (*get_tdp_level)(void);     u64 (*get_mt_mask)(struct 
>>>>>> kvm_vcpu
>>>>>>  *vcpu, gfn_t gfn, bool is_mmio);
>>>>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>>>>> index 44c3f7e..0a54df0 100644
>>>>>> --- a/arch/x86/include/asm/vmx.h
>>>>>> +++ b/arch/x86/include/asm/vmx.h
>>>>>> @@ -139,6 +139,7 @@
>>>>>>  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define
>>>>>>  SECONDARY_EXEC_ENABLE_EPT               0x00000002 #define
>>>>>>  SECONDARY_EXEC_RDTSCP                   0x00000008 +#define
>>>>>>  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE   0x00000010 #define
>>>>>>  SECONDARY_EXEC_ENABLE_VPID              0x00000020 #define
>>>>>>  SECONDARY_EXEC_WBINVD_EXITING           0x00000040 #define
>>>>>>  SECONDARY_EXEC_UNRESTRICTED_GUEST       0x00000080
>>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>>>>> index 0664c13..2ef5e2b 100644
>>>>>> --- a/arch/x86/kvm/lapic.c
>>>>>> +++ b/arch/x86/kvm/lapic.c
>>>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu
>>> *vcpu,
>>>>> u64 value)
>>>>>>          if (!kvm_vcpu_is_bsp(apic->vcpu))
>>>>>>                  value &= ~MSR_IA32_APICBASE_BSP;
>>>>>> -        vcpu->arch.apic_base = value;
>>>>>> -        if (apic_x2apic_mode(apic)) {
>>>>>> -                u32 id = kvm_apic_id(apic);
>>>>>> -                u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>> -                kvm_apic_set_ldr(apic, ldr);
>>>>>> +        if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) {
>>>>>> +                if (value & X2APIC_ENABLE) {
>>>>>> +                        u32 id = kvm_apic_id(apic);
>>>>>> +                        u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
>>>>>> +                        kvm_apic_set_ldr(apic, ldr);
>>>>>> +                        kvm_x86_ops->set_virtual_x2apic_mode(vcpu, 
>>>>>> true);
>>>>>> +                } else
>>>>>> +                        kvm_x86_ops->set_virtual_x2apic_mode(vcpu, 
>>>>>> false);
>>>>>>          }
>>>>>> +
>>>>>> +        vcpu->arch.apic_base = value;
>>>>>>          apic->base_address = apic->vcpu->arch.apic_base &
>>>>>>                               MSR_IA32_APICBASE_BASE;
>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>>>>>> index d29d3cd..38407e9 100644
>>>>>> --- a/arch/x86/kvm/svm.c
>>>>>> +++ b/arch/x86/kvm/svm.c
>>>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct
>>> kvm_vcpu
>>>>> *vcpu, int tpr, int irr)
>>>>>>                  set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
>>>>>>  }
>>>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool
>>>>>> set) +{ +        return; +} +
>>>>>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) {     struct
>>>>>>  vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static struct
>>>>>>  kvm_x86_ops svm_x86_ops = {     .enable_nmi_window =
>>>>>>  enable_nmi_window,      .enable_irq_window = enable_irq_window,
>>>>>>          .update_cr8_intercept = update_cr8_intercept,
>>>>>> +        .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
>>>>>> 
>>>>>>          .set_tss_addr = svm_set_tss_addr,
>>>>>>          .get_tdp_level = get_npt_level,
>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>>>>> index 0403634..847022e 100644
>>>>>> --- a/arch/x86/kvm/vmx.c
>>>>>> +++ b/arch/x86/kvm/vmx.c
>>>>>> @@ -767,6 +767,12 @@ static inline bool
>>>>> cpu_has_vmx_virtualize_apic_accesses(void)
>>>>>>                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>>  }
>>>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
>>>>>> +{
>>>>>> +        return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>> +                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
>>>>>> +}
>>>>>> +
>>>>>>  static inline bool cpu_has_vmx_apic_register_virt(void)
>>>>>>  {
>>>>>>          return vmcs_config.cpu_based_2nd_exec_ctrl &
>>>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct
>>>>> vmcs_config *vmcs_conf)
>>>>>>          if (_cpu_based_exec_control &
>>>>>>  CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {                min2 = 0;       
>>>>>>         opt2 =
>>>>>>  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>>>>>>  +                       SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
>>>>>>                          SECONDARY_EXEC_WBINVD_EXITING |         
>>>>>> SECONDARY_EXEC_ENABLE_VPID |
>>>>>>                          SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 
>>>>>> @@ static void
>>>>>>  free_vpid(struct vcpu_vmx *vmx)         spin_unlock(&vmx_vpid_lock); }
>>>>>> -static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R        1 +#define MSR_TYPE_W   
>>>>>> 2
>>>>>> +static void __vmx_disable_intercept_for_msr(unsigned long
>>>>>> *msr_bitmap, +                                   u32 msr, int type) +{ + 
>>>>>> int f = sizeof(unsigned
>>>>>> long); + +       if (!cpu_has_vmx_msr_bitmap()) +                return; 
>>>>>> + +     /* +     *
>>>>>> See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals +       
>>>>>> * have the write-low and read-high bitmap offsets the wrong way
>>>>>> round. +  * We can control MSRs 0x00000000-0x00001fff and
>>>>>> 0xc0000000-0xc0001fff. +  */ +   if (msr <= 0x1fff) { +          if 
>>>>>> (type &
>>>>>> MSR_TYPE_R) +                    /* read-low */ +        
>>>>>> __clear_bit(msr, msr_bitmap +
>>>>>> 0x000 / f); + +          if (type & MSR_TYPE_W) +                        
>>>>>> /* write-low */ +
>>>>>>  __clear_bit(msr, msr_bitmap + 0x800 / f); + +   } else if ((msr >=
>>>>>> 0xc0000000) && (msr <= 0xc0001fff)) { +          msr &= 0x1fff; +        
>>>>>> if (type
>>>>>> & MSR_TYPE_R) +  /* read-high */ +                       
>>>>>> __clear_bit(msr, msr_bitmap +
>>>>>> 0x400 / f); + +          if (type & MSR_TYPE_W) +                        
>>>>>> /* write-high */ +
>>>>>>  __clear_bit(msr, msr_bitmap + 0xc00 / f); + +   } +} + +static void
>>>>>> __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +              
>>>>>>                 u32
>>>>>> msr, int type)
>>>>>>  {
>>>>>>          int f = sizeof(unsigned long);
>>>>>> @@ -3737,20 +3782,75 @@ static void
>>>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
>>>>>>           * We can control MSRs 0x00000000-0x00001fff and
>>>>>>  0xc0000000-0xc0001fff.   */     if (msr <= 0x1fff) {
>>>>>> -                __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
>>>>>> -                __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low 
>>>>>> */
>>>>>> +                if (type & MSR_TYPE_R)
>>>>>> +                        /* read-low */
>>>>>> +                        __set_bit(msr, msr_bitmap + 0x000 / f);
>>>>>> +
>>>>>> +                if (type & MSR_TYPE_W)
>>>>>> +                        /* write-low */
>>>>>> +                        __set_bit(msr, msr_bitmap + 0x800 / f);
>>>>>> +
>>>>>>          } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
>>>>>>                  msr &= 0x1fff;
>>>>>> -                __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high 
>>>>>> */
>>>>>> -                __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high 
>>>>>> */
>>>>>> +                if (type & MSR_TYPE_R)
>>>>>> +                        /* read-high */
>>>>>> +                        __set_bit(msr, msr_bitmap + 0x400 / f);
>>>>>> +
>>>>>> +                if (type & MSR_TYPE_W)
>>>>>> +                        /* write-high */
>>>>>> +                        __set_bit(msr, msr_bitmap + 0xc00 / f);
>>>>>> +
>>>>>>          } } + static void vmx_disable_intercept_for_msr(u32 msr, bool
>>>>>>  longmode_only) {        if (!longmode_only)
>>>>>> -                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 
>>>>>> msr);
>>>>>> -        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
>>>>>> +                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +
>>>>>>                                          msr, MSR_TYPE_R | MSR_TYPE_W);
>>>>>> +        __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +                                                msr, MSR_TYPE_R | 
>>>>>> MSR_TYPE_W); +} + +static void
>>>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, +                
>>>>>>         bool
>>>>>> set) +{ +        if (!longmode_only) { +         if (set) +
>>>>>>  __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +                 
>>>>>>                 msr,
>>>>>> MSR_TYPE_R); +           else +
>>>>>>  __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +                
>>>>>>                 msr,
>>>>>> MSR_TYPE_R); + + } +     if (set)
>>>>>> +                __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +                                msr, MSR_TYPE_R); +     else
>>>>>> +                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +                                msr, MSR_TYPE_R); +} + +static void
>>>>>> vmx_intercept_for_msr_write(u32 msr, bool longmode_only, +               
>>>>>>                         bool
>>>>>> set) +{ +        if (!longmode_only) { +         if (set) +
>>>>>>  __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, +                 
>>>>>>                 msr,
>>>>>> MSR_TYPE_W); +           else +
>>>>>>  __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, +                
>>>>>>                 msr,
>>>>>> MSR_TYPE_W); + + } +     if (set)
>>>>>> +                __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +                                msr, MSR_TYPE_W); +     else
>>>>>> +                __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
>>>>>> +                                msr, MSR_TYPE_W);
>>>>>>  }
>>>>>>  
>>>>>>  /*
>>>>>> @@ -3848,6 +3948,7 @@ static u32 vmx_secondary_exec_control(struct
>>>>> vcpu_vmx *vmx)
>>>>>>                  exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;     
>>>>>> if
>>>>>>  (!enable_apicv_reg)             exec_control &=
>>>>>>  ~SECONDARY_EXEC_APIC_REGISTER_VIRT; +   exec_control &=
>>>>>>  ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;         return
>>> exec_control; }
>>>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct
>>> kvm_vcpu
>>>>> *vcpu, int tpr, int irr)
>>>>>>          vmcs_write32(TPR_THRESHOLD, irr);
>>>>>>  }
>>>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu,
>>>>>> bool set) +{ +   u32 exec_control, sec_exec_control; +   int msr;
>>>>>> +        struct vcpu_vmx *vmx = to_vmx(vcpu); + +        /* There is not 
>>>>>> point to
>>>>>> enable virtualize x2apic without enable +         * apicv*/ +    if
>>>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) +
>>>>>>  return; + +     if (set) { +            exec_control =
>>>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); +                /* virtualize 
>>>>>> x2apic
>>>>>> mode relies on tpr shadow */ +           if (!(exec_control &
>>>>>> CPU_BASED_TPR_SHADOW)) +                 return; +       } + +   
>>>>>> sec_exec_control =
>>>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + +      if (set) {
>>>>>> +                sec_exec_control &= 
>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>> +                sec_exec_control |= 
>>>>>> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +    }
>>>>>> else { +         sec_exec_control &=
>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +                if
>>>>>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) +
>>>>>>  sec_exec_control |= +   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>>>>>> +        } +     vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 
>>>>>> sec_exec_control); +
>>>>>> +        for (msr = 0x800; msr <= 0x8ff; msr++) +
>>>>>>  vmx_intercept_for_msr_read(msr, false, !set); + +       if (set) { +    
>>>>>>         /*
>>>>>> According SDM, in x2apic mode, the whole id reg is used. +               
>>>>>>  * But
>>>>>> in KVM, it only use the highest eight bits. Need to +             * 
>>>>>> intercept
>>>>>> it*/ +   vmx_intercept_for_msr_read(0x802, false, true); +               
>>>>>> /* TMCCT
>>>>>> */ +     vmx_intercept_for_msr_read(0x839, false, true); +       } +     
>>>>>> /* TPR
>>>>>> */ +     vmx_intercept_for_msr_write(0x808, false, !set); +} +
>>>>>>  static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) {    u32
>>>>>>  exit_intr_info; @@ -7366,6 +7513,7 @@ static struct kvm_x86_ops
>>>>>>  vmx_x86_ops = {         .enable_nmi_window = enable_nmi_window,
>>>>>>          .enable_irq_window = enable_irq_window,         
>>>>>> .update_cr8_intercept =
>>>>>>  update_cr8_intercept,
>>>>>> +        .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
>>>>>> 
>>>>>>          .set_tss_addr = vmx_set_tss_addr,
>>>>>>          .get_tdp_level = get_ept_level,
>>>>>> --
>>>>>> 1.7.1
>>>>> 
>>>>> --
>>>>>                   Gleb.
>>>> 
>>>> 
>>>> Best regards,
>>>> Yang
>>> 
>>> --
>>>                     Gleb.
>> 
>> 
>> Best regards,
>> Yang
>> 
> 
> --
>                       Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to