Gleb Natapov wrote on 2012-11-25:
> On Wed, Nov 21, 2012 at 04:09:39PM +0800, Yang Zhang wrote:
>> Posted Interrupt allows vAPICV interrupts to inject into guest directly
>> without any vmexit.
>> 
>> - When delivering a interrupt to guest, if target vcpu is running,
>>   update Posted-interrupt requests bitmap and send a notification event
>>   to the vcpu. Then the vcpu will handle this interrupt automatically,
>>   without any software involvemnt.
> Looks like you allocating one irq vector per vcpu per pcpu and then
> migrate it or reallocate when vcpu move from one pcpu to another.
> This is not scalable and migrating irq migration slows things down.
> What's wrong with allocating one global vector for posted interrupt
> during vmx initialization and use it for all vcpus?

Consider the following situation: 
If vcpu A is running when notification event which belong to vcpu B is arrived, 
since the vector match the vcpu A's notification vector, then this event will 
be consumed by vcpu A(even it do nothing) and the interrupt cannot be handled 
in time.

>> - If target vcpu is not running or there already a notification event
>>   pending in the vcpu, do nothing. The interrupt will be handled by old
>>   way.
>> Signed-off-by: Yang Zhang <[email protected]>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    3 + arch/x86/include/asm/vmx.h   
>>    |    4 + arch/x86/kernel/apic/io_apic.c  |  138
>>  ++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c            |   31
>>  ++++++- arch/x86/kvm/lapic.h            |    8 ++ arch/x86/kvm/vmx.c  
>>             |  192 +++++++++++++++++++++++++++++++++++++--
>>  arch/x86/kvm/x86.c              |    2 + include/linux/kvm_host.h     
>>    |    1 + virt/kvm/kvm_main.c             |    2 + 9 files changed,
>>  372 insertions(+), 9 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index 8e07a86..1145894 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -683,9 +683,12 @@ struct kvm_x86_ops {
>>      void (*enable_irq_window)(struct kvm_vcpu *vcpu);       void
>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);   int
>>  (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); + int
>>  (*has_posted_interrupt)(struct kvm_vcpu *vcpu);     void
>>  (*update_irq)(struct kvm_vcpu *vcpu);       void (*set_eoi_exitmap)(struct
>>  kvm_vcpu *vcpu, int vector,                         int need_eoi, int 
>> global);
>> +    int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
>> +    void (*pi_migrate)(struct kvm_vcpu *vcpu);
>>      int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>      int (*get_tdp_level)(void);
>>      u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 1003341..7b9e1d0 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -152,6 +152,7 @@
>>  #define PIN_BASED_EXT_INTR_MASK                 0x00000001
>>  #define PIN_BASED_NMI_EXITING                   0x00000008
>>  #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
>> +#define PIN_BASED_POSTED_INTR                   0x00000080
>> 
>>  #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002 #define
>>  VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200 @@ -174,6 +175,7 @@
>>  /* VMCS Encodings */ enum vmcs_field {      VIRTUAL_PROCESSOR_ID          
>>   = 0x00000000, +    POSTED_INTR_NV                  = 0x00000002,
>>      GUEST_ES_SELECTOR               = 0x00000800,   GUEST_CS_SELECTOR     
>>           = 0x00000802,      GUEST_SS_SELECTOR               = 0x00000804,
>>  @@ -208,6 +210,8 @@ enum vmcs_field {       VIRTUAL_APIC_PAGE_ADDR_HIGH    
>>  = 0x00002013,       APIC_ACCESS_ADDR                = 0x00002014,
>>      APIC_ACCESS_ADDR_HIGH           = 0x00002015,
>> +    POSTED_INTR_DESC_ADDR           = 0x00002016,
>> +    POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
>>      EPT_POINTER                     = 0x0000201a,
>>      EPT_POINTER_HIGH                = 0x0000201b,
>>      EOI_EXIT_BITMAP0                = 0x0000201c,
>> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
>> index 1817fa9..97cb8ee 100644
>> --- a/arch/x86/kernel/apic/io_apic.c
>> +++ b/arch/x86/kernel/apic/io_apic.c
>> @@ -3277,6 +3277,144 @@ int arch_setup_dmar_msi(unsigned int irq)
>>  }
>>  #endif
>> +static int
>> +pi_set_affinity(struct irq_data *data, const struct cpumask *mask,
>> +                  bool force)
>> +{
>> +    unsigned int dest;
>> +    struct irq_cfg *cfg = (struct irq_cfg *)data->chip_data;
>> +    if (cpumask_equal(cfg->domain, mask))
>> +            return IRQ_SET_MASK_OK;
>> +
>> +    if (__ioapic_set_affinity(data, mask, &dest))
>> +            return -1;
>> +
>> +    return IRQ_SET_MASK_OK;
>> +}
>> +
>> +static void pi_mask(struct irq_data *data)
>> +{
>> +    ;
>> +}
>> +
>> +static void pi_unmask(struct irq_data *data)
>> +{
>> +    ;
>> +}
>> +
>> +static struct irq_chip pi_chip = {
>> +    .name       = "POSTED-INTR",
>> +    .irq_ack    = ack_apic_edge,
>> +    .irq_unmask = pi_unmask,
>> +    .irq_mask   = pi_mask,
>> +    .irq_set_affinity   = pi_set_affinity,
>> +};
>> +
>> +int arch_pi_migrate(int irq, int cpu)
>> +{
>> +    struct irq_data *data = irq_get_irq_data(irq);
>> +    struct irq_cfg *cfg;
>> +    struct irq_desc *desc = irq_to_desc(irq);
>> +    unsigned long flags;
>> +
>> +    if (!desc)
>> +            return -EINVAL;
>> +
>> +    cfg = irq_cfg(irq);
>> +    if (cpumask_equal(cfg->domain, cpumask_of(cpu)))
>> +            return cfg->vector;
>> +
>> +    irq_set_affinity(irq, cpumask_of(cpu));
>> +    raw_spin_lock_irqsave(&desc->lock, flags);
>> +    irq_move_irq(data);
>> +    raw_spin_unlock_irqrestore(&desc->lock, flags);
>> +
>> +    if (cfg->move_in_progress)
>> +            send_cleanup_vector(cfg);
>> +    return cfg->vector;
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_migrate);
>> +
>> +static int arch_pi_create_irq(const struct cpumask *mask)
>> +{
>> +    int node = cpu_to_node(0);
>> +    unsigned int irq_want;
>> +    struct irq_cfg *cfg;
>> +    unsigned long flags;
>> +    unsigned int ret = 0;
>> +    int irq;
>> +
>> +    irq_want = nr_irqs_gsi;
>> +
>> +    irq = alloc_irq_from(irq_want, node);
>> +    if (irq < 0)
>> +            return 0;
>> +    cfg = alloc_irq_cfg(irq_want, node);
> s/irq_want/irq.
> 
>> +    if (!cfg) {
>> +            free_irq_at(irq, NULL);
>> +            return 0;
>> +    }
>> +
>> +    raw_spin_lock_irqsave(&vector_lock, flags);
>> +    if (!__assign_irq_vector(irq, cfg, mask))
>> +            ret = irq;
>> +    raw_spin_unlock_irqrestore(&vector_lock, flags);
>> +
>> +    if (ret) {
>> +            irq_set_chip_data(irq, cfg);
>> +            irq_clear_status_flags(irq, IRQ_NOREQUEST);
>> +    } else {
>> +            free_irq_at(irq, cfg);
>> +    }
>> +    return ret;
>> +}
> 
> This function is mostly cut&paste of create_irq_nr().

Yes, this function allow to allocate vector from specified cpu.
 
>> +
>> +int arch_pi_alloc_irq(void *vmx)
>> +{
>> +    int irq, cpu = smp_processor_id();
>> +    struct irq_cfg *cfg;
>> +
>> +    irq = arch_pi_create_irq(cpumask_of(cpu));
>> +    if (!irq) {
>> +            pr_err("Posted Interrupt: no free irq\n");
>> +            return -EINVAL;
>> +    }
>> +    irq_set_handler_data(irq, vmx);
>> +    irq_set_chip_and_handler_name(irq, &pi_chip, handle_edge_irq, "edge");
>> +    irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
>> +    irq_set_affinity(irq, cpumask_of(cpu));
>> +
>> +    cfg = irq_cfg(irq);
>> +    if (cfg->move_in_progress)
>> +            send_cleanup_vector(cfg);
>> +
>> +    return irq;
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_alloc_irq);
>> +
>> +void arch_pi_free_irq(unsigned int irq, void *vmx)
>> +{
>> +    if (irq) {
>> +            irq_set_handler_data(irq, NULL);
>> +            /* This will mask the irq */
>> +            free_irq(irq, vmx);
>> +            destroy_irq(irq);
>> +    }
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_free_irq);
>> +
>> +int arch_pi_get_vector(unsigned int irq)
>> +{
>> +    struct irq_cfg *cfg;
>> +
>> +    if (!irq)
>> +            return -EINVAL;
>> +
>> +    cfg = irq_cfg(irq);
>> +    return cfg->vector;
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_get_vector);
>> +
>>  #ifdef CONFIG_HPET_TIMER
>>  
>>  static int hpet_msi_set_affinity(struct irq_data *data,
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index af48361..04220de 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -656,7 +656,7 @@ void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int
> vector,
>>  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>                           int vector, int level, int trig_mode)
>>  {
>> -    int result = 0;
>> +    int result = 0, send;
>>      struct kvm_vcpu *vcpu = apic->vcpu;
>>  
>>      switch (delivery_mode) {
>> @@ -674,6 +674,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>              } else {
>>                      apic_clear_vector(vector, apic->regs + APIC_TMR);
>>                      kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
>> +                    if (kvm_apic_pi_enabled(vcpu)) {
> Provide send_nv() that returns 0 if pi is disabled.
> 
>> +                            send = kvm_x86_ops->send_nv(vcpu, vector);
>> +                            if (send) {
> No need "send" variable here.

ok.
 
>> +                                    result = 1;
>> +                                    break;
>> +                            }
>> +                    }
>>              }
>>  
>>              result = !apic_test_and_set_irr(vector, apic);
>> @@ -1541,6 +1548,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
>> 
>>      if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
>>              apic->vid_enabled = true;
>> +
>> +    if (kvm_x86_ops->has_posted_interrupt(vcpu))
>> +            apic->pi_enabled = true;
>> +
> This is global state, no need per apic variable.
> 
>>      return 0;
>>  nomem_free_apic:
>>      kfree(apic);
>> @@ -1575,6 +1586,24 @@ int kvm_apic_get_highest_irr(struct kvm_vcpu
> *vcpu)
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
>> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir)
>> +{
>> +    struct kvm_lapic *apic = vcpu->arch.apic;
>> +    unsigned int *reg;
>> +    unsigned int i;
>> +
>> +    if (!apic || !apic_enabled(apic))
> Use kvm_vcpu_has_lapic() instead of !apic.

ok.
 
>> +            return;
>> +
>> +    for (i = 0; i <= 7; i++) {
>> +            reg = apic->regs + APIC_IRR + i * 0x10;
>> +            *reg |= pir[i];
> Non atomic access to IRR. Other threads may set bit there concurrently.
Ok.
 
>> +            pir[i] = 0;
>> +    }
> Should set apic->irr_pending to true when setting irr bit.
Right. Will add it in next version.

>> +    return;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
>> +
>>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
>>  {
>>      u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
>> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
>> index 2503a64..ad35868 100644
>> --- a/arch/x86/kvm/lapic.h
>> +++ b/arch/x86/kvm/lapic.h
>> @@ -21,6 +21,7 @@ struct kvm_lapic {
>>      struct kvm_vcpu *vcpu;  bool irr_pending;       bool vid_enabled; +     
>> bool
>>  pi_enabled;         /* Number of bits set in ISR. */        s16 isr_count;  
>> /* The
>>  highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -43,6
>>  +44,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int
>>  kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_cpu_get_extint(struct
>>  kvm_vcpu *v); int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
>>  +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
>>  void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64
>>  kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void
>>  kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
>> @@ -94,6 +96,12 @@ static inline bool kvm_apic_vid_enabled(struct kvm_vcpu
> *vcpu)
>>      return apic->vid_enabled;
>>  }
>> +static inline bool kvm_apic_pi_enabled(struct kvm_vcpu *vcpu)
>> +{
>> +    struct kvm_lapic *apic = vcpu->arch.apic;
>> +    return apic->pi_enabled;
>> +}
>> +
>>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
>>  void kvm_lapic_init(void);
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index f6ef090..6448b96 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -31,6 +31,7 @@
>>  #include <linux/ftrace_event.h> #include <linux/slab.h> #include
>>  <linux/tboot.h> +#include <linux/interrupt.h> #include
>>  "kvm_cache_regs.h" #include "x86.h"
>> @@ -89,6 +90,8 @@ module_param(enable_apicv_reg, bool, S_IRUGO);
>>  static bool __read_mostly enable_apicv_vid = 0;
>>  module_param(enable_apicv_vid, bool, S_IRUGO);
>> +static bool __read_mostly enable_apicv_pi = 0;
>> +module_param(enable_apicv_pi, bool, S_IRUGO);
>>  /*
>>   * If nested=1, nested virtualization is supported, i.e., guests may use
>>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
>> @@ -372,6 +375,44 @@ struct nested_vmx {
>>      struct page *apic_access_page;
>>  };
>> +/* Posted-Interrupt Descriptor */
>> +struct pi_desc {
>> +    u32 pir[8];     /* Posted interrupt requested */
>> +    union {
>> +            struct {
>> +                    u8  on:1,
>> +                        rsvd:7;
>> +            } control;
>> +            u32 rsvd[8];
>> +    } u;
>> +} __aligned(64);
>> +
>> +#define POSTED_INTR_ON  0
>> +u8 pi_test_on(struct pi_desc *pi_desc)
>> +{
>> +    return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
>> +}
>> +void pi_set_on(struct pi_desc *pi_desc)
>> +{
>> +    set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +void pi_clear_on(struct pi_desc *pi_desc)
>> +{
>> +    clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +u8 pi_test_and_set_on(struct pi_desc *pi_desc)
>> +{
>> +    return test_and_set_bit(POSTED_INTR_ON,
>> +                    (unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +void pi_set_pir(int vector, struct pi_desc *pi_desc)
>> +{
>> +    set_bit(vector, (unsigned long *)pi_desc->pir);
>> +}
>> +
>>  struct vcpu_vmx {   struct kvm_vcpu       vcpu;     unsigned long        
>>  host_rsp; @@ -439,6 +480,11 @@ struct vcpu_vmx {    u64
>>  eoi_exit_bitmap[4];         u64 eoi_exit_bitmap_global[4];
>> +    /* Posted interrupt descriptor */
>> +    struct pi_desc *pi;
>> +    u32 irq;
>> +    u32 vector;
>> +
>>      /* Support for a guest hypervisor (nested VMX) */
>>      struct nested_vmx nested;
>>  };
>> @@ -698,6 +744,11 @@ static u64 host_efer;
>> 
>>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
>> +int arch_pi_get_vector(unsigned int irq);
>> +int arch_pi_alloc_irq(struct vcpu_vmx *vmx);
>> +void arch_pi_free_irq(unsigned int irq, struct vcpu_vmx *vmx);
>> +int arch_pi_migrate(int irq, int cpu);
>> +
>>  /*
>>   * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
>>   * away by decrementing the array size.
>> @@ -783,6 +834,11 @@ static inline bool
> cpu_has_vmx_virtual_intr_delivery(void)
>>              SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>>  }
>> +static inline bool cpu_has_vmx_posted_intr(void)
>> +{
>> +    return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
>> +}
>> +
>>  static inline bool cpu_has_vmx_flexpriority(void)
>>  {
>>      return cpu_has_vmx_tpr_shadow() &&
>> @@ -1555,6 +1611,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu,
> int cpu)
>>              struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
>>              unsigned long sysenter_esp;
>> +            if (enable_apicv_pi && to_vmx(vcpu)->pi)
>> +                    pi_set_on(to_vmx(vcpu)->pi);
>> +
> Why?

Here means the vcpu start migration. So we should prevent the notification 
event until migration end.

>> +            kvm_make_request(KVM_REQ_POSTED_INTR, vcpu);
>> +
>>              kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);              
>> local_irq_disable();
>>              list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, @@ -1582,6
>>  +1643,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)          
>> vcpu->cpu
>>  = -1;               kvm_cpu_vmxoff();       }
>> +    if (enable_apicv_pi && to_vmx(vcpu)->pi)
>> +            pi_set_on(to_vmx(vcpu)->pi);
> Why?

When vcpu schedule out, no need to send notification event to it, just set the 
PIR and wakeup it is enough.

>>  }
>>  
>>  static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
>> @@ -2451,12 +2514,6 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>      u32 _vmexit_control = 0;
>>      u32 _vmentry_control = 0;
>> -    min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
>> -    opt = PIN_BASED_VIRTUAL_NMIS;
>> -    if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
>> -                            &_pin_based_exec_control) < 0)
>> -            return -EIO;
>> -
>>      min = CPU_BASED_HLT_EXITING |
>>  #ifdef CONFIG_X86_64
>>            CPU_BASED_CR8_LOAD_EXITING |
>> @@ -2531,6 +2588,17 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>                              &_vmexit_control) < 0)
>>              return -EIO;
>> +    min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
>> +    opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
>> +    if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
>> +                            &_pin_based_exec_control) < 0)
>> +            return -EIO;
>> +
>> +    if (!(_cpu_based_2nd_exec_control &
>> +            SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
>> +            !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
>> +            _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
>> +
>>      min = 0;        opt = VM_ENTRY_LOAD_IA32_PAT;   if 
>> (adjust_vmx_controls(min,
>>  opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2715,6 +2783,9 @@ static __init int
>>  hardware_setup(void)        if (!cpu_has_vmx_virtual_intr_delivery())
>>              enable_apicv_vid = 0;
>> +    if (!cpu_has_vmx_posted_intr() || !x2apic_enabled())
> In nested guest x2apic may be enabled without irq remapping. Check for
> irq remapping here.

There are no posted interrupt available in nested case. We don't need to check 
IR here.

> 
>> +            enable_apicv_pi = 0;
>> +
>>      if (nested)
>>              nested_vmx_setup_ctls_msrs();
>> @@ -3881,6 +3952,93 @@ static void ept_set_mmio_spte_mask(void)
>>      kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
>>  }
>> +irqreturn_t pi_handler(int irq, void *data)
>> +{
>> +    struct vcpu_vmx *vmx = data;
>> +
>> +    kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
>> +    kvm_vcpu_kick(&vmx->vcpu);
>> +
>> +    return IRQ_HANDLED;
>> +}
>> +
>> +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
>> +{
>> +    return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
>> +}
>> +
>> +static void vmx_pi_migrate(struct kvm_vcpu *vcpu)
>> +{
>> +    int ret = 0;
>> +    struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +    if (!enable_apicv_pi)
>> +            return ;
>> +
>> +    preempt_disable();
>> +    local_irq_disable();
>> +    if (!vmx->irq) {
>> +            ret = arch_pi_alloc_irq(vmx);
>> +            if (ret < 0) {
>> +                    vmx->irq = -1;
>> +                    goto out;
>> +            }
>> +            vmx->irq = ret;
>> +
>> +            ret = request_irq(vmx->irq, pi_handler, IRQF_NO_THREAD,
>> +                                    "Posted Interrupt", vmx);
>> +            if (ret) {
>> +                    vmx->irq = -1;
>> +                    goto out;
>> +            }
>> +
>> +            ret = arch_pi_get_vector(vmx->irq);
>> +    } else
>> +            ret = arch_pi_migrate(vmx->irq, smp_processor_id());
>> +
>> +    if (ret < 0) {
>> +            vmx->irq = -1;
>> +            goto out;
>> +    } else {
>> +            vmx->vector = ret;
>> +            vmcs_write16(POSTED_INTR_NV, vmx->vector);
>> +            pi_clear_on(vmx->pi);
>> +    }
>> +out:
>> +    local_irq_enable();
>> +    preempt_enable();
>> +    return ;
>> +}
>> +
>> +static int vmx_send_nv(struct kvm_vcpu *vcpu,
>> +            int vector)
>> +{
>> +    struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +    if (unlikely(vmx->irq == -1))
>> +            return 0;
>> +
>> +    if (vcpu->cpu == smp_processor_id()) {
>> +            pi_set_on(vmx->pi);
> Why? You clear this bit anyway in vmx_update_irq() during guest entry.
Here means the target vcpu already in vmx non-root mode. Then it will consume 
the interrupt on next vm entry and we don't need to send the notification event 
from other cpu, just update PIR is enough.

>> +            return 0; +     } + +   pi_set_pir(vector, vmx->pi); +  if
>> (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
>> +            apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), vmx->vector); +    
>>         return
>> 1; + } +     return 0; +} + +static void free_pi(struct vcpu_vmx *vmx) +{
>> +    if (enable_apicv_pi) { +                kfree(vmx->pi);
>> +            arch_pi_free_irq(vmx->irq, vmx); +      } +} +
>>  /*
>>   * Sets up the vmcs for emulated real mode.
>>   */
>> @@ -3890,6 +4048,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>      unsigned long a;
>>  #endif
>>      int i;
>> +    u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
>> 
>>      /* I/O */       vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); @@
>>  -3901,8 +4060,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>      vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
>>  
>>      /* Control */
>> -    vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
>> -            vmcs_config.pin_based_exec_ctrl);
>> +    if (!enable_apicv_pi)
>> +            pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
>> +
>> +    vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
>> 
>>      vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
> vmx_exec_control(vmx));
>> 
>> @@ -3920,6 +4081,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>              vmcs_write16(GUEST_INTR_STATUS, 0);
>>      }
>> +    if (enable_apicv_pi) {
>> +            vmx->pi = kmalloc(sizeof(struct pi_desc),
>> +                            GFP_KERNEL | __GFP_ZERO);
>> +            vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
>> +    }
>> +
>>      if (ple_gap) {          vmcs_write32(PLE_GAP, ple_gap);
>>              vmcs_write32(PLE_WINDOW, ple_window); @@ -6161,6 +6328,11 @@ 
>> static
>>  void vmx_update_irq(struct kvm_vcpu *vcpu)  if (!enable_apicv_vid)
>>              return ;
>> +    if (enable_apicv_pi) {
>> +            kvm_apic_update_irr(vcpu, (unsigned int *)vmx->pi->pir);
>> +            pi_clear_on(vmx->pi);
> Why do you do that? Isn't VMX process posted interrupts on vmentry if "on" bit
> is set?
> 
>> +    }
>> +
>>      vector = kvm_apic_get_highest_irr(vcpu);
>>      if (vector == -1)
>>              return;
>> @@ -6586,6 +6758,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>> 
>>      free_vpid(vmx);         free_nested(vmx); +     free_pi(vmx);
>>      free_loaded_vmcs(vmx->loaded_vmcs);     kfree(vmx->guest_msrs);
>>      kvm_vcpu_uninit(vcpu); @@ -7483,8 +7656,11 @@ static struct
>>  kvm_x86_ops vmx_x86_ops = {         .enable_irq_window = enable_irq_window,
>>      .update_cr8_intercept = update_cr8_intercept,
>>      .has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
>>  +   .has_posted_interrupt = vmx_has_posted_interrupt,       .update_irq =
>>  vmx_update_irq,     .set_eoi_exitmap = vmx_set_eoi_exitmap,
>> +    .send_nv = vmx_send_nv,
>> +    .pi_migrate = vmx_pi_migrate,
>> 
>>      .set_tss_addr = vmx_set_tss_addr,
>>      .get_tdp_level = get_ept_level,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 8b8de3b..f035267 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -5250,6 +5250,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>      bool req_immediate_exit = 0;
>>  
>>      if (vcpu->requests) {
>> +            if (kvm_check_request(KVM_REQ_POSTED_INTR, vcpu))
>> +                    kvm_x86_ops->pi_migrate(vcpu);
>>              if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
>>                      kvm_mmu_unload(vcpu);
>>              if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index ecc5543..f8d8d34 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -107,6 +107,7 @@ static inline bool is_error_page(struct page *page)
>>  #define KVM_REQ_IMMEDIATE_EXIT    15
>>  #define KVM_REQ_PMU               16
>>  #define KVM_REQ_PMI               17
>> +#define KVM_REQ_POSTED_INTR       18
>> 
>>  #define KVM_USERSPACE_IRQ_SOURCE_ID         0
>>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID    1
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index be70035..05baf1c 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1625,6 +1625,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>>                      smp_send_reschedule(cpu);
>>      put_cpu();
>>  }
>> +EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
>> +
>>  #endif /* !CONFIG_S390 */
>>  
>>  void kvm_resched(struct kvm_vcpu *vcpu)
>> --
>> 1.7.1
> 
> --
>                       Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to