On Thu, Dec 13, 2012 at 03:29:40PM +0800, Yang Zhang wrote:
> From: Yang Zhang <[email protected]>
>
> Posted Interrupt allows APIC interrupts to inject into guest directly
> without any vmexit.
>
> - When delivering a interrupt to guest, if target vcpu is running,
> update Posted-interrupt requests bitmap and send a notification event
> to the vcpu. Then the vcpu will handle this interrupt automatically,
> without any software involvemnt.
>
> - If target vcpu is not running or there already a notification event
> pending in the vcpu, do nothing. The interrupt will be handled by
> next vm entry.
>
> Signed-off-by: Yang Zhang <[email protected]>
> ---
> arch/x86/include/asm/entry_arch.h | 1 +
> arch/x86/include/asm/hw_irq.h | 1 +
> arch/x86/include/asm/irq.h | 1 +
> arch/x86/include/asm/irq_vectors.h | 4 +
> arch/x86/include/asm/kvm_host.h | 3 +
> arch/x86/include/asm/vmx.h | 4 +
> arch/x86/kernel/entry_64.S | 2 +
> arch/x86/kernel/irq.c | 25 +++++++
> arch/x86/kernel/irqinit.c | 2 +
> arch/x86/kvm/lapic.c | 16 +++-
> arch/x86/kvm/lapic.h | 1 +
> arch/x86/kvm/vmx.c | 133
> +++++++++++++++++++++++++++++++++---
> 12 files changed, 180 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/entry_arch.h
> b/arch/x86/include/asm/entry_arch.h
> index 40afa00..7b0a29e 100644
> --- a/arch/x86/include/asm/entry_arch.h
> +++ b/arch/x86/include/asm/entry_arch.h
> @@ -18,6 +18,7 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
> #endif
>
> BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
> +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR)
>
> /*
> * every pentium local APIC has two 'local interrupts', with a
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index eb92a6e..ee61af3 100644
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -28,6 +28,7 @@
> /* Interrupt handlers registered during init_IRQ */
> extern void apic_timer_interrupt(void);
> extern void x86_platform_ipi(void);
> +extern void posted_intr_ipi(void);
> extern void error_interrupt(void);
> extern void irq_work_interrupt(void);
>
> diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
> index ba870bb..cff9933 100644
> --- a/arch/x86/include/asm/irq.h
> +++ b/arch/x86/include/asm/irq.h
> @@ -30,6 +30,7 @@ extern void irq_force_complete_move(int);
> #endif
>
> extern void (*x86_platform_ipi_callback)(void);
> +extern void (*posted_intr_callback)(void);
> extern void native_init_IRQ(void);
> extern bool handle_irq(unsigned irq, struct pt_regs *regs);
>
> diff --git a/arch/x86/include/asm/irq_vectors.h
> b/arch/x86/include/asm/irq_vectors.h
> index 1508e51..8f2e383 100644
> --- a/arch/x86/include/asm/irq_vectors.h
> +++ b/arch/x86/include/asm/irq_vectors.h
> @@ -102,6 +102,10 @@
> */
> #define X86_PLATFORM_IPI_VECTOR 0xf7
>
> +#ifdef CONFIG_HAVE_KVM
Users of POSTED_INTR_VECTOR are not under ifdef, which means compilation
will fails with kvm disabled. Test it please.
> +#define POSTED_INTR_VECTOR 0xf2
> +#endif
> +
> /*
> * IRQ work vector:
> */
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7e26d1a..82423a8 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -700,6 +700,9 @@ struct kvm_x86_ops {
> int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
> void (*update_irq)(struct kvm_vcpu *vcpu);
> void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector, bool set);
> + int (*has_posted_interrupt)(struct kvm_vcpu *vcpu);
> + int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
> + void (*update_irr)(struct kvm_vcpu *vcpu);
> int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> int (*get_tdp_level)(void);
> u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 1003341..7b9e1d0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -152,6 +152,7 @@
> #define PIN_BASED_EXT_INTR_MASK 0x00000001
> #define PIN_BASED_NMI_EXITING 0x00000008
> #define PIN_BASED_VIRTUAL_NMIS 0x00000020
> +#define PIN_BASED_POSTED_INTR 0x00000080
>
> #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
> #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
> @@ -174,6 +175,7 @@
> /* VMCS Encodings */
> enum vmcs_field {
> VIRTUAL_PROCESSOR_ID = 0x00000000,
> + POSTED_INTR_NV = 0x00000002,
> GUEST_ES_SELECTOR = 0x00000800,
> GUEST_CS_SELECTOR = 0x00000802,
> GUEST_SS_SELECTOR = 0x00000804,
> @@ -208,6 +210,8 @@ enum vmcs_field {
> VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
> APIC_ACCESS_ADDR = 0x00002014,
> APIC_ACCESS_ADDR_HIGH = 0x00002015,
> + POSTED_INTR_DESC_ADDR = 0x00002016,
> + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
> EPT_POINTER = 0x0000201a,
> EPT_POINTER_HIGH = 0x0000201b,
> EOI_EXIT_BITMAP0 = 0x0000201c,
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index b51b2c7..d06eea1 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1160,6 +1160,8 @@ apicinterrupt LOCAL_TIMER_VECTOR \
> apic_timer_interrupt smp_apic_timer_interrupt
> apicinterrupt X86_PLATFORM_IPI_VECTOR \
> x86_platform_ipi smp_x86_platform_ipi
> +apicinterrupt POSTED_INTR_VECTOR \
> + posted_intr_ipi smp_posted_intr_ipi
>
> apicinterrupt THRESHOLD_APIC_VECTOR \
> threshold_interrupt smp_threshold_interrupt
> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> index e4595f1..781d324 100644
> --- a/arch/x86/kernel/irq.c
> +++ b/arch/x86/kernel/irq.c
> @@ -22,6 +22,9 @@ atomic_t irq_err_count;
>
> /* Function pointer for generic interrupt vector handling */
> void (*x86_platform_ipi_callback)(void) = NULL;
> +/* Function pointer for posted interrupt vector handling */
> +void (*posted_intr_callback)(void) = NULL;
> +EXPORT_SYMBOL_GPL(posted_intr_callback);
>
> /*
> * 'what should we do if we get a hw irq event on an illegal vector'.
> @@ -228,6 +231,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
> set_irq_regs(old_regs);
> }
>
> +/*
> + * Handler for POSTED_INTERRUPT_VECTOR.
> + */
> +void smp_posted_intr_ipi(struct pt_regs *regs)
> +{
> + struct pt_regs *old_regs = set_irq_regs(regs);
> +
> + ack_APIC_irq();
> +
> + irq_enter();
> +
> + exit_idle();
> +
> + if (posted_intr_callback)
> + posted_intr_callback();
> +
The callback does nothing. Drop it.
> + irq_exit();
> +
> + set_irq_regs(old_regs);
> +}
> +
> +
> EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
>
> #ifdef CONFIG_HOTPLUG_CPU
> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
> index 6e03b0d..d15ca4f 100644
> --- a/arch/x86/kernel/irqinit.c
> +++ b/arch/x86/kernel/irqinit.c
> @@ -205,6 +205,8 @@ static void __init apic_intr_init(void)
>
> /* IPI for X86 platform specific use */
> alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
> + /* IPI for posted interrupt use */
> + alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi);
>
> /* IPI vectors for APIC spurious and error interrupts */
> alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2109a6a..d660b9d 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -350,6 +350,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
> *apic)
> if (!apic->irr_pending)
> return -1;
>
> + kvm_x86_ops->update_irr(apic->vcpu);
> result = apic_search_irr(apic);
> ASSERT(result == -1 || result >= 16);
>
> @@ -725,18 +726,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic,
> int delivery_mode,
> if (trig_mode) {
> apic_debug("level trig mode for vector %d", vector);
> apic_set_vector(vector, apic->regs + APIC_TMR);
> - } else
> + } else {
> apic_clear_vector(vector, apic->regs + APIC_TMR);
> -
Why doing pi only for edge triggered interrupts?
> + if (kvm_x86_ops->has_posted_interrupt(vcpu)) {
Drop has_posted_interrupt(). Just call send_nv() directly. And give it
more descriptive name.
> + result = 1;
> + apic->irr_pending = true;
This is always true with vid anyway.
> + kvm_x86_ops->send_nv(vcpu, vector);
> + goto out;
> + }
> + }
> result = !apic_test_and_set_irr(vector, apic);
> - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
> - trig_mode, vector, !result);
> if (!result) {
> if (trig_mode)
> apic_debug("level trig mode repeatedly for "
> "vector %d", vector);
> break;
> }
> +out:
> + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
> + trig_mode, vector, !result);
No trace if !result now.
>
> kvm_make_request(KVM_REQ_EVENT, vcpu);
> kvm_vcpu_kick(vcpu);
What is the point of sending notification vector to vcpu is you kick it
out of a guest mode immediately after?
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 10e3f66..0f8361e 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -42,6 +42,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
> int kvm_cpu_has_extint(struct kvm_vcpu *v);
> int kvm_cpu_get_extint(struct kvm_vcpu *v);
> int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
> void kvm_lapic_reset(struct kvm_vcpu *vcpu);
> u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
> void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6b6bd03..07dbde6 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -31,6 +31,7 @@
> #include <linux/ftrace_event.h>
> #include <linux/slab.h>
> #include <linux/tboot.h>
> +#include <linux/interrupt.h>
> #include "kvm_cache_regs.h"
> #include "x86.h"
>
> @@ -86,6 +87,8 @@ module_param(fasteoi, bool, S_IRUGO);
> static bool __read_mostly enable_apicv_reg_vid = 1;
> module_param(enable_apicv_reg_vid, bool, S_IRUGO);
>
> +static bool __read_mostly enable_apicv_pi = 1;
> +module_param(enable_apicv_pi, bool, S_IRUGO);
> /*
> * If nested=1, nested virtualization is supported, i.e., guests may use
> * VMX and be a hypervisor for its own guests. If nested=0, guests may not
> @@ -369,6 +372,35 @@ struct nested_vmx {
> struct page *apic_access_page;
> };
>
> +#define POSTED_INTR_ON 0
> +/* Posted-Interrupt Descriptor */
> +struct pi_desc {
> + u32 pir[8]; /* Posted interrupt requested */
> + union {
> + struct {
> + u8 on:1,
> + rsvd:7;
> + } control;
> + u32 rsvd[8];
> + } u;
> +} __aligned(64);
> +
> +static void pi_clear_on(struct pi_desc *pi_desc)
> +{
> + clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> +}
> +
> +static u8 pi_test_and_set_on(struct pi_desc *pi_desc)
> +{
> + return test_and_set_bit(POSTED_INTR_ON,
> + (unsigned long *)&pi_desc->u.control);
> +}
> +
> +static void pi_set_pir(int vector, struct pi_desc *pi_desc)
> +{
> + set_bit(vector, (unsigned long *)pi_desc->pir);
> +}
> +
> struct vcpu_vmx {
> struct kvm_vcpu vcpu;
> unsigned long host_rsp;
> @@ -435,6 +467,9 @@ struct vcpu_vmx {
> u8 eoi_exitmap_changed;
> u32 eoi_exit_bitmap[8];
>
> + /* Posted interrupt descriptor */
> + struct pi_desc *pi;
> +
I am not convinced we should try to save 46 bytes here when !pi.
> /* Support for a guest hypervisor (nested VMX) */
> struct nested_vmx nested;
> };
> @@ -779,6 +814,11 @@ static inline bool
> cpu_has_vmx_virtual_intr_delivery(void)
> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
> }
>
> +static inline bool cpu_has_vmx_posted_intr(void)
> +{
> + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
> +}
> +
> static inline bool cpu_has_vmx_flexpriority(void)
> {
> return cpu_has_vmx_tpr_shadow() &&
> @@ -2475,12 +2515,6 @@ static __init int setup_vmcs_config(struct vmcs_config
> *vmcs_conf)
> u32 _vmexit_control = 0;
> u32 _vmentry_control = 0;
>
> - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
> - opt = PIN_BASED_VIRTUAL_NMIS;
> - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
> - &_pin_based_exec_control) < 0)
> - return -EIO;
> -
> min = CPU_BASED_HLT_EXITING |
> #ifdef CONFIG_X86_64
> CPU_BASED_CR8_LOAD_EXITING |
> @@ -2554,6 +2588,17 @@ static __init int setup_vmcs_config(struct vmcs_config
> *vmcs_conf)
> &_vmexit_control) < 0)
> return -EIO;
>
> + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
> + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
> + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
> + &_pin_based_exec_control) < 0)
> + return -EIO;
> +
> + if (!(_cpu_based_2nd_exec_control &
> + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
> + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
> + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
> +
> min = 0;
> opt = VM_ENTRY_LOAD_IA32_PAT;
> if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
> @@ -2739,6 +2784,9 @@ static __init int hardware_setup(void)
> if (enable_apicv_reg_vid)
> kvm_x86_ops->update_cr8_intercept = NULL;
>
> + if (!cpu_has_vmx_posted_intr() || !enable_apicv_reg_vid)
> + enable_apicv_pi = 0;
> +
> if (nested)
> nested_vmx_setup_ctls_msrs();
>
> @@ -3904,6 +3952,57 @@ static void ept_set_mmio_spte_mask(void)
> kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
> }
>
> +static void pi_handler(void)
> +{
> + ;
> +}
> +
> +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
> +{
> + return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
> +}
> +
> +static int vmx_send_nv(struct kvm_vcpu *vcpu,
> + int vector)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> + pi_set_pir(vector, vmx->pi);
> + if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
> + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
> POSTED_INTR_VECTOR);
> + return 1;
> + }
> + return 0;
> +}
Return value is not checked by the caller.
> +
> +static void vmx_update_irr(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct kvm_lapic *apic = vcpu->arch.apic;
> + unsigned int i, old, new, val, irr_off;
Spaces instead of tabs. Not only here. Run through checkpatch.
> +
> + if (!enable_apicv_pi)
> + return;
> +
Why no check for pit->on before syncing?
> + for (i = 0; i <= 7; i++) {
> + if (vmx->pi->pir[i]) {
> + irr_off = APIC_IRR + i * 0x10;
> + do {
> + old = kvm_apic_get_reg(apic, irr_off);
> + new = old | vmx->pi->pir[i];
> + val = cmpxchg((u32 *)(apic->regs + irr_off),
> old, new);
> + } while (unlikely (val != old));
> + vmx->pi->pir[i] = 0;
> + }
> + }
> +}
> +
> +static void free_pi(struct vcpu_vmx *vmx)
> +{
> + if (enable_apicv_pi)
> + kfree(vmx->pi);
> +}
> +
> /*
> * Sets up the vmcs for emulated real mode.
> */
> @@ -3913,6 +4012,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> unsigned long a;
> #endif
> int i;
> + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
> u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
>
> /* I/O */
> @@ -3925,8 +4025,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
>
> /* Control */
> - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> - vmcs_config.pin_based_exec_ctrl);
> + if (!enable_apicv_pi)
> + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
> +
> + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
>
> vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
>
> @@ -3944,6 +4046,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
> vmcs_write16(GUEST_INTR_STATUS, 0);
> }
>
> + if (enable_apicv_pi) {
> + vmx->pi = kmalloc(sizeof(struct pi_desc),
> + GFP_KERNEL | __GFP_ZERO);
kzalloc().
> + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
> + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
> + }
> +
> if (ple_gap) {
> vmcs_write32(PLE_GAP, ple_gap);
> vmcs_write32(PLE_WINDOW, ple_window);
> @@ -6220,6 +6329,8 @@ static void vmx_update_irq(struct kvm_vcpu *vcpu)
> vmx->eoi_exit_bitmap[index]);
> vmx->eoi_exitmap_changed = 0;
> }
> + if (enable_apicv_pi)
> + pi_clear_on(vmx->pi);
> }
>
> static void vmx_update_eoi_exitmap(struct kvm_vcpu *vcpu,
> @@ -6626,6 +6737,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>
> free_vpid(vmx);
> free_nested(vmx);
> + free_pi(vmx);
> free_loaded_vmcs(vmx->loaded_vmcs);
> kfree(vmx->guest_msrs);
> kvm_vcpu_uninit(vcpu);
> @@ -7520,8 +7632,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
> .enable_irq_window = enable_irq_window,
> .update_cr8_intercept = update_cr8_intercept,
> .has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
> + .has_posted_interrupt = vmx_has_posted_interrupt,
> .update_irq = vmx_update_irq,
> .update_eoi_exitmap = vmx_update_eoi_exitmap,
> + .send_nv = vmx_send_nv,
> + .update_irr = vmx_update_irr,
>
SVM?
> .set_tss_addr = vmx_set_tss_addr,
> .get_tdp_level = get_ept_level,
> @@ -7618,7 +7733,7 @@ static int __init vmx_init(void)
> /* SELF-IPI */
> vmx_disable_intercept_for_msr_write(0x83f, false);
> }
> -
> + posted_intr_callback = pi_handler;
> if (enable_ept) {
> kvm_mmu_set_mask_ptes(0ull,
> (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
> --
> 1.7.1
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html