This patch finally enable MSI-X. What we need for MSI-X: 1. Intercept one page in MMIO region of device. So that we can get guest desired MSI-X table and set up the real one.
2. IRQ fifo. Now one device can have more than one interrupt, and they are all handled by one workqueue structure. So we need to identify them. irq_fifo provide a mechanism to handle more than one interrupt at one time. 3. Mapping from host IRQ to guest gsi as well as guest gsi to real MSI/MSI-X message address/data. We used same entry number for the host and guest here, so that it's easy to find the correlated guest gsi. What we lack for now: 1. The PCI spec said nothing can existed with MSI-X table in the same page of MMIO region, except pending bits. The patch ignore pending bits as the first step (so they are always 0 - no pending). 2. The PCI spec allowed to change MSI-X table dynamically. That means, the OS can enable MSI-X, then mask one MSI-X entry, modify it, and unmask it. The patch didn't support this, and Linux also don't work in this way. 3. The patch didn't implement MSI-X mask all and mask single entry. I would implement the former in driver/pci/msi.c later. And for single entry, I would add a hook in intercepted MMIO's read/write handler later. Signed-off-by: Sheng Yang <[email protected]> --- include/linux/kvm.h | 4 + include/linux/kvm_host.h | 12 ++- virt/kvm/kvm_main.c | 246 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 255 insertions(+), 7 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c45b08d..0531838 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -397,6 +397,7 @@ struct kvm_trace_rec { #define KVM_CAP_GSI_MSG 24 #if defined(CONFIG_X86) #define KVM_CAP_INTERCEPTED_MMIO 25 +#define KVM_CAP_DEVICE_MSIX 26 #endif /* @@ -552,6 +553,9 @@ struct kvm_assigned_irq { #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION (1 << 0) #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 << 1) +#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION (1 << 2) +#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX (1 << 3) +#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX (1 << 4) struct kvm_assigned_gsi_msg { __u32 gsi; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e4d6b99..c0d29aa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -319,16 +319,24 @@ struct kvm_assigned_dev_kernel { int assigned_dev_id; int host_busnr; int host_devfn; - int host_irq; bool host_irq_disabled; - int guest_irq; #define KVM_ASSIGNED_DEV_IRQ_FIFO_LEN 0x100 struct kfifo *irq_fifo; spinlock_t irq_fifo_lock; + int entries_nr; + union { + int host_irq; + struct msix_entry *host_msix_entries; + }; + union { + int guest_irq; + struct msix_entry *guest_msix_entries; + }; #define KVM_ASSIGNED_DEV_GUEST_INTX (1 << 0) #define KVM_ASSIGNED_DEV_GUEST_MSI (1 << 1) #define KVM_ASSIGNED_DEV_HOST_INTX (1 << 8) #define KVM_ASSIGNED_DEV_HOST_MSI (1 << 9) +#define KVM_ASSIGNED_DEV_MSIX ((1 << 2) | (1 << 10)) unsigned long irq_requested_type; int irq_source_id; struct kvm_intercepted_mmio msix_mmio; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a5a9763..b453279 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,6 +100,41 @@ static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *h return NULL; } +static u32 find_gsi_from_host_irq(struct kvm_assigned_dev_kernel *assigned_dev, + int irq) +{ + int i; + int entry; + u32 gsi; + struct msix_entry *host_msix_entries, *guest_msix_entries; + + host_msix_entries = assigned_dev->host_msix_entries; + guest_msix_entries = assigned_dev->guest_msix_entries; + + entry = -1; + gsi = 0; + for (i = 0; i < assigned_dev->entries_nr; i++) + if (irq == (host_msix_entries + i)->vector) { + entry = (host_msix_entries + i)->entry; + break; + } + if (entry < 0) { + printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); + return 0; + } + for (i = 0; i < assigned_dev->entries_nr; i++) + if (entry == (guest_msix_entries + i)->entry) { + gsi = (guest_msix_entries + i)->vector; + break; + } + if (gsi == 0) { + printk(KERN_WARNING "Fail to find correlated MSI-X gsi!\n"); + return 0; + } + + return gsi; +} + static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) { struct kvm_assigned_dev_kernel *assigned_dev; @@ -119,12 +154,16 @@ handle_irq: kfifo_get(assigned_dev->irq_fifo, (unsigned char *)&irq, sizeof(int)); - gsi = assigned_dev->guest_irq; + if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) + gsi = find_gsi_from_host_irq(assigned_dev, irq); + else + gsi = assigned_dev->guest_irq; kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, gsi, 1); - if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) { - enable_irq(assigned_dev->host_irq); + if ((assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) || + (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)) { + enable_irq(irq); assigned_dev->host_irq_disabled = false; } @@ -196,11 +235,23 @@ static void kvm_free_assigned_irq(struct kvm *kvm, */ kvm_put_kvm(kvm); - free_irq(assigned_dev->host_irq, (void *)assigned_dev); + if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) { + int i; + for (i = 0; i < assigned_dev->entries_nr; i++) + free_irq((assigned_dev->host_msix_entries + i)->vector, + (void *)assigned_dev); + } else + free_irq(assigned_dev->host_irq, (void *)assigned_dev); if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) pci_disable_msi(assigned_dev->dev); + if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) { + kfree(assigned_dev->host_msix_entries); + kfree(assigned_dev->guest_msix_entries); + pci_disable_msix(assigned_dev->dev); + } + assigned_dev->irq_requested_type = 0; } @@ -325,6 +376,144 @@ static int assigned_device_update_msi(struct kvm *kvm, adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI; return 0; } + +static int assigned_device_update_msix_mmio(struct kvm *kvm, + struct kvm_assigned_dev_kernel *adev) +{ + struct kvm_intercepted_mmio *mmio = &adev->msix_mmio; + void * va; + u16 entries_nr = 0, entries_max_nr; + int pos, i, r = 0; + u32 msg_addr, msg_upper_addr, msg_data, msg_ctrl; + struct kvm_gsi_msg gsi_msg; + + pos = pci_find_capability(adev->dev, PCI_CAP_ID_MSIX); + if (!pos) + return -EINVAL; + + pci_read_config_word(adev->dev, pos + PCI_MSIX_FLAGS, &entries_max_nr); + entries_max_nr &= PCI_MSIX_FLAGS_QSIZE; + + va = kmap(mmio->page); + /* Get the usable entry number for allocating */ + for (i = 0; i < entries_max_nr; i++) { + memcpy(&msg_ctrl, va + i * 16 + 12, 4); + if (msg_ctrl & PCI_MSIX_FLAGS_BITMASK) + continue; + memcpy(&msg_data, va + i * 16 + 8, 4); + /* Ignore unused entry even it's unmasked */ + if (msg_data == 0) + continue; + entries_nr ++; + } + + adev->entries_nr = entries_nr; + adev->host_msix_entries = kmalloc(sizeof(struct msix_entry) * entries_nr, + GFP_KERNEL); + if (!adev->host_msix_entries) { + printk(KERN_ERR "no memory for host msix entries!\n"); + return -ENOMEM; + } + adev->guest_msix_entries = kmalloc(sizeof(struct msix_entry) * entries_nr, + GFP_KERNEL); + if (!adev->guest_msix_entries) { + printk(KERN_ERR "no memory for guest msix entries!\n"); + return -ENOMEM; + } + + entries_nr = 0; + for (i = 0; i < entries_max_nr; i++) { + if (entries_nr >= adev->entries_nr) + break; + memcpy(&msg_ctrl, va + i * 16 + 12, 4); + if (msg_ctrl & PCI_MSIX_FLAGS_BITMASK) + continue; + memcpy(&msg_addr, va + i * 16, 4); + memcpy(&msg_upper_addr, va + i * 16 + 4, 4); + memcpy(&msg_data, va + i * 16 + 8, 4); + if (msg_data == 0) + continue; + + gsi_msg.gsi = 0; + gsi_msg.msg.address_lo = msg_addr; + gsi_msg.msg.address_hi = msg_upper_addr; + gsi_msg.msg.data = msg_data; + r = kvm_update_gsi_msg(kvm, &gsi_msg); + if (r) { + printk(KERN_ERR "Fail to update gsi_msg for MSIX!"); + break; + } + (adev->guest_msix_entries + entries_nr)->entry = i; + (adev->guest_msix_entries + entries_nr)->vector = gsi_msg.gsi; + (adev->host_msix_entries + entries_nr)->entry = i; + entries_nr ++; + } + kunmap(mmio->page); + + return r; +} + +static int assigned_device_update_msix(struct kvm *kvm, + struct kvm_assigned_dev_kernel *adev, + struct kvm_assigned_irq *airq) +{ + /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ + int i, r; + + adev->ack_notifier.gsi = -1; + + if (irqchip_in_kernel(kvm)) { + if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX) { + printk(KERN_WARNING + "kvm: unsupported mask MSI-X, flags 0x%x!\n", + airq->flags); + return 0; + } + + if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) { + /* Guest disable MSI-X */ + kvm_free_assigned_irq(kvm, adev); + if (msi2intx) { + pci_enable_msi(adev->dev); + if (adev->dev->msi_enabled) + return assigned_device_update_msi(kvm, + adev, airq); + } + return assigned_device_update_intx(kvm, adev, airq); + } + + kvm_free_assigned_irq(kvm, adev); + + /* + * We only scan device (emulated) MMIO when guest want to enable + * MSI-X, and don't support dynamically add MSI-X entry for now + */ + r = assigned_device_update_msix_mmio(kvm, adev); + if (r) + return r; + + r = pci_enable_msix(adev->dev, adev->host_msix_entries, + adev->entries_nr); + if (r) { + printk(KERN_ERR "Fail to enable MSI-X feature!\n"); + return r; + } + + for (i = 0; i < adev->entries_nr; i++) { + r = request_irq((adev->host_msix_entries + i)->vector, + kvm_assigned_dev_intr, 0, + "kvm_assigned_msix_device", + (void *)adev); + if (r) + return r; + } + } + + adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX; + + return 0; +} + #endif static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, @@ -370,6 +559,16 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, } } +#ifdef CONFIG_X86 + if (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) { + r = assigned_device_update_msix(kvm, match, assigned_irq); + if (r) { + printk(KERN_WARNING "kvm: failed to execute " + "MSI-X action!\n"); + goto out_release; + } + } else +#endif if ((!msi2intx && (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION)) || (msi2intx && match->dev->msi_enabled)) { @@ -413,6 +612,33 @@ out_release: return r; } +static int assigned_dev_register_msix_mmio(struct kvm_assigned_dev_kernel *adev) +{ + int pos = pci_find_capability(adev->dev, PCI_CAP_ID_MSIX); + u32 msix_table_entry; + int bar_nr; + + adev->msix_mmio.dev = adev; + INIT_HLIST_NODE(&adev->msix_mmio.link); + + if (!pos) + return 0; + + if (pci_read_config_dword(adev->dev, pos + 4, + &msix_table_entry) != PCIBIOS_SUCCESSFUL) + return -EFAULT; + + bar_nr = msix_table_entry & PCI_MSIX_FLAGS_BIRMASK; + + /* Get table offset */ + msix_table_entry &= ~PCI_MSIX_FLAGS_BIRMASK; + adev->msix_mmio.pfn = (pci_resource_start(adev->dev, bar_nr) + + msix_table_entry) >> PAGE_SHIFT; + + kvm_register_intercept_mmio(adev->kvm, &adev->msix_mmio); + return 0; +} + static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { @@ -475,15 +701,25 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, if (!match->irq_fifo) goto out_list_del; + /* + * Check for MSI-X capability, if device got, we need to intercept + * its MSI-X table accessing + */ + if (assigned_dev_register_msix_mmio(match)) + goto out_fifo_del; + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { r = kvm_iommu_map_guest(kvm, match); if (r) - goto out_fifo_del; + goto out_unregister; } out: mutex_unlock(&kvm->lock); return r; +out_unregister: + if (pci_find_capability(dev, PCI_CAP_ID_MSIX)) + kvm_unregister_intercept_mmio(&match->msix_mmio); out_fifo_del: kfifo_free(match->irq_fifo); out_list_del: -- 1.5.4.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html
