[PATCH 2/4] KVM: pci device assignment

Ben-Ami Yassour Tue, 22 Jul 2008 05:16:08 -0700

Based on a patch from: Amit Shah <[EMAIL PROTECTED]>

This patch adds support for handling PCI devices that are assigned to
the guest.


The device to be assigned to the guest is registered in the host kernel
and interrupt delivery is handled. If a device is already assigned, or
the device driver for it is still loaded on the host, the device
assignment
is failed by conveying a -EBUSY reply to the userspace.

Devices that share their interrupt line are not supported at the moment.

By itself, this patch will not make devices work within the guest.
The VT-d extension is required to enable the device to perform DMA.
Another alternative is PVDMA.

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Ben-Ami Yassour <[EMAIL PROTECTED]>
Signed-off-by: Weidong Han <[EMAIL PROTECTED]>
---
 arch/x86/kvm/i8259.c       |    5 +-
 arch/x86/kvm/irq.c         |    2 +-
 arch/x86/kvm/irq.h         |    3 +-
 arch/x86/kvm/x86.c         |  215 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/kvm.h      |    1 +
 include/asm-x86/kvm_host.h |   20 ++++
 include/asm-x86/kvm_para.h |    1 -
 include/linux/kvm.h        |   21 +++++
 virt/kvm/ioapic.c          |    4 +
 virt/kvm/ioapic.h          |    1 +
 10 files changed, 269 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 55e179a..d6793f0 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -159,9 +159,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, 
int irq)
                s->irr &= ~(1 << irq);
 }
 
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
 {
        int irq, irq2, intno;
+       struct kvm_pic *s = pic_irqchip(kvm);
 
        irq = pic_get_irq(&s->pics[0]);
        if (irq >= 0) {
@@ -186,6 +187,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
                irq = 7;
                intno = s->pics[0].irq_base + irq;
        }
+       kvm_notify_acked_irq(kvm, irq);
+
        pic_update_irq(s);
 
        return intno;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 9091195..3c508af 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
                if (kvm_apic_accept_pic_intr(v)) {
                        s = pic_irqchip(v->kvm);
                        s->output = 0;          /* PIC */
-                       vector = kvm_pic_read_irq(s);
+                       vector = kvm_pic_read_irq(v->kvm);
                }
        }
        return vector;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 95fe718..479a3d2 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,11 +63,12 @@ struct kvm_pic {
        void *irq_request_opaque;
        int output;             /* intr from master PIC */
        struct kvm_io_device dev;
+       void (*ack_notifier)(void *opaque, int irq);
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
 void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
 static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 376ef73..d9aa931 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,12 @@
  * derived from drivers/kvm/kvm_main.c
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
  *
  * Authors:
  *   Avi Kivity   <[EMAIL PROTECTED]>
  *   Yaniv Kamay  <[EMAIL PROTECTED]>
+ *   Amit Shah    <[EMAIL PROTECTED]>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -23,8 +25,10 @@
 #include "x86.h"
 
 #include <linux/clocksource.h>
+#include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
+#include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
@@ -98,6 +102,204 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
 
+struct kvm_assigned_dev_kernel
+*kvm_find_assigned_dev(struct list_head *head,
+                    struct kvm_assigned_dev_info *assigned_dev_info)
+{
+       struct list_head *ptr;
+       struct kvm_assigned_dev_kernel *match;
+
+       list_for_each(ptr, head) {
+               match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+               if ((match->host.busnr == assigned_dev_info->busnr) &&
+                   (match->host.devfn == assigned_dev_info->devfn))
+                       return match;
+       }
+       return NULL;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+       struct kvm_assigned_dev_work *int_work;
+
+       int_work = container_of(work, struct kvm_assigned_dev_work, work);
+
+       /* This is taken to safely inject irq inside the guest. When
+        * the interrupt injection (or the ioapic code) uses a
+        * finer-grained lock, update this
+        */
+       mutex_lock(&int_work->assigned_dev->kvm->lock);
+       kvm_set_irq(int_work->assigned_dev->kvm,
+                   int_work->assigned_dev->guest.irq[0], 1);
+       mutex_unlock(&int_work->assigned_dev->kvm->lock);
+       kvm_put_kvm(int_work->assigned_dev->kvm);
+}
+
+/* FIXME: Implement the OR logic needed to make shared interrupts on
+ * this line behave properly
+ */
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev =
+               (struct kvm_assigned_dev_kernel *) dev_id;
+
+       kvm_get_kvm(assigned_dev->kvm);
+       schedule_work(&assigned_dev->int_work.work);
+       disable_irq_nosync(irq);
+       return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+       struct kvm_assigned_dev_kernel *dev;
+
+       if (kian->gsi == -1)
+               return;
+
+       dev = container_of(kian, struct kvm_assigned_dev_kernel,
+                          ack_notifier);
+       kvm_set_irq(dev->kvm, dev->guest.irq[0], 0);
+       enable_irq(dev->host.irq[0]);
+}
+
+static int
+kvm_vm_ioctl_device_assignment(struct kvm *kvm,
+                              struct kvm_assigned_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+       struct pci_dev *dev;
+
+       if (assigned_dev->host.num_valid_irqs != 1) {
+               printk(KERN_INFO "%s: Unsupported number of irqs %d\n",
+                      __func__, assigned_dev->host.num_valid_irqs);
+               return -EINVAL;
+       }
+
+       mutex_lock(&kvm->lock);
+
+       /* Check if this is a request to update the irq of the device
+        * in the guest (BIOS/ kernels can dynamically reprogram irq
+        * numbers).  This also protects us from adding the same
+        * device twice.
+        */
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     &assigned_dev->host);
+       if (match) {
+               match->guest.irq[0] = assigned_dev->guest.irq[0];
+               match->ack_notifier.gsi = assigned_dev->guest.irq[0];
+               goto out;
+       }
+
+       match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+       if (match == NULL) {
+               printk(KERN_INFO "%s: Couldn't allocate memory\n",
+                      __func__);
+               r = -ENOMEM;
+               goto out;
+       }
+       dev = pci_get_bus_and_slot(assigned_dev->host.busnr,
+                                  assigned_dev->host.devfn);
+       if (!dev) {
+               printk(KERN_INFO "%s: host device not found\n", __func__);
+               r = -EINVAL;
+               goto out_free;
+       }
+       if (pci_enable_device(dev)) {
+               printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+               r = -EBUSY;
+               goto out_put;
+       }
+       r = pci_request_regions(dev, "kvm_assigned_device");
+       if (r) {
+               printk(KERN_INFO "%s: Could not get access to device regions\n",
+                      __func__);
+               goto out_disable;
+       }
+       match->guest.busnr = assigned_dev->guest.busnr;
+       match->guest.devfn = assigned_dev->guest.devfn;
+       match->host.busnr = assigned_dev->host.busnr;
+       match->host.devfn = assigned_dev->host.devfn;
+       match->dev = dev;
+
+       INIT_WORK(&match->int_work.work,
+                 kvm_assigned_dev_interrupt_work_handler);
+
+       match->kvm = kvm;
+       match->int_work.assigned_dev = match;
+
+       list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+       if (irqchip_in_kernel(kvm)) {
+               match->guest.irq[0] = assigned_dev->guest.irq[0];
+               match->host.irq[0] = dev->irq;
+               match->ack_notifier.gsi = assigned_dev->guest.irq[0];
+               match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+               kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+
+               /* Even though this is PCI, we don't want to use shared
+                * interrupts. Sharing host devices with guest-assigned devices
+                * on the same interrupt line is not a happy situation: there
+                * are going to be long delays in accepting, acking, etc.
+                */
+               if (request_irq(dev->irq, kvm_assigned_dev_intr, 0,
+                               "kvm_assigned_device", (void *)match)) {
+                       printk(KERN_INFO "%s: couldn't allocate irq for pv "
+                              "device\n", __func__);
+                       r = -EIO;
+                       goto out_list_del;
+               }
+       }
+
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+out_list_del:
+       list_del(&match->list);
+       pci_release_regions(dev);
+out_disable:
+       pci_disable_device(dev);
+out_put:
+       pci_dev_put(dev);
+out_free:
+       kfree(match);
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+static void kvm_free_assigned_devices(struct kvm *kvm)
+{
+       struct list_head *ptr, *ptr2;
+       struct kvm_assigned_dev_kernel *assigned_dev;
+
+       list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+               assigned_dev = list_entry(ptr,
+                                         struct kvm_assigned_dev_kernel,
+                                         list);
+
+               if (irqchip_in_kernel(kvm) && assigned_dev->host.irq[0])
+                       free_irq(assigned_dev->host.irq[0],
+                                (void *)assigned_dev);
+
+               kvm_unregister_irq_ack_notifier(kvm,
+                                               &assigned_dev->ack_notifier);
+
+               if (cancel_work_sync(&assigned_dev->int_work.work))
+                       /* We had pending work. That means we will have to take
+                        * care of kvm_put_kvm.
+                        */
+                       kvm_put_kvm(kvm);
+
+               pci_release_regions(assigned_dev->dev);
+               pci_disable_device(assigned_dev->dev);
+               pci_dev_put(assigned_dev->dev);
+
+               list_del(&assigned_dev->list);
+               kfree(assigned_dev);
+       }
+}
 
 unsigned long segment_base(u16 selector)
 {
@@ -1746,6 +1948,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_UPDATE_ASSIGNED_DEVICE: {
+               struct kvm_assigned_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_device_assignment(kvm, &assigned_dev);
+               if (r)
+                       goto out;
+               break;
+       }
        case KVM_GET_PIT: {
                struct kvm_pit_state ps;
                r = -EFAULT;
@@ -3925,6 +4138,7 @@ struct  kvm *kvm_arch_create_vm(void)
                return ERR_PTR(-ENOMEM);
 
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
        return kvm;
 }
@@ -3957,6 +4171,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       kvm_free_assigned_devices(kvm);
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
index 8f13749..12b4b25 100644
--- a/include/asm-x86/kvm.h
+++ b/include/asm-x86/kvm.h
@@ -208,4 +208,5 @@ struct kvm_pit_channel_state {
 struct kvm_pit_state {
        struct kvm_pit_channel_state channels[3];
 };
+
 #endif
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index e2864e6..34eb3e7 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -325,6 +325,25 @@ struct kvm_irq_ack_notifier {
        void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+/* For assigned devices, we schedule work in the system workqueue to
+ * inject interrupts into the guest when an interrupt occurs on the
+ * physical device and also when the guest acks the interrupt.
+ */
+struct kvm_assigned_dev_work {
+       struct work_struct work;
+       struct kvm_assigned_dev_kernel *assigned_dev;
+};
+
+struct kvm_assigned_dev_kernel {
+       struct kvm_irq_ack_notifier ack_notifier;
+       struct list_head list;
+       struct kvm_assigned_dev_info guest;
+       struct kvm_assigned_dev_info host;
+       struct kvm_assigned_dev_work int_work;
+       struct pci_dev *dev;
+       struct kvm *kvm;
+};
+
 struct kvm_arch{
        int naliases;
        struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
@@ -337,6 +356,7 @@ struct kvm_arch{
         * Hash table of struct kvm_mmu_page.
         */
        struct list_head active_mmu_pages;
+       struct list_head assigned_dev_head;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 76f3921..3aa1731 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -143,5 +143,4 @@ static inline unsigned int kvm_arch_para_features(void)
 }
 
 #endif
-
 #endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6edba45..c436c08 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -382,6 +382,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_PV_MMU 13
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
+#define KVM_CAP_DEVICE_ASSIGNMENT 16
 
 /*
  * ioctls for VM fds
@@ -411,6 +412,8 @@ struct kvm_trace_rec {
                        _IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
                        _IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_UPDATE_ASSIGNED_DEVICE _IOR(KVMIO, 0x69,           \
+                                       struct kvm_assigned_dev)
 
 /*
  * ioctls for vcpu fds
@@ -475,4 +478,22 @@ struct kvm_trace_rec {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
+#define ASSIGNED_DEV_MAX_IRQ 16
+
+/* Stores information for identifying host PCI devices assigned to the
+ * guest: this is used in the host kernel and in the userspace.
+ */
+struct kvm_assigned_dev_info {
+       __u32 busnr;
+       __u32 devfn;
+       __u32 irq[ASSIGNED_DEV_MAX_IRQ];
+       __u32 num_valid_irqs; /* currently only 1 is supported */
+};
+
+/* Mapping between host and guest PCI device */
+struct kvm_assigned_dev {
+       struct kvm_assigned_dev_info guest;
+       struct kvm_assigned_dev_info host;
+};
+
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c0d2287..5d68d0b 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -39,6 +39,7 @@
 
 #include "ioapic.h"
 #include "lapic.h"
+#include "irq.h"
 
 #if 0
 #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
@@ -293,6 +294,9 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic 
*ioapic, int gsi)
        ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 
        ent->fields.remote_irr = 0;
+
+       kvm_notify_acked_irq(ioapic->kvm, gsi);
+
        if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
                ioapic_service(ioapic, gsi);
 }
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7f16675..a42743f 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -58,6 +58,7 @@ struct kvm_ioapic {
        } redirtbl[IOAPIC_NUM_PINS];
        struct kvm_io_device dev;
        struct kvm *kvm;
+       void (*ack_notifier)(void *opaque, int irq);
 };
 
 #ifdef DEBUG
-- 
1.5.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] KVM: pci device assignment

Reply via email to