* On Wednesday 17 Sep 2008 11:15:16 Zhang, Xiantao wrote:
> Seems it lacks device-assignment.[c,h] ?
> Xiantao

Hmm, here is the version with the files.

>From cd82862ef7493afd3431e538b85adb9771f94da6 Mon Sep 17 00:00:00 2001
From: Amit Shah <[EMAIL PROTECTED]>
Date: Tue, 16 Sep 2008 23:09:23 +0530
Subject: [PATCH] KVM/userspace: Support for assigning PCI devices to guests

[This still doesn't include some fixes to review comments.
I'm posting this just so that people can use this to test
or base their work off the latest patch.]

From: Or Sagi <[EMAIL PROTECTED]>
From: Nir Peleg <[EMAIL PROTECTED]>
From: Amit Shah <[EMAIL PROTECTED]>
From: Ben-Ami Yassour <[EMAIL PROTECTED]>
From: Glauber de Oliveira Costa <[EMAIL PROTECTED]>

With this patch, we can assign a device on the host machine to a
guest.

A new command-line option, -pcidevice is added.
For example, to invoke it for a device sitting at PCI bus:dev.fn
04:08.0 with host IRQ 18, use this:

        -pcidevice host=04:08.0

The host driver for the device, if any, is to be removed before
assigning the device.

This works only with the in-kernel irqchip method; to use the
userspace irqchip, a kernel module (irqhook) and some extra changes
are needed.

Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
---
 libkvm/libkvm-x86.c         |   14 +
 libkvm/libkvm.h             |   27 ++
 qemu/Makefile.target        |    1 +
 qemu/hw/device-assignment.c |  605 
+++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |   92 +++++++
 qemu/hw/isa.h               |    2 +
 qemu/hw/pc.c                |    9 +
 qemu/hw/pci.c               |   12 +
 qemu/hw/pci.h               |    1 +
 qemu/hw/piix_pci.c          |   19 ++
 qemu/qemu-kvm-x86.c         |    3 +
 qemu/vl.c                   |   18 ++
 12 files changed, 803 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/libkvm/libkvm-x86.c b/libkvm/libkvm-x86.c
index a8cca15..6157f75 100644
--- a/libkvm/libkvm-x86.c
+++ b/libkvm/libkvm-x86.c
@@ -53,6 +53,20 @@ static int kvm_init_tss(kvm_context_t kvm)
        return 0;
 }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+int kvm_assign_pci_device(kvm_context_t kvm,
+                         struct kvm_assigned_pci_dev *assigned_dev)
+{
+       return ioctl(kvm->vm_fd, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
+}
+
+int kvm_assign_irq(kvm_context_t kvm,
+                  struct kvm_assigned_irq *assigned_irq)
+{
+       return ioctl(kvm->vm_fd, KVM_ASSIGN_IRQ, assigned_irq);
+}
+#endif
+
 int kvm_create_pit(kvm_context_t kvm)
 {
 #ifdef KVM_CAP_PIT
diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h
index 79dd769..edf8e9e 100644
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -658,4 +658,31 @@ int kvm_s390_interrupt(kvm_context_t kvm, int slot,
 int kvm_s390_set_initial_psw(kvm_context_t kvm, int slot, psw_t psw);
 int kvm_s390_store_status(kvm_context_t kvm, int slot, unsigned long addr);
 #endif
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/*!
+ * \brief Notifies host kernel aboud a PCI device assigned to guest
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the assigning of the physical PCI device.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_dev Parameters, like bus, devfn number, etc
+ */
+int kvm_assign_pci_device(kvm_context_t kvm,
+                         struct kvm_assigned_pci_dev *assigned_dev);
+
+/*!
+ * \brief Notifies host kernel about changes to a irq assignment
+ *
+ * Used for PCI device assignment, this function notifies the host
+ * kernel about the assigning of the irq for an assigned physical
+ * PCI device.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param assigned_irq Parameters, like dev id, host irq, guest irq, etc
+ */
+int kvm_assign_irq(kvm_context_t kvm,
+                  struct kvm_assigned_irq *assigned_irq);
+#endif
 #endif
diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 89814fd..958c33b 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -611,6 +611,7 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+OBJS+= device-assignment.o
 ifeq ($(USE_KVM_PIT), 1)
 OBJS+= i8254-kvm.o
 endif
diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
new file mode 100644
index 0000000..d32bbb4
--- /dev/null
+++ b/qemu/hw/device-assignment.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along 
with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 
Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *
+ *  Assign a PCI device from the host to a guest VM.
+ *
+ *  Adapted for KVM by Qumranet.
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
+ *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
+ */
+#include <stdio.h>
+#include <sys/io.h>
+#include "qemu-kvm.h"
+#include <linux/kvm_para.h>
+#include "device-assignment.h"
+
+/* From linux/ioport.h */
+#define IORESOURCE_IO          0x00000100      /* Resource type */
+#define IORESOURCE_MEM         0x00000200
+#define IORESOURCE_IRQ         0x00000400
+#define IORESOURCE_DMA         0x00000800
+#define IORESOURCE_PREFETCH    0x00001000      /* No side effects */
+
+/* #define DEVICE_ASSIGNMENT_DEBUG */
+
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __func__ , ## args)
+#else
+#define DEBUG(fmt, args...)
+#endif
+
+#define assigned_dev_ioport_write(suffix)                              \
+ static void assigned_dev_ioport_write##suffix(void *opaque, uint32_t addr, \
+                                              uint32_t value)          \
+ {                                                                     \
+        AssignedDevRegion *r_access = (AssignedDevRegion *)opaque;     \
+        uint32_t r_pio = (unsigned long)r_access->r_virtbase           \
+                + (addr - r_access->e_physbase);                       \
+        if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {           \
+                fprintf(stderr, "assigned_dev_ioport_write" #suffix    \
+                        ": r_pio=%08x e_physbase=%08x"                 \
+                        " r_virtbase=%08lx value=%08x\n",              \
+                        r_pio, (int)r_access->e_physbase,              \
+                        (unsigned long)r_access->r_virtbase, value);   \
+        }                                                              \
+        iopl(3);                                                       \
+        out##suffix(value, r_pio);                                     \
+  }
+
+assigned_dev_ioport_write(b)
+assigned_dev_ioport_write(w)
+assigned_dev_ioport_write(l)
+
+#define assigned_dev_ioport_read(suffix)                               \
+ static uint32_t assigned_dev_ioport_read##suffix(void *opaque, uint32_t 
addr) \
+ {                                                                     \
+        AssignedDevRegion *r_access = (AssignedDevRegion *)opaque; \
+        uint32_t r_pio = (addr - r_access->e_physbase)                 \
+                + (unsigned long)r_access->r_virtbase;                 \
+        uint32_t value = in##suffix(r_pio);                            \
+        if (r_access->debug & DEVICE_ASSIGNMENT_DEBUG_PIO) {           \
+                fprintf(stderr, "assigned_dev_ioport_read" #suffix     \
+                        ": r_pio=%08x e_physbase=%08x r_virtbase=%08lx " \
+                        "value=%08x\n",                                \
+                        r_pio, (int)r_access->e_physbase,              \
+                        (unsigned long)r_access->r_virtbase, value);   \
+        }                                                              \
+        return value;                                                  \
+ }
+
+assigned_dev_ioport_read(b)
+assigned_dev_ioport_read(w)
+assigned_dev_ioport_read(l)
+
+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                        uint32_t e_phys, uint32_t e_size, int type)
+{
+       AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+       AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+       int first_map = (region->e_size == 0);
+       int ret = 0;
+
+       DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+             e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
+             region_num);
+
+       region->e_physbase = e_phys;
+       region->e_size = e_size;
+
+       if (!first_map)
+               kvm_destroy_phys_mem(kvm_context, e_phys, e_size);
+       if (e_size > 0)
+               ret = kvm_register_phys_mem(kvm_context,
+                                           e_phys,
+                                           region->r_virtbase,
+                                           e_size,
+                                           0);
+       if (ret != 0)
+               fprintf(stderr, "%s: Error: create new mapping failed\n",
+                       __func__);
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t addr, uint32_t size, int type)
+{
+       AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+       int i;
+       uint32_t ((*rf[])(void *, uint32_t)) =
+               { assigned_dev_ioport_readb,
+                 assigned_dev_ioport_readw,
+                 assigned_dev_ioport_readl
+               };
+       void ((*wf[])(void *, uint32_t, uint32_t)) =
+               { assigned_dev_ioport_writeb,
+                 assigned_dev_ioport_writew,
+                 assigned_dev_ioport_writel
+               };
+
+       r_dev->v_addrs[region_num].e_physbase = addr;
+       DEBUG("%s: address=0x%x type=0x%x len=%d region_num=%d \n",
+             __func__, addr, type, size, region_num);
+
+       for (i = 0; i < 3; i++) {
+               register_ioport_write(addr, size, 1<<i, wf[i],
+                                     (void *) (r_dev->v_addrs + region_num));
+               register_ioport_read(addr, size, 1<<i, rf[i],
+                                    (void *) (r_dev->v_addrs + region_num));
+       }
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                         uint32_t val, int len)
+{
+       int fd, r;
+
+       DEBUG("%s: (%x.%x): address=%04x val=0x%08x len=%d\n",
+             __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+             (uint16_t) address, val, len);
+
+       if (address == 0x4)
+               pci_default_write_config(d, address, val, len);
+
+       if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+           address == 0x3c || address == 0x3d) {
+               /* used for update-mappings (BAR emulation) */
+               pci_default_write_config(d, address, val, len);
+               return;
+       }
+       DEBUG("%s: NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+             __func__, ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+             (uint16_t) address, val, len);
+       fd = ((AssignedDevice *)d)->real_device.config_fd;
+       lseek(fd, address, SEEK_SET);
+again:
+       r = write(fd, &val, len);
+       if (r < 0) {
+               if (errno == EINTR || errno == EAGAIN)
+                       goto again;
+               fprintf(stderr, "%s: write failed, errno = %d\n",
+                       __func__, errno);
+       }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                            int len)
+{
+       uint32_t val = 0;
+       int fd, r;
+
+       if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+           address == 0x3c || address == 0x3d) {
+               val = pci_default_read_config(d, address, len);
+               DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+                     (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val,
+                     len);
+               return val;
+       }
+
+       /* vga specific, remove later */
+       if (address == 0xFC)
+               goto do_log;
+
+       fd = ((AssignedDevice *)d)->real_device.config_fd;
+       lseek(fd, address, SEEK_SET);
+again:
+       r = read(fd, &val, len);
+       if (r < 0) {
+               if (errno == EINTR || errno == EAGAIN)
+                       goto again;
+               fprintf(stderr, "%s: read failed, errno = %d\n",
+                       __func__, errno);
+       }
+do_log:
+       DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+             (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+       /* kill the special capabilities */
+       if (address == 4 && len == 4)
+               val &= ~0x100000;
+       else if (address == 6)
+               val &= ~0x10;
+
+       return val;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                        unsigned long regions_num,
+                                        AssignedDevice *pci_dev)
+{
+       uint32_t i;
+       PCIRegion *cur_region = io_regions;
+
+       for (i = 0; i < regions_num; i++, cur_region++) {
+               if (!cur_region->valid)
+                       continue;
+#ifdef DEVICE_ASSIGNMENT_DEBUG
+               pci_dev->v_addrs[i].debug |= DEVICE_ASSIGNMENT_DEBUG_MMIO
+                                            | DEVICE_ASSIGNMENT_DEBUG_PIO;
+#endif
+               pci_dev->v_addrs[i].num = i;
+
+               /* handle memory io regions */
+               if (cur_region->type & IORESOURCE_MEM) {
+                       int t = cur_region->type & IORESOURCE_PREFETCH
+                               ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                               : PCI_ADDRESS_SPACE_MEM;
+
+                       /* map physical memory */
+                       pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+                       pci_dev->v_addrs[i].r_virtbase =
+                               mmap(NULL,
+                                    (cur_region->size + 0xFFF) & 0xFFFFF000,
+                                    PROT_WRITE | PROT_READ, MAP_SHARED,
+                                    cur_region->resource_fd, (off_t) 0);
+
+                       if ((void *) -1 == pci_dev->v_addrs[i].r_virtbase) {
+                               fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                                       "\n", __func__,
+                                       (uint32_t) (cur_region->base_addr));
+                               return -1;
+                       }
+                       pci_dev->v_addrs[i].r_size = cur_region->size;
+                       pci_dev->v_addrs[i].e_size = 0;
+
+                       /* add offset */
+                       pci_dev->v_addrs[i].r_virtbase +=
+                               (cur_region->base_addr & 0xFFF);
+
+                       pci_register_io_region((PCIDevice *) pci_dev, i,
+                                              cur_region->size, t,
+                                              assigned_dev_iomem_map);
+                       continue;
+               }
+               /* handle port io regions */
+               pci_register_io_region((PCIDevice *) pci_dev, i,
+                                      cur_region->size, PCI_ADDRESS_SPACE_IO,
+                                      assigned_dev_ioport_map);
+
+               pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+               pci_dev->v_addrs[i].r_virtbase =
+                       (void *)(long)cur_region->base_addr;
+               /* not relevant for port io */
+               pci_dev->v_addrs[i].memory_index = 0;
+       }
+
+       /* success */
+       return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                          uint8_t r_dev, uint8_t r_func)
+{
+       char dir[128], name[128], comp[16];
+       int fd, r = 0;
+       FILE *f;
+       unsigned long long start, end, size, flags;
+       PCIRegion *rp;
+       PCIDevRegions *dev = &pci_dev->real_device;
+
+       dev->region_number = 0;
+
+       sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+               r_bus, r_dev, r_func);
+       strcpy(name, dir);
+       strcat(name, "config");
+       fd = open(name, O_RDWR);
+       if (fd == -1) {
+               fprintf(stderr, "%s: %s: %m\n", __func__, name);
+               return 1;
+       }
+       dev->config_fd = fd;
+again:
+       r = read(fd, pci_dev->dev.config, sizeof pci_dev->dev.config);
+       if (r < 0) {
+               if (errno == EINTR || errno == EAGAIN)
+                       goto again;
+               fprintf(stderr, "%s: read failed, errno = %d\n",
+                       __func__, errno);
+       }
+       strcpy(name, dir);
+       strcat(name, "resource");
+
+       f = fopen(name, "r");
+       if (f == NULL) {
+               fprintf(stderr, "%s: %s: %m\n", __func__, name);
+               return 1;
+       }
+       for (r = 0; fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3;
+            r++) {
+               rp = dev->regions + r;
+               rp->valid = 0;
+               size = end - start + 1;
+               flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+               if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+                       continue;
+               if (flags & IORESOURCE_MEM) {
+                       flags &= ~IORESOURCE_IO;
+                       sprintf(comp, "resource%d", r);
+                       strcpy(name, dir);
+                       strcat(name, comp);
+                       fd = open(name, O_RDWR);
+                       if (fd == -1)
+                               continue;               /* probably ROM */
+                       rp->resource_fd = fd;
+               } else
+                       flags &= ~IORESOURCE_PREFETCH;
+
+               rp->type = flags;
+               rp->valid = 1;
+               rp->base_addr = start;
+               rp->size = size;
+               DEBUG("%s: region %d size %d start 0x%x type %d "
+                     "resource_fd %d\n", __func__, r, rp->size, start,
+                     rp->type, rp->resource_fd);
+       }
+       fclose(f);
+
+       dev->region_number = r;
+       return 0;
+}
+
+#define        MAX_ASSIGNED_DEVS 4
+struct {
+       char name[15];
+       int bus;
+       int dev;
+       int func;
+       int dma;
+       AssignedDevice *assigned_dev;
+} assigned_devices[MAX_ASSIGNED_DEVS];
+
+int nr_assigned_devices;
+static int disable_iommu;
+
+static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
+{
+       return (uint32_t)bus << 8 | (uint32_t)devfn;
+}
+
+static AssignedDevice *register_real_device(PCIBus *e_bus,
+                                           const char *e_dev_name,
+                                           int e_devfn, uint8_t r_bus,
+                                           uint8_t r_dev, uint8_t r_func,
+                                           int flags)
+{
+       int r;
+       AssignedDevice *pci_dev;
+       uint8_t e_device, e_intx;
+
+       DEBUG("%s: Registering real physical device %s (devfn=0x%x)\n",
+             __func__, e_dev_name, e_devfn);
+
+       pci_dev = (AssignedDevice *)
+               pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
+                                   e_devfn, assigned_dev_pci_read_config,
+                                   assigned_dev_pci_write_config);
+       if (NULL == pci_dev) {
+               fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
+                       __func__, e_dev_name);
+               return NULL;
+       }
+       if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+               fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
+                       __func__, e_dev_name);
+               goto out;
+       }
+
+       /* handle real device's MMIO/PIO BARs */
+       if (assigned_dev_register_regions(pci_dev->real_device.regions,
+                                         pci_dev->real_device.region_number,
+                                         pci_dev))
+               goto out;
+
+       /* handle interrupt routing */
+       e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+       e_intx = pci_dev->dev.config[0x3d] - 1;
+       pci_dev->intpin = e_intx;
+       pci_dev->run = 0;
+       pci_dev->girq = 0;
+       pci_dev->h_busnr = r_bus;
+       pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+       if (kvm_enabled()) {
+               struct kvm_assigned_pci_dev assigned_dev_data;
+
+               memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+               assigned_dev_data.assigned_dev_id  =
+                       calc_assigned_dev_id(pci_dev->h_busnr,
+                                            (uint32_t)pci_dev->h_devfn);
+               assigned_dev_data.busnr = pci_dev->h_busnr;
+               assigned_dev_data.devfn = pci_dev->h_devfn;
+               assigned_dev_data.flags = flags;
+#ifdef KVM_CAP_PV_DMA
+               assigned_dev_data.guest_dev_id =
+                       calc_assigned_dev_id(pci_bus_num(e_bus),
+                                            PCI_DEVFN(e_device, r_func));
+#endif
+
+#ifdef KVM_CAP_IOMMU
+               /* We always enable the IOMMU if present
+                * (or when not disabled on the command line)
+                */
+               r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
+               if (r && !disable_iommu)
+                       assigned_devices[nr_assigned_devices].dma |=
+                               KVM_DEV_ASSIGN_ENABLE_IOMMU;
+#endif
+               r = kvm_assign_pci_device(kvm_context,
+                                         &assigned_dev_data);
+               if (r < 0) {
+                       fprintf(stderr, "Could not notify kernel about "
+                               "assigned device \"%s\"\n", e_dev_name);
+                       perror("pt-ioctl");
+                       goto out;
+               }
+       }
+#endif
+       fprintf(stderr, "Registered host PCI device %02x:%02x.%1x "
+               "(\"%s\") as guest device %02x:%02x.%1x\n",
+               r_bus, r_dev, r_func, e_dev_name,
+               pci_bus_num(e_bus), e_device, r_func);
+
+       return pci_dev;
+out:
+       pci_unregister_device(&pci_dev->dev);
+       return NULL;
+}
+
+extern int get_param_value(char *buf, int buf_size,
+                          const char *tag, const char *str);
+extern int piix_get_irq(int);
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+/* The pci config space got updated. Check if irq numbers have changed
+ * for our devices
+ */
+void assigned_dev_update_irq(PCIDevice *d)
+{
+       int i, irq, r;
+       AssignedDevice *assigned_dev;
+
+       for (i = 0; i < nr_assigned_devices; i++) {
+               assigned_dev = assigned_devices[i].assigned_dev;
+               if (assigned_dev == NULL)
+                       continue;
+
+               irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
+               irq = piix_get_irq(irq);
+
+               if (irq != assigned_dev->girq) {
+                       struct kvm_assigned_irq assigned_irq_data;
+
+                       memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
+                       assigned_irq_data.assigned_dev_id  =
+                               calc_assigned_dev_id(assigned_dev->h_busnr,
+                                                    (uint8_t)
+                                                    assigned_dev->h_devfn);
+                       assigned_irq_data.guest_irq = irq;
+                       assigned_irq_data.host_irq =
+                               assigned_dev->real_device.irq;
+                       r = kvm_assign_irq(kvm_context, &assigned_irq_data);
+                       if (r < 0) {
+                               perror("assigned_dev_update_irq");
+                               fprintf(stderr, "Are you assigning a device "
+                                       "that shares IRQ with some other "
+                                       "device?\n");
+                               pci_unregister_device(&assigned_dev->dev);
+                               continue;
+                       }
+                       assigned_dev->girq = irq;
+               }
+       }
+}
+#endif
+
+static int init_device_assignment(void)
+{
+       /* Do we have any devices to be assigned? */
+       if (nr_assigned_devices == 0)
+               return -1;
+       iopl(3);
+       return 0;
+}
+
+int init_assigned_device(PCIBus *bus, int *index)
+{
+       AssignedDevice *dev = NULL;
+       int i, ret = 0;
+
+       if (*index == -1) {
+               if (init_device_assignment() < 0)
+                       return -1;
+
+               *index = nr_assigned_devices - 1;
+       }
+       i = *index;
+       dev = register_real_device(bus, assigned_devices[i].name, -1,
+                                  assigned_devices[i].bus,
+                                  assigned_devices[i].dev,
+                                  assigned_devices[i].func,
+                                  assigned_devices[i].dma);
+       if (dev == NULL) {
+               fprintf(stderr, "Error: Couldn't register device \"%s\"\n",
+                       assigned_devices[i].name);
+               ret = -1;
+       }
+       assigned_devices[i].assigned_dev = dev;
+
+       --*index;
+       return ret;
+}
+
+/*
+ * Syntax to assign device:
+ *
+ * -pcidevice dev=bus:dev.func,dma=dma
+ *
+ * Example:
+ * -pcidevice dev=00:13.0,dma=pvdma
+ *
+ * dma can currently be 'none' to disable iommu support.
+ */
+void add_assigned_device(const char *arg)
+{
+       char *cp, *cp1;
+       char device[8];
+       char dma[6];
+       int r;
+
+       if (nr_assigned_devices >= MAX_ASSIGNED_DEVS) {
+               fprintf(stderr, "Too many assigned devices (max %d)\n",
+                       MAX_ASSIGNED_DEVS);
+               return;
+       }
+       memset(&assigned_devices[nr_assigned_devices], 0,
+              sizeof assigned_devices[nr_assigned_devices]);
+
+       r = get_param_value(device, sizeof device, "host", arg);
+
+       r = get_param_value(assigned_devices[nr_assigned_devices].name,
+                           sizeof assigned_devices[nr_assigned_devices].name,
+                           "name", arg);
+       if (!r)
+               strncpy(assigned_devices[nr_assigned_devices].name, device, 8);
+
+#ifdef KVM_CAP_IOMMU
+       r = get_param_value(dma, sizeof dma, "dma", arg);
+       if (r && !strncmp(dma, "none", 4))
+               disable_iommu = 1;
+#endif
+       cp = device;
+       assigned_devices[nr_assigned_devices].bus = strtoul(cp, &cp1, 16);
+       if (*cp1 != ':')
+               goto bad;
+       cp = cp1 + 1;
+
+       assigned_devices[nr_assigned_devices].dev = strtoul(cp, &cp1, 16);
+       if (*cp1 != '.')
+               goto bad;
+       cp = cp1 + 1;
+
+       assigned_devices[nr_assigned_devices].func = strtoul(cp, &cp1, 16);
+
+       nr_assigned_devices++;
+       return;
+bad:
+       fprintf(stderr, "pcidevice argument parse error; "
+               "please check the help text for usage\n");
+}
diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
new file mode 100644
index 0000000..621df82
--- /dev/null
+++ b/qemu/hw/device-assignment.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along 
with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 
Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ *  Data structures for storing PCI state
+ *
+ *  Adapted to kvm by Qumranet
+ *
+ *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
+ *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
+ *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
+ */
+
+#ifndef __DEVICE_ASSIGNMENT_H__
+#define __DEVICE_ASSIGNMENT_H__
+
+#include <sys/mman.h>
+#include "qemu-common.h"
+#include "pci.h"
+#include <linux/types.h>
+
+#define DEVICE_ASSIGNMENT_DEBUG_PIO    (0x01)
+#define DEVICE_ASSIGNMENT_DEBUG_MMIO   (0x02)
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot,func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+typedef uint32_t pciaddr_t;
+
+#define MAX_IO_REGIONS                 (6)
+
+typedef struct pci_region_s {
+       int type;       /* Memory or port I/O */
+       int valid;
+       pciaddr_t base_addr;
+       pciaddr_t size;         /* size of the region */
+       int resource_fd;
+} PCIRegion;
+
+typedef struct pci_dev_s {
+       uint8_t bus, dev, func; /* Bus inside domain, device and function */
+       int irq;                /* IRQ number */
+       uint16_t region_number; /* number of active regions */
+
+       /* Port I/O or MMIO Regions */
+       PCIRegion regions[MAX_IO_REGIONS];
+       int config_fd;
+} PCIDevRegions;
+
+typedef struct assigned_dev_region_s {
+       target_phys_addr_t e_physbase;
+       uint32_t memory_index;
+       void *r_virtbase;       /* mmapped access address */
+       int num;                /* our index within v_addrs[] */
+       uint32_t e_size;        /* emulated size of region in bytes */
+       uint32_t r_size;        /* real size of region in bytes */
+       uint32_t debug;
+} AssignedDevRegion;
+
+typedef struct assigned_dev_s {
+       PCIDevice dev;
+       int intpin;
+       uint8_t debug_flags;
+       AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
+       PCIDevRegions real_device;
+       int run;
+       int girq;
+       unsigned char h_busnr;
+       unsigned int h_devfn;
+       int bound;
+} AssignedDevice;
+
+/* Initialization functions */
+int init_assigned_device(PCIBus *bus, int *index);
+void add_assigned_device(const char *arg);
+void assigned_dev_set_vector(int irq, int vector);
+void assigned_dev_ack_mirq(int vector);
+
+#endif                         /* __DEVICE_ASSIGNMENT_H__ */
diff --git a/qemu/hw/isa.h b/qemu/hw/isa.h
index 89b3004..c720f5e 100644
--- a/qemu/hw/isa.h
+++ b/qemu/hw/isa.h
@@ -1,5 +1,7 @@
 /* ISA bus */
 
+#include "hw.h"
+
 extern target_phys_addr_t isa_mem_base;
 
 int register_ioport_read(int start, int length, int size,
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 8a50096..59c2098 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -32,6 +32,7 @@
 #include "smbus.h"
 #include "boards.h"
 #include "console.h"
+#include "device-assignment.h"
 
 #include "qemu-kvm.h"
 
@@ -1013,6 +1014,14 @@ static void pc_init1(ram_addr_t ram_size, int 
vga_ram_size,
         }
     }
 
+    /* Initialize assigned devices */
+    if (pci_enabled) {
+        int r = -1;
+        do {
+            init_assigned_device(pci_bus, &r);
+       } while (r >= 0);
+    }
+
     rtc_state = rtc_init(0x70, i8259[8]);
 
     qemu_register_boot_set(pc_boot_set, rtc_state);
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 07d37a8..e4e8386 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -50,6 +50,7 @@ struct PCIBus {
 
 static void pci_update_mappings(PCIDevice *d);
 static void pci_set_irq(void *opaque, int irq_num, int level);
+void assigned_dev_update_irq(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
 static int pci_irq_index;
@@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
         val >>= 8;
     }
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
+       address >= 0x60 && address <= 0x63)
+       assigned_dev_update_irq(d);
+#endif
+
     end = address + len;
     if (end > PCI_COMMAND && address < (PCI_COMMAND + 2)) {
         /* if the command register is modified, we must modify the mappings 
*/
@@ -560,6 +567,11 @@ static void pci_set_irq(void *opaque, int irq_num, int 
level)
     bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
 }
 
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+       return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
 /***********************************************************/
 /* monitor info on PCI */
 
diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index 60e4094..e11fbbf 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -81,6 +81,7 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
                             uint32_t size, int type,
                             PCIMapIORegionFunc *map_func);
 
+int pci_map_irq(PCIDevice *pci_dev, int pin);
 uint32_t pci_default_read_config(PCIDevice *d,
                                  uint32_t address, int len);
 void pci_default_write_config(PCIDevice *d,
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 6fbf47b..dc12c8a 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -243,6 +243,25 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int 
level)
     }
 }
 
+int piix3_get_pin(int pic_irq)
+{
+    int i;
+    for (i = 0; i < 4; i++)
+        if (piix3_dev->config[0x60+i] == pic_irq)
+            return i;
+    return -1;
+}
+
+int piix_get_irq(int pin)
+{
+    if (piix3_dev)
+        return piix3_dev->config[0x60+pin];
+    if (piix4_dev)
+        return piix4_dev->config[0x60+pin];
+
+    return 0;
+}
+
 static void piix3_reset(PCIDevice *d)
 {
     uint8_t *pci_conf = d->config;
diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c
index 5daedd1..5123e52 100644
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -530,6 +530,9 @@ struct kvm_para_features {
 #ifdef KVM_CAP_CR3_CACHE
        { KVM_CAP_CR3_CACHE, KVM_FEATURE_CR3_CACHE },
 #endif
+#ifdef KVM_CAP_PV_DMA
+       { KVM_CAP_PV_DMA, KVM_FEATURE_DMA_OP },
+#endif
        { -1, -1 }
 };
 
diff --git a/qemu/vl.c b/qemu/vl.c
index 022b3b8..bab720d 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -37,6 +37,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "hw/device-assignment.h"
 #include "migration.h"
 #include "balloon.h"
 #include "qemu-kvm.h"
@@ -8478,6 +8479,12 @@ static void help(int exitcode)
 #endif
           "-no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n"
           "-no-kvm-pit     disable KVM kernel mode PIT\n"
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+          "-pcidevice host=bus:dev.func[,dma=none][,name=\"string\"]\n"
+          "                expose a PCI device to the guest OS.\n"
+          "                dma=none: don't perform any dma translations 
(default is 
to use an iommu)\n"
+          "                'string' is used in log output.\n"
+#endif
 #endif
 #ifdef TARGET_I386
            "-std-vga        simulate a standard VGA card with VESA Bochs 
Extensions\n"
@@ -8601,6 +8608,9 @@ enum {
     QEMU_OPTION_no_kvm,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_kvm_pit,
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+    QEMU_OPTION_pcidevice,
+#endif
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_no_shutdown,
     QEMU_OPTION_show_cursor,
@@ -8689,6 +8699,9 @@ const QEMUOption qemu_options[] = {
 #endif
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
     { "no-kvm-pit", 0, QEMU_OPTION_no_kvm_pit },
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+    { "pcidevice", HAS_ARG, QEMU_OPTION_pcidevice },
+#endif
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
@@ -9595,6 +9608,11 @@ int main(int argc, char **argv)
                kvm_pit = 0;
                break;
            }
+#if defined(TARGET_I386) || defined(TARGET_X86_64)
+           case QEMU_OPTION_pcidevice:
+               add_assigned_device(optarg);
+               break;
+#endif
 #endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to