[EMAIL PROTECTED] wrote:
Signed-off-by: Amit Shah <[EMAIL PROTECTED]>
Signed-off-by: Muli Ben-Yehuda <[EMAIL PROTECTED]>
---
 qemu/Makefile.target        |    3 +
 qemu/hw/device-assignment.c |  641 +++++++++++++++++++++++++++++++++++++++++++
 qemu/hw/device-assignment.h |  117 ++++++++
 qemu/hw/pc.c                |   16 +
 qemu/hw/pci.c               |    7 +
 qemu/qemu-kvm.c             |   14 +
 qemu/qemu-kvm.h             |    8 +
 qemu/vl.c                   |   28 ++
 8 files changed, 834 insertions(+), 0 deletions(-)
 create mode 100644 qemu/hw/device-assignment.c
 create mode 100644 qemu/hw/device-assignment.h

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index d9bdeca..5d44e08 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -621,6 +621,9 @@ OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
 OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 OBJS+= usb-uhci.o vmmouse.o vmport.o vmware_vga.o extboot.o
+ifeq ($(USE_KVM), 1)
+OBJS+= device-assignment.o
+endif

I don't think you want to build this on PPC so I think you need a stronger check.

+static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
+                       uint32_t value)
+{
+    AssignedDevRegion *r_access = opaque;
+    uint32_t r_pio = guest_to_host_ioport(r_access, addr);
+
+    DEBUG("%s: r_pio=%08x e_physbase=%08x r_virtbase=%08lx value=%08x\n",
+         r_pio, (int)r_access->e_physbase,
+          (unsigned long)r_access->r_virtbase, value);

The format doesn't match the parameter count.

+static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
+                                   uint32_t e_phys, uint32_t e_size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_ephys = region->e_physbase;
+    uint32_t old_esize = region->e_size;
+    int first_map = (region->e_size == 0);
+    int ret = 0;
+
+    DEBUG("e_phys=%08x r_virt=%x type=%d len=%08x region_num=%d \n",
+          e_phys, (uint32_t)region->r_virtbase, type, e_size, region_num);
+
+    region->e_physbase = e_phys;
+    region->e_size = e_size;
+
+    if (!first_map)
+       kvm_destroy_phys_mem(kvm_context, old_ephys, old_esize);
+
+    if (e_size > 0)
+       ret = kvm_register_phys_mem(kvm_context, e_phys,
+                                        region->u.r_virtbase, e_size, 0);
+    if (ret != 0) {
+       fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
+       exit(1);
+    }
+}
+
+static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
+                                    uint32_t addr, uint32_t size, int type)
+{
+    AssignedDevice *r_dev = (AssignedDevice *) pci_dev;
+    AssignedDevRegion *region = &r_dev->v_addrs[region_num];
+    uint32_t old_port = region->u.r_baseport;
+    uint32_t old_num = region->e_size;
+    int first_map = (old_num == 0);
+    struct ioperm_data data;
+    int i;
+
+    region->e_physbase = addr;
+    region->e_size = size;
+
+    DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n",
+          addr, region->u.r_baseport, type, size, region_num);
+
+    memset(&data, 0, sizeof(data));
+
+    if (!first_map) {
+       data.start_port = old_port;
+ data.num = old_num; + data.turn_on = 0;
+
+       for (i = 0; i < smp_cpus; ++i)
+           kvm_ioperm(qemu_kvm_cpu_env(i), &data);

How does this interact with VCPU hot-plug?

+    }
+
+    data.start_port = region->u.r_baseport;
+    data.num = size;
+    data.turn_on = 1;
+ + for (i = 0; i < smp_cpus; ++i)
+       kvm_ioperm(qemu_kvm_cpu_env(i), &data);
+ + register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
+                         (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
+                          (r_dev->v_addrs + region_num));
+    register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
+                          (r_dev->v_addrs + region_num));
+}
+
+static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
+                                          uint32_t val, int len)
+{
+    int fd;
+    ssize_t ret;
+
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    if (address == 0x4) {
+        pci_default_write_config(d, address, val, len);
+        /* Continue to program the card */
+    }
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        /* used for update-mappings (BAR emulation) */
+        pci_default_write_config(d, address, val, len);
+        return;
+    }
+
+    DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+          ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
+          (uint16_t) address, val, len);
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pwrite(fd, &val, len, address);
+    if (ret != len) {
+       if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+           goto again;
+
+       fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
+               __func__, ret, errno);
+
+       exit(1);
+    }
+}
+
+static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
+                                             int len)
+{
+    uint32_t val = 0;
+    int fd;
+    ssize_t ret;
+
+    if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+        address == 0x3c || address == 0x3d) {
+        val = pci_default_read_config(d, address, len);
+        DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+              (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+        return val;
+    }
+
+    /* vga specific, remove later */
+    if (address == 0xFC)
+        goto do_log;
+
+    fd = ((AssignedDevice *)d)->real_device.config_fd;
+
+again:
+    ret = pread(fd, &val, len, address);
+    if (ret != len) {
+       if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
+           goto again;
+
+       fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
+               __func__, ret, errno);
+
+       exit(1);
+    }
+
+do_log:
+    DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+          (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+    /* kill the special capabilities */
+    if (address == 4 && len == 4)
+        val &= ~0x100000;
+    else if (address == 6)
+        val &= ~0x10;
+
+    return val;
+}
+
+static int assigned_dev_register_regions(PCIRegion *io_regions,
+                                         unsigned long regions_num,
+                                         AssignedDevice *pci_dev)
+{
+    uint32_t i;
+    PCIRegion *cur_region = io_regions;
+
+    for (i = 0; i < regions_num; i++, cur_region++) {
+        if (!cur_region->valid)
+            continue;
+        pci_dev->v_addrs[i].num = i;
+
+        /* handle memory io regions */
+        if (cur_region->type & IORESOURCE_MEM) {
+            int t = cur_region->type & IORESOURCE_PREFETCH
+                ? PCI_ADDRESS_SPACE_MEM_PREFETCH
+                : PCI_ADDRESS_SPACE_MEM;
+
+            /* map physical memory */
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_virtbase =
+                mmap(NULL,
+                     (cur_region->size + 0xFFF) & 0xFFFFF000,
+                     PROT_WRITE | PROT_READ, MAP_SHARED,
+                     cur_region->resource_fd, (off_t) 0);
+
+            if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
+                fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
+                        "\n", __func__,
+                        (uint32_t) (cur_region->base_addr));
+                return -1;
+            }
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            /* add offset */
+            pci_dev->v_addrs[i].u.r_virtbase +=
+                (cur_region->base_addr & 0xFFF);
+
+            pci_register_io_region((PCIDevice *) pci_dev, i,
+                                   cur_region->size, t,
+                                   assigned_dev_iomem_map);
+            continue;
+        }
+        /* handle port io regions */
+        pci_register_io_region((PCIDevice *) pci_dev, i,
+                               cur_region->size, PCI_ADDRESS_SPACE_IO,
+                               assigned_dev_ioport_map);
+
+        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+        /* not relevant for port io */
+        pci_dev->v_addrs[i].memory_index = 0;
+    }
+
+    /* success */
+    return 0;
+}
+
+static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
+                           uint8_t r_dev, uint8_t r_func)
+{
+    char dir[128], name[128];
+    int fd, r = 0;
+    FILE *f;
+    unsigned long long start, end, size, flags;
+    PCIRegion *rp;
+    PCIDevRegions *dev = &pci_dev->real_device;
+
+    dev->region_number = 0;
+
+    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+            r_bus, r_dev, r_func);
+
+    snprintf(name, sizeof(name), "%sconfig", dir);
+
+    fd = open(name, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    dev->config_fd = fd;
+again:
+    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
+    if (r < 0) {
+        if (errno == EINTR || errno == EAGAIN)
+            goto again;
+        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
+    }
+
+    snprintf(name, sizeof(name), "%sresource", dir);
+
+    f = fopen(name, "r");
+    if (f == NULL) {
+        fprintf(stderr, "%s: %s: %m\n", __func__, name);
+        return 1;
+    }
+    r = -1;
+    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
+        r++;
+        rp = dev->regions + r;
+        rp->valid = 0;
+        size = end - start + 1;
+        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+            continue;
+        if (flags & IORESOURCE_MEM) {
+            flags &= ~IORESOURCE_IO;
+           snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;       /* probably ROM */
+            rp->resource_fd = fd;
+        } else
+            flags &= ~IORESOURCE_PREFETCH;
+
+        rp->type = flags;
+        rp->valid = 1;
+        rp->base_addr = start;
+        rp->size = size;
+        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
+              r, rp->size, start, rp->type, rp->resource_fd);
+    }
+    fclose(f);
+
+    dev->region_number = r;
+    return 0;
+}
+
+static int disable_iommu;
+int nr_assigned_devices;
+static LIST_HEAD(, AssignedDevInfo) adev_head;
+
+static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
+{
+    return (uint32_t)bus << 8 | (uint32_t)devfn;
+}
+
+static AssignedDevice *register_real_device(PCIBus *e_bus,
+                                            const char *e_dev_name,
+                                            int e_devfn, uint8_t r_bus,
+                                            uint8_t r_dev, uint8_t r_func)
+{
+    int r;
+    AssignedDevice *pci_dev;
+    uint8_t e_device, e_intx;
+
+    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
+          e_dev_name, e_devfn);
+
+    pci_dev = (AssignedDevice *)
+        pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
+                            e_devfn, assigned_dev_pci_read_config,
+                            assigned_dev_pci_write_config);
+    if (NULL == pci_dev) {
+        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
+                __func__, e_dev_name);
+        return NULL;
+    }
+    if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
+                __func__, e_dev_name);
+        goto out;
+    }
+
+    /* handle real device's MMIO/PIO BARs */
+    if (assigned_dev_register_regions(pci_dev->real_device.regions,
+                                      pci_dev->real_device.region_number,
+                                      pci_dev))
+        goto out;
+
+    /* handle interrupt routing */
+    e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+    e_intx = pci_dev->dev.config[0x3d] - 1;
+    pci_dev->intpin = e_intx;
+    pci_dev->run = 0;
+    pci_dev->girq = 0;
+    pci_dev->h_busnr = r_bus;
+    pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
+
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+    if (kvm_enabled()) {
+        struct kvm_assigned_pci_dev assigned_dev_data;
+
+        memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
+        assigned_dev_data.assigned_dev_id  =
+            calc_assigned_dev_id(pci_dev->h_busnr,
+                                 (uint32_t)pci_dev->h_devfn);
+        assigned_dev_data.busnr = pci_dev->h_busnr;
+        assigned_dev_data.devfn = pci_dev->h_devfn;
+
+#ifdef KVM_CAP_IOMMU
+        /* We always enable the IOMMU if present
+         * (or when not disabled on the command line)
+         */
+        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
+        if (r && !disable_iommu)
+            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
+#endif
+        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
+        if (r < 0) {
+            fprintf(stderr, "Could not notify kernel about "
+                "assigned device \"%s\"\n", e_dev_name);
+            perror("register_real_device");
+            goto out;
+        }
+    }

You still succeed if KVM_CAP_DEVICE_ASSIGNMENT isn't defined? That means a newer userspace compiled on an older kernel will silently fail if they try to do device assignment. There's probably no reason to build this file if KVM_CAP_DEVICE_ASSIGNMENT isn't defined (see how the in-kernel PIT gets conditionally build depending on whether that cap is available).

+#endif
+    term_printf("Registered host PCI device %02x:%02x.%1x "
+               "(\"%s\") as guest device %02x:%02x.%1x\n",
+               r_bus, r_dev, r_func, e_dev_name,
+               pci_bus_num(e_bus), e_device, r_func);


If I read the code correctly, this term_printf() happens regardless of whether this is being done for PCI hotplug or for command-line assignment? That's a problem as it'll print garbage on the monitor when you start QEMU which could break management applications.

diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index d559f0c..5fdb726 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -33,6 +33,7 @@
 #include "boards.h"
 #include "console.h"
 #include "fw_cfg.h"
+#include "device-assignment.h"
#include "qemu-kvm.h" @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, if (pci_enabled)
         virtio_balloon_init(pci_bus);
+
+    if (kvm_enabled() && device_assignment_enabled) {
+       int i;

Stray tab.

+        for (i = 0; i < assigned_devices_index; i++) {
+            if (add_assigned_device(assigned_devices[i]) < 0) {
+                fprintf(stderr, "Warning: could not add assigned device %s\n",
+                        assigned_devices[i]);
+            }
+        }
+
+       if (init_all_assigned_devices(pci_bus)) {
+           fprintf(stderr, "Failed to initialize assigned devices\n");
+           exit (1);
+       }
+    }
 }
 +#if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(__linux__)
+            case QEMU_OPTION_pcidevice:
+               device_assignment_enabled = 1;
+               if (assigned_devices_index >= MAX_DEV_ASSIGN_CMDLINE) {
+                    fprintf(stderr, "Too many assigned devices\n");
+                    exit(1);
+               }
+               assigned_devices[assigned_devices_index] = optarg;
+               assigned_devices_index++;
+                break;

Tab damage.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to