On Tue, Oct 28, 2008 at 04:55:22PM +0000, Mark McLoughlin wrote:
> On Tue, 2008-10-28 at 12:06 +0200, [EMAIL PROTECTED] wrote:
> ...
> > +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus,
> > +                           uint8_t r_dev, uint8_t r_func)
> > +{
> > +    char dir[128], name[128];
> > +    int fd, r = 0;
> > +    FILE *f;
> > +    unsigned long long start, end, size, flags;
> > +    PCIRegion *rp;
> > +    PCIDevRegions *dev = &pci_dev->real_device;
> > +
> > +    dev->region_number = 0;
> > +
> > +    snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
> > +        r_bus, r_dev, r_func);
> > +
> > +    snprintf(name, sizeof(name), "%sconfig", dir);
> > +
> > +    fd = open(name, O_RDWR);
> > +    if (fd == -1) {
> > +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> > +        return 1;
> > +    }
> > +    dev->config_fd = fd;
> > +again:
> > +    r = read(fd, pci_dev->dev.config, sizeof(pci_dev->dev.config));
> > +    if (r < 0) {
> > +        if (errno == EINTR || errno == EAGAIN)
> > +            goto again;
> > +        fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
> > +    }
> > +
> > +    snprintf(name, sizeof(name), "%sresource", dir);
> > +
> > +    f = fopen(name, "r");
> > +    if (f == NULL) {
> > +        fprintf(stderr, "%s: %s: %m\n", __func__, name);
> > +        return 1;
> > +    }
> > +    r = -1;
> > +    while (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3) {
> > +        r++;
> > +        rp = dev->regions + r;
> 
> Could, in theory, overflow dev->regions here. Suggest:
> 
> +    for (r = 0; r < MAX_IO_REGIONS; r++) {
> +        if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
> +            break;

Fixed, thanks Mark. I think it also uncovered a buglet where we would
skip the last region with the original code, which should be ok now.

> > +        rp->valid = 0;
> > +        size = end - start + 1;
> > +        flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
> > +        if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
> > +            continue;
> > +        if (flags & IORESOURCE_MEM) {
> > +            flags &= ~IORESOURCE_IO;
> > +       snprintf(name, sizeof(name), "%sresource%d", dir, r);
> > +            fd = open(name, O_RDWR);
> > +            if (fd == -1)
> > +                continue;       /* probably ROM */
> > +            rp->resource_fd = fd;
> > +        } else
> > +            flags &= ~IORESOURCE_PREFETCH;
> > +
> > +        rp->type = flags;
> > +        rp->valid = 1;
> > +        rp->base_addr = start;
> > +        rp->size = size;
> > +        DEBUG("region %d size %d start 0x%x type %d resource_fd %d\n",
> > +              r, rp->size, start, rp->type, rp->resource_fd);
> > +    }
> > +    fclose(f);
> > +
> > +    dev->region_number = r;
> > +    return 0;
> > +}
> > +
> > +static int disable_iommu;
> 
> Why is this global?
> 
> The flag is set per-device on the command-line and only affects whether
> we pass KVM_DEV_ASSIGN_ENABLE_IOMMU to kvm_assign_pci_device()

Made per-device by moving it to struct AssignedDevInfo.

> > +int nr_assigned_devices;
> > +static LIST_HEAD(, AssignedDevInfo) adev_head;
> > +
> > +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn)
> > +{
> > +    return (uint32_t)bus << 8 | (uint32_t)devfn;
> > +}
> > +
> > +static AssignedDevice *register_real_device(PCIBus *e_bus,
> > +                                            const char *e_dev_name,
> > +                                            int e_devfn, uint8_t r_bus,
> > +                                            uint8_t r_dev, uint8_t r_func)
> > +{
> > +    int r;
> > +    AssignedDevice *pci_dev;
> > +    uint8_t e_device, e_intx;
> > +
> > +    DEBUG("Registering real physical device %s (devfn=0x%x)\n",
> > +          e_dev_name, e_devfn);
> > +
> > +    pci_dev = (AssignedDevice *)
> > +        pci_register_device(e_bus, e_dev_name, sizeof(AssignedDevice),
> > +                            e_devfn, assigned_dev_pci_read_config,
> > +                            assigned_dev_pci_write_config);
> > +    if (NULL == pci_dev) {
> > +        fprintf(stderr, "%s: Error: Couldn't register real device %s\n",
> > +                __func__, e_dev_name);
> > +        return NULL;
> > +    }
> > +    if (get_real_device(pci_dev, r_bus, r_dev, r_func)) {
> > +        fprintf(stderr, "%s: Error: Couldn't get real device (%s)!\n",
> > +                __func__, e_dev_name);
> > +        goto out;
> > +    }
> > +
> > +    /* handle real device's MMIO/PIO BARs */
> > +    if (assigned_dev_register_regions(pci_dev->real_device.regions,
> > +                                      pci_dev->real_device.region_number,
> > +                                      pci_dev))
> > +        goto out;
> > +
> > +    /* handle interrupt routing */
> > +    e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
> > +    e_intx = pci_dev->dev.config[0x3d] - 1;
> > +    pci_dev->intpin = e_intx;
> > +    pci_dev->run = 0;
> > +    pci_dev->girq = 0;
> > +    pci_dev->h_busnr = r_bus;
> > +    pci_dev->h_devfn = PCI_DEVFN(r_dev, r_func);
> > +
> > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> > +    if (kvm_enabled()) {
> > +        struct kvm_assigned_pci_dev assigned_dev_data;
> > +
> > +        memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
> > +        assigned_dev_data.assigned_dev_id  =
> > +            calc_assigned_dev_id(pci_dev->h_busnr,
> > +                                 (uint32_t)pci_dev->h_devfn);
> > +        assigned_dev_data.busnr = pci_dev->h_busnr;
> > +        assigned_dev_data.devfn = pci_dev->h_devfn;
> > +
> > +#ifdef KVM_CAP_IOMMU
> > +        /* We always enable the IOMMU if present
> > +         * (or when not disabled on the command line)
> > +         */
> > +        r = kvm_check_extension(kvm_context, KVM_CAP_IOMMU);
> > +        if (r && !disable_iommu)
> > +            assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
> > +#endif
> > +        r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
> > +        if (r < 0) {
> > +            fprintf(stderr, "Could not notify kernel about "
> > +                "assigned device \"%s\"\n", e_dev_name);
> > +            perror("register_real_device");
> > +            goto out;
> > +        }
> > +    }
> > +#endif
> > +    term_printf("Registered host PCI device %02x:%02x.%1x "
> > +           "(\"%s\") as guest device %02x:%02x.%1x\n",
> > +           r_bus, r_dev, r_func, e_dev_name,
> > +           pci_bus_num(e_bus), e_device, r_func);
> > +
> > +    return pci_dev;
> > +out:
> > +/*    pci_unregister_device(&pci_dev->dev); */
> > +    return NULL;
> > +}
> > +
> > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> > +/* The pci config space got updated. Check if irq numbers have changed
> > + * for our devices
> > + */
> > +void assigned_dev_update_irq(PCIDevice *d)
> > +{
> > +    int irq, r;
> > +    AssignedDevice *assigned_dev;
> > +    AssignedDevInfo *adev;
> > +
> > +    LIST_FOREACH(adev, &adev_head, next) {
> > +        assigned_dev = adev->assigned_dev;
> > +        irq = pci_map_irq(&assigned_dev->dev, assigned_dev->intpin);
> > +        irq = piix_get_irq(irq);
> > +
> > +        if (irq != assigned_dev->girq) {
> > +            struct kvm_assigned_irq assigned_irq_data;
> > +
> > +            memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
> > +            assigned_irq_data.assigned_dev_id  =
> > +                calc_assigned_dev_id(assigned_dev->h_busnr,
> > +                                     (uint8_t) assigned_dev->h_devfn);
> > +            assigned_irq_data.guest_irq = irq;
> > +            assigned_irq_data.host_irq = assigned_dev->real_device.irq;
> > +            r = kvm_assign_irq(kvm_context, &assigned_irq_data);
> > +            if (r < 0) {
> > +                perror("assigned_dev_update_irq");
> > +                fprintf(stderr, "Are you assigning a device "
> > +                        "that shares IRQ with some other device?\n");
> > +                pci_unregister_device(&assigned_dev->dev);
> > +                /* FIXME: Delete node from list */
> > +                continue;
> > +            }
> > +            assigned_dev->girq = irq;
> > +        }
> > +    }
> > +}
> > +#endif
> > +
> > +struct PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus)
> > +{
> > +    adev->assigned_dev = register_real_device(bus,
> > +                                              adev->name, -1,
> > +                                              adev->bus,
> > +                                              adev->dev,
> > +                                              adev->func);
> > +    return &adev->assigned_dev->dev;
> > +}
> 
> This looks unnecessary - register_real_device() isn't used anywhere
> else.
> 
> Why not just move register_real_device() into init_assigned_device()
> ?

Done.

> 
> > +int init_all_assigned_devices(PCIBus *bus)
> > +{
> > +    struct AssignedDevInfo *adev;
> > +
> > +    LIST_FOREACH(adev, &adev_head, next)
> > +        if (init_assigned_device(adev, bus) == NULL)
> > +            return -1;
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * Syntax to assign device:
> > + *
> > + * -pcidevice dev=bus:dev.func,dma=dma
>                  ^^^                  ^^^
> 
> Should be:
> 
>   -pcidevice host=bus:dev.func[,dma=none][,name=string]

Fixed (as well as the other -pcidevice bogosities you pointed).

> 
> > + *
> > + * Example:
> > + * -pcidevice host=00:13.0,dma=pvdma
>                                   ^^^^^
> 
> Should be:
> 
>   -pcidevice host=00:13.0,dma=none,name=Foo
>   
> > + *
> > + * dma can currently only be 'none' to disable iommu support.
> > + */
> > +AssignedDevInfo *add_assigned_device(const char *arg)
> > +{
> > +    char *cp, *cp1;
> > +    char device[8];
> > +    char dma[6];
> > +    int r;
> > +    AssignedDevInfo *adev;
> > +
> > +    adev = qemu_mallocz(sizeof(AssignedDevInfo));
> > +    if (adev == NULL) {
> > +        fprintf(stderr, "%s: Out of memory\n", __func__);
> > +        return NULL;
> > +    }
> > +    r = get_param_value(device, sizeof(device), "host", arg);
> > +    r = get_param_value(adev->name, sizeof(adev->name), "name", arg);
> > +    if (!r)
> > +   snprintf(adev->name, sizeof(adev->name), "%s", device);
> > +
> > +#ifdef KVM_CAP_IOMMU
> > +    r = get_param_value(dma, sizeof(dma), "dma", arg);
> > +    if (r && !strncmp(dma, "none", 4))
> > +        disable_iommu = 1;
> > +#endif
> > +    cp = device;
> > +    adev->bus = strtoul(cp, &cp1, 16);
> > +    if (*cp1 != ':')
> > +        goto bad;
> > +    cp = cp1 + 1;
> > +
> > +    adev->dev = strtoul(cp, &cp1, 16);
> > +    if (*cp1 != '.')
> > +        goto bad;
> > +    cp = cp1 + 1;
> > +
> > +    adev->func = strtoul(cp, &cp1, 16);
> > +
> > +    nr_assigned_devices++;
> 
> nr_assigned_devices isn't actually used anywhere.

Nuked.

> 
> > +    LIST_INSERT_HEAD(&adev_head, adev, next);
> > +    return adev;
> > +bad:
> > +    fprintf(stderr, "pcidevice argument parse error; "
> > +            "please check the help text for usage\n");
> > +    qemu_free(adev);
> > +    return NULL;
> > +}
> > diff --git a/qemu/hw/device-assignment.h b/qemu/hw/device-assignment.h
> > new file mode 100644
> > index 0000000..ebc0b50
> > --- /dev/null
> > +++ b/qemu/hw/device-assignment.h
> > @@ -0,0 +1,117 @@
> > +/*
> > + * Copyright (c) 2007, Neocleus Corporation.
> > + * Copyright (c) 2007, Intel Corporation.
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
> > for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along 
> > with
> > + * this program; if not, write to the Free Software Foundation, Inc., 59 
> > Temple
> > + * Place - Suite 330, Boston, MA 02111-1307 USA.
> > + *
> > + *  Data structures for storing PCI state
> > + *
> > + *  Adapted to kvm by Qumranet
> > + *
> > + *  Copyright (c) 2007, Neocleus, Alex Novik ([EMAIL PROTECTED])
> > + *  Copyright (c) 2007, Neocleus, Guy Zana ([EMAIL PROTECTED])
> > + *  Copyright (C) 2008, Qumranet, Amit Shah ([EMAIL PROTECTED])
> > + *  Copyright (C) 2008, Red Hat, Amit Shah ([EMAIL PROTECTED])
> > + */
> > +
> > +#ifndef __DEVICE_ASSIGNMENT_H__
> > +#define __DEVICE_ASSIGNMENT_H__
> > +
> > +#include <sys/mman.h>
> > +#include "qemu-common.h"
> > +#include "sys-queue.h"
> > +#include "pci.h"
> > +
> > +/* From include/linux/pci.h in the kernel sources */
> > +#define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
> > +
> > +#define MAX_IO_REGIONS (6)
> 
> Perhaps a comment to say this is the number of BARs in the config space
> header?

Sure, comments are cheap.

> 
> > +typedef struct {
> > +    int type;           /* Memory or port I/O */
> > +    int valid;
> > +    uint32_t base_addr;
> > +    uint32_t size;    /* size of the region */
> > +    int resource_fd;
> > +} PCIRegion;
> > +
> > +typedef struct {
> > +    uint8_t bus, dev, func; /* Bus inside domain, device and function */
> > +    int irq;                /* IRQ number */
> > +    uint16_t region_number; /* number of active regions */
> > +
> > +    /* Port I/O or MMIO Regions */
> > +    PCIRegion regions[MAX_IO_REGIONS];
> > +    int config_fd;
> > +} PCIDevRegions;
> > +
> > +typedef struct {
> > +    target_phys_addr_t e_physbase;
> > +    uint32_t memory_index;
> > +    union {
> > +        void *r_virtbase;    /* mmapped access address for memory regions 
> > */
> > +        uint32_t r_baseport; /* the base guest port for I/O regions */
> > +    } u;
> > +    int num;            /* our index within v_addrs[] */
> > +    uint32_t e_size;    /* emulated size of region in bytes */
> > +    uint32_t r_size;    /* real size of region in bytes */
> > +} AssignedDevRegion;
> > +
> > +typedef struct {
> > +    PCIDevice dev;
> > +    int intpin;
> > +    uint8_t debug_flags;
> > +    AssignedDevRegion v_addrs[PCI_NUM_REGIONS];
> > +    PCIDevRegions real_device;
> > +    int run;
> > +    int girq;
> > +    unsigned char h_busnr;
> > +    unsigned int h_devfn;
> > +    int bound;
> > +} AssignedDevice;
> > +
> > +typedef struct AssignedDevInfo AssignedDevInfo;
> > +
> > +struct AssignedDevInfo {
> > +    char name[15];
> > +    int bus;
> > +    int dev;
> > +    int func;
> > +    AssignedDevice *assigned_dev;
> > +    LIST_ENTRY(AssignedDevInfo) next;
> > +};
> > +
> > +PCIDevice *init_assigned_device(AssignedDevInfo *adev, PCIBus *bus);
> > +AssignedDevInfo *add_assigned_device(const char *arg);
> > +void assigned_dev_set_vector(int irq, int vector);
> > +void assigned_dev_ack_mirq(int vector);
> > +
> > +
> > +#ifdef USE_KVM
> > +int init_all_assigned_devices(PCIBus *bus);
> > +#else /* not using kvm */
> > +static inline int init_all_assigned_devices(PCIBus *bus)
> > +{
> > +    return 0;
> > +}
> > +#endif /* !USE_KVM */
> > +
> > +
> > +#define MAX_DEV_ASSIGN_CMDLINE 8
> > +
> > +extern int device_assignment_enabled;
> 
> > +extern const char *assigned_devices[MAX_DEV_ASSIGN_CMDLINE];
> > +extern int assigned_devices_index;
> 
> Neither of these two are implemented anywhere.

Actually they are, we use them to pass the arguments from main to
pc.c.

> 
> > +#endif              /* __DEVICE_ASSIGNMENT_H__ */
> > diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
> > index d559f0c..5fdb726 100644
> > --- a/qemu/hw/pc.c
> > +++ b/qemu/hw/pc.c
> > @@ -33,6 +33,7 @@
> >  #include "boards.h"
> >  #include "console.h"
> >  #include "fw_cfg.h"
> > +#include "device-assignment.h"
> >  
> >  #include "qemu-kvm.h"
> >  
> > @@ -1157,6 +1158,21 @@ static void pc_init1(ram_addr_t ram_size, int 
> > vga_ram_size,
> >  
> >      if (pci_enabled)
> >          virtio_balloon_init(pci_bus);
> > +
> > +    if (kvm_enabled() && device_assignment_enabled) {
> 
> The device_assignment_enabled flag looks like it shouldn't be needed.
> 
> If assigned_devices_index remains zero, nothing should happen
> anyway.

Nuked.

> 
> > +   int i;
> > +        for (i = 0; i < assigned_devices_index; i++) {
> > +            if (add_assigned_device(assigned_devices[i]) < 0) {
> > +                fprintf(stderr, "Warning: could not add assigned device 
> > %s\n",
> > +                        assigned_devices[i]);
> > +            }
> > +        }
> > +
> > +   if (init_all_assigned_devices(pci_bus)) {
> > +       fprintf(stderr, "Failed to initialize assigned devices\n");
> > +       exit (1);
> > +   }
> > +    }
> >  }
> >  
> >  static void pc_init_pci(ram_addr_t ram_size, int vga_ram_size,
> > diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
> > index c82cd20..f86a8a7 100644
> > --- a/qemu/hw/pci.c
> > +++ b/qemu/hw/pci.c
> > @@ -50,6 +50,7 @@ struct PCIBus {
> >  
> >  static void pci_update_mappings(PCIDevice *d);
> >  static void pci_set_irq(void *opaque, int irq_num, int level);
> > +void assigned_dev_update_irq(PCIDevice *d);
> >  
> >  target_phys_addr_t pci_mem_base;
> >  static int pci_irq_index;
> > @@ -453,6 +454,12 @@ void pci_default_write_config(PCIDevice *d,
> >          val >>= 8;
> >      }
> >  
> > +#ifdef KVM_CAP_DEVICE_ASSIGNMENT
> > +    if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
> > +        address >= 0x60 && address <= 0x63)
> > +        assigned_dev_update_irq(d);
> > +#endif
> 
> Outside of the context of piix_pci.c, it's difficult to figure out what
> the 0x60-0x63 register range relates to - i.e. you need to know to go
> digging in the PIIX spec.
> 
> How about something like in qemu/hw/pc.h:
> 
>   +/* config space register for IRQ routing */
>   +#define PIIX_CONFIG_IRQ_ROUTE 0x60
> 
> then:
> 
>      if (kvm_enabled() && qemu_kvm_irqchip_in_kernel() &&
>          address >= PIIX_CONFIG_IRQ_ROUTE &&
>          address < PIIX_CONFIG_IRQ_ROUTE + 4)

Ok. #define's are cheap too.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
                       <->
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to