On Wed, Dec 09, 2009 at 06:38:54PM +0100, Alexander Graf wrote:
> While trying to get device passthrough working with an emulex hba, kvm
> refused to pass it through because it has a BAR of 256 bytes:
>
> Region 0: Memory at d2100000 (64-bit, non-prefetchable) [size=4K]
> Region 2: Memory at d2101000 (64-bit, non-prefetchable) [size=256]
> Region 4: I/O ports at b100 [size=256]
>
> Since the page boundary is an arbitrary optimization to allow 1:1 mapping of
> physical to virtual addresses, we can still take the old MMIO callback route.
>
> So let's add a second code path that allows for size & 0xFFF != 0 sized
> regions
> by looping it through userspace.
>
> I verified that it works by passing through an e1000 with this additional
> patch
> applied and the card acted the same way it did without this patch:
>
> map_func = assigned_dev_iomem_map;
> - if (cur_region->size & 0xFFF) {
> + if (i != PCI_ROM_SLOT){
> fprintf(stderr, "PCI region %d at address 0x%llx "
>
> Signed-off-by: Alexander Graf <[email protected]>
Good idea.
One thing I am concerned with, is that some users might
see performance degradation and not see the error message.
Maybe we can have a flag to optionally fail unless
passthrough can be fast? I'm not really sure it's worth it
to add such a flag though - what do others think?
Minor nits below.
> ---
> hw/device-assignment.c | 129 +++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 123 insertions(+), 6 deletions(-)
>
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 13a86bb..beb488c 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -148,6 +148,105 @@ static uint32_t assigned_dev_ioport_readl(void *opaque,
> uint32_t addr)
> return value;
> }
>
> +static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
> +{
> + AssignedDevRegion *d = opaque;
> + uint8_t *in = (uint8_t*)(d->u.r_virtbase + addr);
> + uint32_t r = -1;
> +
> + r = *in;
> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> + return r;
> +}
> +
> +static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
> +{
> + AssignedDevRegion *d = opaque;
> + uint16_t *in = (uint16_t*)(d->u.r_virtbase + addr);
> + uint32_t r = -1;
> +
> + r = *in;
> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> + return r;
> +}
> +
> +static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
> +{
> + AssignedDevRegion *d = opaque;
> + uint32_t *in = (uint32_t*)(d->u.r_virtbase + addr);
> + uint32_t r = -1;
> +
> + r = *in;
> + DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
> +
> + return r;
> +}
> +
> +static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t
> val)
> +{
> + AssignedDevRegion *d = opaque;
> + uint8_t *out = (uint8_t*)(d->u.r_virtbase + addr);
> +
> + DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr,
> val);
> + *out = val;
> +}
> +
> +static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t
> val)
> +{
> + AssignedDevRegion *d = opaque;
> + uint16_t *out = (uint16_t*)(d->u.r_virtbase + addr);
> +
> + DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr,
> val);
> + *out = val;
> +}
> +
> +static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t
> val)
> +{
> + AssignedDevRegion *d = opaque;
> + uint32_t *out = (uint32_t*)(d->u.r_virtbase + addr);
> +
> + DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr,
> val);
> + *out = val;
> +}
> +
> +static CPUWriteMemoryFunc * const slow_bar_write[] = {
> + &slow_bar_writeb,
> + &slow_bar_writew,
> + &slow_bar_writel
> +};
> +
> +static CPUReadMemoryFunc * const slow_bar_read[] = {
> + &slow_bar_readb,
> + &slow_bar_readw,
> + &slow_bar_readl
> +};
> +
> +static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
> + pcibus_t e_phys, pcibus_t e_size,
> + int type)
> +{
> + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
> + AssignedDevRegion *region = &r_dev->v_addrs[region_num];
> + PCIRegion *real_region = &r_dev->real_device.regions[region_num];
> + int m;
> +
> + DEBUG("slow map\n");
> + m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region);
> + cpu_register_physical_memory(e_phys, e_size, m);
> +
> + /* MSI-X MMIO page */
> + if ((e_size > 0) &&
> + real_region->base_addr <= r_dev->msix_table_addr &&
> + real_region->base_addr + real_region->size >=
> r_dev->msix_table_addr) {
> + int offset = r_dev->msix_table_addr - real_region->base_addr;
> +
> + cpu_register_physical_memory(e_phys + offset,
> + TARGET_PAGE_SIZE, r_dev->mmio_index);
> + }
> +}
> +
> static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
> pcibus_t e_phys, pcibus_t e_size, int
> type)
> {
> @@ -381,15 +480,25 @@ static int assigned_dev_register_regions(PCIRegion
> *io_regions,
>
> /* handle memory io regions */
> if (cur_region->type & IORESOURCE_MEM) {
> + void (*map_func)(PCIDevice *, int, pcibus_t, pcibus_t, int);
> + int slow_map = 0;
This is a bit too clever for my taste. Let's just do below
pci_register_bar((PCIDevice *) pci_dev, i,
cur_region->size, t,
slow_map ? assigned_dev_iomem_map :
assigned_dev_iomem_map_slow);
(by the way, this cast to (PCIDevice *) is also unnecessary,
isn't, it?
> int t = cur_region->type & IORESOURCE_PREFETCH
> ? PCI_BASE_ADDRESS_MEM_PREFETCH
> : PCI_BASE_ADDRESS_SPACE_MEMORY;
> +
> + map_func = assigned_dev_iomem_map;
> if (cur_region->size & 0xFFF) {
> - fprintf(stderr, "Unable to assign device: PCI region %d "
> - "at address 0x%llx has size 0x%x, "
> - " which is not a multiple of 4K\n",
> + fprintf(stderr, "PCI region %d at address 0x%llx "
> + "has size 0x%x, which is not a multiple of 4K. "
> + "Using slow map\n",
Maybe replace "Using slow map" with "The device will be slow.".
> i, (unsigned long long)cur_region->base_addr,
> cur_region->size);
> + map_func = assigned_dev_iomem_map_slow;
> + slow_map = 1;
> + }
> +
> + if (slow_map && (i == PCI_ROM_SLOT)) {
> + fprintf(stderr, "ROM not aligned - can't continue\n");
> return -1;
> }
>
> @@ -402,6 +511,12 @@ static int assigned_dev_register_regions(PCIRegion
> *io_regions,
> PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE,
> 0, (off_t) 0);
>
> + } else if (slow_map) {
> + pci_dev->v_addrs[i].u.r_virtbase =
> + mmap(NULL,
> + cur_region->size,
> + PROT_WRITE | PROT_READ, MAP_SHARED,
> + cur_region->resource_fd, (off_t) 0);
Isn't this the same as the case below it?
I expect mmap to act identically for slow and fast ...
Just replace (cur_region->size + 0xFFF) & 0xFFFFF000, with
cur_region->size.
> } else {
> pci_dev->v_addrs[i].u.r_virtbase =
> mmap(NULL,
> @@ -429,12 +544,14 @@ static int assigned_dev_register_regions(PCIRegion
> *io_regions,
> pci_dev->v_addrs[i].e_size = 0;
>
> /* add offset */
> - pci_dev->v_addrs[i].u.r_virtbase +=
> - (cur_region->base_addr & 0xFFF);
> + if (!slow_map) {
> + pci_dev->v_addrs[i].u.r_virtbase +=
> + (cur_region->base_addr & 0xFFF);
> + }
>
> pci_register_bar((PCIDevice *) pci_dev, i,
> cur_region->size, t,
> - assigned_dev_iomem_map);
> + map_func);
> continue;
> }
> /* handle port io regions */
> --
> 1.6.0.2
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html