On Thu, 10 Mar 2016 01:54:45 +0100 Eric Auger <eric.au...@linaro.org> wrote:
> Hi Alex, > On 03/09/2016 08:53 PM, Alex Williamson wrote: > > Both platform and PCI vfio drivers create a "slow", I/O memory region > > with one or more mmap memory regions overlayed when supported by the > > device. Generalize this to a set of common helpers in the core that > > pulls the region info from vfio, fills the region data, configures > > slow mapping, and adds helpers for comleting the mmap, enable/disable, > > and teardown. This can be immediately used by the PCI MSI-X code, > > which needs to mmap around the MSI-X vector table. > > > > This also changes VFIORegion.mem to be dynamically allocated because > > otherwise we don't know how the caller has allocated VFIORegion and > > therefore don't know whether to unreference it to destroy the > > MemoryRegion or not. > > > > Signed-off-by: Alex Williamson <alex.william...@redhat.com> > > --- > > hw/arm/sysbus-fdt.c | 4 - > > hw/vfio/common.c | 172 > > ++++++++++++++++++++++++++++++++++------- > > hw/vfio/pci-quirks.c | 24 +++--- > > hw/vfio/pci.c | 168 > > +++++++++++++++++++++------------------- > > hw/vfio/platform.c | 72 +++-------------- > > include/hw/vfio/vfio-common.h | 23 ++++- > > trace-events | 10 ++ > > 7 files changed, 283 insertions(+), 190 deletions(-) > > > > diff --git a/hw/arm/sysbus-fdt.c b/hw/arm/sysbus-fdt.c > > index 04afeae..49bd212 100644 > > --- a/hw/arm/sysbus-fdt.c > > +++ b/hw/arm/sysbus-fdt.c > > @@ -240,7 +240,7 @@ static int > > add_calxeda_midway_xgmac_fdt_node(SysBusDevice *sbdev, void *opaque) > > mmio_base = platform_bus_get_mmio_addr(pbus, sbdev, i); > > reg_attr[2 * i] = cpu_to_be32(mmio_base); > > reg_attr[2 * i + 1] = cpu_to_be32( > > - > > memory_region_size(&vdev->regions[i]->mem)); > > + memory_region_size(vdev->regions[i]->mem)); > > } > > qemu_fdt_setprop(fdt, nodename, "reg", reg_attr, > > vbasedev->num_regions * 2 * sizeof(uint32_t)); > > @@ -374,7 +374,7 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, > > void *opaque) > > mmio_base = platform_bus_get_mmio_addr(pbus, sbdev, i); > > reg_attr[2 * i] = cpu_to_be32(mmio_base); > > reg_attr[2 * i + 1] = cpu_to_be32( > > - > > memory_region_size(&vdev->regions[i]->mem)); > > + memory_region_size(vdev->regions[i]->mem)); > > } > > qemu_fdt_setprop(guest_fdt, nodename, "reg", reg_attr, > > vbasedev->num_regions * 2 * sizeof(uint32_t)); > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > > index e20fc4f..96ccb79 100644 > > --- a/hw/vfio/common.c > > +++ b/hw/vfio/common.c > > @@ -493,46 +493,162 @@ static void vfio_listener_release(VFIOContainer > > *container) > > memory_listener_unregister(&container->listener); > > } > > > > -int vfio_mmap_region(Object *obj, VFIORegion *region, > > - MemoryRegion *mem, MemoryRegion *submem, > > - void **map, size_t size, off_t offset, > > - const char *name) > > +int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion > > *region, > > + int index, const char *name) > > { > > - int ret = 0; > > - VFIODevice *vbasedev = region->vbasedev; > > + struct vfio_region_info *info; > > + int ret; > > + > > + ret = vfio_get_region_info(vbasedev, index, &info); > > + if (ret) { > > + return ret; > > + } > > + > > + region->vbasedev = vbasedev; > > + region->flags = info->flags; > > + region->size = info->size; > > + region->fd_offset = info->offset; > > + region->nr = index; > > > > - if (!vbasedev->no_mmap && size && region->flags & > > - VFIO_REGION_INFO_FLAG_MMAP) { > > - int prot = 0; > > + if (region->size) { > > + region->mem = g_new0(MemoryRegion, 1); > > + memory_region_init_io(region->mem, obj, &vfio_region_ops, > > + region, name, region->size); > > > > - if (region->flags & VFIO_REGION_INFO_FLAG_READ) { > > - prot |= PROT_READ; > > + if (!vbasedev->no_mmap && > > + region->flags & VFIO_REGION_INFO_FLAG_MMAP && > > + !(region->size & ~qemu_real_host_page_mask)) { > > + > > + region->nr_mmaps = 1; > > + region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); > > + > > + region->mmaps[0].offset = 0; > > + region->mmaps[0].size = region->size; > > } > > + } > > + > > + g_free(info); > > + > > + trace_vfio_region_setup(vbasedev->name, index, name, > > + region->flags, region->fd_offset, > > region->size); > > + return 0; > > +} > > > > - if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) { > > - prot |= PROT_WRITE; > > +int vfio_region_mmap(VFIORegion *region) > > +{ > > + int i, prot = 0; > > + char *name; > > + > > + if (!region->mem) { > > + return 0; > > + } > > + > > + prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; > > + prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; > > + > > + for (i = 0; i < region->nr_mmaps; i++) { > > + region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot, > > + MAP_SHARED, region->vbasedev->fd, > > + region->fd_offset + > > + region->mmaps[i].offset); > > + if (region->mmaps[i].mmap == MAP_FAILED) { > > + int ret = -errno; > > + > > + trace_vfio_region_mmap_fault(memory_region_name(region->mem), > > i, > > + region->fd_offset + > > + region->mmaps[i].offset, > > + region->fd_offset + > > + region->mmaps[i].offset + > > + region->mmaps[i].size - 1, ret); > > + > > + region->mmaps[i].mmap = NULL; > > + > > + for (i--; i >= 0; i--) { > > + memory_region_del_subregion(region->mem, > > ®ion->mmaps[i].mem); > > + munmap(region->mmaps[i].mmap, region->mmaps[i].size); > > + object_unparent(OBJECT(®ion->mmaps[i].mem)); > > + region->mmaps[i].mmap = NULL; > > + } > > + > > + return ret; > > } > > > > - *map = mmap(NULL, size, prot, MAP_SHARED, > > - vbasedev->fd, > > - region->fd_offset + offset); > > - if (*map == MAP_FAILED) { > > - *map = NULL; > > - ret = -errno; > > - goto empty_region; > > + name = g_strdup_printf("%s mmaps[%d]", > > + memory_region_name(region->mem), i); > > + memory_region_init_ram_ptr(®ion->mmaps[i].mem, > > + memory_region_owner(region->mem), > > + name, region->mmaps[i].size, > > + region->mmaps[i].mmap); > > + g_free(name); > > + memory_region_set_skip_dump(®ion->mmaps[i].mem); > > + memory_region_add_subregion(region->mem, region->mmaps[i].offset, > > + ®ion->mmaps[i].mem); > > + > > + trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), > > + region->mmaps[i].offset, > > + region->mmaps[i].offset + > > + region->mmaps[i].size - 1); > > + } > > + > > + return 0; > > +} > > + > > +void vfio_region_exit(VFIORegion *region) > > +{ > > + int i; > > + > > + if (!region->mem) { > > + return; > > + } > > + > > + for (i = 0; i < region->nr_mmaps; i++) { > > + if (region->mmaps[i].mmap) { > > + memory_region_del_subregion(region->mem, > > ®ion->mmaps[i].mem); > > } > > + } > > > > - memory_region_init_ram_ptr(submem, obj, name, size, *map); > > - memory_region_set_skip_dump(submem); > > - } else { > > -empty_region: > > - /* Create a zero sized sub-region to make cleanup easy. */ > > - memory_region_init(submem, obj, name, 0); > > + trace_vfio_region_exit(region->vbasedev->name, region->nr); > > +} > > + > > +void vfio_region_finalize(VFIORegion *region) > > +{ > > + int i; > > + > > + if (!region->mem) { > > + return; > > } > > > > - memory_region_add_subregion(mem, offset, submem); > > + for (i = 0; i < region->nr_mmaps; i++) { > > + if (region->mmaps[i].mmap) { > > + munmap(region->mmaps[i].mmap, region->mmaps[i].size); > > + object_unparent(OBJECT(®ion->mmaps[i].mem)); > > + } > > + } > > > > - return ret; > > + object_unparent(OBJECT(region->mem)); > > + > > + g_free(region->mem); > > + g_free(region->mmaps); > > + > > + trace_vfio_region_finalize(region->vbasedev->name, region->nr); > > +} > > + > > +void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) > > +{ > > + int i; > > + > > + if (!region->mem) { > > + return; > > + } > > + > > + for (i = 0; i < region->nr_mmaps; i++) { > > + if (region->mmaps[i].mmap) { > > + memory_region_set_enabled(®ion->mmaps[i].mem, enabled); > > + } > > + } > > + > > + trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), > > + enabled); > > } > > > > void vfio_reset_handler(void *opaque) > > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > > index 4815527..d626ec9 100644 > > --- a/hw/vfio/pci-quirks.c > > +++ b/hw/vfio/pci-quirks.c > > @@ -337,14 +337,14 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice > > *vdev, int nr) > > memory_region_init_io(window->addr_mem, OBJECT(vdev), > > &vfio_generic_window_address_quirk, window, > > "vfio-ati-bar4-window-address-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > window->address_offset, > > window->addr_mem, 1); > > > > memory_region_init_io(window->data_mem, OBJECT(vdev), > > &vfio_generic_window_data_quirk, window, > > "vfio-ati-bar4-window-data-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > window->data_offset, > > window->data_mem, 1); > > > > @@ -378,7 +378,7 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice > > *vdev, int nr) > > memory_region_init_io(mirror->mem, OBJECT(vdev), > > &vfio_generic_mirror_quirk, mirror, > > "vfio-ati-bar2-4000-quirk", > > PCI_CONFIG_SPACE_SIZE); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > mirror->offset, mirror->mem, 1); > > > > QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); > > @@ -683,7 +683,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice > > *vdev, int nr) > > memory_region_init_io(window->addr_mem, OBJECT(vdev), > > &vfio_generic_window_address_quirk, window, > > "vfio-nvidia-bar5-window-address-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > window->address_offset, > > window->addr_mem, 1); > > memory_region_set_enabled(window->addr_mem, false); > > @@ -691,7 +691,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice > > *vdev, int nr) > > memory_region_init_io(window->data_mem, OBJECT(vdev), > > &vfio_generic_window_data_quirk, window, > > "vfio-nvidia-bar5-window-data-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > window->data_offset, > > window->data_mem, 1); > > memory_region_set_enabled(window->data_mem, false); > > @@ -699,13 +699,13 @@ static void > > vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) > > memory_region_init_io(&quirk->mem[2], OBJECT(vdev), > > &vfio_nvidia_bar5_quirk_master, bar5, > > "vfio-nvidia-bar5-master-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > 0, &quirk->mem[2], 1); > > > > memory_region_init_io(&quirk->mem[3], OBJECT(vdev), > > &vfio_nvidia_bar5_quirk_enable, bar5, > > "vfio-nvidia-bar5-enable-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > 4, &quirk->mem[3], 1); > > > > QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); > > @@ -767,7 +767,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice > > *vdev, int nr) > > &vfio_nvidia_mirror_quirk, mirror, > > "vfio-nvidia-bar0-88000-mirror-quirk", > > vdev->config_size); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > mirror->offset, mirror->mem, 1); > > > > QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); > > @@ -786,7 +786,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice > > *vdev, int nr) > > &vfio_nvidia_mirror_quirk, mirror, > > "vfio-nvidia-bar0-1800-mirror-quirk", > > PCI_CONFIG_SPACE_SIZE); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > mirror->offset, mirror->mem, > > 1); > > > > QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); > > @@ -947,13 +947,13 @@ static void > > vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr) > > memory_region_init_io(&quirk->mem[0], OBJECT(vdev), > > &vfio_rtl_address_quirk, rtl, > > "vfio-rtl8168-window-address-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > 0x74, &quirk->mem[0], 1); > > > > memory_region_init_io(&quirk->mem[1], OBJECT(vdev), > > &vfio_rtl_data_quirk, rtl, > > "vfio-rtl8168-window-data-quirk", 4); > > - memory_region_add_subregion_overlap(&vdev->bars[nr].region.mem, > > + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, > > 0x70, &quirk->mem[1], 1); > > > > QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); > > @@ -1020,7 +1020,7 @@ void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int > > nr) > > > > QLIST_FOREACH(quirk, &bar->quirks, next) { > > for (i = 0; i < quirk->nr_mem; i++) { > > - memory_region_del_subregion(&bar->region.mem, &quirk->mem[i]); > > + memory_region_del_subregion(bar->region.mem, &quirk->mem[i]); > > } > > } > > } > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > index db7a950..f18a678 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -1166,6 +1166,74 @@ static int vfio_msi_setup(VFIOPCIDevice *vdev, int > > pos) > > return 0; > > } > > > > +static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) > > +{ > > + off_t start, end; > > + VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region; > > + > > + /* > > + * We expect to find a single mmap covering the whole BAR, anything > > else > > + * means it's either unsupported or already setup. > > + */ > > + if (region->nr_mmaps != 1 || region->mmaps[0].offset || > > + region->size != region->mmaps[0].size) { > > + return; > > + } > > + > > + /* MSI-X table start and end aligned to host page size */ > > + start = vdev->msix->table_offset & qemu_real_host_page_mask; > > + end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + > > + (vdev->msix->entries * > > PCI_MSIX_ENTRY_SIZE)); > > + > > + /* > > + * Does the MSI-X table cover the beginning of the BAR? The whole BAR? > > + * NB - Host page size is necessarily a power of two and so is the PCI > > + * BAR (not counting EA yet), therefore if we have host page aligned > > + * @start and @end, then any remainder of the BAR before or after those > > + * must be at least host page sized and therefore mmap'able. > > + */ > > + if (!start) { > > + if (end >= region->size) { > > + region->nr_mmaps = 0; > > + g_free(region->mmaps); > > + region->mmaps = NULL; > > + trace_vfio_msix_fixup(vdev->vbasedev.name, > > + vdev->msix->table_bar, 0, 0); > > + } else { > > + region->mmaps[0].offset = end; > > + region->mmaps[0].size = region->size - end; > > + trace_vfio_msix_fixup(vdev->vbasedev.name, > > + vdev->msix->table_bar, > > region->mmaps[0].offset, > > + region->mmaps[0].offset + > > region->mmaps[0].size); > Sorry this does not compile for me on arm 32b: > > ./trace/generated-tracers.h:16113:23: error: format ‘%lx’ expects > argument of type ‘long unsigned int’, but argument 8 has type ‘off_t’ > [-Werror=format=] , name, bar, offset, size); > > -> vfio_msix_fixup(const char *name, int bar, off_t start, off_t end) " > (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]" ? Thanks Eric, that's exactly the right fix: diff --git a/trace-events b/trace-events index e36bc07..fd07512 100644 --- a/trace-events +++ b/trace-events @@ -1652,7 +1652,7 @@ vfio_msix_enable(const char *name) " (%s)" vfio_msix_pba_disable(const char *name) " (%s)" vfio_msix_pba_enable(const char *name) " (%s)" vfio_msix_disable(const char *name) " (%s)" -vfio_msix_fixup(const char *name, int bar, off_t offset, size_t size) " (%s) MSI-X region %d mmap fixup [0x%lx - 0x%lx]" +vfio_msix_fixup(const char *name, int bar, off_t start, off_t end) " (%s) MSI-X region %d mmap fixup [0x%"PRIx64" - 0x%"PRIx64"]" vfio_msi_enable(const char *name, int nr_vectors) " (%s) Enabled %d MSI vectors" vfio_msi_disable(const char *name) " (%s)" vfio_pci_load_rom(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s ROM:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx"