For vfio-user, device operations such as IRQ handling and region read/writes are implemented in userspace over the control socket, not ioctl() or read()/write() to the vfio kernel driver; add an ops vector to generalize this, and implement vfio_device_io_ops_ioctl for interacting with the kernel vfio driver.
Originally-by: John Johnson <john.g.john...@oracle.com> Signed-off-by: Elena Ufimtseva <elena.ufimts...@oracle.com> Signed-off-by: Jagannathan Raman <jag.ra...@oracle.com> Signed-off-by: John Levon <john.le...@nutanix.com> --- hw/vfio/ap.c | 2 +- hw/vfio/ccw.c | 2 +- hw/vfio/container-base.c | 6 +- hw/vfio/device.c | 102 ++++++++++++++++++++++++++++++---- hw/vfio/listener.c | 13 +++-- hw/vfio/pci.c | 40 +++++++------ hw/vfio/platform.c | 2 +- hw/vfio/region.c | 17 ++++-- include/hw/vfio/vfio-device.h | 24 +++++++- 9 files changed, 155 insertions(+), 53 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index f311bca5b6..b6233b2107 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -229,7 +229,7 @@ static void vfio_ap_instance_init(Object *obj) * handle ram_block_discard_disable(). */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_AP, &vfio_ap_ops, - DEVICE(vapdev), true); + &vfio_device_io_ops_ioctl, DEVICE(vapdev), true); /* AP device is mdev type device */ vbasedev->mdev = true; diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 14dee7cd19..aee52b5a8d 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -676,7 +676,7 @@ static void vfio_ccw_instance_init(Object *obj) * ram_block_discard_disable(). */ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_CCW, &vfio_ccw_ops, - DEVICE(vcdev), true); + &vfio_device_io_ops_ioctl, DEVICE(vcdev), true); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 613fe1a00d..16fe5f79d2 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -198,11 +198,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { - return -errno; - } - - return 0; + return vbasedev->io_ops->device_feature(vbasedev, feature); } static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer, diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 102fa5a9b4..545d9f1faf 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -82,7 +82,7 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index) .count = 0, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); } void vfio_device_irq_unmask(VFIODevice *vbasedev, int index) @@ -95,7 +95,7 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); } void vfio_device_irq_mask(VFIODevice *vbasedev, int index) @@ -108,7 +108,7 @@ void vfio_device_irq_mask(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); } static inline const char *action_to_str(int action) @@ -155,6 +155,7 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex int argsz; const char *name; int32_t *pfd; + int ret; argsz = sizeof(*irq_set) + sizeof(*pfd); @@ -167,7 +168,9 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex pfd = (int32_t *)&irq_set->data; *pfd = fd; - if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { + ret = vbasedev->io_ops->set_irqs(vbasedev, irq_set); + + if (!ret) { return true; } @@ -188,22 +191,19 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, struct vfio_irq_info *info) { - int ret; - memset(info, 0, sizeof(*info)); info->argsz = sizeof(*info); info->index = index; - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); - - return ret < 0 ? -errno : ret; + return vbasedev->io_ops->get_irq_info(vbasedev, info); } int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int ret; /* create region info cache */ if (vbasedev->reginfo == NULL) { @@ -222,10 +222,11 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { + ret = vbasedev->io_ops->get_region_info(vbasedev, *info); + if (ret != 0) { g_free(*info); *info = NULL; - return -errno; + return ret; } if ((*info)->argsz > argsz) { @@ -332,10 +333,12 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) } void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, - DeviceState *dev, bool ram_discard) + VFIODeviceIOOps *io_ops, DeviceState *dev, + bool ram_discard) { vbasedev->type = type; vbasedev->ops = ops; + vbasedev->io_ops = io_ops; vbasedev->dev = dev; vbasedev->fd = -1; @@ -463,3 +466,78 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, vfio_device_get_all_region_info(vbasedev); } + +/* + * Traditional ioctl() based io + */ + +static int vfio_device_io_device_feature(VFIODevice *vbasedev, + struct vfio_device_feature *feature) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_set_irqs(VFIODevice *vbasedev, + struct vfio_irq_set *irqs) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + struct vfio_region_info *info = vbasedev->reginfo[index]; + int ret; + + ret = pread(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + struct vfio_region_info *info = vbasedev->reginfo[index]; + int ret; + + ret = pwrite(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +VFIODeviceIOOps vfio_device_io_ops_ioctl = { + .device_feature = vfio_device_io_device_feature, + .get_region_info = vfio_device_io_get_region_info, + .get_irq_info = vfio_device_io_get_irq_info, + .set_irqs = vfio_device_io_set_irqs, + .region_read = vfio_device_io_region_read, + .region_write = vfio_device_io_region_write, +}; diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index bcf2b98e79..7ea9e0dfb7 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -821,13 +821,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + int ret; + if (!vbasedev->dirty_tracking) { continue; } - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + ret = vbasedev->io_ops->device_feature(vbasedev, feature); + + if (ret != 0) { warn_report("%s: Failed to stop DMA logging, err %d (%s)", - vbasedev->name, -errno, strerror(errno)); + vbasedev->name, -ret, strerror(-ret)); } vbasedev->dirty_tracking = false; } @@ -928,10 +932,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, continue; } - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + ret = vbasedev->io_ops->device_feature(vbasedev, feature); if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "%s: Failed to start DMA logging", + error_setg_errno(errp, -ret, "%s: Failed to start DMA logging", vbasedev->name); goto out; } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b40d5abdfd..ff2b15ff02 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -381,7 +381,7 @@ static void vfio_msi_interrupt(void *opaque) static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) { g_autofree struct vfio_irq_set *irq_set = NULL; - int ret = 0, argsz; + int argsz; int32_t *fd; argsz = sizeof(*irq_set) + sizeof(*fd); @@ -396,9 +396,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) fd = (int32_t *)&irq_set->data; *fd = -1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); - - return ret < 0 ? -errno : ret; + return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) @@ -455,11 +453,11 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); g_free(irq_set); - return ret < 0 ? -errno : ret; + return ret; } static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, @@ -917,18 +915,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vbasedev->fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = vbasedev->io_ops->region_read(vbasedev, + VFIO_PCI_ROM_REGION_INDEX, + off, size, vdev->rom + off); + if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strerror(-bytes)); + break; } } @@ -968,22 +970,18 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset, uint32_t size, void *data) { - ssize_t ret; - - ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset); - - return ret < 0 ? -errno : (int)ret; + return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); } /* "Raw" write of underlying config space. */ static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, uint32_t size, void *data) { - ssize_t ret; - - ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset); - - return ret < 0 ? -errno : (int)ret; + return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); } static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) @@ -3405,7 +3403,7 @@ static void vfio_instance_init(Object *obj) vdev->host.function = ~0U; vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, - DEVICE(vdev), false); + &vfio_device_io_ops_ioctl, DEVICE(vdev), false); vdev->nv_gpudirect_clique = 0xFF; diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index fd176c18a4..28eedfa571 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -650,7 +650,7 @@ static void vfio_platform_instance_init(Object *obj) VFIODevice *vbasedev = &vdev->vbasedev; vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PLATFORM, &vfio_platform_ops, - DEVICE(vdev), false); + &vfio_device_io_ops_ioctl, DEVICE(vdev), false); } #ifdef CONFIG_IOMMUFD diff --git a/hw/vfio/region.c b/hw/vfio/region.c index ef2630cac3..35fb81c04a 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -45,6 +45,7 @@ void vfio_region_write(void *opaque, hwaddr addr, uint32_t dword; uint64_t qword; } buf; + int ret; switch (size) { case 1: @@ -64,11 +65,13 @@ void vfio_region_write(void *opaque, hwaddr addr, break; } - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + ret = vbasedev->io_ops->region_write(vbasedev, region->nr, + addr, size, &buf); + if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 - ",%d) failed: %m", + ",%d) failed: %s", __func__, vbasedev->name, region->nr, - addr, data, size); + addr, data, size, ret < 0 ? strerror(ret) : "short write"); } trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); @@ -96,11 +99,13 @@ uint64_t vfio_region_read(void *opaque, uint64_t qword; } buf; uint64_t data = 0; + int ret; - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", + ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", __func__, vbasedev->name, region->nr, - addr, size); + addr, size, ret < 0 ? strerror(ret) : "short read"); return (uint64_t)-1; } switch (size) { diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 967b07cd89..cb2f581826 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -41,6 +41,7 @@ enum { }; typedef struct VFIODeviceOps VFIODeviceOps; +typedef struct VFIODeviceIOOps VFIODeviceIOOps; typedef struct VFIOMigration VFIOMigration; typedef struct IOMMUFDBackend IOMMUFDBackend; @@ -66,6 +67,7 @@ typedef struct VFIODevice { OnOffAuto migration_multifd_transfer; bool migration_events; VFIODeviceOps *ops; + VFIODeviceIOOps *io_ops; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; @@ -140,6 +142,25 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIODeviceList vfio_device_list; #ifdef CONFIG_LINUX +/* + * How devices communicate with the server. The default option is through + * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote + * process. + */ +struct VFIODeviceIOOps { + int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *); + int (*get_region_info)(VFIODevice *vdev, + struct vfio_region_info *info); + int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); + int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); + int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); +}; + +extern VFIODeviceIOOps vfio_device_io_ops_ioctl; + int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, @@ -154,6 +175,7 @@ int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp); void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp); void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, - DeviceState *dev, bool ram_discard); + VFIODeviceIOOps *io_ops, DeviceState *dev, + bool ram_discard); int vfio_device_get_aw_bits(VFIODevice *vdev); #endif /* HW_VFIO_VFIO_COMMON_H */ -- 2.34.1