Re: [PATCH v2 03/23] vfio-user: add container IO ops vector

2023-02-03 Thread Alex Williamson
On Fri, 3 Feb 2023 15:21:09 -0700
Alex Williamson  wrote:

> On Wed,  1 Feb 2023 21:55:39 -0800
> John Johnson  wrote:
> 
> > Used for communication with VFIO driver
> > (prep work for vfio-user, which will communicate over a socket)
> > 
> > Signed-off-by: John G Johnson 
> > ---
> >  include/hw/vfio/vfio-common.h |  24 
> >  hw/vfio/common.c  | 128 
> > --
> >  2 files changed, 110 insertions(+), 42 deletions(-)
> > 
> > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> > index e573f5a..953bc0f 100644
> > --- a/include/hw/vfio/vfio-common.h
> > +++ b/include/hw/vfio/vfio-common.h
> > @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
> >  } VFIOAddressSpace;
> >  
> >  struct VFIOGroup;
> > +typedef struct VFIOContainerIO VFIOContainerIO;
> >  
> >  typedef struct VFIOContainer {
> >  VFIOAddressSpace *space;
> > @@ -83,6 +84,7 @@ typedef struct VFIOContainer {
> >  MemoryListener prereg_listener;
> >  unsigned iommu_type;
> >  Error *error;
> > +VFIOContainerIO *io;
> >  bool initialized;
> >  bool dirty_pages_supported;
> >  uint64_t dirty_pgsizes;
> > @@ -154,6 +156,28 @@ struct VFIODeviceOps {
> >  int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
> >  };
> >  
> > +#ifdef CONFIG_LINUX
> > +
> > +/*
> > + * The next 2 ops vectors are how Devices and Containers
> > + * communicate with the server.  The default option is
> > + * through ioctl() to the kernel VFIO driver, but vfio-user
> > + * can use a socket to a remote process.
> > + */
> > +
> > +struct VFIOContainerIO {
> > +int (*dma_map)(VFIOContainer *container,
> > +   struct vfio_iommu_type1_dma_map *map);
> > +int (*dma_unmap)(VFIOContainer *container,
> > + struct vfio_iommu_type1_dma_unmap *unmap,
> > + struct vfio_bitmap *bitmap);
> > +int (*dirty_bitmap)(VFIOContainer *container,
> > +struct vfio_iommu_type1_dirty_bitmap *bitmap,
> > +struct vfio_iommu_type1_dirty_bitmap_get *range);
> > +};
> > +
> > +#endif /* CONFIG_LINUX */
> > +
> >  typedef struct VFIOGroup {
> >  int fd;
> >  int groupid;
> > diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > index ace9562..9310a7f 100644
> > --- a/hw/vfio/common.c
> > +++ b/hw/vfio/common.c
> > @@ -58,6 +58,8 @@ static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces 
> > =
> >  static int vfio_kvm_device_fd = -1;
> >  #endif
> >  
> > +static VFIOContainerIO vfio_cont_io_ioctl;
> > +
> >  /*
> >   * Common VFIO interrupt disable
> >   */
> > @@ -432,12 +434,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
> > *container,
> >  goto unmap_exit;
> >  }
> >  
> > -ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
> > +ret = container->io->dma_unmap(container, unmap, bitmap);  
> 
> The bitmap arg doesn't make a lot of sense here, its use doesn't come
> around until vfio_user_dma_unmap() is added, but even then, it's
> readily available at >data with validity determined by
> unmap->flags.  The eventual sanity test in vfio_user_dma_unmap() really
> only seems to prove the arg is unnecessary.  Thanks,


The same comment really applies to .dirty_bitmap() with respect to the
range arg, but I think it's also important to note that both of these
are all but deprecated interfaces for the kernel.  The migration
series[1] is pre-enabling these kernel interfaces to be removed by
adding support to QEMU to dirty all pages when support isn't present,
then we replace it with what we expect to be a longer term solution
with device dirty tracking support[2].  All the more reason not to make
these part of the internal API shared by kernel and user versions.
Thanks,

Alex

[1]https://lore.kernel.org/all/20230116141135.12021-1-avih...@nvidia.com/
[2]https://lore.kernel.org/all/20230126184948.10478-1-avih...@nvidia.com/




Re: [PATCH v2 03/23] vfio-user: add container IO ops vector

2023-02-03 Thread Alex Williamson
On Wed,  1 Feb 2023 21:55:39 -0800
John Johnson  wrote:

> Used for communication with VFIO driver
> (prep work for vfio-user, which will communicate over a socket)
> 
> Signed-off-by: John G Johnson 
> ---
>  include/hw/vfio/vfio-common.h |  24 
>  hw/vfio/common.c  | 128 
> --
>  2 files changed, 110 insertions(+), 42 deletions(-)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index e573f5a..953bc0f 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
>  } VFIOAddressSpace;
>  
>  struct VFIOGroup;
> +typedef struct VFIOContainerIO VFIOContainerIO;
>  
>  typedef struct VFIOContainer {
>  VFIOAddressSpace *space;
> @@ -83,6 +84,7 @@ typedef struct VFIOContainer {
>  MemoryListener prereg_listener;
>  unsigned iommu_type;
>  Error *error;
> +VFIOContainerIO *io;
>  bool initialized;
>  bool dirty_pages_supported;
>  uint64_t dirty_pgsizes;
> @@ -154,6 +156,28 @@ struct VFIODeviceOps {
>  int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>  };
>  
> +#ifdef CONFIG_LINUX
> +
> +/*
> + * The next 2 ops vectors are how Devices and Containers
> + * communicate with the server.  The default option is
> + * through ioctl() to the kernel VFIO driver, but vfio-user
> + * can use a socket to a remote process.
> + */
> +
> +struct VFIOContainerIO {
> +int (*dma_map)(VFIOContainer *container,
> +   struct vfio_iommu_type1_dma_map *map);
> +int (*dma_unmap)(VFIOContainer *container,
> + struct vfio_iommu_type1_dma_unmap *unmap,
> + struct vfio_bitmap *bitmap);
> +int (*dirty_bitmap)(VFIOContainer *container,
> +struct vfio_iommu_type1_dirty_bitmap *bitmap,
> +struct vfio_iommu_type1_dirty_bitmap_get *range);
> +};
> +
> +#endif /* CONFIG_LINUX */
> +
>  typedef struct VFIOGroup {
>  int fd;
>  int groupid;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ace9562..9310a7f 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -58,6 +58,8 @@ static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
>  static int vfio_kvm_device_fd = -1;
>  #endif
>  
> +static VFIOContainerIO vfio_cont_io_ioctl;
> +
>  /*
>   * Common VFIO interrupt disable
>   */
> @@ -432,12 +434,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
> *container,
>  goto unmap_exit;
>  }
>  
> -ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
> +ret = container->io->dma_unmap(container, unmap, bitmap);

The bitmap arg doesn't make a lot of sense here, its use doesn't come
around until vfio_user_dma_unmap() is added, but even then, it's
readily available at >data with validity determined by
unmap->flags.  The eventual sanity test in vfio_user_dma_unmap() really
only seems to prove the arg is unnecessary.  Thanks,

Alex




[PATCH v2 03/23] vfio-user: add container IO ops vector

2023-02-01 Thread John Johnson
Used for communication with VFIO driver
(prep work for vfio-user, which will communicate over a socket)

Signed-off-by: John G Johnson 
---
 include/hw/vfio/vfio-common.h |  24 
 hw/vfio/common.c  | 128 --
 2 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index e573f5a..953bc0f 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -75,6 +75,7 @@ typedef struct VFIOAddressSpace {
 } VFIOAddressSpace;
 
 struct VFIOGroup;
+typedef struct VFIOContainerIO VFIOContainerIO;
 
 typedef struct VFIOContainer {
 VFIOAddressSpace *space;
@@ -83,6 +84,7 @@ typedef struct VFIOContainer {
 MemoryListener prereg_listener;
 unsigned iommu_type;
 Error *error;
+VFIOContainerIO *io;
 bool initialized;
 bool dirty_pages_supported;
 uint64_t dirty_pgsizes;
@@ -154,6 +156,28 @@ struct VFIODeviceOps {
 int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
 };
 
+#ifdef CONFIG_LINUX
+
+/*
+ * The next 2 ops vectors are how Devices and Containers
+ * communicate with the server.  The default option is
+ * through ioctl() to the kernel VFIO driver, but vfio-user
+ * can use a socket to a remote process.
+ */
+
+struct VFIOContainerIO {
+int (*dma_map)(VFIOContainer *container,
+   struct vfio_iommu_type1_dma_map *map);
+int (*dma_unmap)(VFIOContainer *container,
+ struct vfio_iommu_type1_dma_unmap *unmap,
+ struct vfio_bitmap *bitmap);
+int (*dirty_bitmap)(VFIOContainer *container,
+struct vfio_iommu_type1_dirty_bitmap *bitmap,
+struct vfio_iommu_type1_dirty_bitmap_get *range);
+};
+
+#endif /* CONFIG_LINUX */
+
 typedef struct VFIOGroup {
 int fd;
 int groupid;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ace9562..9310a7f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -58,6 +58,8 @@ static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
 static int vfio_kvm_device_fd = -1;
 #endif
 
+static VFIOContainerIO vfio_cont_io_ioctl;
+
 /*
  * Common VFIO interrupt disable
  */
@@ -432,12 +434,12 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
 goto unmap_exit;
 }
 
-ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+ret = container->io->dma_unmap(container, unmap, bitmap);
 if (!ret) {
 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
 iotlb->translated_addr, pages);
 } else {
-error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
+error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %s", strerror(-ret));
 }
 
 g_free(bitmap->data);
@@ -465,30 +467,7 @@ static int vfio_dma_unmap(VFIOContainer *container,
 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
 }
 
-while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, )) {
-/*
- * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
- * v4.15) where an overflow in its wrap-around check prevents us from
- * unmapping the last page of the address space.  Test for the error
- * condition and re-try the unmap excluding the last page.  The
- * expectation is that we've never mapped the last page anyway and this
- * unmap request comes via vIOMMU support which also makes it unlikely
- * that this page is used.  This bug was introduced well after type1 v2
- * support was introduced, so we shouldn't need to test for v1.  A fix
- * is queued for kernel v5.0 so this workaround can be removed once
- * affected kernels are sufficiently deprecated.
- */
-if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
-container->iommu_type == VFIO_TYPE1v2_IOMMU) {
-trace_vfio_dma_unmap_overflow_workaround();
-unmap.size -= 1ULL << ctz64(container->pgsizes);
-continue;
-}
-error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
-return -errno;
-}
-
-return 0;
+return container->io->dma_unmap(container, , NULL);
 }
 
 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
@@ -501,24 +480,18 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
 .iova = iova,
 .size = size,
 };
+int ret;
 
 if (!readonly) {
 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
 }
 
-/*
- * Try the mapping, if it fails with EBUSY, unmap the region and try
- * again.  This shouldn't be necessary, but we sometimes see it in
- * the VGA ROM space.
- */
-if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, ) == 0 ||
-(errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
- ioctl(container->fd, VFIO_IOMMU_MAP_DMA, ) == 0)) {
-return 0;
-}