Hi Joao,

On 12/02/2024 15:56, Joao Martins wrote:
External email: Use caution opening links or attachments


There's generally two modes of operation for IOMMUFD:

* The simple user API which intends to perform relatively simple things
with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
and mainly performs IOAS_MAP and UNMAP.

* The native IOMMUFD API where you have fine grained control of the
IOMMU domain and model it accordingly. This is where most new feature
are being steered to.

For dirty tracking 2) is required, as it needs to ensure that
the stage-2/parent IOMMU domain will only attach devices
that support dirty tracking (so far it is all homogeneous in x86, likely
not the case for smmuv3). Such invariant on dirty tracking provides a
useful guarantee to VMMs that will refuse incompatible device
attachments for IOMMU domains.

For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
which is responsible for creating an IOMMU domain. This is contrast to
the 'simple API' where the IOMMU domain is created by IOMMUFD
automatically when it attaches to VFIO (usually referred as autodomains)

To support dirty tracking with the advanced IOMMUFD API, it needs
similar logic, where IOMMU domains are created and devices attached to
compatible domains. Essentially mimmicing kernel
iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
to IOAS attach.

Signed-off-by: Joao Martins <joao.m.mart...@oracle.com>
---
Right now the only alternative to a userspace autodomains implementation
is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
IOAS attach. So opted for autodomains userspace approach to avoid the
duplication of hwpt-alloc flags vs attach-ioas flags. I lack mdev real
drivers atm, so testing with those is still TBD.

Opinions, comments, welcome!
---
  backends/iommufd.c            | 29 +++++++++++++
  backends/trace-events         |  1 +
  hw/vfio/iommufd.c             | 78 +++++++++++++++++++++++++++++++++++
  include/hw/vfio/vfio-common.h |  9 ++++
  include/sysemu/iommufd.h      |  4 ++
  5 files changed, 121 insertions(+)

diff --git a/backends/iommufd.c b/backends/iommufd.c
index 8486894f1b3f..2970135af4b9 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -211,6 +211,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
      return ret;
  }

+int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt)
+{
+    int ret;
+    struct iommu_hwpt_alloc alloc_hwpt = {
+        .size = sizeof(struct iommu_hwpt_alloc),
+        .flags = flags,
+        .dev_id = dev_id,
+        .pt_id = pt_id,
+        .data_type = data_type,
+        .data_len = data_len,
+        .data_uptr = (uint64_t)data_ptr,
+        .__reserved = 0,
+    };
+
+    ret = ioctl(iommufd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
+    trace_iommufd_backend_alloc_hwpt(iommufd, dev_id, pt_id, flags, data_type,
+                                     data_len, (uint64_t)data_ptr,
+                                     alloc_hwpt.out_hwpt_id, ret);
+    if (ret) {
+        error_report("IOMMU_HWPT_ALLOC failed: %m");
+    } else {
+        *out_hwpt = alloc_hwpt.out_hwpt_id;
+    }
+    return !ret ? 0 : -errno;
+}
+
  static const TypeInfo iommufd_backend_info = {
      .name = TYPE_IOMMUFD_BACKEND,
      .parent = TYPE_OBJECT,
diff --git a/backends/trace-events b/backends/trace-events
index d45c6e31a67e..f83a276a4253 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -13,5 +13,6 @@ iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
  iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, void *vaddr, bool 
readonly, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p 
readonly=%d (%d)"
  iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) 
" Unmap nonexistent mapping: iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" 
(%d)"
  iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t size, int ret) " 
iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, uint32_t flags, uint32_t 
hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t out_hwpt_id, int ret) " iommufd=%d 
dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u len=%u data_ptr=0x%"PRIx64" out_hwpt=%u 
(%d)"
  iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d 
ioas=%d (%d)"
  iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d 
(%d)"
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 7d39d7a5fa51..ca7ec45e725c 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -219,10 +219,82 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice 
*vbasedev, Error **errp)
      return ret;
  }

+static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
+                                        VFIOIOMMUFDContainer *container,
+                                        Error **errp)
+{
+    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
+    VFIOIOASHwpt *hwpt;
+    Error *err = NULL;
+    int ret = -EINVAL;

The -EINVAL initialization is not necessary.

+    uint32_t hwpt_id;
+
+    /* Try to find a domain */
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
+        if (ret) {
+            /* -EINVAL means the domain is incompatible with the device. */
+            if (ret == -EINVAL) {

On error iommufd_cdev_attach_ioas_hwpt() returns -1 and not -errno, so I guess we need to change it.

+                continue;
+            }
+            return ret;
+        } else {
+            vbasedev->hwpt = hwpt;
+            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+            return 0;
+        }
+    }
+
+    ret = iommufd_backend_alloc_hwpt(iommufd,
+                                     vbasedev->iommufd_dev.devid,
+                                     container->ioas_id, 0, 0, 0,

Should we explicitly pass IOMMU_HWPT_DATA_NONE instead of 0?

+                                     NULL, &hwpt_id);
+    if (ret) {
+        error_append_hint(&err,
+                   "Failed to allocate HWPT for device %s. Fallback to IOAS 
attach\n",
+                   vbasedev->name);
+        warn_report_err(err);
+        return ret;
+    }
+
+    hwpt = g_malloc0(sizeof(*hwpt));
+    hwpt->hwpt_id = hwpt_id;
+    QLIST_INIT(&hwpt->device_list);
+
+    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
+    if (ret) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+        return ret;
+    }
+
+    vbasedev->hwpt = hwpt;
+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+    return 0;

I think we need to improve error handling in this function.
There are various places where err is not freed/reported, not NULL-ed before re-used, or used in error_append_hint() although it can be NULL. If there are places where Error can be ignored, we can pass NULL instead of &err.
Plus, errp parameter passed to this function is never used.

+}
+
+static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container)
+{
+    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
+
+    QLIST_REMOVE(vbasedev, hwpt_next);
+    QLIST_REMOVE(hwpt, next);

Shouldn't QLIST_REMOVE(hwpt, next) be moved inside the if?

+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+    }
+}
+
  static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
                                           VFIOIOMMUFDContainer *container,
                                           Error **errp)
  {
+    if (!iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
+        return 0;
+    }

If errp is used in iommufd_cdev_autodomains_get() eventually, we need to make sure it doens't contain an Error object before using it again below.

Thanks.

+
      return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
  }

@@ -231,6 +303,11 @@ static void iommufd_cdev_detach_container(VFIODevice 
*vbasedev,
  {
      Error *err = NULL;

+    if (vbasedev->hwpt) {
+        iommufd_cdev_autodomains_put(vbasedev, container);
+        return;
+    }
+
      if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
          error_report_err(err);
      }
@@ -370,6 +447,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice 
*vbasedev,
      container = g_malloc0(sizeof(*container));
      container->be = vbasedev->iommufd_dev.iommufd;
      container->ioas_id = ioas_id;
+    QLIST_INIT(&container->hwpt_list);

      bcontainer = &container->bcontainer;
      vfio_container_init(bcontainer, space, iommufd_vioc);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9c4b60c906d9..7f7d823221e2 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -93,10 +93,17 @@ typedef struct VFIOHostDMAWindow {

  typedef struct IOMMUFDBackend IOMMUFDBackend;

+typedef struct VFIOIOASHwpt {
+    uint32_t hwpt_id;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
  typedef struct VFIOIOMMUFDContainer {
      VFIOContainerBase bcontainer;
      IOMMUFDBackend *be;
      uint32_t ioas_id;
+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
  } VFIOIOMMUFDContainer;

  /* Abstraction of host IOMMU legacy device */
@@ -136,6 +143,8 @@ typedef struct VFIODevice {
          IOMMULegacyDevice legacy_dev;
          IOMMUFDDevice iommufd_dev;
      };
+    VFIOIOASHwpt *hwpt;
+    QLIST_ENTRY(VFIODevice) hwpt_next;
  } VFIODevice;

  QEMU_BUILD_BUG_ON(offsetof(VFIODevice, legacy_dev.base) !=
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 4afe97307dbe..1966b75caae2 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -49,4 +49,8 @@ typedef struct IOMMUFDDevice {
  void iommufd_device_init(IOMMUFDDevice *idev);
  int iommufd_device_get_hw_capabilities(IOMMUFDDevice *idev, uint64_t *caps,
                                         Error **errp);
+int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt);
  #endif
--
2.39.3


Reply via email to