On 2025/8/22 14:40, Zhenzhong Duan wrote:
This captures the guest PASID table entry modifications and
propagates the changes to host to attach a hwpt with type determined
per guest IOMMU mode and PGTT configuration.

When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
vIOMMU reuse hwpt(GPA->HPA) provided by VFIO as nested parent to
construct nested page table.

maybe more straightforward, if PGTT is xxxx, attach PASID to a xxx
hwpt. e.g. PGTT==pt, attach to the default hwpt.

When guest decides to use legacy mode then vIOMMU switches the MRs of
the device's AS, hence the IOAS created by VFIO container would be
switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
switched to IOMMU MR. So it is able to support shadowing the guest IO
page table.

this is not quite related to this patch as bind/unbind pasid only happes
when guest is operating in scalable mode. I think you may drop it and
consider to add it in another more related patch.

Co-Authored-by: Yi Liu <yi.l....@intel.com>
Signed-off-by: Yi Liu <yi.l....@intel.com>
Signed-off-by: Yi Sun <yi.y....@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
---
  hw/i386/intel_iommu_internal.h |  14 ++-
  include/hw/i386/intel_iommu.h  |   1 +
  hw/i386/intel_iommu.c          | 221 ++++++++++++++++++++++++++++++++-
  hw/i386/trace-events           |   3 +
  4 files changed, 233 insertions(+), 6 deletions(-)

diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index c510b09d1a..61e35dbdc0 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -564,6 +564,12 @@ typedef struct VTDRootEntry VTDRootEntry;
  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
+typedef enum VTDPASIDOp {
+    VTD_PASID_BIND,
+    VTD_PASID_UPDATE,
+    VTD_PASID_UNBIND,
+} VTDPASIDOp;
+
  typedef enum VTDPCInvType {
      /* VTD spec defined PASID cache invalidation type */
      VTD_PASID_CACHE_DOMSI = VTD_INV_DESC_PASIDC_G_DSI,
@@ -612,8 +618,12 @@ typedef struct VTDPASIDCacheInfo {
  #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width */
  #define VTD_SM_PASID_ENTRY_DID(x)      extract64((x)->val[1], 0, 16)
-#define VTD_SM_PASID_ENTRY_FLPM 3ULL
-#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_FSPTPTR       (~0xfffULL)
+#define VTD_SM_PASID_ENTRY_SRE_BIT(x)    extract64((x)->val[2], 0, 1)
+/* 00: 4-level paging, 01: 5-level paging, 10-11: Reserved */
+#define VTD_SM_PASID_ENTRY_FSPM(x)       extract64((x)->val[2], 2, 2)
+#define VTD_SM_PASID_ENTRY_WPE_BIT(x)    extract64((x)->val[2], 4, 1)
+#define VTD_SM_PASID_ENTRY_EAFE_BIT(x)   extract64((x)->val[2], 7, 1)
/* First Level Paging Structure */
  /* Masks for First Level Paging Entry */
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 0e3826f6f0..2affab36b2 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -104,6 +104,7 @@ struct VTDAddressSpace {
      PCIBus *bus;
      uint8_t devfn;
      uint32_t pasid;
+    uint32_t s1_hwpt;
      AddressSpace as;
      IOMMUMemoryRegion iommu;
      MemoryRegion root;          /* The root container of the device */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 15582977b8..a10ee8eb4f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -20,6 +20,7 @@
   */
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
  #include "qemu/error-report.h"
  #include "qemu/main-loop.h"
  #include "qapi/error.h"
@@ -41,6 +42,9 @@
  #include "migration/vmstate.h"
  #include "trace.h"
  #include "system/iommufd.h"
+#ifdef CONFIG_IOMMUFD
+#include <linux/iommufd.h>
+#endif
/* context entry operations */
  #define VTD_CE_GET_RID2PASID(ce) \
@@ -50,10 +54,9 @@
/* pe operations */
  #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
-#define VTD_PE_GET_FL_LEVEL(pe) \
-    (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
  #define VTD_PE_GET_SL_LEVEL(pe) \
      (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FL_LEVEL(pe) (VTD_SM_PASID_ENTRY_FSPM(pe) + 4)
/*
   * PCI bus number (or SID) is not reliable since the device is usaully
@@ -834,6 +837,31 @@ static inline uint32_t 
vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
      return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
  }
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+    return pe->val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;
+}
+
+/*
+ * Stage-1 IOVA address width: 48 bits for 4-level paging(FSPM=00)
+ *                             57 bits for 5-level paging(FSPM=01)
+ */
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+    return 48 + VTD_SM_PASID_ENTRY_FSPM(pe) * 9;
+}
+
+static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
+}

I think the existing pt related code can use this helper as well. A
separate patch to add this helper and replace the existing opened code
would be helpful.

+/* check if pgtt is first stage translation */
+static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
+{
+    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
+}
+
  static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
  {
      return pdire->val & 1;
@@ -1131,7 +1159,7 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState 
*s,
      if (s->root_scalable) {
          vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
          if (s->flts) {
-            return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+            return pe.val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;

such renaming better to be separate patch. If it's not too heavy to aford, we may consider to consolidate the "fs" and "fl" term to be "fs". :)

          } else {
              return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
          }
@@ -1766,7 +1794,7 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, 
VTDContextEntry *ce,
               */
              return false;
          }
-        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
+        return vtd_pe_pgtt_is_pt(&pe);
      }
return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
@@ -2433,6 +2461,178 @@ static void 
vtd_context_global_invalidate(IntelIOMMUState *s)
      vtd_iommu_replay_all(s);
  }
+#ifdef CONFIG_IOMMUFD
+static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
+                                  VTDPASIDEntry *pe)
+{
+    memset(vtd, 0, sizeof(*vtd));
+
+    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe) ? IOMMU_VTD_S1_SRE : 0) |
+                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe) ? IOMMU_VTD_S1_WPE : 0) |
+                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe) ? IOMMU_VTD_S1_EAFE : 0);
+    vtd->addr_width = vtd_pe_get_fl_aw(pe);
+    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
+}
+
+static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                              VTDPASIDEntry *pe, uint32_t *s1_hwpt,
+                              Error **errp)
+{
+    struct iommu_hwpt_vtd_s1 vtd;
+
+    vtd_init_s1_hwpt_data(&vtd, pe);
+
+    return !iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
+                                       idev->hwpt_id, 0, 
IOMMU_HWPT_DATA_VTD_S1,
+                                       sizeof(vtd), &vtd, s1_hwpt, errp);
+}
+
+static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
+                                VTDAddressSpace *vtd_as)
+{
+    if (!vtd_as->s1_hwpt) {
+        return;
+    }
+    iommufd_backend_free_id(idev->iommufd, vtd_as->s1_hwpt);
+    vtd_as->s1_hwpt = 0;
+}
+
+static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+                                     VTDAddressSpace *vtd_as, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);

should it check if idev is valid as vtd_hiod->hiod may be other type
rather than TYPE_HOST_IOMMU_DEVICE_IOMMUFD?

+    VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
+    uint32_t hwpt_id;
+    int ret;
+
+    /*
+     * We can get here only if flts=on, the supported PGTT is FLT and PT.
+     * Catch invalid PGTT when processing invalidation request to avoid
+     * attaching to wrong hwpt.

I think it is necessary to check x-flts=on in vtd_process_pasid_desc()
to gurantee the above comment. Existing vIOMMU has reported scalable
mode to guest without x-flts. For that configuration, vIOMMU just skip
the PASID cache flush as that configuration depends on shadowing guest
I/O page table to host. Now, we are deadling with PASID cache
invalidaiton because we need to bind guest I/O page table to host if
guest uses FS translation.

+     */
+    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
+        error_setg(errp, "Invalid PGTT type");
+        return -EINVAL;
+    }
+
+    if (vtd_pe_pgtt_is_flt(pe)) {
+        /* Should fail if the FLPT base is 0 */
+        if (!vtd_pe_get_flpt_base(pe)) {

aha, I cannot recall why 0 check is special and added here. If we want
to keep this check, I think the flpt_base should also be smaller than
the AW width of the s2_hwpt to maek the check completed. :)

+            error_setg(errp, "FLPT base is 0");
+            return -EINVAL;
+        }
+
+        if (vtd_create_s1_hwpt(idev, pe, &hwpt_id, errp)) {
+            return -EINVAL;
+        }
+    } else {
+        hwpt_id = idev->hwpt_id;
+    }
+
+    ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
+    trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid, hwpt_id, ret);
+    if (!ret) {

above three lines have two !!. Consider to simplify it.

+        vtd_destroy_s1_hwpt(idev, vtd_as);

suppose this is the succ branch. why destroy s1_hwpt?

+        if (vtd_pe_pgtt_is_flt(pe)) {
+            vtd_as->s1_hwpt = hwpt_id;
+        }
+    } else if (vtd_pe_pgtt_is_flt(pe)) {
+        iommufd_backend_free_id(idev->iommufd, hwpt_id);
+    }
+
+    return ret;
+}
+
+static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
+                                     VTDAddressSpace *vtd_as, Error **errp)
+{
+    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    uint32_t pasid = vtd_as->pasid;
+    int ret;
+
+    if (vtd_hiod->iommu_state->dmar_enabled) {
+        ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
+        trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
+    } else {
+        ret = !host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, 
errp);
+        trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
+                                           ret);
+    }

you need a comment to explain why it differs per iommu_state->dmar_enabled.

+
+    if (!ret) {
+        vtd_destroy_s1_hwpt(idev, vtd_as);
+    }
+
+    return ret;
+}
+
+static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, VTDPASIDOp op,
+                                Error **errp)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    VTDHostIOMMUDevice *vtd_hiod = vtd_find_hiod_iommufd(s, vtd_as);
+    int ret;
+
+    if (!vtd_hiod) {
+        /* No need to go further, e.g. for emulated device */
+        return 0;
+    }
+
+    if (vtd_as->pasid != PCI_NO_PASID) {
+        error_setg(errp, "Non-rid_pasid %d not supported yet", vtd_as->pasid);

I see. This series is really only for rid_pasid. Then some prior patches
may need to be re-orged. Please refer to related comments to prior patches.

+        return -EINVAL;
+    }
+
+    switch (op) {
+    case VTD_PASID_UPDATE:
+    case VTD_PASID_BIND:

I'm doubting if we really want to have two types. BIND might be enough
since UPDATE is to bind device/pasid to a new page table and kernel
supports bind to a new page table wihout unbinding the old one.

+    {
+        ret = vtd_device_attach_iommufd(vtd_hiod, vtd_as, errp);
+        break;
+    }
+    case VTD_PASID_UNBIND:
+    {
+        ret = vtd_device_detach_iommufd(vtd_hiod, vtd_as, errp);
+        break;
+    }
+    default:
+        error_setg(errp, "Unknown VTDPASIDOp!!!");
+        break;
+    }
+
+    return ret;
+}
+#else
+static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, VTDPASIDOp op,
+                                Error **errp)
+{
+    return 0;
+}
+#endif
+
+static int vtd_bind_guest_pasid_report_err(VTDAddressSpace *vtd_as,
+                                           VTDPASIDOp op)
+{
+    Error *local_err = NULL;
+    int ret;
+
+    /*
+     * vIOMMU calls into kernel to do BIND/UNBIND, the failure reason
+     * can be kernel, QEMU bug or invalid guest config. None of them
+     * should be reported to guest in PASID cache invalidation
+     * processing path. But at least, we can report it to QEMU console.
+     *

could you elaborate the reason for the above comment? I agree that we
lack of a formal way to report the failure. But supressing the failure
does not seem correct to me.

Regards,
Yi Liu

+     * TODO: for invalid guest config, DMA translation fault will be
+     * caught by host and passed to QEMU to inject to guest in future.
+     */
+    ret = vtd_bind_guest_pasid(vtd_as, op, &local_err);
+    if (ret) {
+        error_report_err(local_err);
+    }
+
+    return ret;
+}
+
  /* Do a context-cache device-selective invalidation.
   * @func_mask: FM field after shifting
   */
@@ -3248,10 +3448,20 @@ static gboolean vtd_flush_pasid_locked(gpointer key, 
gpointer value,
       */
      if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
          pc_entry->pasid_entry = pe;
+        if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_UPDATE)) {
+            /*
+             * In case update binding fails, tear down existing binding to
+             * catch invalid pasid entry config during DMA translation.
+             */
+            goto remove;
+        }
      }
      return false;
remove:
+    if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_UNBIND)) {
+        return false;
+    }
      pc_entry->valid = false;
/*
@@ -3336,6 +3546,9 @@ static void vtd_sm_pasid_table_walk_one(IntelIOMMUState 
*s,
              if (!pc_entry->valid) {
                  pc_entry->pasid_entry = pe;
                  pc_entry->valid = true;
+                if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_BIND)) {
+                    pc_entry->valid = false;
+                }
              }
          }
          pasid++;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index c8a936eb46..1c31b9a873 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
  vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" 
index %d vec %d (should be: %d)"
  vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 
0x%"PRIx16" index %d trigger %d (should be: %d)"
  vtd_reset_exit(void) ""
+vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) 
"dev_id %d pasid %d hwpt_id %d, ret: %d"
+vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d pasid 
%d ret: %d"
+vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, int ret) 
"dev_id %d pasid %d hwpt_id %d, ret: %d"
# amd_iommu.c
  amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 
0x%"PRIx64" +  offset 0x%"PRIx32

Reply via email to