Hi Zhenzhong,

On 8/22/25 8:40 AM, Zhenzhong Duan wrote:
> This captures the guest PASID table entry modifications and
> propagates the changes to host to attach a hwpt with type determined
> per guest IOMMU mode and PGTT configuration.
>
> When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
> page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
> vIOMMU reuse hwpt(GPA->HPA) provided by VFIO as nested parent to
> construct nested page table.
>
> When guest decides to use legacy mode then vIOMMU switches the MRs of
> the device's AS, hence the IOAS created by VFIO container would be
> switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
> switched to IOMMU MR. So it is able to support shadowing the guest IO
> page table.
>
> Co-Authored-by: Yi Liu <yi.l....@intel.com>
> Signed-off-by: Yi Liu <yi.l....@intel.com>
> Signed-off-by: Yi Sun <yi.y....@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
> ---
>  hw/i386/intel_iommu_internal.h |  14 ++-
>  include/hw/i386/intel_iommu.h  |   1 +
>  hw/i386/intel_iommu.c          | 221 ++++++++++++++++++++++++++++++++-
>  hw/i386/trace-events           |   3 +
>  4 files changed, 233 insertions(+), 6 deletions(-)
>
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index c510b09d1a..61e35dbdc0 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -564,6 +564,12 @@ typedef struct VTDRootEntry VTDRootEntry;
>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
>  
> +typedef enum VTDPASIDOp {
> +    VTD_PASID_BIND,
> +    VTD_PASID_UPDATE,
> +    VTD_PASID_UNBIND,
> +} VTDPASIDOp;
> +
>  typedef enum VTDPCInvType {
>      /* VTD spec defined PASID cache invalidation type */
>      VTD_PASID_CACHE_DOMSI = VTD_INV_DESC_PASIDC_G_DSI,
> @@ -612,8 +618,12 @@ typedef struct VTDPASIDCacheInfo {
>  #define VTD_SM_PASID_ENTRY_AW          7ULL /* Adjusted guest-address-width 
> */
>  #define VTD_SM_PASID_ENTRY_DID(x)      extract64((x)->val[1], 0, 16)
>  
> -#define VTD_SM_PASID_ENTRY_FLPM          3ULL
> -#define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
> +#define VTD_SM_PASID_ENTRY_FSPTPTR       (~0xfffULL)
along with renaming, use extract64()
will be simpler than
intel_iommu.c:    return pe->val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;
intel_iommu.c:            return pe.val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;

also this will take care of the upper bits.

> +#define VTD_SM_PASID_ENTRY_SRE_BIT(x)    extract64((x)->val[2], 0, 1)
> +/* 00: 4-level paging, 01: 5-level paging, 10-11: Reserved */
> +#define VTD_SM_PASID_ENTRY_FSPM(x)       extract64((x)->val[2], 2, 2)
> +#define VTD_SM_PASID_ENTRY_WPE_BIT(x)    extract64((x)->val[2], 4, 1)
> +#define VTD_SM_PASID_ENTRY_EAFE_BIT(x)   extract64((x)->val[2], 7, 1)
>  
>  /* First Level Paging Structure */
>  /* Masks for First Level Paging Entry */
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 0e3826f6f0..2affab36b2 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -104,6 +104,7 @@ struct VTDAddressSpace {
>      PCIBus *bus;
>      uint8_t devfn;
>      uint32_t pasid;
> +    uint32_t s1_hwpt;
>      AddressSpace as;
>      IOMMUMemoryRegion iommu;
>      MemoryRegion root;          /* The root container of the device */
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 15582977b8..a10ee8eb4f 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
>  #include "qemu/error-report.h"
>  #include "qemu/main-loop.h"
>  #include "qapi/error.h"
> @@ -41,6 +42,9 @@
>  #include "migration/vmstate.h"
>  #include "trace.h"
>  #include "system/iommufd.h"
> +#ifdef CONFIG_IOMMUFD
> +#include <linux/iommufd.h>
> +#endif
>  
>  /* context entry operations */
>  #define VTD_CE_GET_RID2PASID(ce) \
> @@ -50,10 +54,9 @@
>  
>  /* pe operations */
>  #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
> -#define VTD_PE_GET_FL_LEVEL(pe) \
> -    (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
>  #define VTD_PE_GET_SL_LEVEL(pe) \
>      (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
> +#define VTD_PE_GET_FL_LEVEL(pe) (VTD_SM_PASID_ENTRY_FSPM(pe) + 4)
this change and above one are cleanups. They can easily be put in a
separate patch to ease the review.
>  
>  /*
>   * PCI bus number (or SID) is not reliable since the device is usaully
> @@ -834,6 +837,31 @@ static inline uint32_t 
> vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
>      return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce) + 7);
>  }
>  
> +static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
> +{
> +    return pe->val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;
> +}
> +
> +/*
> + * Stage-1 IOVA address width: 48 bits for 4-level paging(FSPM=00)
> + *                             57 bits for 5-level paging(FSPM=01)
> + */
> +static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
fl = first level? You may prefer fs (ifrts stage) which is used in fspm
terminology.
> +{
> +    return 48 + VTD_SM_PASID_ENTRY_FSPM(pe) * 9;
> +}
> +
> +static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
> +{
> +    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
> +}
> +
> +/* check if pgtt is first stage translation */
> +static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
> +{
> +    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
> +}
> +
>  static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>  {
>      return pdire->val & 1;
> @@ -1131,7 +1159,7 @@ static dma_addr_t 
> vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
>      if (s->root_scalable) {
>          vtd_ce_get_pasid_entry(s, ce, &pe, pasid);
>          if (s->flts) {
> -            return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
> +            return pe.val[2] & VTD_SM_PASID_ENTRY_FSPTPTR;
>          } else {
>              return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
>          }
> @@ -1766,7 +1794,7 @@ static bool vtd_dev_pt_enabled(IntelIOMMUState *s, 
> VTDContextEntry *ce,
>               */
>              return false;
>          }
> -        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
> +        return vtd_pe_pgtt_is_pt(&pe);
that change can be put in the cleanup patch too
>      }
>  
>      return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
> @@ -2433,6 +2461,178 @@ static void 
> vtd_context_global_invalidate(IntelIOMMUState *s)
>      vtd_iommu_replay_all(s);
>  }
>  
> +#ifdef CONFIG_IOMMUFD
> +static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
> +                                  VTDPASIDEntry *pe)
> +{
> +    memset(vtd, 0, sizeof(*vtd));
> +
> +    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe) ? IOMMU_VTD_S1_SRE : 0) |
> +                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe) ? IOMMU_VTD_S1_WPE : 0) |
> +                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe) ? IOMMU_VTD_S1_EAFE : 0);
> +    vtd->addr_width = vtd_pe_get_fl_aw(pe);
> +    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
> +}
> +
> +static int vtd_create_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> +                              VTDPASIDEntry *pe, uint32_t *s1_hwpt,
> +                              Error **errp)
> +{
> +    struct iommu_hwpt_vtd_s1 vtd;
= {};
and get rid of above memset?
> +
> +    vtd_init_s1_hwpt_data(&vtd, pe);
not sure the leper is needed. Is it reused somewhere else?
> +
> +    return !iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                       idev->hwpt_id, 0, 
> IOMMU_HWPT_DATA_VTD_S1,
> +                                       sizeof(vtd), &vtd, s1_hwpt, errp);
> +}
> +
> +static void vtd_destroy_s1_hwpt(HostIOMMUDeviceIOMMUFD *idev,
> +                                VTDAddressSpace *vtd_as)
> +{
> +    if (!vtd_as->s1_hwpt) {
> +        return;
> +    }
> +    iommufd_backend_free_id(idev->iommufd, vtd_as->s1_hwpt);
> +    vtd_as->s1_hwpt = 0;
> +}
> +
> +static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
> +                                     VTDAddressSpace *vtd_as, Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +    VTDPASIDEntry *pe = &vtd_as->pasid_cache_entry.pasid_entry;
> +    uint32_t hwpt_id;
> +    int ret;
> +
> +    /*
> +     * We can get here only if flts=on, the supported PGTT is FLT and PT.
> +     * Catch invalid PGTT when processing invalidation request to avoid
> +     * attaching to wrong hwpt.
> +     */
> +    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
> +        error_setg(errp, "Invalid PGTT type");
> +        return -EINVAL;
> +    }
> +
> +    if (vtd_pe_pgtt_is_flt(pe)) {
> +        /* Should fail if the FLPT base is 0 */
OK I see there is a mix of FL and FS terminology. Forget about my
previous comment.
> +        if (!vtd_pe_get_flpt_base(pe)) {
> +            error_setg(errp, "FLPT base is 0");
> +            return -EINVAL;
> +        }
> +
> +        if (vtd_create_s1_hwpt(idev, pe, &hwpt_id, errp)) {
> +            return -EINVAL;
> +        }
> +    } else {
> +        hwpt_id = idev->hwpt_id;
> +    }
> +
> +    ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
> +    trace_vtd_device_attach_hwpt(idev->devid, vtd_as->pasid, hwpt_id, ret);
> +    if (!ret) {
that double ! looks pretty bad ;-)
> +        vtd_destroy_s1_hwpt(idev, vtd_as);
> +        if (vtd_pe_pgtt_is_flt(pe)) {
> +            vtd_as->s1_hwpt = hwpt_id;
would deserve some comments
> +        }
> +    } else if (vtd_pe_pgtt_is_flt(pe)) {
> +        iommufd_backend_free_id(idev->iommufd, hwpt_id);
> +    }
> +
> +    return ret;
> +}
> +
> +static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
> +                                     VTDAddressSpace *vtd_as, Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +    uint32_t pasid = vtd_as->pasid;
> +    int ret;
> +
> +    if (vtd_hiod->iommu_state->dmar_enabled) {
> +        ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
> +        trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
> +    } else {
> +        ret = !host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, 
> errp);
> +        trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
> +                                           ret);
> +    }
> +
> +    if (!ret) {
gere as well, !! lost me sorry
> +        vtd_destroy_s1_hwpt(idev, vtd_as);
> +    }
> +
> +    return ret;
> +}
> +
> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, VTDPASIDOp op,
> +                                Error **errp)
> +{
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    VTDHostIOMMUDevice *vtd_hiod = vtd_find_hiod_iommufd(s, vtd_as);
> +    int ret;
> +
> +    if (!vtd_hiod) {
> +        /* No need to go further, e.g. for emulated device */
> +        return 0;
> +    }
> +
> +    if (vtd_as->pasid != PCI_NO_PASID) {
> +        error_setg(errp, "Non-rid_pasid %d not supported yet", 
> vtd_as->pasid);
> +        return -EINVAL;
> +    }
> +
> +    switch (op) {
> +    case VTD_PASID_UPDATE:
> +    case VTD_PASID_BIND:
> +    {
> +        ret = vtd_device_attach_iommufd(vtd_hiod, vtd_as, errp);
> +        break;
> +    }
> +    case VTD_PASID_UNBIND:
> +    {
> +        ret = vtd_device_detach_iommufd(vtd_hiod, vtd_as, errp);
> +        break;
> +    }
> +    default:
> +        error_setg(errp, "Unknown VTDPASIDOp!!!");
> +        break;
> +    }
> +
> +    return ret;
> +}
> +#else
> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as, VTDPASIDOp op,
> +                                Error **errp)
> +{
> +    return 0;
> +}
> +#endif
> +
> +static int vtd_bind_guest_pasid_report_err(VTDAddressSpace *vtd_as,
> +                                           VTDPASIDOp op)
> +{
> +    Error *local_err = NULL;
> +    int ret;
> +
> +    /*
> +     * vIOMMU calls into kernel to do BIND/UNBIND, the failure reason
> +     * can be kernel, QEMU bug or invalid guest config. None of them
> +     * should be reported to guest in PASID cache invalidation
> +     * processing path. But at least, we can report it to QEMU console.
> +     *
> +     * TODO: for invalid guest config, DMA translation fault will be
> +     * caught by host and passed to QEMU to inject to guest in future.
> +     */
> +    ret = vtd_bind_guest_pasid(vtd_as, op, &local_err);
> +    if (ret) {
> +        error_report_err(local_err);
> +    }
> +
> +    return ret;
> +}
> +
>  /* Do a context-cache device-selective invalidation.
>   * @func_mask: FM field after shifting
>   */
> @@ -3248,10 +3448,20 @@ static gboolean vtd_flush_pasid_locked(gpointer key, 
> gpointer value,
>       */
>      if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) {
>          pc_entry->pasid_entry = pe;
> +        if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_UPDATE)) {
I would remove that helper.
> +            /*
> +             * In case update binding fails, tear down existing binding to
> +             * catch invalid pasid entry config during DMA translation.
> +             */
> +            goto remove;
> +        }
>      }
>      return false;
>  
>  remove:
> +    if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_UNBIND)) {
> +        return false;
> +    }
>      pc_entry->valid = false;
>  
>      /*
> @@ -3336,6 +3546,9 @@ static void vtd_sm_pasid_table_walk_one(IntelIOMMUState 
> *s,
>              if (!pc_entry->valid) {
>                  pc_entry->pasid_entry = pe;
>                  pc_entry->valid = true;
> +                if (vtd_bind_guest_pasid_report_err(vtd_as, VTD_PASID_BIND)) 
> {
> +                    pc_entry->valid = false;
> +                }
>              }
>          }
>          pasid++;
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index c8a936eb46..1c31b9a873 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
>  vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 
> 0x%"PRIx16" index %d vec %d (should be: %d)"
>  vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 
> 0x%"PRIx16" index %d trigger %d (should be: %d)"
>  vtd_reset_exit(void) ""
> +vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, 
> int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
> +vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d 
> pasid %d ret: %d"
> +vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t 
> hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
>  
>  # amd_iommu.c
>  amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at 
> addr 0x%"PRIx64" +  offset 0x%"PRIx32
Thanks

Eric


Reply via email to