On 11/17/25 10:37 AM, Zhenzhong Duan wrote: > On a system influenced by ERRATA_772415, IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 > is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the readonly > range mapped on second stage page table could still be written. > > Reference from 4th Gen Intel Xeon Processor Scalable Family Specification > Update, Errata Details, SPR17. > https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/ > > Also copied the SPR17 details from above link: > "Problem: When remapping hardware is configured by system software in > scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the > PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended > Access bit if enabled) in first-stage page-table entries even when > second-stage mappings indicate that corresponding first-stage page-table > is Read-Only. > > Implication: Due to this erratum, pages mapped as Read-only in second-stage > page-tables may be modified by remapping hardware Access/Dirty bit updates. > > Workaround: None identified. System software enabling nested translations > for a VM should ensure that there are no read-only pages in the > corresponding second-stage mappings." > > Introduce a helper vfio_device_get_host_iommu_quirk_bypass_ro to check if > readonly mappings should be bypassed. > > Signed-off-by: Zhenzhong Duan <[email protected]> since it will be moved to a different series, I skip the review for now. Thanks Eric > --- > include/hw/vfio/vfio-container.h | 1 + > include/hw/vfio/vfio-device.h | 3 +++ > hw/vfio/device.c | 14 ++++++++++++++ > hw/vfio/iommufd.c | 9 ++++++++- > hw/vfio/listener.c | 6 ++++-- > 5 files changed, 30 insertions(+), 3 deletions(-) > > diff --git a/include/hw/vfio/vfio-container.h > b/include/hw/vfio/vfio-container.h > index 9f6e8cedfc..a7d5c5ed67 100644 > --- a/include/hw/vfio/vfio-container.h > +++ b/include/hw/vfio/vfio-container.h > @@ -52,6 +52,7 @@ struct VFIOContainer { > QLIST_HEAD(, VFIODevice) device_list; > GList *iova_ranges; > NotifierWithReturn cpr_reboot_notifier; > + bool bypass_ro; > }; > > #define TYPE_VFIO_IOMMU "vfio-iommu" > diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h > index 48d00c7bc4..f6f3d0e378 100644 > --- a/include/hw/vfio/vfio-device.h > +++ b/include/hw/vfio/vfio-device.h > @@ -268,6 +268,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, > VFIOContainer *bcontainer, > void vfio_device_unprepare(VFIODevice *vbasedev); > > bool vfio_device_get_viommu_flags_want_nesting(VFIODevice *vbasedev); > +bool vfio_device_get_host_iommu_quirk_bypass_ro(VFIODevice *vbasedev, > + uint32_t type, void *caps, > + uint32_t size); > > int vfio_device_get_region_info(VFIODevice *vbasedev, int index, > struct vfio_region_info **info); > diff --git a/hw/vfio/device.c b/hw/vfio/device.c > index 71eb069eb6..290011e154 100644 > --- a/hw/vfio/device.c > +++ b/hw/vfio/device.c > @@ -533,6 +533,20 @@ bool > vfio_device_get_viommu_flags_want_nesting(VFIODevice *vbasedev) > return false; > } > > +bool vfio_device_get_host_iommu_quirk_bypass_ro(VFIODevice *vbasedev, > + uint32_t type, void *caps, > + uint32_t size) > +{ > + VFIOPCIDevice *vdev = vfio_pci_from_vfio_device(vbasedev); > + > + if (vdev) { > + return !!(pci_device_get_host_iommu_quirks(PCI_DEVICE(vdev), type, > + caps, size) & > + HOST_IOMMU_QUIRK_NESTING_PARENT_BYPASS_RO); > + } > + return false; > +} > + > /* > * Traditional ioctl() based io > */ > diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c > index 63f8442865..2a7b0d0c07 100644 > --- a/hw/vfio/iommufd.c > +++ b/hw/vfio/iommufd.c > @@ -351,6 +351,7 @@ static bool iommufd_cdev_autodomains_get(VFIODevice > *vbasedev, > VFIOContainer *bcontainer = VFIO_IOMMU(container); > uint32_t type, flags = 0; > uint64_t hw_caps; > + VendorCaps caps; > VFIOIOASHwpt *hwpt; > uint32_t hwpt_id; > int ret; > @@ -396,7 +397,8 @@ static bool iommufd_cdev_autodomains_get(VFIODevice > *vbasedev, > * instead. > */ > if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev->devid, > - &type, NULL, 0, &hw_caps, errp)) { > + &type, &caps, sizeof(caps), > &hw_caps, > + errp)) { > return false; > } > > @@ -411,6 +413,11 @@ static bool iommufd_cdev_autodomains_get(VFIODevice > *vbasedev, > */ > if (vfio_device_get_viommu_flags_want_nesting(vbasedev)) { > flags |= IOMMU_HWPT_ALLOC_NEST_PARENT; > + > + if (vfio_device_get_host_iommu_quirk_bypass_ro(vbasedev, type, > + &caps, sizeof(caps))) > { > + bcontainer->bypass_ro = true; > + } > } > > if (cpr_is_incoming()) { > diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c > index ca2377d860..090f935d30 100644 > --- a/hw/vfio/listener.c > +++ b/hw/vfio/listener.c > @@ -502,7 +502,8 @@ void vfio_container_region_add(VFIOContainer *bcontainer, > int ret; > Error *err = NULL; > > - if (!vfio_listener_valid_section(section, false, "region_add")) { > + if (!vfio_listener_valid_section(section, bcontainer->bypass_ro, > + "region_add")) { > return; > } > > @@ -668,7 +669,8 @@ static void vfio_listener_region_del(MemoryListener > *listener, > int ret; > bool try_unmap = true; > > - if (!vfio_listener_valid_section(section, false, "region_del")) { > + if (!vfio_listener_valid_section(section, bcontainer->bypass_ro, > + "region_del")) { > return; > } >
