>-----Original Message----- >From: Eric Auger <eric.au...@redhat.com> >Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check and >sync host IOMMU cap/ecap > > > >On 1/18/24 10:30, Duan, Zhenzhong wrote: >> Hi Eric, >> >>> -----Original Message----- >>> From: Eric Auger <eric.au...@redhat.com> >>> Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check >and >>> sync host IOMMU cap/ecap >>> >>> Hi Zhenzhong, >>> >>> On 1/15/24 11:13, Zhenzhong Duan wrote: >>>> From: Yi Liu <yi.l....@intel.com> >>>> >>>> Add a framework to check and synchronize host IOMMU cap/ecap with >>>> vIOMMU cap/ecap. >>>> >>>> Currently only stage-2 translation is supported which is backed by >>>> shadow page table on host side. So we don't need exact matching of >>>> each bit of cap/ecap between vIOMMU and host. However, we can still >>>> utilize this framework to ensure compatibility of host and vIOMMU's >>>> address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, >>>> which is missed before. >>>> >>>> When stage-1 translation is supported in future, a.k.a. scalable >>>> modern mode, we need to ensure compatibility of each bits. Some >>>> bits are user controllable, they should be checked with host side >>>> to ensure compatibility. Other bits are not, they should be synced >>>> into vIOMMU cap/ecap for compatibility. >>>> >>>> The sequence will be: >>>> >>>> vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() >>>> iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() >>>> iommu->host_cap/ecap is checked and updated some bits with host >>> cap/ecap. ---- vtd_sync_hw_info() >>>> iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- >>> vtd_machine_done_hook() >>>> iommu->host_cap/ecap is a temporary storage to hold intermediate >value >>>> when synthesize host cap/ecap and vIOMMU's initial configured >cap/ecap. >>>> >>>> Signed-off-by: Yi Liu <yi.l....@intel.com> >>>> Signed-off-by: Yi Sun <yi.y....@linux.intel.com> >>>> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com> >>>> --- >>>> include/hw/i386/intel_iommu.h | 4 ++ >>>> hw/i386/intel_iommu.c | 78 >>> +++++++++++++++++++++++++++++++---- >>>> 2 files changed, 75 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/include/hw/i386/intel_iommu.h >>> b/include/hw/i386/intel_iommu.h >>>> index c65fdde56f..b8abbcce12 100644 >>>> --- a/include/hw/i386/intel_iommu.h >>>> +++ b/include/hw/i386/intel_iommu.h >>>> @@ -292,6 +292,9 @@ struct IntelIOMMUState { >>>> uint64_t cap; /* The value of capability reg */ >>>> uint64_t ecap; /* The value of extended capability >>>> reg */ >>>> >>>> + uint64_t host_cap; /* The value of host capability reg */ >>>> + uint64_t host_ecap; /* The value of host ext-capability >>>> reg */ >>>> + >>>> uint32_t context_cache_gen; /* Should be in [1,MAX] */ >>>> GHashTable *iotlb; /* IOTLB */ >>>> >>>> @@ -314,6 +317,7 @@ struct IntelIOMMUState { >>>> bool dma_translation; /* Whether DMA translation supported >*/ >>>> bool pasid; /* Whether to support PASID */ >>>> >>>> + bool cap_finalized; /* Whether VTD capability finalized */ >>>> /* >>>> * Protects IOMMU states in general. Currently it protects the >>>> * per-IOMMU IOTLB cache, and context entry cache in >>> VTDAddressSpace. >>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >>>> index 4c1d058ebd..be03fcbf52 100644 >>>> --- a/hw/i386/intel_iommu.c >>>> +++ b/hw/i386/intel_iommu.c >>>> @@ -3819,6 +3819,47 @@ VTDAddressSpace >>> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, >>>> return vtd_dev_as; >>>> } >>>> >>>> +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct >>> iommu_hw_info_vtd *vtd, >>>> + Error **errp) >>>> +{ >>>> + uint64_t addr_width; >>>> + >>>> + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; >>>> + if (s->aw_bits > addr_width) { >>>> + error_setg(errp, "User aw-bits: %u > host address width: %lu", >>>> + s->aw_bits, addr_width); >>>> + return false; >>>> + } >>>> + >>>> + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ >>>> + >>>> + return true; >>>> +} >>>> + >>>> +/* >>>> + * virtual VT-d which wants nested needs to check the host IOMMU >>>> + * nesting cap info behind the assigned devices. Thus that vIOMMU >>>> + * could bind guest page table to host. >>>> + */ >>>> +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice >*idev, >>>> + Error **errp) >>>> +{ >>>> + struct iommu_hw_info_vtd vtd; >>>> + enum iommu_hw_info_type type = >>> IOMMU_HW_INFO_TYPE_INTEL_VTD; >>>> + >>>> + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { >>>> + error_setg(errp, "Failed to get IOMMU capability!!!"); >>>> + return false; >>>> + } >>>> + >>>> + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { >>>> + error_setg(errp, "IOMMU hardware is not compatible!!!"); >>>> + return false; >>>> + } >>>> + >>>> + return vtd_sync_hw_info(s, &vtd, errp); >>>> +} >>>> + >>>> static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, >int32_t >>> devfn, >>>> IOMMUFDDevice *idev, Error **errp) >>>> { >>>> @@ -3837,6 +3878,10 @@ static int >vtd_dev_set_iommu_device(PCIBus >>> *bus, void *opaque, int32_t devfn, >>>> return 0; >>>> } >>>> >>>> + if (!vtd_check_idev(s, idev, errp)) {In >>> In >>> [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for >>> hotplugged devices >>> https://lore.kernel.org/all/20240117080414.316890-1- >>> eric.au...@redhat.com/ >>> >>> I also attempt to pass host iommu info to the virtio-iommu but with >>> legacy BE. >> I think your patch works with iommufd BE too😊 Because iommufd BE >> also fills bcontainer->iova_ranges in iommufd_cdev_get_info_iova_range(). >correct. I wanted to emphasize that we also have the need to pass host >iommu info in legacy mode for instance. In this series you introduce an >object that works with the iommufd backed but I think if we go this way >we would need another one for the legacy device. So maybe introducing a >base object derived into 2 ones may be the most appropriate? Maybe, >given the assumption that we will use iommufd for new use cases this >legacy object will implement much fewer interfaces but still.
How about this: enum IOMMU_LEGACY_DEVICE_TYPE { IOMMU_LEGACY_VFIO_DEVICE, IOMMU_LEGACY_VDPA_DEVICE, } typedef struct IOMMULegacyDevice { enum IOMMU_LEGACY_DEVICE_TYPE type; /* common field */ union { .... } } IOMMULegacyDevice; typedef struct IOMMUFDDevice { IOMMUFDBackend *iommufd; uint32_t dev_id; uint32_t ioas_id; } IOMMUFDDevice; enum IOMMUDEVICE_TYPE { IOMMUFD_DEVICE, IOMMU_LEGACY_DEVICE, } struct IOMMUDevice { enum IOMMU_DEVICE_TYPE type; /* common field */ GList *iova_ranges; union { IOMMULegacyDevice legacy_dev; IOMMUFDDevice idev; } } >> >>> In my case I want to pass the reserved memory regions which >>> also model the aw. >>> So this is a pretty similar use case. >> Yes. >> >>> Why don't we pass the pointer to an opaque iommu_hw_info instead, >>> through the PCIIOMMUOps? >> Passing iommu_hw_info is ok for this series, but we want more from >> IOMMUFDDevice in nesting series. I.e., allocate/free ioas/hwpt, >> attach/detach from hwpt, get dirty bitmap, etc. It's more flexible to >> let vIOMMU get what it want itself. >OK, would be interesting to define the class for this object. Worth to >be introduced either in the cover letter or in the 1st patch Not a QOM class because we don't want it showed out through query-qmp-schema. Thanks Zhenzhong > >Eric >> >>> >>> >>>> + return -1; >>>> + } >>>> + >>>> vtd_iommu_lock(s); >>>> >>>> vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); >>>> @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) >>>> { >>>> X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); >>>> >>>> - memset(s->csr, 0, DMAR_REG_SIZE); >>>> - memset(s->wmask, 0, DMAR_REG_SIZE); >>>> - memset(s->w1cmask, 0, DMAR_REG_SIZE); >>>> - memset(s->womask, 0, DMAR_REG_SIZE); >>>> + /* CAP/ECAP are initialized in machine create done stage */ >>>> + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>>> + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>>> + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>>> + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>> This change is not documented in the commit msg. >>> Sorry I don't get why this is needed? >> I'll doc it. Above we have one line to explain when cap/ecap are initialized. >> vtd_init() is called in qemu init and guest reset. In qemu init, >> Cap/ecap is finalized, after that we don't want cap/ecap to be changed. >> So we bypass change to cap/ecap here. >> >>>> s->root = 0; >>>> s->root_scalable = false; >>>> @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) >>>> vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; >>>> } >>>> >>>> - vtd_cap_init(s); >>>> + if (!s->cap_finalized) { >>>> + vtd_cap_init(s); >>>> + s->host_cap = s->cap; >>>> + s->host_ecap = s->ecap; >>>> + } >>>> + >>>> vtd_reset_caches(s); >>>> >>>> /* Define registers with default values and bit semantics */ >>>> vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); >>>> - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>>> - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>>> vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); >>>> vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); >>>> vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); >>>> @@ -4241,6 +4290,12 @@ static bool >>> vtd_decide_config(IntelIOMMUState *s, Error **errp) >>>> return true; >>>> } >>>> >>>> +static void vtd_setup_capability_reg(IntelIOMMUState *s) >>>> +{ >>>> + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>>> + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>>> +} >>>> + >>>> static int vtd_machine_done_notify_one(Object *child, void *unused) >>>> { >>>> IntelIOMMUState *iommu = >>> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>>> @@ -4259,6 +4314,14 @@ static int >>> vtd_machine_done_notify_one(Object *child, void *unused) >>>> static void vtd_machine_done_hook(Notifier *notifier, void *unused) >>>> { >>>> + IntelIOMMUState *iommu = >>> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>>> + >>>> + iommu->cap = iommu->host_cap; >>>> + iommu->ecap = iommu->host_ecap; >>>> + iommu->cap_finalized = true; >>> I don't think you can change the defaults like this without taking care >>> of compats (migration). >> Will bump vtd_vmstate .version_id works? >> >> Thanks >> Zhenzhong >> >>> Thanks >>> >>> Eric >>>> + >>>> + vtd_setup_capability_reg(iommu); >>>> + >>>> object_child_foreach_recursive(object_get_root(), >>>> vtd_machine_done_notify_one, NULL); >>>> } >>>> @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, >Error >>> **errp) >>>> QLIST_INIT(&s->vtd_as_with_notifiers); >>>> qemu_mutex_init(&s->iommu_lock); >>>> + s->cap_finalized = false; >>>> memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, >>>> "intel_iommu", DMAR_REG_SIZE); >>>> memory_region_add_subregion(get_system_memory(),