Hi Shameer,

On 11/20/25 2:21 PM, Shameer Kolothum wrote:
> For certain vIOMMU implementations, such as SMMUv3 in accelerated mode,
> the translation tables are programmed directly into the physical SMMUv3
> in a nested configuration. While QEMU knows where the guest tables live,
> safely walking them in software would require trapping and ordering all
> guest invalidations on every command queue. Without this, QEMU could race
> with guest updates and walk stale or freed page tables.
>
> This constraint is fundamental to the design of HW-accelerated vSMMU when
> used with downstream vfio-pci endpoint devices, where QEMU must never walk
> guest translation tables and must rely on the physical SMMU for
> translation. Future accelerated vSMMU features, such as virtual CMDQ, will
> also prevent trapping invalidations, reinforcing this restriction.
>
> For vfio-pci endpoints behind such a vSMMU, the only translation QEMU
> needs is for the MSI doorbell used when setting up KVM MSI route tables.
> Instead of attempting a software walk, introduce an optional vIOMMU
> callback that returns the MSI doorbell GPA directly.
>
> kvm_arch_fixup_msi_route() uses this callback when available and ignores
> the guest provided IOVA in that case.
>
> If the vIOMMU does not implement the callback, we fall back to the
> existing IOMMU based address space translation path.
>
> This ensures correct MSI routing for accelerated SMMUv3 + VFIO passthrough
> while avoiding unsafe software walks of guest translation tables.
>
> Cc: Michael S. Tsirkin <[email protected]>
> Signed-off-by: Shameer Kolothum <[email protected]>
> ---
>  hw/pci/pci.c         | 17 +++++++++++++++++
>  include/hw/pci/pci.h | 17 +++++++++++++++++
>  target/arm/kvm.c     | 18 +++++++++++++++++-
>  3 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 55647a6928..201583603f 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2979,6 +2979,23 @@ bool pci_device_get_iommu_bus_devfn(PCIDevice *dev, 
> PCIBus **piommu_bus,
>      return aliased;
>  }
>  
> +bool pci_device_iommu_msi_direct_gpa(PCIDevice *dev, hwaddr *out_doorbell)
> +{
> +    PCIBus *bus;
> +    PCIBus *iommu_bus;
> +    int devfn;
> +
> +    pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn);
> +    if (iommu_bus) {
> +        if (iommu_bus->iommu_ops->get_msi_direct_gpa) {
> +            *out_doorbell = iommu_bus->iommu_ops->get_msi_direct_gpa(bus,
> +                                iommu_bus->iommu_opaque, devfn);
> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
>  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
>  {
>      PCIBus *bus;
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index dd1c4483a2..0964049044 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -664,6 +664,22 @@ typedef struct PCIIOMMUOps {
>                              uint32_t pasid, bool priv_req, bool exec_req,
>                              hwaddr addr, bool lpig, uint16_t prgi, bool 
> is_read,
>                              bool is_write);
> +    /**
> +     * @get_msi_direct_gpa: get the guest physical address of MSI doorbell
> +     * for the device on a PCI bus.
> +     *
> +     * Optional callback. If implemented, it must return a valid guest
> +     * physical address for the MSI doorbell
> +     *
> +     * @bus: the #PCIBus being accessed.
> +     *
> +     * @opaque: the data passed to pci_setup_iommu().
> +     *
> +     * @devfn: device and function number
> +     *
> +     * Returns: the guest physical address of the MSI doorbell.
> +     */
> +    uint64_t (*get_msi_direct_gpa)(PCIBus *bus, void *opaque, int devfn);
>  } PCIIOMMUOps;
>  
>  bool pci_device_get_iommu_bus_devfn(PCIDevice *dev, PCIBus **piommu_bus,
> @@ -672,6 +688,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
> *dev);
>  bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
>                                   Error **errp);
>  void pci_device_unset_iommu_device(PCIDevice *dev);
> +bool pci_device_iommu_msi_direct_gpa(PCIDevice *dev, hwaddr *out_doorbell);
>  
>  /**
>   * pci_device_get_viommu_flags: get vIOMMU flags.
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index 0d57081e69..2372de6a6e 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -1620,26 +1620,42 @@ int kvm_arch_fixup_msi_route(struct 
> kvm_irq_routing_entry *route,
>          return 0;
>      }
>  
> +    /*
> +     * We do have an IOMMU address space, but for some vIOMMU implementations
> +     * (e.g. accelerated SMMUv3) the translation tables are programmed into
> +     * the physical SMMUv3 in the host (nested S1=guest, S2=host). QEMU 
> cannot
> +     * walk these tables in a safe way, so in that case we obtain the MSI
> +     * doorbell GPA directly from the vIOMMU backend and ignore the gIOVA
> +     * @address.
> +     */
> +    if (pci_device_iommu_msi_direct_gpa(dev, &doorbell_gpa)) {
> +        goto set_doorbell;
> +    }
> +
>      /* MSI doorbell address is translated by an IOMMU */
>  
> -    RCU_READ_LOCK_GUARD();
> +    rcu_read_lock();
what is the rationale behind the RCU changes?

Eric
>  
>      mr = address_space_translate(as, address, &xlat, &len, true,
>                                   MEMTXATTRS_UNSPECIFIED);
>  
>      if (!mr) {
> +        rcu_read_unlock();
>          return 1;
>      }
>  
>      mrs = memory_region_find(mr, xlat, 1);
>  
>      if (!mrs.mr) {
> +        rcu_read_unlock();
>          return 1;
>      }
>  
>      doorbell_gpa = mrs.offset_within_address_space;
>      memory_region_unref(mrs.mr);
> +    rcu_read_unlock();
>  
> +set_doorbell:
>      route->u.msi.address_lo = doorbell_gpa;
>      route->u.msi.address_hi = doorbell_gpa >> 32;
>  


Reply via email to