Hi, On 9/28/21 7:44 PM, Eric Auger wrote: > To handle SMMUv3 nested stage support it is practical to > expose the guest with reserved memory regions (RMRs) > covering the IOVAs used by the host kernel to map > physical MSI doorbells. > > Those IOVAs belong to [0x8000000, 0x8100000] matching > MSI_IOVA_BASE and MSI_IOVA_LENGTH definitions in kernel > arm-smmu-v3 driver. This is the window used to allocate > IOVAs matching physical MSI doorbells. > > With those RMRs, the guest is forced to use a flat mapping > for this range. Hence the assigned device is programmed > with one IOVA from this range. Stage 1, owned by the guest > has a flat mapping for this IOVA. Stage2, owned by the VMM > then enforces a mapping from this IOVA to the physical > MSI doorbell. > > At IORT table level, due to the single mapping flag being > set on the ID mapping, 256 IORT RMR nodes need to be > created per bus. This looks awkward from a specification > and implementation point of view. > > This may also produce a warning at execution time: > qemu-system-aarch64: warning: ACPI table size 114709 exceeds > 65536 bytes, migration may not work > (here with 5 pcie root ports, ie. 256 * 6 = 1536 RMR nodes!). > > The creation of those RMR nodes only is relevant if nested > stage SMMU is in use, along with VFIO. As VFIO devices can be > hotplugged, all RMRs need to be created in advance. Hence > the patch introduces a new arm virt "nested-smmuv3" iommu type. > > Signed-off-by: Eric Auger <eric.au...@redhat.com> > Suggested-by: Jean-Philippe Brucker <jean-phili...@linaro.org> > > --- > > Instead of introducing a new IOMMU type, we could introduce > an array of qdev_prop_reserved_region(s). > > Guest can parse the IORT RMR nodes with Shammer's series: > [PATCH v7 0/9] ACPI/IORT: Support for IORT RMR node
While further testing the SMMUv3 nested stage use case I noticed we miss a _DSM in the the ACPI device object of the PCIe host bridge (function 5). This DSM shall return 0 to indicate the OS must honour the PCI config the FW set at boot time. Otherwise, the RMRs are not taken into account in [PATCH v7 9/9] iommu/dma: Reserve any RMR regions associated with a dev (https://www.spinics.net/lists/linux-acpi/msg102492.html) if (!host->preserve_config) + return; So this patch is not sufficient for enabling the whole use case Thanks Eric > > The patch applies on Igor's v4 series [1]+ IORT E.b upgrade [2] > [1] [PATCH v4 00/35] acpi: refactor error prone build_header() > and packed structures usage in ACPI tables > [2] [PATCH 0/3] hw/arm/virt_acpi_build: Upgrate the IORT table > up to revision E.b > --- > include/hw/arm/virt.h | 7 ++++ > hw/arm/virt-acpi-build.c | 75 +++++++++++++++++++++++++++++++++------- > hw/arm/virt.c | 7 +++- > 3 files changed, 76 insertions(+), 13 deletions(-) > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h > index b461b8d261..f2f8aee219 100644 > --- a/include/hw/arm/virt.h > +++ b/include/hw/arm/virt.h > @@ -99,6 +99,7 @@ enum { > typedef enum VirtIOMMUType { > VIRT_IOMMU_NONE, > VIRT_IOMMU_SMMUV3, > + VIRT_IOMMU_NESTED_SMMUV3, > VIRT_IOMMU_VIRTIO, > } VirtIOMMUType; > > @@ -190,4 +191,10 @@ static inline int > virt_gicv3_redist_region_count(VirtMachineState *vms) > return MACHINE(vms)->smp.cpus > redist0_capacity ? 2 : 1; > } > > +static inline bool virt_has_smmuv3(const VirtMachineState *vms) > +{ > + return vms->iommu == VIRT_IOMMU_SMMUV3 || > + vms->iommu == VIRT_IOMMU_NESTED_SMMUV3; > +} > + > #endif /* QEMU_ARM_VIRT_H */ > diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c > index ce9311ac19..79842dea4c 100644 > --- a/hw/arm/virt-acpi-build.c > +++ b/hw/arm/virt-acpi-build.c > @@ -245,16 +245,16 @@ static void acpi_dsdt_add_tpm(Aml *scope, > VirtMachineState *vms) > #define ROOT_COMPLEX_ENTRY_SIZE 36 > #define IORT_NODE_OFFSET 48 > > -static void build_iort_id_mapping(GArray *table_data, uint32_t input_base, > - uint32_t id_count, uint32_t out_ref) > +static void > +build_iort_id_mapping(GArray *table_data, uint32_t input_base, > + uint32_t id_count, uint32_t out_ref, uint32_t flags) > { > /* Table 4 ID mapping format */ > build_append_int_noprefix(table_data, input_base, 4); /* Input base */ > build_append_int_noprefix(table_data, id_count, 4); /* Number of IDs */ > build_append_int_noprefix(table_data, input_base, 4); /* Output base */ > build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */ > - /* Flags */ > - build_append_int_noprefix(table_data, 0 /* Single mapping */, 4); > + build_append_int_noprefix(table_data, flags, 4); /* Flags */ > } > > struct AcpiIortIdMapping { > @@ -296,6 +296,49 @@ static int iort_idmap_compare(gconstpointer a, > gconstpointer b) > return idmap_a->input_base - idmap_b->input_base; > } > > +static void > +build_iort_rmr_node(GArray *table_data, GArray *smmu_idmaps, int > smmu_offset) { > + AcpiIortIdMapping *range; > + int i, j; > + > + for (i = 0; i < smmu_idmaps->len; i++) { > + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); > + for (j = 0; j < range->id_count; j++) { > + int bdf = range->input_base + j; > + > + /* Table 18 Reserved Memory Range Node */ > + > + build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */ > + /* Length */ > + build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE > + 20, 2); > + build_append_int_noprefix(table_data, 3, 1); /* Revision */ > + build_append_int_noprefix(table_data, i * j + 3, 4); /* > Identifier */ > + /* Number of ID mappings */ > + build_append_int_noprefix(table_data, 1, 4); > + /* Reference to ID Array */ > + build_append_int_noprefix(table_data, 28, 4); > + > + /* RMR specific data */ > + > + /* Flags */ > + build_append_int_noprefix(table_data, 0 /* Disallow remapping > */, 4); > + /* Number of Memory Range Descriptors */ > + build_append_int_noprefix(table_data, 1 , 4); > + /* Reference to Memory Range Descriptors */ > + build_append_int_noprefix(table_data, 28 + > ID_MAPPING_ENTRY_SIZE, 4); > + build_iort_id_mapping(table_data, bdf, 1, smmu_offset, 1); > + > + /* Table 19 Memory Range Descriptor */ > + > + /* Physical Range offset */ > + build_append_int_noprefix(table_data, 0x8000000, 8); > + /* Physical Range length */ > + build_append_int_noprefix(table_data, 0x100000, 8); > + build_append_int_noprefix(table_data, 0, 4); /* Reserved */ > + } > + } > +} > + > /* > * Input Output Remapping Table (IORT) > * Conforms to "IO Remapping Table System Software on ARM Platforms", > @@ -316,12 +359,14 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > /* Table 2 The IORT */ > acpi_table_begin(&table, table_data); > > - if (vms->iommu == VIRT_IOMMU_SMMUV3) { > + if (virt_has_smmuv3(vms)) { > AcpiIortIdMapping next_range = {0}; > > object_child_foreach_recursive(object_get_root(), > iort_host_bridges, smmu_idmaps); > > + nb_nodes = 3; /* RC, ITS, SMMUv3 */ > + > /* Sort the smmu idmap by input_base */ > g_array_sort(smmu_idmaps, iort_idmap_compare); > > @@ -338,6 +383,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > } > > next_range.input_base = idmap->input_base + idmap->id_count; > + if (vms->iommu == VIRT_IOMMU_NESTED_SMMUV3) { > + nb_nodes += idmap->id_count; > + } > } > > /* Append the last RC -> ITS ID mapping */ > @@ -346,7 +394,6 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > g_array_append_val(its_idmaps, next_range); > } > > - nb_nodes = 3; /* RC, ITS, SMMUv3 */ > rc_mapping_count = smmu_idmaps->len + its_idmaps->len; > } else { > nb_nodes = 2; /* RC, ITS */ > @@ -371,7 +418,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > /* GIC ITS Identifier Array */ > build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4); > > - if (vms->iommu == VIRT_IOMMU_SMMUV3) { > + if (virt_has_smmuv3(vms)) { > int irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE; > > smmu_offset = table_data->len - table.table_offset; > @@ -401,7 +448,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > build_append_int_noprefix(table_data, 0, 4); > > /* output IORT node is the ITS group node (the first node) */ > - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); > + build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET, 0); > } > > /* Table 17 Root Complex Node */ > @@ -434,7 +481,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > build_append_int_noprefix(table_data, 0, 3); /* Reserved */ > > /* Output Reference */ > - if (vms->iommu == VIRT_IOMMU_SMMUV3) { > + if (virt_has_smmuv3(vms)) { > AcpiIortIdMapping *range; > > /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ > @@ -442,7 +489,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); > /* output IORT node is the smmuv3 node */ > build_iort_id_mapping(table_data, range->input_base, > - range->id_count, smmu_offset); > + range->id_count, smmu_offset, 0); > } > > /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ > @@ -450,11 +497,15 @@ build_iort(GArray *table_data, BIOSLinker *linker, > VirtMachineState *vms) > range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); > /* output IORT node is the ITS group node (the first node) */ > build_iort_id_mapping(table_data, range->input_base, > - range->id_count, iort_node_offset); > + range->id_count, iort_node_offset, 0); > } > } else { > /* output IORT node is the ITS group node (the first node) */ > - build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET); > + build_iort_id_mapping(table_data, 0, 0xFFFF, IORT_NODE_OFFSET, 0); > + } > + > + if (vms->iommu == VIRT_IOMMU_NESTED_SMMUV3) { > + build_iort_rmr_node(table_data, smmu_idmaps, smmu_offset); > } > > acpi_table_end(linker, &table); > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index 1d59f0e59f..f538611f4e 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -1251,7 +1251,7 @@ static void create_smmu(const VirtMachineState *vms, > DeviceState *dev; > MachineState *ms = MACHINE(vms); > > - if (vms->iommu != VIRT_IOMMU_SMMUV3 || !vms->iommu_phandle) { > + if (!virt_has_smmuv3(vms) || !vms->iommu_phandle) { > return; > } > > @@ -1441,6 +1441,7 @@ static void create_pcie(VirtMachineState *vms) > > switch (vms->iommu) { > case VIRT_IOMMU_SMMUV3: > + case VIRT_IOMMU_NESTED_SMMUV3: > create_smmu(vms, vms->bus); > qemu_fdt_setprop_cells(ms->fdt, nodename, "iommu-map", > 0x0, vms->iommu_phandle, 0x0, 0x10000); > @@ -2314,6 +2315,8 @@ static char *virt_get_iommu(Object *obj, Error **errp) > return g_strdup("none"); > case VIRT_IOMMU_SMMUV3: > return g_strdup("smmuv3"); > + case VIRT_IOMMU_NESTED_SMMUV3: > + return g_strdup("nested-smmuv3"); > default: > g_assert_not_reached(); > } > @@ -2325,6 +2328,8 @@ static void virt_set_iommu(Object *obj, const char > *value, Error **errp) > > if (!strcmp(value, "smmuv3")) { > vms->iommu = VIRT_IOMMU_SMMUV3; > + } else if (!strcmp(value, "nested-smmuv3")) { > + vms->iommu = VIRT_IOMMU_NESTED_SMMUV3; > } else if (!strcmp(value, "none")) { > vms->iommu = VIRT_IOMMU_NONE; > } else {