On 10/9/2025 9:47 PM, Alejandro Jimenez wrote:
Hi alejandro
Thanks for reviewing

Hi Sairaj,

Good catch. This issue makes my Linux guest unusable due to kernel watchdog errors. This patch fixes the problem. I have a few comments, but nothing that would fundamentally alter the current behavior. Please see below...

Yep I also faced similar issue.


On 10/8/25 12:43 PM, Sairaj Kodilkar wrote:
The AMD IOMMU is set up at boot time and uses PCI bus numbers + devfn
for indexing into DTE. The problem is that before the guest started,
all PCI bus numbers are 0 as no PCI discovery happened yet (BIOS or/and
kernel will do that later) so relying on the bus number is wrong.
The immediate effect is emulated devices cannot do DMA when places on
a bus other that 0.

Replace static array of address_space with hash table which uses devfn and
PCIBus* for key as it is not going to change after the guest is booted.

Co-developed-by: Alexey Kardashevskiy <[email protected]>
Signed-off-by: Alexey Kardashevskiy <[email protected]>
Signed-off-by: Sairaj Kodilkar <[email protected]>
---
  hw/i386/amd_iommu.c | 127 ++++++++++++++++++++++++++------------------
  hw/i386/amd_iommu.h |   2 +-
  2 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 378e0cb55eab..0a4b4d46d885 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -59,7 +59,7 @@ const char *amdvi_mmio_high[] = {
  };
    struct AMDVIAddressSpace {
-    uint8_t bus_num;            /* bus number                           */ +    PCIBus *bus;                /* PCIBus (for bus number)              */       uint8_t devfn;              /* device function                      */       AMDVIState *iommu_state;    /* AMDVI - one per machine              */       MemoryRegion root;          /* AMDVI Root memory map region         */
@@ -101,6 +101,11 @@ typedef enum AMDVIFaultReason {
      AMDVI_FR_PT_ENTRY_INV,      /* Failure to read PTE from guest memory */
  } AMDVIFaultReason;
  +typedef struct amdvi_as_key {
+    PCIBus *bus;
+    int devfn;

I'd prefer to use fixed types i.e. uint8_t for devfn. Keeps it consistent with same field in other local structs and existing casts in the code (e.g amdvi_host_dma_iommu()).
Sure will update it.

+} amdvi_as_key;
+
  uint64_t amdvi_extended_feature_register(AMDVIState *s)
  {
      uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES;
@@ -382,6 +387,42 @@ static guint amdvi_uint64_hash(gconstpointer v)
      return (guint)*(const uint64_t *)v;
  }
  +static gboolean amdvi_as_equal(gconstpointer v1, gconstpointer v2)
+{
+    const struct amdvi_as_key *key1 = v1;
+    const struct amdvi_as_key *key2 = v2;
+
+    return key1->bus == key2->bus && key1->devfn == key2->devfn;
+}
+
+static guint amdvi_as_hash(gconstpointer v)
+{
+    const struct amdvi_as_key *key = v;
+    return (guint)((uint64_t)key->bus | (key->devfn << 24));

Any particular reason to build the hash in 'big endian' format?
I don't see a problem as long it remains consistent, but it differs from the encoding used by the PCI_* builder macros in pci.h, as well as the vtd equivalent code.

Additionally, using uintptr_t instead of uint64_t when casting key->bus is a good way to document that we are hashing the pointer value itself. In practice I don't see any scenario where there would be a difference in behavior (the result is truncated anyways when casting to guint), but technically/pedantically uintptr_t is correct choice to convert from a data pointer.

There is no particular reason for it to be big endian,  I will make it consistent with pci.h



+}
+
+static AMDVIAddressSpace *amdvi_as_lookup(AMDVIState *s, PCIBus *bus,
+                                          int devfn)
+{
+    amdvi_as_key key = { .bus = bus, .devfn = devfn };
+    return g_hash_table_lookup(s->address_spaces, &key);
+}
+
+static int amdvi_find_as_by_devid(gpointer key, gpointer value,

this should return a gboolean to exactly match the signature of the predicate argument used by g_hash_table_find(). gboolean is ultimately an int, but I don't know if a strict type checking tool might complain now or in the future, and since we are already using glib defined types we might as well keep it consistent.



Ack.

+ gpointer user_data)
+{
+    amdvi_as_key *as = (struct amdvi_as_key *)key;
+    uint16_t devid = *((uint16_t *)user_data);
+
+    return devid == PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn);
+}
+
+static AMDVIAddressSpace *amdvi_get_as_by_devid(AMDVIState *s, uint16_t devid)
+{
+    return g_hash_table_find(s->address_spaces,
+                             amdvi_find_as_by_devid, &devid);
+}
+
  static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
                                             uint64_t devid)
  {
@@ -551,7 +592,7 @@ static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
    static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte)
  {
-    uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
+    uint16_t devid = PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn);
      AMDVIState *s = as->iommu_state;
        if (!amdvi_get_dte(s, devid, dte)) {
@@ -1011,25 +1052,14 @@ static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as)
   */
  static void amdvi_reset_address_translation_all(AMDVIState *s)
  {
-    AMDVIAddressSpace **iommu_as;
-
-    for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) {
-
-        /* Nothing to do if there are no devices on the current bus */
-        if (!s->address_spaces[bus_num]) {
-            continue;
-        }
-        iommu_as = s->address_spaces[bus_num];
+    AMDVIAddressSpace *iommu_as;
+    GHashTableIter as_it;
  -        for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+    g_hash_table_iter_init(&as_it, s->address_spaces);
  -            if (!iommu_as[devfn]) {
-                continue;
-            }
-            /* Use passthrough as default mode after reset */
-            iommu_as[devfn]->addr_translation = false;
-            amdvi_switch_address_space(iommu_as[devfn]);
-        }
+    while (g_hash_table_iter_next(&as_it, NULL, (void **)&iommu_as)) {

Lets keep the comment describing the behavior. This is something I want to discuss in a separate thread...

Ahh right, I missed to add the comment after deleting the above part.


           /* Use passthrough as default mode after reset */

+        iommu_as->addr_translation = false;
+        amdvi_switch_address_space(iommu_as);
      }
  }
  @@ -1089,27 +1119,21 @@ static void enable_nodma_mode(AMDVIAddressSpace *as)
   */
  static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid)
  {
-    uint8_t bus_num, devfn, dte_mode;
+    uint8_t dte_mode;
      AMDVIAddressSpace *as;
      uint64_t dte[4] = { 0 };
      int ret;
  -    /*
-     * Convert the devid encoded in the command to a bus and devfn in
-     * order to retrieve the corresponding address space.
-     */
-    bus_num = PCI_BUS_NUM(devid);
-    devfn = devid & 0xff;
-
      /*
       * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already        * been allocated within AMDVIState, but must be careful to not access
       * unallocated devfn.
       */

I think this block comment can be removed now that we have a better interface to retrieve the address space.

Right thanks for noticing, will remove it.


-    if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) {
+
+    as = amdvi_get_as_by_devid(s, devid);
+    if (!as) {
          return;
      }
-    as = s->address_spaces[bus_num][devfn];
        ret = amdvi_as_to_dte(as, dte);
  @@ -1783,7 +1807,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
                                 bool is_write, IOMMUTLBEntry *ret)
  {
      AMDVIState *s = as->iommu_state;
-    uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
+    uint16_t devid = PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn);
      AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
      uint64_t entry[4];
      int dte_ret;
@@ -1858,7 +1882,7 @@ static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
      }
        amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
-    trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
+    trace_amdvi_translation_result(pci_bus_num(as->bus), PCI_SLOT(as->devfn),
              PCI_FUNC(as->devfn), addr, ret.translated_addr);
      return ret;
  }
@@ -2222,30 +2246,28 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
  {
      char name[128];
      AMDVIState *s = opaque;
-    AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
-    int bus_num = pci_bus_num(bus);
+    AMDVIAddressSpace *amdvi_dev_as;
+    amdvi_as_key *key;
  -    iommu_as = s->address_spaces[bus_num];
+    amdvi_dev_as = amdvi_as_lookup(s, bus, devfn);
        /* allocate memory during the first run */
-    if (!iommu_as) {
-        iommu_as = g_new0(AMDVIAddressSpace *, PCI_DEVFN_MAX);
-        s->address_spaces[bus_num] = iommu_as;
-    }
-
-    /* set up AMD-Vi region */
-    if (!iommu_as[devfn]) {
+    if (!amdvi_dev_as) {
          snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
  -        iommu_as[devfn] = g_new0(AMDVIAddressSpace, 1);
-        iommu_as[devfn]->bus_num = (uint8_t)bus_num;
-        iommu_as[devfn]->devfn = (uint8_t)devfn;
-        iommu_as[devfn]->iommu_state = s;
-        iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE;
-        iommu_as[devfn]->iova_tree = iova_tree_new();
-        iommu_as[devfn]->addr_translation = false;
+        amdvi_dev_as = g_new0(AMDVIAddressSpace, 1);
+        key = g_new0(amdvi_as_key, 1);
  -        amdvi_dev_as = iommu_as[devfn];
+        amdvi_dev_as->bus = bus;
+        amdvi_dev_as->devfn = (uint8_t)devfn;
+        amdvi_dev_as->iommu_state = s;
+        amdvi_dev_as->notifier_flags = IOMMU_NONE;
Keep IOMMU_NOTIFIER_NONE. It is the correct type, as you pointed out in a previous patchset.

Ahh right. I missed to update that field while rebasing patches from V2 of your series to master

Thanks
Sairaj

Reply via email to