On 10/23/2025 11:50 PM, Jason Gunthorpe wrote: > AMD IOMMU v1 is unique in supporting contiguous pages with a variable size > and it can decode the full 64 bit VA space. Unlike other x86 page tables > this explicitly does not do sign extension as part of allowing the entire > 64 bit VA space to be supported. > > The general design is quite similar to the x86 PAE format, except with a > 6th level and quite different PTE encoding. > > This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in > the existing code as the existing AMDv1 code starts out with a 3 level > table and adds levels on the fly if more IOVA is needed. > > Comparing the performance of several operations to the existing version: > > iommu_map() > pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) > 2^12, 65,64 , 62,61 , -1.01 > 2^13, 70,66 , 67,62 , -8.08 > 2^14, 73,69 , 71,65 , -9.09 > 2^15, 78,75 , 75,71 , -5.05 > 2^16, 89,89 , 86,84 , -2.02 > 2^17, 128,121 , 124,112 , -10.10 > 2^18, 175,175 , 170,163 , -4.04 > 2^19, 264,306 , 261,279 , 6.06 > 2^20, 444,525 , 438,489 , 10.10 > 2^21, 60,62 , 58,59 , 1.01 > 256*2^12, 381,1833 , 367,1795 , 79.79 > 256*2^21, 375,1623 , 356,1555 , 77.77 > 256*2^30, 356,1338 , 349,1277 , 72.72 > > iommu_unmap() > pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) > 2^12, 76,89 , 71,86 , 17.17 > 2^13, 79,89 , 75,86 , 12.12 > 2^14, 78,90 , 74,86 , 13.13 > 2^15, 82,89 , 74,86 , 13.13 > 2^16, 79,89 , 74,86 , 13.13 > 2^17, 81,89 , 77,87 , 11.11 > 2^18, 90,92 , 87,89 , 2.02 > 2^19, 91,93 , 88,90 , 2.02 > 2^20, 96,95 , 91,92 , 1.01 > 2^21, 72,88 , 68,85 , 20.20 > 256*2^12, 372,6583 , 364,6251 , 94.94 > 256*2^21, 398,6032 , 392,5758 , 93.93 > 256*2^30, 396,5665 , 389,5258 , 92.92 > > The ~5-17x speedup when working with mutli-PTE map/unmaps is because the > AMD implementation rewalks the entire table on every new PTE while this > version retains its position. The same speedup will be seen with dirtys as > well. > > The old implementation triggers a compiler optimization that ends up > generating a "rep stos" memset for contiguous PTEs. Since AMD can have > contiguous PTEs that span 2Kbytes of table this is a huge win compared to > a normal movq loop. It is why the unmap side has a fairly flat runtime as > the contiguous PTE sides increases. This version makes it explicit with a > memset64() call. > > Tested-by: Alejandro Jimenez <[email protected]> > Reviewed-by: Kevin Tian <[email protected]> > Signed-off-by: Jason Gunthorpe <[email protected]>
Reviewed-by: Vasant Hegde <[email protected]> > --- > drivers/iommu/Makefile | 1 + > drivers/iommu/generic_pt/Kconfig | 12 + > drivers/iommu/generic_pt/fmt/Makefile | 11 + > drivers/iommu/generic_pt/fmt/amdv1.h | 391 +++++++++++++++++++++ > drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 ++ > drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 + > include/linux/generic_pt/common.h | 19 + > include/linux/generic_pt/iommu.h | 12 + > 8 files changed, 482 insertions(+) > create mode 100644 drivers/iommu/generic_pt/fmt/Makefile > create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h > create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h > create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c > .../... > +$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) > diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h > b/drivers/iommu/generic_pt/fmt/amdv1.h > new file mode 100644 > index 00000000000000..1f46e4ab4aea51 > --- /dev/null > +++ b/drivers/iommu/generic_pt/fmt/amdv1.h > @@ -0,0 +1,391 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES > + * > + * AMD IOMMU v1 page table > + * > + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" > + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" > + * > + * Note the level numbering here matches the core code, so level 0 is the > same > + * as mode 1. > + * > + */ > +#ifndef __GENERIC_PT_FMT_AMDV1_H > +#define __GENERIC_PT_FMT_AMDV1_H > + > +#include "defs_amdv1.h" > +#include "../pt_defs.h" > + > +#include <asm/page.h> > +#include <linux/bitfield.h> > +#include <linux/container_of.h> > +#include <linux/mem_encrypt.h> > +#include <linux/minmax.h> > +#include <linux/sizes.h> > +#include <linux/string.h> > + > +enum { > + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, > + PT_MAX_VA_ADDRESS_LG2 = 64, > + PT_ITEM_WORD_SIZE = sizeof(u64), > + PT_MAX_TOP_LEVEL = 5, > + PT_GRANULE_LG2SZ = 12, > + PT_TABLEMEM_LG2SZ = 12, > + > + /* The DTE only has these bits for the top phyiscal address */ > + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), > +}; > + > +/* PTE bits */ > +enum { > + AMDV1PT_FMT_PR = BIT(0), > + AMDV1PT_FMT_D = BIT(6), > + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), > + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), > + AMDV1PT_FMT_FC = BIT_ULL(60), > + AMDV1PT_FMT_IR = BIT_ULL(61), > + AMDV1PT_FMT_IW = BIT_ULL(62), > +}; > + > +/* > + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, > make > + * these defines to avoid it. > + */ > +#define AMDV1PT_FMT_NL_DEFAULT 0 > +#define AMDV1PT_FMT_NL_SIZE 7 > + > +#define common_to_amdv1pt(common_ptr) \ > + container_of_const(common_ptr, struct pt_amdv1, common) > +#define to_amdv1pt(pts) common_to_amdv1pt((pts)->range->common) Unused macros? -Vasant
