Hi Jason, On Mon, May 05, 2025 at 11:18:37AM -0300, Jason Gunthorpe wrote: > map is slightly complicated because it has to handle a number of special > edge cases: > - Overmapping a previously shared table with an OA - requries validating > and freeing the possibly empty tables > - Doing the above across an entire to-be-created contiguous entry > - Installing a new shared table level concurrently with another thread > - Expanding the table by adding more top levels > > Table expansion is a unique feature of AMDv1, this version is quite > similar except we handle racing concurrent lockless map. The table top > pointer and starting level are encoded in a single uintptr_t which ensures > we can READ_ONCE() without tearing. Any op will do the READ_ONCE() and use > that fixed point as its starting point. Concurrent expansion is handled > with a table global spinlock. > > When inserting a new table entry map checks that the entire portion of the > table is empty. This includes freeing any empty lower tables that will be > overwritten by an OA. A separate free list is used while checking and > collecting all the empty lower tables so that writing the new entry is > uninterrupted, either the new entry fully writes or nothing changes. > > A special fast path for PAGE_SIZE is implemented that does a direct walk > to the leaf level and installs a single entry. This gives ~15% improvement > for iommu_map() when mapping lists of single pages. > > This version sits under the iommu_domain_ops as map_pages() but does not > require the external page size calculation. The implementation is actually > map_range() and can do arbitrary ranges, internally handling all the > validation and supporting any arrangment of page sizes. A future series > can optimize iommu_map() to take advantage of this. > > Signed-off-by: Jason Gunthorpe <j...@nvidia.com> > --- > drivers/iommu/generic_pt/iommu_pt.h | 481 ++++++++++++++++++++++++++++ > include/linux/generic_pt/iommu.h | 58 ++++ > 2 files changed, 539 insertions(+) > > diff --git a/drivers/iommu/generic_pt/iommu_pt.h > b/drivers/iommu/generic_pt/iommu_pt.h > index aa15fb79abf24a..7a680017f35bff 100644 > --- a/drivers/iommu/generic_pt/iommu_pt.h > +++ b/drivers/iommu/generic_pt/iommu_pt.h > @@ -90,6 +90,23 @@ static __maybe_unused int make_range_u64(struct pt_common > *common, > ret; \ > }) > > +static inline unsigned int compute_best_pgsize(struct pt_state *pts, > + pt_oaddr_t oa) > +{ > + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); > + > + if (!pt_can_have_leaf(pts)) > + return 0; > + > + /* > + * The page size is limited by the domain's bitmap. This allows the core > + * code to reduce the supported page sizes by changing the bitmap. > + */ > + return pt_compute_best_pgsize(pt_possible_sizes(pts) & > + iommu_table->domain.pgsize_bitmap, > + pts->range->va, pts->range->last_va, oa); > +} > + > static __always_inline int __do_iova_to_phys(struct pt_range *range, void > *arg, > unsigned int level, > struct pt_table_p *table, > @@ -189,6 +206,463 @@ static inline struct pt_table_p *table_alloc_top(struct > pt_common *common, > log2_to_int(pt_top_memsize_lg2(common, top_of_table))); > } > > +/* Allocate an interior table */ > +static inline struct pt_table_p *table_alloc(const struct pt_state > *parent_pts, > + gfp_t gfp) > +{ > + struct pt_iommu *iommu_table = > + iommu_from_common(parent_pts->range->common); > + struct pt_state child_pts = > + pt_init(parent_pts->range, parent_pts->level - 1, NULL); > + > + return iommu_alloc_pages_node_sz( > + iommu_table->nid, gfp, > + log2_to_int(pt_num_items_lg2(&child_pts) + > + ilog2(PT_ENTRY_WORD_SIZE))); > +} > + > +static inline int pt_iommu_new_table(struct pt_state *pts, > + struct pt_write_attrs *attrs) > +{ > + struct pt_table_p *table_mem; > + phys_addr_t phys; > + > + /* Given PA/VA/length can't be represented */ > + if (unlikely(!pt_can_have_table(pts))) > + return -ENXIO; > + > + table_mem = table_alloc(pts, attrs->gfp); > + if (IS_ERR(table_mem)) > + return PTR_ERR(table_mem); > + > + phys = virt_to_phys(table_mem); > + if (!pt_install_table(pts, phys, attrs)) { > + iommu_free_pages(table_mem); > + return -EAGAIN; > + } > + > + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { > + /* > + * The underlying table can't store the physical table address. > + * This happens when kunit testing tables outside their normal > + * environment where a CPU might be limited. > + */ > + pt_load_single_entry(pts); > + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { > + pt_clear_entry(pts, ilog2(1)); > + iommu_free_pages(table_mem); > + return -EINVAL; > + } > + } > + > + pts->table_lower = table_mem; > + return 0; > +} > + > +struct pt_iommu_map_args { > + struct iommu_iotlb_gather *iotlb_gather; > + struct pt_write_attrs attrs; > + pt_oaddr_t oa; > + unsigned int leaf_pgsize_lg2; > + unsigned int leaf_level; > +}; > + > +/* > + * Check that the items in a contiguous block are all empty. This will > + * recursively check any tables in the block to validate they are empty and > + * accumulate them on the free list. Makes no change on failure. On success > + * caller must fill the items. > + */ > +static int clear_contig(const struct pt_state *start_pts, > + struct iommu_iotlb_gather *iotlb_gather, > + unsigned int step, unsigned int pgsize_lg2) > +{ > + struct pt_iommu *iommu_table = > + iommu_from_common(start_pts->range->common); > + struct pt_range range = *start_pts->range; > + struct pt_state pts = > + pt_init(&range, start_pts->level, start_pts->table); > + struct pt_iommu_collect_args collect = { > + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), > + }; > + pt_vaddr_t start_va = range.va; > + int ret; > + > + pts.index = start_pts->index; > + pts.end_index = start_pts->index + step; > + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { > + if (pts.type == PT_ENTRY_TABLE) { > + ret = pt_walk_descend_all(&pts, __collect_tables, > + &collect); > + if (ret) > + return ret; > + > + iommu_pages_list_add(&collect.free_list, > + pt_table_ptr(&pts)); > + } else if (pts.type != PT_ENTRY_EMPTY) { > + return -EADDRINUSE; > + } > + } > + > + if (!iommu_pages_list_empty(&collect.free_list)) { > + gather_range(iotlb_gather, iommu_table, start_va, > + range.va - start_va); > + iommu_pages_list_splice(&collect.free_list, > + &iotlb_gather->freelist); > + } > + return 0; > +} > + > +static int __map_range_leaf(struct pt_range *range, void *arg, > + unsigned int level, struct pt_table_p *table) > +{ > + struct pt_state pts = pt_init(range, level, table); > + struct pt_iommu_map_args *map = arg; > + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; > + unsigned int start_index; > + pt_oaddr_t oa = map->oa; > + unsigned int step; > + bool need_contig; > + > + PT_WARN_ON(map->leaf_level != level); > + PT_WARN_ON(!pt_can_have_leaf(&pts)); > + > + step = log2_to_int_t(unsigned int, > + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); > + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); > + > + _pt_iter_first(&pts); > + start_index = pts.index; > + do { > + pts.type = pt_load_entry_raw(&pts); > + if (pts.type != PT_ENTRY_EMPTY || need_contig) { > + int ret; > + > + if (pts.index != start_index) > + pt_index_to_va(&pts); > + ret = clear_contig(&pts, map->iotlb_gather, step, > + leaf_pgsize_lg2); > + if (ret) { > + map->oa = oa; > + return ret; > + } > + } > + > + PT_WARN_ON(compute_best_pgsize(&pts, oa) != leaf_pgsize_lg2); > + > + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); > + > + oa += log2_to_int(leaf_pgsize_lg2); > + pts.index += step; > + } while (pts.index < pts.end_index); > + > + map->oa = oa; > + return 0; > +} > + > +static int __map_range(struct pt_range *range, void *arg, unsigned int level, > + struct pt_table_p *table) > +{ > + struct pt_state pts = pt_init(range, level, table); > + struct pt_iommu_map_args *map = arg; > + int ret; > + > + PT_WARN_ON(map->leaf_level == level); > + PT_WARN_ON(!pt_can_have_table(&pts)); > + > + _pt_iter_first(&pts); > + > + /* Descend to a child table */ > + do { > + pts.type = pt_load_entry_raw(&pts); > + > + if (pts.type != PT_ENTRY_TABLE) { > + if (pts.type != PT_ENTRY_EMPTY) > + return -EADDRINUSE; > + ret = pt_iommu_new_table(&pts, &map->attrs); > + if (ret) { > + /* > + * Racing with another thread installing a table > + */ > + if (ret == -EAGAIN) > + continue; > + return ret; > + } > + } else { > + pts.table_lower = pt_table_ptr(&pts); > + } > + > + /* > + * The already present table can possibly be shared with another > + * concurrent map. > + */ > + if (map->leaf_level == level - 1) > + ret = pt_descend(&pts, arg, __map_range_leaf); > + else > + ret = pt_descend(&pts, arg, __map_range); > + if (ret) > + return ret; > + > + pts.index++; > + pt_index_to_va(&pts); > + if (pts.index >= pts.end_index) > + break; > + pts.type = pt_load_entry_raw(&pts); > + } while (true); > + return 0; > +} > + > +static __always_inline int __do_map_single_page(struct pt_range *range, > + void *arg, unsigned int level, > + struct pt_table_p *table, > + pt_level_fn_t descend_fn) > +{ > + struct pt_state pts = pt_init(range, level, table); > + struct pt_iommu_map_args *map = arg; > + > + pts.type = pt_load_single_entry(&pts); > + if (level == 0) { > + if (pts.type != PT_ENTRY_EMPTY) > + return -EADDRINUSE; > + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, > + &map->attrs); > + map->oa += PAGE_SIZE; > + return 0; > + } > + if (pts.type != PT_ENTRY_TABLE) > + return -EAGAIN; > + return pt_descend(&pts, arg, descend_fn); > +} > +PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); > + > +/* > + * Add a table to the top, increasing the top level as much as necessary to > + * encompass range. > + */ > +static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, > + struct pt_iommu_map_args *map) > +{ > + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); > + struct pt_common *common = common_from_iommu(iommu_table); > + uintptr_t top_of_table = READ_ONCE(common->top_of_table); > + uintptr_t new_top_of_table = top_of_table; > + struct pt_table_p *table_mem; > + unsigned int new_level; > + spinlock_t *domain_lock; > + unsigned long flags; > + int ret; > + > + while (true) { > + struct pt_range top_range = > + _pt_top_range(common, new_top_of_table); > + struct pt_state pts = pt_init_top(&top_range); > + > + top_range.va = range->va; > + top_range.last_va = range->last_va; > + > + if (!pt_check_range(&top_range) && map->leaf_level <= pts.level) > + break; > + > + pts.level++; > + if (pts.level > PT_MAX_TOP_LEVEL || > + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { > + ret = -ERANGE; > + goto err_free; > + } > + > + new_level = pts.level; > + table_mem = table_alloc_top( > + common, _pt_top_set(NULL, pts.level), map->attrs.gfp); > + if (IS_ERR(table_mem)) > + return PTR_ERR(table_mem);
For subsequent calls for while, it should necessitate invoking goto, correct? > + iommu_pages_list_add(&free_list, table_mem); > + > + /* The new table links to the lower table always at index 0 */ > + top_range.va = 0; > + top_range.top_level = new_level; > + pts.table_lower = pts.table; > + pts.table = table_mem; > + pt_load_single_entry(&pts); > + PT_WARN_ON(pts.index != 0); > + pt_install_table(&pts, virt_to_phys(pts.table_lower), > + &map->attrs); > + new_top_of_table = _pt_top_set(pts.table, pts.level); > + } > + > + /* > + * top_of_table is write locked by the spinlock, but readers can use > + * READ_ONCE() to get the value. Since we encode both the level and the > + * pointer in one quanta the lockless reader will always see something > + * valid. The HW must be updated to the new level under the spinlock > + * before top_of_table is updated so that concurrent readers don't map > + * into the new level until it is fully functional. If another thread > + * already updated it while we were working then throw everything away > + * and try again. > + */ > + domain_lock = iommu_table->hw_flush_ops->get_top_lock(iommu_table); > + spin_lock_irqsave(domain_lock, flags); > + if (common->top_of_table != top_of_table) { > + spin_unlock_irqrestore(domain_lock, flags); > + ret = -EAGAIN; > + goto err_free; > + } > + > + /* > + * We do not issue any flushes for change_top on the expectation that > + * any walk cache will not become a problem by adding another layer to > + * the tree. Misses will rewalk from the updated top pointer, hits > + * continue to be correct. Negative caching is fine too since all the > + * new IOVA added by the new top is non-present. > + */ > + iommu_table->hw_flush_ops->change_top( > + iommu_table, virt_to_phys(table_mem), new_level); > + WRITE_ONCE(common->top_of_table, new_top_of_table); > + spin_unlock_irqrestore(domain_lock, flags); > + return 0; > + > +err_free: > + iommu_put_pages_list(&free_list); > + return ret; > +} > + > +static int check_map_range(struct pt_iommu *iommu_table, struct pt_range > *range, > + struct pt_iommu_map_args *map) > +{ > + struct pt_common *common = common_from_iommu(iommu_table); > + int ret; > + > + do { > + ret = pt_check_range(range); > + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) > + return ret; > + > + if (!ret && map->leaf_level <= range->top_level) > + break; > + > + ret = increase_top(iommu_table, range, map); > + if (ret && ret != -EAGAIN) > + return ret; > + > + /* Reload the new top */ > + *range = pt_make_range(common, range->va, range->last_va); > + } while (ret); > + PT_WARN_ON(pt_check_range(range)); > + return 0; > +} > + > +/** > + * map_range() - Install translation for an IOVA range > + * @iommu_table: Table to manipulate > + * @iova: IO virtual address to start > + * @paddr: Physical/Output address to start > + * @len: Length of the range starting from @iova > + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO > + * @gfp: GFP flags for any memory allocations > + * @gather: Gather struct that must be flushed on return > + * > + * The range starting at IOVA will have paddr installed into it. The rage is > + * automatically segmented into optimally sized table entries, and can have > any > + * valid alignment. > + * > + * On error the caller will probably want to invoke unmap on the range from > iova > + * up to the amount indicated by @mapped to return the table back to an > + * unchanged state. > + * > + * Context: The caller must hold a write range lock that includes the whole > + * range. > + * > + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that > were > + * mapped are added to @mapped, @mapped is not zerod first. > + */ > +int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, > + phys_addr_t paddr, size_t pgsize, size_t pgcount, > + int prot, gfp_t gfp, size_t *mapped) > +{ > + struct pt_iommu *iommu_table = > + container_of(domain, struct pt_iommu, domain); > + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; > + struct pt_common *common = common_from_iommu(iommu_table); > + struct iommu_iotlb_gather iotlb_gather; > + pt_vaddr_t len = pgsize * pgcount; > + struct pt_iommu_map_args map = { > + .iotlb_gather = &iotlb_gather, > + .oa = paddr, > + .leaf_pgsize_lg2 = log2_ffs(pgsize), > + }; > + bool single_page = false; > + struct pt_range range; > + int ret; > + > + iommu_iotlb_gather_init(&iotlb_gather); > + > + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) > + return -EINVAL; > + > + /* Check the paddr doesn't exceed what the table can store */ > + if ((sizeof(pt_oaddr_t) > sizeof(paddr) && paddr > PT_VADDR_MAX) || > + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && > + oalog2_div(paddr, common->max_oasz_lg2))) > + return -ERANGE; > + > + ret = pt_iommu_set_prot(common, &map.attrs, prot); > + if (ret) > + return ret; > + map.attrs.gfp = gfp; > + > + ret = make_range_no_check(common, &range, iova, len); > + if (ret) > + return ret; > + > + /* Calculate target page size and level for the leaves */ > + if (pt_has_system_page(common) && pgsize == PAGE_SIZE && pgcount == 1) { > + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); > + if (log2_mod(iova | paddr, PAGE_SHIFT)) > + return -ENXIO; > + map.leaf_pgsize_lg2 = PAGE_SHIFT; > + map.leaf_level = 0; > + single_page = true; > + } else { > + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( > + pgsize_bitmap, range.va, range.last_va, paddr); > + if (!map.leaf_pgsize_lg2) > + return -ENXIO; > + map.leaf_level = > + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); > + } > + > + ret = check_map_range(iommu_table, &range, &map); > + if (ret) > + return ret; > + > + PT_WARN_ON(map.leaf_level > range.top_level); > + > + do { > + if (single_page) { > + ret = pt_walk_range(&range, __map_single_page, &map); > + if (ret != -EAGAIN) > + break; > + } > + > + if (map.leaf_level == range.top_level) > + ret = pt_walk_range(&range, __map_range_leaf, &map); > + else > + ret = pt_walk_range(&range, __map_range, &map); > + } while (false); > + > + /* > + * Table levels were freed and replaced with large items, flush any walk > + * cache that may refer to the freed levels. > + */ > + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) > + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); > + > + /* Bytes successfully mapped */ > + PT_WARN_ON(!ret && map.oa - paddr != len); > + *mapped += map.oa - paddr; > + return ret; > +} > +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); > + > struct pt_unmap_args { > struct iommu_pages_list free_list; > pt_vaddr_t unmapped; > @@ -448,6 +922,7 @@ static void pt_iommu_zero(struct pt_iommu_table > *fmt_table) > memset_after(fmt_table, 0, iommu.domain); > > /* The caller can initialize some of these values */ > + iommu_table->hw_flush_ops = cfg.hw_flush_ops; > iommu_table->nid = cfg.nid; > } > > @@ -483,6 +958,12 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table, > if (ret) > return ret; > > + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && > + WARN_ON(!iommu_table->hw_flush_ops || > + !iommu_table->hw_flush_ops->change_top || > + !iommu_table->hw_flush_ops->get_top_lock)) > + return -EINVAL; > + > if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && > (pt_feature(common, PT_FEAT_FULL_VA) || > pt_feature(common, PT_FEAT_DYNAMIC_TOP))) > diff --git a/include/linux/generic_pt/iommu.h > b/include/linux/generic_pt/iommu.h > index 862d224c59281e..fd0c598526ce57 100644 > --- a/include/linux/generic_pt/iommu.h > +++ b/include/linux/generic_pt/iommu.h > @@ -11,6 +11,7 @@ > > struct iommu_iotlb_gather; > struct pt_iommu_ops; > +struct pt_iommu_flush_ops; > > /** > * DOC: IOMMU Radix Page Table > @@ -43,6 +44,12 @@ struct pt_iommu { > */ > const struct pt_iommu_ops *ops; > > + /** > + * @hw_flush_ops - Function pointers provided by the HW driver to flush > + * HW caches after changes to the page table. > + */ > + const struct pt_iommu_flush_ops *hw_flush_ops; > + > /** > * @nid - Node ID to use for table memory allocations. The iommu driver > * may want to set the NID to the device's NID, if there are multiple > @@ -84,6 +91,52 @@ struct pt_iommu_ops { > void (*deinit)(struct pt_iommu *iommu_table); > }; > > +/** > + * struct pt_iommu_flush_ops - HW IOTLB cache flushing operations > + * > + * The IOMMU driver should implement these using container_of(iommu_table) to > + * get to it's iommu_domain dervied structure. All ops can be called in > atomic > + * contexts as they are buried under DMA API calls. > + */ > +struct pt_iommu_flush_ops { > + /** > + * change_top() - Update the top of table pointer > + * @iommu_table: Table to operate on > + * @top_paddr: New CPU physical address of the top pointer > + * @top_level: IOMMU PT level of the new top > + * > + * Called under the get_top_lock() spinlock. The driver must update all > + * HW references to this domain with a new top address and > + * configuration. On return mappings placed in the new top must be > + * reachable by the HW. > + * > + * top_level encodes the level in IOMMU PT format, level 0 is the > + * smallest page size increasing from there. This has to be translated > + * to any HW specific format. During this call the new top will not be > + * visible to any other API. > + * > + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if > + * enabled. > + */ > + void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr, > + unsigned int top_level); > + > + /** > + * get_top_lock() - Return a lock to hold when changing the table top > + * @iommu_table: Table to operate on > + * > + * page table from being stored in HW. The lock will be held prior > + * to calling change_top() and released once the top is fully visible. > + * > + * Typically this would be a lock that protects the iommu_domain's > + * attachment list. > + * > + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if > + * enabled. > + */ > + spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table); > +}; > + > static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) > { > iommu_table->ops->deinit(iommu_table); > @@ -114,6 +167,10 @@ struct pt_iommu_cfg { > #define IOMMU_PROTOTYPES(fmt) > \ > phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ > dma_addr_t iova); \ > + int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ > + unsigned long iova, phys_addr_t paddr, \ > + size_t pgsize, size_t pgcount, \ > + int prot, gfp_t gfp, size_t *mapped); \ > size_t pt_iommu_##fmt##_unmap_pages( \ > struct iommu_domain *domain, unsigned long iova, \ > size_t pgsize, size_t pgcount, \ > @@ -136,6 +193,7 @@ struct pt_iommu_cfg { > */ > #define IOMMU_PT_DOMAIN_OPS(fmt) \ > .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ > + .map_pages = &pt_iommu_##fmt##_map_pages, \ > .unmap_pages = &pt_iommu_##fmt##_unmap_pages > > /* > -- > 2.43.0 >