On Thu, Oct 23, 2025 at 11:21 AM Jason Gunthorpe <[email protected]> wrote: > > unmap_pages removes mappings and any fully contained interior tables from > the given range. This follows the now-standard iommu_domain API definition > where it does not split up larger page sizes into smaller. The caller must > perform unmap only on ranges created by map or it must have somehow > otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to > scan for them) > > A future work will provide 'cut' which explicitly does the page size split > if the HW can support it. > > unmap is implemented with a recursive descent of the tree. If the caller > provides a VA range that spans an entire table item then the table memory > can be freed as well. > > If an entire table item can be freed then this version will also check the > leaf-only level of the tree to ensure that all entries are present to > generate -EINVAL. Many of the existing drivers don't do this extra check. > > This version sits under the iommu_domain_ops as unmap_pages() but does not > require the external page size calculation. The implementation is actually > unmap_range() and can do arbitrary ranges, internally handling all the > validation and supporting any arrangment of page sizes. A future series > can optimize __iommu_unmap() to take advantage of this. > > Freed page table memory is batched up in the gather and will be freed in > the driver's iotlb_sync() callback after the IOTLB flush completes. > > Tested-by: Alejandro Jimenez <[email protected]> > Reviewed-by: Kevin Tian <[email protected]> > Signed-off-by: Jason Gunthorpe <[email protected]> > --- > drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++ > include/linux/generic_pt/iommu.h | 10 +- > 2 files changed, 164 insertions(+), 2 deletions(-) > > diff --git a/drivers/iommu/generic_pt/iommu_pt.h > b/drivers/iommu/generic_pt/iommu_pt.h > index 5ff1b887928a46..e3d1b272723db0 100644 > --- a/drivers/iommu/generic_pt/iommu_pt.h > +++ b/drivers/iommu/generic_pt/iommu_pt.h > @@ -14,6 +14,29 @@ > #include <linux/export.h> > #include <linux/iommu.h> > #include "../iommu-pages.h" > +#include <linux/cleanup.h> > +#include <linux/dma-mapping.h> > + > +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, > + struct pt_iommu *iommu_table, pt_vaddr_t iova, > + pt_vaddr_t len, > + struct iommu_pages_list *free_list) > +{ > + struct pt_common *common = common_from_iommu(iommu_table); > + > + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && > + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { > + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); > + /* > + * Note that the sync frees the gather's free list, so we must > + * not have any pages on that list that are covered by > iova/len > + */ > + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { > + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); > + } > + > + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); > +} > > #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) > > @@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct > pt_common *common, > log2_to_int(pt_top_memsize_lg2(common, top_of_table))); > } > > +struct pt_unmap_args { > + struct iommu_pages_list free_list; > + pt_vaddr_t unmapped; > +}; > + > +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, > + unsigned int level, > + struct pt_table_p *table) > +{ > + struct pt_state pts = pt_init(range, level, table); > + struct pt_unmap_args *unmap = arg; > + unsigned int num_oas = 0; > + unsigned int start_index; > + int ret = 0; > + > + _pt_iter_first(&pts); > + start_index = pts.index; > + pts.type = pt_load_entry_raw(&pts); > + /* > + * A starting index is in the middle of a contiguous entry > + * > + * The IOMMU API does not require drivers to support unmapping parts > of > + * large pages. Long ago VFIO would try to split maps but the current > + * version never does. > + * > + * Instead when unmap reaches a partial unmap of the start of a large > + * IOPTE it should remove the entire IOPTE and return that size to the > + * caller. > + */ > + if (pts.type == PT_ENTRY_OA) { > + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) > + return -EINVAL; > + /* Micro optimization */ > + goto start_oa; > + } > + > + do { > + if (pts.type != PT_ENTRY_OA) { > + bool fully_covered; > + > + if (pts.type != PT_ENTRY_TABLE) { > + ret = -EINVAL; > + break; > + } > + > + if (pts.index != start_index) > + pt_index_to_va(&pts); > + pts.table_lower = pt_table_ptr(&pts); > + > + fully_covered = pt_entry_fully_covered( > + &pts, pt_table_item_lg2sz(&pts)); > + > + ret = pt_descend(&pts, arg, __unmap_range); > + if (ret) > + break; > + > + /* > + * If the unmapping range fully covers the table then > we > + * can free it as well. The clear is delayed until we > + * succeed in clearing the lower table levels. > + */ > + if (fully_covered) { > + iommu_pages_list_add(&unmap->free_list, > + pts.table_lower); > + pt_clear_entries(&pts, ilog2(1)); > + } > + pts.index++; > + } else { > + unsigned int num_contig_lg2; > +start_oa: > + /* > + * If the caller requested an last that falls within a > + * single entry then the entire entry is unmapped and > + * the length returned will be larger than requested. > + */ > + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); > + pt_clear_entries(&pts, num_contig_lg2); > + num_oas += log2_to_int(num_contig_lg2); > + pts.index += log2_to_int(num_contig_lg2); > + } > + if (pts.index >= pts.end_index) > + break; > + pts.type = pt_load_entry_raw(&pts); > + } while (true); > + > + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); > + return ret; > +} > + > +/** > + * unmap_pages() - Make a range of IOVA empty/not present > + * @domain: Domain to manipulate > + * @iova: IO virtual address to start > + * @pgsize: Length of each page > + * @pgcount: Length of the range in pgsize units starting from @iova > + * @iotlb_gather: Gather struct that must be flushed on return > + * > + * unmap_pages() will remove a translation created by map_pages(). It cannot > + * subdivide a mapping created by map_pages(), so it should be called with > IOVA > + * ranges that match those passed to map_pages(). The IOVA range can > aggregate > + * contiguous map_pages() calls so long as no individual range is split. > + * > + * Context: The caller must hold a write range lock that includes > + * the whole range. > + * > + * Returns: Number of bytes of VA unmapped. iova + res will be the point > + * unmapping stopped. > + */ > +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long > iova, > + size_t pgsize, size_t pgcount, > + struct iommu_iotlb_gather *iotlb_gather) > +{ > + struct pt_iommu *iommu_table = > + container_of(domain, struct pt_iommu, domain); > + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( > + unmap.free_list) }; > + pt_vaddr_t len = pgsize * pgcount; > + struct pt_range range; > + int ret; > + > + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); > + if (ret) > + return 0; > + > + pt_walk_range(&range, __unmap_range, &unmap); > + > + gather_range_pages(iotlb_gather, iommu_table, iova, len, > + &unmap.free_list); > + > + return unmap.unmapped; > +} > +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); > + > static void NS(get_info)(struct pt_iommu *iommu_table, > struct pt_iommu_info *info) > { > diff --git a/include/linux/generic_pt/iommu.h > b/include/linux/generic_pt/iommu.h > index 5622856e199881..ceb6bc9cea37cd 100644 > --- a/include/linux/generic_pt/iommu.h > +++ b/include/linux/generic_pt/iommu.h > @@ -9,6 +9,7 @@ > #include <linux/iommu.h> > #include <linux/mm_types.h> > > +struct iommu_iotlb_gather; > struct pt_iommu_ops; > > /** > @@ -119,6 +120,10 @@ struct pt_iommu_cfg { > #define IOMMU_PROTOTYPES(fmt) > \ > phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain > *domain, \ > dma_addr_t iova); > \ > + size_t pt_iommu_##fmt##_unmap_pages( > \ > + struct iommu_domain *domain, unsigned long iova, > \ > + size_t pgsize, size_t pgcount, > \ > + struct iommu_iotlb_gather *iotlb_gather); > \ > int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, > \ > const struct pt_iommu_##fmt##_cfg *cfg, > \ > gfp_t gfp); > \ > @@ -135,8 +140,9 @@ struct pt_iommu_cfg { > * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the > * iommu_pt > */ > -#define IOMMU_PT_DOMAIN_OPS(fmt) \ > - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, > +#define IOMMU_PT_DOMAIN_OPS(fmt) \ > + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ > + .unmap_pages = &pt_iommu_##fmt##_unmap_pages > > /* > * The driver should setup its domain struct like > -- > 2.43.0 > >
Reviewed-by: Samiullah Khawaja <[email protected]>
