On Thu, Oct 23, 2025 at 11:21 AM Jason Gunthorpe <[email protected]> wrote: > > IOMMU HW now supports updating a dirty bit in an entry when a DMA writes > to the entry's VA range. iommufd has a uAPI to read and clear the dirty > bits from the tables. > > This is a trivial recursive descent algorithm to read and optionally clear > the dirty bits. The format needs a function to tell if a contiguous entry > is dirty, and a function to clear a contiguous entry back to clean. > > Tested-by: Alejandro Jimenez <[email protected]> > Reviewed-by: Kevin Tian <[email protected]> > Signed-off-by: Jason Gunthorpe <[email protected]> > --- > drivers/iommu/generic_pt/iommu_pt.h | 104 ++++++++++++++++++++++++++++ > include/linux/generic_pt/iommu.h | 6 ++ > 2 files changed, 110 insertions(+) > > diff --git a/drivers/iommu/generic_pt/iommu_pt.h > b/drivers/iommu/generic_pt/iommu_pt.h > index f32e81509f4f09..448c5796d4a861 100644 > --- a/drivers/iommu/generic_pt/iommu_pt.h > +++ b/drivers/iommu/generic_pt/iommu_pt.h > @@ -162,6 +162,108 @@ phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain > *domain, > } > EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); > > +struct pt_iommu_dirty_args { > + struct iommu_dirty_bitmap *dirty; > + unsigned int flags; > +}; > + > +static void record_dirty(struct pt_state *pts, > + struct pt_iommu_dirty_args *dirty, > + unsigned int num_contig_lg2) > +{ > + pt_vaddr_t dirty_len; > + > + if (num_contig_lg2 != ilog2(1)) { > + unsigned int index = pts->index; > + unsigned int end_index = log2_set_mod_max_t( > + unsigned int, pts->index, num_contig_lg2); > + > + /* Adjust for being contained inside a contiguous page */ > + end_index = min(end_index, pts->end_index); > + dirty_len = (end_index - index) * > + log2_to_int(pt_table_item_lg2sz(pts)); > + } else { > + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); > + } > + > + if (dirty->dirty->bitmap) > + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, > + dirty_len); > + > + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { > + pt_entry_make_write_clean(pts); > + iommu_iotlb_gather_add_range(dirty->dirty->gather, > + pts->range->va, dirty_len); > + } > +} > + > +static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, > + unsigned int level, > + struct pt_table_p *table) > +{ > + struct pt_state pts = pt_init(range, level, table); > + struct pt_iommu_dirty_args *dirty = arg; > + int ret; > + > + for_each_pt_level_entry(&pts) { > + if (pts.type == PT_ENTRY_TABLE) { > + ret = pt_descend(&pts, arg, __read_and_clear_dirty); > + if (ret) > + return ret; > + continue; > + } > + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) > + record_dirty(&pts, dirty, > + pt_entry_num_contig_lg2(&pts)); > + } > + return 0; > +} > + > +/** > + * read_and_clear_dirty() - Manipulate the HW set write dirty state > + * @domain: Domain to manipulate > + * @iova: IO virtual address to start > + * @size: Length of the IOVA > + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR > + * @dirty: Place to store the dirty bits > + * > + * Iterate over all the entries in the mapped range and record their write > dirty > + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified > then > + * the entries will be left dirty, otherwise they are returned to being not > + * write dirty. > + * > + * Context: The caller must hold a read range lock that includes @iova. > + * > + * Returns: -ERRNO on failure, 0 on success. > + */ > +int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, > + unsigned long iova, size_t size, > + unsigned long flags, > + struct iommu_dirty_bitmap *dirty) > +{ > + struct pt_iommu *iommu_table = > + container_of(domain, struct pt_iommu, domain); > + struct pt_iommu_dirty_args dirty_args = { > + .dirty = dirty, > + .flags = flags, > + }; > + struct pt_range range; > + int ret; > + > +#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) > + return -EOPNOTSUPP; > +#endif > + > + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); > + if (ret) > + return ret; > + > + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); > + PT_WARN_ON(ret); > + return ret; > +} > +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); > + > struct pt_iommu_collect_args { > struct iommu_pages_list free_list; > /* Fail if any OAs are within the range */ > @@ -1015,5 +1117,7 @@ EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, > "GENERIC_PT_IOMMU"); > MODULE_LICENSE("GPL"); > MODULE_DESCRIPTION("IOMMU Page table implementation for " > __stringify(PTPFX_RAW)); > MODULE_IMPORT_NS("GENERIC_PT"); > +/* For iommu_dirty_bitmap_record() */ > +MODULE_IMPORT_NS("IOMMUFD"); > > #endif /* __GENERIC_PT_IOMMU_PT_H */ > diff --git a/include/linux/generic_pt/iommu.h > b/include/linux/generic_pt/iommu.h > index 0d59423024d57f..03a906fbe12a83 100644 > --- a/include/linux/generic_pt/iommu.h > +++ b/include/linux/generic_pt/iommu.h > @@ -12,6 +12,7 @@ > struct iommu_iotlb_gather; > struct pt_iommu_ops; > struct pt_iommu_driver_ops; > +struct iommu_dirty_bitmap; > > /** > * DOC: IOMMU Radix Page Table > @@ -182,6 +183,9 @@ struct pt_iommu_cfg { > struct iommu_domain *domain, unsigned long iova, > \ > size_t pgsize, size_t pgcount, > \ > struct iommu_iotlb_gather *iotlb_gather); > \ > + int pt_iommu_##fmt##_read_and_clear_dirty( > \ > + struct iommu_domain *domain, unsigned long iova, size_t size, > \ > + unsigned long flags, struct iommu_dirty_bitmap *dirty); > \ > int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, > \ > const struct pt_iommu_##fmt##_cfg *cfg, > \ > gfp_t gfp); > \ > @@ -202,6 +206,8 @@ struct pt_iommu_cfg { > .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ > .map_pages = &pt_iommu_##fmt##_map_pages, \ > .unmap_pages = &pt_iommu_##fmt##_unmap_pages > +#define IOMMU_PT_DIRTY_OPS(fmt) \ > + .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty > > /* > * The driver should setup its domain struct like > -- > 2.43.0 > >
Reviewed-by: Samiullah Khawaja <[email protected]>
